string.gne一个用于从html中提取新闻正文的库
By
money
at 2021-11-15 • 0人收藏 • 1520人看过
//模仿自@青南 GNE(GeneralNewsExtractor) python库
import string.html;
import math;
import string.regex;
namespace string.gne{
import console
content_tag = 'p';
rep = ..string.replace;
ncr = ..string.xml.ncr;
removeTag = ..string.html.removeTag;
match = ..string.match;
push = ..table.push;
len = ..string.len;
join = ..string.join;
split = ..string.split;
abs = ..math.abs;
punctuation = {
["!"]=1;[","]=1;["。"]=1;["?"]=1;[";"]=1;[":"]=1;["“"]=1;["”"]=1;["‘"]=1;["’"]=1;["《"]=1;["》"]=1;["("]=1;
[")"]=1;["【"]=1;["】"]=1;["、"]=1;["—"]=1;["…"]=1;["~"]=1;["·"]=1;["〉"]=1;["〈"]=1;
[","]=1;["."]=1;["?"]=1;[":"]=1;[";"]=1;["'"]=1;[" "]=1;
['"']=1;["!"]=1;["%"]=1;["("]=1;[")"]=1
};
uselesstag={
['img']=1;['svg']=1;['video']=1;['object']=1;['embed']=1;['audio']=1;['applet']=1;
['map']=1;['area']=1;['base']=1;['head']=1;['basefont']=1;['br']=1;['button']=1;
['input']=1;['canvas']=1;['iframe']=1;['frame']=1;['frameset']=1;
}
uselesstag1={
['a']=1;
}
authorPerlHeads={
"责编";
"作者";
"编辑";
"文";
"撰文";
"来源"
}
authorPerlTail = "[:\: 丨/]\s*(:{2,5})[^:\::]*";
pubdayPerls={
"(202\d-\d{2}-\d{2} \d{2}\:\d{2}\:\d{2})";
"(202\d/\d{2}/\d{2} \d{2}\:\d{2}\:\d{2})";
"(202\d-\d{2}-\d{2} \d{2}\:\d{2})[^\:]";
"(202\d/\d{2}/\d{2} \d{2}\:\d{2})[^\:]";
"(202\d年\d{2}月\d{2}日 \d{2}时\d{2}分)";
"(202\d年\d{2}月\d{2}日 \d{2}点\d{2}分)";
"(202\d-\d{2}-\d{2}) ";
"(202\d/\d{2}/\d{2}) ";
"(202\d年\d{2}月\d{2}日) ";
"(202\d-\d{1,2}-\d{1,2})";
"(202\d/\d{1,2}/\d{1,2})";
"(202\d年\d{1,2}月\d{1,2}日)";
}
//方差s^2=[(x1-x)^2 +...(xn-x)^2]/n 或者s^2=[(x1-x)^2 +...(xn-x)^2]/(n-1)
variance = function(arr) {
var m=#arr;
var sum=0;
for(i=1;m;1){//求和
sum += arr[i];
}
var dAve=sum/m;//求平均值
var dVar=0;
for(i=1;m;1){//求方差
dVar += (arr[i]-dAve) * (arr[i]-dAve);
}
return dVar/m;
}
//标准差σ=sqrt(s^2)
std = function(arr) {
return ..math.sqrt(variance(arr));
}
toText = function(html){
if(!#html) return;
html = rep(html,
"\<\s*pre[^\>]*?\s*\>(.*?)\<\s*/pre\s*>",
function(c){
c = rep(c," "," ");
c = rep(c,'\n',"<br>");
return c;
}
);
html = rep(html,"\s+"," ");
html = rep(html,"\<[bB][rR]\s*/*\>",'\r\n');
html = rep(html,"\</*[pP]\>",'\r\n');
html = rep(html,"\</div\>",'\r\n');
html = rep(html,"\</DIV\>",'\r\n');
html = rep(html,"\<.+?\>",'\r\n');
html = rep(html,"[ ]+",'');
html = rep(html,"[\r\n]+",'\r\n');
return ncr(html);
}
calc_text_density = function(elm){
/*
根据公式:
Ti - LTi
TDi = -----------
TGi - LTGi
Ti:节点 i 的字符串字数
LTi:节点 i 的所有a标签的字符串字数
TGi:节点 i 的标签数
LTGi:节点 i 的a标签数
*/
var ti_text = toText(elm.innerXml()):"";
var ti = #ti_text;
var lti = {}
var tgi = -1;
var ltgi = 0;
var pcount = 0;
elm.enumNodes(
function(parentElement,index,tagName,childCount,xNode){
if(tagName=='a'){
push(lti, xNode.innerText());
ltgi++;
}
if(tagName==content_tag){
pcount++;
}
tgi++;
}
)
lti = join(lti,'');
lti = #lti
var density
if (tgi == ltgi) density=0
else density = (ti - lti) / (tgi - ltgi);
return {
density=density;
text=ti_text;
ti=ti;
lti=lti;
pcount=pcount;
tgi=tgi;
ltgi=ltgi;
sbdi = calc_sbdi(ti_text, ti, lti)
}
}
count_punctuation_num = function( text){
if(!text) return 0;
var count = 0
var tab = split(text)
for(i=1;#tab;1){
if(punctuation[tab[i]]){
count++
}
}
return count
}
calc_sbdi = function(text, ti, lti){
/*
Ti - LTi
SbDi = --------------
Sbi + 1
SbDi: 符号密度
Sbi:符号数量
*/
var sbi = count_punctuation_num(text)
sbdi = (ti - lti) / (sbi || 1)
return sbdi || 1
}
calc_standard_deviation = function(node_info){
var score_list={}
for(i=1;#node_info;1){
push(score_list, node_info[i].density)
}
return std(score_list)
}
calc_new_score = function(std, node_info){
/*
score = log(std) * ndi * log10(text_tag_count + 2) * log(sbdi)
std:每个节点文本密度的标准差
ndi:节点 i 的文本密度
text_tag_count: 正文所在标签数。例如正文在<p></p>标签里面,这里就是 p 标签数,如果正文在<div></div>标签,这里就是 div 标签数
sbdi:节点 i 的符号密度
*/
for(i=1;#node_info;1){
var info = node_info[i];
//info.score = info.density * info.sbdi * (info.text_tag_count || 1) * (info.tgi?1:0)
info.score = ..math.log(std) * info.density * ..math.log10(info.pcount+2) * ..math.log(info.sbdi)
}
}
extract_title = function(html){
html = removeTag(html,"script","style");
var doc = ..string.html(html);
var elm = doc.queryEle({tagName="title"});
if(elm and #elm.innerText()) return elm.innerText();
var elm = doc.queryEle({tagName="h1"});
if(elm and #elm.innerText()) return elm.innerText();
var elm = doc.queryEle({tagName="h2"});
if(elm and #elm.innerText()) return elm.innerText();
var elm = doc.queryEle({tagName="h3"});
if(elm and #elm.innerText()) return elm.innerText();
var elm = doc.queryEle({tagName="h4"});
if(elm and #elm.innerText()) return elm.innerText();
}
extract_author = function(html){
for(i=1;#authorPerlHeads;1){
var perl = authorPerlHeads[i]+authorPerlTail;
var author = match(html, perl);
if(author){
return author;
}
}
}
extract_pubDay = function(html){
var tab={}
for(i=1;#pubdayPerls;1){
var pubday = match(html, pubdayPerls[i])
if(pubday){
push(tab, {pubday=pubday;idx=..string.find(html, pubdayPerls[i])})
//return pubday;
}
}
if(#tab){
..table.sort(tab,function(b){
if(owner.idx==b.idx){
return #owner.pubday>#b.pubday;
}
return owner.idx<b.idx;
})
return tab[1].pubday;
}
}
extract = function(html){
html = rep(html,"\<\!--.*?--\>" , "");
var author = extract_author(html);
var title = extract_title(html);
var pubday = extract_pubDay(html);
html = match(html,"\<\s*<@@body@>[^\>]*?\s*\>.+") : html;
html = removeTag(html,"head","script","style")
var doc = ..string.html(html);
var node_info={}
var onlyTxt={}
doc.enumNodes(
function(parentElement,index,tagName,childCount,xNode){
if(tagName and childCount>1 and tagname!='a'){
push(node_info, calc_text_density(xNode))
}
if(!tagName){
var elm = parentElement;
var lv=5
var founda=0;
while(elm and lv){
if(uselesstag1[elm.tagName]) {
founda=1
break;
}
elm = elm.getParent()
lv--
}
if(!founda and #xNode.text>1 and count_punctuation_num(xNode.text)) push(onlyTxt, xNode.text)
}
}
)
onlyTxt = toText(join(onlyTxt,'\r\n'))
//console.dump(onlyTxt)
var textlen = #onlyTxt;
var std = calc_standard_deviation(node_info)
calc_new_score(std, node_info)
if(#onlyTxt){
for(i=1;#node_info;1){
var info = node_info[i];
var found=0
for(v in ..string.lines(info.text)){
if(v and ..string.indexOf(onlyTxt, v)){
found += #v;
}
}
info.score1 = found/textlen;
info.score2 = #info.text/textlen;
info.score2 = info.score2<1.2?info.score2:0;
//评分标准,自行调优
info.score = info.score1 * info.score2
//info.score = info.density * info.sbdi * (info.text_tag_count || 1) * (info.tgi?1:0)
info.score = info.score * ..math.log(std) * info.density * ..math.log10(info.pcount+2) * ..math.log(info.sbdi)
}
}
..table.sort(node_info,function(b){
return owner.score>b.score;
})
/*
看看评分情况
for(i=1;10;1){
console.dump(node_info[i]);
console.dump("--------------------------------")
}
*/
if(#node_info) return {
content = node_info[1].text;
title = title;
author = author;
pubday = pubday
};
}
}示例:
import console
import inet.whttp;
var http = inet.whttp()
import string.gne;
var html = http.get("https://www.cnblogs.com/xieqiankun/p/gne_release.html")
var tab = string.gne.extract(html)
console.dump(tab)
console.pause()
3 个回复 | 最后更新于 2021-11-29
智能文档新成员:动态文档智能模型MarkupLM
https://mp.weixin.qq.com/s/sEFUe5frk5lKSu7cXqfxgQ
源码:https://github.com/microsoft/unilm/tree/master/markuplm
有时间了尝试一下这个
登录后方可回帖

python
import requests from fake_useragent import UserAgent url1="http://www.nxzw.gov.cn/xwzx/zwyw/202111/t20211122_3141633.html" ua = UserAgent() headers = {'User-Agent':ua.random} res = requests.get(url1, headers=headers) from gne import GeneralNewsExtractor extractor = GeneralNewsExtractor() html = res.text result = extractor.extract(html) print(result)2.aardio
也查阅了gne提供的论文,目前还没详细看代码,测试了,还是有差距,目前不知道问题出在哪