我在《编程集体智慧》这本书中发现了以下代码,文件名为newsfeatures.py。
这里是代码:
import feedparserimport refeedlist=['http://today.reuters.com/rss/topNews', 'http://today.reuters.com/rss/domesticNews', 'http://today.reuters.com/rss/worldNews', 'http://hosted.ap.org/lineups/TOPHEADS-rss_2.0.xml', 'http://hosted.ap.org/lineups/USHEADS-rss_2.0.xml', 'http://hosted.ap.org/lineups/WORLDHEADS-rss_2.0.xml', 'http://hosted.ap.org/lineups/POLITICSHEADS-rss_2.0.xml', 'http://www.nytimes.com/services/xml/rss/nyt/HomePage.xml', 'http://www.nytimes.com/services/xml/rss/nyt/International.xml', 'http://news.google.com/?output=rss', 'http://feeds.salon.com/salon/news', 'http://www.foxnews.com/xmlfeed/rss/0,4313,0,00.rss', 'http://www.foxnews.com/xmlfeed/rss/0,4313,80,00.rss', 'http://www.foxnews.com/xmlfeed/rss/0,4313,81,00.rss', 'http://rss.cnn.com/rss/edition.rss', 'http://rss.cnn.com/rss/edition_world.rss', 'http://rss.cnn.com/rss/edition_us.rss']def stripHTML(h): p='' s=0 for c in h: if c=='<': s=1 elif c=='>': s=0 p+=' ' elif s==0: p+=c return pdef separatewords(text): splitter=re.compile('\\W*') return [s.lower( ) for s in splitter.split(text) if len(s)>3]def getarticlewords( ): allwords={} articlewords=[] articletitles=[] ec=0 # 遍历每个feed for feed in feedlist: f=feedparser.parse(feed) # 遍历每篇文章 for e in f.entries: # 忽略相同的文章 if e.title in articletitles: continue # 提取单词 txt=e.title.encode('utf8')+stripHTML(e.description.encode('utf8')) words=separatewords(txt) articlewords.append({}) articletitles.append(e.title) # 增加单词在allwords和articlewords中的计数 for word in words: allwords.setdefault(word,0) allwords[word]+=1 articlewords[ec].setdefault(word,0) articlewords[ec][word]+=1 ec+=1 return allwords,articlewords,articletitlesdef makematrix(allw,articlew): wordvec=[] # 只选择常见但不太常见的单词 for w,c in allw.items( ): if c>3 and c<len(articlew)*0.6: wordvec.append(w) # 创建单词矩阵 l1=[[(word in f and f[word] or 0) for word in wordvec] for f in articlew] return l1,wordvecfrom numpy import *def showfeatures(w,h,titles,wordvec,out='features.txt'): outfile=file(out,'w') pc,wc=shape(h) toppatterns=[[] for i in range(len(titles))] patternnames=[] # 遍历所有特征 for i in range(pc): slist=[] # 创建单词及其权重的列表 for j in range(wc): slist.append((h[i,j],wordvec[j])) # 反向排序单词列表 slist.sort( ) slist.reverse( ) # 打印前六个元素 n=[s[1] for s in slist[0:6]] outfile.write(str(n)+'\n') patternnames.append(n) # 创建此特征的文章列表 flist=[] for j in range(len(titles)): # 添加文章及其权重 flist.append((w[j,i],titles[j])) toppatterns[j].append((w[j,i],i,titles[j])) # 反向排序列表 flist.sort( ) flist.reverse( ) # 显示前三篇文章 for f in flist[0:3]: outfile.write(str(f)+'\n') outfile.write('\n') outfile.close( ) # 返回模式名称以供后续使用 return toppatterns,patternnames
使用方法如下:
>>> import newsfeatures>>> allw,artw,artt= newsfeatures.getarticlewords( )>>> artt[1]u'Fatah, Hamas men abducted freed: sources'
如你所见,这行代码会生成新闻标题。
>>> artt[1]u'Fatah, Hamas men abducted freed: sources'
我想知道是否有办法让程序不仅显示标题,还能显示标题的来源,即feedlist
中的来源。
有谁能帮忙吗?
谢谢!
回答:
替换
articletitles.append(e.title)
在getarticlewords()
函数中为以下代码:
articletitles.append(' '.join([e.title, ', from', feed]))