好像页面的处理做的不是很好
#coding='gbk'
from calibre.web.feeds.recipes import BasicNewsRecipe
import re
class lhxy(BasicNewsRecipe):
title = u"静觅爬虫学习系列教程"
description = u"..."
language = 'zh'
max_articles_per_feed = 1000
oldest_article = 500
remove_javascript = True
cover_url = 'https://xxx.jpg' #自己换成个封面链接
no_stylesheets = True
keep_only_tags =[dict(name='h1', attrs={'class':'article-title'}),
dict(name='article', attrs={'class':'article-content'}),]
def get_title(self,link):
return link.contents[0].strip()
def parse_index(self):
contents_soup = self.index_to_soup('http://cuiqingcai.com/1052.html')
trans_Elem = contents_soup.find('article', attrs={'class': "article-content"})
contents_Elem=trans_Elem.findAll('p')
mn=[]
for link in contents_Elem:
xx=link.a
if xx==None:
continue
mn.append(xx)
articles = []
for link1 in mn:
title=self.get_title(link1)
title = title.encode("utf-8")
url = link1['href']
gather = {'title':title,'url':url}
articles.append(gather)
ans = [(u'静觅爬虫学习系列教程',articles)]
return ans
网友评论