技术路线:
python+requests+re
代码如下:
import requests
from bs4 import BeautifulSoup
import re
def getHTMLcode(url,data):
try:
r = requests.get(url, headers=data)
print(r.status_code)
r.raise_for_status()
print(r.apparent_encoding)
r.encoding = r.apparent_encoding
print(r.encoding)
return r.text,r.encoding
except:
print('爬取失败')
def parsePage(contain,html):
soup = BeautifulSoup(html, "html.parser")
# find_all( name , attrs , recursive , text , **kwargs )
items = soup.find_all(name='div', class_='article')
for item in items:
print(item)
print('-------------------------------')
item = str(item)
a = []
# 发布人,发布内容,发布时间,点赞数
pattern = re.compile(
r'<div class="author clearfix">.*?<h2>(.*?)</h2>.*?<span>(.*?)</span>.*?<i class="number">(.*?)</i>', re.S)
groups = re.findall(pattern, item) #groups是以元组为元素的列表
for g in groups: #g是元组
a.append(g[0])
a.append(g[1])
a.append(g[2])
contain.append(a)
def saveArticle(contain,filPath):
for article in contain:
with open(filPath,'a+',encoding='utf-8') as f:
temp='作者:'+article[0].strip('\n')+'\n'+'内容:\n'+article[1].strip('\n').replace('<br/>', '')+'\n点赞数:'+article[2].strip('\n')+'\n\n\n\n'
f.write(temp)
def spyder(url,data,depth,filPath):
for i in range(depth):
url=url+str(depth+11)
html,encoding=getHTMLcode(url, data)
if encoding=='ISO-8859-2':
continue
contain=[]
parsePage(contain, html)
saveArticle(contain,filPath)
if __name__=="__main__":
depth =10
url = 'https://www.qiushibaike.com/hot/page/'
data = {'User-Agent': 'Mozilla/5.0'}
filPath='newarticle.docx'
spyder(url,data,depth,filPath)
关键点:
正则表达式的编写:
pattern = re.compile(
r'<div class="author clearfix">.?<h2>(.?)</h2>.?<span>(.?)</span>.?<i class="number">(.?)</i>', re.S)
说明:
1:(.*?)代表分组,用正则的方法re.findall(pattern, item)每个匹配的字符串里面我们先要的部分会以元组的形式返回,然后一篇文章里有多组匹配的字符串的话最终结果会返回以元组为元素的列表。
2:re.S代表.包括匹配换行
网友评论