按照新闻爬页面的代码
import requests
import os
from pyquery import PyQuery as pq
def Page(num, url, mainurl):
r = requests.get(url) #获取该页面的url
html = r.text
doc = pq(html)
# print(doc)
y = doc("[class='rebox_news']").html()
z = pq(y)
# print(type(y))
j = 0
for i in z('li').items():
j += 1
xx = i("a").attr.href
pageurl = mainurl + xx[2:]
name = str(num)+'页'+ '第'+str(j)+'篇' + i.text()[-21:-19] # 给储存的文档命名
new = News(pageurl) # 获取页面的新闻,str类型
save(name, new) # 储存爬下来的内容
def News(url):
r = requests.get(url)
html = r.text
article = pq(html)
# print(mainurl+'/t1720844.shtml')
ar = article("#News_Body_Txt_A p")
new = ar.text()
# print(ar)
return new # 把新闻内容返回
def save(name,text):
os.chdir(r'E:\Senior\database language\Pachong\sort') # 选择储存路径
news = open(name + '.txt', 'w', encoding ="utf-8") # 打开文件,没有该文件就新建
news.write(text)
news.close()
if __name__ == '__main__':
indexurl = 'https://www.fmprc.gov.cn/web/fyrbt_673021/jzhsl_673025/'
x='default_'+'.shtml'
furl = indexurl + 'default.shtml' # 第一页的网址
Page(1, furl, indexurl)
for i in range(1,67): # 根据跳转页面规律
url = indexurl + 'default_' + str(i) + '.shtml'
# print(url)
Page(i+1, furl, indexurl)
按照页面爬数据的,其实很类似
import requests
import os
from pyquery import PyQuery as pq
def Page(num, url, mainurl):
r = requests.get(url)
html = r.text
doc = pq(html)
# print(doc)
y = doc("[class='rebox_news']").html()
z = pq(y)
# print(type(y))
j = 0
acontext = ''
name = '第' + str(num) + '页'
for i in z('li').items():
j += 1
xx = i("a").attr.href
pageurl = mainurl + xx[2:]
new = News(pageurl)
acontext += new # 把这个页面中爬下来的新闻都整合在一起
save(name, acontext)
def News(url):
r = requests.get(url)
html = r.text
article = pq(html)
# print(mainurl+'/t1720844.shtml')
ar = article("#News_Body_Txt_A p")
new = ar.text()
# print(ar)
return new
def save(name,text):
os.chdir(r'E:\Senior\database language\Pachong\pagesort')
news = open(name + '.txt', 'w')
news.write(text)
news.close()
if __name__ == '__main__':
indexurl = 'https://www.fmprc.gov.cn/web/fyrbt_673021/jzhsl_673025/'
x='default_'+'.shtml'
furl = indexurl + 'default.shtml'
Page(1, furl, indexurl)
for i in range(1,67):
url = indexurl + 'default_' + str(i) + '.shtml'
# print(url)
Page(i+1, url, indexurl)
感觉 PyQuery 实在是太好用了,比 BeautifulSoup 查找标签之类的方便多了。麻烦的就是,之前不知道.items()这个方法,吓得我还以为要用正则表达式呢,不过还好,多查查就找到了。
经验就是,分步骤去做,然后组装在一起。
效果图

存入文档

文档效果
有一个特别有趣的现象,爬取页面所有的数据都是84KB,真的是神奇!不神奇了,因为我爬的是同一个页面。
网友评论