使用工具:Jupyter Notebook
python版本:python3.7
系统:win10
import requests
from bs4 import BeautifulSoup
import json
import pandas
headers={
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36',
}
items=[]
def in_text(url):
res=requests.get(url,headers=headers)
res.encoding='utf8'
soup=BeautifulSoup(res.text,'lxml')
article=soup.select('#article')
main_title=soup.select('.date')
main_d=main_title[0].text
p=article[0].find_all('p')
for i in p[1:]:
in_t=i.text
# print(in_t)
return main_d,in_t
def parse_data(data):
for data_list in data:
title=data_list['title']
intro=data_list['intro']
in_url=data_list["url"]
media_name=data_list["media_name"]
dosid=data_list["docid"]
main_d,in_t=in_text(in_url)
item={
'标题':title,
'原标题':intro,
'发布日期':main_d,
'新闻来源':media_name,
'内文':in_t
}
items.append(item)
# print(items)
exit()
def parse_url(url,page):
page_url=url.format(page)
res=requests.get(url=page_url,headers=headers)
js=res.text.lstrip('try{feedCardJsonpCallback').lstrip('(').rstrip('{};').rstrip('catch(e)').rstrip(';}').rstrip(')')
js_text=json.loads(js)
data=js_text['result']['data']
parse_data(data)
def main():
url='https://feed.sina.com.cn/api/roll/get?pageid=121&lid=1356&num=20&versionNumber=1.2.4&page={}&encode=utf-8&callback=feedCardJsonpCallback&_=1561466144181'
start_page=int(input('请输入起始页码:'))
end_page=int(input('请输入结束页码:'))
for page in range(start_page,end_page + 1):
parse_url(url,page)
#保存数据
df=pandas.DataFrame(items)
df.to_excel('news.xlsx')
if __name__ == '__main__':
main()

生成结果范例:
本地C盘下会生成news.xlsx

网友评论