这次的爬虫用到了正则,OrderedDict和pandas,同时增加了time.sleep以免爬速过快导致IP被服务器封杀。本次并没有使用新的技术,主要是对数据做了一点清洗,感觉可以往代理IP+多线程方向发展,and Pandas真是一个非常适合数据挖掘和分析的库啊!
冥冥中有种感觉,如果新闻多抓一点,能分析出很多很有趣的结论,诸如社会热点,政治局势等等。
好了,直接上代码。
#!/usr/bin/env python
# -*-coding=utf-8
import requests
from bs4 import BeautifulSoup
import time
import re
import pandas as pd
from collections import OrderedDict
newsAPI = 'http://api.roll.news.sina.com.cn/zt_list'
CommentsAPI='http://comment5.news.sina.com.cn/page/info'
headers = {'User-Agent':'Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.113 Safari/537.36',
'Accept-Language':'zh-CN,zh;q=0.8'}
paramsNewsAPI={'channel':'news','cat_1':'gnxw',
'cat_2':'=gdxw1||=gatxw||=zs-pl||=mtjj',
'level':'=1||=2',
'show_ext':1,
'show_all':1,
'show_num':22,
'tag':'1',
'format':'json',
'page':1,
}
paramsCommentsAPI={'version':'1',
'format':'json',
'channel':'gn',
'newsid':'comos-ifymenmt5700372',
'group':None,
'compress':0,
'ie':'utf-8',
'oe':'utf-8',
'page':1,
'page_size':20
}
def getnewsInfo(newslist,curSession):
newsInfolist = []
for news in newslist['result']['data']:
newsInfo = OrderedDict()
newsInfo['title'] = news['title']
newsInfo['source'] = news['media_name']
newsInfo['link'] = news['url']
newsInfo['newsid'] = re.search(r'doc-(.*).shtml',news['url']).group(1)
newsInfo['keywords'] = news['keywords'].split(',')
getNewsContent(newsInfo,curSession)
getNewsComments(newsInfo,curSession)
#防止爬速度过快
time.sleep(0.1)
newsInfolist.append(newsInfo)
return newsInfolist
def getNewsContent(newsInfo,curSession):
resp = s.get(newsInfo['link'])
resp.encoding='utf-8'
resp.raise_for_status()
bsobj = BeautifulSoup(resp.text,'html.parser')
newsInfo['date'] = bsobj.find('span',{'class':'time-source'}).contents[0].strip()
newsInfo['editor'] = bsobj.find('p',{'class':'article-editor'}).string.split(':')
def getNewsComments(newsInfo,curSession):
paramsCommentsAPI['newsid'] = "comos-{}".format(newsInfo['newsid'][1::])
resp = s.get(CommentsAPI,params=paramsCommentsAPI)
result = resp.json()
if result['result']['status']['code'] != 0:
newsInfo['comments'] = 0
newsInfo['commenter'] = 0
else:
newsInfo['comments'] = result['result']['count']['show']
newsInfo['commenter'] = result['result']['count']['total']
newsInfo=[]
with requests.Session() as s:
s.headers.update(headers)
#按需调整页码
for page in range(1,3):
resp = s.get(newsAPI,params=paramsNewsAPI)
resp.encoding = 'utf-8'
newsInfo.extend(getnewsInfo(resp.json(),s))
df = pd.DataFrame(newsInfo)
df.to_excel('新浪新闻.xlsx')
网友评论