首先构造请求头
base_url ='https://m.weibo.cn/api/container/getIndex?'
headers = {
'Host':'m.weibo.cn',
'Referer':'https://m.weibo.cn/u/2803301701',
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.75 Safari/537.36',
'X-Requested-With':'XMLHttpRequest'
}
之后根据人民日报的Cookie数值(在审查元素的Network(getindex中uid以及containderid)
def get_page(page):
params = {
'type':'uid',
'value':'2803301701',
'containerid':'1076032803301701',
'page': page
}
构造网页
url = base_url + urlencode(params)
判断请求是否成功(200成功,404丢失,500服务器问题)
try:
response = requests.get(url, headers=headers)
if response.status_code ==200:
return response.json(), page
except requests.ConnectionErroras e:
print('Error', e.args)
获取网页信息中的固定信息(card中的melog)
def parse_page(json, page: int):
if json:
items = json.get('data').get('cards')# json为一个元组中,字典中字典嵌套一排字典;page
for index, itemin enumerate(items):
item = item.get('mblog')
weibo = {}
weibo['id'] = item.get('id')
weibo['text'] = pq(item.get('text')).text()# pq过滤文本中的html代码
weibo['attitudes'] = item.get('attitudes_count')
weibo['comments'] = item.get('comments_count')
weibo['reposts'] = item.get('reposts_count')
yield weibo
主函数控制爬取以及打印信息
if __name__ =='__main__':
for pagein range(1, max_page +1):
json = get_page(page)
results = parse_page(*json)
for resultin results:
print(result)
完整代码如下
需求:爬取微博头条500条数据,转存到mysql
分析:
头条网址:https://weibo.com/?category=1760 确认过,不变的
23-24个item一组,ajax实现的自动刷新,每次刷新一组23-24个item左右
"""
数据库设计:mydatabase下
create table sina (
id varchar(255) primary key,
text TEXT,
attitudes varchar(255),
comments varchar(255),
reposts varchar(255)
)
"""
import requests
from urllib.parse import urlencode
from pyquery import PyQuery as pq
import pymysql
base_url = 'https://m.weibo.cn/api/container/getIndex?'
headers = {
'Host': 'm.weibo.cn',
'Referer': 'https://m.weibo.cn/u/2803301701',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.75 Safari/537.36',
'X-Requested-With': 'XMLHttpRequest'
}
max_page = 100
def get_page(page):
params = {
'type': 'uid',
'value': '2803301701',
'containerid': '1076032803301701',
'page': page
}
url = base_url + urlencode(params)
# print(url)
try:
response = requests.get(url, headers=headers)
if response.status_code == 200: # 判断请求是否成功
return response.json(), page
except requests.ConnectionError as e:
print('Error', e.args)
def parse_page(json, page: int):
if json:
items = json.get('data').get('cards') # json为一个元组中,字典中字典嵌套一排字典;page
for index, item in enumerate(items): # TODO item内容不一样items
item = item.get('mblog')
weibo = {}
weibo['id'] = item.get('id')
weibo['text'] = pq(item.get('text')).text() # pq过滤文本中的html代码
weibo['attitudes'] = item.get('attitudes_count')
weibo['comments'] = item.get('comments_count')
weibo['reposts'] = item.get('reposts_count')
yield weibo
if __name__ == '__main__':
db = pymysql.connect("localhost", "root", "root", database="mydatabase", charset="utf8")
cursor = db.cursor()
for page in range(1, max_page + 1):
json = get_page(page)
results = parse_page(*json)
for result in results:
# print(result)
id = result['id']
text = result['text']
attitudes = result['attitudes']
comments = result['comments']
reposts = result['reposts']
sql = "insert into sina (id, text, attitudes, comments, reposts)" \
" values (%s, %s, %s, %s, %s)"
cursor.execute(sql, [id, text, str(attitudes), str(comments), str(reposts)])
db.commit()
db.close()
print("ojbk")
网友评论