美文网首页
python爬取新浪微博人民日报信息

python爬取新浪微博人民日报信息

作者: 白面金毛九尾狐 | 来源:发表于2019-01-10 13:52 被阅读0次

首先构造请求头

base_url ='https://m.weibo.cn/api/container/getIndex?'

headers = {

'Host':'m.weibo.cn',

    'Referer':'https://m.weibo.cn/u/2803301701',

    'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.75 Safari/537.36',

    'X-Requested-With':'XMLHttpRequest'

}

之后根据人民日报的Cookie数值(在审查元素的Network(getindex中uid以及containderid)

def get_page(page):

params = {

'type':'uid',

        'value':'2803301701',

        'containerid':'1076032803301701',

        'page': page

}

构造网页

url = base_url + urlencode(params)

判断请求是否成功(200成功,404丢失,500服务器问题)

try:

response = requests.get(url, headers=headers)

if response.status_code ==200:

        return response.json(), page

except requests.ConnectionErroras e:

print('Error', e.args)

获取网页信息中的固定信息(card中的melog)


def parse_page(json, page: int):

if json:

items = json.get('data').get('cards')# json为一个元组中,字典中字典嵌套一排字典;page

        for index, itemin enumerate(items):

item = item.get('mblog')

weibo = {}

weibo['id'] = item.get('id')

weibo['text'] = pq(item.get('text')).text()# pq过滤文本中的html代码

            weibo['attitudes'] = item.get('attitudes_count')

weibo['comments'] = item.get('comments_count')

weibo['reposts'] = item.get('reposts_count')

yield weibo

主函数控制爬取以及打印信息


if __name__ =='__main__':

for pagein range(1, max_page +1):

json = get_page(page)

results = parse_page(*json)

for resultin results:

print(result)

完整代码如下

需求:爬取微博头条500条数据,转存到mysql

分析:

头条网址:https://weibo.com/?category=1760 确认过,不变的

23-24个item一组,ajax实现的自动刷新,每次刷新一组23-24个item左右

"""
数据库设计:mydatabase下
create table sina (
    id varchar(255) primary key,
    text TEXT,
    attitudes varchar(255),
    comments varchar(255),
    reposts varchar(255)
)
"""
import requests
from urllib.parse import urlencode
from pyquery import PyQuery as pq
import pymysql

base_url = 'https://m.weibo.cn/api/container/getIndex?'
headers = {
    'Host': 'm.weibo.cn',
    'Referer': 'https://m.weibo.cn/u/2803301701',
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.75 Safari/537.36',
    'X-Requested-With': 'XMLHttpRequest'
}
max_page = 100


def get_page(page):
    params = {
        'type': 'uid',
        'value': '2803301701',
        'containerid': '1076032803301701',
        'page': page
    }
    url = base_url + urlencode(params)
    # print(url)
    try:
        response = requests.get(url, headers=headers)
        if response.status_code == 200:  # 判断请求是否成功
            return response.json(), page
    except requests.ConnectionError as e:
        print('Error', e.args)


def parse_page(json, page: int):
    if json:
        items = json.get('data').get('cards')  # json为一个元组中,字典中字典嵌套一排字典;page
        for index, item in enumerate(items):  # TODO item内容不一样items
            item = item.get('mblog')
            weibo = {}
            weibo['id'] = item.get('id')
            weibo['text'] = pq(item.get('text')).text()  # pq过滤文本中的html代码
            weibo['attitudes'] = item.get('attitudes_count')
            weibo['comments'] = item.get('comments_count')
            weibo['reposts'] = item.get('reposts_count')
            yield weibo


if __name__ == '__main__':
    db = pymysql.connect("localhost", "root", "root", database="mydatabase", charset="utf8")
    cursor = db.cursor()
    for page in range(1, max_page + 1):
        json = get_page(page)
        results = parse_page(*json)
        for result in results:
            # print(result)
            id = result['id']
            text = result['text']
            attitudes = result['attitudes']
            comments = result['comments']
            reposts = result['reposts']
            sql = "insert into sina (id, text, attitudes, comments, reposts)" \
                  " values (%s, %s, %s, %s, %s)"
            cursor.execute(sql, [id, text, str(attitudes), str(comments), str(reposts)])
            db.commit()
    db.close()
    print("ojbk")

相关文章

网友评论

      本文标题:python爬取新浪微博人民日报信息

      本文链接:https://www.haomeiwen.com/subject/qeqarqtx.html