python百度热搜榜爬取

作者: 側耳听偑 | 来源:发表于2020-11-30 23:14 被阅读0次

三十. 模拟登陆实战 - 爬取微博信息
python百度热搜榜爬取
热点平台搭建（一）——Python爬取热榜数据
python实现爬虫爬取百度热搜汇总
知乎热搜榜单抓取
抓取微博热搜榜单
python爬取百度图片代码
2017-12-31
Python自动发邮件，定制上班划水神器
2019-01-11

# terminal中安装库 bs4 requests
# pip install bs4 requests
import requests
from bs4 import BeautifulSoup
import bs4

def get_html(url,headers):
    r = requests.get(url,headers=headers)
    r.encoding = r.apparent_encoding
    return r.text


def get_pages(html):
    global s
    soup = BeautifulSoup(html,'html.parser')
    all_topics=soup.find_all('tr')[1:]
    for each_topic in all_topics:
        #print(each_topic)
        topic_times = each_topic.find('td',class_='last')#搜索指数
        topic_rank = each_topic.find('td',class_='first')#排名
        topic_name = each_topic.find('td',class_='keyword')#标题目
        if topic_rank != None and topic_name!=None and topic_times!=None:
            topic_rank = each_topic.find('td',class_='first').get_text().replace(' ','').replace('\n','')
            topic_name = each_topic.find('td',class_='keyword').get_text().replace(' ','').replace('\n','')
            topic_times = each_topic.find('td',class_='last').get_text().replace(' ','').replace('\n','')
            #print('排名：{}，标题：{}，热度：{}'.format(topic_rank,topic_name,topic_times))
            tplt = "排名：{0:^4}\t标题：{1:{3}^15}\t热度：{2:^8}"
            print(tplt.format(topic_rank,topic_name,topic_times,chr(12288)))
            s=s+topic_name.replace('search','')+'\n'



url = 'http://top.baidu.com/buzz?b=1&fr=20811'
headers= {'User-Agent':'Mozilla/5.0'}
html = get_html(url,headers)
s = ''
get_pages(html)
with open('百度热榜.txt','w',encoding='utf-8') as f:
    f.write(s)