美文网首页
python百度热搜榜爬取

python百度热搜榜爬取

作者: 側耳听偑 | 来源:发表于2020-11-30 23:14 被阅读0次
    # terminal中安装库 bs4 requests
    # pip install bs4 requests
    import requests
    from bs4 import BeautifulSoup
    import bs4
    
    def get_html(url,headers):
        r = requests.get(url,headers=headers)
        r.encoding = r.apparent_encoding
        return r.text
    
    
    def get_pages(html):
        global s
        soup = BeautifulSoup(html,'html.parser')
        all_topics=soup.find_all('tr')[1:]
        for each_topic in all_topics:
            #print(each_topic)
            topic_times = each_topic.find('td',class_='last')#搜索指数
            topic_rank = each_topic.find('td',class_='first')#排名
            topic_name = each_topic.find('td',class_='keyword')#标题目
            if topic_rank != None and topic_name!=None and topic_times!=None:
                topic_rank = each_topic.find('td',class_='first').get_text().replace(' ','').replace('\n','')
                topic_name = each_topic.find('td',class_='keyword').get_text().replace(' ','').replace('\n','')
                topic_times = each_topic.find('td',class_='last').get_text().replace(' ','').replace('\n','')
                #print('排名:{},标题:{},热度:{}'.format(topic_rank,topic_name,topic_times))
                tplt = "排名:{0:^4}\t标题:{1:{3}^15}\t热度:{2:^8}"
                print(tplt.format(topic_rank,topic_name,topic_times,chr(12288)))
                s=s+topic_name.replace('search','')+'\n'
    
    
    
    url = 'http://top.baidu.com/buzz?b=1&fr=20811'
    headers= {'User-Agent':'Mozilla/5.0'}
    html = get_html(url,headers)
    s = ''
    get_pages(html)
    with open('百度热榜.txt','w',encoding='utf-8') as f:
        f.write(s)
    

    相关文章

      网友评论

          本文标题:python百度热搜榜爬取

          本文链接:https://www.haomeiwen.com/subject/mpqjwktx.html