美文网首页
抓取虎嗅网

抓取虎嗅网

作者: 丽雁解 | 来源:发表于2018-01-17 23:27 被阅读0次
    from lxml import etree
    import requests
    import json
    
    root_huxiu_url='https://www.huxiu.com/'
    post_url = 'https://www.huxiu.com/channel/ajaxGetMore'
    headers={
    'User-Agent':'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36',
    'Cookie':'aliyungf_tc=AQAAAIokYwn+WwMAN9UmZQmaqDaXEhQv; huxiu_analyzer_wcy_id=4f1wsxk7txc42t0xk7w; Hm_lvt_324368ef52596457d064ca5db8c6618e=1516200027; Hm_lpvt_324368ef52596457d064ca5db8c6618e=1516200027; _ga=GA1.2.1212828852.1516200027; _gid=GA1.2.1755766605.1516200027; screen=%7B%22w%22%3A1366%2C%22h%22%3A768%2C%22d%22%3A1%7D; SERVERID=03a07aad3597ca2bb83bc3f5ca3decf7|1516199779|1516199738'
    }
    
    def get_channel_info(root_url):
        req=requests.get(root_url,headers=headers)
        html=req.text
        selector=etree.HTML(html)
        infos=selector.xpath('//ul[@class="header-column header-column1 header-column-zx menu-box"]/li/a')
        items=[]
        for info in infos:
            item_dict={}
            channel_name=info.xpath('text()')[0]
            catId=info.xpath('@href')[0].replace('/channel/','').replace('.html','')
            # print(channel_name,catId)
            item_dict['channel_name']=channel_name
            item_dict['catId']=catId
            items.append(item_dict)
        return items
    
    def get_totalPage(catId):
        post_url = 'https://www.huxiu.com/channel/ajaxGetMore'
        post_data={#form data
            'huxiu_hash_code':'25ac5e645e763c56a512d97ab1901874',
            'page':1,
            'catId':catId
        }
        html=requests.post(post_url,data=post_data,headers=headers).text
        dict_data=json.loads(html)#json
        # print(dict_data)
        parse_data=dict_data['data']
        total_page=parse_data['total_page']
    
        return str(total_page)
    
    def get_article_info(channel_name,catId,page):
        post_data={#form data
            'huxiu_hash_code':'25ac5e645e763c56a512d97ab1901874',
            'page':page,
            'catId':catId
        }
        html=requests.post(post_url,data=post_data,headers=headers).text
        dict_data=json.loads(html)#jason
        # print(dict_data)
        parse_data=dict_data['data']
        total_page=parse_data['total_page']
        data_html=parse_data['data'].strip()
        print(channel_name,catId,total_page,data_html[0:10])
    
        selector2=etree.HTML(data_html)
        articles_url=selector2.xpath('//a/@href')
    
        for a_url in articles_url[0::2]:#相邻的2个url是一样的
            if a_url.startswith('/article'):
                article_url=root_huxiu_url+a_url[1:]
                print(article_url)
    
                req3=requests.get(article_url,headers=headers)
                selector3=etree.HTML(req3.text)
                title=selector3.xpath('//div[@class="article-wrap"]/h1/text()')[0].strip()
                content=selector3.xpath('//p/text()')
                whole_content='\n'.join(content) #自动换行
                print(title)
                print(whole_content)
    
    def main():
        channels_info=get_channel_info(root_huxiu_url)
        for one_channel in channels_info:
            print(one_channel)
            pages=get_totalPage(one_channel['catId'])
            print(one_channel['channel_name']+'pages:'+pages)
            for ipage in pages:
                get_article_info(one_channel['channel_name'],one_channel['catId'],ipage)
    
    if __name__=='__main__':
        main()
    

    相关文章

      网友评论

          本文标题:抓取虎嗅网

          本文链接:https://www.haomeiwen.com/subject/otupoxtx.html