美文网首页
Python今日头条网爬虫

Python今日头条网爬虫

作者: peng_js | 来源:发表于2017-07-02 23:06 被阅读0次

    encoding=utf8

    import requests
    from requests.exceptions import RequestException
    import urllib
    import json
    import re
    from bs4 import BeautifulSoup
    from config import *
    def get_index(offset,keyword):
    data={
    'offset':offset,
    'format':'json',
    'keyword':keyword,
    'autoload':'true',
    'count':20,
    'cur_tab':3
    }
    url='http://www.toutiao.com/search_content/?'+urllib.urlencode(data)
    response=requests.get(url)
    try:
    if response.status_code == 200:
    return response.text
    return None
    except RequestException:
    print u'请求索引页出错'
    return None
    def parse_page_index(html):
    #将json格式的字符串转化成python对象,对象转换成json用 json.dumps()
    data=json.loads(html)
    if data and 'data' in data.keys():
    for item in data.get('data'):
    #yield 是url生成器 即取出article_url并生成url
    yield item.get('article_url')
    def get_page_detail(url):
    response = requests.get(url)
    try:
    if response.status_code == 200:
    return response.text
    return None
    except RequestException:
    print '请求详情页出错'
    return None
    def parse_page_detail(html,url):
    soup=BeautifulSoup(html,'lxml')
    title = soup.select('title')[0].get_text()
    images_pattern= re.compile('var gallery = (.*?);',re.S)
    result = re.search(images_pattern,html)
    if result:
    data =json.loads(result.group(1))
    sub_images = data.get('sub_images')
    images = [item.get('url') for item in sub_images]
    return {
    'title' :title,
    'url':url,
    'images':images
    }
    def main():
    html=get_index(0,'街拍')
    for url in parse_page_index(html):
    html=get_page_detail(url)
    if html:
    result=parse_page_detail(html,url)
    print result['title']
    if name == 'main':
    main()

    相关文章

      网友评论

          本文标题:Python今日头条网爬虫

          本文链接:https://www.haomeiwen.com/subject/ayiocxtx.html