美文网首页爬虫实验楼 <( ̄︶ ̄)> 
爬虫——获取知乎热点新闻

爬虫——获取知乎热点新闻

作者: 戴眼镜的莫林 | 来源:发表于2017-03-05 17:44 被阅读0次

    Python 2.7 按照程序提示 输入账号密码之后 可以 获得到知乎热点新闻的标题链接。如果想获得知乎其他信息可以自行修改。

    直接上代码啦

    <code>
    import re
    import requests
    import cookielib
    from PIL import Image
    import time
    import json
    import webbrowser
    from attr import attrib
    from lxml import etree
    import urllib2
    import urlparse
    headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.71 Safari/537.36'}
    filename = 'cookie'
    session = requests.Session()
    session.cookies = cookielib.CookieJar()
    try:
    session.cookies.load(filename=filename, ignore_discard=True)
    except:
    print('cookie fail')
    # <input type="hidden" name="_xsrf" value="f1f90f1cfe8ec5c732ef0d8833ccabe8"/>
    def get_xsrf():
    response = session.get('https://www.zhihu.com', headers=headers)
    html = response.text
    get_xsrf_pattern = re.compile(r'<input type="hidden" name="_xsrf" value="(.*?)"')
    _xsrf = re.findall(get_xsrf_pattern, html)[0]
    return _xsrf
    def get_captcha():
    t = str(int(time.time() * 1000))
    captcha_url = 'https://www.zhihu.com/captcha.gif?r=' + t + "&type=login"
    response = session.get(captcha_url, headers=headers)
    with open('cptcha.gif', 'wb') as f:
    f.write(response.content)
    im = Image.open('cptcha.gif')
    im.show()
    captcha = raw_input('Verification code:')
    print captcha
    return captcha
    def login(username, password):
    if re.match(r'\d{11}$', account):
    print('phone logining')
    url = 'http://www.zhihu.com/login/phone_num'
    data = {'_xsrf': get_xsrf(),
    'password': password,
    'remember_me': 'true',
    'phone_num': username
    }
    else:
    print('email longing')
    url = 'https://www.zhihu.com/login/email'
    data = {'_xsrf': get_xsrf(),
    'password': password,
    'remember_me': 'true',
    'email': username
    }
    data['captcha'] = get_captcha()
    result = session.post(url, data=data, headers=headers)
    print((json.loads(result.text))['msg']+' codeLogin')
    # session.cookies.save(ignore_discard=True, ignore_expires=True)
    def nextMore(offset, start):
    url = 'https://www.zhihu.com/node/TopStory2FeedList'
    data = {'params': {'offset':offset, 'start':start},
    'method': 'next'
    }
    result = session.post(url, data=data, headers=headers)
    print((json.loads(result.text))['msg'] + ' ')
    def download(url, headers, proxy, num_retries, data=None):
    headers = headers or {}
    print 'Downloading:', url
    request = urllib2.Request(url, data, headers)
    opener = urllib2.build_opener()
    if proxy:
    proxy_params = {urlparse.urlparse(url).scheme: proxy}
    opener.add_handler(urllib2.ProxyHandler(proxy_params))
    try:
    response = opener.open(request)
    html = response.read()
    code = response.code
    except urllib2.URLError as e:
    print 'Download error:', e.reason
    html = ''
    if hasattr(e, 'code'):
    code = e.code
    if num_retries > 0 and 500 <= code < 600:
    # retry 5XX HTTP errors
    return download(url, headers, proxy, num_retries - 1, data)
    else:
    code = None
    return html
    if name == 'main':
    account = raw_input('account:')
    secret = raw_input('password:')
    login(account, secret)
    get_url = 'https://www.zhihu.com/explore/recommendations'
    resp = session.get(get_url, headers=headers, allow_redirects=False)
    page = etree.HTML(resp.text)
    i = 1
    while (i<6):
    string = "//div[@id='zh-recommend']/div[2]/div[1]/div[" + str(i) + "]/h2/a"
    hrefs = page.xpath(string)
    for href in hrefs:
    print href.text + '\n' + 'https://www.zhihu.com' + str(href.attrib['href'])
    url = 'https://www.zhihu.com' + str(href.attrib['href'])
    i = i + 1
    webbrowser.open(get_url, new=0, autoraise=True)
    </code>

    By 戴眼镜的莫林

    相关文章

      网友评论

        本文标题:爬虫——获取知乎热点新闻

        本文链接:https://www.haomeiwen.com/subject/cgjegttx.html