美文网首页
分享一个知乎图片爬虫

分享一个知乎图片爬虫

作者: yousa_ | 来源:发表于2019-09-26 13:41 被阅读0次

login.py 用于登录验证

# -*- coding: utf-8-*-
'''
  @Description:获取cookie
'''
from zhihu_oauth import ZhihuClient
from zhihu_oauth.exception import NeedCaptchaException
client = ZhihuClient()
user = '保密'
pwd = '保密'
try:
     client.login(user, pwd)
     print(u"登陆成功!")
except NeedCaptchaException: # 处理要验证码的情况
#  保存验证码并提示输入,重新登录
   with open('a.gif', 'wb') as f:
            f.write(client.get_captcha())
   captcha = input('please input captcha:')
   client.login(user, pwd, captcha)
client.save_token('token.pkl') # 保存token

pic.py 用于爬取图片

# -*- coding: utf-8-*-
'''
  @Description:保存知乎某个问题下所有答案的图片
'''
from __future__ import print_function
from zhihu_oauth import ZhihuClient
import re
import os
import urllib

client = ZhihuClient()
# 登录
client.load_token('token.pkl')  # 加载token文件
# id = 327588950  # https://www.zhihu.com/question/287345713/answer/714145378
# id = 322665913
# id = 22462004
id = 20312271
question = client.question(id)
print(u"问题:", question.title)
print(u"回答数量:", question.answer_count)
# 建立存放图片的文件夹
os.mkdir(question.title.replace('?', '') + u"(图片)")
# os.mkdir('你们被骗子骗过钱吗?')
path = question.title.replace('?', '') + u"(图片)"
index = 1  # 图片序号
for answer in question.answers:
    content = answer.content  # 回答内容
    re_compile = re.compile(r'<img src="(https://pic\d\.zhimg\.com/.*?\.(jpg|png))".*?>')
    img_lists = re.findall(re_compile, content)
    if (img_lists):
        for img in img_lists:
            img_url = img[0]  # 图片url
            urllib.request.urlretrieve(img_url, path + u"/%d.jpg" % index)
            print(u"成功保存第%d张图片" % index)
            index += 1

相关文章

网友评论

      本文标题:分享一个知乎图片爬虫

      本文链接:https://www.haomeiwen.com/subject/hfaguctx.html