美文网首页
python3爬取头条美女

python3爬取头条美女

作者: Al_不期而遇 | 来源:发表于2018-07-24 13:33 被阅读50次

    import requests

    from urllib.parse import urlencode

    from pyquery import PyQuery as pq

    import os

    from hashlib import md5

    from multiprocessing.pool import Pool

    GROUP_START =1

    GROUP_END =20

    headers = {

        'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux i686; rv:59.0) Gecko/20100101 Firefox/59.0',

    }

    def get_page(offset):

    params = {

    'offset' : offset,

    'format' : 'json',

    'keyword' : '美女',

    'autoload':'true',

    'count':'20',

    'cur_tab':'1',

    }

    url = 'https://www.toutiao.com/search_content/?' + urlencode(params)

    try:

    response = requests.get(url=url, headers=headers)

    if response.status_code == 200:

    return response.json()

    except requests.ConnectionError:

    return None

    def get_images(json):

    data = json.get('data')

    if data:

    for item in data:

    image_list = item.get('image_list')

    title = item.get('title')

    if image_list:

    for image in image_list:

    yield{

    'image':image.get('url'),

    'title':title

    }

    def save_image(item):

    if not os.path.exists(item.get('title')):

    os.mkdir(item.get('title'))

    try:

    local_image_url = item.get('image')

    new_image_url = local_image_url.replace('list','large')

    response = requests.get('http:' + new_image_url)

    if response.status_code == 200:

    file_path = '{0}/{1}.{2}'.format(item.get('title'),md5(response.content).hexdigest(),'jpg')

    if not os.path.exists(file_path):

    with open(file_path,'wb') as f:

    f.write(response.content)

    else:

    print('Already Downloaded',file_path)

    except request.ConnectionError:

    print('Failed to save image')

    def main(offset):

    json = get_page(offset)

    for item in get_images(json):

    print(item)

    save_image(item)

    if __name__ == '__main__':

    pool = Pool()

    groups = ([x * 20 for x in range(GROUP_START,GROUP_END + 1)])

    pool.map(main,groups)

    pool.close()

    pool.join()

    刚开始的时候按照书的去爬的时候,网站的源码有些变化,image_detail是没有了,改为了image_list中。

    相关文章

      网友评论

          本文标题:python3爬取头条美女

          本文链接:https://www.haomeiwen.com/subject/kzbemftx.html