美文网首页(一点一点累积)Python
python抓取图片例子(ajax)

python抓取图片例子(ajax)

作者: 词穷又词贫 | 来源:发表于2017-04-19 10:37 被阅读105次

    例子是仿抄:崔庆才先生 的案例
    他的个人博客地址是:http://cuiqingcai.com/
    #!/bin/python3.4
    # -- coding:utf-8 --

    import re
    import json
    from bs4 import BeautifulSoup
    from urllib.parse import urlencode
    from requests.exceptions import RequestException
    import requests
    from config import *
    from hashlib import md5
    from multiprocessing import Pool
    from json.decoder import JSONDecoder
    from pymongo import MongoClient
    import os
    
    client = MongoClient(MONGO_URL,connect=False)
    db = client[MONGO_DB]
    
    def get_page_index(offset,keyword):
        data = {
            'offset':offset,
            'format':'json',
            'keyword':keyword,
            'autoload':'true',
            'count':'20',
            'cur_tab':1
        }
        url = 'http://www.toutiao.com/search_content/?' + urlencode(data)
        try:
            response = requests.get(url)
            if response.status_code == 200:
                return response.text
            return None
        except RequestException:
            print ("请求索引页面出错")
            return None
    
    def parse_page_index(html):
        try:
            data = json.loads(html)
            if data and 'data' in data.keys():
                for item in data.get('data'):
                    yield item.get('article_url')
        except JSONDecoder:
            pass
    
    def get_page_detail(url):
        try:
            response = requests.get(url)
            if response.status_code == 200:
                return response.text
            return None
        except RequestException:
            print ("请求详情页面出错",url)
            return None
    
    def parse_page_detail(html,url):
        soup = BeautifulSoup(html,'lxml')
        title = soup.select('title')[0].get_text()
        images_pattern = re.compile('var gallery = (.*?);',re.S)
        result = re.search(images_pattern,html)
        if result:
            data = json.loads(result.group(1))
            if data and 'sub_images' in data.keys():
                sub_images = data.get('sub_images')
                images = [item.get('url') for item in sub_images]
                for image in images: download_image(image)
                return {
                    'title':title,
                    'url':url,
                    'images':images,
                }
    
    def save_to_mongo(result):
        if db[MONGO_TABLE].insert(result):
            print ("存储到Mongodb成功",result)
            return True
        return False
    
    def download_image(url):
        try:
            response = requests.get(url)
            if response.status_code == 200:
                save_image(response.content)
            return None
        except RequestException:
            print ("请求图片出错")
            return None
    
    def save_image(content):
        file_path = '{0}/{1}.{2}'.format(os.getcwd(),md5(content).hexdigest(),'jpg')
        if not os.path.exists(file_path):
            with open(file_path,'wb') as f:
                f.write(content)
                f.close()
    
    def main(offset):
        html = get_page_index(offset,KEYWORD)
        for url in parse_page_index(html):
            html = get_page_detail(url)
            if html:
                result = parse_page_detail(html,url)
                if result: save_to_mongo(result)
    
    if __name__ == '__main__':
        groups = [x * 20 for x in range(GROUP_START,GROUP_END + 1)]
        pool = Pool()
        pool.map(main,groups)
    

    config.py配置文件

    #!/bin/python3.4
    # -*- coding:utf-8 -*-
    MONGO_URL = 'localhost'
    MONGO_DB = 'toutiao'
    MONGO_TABLE = 'toutiao'
    
    GROUP_START = 1
    GROUP_END = 20
    
    KEYWORD = '街拍'
    

    images_pattern与result正则匹配到数据转化成json格式:

    "sub_images":
    [
        {
        "url":"http:\/\/p2.pstatp.com\/origin\/168300027e4c8323ee22",
        "width":700,
        "url_list":
            [
             {"url":"http:\/\/p2.pstatp.com\/origin\/168300027e4c8323ee22"},
              {"url":"http:\/\/pb3.pstatp.com\/origin\/168300027e4c8323ee22"},
              {"url":"http:\/\/pb3.pstatp.com\/origin\/168300027e4c8323ee22"}
            ],
        "uri":"origin\/168300027e4c8323ee22","height":981
        },
    
        {
        "url":"http:\/\/p2.pstatp.com\/origin\/168600026fb5ecf86ba9",
        "width":700,
        "url_list":
            [
                {"url":"http:\/\/p2.pstatp.com\/origin\/168600026fb5ecf86ba9"},
                {"url":"http:\/\/pb3.pstatp.com\/origin\/168600026fb5ecf86ba9"},
                {"url":"http:\/\/pb3.pstatp.com\/origin\/168600026fb5ecf86ba9"}
            ],
        "uri":"origin\/168600026fb5ecf86ba9","height":891
        },
        
        {
        "url":"http:\/\/p3.pstatp.com\/origin\/16870003ef0948da7863",
        "width":700,
        "url_list":
            [
                {"url":"http:\/\/p3.pstatp.com\/origin\/16870003ef0948da7863"},
                {"url":"http:\/\/pb2.pstatp.com\/origin\/16870003ef0948da7863"},
                {"url":"http:\/\/pb3.pstatp.com\/origin\/16870003ef0948da7863"}
            ],
        "uri":"origin\/16870003ef0948da7863","height":1078
        },
        
        
        {
        "url":"http:\/\/p1.pstatp.com\/origin\/16820003ee9c72717ad5",
        "width":700,
        "url_list":
            [
                {"url":"http:\/\/p1.pstatp.com\/origin\/16820003ee9c72717ad5"},
                {"url":"http:\/\/pb3.pstatp.com\/origin\/16820003ee9c72717ad5"},
                {"url":"http:\/\/pb3.pstatp.com\/origin\/16820003ee9c72717ad5"}
            ],
        "uri":"origin\/16820003ee9c72717ad5","height":999
        },
        
        {
        "url":"http:\/\/p1.pstatp.com\/origin\/16870003ef0b2bbec810",
        "width":960,
        "url_list":
            [
                {"url":"http:\/\/p1.pstatp.com\/origin\/16870003ef0b2bbec810"},
                {"url":"http:\/\/pb3.pstatp.com\/origin\/16870003ef0b2bbec810"},
                {"url":"http:\/\/pb3.pstatp.com\/origin\/16870003ef0b2bbec810"}
            ],
        "uri":"origin\/16870003ef0b2bbec810","height":609
        }
    ],
    

    相关文章

      网友评论

        本文标题:python抓取图片例子(ajax)

        本文链接:https://www.haomeiwen.com/subject/eqyczttx.html