美文网首页
2018-06-25

2018-06-25

作者: 加勒比海带_4bbc | 来源:发表于2018-06-25 07:24 被阅读0次

import re

import pymongo

import json

from urllib.parse import urlencode

from requests.exceptions import RequestException

from bs4 import BeautifulSoup

import bs4

import requests

def get_page_index():

    data={

        'offset': 0,

        'format': 'json',

        'keyword': 'xxx',

        'autoload': 'true',

        'count': '20',

        'cur_tab':3

        }

    url = '' + urlencode(data)

    try:

        response = requests.get(url)

        if response.status_code == 200:

            return response.text

        return None

    except RequestException:

        print('请求索引页出错')

        return None

def parse_page_index(html):

    data = json.loads(html)

    if data and 'data' in data.keys():

        for item in data.get('data'):

            yield item.get('article_url')

def get_page_detail(url):

    try:

        response = requests.get(url)

        if response.status_code == 200:

            return response.text

        return None

    except RequestException:

        print('请求详细页出错')

        return None

def parse_page_detail(html):

    soup = BeautifulSoup(html,'lxml')

    title = soup.select('title')[0].get_text()

    print(title)

    images_pattern = re.compile('var gallery = (.*?);',re.s)

    result = re.search(images_pattern,html)

    if result:

        print(result.group(1))

def save_to_mongo(result):

    if db[MONGO_TABLE].insert(result):

        print('存储到MongoDB成功',result)

        return True

    return False

def download_image(url):

    try:

        response = requests.get(url)

        if response.status_code == 200:

            return response.add_text

        return None

    except RequestException:

        print('请求图片错误', url)

        return None

def save_image(content):

    file_path = '{0}/{1}.{2}'.format(os.getcwd(),md5(content).hexdigest(),'jpg')

    if not os.path.exists(file_path):

        with open(file_path,'wb')as f:

            f.write(content)

            f.close()

def main():

    html = get_page_index(0,'xxx')

    print(html)

if __name__ == '__main__':

    main()   


相关文章

网友评论

      本文标题:2018-06-25

      本文链接:https://www.haomeiwen.com/subject/nkseyftx.html