top baidu spider

作者: 是东东 | 来源:发表于2022-08-09 02:37 被阅读0次

top baidu spider
房思琪式伤害
房思琪式伤害
保险业AI 大盘点：这10家公司值得关注！
Retuers:China's Baidu pledges to
spider - 猫眼电影top100
java成品源码
百度身份证识别
Scrapy - 爬取豆瓣Top250电影和灌篮高手漫画全集
在scrapy的每个spider中设置独立的日志文件

import json

import requests
import time
from lxml import etree
import re

replaces = lambda x: ''.join(x).replace('\n', '').replace('  ', '').replace('  ', '').strip()
prints = lambda msg: print(f'{time.strftime("%Y-%m-%d", time.localtime())} {msg}')


class Spider(object):
    def __init__(self):
        pass

    def req(self, args_json=None):
        headers = args_json.get('headers')
        url = args_json.get('url')
        response = requests.get(url, headers=headers)
        code = response.status_code
        prints(f'code:{response.status_code}')
        if code == 200:
            return response.text

    def parse_details(self, html_str):
        result = []
        queries = []
        try:
            crawl_time = time.strftime('%Y-%m-%d', time.localtime())
            tree = etree.HTML(html_str)
            details = tree.xpath('//div[contains(@class,"category-wrap")]')
            for detail in details:
                item = {}
                title = replaces(detail.xpath('.//div[@class="c-single-text-ellipsis"]//text()'))
                item['title'] = title
                item['hot_count'] = replaces(detail.xpath('.//div[@class="hot-index_1Bl1a"]/text()'))
                item['desc'] = replaces(detail.xpath('.//div[contains(@class,"hot-desc_") and contains(@class,"large_")]/text()'))
                item['href'] = replaces(detail.xpath('.//div[contains(@class,"hot-desc_") and contains(@class,"large_")]/a/@href'))
                item['crawl_time'] = crawl_time
                prints(item)
                result.append(item)
                if title:
                    if title not in queries:
                        queries.append(title)
        except Exception as e:
            prints(f'func parse failed:{e}')
        prints(f'detail queries count:{len(queries)}')
        return result, queries

    def parse_words(self, html_str):
        words = []
        try:
            json_str = re.findall('<!--s-data:(.*?)-->', html_str, re.S)
            if json_str:
                oo = json.loads(json_str[0])
                wds = oo.get('data', {}).get('homepage', {}).get('cloud', [])
                if wds:
                    for word in wds:
                        wd = replaces(word.get('word', ''))
                        if wd:
                            if wd not in words:
                                words.append(wd)
        except Exception as e:
            prints(f'func parse failed:{e}')
        prints(f'words query count:{len(words)}')
        return words

    def get_data(self, url):
        headers = {
            'Upgrade-Insecure-Requests': '1',
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36',
            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
            'Referer': 'https://top.baidu.com/board',
            'Accept-Encoding': 'gzip, deflate, br',
            'Accept-Language': 'zh-CN,zh;q=0.9,zh-TW;q=0.8,th;q=0.7,en;q=0.6',
        }
        args_json = {}
        args_json['url'] = url
        args_json['headers'] = headers
        html_str = self.req(args_json)
        if html_str:
            if 'realtime' in url:
                result, queries = self.parse_details(html_str)
                return result, queries
            else:
                queries = self.parse_words(html_str)
                return queries
        else:
            with open('topbaidu.html', 'w', encoding='utf-8') as ww:
                ww``.write(html_str)

    def run(self):

        url = 'https://top.baidu.com/board?tab=realtime'
        result, queries = self.get_data(url)
        url = 'https://top.baidu.com/board?tab=homepage'
        words = self.get_data(url)
        queries.extend(words)
        return queries


if __name__ == '__main__':
    with open('topbaidu.html', 'r', encoding='utf-8') as rr:
        html_str = rr.read()
    S = Spider()
    S.run()
    # S.parse(html_str)