美文网首页
top baidu spider

top baidu spider

作者: 是东东 | 来源:发表于2022-08-09 02:37 被阅读0次
    import json
    
    import requests
    import time
    from lxml import etree
    import re
    
    replaces = lambda x: ''.join(x).replace('\n', '').replace('  ', '').replace('  ', '').strip()
    prints = lambda msg: print(f'{time.strftime("%Y-%m-%d", time.localtime())} {msg}')
    
    
    class Spider(object):
        def __init__(self):
            pass
    
        def req(self, args_json=None):
            headers = args_json.get('headers')
            url = args_json.get('url')
            response = requests.get(url, headers=headers)
            code = response.status_code
            prints(f'code:{response.status_code}')
            if code == 200:
                return response.text
    
        def parse_details(self, html_str):
            result = []
            queries = []
            try:
                crawl_time = time.strftime('%Y-%m-%d', time.localtime())
                tree = etree.HTML(html_str)
                details = tree.xpath('//div[contains(@class,"category-wrap")]')
                for detail in details:
                    item = {}
                    title = replaces(detail.xpath('.//div[@class="c-single-text-ellipsis"]//text()'))
                    item['title'] = title
                    item['hot_count'] = replaces(detail.xpath('.//div[@class="hot-index_1Bl1a"]/text()'))
                    item['desc'] = replaces(detail.xpath('.//div[contains(@class,"hot-desc_") and contains(@class,"large_")]/text()'))
                    item['href'] = replaces(detail.xpath('.//div[contains(@class,"hot-desc_") and contains(@class,"large_")]/a/@href'))
                    item['crawl_time'] = crawl_time
                    prints(item)
                    result.append(item)
                    if title:
                        if title not in queries:
                            queries.append(title)
            except Exception as e:
                prints(f'func parse failed:{e}')
            prints(f'detail queries count:{len(queries)}')
            return result, queries
    
        def parse_words(self, html_str):
            words = []
            try:
                json_str = re.findall('<!--s-data:(.*?)-->', html_str, re.S)
                if json_str:
                    oo = json.loads(json_str[0])
                    wds = oo.get('data', {}).get('homepage', {}).get('cloud', [])
                    if wds:
                        for word in wds:
                            wd = replaces(word.get('word', ''))
                            if wd:
                                if wd not in words:
                                    words.append(wd)
            except Exception as e:
                prints(f'func parse failed:{e}')
            prints(f'words query count:{len(words)}')
            return words
    
        def get_data(self, url):
            headers = {
                'Upgrade-Insecure-Requests': '1',
                'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36',
                'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
                'Referer': 'https://top.baidu.com/board',
                'Accept-Encoding': 'gzip, deflate, br',
                'Accept-Language': 'zh-CN,zh;q=0.9,zh-TW;q=0.8,th;q=0.7,en;q=0.6',
            }
            args_json = {}
            args_json['url'] = url
            args_json['headers'] = headers
            html_str = self.req(args_json)
            if html_str:
                if 'realtime' in url:
                    result, queries = self.parse_details(html_str)
                    return result, queries
                else:
                    queries = self.parse_words(html_str)
                    return queries
            else:
                with open('topbaidu.html', 'w', encoding='utf-8') as ww:
                    ww``.write(html_str)
    
        def run(self):
    
            url = 'https://top.baidu.com/board?tab=realtime'
            result, queries = self.get_data(url)
            url = 'https://top.baidu.com/board?tab=homepage'
            words = self.get_data(url)
            queries.extend(words)
            return queries
    
    
    if __name__ == '__main__':
        with open('topbaidu.html', 'r', encoding='utf-8') as rr:
            html_str = rr.read()
        S = Spider()
        S.run()
        # S.parse(html_str)
    

    相关文章

      网友评论

          本文标题:top baidu spider

          本文链接:https://www.haomeiwen.com/subject/gyigwrtx.html