import json
import requests
import time
from lxml import etree
import re
replaces = lambda x: ''.join(x).replace('\n', '').replace(' ', '').replace(' ', '').strip()
prints = lambda msg: print(f'{time.strftime("%Y-%m-%d", time.localtime())} {msg}')
class Spider(object):
def __init__(self):
pass
def req(self, args_json=None):
headers = args_json.get('headers')
url = args_json.get('url')
response = requests.get(url, headers=headers)
code = response.status_code
prints(f'code:{response.status_code}')
if code == 200:
return response.text
def parse_details(self, html_str):
result = []
queries = []
try:
crawl_time = time.strftime('%Y-%m-%d', time.localtime())
tree = etree.HTML(html_str)
details = tree.xpath('//div[contains(@class,"category-wrap")]')
for detail in details:
item = {}
title = replaces(detail.xpath('.//div[@class="c-single-text-ellipsis"]//text()'))
item['title'] = title
item['hot_count'] = replaces(detail.xpath('.//div[@class="hot-index_1Bl1a"]/text()'))
item['desc'] = replaces(detail.xpath('.//div[contains(@class,"hot-desc_") and contains(@class,"large_")]/text()'))
item['href'] = replaces(detail.xpath('.//div[contains(@class,"hot-desc_") and contains(@class,"large_")]/a/@href'))
item['crawl_time'] = crawl_time
prints(item)
result.append(item)
if title:
if title not in queries:
queries.append(title)
except Exception as e:
prints(f'func parse failed:{e}')
prints(f'detail queries count:{len(queries)}')
return result, queries
def parse_words(self, html_str):
words = []
try:
json_str = re.findall('<!--s-data:(.*?)-->', html_str, re.S)
if json_str:
oo = json.loads(json_str[0])
wds = oo.get('data', {}).get('homepage', {}).get('cloud', [])
if wds:
for word in wds:
wd = replaces(word.get('word', ''))
if wd:
if wd not in words:
words.append(wd)
except Exception as e:
prints(f'func parse failed:{e}')
prints(f'words query count:{len(words)}')
return words
def get_data(self, url):
headers = {
'Upgrade-Insecure-Requests': '1',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
'Referer': 'https://top.baidu.com/board',
'Accept-Encoding': 'gzip, deflate, br',
'Accept-Language': 'zh-CN,zh;q=0.9,zh-TW;q=0.8,th;q=0.7,en;q=0.6',
}
args_json = {}
args_json['url'] = url
args_json['headers'] = headers
html_str = self.req(args_json)
if html_str:
if 'realtime' in url:
result, queries = self.parse_details(html_str)
return result, queries
else:
queries = self.parse_words(html_str)
return queries
else:
with open('topbaidu.html', 'w', encoding='utf-8') as ww:
ww``.write(html_str)
def run(self):
url = 'https://top.baidu.com/board?tab=realtime'
result, queries = self.get_data(url)
url = 'https://top.baidu.com/board?tab=homepage'
words = self.get_data(url)
queries.extend(words)
return queries
if __name__ == '__main__':
with open('topbaidu.html', 'r', encoding='utf-8') as rr:
html_str = rr.read()
S = Spider()
S.run()
# S.parse(html_str)
网友评论