target_url = "https://go.mail.ru/search?q=query&sf=0"
import requests
import re
import time
import json
replaces = lambda x: f'{x}'.replace('\n', '').strip()
class MailRuPC(object):
def __init__(self):
pass
def get_ua(self):
ua = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36'
return ua
def get_cookies(self):
time_str = int(time.time())
tmr_detect = f'0%7C{time_str}'
cookies = {'searchuid': '7434245071553574513'}
cookies[tmr_detect] = tmr_detect
return cookies
def get_headers(self):
headers = {
'User-Agent': self.get_ua(),
# 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3',
}
return headers
def get_with_proxy(self, target_url, timeout=60):
headers = self.get_headers()
cookies = self.get_cookies()
response = requests.get(url=target_url, headers=headers, cookies=cookies, timeout=timeout)
# response.encoding = response.apparent_encoding
encoding = ''.join(re.findall('charset=(.*)', response.headers.get('Content-Type')))
response.encoding = encoding
html_str = response.text
code = response.status_code
return code, html_str
def parse_all(self, html_str):
result = []
try:
tree = re.findall('go\.dataJson = (.*?)</script>', html_str, re.S)
if tree:
json_str = tree[0].replace('\r', '').replace('\t', '').replace('\n', '').rstrip(';')
tree = json.loads(json_str)
result = tree.get('serp', {}).get('results')
except Exception as e:
msg = 'func parse_all error:%s' % repr(e)
print(msg)
return result
def control(self, line):
query = line.get('query')
page = line.get('page')
_p = int(page) - 1
sf = int(_p) * 10
target_url = 'https://go.mail.ru/search?q=%s&sf=%s' % (query, sf)
line['url'] = target_url
code, html_str = self.get_with_proxy(target_url)
if html_str:
with open('html_str.html', 'w', encoding='utf-8') as ww:
ww.write(html_str)
result = self.parse_all(html_str)
line['result'] = result
line['html'] = html_str
return line
def start(self, line):
line = self.control(line)
return line
if __name__ == '__main__':
mail_ru = MailRuPC()
crawl_time = time.strftime('%Y-%m-%d', time.localtime())
line = {'query': 'query', 'crawl_time': crawl_time, 'lang': 'ru', 'country': 'RU', 'page': '1'}
result = mail_ru.start(line)
print(result)
网友评论