美文网首页
采集go.mail.ru

采集go.mail.ru

作者: 是东东 | 来源:发表于2021-12-06 00:22 被阅读0次

target_url = "https://go.mail.ru/search?q=query&sf=0"

import requests
import re
import time
import json

replaces = lambda x: f'{x}'.replace('\n', '').strip()


class MailRuPC(object):
    def __init__(self):
        pass

    def get_ua(self):
        ua = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36'
        return ua

    def get_cookies(self):
        time_str = int(time.time())
        tmr_detect = f'0%7C{time_str}'
        cookies = {'searchuid': '7434245071553574513'}
        cookies[tmr_detect] = tmr_detect
        return cookies

    def get_headers(self):
        headers = {
            'User-Agent': self.get_ua(),
            # 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3',
        }
        return headers

    def get_with_proxy(self, target_url, timeout=60):
        headers = self.get_headers()
        cookies = self.get_cookies()
        response = requests.get(url=target_url, headers=headers, cookies=cookies, timeout=timeout)
        # response.encoding = response.apparent_encoding
        encoding = ''.join(re.findall('charset=(.*)', response.headers.get('Content-Type')))
        response.encoding = encoding
        html_str = response.text
        code = response.status_code
        return code, html_str

    def parse_all(self, html_str):
        result = []
        try:

            tree = re.findall('go\.dataJson = (.*?)</script>', html_str, re.S)
            if tree:
                json_str = tree[0].replace('\r', '').replace('\t', '').replace('\n', '').rstrip(';')
                tree = json.loads(json_str)
                result = tree.get('serp', {}).get('results')
        except Exception as e:
            msg = 'func parse_all error:%s' % repr(e)
            print(msg)
        return result

    def control(self, line):
        query = line.get('query')
        page = line.get('page')
        _p = int(page) - 1
        sf = int(_p) * 10
        target_url = 'https://go.mail.ru/search?q=%s&sf=%s' % (query, sf)
        line['url'] = target_url
        code, html_str = self.get_with_proxy(target_url)
        if html_str:
            with open('html_str.html', 'w', encoding='utf-8') as ww:
                ww.write(html_str)
            result = self.parse_all(html_str)
            line['result'] = result
            line['html'] = html_str
        return line

    def start(self, line):
        line = self.control(line)
        return line


if __name__ == '__main__':
    mail_ru = MailRuPC()
    crawl_time = time.strftime('%Y-%m-%d', time.localtime())
    line = {'query': 'query', 'crawl_time': crawl_time, 'lang': 'ru', 'country': 'RU', 'page': '1'}
    result = mail_ru.start(line)
    print(result)

相关文章

网友评论

      本文标题:采集go.mail.ru

      本文链接:https://www.haomeiwen.com/subject/vwfgxrtx.html