美文网首页python nlp笔记与学习
(爬虫源码全公开)征服所有网站之大众点评

(爬虫源码全公开)征服所有网站之大众点评

作者: 雨夜剪魂 | 来源:发表于2019-05-24 11:38 被阅读0次

    搞死搞残大众点评,废话不多说,源码参上

    import json

    import re

    from bs4 import BeautifulSoup

    import time

    from selenium import webdriver

    import requests.models

    import pandas as pd

    from urllib.parse import urlencode

    from threading import Thread

    keyword = input('your keyword')

    output_filename = input('output csv path') + '.csv'

    post_url = 'https://m.dianping.com/isoapi/module'

    true_url = 'https://m.dianping.com/shoplist/4/search?from=m_search&keyword={}'.format(keyword)

    headers = {

        'Connection': 'keep-alive',

        'Content-Length': '234',

        'Origin': 'https://m.dianping.com',

        'user-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.157 Safari/537.36',

        'Content-Type': 'application/json',

        'Accept': '*/*',

        'Referer': 'https://m.dianping.com/shoplist/4/search?from=m_search&keyword=%E5%95%86%E5%9C%BA'

    }

    def get_cookies():

        """

        使用selenium获取true_url 的cookies

        :return: cookie

        """

        chromeOptions = webdriver.ChromeOptions()

        ua = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.101 Safari/537.36'

        chromeOptions.add_argument('user-agent="%s"' % ua)

        chromeOptions.add_argument('--proxy-server=http://127.0.0.1:8888')

        driver = webdriver.Chrome(chrome_options=chromeOptions)

        driver.get(true_url)

        for _ in range(15):

            driver.execute_script(

                'window.scrollBy(0, 500)'

            )

            time.sleep(0.5)

        if '请输入图片中的内容' in driver.page_source:

            input('12345')

        cookies = driver.get_cookies()

        dict_cookies = {cookie['name']: cookie['value'] for cookie in cookies}

        driver.quit()

        return dict_cookies

    def get_data(dict_cookies):

        for p in range(0, 20 * 51, 20):

            data = {

                "pageEnName": "shopList",

                "moduleInfoList": [

                    {

                        "moduleName": "mapiSearch",

                        "query": {

                            "search": {

                                "start": p,

                                "categoryId": 0,

                                "parentCategoryId": 0,

                                "locateCityid": 0,

                                "limit": 20,

                                "sortId": 0,

                                "cityId": 4,

                                "keyword": '商场',

                                "regionId": 0,

                                "maptype": 0

                            }

                        }

                    }

                ]

            }

            r = requests.post(post_url, headers=headers, cookies=dict_cookies, json=data, verify=False)

            json_data = r.json()

            datas = json_data['data']['moduleInfoList'][0]['moduleData']['data']['listData']['list']

            items = []

            for index, data in enumerate(datas):

                item = {}

                for k, v in data.items():

                    if isinstance(v, list):

                        continue

                    elif isinstance(v, dict):

                        continue

                    else:

                        v = v

                    item[k] = v

                items.append(item)

                header = True if p == 0 and index == 0 else False

                print(item)

                df = pd.DataFrame(data=item, index=['0'])

                df.to_csv(output_filename, mode='a', index=False, header=header, encoding='utf_8_sig')

    def read_csv():

        df = pd.read_csv(output_filename, error_bad_lines=False)

        for index, row in df.iterrows():

            print('ShopId: ' + str(row['shopId']))

    if __name__ == '__main__':

        dict_cookies = get_cookies()

        threads = []

        t1 = Thread(target = get_data, args = (dict_cookies, ))

        t2 = Thread(target = read_csv)

        threads.append(t1)

        threads.append(t2)

        for index, t in enumerate(threads):

            if index == 1:

                time.sleep(15)

            t.start()

        for t in threads:

            t.join()

    解释下,先使用selenium 打开m.dianding.com网站,关键词已输入的情况下,进行翻页,然后获取到cookies,拿到cookies获取接口数据,保存到outputfilename (csv文件中),需要提供的是关键词与保存路径,默认是csv文件。

    如果使用正常的selenium手法去访问,必然会出现验证码跟异常操作的问题。参考:https://www.jianshu.com/p/304f4dfae0bb

    使用mitmproxy作为中间代理,selenium通过代理进行访问,代理中对请求进行过滤,过滤掉某些参数就可以防止这样的反爬手段。filter_js.py 屏蔽代码参上

    import re

    from mitmproxyimport ctx

    def response(flow):

    """修改应答数据"""

        if '/js/yoda.' in flow.request.url:

    # 屏蔽selenium检测

            for webdriver_keyin ['webdriver', '__driver_evaluate', '__webdriver_evaluate', '__selenium_evaluate',

                                  '__fxdriver_evaluate', '__driver_unwrapped', '__webdriver_unwrapped',

                                  '__selenium_unwrapped', '__fxdriver_unwrapped', '_Selenium_IDE_Recorder', '_selenium',

                                  'calledSelenium', '_WEBDRIVER_ELEM_CACHE', 'ChromeDriverw', 'driver-evaluate',

                                  'webdriver-evaluate', 'selenium-evaluate', 'webdriverCommand',

                                  'webdriver-evaluate-response', '__webdriverFunc', '__webdriver_script_fn',

                                  '__$webdriverAsyncExecutor', '__lastWatirAlert', '__lastWatirConfirm',

                                  '__lastWatirPrompt', '$chrome_asyncScriptInfo', '$cdc_asdjflasutopfhvcZLmcfl_']:

    ctx.log.info('Remove "{}" from {}.'.format(webdriver_key, flow.request.url))

    flow.response.text = flow.response.text.replace('"{}"'.format(webdriver_key), '"NO-SUCH-ATTR"')

    flow.response.text = flow.response.text.replace('t.webdriver', 'false')

    flow.response.text = flow.response.text.replace('ChromeDriver', '')

    代理命令截图参上

    监听8888的端口,并自定义脚本filter_js屏幕selenium的检测。在csv文件中,拿到了shopID,通过它构建的url可以抓取详情页面数据,这个就比较简单了。

    相关文章

      网友评论

        本文标题:(爬虫源码全公开)征服所有网站之大众点评

        本文链接:https://www.haomeiwen.com/subject/vaqgzqtx.html