美文网首页
正则表达式爬虫——上海小区大全

正则表达式爬虫——上海小区大全

作者: 徐_c90e | 来源:发表于2019-05-29 11:06 被阅读0次

    import requests

    import re

    import csv

    import time

    import random

    def get_area_names(url,line_num):

        headers = {

            'User-Agent':'Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36',

            'cookie':'aQQ_ajkguid=EA9803C9-984A-8523-9851-4A030C3F192C; ctid=11; wmda_uuid=6c700ade52714ad2458ecf83c0e7724e; wmda_new_uuid=1; wmda_visited_projects=%3B6289197098934; wmda_session_id_6289197098934=1559007118314-d3bbb6df-b03b-4667; sessid=DBF5B260-CD64-5A54-ED48-33FC0D0A3D50; lps=http%3A%2F%2Fwww.anjuke.com%2Fshanghai%2Fcm1210%2F%7C; twe=2; __xsptplusUT_8=1; propertys=s5vy32-ps6xxj_; ajk_member_captcha=b5e4bebf3df2a3edce16d50621e6a514; __xsptplus8=8.2.1559007119.1559007564.4%234%7C%7C%7C%7C%7C%23%23DDSca8CHAo0T4Q1PY0Yv9GoBvpZVNqz2%23; _ga=GA1.2.654788080.1559007564; _gid=GA1.2.721651369.1559007564'

        }

        res = requests.get(url,headers=headers)

        # result = re.findall('<a href="https://www.anjuke.com/shanghai/.*?" target="_blank">(.*?)</a>', res.text, re.S)

        result = re.findall('<em><a href="(.*?)".*?target="_blank">(.*?)</a>', res.text, re.S)

        # <a href="https://www.anjuke.com/shanghai/cm1026072/" target="_blank">黄山始信苑</a>

        print(result)

        for row in result:

            print(row[1],row[0])

            try:

                res1=requests.get(row[0], headers=headers , timeout=5 )

                result1 = re.findall('_spread_params="commbook_p" href="(.*?)" class="hd-link only_show" target="_blank"',res1.text, re.S)

            except Exception as err:

                #requests.exceptions.ConnectTimeout  as err: #requests.exceptions.ReadTimeout

                print('查询失败1:',err)

                time.sleep( random.randint(1, 3))

                result1=('http://127.0.0.1','')

            time.sleep(random.randint(1, 5))

            #print(res1.text)

            #print(res1.text)

            #print (result1[0])

            if len(result1) >= 1 :

                try:

                    res2 = requests.get(result1[0], headers=headers)

                    result2=re.findall('lat : "(.*?)",.*?lng : "(.*?)"',res2.text,re.S)

                except Exception as err:

                    print('查询失败2:', err)

                    result2=(('',''),)

                #print(result2)

                time.sleep( random.randint(2, 5))

                if len(result2)>=1 :

                # print(row,result2[0])

                    row_to_write = list(row + result2[0])

                else:

                    row_to_write = list(row)

            else:

                row_to_write = list(row)

            #print(type(row_to_write),row_to_write)

            line_num = line_num + 1

            print(line_num,row_to_write)

            writer.writerow(row_to_write)

        return(line_num)

    if __name__ == '__main__':

        line_num = 0

        f = open('C:\上海小区大全.csv', 'w', encoding='utf-8',newline='')

        writer = csv.writer(f,dialect='excel')

        writer.writerow(['页面链接', '小区名称', '百度纬度', '百度经度'])

        urls = ['https://www.anjuke.com/shanghai/cm/p{}'.format(str(i)) for i in range(1,36)]

        for url in urls:

            print(url)

            line_num = get_area_names(url,line_num)

            time.sleep(random.randint(1, 3))

    相关文章

      网友评论

          本文标题:正则表达式爬虫——上海小区大全

          本文链接:https://www.haomeiwen.com/subject/sfuetctx.html