import requests
import re
import csv
import time
import random
def get_area_names(url,line_num):
headers = {
'User-Agent':'Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36',
'cookie':'aQQ_ajkguid=EA9803C9-984A-8523-9851-4A030C3F192C; ctid=11; wmda_uuid=6c700ade52714ad2458ecf83c0e7724e; wmda_new_uuid=1; wmda_visited_projects=%3B6289197098934; wmda_session_id_6289197098934=1559007118314-d3bbb6df-b03b-4667; sessid=DBF5B260-CD64-5A54-ED48-33FC0D0A3D50; lps=http%3A%2F%2Fwww.anjuke.com%2Fshanghai%2Fcm1210%2F%7C; twe=2; __xsptplusUT_8=1; propertys=s5vy32-ps6xxj_; ajk_member_captcha=b5e4bebf3df2a3edce16d50621e6a514; __xsptplus8=8.2.1559007119.1559007564.4%234%7C%7C%7C%7C%7C%23%23DDSca8CHAo0T4Q1PY0Yv9GoBvpZVNqz2%23; _ga=GA1.2.654788080.1559007564; _gid=GA1.2.721651369.1559007564'
}
res = requests.get(url,headers=headers)
# result = re.findall('<a href="https://www.anjuke.com/shanghai/.*?" target="_blank">(.*?)</a>', res.text, re.S)
result = re.findall('<em><a href="(.*?)".*?target="_blank">(.*?)</a>', res.text, re.S)
# <a href="https://www.anjuke.com/shanghai/cm1026072/" target="_blank">黄山始信苑</a>
print(result)
for row in result:
print(row[1],row[0])
try:
res1=requests.get(row[0], headers=headers , timeout=5 )
result1 = re.findall('_spread_params="commbook_p" href="(.*?)" class="hd-link only_show" target="_blank"',res1.text, re.S)
except Exception as err:
#requests.exceptions.ConnectTimeout as err: #requests.exceptions.ReadTimeout
print('查询失败1:',err)
time.sleep( random.randint(1, 3))
result1=('http://127.0.0.1','')
time.sleep(random.randint(1, 5))
#print(res1.text)
#print(res1.text)
#print (result1[0])
if len(result1) >= 1 :
try:
res2 = requests.get(result1[0], headers=headers)
result2=re.findall('lat : "(.*?)",.*?lng : "(.*?)"',res2.text,re.S)
except Exception as err:
print('查询失败2:', err)
result2=(('',''),)
#print(result2)
time.sleep( random.randint(2, 5))
if len(result2)>=1 :
# print(row,result2[0])
row_to_write = list(row + result2[0])
else:
row_to_write = list(row)
else:
row_to_write = list(row)
#print(type(row_to_write),row_to_write)
line_num = line_num + 1
print(line_num,row_to_write)
writer.writerow(row_to_write)
return(line_num)
if __name__ == '__main__':
line_num = 0
f = open('C:\上海小区大全.csv', 'w', encoding='utf-8',newline='')
writer = csv.writer(f,dialect='excel')
writer.writerow(['页面链接', '小区名称', '百度纬度', '百度经度'])
urls = ['https://www.anjuke.com/shanghai/cm/p{}'.format(str(i)) for i in range(1,36)]
for url in urls:
print(url)
line_num = get_area_names(url,line_num)
time.sleep(random.randint(1, 3))
网友评论