1.导入所需库
import requests
import pandas as pd
from lxml import etree
2.爬取各区链接
url = 'www.jkl.com.cn/cn/shop.asp…headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.135 Safari/537.36'}response = requests.get(url,headers=headers).texthtml = etree.HTML(response)city_name = html.xpath('//div[@class="infoLis"]//a/text()')city_name = [i.strip() for i in city_name]city_url = html.xpath('//div[@class="infoLis"]//a/@href')city_url = ['http://www.jkl.com.cn/cn/' + i for i in city_url]
3.当只存在一个大区需要翻页时,数据存储
for i in city_url: if i == 'http://www.jkl.com.cn/cn/shopLis.aspx?id=865': for a in range(1,4): date = { '__EVENTTARGET': 'AspNetPager1', '__EVENTARGUMENT': a } response3 = requests.post(url = i, data=date,headers=headers).text html2 = etree.HTML(response3) city_shop_name = html2.xpath('//span[@class="con01"]/text()') city_shop_dis = html2.xpath('//span[@class="con02"]/text()') city_shop_phone = html2.xpath('//span[@class="con03"]/text()') city_shop_time = html2.xpath('//span[@class="con04"]/text()') shop_name = [d.strip() for d in city_shop_name] print(shop_name) print('*'*30) date = pd.DataFrame({"店铺名称":shop_name,"店铺地址":city_shop_dis,"联系方式":city_shop_phone,"营业时间":city_shop_time}) date.to_csv("e:/爬取爬取【京客隆超市】店铺信息.csv",index=False,header=0,mode="a",encoding = "ANSI") else: response1 = requests.post(url=i,headers=headers).text html1 = etree.HTML(response1) city_shop_name1 = html1.xpath('//span[@class="con01"]/text()') city_shop_dis1 = html1.xpath('//span[@class="con02"]/text()') city_shop_phone1 = html1.xpath('//span[@class="con03"]/text()') city_shop_time1 = html1.xpath('//span[@class="con04"]/text()') shop_name1 = [c.strip() for c in city_shop_name1] print(shop_name1) #数据存储 date = pd.DataFrame({"店铺名称":shop_name1,"店铺地址":city_shop_dis1,"联系方式":city_shop_phone1,"营业时间":city_shop_time1}) date.to_csv("e:/爬取爬取【京客隆超市】店铺信息.csv",index=False,header=0,mode="a",encoding = "ANSI")复制代码
#完整代码
#爬取【京客隆超市】店铺信息
import requests
import pandas as pd
from lxml import etree
url = 'http://www.jkl.com.cn/cn/shop.aspx'
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.135 Safari/537.36'}
response = requests.get(url,headers=headers).text
html = etree.HTML(response)
city_name = html.xpath('//div[@class="infoLis"]//a/text()')
city_name = [i.strip() for i in city_name]
city_url = html.xpath('//div[@class="infoLis"]//a/@href')
city_url = ['http://www.jkl.com.cn/cn/' + i for i in city_url]
for i in city_url:
if i == 'http://www.jkl.com.cn/cn/shopLis.aspx?id=865':
for a in range(1,4):
date = {
'__EVENTTARGET': 'AspNetPager1',
'__EVENTARGUMENT': a
}
response3 = requests.post(url = i, data=date,headers=headers).text
html2 = etree.HTML(response3)
city_shop_name = html2.xpath('//span[@class="con01"]/text()')
city_shop_dis = html2.xpath('//span[@class="con02"]/text()')
city_shop_phone = html2.xpath('//span[@class="con03"]/text()')
city_shop_time = html2.xpath('//span[@class="con04"]/text()')
shop_name = [d.strip() for d in city_shop_name]
print(shop_name)
print('*'*30)
date = pd.DataFrame({"店铺名称":shop_name,"店铺地址":city_shop_dis,"联系方式":city_shop_phone,"营业时间":city_shop_time})
date.to_csv("e:/爬取爬取【京客隆超市】店铺信息.csv",index=False,header=0,mode="a",encoding = "ANSI")
else:
response1 = requests.post(url=i,headers=headers).text
html1 = etree.HTML(response1)
city_shop_name1 = html1.xpath('//span[@class="con01"]/text()')
city_shop_dis1 = html1.xpath('//span[@class="con02"]/text()')
city_shop_phone1 = html1.xpath('//span[@class="con03"]/text()')
city_shop_time1 = html1.xpath('//span[@class="con04"]/text()')
shop_name1 = [c.strip() for c in city_shop_name1]
print(shop_name1)
#数据存储
date = pd.DataFrame({"店铺名称":shop_name1,"店铺地址":city_shop_dis1,"联系方式":city_shop_phone1,"营业时间":city_shop_time1})
date.to_csv("e:/爬取爬取【京客隆超市】店铺信息.csv",index=False,header=0,mode="a",encoding = "ANSI")
#如果区域内店铺不止一页,且只有一页时
作者:Jw__L
链接:https://juejin.im/post/6864348048642801672
来源:掘金
著作权归作者所有。商业转载请联系作者获得授权,非商业转载请注明出处。
网友评论