背景:因营销需求,对家具市场近期活动有明确的了解。
数据:爬取和家居网站上商品的详细信息,包括但不限于···。
目标:编写爬虫框架,定期获取目标数据,导入数据库。

【一】分析思路
1、观察商品网址


发现每个商品的SKU对应唯一的ID,根据如此,递增ID编号获取其基本信息。
2、爬取商品基础信息
观察后,需要的基础属性在网页第一次响应中包含,那么直接通过Xpath解析出所在的位置。


3、筛选商品价格的异步链接
测试后发现价格通过异步加载完成的,通过控制台找寻其所在的加载网址。


观察后发现是通过ID的变化进行的加载,故拼接出不同的ID对应的价格网址
4、根据商品图片链接进行下载
【三】实现代码
1.获取商品的分类&链接
#找到货物链接的规律,进行自增循环,爬取到货物的分类与ID
goods =[]
headers = {
'Cookie':'area_region=2; goodsId=2464; area_region=2; goodsId=2464; ECS_ID=664f1cd1f37ba6bfa6bedf430b2c0d1096b2f969; ECS[visit_times]=1; session_id_ip=221.237.152.174_664f1cd1f37ba6bfa6bedf430b2c0d10; area_region=2; goodsId=1463; ECS[history]=2464%2C1463%2C1464%2C1648%2C1312%2C2335%2C1332%2C1235%2C1335%2C1333%2C1334; ECS[list_history]=2464%2C1463%2C1464%2C1648%2C1312%2C2335%2C1332%2C1235%2C1335%2C1333%2C1334; _ga=GA1.2.46935259.1537932797; _gid=GA1.2.16826347.1537932797; _gat_gtag_UA_125099464_1=1; Hm_lvt_0c5d16c4fdfede265f1fe61f241c5c3a=1537932797; Hm_lpvt_0c5d16c4fdfede265f1fe61f241c5c3a=1537947312; province=26; city=322; district=2722',
'Host':'www.hejiaju.com',
'Upgrade-Insecure-Requests':'1',
'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.99 Safari/537.36'
}
number = 1
requests.adapters.DEFAULT_RETRIES = 5
r = requests.session()
r.keep_alive = False
for i in range(1,3000):
url = ('http://www.hejiaju.com/goods/%s.html/' %i)
r = requests.get(url,headers = headers,timeout = 10)
if r.text != 'File not found.\n':
good = {}
A = etree.HTML(r.text)
B = A.xpath('//*[@id="ECS_FORMBUY"]/div[1]/h1/text()') #名称
C = A.xpath('//*[@id="ur_here"]/span[5]/a/text()') #子分类
D = A.xpath('//*[@id="ur_here"]/span[3]/a/text()') #主分类
good = {
'名称':B ,
'子分类':C ,
'主分类':D ,
'链接':url ,
}
goods.append(good)
print ("——"*20 + str(number) + "——"*20 + str(i))
number += 1
time.sleep(4)
else :
print ("——"*20 + '无连接' + "——"*20 + str(i))
time.sleep(4)
goods = pd.DataFrame(goods)
goods .to_excel(r'C:\Users\Administrator\Desktop\新.xlsx')
2.获取商品的属性
#货物链接,爬取固定页面上的属性,图片链接,规格
requests.adapters.DEFAULT_RETRIES = 5
r = requests.session()
r.keep_alive = False
headers = {
'Connection': 'keep-alive',
'Cookie': 'area_region=2; goodsId=1648; area_region=2; goodsId=2464; session_id_ip=221.237.152.174_664f1cd1f37ba6bfa6bedf430b2c0d10; area_region=2; goodsId=1463; ECS_ID=e89c6f653c6f754290aa884d42a1005e962e589a; ECS[visit_times]=3; ECS[history]=1648%2C2464%2C1463%2C1464%2C1312%2C2335%2C1332%2C1235%2C1335%2C1333%2C1334; ECS[list_history]=1648%2C2464%2C1463%2C1464%2C1312%2C2335%2C1332%2C1235%2C1335%2C1333%2C1334; Hm_lvt_0c5d16c4fdfede265f1fe61f241c5c3a=1537932797,1537947835,1538011225; Hm_lpvt_0c5d16c4fdfede265f1fe61f241c5c3a=1538011225; _ga=GA1.2.46935259.1537932797; _gid=GA1.2.16826347.1537932797; _gat_gtag_UA_125099464_1=1; province=26; city=322; district=2722',
'Host': 'www.hejiaju.com',
'Upgrade-Insecure-Requests': '1',
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.99 Safari/537.36',
'Referer': 'http://www.hejiaju.com/'
}
label = []
d1 = ['商品型号:','品牌:','风格:','产品结构:','体积','包装件数:','材质:']
picture = []
specifications = []
sales_volume = []
number = 1
eror_url = []
for i in X['链接']:
url = i
r = requests.session()
r.keep_alive = False
try :
r = requests.get(url, headers=headers, timeout=10)
A = etree.HTML(r.text)
B_picture = A.xpath('//*[@id="J_tabSlider"]/div[1]/ul/li/a/img/@src')
C_picture = {
'图片': B_picture
}
picture.append(C_picture)
B_label = A.xpath('//*[@id="ncGoodsIntro"]/ul/li/text()')
C_label = ''
for j in B_label:
C_label = C_label + j + '---'
C_label = C_label.replace('\n', '')
C_label = list(C_label.split(' '))
C_label = ''.join(C_label) #
_属性 = {}
for k in d1:
D_label = re.findall('%s(.*?)---' % k, C_label)
if D_label == []:
D_label = 'NAN'
else:
D_label = D_label[0]
_属性.update({k: D_label})
label.append(_属性)
B_specifications = A.xpath('//*[@id="ECS_FORMBUY"]/div[2]/div[2]/dl[2]/dd/ul/li/a/text()')
C_specifications = {
'规格': B_specifications
}
specifications.append(C_specifications)
B_sales_volume = A.xpath('//*[@id="ECS_FORMBUY"]/div[2]/div[1]/dl[1]/dt/em/text()')
C_sales_volume = {
'销量': B_sales_volume
}
sales_volume.append(C_sales_volume)
print('---' * 20 + str(number))
number += 1
# time.sleep(2)
except :
eror_url.append(i)
time.sleep(5)
picture = pd.DataFrame(picture)
label = pd.DataFrame(label)
specifications = pd.DataFrame(specifications)
sales_volume = pd.DataFrame(sales_volume)
data = pd.concat([label,specifications],axis = 1)
data = pd.concat([data,sales_volume],axis = 1)
data = pd.concat([data,picture],axis = 1)
3.获取商品的价格
#根据商品的链接,拼接出异步加载链接,爬取价格
import re
import requests
from lxml import etree
import pandas as pd
headers = {
'Cookie':'area_region=2; goodsId=2464; area_region=2; goodsId=2464; ECS_ID=664f1cd1f37ba6bfa6bedf430b2c0d1096b2f969; ECS[visit_times]=1; session_id_ip=221.237.152.174_664f1cd1f37ba6bfa6bedf430b2c0d10; area_region=2; goodsId=1463; ECS[history]=2464%2C1463%2C1464%2C1648%2C1312%2C2335%2C1332%2C1235%2C1335%2C1333%2C1334; ECS[list_history]=2464%2C1463%2C1464%2C1648%2C1312%2C2335%2C1332%2C1235%2C1335%2C1333%2C1334; _ga=GA1.2.46935259.1537932797; _gid=GA1.2.16826347.1537932797; _gat_gtag_UA_125099464_1=1; Hm_lvt_0c5d16c4fdfede265f1fe61f241c5c3a=1537932797; Hm_lpvt_0c5d16c4fdfede265f1fe61f241c5c3a=1537947312; province=26; city=322; district=2722',
'Host':'www.hejiaju.com',
'Upgrade-Insecure-Requests':'1',
'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.99 Safari/537.36'
}
patter = r'"shop_price":"(.*?)"'
patter2 = r'"market_price":"\\uffe5(.*?)"'
patter3 = r'goods/(.*?).html'
requests.adapters.DEFAULT_RETRIES = 5
r = requests.session()
r.keep_alive = False
price = []
number = 1
eror_url_2 = []
for i in X['链接']:
id = re.findall(patter3,i)[0]
url = 'http://www.hejiaju.com/goods.php?act=price&id=' + id +'&tdsourcetag=s_pctim_aiomsg'
try :
r = requests.get(url,headers = headers,timeout = 10)
A = re.findall(patter2,r.text)
B = re.findall(patter,r.text)
p = {
"ID" : id ,
"店铺价格" : B ,
"市场价格" : A
}
price.append(p)
print ('---'*20 + str(number))
number += 1
# time.sleep(3)
except:
eror_url_2.append(url)
time.sleep(3)
price = pd.DataFrame(price)
4.下载图片
将先前获取的图片链接逐一清洗出来,再进行下载。
#清洗出图片地址,X = 产品包含的图片地址
def JPG_url (X):
jpg_url = []
for i in X['图片']:
if ',' not in i:
url = 'http:' + i.strip(r"[]'")
jpg_url.append(url)
else :
i = i.replace("'",'')
i = i.strip(r"[]'")
i = i.split(',')
for j in i:
j = j.strip()
url = 'http:' + j
jpg_url.append(url)
#jpg_url = pd.DataFrame(pd.Series(jpg_url), columns=['图片下载地址'])
return (jpg_url)
#通过图片地址,将图片下载到本地,X = 每张图片的下载地址
def JPG (X):
import os
number = 1
for i in X['图片下载地址']:
url = i
#图片存储的地址
root = "G://python//"
path = root + url.split("/")[-1]
try:
if not os.path.exists(root):
os.mkdir(root)
if not os.path.exists(path):
r = requests.get(url,timeout = 10)
r.raise_for_status()
#使用with语句可以不用自己手动关闭已经打开的文件流
with open(path,"wb") as f: #开始写文件,wb代表写二进制文件
f.write(r.content)
print("爬取完成" + '---'*20 +str(number))
number += 1
else:
print("文件已存在")
except Exception as e:
print("爬取失败:"+str(e))
else :
time.sleep(1)
return ('爬取完成')
网友评论