美文网首页
家具平台(和家居)爬虫

家具平台(和家居)爬虫

作者: Ziger丶 | 来源:发表于2019-03-05 20:52 被阅读0次

背景:因营销需求,对家具市场近期活动有明确的了解。
数据:爬取和家居网站上商品的详细信息,包括但不限于···。
目标:编写爬虫框架,定期获取目标数据,导入数据库。

和家居

【一】分析思路

1、观察商品网址
网址一
网址二

发现每个商品的SKU对应唯一的ID,根据如此,递增ID编号获取其基本信息。

2、爬取商品基础信息

观察后,需要的基础属性在网页第一次响应中包含,那么直接通过Xpath解析出所在的位置。



3、筛选商品价格的异步链接

测试后发现价格通过异步加载完成的,通过控制台找寻其所在的加载网址。


价格的加载位置
响应的网址

观察后发现是通过ID的变化进行的加载,故拼接出不同的ID对应的价格网址

4、根据商品图片链接进行下载

【三】实现代码

1.获取商品的分类&链接
#找到货物链接的规律,进行自增循环,爬取到货物的分类与ID
goods =[]
headers = {
    'Cookie':'area_region=2; goodsId=2464; area_region=2; goodsId=2464; ECS_ID=664f1cd1f37ba6bfa6bedf430b2c0d1096b2f969; ECS[visit_times]=1; session_id_ip=221.237.152.174_664f1cd1f37ba6bfa6bedf430b2c0d10; area_region=2; goodsId=1463; ECS[history]=2464%2C1463%2C1464%2C1648%2C1312%2C2335%2C1332%2C1235%2C1335%2C1333%2C1334; ECS[list_history]=2464%2C1463%2C1464%2C1648%2C1312%2C2335%2C1332%2C1235%2C1335%2C1333%2C1334; _ga=GA1.2.46935259.1537932797; _gid=GA1.2.16826347.1537932797; _gat_gtag_UA_125099464_1=1; Hm_lvt_0c5d16c4fdfede265f1fe61f241c5c3a=1537932797; Hm_lpvt_0c5d16c4fdfede265f1fe61f241c5c3a=1537947312; province=26; city=322; district=2722',
    'Host':'www.hejiaju.com',
    'Upgrade-Insecure-Requests':'1',
    'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.99 Safari/537.36'
    }
number = 1
requests.adapters.DEFAULT_RETRIES = 5 
r = requests.session()
r.keep_alive = False 
for i in range(1,3000):  
    url = ('http://www.hejiaju.com/goods/%s.html/' %i)
    r = requests.get(url,headers = headers,timeout = 10)
    if r.text != 'File not found.\n':
        good = {}
        A = etree.HTML(r.text)
        B = A.xpath('//*[@id="ECS_FORMBUY"]/div[1]/h1/text()') #名称
        C = A.xpath('//*[@id="ur_here"]/span[5]/a/text()') #子分类
        D = A.xpath('//*[@id="ur_here"]/span[3]/a/text()')  #主分类
        good = {
               '名称':B ,
               '子分类':C ,
               '主分类':D ,
               '链接':url ,
                }
        goods.append(good)
        print ("——"*20 + str(number) + "——"*20 + str(i))
        number += 1
        time.sleep(4)
    else :
        print ("——"*20 + '无连接' + "——"*20 + str(i))
        time.sleep(4)        
goods  = pd.DataFrame(goods)
goods .to_excel(r'C:\Users\Administrator\Desktop\新.xlsx')
2.获取商品的属性
#货物链接,爬取固定页面上的属性,图片链接,规格
requests.adapters.DEFAULT_RETRIES = 5  
r = requests.session()
r.keep_alive = False  
headers = {
    'Connection': 'keep-alive',
    'Cookie': 'area_region=2; goodsId=1648; area_region=2; goodsId=2464; session_id_ip=221.237.152.174_664f1cd1f37ba6bfa6bedf430b2c0d10; area_region=2; goodsId=1463; ECS_ID=e89c6f653c6f754290aa884d42a1005e962e589a; ECS[visit_times]=3; ECS[history]=1648%2C2464%2C1463%2C1464%2C1312%2C2335%2C1332%2C1235%2C1335%2C1333%2C1334; ECS[list_history]=1648%2C2464%2C1463%2C1464%2C1312%2C2335%2C1332%2C1235%2C1335%2C1333%2C1334; Hm_lvt_0c5d16c4fdfede265f1fe61f241c5c3a=1537932797,1537947835,1538011225; Hm_lpvt_0c5d16c4fdfede265f1fe61f241c5c3a=1538011225; _ga=GA1.2.46935259.1537932797; _gid=GA1.2.16826347.1537932797; _gat_gtag_UA_125099464_1=1; province=26; city=322; district=2722',
    'Host': 'www.hejiaju.com',
    'Upgrade-Insecure-Requests': '1',
    'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.99 Safari/537.36',
    'Referer': 'http://www.hejiaju.com/'
}
label = []
d1 = ['商品型号:','品牌:','风格:','产品结构:','体积','包装件数:','材质:']
picture = []
specifications = []
sales_volume = []
number = 1
eror_url = []
for i in X['链接']:
    url = i
    r = requests.session()
    r.keep_alive = False
    try :
        r = requests.get(url, headers=headers, timeout=10)
        A = etree.HTML(r.text)
    
        B_picture = A.xpath('//*[@id="J_tabSlider"]/div[1]/ul/li/a/img/@src')
        C_picture = {
            '图片': B_picture
        }
        picture.append(C_picture)
    
        B_label = A.xpath('//*[@id="ncGoodsIntro"]/ul/li/text()')
        C_label = ''
        for j in B_label:
            C_label = C_label + j + '---'
        C_label = C_label.replace('\n', '')
        C_label = list(C_label.split(' '))
        C_label = ''.join(C_label)  #
        _属性 = {}
        for k in d1:
            D_label = re.findall('%s(.*?)---' % k, C_label)
            if D_label == []:
                D_label = 'NAN'
            else:
                D_label = D_label[0]
            _属性.update({k: D_label})
        label.append(_属性)
    
        B_specifications = A.xpath('//*[@id="ECS_FORMBUY"]/div[2]/div[2]/dl[2]/dd/ul/li/a/text()')
        C_specifications = {
            '规格': B_specifications
        }
        specifications.append(C_specifications)
        
        B_sales_volume = A.xpath('//*[@id="ECS_FORMBUY"]/div[2]/div[1]/dl[1]/dt/em/text()')
        C_sales_volume = {
                '销量': B_sales_volume
                }       
        sales_volume.append(C_sales_volume)
        
        print('---' * 20 + str(number))
        number += 1
#        time.sleep(2)        
    except :
        eror_url.append(i)
        time.sleep(5)

picture = pd.DataFrame(picture)
label = pd.DataFrame(label)
specifications = pd.DataFrame(specifications)
sales_volume = pd.DataFrame(sales_volume)

data = pd.concat([label,specifications],axis = 1)
data = pd.concat([data,sales_volume],axis = 1)
data = pd.concat([data,picture],axis = 1)
3.获取商品的价格
#根据商品的链接,拼接出异步加载链接,爬取价格

import re
import requests
from lxml import etree
import pandas as pd
headers = {
'Cookie':'area_region=2; goodsId=2464; area_region=2; goodsId=2464; ECS_ID=664f1cd1f37ba6bfa6bedf430b2c0d1096b2f969; ECS[visit_times]=1; session_id_ip=221.237.152.174_664f1cd1f37ba6bfa6bedf430b2c0d10; area_region=2; goodsId=1463; ECS[history]=2464%2C1463%2C1464%2C1648%2C1312%2C2335%2C1332%2C1235%2C1335%2C1333%2C1334; ECS[list_history]=2464%2C1463%2C1464%2C1648%2C1312%2C2335%2C1332%2C1235%2C1335%2C1333%2C1334; _ga=GA1.2.46935259.1537932797; _gid=GA1.2.16826347.1537932797; _gat_gtag_UA_125099464_1=1; Hm_lvt_0c5d16c4fdfede265f1fe61f241c5c3a=1537932797; Hm_lpvt_0c5d16c4fdfede265f1fe61f241c5c3a=1537947312; province=26; city=322; district=2722',
'Host':'www.hejiaju.com',
'Upgrade-Insecure-Requests':'1',
'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.99 Safari/537.36'
}
patter = r'"shop_price":"(.*?)"'
patter2 = r'"market_price":"\\uffe5(.*?)"'
patter3 = r'goods/(.*?).html'
requests.adapters.DEFAULT_RETRIES = 5 
r = requests.session()
r.keep_alive = False
price = []
number = 1
eror_url_2 = []
for i in X['链接']:
    id = re.findall(patter3,i)[0]
    url = 'http://www.hejiaju.com/goods.php?act=price&id=' + id +'&tdsourcetag=s_pctim_aiomsg'
    try :
        r = requests.get(url,headers = headers,timeout = 10)
        A = re.findall(patter2,r.text)
        B = re.findall(patter,r.text)
        p = {
                "ID" : id ,
                "店铺价格" : B ,
                "市场价格" : A
            }
        price.append(p)
        print ('---'*20 + str(number))
        number += 1
#        time.sleep(3) 
    except:
        eror_url_2.append(url)
        time.sleep(3)
price = pd.DataFrame(price)
4.下载图片

将先前获取的图片链接逐一清洗出来,再进行下载。

#清洗出图片地址,X = 产品包含的图片地址
def JPG_url (X):
    jpg_url = []
    for i in X['图片']:
        if ',' not in i:
            url = 'http:' + i.strip(r"[]'")
            jpg_url.append(url)
        else :
            i = i.replace("'",'')
            i = i.strip(r"[]'")
            i = i.split(',')
            for j in i:
                j = j.strip()
                url = 'http:' + j
                jpg_url.append(url)
    #jpg_url = pd.DataFrame(pd.Series(jpg_url), columns=['图片下载地址'])
    return (jpg_url)

#通过图片地址,将图片下载到本地,X = 每张图片的下载地址
def JPG (X):
    import os
    number = 1
    for i in X['图片下载地址']:
        url = i
        #图片存储的地址
        root = "G://python//"
        path = root + url.split("/")[-1]
        try:
            if not os.path.exists(root):
                os.mkdir(root)
            if not os.path.exists(path):
                r = requests.get(url,timeout = 10)
                r.raise_for_status()
                #使用with语句可以不用自己手动关闭已经打开的文件流
                with open(path,"wb") as f: #开始写文件,wb代表写二进制文件
                    f.write(r.content)
                print("爬取完成" + '---'*20 +str(number))
                number += 1
            else:
                print("文件已存在")
        except Exception as e:
            print("爬取失败:"+str(e))
        else :
            time.sleep(1)
    return ('爬取完成')

相关文章

网友评论

      本文标题:家具平台(和家居)爬虫

      本文链接:https://www.haomeiwen.com/subject/ruodaftx.html