美文网首页网络爬虫天地
淘宝爬虫爬取商品详情和销量

淘宝爬虫爬取商品详情和销量

作者: 探索者_逗你玩儿 | 来源:发表于2019-02-18 11:33 被阅读0次

    废话不说直接上代码,由于获取销量的接口需要登录后的cookies,并且需要指定获取的权限,所以需要在web上登录一次,然后在通过代码获取到销量字段

    #!/usr/bin/python
    # -*- coding: utf-8 -*-
    import requests
    from bs4 import BeautifulSoup
    from selenium import webdriver
    import re
    from module.TaobaoItem import TaobaoItem
    import json
    import urllib
    from pycookiecheat import chrome_cookies
    from module.CookiesUtil import get_cookie_path
    
    class TaobaoProcessor(object):
    
        def process(self,url):
    
            browser = webdriver.Chrome()
            browser.get(url)
            browser.implicitly_wait(10)
    
            # soap = BeautifulSoup(browser.page_source)
            # print(soap.prettify())
            meta = browser.find_element_by_xpath("/html/head/meta[9]")
            meta_content = meta.get_attribute("content")
            userid = re.findall(r".userid=(\d+)", meta_content)
            title = browser.find_element_by_class_name('tb-main-title').text
            origin_price = browser.find_element_by_id('J_StrPrice').find_element_by_class_name('tb-rmb-num').text
            # new_price = browser.find_element_by_class_name("tb-promo-price").find_element_by_class_name('tb-rmb-num').text
            # new_price = WebDriverWait(browser,5).until(lambda x: x.find_element_by_class_name("tb-promo-price").find_element_by_class_name('tb-rmb-num')).text
            imgs = browser.find_elements_by_xpath('//ul[@id="J_UlThumb"]/li/div/a/img')
            img_list = []
            for img in imgs:
                img_url = img.get_attribute('src')
                img_url = "_".join(img_url.split("_")[:-1])
                img_list.append(img_url)
            chima = browser.find_element_by_xpath('//*[@id="J_isku"]/div/dl[1]/dt').text
            chima_element = browser.find_element_by_xpath('//*[@id="J_isku"]/div/dl[1]/dd/ul').text
            span_list = []
            for span in chima_element.split("\n"):
                span_list.append(span)
            color_pro = browser.find_element_by_xpath('//*[@id="J_isku"]/div/dl[2]/dt').text
            color_val = browser.find_elements_by_xpath('//*[@id="J_isku"]/div/dl[2]/dd/ul/li/a/span')
            color_list = []
            for color in color_val:
                color_list.append(color.get_attribute('innerHTML'))
            attribute = browser.find_element_by_id("attributes")
            attr_val = attribute.get_attribute("innerHTML")
            description = browser.find_element_by_id("description")
            description_val = description.get_attribute("innerHTML")
    
            sale_count = browser.find_element_by_id("J_SellCounter").get_attribute('innerHTML')
            comment_count = browser.find_element_by_id("J_RateCounter").text
    
            item = TaobaoItem()
            item.title = title
            item.origin_price = origin_price
            # item.new_price = new_price
            item.img_list = img_list
            item.chima = chima
            item.color_pro = color_pro
            item.span_list = span_list
            item.color_list = color_list
            item.attr_val = attr_val
            item.description_val = description_val
            item.sale_count = sale_count
            item.comment_count = comment_count
    
            print(item.__dict__)
    
        def get_price(self,userid,itemid):
            url = "https://detailskip.taobao.com/service/getData/1/p1/item/detail/sib.htm?itemId=" + str(itemid)+"&sellerId=" + str(userid)+"&modules=dynStock,qrcode,viewer,price,duty,xmpPromotion,delivery,upp,activity,fqg,zjys,amountRestriction,couponActivity,soldQuantity,originalPrice,tradeContract&callback=onSibRequestSuccess"
            req = requests.session()
            _headers = {
                'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36',
                'authority': 'detailskip.taobao.com',
                'method': 'GET',
    
            }
            resp = req.get(url, headers=_headers)
            content = resp.content.decode()
            content = re.findall(r"onSibRequestSuccess\((.+)\)",content)
            data = json.loads(content[0])
            new_price = data['data']['promotion']['promoData']['def']['price']
            print(new_price)
    
        def process_html(self,url):
            req = requests.session()
            _headers = {
                'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36',
            }
            resp = req.get(url, headers=_headers)
            content = resp.content
            soap = BeautifulSoup(content)
            print(soap.prettify())
    
        def taobao_spider(self,url):
            headers = {
                'Accept': 'application/json, text/plain, */*',
                'Accept-Language': 'zh-CN,zh;q=0.3',
                'Referer': 'https://item.taobao.com/item.htm',
                'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36',
                'Connection': 'keep-alive',
            }
            goods_id = re.findall('id=(\d+)', url)[0]
            try:
                req = urllib.request.Request(url=url, headers=headers)
                res = urllib.request.urlopen(req).read().decode('gbk', 'ignore')
            except Exception as e:
                print('无法打开网页:', e.reason)
            try:
                title = re.findall('<h3 class="tb-main-title" data-title="(.*?)"', res)
                title = title[0] if title else None
                line_price = re.findall('<em class="tb-rmb-num">(.*?)</em>', res)[0]
                des_url = re.findall(r"descUrl\s+:(.+)", res)
                sellerid = re.findall(r"sellerId\s+:\s\'(.+)\'",res)[0]
                des_url = des_url[0].split(":")[2]
                des_url = re.findall(r"\'(.+)\'", des_url)
                des_url = 'https:' + des_url[0]
                des_request = urllib.request.Request(url=des_url, headers=headers)
                des_resp = urllib.request.urlopen(des_request).read()
                des_val = des_resp.decode('gbk', 'ignore')
                des_val = re.findall(r"var desc=((.+\s)+)",des_val)
                des_val = des_val[0][0]
                des_val = des_val.replace("';",'')
                des_val = des_val.replace("'", '')
                des_val = des_val.replace("\\",'')
                soap = BeautifulSoup(res)
                desciption = str(soap.find_all('div',id='attributes')[0])
    
    
    
                # 30-42行为抓取淘宝商品真实价格,该数据是动态加载的
                # purl = "https://detailskip.taobao.com/service/getData/1/p1/item/detail/sib.htm?itemId={}&sellerId={}&modules=dynStock,qrcode,viewer,price,duty,xmpPromotion,delivery,upp,activity,fqg,zjys,amountRestriction,couponActivity,soldQuantity,originalPrice,tradeContract&callback=onSibRequestSuccess".format(goods_id,sellerid)
                purl = "https://detailskip.taobao.com/service/getData/1/p1/item/detail/sib.htm?itemId={}&sellerId={}&modules=price,soldQuantity".format(
                    goods_id,sellerid)
                headers['Referer'] = url
                cookie_path = get_cookie_path()
                cookies = chrome_cookies('https://item.taobao.com/', cookie_path)
                cookie_str = ''
                for k in cookies:
                    cookie_str = cookie_str + k+"="+cookies[k]+";"
                headers['Cookie'] = cookie_str
    
                price_req = urllib.request.Request(url=purl, headers=headers)
                price_res = urllib.request.urlopen(price_req).read()
                resp_data = price_res.decode()
                data = list(set(re.findall('"price":"(.*?)"', resp_data)))
                confirm_sell = re.findall(r'"confirmGoodsCount":"(.*?)"', resp_data)[0]
                sell_total = re.findall(r'"soldTotalCount":"(.*?)"', resp_data)[0]
                # data列表中的价格可能是定值与区间的组合,也可能只是定值,而且不一定有序
                real_price = ""
                for t in data:
                    if '-' in t:
                        real_price = t
                        break
                if not real_price:
                    real_price = sorted(map(float, data))[0]
    
                # 45-53行为抓取评论数据,该数据也是动态加载的
                # comment_url = "https://rate.tmall.com/list_detail_rate.htm?itemId={}&sellerId=880734502Page=1".format(
                #     goods_id)
                # comment_data = urllib.request.urlopen(comment_url).read().decode("gbk", "ignore")
                # print(comment_data)
                # temp_data = re.findall('("commentTime":.*?),"days"', comment_data)
                # temp_data = temp_data if temp_data else re.findall('("rateContent":.*?),"reply"', comment_data)
                # comment = ""
                # for data in temp_data:
                #     comment += data.encode('utf-8')
                # comment = comment if comment else "暂无评论"
            except Exception as e:
                print('数据抽取失败!!!')
            print('商品名:', title)
            print('划线价格:', line_price)
            print('真实价格:', real_price)
            print('商品链接:', url)
            # print('部分评论内容:', comment)
            print('确认订单:',confirm_sell)
            print('30天内销售订单:', sell_total)
            print("描述:",desciption,des_val)
    
    
    if __name__ == '__main__':
    
        # text = """
        #     onSibRequestSuccess({"code":{"code":0,"message":"SUCCESS"},"data":{"viewer":{"admin":false,"bs":"","buyDomain":"buy.taobao.com","buyerId":"","cartDomain":"cart.taobao.com","cc":false,"countryCode":"CN","ctUser":false,"lgin":false,"serviceTab":"ITEM","tkn":"5759fa35b33bb"},"deliveryFee":{"data":{"areaId":440100,"areaName":"\u5E7F\u4E1C\u5E7F\u5DDE","sendCity":"\u6D59\u6C5F\u676D\u5DDE","serviceInfo":{"list":[{"id":"100_-4","info":"\u5FEB\u9012 \u514D\u8FD0\u8D39","isDefault":true,"markInfo":"7\u5929\u5185\u53D1\u8D27"}]}},"dataUrl":"\/\/detailskip.taobao.com\/json\/deliveryFee.htm","message":"ok","success":true},"upp":{"3790159085892":"<em class='tb_red'><strong>192<\/strong>\u6DD8\u91D1\u5E01<\/em><em class='tb_dashes_box'> \u62B5\uFFE51.92<\/em> <em class='tb_tjb_price'>\uFFE594.08<\/em>","3790159085893":"<em class='tb_red'><strong>192<\/strong>\u6DD8\u91D1\u5E01<\/em><em class='tb_dashes_box'> \u62B5\uFFE51.92<\/em> <em class='tb_tjb_price'>\uFFE594.08<\/em>","-2":"\u6DD8\u91D1\u5E01\u6700\u9AD8\u53EF\u62B5\u5546\u54C1\u4EF7<em class='tb-h'> 2%<\/em>","3790159085894":"<em class='tb_red'><strong>192<\/strong>\u6DD8\u91D1\u5E01<\/em><em class='tb_dashes_box'> \u62B5\uFFE51.92<\/em> <em class='tb_tjb_price'>\uFFE594.08<\/em>","3790159085895":"<em class='tb_red'><strong>192<\/strong>\u6DD8\u91D1\u5E01<\/em><em class='tb_dashes_box'> \u62B5\uFFE51.92<\/em> <em class='tb_tjb_price'>\uFFE594.08<\/em>","-5":"<em class='tb_red'><strong>192<\/strong>\u6DD8\u91D1\u5E01<\/em><em class='tb_dashes_box'> \u62B5\uFFE51.92<\/em> <em class='tb_tjb_price'>\uFFE594.08<\/em>","3790159085890":"<em class='tb_red'><strong>192<\/strong>\u6DD8\u91D1\u5E01<\/em><em class='tb_dashes_box'> \u62B5\uFFE51.92<\/em> <em class='tb_tjb_price'>\uFFE594.08<\/em>","3790159085891":"<em class='tb_red'><strong>192<\/strong>\u6DD8\u91D1\u5E01<\/em><em class='tb_dashes_box'> \u62B5\uFFE51.92<\/em> <em class='tb_tjb_price'>\uFFE594.08<\/em>"},"originalPrice":{";20509:28315;1627207:28320;":{"price":"99.00"},";20509:28314;1627207:28338;":{"price":"99.00"},";20509:28315;1627207:28341;":{"price":"99.00"},"def":{"price":"99.00"},";20509:28314;1627207:28341;":{"price":"99.00"},";20509:28315;1627207:28338;":{"price":"99.00"},";20509:28314;1627207:28320;":{"price":"99.00"}},"activity":{"bigpromotion":[{"bg":"\/\/img.alicdn.com\/tfs\/TB1yqnZr0knBKNjSZKPXXX6OFXa-480-40.png","img":["\/\/img.alicdn.com\/tfs\/TB1yqnZr0knBKNjSZKPXXX6OFXa-480-40.png","\/\/img.alicdn.com\/tfs\/TB18j50sk7mBKNjSZFyXXbydFXa-330-40.png"],"time":1536508800000,"type":"pre"},{"bg":"\/\/img.alicdn.com\/tfs\/TB172yRsiAnBKNjSZFvXXaTKXXa-480-40.png","img":["\/\/img.alicdn.com\/tfs\/TB172yRsiAnBKNjSZFvXXaTKXXa-480-40.png","\/\/img.alicdn.com\/tfs\/TB1NQGysXkoBKNjSZFkXXb4tFXa-330-40.png"],"time":1536836400000,"type":"start"},{"time":1537113599000,"type":"end"}]},"price":"99.00","tradeContract":{"pay":[{"icons":["\/\/img.alicdn.com\/tfs\/TB1KTHfQFXXXXbnXFXXXXXXXXXX-16-16.png","\/\/img.alicdn.com\/tfs\/TB1XeDvQFXXXXc5XXXXXXXXXXXX-32-32.png"],"title":"\u8682\u8681\u82B1\u5457","url":"\/\/payservice.alipay.com\/intro\/index.htm?c=hb"},{"icons":["\/\/img.alicdn.com\/tfs\/TB1w6O3QFXXXXX4aXXXXXXXXXXX-16-16.png","\/\/img.alicdn.com\/tfs\/TB1c7HAQFXXXXakXXXXXXXXXXXX-32-32.png"],"title":"\u4FE1\u7528\u5361\u652F\u4ED8","url":"\/\/payservice.alipay.com\/intro\/index.htm?c=xyk"},{"icons":["\/\/img.alicdn.com\/tfs\/TB1dvGWQFXXXXcFaXXXXXXXXXXX-16-16.png","\/\/img.alicdn.com\/tfs\/TB1FdDlQFXXXXa5XpXXXXXXXXXX-32-32.png"],"title":"\u96C6\u5206\u5B9D","url":"\/\/jf.alipay.com"}],"service":[{"desc":"\u6EE1\u8DB37\u5929\u65E0\u7406\u7531\u9000\u6362\u8D27\u7533\u8BF7\u7684\u524D\u63D0\u4E0B\uFF0C\u5305\u90AE\u5546\u54C1\u9700\u8981\u4E70\u5BB6\u627F\u62C5\u9000\u8D27\u90AE\u8D39\uFF0C\u975E\u5305\u90AE\u5546\u54C1\u9700\u8981\u4E70\u5BB6\u627F\u62C5\u53D1\u8D27\u548C\u9000\u8D27\u90AE\u8D39\u3002","icons":["\/\/img.alicdn.com\/tps\/i1\/T1EQA5FpVgXXceOP_X-16-16.jpg",null],"linkType":1,"title":"7\u5929\u65E0\u7406\u7531"},{"icons":["\/\/img.alicdn.com\/tfs\/TB1CgB6QVXXXXbwXXXXXXXXXXXX-16-16.png",null],"linkType":2,"title":"\u65B0\u54C1","url":"\/\/service.taobao.com\/support\/knowledge-1138476.htm?spm=2013.1.1000372.17.3wGlNf"}]},"dynStock":{"holdQuantity":0,"sellableQuantity":911,"sku":{";20509:28315;1627207:28320;":{"holdQuantity":0,"oversold":false,"sellableQuantity":557,"stock":557},";20509:28314;1627207:28338;":{"holdQuantity":0,"oversold":false,"sellableQuantity":542,"stock":542},";20509:28315;1627207:28341;":{"holdQuantity":0,"oversold":false,"sellableQuantity":911,"stock":911},";20509:28314;1627207:28341;":{"holdQuantity":0,"oversold":false,"sellableQuantity":911,"stock":911},";20509:28315;1627207:28338;":{"holdQuantity":0,"oversold":false,"sellableQuantity":552,"stock":552},";20509:28314;1627207:28320;":{"holdQuantity":0,"oversold":false,"sellableQuantity":425,"stock":425}},"stock":911,"stockType":"channel"},"qrcodeImgUrl":"\/\/gcodex.alicdn.com\/qrcode.do?biz_code=xcode&short_name=a.ZRs8&cmd=createSub&param=id:576081757954;scm:20140619.pc_detail.itemId.0","couponActivity":{"buyerHasMianxi":false,"coupon":{"couponList":[{"activityId":"d71180f1c5d14d18aa2dca099dc7c46c","sellerId":"2448721589","icon":["\/\/img.alicdn.com\/tps\/TB1xlnONpXXXXa9aXXXXXXXXXXX-80-16.png","\/\/img.alicdn.com\/tps\/TB1HZofNpXXXXacXpXXXXXXXXXX-155-32.png"],"type":"shopcoupon","title":"50\u5143\u5E97\u94FA\u4F18\u60E0\u5238\uFF0C\u6EE1499\u5143\u53EF\u7528","isGot":false},{"activityId":"3c89bb542b3d49cd9fe942f102961bd0","sellerId":"2448721589","icon":["\/\/img.alicdn.com\/tps\/TB1xlnONpXXXXa9aXXXXXXXXXXX-80-16.png","\/\/img.alicdn.com\/tps\/TB1HZofNpXXXXacXpXXXXXXXXXX-155-32.png"],"type":"shopcoupon","title":"30\u5143\u5E97\u94FA\u4F18\u60E0\u5238\uFF0C\u6EE1299\u5143\u53EF\u7528","isGot":false}]},"shopProm":[{"icon":["\/\/img.alicdn.com\/tfs\/TB1ZrfnRFXXXXXgXXXXXXXXXXXX-57-16.png","\/\/img.alicdn.com\/tfs\/TB1qX5SRFXXXXciXFXXXXXXXXXX-116-32.png"],"type":"kdmnajian","title":"9\/13-9\/16\u6BCF\u6EE1199\u51CF10,\u4E0A\u4E0D\u5C01\u9876"},{"icon":["\/\/img.alicdn.com\/tfs\/TB1Kz8VQFXXXXa6XFXXXXXXXXXX-56-16.png","\/\/img.alicdn.com\/tfs\/TB1CDp8QFXXXXakXpXXXXXXXXXX-112-32.png"],"title":"\u6EE1299,\u4EAB\u90E8\u5206\u5730\u533A\u5305\u90AE"}],"showMianxiTips":false},"soldQuantity":{"confirmGoodsCount":"1452","soldTotalCount":"8863"},"promotion":{"promoData":{";20509:28315;1627207:28320;":[{"cart":true,"icon":"\/\/img.alicdn.com\/tfs\/TB1.KehskomBKNjSZFqXXXtqVXa-78-16.png","loginPromotion":false,"price":"96.00","start":false,"type":"\u6DD8\u62A2\u8D2D"}],";20509:28314;1627207:28338;":[{"cart":true,"icon":"\/\/img.alicdn.com\/tfs\/TB1.KehskomBKNjSZFqXXXtqVXa-78-16.png","loginPromotion":false,"price":"96.00","start":false,"type":"\u6DD8\u62A2\u8D2D"}],";20509:28315;1627207:28341;":[{"cart":true,"icon":"\/\/img.alicdn.com\/tfs\/TB1.KehskomBKNjSZFqXXXtqVXa-78-16.png","loginPromotion":false,"price":"96.00","start":false,"type":"\u6DD8\u62A2\u8D2D"}],"def":[{"cart":true,"icon":"\/\/img.alicdn.com\/tfs\/TB1.KehskomBKNjSZFqXXXtqVXa-78-16.png","loginPromotion":false,"price":"96.00","start":false,"type":"\u6DD8\u62A2\u8D2D"}],";20509:28314;1627207:28341;":[{"cart":true,"icon":"\/\/img.alicdn.com\/tfs\/TB1.KehskomBKNjSZFqXXXtqVXa-78-16.png","loginPromotion":false,"price":"96.00","start":false,"type":"\u6DD8\u62A2\u8D2D"}],";20509:28315;1627207:28338;":[{"cart":true,"icon":"\/\/img.alicdn.com\/tfs\/TB1.KehskomBKNjSZFqXXXtqVXa-78-16.png","loginPromotion":false,"price":"96.00","start":false,"type":"\u6DD8\u62A2\u8D2D"}],";20509:28314;1627207:28320;":[{"cart":true,"icon":"\/\/img.alicdn.com\/tfs\/TB1.KehskomBKNjSZFqXXXtqVXa-78-16.png","loginPromotion":false,"price":"96.00","start":false,"type":"\u6DD8\u62A2\u8D2D"}]},"saleDetailMap":{"result":"tqg","status":"online"}}}});
        # """
        # des_url = re.findall(r'"confirmGoodsCount":"(.*?)"',text)
        # total = re.findall(r'"soldTotalCount":"(.*?)"', text)
        # print(des_url[0],total[0])
        taobao = TaobaoProcessor()
        url = 'https://item.taobao.com/item.htm?spm=2013.1.w16867253-18554788179.1.7aa41c3dXWLKMm&id=556805373975'
        # taobao.process(url)
        # taobao.get_price(2448721589,573379814923)
        # taobao.process_html(url)
        taobao.taobao_spider(url)
    

    相关文章

      网友评论

        本文标题:淘宝爬虫爬取商品详情和销量

        本文链接:https://www.haomeiwen.com/subject/xwfaeqtx.html