美文网首页
python 抓取 1688商品详情

python 抓取 1688商品详情

作者: 北方蜘蛛 | 来源:发表于2016-06-01 17:18 被阅读1128次

    写的还是不完善,抓取的价格还稳定,还有详情现在无法抓到

    #!/usr/bin/python
    import threading
    from time import ctime,sleep
    import pycurl
    import urllib2
    import sys,os
    import StringIO
    from lxml import etree
    import datetime
    starttime = datetime.datetime.now()
    #https pycurl
    
    def spider_curl(url):
        c = pycurl.Curl()
        c.setopt(pycurl.URL, url)
        b = StringIO.StringIO()
        c.setopt(pycurl.WRITEFUNCTION, b.write)
        c.setopt(pycurl.SSL_VERIFYPEER, 0)
        c.setopt(pycurl.SSL_VERIFYHOST, 0)
        c.perform()
        html = b.getvalue()
        #print html
        show_pach( html,url)
    
    def show_pach(html,url):
        tree=etree.HTML(html)
        nodes=tree.xpath(u"/html/body")
        title=nodes[0].xpath("//title")
        attach_thumb = nodes[0].xpath('//li[@data-imgs]')
        sale = nodes[0].xpath('//span[@class="value"][2]');
        
        img_s = ""
        cover = ""
        cost = ""
        sale_price = ""
        market_price = ""
        shop_price = 100 #库存
        #print sale[5]
        
        for items in sale:
            cost = items.text
            
        sale_price = float(cost) * 1.4
        market_price = float(cost) * 1.8
            
        j = 0
        for item in attach_thumb:
           
                imgs = item.attrib['data-imgs']
                dict = eval(item.attrib['data-imgs'])
                if j == 1:
                    cover = str(dict["preview"])
    
                #print dict["preview"]
                img_s = img_s + str(dict["preview"])+","
                j = j + 1
                if j == 5:
                    break
    
        for item in title:
             title = item.text[:-11]
    
        print title +"\n"
        print cover +"\n"
        print img_s[:-1] +"\n"
        
        sql = "INSERT INTO `wpin`.`yge_product` ( `title`, `category_id`, `attach_thumb`,`attach_image`,`slider`,`sale_price`,`market_price`,`shop_price`,`chengben`, `content`) VALUES ('"+ title +"','163','"+ cover +"','"+ cover +"','"+ img_s[:-1] +"','"+ str(sale_price) +"','"+ str(market_price) +"','"+ str(shop_price) +"','"+ str(cost) +"','content')"
        print sql
    
        
        
        
    def download_img_https(url):
        c = pycurl.Curl()
        c.setopt(pycurl.URL, url)
        b = StringIO.StringIO()
        c.setopt(pycurl.WRITEFUNCTION, b.write)
        c.setopt(pycurl.SSL_VERIFYPEER, 0)
        c.setopt(pycurl.SSL_VERIFYHOST, 0)
        c.perform()
        html = b.getvalue()
        #print html
        ms=hashlib.md5()
        ms.update(url)
        ms.hexdigest()
        fk = open("file/nimabi"+ms.hexdigest()+".jpg", "wb")
        fk.write(html)
        print url
        fk.close()
        
    def urllibget(i):
        response = urllib2.urlopen(i) 
        html = response.read()
        show_pach(html,i)
        
    def run():
        url = raw_input("add one url: ")
        if url[:4] != 'http':
               print "please a true 1688 detail url "
        else:
            urllibget(url)
    
    run()
    endtime = datetime.datetime.now()
    print (endtime - starttime).seconds
    
    

    相关文章

      网友评论

          本文标题:python 抓取 1688商品详情

          本文链接:https://www.haomeiwen.com/subject/lcvhdttx.html