写的还是不完善,抓取的价格还稳定,还有详情现在无法抓到
#!/usr/bin/python
import threading
from time import ctime,sleep
import pycurl
import urllib2
import sys,os
import StringIO
from lxml import etree
import datetime
starttime = datetime.datetime.now()
#https pycurl
def spider_curl(url):
c = pycurl.Curl()
c.setopt(pycurl.URL, url)
b = StringIO.StringIO()
c.setopt(pycurl.WRITEFUNCTION, b.write)
c.setopt(pycurl.SSL_VERIFYPEER, 0)
c.setopt(pycurl.SSL_VERIFYHOST, 0)
c.perform()
html = b.getvalue()
#print html
show_pach( html,url)
def show_pach(html,url):
tree=etree.HTML(html)
nodes=tree.xpath(u"/html/body")
title=nodes[0].xpath("//title")
attach_thumb = nodes[0].xpath('//li[@data-imgs]')
sale = nodes[0].xpath('//span[@class="value"][2]');
img_s = ""
cover = ""
cost = ""
sale_price = ""
market_price = ""
shop_price = 100 #库存
#print sale[5]
for items in sale:
cost = items.text
sale_price = float(cost) * 1.4
market_price = float(cost) * 1.8
j = 0
for item in attach_thumb:
imgs = item.attrib['data-imgs']
dict = eval(item.attrib['data-imgs'])
if j == 1:
cover = str(dict["preview"])
#print dict["preview"]
img_s = img_s + str(dict["preview"])+","
j = j + 1
if j == 5:
break
for item in title:
title = item.text[:-11]
print title +"\n"
print cover +"\n"
print img_s[:-1] +"\n"
sql = "INSERT INTO `wpin`.`yge_product` ( `title`, `category_id`, `attach_thumb`,`attach_image`,`slider`,`sale_price`,`market_price`,`shop_price`,`chengben`, `content`) VALUES ('"+ title +"','163','"+ cover +"','"+ cover +"','"+ img_s[:-1] +"','"+ str(sale_price) +"','"+ str(market_price) +"','"+ str(shop_price) +"','"+ str(cost) +"','content')"
print sql
def download_img_https(url):
c = pycurl.Curl()
c.setopt(pycurl.URL, url)
b = StringIO.StringIO()
c.setopt(pycurl.WRITEFUNCTION, b.write)
c.setopt(pycurl.SSL_VERIFYPEER, 0)
c.setopt(pycurl.SSL_VERIFYHOST, 0)
c.perform()
html = b.getvalue()
#print html
ms=hashlib.md5()
ms.update(url)
ms.hexdigest()
fk = open("file/nimabi"+ms.hexdigest()+".jpg", "wb")
fk.write(html)
print url
fk.close()
def urllibget(i):
response = urllib2.urlopen(i)
html = response.read()
show_pach(html,i)
def run():
url = raw_input("add one url: ")
if url[:4] != 'http':
print "please a true 1688 detail url "
else:
urllibget(url)
run()
endtime = datetime.datetime.now()
print (endtime - starttime).seconds
网友评论