爬取目标:淘宝下某一类目商品的标题、链接、原价、优惠促销价格、评论数等信息(也可进一步爬取详细评论信息)。
源代码
items.py
# -*- coding: utf-8 -*-
# Define here the models for your scraped items
#
# See documentation in:
# http://doc.scrapy.org/en/latest/topics/items.html
import scrapy
class Taobao01Item(scrapy.Item):
# define the fields for your item here like:
#商品名称
title = scrapy.Field()
#商品链接
link = scrapy.Field()
#商品价格(原价)
price = scrapy.Field()
#促销价格
price_now = scrapy.Field()
#评论数
comment = scrapy.Field()
爬虫文件 tb01.py
# -*- coding: utf-8 -*-
import scrapy
from scrapy.http import Request
import re
from taobao01.items import Taobao01Item
import urllib.request
class Tb01Spider(scrapy.Spider):
name = "tb01"
allowed_domains = ["taobao.com"]
start_urls = ['http://www.taobao.com/']
def parse(self, response):
key = '零食'
for i in range(1,3):
url = "https://s.taobao.com/search?q=" + key + "&ie=utf8&s=" + str((i - 1) * 44)
print("要爬取的url是:" + url)
yield Request(url=url,callback=self.goodlist)
def goodlist(self,response):
body = response.body.decode()
pat = '"nid":"(.*?)"'
allid = re.compile(pattern=pat).findall(body)
# print(allid)
for id in allid:
url = "https://item.taobao.com/item.htm?id=" + str(id)
yield Request(url=url,callback=self.good,meta={"id":id})
def good(self,response):
id = response.meta["id"]
comment_url = "https://rate.taobao.com/detailCount.do?callback=jsonp100&itemId=" + str(id)
try:
title = response.xpath("//h3[@class='tb-main-title']/text()").extract()[0]
except:
title = response.xpath("//h1[@data-spm='1000983']/text()").extract()[0]
link = response.url
try:
price = response.xpath("//em[@class='tb-rmb-num']/text()").extract()[0]
except:
price = "100"
# try:
# price = response.xpath("//dl[@id='J_StrPriceModBox']/dd/span[@class='tm-price']/text()").extract()[0]
# except:
# price = response.xpath("//dl[@id='J_StrPriceModBox']/dd/div[class='tm-promo-price']/span[@class='tm-price']/text()").extract()[0]
commentdata = urllib.request.urlopen(comment_url).read().decode("utf-8","ignore")
pat = 'jsonp100({"count":(.*?)})'
# comment = re.compile(pat).findall(commentdata)[0]
print("商品url是:" + link)
print("商品价格是:" + price)
print("商品评论url是:" + comment_url)
print("返回的评论字符串是:" + commentdata)
# print("评论数是:" + comment)
print("")
item = Taobao01Item()
item["title"] = title
item["link"] = link
item["price"] = price
# item["comment"] = comment
yield item
pipelines.py
将爬取到的数据插入到数据库:略,可参考博文
http://www.jianshu.com/p/164f3fda2d1c
(本文未完待续)
网友评论