-[spider.py]
# -*- coding:utf-8 -*-
import sys
reload(sys)
sys.setdefaultencoding('utf8')
import urllib2
from lxml import etree
import scrapy
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.selector import HtmlXPathSelector
from scrapy.item import Item
from ajk.items import ajkItem
import re
from scrapy.http import Request
class ajkSpider(CrawlSpider):
#定义爬虫的名称
name = "ajkSpider"
#定义允许抓取的域名,如果不是在此列表的域名则放弃抓取
allowed_domains = ["58.com"]
#定义抓取的入口url
start_urls = [
"http://sz.58.com/pinpaigongyu/pn/104/"
]
# 定义爬取URL的规则,并指定回调函数为parse_item
rules = [
Rule(SgmlLinkExtractor(allow=(r'http://sz.58.com'),
))
]
#print "**********ajkSpider**********"
#定义回调函数
#提取数据到Items里面,主要用到XPath和CSS选择器提取网页数据
def parse(self, response):
for info in response.xpath('//a[@tongji_label="listclick"]'):
item = ajkItem()
name = info.xpath('div[@class="des"]/h2/text()').extract()
#address= info.xpath('div[@class="infos"]/a[@class="address"]/span/text()').extract()
#huxing = info.xpath('div[@class="infos"]/a[@class="huxing"]/span/text()').extract()
onsale = info.xpath('div[@class="des"]/p[@class="room"]/text()').extract()
notsale = info.xpath('div[@class="des"]/p[@class="spec"]/span[@class="spec1"]/text()').extract()
proper = info.xpath('div[@class="des"]/p[@class="spec"]/span[@class="spec2"]/text()').extract()
special = info.xpath('div[@class="des"]/p[@class="spec"]/span[@class="spec3"]/text()').extract()
price = info.xpath('div[@class="small-logo"]/text()').extract()
tel = info.xpath('div[@class="money"]/span/b/text()').extract()
#scrapy crawl ajkSpider -o info.csv -t csv命令执行csv输出。
if name:
item['name']=name[0].encode('utf-8')
# if address:
#item['address']=address[0].encode('utf-8')
# if huxing:
# item['huxing']=huxing[0].encode('utf-8')
if onsale:
item['onsale']=onsale[0].encode('utf-8')
if notsale:
item['notsale']=notsale[0].encode('utf-8')
if proper:
item['proper']=proper[0].encode('utf-8')
if price:
item['price']=price[0].encode('utf-8')
if special:
item['special']=special[0].encode('utf-8')
if tel:
item['tel']=tel[0].encode('utf-8')
yield item
#item['address'] = [a.encode('utf-8') for a in address[0].xpath('string(.)').strip()]
#info.xpath('div[@class="pic"]/a/@href').extract()
#huxing = info.xpath('//a[@class="huxing"]/text()')
#item['huxing'] = [h.encode('utf-8') for h in huxing[0].xpath('string(.)').strip()]
#item['rate'] = info.xpath('div[@class="info"]/div[@class="bd"]/div[@class="star"]/span/text()').extract()
#item['quote'] = info.xpath('div[@class="info"]/div[@class="bd"]/p[@class="quote"]/span/text()').extract()
# 翻页
next_page = response.xpath('//div[@class="page"]/a[@class="next"]/@href')
if next_page:
url =response.urljoin(next_page[0].extract())
yield scrapy.Request(url,callback=self.parse)
#return items
网友评论