美文网首页
楼盘统计爬虫源码

楼盘统计爬虫源码

作者: 白日sv | 来源:发表于2018-02-09 11:22 被阅读0次

-[spider.py]

# -*- coding:utf-8 -*-
import sys  
reload(sys)  
sys.setdefaultencoding('utf8')
import urllib2
from lxml import etree
import scrapy
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.selector import HtmlXPathSelector
from scrapy.item import Item
from ajk.items import ajkItem
import re
from scrapy.http import Request
class ajkSpider(CrawlSpider):
    #定义爬虫的名称
    name = "ajkSpider"
    #定义允许抓取的域名,如果不是在此列表的域名则放弃抓取
    allowed_domains = ["58.com"]
    #定义抓取的入口url
    start_urls = [
        "http://sz.58.com/pinpaigongyu/pn/104/"
    ]
    # 定义爬取URL的规则,并指定回调函数为parse_item
    rules = [
        Rule(SgmlLinkExtractor(allow=(r'http://sz.58.com'),
                      ))
                       
    ]
    #print "**********ajkSpider**********"
    #定义回调函数
    #提取数据到Items里面,主要用到XPath和CSS选择器提取网页数据
    def parse(self, response): 
        for info in response.xpath('//a[@tongji_label="listclick"]'):
            item = ajkItem()
            name = info.xpath('div[@class="des"]/h2/text()').extract()
            #address= info.xpath('div[@class="infos"]/a[@class="address"]/span/text()').extract()
            #huxing = info.xpath('div[@class="infos"]/a[@class="huxing"]/span/text()').extract()
            onsale = info.xpath('div[@class="des"]/p[@class="room"]/text()').extract()
            notsale = info.xpath('div[@class="des"]/p[@class="spec"]/span[@class="spec1"]/text()').extract()
            proper = info.xpath('div[@class="des"]/p[@class="spec"]/span[@class="spec2"]/text()').extract()
            special = info.xpath('div[@class="des"]/p[@class="spec"]/span[@class="spec3"]/text()').extract()
            price = info.xpath('div[@class="small-logo"]/text()').extract()
            tel = info.xpath('div[@class="money"]/span/b/text()').extract()
            #scrapy crawl ajkSpider -o info.csv -t csv命令执行csv输出。    
            if name:                
                item['name']=name[0].encode('utf-8')
            
           # if address:
                #item['address']=address[0].encode('utf-8')
           # if huxing:
            #    item['huxing']=huxing[0].encode('utf-8')
            if onsale:
                item['onsale']=onsale[0].encode('utf-8')
            if notsale:
                item['notsale']=notsale[0].encode('utf-8')

            if proper:
                item['proper']=proper[0].encode('utf-8')
            
            if price:
                item['price']=price[0].encode('utf-8')
            if special:
                item['special']=special[0].encode('utf-8')
            if tel:
                item['tel']=tel[0].encode('utf-8')
            yield item
                #item['address'] = [a.encode('utf-8') for a in address[0].xpath('string(.)').strip()]

                 #info.xpath('div[@class="pic"]/a/@href').extract()
                #huxing = info.xpath('//a[@class="huxing"]/text()')
                #item['huxing'] = [h.encode('utf-8') for h in huxing[0].xpath('string(.)').strip()]
        #item['rate'] = info.xpath('div[@class="info"]/div[@class="bd"]/div[@class="star"]/span/text()').extract()
        #item['quote'] = info.xpath('div[@class="info"]/div[@class="bd"]/p[@class="quote"]/span/text()').extract()
            

    # 翻页
        next_page = response.xpath('//div[@class="page"]/a[@class="next"]/@href')                
        if next_page:
            url =response.urljoin(next_page[0].extract())
            yield scrapy.Request(url,callback=self.parse)
    #return items

相关文章

网友评论

      本文标题:楼盘统计爬虫源码

      本文链接:https://www.haomeiwen.com/subject/yzjftftx.html