美文网首页
楼盘统计爬虫源码

楼盘统计爬虫源码

作者: 白日sv | 来源:发表于2018-02-09 11:22 被阅读0次

    -[spider.py]

    # -*- coding:utf-8 -*-
    import sys  
    reload(sys)  
    sys.setdefaultencoding('utf8')
    import urllib2
    from lxml import etree
    import scrapy
    from scrapy.contrib.spiders import CrawlSpider, Rule
    from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
    from scrapy.selector import HtmlXPathSelector
    from scrapy.item import Item
    from ajk.items import ajkItem
    import re
    from scrapy.http import Request
    class ajkSpider(CrawlSpider):
        #定义爬虫的名称
        name = "ajkSpider"
        #定义允许抓取的域名,如果不是在此列表的域名则放弃抓取
        allowed_domains = ["58.com"]
        #定义抓取的入口url
        start_urls = [
            "http://sz.58.com/pinpaigongyu/pn/104/"
        ]
        # 定义爬取URL的规则,并指定回调函数为parse_item
        rules = [
            Rule(SgmlLinkExtractor(allow=(r'http://sz.58.com'),
                          ))
                           
        ]
        #print "**********ajkSpider**********"
        #定义回调函数
        #提取数据到Items里面,主要用到XPath和CSS选择器提取网页数据
        def parse(self, response): 
            for info in response.xpath('//a[@tongji_label="listclick"]'):
                item = ajkItem()
                name = info.xpath('div[@class="des"]/h2/text()').extract()
                #address= info.xpath('div[@class="infos"]/a[@class="address"]/span/text()').extract()
                #huxing = info.xpath('div[@class="infos"]/a[@class="huxing"]/span/text()').extract()
                onsale = info.xpath('div[@class="des"]/p[@class="room"]/text()').extract()
                notsale = info.xpath('div[@class="des"]/p[@class="spec"]/span[@class="spec1"]/text()').extract()
                proper = info.xpath('div[@class="des"]/p[@class="spec"]/span[@class="spec2"]/text()').extract()
                special = info.xpath('div[@class="des"]/p[@class="spec"]/span[@class="spec3"]/text()').extract()
                price = info.xpath('div[@class="small-logo"]/text()').extract()
                tel = info.xpath('div[@class="money"]/span/b/text()').extract()
                #scrapy crawl ajkSpider -o info.csv -t csv命令执行csv输出。    
                if name:                
                    item['name']=name[0].encode('utf-8')
                
               # if address:
                    #item['address']=address[0].encode('utf-8')
               # if huxing:
                #    item['huxing']=huxing[0].encode('utf-8')
                if onsale:
                    item['onsale']=onsale[0].encode('utf-8')
                if notsale:
                    item['notsale']=notsale[0].encode('utf-8')
    
                if proper:
                    item['proper']=proper[0].encode('utf-8')
                
                if price:
                    item['price']=price[0].encode('utf-8')
                if special:
                    item['special']=special[0].encode('utf-8')
                if tel:
                    item['tel']=tel[0].encode('utf-8')
                yield item
                    #item['address'] = [a.encode('utf-8') for a in address[0].xpath('string(.)').strip()]
    
                     #info.xpath('div[@class="pic"]/a/@href').extract()
                    #huxing = info.xpath('//a[@class="huxing"]/text()')
                    #item['huxing'] = [h.encode('utf-8') for h in huxing[0].xpath('string(.)').strip()]
            #item['rate'] = info.xpath('div[@class="info"]/div[@class="bd"]/div[@class="star"]/span/text()').extract()
            #item['quote'] = info.xpath('div[@class="info"]/div[@class="bd"]/p[@class="quote"]/span/text()').extract()
                
    
        # 翻页
            next_page = response.xpath('//div[@class="page"]/a[@class="next"]/@href')                
            if next_page:
                url =response.urljoin(next_page[0].extract())
                yield scrapy.Request(url,callback=self.parse)
        #return items
    

    相关文章

      网友评论

          本文标题:楼盘统计爬虫源码

          本文链接:https://www.haomeiwen.com/subject/yzjftftx.html