楼盘统计爬虫源码

作者: 白日sv | 来源:发表于2018-02-09 11:22 被阅读0次

楼盘统计爬虫源码
网络爬虫源码分享
郑州中牟楼盘统计
Python爬虫技巧-西瓜视频MP4地址获取并下载
郑州管城区楼盘统计
郑州郑东新区楼盘统计
leetcode爬虫源码寻找之意外发现
用Python告诉你深圳房租有多高
秋招季，用Python分析深圳程序员工资有多高？
爬去链家房价数据

-[spider.py]

# -*- coding:utf-8 -*-
import sys  
reload(sys)  
sys.setdefaultencoding('utf8')
import urllib2
from lxml import etree
import scrapy
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.selector import HtmlXPathSelector
from scrapy.item import Item
from ajk.items import ajkItem
import re
from scrapy.http import Request
class ajkSpider(CrawlSpider):
    #定义爬虫的名称
    name = "ajkSpider"
    #定义允许抓取的域名,如果不是在此列表的域名则放弃抓取
    allowed_domains = ["58.com"]
    #定义抓取的入口url
    start_urls = [
        "http://sz.58.com/pinpaigongyu/pn/104/"
    ]
    # 定义爬取URL的规则，并指定回调函数为parse_item
    rules = [
        Rule(SgmlLinkExtractor(allow=(r'http://sz.58.com'),
                      ))
                       
    ]
    #print "**********ajkSpider**********"
    #定义回调函数
    #提取数据到Items里面，主要用到XPath和CSS选择器提取网页数据
    def parse(self, response): 
        for info in response.xpath('//a[@tongji_label="listclick"]'):
            item = ajkItem()
            name = info.xpath('div[@class="des"]/h2/text()').extract()
            #address= info.xpath('div[@class="infos"]/a[@class="address"]/span/text()').extract()
            #huxing = info.xpath('div[@class="infos"]/a[@class="huxing"]/span/text()').extract()
            onsale = info.xpath('div[@class="des"]/p[@class="room"]/text()').extract()
            notsale = info.xpath('div[@class="des"]/p[@class="spec"]/span[@class="spec1"]/text()').extract()
            proper = info.xpath('div[@class="des"]/p[@class="spec"]/span[@class="spec2"]/text()').extract()
            special = info.xpath('div[@class="des"]/p[@class="spec"]/span[@class="spec3"]/text()').extract()
            price = info.xpath('div[@class="small-logo"]/text()').extract()
            tel = info.xpath('div[@class="money"]/span/b/text()').extract()
            #scrapy crawl ajkSpider -o info.csv -t csv命令执行csv输出。    
            if name:                
                item['name']=name[0].encode('utf-8')
            
           # if address:
                #item['address']=address[0].encode('utf-8')
           # if huxing:
            #    item['huxing']=huxing[0].encode('utf-8')
            if onsale:
                item['onsale']=onsale[0].encode('utf-8')
            if notsale:
                item['notsale']=notsale[0].encode('utf-8')

            if proper:
                item['proper']=proper[0].encode('utf-8')
            
            if price:
                item['price']=price[0].encode('utf-8')
            if special:
                item['special']=special[0].encode('utf-8')
            if tel:
                item['tel']=tel[0].encode('utf-8')
            yield item
                #item['address'] = [a.encode('utf-8') for a in address[0].xpath('string(.)').strip()]

                 #info.xpath('div[@class="pic"]/a/@href').extract()
                #huxing = info.xpath('//a[@class="huxing"]/text()')
                #item['huxing'] = [h.encode('utf-8') for h in huxing[0].xpath('string(.)').strip()]
        #item['rate'] = info.xpath('div[@class="info"]/div[@class="bd"]/div[@class="star"]/span/text()').extract()
        #item['quote'] = info.xpath('div[@class="info"]/div[@class="bd"]/p[@class="quote"]/span/text()').extract()
            

    # 翻页
        next_page = response.xpath('//div[@class="page"]/a[@class="next"]/@href')                
        if next_page:
            url =response.urljoin(next_page[0].extract())
            yield scrapy.Request(url,callback=self.parse)
    #return items