开通了简书,会逐渐把以前的项目放上分享,多总结回顾之前的项目,会有多一些不同的理解。
一、项目介绍
项目需求
朋友在上海一家房屋中介公司,因为项目需求,需要北上广深的一些房价信息,具体用途不清楚,我爬了全国的给他。
爬取房天下上全国新房、二手房数据
相关技术
python3.6、pycharm2018.1、Scarpy框架
爬取思路
1.分析获取全国所有城市新房和二手房的url,
2.获取房源链接,提取需求信息,并完成翻页爬取。
爬取难点
1.获取城市链接时,要分析页面布局,同时除去海外城市。

2.分析并构建二手房、新房url
以深圳为例:
深圳url:https://sz.fang.com/
深圳新房url:https://sz.newhouse.fang.com/house/s/
深圳二手房url:https://sz.esf.fang.com/
if city == "北京":
newhouse_url = "https://newhouse.fang.com/house/s/"
esf_url = "https://esf.fang.com/"
else:
# 构建新房的url链接
url1, url2 = city_url.split(".fang.")
newhouse_url = url1 + ".newhouse.fang." + url2 + "house/s/"
# 构建二手房url链接
esf_url = url1 + ".esf.fang." + url2
3.获取房源链接时,避开广告页

二、项目搭建:
打开cmd,进入project目录,执行scrpay startproject fang
,创建scrapy项目;
执行cd fang
进入项目;
执行scrapy genspider sfw fang.com
,创建通用爬虫
4.提取房屋信息


写好item时,要使用scarpy shell 进行测试,确保其他数据剔除干净。

三、项目代码
start.py(习惯性写一个启动脚本,方便多次执行调试)
# -*- coding: utf-8 -*-
from scrapy import cmdline
cmdline.execute("scrapy crawl sfw".split())
setting.py
# -*- coding: utf-8 -*-
ROBOTSTXT_OBEY = False
DOWNLOAD_DELAY = 0.5
# 这里不写User-Agent,而是在middlewares中写一个随机请求头,也可以使用fake_uaer-agent库
DEFAULT_REQUEST_HEADERS = {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Language': 'en',
}
DOWNLOADER_MIDDLEWARES = {
'fang.middlewares.UserAgentDownloadMiddleware': 543,
}
ITEM_PIPELINES = {
'fang.pipelines.HouseItemPipeline': 300,
}
middlewares.py
房天下还是挺友好的,写一个随机请求头,就可以防止被ban
# -*- coding: utf-8 -*-
import random
# user-agent随机请求头中间件
class UserAgentDownloadMiddleware(object):
USER_AGENTS = [
'Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML like Gecko) Chrome/44.0.2403.155 Safari/537.36',
'Mozilla/5.0 (Windows NT 6.2; WOW64; rv:63.0) Gecko/20100101 Firefox/63.0',
'Mozilla/5.0 (compatible, MSIE 11, Windows NT 6.3; Trident/7.0; rv:11.0) like Gecko',
'Mozilla/5.0 (Windows; U; Windows NT 6.1; rv:2.2) Gecko/20110201',
'Mozilla/5.0 (Windows; U; Win 9x 4.90; SG; rv:1.9.2.4) Gecko/20101104 Netscape/9.1.0285',
'Mozilla/5.0 (Windows NT 6.0; rv:2.0) Gecko/20100101 Firefox/4.0 Opera 12.14',
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36 OPR/26.0.1656.60',
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2125.122 UBrowser/4.0.3214.0 Safari/537.36',
'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E; LBBROWSER)',
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.11 TaoBrowser/2.0 Safari/536.11',
'Mozilla/5.0 (compatible; MSIE 9.0; Windows Phone OS 7.5; Trident/5.0; IEMobile/9.0; HTC; Titan)',
'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E; QQBrowser/7.0.3698.400)',
'Mozilla/5.0 (Windows NT 5.1) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.84 Safari/535.11 SE 2.X MetaSr 1.0',
]
def process_request(self, request, spider):
user_agent = random.choice(self.USER_AGENTS)
request.headers['User-Agent'] = user_agent
items.py
根据分析需求,确定web页面信息,定义必要的item
# -*- coding: utf-8 -*-
import scrapy
class NewHouseItem(scrapy.Item):
# 省份
province = scrapy.Field()
# 城市
city = scrapy.Field()
# 小区名字
name = scrapy.Field()
# 房屋类型
house_type = scrapy.Field()
# 价格
price = scrapy.Field()
# 居室,很可能是一个列表
rooms = scrapy.Field()
# 面积
area = scrapy.Field()
# 地址
address = scrapy.Field()
# 行政区
district = scrapy.Field()
# 是否在售
sale = scrapy.Field()
# 原始页面
origin_url = scrapy.Field()
class ESFHouseItem(scrapy.Item):
# 省份
province = scrapy.Field()
# 城市
city = scrapy.Field()
# 小区名字
name = scrapy.Field()
# 房屋类型
house_type = scrapy.Field()
# 几室几厅
rooms = scrapy.Field()
# 层
floor = scrapy.Field()
# 朝向
toward = scrapy.Field()
# 年代
year = scrapy.Field()
# 地址
address = scrapy.Field()
# 建筑面积
area = scrapy.Field()
# 总价
price = scrapy.Field()
# 单价
unit = scrapy.Field()
# 原始页面
origin_url = scrapy.Field()
sfw.py
主要分为三个部分,提取二手房和新房url,解析新房数据,解析二手房数据
# -*- coding: utf-8 -*-
import scrapy
from fang.items import NewHouseItem, ESFHouseItem
import re
class SfwSpider(scrapy.Spider):
name = 'sfw'
allowed_domains = ['fang.com']
start_urls = ['https://www.fang.com/SoufunFamily.html']
def parse(self, response):
province = None
trs = response.xpath("//div[@class='outCont']//tr")
for tr in trs:
tds = tr.xpath(".//td[not(@class)]")
province_td = tds[0]
province_text = province_td.xpath(".//text()").get().strip()
if province_text:
province = province_text
# 不爬取海外城市的房源
if province == "其它":
continue
city_td = tds[1]
city_as = city_td.xpath(".//a")
for city_a in city_as:
city = city_a.xpath(".//text()").get()
city_url = city_a.xpath(".//@href").get()
# 北京的url较为特殊,为不符合其他url的规则
if city == "北京":
newhouse_url ="https://newhouse.fang.com/house/s/"
esf_url = "https://esf.fang.com/"
else:
# 构建新房的url链接
url1, url2 = city_url.split(".fang.")
newhouse_url = url1+".newhouse.fang."+url2+"house/s/"
# 构建二手房url链接
esf_url = url1+".esf.fang."+url2
yield scrapy.Request(url=newhouse_url, callback=self.parse_newhouse, meta={'info': (province, city)})
yield scrapy.Request(url=esf_url, callback=self.parse_esf, meta={'info': (province, city)})
def parse_newhouse(self, response):
province, city = response.meta.get('info')
divs = response.xpath("//div[contains(@class, 'nl_con')]/ul/li/div/div[@class='nlc_details']")
for div in divs:
name = div.xpath(".//div[@class='nlcd_name']/a/text()").get().strip()
house_type = '新房'
house_type_list = div.xpath(".//div[@class='house_type clearfix']/a/text()").getall()
house_area_str = "".join(div.xpath(".//div[@class='house_type clearfix']/text()").getall())
area = re.sub(r"\s|-|/", "", house_area_str)
rooms = "/".join(list(filter(lambda x: x.endswith("居"), house_type_list)))
address = div.xpath(".//div[@class='address']/a/@title").get()
district_text = "".join(div.xpath(".//div[@class='address']/a//text()").getall())
try:
district = re.search(r".*\[(.*)\].*", district_text).group(1)
except:
district = ""
sale = div.xpath("./div[contains(@class,'fangyuan')]/span/text()").get()
price = "".join(div.xpath(".//div[@class='nhouse_price']//text()").getall())
price = re.sub(r"\s", "", price)
url = div.xpath(".//div[@class='nlcd_name']/a/@href").get()
origin_url = response.urljoin(url)
item = NewHouseItem(name=name, house_type=house_type, area=area, rooms=rooms, address=address, district=district, sale=sale, price=price, origin_url=origin_url, province=province, city=city)
yield item
url = response.xpath("//li[@class='fr']/a[@class='next']/@href").get()
next_url = response.urljoin(url)
if next_url:
yield scrapy.Request(url=next_url, callback=self.parse_newhouse, meta={'info': (province, city)})
def parse_esf(self, response):
province, city = response.meta.get('info')
dls = response.xpath("//dl[@class='clearfix']")
for dl in dls:
item = ESFHouseItem(province=province, city=city)
name = dl.xpath(".//p[@class='add_shop']/a/@title").get()
if not name:
continue
item['name'] = name
item['house_type'] = '二手房'
infos = dl.xpath(".//p[@class='tel_shop']//text()").getall()
infos = list(map(lambda x: re.sub(r"\s", "", x), infos))
for info in infos:
if "厅" in info:
item['rooms'] = info
elif "㎡" in info:
item['area'] = info
elif "层" in info:
item['floor'] = info
elif "向" in info:
item['toward'] = info
elif "年" in info:
item['year'] = info
address = dl.xpath(".//p[@class='add_shop']/span/text()").get()
item['address'] = address
price = "".join(dl.xpath(".//dd[@class='price_right']/span[1]//text()").getall())
item['price'] = price
unit = dl.xpath(".//dd[@class='price_right']/span[2]//text()").get()
item['unit'] = unit
url = dl.xpath(".//h4[@class='clearfix']/a/@href").get()
origin_url = response.urljoin(url)
item['origin_url'] = origin_url
yield item
next_url = response.xpath("//div[@class='page_al']/p/@href")
next_url = response.urljoin(next_url)
if next_url:
scrapy.Request(url=next_url, callback=self.parse_esf, meta={'info': (province, city)})
pipelines.py
使用mongodb存储数据
class HouseItemPipeline(object):
def __init__(self):
# 创建一个client
self.client = pymongo.MongoClient(host='127.0.0.1', port=27017)
# 访问数据库
db = self.client['fangtianxia']
# 访问集合
self.collection = db['house']
def process_item(self, item, spider):
data = dict(item)
self.collection.insert(data)
return item
四、查看爬取结果

网友评论