#!/usr/bin/env python
# -*- encoding: utf-8 -*-
'''
@文件 :NJHouse.py
@说明 :https://www.njhouse.com.cn(对njhouse的房源信息抓包)
@时间 :2021/12/28 16:42:57
@作者 :wbb
@版本 :1.0
'''
from os import name
from typing import Mapping, NamedTuple
import requests
from bs4 import BeautifulSoup
import xlwt
from fake_useragent import UserAgent
# 传入图片链接,识别图片
from PIL import Image
import pytesseract
from io import BytesIO
def main():
datasourceDic = getData()
# print(datasourceDic)
saveDataToExcel(datasourceDic)
# 数据保存到Excel
def saveDataToExcel(datasourceDic):
color_dic = {'ks': '#99cc00', 'rg': '#ffff66',
'qy': '#ff9900', 'ba': '#ff0019', 'az': '#ffccff'}
sheetTitle = datasourceDic['sheetTitle'].replace('[', '').replace(']', '')
dataList = datasourceDic['dataList']
detail_dong = datasourceDic['detail_dong']
dong_header = detail_dong['dong_header']
dong_table_th_title_list = detail_dong['dong_table_th_title_list']
dong_table_td_title_list = detail_dong['dong_table_td_title_list']
room_info_list = datasourceDic['room_info_list']
workbook = xlwt.Workbook(encoding='utf-8')
bookSheet = workbook.add_sheet(sheetTitle, cell_overwrite_ok=True)
font = xlwt.Font() # Create Font
font.bold = True # Set font to Bold
alignment = xlwt.Alignment() # Create Alignment 创建对齐
# May be: 标准化:HORZ_GENERAL, 左对齐:HORZ_LEFT, 水平对齐居中:HORZ_CENTER, 右对齐:HORZ_RIGHT, 填充:HORZ_FILLED, HORZ_JUSTIFIED, HORZ_CENTER_ACROSS_SEL, HORZ_DISTRIBUTED
alignment.horz = xlwt.Alignment.HORZ_CENTER
# May be: 顶部对齐:VERT_TOP, 垂直居中:VERT_CENTER, 底部对齐:VERT_BOTTOM, VERT_JUSTIFIED, VERT_DISTRIBUTED
alignment.vert = xlwt.Alignment.VERT_CENTER
style = xlwt.XFStyle() # Create Style 创建样式
style.alignment = alignment # Add Alignment to Style 为样式添加对齐
style.font = font
# 第一行,第一列
bookSheet.write_merge(0, 0, 0, len(
dong_table_th_title_list)-1, label=dong_header, style=style)
for i in range(len(dong_table_th_title_list)):
alignment = xlwt.Alignment() # Create Alignment 创建对齐
# May be: 标准化:HORZ_GENERAL, 左对齐:HORZ_LEFT, 水平对齐居中:HORZ_CENTER, 右对齐:HORZ_RIGHT, 填充:HORZ_FILLED, HORZ_JUSTIFIED, HORZ_CENTER_ACROSS_SEL, HORZ_DISTRIBUTED
alignment.horz = xlwt.Alignment.HORZ_CENTER
# May be: 顶部对齐:VERT_TOP, 垂直居中:VERT_CENTER, 底部对齐:VERT_BOTTOM, VERT_JUSTIFIED, VERT_DISTRIBUTED
alignment.vert = xlwt.Alignment.VERT_CENTER
style = xlwt.XFStyle() # Create Style 创建样式
style.alignment = alignment # Add Alignment to Style 为样式添加对齐
title = dong_table_th_title_list[i]
bookSheet.write(1, i, label=title)
content = dong_table_td_title_list[i]
bookSheet.write(2, i, label=content, style=style)
count = 4
for i in range(len(dataList)):
cengDic = dataList[i]
# 写入第一列,多少层
bookSheet.write(i + count, 0, label=cengDic['ceng'])
room_list = cengDic["roomList"]
for j in range(len(room_list)):
roomDic = room_list[j]
room = roomDic['room']
area = roomDic['area']
price = roomDic['price']
type = roomDic['type']
room_color = color_dic[type]
# TODO 给对应的房间设置背景色(需要xlwt自定义背景色)
bookSheet.write(
i+count, j+1, label=f'{room}\n面积:{area}\n价格:{price}')
# borders = xlwt.Borders()
# borders.left = 1
# borders.right = 1
# borders.top = 1
# borders.bottom = 1
# borders.bottom_colour = 0x3A
# style = xlwt.XFStyle()
# style.borders = borders
# bookSheet.write(4 + len(dataList), 0, style)
workbook.save('NJHouse.xls')
# 爬取网页
def getData():
# 数据源
datasourceDic = {}
# 某一栋
currentUrl = 'https://www.njhouse.com.cn/spf/sales_detail?PRJ_ID=2867150&prjid=2867150&buildid=580130&dm=9幢'
imgHeaderUrl = 'https://www.njhouse.com.cn/'
htmlContent = requestUrl(currentUrl).text
bs = BeautifulSoup(htmlContent, "html.parser")
# sheet名字
sheetTitle = bs.select(
'body > div.main > div.business_centers > div > div.spf_del_title.clearfix > h2')[0].text
datasourceDic['sheetTitle'] = sheetTitle
# 某幢的详细信息
detail_dong = {}
dong_header = bs.select(
'body > div.main > div.business_centers > div > div.spf_del_block > div > div > div:nth-child(1) > h2')[0]
# 丢弃子元素 font decompose()
dong_header.font.decompose()
detail_dong['dong_header'] = dong_header.text.strip()
dong_table = bs.select(
'body > div.main > div.business_centers > div > div.spf_del_block > div > div > table:nth-child(2)')[0]
dong_table_th_list = dong_table.thead.tr.find_all('th')
dong_table_th_title_list = []
for dong_table_th_title in dong_table_th_list:
dong_table_th_title_list.append(dong_table_th_title.text)
detail_dong["dong_table_th_title_list"] = dong_table_th_title_list
dong_table_td_list = dong_table.tbody.tr.find_all('td')
dong_table_td_title_list = []
for dong_table_td_title in dong_table_td_list:
dong_table_td_title_list.append(dong_table_td_title.text)
# print(dong_table_td_title_list)
detail_dong["dong_table_td_title_list"] = dong_table_td_title_list
datasourceDic['detail_dong'] = detail_dong
# 房屋信息情况
room_info = bs.select(
'body > div.main > div.business_centers > div > div.spf_del_block > div > div > div.color_nav > ul > li')
# print(room_info_list)
room_info_list = []
for item in room_info:
room_dic = {}
type = item.span.get('class')[0]
title = item.p.text
room_dic['type'] = type
room_dic['title'] = title
room_info_list.append(room_dic)
datasourceDic['room_info_list'] = room_info_list
# print(room_info_list)
# 销售窗口表
table = bs.find("table", class_='ck_table')
tr_list = table.tbody.find_all('tr')
# print(len(tr_list))
dataList = []
for ty_item in tr_list:
dataDic = {}
ceng = ty_item.find("td", class_="td_h").text.strip()
dataDic['ceng'] = ceng
room_list = list(ty_item.children)
room_list.pop(0)
ceng_room_list = []
for i in range(len(room_list)):
td_item = room_list[i]
roomDic = {}
roomDic['type'] = td_item.get("class")[0]
a_list = td_item.find_all('a')
# 房间号
room = a_list[0].text
print('爬取的房间号', room)
roomDic['room'] = room
if i == 0:
imgList = a_list[1].find_all("img")
# print(imgList)
imgsrc1 = imgList[0].get('src')
imgUrl1 = imgHeaderUrl + imgsrc1
# saveImage(imgUrl1)
area = textForImgUrl(imgUrl1).replace("\n", "").strip()
# print(area)
roomDic['area'] = area
imgsrc2 = imgList[1].get('src')
imgUrl2 = imgHeaderUrl + imgsrc2
# saveImage(imgUrl2)
price = textForImgUrl(imgUrl2).replace("\n", "").strip()
# print(price)
roomDic['price'] = price
else:
area = a_list[1].get_text()
price = area.split("价格:")[-1]
area = area.split("价格:")[0]
area = area.split(":")[-1]
roomDic['area'] = area
roomDic['price'] = price
# print(area, price)
ceng_room_list.append(roomDic)
dataDic["roomList"] = ceng_room_list
dataList.append(dataDic)
datasourceDic["dataList"] = dataList
return datasourceDic
# 图片链接转文本
# Tesseract OCR识别
def textForImgUrl(imgUrl):
response = requestUrl(imgUrl)
image = Image.open(BytesIO(response.content))
width, height = image.size
newsize = (width*2, height*2)
image = image.resize(newsize)
text = pytesseract.image_to_string(image)
return text
# 保存图片到本地
def saveImage(imgUrl):
imgPath = imgUrl.split("?")[-1]
imgPath = imgPath.split("&")[0]
imgPath = imgPath.split("=")[-1]
with open(imgPath, "wb") as f:
response = requestUrl(imgUrl)
img = response.content
f.write(img)
# 进行网络请求
def requestUrl(url):
header = {
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.55 Safari/537.36"}
response = requests.get(url, headers=header)
response.encoding = 'utf-8'
return response
if __name__ == '__main__':
main()
# imgUrl = 'https://www.njhouse.com.cn/common/imgmake?num=172355&bg=%2399CC00'
# textForImgUrl(imgUrl)
网友评论