美文网首页
python - 对njhouse的房源信息抓包

python - 对njhouse的房源信息抓包

作者: 温柔vs先生 | 来源:发表于2021-12-31 14:46 被阅读0次
    #!/usr/bin/env python
    # -*- encoding: utf-8 -*-
    '''
    @文件        :NJHouse.py
    @说明        :https://www.njhouse.com.cn(对njhouse的房源信息抓包)
    @时间        :2021/12/28 16:42:57
    @作者        :wbb
    @版本        :1.0
    '''
    
    from os import name
    from typing import Mapping, NamedTuple
    import requests
    from bs4 import BeautifulSoup
    import xlwt
    from fake_useragent import UserAgent
    
    # 传入图片链接,识别图片
    from PIL import Image
    import pytesseract
    from io import BytesIO
    
    
    def main():
        datasourceDic = getData()
        # print(datasourceDic)
        saveDataToExcel(datasourceDic)
    
    
    # 数据保存到Excel
    
    
    def saveDataToExcel(datasourceDic):
    
        color_dic = {'ks': '#99cc00', 'rg': '#ffff66',
                     'qy': '#ff9900', 'ba': '#ff0019', 'az': '#ffccff'}
    
        sheetTitle = datasourceDic['sheetTitle'].replace('[', '').replace(']', '')
        dataList = datasourceDic['dataList']
        detail_dong = datasourceDic['detail_dong']
        dong_header = detail_dong['dong_header']
        dong_table_th_title_list = detail_dong['dong_table_th_title_list']
        dong_table_td_title_list = detail_dong['dong_table_td_title_list']
        room_info_list = datasourceDic['room_info_list']
    
        workbook = xlwt.Workbook(encoding='utf-8')
        bookSheet = workbook.add_sheet(sheetTitle, cell_overwrite_ok=True)
    
        font = xlwt.Font()  # Create Font
        font.bold = True  # Set font to Bold
    
        alignment = xlwt.Alignment()  # Create Alignment  创建对齐
        # May be: 标准化:HORZ_GENERAL, 左对齐:HORZ_LEFT, 水平对齐居中:HORZ_CENTER, 右对齐:HORZ_RIGHT, 填充:HORZ_FILLED, HORZ_JUSTIFIED, HORZ_CENTER_ACROSS_SEL, HORZ_DISTRIBUTED
        alignment.horz = xlwt.Alignment.HORZ_CENTER
        # May be: 顶部对齐:VERT_TOP, 垂直居中:VERT_CENTER, 底部对齐:VERT_BOTTOM, VERT_JUSTIFIED, VERT_DISTRIBUTED
        alignment.vert = xlwt.Alignment.VERT_CENTER
        style = xlwt.XFStyle()  # Create Style 创建样式
        style.alignment = alignment  # Add Alignment to Style  为样式添加对齐
        style.font = font
        # 第一行,第一列
        bookSheet.write_merge(0, 0, 0, len(
            dong_table_th_title_list)-1, label=dong_header, style=style)
    
        for i in range(len(dong_table_th_title_list)):
    
            alignment = xlwt.Alignment()  # Create Alignment  创建对齐
            # May be: 标准化:HORZ_GENERAL, 左对齐:HORZ_LEFT, 水平对齐居中:HORZ_CENTER, 右对齐:HORZ_RIGHT, 填充:HORZ_FILLED, HORZ_JUSTIFIED, HORZ_CENTER_ACROSS_SEL, HORZ_DISTRIBUTED
            alignment.horz = xlwt.Alignment.HORZ_CENTER
            # May be: 顶部对齐:VERT_TOP, 垂直居中:VERT_CENTER, 底部对齐:VERT_BOTTOM, VERT_JUSTIFIED, VERT_DISTRIBUTED
            alignment.vert = xlwt.Alignment.VERT_CENTER
            style = xlwt.XFStyle()  # Create Style 创建样式
            style.alignment = alignment  # Add Alignment to Style  为样式添加对齐
    
            title = dong_table_th_title_list[i]
            bookSheet.write(1, i, label=title)
            content = dong_table_td_title_list[i]
            bookSheet.write(2, i, label=content, style=style)
    
        count = 4
    
        for i in range(len(dataList)):
            cengDic = dataList[i]
            # 写入第一列,多少层
            bookSheet.write(i + count, 0, label=cengDic['ceng'])
    
            room_list = cengDic["roomList"]
            for j in range(len(room_list)):
                roomDic = room_list[j]
                room = roomDic['room']
                area = roomDic['area']
                price = roomDic['price']
                type = roomDic['type']
                room_color = color_dic[type]
                # TODO 给对应的房间设置背景色(需要xlwt自定义背景色)
    
                bookSheet.write(
                    i+count, j+1, label=f'{room}\n面积:{area}\n价格:{price}')
    
            # borders = xlwt.Borders()
            # borders.left = 1
            # borders.right = 1
            # borders.top = 1
            # borders.bottom = 1
            # borders.bottom_colour = 0x3A
    
            # style = xlwt.XFStyle()
            # style.borders = borders
    
            # bookSheet.write(4 + len(dataList), 0, style)
            workbook.save('NJHouse.xls')
    
    
    # 爬取网页
    def getData():
    
        # 数据源
        datasourceDic = {}
        # 某一栋
        currentUrl = 'https://www.njhouse.com.cn/spf/sales_detail?PRJ_ID=2867150&prjid=2867150&buildid=580130&dm=9幢'
        imgHeaderUrl = 'https://www.njhouse.com.cn/'
    
        htmlContent = requestUrl(currentUrl).text
        bs = BeautifulSoup(htmlContent, "html.parser")
    
        # sheet名字
        sheetTitle = bs.select(
            'body > div.main > div.business_centers > div > div.spf_del_title.clearfix > h2')[0].text
        datasourceDic['sheetTitle'] = sheetTitle
    
        # 某幢的详细信息
        detail_dong = {}
        dong_header = bs.select(
            'body > div.main > div.business_centers > div > div.spf_del_block > div > div > div:nth-child(1) > h2')[0]
        # 丢弃子元素 font  decompose()
        dong_header.font.decompose()
        detail_dong['dong_header'] = dong_header.text.strip()
    
        dong_table = bs.select(
            'body > div.main > div.business_centers > div > div.spf_del_block > div > div > table:nth-child(2)')[0]
        dong_table_th_list = dong_table.thead.tr.find_all('th')
        dong_table_th_title_list = []
        for dong_table_th_title in dong_table_th_list:
            dong_table_th_title_list.append(dong_table_th_title.text)
    
        detail_dong["dong_table_th_title_list"] = dong_table_th_title_list
    
        dong_table_td_list = dong_table.tbody.tr.find_all('td')
    
        dong_table_td_title_list = []
        for dong_table_td_title in dong_table_td_list:
            dong_table_td_title_list.append(dong_table_td_title.text)
        # print(dong_table_td_title_list)
        detail_dong["dong_table_td_title_list"] = dong_table_td_title_list
    
        datasourceDic['detail_dong'] = detail_dong
    
        # 房屋信息情况
        room_info = bs.select(
            'body > div.main > div.business_centers > div > div.spf_del_block > div > div > div.color_nav > ul > li')
        # print(room_info_list)
        room_info_list = []
        for item in room_info:
            room_dic = {}
            type = item.span.get('class')[0]
            title = item.p.text
            room_dic['type'] = type
            room_dic['title'] = title
            room_info_list.append(room_dic)
        datasourceDic['room_info_list'] = room_info_list
        # print(room_info_list)
    
        # 销售窗口表
        table = bs.find("table", class_='ck_table')
        tr_list = table.tbody.find_all('tr')
        # print(len(tr_list))
        dataList = []
        for ty_item in tr_list:
            dataDic = {}
            ceng = ty_item.find("td", class_="td_h").text.strip()
            dataDic['ceng'] = ceng
            room_list = list(ty_item.children)
            room_list.pop(0)
    
            ceng_room_list = []
            for i in range(len(room_list)):
                td_item = room_list[i]
                roomDic = {}
                roomDic['type'] = td_item.get("class")[0]
                a_list = td_item.find_all('a')
                # 房间号
                room = a_list[0].text
                print('爬取的房间号', room)
                roomDic['room'] = room
                if i == 0:
                    imgList = a_list[1].find_all("img")
                    # print(imgList)
                    imgsrc1 = imgList[0].get('src')
                    imgUrl1 = imgHeaderUrl + imgsrc1
                    # saveImage(imgUrl1)
                    area = textForImgUrl(imgUrl1).replace("\n", "").strip()
                    # print(area)
                    roomDic['area'] = area
                    imgsrc2 = imgList[1].get('src')
                    imgUrl2 = imgHeaderUrl + imgsrc2
                    # saveImage(imgUrl2)
                    price = textForImgUrl(imgUrl2).replace("\n", "").strip()
                    # print(price)
                    roomDic['price'] = price
                else:
                    area = a_list[1].get_text()
                    price = area.split("价格:")[-1]
                    area = area.split("价格:")[0]
                    area = area.split(":")[-1]
                    roomDic['area'] = area
                    roomDic['price'] = price
                    # print(area, price)
    
                ceng_room_list.append(roomDic)
    
            dataDic["roomList"] = ceng_room_list
            dataList.append(dataDic)
    
        datasourceDic["dataList"] = dataList
        return datasourceDic
    
    # 图片链接转文本
    # Tesseract OCR识别
    
    
    def textForImgUrl(imgUrl):
        response = requestUrl(imgUrl)
    
        image = Image.open(BytesIO(response.content))
        width, height = image.size
        newsize = (width*2, height*2)
        image = image.resize(newsize)
        text = pytesseract.image_to_string(image)
        return text
    
    # 保存图片到本地
    
    
    def saveImage(imgUrl):
        imgPath = imgUrl.split("?")[-1]
        imgPath = imgPath.split("&")[0]
        imgPath = imgPath.split("=")[-1]
    
        with open(imgPath, "wb") as f:
            response = requestUrl(imgUrl)
            img = response.content
            f.write(img)
    
    
    # 进行网络请求
    def requestUrl(url):
        header = {
            "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.55 Safari/537.36"}
    
        response = requests.get(url, headers=header)
        response.encoding = 'utf-8'
        return response
    
    
    if __name__ == '__main__':
        main()
        # imgUrl = 'https://www.njhouse.com.cn/common/imgmake?num=172355&bg=%2399CC00'
        # textForImgUrl(imgUrl)
    
    
    

    相关文章

      网友评论

          本文标题:python - 对njhouse的房源信息抓包

          本文链接:https://www.haomeiwen.com/subject/jaigqrtx.html