美文网首页
python爬取链家网房源,租售比辅助买房

python爬取链家网房源,租售比辅助买房

作者: 阿雷_590a | 来源:发表于2018-02-08 14:30 被阅读0次

    写在前面得话

    确定在武昌这边买套房,用python爬取链家武昌、洪山、东高三区的租房和二手房信息。计算租售比辅助买房决策。(之前先爬过房天下,但满满的都是假房源没有意义只能放弃)

    爬取租房信息

    代码

    # -*- coding: utf-8 -*-
    """
    Created on Wed Dec 20 10:44:38 2017
    
    @author: huanglei
    """
    
    # -*- coding: utf-8 -*-
    
    #-*- coding:utf-8 -*-  
    import requests  
    import re  
    import random  
    from bs4 import BeautifulSoup  
    import pandas as pd
      
    
    def is_num_by_except(num):
        try:
            int(num)
            return True
        except ValueError:
    #        print "%s ValueError" % num
            return False
    
    
    def spider_1(url):
        
        
        user_agent=['Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.87 Safari/537.36',  
        'Mozilla/5.0 (X11; U; Linux x86_64; zh-CN; rv:1.9.2.10) Gecko/20100922 Ubuntu/10.10 (maverick) Firefox/3.6.10',  
        'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11',  
        'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/30.0.1599.101 Safari/537.36',  
        'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.71 Safari/537.1 LBBROWSER',  
        'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E; QQBrowser/7.0.3698.400)',  
        ]  
        headers={  
        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',  
        'Accept-Encoding': 'gzip, deflate, sdch',  
        'Accept-Language': 'zh-CN,zh;q=0.8',  
        'User-Agent': user_agent[random.randint(0,5)]  
        } 
        
        response = requests.get(url,headers=headers)  
        soup = BeautifulSoup(response.text,'lxml')
        page_array = []
    
        titles = soup.select('#house-lst > li > div.info-panel > h2 > a')          # 标题
        courts = soup.select('span.region')                         #小区
        areas = soup.select('#house-lst > li > div.info-panel > div.col-1 > div.where > span.meters') #平米
        zones = soup.select('#house-lst > li > div.info-panel > div.col-1 > div.where > span.zone') # 几室几厅
        prices = soup.select('#house-lst > li > div.info-panel > div.col-3 > div.price > span') #价格
    
    
        for title, court, area, zone, price in zip(titles, courts, areas, zones, prices):
            data = {
                'title': title.get_text().strip(),
                'court': court.get_text().strip(),
                'roomWay': zone.get_text().strip(),
                'square': area.get_text().strip(),
    #            'orient': list(detail.stripped_strings)[5],
                'price': price.get_text().strip(),
            }
            if is_num_by_except(data['square'][:-2]) == True:
                data['square'] = data['square'][:-2]
                data['danjia']=int(int(data['price'])/int(data['square']))
                if int(data['square'])>20:
                    page_array.append(data)
        return page_array
    
    def pandas_to_xlsx(info, file_name):
        pd_look = pd.DataFrame(info)
        sheet_n = '武汉租房'
        pd_look.to_excel(file_name,sheet_name=sheet_n)
        
    def sort_xlsx(file_name):
        df = pd.read_excel(file_name)
        df = df.drop_duplicates()
        df_zufang = df.groupby('court').mean()
        df_zufang.to_excel("均一化"+file_name,'均一化')
        nlist = add_youzufang(df_zufang)   
    
    
    
    array_all =[]
    list_qu = ['wuchang','hongshan','donghugaoxin']
    for qu in list_qu:
        page = 1
        url_qu = 'https://wh.lianjia.com/zufang/'+qu
        while page < 3:        
            url = url_qu+'/pg'+str(page)
            try:
                array_all.extend(spider_1(url))
            except:
                pandas_to_xlsx(array_all)
                break
    
            page = page + 1
            print(qu+str(page))
    pandas_to_xlsx(array_all,"链家武汉租房.xlsx")
    
    
    df = pd.read_excel("链家武汉租房.xlsx")
    df = df.drop_duplicates()
    df_zufang = df.groupby('court').mean()
    df_zufang.to_excel('租房均一化.xlsx','均一化') 
    

    爬取租房信息截图如下

    image

    爬取二手房信息

    # -*- coding: utf-8 -*-
    """
    Created on Wed Dec 20 10:44:38 2017
    
    @author: huanglei
    """
    
    # -*- coding: utf-8 -*-
    
    from bs4 import BeautifulSoup
    import requests
    import pandas as pd
    import re
    import random  
    
    
    
    def spider_1(url):
        
        user_agent=['Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.87 Safari/537.36',  
        'Mozilla/5.0 (X11; U; Linux x86_64; zh-CN; rv:1.9.2.10) Gecko/20100922 Ubuntu/10.10 (maverick) Firefox/3.6.10',  
        'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11',  
        'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/30.0.1599.101 Safari/537.36',  
        'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.71 Safari/537.1 LBBROWSER',  
        'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E; QQBrowser/7.0.3698.400)',  
        ]  
        headers={  
        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',  
        'Accept-Encoding': 'gzip, deflate, sdch',  
        'Accept-Language': 'zh-CN,zh;q=0.8',  
        'User-Agent': user_agent[random.randint(0,5)]  
        }  
        response = requests.get(url,headers=headers)
        soup = BeautifulSoup(response.text,'lxml')
        page_array = []
    
        titles = soup.select('li.clear > div.info.clear > div.title > a')          # 标题
        hrefs = soup.select('ul.sellListContent > li.clear > a.img')
        details = soup.select("div.address > div.houseInfo")
        prices = soup.select("div.priceInfo > div.totalPrice > span")#解析总价
        danjias = soup.select("div.priceInfo > div.unitPrice > span")#解析单价
        loucengs = soup.select("div.info.clear > div.flood > div.positionInfo")
        addresss = soup.select("div.info.clear > div.flood > div.positionInfo > a")
    
    
        for title, href, detail, price, danjia, louceng, address in zip(titles, hrefs, details, prices, danjias, loucengs, addresss):
            data = {
                'title': title.get_text(),
                'href': href.get('href'),
                'detail': detail.get_text().strip(),
                'price': price.get_text(),
                'danjia': danjia.get_text(),
                'louceng': louceng.get_text(),
                'add': address.get_text(),
            }
            
    
                #print(float(data['price'])<170)
            data['court'] = data['detail'].split('|')[0].strip()
            data['area'] = data['detail'].split('|')[2][:-3]
            data['author'] = ""
            
            page_array.append(data)
        
        return page_array
    
            
    def pandas_to_xlsx(info):
        pd_look = pd.DataFrame(info)
        xlsx_n = '链家二手房.xlsx'
        sheet_n = '武汉二手房'
    
        pd_look.to_excel(xlsx_n,sheet_name=sheet_n)
    
    #返回有租房小区列表 
    def add_youzufang(info):
        nlist=[]
        for index, row in info.iterrows():
            #for col_name in info.columns:
            nlist.append(index)
        return nlist
    
    #修改买房者为租金单价
    def chinese(info):
        info.rename(columns={"author":"租售比", "add":"地址","area":"平米","court":"小区名","danjia":"单价"}, inplace = True)
        info.rename(columns={"detail":"细节", "price":"总价","title":"标题"}, inplace = True)
        return info
    
    # 读取租房信息并得到所有小区信息
    df = pd.read_excel("链家武汉租房.xlsx")
    df = df.drop_duplicates()
    df_zufang = df.groupby('court').mean()
    df_zufang.to_excel('租房均一化.xlsx','均一化')
    nlist = add_youzufang(df_zufang)    
    
    
    # 读取所有卖二手房信息并放在pd数据中    
    page = 1
    df_ershoufang =[]
    while page < 300:
        url = 'https://wh.lianjia.com/ershoufang/sf1l1l2l3p1p2p3p4/pg'+str(page)
        try:
            df_ershoufang.extend(spider_1(url))    
            page = page + 1
        except:
            print("error:")
            break
        print(page)
    
    pandas_to_xlsx(df_ershoufang)
    
    df_ershoufang = pd.read_excel("链家二手房.xlsx")
    
    for index, row in df_ershoufang.iterrows():
        if (row['court']) in nlist:
    
            df_ershoufang.at[index,'author']= df_zufang.at[row['court'],'danjia']
        else:
            df_ershoufang.drop(index,axis=0,inplace=True)
    
    df_ershoufang['author'] = 0.12*df_ershoufang['author'].astype('int')/(df_ershoufang['price'].astype('int')/df_ershoufang['area'].astype('float'))
    df_ershoufang = df_ershoufang.sort_values(by='author',ascending = False)
    df_ershoufang = chinese(df_ershoufang)
    pandas_to_xlsx(df_ershoufang)
    

    结果截图如下


    image.png

    写在后面的话

    数据分析依旧只能做参考

    得到房源信息没能区分70年产权和40年产权的房子。而且也不能满足我其他关于交通、工作的准确信息。最终还是去现场看了几天才确认。

    租售比只能做参考

    曾经看了各种帖子,说金融界租售比(年租金除以卖价)为4%为最佳,2.4%一下就有泡沫。但在中国这个房价飞速增长的年代,没点儿泡沫的房子不仅难找,是不是真有意义也谁又说的定呢?话说3年前周围人在北京和武汉买的房倒是真的租售比超过4% _

    相关文章

      网友评论

          本文标题:python爬取链家网房源,租售比辅助买房

          本文链接:https://www.haomeiwen.com/subject/eirttftx.html