美文网首页
Week2_Practice1

Week2_Practice1

作者: Mark狡 | 来源:发表于2016-05-24 10:04 被阅读0次

    MainInformation.py

    函数功能:获取租房页面的信息

    import requests
    from bs4 import BeautifulSoup
    import time
    def getMainInformation(url):
        html=requests.get(url)
        bsHtml=BeautifulSoup(html.text,'lxml')
    
        data={
            'title':'hello',
            'price':'hello',
            'sex':'hello',
            'name':'hello',
            'photo':'hello',
            'add':'hello',
            'ownerPhoto':'hello'
        }
        # 获取标题
        midTitle=bsHtml.select('body > div.wrap.clearfix.con_bg > div.con_l > div.pho_info > h4 > em')
        for i in midTitle:
            data['title']=i.get_text()
    
    
        # 获取地址
        midAdd=bsHtml.select('body > div.wrap.clearfix.con_bg > div.con_l > div.pho_info > p > span.pr5')
        for i in midAdd:
            data['add']=i.get_text().strip()
    
    
        # 获取价格
        midPri=bsHtml.select('#pricePart > div.day_l > span')
        for i in midPri:
            data['price']=i.get_text()
    
    
        # 获取房东图片
        midOwnerPhone=bsHtml.select('#floatRightBox > div.js_box.clearfix > div.member_pic > a > img')
        for i in midOwnerPhone:
            data['ownerPhoto']=i.get('src')
    
    
        # 获取房东性别
        midSex=bsHtml.select('div[class="member_pic"] > div')
        for i in midSex:
            x=i.get('class')
            if x[0]=='member_ico':
               data['sex']='male'
            else:
                data['sex']='female'
    
    
        # 获取房东名称
        midName=bsHtml.select('#floatRightBox > div.js_box.clearfix > div.w_240 > h6 > a')
        for i in midName:
            data['name']=i.get_text()
    
    
        # 获取房间照片
        midPh = bsHtml.select('#detailImageBox > div.pho_show_r > div > ul > li > img[data-width="800"]')
    
        for i in midPh:
            data['photo'] = i.get('data-src')
            break
        time.sleep(1)
        return data
    
    
    # 测试程序
    url='http://bj.xiaozhu.com/fangzi/1466098635.html'
    getMainInformation(url)
    

    getPageHref.py

    函数功能:获取租房链接

    import requests
    from bs4 import BeautifulSoup
    import time
    def getPageHref(url):
        html=requests.get(url)
        bsHtml=BeautifulSoup(html.text,'lxml')
    
        # 获取链接
        hrefs=bsHtml.select('#page_list > ul > li > a[class="resule_img_a"]')
        self_hrefs=[]
        for i in hrefs:
            self_hrefs.append(i.get('href'))
        time.sleep(1)
        # 返回获取的链接
        return self_hrefs
    
    
    # 测试程序
    url='http://bj.xiaozhu.com/search-duanzufang-p1-0/'
    getPageHref(url)
    

    Main.py

    函数功能:筛选出房价大于400的租房信息

    import getPageHref
    import MainInformation
    import pymongo
    
    
    def main():
        urls=['http://bj.xiaozhu.com/search-duanzufang-p{}-0/'.format(str(i)) for i in range(1,3)]
        client=pymongo.MongoClient('localhost',27017)
        houseMess=client['houseMess']
        sheet_tab=houseMess['sheet_tab']
        # for url in urls:
        #     urlss=getPageHref.getPageHref(url)
        #     for i in urlss:
        #         data=MainInformation.getMainInformation(i)
        #         sheet_tab.insert_one(data)
    
        # 筛选出大于400的商家
        house=[]
        for i in sheet_tab.find():
             if eval(i['price'])>=400:
                 house.append(i)
        for i in house:
            print(i)
    main()
    

    相关文章

      网友评论

          本文标题:Week2_Practice1

          本文链接:https://www.haomeiwen.com/subject/zukzrttx.html