美文网首页
python爬虫:爬取房源数据

python爬虫:爬取房源数据

作者: 倪大头 | 来源:发表于2018-01-28 16:32 被阅读381次
    from bs4 import BeautifulSoup
    import requests
    import time
    import pymongo
    
    client = pymongo.MongoClient('localhost',27017)
    walden = client['walden']
    sheet_tab = walden['sheet_tab']
    
    url = 'https://m.lianjia.com/bj/zufang/101102453003.html'
    urls = ['https://m.lianjia.com/bj/zufang/pg{}'.format(str(i)) for i in range(1,3)]
    
    def get_houses(url,data=None):
        wb_data = requests.get(url)
        soup = BeautifulSoup(wb_data.text, 'lxml')
    
        time.sleep(2) #防止请求过于频繁被反爬虫
    
        titles = soup.select('div.item_list > div.item_main')
        adds = soup.select('div.item_list > div.item_other.text_cut')
        prices = soup.select('div.item_list > div.item_minor > span > em')
        imgs = soup.select('div.mod_media > div > img')
        for title,img,add,price in zip(titles,imgs,adds,prices):
            data = {
                'title':title.get_text(),
                'img':img.get('origin-src'),
                'add':add.get_text(),
                'price':price.get_text(),
            }
            sheet_tab.insert_one(data)
    
    for single_url in urls:
        get_houses(single_url)
    
    #输出所有价格大于5000的数据
    # for info in sheet_tab.find():
    #     if int(info['price']) > 5000:
    #         print(info)
    

    相关文章

      网友评论

          本文标题:python爬虫:爬取房源数据

          本文链接:https://www.haomeiwen.com/subject/tuuvaxtx.html