美文网首页
爬取链家租房信息

爬取链家租房信息

作者: 王小坤_GO | 来源:发表于2016-11-19 16:35 被阅读0次

    frombs4importBeautifulSoup

    importrequests

    #准备网络连接

    #pc端

    urls=['http://bj.lianjia.com/zufang/pg{}/'.format(str(i))foriinrange(1,101)]

    #手机端

    murls=['http://m.lianjia.com/bj/zufang/pg{}'.format(str(i))foriinrange(1,101)]

    #爬取PC端的数据

    defpachongpc(url):

    web_date = requests.get(url)

    web_date.encoding ='utf-8'

    soup = BeautifulSoup(web_date.text,'lxml')

    names = soup.select('#house-lst > li > div.info-panel > h2 > a')

    adrs = soup.select('#house-lst > li > div.info-panel > div.col-1 > div.where > a > span')

    styles = soup.select('#house-lst > li > div.info-panel > div.col-1 > div.where > span.zone > span')

    areas = soup.select('#house-lst > li > div.info-panel > div.col-1 > div.where > span.meters')

    prices = soup.select('#house-lst > li > div.info-panel > div.col-3 > div.price > span')

    times = soup.select('#house-lst > li > div.info-panel > div.col-3 > div.price-pre')

    imgs = soup.select('#house-lst > li > div.pic-panel > a > img')

    data = []

    # print(len(names),len(adrs),len(styles),len(areas),len(prices),len(times),len(imgs))

    forname, adr, style, area, price, time, imginzip(names, adrs, styles, areas, prices, times, imgs):

    info = {

    #'name': name.get_text().replace(u'\xa0\xa0', u'').split(' ')[0],

    'adr': adr.get_text().replace(u'\xa0\xa0',u''),

    'style': style.get_text().replace(u'\xa0\xa0',u''),

    'area': area.get_text().replace(u'\xa0\xa0',u''),

    'price': price.get_text().replace(u'\xa0\xa0',u''),

    'time': time.get_text().replace(u'\xa0\xa0',u''),

    'img': img.get('src').replace(u'\xa0\xa0',u'')

    }

    data.append(info)

    print(data)

    #爬取mobile端的数据

    defpachongmo(url):

    web_date = requests.get(url)

    web_date.encoding ='utf-8'

    soup = BeautifulSoup(web_date.text,'lxml')

    names = soup.select('section.page.page_zufang > div > div.mod_box.house_lists > div.mod_cont > ul > li > div > div.item_list > div.item_main.text_cut')

    adrs = soup.select(' section.page.page_zufang > div > div.mod_box.house_lists > div.mod_cont > ul > li > div > div.item_list > div.item_other')

    styles = soup.select(' section.page.page_zufang > div > div.mod_box.house_lists > div.mod_cont > ul > li > div > div.item_list > div.item_minor > div.info')

    prices = soup.select('div > div.mod_box.house_lists > div.mod_cont > ul > li > div > div.item_list > div.item_minor > div.price_total.q_rentprice')

    cate = soup.select('section.page.page_zufang > div > div.mod_box.house_lists > div.mod_cont > ul > li > div > div.item_list > div.tag_box')

    imgs = soup.select('section.page.page_zufang > div > div.mod_box.house_lists > div.mod_cont > ul > li > div > div.mod_media > div > img')

    data = []

    # print(len(names),len(adrs),len(styles),len(areas),len(prices),len(times),len(imgs))

    forname, adr, style, price, cate, imginzip(names, adrs, styles, prices, cate, imgs):

    info = {

    #'name': name.get_text().replace(u'\xa0\xa0', u'').split(' ')[0],

    'adr': adr.get_text().replace(u'\xa0\xa0',u''),

    'style': style.get_text().replace(u'\xa0\xa0',u''),

    'price': price.get_text().replace(u'\xa0\xa0',u''),

    'cate':list(cate.stripped_strings),

    'img': img.get('src').replace(u'\xa0\xa0',u'')

    }

    data.append(info)

    print(data)

    if__name__ =="__main__":

    forurlinurls:

    pachongpc(url)

    formurlinmurls:

    pachongmo(murl)

    注:可以使用标签加方括号的方式来定位到某一个特定的标签‘div>div.property_title>a[target='_blank']’

    相关文章

      网友评论

          本文标题:爬取链家租房信息

          本文链接:https://www.haomeiwen.com/subject/ufkepttx.html