美文网首页
第一周第三课时

第一周第三课时

作者: 采矿 | 来源:发表于2016-05-21 17:38 被阅读21次
    抓取的详情页链接 详情页的详细信息
    from bs4 import BeautifulSoup
    import requests
    import time
    sourceurls = ['http://bj.xiaozhu.com/search-duanzufang-p{}-0/'.format(str(i)) for i in range(1, 14)]
    detail_urls = []
    detailku = []
    
    
    def get_detail_url(urls):
        web_data = requests.get(urls)
        time.sleep(2)
        soup = BeautifulSoup(web_data.text, 'lxml')
        for url in soup.select('#page_list > ul > li > a'):
            detail_url = url.get('href')
            detail_urls.append(detail_url)
        print(detail_urls, len(detail_urls))
    # sourceurls 是一个列表,所以还需要一个个的取出来放到函数里
    for single_url in sourceurls:
        get_detail_url(single_url)
    
    
    def get_dtail_info(url):
        web_data = requests.get(url)
        time.sleep(1)
        soup = BeautifulSoup(web_data.text, 'lxml')
        titles = soup.select('h4 > em')
        areas = soup.select('span.pr5')
        day_prices = soup.select('div.day_l > span')
        house_pics = soup.select('#curBigImage')
        landlord_pics = soup.select('div.member_pic > a > img')
        if soup.find_all('div', 'member_ico'):
            landlord_genders = '男'
        else:
            landlord_genders = '女'
        landlord_names = soup.select('a.lorder_name')
        for title, area, day_price, house_pic, landlord_pic, landlord_gender, landlord_name in zip(titles, areas, day_prices,
                                                                                                   house_pics, landlord_pics,
                                                                                                   landlord_genders,
                                                                                                   landlord_names):
            data = {
                'title': title.get_text(),
                'area': area.get_text(),
                'day_price': day_price.get_text(),
                'house_pic': house_pic.get('src'),
                'landlord_pic':landlord_pic.get('src'),
                'landlord_gender': landlord_gender,
                'landlord_name': landlord_name.get_text()
            }
            detailku.append(data)
            print(data, len(detailku))
    for detail_sinngle_url in detail_urls:
            get_dtail_info(detail_sinngle_url)
    

    相关文章

      网友评论

          本文标题:第一周第三课时

          本文链接:https://www.haomeiwen.com/subject/hbhorttx.html