美文网首页
2-2homework

2-2homework

作者: OldSix1987 | 来源:发表于2016-08-16 22:31 被阅读25次

    结果


    Result.png

    代码


    from bs4 import BeautifulSoup
    import requests
    import time
    import pymongo
    
    client = pymongo.MongoClient('localhost', 27017)
    tongcheng = client['tongcheng']
    shoujihao = tongcheng['shoujihao']
    infos = tongcheng['infos']
    
    def get_links_from(page):
        url = 'http://bj.58.com/shoujihao/pn{}'.format(page)
        wb_data = requests.get(url)
        time.sleep(2)
    
        soup = BeautifulSoup(wb_data.text, 'lxml')
        handle_soup(soup)
        # handle_soup2(soup)
    
    
    def handle_soup2(soup):
        pageinfo = soup.select('#infocont > span > b')
        if int(pageinfo[0].text) == 0:
            print('last page')
        else:
            get_data(soup)
    
    
    def handle_soup(soup):
        if soup.select_one('#infolist > div > ul').find('div', 'boxlist'):
            get_data(soup)
        else:
            print("It's Last page in shoujihao.")
    
    
    def get_data(soup):
        titles = soup.select('div.boxlist > ul > li > a > strong.number')
        links = soup.select('div.boxlist > ul > li > a.t')
        for title, link in zip(titles, links):
            data = {
                'title': title.text,
                'link': link.get('href')
            }
            # print(data)
            shoujihao.insert_one(data)
    
    
    def isexist(soup):
        js_src = soup.find('script', type='javascript')
    
        if js_src:
            return '404' in js_src.get('src').split('/')
        else:
            return None
    
    def get_item_info(url):
        wb_data = requests.get(url)
        soup = BeautifulSoup(wb_data.text, 'lxml')
        no_longer_exist = isexist(soup)
        if no_longer_exist:
            print('not exist')
        else:
            prices = soup.select('#main > div.col.detailPrimary.mb15 > div.col_sub.sumary > ul > li > div.su_con > span.price')
    
            # print(titles, prices, sep="\n-------\n")
            for price in prices:
                data = {
                    'title': soup.title.text,
                    'price': price.text.replace('\n', '').replace('\t', '').replace('\r', '').replace(' ', '')
                }
                # print(data)
                infos.insert(data)
    
    if __name__ == '__main__':
        for page in range(1, 1001):
            get_links_from(page)
    
        for item in shoujihao.find():
            get_item_info(item['link'])
    
    

    总结


    1. 边界条件的判断

    // 详情页面:链接不存在,404判断
    // 一般的情况是在js的src里面还有一个跳转的404链接
    // 具体的情况还要具体分析,本例只作为一个参考
    
    def isexist(soup):
        js_src = soup.find('script', type='javascript')
    
        if js_src:
            return '404' in js_src.get('src').split('/')
        else:
            return None
    
    

    // last_page:强行给page一个最大的参数测试页面临界的效果,是404,还是特殊处理
    // 特殊情况,就要观察lastPage和normalPage的区别,加以判断
    
    // solution 1
    def handle_soup(soup):
        pageinfo = soup.select('#infocont > span > b')
        if int(pageinfo[0].text) == 0:
            print('last page')
        else:
            get_data(soup)
    
    // solution 2
    def handle_soup2(soup):
        if soup.select_one('#infolist > div > ul').find('div', 'boxlist'):
            get_data(soup)
        else:
            print("It's Last page in shoujihao.")
    
    

    solution 1: 根据页面结构的区别

    strcuture.png

    solution 2: 根据页面直观表现的区别

    normal_page.png last_page.png

    2. 监控数据库写入情况,仅提供一个思路

    import time
    from page_parsing import url_list
    
    while True:
        print(url_list.find().count())
        time.sleep(5)
    

    3. 多进程写入

    from multiprocessing import Pool
    
    if __name__ == '__main__':
        pool = Pool()
        # pool = Pool(processes=6)  // 可以控制进程启动数量
        pool.map(get_all_links_from, channel_list.split())
    

    4. 多个参数的格式化

    list_view = '{}{}/pn{}/'.format('http://bj.58.com/shoujihao/', str(who_sells), str(page))
    

    5. soup.selcet之后返回的是一个list

    pageinfo = soup.select('#infocont > span > b')
    // 这里直接使用pageInfo.text,则会报错
    if int(pageinfo[0].text) == 0:
            print('last page')
    else:
            pass
    
    b标签.png

    6. soup.select_one 只返回第一个查找元素

    soup.select_one('#infolist > div > ul').find('div', 'boxlist')
    
    // find这个函数,可以连续的再后面的find,有点像replace( )
    

    BeautifulSoup和Request 还是理解程度太少了,得多看文档

    相关文章

      网友评论

          本文标题:2-2homework

          本文链接:https://www.haomeiwen.com/subject/bdwpsttx.html