美文网首页
Python实战计划学习第二周

Python实战计划学习第二周

作者: 飞飞幻想 | 来源:发表于2016-05-30 01:21 被阅读0次

    在MongoDB中筛选房源

    import pymongo
    from bs4 import BeautifulSoup
    import requests
    import time
    
    def get_seed(url='http://bj.xiaozhu.com/', page=1):
        return url if page <= 1 else '{}search-duanzufang-p{}-0/'.format(url, page)
    
    def parse_fangzi(url, data=None):
        wb_data = requests.get(url)
        soup = BeautifulSoup(wb_data.text, 'lxml')
        title = soup.select('h4 > em') # 标题
        address = soup.select('p > span.pr5') # 地址
        price = soup.select('#pricePart > div.day_l > span') # 价格
        head = soup.select('#floatRightBox > div > div > a > img') # 房主头像
        name = soup.select('#floatRightBox > div > div > h6 > a') # 房主昵称
        gender = soup.select('#floatRightBox > div > div.member_pic > div') # 房主性别
        image = soup.select('#curBigImage') # 首张房子图片
        data = {
            'title': title[0].get_text() if title != None else None
            , 'address': address[0].get_text().strip() if address != None else None
            , 'price': int(price[0].get_text()) if price != None else None
            , 'head': head[0].get('src') if head != None else None
            , 'name': name[0].get_text() if name != None else None
            , 'gender': gender[0].get('class') if gender != None else None
            , 'image': image[0].get('src') if image != None else None
        }
        if data['gender'] != None:
            data['gender'] = '男' if 'member_ico' in data['gender'] else '女'
        return data
    
    def save(sheet):
        seed = 'http://bj.xiaozhu.com/'
        urls = []
        for page in range(1,4) :
            wb_data = requests.get(get_seed(url=seed,page=page))
            soup = BeautifulSoup(wb_data.text, 'lxml')
            for div in soup.select('#page_list > ul > li > div.result_btm_con.lodgeunitname'):
                urls.append(div.get('detailurl'))
    
        for url in urls:
            sheet.insert_one(parse_fangzi(url))
            time.sleep(2)
    
    client = pymongo.MongoClient('localhost',27017)
    walden = client['walden']
    xiaozhu = walden['xiaozhu']
    
    # save(xiaozhu)
    for item in xiaozhu.find({'price':{'$gte':500}}):
        print(item)
    
    屏幕快照 2016-05-30 上午12.37.33.png
    • 学习如何操作mongodb数据库
    • 学习Python '三元操作符'

    爬取手机号

    import pymongo
    from bs4 import BeautifulSoup
    import requests
    import time
    
    count = 0
    client = pymongo.MongoClient('localhost', 27017)
    walden = client['walden']
    shoujihao = walden['shoujihao']
    
    for page in range(1, 2):
        url = 'http://bj.58.com/shoujihao/pn{}/'.format(page)
        wb_data = requests.get(url)
        soup = BeautifulSoup(wb_data.text, 'lxml')
        infocont = soup.select('#infocont > span')
        infocont = int(infocont[0].get_text()) if len(infocont) != 0 else 0
        # 如果找不到计数器 则直接退出
        if infocont == 0:
            print('共插入 {} 条, 终止于 {}'.format(count, url))
            break
        phones = soup.select('ul > div.boxlist > ul > li > a.t')
        for phone in phones:
            data = {
                'href': phone.get('href')
                , 'title': phone.find('strong', 'number').get_text()
            }
            shoujihao.insert_one(data)
        count += len(phones)
        print('{} -> {} : {}/{}'.format(url,len(phones),count,infocont))
        time.sleep(2)
    
    屏幕快照 2016-05-30 上午12.43.40.png

    设计断点续传程序

    channel_extract.py

    from bs4 import BeautifulSoup
    import requests
    import re
    
    
    # 抽取58北京跳蚤市场下的所有类目 除却手机号
    def get_channel_urls(url):
        wb_data = requests.get(url)
        soup = BeautifulSoup(wb_data.text, 'lxml')
        base = re.search(r'^(https?://[^/]+).*$', url).group(1)
        urls = []
        links = soup.select('ul.ym-submnu > li > b > a')
        for link in links:
            href = link.get('href')
            if href.startswith('/'):
                href = '{}{}'.format(base, href)
            if href not in urls and 'shoujihao' not in href:
                urls.append(href)
        return urls
    

    counts.py

    from bs4 import BeautifulSoup
    import requests
    import time
    import pymongo
    
    client = pymongo.MongoClient('localhost', 27017)
    walden = client['walden']
    _58_channels = walden['_58_channels']
    _58_urls = walden['_58_urls']
    _58_infos = walden['_58_infos']
    while True:
        print('{}/{}/{}'.format(_58_channels.count(), _58_urls.count(), _58_infos.count()))
        time.sleep(5)
    

    main.py

    from multiprocessing import Pool
    from channel_extract import get_channel_urls
    from page_parsing import get_item_info
    from page_parsing import get_links_from
    import time
    
    import pymongo
    
    client = pymongo.MongoClient('localhost', 27017)
    walden = client['walden']
    _58_channels = walden['_58_channels']
    _58_urls = walden['_58_urls']
    
    
    def get_all_links_from(channel):
        if _58_channels.count({'channel': channel}) > 0:
            return
        count = 0
        for page in range(1, 101):
            links = get_links_from(channel, page, 0)
            time.sleep(0.1)
            if links <= 0:
                break
            count += links
        print('{} -> {}'.format(channel, count))
        _58_channels.insert_one({'channel': channel, 'count': count})
    
    
    def get_info(url):
        get_item_info(url)
        time.sleep(0.1)
    
    
    if __name__ == '__main__':
        channels = get_channel_urls('http://bj.58.com/sale.shtml')
        print(len(channels))
        # 首先执行解析类目程序
        pool = Pool()
        pool.map(get_all_links_from, channels)
        pool.close()
        pool.join()
        _58_urls.find_and_modify(query={'flag': {'$eq':0}}, update={'$set': {'flag': 0}})
        urls = list(map(lambda url: url['url'], _58_urls.find({'flag': {'$eq': 0}})))
        pool = Pool()
        pool.map(get_info, urls)
        pool.close()
        pool.join()
    

    page_parsing.py

    from bs4 import BeautifulSoup
    import requests
    import time
    import pymongo
    import sys
    
    client = pymongo.MongoClient('localhost', 27017)
    walden = client['walden']
    _58_urls = walden['_58_urls']
    _58_infos = walden['_58_infos']
    
    
    def get_links_from(channel, page, who_sells=0):
        # http://bj.58.com/iphonesj/0/pn2/
        list_url = '{}{}/pn{}'.format(channel, who_sells, page)
        wb_data = requests.get(list_url)
        soup = BeautifulSoup(wb_data.text, 'lxml')
        # #infocont > span > b
        infocont = soup.select('#infocont > span > b')
        infocont = int(infocont[0].get_text()) if len(infocont) > 0 else 0
        print('{} -> {}'.format(list_url, infocont))
        if infocont <= 0:
            return 0
        links = soup.select('table > tr > td > a.t')
        for link in links:
            if not link.has_attr('onclick') and not link.has_attr('data-addtype'):
                item_link = link.get('href').split('?')[0]
                data = {
                    'url': item_link
                    , 'flag': 0
                }
                if _58_urls.find({'url': {'$eq': data['url']}}).count() == 0:
                    _58_urls.insert_one(data)
        return len(links)
    
    
    def get_item_info(url):
        wb_data = requests.get(url)
        soup = BeautifulSoup(wb_data.text, 'lxml')
        res = list(filter(lambda a: a.src != None and '404' in a.src, soup.find_all('script')))
        if len(res) > 0:
            return
        title = soup.select('div.col_sub.mainTitle > h1')
        price = soup.select('span.price.c_f50')
        date = soup.select('li.time')
        area = soup.select('li > div.su_con > span.c_25d')
        # 去除匹配到的 借钱买
        area = list(filter(lambda a: '借钱买' not in a.get_text(), area))
        try:
            data = {
                'url': url
                , 'title': title[0].get_text().strip() if len(title) > 0 else None
                , 'price': price[0].get_text().strip().strip('元').strip() if len(price) > 0 else None
                , 'date': date[0].get_text().strip() if len(date) > 0 else None
                , 'area': ''.join(area[0].stripped_strings) if len(area) > 0 else None
            }
        except:
            print('{} -> 异常'.format(url))
            print(sys.exc_info())
        else:
            print('{} -> {}'.format(url, data))
            record = _58_infos.find_one({'url': {'$eq': url}})
            if record == None:
                _58_infos.insert_one(data)
            else:
                _58_infos.update({'_id': record['_id']}, data)
            _58_urls.find_and_modify(query={'url': url}, update={'$inc': {'flag': 1}})
    
    # get_links_from('http://bj.58.com/iphonesj/', 1)
    # get_item_info('http://bj.58.com/zixingche/26131404258880x.shtml')
    
    屏幕快照 2016-05-30 上午1.09.55.png
    • 断点续传主要思路:在mongodb中渠道解析时存储解析到的url并给定判断位flag=0,在解析完详情页的时候将对应url的flag自增,每次运行取待爬取url的时候仅取flag=0的
    • 另外发现启动多进程的时候 mongodb会报警(根据内容应该是同时连接了多个mongoclient):
    UserWarning: MongoClient opened before fork. Create MongoClient with connect=False, or create client after forking. See PyMongo's documentation for details: http://api.mongodb.org/python/current/faq.html#using-pymongo-with-multiprocessing>
      "MongoClient opened before fork. Create MongoClient "
    

    爬取10万商品数据

    channel_extract.py :渠道抓取

    from bs4 import BeautifulSoup
    import requests
    import re
    
    
    # 收取赶集网北京下面的所有二级类目
    def get_channel_urls(url):
        wb_data = requests.get(url)
        soup = BeautifulSoup(wb_data.text, 'lxml')
        base = re.search(r'^(https?://[^/]+).*$', url).group(1)
        urls = []
        links = soup.select('dl.fenlei dt a')
        for link in links:
            href = link.get('href')
            if href.startswith('/'):
                href = '{}{}'.format(base, href)
                # 去除转转
            if href not in urls :
                urls.append(href)
        return urls
    
    # #wrapper > div.content > div:nth-child(1) > div:nth-child(1) > dl > dt > a:nth-child(1)
    # channels = get_channel_urls('http://bj.ganji.com/wu/')
    # print('{}\n{}'.format(len(channels),'\n'.join(channels)))
    
    

    counts.py :监控小程序

    import time
    import pymongo
    
    client = pymongo.MongoClient('localhost', 27017)
    walden = client['walden']
    _ganji_channels = walden['_ganji_channels']
    _ganji_urls = walden['_ganji_urls']
    _ganji_infos = walden['_ganji_infos']
    
    while True:
        print('{}/{}/{}'.format(_ganji_channels.count(), _ganji_urls.count(), _ganji_infos.count()))
        time.sleep(5)
    
    

    main.py : 主程序

    from multiprocessing import Pool
    from channel_extract import get_channel_urls
    from page_parsing import get_item_info
    from page_parsing import get_links_from
    
    import pymongo
    
    client = pymongo.MongoClient('localhost', 27017)
    walden = client['walden']
    _ganji_channels = walden['_ganji_channels']
    _ganji_urls = walden['_ganji_urls']
    
    
    def get_all_links_from(channel):
        if _ganji_channels.count({'channel': channel}) > 0:
            return
        count = 0
        for page in range(1, 201):
            links = get_links_from(channel, page)
            if links <= 0:
                continue
            count += links
        print('{} -> {}'.format(channel, count))
        _ganji_channels.insert_one({'channel': channel, 'count': count})
    
    
    def get_info(url):
        get_item_info(url)
    
    
    # _ganji_urls.drop()
    # _ganji_urls.find_and_modify(query={'flag': {'$eq': 0}}, update={'$set': {'flag': 0}})
    if __name__ == '__main__':
        _ganji_channels.drop()
        channels = get_channel_urls('http://bj.ganji.com/wu/')
        print(len(channels))
        # 首先执行解析类目程序
        pool = Pool()
        pool.map(get_all_links_from, channels)
        pool.close()
        pool.join()
        urls = list(map(lambda url: url['url'], _ganji_urls.find({'flag': {'$eq': 0}})))
        pool = Pool()
        pool.map(get_info, urls)
        pool.close()
        pool.join()
    
    

    page_parsing.py : 解析模块,主要用来解析商品列表和商品详情

    from bs4 import BeautifulSoup
    import requests
    import pymongo
    import sys
    import random
    
    client = pymongo.MongoClient('localhost', 27017)
    walden = client['walden']
    _ganji_urls = walden['_ganji_urls']
    _ganji_infos = walden['_ganji_infos']
    
    headers = {
        'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/47.0.2526.111 Safari/537.36',
        'Connection': 'keep-alive'
    }
    # 代理网站 http://cn-proxy.com/
    proxy_list = [
        '101.96.11.47:80'
        , '101.96.11.43:80'
        , '101.96.11.42:80'
        , '101.96.11.44:80'
        , '112.5.220.199:80'
        , '111.13.109.56:8080'
    ]
    # 随机获取代理ip
    proxy_ip = random.choice(proxy_list)
    proxies = {'http': proxy_ip}
    
    
    def get_links_from(channel, page, who_sells=1):
        # http://bj.ganji.com/jiaju/a1o1/
        list_url = '{}a{}o{}/'.format(channel, who_sells, page)
        wb_data = requests.get(list_url, headers=headers, proxies=proxies)
        if wb_data.status_code != 200:  # 页面不存在则返回 -1
            return -1
        soup = BeautifulSoup(wb_data.text, 'lxml')
        if len(soup.select('ul.pageLink')) == 0:  # 页面下方无页面跳转模块
            print('{} -> {} 结束'.format(list_url, soup.select('ul.pageLink')))
            return 0
        links = soup.select('dl.list-bigpic > dt > a')
        for link in links:
            data = {
                'url': link.get('href')
                , 'flag': 0
            }
            # 跳过转转
            if 'zhuanzhuan' not in data['url'] and _ganji_urls.find({'url': {'$eq': data['url']}}).count() == 0:
                _ganji_urls.insert_one(data)
        return len(links)
    
    
    def get_item_info(url):
        wb_data = requests.get(url, headers=headers, proxies=proxies)
        # wb_data = requests.get(url, headers=headers)
        soup = BeautifulSoup(wb_data.text, 'lxml')
        if wb_data.status_code != 200:  # 如果没有该页面则跳过
            return
        title = soup.select('h1.title-name')
        date = soup.select('i.pr-5')
        types = soup.select('ul.det-infor > li:nth-of-type(1) > span > a')
        price = soup.select('ul.det-infor > li > i.f22')
        areas = soup.select('ul.det-infor > li:nth-of-type(3) > a')
        newer = soup.select('ul.second-det-infor > li:nth-of-type(1)')
        try:
            data = {
                'url': url
                , 'title': title[0].get_text().strip()
                , 'date': date[0].get_text().strip().strip('发布').strip() if len(date) > 0 else None
                , 'type': [type.text.strip() for type in types]
                , 'price': price[0].get_text().strip() if len(price) > 0 else None
                , 'area': [area.text.strip() for area in areas if area.text.strip() != "-"]
                , 'newer': '{}{}'.format(newer[0].find('label').get_text(),
                                         newer[0].contents[1].strip()) if len(newer) > 0 else None
            }
        except:
            print(
                '异常 : {} -> title={},date={},types={},price={},areas={},newer={}'.format(url, title, date, types, price,
                                                                                         areas, newer))
            print(sys.exc_info())
        else:
            print('{} -> {}'.format(url, data))
            record = _ganji_infos.find_one({'url': {'$eq': url}})
            if record == None:
                _ganji_infos.insert_one(data)
            else:
                _ganji_infos.update({'_id': record['_id']}, data)
            _ganji_urls.find_and_modify(query={'url': url}, update={'$inc': {'flag': 1}})
    
    # print(get_links_from('http://bj.ganji.com/ershoubijibendiannao/', 1))
    # get_item_info('http://bj.ganji.com/bangong/2136884202x.htm')
    
    
    屏幕快照 2016-05-30 上午1.17.13.png
    • 一共抓取了20个类目5w多条商品记录
    • 发现有些时候自己本地可以访问的页面代理却访问不了

    相关文章

      网友评论

          本文标题:Python实战计划学习第二周

          本文链接:https://www.haomeiwen.com/subject/vciqdttx.html