美文网首页
[python爬虫]aiohttp+asynci房天下,将数据存

[python爬虫]aiohttp+asynci房天下,将数据存

作者: 程序里的小仙女 | 来源:发表于2020-06-19 17:48 被阅读0次
    # -*- coding: utf-8 -*-
    """
     @Time   : 2020/6/19 16:49 
     @Athor   : LinXiao
     @功能   :
    """
    # ------------------------------
    from asyncio import sleep
    from pprint import pprint
    
    """
    
    2940条:
    Cost 3.7898309230804443 seconds
    """
    from lxml import etree
    from time import time
    import asyncio
    import aiohttp
    import asyncio
    from fake_useragent import UserAgent
    from motor.motor_asyncio import AsyncIOMotorClient
    
    # url = 'https://github.com/StackExchange/StackExchange.Redis/issues/201'
    url='https://hui.lianjia.com/ershoufang/'
    
    
    # 抓取页面内容
    async def fetch_content(url):
        ua=UserAgent()
    
        headers={
            'User-Agent': ua.random
        }
        async with aiohttp.ClientSession(headers=headers) as session:
            async with session.get(url) as response:
                sleep(3)
                res=await response.text()
                pprint(res)
                return res
    
    
    # 解析页面内容
    async def parse(url):
        page=await fetch_content(url)
    
        xpath_house="//div[@class='info clear']/div[@class='title']/a"
        xpath_house_price="//div[@class='totalPrice']/span"
        # xpath_pages = '//*[@id="content"]/div/div[1]/div[2]/a'
    
        fetch_list=[]
        result=[]
    
        for p in range(1, 300):
            fetch_list.append(url + f'//pg{p}//')
    
        tasks=[fetch_content(url) for url in fetch_list]
        pages=await asyncio.gather(*tasks)
    
        for page in pages:
            html=etree.HTML(page)
            for element_house, element_house_price in zip(html.xpath(xpath_house), html.xpath(xpath_house_price)):
                result.append(f'{element_house.text}:{element_house_price.text}万')
        pprint(type(result))
        return result
    
    # 保存到mongo数据库中
    async def save(url):
        db_url='localhost'
        db_port=27017
        db_name="linxao1"
        db_collection="lx21"
        # 建立连接
        client=AsyncIOMotorClient(db_url, db_port)
    
        # 连接某个库名字
        db=client[db_name][db_collection]
    
        list1=[]
        result=await parse(url)
        pprint(result)
        for i, house in enumerate(result, 1):
            print(i, house)
            dic={"_id": i,
                 "context": house
                 }
            pprint(dic)
            list1.append(dic)
        pprint(list1)
        await db.insert_many([i for i in list1])
        print('inserted %d docs' % (len(result)))
    
        document={'key': 'value'}
        result=await db.test_collection.insert_one(document)
        print('result %s' % repr(result.inserted_id))
    
    async def do_insert():
        # 建立连接
        client=AsyncIOMotorClient('localhost', 27017)
    
        # 连接某个库名字
        db=client.linxiao
        # result = await db.collection111.insert_many(
        # lx 为linxiao库下面的lx集合
        result=await db.lx.insert_many(
            [{'i': i} for i in range(20)])  # insert_many可以插入一条或多条数据,但是必须以列表(list)的形式组织数据
        print('inserted %d docs' % (len(result.inserted_ids),))
    
    def main():
        loop=asyncio.get_event_loop()
        start=time()
        # loop.run_until_complete(parse(url))
        # loop.run_until_complete(fetch_content(url))
        loop.run_until_complete(save(url))
        end=time()
        print('Cost {} seconds'.format(end - start))
        loop.close()
    
    if __name__ == '__main__':
        main()
    

    相关文章

      网友评论

          本文标题:[python爬虫]aiohttp+asynci房天下,将数据存

          本文链接:https://www.haomeiwen.com/subject/shrbxktx.html