协程并非爬虫

作者: 大卫同學 | 来源:发表于2018-12-10 00:05 被阅读0次

    标签: tornado 、爬虫


    from urllib.parse import urljoin
    
    from bs4 import BeautifulSoup
    from tornado import gen, httpclient, ioloop, queues
    
    base_url = "http://www.tornadoweb.org/en/stable/"
    concurrency = 20
    
    async def get_url_links(url):
        response = await httpclient.AsyncHTTPClient().fetch(url)
        html = response.body.decode("utf8")
        soup = BeautifulSoup(html)
        links = [urljoin(base_url, a.get("href")) for a in soup.find_all("a", href=True)]
        return links
    
    async def main():
        seen_set = set()
        q = queues.Queue()
    
        async def fetch_url(current_url):
            #生产者
            if current_url in seen_set:
                return
    
            print("获取: {}".format(current_url))
            seen_set.add(current_url)
            next_urls = await get_url_links(current_url)
            for new_url in next_urls:
                if new_url.startswith(base_url):
                    #放入队列,
                    await q.put(new_url)
    
        async def worker():
            '''
            消费者
            :return: 
            '''
            async for url in q:
                if url is None:
                    return
                try:
                    await fetch_url(url)
                except Exception as e:
                    print("excepiton")
                finally:
                    q.task_done()
    
        #放入初始url到队列
        await q.put(base_url)
    
        #启动协程
        workers = gen.multi([worker() for _ in range(concurrency)])
        await q.join()
    
        for _ in range(concurrency):
            await q.put(None)
    
        await workers
    
    
    if __name__ == "__main__":
        import asyncio
        asyncio.get_event_loop().run_until_complete(main())
        # io_loop = ioloop.IOLoop.current()
        # io_loop.run_sync(main)
    

    并发请求demo

    #!/usr/bin/env python
    #-*- coding:utf-8 -*-
    # author:zenwan
    # datetime:2018/10/29 22:54
    # file: PyCharm
    
    import tornado
    from tornado.ioloop import IOLoop
    from tornado import httpclient
    import asyncio
    from tornado import gen,queues
    import requests
    import time
    
    concurrency = 50
    num = 100
    base_url = 'http://mirrors.163.com/fedora/'
    client = httpclient.AsyncHTTPClient()
    res0,res1 = [],[]
    async def main():
        q = queues.Queue()
        for i in range(num):
            q.put(base_url)
    
        async def f():
            async for url in q:
                if url is None:
                    return
                try:
                    res = await client.fetch(url)
                    res1.append(res.body.decode("utf8").strip()[:10])
                except Exception as e:
                    print(e)
                else:
                    q.task_done()
    
        workers = gen.multi([f() for _ in range(concurrency)])
        await q.join()
        for _ in range(concurrency):
            await  q.put(None)
        await workers
    
    
    if __name__ == "__main__":
        # io_loop = IOLoop.current()
        # # run_sync 方法可以在运行完某个协程之后停止事件循环
        # io_loop.run_sync(f)
        from pprint import pprint
        time0 = time.time()
        for i in range(num):
            res = requests.get(base_url)
            res0.append(res.text.strip()[:10])
        time1 = time.time()
        #pprint(res0)
        print(time1-time0)
        asyncio.get_event_loop().run_until_complete(main())
        time2 = time.time()
        #pprint(res1)
        print(time2 - time1)
    
    

    相关文章

      网友评论

        本文标题:协程并非爬虫

        本文链接:https://www.haomeiwen.com/subject/bnylhqtx.html