Python: Requests/Aiohttp--同步/异步爬

作者: 疯狂的向日葵 | 来源:发表于2017-06-25 21:38 被阅读239次

    1.request同步版本

    1.1Code:

    # -*- coding: utf-8 -*-
    
    '''
    dengta news list`
    '''
    
    __author__ = 'Jimmy'
    
    import requests
    from bs4 import BeautifulSoup
    import re
    import time
    
    
    class News:
        def __init__(self,title='',abstract='',detailUrl='',impact='',source='',content=''):
            self.title = title
            self.abstract = abstract
            self.detailUrl = detailUrl
            self.impact = impact
            self.source = source
            self.content = content
    
        def printNews(self):
            print('标题:%s \n来源:%s\n摘要:%s\n影响:%s\n地址:%s\n内容:%s' %(self.title,self.source,self.abstract,self.impact,self.detailUrl,self.content))
    class Page:
        def __init__(self,newsCount=0,pageCount=0):
            self.newsCount = newsCount
            self.pageCount = pageCount
    
    
    def getNewsPageCount(code):
        url = 'http://www.wedengta.com/stockDetail/0101%s/news/1.html' % code
        r = requests.get(url)
        soup = BeautifulSoup(r.text, 'html.parser')
        paper = soup.find_all('div', class_='pager')[0]
        newsCount = paper.span.string
        ncount = re.sub('\D','',newsCount)
        page = Page()
        page.newsCount = int(ncount)
        for c in paper.children:
            if c.string == '末页':
                url = c['href']
                pageCount = url.split('/')[-1].split('.')[0]
                page.pageCount = int(pageCount)
                return page
    
    def getSingleNewsList(code,page):
        url = 'http://www.wedengta.com/stockDetail/0101%s/news/%d.html' % (code,page)
        r = requests.get(url)
        soup = BeautifulSoup(r.text, 'html.parser')
        nl = soup.find_all('ul', class_='news_list')[0]
        newsList = []
        for li in nl.children:
            for a in li:
                news = News()
                news.detailUrl = 'http://www.wedengta.com%s' % a['href']
                sc = getSingleNewsDetail(news.detailUrl)
                if sc :
                    news.title = a.h3.string
                    news.abstract = a.p.string
                    news.impact = a.span.string
                    news.source = sc[0]
                    news.content = sc[1]
                    newsList.append(news)
                    news.printNews()
        return newsList
    
    def getSingleNewsDetail(url):
        r = requests.get(url)
        soup = BeautifulSoup(r.text, 'html.parser')
        source = soup.find_all('div',class_='news_info')[0]
        content = soup.find_all('div',id='newsContent')[0]
        if content.div == None:
            return [source.string, str(content)]
        else:
            return None
    
    def getAllNewsList(code):
        newsList = []
        print('获取%s的资讯个数' %code)
        page = getNewsPageCount(code)
        print('共%d页,合计%d条' %(page.pageCount,page.newsCount))
        if page.newsCount > 0:
            for page in range(1, page.pageCount):
                aNewsList = getSingleNewsList(code, page)
                newsList.extend(aNewsList)
        return newsList
    
    
    
    start = time.time()
    list = getAllNewsList('600585')
    end = str(time.time() - start)
    print('共用时%s' % end)
    print(len(list))
    
    # getSingleNewsDetail('http://www.wedengta.com/news/newsDetail/1/1498212464_869774_10_1.html')
    # getSingleNewsDetail('http://www.wedengta.com/news/newsDetail/1/1498213693_9569133_9_1.html')
    
    

    1.2结果:

    image.png

    2.同步的速度实在太低了,aiohttp异步版本

    2.1Code:

    # -*- coding: utf-8 -*-
    
    '''
    aiohttp
    '''
    
    __author__ = 'Jimmy'
    
    import aiohttp
    import asyncio
    import requests
    from bs4 import BeautifulSoup
    import re
    import time
    
    class News:
        def __init__(self,title='',abstract='',detailUrl='',impact='',source='',content=''):
            self.title = title
            self.abstract = abstract
            self.detailUrl = detailUrl
            self.impact = impact
            self.source = source
            self.content = content
    
        def printNews(self):
            print('标题:%s \n来源:%s\n摘要:%s\n影响:%s\n地址:%s\n内容:%s' %(self.title,self.source,self.abstract,self.impact,self.detailUrl,self.content))
    class Page:
        def __init__(self,newsCount=0,pageCount=0):
            self.newsCount = newsCount
            self.pageCount = pageCount
    
    def getNewsPageCount(code):
        url = 'http://www.wedengta.com/stockDetail/0101%s/news/1.html' % code
        r = requests.get(url)
        soup = BeautifulSoup(r.text, 'html.parser')
        paper = soup.find_all('div', class_='pager')[0]
        newsCount = paper.span.string
        ncount = re.sub('\D','',newsCount)
        page = Page()
        page.newsCount = int(ncount)
        for c in paper.children:
            if c.string == '末页':
                url = c['href']
                pageCount = url.split('/')[-1].split('.')[0]
                page.pageCount = int(pageCount)
                return page
    
    async def getSingleNewsList(code,page,newsList):
        url = 'http://www.wedengta.com/stockDetail/0101%s/news/%d.html' % (code,page)
        async with aiohttp.ClientSession() as session:
            async with session.get(url) as r:
                body = await r.text(encoding='utf-8')
                soup = BeautifulSoup(body, 'html.parser')
                nl = soup.find_all('ul', class_='news_list')[0]
                for li in nl.children:
                    for a in li:
                        news = News()
                        news.detailUrl = 'http://www.wedengta.com%s' % a['href']
                        sc = await getSingleNewsDetail(news.detailUrl)
                        if sc:
                            news.title = a.h3.string
                            news.abstract = a.p.string
                            news.impact = a.span.string
                            news.source = sc[0]
                            news.content = sc[1]
                            newsList.append(news)
                            news.printNews()
    
    async def getSingleNewsDetail(url):
        async with aiohttp.ClientSession() as session:
            async with session.get(url) as r:
                body = await r.text(encoding='utf-8')
                soup = BeautifulSoup(body, 'html.parser')
                source = soup.find_all('div', class_='news_info')[0]
                content = soup.find_all('div', id='newsContent')[0]
                if content.div == None:
                    return [source.string, str(content)]
                else:
                    return None
    
    def getAllNewsList(code):
        newsList = []
        print('获取%s的资讯个数' %code)
        page = getNewsPageCount(code)
        print('共%d页,合计%d条' %(page.pageCount,page.newsCount))
        if page.newsCount > 0:
            loop = asyncio.get_event_loop()
            tasks = [getSingleNewsList(code,pc,newsList) for pc in range(1, page.pageCount+1)]
            loop.run_until_complete(asyncio.wait(tasks))
            loop.close()
        return newsList
    
    
    start = time.time()
    list = getAllNewsList('600585')
    end = str(time.time() - start)
    print('共用时%s' % end)
    print(len(list))
    

    相关文章

      网友评论

        本文标题:Python: Requests/Aiohttp--同步/异步爬

        本文链接:https://www.haomeiwen.com/subject/mzwrcxtx.html