1.request同步版本
1.1Code:
# -*- coding: utf-8 -*-
'''
dengta news list`
'''
__author__ = 'Jimmy'
import requests
from bs4 import BeautifulSoup
import re
import time
class News:
def __init__(self,title='',abstract='',detailUrl='',impact='',source='',content=''):
self.title = title
self.abstract = abstract
self.detailUrl = detailUrl
self.impact = impact
self.source = source
self.content = content
def printNews(self):
print('标题:%s \n来源:%s\n摘要:%s\n影响:%s\n地址:%s\n内容:%s' %(self.title,self.source,self.abstract,self.impact,self.detailUrl,self.content))
class Page:
def __init__(self,newsCount=0,pageCount=0):
self.newsCount = newsCount
self.pageCount = pageCount
def getNewsPageCount(code):
url = 'http://www.wedengta.com/stockDetail/0101%s/news/1.html' % code
r = requests.get(url)
soup = BeautifulSoup(r.text, 'html.parser')
paper = soup.find_all('div', class_='pager')[0]
newsCount = paper.span.string
ncount = re.sub('\D','',newsCount)
page = Page()
page.newsCount = int(ncount)
for c in paper.children:
if c.string == '末页':
url = c['href']
pageCount = url.split('/')[-1].split('.')[0]
page.pageCount = int(pageCount)
return page
def getSingleNewsList(code,page):
url = 'http://www.wedengta.com/stockDetail/0101%s/news/%d.html' % (code,page)
r = requests.get(url)
soup = BeautifulSoup(r.text, 'html.parser')
nl = soup.find_all('ul', class_='news_list')[0]
newsList = []
for li in nl.children:
for a in li:
news = News()
news.detailUrl = 'http://www.wedengta.com%s' % a['href']
sc = getSingleNewsDetail(news.detailUrl)
if sc :
news.title = a.h3.string
news.abstract = a.p.string
news.impact = a.span.string
news.source = sc[0]
news.content = sc[1]
newsList.append(news)
news.printNews()
return newsList
def getSingleNewsDetail(url):
r = requests.get(url)
soup = BeautifulSoup(r.text, 'html.parser')
source = soup.find_all('div',class_='news_info')[0]
content = soup.find_all('div',id='newsContent')[0]
if content.div == None:
return [source.string, str(content)]
else:
return None
def getAllNewsList(code):
newsList = []
print('获取%s的资讯个数' %code)
page = getNewsPageCount(code)
print('共%d页,合计%d条' %(page.pageCount,page.newsCount))
if page.newsCount > 0:
for page in range(1, page.pageCount):
aNewsList = getSingleNewsList(code, page)
newsList.extend(aNewsList)
return newsList
start = time.time()
list = getAllNewsList('600585')
end = str(time.time() - start)
print('共用时%s' % end)
print(len(list))
# getSingleNewsDetail('http://www.wedengta.com/news/newsDetail/1/1498212464_869774_10_1.html')
# getSingleNewsDetail('http://www.wedengta.com/news/newsDetail/1/1498213693_9569133_9_1.html')
1.2结果:
image.png
2.同步的速度实在太低了,aiohttp异步版本
2.1Code:
# -*- coding: utf-8 -*-
'''
aiohttp
'''
__author__ = 'Jimmy'
import aiohttp
import asyncio
import requests
from bs4 import BeautifulSoup
import re
import time
class News:
def __init__(self,title='',abstract='',detailUrl='',impact='',source='',content=''):
self.title = title
self.abstract = abstract
self.detailUrl = detailUrl
self.impact = impact
self.source = source
self.content = content
def printNews(self):
print('标题:%s \n来源:%s\n摘要:%s\n影响:%s\n地址:%s\n内容:%s' %(self.title,self.source,self.abstract,self.impact,self.detailUrl,self.content))
class Page:
def __init__(self,newsCount=0,pageCount=0):
self.newsCount = newsCount
self.pageCount = pageCount
def getNewsPageCount(code):
url = 'http://www.wedengta.com/stockDetail/0101%s/news/1.html' % code
r = requests.get(url)
soup = BeautifulSoup(r.text, 'html.parser')
paper = soup.find_all('div', class_='pager')[0]
newsCount = paper.span.string
ncount = re.sub('\D','',newsCount)
page = Page()
page.newsCount = int(ncount)
for c in paper.children:
if c.string == '末页':
url = c['href']
pageCount = url.split('/')[-1].split('.')[0]
page.pageCount = int(pageCount)
return page
async def getSingleNewsList(code,page,newsList):
url = 'http://www.wedengta.com/stockDetail/0101%s/news/%d.html' % (code,page)
async with aiohttp.ClientSession() as session:
async with session.get(url) as r:
body = await r.text(encoding='utf-8')
soup = BeautifulSoup(body, 'html.parser')
nl = soup.find_all('ul', class_='news_list')[0]
for li in nl.children:
for a in li:
news = News()
news.detailUrl = 'http://www.wedengta.com%s' % a['href']
sc = await getSingleNewsDetail(news.detailUrl)
if sc:
news.title = a.h3.string
news.abstract = a.p.string
news.impact = a.span.string
news.source = sc[0]
news.content = sc[1]
newsList.append(news)
news.printNews()
async def getSingleNewsDetail(url):
async with aiohttp.ClientSession() as session:
async with session.get(url) as r:
body = await r.text(encoding='utf-8')
soup = BeautifulSoup(body, 'html.parser')
source = soup.find_all('div', class_='news_info')[0]
content = soup.find_all('div', id='newsContent')[0]
if content.div == None:
return [source.string, str(content)]
else:
return None
def getAllNewsList(code):
newsList = []
print('获取%s的资讯个数' %code)
page = getNewsPageCount(code)
print('共%d页,合计%d条' %(page.pageCount,page.newsCount))
if page.newsCount > 0:
loop = asyncio.get_event_loop()
tasks = [getSingleNewsList(code,pc,newsList) for pc in range(1, page.pageCount+1)]
loop.run_until_complete(asyncio.wait(tasks))
loop.close()
return newsList
start = time.time()
list = getAllNewsList('600585')
end = str(time.time() - start)
print('共用时%s' % end)
print(len(list))
网友评论