# -*- coding: utf-8 -*-
"""
@Time : 2020/6/19 16:49
@Athor : LinXiao
@功能 :
"""
# ------------------------------
from asyncio import sleep
from pprint import pprint
"""
2940条:
Cost 3.7898309230804443 seconds
"""
from lxml import etree
from time import time
import asyncio
import aiohttp
import asyncio
from fake_useragent import UserAgent
from motor.motor_asyncio import AsyncIOMotorClient
# url = 'https://github.com/StackExchange/StackExchange.Redis/issues/201'
url='https://hui.lianjia.com/ershoufang/'
# 抓取页面内容
async def fetch_content(url):
ua=UserAgent()
headers={
'User-Agent': ua.random
}
async with aiohttp.ClientSession(headers=headers) as session:
async with session.get(url) as response:
sleep(3)
res=await response.text()
pprint(res)
return res
# 解析页面内容
async def parse(url):
page=await fetch_content(url)
xpath_house="//div[@class='info clear']/div[@class='title']/a"
xpath_house_price="//div[@class='totalPrice']/span"
# xpath_pages = '//*[@id="content"]/div/div[1]/div[2]/a'
fetch_list=[]
result=[]
for p in range(1, 300):
fetch_list.append(url + f'//pg{p}//')
tasks=[fetch_content(url) for url in fetch_list]
pages=await asyncio.gather(*tasks)
for page in pages:
html=etree.HTML(page)
for element_house, element_house_price in zip(html.xpath(xpath_house), html.xpath(xpath_house_price)):
result.append(f'{element_house.text}:{element_house_price.text}万')
pprint(type(result))
return result
# 保存到mongo数据库中
async def save(url):
db_url='localhost'
db_port=27017
db_name="linxao1"
db_collection="lx21"
# 建立连接
client=AsyncIOMotorClient(db_url, db_port)
# 连接某个库名字
db=client[db_name][db_collection]
list1=[]
result=await parse(url)
pprint(result)
for i, house in enumerate(result, 1):
print(i, house)
dic={"_id": i,
"context": house
}
pprint(dic)
list1.append(dic)
pprint(list1)
await db.insert_many([i for i in list1])
print('inserted %d docs' % (len(result)))
document={'key': 'value'}
result=await db.test_collection.insert_one(document)
print('result %s' % repr(result.inserted_id))
async def do_insert():
# 建立连接
client=AsyncIOMotorClient('localhost', 27017)
# 连接某个库名字
db=client.linxiao
# result = await db.collection111.insert_many(
# lx 为linxiao库下面的lx集合
result=await db.lx.insert_many(
[{'i': i} for i in range(20)]) # insert_many可以插入一条或多条数据,但是必须以列表(list)的形式组织数据
print('inserted %d docs' % (len(result.inserted_ids),))
def main():
loop=asyncio.get_event_loop()
start=time()
# loop.run_until_complete(parse(url))
# loop.run_until_complete(fetch_content(url))
loop.run_until_complete(save(url))
end=time()
print('Cost {} seconds'.format(end - start))
loop.close()
if __name__ == '__main__':
main()
网友评论