1.多任务线程池进行爬取数据,高效率完成
2.创建线程池将url和执行的函数名提交也就是pool.submit(函数名,url)
3.使用xpath进行数据的获取
4.存储数据为json文件,存在本地
from concurrent.futures import ThreadPoolExecutor
import requests
from requests import exceptions
from lxml import etree
import json
import threading
导入线程池模块
def crawlPageDate(url,kwargs):
print(url,kwargs)
# fullurl = 'https://www.meishij.net/chufang/diy/jiangchangcaipu/?&page=1' + str(pagenum)
headers = {
'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.119 Safari/537.36',
}
try:
response = requests.get(url,headers=headers)
if response.status_code == 200:
print('请求成功')
html =response.text
#将获取的页面源代码放入数据队列
return html,200
except exceptions.HTTPError as err:
print(err)
except exceptions.ConnectTimeout as err:
print(err)
except exceptions.RequestException as err:
print(err)
return None,404
def done(futures):
# print('123')
print(futures)
html,status = futures.result()
print(status)
# 解析数据,实例化一个xpath对象
if html:
x_html = etree.HTML(html)
caipu_list = x_html.xpath('//div[@class="listtyle1"]')
for cai_div in caipu_list:
# 封面图
item = {}
item['coverImage'] = cai_div.xpath('//img[@class="img"]/@src')[0]
item['type'] = cai_div.xpath('.//a/strong[@class="gx"]/span/text()')
if len(item['type']) > 0:
item['type'] = item['type'][0]
else:
item['type'] = '暂无'
item['title'] = cai_div.xpath('.//div[@class="c1"]/strong/text()')[0]
print(item)
lock.acquire()
with open('cai.json', 'a') as file:
json_str = json.dumps(item, ensure_ascii=False) + '\n'
file.write(json_str)
lock.release()
if name == 'main':
#创建线程池
pool = ThreadPoolExecutor(max_workers=1889999999999999999999999999)
for page in range(1,57):
#向线程池中提交任务
'''
fn:要执行的任务,args:要传递的参数,*kwargs要传递的多个参数:
'''
url = 'https://www.meishij.net/chufang/diy/jiangchangcaipu/?&page=' + str(page)
result = pool.submit(crawlPageDate,url)
#callback里面要添加方法名称
result.add_done_callback(done)
lock = threading.Lock()
网友评论