使用multiprocessting的多进程爬虫
# coding:utf8
import time
from multiprocessing import Process, Queue
import requests
link_list = []
with open('alexa.txt', 'r') as f:
file_list = f.readlines()
for line in file_list:
link = line.split('\t')[1]
link = link.replace('\n', '')
link_list.append(link)
# print(link_list)
start = time.time()
class MyProcess(Process):
def __init__(self, name, q):
Process.__init__(self)
self.name = name
self.q = q
def run(self):
print('Starting', self.pid)
print('Process_name:%s' % self.name)
while not self.q.empty():
crawl(self.q)
print('Exiting', self.pid)
def crawl(q):
link = q.get(timeout=2)
try:
r = requests.get(link)
print(r.status_code)
except Exception as e:
print(link, 'Error: ', e)
if __name__ == '__main__':
# process_names = ['Process_1', 'Process_2', 'Process_3']
list_size = len(link_list)
workQueue = Queue(list_size)
# 填充队列
for url in link_list:
workQueue.put(url)
for i in range(0, 3):
p = MyProcess('Process_%s' % str(i), workQueue)
# p.daemon = True
p.start()
# p.join()
end = time.time()
print('duration: %d' % (end - start))
使用Pool + Queue的多进程爬虫
# coding:utf8
import time
import multiprocessing
from multiprocessing import Pool, Manager
import requests
link_list = []
with open('alexa.txt', 'r') as f:
file_list = f.readlines()
for line in file_list:
link = line.split('\t')[1]
link = link.replace('\n', '')
link_list.append(link)
# print(link_list)
start = time.time()
def crawl(q, index):
process_id = 'Process_%s' % str(index)
while not q.empty():
link = q.get(timeout=2)
try:
r = requests.get(link)
print(process_id, r.status_code)
except Exception as e:
print(process_id, link, 'Error: ', e)
if __name__ == '__main__':
manager = Manager()
list_size = len(link_list)
workQueue = manager.Queue(list_size)
# 填充队列
for url in link_list:
workQueue.put(url)
pool_num = multiprocessing.cpu_count()
pool = Pool(processes=pool_num)
for i in range(pool_num):
pool.apply_async(crawl, args=(workQueue, i))
print('Started project...')
pool.close()
pool.join()
end = time.time()
print('duration: %d' % (end - start))
网友评论