美文网首页
使用多进程抓取网页信息

使用多进程抓取网页信息

作者: BlueCat2016 | 来源:发表于2018-07-08 17:43 被阅读0次

使用multiprocessting的多进程爬虫

# coding:utf8

import time
from multiprocessing import Process, Queue
import requests

link_list = []
with open('alexa.txt', 'r') as f:
    file_list = f.readlines()
    for line in file_list:
        link = line.split('\t')[1]
        link = link.replace('\n', '')
        link_list.append(link)

# print(link_list)
start = time.time()


class MyProcess(Process):
    def __init__(self, name, q):
        Process.__init__(self)
        self.name = name
        self.q = q

    def run(self):
        print('Starting', self.pid)
        print('Process_name:%s' % self.name)
        while not self.q.empty():
            crawl(self.q)
        print('Exiting', self.pid)


def crawl(q):
    link = q.get(timeout=2)
    try:
        r = requests.get(link)
        print(r.status_code)
    except Exception as e:
        print(link, 'Error: ', e)


if __name__ == '__main__':
    # process_names = ['Process_1', 'Process_2', 'Process_3']
    list_size = len(link_list)
    workQueue = Queue(list_size)

    # 填充队列
    for url in link_list:
        workQueue.put(url)

    for i in range(0, 3):
        p = MyProcess('Process_%s' % str(i), workQueue)
        # p.daemon = True
        p.start()
        # p.join()

    end = time.time()
    print('duration: %d' % (end - start))

使用Pool + Queue的多进程爬虫

# coding:utf8

import time
import multiprocessing
from multiprocessing import Pool, Manager
import requests

link_list = []
with open('alexa.txt', 'r') as f:
    file_list = f.readlines()
    for line in file_list:
        link = line.split('\t')[1]
        link = link.replace('\n', '')
        link_list.append(link)

# print(link_list)
start = time.time()


def crawl(q, index):
    process_id = 'Process_%s' % str(index)
    while not q.empty():
        link = q.get(timeout=2)
        try:
            r = requests.get(link)
            print(process_id, r.status_code)
        except Exception as e:
            print(process_id, link, 'Error: ', e)


if __name__ == '__main__':
    manager = Manager()
    list_size = len(link_list)
    workQueue = manager.Queue(list_size)

    # 填充队列
    for url in link_list:
        workQueue.put(url)

    pool_num = multiprocessing.cpu_count()
    pool = Pool(processes=pool_num)
    for i in range(pool_num):
        pool.apply_async(crawl, args=(workQueue, i))

    print('Started project...')
    pool.close()
    pool.join()
    end = time.time()
    print('duration: %d' % (end - start))

相关文章

网友评论

      本文标题:使用多进程抓取网页信息

      本文链接:https://www.haomeiwen.com/subject/ffwouftx.html