美文网首页
多进程协同selenium

多进程协同selenium

作者: 雨夜剪魂 | 来源:发表于2019-08-19 15:35 被阅读0次

代码如下:

在windows下测试ok,但会存在卡死的问题,应该是windows遗留的问题

from threading import Thread
from multiprocessing import Pool
from bs4 import BeautifulSoup
import re
import time
import requests
import pandas as pd
import json
from selenium import webdriver
from urllib.parse import urljoin, quote

PAGES = 34

KEYWORD = 'yourkeyword'

BASEURL = 'https://search.jd.com/Search?keyword={}&enc=utf-8&qrst=1&rt=1&stop=1&vt=2&bs=1&suggest=2.def.0.V19--12s0&ev=exbrand_%E5%A5%94%E5%AF%8C%EF%BC%88Penfolds%EF%BC%89%5E&stock=1&page={}&s=918&click=0'

def parse(source):
    soup = BeautifulSoup(source, 'lxml')
    lis = soup.find_all('li', 'gl-item')
    print('总共{}个数据'.format(len(lis)))
    for li in lis:
        price = li.find('div', 'p-price').get_text()
        title = li.div.find('div', 'p-name').a.get('title')
        href = 'http:' + li.div.find('div', 'p-name').a.get('href')
        sale = li.div.find('div', 'p-commit').get_text()
        shopname = li.div.find('div', 'p-shop').span.a.get('title')

        print(price, title, href, sale, shopname)
        df = pd.DataFrame(data = {
            '价格': price,
            '标题': title,
            '链接': href,
            '销量': sale,
            '店铺名': shopname
        }, index = ['0'])
        df.to_csv('filename.csv', mode = 'a', index=False, header = False, encoding='utf_8_sig')


def run(url):
    driver = webdriver.Chrome()
    try:
        driver.get(url)
        # 滚动下拉菜单
        for j in range(10):
            driver.execute_script('window.scrollBy(0,1500)')
            time.sleep(1)
    except Exception as e:
        print("err: ", e)
    else:
        parse(driver.page_source)
    finally:
        driver.close()

def main():
    p = Pool(2)

    urls = []
    for page in range(21, PAGES, 2):
        url = BASEURL.format(quote(KEYWORD), page)
        print('url: ', url)
        urls.append(url)

    for url in urls:
        p.apply_async(run, args = (url, ))
    p.close()
    p.join()
    print('task over')


if __name__ == '__main__':
    main()

相关文章

网友评论

      本文标题:多进程协同selenium

      本文链接:https://www.haomeiwen.com/subject/bxtesctx.html