代码如下:
在windows下测试ok,但会存在卡死的问题,应该是windows遗留的问题
from threading import Thread
from multiprocessing import Pool
from bs4 import BeautifulSoup
import re
import time
import requests
import pandas as pd
import json
from selenium import webdriver
from urllib.parse import urljoin, quote
PAGES = 34
KEYWORD = 'yourkeyword'
BASEURL = 'https://search.jd.com/Search?keyword={}&enc=utf-8&qrst=1&rt=1&stop=1&vt=2&bs=1&suggest=2.def.0.V19--12s0&ev=exbrand_%E5%A5%94%E5%AF%8C%EF%BC%88Penfolds%EF%BC%89%5E&stock=1&page={}&s=918&click=0'
def parse(source):
soup = BeautifulSoup(source, 'lxml')
lis = soup.find_all('li', 'gl-item')
print('总共{}个数据'.format(len(lis)))
for li in lis:
price = li.find('div', 'p-price').get_text()
title = li.div.find('div', 'p-name').a.get('title')
href = 'http:' + li.div.find('div', 'p-name').a.get('href')
sale = li.div.find('div', 'p-commit').get_text()
shopname = li.div.find('div', 'p-shop').span.a.get('title')
print(price, title, href, sale, shopname)
df = pd.DataFrame(data = {
'价格': price,
'标题': title,
'链接': href,
'销量': sale,
'店铺名': shopname
}, index = ['0'])
df.to_csv('filename.csv', mode = 'a', index=False, header = False, encoding='utf_8_sig')
def run(url):
driver = webdriver.Chrome()
try:
driver.get(url)
# 滚动下拉菜单
for j in range(10):
driver.execute_script('window.scrollBy(0,1500)')
time.sleep(1)
except Exception as e:
print("err: ", e)
else:
parse(driver.page_source)
finally:
driver.close()
def main():
p = Pool(2)
urls = []
for page in range(21, PAGES, 2):
url = BASEURL.format(quote(KEYWORD), page)
print('url: ', url)
urls.append(url)
for url in urls:
p.apply_async(run, args = (url, ))
p.close()
p.join()
print('task over')
if __name__ == '__main__':
main()
网友评论