参考此文:https://www.jianshu.com/p/80c602afc623
#! -*- coding:utf-8 -*-
# 抓取商品信息并保存到数据库或者EXCEL
from multiprocessing.pool import Pool
import pymongo
from selenium import webdriver
import xlwt
driver_path=r"C:\Users\Administrator\AppData\Local\Google\Chrome\Application\chromedriver.exe"
driver=webdriver.Chrome(executable_path=driver_path)
# EXCEL
f=xlwt.Workbook(encoding="utf8")
sheet01=f.add_sheet(u'sheet1',cell_overwrite_ok=True)
sheet01.write(0,0,'标题') # excl里面:左边0:是横,右边:纵
sheet01.write(0,1,'标价')
sheet01.write(0,2,'购买人数')
# MONGODB
MONGO_URL = 'localhost'
MONGO_DB = 'taobao'
MONGO_TABLE = 'nvzhuang'
client = pymongo.MongoClient(MONGO_URL)
db=client[MONGO_DB]
# 需要请求的页面
url = "https://uland.taobao.com/sem/tbsearch?refpid=mm_26632258_3504122_32538762&keyword=%E5%A5%B3%E8%A3%85&clk1=44c369a534bf95506aa0a87518971645&upsid=44c369a534bf95506aa0a87518971645&page="
# 默认页数
number=1
# 获取最大页数
def get_maxpage(url):
try:
driver.get(url)
totalPage = driver.find_element_by_css_selector('.totalPage').text
number = int(str(totalPage).strip('共').strip('页'))
driver.close()
return number
except Exception:
get_maxpage(url)
# 解析页面元素
def parse_page(pagenum):
print("在正抓取第",pagenum,'页...')
contents = []
driver.get(url + str(pagenum))
divs = driver.find_elements_by_xpath("//div[@id='searchResult']/div[@id='ItemWrapper']/div[@class='item']")
for div in divs:
title = div.find_element_by_xpath(".//span[@class='title']").text
price = div.find_element_by_xpath(".//span[@class='pricedetail']/strong").text
byNumber = div.find_element_by_xpath(".//span[@class='payNum']").text
shopName = div.find_element_by_xpath(".//span[@class='shopNick']").text
score = div.find_element_by_xpath(".//span[@class='dsr-info-num']").text
href = div.find_element_by_xpath(".//a").get_attribute("href")
image = div.find_element_by_xpath(".//img").get_attribute("src")
info = {
'title': title,
'shopName':shopName,
'price': price,
'byNumber': byNumber,
'score':score,
'href':href,
'image':image
}
contents.append(info)
# save_infos(contents)
save_to_mongo(contents)
# 保存到MONGO
def save_to_mongo(result):
try:
if db[MONGO_TABLE].insert(result):
print('保存到MONGODB成功')
except Exception:
print('存储到MONGODB失败')
# 保存到EXCEL
def save_infos(contents):
w = 0
for content in contents:
sheet01.write(w + 1, 0, content['title']) # 前纵后横
sheet01.write(w + 1, 1, content['price'])
sheet01.write(w + 1, 2, content['number'])
w = w + 1
f.save(r"C:\Users\Administrator\Desktop\taobao_nvzhuang.xls")
def main():
number = get_maxpage(url)
# 多线程抓取
pool = Pool()
pool.map(parse_page, [i + 1 for i in range(number)])
if __name__ == '__main__':
main()
效果如下:

网友评论