'''
1,脚本目的:获取某个淘宝店内产品信息。方法:进入店铺首页,按照销量排序即可。
2,输出的是一个列表,列表中每个元素都是字典的形式。以后可以储存成excel表格等形式。
3,淘宝已售件数,是最近30天的,还是累计的?我不确定。凭感觉应该是最先进30天的!
'''
from selenium import webdriver
import time
import urllib.request
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
url = 'https://shop508375259.taobao.com/search.htm?spm=a1z10.1-c.0.0.59f334b5ictxhE&search=y&orderType=hotsell_desc'
class TbSpider(object):
driver_path = "D:\chromedriver_win32\chromedriver.exe"
def __init__(self):
self.driver = webdriver.Chrome(executable_path=TbSpider.driver_path)
self.keywords = "ipad"
word = url_code_name = urllib.request.quote(self.keywords)
self.url = "https://s.taobao.com/search?q=" + self.keywords
self.url2 = "https://item.jd.com/17033085487.html"
self.url3 = 'https://shop508375259.taobao.com/search.htm?spm=a1z10.1-c.0.0.59f334b5ictxhE&search=y&orderType=hotsell_desc'
self.data = []
def get_html(self):
# 这个函数就是打开url,然后我手动扫描登录淘宝。
self.driver.get(self.url)
self.driver.maximize_window()
# 这里用time.sleep不行,得用一个显示等待代码。
try:
element = WebDriverWait(self.driver, 100).until(
EC.presence_of_element_located((By.ID, "tabFilterMall")))
finally:
print('显示等待完成。')
print(element)
#跳转到店铺地址
self.driver.get(self.url3)
# 这里用time.sleep不行,得用一个显示等待代码。
try:
element = WebDriverWait(self.driver, 100).until(
EC.presence_of_element_located((By.ID, "J_HesperCats")))
finally:
print('显示等待完成--看到了所有宝贝。')
print(element)
def drop_down(self):
#抄袭的是视频中的代码,实现自动翻页的效果。
for x in range(1,11,2):
time.sleep(0.5)
j = x/10
js = 'document.documentElement.scrollTop = document.documentElement.scrollHeight * %f' % j
self.driver.execute_script(js)
print('翻页到底了,老铁')
def get_one_page_info(self):
'''
获取本页列表内所有的商品标题,销量,价格等。
:return:
'''
lists = self.driver.find_elements_by_xpath('//*[@class="item3line1"]')
for list in lists:
for i in range(1,4):
title = list.find_element_by_xpath('.//dl['+ str(i) +']/dd[1]/a').text
price = list.find_element_by_xpath('.//dl['+ str(i) +']/dd[1]/div/div/span[2]').text
sale = list.find_element_by_xpath('.//dl['+ str(i) +']/dd[1]/div/div[2]/span').text
# print('标题',title,'价格',price , '售卖数量',sale)
self.data.append({'标题':title,'价格':price,'已售件数':sale})
def fanye(self):
#获取下一页的链接,并且翻页。
next_url = self.driver.find_element_by_xpath('//*[@id="J_ShopSearchResult"]/div/div[2]/div[10]/a[10]').get_attribute('href')
self.driver.get(next_url)
# 这里用time.sleep不行,得用一个显示等待代码。
try:
element = WebDriverWait(self.driver, 100).until(
EC.presence_of_element_located((By.ID, "J_HesperCats")))
finally:
print('刷新成功--可以进行向下滚动了。')
self.drop_down()
try:
element = WebDriverWait(self.driver, 100).until(
EC.presence_of_element_located((By.ID, "J_ShopSearchResult")))
finally:
print('翻页成功--可以进行操作了。')
def run(self):
self.get_html()
self.drop_down()
for i in range(4):
self.get_one_page_info()
self.fanye()
for da in self.data:
print(da)
if __name__ == "__main__":
spider = TbSpider()
spider.run()
网友评论