这是一个没有翻页处理的爬取,可以小修改下,实现隔壁淘宝信息抓取的翻页处理
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.common.keys import Keys
from lxml import etree
def search():
try:
browser.get('https://www.jd.com/')
input = wait.until(EC.presence_of_element_located((By.XPATH, '//*[@id="key"]')))
# submit = wait.until(EC.element_to_be_clickable((By.XPATH, '//*[@class="button"]')))
input.clear()
input.send_keys('鞋子', Keys.ENTER)
# submit.click()
total = wait.until(EC.presence_of_element_located((By.XPATH, '//*[@id="J_bottomPage"]/span[2]/em[1]/b')))
get_products()
return total.text
except TimeoutException:
return search()
def get_products():
wait.until(EC.presence_of_element_located((By.XPATH, '//*[@class="page clearfix"]')))
html = browser.page_source
html = etree.HTML(html)
images = html.xpath('//*[@id="J_goodsList"]/ul/li/div/div[1]/a/img/@src')
comment = html.xpath('//*[@class="p-commit"]//a/text()')
name = html.xpath('//*[@class="curr-shop"]/@title')
for i in range(len(name)):
temp = {
'images': images,
'comment': comment,
'name': name,
}
print(temp)
if __name__ == '__main__':
browser = webdriver.Chrome()
wait = WebDriverWait(browser, 10)
search()
网友评论