from selenium import webdriver
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
import selenium.common.exceptions
import json
import csv
import time
class Spider(object):
def open_file(self):
"""
设置结果存储格式
:return:
"""
self.fm = input('请输入文件保存格式(txt、json、csv):')
while self.fm != 'txt' and self.fm != 'json' and self.fm != 'csv':
self.fm = input('输入错误,请重新输入文件保存格式(txt、json、csv):')
if self.fm == 'txt':
self.fd = open('Jd.txt', 'w', encoding='utf-8')
elif self.fm == 'json':
self.fd = open('Jd.json', 'w', encoding='utf-8')
elif self.fm == 'csv':
self.fd = open('Jd.csv', 'w', encoding='utf-8', newline='')
def getOptions(self):
"""
对webdriver进行设置
:return:
"""
options = webdriver.ChromeOptions()
options.add_argument(
'User-Agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/102.0.5005.63 Safari/537.36')
options.add_argument('--no-sandbox') # 停用沙箱
# 禁用GPU实现加速
options.add_argument('--disable-gpu')
# 开启无痕模式
# options.add_argument('--incognito')
# 去掉显示“浏览器正在受自动化控制”
options.add_experimental_option('excludeSwitches', ['enable-automation'])
# 忽略证书错误
options.add_argument('--ignore-certificate-errors')
return options
def open_browser(self):
"""
设置driver
:return:
"""
options = self.getOptions()
self.browser = webdriver.Chrome(options=options)
self.browser.implicitly_wait(10)
self.wait = WebDriverWait(self.browser, 10)
def init_variable(self):
self.data = zip()
self.isLast = False
def parse_page(self):
"""
解析网页内容
:return:
"""
try:
skus = self.wait.until(EC.presence_of_all_elements_located((By.XPATH, '//li[@class="gl-item"]')))
skus = [item.get_attribute('data-sku') for item in skus]
links = ['https://item.jd.com/{sku}.html'.format(sku=item) for item in skus]
prices = self.wait.until(
EC.presence_of_all_elements_located((By.XPATH, '//div[@class="gl-i-wrap"]/div[2]/strong/i')))
prices = [item.text for item in prices]
names = self.wait.until(
EC.presence_of_all_elements_located((By.XPATH, '//div[@class="gl-i-wrap"]/div[3]/a/em')))
names = [item.text for item in names]
comments = self.wait.until(
EC.presence_of_all_elements_located((By.XPATH, '//div[@class="gl-i-wrap"]/div[4]/strong')))
comments = [item.text for item in comments]
self.data = zip(links, prices, names, comments)
except selenium.common.exceptions.TimeoutException:
print('parse_page: TimeoutException')
self.parse_page()
except selenium.common.exceptions.StaleElementReferenceException:
print('parse_page: StaleElementReferenceException')
self.browser.refresh()
def turn_page(self):
"""
网页翻页
:return:
"""
try:
self.wait.until(EC.element_to_be_clickable((By.XPATH, '//a[@class="pn-next"]'))).click()
time.sleep(1)
self.browser.execute_script("window.scrollTo(0,document.body.scrollHeight)")
time.sleep(2)
except selenium.common.exceptions.NoSuchElementException:
self.isLast = True
except selenium.common.exceptions.TimeoutException:
print('turn_page: TimeoutException')
self.turn_page()
except selenium.common.exceptions.StaleElementReferenceException:
print('turn_page: StaleElementReferenceException')
self.browser.refresh()
def write_to_file(self):
"""
将结果写入文件
:return:
"""
if self.fm == 'txt':
for item in self.data:
self.fd.write('----------------------------------------\n')
self.fd.write('link:' + str(item[0]) + '\n')
self.fd.write('price:' + str(item[1]) + '\n')
self.fd.write('name:' + str(item[2]) + '\n')
self.fd.write('comment:' + str(item[3]) + '\n')
if self.fm == 'json':
temp = ('link', 'price', 'name', 'comment')
for item in self.data:
json.dump(dict(zip(temp, item)), self.fd, ensure_ascii=False)
if self.fm == 'csv':
writer = csv.writer(self.fd)
for item in self.data:
writer.writerow(item)
def close_file(self):
self.fd.close()
def close_browser(self):
self.browser.quit()
def crawl(self):
"""
爬取主程序
:return:
"""
self.open_file()
self.open_browser()
self.init_variable()
print('开始爬取')
self.browser.get('https://search.jd.com/Search?keyword=%E7%AC%94%E8%AE%B0%E6%9C%AC&enc=utf-8')
time.sleep(1)
self.browser.execute_script("window.scrollTo(0,document.body.scrollHeight)")
time.sleep(2)
count = 0
while not self.isLast:
count += 1
print('正在爬取第 ' + str(count) + ' 页......')
self.parse_page()
self.write_to_file()
self.turn_page()
self.close_file()
self.close_browser()
print('结束爬取')
if __name__ == '__main__':
spider = Spider()
spider.crawl()
网友评论