美文网首页
selenium for jd

selenium for jd

作者: 是东东 | 来源:发表于2022-09-03 12:40 被阅读0次
    from selenium import webdriver
    from selenium.webdriver.support.wait import WebDriverWait
    from selenium.webdriver.support import expected_conditions as EC
    from selenium.webdriver.common.by import By
    import selenium.common.exceptions
    import json
    import csv
    import time
    
    
    class Spider(object):
        def open_file(self):
            """
            设置结果存储格式
            :return:
            """
            self.fm = input('请输入文件保存格式(txt、json、csv):')
            while self.fm != 'txt' and self.fm != 'json' and self.fm != 'csv':
                self.fm = input('输入错误,请重新输入文件保存格式(txt、json、csv):')
            if self.fm == 'txt':
                self.fd = open('Jd.txt', 'w', encoding='utf-8')
            elif self.fm == 'json':
                self.fd = open('Jd.json', 'w', encoding='utf-8')
            elif self.fm == 'csv':
                self.fd = open('Jd.csv', 'w', encoding='utf-8', newline='')
    
        def getOptions(self):
            """
            对webdriver进行设置
            :return:
            """
            options = webdriver.ChromeOptions()
            options.add_argument(
                'User-Agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/102.0.5005.63 Safari/537.36')
            options.add_argument('--no-sandbox')  # 停用沙箱
            # 禁用GPU实现加速
            options.add_argument('--disable-gpu')
            # 开启无痕模式
            # options.add_argument('--incognito')
            # 去掉显示“浏览器正在受自动化控制”
            options.add_experimental_option('excludeSwitches', ['enable-automation'])
            # 忽略证书错误
            options.add_argument('--ignore-certificate-errors')
            return options
    
        def open_browser(self):
            """
            设置driver
            :return:
            """
            options = self.getOptions()
            self.browser = webdriver.Chrome(options=options)
            self.browser.implicitly_wait(10)
            self.wait = WebDriverWait(self.browser, 10)
    
        def init_variable(self):
            self.data = zip()
            self.isLast = False
    
        def parse_page(self):
            """
            解析网页内容
            :return:
            """
            try:
                skus = self.wait.until(EC.presence_of_all_elements_located((By.XPATH, '//li[@class="gl-item"]')))
                skus = [item.get_attribute('data-sku') for item in skus]
                links = ['https://item.jd.com/{sku}.html'.format(sku=item) for item in skus]
                prices = self.wait.until(
                    EC.presence_of_all_elements_located((By.XPATH, '//div[@class="gl-i-wrap"]/div[2]/strong/i')))
                prices = [item.text for item in prices]
                names = self.wait.until(
                    EC.presence_of_all_elements_located((By.XPATH, '//div[@class="gl-i-wrap"]/div[3]/a/em')))
                names = [item.text for item in names]
                comments = self.wait.until(
                    EC.presence_of_all_elements_located((By.XPATH, '//div[@class="gl-i-wrap"]/div[4]/strong')))
                comments = [item.text for item in comments]
                self.data = zip(links, prices, names, comments)
            except selenium.common.exceptions.TimeoutException:
                print('parse_page: TimeoutException')
                self.parse_page()
            except selenium.common.exceptions.StaleElementReferenceException:
                print('parse_page: StaleElementReferenceException')
                self.browser.refresh()
    
        def turn_page(self):
            """
            网页翻页
            :return:
            """
            try:
                self.wait.until(EC.element_to_be_clickable((By.XPATH, '//a[@class="pn-next"]'))).click()
                time.sleep(1)
                self.browser.execute_script("window.scrollTo(0,document.body.scrollHeight)")
                time.sleep(2)
            except selenium.common.exceptions.NoSuchElementException:
                self.isLast = True
            except selenium.common.exceptions.TimeoutException:
                print('turn_page: TimeoutException')
                self.turn_page()
            except selenium.common.exceptions.StaleElementReferenceException:
                print('turn_page: StaleElementReferenceException')
                self.browser.refresh()
    
        def write_to_file(self):
            """
            将结果写入文件
            :return:
            """
            if self.fm == 'txt':
                for item in self.data:
                    self.fd.write('----------------------------------------\n')
                    self.fd.write('link:' + str(item[0]) + '\n')
                    self.fd.write('price:' + str(item[1]) + '\n')
                    self.fd.write('name:' + str(item[2]) + '\n')
                    self.fd.write('comment:' + str(item[3]) + '\n')
            if self.fm == 'json':
                temp = ('link', 'price', 'name', 'comment')
                for item in self.data:
                    json.dump(dict(zip(temp, item)), self.fd, ensure_ascii=False)
            if self.fm == 'csv':
                writer = csv.writer(self.fd)
                for item in self.data:
                    writer.writerow(item)
    
        def close_file(self):
            self.fd.close()
    
        def close_browser(self):
            self.browser.quit()
    
        def crawl(self):
            """
            爬取主程序
            :return:
            """
            self.open_file()
            self.open_browser()
            self.init_variable()
            print('开始爬取')
            self.browser.get('https://search.jd.com/Search?keyword=%E7%AC%94%E8%AE%B0%E6%9C%AC&enc=utf-8')
            time.sleep(1)
            self.browser.execute_script("window.scrollTo(0,document.body.scrollHeight)")
            time.sleep(2)
            count = 0
            while not self.isLast:
                count += 1
                print('正在爬取第 ' + str(count) + ' 页......')
                self.parse_page()
                self.write_to_file()
                self.turn_page()
            self.close_file()
            self.close_browser()
            print('结束爬取')
    
    
    if __name__ == '__main__':
        spider = Spider()
        spider.crawl()
    
    

    相关文章

      网友评论

          本文标题:selenium for jd

          本文链接:https://www.haomeiwen.com/subject/xoyunrtx.html