淘宝运用selenium+无头浏览器爬取并存入数据库

作者: 公元2094年 | 来源:发表于2020-05-10 18:09 被阅读0次

淘宝运用selenium+无头浏览器爬取并存入数据库
Python+selenium使用cookie登录淘宝
python 简单操作MySQL
python | 爬取国内油价数据，并存入数据库
行业垂直搜索引擎的构建
使用XPath爬取起点网
【爬虫】-005-MongoDB数据库操作-练习
数据分析 | 六、分页爬取58同城房屋信息
Python爬虫 --- Scrapy爬取黄页88网企业信息
爬取猫眼电影存入csv

代码：

import re

import pymysql

from pyquery import PyQuery as pq

from selenium import webdriver

from selenium.common.exceptions import TimeoutException

from selenium.webdriver.chrome.options import Options

from selenium.webdriver.support.ui import WebDriverWait

from selenium.webdriver.common.by import By

from selenium.webdriver.support import expected_conditions as EC

option = Options()

# option.add_argument("--headless")

option.add_experimental_option("debuggerAddress", "127.0.0.1:9222")

driver = webdriver.Chrome(chrome_options=option)

wait = WebDriverWait(driver,10)

def search():

    try:

        input = wait.until(

            EC.presence_of_element_located((By.CSS_SELECTOR,"#q"))

             )

        submit = wait.until(

            EC.element_to_be_clickable((By.CSS_SELECTOR,"#J_SearchForm > button"))

        )

        input.clear()

        input.send_keys('美食')

        submit.click()

        total = wait.until(

            EC.presence_of_element_located((By.CSS_SELECTOR,"#mainsrp-pager > div > div > div > div.total"))

        )

        get_product()

        return total.text

    except TimeoutException:

        return search()

def next_page(page_n):

    print("翻页")

    try:

        input = wait.until(

            EC.presence_of_element_located((By.CSS_SELECTOR, "#mainsrp-pager > div > div > div > div.form > input"))

        )

        submit = wait.until(

            EC.element_to_be_clickable((By.CSS_SELECTOR, "#mainsrp-pager > div > div > div > div.form > span.btn.J_Submit"))

        )

        input.clear()

        input.send_keys(page_n)

        submit.click()

        wait.until(EC.text_to_be_present_in_element((By.CSS_SELECTOR,"#mainsrp-pager > div > div > div > ul > li.item.active > span"),str(page_n)))

        get_product()

    except TimeoutException:

        next_page(page_n)

def main():

    total = search()

    total=int(re.compile('(\d+)').search(total).group(1))

    for i in range(2,total+1):

        next_page(i)

def get_product():

    wait.until(EC.presence_of_element_located((By.CSS_SELECTOR,"#mainsrp-itemlist .items .item")))

    html = driver.page_source

    doc = pq(html)

    items = doc("#mainsrp-itemlist .items .item").items()

    print("开始爬取")

    for item in items:

        # product = {

            image=item.find('.pic .img').attr("src")

            price=item.find('.price').text()

            deal=item.find('.deal-cnt').text()

            title=item.find('.title').text()

            shop=item.find('.shopname').text()

            location=item.find('.location').text()

        # }

    coon = pymysql.connect(host='qxx2094.cn', user='root', password='qxxmysql', database="test",port=3306)

    cursor = coon.cursor()

    sql ="""

        INSERT INTO sj(image,price,deal,title,shop,location)

        VALUES (%s,%s,%s,%s,%s,%s)

    """

    try:

        cursor.execute(sql,(image,price,deal,title,shop,location))

        coon.commit()

        print("插入成功")

    except:

        coon.rollback()

        coon.close()

        print("插入失败")

if __name__ == '__main__':

    main()