美文网首页
淘宝运用selenium+无头浏览器爬取并存入数据库

淘宝运用selenium+无头浏览器爬取并存入数据库

作者: 公元2094年 | 来源:发表于2020-05-10 18:09 被阅读0次

代码:

import re

import pymysql

from pyquery import PyQuery as pq

from selenium import webdriver

from selenium.common.exceptions import TimeoutException

from selenium.webdriver.chrome.options import Options

from selenium.webdriver.support.ui import WebDriverWait

from selenium.webdriver.common.by import By

from selenium.webdriver.support import expected_conditions as EC

option = Options()

# option.add_argument("--headless")

option.add_experimental_option("debuggerAddress", "127.0.0.1:9222")

driver = webdriver.Chrome(chrome_options=option)

wait = WebDriverWait(driver,10)

def search():

    try:

        input = wait.until(

            EC.presence_of_element_located((By.CSS_SELECTOR,"#q"))

             )

        submit = wait.until(

            EC.element_to_be_clickable((By.CSS_SELECTOR,"#J_SearchForm > button"))

        )

        input.clear()

        input.send_keys('美食')

        submit.click()

        total = wait.until(

            EC.presence_of_element_located((By.CSS_SELECTOR,"#mainsrp-pager > div > div > div > div.total"))

        )

        get_product()

        return total.text

    except TimeoutException:

        return search()

def next_page(page_n):

    print("翻页")

    try:

        input = wait.until(

            EC.presence_of_element_located((By.CSS_SELECTOR, "#mainsrp-pager > div > div > div > div.form > input"))

        )

        submit = wait.until(

            EC.element_to_be_clickable((By.CSS_SELECTOR, "#mainsrp-pager > div > div > div > div.form > span.btn.J_Submit"))

        )

        input.clear()

        input.send_keys(page_n)

        submit.click()

        wait.until(EC.text_to_be_present_in_element((By.CSS_SELECTOR,"#mainsrp-pager > div > div > div > ul > li.item.active > span"),str(page_n)))

        get_product()

    except TimeoutException:

        next_page(page_n)

def main():

    total = search()

    total=int(re.compile('(\d+)').search(total).group(1))

    for i in range(2,total+1):

        next_page(i)

def get_product():

    wait.until(EC.presence_of_element_located((By.CSS_SELECTOR,"#mainsrp-itemlist .items .item")))

    html = driver.page_source

    doc = pq(html)

    items = doc("#mainsrp-itemlist .items .item").items()

    print("开始爬取")

    for item in items:

        # product = {

            image=item.find('.pic .img').attr("src")

            price=item.find('.price').text()

            deal=item.find('.deal-cnt').text()

            title=item.find('.title').text()

            shop=item.find('.shopname').text()

            location=item.find('.location').text()

        # }

    coon = pymysql.connect(host='qxx2094.cn', user='root', password='qxxmysql', database="test",port=3306)

    cursor = coon.cursor()

    sql ="""

        INSERT INTO sj(image,price,deal,title,shop,location)

        VALUES (%s,%s,%s,%s,%s,%s)

    """

    try:

        cursor.execute(sql,(image,price,deal,title,shop,location))

        coon.commit()

        print("插入成功")

    except:

        coon.rollback()

        coon.close()

        print("插入失败")

if __name__ == '__main__':

    main()

相关文章

网友评论

      本文标题:淘宝运用selenium+无头浏览器爬取并存入数据库

      本文链接:https://www.haomeiwen.com/subject/kriskctx.html