美文网首页
淘宝运用selenium+无头浏览器爬取并存入数据库

淘宝运用selenium+无头浏览器爬取并存入数据库

作者: 公元2094年 | 来源:发表于2020-05-10 18:09 被阅读0次

    代码:

    import re
    
    import pymysql
    
    from pyquery import PyQuery as pq
    
    from selenium import webdriver
    
    from selenium.common.exceptions import TimeoutException
    
    from selenium.webdriver.chrome.options import Options
    
    from selenium.webdriver.support.ui import WebDriverWait
    
    from selenium.webdriver.common.by import By
    
    from selenium.webdriver.support import expected_conditions as EC
    
    option = Options()
    
    # option.add_argument("--headless")
    
    option.add_experimental_option("debuggerAddress", "127.0.0.1:9222")
    
    driver = webdriver.Chrome(chrome_options=option)
    
    wait = WebDriverWait(driver,10)
    
    def search():
    
        try:
    
            input = wait.until(
    
                EC.presence_of_element_located((By.CSS_SELECTOR,"#q"))
    
                 )
    
            submit = wait.until(
    
                EC.element_to_be_clickable((By.CSS_SELECTOR,"#J_SearchForm > button"))
    
            )
    
            input.clear()
    
            input.send_keys('美食')
    
            submit.click()
    
            total = wait.until(
    
                EC.presence_of_element_located((By.CSS_SELECTOR,"#mainsrp-pager > div > div > div > div.total"))
    
            )
    
            get_product()
    
            return total.text
    
        except TimeoutException:
    
            return search()
    
    def next_page(page_n):
    
        print("翻页")
    
        try:
    
            input = wait.until(
    
                EC.presence_of_element_located((By.CSS_SELECTOR, "#mainsrp-pager > div > div > div > div.form > input"))
    
            )
    
            submit = wait.until(
    
                EC.element_to_be_clickable((By.CSS_SELECTOR, "#mainsrp-pager > div > div > div > div.form > span.btn.J_Submit"))
    
            )
    
            input.clear()
    
            input.send_keys(page_n)
    
            submit.click()
    
            wait.until(EC.text_to_be_present_in_element((By.CSS_SELECTOR,"#mainsrp-pager > div > div > div > ul > li.item.active > span"),str(page_n)))
    
            get_product()
    
        except TimeoutException:
    
            next_page(page_n)
    
    def main():
    
        total = search()
    
        total=int(re.compile('(\d+)').search(total).group(1))
    
        for i in range(2,total+1):
    
            next_page(i)
    
    def get_product():
    
        wait.until(EC.presence_of_element_located((By.CSS_SELECTOR,"#mainsrp-itemlist .items .item")))
    
        html = driver.page_source
    
        doc = pq(html)
    
        items = doc("#mainsrp-itemlist .items .item").items()
    
        print("开始爬取")
    
        for item in items:
    
            # product = {
    
                image=item.find('.pic .img').attr("src")
    
                price=item.find('.price').text()
    
                deal=item.find('.deal-cnt').text()
    
                title=item.find('.title').text()
    
                shop=item.find('.shopname').text()
    
                location=item.find('.location').text()
    
            # }
    
        coon = pymysql.connect(host='qxx2094.cn', user='root', password='qxxmysql', database="test",port=3306)
    
        cursor = coon.cursor()
    
        sql ="""
    
            INSERT INTO sj(image,price,deal,title,shop,location)
    
            VALUES (%s,%s,%s,%s,%s,%s)
    
        """
    
        try:
    
            cursor.execute(sql,(image,price,deal,title,shop,location))
    
            coon.commit()
    
            print("插入成功")
    
        except:
    
            coon.rollback()
    
            coon.close()
    
            print("插入失败")
    
    if __name__ == '__main__':
    
        main()
    
    

    相关文章

      网友评论

          本文标题:淘宝运用selenium+无头浏览器爬取并存入数据库

          本文链接:https://www.haomeiwen.com/subject/kriskctx.html