美文网首页
使用chrome无头浏览器以及pyquery爬取妹子图

使用chrome无头浏览器以及pyquery爬取妹子图

作者: 非鱼2018 | 来源:发表于2019-08-10 14:25 被阅读0次

    主要使用技术:
    1.selenium+chrome无头浏览器
    2.pyquery解析网页
    3.多进程

    #coding=utf-8
    from selenium import webdriver
    from selenium.webdriver.chrome.options import Options
    from pyquery import PyQuery as pq
    from selenium.webdriver.common.by import By
    from selenium.webdriver.support.ui import WebDriverWait
    from selenium.webdriver.support import expected_conditions as EC
    from multiprocessing import Pool
    
    chrome_options = Options()
    chrome_options.add_argument('--headless')
    chrome_options.add_argument('--disable-gpu')
    def open_url(url):
        driver = webdriver.Chrome(chrome_options=chrome_options)
        driver.get(url)
        WebDriverWait(driver, 10).until(
            EC.presence_of_element_located((By.CSS_SELECTOR, "span.page-numbers.dots +a.page-numbers"))
        )
        html=driver.page_source
        driver.close()
        return html
    def download_image(url):
        print("正在下载")
        try:
            response=requests.get(url)
            if response.status_code==200:
                save_image(response.content)
                #return response.content
            return None
        except RequestException:
            print("%s下载失败"%url)
            return None
    def save_image(content):
        print("正在保存")
        file_path='{0}/{1}.{2}'.format(os.getcwd()+'\\picture',md5(content).hexdigest(),'jpg')
        if not os.path.exists(file_path):
            with open(file_path,'wb') as fp:
                fp.write(content)
                fp.close()
    def main(pagenum):
        print("正在解析第%s页数据" % str(pagenum))
        url = 'https://www.mzitu.com/jiepai/comment-page-' + str(pagenum)
        res = pq(url=url, opener=open_url)
        for i in res('p>img').items():
            print(i.attr('data-original'))
            download_image(i.attr('data-original'))
    if __name__=='__main__':
        pagenum=1
        url = 'https://www.mzitu.com/jiepai/comment-page-'+str(pagenum)
        res=pq(url=url, opener=open_url)
        page_count = res('span.page-numbers.dots +a.page-numbers').text()
        print('page_count:%s' % page_count)
        pool = Pool(5)
        pool.map(main, [i+1 for i in range(int(page_count))])
     
    
    
    

    相关文章

      网友评论

          本文标题:使用chrome无头浏览器以及pyquery爬取妹子图

          本文链接:https://www.haomeiwen.com/subject/dxrrjctx.html