美文网首页工作生活
downloading images from particul

downloading images from particul

作者: 狼无雨雪 | 来源:发表于2019-07-04 18:37 被阅读0次

    一些特殊的网站,比如漫画网站,并没有对图片做访问登陆的限制,并且图片访问呈现一定的规律性,就可以通过编程简单爬虫的手段下载。在这边我上传了一个我下载哈哈漫画的示例程序,有兴趣的童鞋可以看看。

    # 爬取特殊网站图片方法一
    import requests
    import os
    import glob
    import shutil
    
    prefix = "有意思的网址/files/"
    base_url = "有意思的网址/files/80606/"
    begin_page = 27956
    end_page = 63655
    
    for page_id in range(begin_page, end_page + 1):
        url = base_url + str(page_id) + "/"
        index = 1
        response_id = True
        
        dir_path = url.replace(prefix, "")
        
        if not os.path.exists(dir_path):
            os.makedirs(dir_path)
        
        
        while response_id != False:
            url_path = url + str(index) + ".jpg"
            response = requests.get(url_path)
            response_id = response.ok
            
            filename = url_path.replace(prefix, "")
            
            if response_id == True:
                print(url_path, filename)
                with open(filename,'wb') as f:
                    for chunk in response.iter_content(128):
                        f.write(chunk)
            
            index += 1
            
    
    

    此外还有一种更加优秀的解决方法

    # 爬取特殊网站图片方法二
    
    """
    really used in fetching url from 不可描述的网站
    """
    
    import requests
    import glob
    import shutil
    from selenium import webdriver
    import time
    import os
    import sys
    import re
    from bs4 import BeautifulSoup
    from selenium.webdriver.chrome.options import Options
    
    
    def downloading_images(prefix, url):
        filename = url.replace(prefix, '')
        basename = os.path.basename(filename)
        dirname = filename.replace(basename, '')
        
        dirurl = url.replace(basename, '')
        
        if not os.path.exists(dirname):
            os.makedirs(dirname)
            
        response_id = True
        index = 1
        
        while response_id != False:
            url_path = dirurl + str(index) + ".jpg"
            response = requests.get(url_path)
            response_id = response.ok
            
            filename = dirname + str(index) + ".jpg"
            if response_id == True:
                print(url_path, filename)
                with open(filename,'wb') as f:
                    for chunk in response.iter_content(128):
                        f.write(chunk)
            
            index += 1
    
    
    if __name__ == "__main__":
    
        
        prefix = "不可描述的域名"
        
        down_loading_urls = [
            ['不可描述的网站', 49]
        ]
        
    
        for down_loading_url, count in down_loading_urls:
            
            
            
            
            chrome_options = Options()
            chrome_options.add_argument('-headless')
            chrome_options.add_argument('--disable-gpu')
            browser = webdriver.Chrome(chrome_options = chrome_options)
    
    
            try:
                print(down_loading_url)
                browser.get(down_loading_url)
                time.sleep(4)
                for num in range(1, count + 1):
                    browser.find_element_by_xpath('/html/body/div[2]/div[3]/div[2]/ul/li[%d]/a'%num).click()
                    time.sleep(4)
                    values = browser.find_elements_by_xpath('/html/body/div[2]/article/div/div/img[1]')[0].get_attribute('data-original')
                    downloading_images(prefix, values)
                    browser.back()
                    time.sleep(4)
                    
            except Exception as e:
                print("global",e)
            finally:
                browser.close()
    
    

    完成之后最好把图片都放一个文件夹,并且放在手机上观看

    # 将图片集中到一个文件夹
    import os
    import shutil
    dirname = "files/80606"
    dirs = os.listdir(dirname)
    dirs = [int(value) for value in dirs]
    dirs.sort()
    new_dirname = dirname + "all"
    index = 1
    
    if not os.path.exists(new_dirname):
        os.makedirs(new_dirname)
    
    for dir_ in dirs:
        dir_path = os.path.join(dirname, str(dir_))
        for file in os.listdir(dir_path):
            filename = os.path.join(dir_path, file)
            shutil.copy(filename, os.path.join(new_dirname, str(index) + ".jpg"))
            index += 1
    
    
    import os
    import shutil
    dirname = "files/80648"
    new_dirname = dirname + "all"
    index = 1
    
    if not os.path.exists(new_dirname):
        os.makedirs(new_dirname)
    
    dirs = sorted([int(value) for value in os.listdir(dirname)])
    
    t_dir = dirs[0]
    
    t_path = os.path.join(dirname, str(t_dir))
    
    mark = False
    for f in os.listdir(t_path):
        t_t_dir = os.path.join(t_path, f)
        if os.path.isdir(t_t_dir):
            mark = True
            shutil.move(t_t_dir, os.path.join(dirname, f))
    if mark == True:
        os.rmdir(t_path)
    
    dirs = sorted([int(value) for value in os.listdir(dirname)])
    
    for dir_ in dirs:
        path_dir = os.path.join(dirname, str(dir_))
    #     print(path_dir)
    
        dir_ = sorted([int(value.split(".")[0]) for value in os.listdir(path_dir)])
        for file in dir_:
            filename = os.path.join(path_dir,str(file) + ".jpg")
            new_filename = os.path.join(new_dirname,str(index) + ".jpg")
            print(filename, new_filename)
            shutil.copy(filename,new_filename)
            index += 1
            
            
    

    相关文章

      网友评论

        本文标题:downloading images from particul

        本文链接:https://www.haomeiwen.com/subject/wzxdhctx.html