美文网首页
爬虫-用requests爬取武当休闲山庄图片

爬虫-用requests爬取武当休闲山庄图片

作者: Noza_ea8f | 来源:发表于2020-01-27 12:33 被阅读0次

    目录结构

    image.png

    自定义函数

    func.py

    import os, random, re, time, requests
    
    
    def create_dir(foldername):
        '''
        创建目录----- 判断当前文件夹下文件名是否存在,如果不存在就创建‘foldername’文件夹
        :param foldername:
        :return:
        '''
        if not os.path.exists(str(foldername)):
            os.mkdir(str(foldername))
    
    
    def down_page(url):
        # 这个请求头是从网上搜来的,用于破解防盗链,测试没有问题!
        headers = {
            'User-Agent': 'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1)',
            'Referer': 'http://www.jvcxp.org/index.php'
        }
        response = requests.get(url=url, headers=headers)
        data = response.content
        return data
    
    
    def del_file(path):
        '''
        # 删除已有文件
        :param path: 文件路径
        :return:
        '''
        my_file = 'temp/paper_id.py'  # 文件路径
        if os.path.exists(path):
            os.remove(path)
    
    
    def get_imgs_urls(url, cookie_value):
        '''
        获取图片链接
        :param url: 网址
        :param cookie_value: cookie值
        :return: 图片地址
        '''
        # 构造请求头
        headers = {
            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
            'Accept-Encoding': 'gzip, deflate',
            'Accept-Language': 'zh-CN,zh;q=0.9',
            'Cache-Control': 'max-age=0',
            'Connection': 'keep-alive',
            'Cookie': cookie_value,
            'Host': 'www.jvcxp.org',
            'Referer': 'http://www.jvcxp.org/login.php?',
            'Upgrade-Insecure-Requests': '1',
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.130 Safari/537.36',
        }
    
        # 打开1页
        r = requests.get(url, headers=headers)  # 发送请求
        r.encoding = r.apparent_encoding  # 设置编码格式为获取到的编码格式,这样不仅省去了猜测编码的环节,而且不会出错
        # 打印1页网页内容
        # print(r.text)
        # 利用正则表达式获取一页的所有链接
        img_link_re = '<a href="(.*?)" name="readlink" id="a_ajax_'
        img_urls = re.findall(img_link_re, r.text)
        # 统一加前缀获取完整网址
        img_urls = ['http://www.jvcxp.org/' + x for x in img_urls]
        # print(img_urls)
        return img_urls
    
    
    def get_imgs(url, cookie_value):
        '''
        获取图片
        :param url: 网址
        :param cookie_value: cookie值
        :return:
        '''
        # 构造请求头
        headers = {
            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
            'Accept-Encoding': 'gzip, deflate',
            'Accept-Language': 'zh-CN,zh;q=0.9',
            'Cache-Control': 'max-age=0',
            'Connection': 'keep-alive',
            'Cookie': cookie_value,
            'Host': 'www.jvcxp.org',
            'Referer': 'http://www.jvcxp.org/login.php?',
            'Upgrade-Insecure-Requests': '1',
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.130 Safari/537.36',
        }
    
        # 打开1页
        r = requests.get(url, headers=headers)  # 发送请求
        r.encoding = r.apparent_encoding  # 设置编码格式为获取到的编码格式,这样不仅省去了猜测编码的环节,而且不会出错
        # 打印1页网页内容
        # print(r.text)
        # 利用正则表达式获取该页的文件名
        dir_name_re = '<title>(.*?)\|模拍私房 - 武当休闲山庄 - 稳定,和谐,人性化的中文社区</title>'
        dir_name = re.findall(dir_name_re, r.text)
        # 创建文件夹
        try:
            # 尝试创建目录
            foldername = f'pics/{dir_name}'
            create_dir(foldername=foldername)
        except Exception as e:
            # 如果创建目录失败,说明有特殊字符,用函数替换掉
            print(e)
            dir_name = replace_not_name_str(dir_name[0])
            foldername = f'pics/{dir_name}'
            create_dir(foldername=foldername)
    
        # 获取图片网址
        img_urls_re = '<img src="(.*?)" border="0" onclick="if'
        img_urls = re.findall(img_urls_re, r.text)
        # print(img_urls)
        # 保存文件
        for index, value in enumerate(img_urls):
            try:
                get_img = down_page(value)
            except Exception as e:
                print(e)
                print('图片未下载!')
                continue
            with open(f'{foldername}/{index + 1}.jpg', 'wb') as fp:
                fp.write(get_img)
                print(f'正在下载<---{dir_name}图片--->第{index + 1}张')
            time.sleep(random.randint(1, 5))
        print(f'{url}---{dir_name}下载完毕!---------------------------------------')
    
    
    def replace_not_name_str(dir_name):
        '''
        替换不能命名文件夹的特殊字符串
        :param dir_name: 文件夹名
        :return: 替换后的文件夹名
        '''
        not_name_str = '/\:*"<>|?'
        for i in dir_name:
            if i in not_name_str:
                dir_name_ = dir_name.replace(i, '-')
    
        return dir_name_
    

    获取图片链接,并保文件

    get_link.py

    from lib.func import del_file, get_imgs_urls
    
    # 获取所有页面图片链接的url,总共有38页
    cookie_value = 'PHPSESSID=8d7b065aaca254fe76b3e4076c1bdb4d; ba1b8_cloudClientUid=57571005; ba1b8_threadlog=%2C438%2C; ba1b8_jobpop=0; ba1b8_readlog=%2C3184979%2C3183580%2C3184676%2C; ba1b8_cknum=CQ0OUQAKDVAIBzw%2BVFQEV1QLAAFVV1FcUVQBVQcFVARWBVcCDw4BUwEBAwA; ba1b8_winduser=CQEPVAEwDVNcAANWAwEFV1IBDwkFUVFcAQQOAwUDUwYEUFVRW14%2F; ba1b8_ck_info=%2F%09; ba1b8_lastvisit=0%091580038202%09%2Fthread.php%3Ffid-438-page-1.html; ba1b8_lastpos=F438; ba1b8_ol_offset=10282; ba1b8_ci=thread%091580038202%09%09438'
    page_urls = []
    for i in range(1, 39):
        url = f'http://www.jvcxp.org/thread-htm-fid-438-page-{i}.html'
        img_urls = get_imgs_urls(url, cookie_value)
        print(f'已经获取了第{i}页')
        page_urls = page_urls + img_urls
    # 删除原有网址记录
    path = 'temp/page_urls.py'
    del_file(path)
    # 写入网址
    with open(path, 'a+', encoding="utf-8") as f:
        f.write('page_urls=[')
        for i in page_urls:
            f.write(f'\'{i}\',' + '\n')
        f.write(']')
    
    print(page_urls)
    

    思路:

    • 网站图片是分为各个主题;
    • 每个主题对应一个链接;
    • 链接页面含有与主题相关的一组图片;

    解决方案一:

    • 先获取网站所有帖子页面的图片主题链接;
    • 保存链接到文件;
    • 逐一获取每个链接页面对应的所有图片;

    缺点:再获取为下载的图片不太方便;

    解决方案二:

    • 先获取网站所有帖子页面的图片主题链接;
    • 保存链接到文件;
    • 逐一获取每个链接页面对应的所有图片链接;
    • 根据链接获取图片;

    优点:没有找到图片把链接记录下来,因为一般是请求超时造成的,这样做不会丢图;

    获取图片

    get_pics.py

    from lib.func import get_imgs
    from temp.page_urls import page_urls
    
    cookie_value = 'PHPSESSID=8d7b065aaca254fe76b3e4076c1bdb4d; ba1b8_cloudClientUid=57571005; ba1b8_threadlog=%2C438%2C; ba1b8_jobpop=0; ba1b8_readlog=%2C3184979%2C3183580%2C3184676%2C; ba1b8_cknum=CQ0OUQAKDVAIBzw%2BVFQEV1QLAAFVV1FcUVQBVQcFVARWBVcCDw4BUwEBAwA; ba1b8_winduser=CQEPVAEwDVNcAANWAwEFV1IBDwkFUVFcAQQOAwUDUwYEUFVRW14%2F; ba1b8_ck_info=%2F%09; ba1b8_lastvisit=0%091580038202%09%2Fthread.php%3Ffid-438-page-1.html; ba1b8_lastpos=F438; ba1b8_ol_offset=10282; ba1b8_ci=thread%091580038202%09%09438'
    # print(page_urls[0])
    for url in page_urls[11:]:
        get_imgs(url, cookie_value)
        print(f'程序执行到第"{page_urls.index(url)+1}"处!---------')
    

    效果

    image.png

    还不错哦 :-) !

    相关文章

      网友评论

          本文标题:爬虫-用requests爬取武当休闲山庄图片

          本文链接:https://www.haomeiwen.com/subject/pstethtx.html