美文网首页
selenium爬虫下载豆瓣相册

selenium爬虫下载豆瓣相册

作者: 墨色尘埃 | 来源:发表于2020-05-04 22:04 被阅读0次

    想爬豆瓣相册起因是因为,想将豆瓣照片转移到其他地方,但是豆瓣的照片下载下来都是webp格式,正常情况下没法打开,所以想到能不能通过selenium爬到数据呢?

    本来想的是从模拟登陆开始,然后一步步进到相册里,获取相册图片列表,但是比较麻烦。还有一个更方便的操作,通过F12开发者工具,查看请求的url,带上cookie去模拟请求

    进到相册首页

    GET请求带cookie,https://www.douban.com/people/62414040/photos

    image.png

    通过对Response的分析

    image.png

    看到相册信息以及相册地址都在下图所示的信息块中
    通过对该信息块进行信息提取,可以得到相册名列表相册地址url列表

    image.png

    进到相册详情页

    GET请求带cookie,https://www.douban.com/photos/album/1871536872/
    如果相册里图片数量超过了18张,就会有第二页
    GET请求带cookie,https://www.douban.com/photos/album/1871536872/?m_start=18
    同理,如果有第三页第四页,url是
    https://www.douban.com/photos/album/1871536872/?m_start=36
    https://www.douban.com/photos/album/1871536872/?m_start=54

    image.png

    通过对Response的分析
    看到相册信息以及相册中每张图片url都在下图所示的信息块中

    image.png

    有了图片地址就好办了,使用IO流下载图片

    代码如下

    获取每个相册中图片的地址,返回集合

        /**
         * GET方式
         * 获取每个相册中图片的地址,返回集合
         */
        public List<String> getImageUrlList(String passUrl, String folderName) throws BusinessException {
    
            //第五步:在获取以上凭证后开始采集数据
            logger.info("获取豆瓣相册" + folderName + "图片地址");
    
    //        String passUrl = "https://www.douban.com/photos/album/1871536796/";
            String passCookie = "gmxq-IDUYXw; douban-fav-remind=1; __yadk_uid=bK748gKAh8REVU6PsYrAF24ZroyVxSYA; " +
                    "__gads=ID=9cf216a578728a98:T=1587890575:S=ALNI_MbpwRrOxhrMiUZPlDmEcg-YeTzuhw; ll=\"118159\"; " +
                    "__utmc=30149280; ap_v=0,6.0; push_noty_num=0; push_doumail_num=0; __utmv=30149280.6241; " +
                    "douban-profile-remind=1; ct=y; __utmz=30149280.1588515963.4.3.utmcsr=baidu|utmccn=(organic)" +
                    "|utmcmd=organic|utmctr=%E8%B1%86%E7%93%A3%E7%9B%B8%E5%86%8C%E4%B8%8B%E8%BD%BD%E6%80%8E%E4%B9%88%E6%98" +
                    "%AFwebp; dbcl2=\"62414040:cC/OMBA010s\"; ck=snAu; gr_user_id=7f5f3e5d-9cf9-4d38-9e3e-8ffebe3b22b1; " +
                    "_pk_ref.100001.8cb4=%5B%22%22%2C%22%22%2C1588520368%2C%22https%3A%2F%2Fwww.baidu" +
                    ".com%2Fbaidu%3Fisource%3Dinfinity%26iname%3Dbaidu%26itype%3Dweb%26tn%3D02003390_42_hao_pg%26ie%3Dutf-8" +
                    "%26wd%3D5kg%25E5%25A4%25A7%25E7%25B1%25B3%25E4%25B8%2580%25E4%25B8%25AA%25E4%25BA%25BA%25E5%2590%2583" +
                    "%25E5%25A4%259A%25E4%25B9%2585%22%5D; _pk_ses.100001.8cb4=*; __utma=30149280.835562099.1587890488" +
                    ".1588515963.1588520368.5; __utmt=1; _pk_id.100001.8cb4=d2bb86d9f8460bdf.1587890487.3.1588520483" +
                    ".1588517154.; __utmb=30149280.12.10.1588520368";
    
            List<String> imageUrl = new ArrayList<>();
    
            //请求url带上cookie
            Document document = RequestUtil.sendGetRequestWithNullReturnDoc(passUrl, passCookie);
            //图片元素列表
            Elements photo_wrap = document.getElementsByClass("photo_wrap");
            //图片元素列表for循环,单个元素有哪些子元素,通过getElementsByTag方法得到子元素后就,再调用attr方法获取属性值
            for (Element element : photo_wrap) {
    
                Elements allElements = element.getAllElements();
                Elements img = element.getElementsByTag("img");
                String src = img.attr("src");
                System.out.println(src);
    
                //将webp替换成jpg,一样可以打开图片
                src = src.replace("webp", "jpg");
                imageUrl.add(src);
    
            }
    
            return imageUrl;
        }
    
    

    下载图片方法

    
    
        /**
         * java 通过url下载图片保存到本地
         *
         * @param urlString 图片url地址
         * @param i         图片保存名称,雪花算法【需要引入mybatis-plus】
         * @throws Exception
         */
        public static void download(String urlString, String basePath, String folderName, Long i) throws Exception {
    
            // 构造URL
            URL url = new URL(urlString);
            // 打开连接
            URLConnection con = url.openConnection();
            // 输入流
            InputStream is = con.getInputStream();
            // 1K的数据缓冲
            byte[] bs = new byte[1024];
            // 读取到的数据长度
            int len;
    
            //完整文件夹名
            File file0 = new File(basePath + "\\" + folderName);
            if (!file0.isDirectory() && !file0.exists()) {
                file0.mkdirs();
            }
    
            // 输出的文件流
            String filename = file0 + "\\" + i + ".jpg";  //下载路径及下载图片名称
    
            File file = new File(filename);
            FileOutputStream os = new FileOutputStream(file, true);
            // 开始读取
            while ((len = is.read(bs)) != -1) {
                os.write(bs, 0, len);
            }
            System.out.println(i);
            // 完毕,关闭所有链接
            os.close();
            is.close();
        }
    
        }
    }
    

    获取相册的集合

        /**
         * 获取相册的集合
         */
        public List<List<String>> getAlbumList(String passUrl) throws BusinessException {
            logger.info("图片首页");
            String passCookie = "gmxq-IDUYXw; douban-fav-remind=1; __yadk_uid=bK748gKAh8REVU6PsYrAF24ZroyVxSYA; " +
                    "__gads=ID=9cf216a578728a98:T=1587890575:S=ALNI_MbpwRrOxhrMiUZPlDmEcg-YeTzuhw; ll=\"118159\"; " +
                    "__utmc=30149280; ap_v=0,6.0; push_noty_num=0; push_doumail_num=0; __utmv=30149280.6241; " +
                    "douban-profile-remind=1; ct=y; __utmz=30149280.1588515963.4.3.utmcsr=baidu|utmccn=(organic)" +
                    "|utmcmd=organic|utmctr=%E8%B1%86%E7%93%A3%E7%9B%B8%E5%86%8C%E4%B8%8B%E8%BD%BD%E6%80%8E%E4%B9%88%E6%98" +
                    "%AFwebp; dbcl2=\"62414040:cC/OMBA010s\"; ck=snAu; gr_user_id=7f5f3e5d-9cf9-4d38-9e3e-8ffebe3b22b1; " +
                    "_pk_ref.100001.8cb4=%5B%22%22%2C%22%22%2C1588520368%2C%22https%3A%2F%2Fwww.baidu" +
                    ".com%2Fbaidu%3Fisource%3Dinfinity%26iname%3Dbaidu%26itype%3Dweb%26tn%3D02003390_42_hao_pg%26ie%3Dutf-8" +
                    "%26wd%3D5kg%25E5%25A4%25A7%25E7%25B1%25B3%25E4%25B8%2580%25E4%25B8%25AA%25E4%25BA%25BA%25E5%2590%2583" +
                    "%25E5%25A4%259A%25E4%25B9%2585%22%5D; _pk_ses.100001.8cb4=*; __utma=30149280.835562099.1587890488" +
                    ".1588515963.1588520368.5; __utmt=1; _pk_id.100001.8cb4=d2bb86d9f8460bdf.1587890487.3.1588520483" +
                    ".1588517154.; __utmb=30149280.12.10.1588520368";
    
            //相册的集合
            List<List<String>> albumListAll = new ArrayList<>();
    
            //请求url带上cookie
            Document document = RequestUtil.sendGetRequestWithNullReturnDoc(passUrl, passCookie);
            Elements allElements = document.getAllElements();
            Elements albumlst = document.getElementsByClass("albumlst");
    
            for (Element element : albumlst) {
    
                List<String> albumList = new ArrayList<>();
    
                Elements elements = element.getElementsByClass("albumlst");
                Element element1 = elements.get(0);
    
                String href = element1.getElementsByClass("album_photo").attr("href");
                String pl = element1.getElementsByClass("pl").text();
                String substring = pl.substring(0, pl.indexOf("张"));
                int totalRecord = Integer.parseInt(substring);
    
                //totalRecord:总记录数 / pageSize:每页多少条记录  /totalPageNum:总页数
                int pageSize = 18;
                //分页的总页数算法
                int totalPageNum = (totalRecord + pageSize - 1) / pageSize;
    
                if (totalPageNum == 1) {
                    albumList.add(href);
                } else if (totalPageNum >= 2) {
                    albumList.add(href);
                    for (int i = 1; i < totalPageNum; i++) {
                        String s = href + "?m_start=" + pageSize * i;
                        albumList.add(s);
                    }
                } else {
    
                }
                albumListAll.add(albumList);
            }
            return albumListAll;
        }
    

    获取相册名的集合

        /**
         * 获取相册名的集合
         */
        public List<String> getAlbumNameList(String passUrl) throws BusinessException {
            logger.info("图片首页");
            String passCookie = "gmxq-IDUYXw; douban-fav-remind=1; __yadk_uid=bK748gKAh8REVU6PsYrAF24ZroyVxSYA; " +
                    "__gads=ID=9cf216a578728a98:T=1587890575:S=ALNI_MbpwRrOxhrMiUZPlDmEcg-YeTzuhw; ll=\"118159\"; " +
                    "__utmc=30149280; ap_v=0,6.0; push_noty_num=0; push_doumail_num=0; __utmv=30149280.6241; " +
                    "douban-profile-remind=1; ct=y; __utmz=30149280.1588515963.4.3.utmcsr=baidu|utmccn=(organic)" +
                    "|utmcmd=organic|utmctr=%E8%B1%86%E7%93%A3%E7%9B%B8%E5%86%8C%E4%B8%8B%E8%BD%BD%E6%80%8E%E4%B9%88%E6%98" +
                    "%AFwebp; dbcl2=\"62414040:cC/OMBA010s\"; ck=snAu; gr_user_id=7f5f3e5d-9cf9-4d38-9e3e-8ffebe3b22b1; " +
                    "_pk_ref.100001.8cb4=%5B%22%22%2C%22%22%2C1588520368%2C%22https%3A%2F%2Fwww.baidu" +
                    ".com%2Fbaidu%3Fisource%3Dinfinity%26iname%3Dbaidu%26itype%3Dweb%26tn%3D02003390_42_hao_pg%26ie%3Dutf-8" +
                    "%26wd%3D5kg%25E5%25A4%25A7%25E7%25B1%25B3%25E4%25B8%2580%25E4%25B8%25AA%25E4%25BA%25BA%25E5%2590%2583" +
                    "%25E5%25A4%259A%25E4%25B9%2585%22%5D; _pk_ses.100001.8cb4=*; __utma=30149280.835562099.1587890488" +
                    ".1588515963.1588520368.5; __utmt=1; _pk_id.100001.8cb4=d2bb86d9f8460bdf.1587890487.3.1588520483" +
                    ".1588517154.; __utmb=30149280.12.10.1588520368";
    
            //相册的集合
            List<String> albumNameList = new ArrayList<>();
    
            //请求url带上cookie
            Document document = RequestUtil.sendGetRequestWithNullReturnDoc(passUrl, passCookie);
            Elements allElements = document.getAllElements();
            Elements albumlst = document.getElementsByClass("albumlst");
            for (Element element : albumlst) {
    
                Elements elements = element.getElementsByClass("albumlst");
                Element element1 = elements.get(0);
    
                Elements elements1 = element1.getElementsByClass("pl2");
                String albumName = elements1.text();
    
                albumNameList.add(albumName);
            }
    
    
            return albumNameList;
        }
    

    相关文章

      网友评论

          本文标题:selenium爬虫下载豆瓣相册

          本文链接:https://www.haomeiwen.com/subject/hvpyghtx.html