爬虫下载图片

作者: 刘小刀tina | 来源:发表于2020-02-13 18:59 被阅读0次

爬虫下载图片
scrapy总结
Python爬虫入门
python 爬虫下载图片(煎蛋)
python爬虫-图片下载
多线程爬虫
爬虫
《七天爬虫进阶系列》 - 04 爬虫进阶之多线程
Python多线程
【图文详解】python爬虫实战——5分钟做个图片自动下载器

IOUtils类

package com.example.demospringboot.util;

/**
 * @program: demopa
 * @description
 * @author: tina.liu
 * @create: 2020-02-09 11:12
 **/

import java.io.FileOutputStream;
import java.io.InputStream;

public class IOUtils {
    /**
     *
     * @param path 需要下载的文件路径，包括后缀名
     * @param inStream 输入流
     */
    public static void download(String path,InputStream inStream){
        FileOutputStream fs;
        try {
            fs = new FileOutputStream(path);
            byte[] buffer = new byte[1204];

            int byteread = 0;

            while ((byteread = inStream.read(buffer)) != -1) {
                fs.write(buffer, 0, byteread);
            }
            System.out.println(path + "保存成功！");

        } catch (Exception e) {
            e.printStackTrace();
        }
    }
}

ReptileGetList类

package com.example.demospringboot.config;
import com.example.demospringboot.resp.MessageResp;
import com.example.demospringboot.util.IOUtils;
import org.jsoup.Connection;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import java.io.File;
import java.io.IOException;
import java.io.InputStream;
import java.net.URL;
import java.net.URLConnection;
import java.util.ArrayList;
import java.util.List;
import java.util.UUID;
/**
 * @program: demopa
 * @description
 * @author: tina.liu
 * @create: 2020-02-09 09:59
 **/
public class ReptileGetList {

    public static void main(String[] args) throws Exception {

        List<MessageResp> messageRespList = copy();
        for (MessageResp loop :messageRespList) {
            String nextSrc = loop.getNextSrc();
            String title = loop.getTitle();
            down2(nextSrc,title);
            List<MessageResp> messageRespList1 = loop.getMessageRespList();
            for (MessageResp loop2:messageRespList1) {
                String nextSrc1 = loop2.getNextSrc();
                String title1 = loop2.getTitle();
                down2(nextSrc1,title1);
            }
        }
       System.out.println("爬虫程序完成,期待您的下次使用");
    }



    //封装的爬图方法
    public static List<MessageResp> copy() {
        List<MessageResp> messageRespList = new ArrayList<MessageResp>();
        //获取第一层的信息，title src href
        //System.out.println("获取第一层的信息爬虫程序启动，当前访问的url为：" + firstUrl);
        String firstUrl = "https://www.ivsky.com/tupian/renwutupian/";
        Connection conn = Jsoup.connect(firstUrl);
        Document doc = null;
        try {
            doc = conn.get();
        } catch (IOException e) {
            e.printStackTrace();
        }
        Elements divElements = doc.getElementsByClass("il_img");
        //System.out.println("第一层div的size为： " + divElements.size());
        for (int i = 0; i < divElements.size(); i++) {
            //获取DIV下面的a标签
            Elements aElements = divElements.get(i).select("a");
            //获取 标签a中的 href  title
            String href = "https://www.ivsky.com" + aElements.get(0).attr("href");
            //System.out.println(href);
            String title = aElements.get(0).attr("title");
            //System.out.println(title);
            Elements imgElements = aElements.get(0).select("img");
            Element imgElement = imgElements.get(0);
            //获取标签img中的src
            String nextSrc = "https:" + imgElement.attr("src");
            //System.out.println(nextSrc);
            //获取第二层的信息，title src href
            //System.out.println("获取第二层的信息爬虫程序启动，当前访问的url为：" + href);
            List<MessageResp> messageRespList2 = new ArrayList<MessageResp>();
            Connection conn2 = Jsoup.connect(href);
            Document doc2 = null;
            try {
                doc2 = conn2.get();
                Thread.sleep(800);
            } catch (Exception e) {
                e.printStackTrace();
            }
            Elements divElements2 = doc2.getElementsByClass("il_img");
            //System.out.println("第二层div的size为： " + divElements2.size());
            for (int j = 0; j < divElements2.size(); j++) {
                Element aElements2 = divElements2.get(j).select("a").get(0);
                String href2 ="https://www.ivsky.com"+ aElements2.attr("href");
                //System.out.println(href2);
                String title2 = aElements2.attr("title");
                //System.out.println(title2);
                Element imgElements2 = aElements2.select("img").get(0);
                String src2 = "https:"+imgElements2.attr("src");
                //System.out.println(src2);
                MessageResp secondMessageResp = new MessageResp(UUID.randomUUID().toString().substring(0, 16).replaceAll("-",""),href2,title2,src2,null);
                messageRespList2.add(secondMessageResp);
            }
            MessageResp firstMessageResp = new MessageResp(UUID.randomUUID().toString().substring(0, 16).replaceAll("-",""), href, title, nextSrc, messageRespList2);
            messageRespList.add(firstMessageResp);
        }
        return messageRespList;
    }


    //下载文件到本地的通用方法 返回值nextSrc
    public static String  down2(String url2, String title) throws Exception {
        //获取URL对象
        URL url = new URL(url2);
        //根据URL打开链接
        URLConnection connection = url.openConnection();
        //从连接处获取输入流对象
        InputStream inputStream = connection.getInputStream();
        File file = new File("/Users/lvxiaokai/Desktop/tina/img/images/character/");
        if(!file.exists()){
            file.mkdirs();
        }
        String uuid = UUID.randomUUID().toString().replaceAll("-","");
        String   nextSrc = "/Users/lvxiaokai/Desktop/tina/img/images/character/"+title+".jpg";
        IOUtils.download(nextSrc , inputStream);
        return nextSrc;
    }
}