美文网首页
java实现简单爬虫,爬取网站图片

java实现简单爬虫,爬取网站图片

作者: willcoder | 来源:发表于2019-06-10 00:01 被阅读0次
package com.example.demo.SuperSpider;

import org.apache.http.HttpEntity;

import org.apache.http.client.config.RequestConfig;

import org.apache.http.client.methods.CloseableHttpResponse;

import org.apache.http.client.methods.HttpGet;

import org.apache.http.impl.client.CloseableHttpClient;

import org.apache.http.impl.client.HttpClients;

import org.apache.http.util.EntityUtils;

import org.jsoup.Jsoup;

import org.jsoup.nodes.Document;

import org.jsoup.nodes.Element;

import org.jsoup.select.Elements;

import java.io.File;

import java.io.FileOutputStream;

import java.io.IOException;

import java.io.InputStream;

import [java.net.HttpURLConnection](http://java.net.httpurlconnection/);

import [java.net.URL](http://java.net.url/);

import java.util.ArrayList;

import java.util.HashMap;

import java.util.HashSet;

import java.util.List;

/**

* Description:    爬虫 爬图片

* Author:         WangZW

* CreateDate:     2019/1/23

*/

public class SuperSpider {

    public static HashMap<String,Integer> websiteMap = new HashMap<>();

    //下载过的imgSet

    public static HashSet<String> imgUrlSet = new HashSet<>();

    //下载失败的imgSet

    public static HashSet<String> errorImgUrlSet = new HashSet<>();

    //爬虫爬过的

    public static List<String> overList = new ArrayList<>();

    //等待爬取的

    public static List<String> waitList = new ArrayList<>();

    //异常url

    public static List<String> exceptionList = new ArrayList<>();

    public static int currentLevel = 0;

    public static int maxSpiderLevel = 2;

    public static int maxThreadNum = 2;

    public static String topUrl = "[http://www.baidu.com/](http://www.baidu.com/)";

    public static String filter = "[www.baidu](http://www.baidu/)";

    public static String filePath = "/Users/will/Downloads/pic/";

    //生命对象,帮助进行线程的等待操作

    public static final Object lifeObj = new Object();

    public static void main(String[] args) throws Exception {

        waitList.add(topUrl);

        for (int i = 0; i < maxThreadNum; i++) {

            new SuperSpider().new SpiderThread("Thread_"+i).start();

        }

    }

    /**

     * 获取url里的a标签

     */

    public static void runSpider(String url,int level)  {

        if(overList.contains(url) || level >= maxSpiderLevel){

            return;

        }

        String content = "";

        try {

            content = getHTML(url);

        }catch (Exception e){

            System.out.println(url);

            e.printStackTrace();

            exceptionList.add(url);

        }

        List<String> imgUrlList = getImgSrc(content);

        for(String imgUrl:imgUrlList){

            downImages(filePath, imgUrl);

        }

        List<String> aUrlList = getAHref(content);

        for(String waitUrl:aUrlList){

            waitList.add(waitUrl);

            websiteMap.put(waitUrl,currentLevel+1);

        }

        overList.add(url);

    }

    /**

     * 获取目标网站的content

     * @param url url

     */

    private static String getHTML(String url) throws IOException {

        // 创建httpclient实例

        CloseableHttpClient httpclient = HttpClients.createDefault();

        // 创建httpget实例

        HttpGet httpget = new HttpGet(url);

        // 模拟浏览器

        httpget.setHeader("User-Agent",

            "Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:50.0) Gecko/20100101 Firefox/50.0");

        // 使用代理 IP

//        HttpHost proxy = new HttpHost("118.114.77.47", 8080);

        RequestConfig config = RequestConfig.custom()

//            .setProxy(proxy)

            //设置连接超时

            .setConnectTimeout(10000) // 设置连接超时时间 10秒钟

            .setSocketTimeout(10000) // 设置读取超时时间10秒钟

            .build();

        httpget.setConfig(config);

        // 执行get请求

        CloseableHttpResponse response = httpclient.execute(httpget);

        HttpEntity entity = response.getEntity();

        // 获取返回实体

        String content = EntityUtils.toString(entity, "utf-8");

        //获取响应类型、内容

        System.out.println("Status:"+response.getStatusLine().getStatusCode());

        System.out.println("Content-Type:"+entity.getContentType().getValue());

        response.close(); // response关闭

        httpclient.close(); // httpClient关闭

        return content;

    }

    /**

     * 获取图片url

     */

    private static List<String> getImgSrc(String content){

        // 解析网页 得到文档对象

        Document doc = Jsoup.parse(content);

        Elements elements = doc.getElementsByTag("img"); // 获取tag是a的所有DOM元素,数组

        List<String> imgSrcList = new ArrayList<>();

        for (Element element:elements) {

            String src = element.attr("src"); // 返回元素的文本

            System.out.println("<img>:" + src);

            //正则校验?有效校验

            int i = src.indexOf(filter);

            if(i==-1 || imgUrlSet.contains(src)){

                continue;

            }

            if(i>0){

                src = src.substring(i);

            }

            imgSrcList.add(src);

        }

        return imgSrcList;

    }

    /**

     * 获取页面超链接

     */

    private static List<String> getAHref(String content){

        // 解析网页 得到文档对象

        Document doc = Jsoup.parse(content);

        // 获取tag是a的所有DOM元素,数组

        Elements elements = doc.getElementsByTag("a");

        List<String> imgSrcList = new ArrayList<>();

        for (Element element:elements) {

            // 返回元素的信息

            String src = element.attr("href");

            System.out.println("<a>:" + src);

            imgSrcList.add(src);

        }

        return imgSrcList;

    }

    /**

     * 根据图片的URL下载的图片到本地的filePath

     * @param filePath 文件夹

     * @param imageUrl 图片的网址

     */

    public static void downImages(String filePath,String imageUrl){

        // 截取图片的名称

        String fileName = imageUrl.substring(imageUrl.lastIndexOf("/"));

        //创建文件的目录结构

        File files = new File(filePath);

        // 判断文件夹是否存在,如果不存在就创建一个文件夹

        if(!files.exists()){

            files.mkdirs();

        }

        try {

            URL url = new URL("http://"+imageUrl);

            HttpURLConnection connection = (HttpURLConnection) url.openConnection();

            InputStream is = connection.getInputStream();

            // 创建文件

            File file = new File(filePath+fileName);

            FileOutputStream out = new FileOutputStream(file);

            int i = 0;

            while((i = is.read()) != -1){

                out.write(i);

            }

            is.close();

            out.close();

            imgUrlSet.add(imageUrl);

        } catch (Exception e) {

            errorImgUrlSet.add(imageUrl);

            System.out.println(e.getMessage()+"下载失败+_"+imageUrl);

        }

    }

    public class SpiderThread extends Thread{

        public SpiderThread(String name){

            super(name);

        }

        @Override

        public void run() {

            //设定一个死循环,让线程一直存在

            while (true) {

                //判断是否新链接,有则获取

                if (waitList.size()>0){

                    String nextUrl = waitList.get(0);

                    waitList.remove(0);

                    runSpider(nextUrl,websiteMap.get(nextUrl));

                } else {

                    System.out.println("当前线程准备就绪,等待连接爬取:" + this.getName());

                    //建立一个对象,让线程进入等待状态,即wait()

                    synchronized (lifeObj) {

                        try {

                            lifeObj.wait();

                        } catch (Exception e) {

                        }

                    }

                }

            }

        }

    }

}




maven依赖

<?xml version="1.0" encoding="UTF-8"?>

<project xmlns="[http://maven.apache.org/POM/4.0.0](http://maven.apache.org/POM/4.0.0)" xmlns:xsi="[http://www.w3.org/2001/XMLSchema-instance](http://www.w3.org/2001/XMLSchema-instance)"

         xsi:schemaLocation="[http://maven.apache.org/POM/4.0.0](http://maven.apache.org/POM/4.0.0) [http://maven.apache.org/xsd/maven-4.0.0.xsd](http://maven.apache.org/xsd/maven-4.0.0.xsd)">

    <modelVersion>4.0.0</modelVersion>

    <parent>

        <groupId>org.springframework.boot</groupId>

        <artifactId>spring-boot-starter-parent</artifactId>

        <version>2.1.2.RELEASE</version>

        <relativePath/> <!-- lookup parent from repository -->

    </parent>

    <groupId>com.example</groupId>

    <artifactId>demo</artifactId>

    <version>0.0.1-SNAPSHOT</version>

    <name>demo</name>

    <description>Demo project for Spring Boot</description>

    <properties>

        <java.version>1.8</java.version>

    </properties>

    <dependencies>

        <dependency>

            <groupId>org.springframework.boot</groupId>

            <artifactId>spring-boot-starter-web</artifactId>

        </dependency>

        <dependency>

            <groupId>org.springframework.boot</groupId>

            <artifactId>spring-boot-starter-test</artifactId>

            <scope>test</scope>

        </dependency>

        <dependency>

            <!-- jsoup HTML parser library @ [https://jsoup.org/](https://jsoup.org/) -->

            <groupId>org.jsoup</groupId>

            <artifactId>jsoup</artifactId>

            <version>1.11.3</version>

        </dependency>

        <!-- [https://mvnrepository.com/artifact/org.apache.httpcomponents/httpcore](https://mvnrepository.com/artifact/org.apache.httpcomponents/httpcore) -->

        <dependency>

            <groupId>org.apache.httpcomponents</groupId>

            <artifactId>httpcore</artifactId>

            <version>4.4.10</version>

        </dependency>

        <!-- [https://mvnrepository.com/artifact/org.apache.httpcomponents/httpclient](https://mvnrepository.com/artifact/org.apache.httpcomponents/httpclient) -->

        <dependency>

            <groupId>org.apache.httpcomponents</groupId>

            <artifactId>httpclient</artifactId>

            <version>4.5.6</version>

        </dependency>

    </dependencies>

    <build>

        <plugins>

            <plugin>

                <groupId>org.springframework.boot</groupId>

                <artifactId>spring-boot-maven-plugin</artifactId>

            </plugin>

        </plugins>

    </build>

</project>

相关文章

网友评论

      本文标题:java实现简单爬虫,爬取网站图片

      本文链接:https://www.haomeiwen.com/subject/jviytctx.html