java实现简单爬虫，爬取网站图片

作者: willcoder | 来源:发表于2019-06-10 00:01 被阅读0次

java实现简单爬虫，爬取网站图片
使用Java写一个简单爬虫爬取单页面
python-爬虫学习（文字、图片、视频）
python爬虫学习（文字、图片、视频）
最简单的万能爬虫器
用Java实现网络爬虫三之开始爬取
java实现爬虫爬网站图片
【Python爬虫】爬一个专门看小姐姐的网站，写一段紧张刺激的代
爬虫很难？最适合新人上手的3个Python项目,即学即用！
3 个适合新人上手的Python项目

package com.example.demo.SuperSpider;

import org.apache.http.HttpEntity;

import org.apache.http.client.config.RequestConfig;

import org.apache.http.client.methods.CloseableHttpResponse;

import org.apache.http.client.methods.HttpGet;

import org.apache.http.impl.client.CloseableHttpClient;

import org.apache.http.impl.client.HttpClients;

import org.apache.http.util.EntityUtils;

import org.jsoup.Jsoup;

import org.jsoup.nodes.Document;

import org.jsoup.nodes.Element;

import org.jsoup.select.Elements;

import java.io.File;

import java.io.FileOutputStream;

import java.io.IOException;

import java.io.InputStream;

import [java.net.HttpURLConnection](http://java.net.httpurlconnection/);

import [java.net.URL](http://java.net.url/);

import java.util.ArrayList;

import java.util.HashMap;

import java.util.HashSet;

import java.util.List;

/**

* Description:    爬虫 爬图片

* Author:         WangZW

* CreateDate:     2019/1/23

*/

public class SuperSpider {

    public static HashMap<String,Integer> websiteMap = new HashMap<>();

    //下载过的imgSet

    public static HashSet<String> imgUrlSet = new HashSet<>();

    //下载失败的imgSet

    public static HashSet<String> errorImgUrlSet = new HashSet<>();

    //爬虫爬过的

    public static List<String> overList = new ArrayList<>();

    //等待爬取的

    public static List<String> waitList = new ArrayList<>();

    //异常url

    public static List<String> exceptionList = new ArrayList<>();

    public static int currentLevel = 0;

    public static int maxSpiderLevel = 2;

    public static int maxThreadNum = 2;

    public static String topUrl = "[http://www.baidu.com/](http://www.baidu.com/)";

    public static String filter = "[www.baidu](http://www.baidu/)";

    public static String filePath = "/Users/will/Downloads/pic/";

    //生命对象，帮助进行线程的等待操作

    public static final Object lifeObj = new Object();

    public static void main(String[] args) throws Exception {

        waitList.add(topUrl);

        for (int i = 0; i < maxThreadNum; i++) {

            new SuperSpider().new SpiderThread("Thread_"+i).start();

        }

    }

    /**

     * 获取url里的a标签

     */

    public static void runSpider(String url,int level)  {

        if(overList.contains(url) || level >= maxSpiderLevel){

            return;

        }

        String content = "";

        try {

            content = getHTML(url);

        }catch (Exception e){

            System.out.println(url);

            e.printStackTrace();

            exceptionList.add(url);

        }

        List<String> imgUrlList = getImgSrc(content);

        for(String imgUrl:imgUrlList){

            downImages(filePath, imgUrl);

        }

        List<String> aUrlList = getAHref(content);

        for(String waitUrl:aUrlList){

            waitList.add(waitUrl);

            websiteMap.put(waitUrl,currentLevel+1);

        }

        overList.add(url);

    }

    /**

     * 获取目标网站的content

     * @param url url

     */

    private static String getHTML(String url) throws IOException {

        // 创建httpclient实例

        CloseableHttpClient httpclient = HttpClients.createDefault();

        // 创建httpget实例

        HttpGet httpget = new HttpGet(url);

        // 模拟浏览器

        httpget.setHeader("User-Agent",

            "Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:50.0) Gecko/20100101 Firefox/50.0");

        // 使用代理 IP

//        HttpHost proxy = new HttpHost("118.114.77.47", 8080);

        RequestConfig config = RequestConfig.custom()

//            .setProxy(proxy)

            //设置连接超时

            .setConnectTimeout(10000) // 设置连接超时时间 10秒钟

            .setSocketTimeout(10000) // 设置读取超时时间10秒钟

            .build();

        httpget.setConfig(config);

        // 执行get请求

        CloseableHttpResponse response = httpclient.execute(httpget);

        HttpEntity entity = response.getEntity();

        // 获取返回实体

        String content = EntityUtils.toString(entity, "utf-8");

        //获取响应类型、内容

        System.out.println("Status:"+response.getStatusLine().getStatusCode());

        System.out.println("Content-Type:"+entity.getContentType().getValue());

        response.close(); // response关闭

        httpclient.close(); // httpClient关闭

        return content;

    }

    /**

     * 获取图片url

     */

    private static List<String> getImgSrc(String content){

        // 解析网页 得到文档对象

        Document doc = Jsoup.parse(content);

        Elements elements = doc.getElementsByTag("img"); // 获取tag是a的所有DOM元素，数组

        List<String> imgSrcList = new ArrayList<>();

        for (Element element:elements) {

            String src = element.attr("src"); // 返回元素的文本

            System.out.println("<img>：" + src);

            //正则校验？有效校验

            int i = src.indexOf(filter);

            if(i==-1 || imgUrlSet.contains(src)){

                continue;

            }

            if(i>0){

                src = src.substring(i);

            }

            imgSrcList.add(src);

        }

        return imgSrcList;

    }

    /**

     * 获取页面超链接

     */

    private static List<String> getAHref(String content){

        // 解析网页 得到文档对象

        Document doc = Jsoup.parse(content);

        // 获取tag是a的所有DOM元素，数组

        Elements elements = doc.getElementsByTag("a");

        List<String> imgSrcList = new ArrayList<>();

        for (Element element:elements) {

            // 返回元素的信息

            String src = element.attr("href");

            System.out.println("<a>：" + src);

            imgSrcList.add(src);

        }

        return imgSrcList;

    }

    /**

     * 根据图片的URL下载的图片到本地的filePath

     * @param filePath 文件夹

     * @param imageUrl 图片的网址

     */

    public static void downImages(String filePath,String imageUrl){

        // 截取图片的名称

        String fileName = imageUrl.substring(imageUrl.lastIndexOf("/"));

        //创建文件的目录结构

        File files = new File(filePath);

        // 判断文件夹是否存在，如果不存在就创建一个文件夹

        if(!files.exists()){

            files.mkdirs();

        }

        try {

            URL url = new URL("http://"+imageUrl);

            HttpURLConnection connection = (HttpURLConnection) url.openConnection();

            InputStream is = connection.getInputStream();

            // 创建文件

            File file = new File(filePath+fileName);

            FileOutputStream out = new FileOutputStream(file);

            int i = 0;

            while((i = is.read()) != -1){

                out.write(i);

            }

            is.close();

            out.close();

            imgUrlSet.add(imageUrl);

        } catch (Exception e) {

            errorImgUrlSet.add(imageUrl);

            System.out.println(e.getMessage()+"下载失败+_"+imageUrl);

        }

    }

    public class SpiderThread extends Thread{

        public SpiderThread(String name){

            super(name);

        }

        @Override

        public void run() {

            //设定一个死循环，让线程一直存在

            while (true) {

                //判断是否新链接，有则获取

                if (waitList.size()>0){

                    String nextUrl = waitList.get(0);

                    waitList.remove(0);

                    runSpider(nextUrl,websiteMap.get(nextUrl));

                } else {

                    System.out.println("当前线程准备就绪，等待连接爬取：" + this.getName());

                    //建立一个对象，让线程进入等待状态，即wait（）

                    synchronized (lifeObj) {

                        try {

                            lifeObj.wait();

                        } catch (Exception e) {

                        }

                    }

                }

            }

        }

    }

}

maven依赖

<?xml version="1.0" encoding="UTF-8"?>

<project xmlns="[http://maven.apache.org/POM/4.0.0](http://maven.apache.org/POM/4.0.0)" xmlns:xsi="[http://www.w3.org/2001/XMLSchema-instance](http://www.w3.org/2001/XMLSchema-instance)"

         xsi:schemaLocation="[http://maven.apache.org/POM/4.0.0](http://maven.apache.org/POM/4.0.0) [http://maven.apache.org/xsd/maven-4.0.0.xsd](http://maven.apache.org/xsd/maven-4.0.0.xsd)">

    <modelVersion>4.0.0</modelVersion>

    <parent>

        <groupId>org.springframework.boot</groupId>

        <artifactId>spring-boot-starter-parent</artifactId>

        <version>2.1.2.RELEASE</version>

        <relativePath/> <!-- lookup parent from repository -->

    </parent>

    <groupId>com.example</groupId>

    <artifactId>demo</artifactId>

    <version>0.0.1-SNAPSHOT</version>

    <name>demo</name>

    <description>Demo project for Spring Boot</description>

    <properties>

        <java.version>1.8</java.version>

    </properties>

    <dependencies>

        <dependency>

            <groupId>org.springframework.boot</groupId>

            <artifactId>spring-boot-starter-web</artifactId>

        </dependency>

        <dependency>

            <groupId>org.springframework.boot</groupId>

            <artifactId>spring-boot-starter-test</artifactId>

            <scope>test</scope>

        </dependency>

        <dependency>

            <!-- jsoup HTML parser library @ [https://jsoup.org/](https://jsoup.org/) -->

            <groupId>org.jsoup</groupId>

            <artifactId>jsoup</artifactId>

            <version>1.11.3</version>

        </dependency>

        <!-- [https://mvnrepository.com/artifact/org.apache.httpcomponents/httpcore](https://mvnrepository.com/artifact/org.apache.httpcomponents/httpcore) -->

        <dependency>

            <groupId>org.apache.httpcomponents</groupId>

            <artifactId>httpcore</artifactId>

            <version>4.4.10</version>

        </dependency>

        <!-- [https://mvnrepository.com/artifact/org.apache.httpcomponents/httpclient](https://mvnrepository.com/artifact/org.apache.httpcomponents/httpclient) -->

        <dependency>

            <groupId>org.apache.httpcomponents</groupId>

            <artifactId>httpclient</artifactId>

            <version>4.5.6</version>

        </dependency>

    </dependencies>

    <build>

        <plugins>

            <plugin>

                <groupId>org.springframework.boot</groupId>

                <artifactId>spring-boot-maven-plugin</artifactId>

            </plugin>

        </plugins>

    </build>

</project>

java实现简单爬虫，爬取网站图片
maven依赖
使用Java写一个简单爬虫爬取单页面
使用Java爬虫爬取人民日报公众号页面图片使用Java框架Jsoup和HttpClient实现，先看代码爬取目标...
python-爬虫学习（文字、图片、视频）
爬虫-文字爬取爬虫-图片爬取爬虫-视频爬取
python爬虫学习（文字、图片、视频）
爬虫-文字爬取爬虫-图片爬取爬虫-视频爬取
最简单的万能爬虫器
最简单的万能爬虫器项目介绍爬取指定网站url所有图片以及描述信息爬取指定网站中任何标签下的任意属性效果展示...
用Java实现网络爬虫三之开始爬取
title: 用Java实现网络爬虫三之开始爬取tags: Java 网络爬虫 Spider Crawlercat...
java实现爬虫爬网站图片
第一步，实现 LinkQueue，对url进行过滤和存储的操作第二步，收集每一个url下的链接进行过滤产生新的链...
【Python爬虫】爬一个专门看小姐姐的网站，写一段紧张刺激的代
前言今天我们通过Python爬取小姐姐图片网站上的美图，零基础学会通用爬虫，当然我们还可以实现多线程爬虫，加快爬...
爬虫很难？最适合新人上手的3个Python项目,即学即用！
今天给大家分享三个极实用的Python爬虫案例。 1、爬取网站美图爬取图片是最常见的爬虫入门项目，不复杂却能很好...
3 个适合新人上手的Python项目
今天给大家分享三个极实用的Python爬虫案例。 1、爬取网站美图爬取图片是最常见的爬虫入门项目，不复杂却能很好...