webMagic 爬取图片

作者: LinJF | 来源:发表于2019-08-12 16:52 被阅读0次

webMagic 爬取图片
WebMagic爬取新年图片
Spring Boot集成WebMagic爬取商品信息
python-爬虫学习（文字、图片、视频）
python爬虫学习（文字、图片、视频）
webmagic selenium 爬取动态页面
WebMagic框架爬取区划代码
爬百度图片
Python·爬取当当网图书信息
爬取图片

相关依赖

<dependency>
            <groupId>us.codecraft</groupId>
            <artifactId>webmagic-core</artifactId>
            <version>0.7.3</version>
        </dependency>
        <dependency>
            <groupId>us.codecraft</groupId>
            <artifactId>webmagic-extension</artifactId>
            <version>0.7.3</version>
        </dependency>

示例代码

myImageProcess.java

import com.bootdo.project.controller.pachong.DownLoadUtils;
import org.apache.log4j.Logger;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.Spider;
import us.codecraft.webmagic.monitor.SpiderMonitor;
import us.codecraft.webmagic.processor.PageProcessor;
import us.codecraft.webmagic.selector.Selectable;

import javax.management.JMException;
import java.text.SimpleDateFormat;
import java.util.ArrayList;
import java.util.Date;
import java.util.List;

/**
 * Created by ljf on 2019/8/9.
 */
public class myImageProcess implements PageProcessor {
    private Logger logger = Logger.getLogger(this.getClass());

    //页面URL的正则表达式
    //.是匹配所有的字符，//.表示只匹配一个，//.?同理

    private static String REGEX_PAGE_URL = "http://www\\.win4000\\.com/zt/gaoqing_[0-9]+.html";
    //爬取的页数
    public static int PAGE_SIZE = 6;
    //下载张数
    public static int INDEX_PHOTO =1;

    public void process(Page page) {

        List<String> SpidertURL = new ArrayList<String>();

        for (int i = 2; i < PAGE_SIZE; i++){//添加到目标url中
            SpidertURL.add("http://www.win4000.com/zt/gaoqing_" + i + ".html");
        }
        //添加url到请求中
        page.addTargetRequests(SpidertURL);


        //是图片列表页面
        System.out.println(page.getUrl());
        if (page.getUrl().regex(REGEX_PAGE_URL).match()) {
            //获得所有详情页的连接
            //page.getHtml().xpath("//a[@class=\"title\"]").links().all();
            List<String> detailURL = page.getHtml().xpath("//ul[@class='clearfix']/li/a").links().all();
            int x = 1;
            for (String str:detailURL){//输出所有连接
                System.out.println(x+"----"+str);
                x++;
            }
            page.addTargetRequests(detailURL);
        } else {//详情页
            String detailUrl = page.getUrl().toString();
            System.out.println(detailUrl);
            String picURL = page.getHtml().xpath("//div[@class='pic-meinv']/a").css("img", "src").toString();
            System.out.println(picURL);
            String currentIndex = page.getHtml().xpath("//div[@class='ptitle']/span/text()").toString();
            String picname = page.getHtml().xpath("//div[@class='ptitle']/h1/text()").toString();
            if(!"1".equals(currentIndex)){//如果不是第一页，则图片名称加上页码顺序
               // picname = picname+"_"+StringUtil.getURLIndex(detailUrl);
            }
            String allPic = page.getHtml().xpath("//div[@class='ptitle']/em/text()").toString();
            if(allPic!= null && picURL != null && "1".equals(currentIndex)){
                Integer pageindex = Integer.parseInt(allPic);
                List<String>otherPic = new ArrayList<String>();
                for(int i=2;i<=pageindex;i++){
                    otherPic.add(detailUrl.replaceAll(".html", "_"+i+".html"));
                }
                page.addTargetRequests(otherPic);
            }
            System.out.println(picname);
            try {
                /**
                 * String 图片地址
                 * String 图片名称
                 * String 保存路径*/

                if(picURL !=null){
                    DownLoadUtils.download( picURL, picname + ".jpg", "E:\\image3\\");
                    System.out.println("第"+(INDEX_PHOTO++)+"张");
                }
            } catch (Exception e) {
                e.printStackTrace();
            }
        }

    }

    public Site getSite() {
        return Site.me();
    }


    public static void main(String [] args) throws JMException {

        Date stdate = new Date();
        System.out.println("开始时间："+new SimpleDateFormat("yyyy-MM-dd HH:mm:ss").format(stdate));
        Spider picSpider = Spider.create(new myImageProcess())
                .addUrl("http://www.win4000.com/zt/gaoqing_1.html")
                .thread(5);
        SpiderMonitor.instance().register(picSpider);
        picSpider.start();
        Date edDate = new Date();
        System.out.println("结束时间："+new SimpleDateFormat("yyyy-MM-dd HH:mm:ss").format(edDate));
        System.out.println("共耗时"+(edDate.getTime()-stdate.getTime())/1000/60+"分钟");
    }
}

DownLoadUtils.java

import java.io.File;
import java.io.FileOutputStream;
import java.io.InputStream;
import java.io.OutputStream;
import java.net.URL;
import java.net.URLConnection;
import java.util.UUID;

/**
 * 下载相关的工具类
 *
 * @author xwer
 *
 */
public class DownLoadUtils {

    /**
     * 下载图片工具
     *
     * @param urlString
     *            图片链接地址
     * @param filename
     *            图片的文件名字
     * @param savePath
     *            图片保存的路径
     * @throws Exception
     */
    public static void download(String urlString, String filename, String savePath) throws Exception {
        // 构造URL
        URL url = new URL(urlString);
        // 打开连接
        URLConnection con = url.openConnection();
        // 设置请求头
        con.addRequestProperty("User-Agent", "Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; WOW64; Trident/6.0)");
        // 设置请求超时为5s
        con.setConnectTimeout(5 * 1000);
        // 输入流
        InputStream is = con.getInputStream();

        // 1K的数据缓冲
        byte[] bs = new byte[1024];
        // 读取到的数据长度
        int len;
        // 输出的文件流
        File sf = new File(savePath);
        if (!sf.exists()) {
            sf.mkdirs();
        }
        OutputStream os = new FileOutputStream(sf.getPath() + "\\" + filename);
        // 开始读取
        while ((len = is.read(bs)) != -1) {
            os.write(bs, 0, len);
        }
        // 完毕，关闭所有链接
        os.close();
        is.close();
    }

    /**
     * 截取真实文件名
     *
     * @param fileName
     * @return
     */
    public static String subFileName(String fileName) {
        // 查找最后一个 \出现位置
        int index = fileName.lastIndexOf("\\");
        if (index == -1) {
            return fileName;
        }
        return fileName.substring(index + 1);
    }

    /**
     * 获得随机UUID文件名
     *
     * @param fileName
     * @return
     */
    public static String generateRandonFileName(String fileName) {
        // 获得扩展名
        String ext = fileName.substring(fileName.lastIndexOf("."));
        return UUID.randomUUID().toString().replace("-", "") + ext;
    }
}