相关依赖
<dependency>
<groupId>us.codecraft</groupId>
<artifactId>webmagic-core</artifactId>
<version>0.7.3</version>
</dependency>
<dependency>
<groupId>us.codecraft</groupId>
<artifactId>webmagic-extension</artifactId>
<version>0.7.3</version>
</dependency>
示例代码
myImageProcess.java
import com.bootdo.project.controller.pachong.DownLoadUtils;
import org.apache.log4j.Logger;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.Spider;
import us.codecraft.webmagic.monitor.SpiderMonitor;
import us.codecraft.webmagic.processor.PageProcessor;
import us.codecraft.webmagic.selector.Selectable;
import javax.management.JMException;
import java.text.SimpleDateFormat;
import java.util.ArrayList;
import java.util.Date;
import java.util.List;
/**
* Created by ljf on 2019/8/9.
*/
public class myImageProcess implements PageProcessor {
private Logger logger = Logger.getLogger(this.getClass());
//页面URL的正则表达式
//.是匹配所有的字符,//.表示只匹配一个,//.?同理
private static String REGEX_PAGE_URL = "http://www\\.win4000\\.com/zt/gaoqing_[0-9]+.html";
//爬取的页数
public static int PAGE_SIZE = 6;
//下载张数
public static int INDEX_PHOTO =1;
public void process(Page page) {
List<String> SpidertURL = new ArrayList<String>();
for (int i = 2; i < PAGE_SIZE; i++){//添加到目标url中
SpidertURL.add("http://www.win4000.com/zt/gaoqing_" + i + ".html");
}
//添加url到请求中
page.addTargetRequests(SpidertURL);
//是图片列表页面
System.out.println(page.getUrl());
if (page.getUrl().regex(REGEX_PAGE_URL).match()) {
//获得所有详情页的连接
//page.getHtml().xpath("//a[@class=\"title\"]").links().all();
List<String> detailURL = page.getHtml().xpath("//ul[@class='clearfix']/li/a").links().all();
int x = 1;
for (String str:detailURL){//输出所有连接
System.out.println(x+"----"+str);
x++;
}
page.addTargetRequests(detailURL);
} else {//详情页
String detailUrl = page.getUrl().toString();
System.out.println(detailUrl);
String picURL = page.getHtml().xpath("//div[@class='pic-meinv']/a").css("img", "src").toString();
System.out.println(picURL);
String currentIndex = page.getHtml().xpath("//div[@class='ptitle']/span/text()").toString();
String picname = page.getHtml().xpath("//div[@class='ptitle']/h1/text()").toString();
if(!"1".equals(currentIndex)){//如果不是第一页,则图片名称加上页码顺序
// picname = picname+"_"+StringUtil.getURLIndex(detailUrl);
}
String allPic = page.getHtml().xpath("//div[@class='ptitle']/em/text()").toString();
if(allPic!= null && picURL != null && "1".equals(currentIndex)){
Integer pageindex = Integer.parseInt(allPic);
List<String>otherPic = new ArrayList<String>();
for(int i=2;i<=pageindex;i++){
otherPic.add(detailUrl.replaceAll(".html", "_"+i+".html"));
}
page.addTargetRequests(otherPic);
}
System.out.println(picname);
try {
/**
* String 图片地址
* String 图片名称
* String 保存路径*/
if(picURL !=null){
DownLoadUtils.download( picURL, picname + ".jpg", "E:\\image3\\");
System.out.println("第"+(INDEX_PHOTO++)+"张");
}
} catch (Exception e) {
e.printStackTrace();
}
}
}
public Site getSite() {
return Site.me();
}
public static void main(String [] args) throws JMException {
Date stdate = new Date();
System.out.println("开始时间:"+new SimpleDateFormat("yyyy-MM-dd HH:mm:ss").format(stdate));
Spider picSpider = Spider.create(new myImageProcess())
.addUrl("http://www.win4000.com/zt/gaoqing_1.html")
.thread(5);
SpiderMonitor.instance().register(picSpider);
picSpider.start();
Date edDate = new Date();
System.out.println("结束时间:"+new SimpleDateFormat("yyyy-MM-dd HH:mm:ss").format(edDate));
System.out.println("共耗时"+(edDate.getTime()-stdate.getTime())/1000/60+"分钟");
}
}
DownLoadUtils.java
import java.io.File;
import java.io.FileOutputStream;
import java.io.InputStream;
import java.io.OutputStream;
import java.net.URL;
import java.net.URLConnection;
import java.util.UUID;
/**
* 下载相关的工具类
*
* @author xwer
*
*/
public class DownLoadUtils {
/**
* 下载图片工具
*
* @param urlString
* 图片链接地址
* @param filename
* 图片的文件名字
* @param savePath
* 图片保存的路径
* @throws Exception
*/
public static void download(String urlString, String filename, String savePath) throws Exception {
// 构造URL
URL url = new URL(urlString);
// 打开连接
URLConnection con = url.openConnection();
// 设置请求头
con.addRequestProperty("User-Agent", "Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; WOW64; Trident/6.0)");
// 设置请求超时为5s
con.setConnectTimeout(5 * 1000);
// 输入流
InputStream is = con.getInputStream();
// 1K的数据缓冲
byte[] bs = new byte[1024];
// 读取到的数据长度
int len;
// 输出的文件流
File sf = new File(savePath);
if (!sf.exists()) {
sf.mkdirs();
}
OutputStream os = new FileOutputStream(sf.getPath() + "\\" + filename);
// 开始读取
while ((len = is.read(bs)) != -1) {
os.write(bs, 0, len);
}
// 完毕,关闭所有链接
os.close();
is.close();
}
/**
* 截取真实文件名
*
* @param fileName
* @return
*/
public static String subFileName(String fileName) {
// 查找最后一个 \出现位置
int index = fileName.lastIndexOf("\\");
if (index == -1) {
return fileName;
}
return fileName.substring(index + 1);
}
/**
* 获得随机UUID文件名
*
* @param fileName
* @return
*/
public static String generateRandonFileName(String fileName) {
// 获得扩展名
String ext = fileName.substring(fileName.lastIndexOf("."));
return UUID.randomUUID().toString().replace("-", "") + ext;
}
}
网友评论