先给大家贴上全书网网址:http://www.quanshuwang.com/
本程序采用的webmagic爬虫框架;WebMagic是一个简单灵活的Java爬虫框架。你可以快速开发出一个高效、易维护的爬虫。
程序需要使用到WebMagic框架的jar,大家可以自行百度。
废话不说,直接上代码
package com.baweihu.wler;
import java.util.UUID;
import com.alibaba.fastjson.JSONObject;
import com.baweihu.entity.Novel;
import com.baweihu.util.FileUtil;
import com.baweihu.util.ImageDownload;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.Spider;
import us.codecraft.webmagic.processor.PageProcessor;
import us.codecraft.webmagic.selector.Html;
/**
* @author huluwa
* @version 1.0.0
* */
public class QuanShu implements Runnable,PageProcessor {
// 抓取网站的相关配置,可以包括编码、抓取间隔1s、重试次数等
private Site site = Site.me().setCharset("gbk").setRetryTimes(0).setSleepTime(0).setTimeOut(8000);
private int start;
private int end;
public QuanShu(int start, int end) {
super();
this.start = start;
this.end = end;
}
public QuanShu() {
super();
}
public Site getSite() {
return site;
}
public void run() {
for(; start < end; start ++) {
Spider.create(new QuanShu()).addUrl("http://www.quanshuwang.com/book_" + start + ".html").thread(2).run();
}
}
/**
* @param length:长度
* threadNum:要开启线程的数量
* */
public synchronized void handleList(int length, int threadNum) {
//如果线程个数不能被整除则多加个线程来处理剩余的数据
int t = length % threadNum == 0 ? length / threadNum : length / (threadNum - 1);
for (int i = 0; i < threadNum; i++) {
int start = i * t;
int end = (i + 1) * t;
QuanShu test = new QuanShu(start, end);
new Thread(test).start();
}
}
public void process(Page page) {
try {
Html html = page.getHtml();
String title = html.xpath("/html/head/meta[5]/@content").get();
if(null == title) {
return; //获取不到小说名字 就表示为null 不爬这条数据
}
String author = html.xpath("/html/head/meta[9]/@content").get();
String classfy =html.xpath("/html/head/meta[8]/@content").get();
String isFinish =html.xpath("/html/head/meta[11]/@content").get();
String coverUrl =html.xpath("/html/head/meta[7]/@content").get();
String chapterUrl =html.xpath("/html/head/meta[14]/@content").get();
String introduce =html.xpath("/html/head/meta[6]/@content").get();
String updateTime =html.xpath("/html/head/meta[12]/@content").get();
String json = JSONObject.toJSONString(new Novel(title, author, classfy, isFinish, introduce, coverUrl, chapterUrl, updateTime));
String no = UUID.randomUUID().toString().replace("-", "");
FileUtil.writeToFile("file/" + no + "/info.txt", json);
ImageDownload.downloadPicture(coverUrl, "file/" + no + "/cover.jpg");
} catch (Exception e) {
e.printStackTrace();
}
}
public static void main(String[] args) {
QuanShu qs = new QuanShu();
qs.handleList(95730, 12);
}
}
网友评论