前言
在学了关于webmagic的东西之后,就想搞点事情,于是乎就又去挖起点的墙角了
开搞
1.要先添加依赖并找到一个要爬的目标
<dependency>
<groupId>us.codecraft</groupId>
<artifactId>webmagic-core</artifactId>
<version>0.7.3</version>
</dependency>
<dependency>
<groupId>us.codecraft</groupId>
<artifactId>webmagic-extension</artifactId>
<version>0.7.3</version>
</dependency>

2.使用xpath语法找这个cf
Selectable div = page.getHtml().xpath("//div[@class=\"focus-wrap box-center mb40 cf\"]");
3.再使用Css选择器选择获取classify-list
及一下的dd
Selectable div2 = div.$(".classify-list").css(".cf");
4.使用Css选择器选择所有的dd
,之后遍历
List<String> all = div2.css("dd").all();
5.再使用xpath和css选择器获取里面的链接和文本
6.代码如下:
6.1 爬取分类名和页面路径
public class QidianCategoryProcessor implements PageProcessor, Serializable {
private static List<Map<Selectable, Map<Selectable, String>>> out = new ArrayList<>();
public void process(Page page) {
Selectable div = page.getHtml().xpath("//div[@class=\"focus-wrap box-center mb40 cf\"]");
Selectable div2 = div.$(".classify-list").css(".cf");
List<String> all = div2.css("dd").all();
for (int i = 1; i <= all.size(); i++) {
Selectable xpath = div2.xpath("dd[" + i + "]");
Selectable a = xpath.xpath("a[@href]");
Selectable $ = a.$("a", "href");//获取属性
String url = "";
if (i != all.size() - 1) {
url = "https://www.qidian.com/" + $;
} else {
url = "https:" + $;
}
Selectable name = a.xpath("i/text()");//通过包裹有文本的最近的标签获取文本
EveryCategoryProcessor.in(url, name);
}
out = EveryCategoryProcessor.out();
}
public Site getSite() {
return Site.me().setTimeOut(5000).setCharset("utf-8").setUserAgent("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.66 Safari/537.36").setSleepTime(100).setRetryTimes(3);
}
@Bean
public String run() {
Spider.create(new QidianCategoryProcessor()).addUrl("https://www.qidian.com/").thread(5).run();
return "ok";
}
public static List<Map<Selectable, Map<Selectable, String>>> get() {
return out;
}
}
6.2 爬取每一个分类下的一部分数据
public class EveryCategoryProcessor implements PageProcessor, Serializable {
private static Map<Selectable, Map<Selectable, String>> map = new HashMap<Selectable, Map<Selectable, String>>();
private Map<Selectable, String> map2 = new HashMap<Selectable, String>();
private static Selectable name;
@Override
public void process(Page page) {
Selectable xpath = page.getHtml().xpath("//tbody");
List<String> tr = xpath.css("tr").all();
for (int i = 1; i <= tr.size(); i++) {
Selectable xpath1 = xpath.xpath("tr[" + i + "]");
if ((xpath1 + "").equals("null")) {
continue;
}
Selectable css = xpath1.css("td:nth-child(2)");
Selectable $ = css.$("a", "href");
Selectable text = css.xpath("a/text()");
map2.put(text, "https:" + $);
}
map.put(name, map2);
}
@Override
public Site getSite() {
return Site.me().setTimeOut(5000).setCharset("utf-8").setUserAgent("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.66 Safari/537.36").setSleepTime(100).setRetryTimes(3);
}
static void in(String url, Selectable name) {
EveryCategoryProcessor.name = name;
Spider.create(new EveryCategoryProcessor()).addUrl(url).thread(5).run();
}
static List<Map<Selectable, Map<Selectable, String>>> out() {
List<Map<Selectable, Map<Selectable, String>>> list = new ArrayList<>();
list.add(map);
return list;
}
}

网友评论