1. Jsoup
- 官方文档
https://jsoup.org/
Maven 依赖
<dependency>
<groupId>org.jsoup</groupId>
<artifactId>jsoup</artifactId>
<version>1.15.4</version>
</dependency>
1.1 加载 HTML
/**
* 方式一:完整HTML: Jsoup.parse(String html)
* 方式二:HTML片段: Jsoup.parseBodyFragment(String bodyHtml)
* 方式三:URL加载: Jsoup.connect(String url)
* 方式四:文件加载: Jsoup.parse(File file, String charsetName)
*/
@Test
public void demoOne() throws IOException {
Document document = Jsoup.parse("<html>...</html>");
Document document2 = Jsoup.parse("<div>...</div>");
Document document3 = Jsoup.connect("https://jsoup.org/").get();
System.out.println(document);
}
2. DOM 解析
示例文档 https://www.tutorialspoint.com/jsoup/jsoup_use_dom.htm
文件 test.html
<!doctype html>
<html lang="en">
<head>
<title>jsoup - Using DOM Methods</title>
<meta charset="utf-8">
</head>
<body>
<div id="show">
<img src="/jsoup/images/jsoup-mini-logo.jpg" alt="jsoup tutorial">
<img class="footer_logo" src="/static/images/logo-footer.svg" alt="logo">
</div>
<div class="list">
<ul class="chapters">
<li class="heading">Jsoup Tutorial</li>
<li><a href="/jsoup/index.htm">jsoup - Home</a></li>
<li><a href="/jsoup/jsoup_use_dom.htm">jsoup - Using DOM Methods</a></li>
<li><a href="/jsoup/jsoup_use_selector.htm">jsoup - Using Selector Syntax</a></li>
</ul>
</div>
<div class="content">
<p>Where</p>
<ul>
<li><p><b>document</b> − document object represents the HTML DOM.</p></li>
<li><p><b>Jsoup</b> − main class to parse the given HTML String.</p></li>
<li><p><b>html</b> − HTML String.</p></li>
</ul>
<p><i>JsoupTester.java</i></p>
<pre class="pretty">
Document document = Jsoup.parse(html);
System.out.println(document.title());
Elements paragraphs = document.getElementsByTag("p");
for (Element paragraph : paragraphs) {
System.out.println(paragraph.text());
}
</pre>
</div>
</body>
</html>
内容解析
@Test
public void demo() throws Exception {
URI uri = ClassLoader.getSystemResource("test.html").toURI();
// String path = ClassLoader.getSystemResource("test.html").getPath();
Document document = Jsoup.parse(new File(uri), "UTF-8");
JsoupUtil.printTitle(document);
// JsoupUtil.printBody(document);
JsoupUtil.printElementById(document, "show");
JsoupUtil.printDocumentTag(document, "p");
JsoupUtil.printDocumentTag(document, "ul");
JsoupUtil.printDocumentTag(document, "li");
// <pre> 保留文本原格式
JsoupUtil.printDocumentTag(document, "pre");
JsoupUtil.printElementClass(document, "content");
JsoupUtil.printElementTag(document, "content", "li");
JsoupUtil.printDocumentLinkTag(document);
JsoupUtil.printElementLinkTag(document, "list");
}
工具类JsoupUtil
public class JsoupUtil {
public static void printTitle(Document document) {
System.out.println(document.title());
}
public static void printBody(Document document) {
System.out.println(document.body());
}
public static void printElementLinkTag(Document document, String className) {
Elements contentDiv = document.getElementsByClass(className);
for (Element element : contentDiv) {
Elements links = element.getElementsByTag("a");
for (Element link : links) {
System.out.println("Href: " + link.attr("href"));
System.out.println("Text: " + link.text());
}
}
}
/**
* e.g. <a href="">xxx</a>
*/
public static void printDocumentLinkTag(Document document) {
Elements links = document.getElementsByTag("a");
for (Element link : links) {
System.out.println("Href: " + link.attr("href"));
System.out.println("Text: " + link.text());
}
}
public static void printDocumentTag(Document document, String tag) {
Elements elements = document.getElementsByTag(tag);
for (Element element : elements) {
System.out.println(element.text());
}
}
/**
* e.g. <div id='idName'>
*/
public static void printElementById(Document document, String idName) {
Element content = document.getElementById(idName);
System.out.println(content);
}
/**
* e.g. <div class="content">
*/
public static void printElementClass(Document document, String className) {
Elements contentDiv = document.getElementsByClass(className);
for (Element elementDiv : contentDiv) {
System.out.println(elementDiv.text());
}
}
public static void printElementTag(Document document, String className, String tag) {
Elements contentDiv = document.getElementsByClass(className);
for (Element elementDiv : contentDiv) {
printElementTag(elementDiv, tag);
}
}
public static void printElementTag(Element element, String tag) {
Elements tagElements = element.getElementsByTag(tag);
for (Element tagElement : tagElements) {
System.out.println(tagElement.text());
}
}
public static void printElementAttrs(Element element, String... attrs) {
for (String attr : attrs) {
System.out.println(element.attr(attr));
}
}
}
3. Selector 选择器
获取DOM
private Document getDocument() throws Exception {
URI uri = ClassLoader.getSystemResource("test.html").toURI();
return Jsoup.parse(new File(uri), "UTF-8");
}
3.1 <a> 选择
@Test
public void selectorHref() throws Exception {
Document document = getDocument();
// <a> with href
Elements links = document.select("a[href]");
for (Element link : links) {
System.out.println("Href: " + link.attr("href"));
System.out.println("Text: " + link.text());
}
}
3.2 <ul class="chapters">
@Test
public void selectorClass() throws Exception {
Document document = getDocument();
// ul with class=chapters
Elements ulChapters = document.select("ul.chapters");
for (Element ulChapter : ulChapters) {
JsoupUtil.printElementTag(ulChapter, "li");
}
}
3.3 获取第一个匹配的元素
@Test
public void selectorFirst() throws Exception {
Document document = getDocument();
Element first = document.select("li").first();
System.out.println(first);
}
3.4 按层级选择
/**
* <div>
* <pre>...</pre>
* </div>
*/
@Test
public void selectorLayer() throws Exception {
Document document = getDocument();
// direct <pre> after <div>
Elements layers = document.select("div > pre");
for (Element layer : layers) {
System.out.println(layer.text());
}
}
3.5 <img src="..." alt="...">
@Test
public void selectorImg() throws Exception {
Document document = getDocument();
// <img> with src
Elements images = document.select("img");
for (Element image : images) {
JsoupUtil.printElementAttrs(image, "alt");
}
Elements matchImages = document.select("img[src$=.jpg]");
System.out.println(matchImages.first());
}
网友评论