美文网首页
Jsoup - Java HTML Parser

Jsoup - Java HTML Parser

作者: Tinyspot | 来源:发表于2023-10-18 15:47 被阅读0次

    1. Jsoup

    • 官方文档 https://jsoup.org/
      Maven 依赖
    <dependency>
        <groupId>org.jsoup</groupId>
        <artifactId>jsoup</artifactId>
        <version>1.15.4</version>
    </dependency>
    

    1.1 加载 HTML

    /**
     * 方式一:完整HTML: Jsoup.parse(String html)
     * 方式二:HTML片段: Jsoup.parseBodyFragment(String bodyHtml)
     * 方式三:URL加载: Jsoup.connect(String url)
     * 方式四:文件加载: Jsoup.parse(File file, String charsetName)
     */
    @Test
    public void demoOne() throws IOException {
        Document document = Jsoup.parse("<html>...</html>");
        
        Document document2 = Jsoup.parse("<div>...</div>");
    
        Document document3 = Jsoup.connect("https://jsoup.org/").get();
        
        System.out.println(document);
    }
    

    2. DOM 解析

    示例文档 https://www.tutorialspoint.com/jsoup/jsoup_use_dom.htm

    文件 test.html

    <!doctype html>
    <html lang="en">
    <head>
        <title>jsoup - Using DOM Methods</title>
        <meta charset="utf-8">
    </head>
    <body>
        <div id="show">
            <img src="/jsoup/images/jsoup-mini-logo.jpg" alt="jsoup tutorial">
            <img class="footer_logo" src="/static/images/logo-footer.svg" alt="logo">
        </div>
        <div class="list">
            <ul class="chapters">
                <li class="heading">Jsoup Tutorial</li>
                <li><a href="/jsoup/index.htm">jsoup - Home</a></li>
                <li><a href="/jsoup/jsoup_use_dom.htm">jsoup - Using DOM Methods</a></li>
                <li><a href="/jsoup/jsoup_use_selector.htm">jsoup - Using Selector Syntax</a></li>
            </ul>
        </div>
        <div class="content">
            <p>Where</p>
            <ul>
                <li><p><b>document</b> − document object represents the HTML DOM.</p></li>
                <li><p><b>Jsoup</b> − main class to parse the given HTML String.</p></li>
                <li><p><b>html</b> − HTML String.</p></li>
            </ul>
            <p><i>JsoupTester.java</i></p>
            <pre class="pretty">
                Document document = Jsoup.parse(html);
                System.out.println(document.title());
                Elements paragraphs = document.getElementsByTag("p");
                for (Element paragraph : paragraphs) {
                 System.out.println(paragraph.text());
                }
            </pre>
        </div>
    </body>
    </html>
    

    内容解析

    @Test
    public void demo() throws Exception {
        URI uri = ClassLoader.getSystemResource("test.html").toURI();
        // String path = ClassLoader.getSystemResource("test.html").getPath();
        Document document = Jsoup.parse(new File(uri), "UTF-8");
    
        JsoupUtil.printTitle(document);
        // JsoupUtil.printBody(document);
    
        JsoupUtil.printElementById(document, "show");
        JsoupUtil.printDocumentTag(document, "p");
        JsoupUtil.printDocumentTag(document, "ul");
        JsoupUtil.printDocumentTag(document, "li");
        // <pre> 保留文本原格式
        JsoupUtil.printDocumentTag(document, "pre");
    
        JsoupUtil.printElementClass(document, "content");
        JsoupUtil.printElementTag(document, "content", "li");
    
        JsoupUtil.printDocumentLinkTag(document);
        JsoupUtil.printElementLinkTag(document, "list");
    }
    

    工具类JsoupUtil

    public class JsoupUtil {
    
        public static void printTitle(Document document) {
            System.out.println(document.title());
        }
    
        public static void printBody(Document document) {
            System.out.println(document.body());
        }
    
        public static void printElementLinkTag(Document document, String className) {
            Elements contentDiv = document.getElementsByClass(className);
            for (Element element : contentDiv) {
                Elements links = element.getElementsByTag("a");
                for (Element link : links) {
                    System.out.println("Href: " + link.attr("href"));
                    System.out.println("Text: " + link.text());
                }
            }
        }
    
        /**
         * e.g. <a href="">xxx</a>
         */
        public static void printDocumentLinkTag(Document document) {
            Elements links = document.getElementsByTag("a");
            for (Element link : links) {
                System.out.println("Href: " + link.attr("href"));
                System.out.println("Text: " + link.text());
            }
        }
    
        public static void printDocumentTag(Document document, String tag) {
            Elements elements = document.getElementsByTag(tag);
            for (Element element : elements) {
                System.out.println(element.text());
            }
        }
    
        /**
         * e.g. <div id='idName'>
         */
        public static void printElementById(Document document, String idName) {
            Element content = document.getElementById(idName);
            System.out.println(content);
        }
    
        /**
         * e.g. <div class="content">
         */
        public static void printElementClass(Document document, String className) {
            Elements contentDiv = document.getElementsByClass(className);
            for (Element elementDiv : contentDiv) {
                System.out.println(elementDiv.text());
            }
        }
    
        public static void printElementTag(Document document, String className, String tag) {
            Elements contentDiv = document.getElementsByClass(className);
            for (Element elementDiv : contentDiv) {
                printElementTag(elementDiv, tag);
            }
        }
    
        public static void printElementTag(Element element, String tag) {
            Elements tagElements = element.getElementsByTag(tag);
            for (Element tagElement : tagElements) {
                System.out.println(tagElement.text());
            }
        }
    
        public static void printElementAttrs(Element element, String... attrs) {
            for (String attr : attrs) {
                System.out.println(element.attr(attr));
            }
        }
    }
    

    3. Selector 选择器

    获取DOM

    private Document getDocument() throws Exception {
        URI uri = ClassLoader.getSystemResource("test.html").toURI();
        return Jsoup.parse(new File(uri), "UTF-8");
    }
    

    3.1 <a> 选择

    @Test
    public void selectorHref() throws Exception {
        Document document = getDocument();
        // <a> with href
        Elements links = document.select("a[href]");
        for (Element link : links) {
            System.out.println("Href: " + link.attr("href"));
            System.out.println("Text: " + link.text());
        }
    }
    

    3.2 <ul class="chapters">

    @Test
    public void selectorClass() throws Exception {
        Document document = getDocument();
        // ul with class=chapters
        Elements ulChapters = document.select("ul.chapters");
        for (Element ulChapter : ulChapters) {
            JsoupUtil.printElementTag(ulChapter, "li");
        }
    }
    

    3.3 获取第一个匹配的元素

    @Test
    public void selectorFirst() throws Exception {
        Document document = getDocument();
        Element first = document.select("li").first();
        System.out.println(first);
    }
    

    3.4 按层级选择

    /**
     * <div>
     *     <pre>...</pre>
     * </div>
     */
    @Test
    public void selectorLayer() throws Exception {
        Document document = getDocument();
        // direct <pre> after <div>
        Elements layers = document.select("div > pre");
        for (Element layer : layers) {
            System.out.println(layer.text());
        }
    }
    

    3.5 <img src="..." alt="...">

    @Test
    public void selectorImg() throws Exception {
        Document document = getDocument();
    
        // <img> with src
        Elements images = document.select("img");
        for (Element image : images) {
            JsoupUtil.printElementAttrs(image, "alt");
        }
    
        Elements matchImages = document.select("img[src$=.jpg]");
        System.out.println(matchImages.first());
    }
    

    相关文章

      网友评论

          本文标题:Jsoup - Java HTML Parser

          本文链接:https://www.haomeiwen.com/subject/tldcidtx.html