美文网首页
Jsoup爬虫简单实例[笔记] ---- 根据URL爬取多篇凤凰

Jsoup爬虫简单实例[笔记] ---- 根据URL爬取多篇凤凰

作者: lhsjohn | 来源:发表于2019-03-03 23:27 被阅读0次
package com.lhsjohn.spider;

import java.io.IOException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;

import org.apache.bcel.generic.GETSTATIC;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import org.junit.Test;
/**
 * 凤凰网爬虫
 * @author lihuashuo
 *
 */
public class JsoupTest {

    public static void main(String[] args) throws Exception {
        String url = "https://www.ifeng.com/";
        String selection = "#FNew .fl.FNewM .FNewMTopLis ul";
        List<Map<String, String>> list = new ArrayList<Map<String, String>>();
        org.jsoup.nodes.Document doc = Jsoup.connect(url).get();
        Elements elements = doc.select(selection);
        Element element = elements.get(0);
        String outerHtml = element.outerHtml();
        Elements children = element.children();
        for (Element element2 : children) {
            if (element2.select("a").attr("href").startsWith("http://news.ifeng.com/a/")) {
                Map<String, String> item = new HashMap<String, String>();
                Elements aElement = element2.select("a");
                item.put("text", aElement.text());
                item.put("href", aElement.attr("href"));
                list.add(item);
            }

            // System.out.println(aElement.text());
            // System.out.println(aElement.attr("href"));

        }
        Element nextElementSibling = element.nextElementSibling();
        Elements children2 = nextElementSibling.children();

        for (Element element3 : children2) {
            if (element3.select("a").attr("href").startsWith("http://news.ifeng.com/a/")) {

                Map item = new HashMap<>();
                Elements aElement2 = element3.select("a");
                // System.out.println(aElement2.text());
                // System.out.println(aElement2.attr("href"));
                item.put("text", aElement2.text());
                item.put("href", aElement2.attr("href"));
                list.add(item);
            }

        }

        for (Map map : list) {
            System.out.println(map.toString());
        }
        System.out.println(list.size());

        System.out.println("======================开始测试=====================================");
        // String newUrl = list.get(1).get("href");
        String newUrl = list.get(0).get("href");
        // for (Map map : list) {
        // newUrl = (String) map.get("href");
        // if (newUrl.startsWith("http://news.ifeng.com/a/")) {
        //
        // System.out.println("newUrl:" + newUrl);
        // //getArticles(newUrl);
        // }
        //
        System.out.println(newUrl);
        getArticles(newUrl);
        String newsUrl2 = list.get(1).get("href");
        System.out.println("~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~");
        System.out.println(newsUrl2);
        getArticles(newsUrl2);
        for(int i=3;i<list.size();i++) {
            String tempUrl = list.get(i).get("href");
            System.out.println("~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~");
            System.out.println(tempUrl);
            getArticles2(tempUrl);
        }
        // getArticles2("http://news.ifeng.com/a/20180920/60076776_0.shtml");
    }

    public static void getArticles(String url) throws Exception {

        Document document = Jsoup.connect(url).get();
        String selection = ".yc_main.wrap";
        Elements elements = document.select(selection);
        Element element = elements.get(0);
        // System.out.println(element.outerHtml());
        Elements elements2 = element.select(".yc_tit");
        if (!elements2.isEmpty()) {
            Element element2 = elements2.get(0);
            Elements children = element2.children();
            for (Element element3 : children) {
                System.out.println(element3.tagName());
                if (element3.tagName().equals("h1")) {
                    System.out.println("标题:" + element3.text());
                }
                if (element3.tagName().equals("p")) {
                    Elements children2 = element3.children();
                    Element element4 = children2.get(0);
                    System.out.println("时间:" + element4.text());
                    Element element5 = children2.get(1);
                    System.out.println("来源:" + element5.text());
                    System.out.println("来源链接:" + element5.attr("href"));
                }

            }

        }

        Elements element6 = element.select(".yc_con_l #yc_con_txt");
        System.out.println("element3 size " + element6.size());
        if (!element6.isEmpty()) {
            System.out.println("-------------content-------------");
            Element element7 = element6.get(0);
            System.out.println(element7.text());
        }
    }

    public static void getArticles2(String url) throws Exception {
        Document document = Jsoup.connect(url).get();
        String selection = "#artical";
        Elements elements = document.select(selection);
        System.out.println(elements.size());
        Element element = elements.get(0);
        Element element2 = element.select("#artical_topic").get(0);
        System.out.println(element2.text());
        Element element3 = element.select("#artical_sth").get(0);
        Elements children = element3.children();
        Element element4 = children.get(0);
        Element element5 = element4.select(".ss01").get(0);
        System.out.println("时间:" + element5.text());
        Element element6 = element4.select(".ss03 a").get(0);
        System.out.println("来源: " + element6.text());
        System.out.println("来源地址:" + element6.attr("href"));
        // 获取文章内容
        Element element7 = element.select("#artical_real #main_content").get(0);
        Elements element8 = element7.select(".detailPic img");

        if (!element8.isEmpty()) {
            System.out.println("图片地址:" + element8.get(0).attr("src"));
        }

        System.out.println("新闻内容:" + element7.text());
    }

}




作者:lhsjohn

相关文章

网友评论

      本文标题:Jsoup爬虫简单实例[笔记] ---- 根据URL爬取多篇凤凰

      本文链接:https://www.haomeiwen.com/subject/aiyvuqtx.html