package com.lhsjohn.spider;
import java.io.IOException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import org.apache.bcel.generic.GETSTATIC;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import org.junit.Test;
/**
* 凤凰网爬虫
* @author lihuashuo
*
*/
public class JsoupTest {
public static void main(String[] args) throws Exception {
String url = "https://www.ifeng.com/";
String selection = "#FNew .fl.FNewM .FNewMTopLis ul";
List<Map<String, String>> list = new ArrayList<Map<String, String>>();
org.jsoup.nodes.Document doc = Jsoup.connect(url).get();
Elements elements = doc.select(selection);
Element element = elements.get(0);
String outerHtml = element.outerHtml();
Elements children = element.children();
for (Element element2 : children) {
if (element2.select("a").attr("href").startsWith("http://news.ifeng.com/a/")) {
Map<String, String> item = new HashMap<String, String>();
Elements aElement = element2.select("a");
item.put("text", aElement.text());
item.put("href", aElement.attr("href"));
list.add(item);
}
// System.out.println(aElement.text());
// System.out.println(aElement.attr("href"));
}
Element nextElementSibling = element.nextElementSibling();
Elements children2 = nextElementSibling.children();
for (Element element3 : children2) {
if (element3.select("a").attr("href").startsWith("http://news.ifeng.com/a/")) {
Map item = new HashMap<>();
Elements aElement2 = element3.select("a");
// System.out.println(aElement2.text());
// System.out.println(aElement2.attr("href"));
item.put("text", aElement2.text());
item.put("href", aElement2.attr("href"));
list.add(item);
}
}
for (Map map : list) {
System.out.println(map.toString());
}
System.out.println(list.size());
System.out.println("======================开始测试=====================================");
// String newUrl = list.get(1).get("href");
String newUrl = list.get(0).get("href");
// for (Map map : list) {
// newUrl = (String) map.get("href");
// if (newUrl.startsWith("http://news.ifeng.com/a/")) {
//
// System.out.println("newUrl:" + newUrl);
// //getArticles(newUrl);
// }
//
System.out.println(newUrl);
getArticles(newUrl);
String newsUrl2 = list.get(1).get("href");
System.out.println("~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~");
System.out.println(newsUrl2);
getArticles(newsUrl2);
for(int i=3;i<list.size();i++) {
String tempUrl = list.get(i).get("href");
System.out.println("~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~");
System.out.println(tempUrl);
getArticles2(tempUrl);
}
// getArticles2("http://news.ifeng.com/a/20180920/60076776_0.shtml");
}
public static void getArticles(String url) throws Exception {
Document document = Jsoup.connect(url).get();
String selection = ".yc_main.wrap";
Elements elements = document.select(selection);
Element element = elements.get(0);
// System.out.println(element.outerHtml());
Elements elements2 = element.select(".yc_tit");
if (!elements2.isEmpty()) {
Element element2 = elements2.get(0);
Elements children = element2.children();
for (Element element3 : children) {
System.out.println(element3.tagName());
if (element3.tagName().equals("h1")) {
System.out.println("标题:" + element3.text());
}
if (element3.tagName().equals("p")) {
Elements children2 = element3.children();
Element element4 = children2.get(0);
System.out.println("时间:" + element4.text());
Element element5 = children2.get(1);
System.out.println("来源:" + element5.text());
System.out.println("来源链接:" + element5.attr("href"));
}
}
}
Elements element6 = element.select(".yc_con_l #yc_con_txt");
System.out.println("element3 size " + element6.size());
if (!element6.isEmpty()) {
System.out.println("-------------content-------------");
Element element7 = element6.get(0);
System.out.println(element7.text());
}
}
public static void getArticles2(String url) throws Exception {
Document document = Jsoup.connect(url).get();
String selection = "#artical";
Elements elements = document.select(selection);
System.out.println(elements.size());
Element element = elements.get(0);
Element element2 = element.select("#artical_topic").get(0);
System.out.println(element2.text());
Element element3 = element.select("#artical_sth").get(0);
Elements children = element3.children();
Element element4 = children.get(0);
Element element5 = element4.select(".ss01").get(0);
System.out.println("时间:" + element5.text());
Element element6 = element4.select(".ss03 a").get(0);
System.out.println("来源: " + element6.text());
System.out.println("来源地址:" + element6.attr("href"));
// 获取文章内容
Element element7 = element.select("#artical_real #main_content").get(0);
Elements element8 = element7.select(".detailPic img");
if (!element8.isEmpty()) {
System.out.println("图片地址:" + element8.get(0).attr("src"));
}
System.out.println("新闻内容:" + element7.text());
}
}
作者:lhsjohn
网友评论