需要的jar
<!--html解析-->
<dependency>
<groupId>org.jsoup</groupId>
<artifactId>jsoup</artifactId>
<version>1.12.1</version>
</dependency>
<!--html动态加载-->
<dependency>
<groupId>net.sourceforge.htmlunit</groupId>
<artifactId>htmlunit</artifactId>
<version>2.36.0</version>
</dependency>
直接上代码基本上各种情况都有用到
import cn.hutool.core.bean.BeanUtil;
import cn.hutool.core.util.StrUtil;
import com.gargoylesoftware.htmlunit.BrowserVersion;
import com.gargoylesoftware.htmlunit.NicelyResynchronizingAjaxController;
import com.gargoylesoftware.htmlunit.WebClient;
import com.gargoylesoftware.htmlunit.html.HtmlPage;
import com.jcwl.extented.biz.NewsContentXiBiz;
import com.jcwl.extented.model.NewsContentXiCriteria;
import com.jcwl.extented.utils.SimHash;
import lombok.extern.slf4j.Slf4j;
import org.apache.commons.lang3.StringUtils;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import org.springframework.stereotype.Service;
import javax.annotation.Resource;
import java.io.IOException;
import java.net.ConnectException;
import java.net.HttpURLConnection;
import java.net.URL;
import java.time.LocalDateTime;
import java.util.*;
/**
* 人大数据爬取
* @ClassName: RenDaController
* @Descripton: 人大数据爬取类
* @author: xl
* @date: 2021-10-15 10:09
* @version 1.0
*/
@Slf4j
@Service
public class NationalPeopleIsCongressStarter implements CommonSpiderTask {
private static final String HOMEPAGE_NEWS_URL = "http://www.npc.gov.cn";
private static final String LEGISLATION_URL = "http://www.npc.gov.cn/npc/c183/list.shtml";
private static final String SUPERVISE_URL = "http://www.npc.gov.cn/npc/c184/list.shtml";
private static final String DELEGATE_URL = "http://www.npc.gov.cn/npc/c185/list.shtml";
private static final String ELECTION_URL = "http://www.npc.gov.cn/npc/c204/list.shtml";
private static final String NPC_DEPUTY_MEETING_URL = "http://www.npc.gov.cn/npc/c16455/list.shtml";
private static final String NPC_COMMITTEE_MEETING_URL = "http://www.npc.gov.cn/npc/c16454/list.shtml";
private static final String NPC_CHAIRMAN_MEETING_URL = "http://www.npc.gov.cn/npc/c34618/ssj_wyz_list.shtml";
private static final String NPC_OFFIC_EWORK_URL = "http://www.npc.gov.cn/npc/c22454/list.shtml";
private static final String NPC_LOCAL_PCTASK_URL = "http://www.npc.gov.cn/npc/c190/list.shtml";
private static final String SITE_CODE = "1001";
@Resource
private NewsContentXiBiz newsContentXiBiz;
private SimHash simHash = SimHash.contentSimHash();
/**
* 获取列表的集合
* @param isAppend
*/
public void crawlerNPCNews(boolean isAppend) {
psHomepageNewsTask(true);
psLegislationTask(true);
psSuperviseTask(true);
psDelegateTask(true);
psElectionTask(true);
psNPCDeputyMeetingTask(true);
psNPCCommitteeMeetingTask(true);
psNPCChairmanMeetingTask(true);
psNPCOfficeworkTask(true);
psNPCLocalPCTask(true);
}
/**
* 主页新闻列表
* @param isAppend
*/
public void psHomepageNewsTask(boolean isAppend) {
List<Map<String, String>> execute = execute(null,null, "sliderRight",HOMEPAGE_NEWS_URL);
for (Map<String, String> map:execute){
parseJson("1001001","主页新闻",null,map);
}
}
/**
* 立法列表
* @param isAppend
*/
public void psLegislationTask(boolean isAppend) {
List<String> list = urlList(LEGISLATION_URL);
for (String url : list) {
List<Map<String, String>> execute = execute(null,null, "fl s_lw",url);
for (Map<String, String> map:execute){
parseJson("1001002","立法",null,map);
}
}
}
/**
* 监督列表
* @param isAppend
*/
public void psSuperviseTask(boolean isAppend) {
List<String> list = urlList(SUPERVISE_URL);
for (String url : list) {
List<Map<String, String>> execute = execute(null,null, "fl s_lw",url);
for (Map<String, String> map:execute){
parseJson("1001003","监督",null,map);
}
}
}
/**
* 代表列表
* @param isAppend
*/
public void psDelegateTask(boolean isAppend) {
List<String> list = urlList(DELEGATE_URL);
for (String url : list) {
List<Map<String, String>> execute = execute(null,null, "fl s_lw",url);
for (Map<String, String> map:execute){
parseJson("1001004","代表",null,map);
}
}
}
/**
* 选举任免列表
* @param isAppend
*/
public void psElectionTask(boolean isAppend) {
List<String> list = urlList(ELECTION_URL);
for (String url : list) {
List<Map<String, String>> execute = execute(null,null, "fl s_lw",url);
for (Map<String, String> map:execute){
parseJson("1001005","全国人大选举任免",null,map);
}
}
}
/**
* 人大代表会议列表
* @param isAppend
*/
public void psNPCDeputyMeetingTask(boolean isAppend) {
List<Map<String, String>> execute = execute(null,null, "fl s_lw",NPC_DEPUTY_MEETING_URL);
for (Map<String, String> map:execute){
parseJson2("1001006","人大代表会议列表",null,map);
}
}
/**
* 人大常委会议列表
* @param isAppend
*/
public void psNPCCommitteeMeetingTask(boolean isAppend) {
List<Map<String, String>> execute = execute(null, null,"fl s_lw",NPC_COMMITTEE_MEETING_URL);
for (Map<String, String> map:execute){
parseJson2("1001007","人大常委会议列表",null,map);
}
}
/**
* 人大委员长会议列表
* @param isAppend
*/
public void psNPCChairmanMeetingTask(boolean isAppend) {
List<String> list = urlList(NPC_CHAIRMAN_MEETING_URL);
for (String url : list) {
List<Map<String, String>> execute = execute(null,"two-left", null,url);
for (Map<String, String> map:execute){
parseJson("1001008","人大委员长会议",null,map);
}
}
}
/**
* 机关工作列表
* @param isAppend
*/
public void psNPCOfficeworkTask(boolean isAppend) {
List<String> list = urlList(NPC_OFFIC_EWORK_URL);
for (String url : list) {
List<Map<String, String>> execute = execute(null,null, "fl s_lw",url);
for (Map<String, String> map:execute){
parseJson("1001009","机关工作",null,map);
}
}
}
/**
* 地方人大列表
* @param isAppend
*/
public void psNPCLocalPCTask(boolean isAppend) {
List<String> list = urlList(NPC_LOCAL_PCTASK_URL);
for (String url : list) {
List<Map<String, String>> execute = execute(null,null, "fl s_lw",url);
for (Map<String, String> map:execute){
parseJson("1001010","地方人大",null,map);
}
}
}
/**
* 获取每页的URL
* @param url
* @return
*/
public static List<String> urlList(String url){
List<String> list = new ArrayList();
list.add(url);
final WebClient webClient = new WebClient(BrowserVersion.CHROME);
webClient.getOptions().setThrowExceptionOnScriptError(false);
webClient.getOptions().setThrowExceptionOnFailingStatusCode(false);
webClient.getOptions().setActiveXNative(false);
webClient.getOptions().setCssEnabled(false);
webClient.getOptions().setJavaScriptEnabled(true);
webClient.setAjaxController(new NicelyResynchronizingAjaxController());
HtmlPage page = null;
try {
page = webClient.getPage(url);
} catch (Exception e) {
log.info("urlList(爬取URL列表报错)"+e);
e.printStackTrace();
}finally {
webClient.close();
}
webClient.waitForBackgroundJavaScript(30000);
String pageXml = page.asXml();
//获取html文档
Document document = Jsoup.parse(pageXml);
Elements sliderRight = document.getElementsByClass("pagination_index_num");
Elements titleEle = sliderRight.select("a");
for (Element elementA : titleEle){
String urlA ;
if(elementA.text().equals("1")){
urlA = url;
}else {
String s = StringUtils.substringBeforeLast(url, "/");
urlA = s +"/"+ elementA.attr("href");
}
list.add(urlA);
}
return list;
}
/**
* 爬取新闻的URL
* @param startTime 时间
* @param divId div的Id属性值
* @param divClass div的class属性值
* @param url 爬取地址
* @return
*/
public static List<Map<String,String>> execute(String startTime,String divId,String divClass,String url) {
List<Map<String,String>> list = new ArrayList();
try{
Document document = Jsoup.connect(url).get();
Elements elementsUl;
if(divId != null){
Element elementById = document.getElementById(divId);
elementsUl = elementById.getElementsByTag("ul");
}else {
Elements sliderRight = document.getElementsByClass(divClass);
elementsUl = sliderRight.get(0).getElementsByTag("ul");
}
for (Element elementLi : elementsUl) {
Elements provinceEl = elementLi.getElementsByTag("li");
for (Element element : provinceEl) {
Elements titleEle = element.select("a");
for (Element elementA : titleEle){
Map<String,String> map = new HashMap<>();
int size = titleEle.size();
map.put("title",elementA.text());
String urlA = "";
if(size !=1){
urlA = elementA.attr("href");
}else {
String href = elementA.attr("href");
boolean status = href.contains("www.npc.gov.cn");
if(status){
urlA = href;
}else{
urlA = HOMEPAGE_NEWS_URL+href;
}
}
map.put("urlA",urlA);
list.add(map);
}
}
}
return list;
} catch (IndexOutOfBoundsException e) {
log.info("这个地址报错(execute:IndexOutOfBoundsException):"+url);
e.printStackTrace();
}catch (IOException e) {
log.info("execute(爬取URL报错)"+e);
}
return list;
}
/**
* 执行数据解析
*
* @param newsClazz 分类
* @param newsClazzName 分类名称
* @param map 标题和url
* @return
*/
private void parseJson(String newsClazz, String newsClazzName, String startTime,Map<String, String> map){
String title = map.get("title");
String url = map.get("urlA");
Document document = null;
NewsContentXiCriteria content = new NewsContentXiCriteria();
long l = System.currentTimeMillis();
content.setId(l+"");
content.setNewsTitleSubtitle(title);
content.setNewsUrl(String.valueOf(url));
content.setNewsClazz(newsClazz);
content.setNewsClazzName(newsClazzName);
content.setCreateTime(LocalDateTime.now());
content.setUpdateTime(LocalDateTime.now());
content.setSiteCode(SITE_CODE);
try {
document = Jsoup.connect(url).get();
//新闻来源
Element origin = document.getElementsByClass("fontsize").get(0);
Map<String, String> originNameAndPublishTime = getOriginNameAndPublishTime(origin,newsClazz);
String originName = originNameAndPublishTime.get("originName");
String publishTime = originNameAndPublishTime.get("publishTime");
//主要内容处理
Element newsContent = document.getElementById("Zoom");
//图片处理
String s = StringUtils.substringBeforeLast(url, "/");
Elements imgs = newsContent.getElementsByTag("img");
for (Element img: imgs) {
//获取src属性值
String src = img.attr("src");
img.attr("src",s+"/"+src);
}
//视频处理
Elements videoes = newsContent.getElementsByTag("video");
for (Element video: videoes) {
//获取src属性值
String src = video.attr("src");
video.attr("src",HOMEPAGE_NEWS_URL+"/"+src);
}
//末尾加上 编辑 和 责编
Elements editores = document.getElementsByClass("editor");
String newsContents = newsContent.toString();
for (Element editor: editores) {
newsContents += editor.toString();
}
content.setNewsTitle(StringUtils.substringBeforeLast(document.getElementsByTag("title").text(),"_"));
content.setPublishTime(publishTime);
content.setSpiderFlag(1);
content.setNewsContent(newsContents);
content.setOriginName(originName);
content.setGenZzyq(2);
if (StrUtil.isNotEmpty(newsContents)){
content.setSimhash(simHash.hashFromHtml(newsContents));
}
newsContentXiBiz.add(BeanUtil.toBean(content, NewsContentXiCriteria.class));
log.info("完成执行爬取, 分类:【{}】, 最新时间:【{}】", newsClazzName, startTime);
} catch (ConnectException e) {
content.setSpiderFlag(0);
content.setGenZzyq(0);
newsContentXiBiz.add(BeanUtil.toBean(content, NewsContentXiCriteria.class));
log.info("这个地址报错(parseJson:ConnectException):是因为访问不到:"+url);
log.info("这个地址报错(parseJson:ConnectException):"+content.getId());
e.printStackTrace();
}catch (IndexOutOfBoundsException e) {
content.setNewsTitle(StringUtils.substringBeforeLast(document.getElementsByTag("title").text(),"_"));
content.setSpiderFlag(0);
content.setGenZzyq(1);
newsContentXiBiz.add(BeanUtil.toBean(content, NewsContentXiCriteria.class));
log.info("这个地址报错(parseJson:IndexOutOfBoundsException):"+url);
log.info("这个地址报错(parseJson:IndexOutOfBoundsException):"+title);
log.info("这个地址报错(parseJson:IndexOutOfBoundsException+contentId):"+content.getId());
e.printStackTrace();
}catch (IllegalArgumentException e) {
log.info("这个地址报错(parseJson:IllegalArgumentException):"+url);
log.info("这个地址报错(parseJson:IllegalArgumentException):"+title);
e.printStackTrace();
}catch (Exception e) {
log.info("这个地址报错:(parseJson:Exception):"+url);
log.info("这个地址报错:(parseJson:Exception):"+title);
e.printStackTrace();
}
}
/**
* 执行数据解析
* 这个只保存URL地址不保存页面内容
* @param newsClazz 分类
* @param newsClazzName 分类名称
* @param map 标题和url
* @return
*/
private void parseJson2(String newsClazz, String newsClazzName, String startTime,Map<String, String> map){
String title = map.get("title");
String url = map.get("urlA");
NewsContentXiCriteria content = new NewsContentXiCriteria();
long l = System.currentTimeMillis();
content.setId(l+"");
content.setNewsTitleSubtitle(title);
content.setNewsUrl(String.valueOf(url));
content.setNewsClazz(newsClazz);
content.setNewsClazzName(newsClazzName);
content.setCreateTime(LocalDateTime.now());
content.setUpdateTime(LocalDateTime.now());
content.setSiteCode(SITE_CODE);
content.setSpiderFlag(0);
try {
URL urlObj = new URL(url);
HttpURLConnection oc = (HttpURLConnection) urlObj.openConnection();
oc.setUseCaches(false);
// 请求状态
int status = oc.getResponseCode();
if (200 == status) {
content.setGenZzyq(1);
}
} catch (Exception e) {
content.setGenZzyq(0);
log.info("这个地址报错(parseJson2:Exception):是因为访问不到:"+url);
log.info("这个地址报错(parseJson2:Exception):是因为访问不到:"+title);
log.info("这个地址报错(parseJson2:Exception):"+content.getId());
e.printStackTrace();
}
newsContentXiBiz.add(BeanUtil.toBean(content, NewsContentXiCriteria.class));
}
/**
* 新闻来源和发布时间处理
* @param origin
* @return
*/
private Map<String,String> getOriginNameAndPublishTime(Element origin,String newsClazz){
Map<String,String> map = new HashMap<>();
if(newsClazz.equals("1001005")){
//新闻来源
String divFontsize = origin.text();
Element element = origin.select(".fontsize span[class='fr']").get(0);
boolean status = element.text().contains("浏览");
String[] ss = divFontsize.split(" ");
List<String> a = new ArrayList<>();
for (int i = 0; i < ss.length;i++){
a.add(ss[i]);
}
String originName = "中国人大网";
String publishTime;
if(status){
//时间
publishTime = a.get(6) + " " + a.get(7);
publishTime = publishTime.substring(1);
}else {
//时间
publishTime = element.text();
}
map.put("originName",originName);
map.put("publishTime",publishTime);
return map ;
}else {
//新闻来源
String divFontsize = origin.text();
Element element = origin.select(".fontsize span[class='fr']").get(0);
boolean status = element.text().contains("浏览");
String[] ss = divFontsize.split(" ");
List<String> a = new ArrayList<>();
for (int i = 0; i < ss.length;i++){
a.add(ss[i]);
}
String originName;
String publishTime;
if(status){
//来源
originName = a.get(5);
//时间
publishTime = a.get(6) + " " + a.get(7);
publishTime = publishTime.substring(1);
}else {
//来源
originName = a.get(1);
//时间
publishTime = element.text();
}
map.put("originName",originName);
map.put("publishTime",publishTime);
return map ;
}
}
}
网友评论