Jsoup爬取凤凰网2018-09-22日部分新闻[笔记] --

作者: lhsjohn | 来源:发表于2019-03-03 23:33 被阅读0次

Jsoup爬取凤凰网2018-09-22日部分新闻[笔记] --
Java爬虫实战—利用xpath表达式抓取页面信息
jsoup+okhttp实现网页搜索表单的爬取
Java简单的爬虫实践
使用Java写一个简单爬虫爬取单页面
python爬虫
爬虫实践－基于Jsoup爬取Facebook群组成员信息
Android笔记之JSoup爬取豆瓣同城
Java实现的简单小爬虫
如何用JAVA爬取AJAX加载后的页面(利用phantomjs)

这个方法缺点是速度慢，优点是暂时不用考虑数据共享的问题,但是单线程去爬虫还是有很大缺点的,
暂时先写一个单线程的,下次再此基础上优化出多线程版本。



package com.lhsjohn.spider;

import java.io.IOException;

import org.jsoup.Jsoup;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;

public class JsoupTest2 {

    public static void main(String[] args) throws Exception {
        int index = 118;
        String Sindex = "";

        while (index <= 999) {
            if (index <= 9) {
                Sindex = "00" + index;
            }
            if (index > 9 && index <= 99) {
                Sindex = "0" + index;
            }
            if (index > 99) {
                Sindex = "" + index;
            }
            index++;
            System.out.println(Sindex);
            crawNews(Sindex);
        }
        
    }

    public static void crawNews(String index) throws Exception {
        String url = "http://news.ifeng.com/a/20180922/60080" + index + "_0.shtml";
        org.jsoup.nodes.Document document = Jsoup.connect(url).timeout(50000000).get();
        String selection = ".yc_main.wrap";
        Elements elements = document.select(selection);
        // if (!elements.isEmpty()) {
        // Element element = elements.get(0);
        // }
        if(elements.isEmpty()) {
            Elements elements2 = document.select("#artical #artical_topic");
            if(!elements2.isEmpty()) {
                System.out.println(elements2.get(0).text());
                return;
            }else {
                return;
            }
        }
        Element element = elements.get(0);
        Elements elements2 = element.select(".yc_tit");
        if (!elements2.isEmpty()) {
            Element element2 = elements2.get(0);
            Elements children = element2.children();
            for (Element element3 : children) {
                // System.out.println(element3.tagName());
                if (element3.tagName().equals("h1")) {
                    System.out.println("标题:" + element3.text());
                }
                if (element3.tagName().equals("p")) {
                    Elements children2 = element3.children();
                    Element element4 = children2.get(0);
                    System.out.println("时间:" + element4.text());
                    Element element5 = children2.get(1);
                    System.out.println("来源:" + element5.text());
                    System.out.println("来源链接:" + element5.attr("href"));
                }

            }

        }

    }