美文网首页
Jsoup爬取凤凰网2018-09-22日部分新闻[笔记] --

Jsoup爬取凤凰网2018-09-22日部分新闻[笔记] --

作者: lhsjohn | 来源:发表于2019-03-03 23:33 被阅读0次

    这个方法缺点是速度慢,优点是暂时不用考虑数据共享的问题,但是单线程去爬虫还是有很大缺点的,
    暂时先写一个单线程的,下次再此基础上优化出多线程版本。

    
    
    package com.lhsjohn.spider;
    
    import java.io.IOException;
    
    import org.jsoup.Jsoup;
    import org.jsoup.nodes.Element;
    import org.jsoup.select.Elements;
    
    public class JsoupTest2 {
    
        public static void main(String[] args) throws Exception {
            int index = 118;
            String Sindex = "";
    
            while (index <= 999) {
                if (index <= 9) {
                    Sindex = "00" + index;
                }
                if (index > 9 && index <= 99) {
                    Sindex = "0" + index;
                }
                if (index > 99) {
                    Sindex = "" + index;
                }
                index++;
                System.out.println(Sindex);
                crawNews(Sindex);
            }
            
        }
    
        public static void crawNews(String index) throws Exception {
            String url = "http://news.ifeng.com/a/20180922/60080" + index + "_0.shtml";
            org.jsoup.nodes.Document document = Jsoup.connect(url).timeout(50000000).get();
            String selection = ".yc_main.wrap";
            Elements elements = document.select(selection);
            // if (!elements.isEmpty()) {
            // Element element = elements.get(0);
            // }
            if(elements.isEmpty()) {
                Elements elements2 = document.select("#artical #artical_topic");
                if(!elements2.isEmpty()) {
                    System.out.println(elements2.get(0).text());
                    return;
                }else {
                    return;
                }
            }
            Element element = elements.get(0);
            Elements elements2 = element.select(".yc_tit");
            if (!elements2.isEmpty()) {
                Element element2 = elements2.get(0);
                Elements children = element2.children();
                for (Element element3 : children) {
                    // System.out.println(element3.tagName());
                    if (element3.tagName().equals("h1")) {
                        System.out.println("标题:" + element3.text());
                    }
                    if (element3.tagName().equals("p")) {
                        Elements children2 = element3.children();
                        Element element4 = children2.get(0);
                        System.out.println("时间:" + element4.text());
                        Element element5 = children2.get(1);
                        System.out.println("来源:" + element5.text());
                        System.out.println("来源链接:" + element5.attr("href"));
                    }
    
                }
    
            }
    
        }
    
    
    
    
    
    
    
    
    

    相关文章

      网友评论

          本文标题:Jsoup爬取凤凰网2018-09-22日部分新闻[笔记] --

          本文链接:https://www.haomeiwen.com/subject/zduvuqtx.html