美文网首页
笔记:Android用jsoup抓取网页HTML解析数据

笔记:Android用jsoup抓取网页HTML解析数据

作者: RoJacKing | 来源:发表于2018-04-28 11:17 被阅读2306次

    ( jsoup入门 ) 做个笔记,方便以后忘了可以翻笔记,这里只为测试而测试,其他问题不考虑,只考虑实现入门学习,还有就是这里只用了select抓取(因为用的顺手嘛,也可以用getElementXxxx()的 ),由于没时间这个网页没抓完,只抓取一部分,以后有时间再搞吧

    直接复制粘贴到工程中就可以看效果了

    哦对了,最近有看过Charles一些文章,Charles是一个Mac和Windows平台都可以使用的抓包工具,有空仔细研究研究

    jsoup官方文档
    https://jsoup.org/cookbook/
    中文文档
    http://www.open-open.com/jsoup/
    下载jar包地址
    http://jsoup.org/download
    抓取地址
    这里抓取的是泡在网上的日子
    http://www.jcodecraeer.com

    准备工作

    1、Android studio的app下的build.gradle中
    依赖这段代码 :compile 'org.jsoup:jsoup:1.11.3' 然后Sync now同步代码
    现在官网的最新版本就是1.11.3

    查看网页
    右键(泡在网上的日子)--检查

    logo 导航栏1

    1524884602(1).jpg

    导航栏1_1


    1524884691(1).jpg

    排行榜


    1524884778(1).jpg

    内容1


    1524884829(1).jpg

    banner


    1524884931(1).jpg

    MainActivity.java

    public class MainActivity extends AppCompatActivity {
    
        private JsoupBean jsoupBean;
        @Override
        protected void onCreate(Bundle savedInstanceState) {
            super.onCreate(savedInstanceState);
            setContentView(R.layout.activity_main);
    
            //抓取后的数据放到这个bean中
            jsoupBean = new JsoupBean();
    
            //这里需要放在子线程中完成,否则报这个错android.os.NetworkOnMainThreadException
            new Thread(new Runnable() {
                @Override
                public void run() {
                    jsoupData();
                }
            }).start();
    
            findViewById(R.id.test).setOnClickListener(new View.OnClickListener() {
                @Override
                public void onClick(View view) {
                    //抓完后打印一下logo,看看有没有把数据存到bean中
                    Log.e("wwww",jsoupBean.toString();
                    
                }
            });
        }
    
        private void jsoupData() {
            //抓取的目标网址
            String url = "http://www.jcodecraeer.com";
    
            try {//捕捉异常
    
                Document document = Jsoup.connect(url).get();//这里可用get也可以post方式,具体区别请自行了解
    
                //=======================logo    这个是泡在网上的日子的logo================
                //jsoupBean.setLogoImg();将数据放到bean的集合中list,其他也雷同,下面不做解释了
                //document.select("a.logo-t>img").attr("src")查找a标签class="logo-t"下的子标签img的属性src的值
                //document.select("a.logo-t").text()查找a标签class="logo-t"包含的内容
    
                jsoupBean.setLogoImg(url + document.select("a.logo-t>img").attr("src"));//select的api的详细用法请查看官方文档,这里也做简单说明
                jsoupBean.setLogoUrl(url);
                jsoupBean.setLogoName(document.select("a.logo-t").text());
    
    
                //======================导航栏1================
    
                Elements nv1_elements_list = document.select("ul.nav-ul>li");//查找class="nav-ul"的ul下的所有li,这里得到的是一个Elements数据
                List<String> Nv1_NameList = new ArrayList<>();
                List<String> Nv1_UrlList = new ArrayList<>();
    
                for (Element element : nv1_elements_list) {//遍历数组
                    //                Log.e("wwww",element.select("a").attr("href"));
                    //                Log.e("wwww",element.select("a").text());
    
                    Nv1_NameList.add(element.select("a").text());//查找element下的a标签的内容
                    if (element.select("a").attr("href").equals("/")) {//查找element下的a标签的href属性值
                        Nv1_UrlList.add(url);//由于 泡在网上的日子返回的是“/”,这里判断了一下再添加到Nv1_UrlList中
                    } else {
                        Nv1_UrlList.add(url + element.select("a").attr("href"));
                    }
                }
                jsoupBean.setNv1_NameList(Nv1_NameList);
                jsoupBean.setNv1_UrlList(Nv1_UrlList);
                //============导航栏1-1===================
                //            Log.e("wwww",document.select("a.lg_app").attr("href"));
                //            Log.e("wwww",document.select("a.lg_app").text());
    
                List<String> Nv1_1_NameList = new ArrayList<>();
                List<String> Nv1_1_UrlList = new ArrayList<>();
    
                Nv1_1_NameList.add(0, url + document.select("a.lg_app").attr("href"));//a.lg_app  :标签.class的值      attr("href")    :属性href的值
                Nv1_1_UrlList.add(0, document.select("a.lg_app").text());//text()    :标签内的值
    
                //            Log.e("wwww",document.select("div.search_cont>form").attr("action"));
                //            Log.e("wwww",document.select("input.in_search").attr("value"));
    
                Nv1_1_NameList.add(1, document.select("div.search_cont>form").attr("action"));//>form  :直接子标签form
                Nv1_1_UrlList.add(1, document.select("input.in_search").attr("value"));
    
                //first()   get(1)    :由于查出来的是一个Elements(数组),所以这两个表示数组的0 1下标对应的值
                //            Log.e("wwww",document.select("div#login_info>a").get(1).toString());
    
                Elements nv_1_1_elements = document.select("div#login_info>a");//标签#id
                for (Element element : nv_1_1_elements) {//循环遍历数组nv_1_1_elements
                    //                Log.e("wwww",element.attr("href"));
                    //                Log.e("wwww",element.text());
    
                    Nv1_1_NameList.add(url + element.attr("href"));
                    Nv1_1_UrlList.add(element.text());
                }
                jsoupBean.setNv1_1_NameList(Nv1_1_NameList);
                jsoupBean.setNv1_1_UrlList(Nv1_1_UrlList);
    
                //============导航栏2===========================
                Elements nv_2_elements = document.select("ul#nav>li");
                List<String> Nv2_NameList = new ArrayList<>();
                List<String> Nv2_UrlList = new ArrayList<>();
    
                for (Element element : nv_2_elements) {//循环遍历数组nv_2_elements
                    //                Log.e("wwww",element.select("a").attr("href"));
                    //                Log.e("wwww",element.select("a").text());
    
                    Nv2_NameList.add(element.select("a").text());
                    Nv2_UrlList.add(url + element.select("a").attr("href"));
                }
                jsoupBean.setNv2_NameList(Nv2_NameList);
                jsoupBean.setNv2_UrlList(Nv2_UrlList);
                //=================advertigical广告     抓取不到数据,先不理,明天有空问一下公司的网页前段大神为什么,我猜应该是js注入========================
                Elements advert_elements = document.select("div.col-md-6");
                for (Element element : advert_elements) {//循环遍历数组advert_elements
    
                    //                Log.e("wwww",element.select("a").attr("href"));
                    //                Log.e("wwww",element.select("a").toString());
                    //                Log.e("wwww",element.select("a>img[src$=.png]").toString());
    
                }
                //===============banner============================
    
                Elements banner_elements = document.select("div.item");
    
                List<String> banner_ContentList = new ArrayList<>();
                List<String> banner_UrlList = new ArrayList<>();
                List<String> banner_ImgList = new ArrayList<>();
    
                for (Element element : banner_elements) {//循环遍历数组banner_elements
                    //                Log.e("wwww", element.select("a").attr("href"));
                    //                Log.e("wwww", element.select("h3").text());
                    //                Log.e("wwww", element.select("a>img").attr("src"));
    
                    banner_ContentList.add(element.select("h3").text());
                    banner_UrlList.add(element.select("a").attr("href"));
                    banner_ImgList.add(url + element.select("a>img").attr("src"));
                }
                jsoupBean.setBanner_ContentList(banner_ContentList);
                jsoupBean.setBanner_UrlList(banner_UrlList);
                jsoupBean.setBanner_ImgList(banner_ImgList);
                //====================content_1========================
                Elements content1_elements = document.select("ul.arclist>li");
                List<String> content1_UrlList = new ArrayList<>();
                List<String> content1_ContentList = new ArrayList<>();
    
                for (Element element : content1_elements) {//循环遍历数组
                    //                Log.e("wwww", element.select("a").attr("href"));
                    //                Log.e("wwww", element.select("a").text());
    
                    content1_UrlList.add(element.select("a").attr("href"));
                    content1_ContentList.add(element.select("a").text());
                }
                jsoupBean.setContent1_ContentList(content1_ContentList);
                jsoupBean.setContent1_UrlList(content1_UrlList);
                //================Rank  排行榜=======================
                Elements rank_elements = document.select("ul.nav>li");
                List<String> rank_UrlList = new ArrayList<>();
                List<String> rank_ContentList = new ArrayList<>();
    
                for (Element element : rank_elements) {//循环遍历数组
                    Log.e("wwww", element.select("a").attr("href"));
                    Log.e("wwww", element.select("a").text());
    
                    rank_UrlList.add(element.select("a").attr("href"));
                    rank_ContentList.add(element.select("a").text());
                }
                jsoupBean.setRank_ContentList(rank_ContentList);
                jsoupBean.setRank_UrlList(rank_UrlList);
    
            } catch (Exception e) {
                Log.e("wwwwwwwww==", e.toString());
            }
        }
    }
    

    JsoupBean.java 数据bean

    public class JsoupBean {
        //logo
        private String logoUrl;
        private String logoName;
        private String logoImg;
        //最上面左边的导航栏
        private List<String> nv1_NameList;
        private List<String> nv1_UrlList;
        //最上面右边的导航栏
        private List<String> nv1_1_NameList;
        private List<String> nv1_1_UrlList;
        //导航栏2
        private List<String> nv2_NameList;
        private List<String> nv2_UrlList;
        //广告栏
        private List<String> advert_Img_List;
        private List<String> advert_Url_List;
        //banner
        private List<String> banner_ContentList;
        private List<String> banner_UrlList;
        private List<String> banner_ImgList;
        //内容1   banner旁边的
        private List<String> content1_UrlList;
        private List<String> content1_ContentList;
        //排行榜标题
        private List<String> rank_UrlList;
        private List<String> rank_ContentList;
    
        public List<String> getRank_UrlList() {
            return rank_UrlList;
        }
    
        public void setRank_UrlList(List<String> rank_UrlList) {
            this.rank_UrlList = rank_UrlList;
        }
    
        public List<String> getRank_ContentList() {
            return rank_ContentList;
        }
    
        public void setRank_ContentList(List<String> rank_ContentList) {
            this.rank_ContentList = rank_ContentList;
        }
    
        public List<String> getContent1_UrlList() {
            return content1_UrlList;
        }
    
        public void setContent1_UrlList(List<String> content1_UrlList) {
            this.content1_UrlList = content1_UrlList;
        }
    
        public List<String> getContent1_ContentList() {
            return content1_ContentList;
        }
    
        public void setContent1_ContentList(List<String> content1_ContentList) {
            this.content1_ContentList = content1_ContentList;
        }
    
        public List<String> getAdvert_Img_List() {
            return advert_Img_List;
        }
    
        public void setAdvert_Img_List(List<String> advert_Img_List) {
            this.advert_Img_List = advert_Img_List;
        }
    
        public List<String> getBanner_ContentList() {
            return banner_ContentList;
        }
    
        public void setBanner_ContentList(List<String> banner_ContentList) {
            this.banner_ContentList = banner_ContentList;
        }
    
        public List<String> getBanner_UrlList() {
            return banner_UrlList;
        }
    
        public void setBanner_UrlList(List<String> banner_UrlList) {
            this.banner_UrlList = banner_UrlList;
        }
    
        public List<String> getBanner_ImgList() {
            return banner_ImgList;
        }
    
        public void setBanner_ImgList(List<String> banner_ImgList) {
            this.banner_ImgList = banner_ImgList;
        }
    
        public List<String> getAdvert_Name_List() {
            return advert_Img_List;
        }
    
        public void setAdvert_Name_List(List<String> advert_Img_List) {
            this.advert_Img_List = advert_Img_List;
        }
    
        public List<String> getAdvert_Url_List() {
            return advert_Url_List;
        }
    
        public void setAdvert_Url_List(List<String> advert_Url_List) {
            this.advert_Url_List = advert_Url_List;
        }
    
        public List<String> getNv2_NameList() {
            return nv2_NameList;
        }
    
        public void setNv2_NameList(List<String> nv2_NameList) {
            this.nv2_NameList = nv2_NameList;
        }
    
        public List<String> getNv2_UrlList() {
            return nv2_UrlList;
        }
    
        public void setNv2_UrlList(List<String> nv2_UrlList) {
            this.nv2_UrlList = nv2_UrlList;
        }
    
        public List<String> getNv1_1_NameList() {
            return nv1_1_NameList;
        }
    
        public void setNv1_1_NameList(List<String> nv1_1_NameList) {
            this.nv1_1_NameList = nv1_1_NameList;
        }
    
        public List<String> getNv1_1_UrlList() {
            return nv1_1_UrlList;
        }
    
        public void setNv1_1_UrlList(List<String> nv1_1_UrlList) {
            this.nv1_1_UrlList = nv1_1_UrlList;
        }
    
        public List<String> getNv1_NameList() {
            return nv1_NameList;
        }
    
        public void setNv1_NameList(List<String> nv1_NameList) {
            this.nv1_NameList = nv1_NameList;
        }
    
        public List<String> getNv1_UrlList() {
            return nv1_UrlList;
        }
    
        public void setNv1_UrlList(List<String> nv1_UrlList) {
            this.nv1_UrlList = nv1_UrlList;
        }
    
        public String getLogoUrl() {
            return logoUrl;
        }
    
        public void setLogoUrl(String logoUrl) {
            this.logoUrl = logoUrl;
        }
    
        public String getLogoName() {
            return logoName;
        }
    
        public void setLogoName(String logoName) {
            this.logoName = logoName;
        }
    
        public String getLogoImg() {
            return logoImg;
        }
    
        public void setLogoImg(String logoImg) {
            this.logoImg = logoImg;
        }
    
        @Override
        public String toString() {
            return "JsoupBean{" +
                    "logoUrl='" + logoUrl + '\'' +
                    ", logoName='" + logoName + '\'' +
                    ", logoImg='" + logoImg + '\'' +
                    ", nv1_NameList=" + nv1_NameList +
                    ", nv1_UrlList=" + nv1_UrlList +
                    ", nv1_1_NameList=" + nv1_1_NameList +
                    ", nv1_1_UrlList=" + nv1_1_UrlList +
                    ", nv2_NameList=" + nv2_NameList +
                    ", nv2_UrlList=" + nv2_UrlList +
                    ", advert_Img_List=" + advert_Img_List +
                    ", advert_Url_List=" + advert_Url_List +
                    ", banner_ContentList=" + banner_ContentList +
                    ", banner_UrlList=" + banner_UrlList +
                    ", banner_ImgList=" + banner_ImgList +
                    ", content1_UrlList=" + content1_UrlList +
                    ", content1_ContentList=" + content1_ContentList +
                    ", rank_UrlList=" + rank_UrlList +
                    ", rank_ContentList=" + rank_ContentList +
                    '}';
        }
    }
    
    XML界面就不上了,就一个button点击按钮,点击打印log

    本文章仅供学习之用,禁止任何商业用途,若有所需或转载请与作者联系

    相关文章

      网友评论

          本文标题:笔记:Android用jsoup抓取网页HTML解析数据

          本文链接:https://www.haomeiwen.com/subject/fdakgxtx.html