( jsoup入门 ) 做个笔记,方便以后忘了可以翻笔记,这里只为测试而测试,其他问题不考虑,只考虑实现入门学习,还有就是这里只用了select抓取(因为用的顺手嘛,也可以用getElementXxxx()的 ),由于没时间这个网页没抓完,只抓取一部分,以后有时间再搞吧
直接复制粘贴到工程中就可以看效果了
哦对了,最近有看过Charles一些文章,Charles是一个Mac和Windows平台都可以使用的抓包工具,有空仔细研究研究
jsoup官方文档:
https://jsoup.org/cookbook/
中文文档:
http://www.open-open.com/jsoup/
下载jar包地址
http://jsoup.org/download
抓取地址
这里抓取的是泡在网上的日子
http://www.jcodecraeer.com
准备工作
1、Android studio的app下的build.gradle中
依赖这段代码 :compile 'org.jsoup:jsoup:1.11.3'
然后Sync now同步代码
现在官网的最新版本就是1.11.3
查看网页
右键(泡在网上的日子)--检查
logo 导航栏1
1524884602(1).jpg导航栏1_1
1524884691(1).jpg
排行榜
1524884778(1).jpg
内容1
1524884829(1).jpg
banner
1524884931(1).jpg
MainActivity.java
public class MainActivity extends AppCompatActivity {
private JsoupBean jsoupBean;
@Override
protected void onCreate(Bundle savedInstanceState) {
super.onCreate(savedInstanceState);
setContentView(R.layout.activity_main);
//抓取后的数据放到这个bean中
jsoupBean = new JsoupBean();
//这里需要放在子线程中完成,否则报这个错android.os.NetworkOnMainThreadException
new Thread(new Runnable() {
@Override
public void run() {
jsoupData();
}
}).start();
findViewById(R.id.test).setOnClickListener(new View.OnClickListener() {
@Override
public void onClick(View view) {
//抓完后打印一下logo,看看有没有把数据存到bean中
Log.e("wwww",jsoupBean.toString();
}
});
}
private void jsoupData() {
//抓取的目标网址
String url = "http://www.jcodecraeer.com";
try {//捕捉异常
Document document = Jsoup.connect(url).get();//这里可用get也可以post方式,具体区别请自行了解
//=======================logo 这个是泡在网上的日子的logo================
//jsoupBean.setLogoImg();将数据放到bean的集合中list,其他也雷同,下面不做解释了
//document.select("a.logo-t>img").attr("src")查找a标签class="logo-t"下的子标签img的属性src的值
//document.select("a.logo-t").text()查找a标签class="logo-t"包含的内容
jsoupBean.setLogoImg(url + document.select("a.logo-t>img").attr("src"));//select的api的详细用法请查看官方文档,这里也做简单说明
jsoupBean.setLogoUrl(url);
jsoupBean.setLogoName(document.select("a.logo-t").text());
//======================导航栏1================
Elements nv1_elements_list = document.select("ul.nav-ul>li");//查找class="nav-ul"的ul下的所有li,这里得到的是一个Elements数据
List<String> Nv1_NameList = new ArrayList<>();
List<String> Nv1_UrlList = new ArrayList<>();
for (Element element : nv1_elements_list) {//遍历数组
// Log.e("wwww",element.select("a").attr("href"));
// Log.e("wwww",element.select("a").text());
Nv1_NameList.add(element.select("a").text());//查找element下的a标签的内容
if (element.select("a").attr("href").equals("/")) {//查找element下的a标签的href属性值
Nv1_UrlList.add(url);//由于 泡在网上的日子返回的是“/”,这里判断了一下再添加到Nv1_UrlList中
} else {
Nv1_UrlList.add(url + element.select("a").attr("href"));
}
}
jsoupBean.setNv1_NameList(Nv1_NameList);
jsoupBean.setNv1_UrlList(Nv1_UrlList);
//============导航栏1-1===================
// Log.e("wwww",document.select("a.lg_app").attr("href"));
// Log.e("wwww",document.select("a.lg_app").text());
List<String> Nv1_1_NameList = new ArrayList<>();
List<String> Nv1_1_UrlList = new ArrayList<>();
Nv1_1_NameList.add(0, url + document.select("a.lg_app").attr("href"));//a.lg_app :标签.class的值 attr("href") :属性href的值
Nv1_1_UrlList.add(0, document.select("a.lg_app").text());//text() :标签内的值
// Log.e("wwww",document.select("div.search_cont>form").attr("action"));
// Log.e("wwww",document.select("input.in_search").attr("value"));
Nv1_1_NameList.add(1, document.select("div.search_cont>form").attr("action"));//>form :直接子标签form
Nv1_1_UrlList.add(1, document.select("input.in_search").attr("value"));
//first() get(1) :由于查出来的是一个Elements(数组),所以这两个表示数组的0 1下标对应的值
// Log.e("wwww",document.select("div#login_info>a").get(1).toString());
Elements nv_1_1_elements = document.select("div#login_info>a");//标签#id
for (Element element : nv_1_1_elements) {//循环遍历数组nv_1_1_elements
// Log.e("wwww",element.attr("href"));
// Log.e("wwww",element.text());
Nv1_1_NameList.add(url + element.attr("href"));
Nv1_1_UrlList.add(element.text());
}
jsoupBean.setNv1_1_NameList(Nv1_1_NameList);
jsoupBean.setNv1_1_UrlList(Nv1_1_UrlList);
//============导航栏2===========================
Elements nv_2_elements = document.select("ul#nav>li");
List<String> Nv2_NameList = new ArrayList<>();
List<String> Nv2_UrlList = new ArrayList<>();
for (Element element : nv_2_elements) {//循环遍历数组nv_2_elements
// Log.e("wwww",element.select("a").attr("href"));
// Log.e("wwww",element.select("a").text());
Nv2_NameList.add(element.select("a").text());
Nv2_UrlList.add(url + element.select("a").attr("href"));
}
jsoupBean.setNv2_NameList(Nv2_NameList);
jsoupBean.setNv2_UrlList(Nv2_UrlList);
//=================advertigical广告 抓取不到数据,先不理,明天有空问一下公司的网页前段大神为什么,我猜应该是js注入========================
Elements advert_elements = document.select("div.col-md-6");
for (Element element : advert_elements) {//循环遍历数组advert_elements
// Log.e("wwww",element.select("a").attr("href"));
// Log.e("wwww",element.select("a").toString());
// Log.e("wwww",element.select("a>img[src$=.png]").toString());
}
//===============banner============================
Elements banner_elements = document.select("div.item");
List<String> banner_ContentList = new ArrayList<>();
List<String> banner_UrlList = new ArrayList<>();
List<String> banner_ImgList = new ArrayList<>();
for (Element element : banner_elements) {//循环遍历数组banner_elements
// Log.e("wwww", element.select("a").attr("href"));
// Log.e("wwww", element.select("h3").text());
// Log.e("wwww", element.select("a>img").attr("src"));
banner_ContentList.add(element.select("h3").text());
banner_UrlList.add(element.select("a").attr("href"));
banner_ImgList.add(url + element.select("a>img").attr("src"));
}
jsoupBean.setBanner_ContentList(banner_ContentList);
jsoupBean.setBanner_UrlList(banner_UrlList);
jsoupBean.setBanner_ImgList(banner_ImgList);
//====================content_1========================
Elements content1_elements = document.select("ul.arclist>li");
List<String> content1_UrlList = new ArrayList<>();
List<String> content1_ContentList = new ArrayList<>();
for (Element element : content1_elements) {//循环遍历数组
// Log.e("wwww", element.select("a").attr("href"));
// Log.e("wwww", element.select("a").text());
content1_UrlList.add(element.select("a").attr("href"));
content1_ContentList.add(element.select("a").text());
}
jsoupBean.setContent1_ContentList(content1_ContentList);
jsoupBean.setContent1_UrlList(content1_UrlList);
//================Rank 排行榜=======================
Elements rank_elements = document.select("ul.nav>li");
List<String> rank_UrlList = new ArrayList<>();
List<String> rank_ContentList = new ArrayList<>();
for (Element element : rank_elements) {//循环遍历数组
Log.e("wwww", element.select("a").attr("href"));
Log.e("wwww", element.select("a").text());
rank_UrlList.add(element.select("a").attr("href"));
rank_ContentList.add(element.select("a").text());
}
jsoupBean.setRank_ContentList(rank_ContentList);
jsoupBean.setRank_UrlList(rank_UrlList);
} catch (Exception e) {
Log.e("wwwwwwwww==", e.toString());
}
}
}
JsoupBean.java 数据bean
public class JsoupBean {
//logo
private String logoUrl;
private String logoName;
private String logoImg;
//最上面左边的导航栏
private List<String> nv1_NameList;
private List<String> nv1_UrlList;
//最上面右边的导航栏
private List<String> nv1_1_NameList;
private List<String> nv1_1_UrlList;
//导航栏2
private List<String> nv2_NameList;
private List<String> nv2_UrlList;
//广告栏
private List<String> advert_Img_List;
private List<String> advert_Url_List;
//banner
private List<String> banner_ContentList;
private List<String> banner_UrlList;
private List<String> banner_ImgList;
//内容1 banner旁边的
private List<String> content1_UrlList;
private List<String> content1_ContentList;
//排行榜标题
private List<String> rank_UrlList;
private List<String> rank_ContentList;
public List<String> getRank_UrlList() {
return rank_UrlList;
}
public void setRank_UrlList(List<String> rank_UrlList) {
this.rank_UrlList = rank_UrlList;
}
public List<String> getRank_ContentList() {
return rank_ContentList;
}
public void setRank_ContentList(List<String> rank_ContentList) {
this.rank_ContentList = rank_ContentList;
}
public List<String> getContent1_UrlList() {
return content1_UrlList;
}
public void setContent1_UrlList(List<String> content1_UrlList) {
this.content1_UrlList = content1_UrlList;
}
public List<String> getContent1_ContentList() {
return content1_ContentList;
}
public void setContent1_ContentList(List<String> content1_ContentList) {
this.content1_ContentList = content1_ContentList;
}
public List<String> getAdvert_Img_List() {
return advert_Img_List;
}
public void setAdvert_Img_List(List<String> advert_Img_List) {
this.advert_Img_List = advert_Img_List;
}
public List<String> getBanner_ContentList() {
return banner_ContentList;
}
public void setBanner_ContentList(List<String> banner_ContentList) {
this.banner_ContentList = banner_ContentList;
}
public List<String> getBanner_UrlList() {
return banner_UrlList;
}
public void setBanner_UrlList(List<String> banner_UrlList) {
this.banner_UrlList = banner_UrlList;
}
public List<String> getBanner_ImgList() {
return banner_ImgList;
}
public void setBanner_ImgList(List<String> banner_ImgList) {
this.banner_ImgList = banner_ImgList;
}
public List<String> getAdvert_Name_List() {
return advert_Img_List;
}
public void setAdvert_Name_List(List<String> advert_Img_List) {
this.advert_Img_List = advert_Img_List;
}
public List<String> getAdvert_Url_List() {
return advert_Url_List;
}
public void setAdvert_Url_List(List<String> advert_Url_List) {
this.advert_Url_List = advert_Url_List;
}
public List<String> getNv2_NameList() {
return nv2_NameList;
}
public void setNv2_NameList(List<String> nv2_NameList) {
this.nv2_NameList = nv2_NameList;
}
public List<String> getNv2_UrlList() {
return nv2_UrlList;
}
public void setNv2_UrlList(List<String> nv2_UrlList) {
this.nv2_UrlList = nv2_UrlList;
}
public List<String> getNv1_1_NameList() {
return nv1_1_NameList;
}
public void setNv1_1_NameList(List<String> nv1_1_NameList) {
this.nv1_1_NameList = nv1_1_NameList;
}
public List<String> getNv1_1_UrlList() {
return nv1_1_UrlList;
}
public void setNv1_1_UrlList(List<String> nv1_1_UrlList) {
this.nv1_1_UrlList = nv1_1_UrlList;
}
public List<String> getNv1_NameList() {
return nv1_NameList;
}
public void setNv1_NameList(List<String> nv1_NameList) {
this.nv1_NameList = nv1_NameList;
}
public List<String> getNv1_UrlList() {
return nv1_UrlList;
}
public void setNv1_UrlList(List<String> nv1_UrlList) {
this.nv1_UrlList = nv1_UrlList;
}
public String getLogoUrl() {
return logoUrl;
}
public void setLogoUrl(String logoUrl) {
this.logoUrl = logoUrl;
}
public String getLogoName() {
return logoName;
}
public void setLogoName(String logoName) {
this.logoName = logoName;
}
public String getLogoImg() {
return logoImg;
}
public void setLogoImg(String logoImg) {
this.logoImg = logoImg;
}
@Override
public String toString() {
return "JsoupBean{" +
"logoUrl='" + logoUrl + '\'' +
", logoName='" + logoName + '\'' +
", logoImg='" + logoImg + '\'' +
", nv1_NameList=" + nv1_NameList +
", nv1_UrlList=" + nv1_UrlList +
", nv1_1_NameList=" + nv1_1_NameList +
", nv1_1_UrlList=" + nv1_1_UrlList +
", nv2_NameList=" + nv2_NameList +
", nv2_UrlList=" + nv2_UrlList +
", advert_Img_List=" + advert_Img_List +
", advert_Url_List=" + advert_Url_List +
", banner_ContentList=" + banner_ContentList +
", banner_UrlList=" + banner_UrlList +
", banner_ImgList=" + banner_ImgList +
", content1_UrlList=" + content1_UrlList +
", content1_ContentList=" + content1_ContentList +
", rank_UrlList=" + rank_UrlList +
", rank_ContentList=" + rank_ContentList +
'}';
}
}
XML界面就不上了,就一个button点击按钮,点击打印log
本文章仅供学习之用,禁止任何商业用途,若有所需或转载请与作者联系
网友评论