最近要做java的爬虫项目,由于最先接触的少,所以有点茫然,万事不决问百度,发现还是python爬虫用的多,然后没办法,网上抄代码。不知道是不是我百度的方法不对,百度的资料少的很,就茫然了一会,花费了不少时间,大概一周时间,网上抄的代码又难改,后来组长叫我拿东西出来,哦豁,然后被批评了,然后让我两天之内拿出来,感觉头发都要掉完。然后还好给我指了方向,就说用jsoup和httpclient,最先也百度到了,但是java爬虫我看了其他人的源码,自己创建connection,过程繁琐,后来开始学习jsoup和HTTPclient,学习加上完成爬虫项目,刚好花费2天时间完成,准确的说是一天半,剩下半天在这写文章。jsoup和httpclient减少了大量的时间。
爬虫项目仅用于学习交流
目标
爬取网站 https://birdnet.cn/atlas.php
然后按照目科检索下分门别类建立文件夹并在对应文件夹放入图片
图片2.png
这里应该有56张图片,包括两张鸟种描述和鸟种分析,因为
图片3.png 图片4.png
这里贴源码
图片1.png
使用的主要是httpclient和jsoup
HttpClientUtil来源于网上工具包
public class HttpClientUtil {
private RequestConfig requestConfig = RequestConfig.custom()
.setSocketTimeout(45000)
.setConnectTimeout(45000)
.setConnectionRequestTimeout(45000)
.build();
private static HttpClientUtil instance = null;
private HttpClientUtil(){}
public static HttpClientUtil getInstance(){
if (instance == null) {
instance = new HttpClientUtil();
}
return instance;
}
/**
* 发送 post请求
* @param httpUrl 地址
*/
public String sendHttpPost(String httpUrl) {
HttpPost httpPost = new HttpPost(httpUrl);// 创建httpPost
return sendHttpPost(httpPost);
}
/**
* 发送 post请求
* @param httpUrl 地址
* @param params 参数(格式:key1=value1&key2=value2)
*/
public String sendHttpPost(String httpUrl, String params) {
HttpPost httpPost = new HttpPost(httpUrl);// 创建httpPost
try {
//设置参数
StringEntity stringEntity = new StringEntity(params, "UTF-8");
stringEntity.setContentType("application/x-www-form-urlencoded");
httpPost.setEntity(stringEntity);
} catch (Exception e) {
e.printStackTrace();
}
return sendHttpPost(httpPost);
}
/**
* 发送 post请求
* @param httpUrl 地址
* @param maps 参数
*/
public String sendHttpPost(String httpUrl, Map<String, String> maps) {
HttpPost httpPost = new HttpPost(httpUrl);// 创建httpPost
// 创建参数队列
List<NameValuePair> nameValuePairs = new ArrayList<NameValuePair>();
for (String key : maps.keySet()) {
nameValuePairs.add(new BasicNameValuePair(key, maps.get(key)));
}
try {
httpPost.setEntity(new UrlEncodedFormEntity(nameValuePairs, "UTF-8"));
} catch (Exception e) {
e.printStackTrace();
}
return sendHttpPost(httpPost);
}
/**
* 发送 post请求(带文件)
* @param httpUrl 地址
* @param maps 参数
* @param fileLists 附件
*/
public String sendHttpPost(String httpUrl, Map<String, String> maps, List<File> fileLists) {
HttpPost httpPost = new HttpPost(httpUrl);// 创建httpPost
MultipartEntityBuilder meBuilder = MultipartEntityBuilder.create();
for (String key : maps.keySet()) {
meBuilder.addPart(key, new StringBody(maps.get(key), ContentType.TEXT_PLAIN));
}
for(File file : fileLists) {
FileBody fileBody = new FileBody(file);
meBuilder.addPart("files", fileBody);
}
HttpEntity reqEntity = meBuilder.build();
httpPost.setEntity(reqEntity);
return sendHttpPost(httpPost);
}
/**
* 发送Post请求
* @param httpPost
* @return
*/
private String sendHttpPost(HttpPost httpPost) {
CloseableHttpClient httpClient = null;
CloseableHttpResponse response = null;
HttpEntity entity = null;
String responseContent = null;
try {
// 创建默认的httpClient实例.
httpClient = HttpClients.createDefault();
httpPost.setConfig(requestConfig);
// 执行请求
response = httpClient.execute(httpPost);
entity = response.getEntity();
responseContent = EntityUtils.toString(entity, "UTF-8");
} catch (Exception e) {
e.printStackTrace();
} finally {
try {
// 关闭连接,释放资源
if (response != null) {
response.close();
}
if (httpClient != null) {
httpClient.close();
}
} catch (IOException e) {
e.printStackTrace();
}
}
return responseContent;
}
/**
* 发送 get请求
* @param httpUrl
*/
public String sendHttpGet(String httpUrl) {
HttpGet httpGet = new HttpGet(httpUrl);// 创建get请求
return sendHttpGet(httpGet);
}
/**
* 发送 get请求Https
* @param httpUrl
*/
public String sendHttpsGet(String httpUrl) {
HttpGet httpGet = new HttpGet(httpUrl);// 创建get请求
return sendHttpsGet(httpGet);
}
/**
* 发送 get请求Https
* @param httpUrl
*/
public String sendHttpsPost(String httpUrl) {
HttpPost httpPost = new HttpPost(httpUrl);// 创建get请求
return sendHttpsPost(httpPost);
}
/**
* 发送Get请求
* @param httpPost
* @return
*/
private String sendHttpGet(HttpGet httpGet) {
CloseableHttpClient httpClient = null;
CloseableHttpResponse response = null;
HttpEntity entity = null;
String responseContent = null;
try {
// 创建默认的httpClient实例.
httpClient = HttpClients.createDefault();
httpGet.setConfig(requestConfig);
// 执行请求
response = httpClient.execute(httpGet);
entity = response.getEntity();
responseContent = EntityUtils.toString(entity, "UTF-8");
} catch (Exception e) {
e.printStackTrace();
} finally {
try {
// 关闭连接,释放资源
if (response != null) {
response.close();
}
if (httpClient != null) {
httpClient.close();
}
} catch (IOException e) {
e.printStackTrace();
}
}
return responseContent;
}
/**
* 发送Get请求Https
* @param httpPost
* @return
*/
private String sendHttpsGet(HttpGet httpGet) {
CloseableHttpClient httpClient = null;
CloseableHttpResponse response = null;
HttpEntity entity = null;
String responseContent = null;
try {
// 创建默认的httpClient实例.
PublicSuffixMatcher publicSuffixMatcher = PublicSuffixMatcherLoader.load(new URL(httpGet.getURI().toString()));
DefaultHostnameVerifier hostnameVerifier = new DefaultHostnameVerifier(publicSuffixMatcher);
httpClient = HttpClients.custom().setSSLHostnameVerifier(hostnameVerifier).build();
httpGet.setConfig(requestConfig);
// 执行请求
response = httpClient.execute(httpGet);
entity = response.getEntity();
responseContent = EntityUtils.toString(entity, "UTF-8");
} catch (Exception e) {
e.printStackTrace();
} finally {
try {
// 关闭连接,释放资源
if (response != null) {
response.close();
}
if (httpClient != null) {
httpClient.close();
}
} catch (IOException e) {
e.printStackTrace();
}
}
return responseContent;
}
private String sendHttpsPost(HttpPost httpPost) {
CloseableHttpClient httpClient = null;
CloseableHttpResponse response = null;
HttpEntity entity = null;
String responseContent = null;
try {
// 创建默认的httpClient实例.
PublicSuffixMatcher publicSuffixMatcher = PublicSuffixMatcherLoader.load(new URL(httpPost.getURI().toString()));
DefaultHostnameVerifier hostnameVerifier = new DefaultHostnameVerifier(publicSuffixMatcher);
httpClient = HttpClients.custom().setSSLHostnameVerifier(hostnameVerifier).build();
httpPost.setConfig(requestConfig);
// 执行请求
response = httpClient.execute(httpPost);
entity = response.getEntity();
responseContent = EntityUtils.toString(entity, "UTF-8");
} catch (Exception e) {
e.printStackTrace();
} finally {
try {
// 关闭连接,释放资源
if (response != null) {
response.close();
}
if (httpClient != null) {
httpClient.close();
}
} catch (IOException e) {
e.printStackTrace();
}
}
return responseContent;
}
}
craw.java
public class craw {
public static void main(String[] args) throws IOException{
Map<String, String> map = new HashMap<>();
int m=0;
String filename1;//一级目录名
String filename2;
String filename3;
String path="C:\\Bird";
int size;//多页情况
boolean flag=true;//设立标志位,因为只有第一张图会点进去爬两张介绍信息,其他图都不用爬
File myPath = new File(path);
if ( !myPath.exists()){//若此目录不存在,则创建
myPath.mkdir();
System.out.println("创建文件夹路径为:"+ path);
}
String responseContent1 = HttpClientUtil.getInstance().sendHttpPost("https://birdnet.cn/atlas.php");
Document document1=Jsoup.parse(responseContent1);
Elements getclass=document1.getElementsByClass("pih");
Elements search=getclass.select("#search_from_1");
Elements els1=search.select("[name='collect_1'] > option");//获得collect1里的值
for(Element el1: els1){
// map.put("collect_1", "10453");//雀形目
// map.put("collect_2", "9760");
try{
Thread thread = Thread.currentThread();
int r=(int) (Math.random()*10000/2);
thread.sleep(r);//暂停后程序继续执行
if(el1.attr("value")!=""){//排除了请选择选项
map.put("collect_1", el1.attr("value"));
filename1=el1.text();//开始创建一级文件夹
String responseContent2 = HttpClientUtil.getInstance().sendHttpPost("https://birdnet.cn/atlas.php?mod=show&action=atlaslist", map);
Document document2=Jsoup.parse(responseContent2);
Elements els2=document2.select("#collect_2> option");//获取collect2中的值
for (Element el2: els2) {
try{
int r2=(int) (Math.random()*10000/2);
thread.sleep(r2);//暂停后程序继续执行
if(el2.attr("value")!=""){//这是进入第三层,过滤掉“请选择”这个选项
map.put("collect_2", el2.attr("value"));
filename2=el2.text();
String responseContent3 = HttpClientUtil.getInstance().sendHttpPost("https://birdnet.cn/atlas.php?mod=show&action=atlaslist", map);
Document documentcol3=Jsoup.parse(responseContent3);
Elements els3=documentcol3.select("#collect_3> option");//获取collect3中的值,到这列表数据就爬完了
for (Element el3: els3) {
try{
flag=true;//循环一开始标志位复位
if(el3.attr("value")!=""){
filename3=el3.text();
try {
File dir = new File("C:\\Bird"+"\\"+filename1+"\\"+filename2+"\\"+filename3);
if (!dir.exists()) {
dir.mkdirs();
}
} catch (Exception e) {
e.printStackTrace();
}//先建立对应的文件夹
map.put("collect_3", el3.attr("value"));
String responsePic = HttpClientUtil.getInstance().sendHttpPost("https://birdnet.cn/atlas.php?mod=show&action=atlaslist", map);
Document document3=Jsoup.parse(responsePic);
// System.out.println(document3);
Elements ul=document3.select("div.picturel > *");//获取ul信息
Elements div=document3.getElementsByClass("pg");
Elements label=div.select("span");
String sizeString=label.attr("title");//这里是因为有些图有多页
String regEx="[^0-9]";
if(sizeString!="")
{ Pattern p = Pattern.compile(regEx);
Matcher matcher = p.matcher(sizeString);
String sizes=matcher.replaceAll("").trim();
size=Integer.valueOf(sizes);
}//转为int类型
else{
size=1;//运行时抛出空指针异常,这里注意
System.out.println("-----------------"+size);
}
Elements pics = ul.select("ul> *");
String ImgPath="C:\\Bird"+"\\"+filename1+"\\"+filename2+"\\"+filename3;
for(Element pic:pics){//此时已经遍历了各个li下的资源了
Elements url2=pic.select("img[src]");
String url= url2.attr("src");
if(flag){//下载鸟种介绍鸟种2张图片
Elements a=pic.select("a[href]");
String href= a.attr("href");
String responseflag = HttpClientUtil.getInstance().sendHttpPost("https://birdnet.cn/"+href);
Document documentflag=Jsoup.parse(responseflag);
Elements kind=documentflag.getElementsByClass("mahko cl");//鸟种描述
Elements birdPics=kind.select("img[src]");//因为有两张图,所以遍历
for(Element birdPic:birdPics )
{ String birdurl= birdPic.attr("src");
long st = System.currentTimeMillis();
thread.sleep(10000);//暂停后程序继续执行
String realPath="https://birdnet.cn//"+birdurl;
downImages(ImgPath, realPath);
System.out.println("下载图片耗时="+(System.currentTimeMillis()-st)+"ms");
}
// System.out.println(birdurl2);
flag=false;
}
long st = System.currentTimeMillis();
thread.sleep(10000);//暂停后程序继续执行
downImages(ImgPath, url);
System.out.println("下载图片耗时="+(System.currentTimeMillis()-st)+"ms");
m++ ;
System.out.println("已下载"+m+"张图片");
System.out.println("地址为"+"C:\\Bird"+"\\"+filename1+"\\"+filename2+"\\"+filename3);
// System.out.println("++++++++++++");
}//for循环结束
if(size>1){//对于多页情况
for(int i=2;i<(size+1);i++)
{ String i2=String.valueOf(i);
map.put("page",i2 );
String responseContent4 = HttpClientUtil.getInstance().sendHttpPost("https://birdnet.cn/atlas.php?mod=show&action=atlaslist",map);
Document document4=Jsoup.parse(responseContent4);
// System.out.println(document3);
Elements ul2=document4.select("div.picturel > *");//获取ul信息
Elements pics2 = ul2.select("ul> *");
String ImgPath2="C:\\Bird"+"\\"+filename1+"\\"+filename2+"\\"+filename3;
for(Element pic:pics2){//此时已经遍历了各个li下的资源了
long st = System.currentTimeMillis();
thread.sleep(10000);//暂停后程序继续执行
Elements url2=pic.select("img[src]");//for循环里局部变量,不用改
String url= url2.attr("src");
downImages(ImgPath2, url);
m++;
System.out.println("已下载"+m+"张图片");
System.out.println("地址为"+"C:\\Bird"+"\\"+filename1+"\\"+filename2+"\\"+filename3);
System.out.println("下载图片耗时="+(System.currentTimeMillis()-st)+"ms");
}
}
map.remove("page");//移除map,要不后面会附加map里page=size2
}//if语句结束
}
}catch (InterruptedException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}//第三个for循环结束
}
}catch (InterruptedException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}//第二个for循环结束
}
}catch (InterruptedException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}//第一个for循环结束
}
/**
* 下载图片到指定目录
*
* @param filePath 文件路径
* @param imgUrl 图片URL
*/
public static void downImages(String filePath, String imgUrl) {
// 若指定文件夹没有,则先创建
File dir = new File(filePath);
if (!dir.exists()) {
dir.mkdirs();
}
// 截取图片文件名
String fileName = imgUrl.substring(imgUrl.lastIndexOf('/') + 1, imgUrl.length());
try {
// 文件名里面可能有中文或者空格,所以这里要进行处理。但空格又会被URLEncoder转义为加号
String urlTail = URLEncoder.encode(fileName, "UTF-8");
// 因此要将加号转化为UTF-8格式的%20
imgUrl = imgUrl.substring(0, imgUrl.lastIndexOf('/') + 1) + urlTail.replaceAll("\\+", "\\%20");
} catch (UnsupportedEncodingException e) {
e.printStackTrace();
}
// 写出的路径
File file = new File(filePath + File.separator + fileName);
try {
// 获取图片URL
URL url = new URL(imgUrl);
// 获得连接
URLConnection connection = url.openConnection();
// 设置10秒的相应时间
connection.setConnectTimeout(10 * 1000);
// 获得输入流
InputStream in = connection.getInputStream();
// 获得输出流
BufferedOutputStream out = new BufferedOutputStream(new FileOutputStream(file));
// 构建缓冲区
byte[] buf = new byte[1024];
int size;
// 写入到文件
while (-1 != (size = in.read(buf))) {
out.write(buf, 0, size);
}
out.close();
in.close();
} catch (MalformedURLException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
}
}
这是核心代码
达到图上效果
图片2.png
至于为什么不利用多线程,完全是点击量过大造成网站崩溃,所以尽量慢点。
网友评论