前言
最近有点清闲,学习了下Java相关的爬虫,第一次入门。
参考文章:
jsoup
1:https://blog.csdn.net/championhengyi/article/details/68491306
2:http://www.open-open.com/jsoup/dom-navigation.htm
代码:
https://www.cnblogs.com/1996swg/p/7355577.html
发现一个很有用的网站,本地图片生成url
http://thyrsi.com//
废话不说,开始上代码
- 第一部分代码
static DataSource dataSource = MyDataSource.getDataSource();
static QueryRunner qr = new QueryRunner(dataSource);
private static int count = 001;
//等待爬取的页面
private static List<String> waitUrl = new ArrayList<>();
//爬取过的url
private static Set set = new HashSet();
private static String savepath = "e:/spider/dang01/";
private static int dataSize=0;
public static void main(String[] args) throws IOException {
String firstUrl = "http://search.dangdang.com/?key=%BB%FA%D0%B5%B1%ED&act=input&page_index=1";
getUrls(firstUrl);
System.out.println("最后插入数据的条数为: "+ dataSize);
}
- 核心爬虫
public static void getUrls(String url) throws IOException {
System.out.println("爬取的url是:" + url);
HttpClient aDefault = HttpClients.createDefault();
HttpGet httpGet = new HttpGet(url);
HttpResponse response = aDefault.execute(httpGet);
int statusCode = response.getStatusLine().getStatusCode();
Pattern p = Pattern.compile("<a .*href=.+</a>");
List<String> finalList = new ArrayList<>();
String group = null;
if (200 == statusCode) {
String result = EntityUtils.toString(response.getEntity(), "UTF-8");
parseData(result);
Matcher matcher = p.matcher(result);
while (matcher.find()) {
group = matcher.group();
if (group.contains("下一页")) {
try {
//这个index有可能是两位数 三位数
int startIndex = group.indexOf("index");
int titleIndex = group.indexOf("title");
int pageindex = Integer.valueOf(group.substring(startIndex + 6, titleIndex - 2));
System.out.println("**********************" + pageindex);
if (pageindex > 10) {
return;
}
String lastUrl = "http://search.dangdang.com/?key=%BB%FA%D0%B5%B1%ED&act=input&page_index=";
lastUrl += pageindex;
System.out.println("下一个将要爬取的url: " + lastUrl);
waitUrl.add(lastUrl);
} catch (Exception e) {
e.printStackTrace();
}
}
}
}
// System.out.println(JSON.toJSON(waitUrl));
set.add(group);
String nextUrl = waitUrl.get(0);
waitUrl.remove(0);
getUrls(nextUrl);
}
- 解析代码
public static void parseData(String data) throws IOException {
List<DangDangModel> result = new ArrayList();
Document document = Jsoup.parse(data);
Elements elements = document.select("div[class=con shoplist]")
.select("div[id=search_nature_rg]")
.select("ul[class=bigimg cloth_shoplist]ul[id=component_0__0__8609]")
.select("li");
System.out.println(elements.size());
for (Element element : elements) {
String bookID = element.attr("id");
String bookPrice = element.select("span[class=price_n]").text();
String bookName = element.select("p[class=name]").select("a").text();
if (StringUtils.isEmpty(bookID) || StringUtils.isEmpty(bookPrice) || StringUtils.isEmpty(bookName)) {
continue;
}
bookPrice = bookPrice.substring(1);
DangDangModel ddModel = new DangDangModel();
ddModel.setWatchID(bookID);
ddModel.setWatchName(bookName);
ddModel.setWatchPrice(Double.valueOf(bookPrice));
result.add(ddModel);
}
Object[][] sqlParams = new Object[result.size()][3];
for (int i = 0; i < result.size(); i++) {
sqlParams[i][0] = result.get(i).getWatchID();
sqlParams[i][1] = result.get(i).getWatchName();
sqlParams[i][2] = result.get(i).getWatchPrice();
}
String sql = "insert into dangdangmodel(watchID,watchName,watchPrice) values(?,?,?)";
try {
qr.batch(sql, sqlParams);
} catch (SQLException e) {
e.printStackTrace();
}
dataSize+=result.size();
File file = new File(savepath + count + ".txt");
FileOutputStream fileOutputStream = new FileOutputStream(file, true);
for (Object o : result) {
fileOutputStream.write(o.toString().getBytes());
fileOutputStream.write("\r\n".getBytes());
}
fileOutputStream.close();
count++;
}
结果展示
-
数据库展示效果
image -
本地文件展示效果
没有了,被我删除了
总结
这个是最简单的爬虫,
- 使用httpclient发送get请求,获取整个页面,然后正则提取出存在下一个页面的url
获取下一个页面值,然后递归去爬取。
- 使用jsoup去对获取的页面去解析,最后保存到已经存在的数据库中。
网友评论