Maven引入
<dependency>
<groupId>org.apache.httpcomponents</groupId>
<artifactId>httpclient</artifactId>
<version>4.5.13</version>
</dependency>
<dependency>
<groupId>org.jsoup</groupId>
<artifactId>jsoup</artifactId>
<version>1.13.1</version>
</dependency>
写在application.properties配置文件中
upload.path=G:/imgs
getUrl=https://mp.weixin.qq.com/s/fdllA87IDpUZ34OFBVZdWw
wexinUrl=https://mp.weixin.qq.com
定义Controller
@RestController
@RequestMapping("/crawler")
public class CrawlerController {
//存储图片的路径,写在配置文件里面
@Value("${upload.path}")
private String path;
//公众号文章地址,写在配置文件里面
@Value("${getUrl}")
private String url;
//公众号总地址
@Value("${wexinUrl}")
private String wexinUrl;
@RequestMapping("/getContent")
public String getContent() {
String content =null;
String imgDir =path;
// 输入网址,创建发起Get请求的对象
HttpGet httpGet =new HttpGet(url);
// 创建httpClient对象,类似于打开浏览器
CloseableHttpClient httpClient =HttpClients.createDefault();
// 类似于浏览器输入网址后,按回车
CloseableHttpResponse execute =null;
try {
execute =httpClient.execute(httpGet);
// 解析获取数据,判断状态码是不是200
if (execute.getStatusLine().getStatusCode() ==200) {
HttpEntity entity = execute.getEntity();
content =EntityUtils.toString(entity,"utf8");
Document doc =Jsoup.parse(content);
//找到图片标签
Elements img =doc.select("img");
for (int i =0; i
// 图片地址
String imgUrl =img.get(i).attr("data-src");
File sf =new File(imgDir);
if (!sf.exists()) {
sf.mkdirs();
}
// 这里是一个公众号的二维码的图片,先不处理了
// String id = img.get(i).attr("id");
// if ("js_pc_qr_code_img".equalsIgnoreCase(id)) {
// imgUrl = wexinUrl + img.get(i).attr("src");
// }
if (imgUrl !=null && !imgUrl.equals("")) {
String fileName =DateTimeUitls.getString("yyyyMMddHHmmssSS") +".png";
String imgPath =imgDir +File.separator +fileName;
File imgFile =new File(imgPath);
if (!imgFile.exists()) {
// 下载图片
// 构造URL
URL url =new URL(imgUrl);
// 打开连接
URLConnection con =url.openConnection();
//设置请求超时为5s
con.setConnectTimeout(5 *1000);
// 输入流
InputStream in =con.getInputStream();
// 1K的数据缓冲
byte[]bs =new byte[1024];
// 读取到的数据长度
int len;
// 输出的文件流
OutputStream os =new FileOutputStream(imgPath);
// 开始读取
while ((len =in.read(bs)) != -1) {
os.write(bs,0, len);
}
os.close();
in.close();
}
//重新赋值为本地路径,
// img.get(i).attr("data-src", imgPath);
// img.get(i).attr("src", imgPath);
//上面访问图片可能访问不到,建议定义访问图片的请求方法,所以修改成下面的的路径访问方式
img.get(i).attr("data-src","/crawler/readImg/" +fileName);
img.get(i).attr("src","/crawler/readImg/" +fileName);
//导出html
content =doc.outerHtml();
}
}
}
}catch (Exception e) {
e.printStackTrace();
}finally {
try {
execute.close();
}catch (IOException e) {
e.printStackTrace();
}
try {
httpClient.close();
}catch (IOException e) {
e.printStackTrace();
}
}
return content;
}
//建议定义访问图片的请求方法
@RequestMapping("/readImg/{fileName}")
public void readImg(@PathVariable("fileName")String fileName,HttpServletResponse response) {
try {
// fileImage 为服务器存储的实际路径 如c:\aa\bb.jpg
String fileImage =path +File.separator + fileName;
FileInputStream hFile =new FileInputStream(fileImage);// 以byte流的方式打开文件
int i =hFile.available();// 得到文件大小
byte data[] =new byte[i];
hFile.read(data);// 读数据
hFile.close();
response.setContentType("image/*");// 设置返回的文件类型
OutputStream toClient = response.getOutputStream();// 得到向客户端输出二进制数据的对象
toClient.write(data);// 输出数据
toClient.close();
}catch (IOException e) {
// 错误处理
PrintWriter toClient;
try {
// 得到向客户端输出文本的对象
toClient = response.getWriter();
response.setContentType("text/html;charset=utf8");
toClient.write("无法打开图片!");
toClient.close();
}catch (IOException e1) {
e1.printStackTrace();
}
}
}
}
时间格式工具类
public class DateTimeUitls {
public static String getString(String pattern){
SimpleDateFormat df = new SimpleDateFormat(pattern);
return df.format(new Date());
}
}
网友评论