import com.gargoylesoftware.htmlunit.Page;
import com.gargoylesoftware.htmlunit.WebClient;
import com.gargoylesoftware.htmlunit.WebClientOptions;
import com.gargoylesoftware.htmlunit.html.HtmlPage;
import com.sun.tools.javac.Main;
import org.apache.http.client.HttpClient;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import java.io.IOException;
/**
* @outhor chenglong
* @create 2021-09-17 20:40
* @name 小哈
*/
public class Testmain {
public static void main(String[] args) throws IOException {
WebClient webClient = new WebClient();
// WebClientOptions options = webClient.getOptions();
webClient.getOptions().setCssEnabled(false);
webClient.getOptions().setJavaScriptEnabled(true);
webClient.getOptions().setThrowExceptionOnScriptError(false);
webClient.waitForBackgroundJavaScript(3000);
HtmlPage page = webClient.getPage("http://www.csrc.gov.cn/pub/shanghai/shfdqyxx/shfdgzzjbg/202106/t20210621_400053.htm");
Document parse = Jsoup.parse(page.asXml(),"http://www.csrc.gov.cn/pub/shanghai/shfdqyxx/shfdgzzjbg/202106/t20210621_400053.htm");
String text = parse.select("a.h12").text();
String attr = parse.select("a.h12").attr("abs:href");//使用abs时需要注意jsoup传递的是String类型时,jsoup是不知道绝对地址的(需要在后面增加),如果是url则知道
System.out.println(text+"\t"+attr);
}
}
public static Document getDocument() throws IOException, InterruptedException{
WebClient wc = new WebClient(BrowserVersion.CHROME);
//是否使用不安全的SSL
wc.getOptions().setUseInsecureSSL(true);
//启用JS解释器,默认为true
wc.getOptions().setJavaScriptEnabled(true);
//禁用CSS
wc.getOptions().setCssEnabled(false);
//js运行错误时,是否抛出异常
wc.getOptions().setThrowExceptionOnScriptError(false);
//状态码错误时,是否抛出异常
wc.getOptions().setThrowExceptionOnFailingStatusCode(false);
//是否允许使用ActiveX
wc.getOptions().setActiveXNative(false);
//等待js时间
wc.waitForBackgroundJavaScript(600*1000);
//设置Ajax异步处理控制器即启用Ajax支持
wc.setAjaxController(new NicelyResynchronizingAjaxController());
//设置超时时间
wc.getOptions().setTimeout(1000000);
//不跟踪抓取
wc.getOptions().setDoNotTrackEnabled(false);
WebRequest request=new WebRequest(new URL("http://kcb.sse.com.cn/renewal/xmxq/index_refund.shtml?auditId=1000055&anchor_type=2&bussinesType=6"));
request.setAdditionalHeader("User-Agent", "Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:62.0) Gecko/20100101 Firefox/62.0");
// request.setAdditionalHeader("Cookie","PLAY_LANG=cn; _plh=b9289d0a863a8fc9c79fb938f15372f7731d13fb; PLATFORM_SESSION=39034d07000717c664134556ad39869771aabc04-_ldi=520275&_lsh=8cf91cdbcbbb255adff5cba6061f561b642f5157&csrfToken=209f20c8473bc0518413c226f898ff79cd69c3ff-1539926671235-b853a6a63c77dd8fcc364a58&_lpt=%2Fcn%2Fvehicle_sales%2Fsearch&_lsi=1646321; _ga=GA1.2.2146952143.1539926675; _gid=GA1.2.1032787565.1539926675; _plh_notime=8cf91cdbcbbb255adff5cba6061f561b642f5157");
try {
//模拟浏览器打开一个目标网址
HtmlPage htmlPage = wc.getPage(request);
//为了获取js执行的数据 线程开始沉睡等待
Thread.sleep(1000);//这个线程的等待 因为js加载需要时间的
//以xml形式获取响应文本
String xml = htmlPage.asXml();
//将内容输入到本地文件
//并转为Document对象return
return Jsoup.parse(xml);
//System.out.println(xml.contains("结果.xls"));//false
} catch (FailingHttpStatusCodeException e) {
e.printStackTrace();
} catch (MalformedURLException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
}
return null;
}
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<groupId>org.example</groupId>
<artifactId>getData</artifactId>
<version>1.0-SNAPSHOT</version>
<properties>
<maven.compiler.source>16</maven.compiler.source>
<maven.compiler.target>16</maven.compiler.target>
</properties>
<dependencies>
<!-- https://mvnrepository.com/artifact/net.sourceforge.htmlunit/htmlunit -->
<dependency>
<groupId>net.sourceforge.htmlunit</groupId>
<artifactId>htmlunit</artifactId>
<version>2.27</version>
</dependency>
<!-- https://mvnrepository.com/artifact/org.jsoup/jsoup -->
<dependency>
<groupId>org.jsoup</groupId>
<artifactId>jsoup</artifactId>
<version>1.13.1</version>
</dependency>
<!-- https://mvnrepository.com/artifact/org.apache.httpcomponents/httpclient -->
<dependency>
<groupId>org.apache.httpcomponents</groupId>
<artifactId>httpclient</artifactId>
<version>4.5.13</version>
</dependency>
</dependencies>
</project>
网友评论