美文网首页
爬虫等待js加载完成后再进行爬取

爬虫等待js加载完成后再进行爬取

作者: 开心的小哈 | 来源:发表于2021-09-17 21:06 被阅读0次
import com.gargoylesoftware.htmlunit.Page;
import com.gargoylesoftware.htmlunit.WebClient;
import com.gargoylesoftware.htmlunit.WebClientOptions;
import com.gargoylesoftware.htmlunit.html.HtmlPage;
import com.sun.tools.javac.Main;
import org.apache.http.client.HttpClient;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;

import java.io.IOException;

/**
 * @outhor chenglong
 * @create 2021-09-17 20:40
 * @name 小哈
 */
public class Testmain {
    public static void main(String[] args) throws IOException {
        WebClient webClient = new WebClient();
//        WebClientOptions options = webClient.getOptions();
        webClient.getOptions().setCssEnabled(false);
        webClient.getOptions().setJavaScriptEnabled(true);
        webClient.getOptions().setThrowExceptionOnScriptError(false);
        webClient.waitForBackgroundJavaScript(3000);
        HtmlPage page = webClient.getPage("http://www.csrc.gov.cn/pub/shanghai/shfdqyxx/shfdgzzjbg/202106/t20210621_400053.htm");
        Document parse = Jsoup.parse(page.asXml(),"http://www.csrc.gov.cn/pub/shanghai/shfdqyxx/shfdgzzjbg/202106/t20210621_400053.htm");
        String text = parse.select("a.h12").text();
        String attr = parse.select("a.h12").attr("abs:href");//使用abs时需要注意jsoup传递的是String类型时,jsoup是不知道绝对地址的(需要在后面增加),如果是url则知道
        System.out.println(text+"\t"+attr);
    }
}

 public static Document getDocument() throws IOException, InterruptedException{

          WebClient wc = new WebClient(BrowserVersion.CHROME);
          //是否使用不安全的SSL
          wc.getOptions().setUseInsecureSSL(true);
           //启用JS解释器,默认为true
           wc.getOptions().setJavaScriptEnabled(true);
           //禁用CSS
           wc.getOptions().setCssEnabled(false);
           //js运行错误时,是否抛出异常
           wc.getOptions().setThrowExceptionOnScriptError(false);
           //状态码错误时,是否抛出异常
           wc.getOptions().setThrowExceptionOnFailingStatusCode(false);
           //是否允许使用ActiveX
           wc.getOptions().setActiveXNative(false);
           //等待js时间
           wc.waitForBackgroundJavaScript(600*1000);
           //设置Ajax异步处理控制器即启用Ajax支持
           wc.setAjaxController(new NicelyResynchronizingAjaxController());
           //设置超时时间
           wc.getOptions().setTimeout(1000000);
           //不跟踪抓取
           wc.getOptions().setDoNotTrackEnabled(false);
            WebRequest request=new WebRequest(new URL("http://kcb.sse.com.cn/renewal/xmxq/index_refund.shtml?auditId=1000055&anchor_type=2&bussinesType=6"));
            request.setAdditionalHeader("User-Agent", "Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:62.0) Gecko/20100101 Firefox/62.0");
//            request.setAdditionalHeader("Cookie","PLAY_LANG=cn; _plh=b9289d0a863a8fc9c79fb938f15372f7731d13fb; PLATFORM_SESSION=39034d07000717c664134556ad39869771aabc04-_ldi=520275&_lsh=8cf91cdbcbbb255adff5cba6061f561b642f5157&csrfToken=209f20c8473bc0518413c226f898ff79cd69c3ff-1539926671235-b853a6a63c77dd8fcc364a58&_lpt=%2Fcn%2Fvehicle_sales%2Fsearch&_lsi=1646321; _ga=GA1.2.2146952143.1539926675; _gid=GA1.2.1032787565.1539926675; _plh_notime=8cf91cdbcbbb255adff5cba6061f561b642f5157");
           try {
                   //模拟浏览器打开一个目标网址
                   HtmlPage htmlPage = wc.getPage(request);
                   //为了获取js执行的数据 线程开始沉睡等待
                   Thread.sleep(1000);//这个线程的等待 因为js加载需要时间的
                   //以xml形式获取响应文本
                   String xml = htmlPage.asXml();
                   //将内容输入到本地文件
                   //并转为Document对象return
                   return Jsoup.parse(xml);
                   //System.out.println(xml.contains("结果.xls"));//false
               } catch (FailingHttpStatusCodeException e) {
                   e.printStackTrace();
               } catch (MalformedURLException e) {
                   e.printStackTrace();
               } catch (IOException e) {
                   e.printStackTrace();
               }
           return null;
       }
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0"
         xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
         xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
    <modelVersion>4.0.0</modelVersion>

    <groupId>org.example</groupId>
    <artifactId>getData</artifactId>
    <version>1.0-SNAPSHOT</version>

    <properties>
        <maven.compiler.source>16</maven.compiler.source>
        <maven.compiler.target>16</maven.compiler.target>
    </properties>
    <dependencies>
        <!-- https://mvnrepository.com/artifact/net.sourceforge.htmlunit/htmlunit -->
        <dependency>
            <groupId>net.sourceforge.htmlunit</groupId>
            <artifactId>htmlunit</artifactId>
            <version>2.27</version>
        </dependency>
        <!-- https://mvnrepository.com/artifact/org.jsoup/jsoup -->
        <dependency>
            <groupId>org.jsoup</groupId>
            <artifactId>jsoup</artifactId>
            <version>1.13.1</version>
        </dependency>
        <!-- https://mvnrepository.com/artifact/org.apache.httpcomponents/httpclient -->
        <dependency>
            <groupId>org.apache.httpcomponents</groupId>
            <artifactId>httpclient</artifactId>
            <version>4.5.13</version>
        </dependency>


    </dependencies>


</project>

相关文章

网友评论

      本文标题:爬虫等待js加载完成后再进行爬取

      本文链接:https://www.haomeiwen.com/subject/gxflgltx.html