美文网首页
java爬虫之Xsoup(xpath) 解析

java爬虫之Xsoup(xpath) 解析

作者: HAO延WEI | 来源:发表于2020-01-02 14:18 被阅读0次

Git地址:https://github.com/code4craft/xsoup

引入包

        <dependency>
            <groupId>us.codecraft</groupId>
            <artifactId>webmagic-core</artifactId>
            <version>0.5.3</version>
        </dependency>

案例:

package com.example.power_spider.test;

import org.apache.http.HttpEntity;
import org.apache.http.HttpStatus;
import org.apache.http.client.ClientProtocolException;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.client.utils.HttpClientUtils;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.util.EntityUtils;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import us.codecraft.xsoup.Xsoup;
import java.io.IOException;



public class HttpClientTest {

    public static String getRosponse() throws IOException{
        //1.生成httpclient,相当于该打开一个浏览器
        CloseableHttpClient httpClient = HttpClients.createDefault();
        String html = null;
        try {
            //2.创建get请求,相当于在浏览器地址栏输入 网址
            HttpGet httpget = new HttpGet("https://auto.gasgoo.com/a/70147658.html");

            httpget.setHeader("Accept", "text/html, */*; q=0.01");
            httpget.setHeader("Accept-Encoding", "gzip, deflate,sdch");
            httpget.setHeader("Accept-Language", "zh-CN,zh;q=0.8");
            httpget.setHeader("Connection", "keep-alive");
            httpget.setHeader("User-Agent", "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/37.0.2062.124 Safari/537.36)");
            //3.执行get请求,相当于在输入地址栏后敲回车键
            CloseableHttpResponse response = httpClient.execute(httpget);

            //4.判断响应状态为200,进行处理
            if(response.getStatusLine().getStatusCode() == HttpStatus.SC_OK) {
                //5.获取响应内容
                HttpEntity httpEntity = response.getEntity();
                html = EntityUtils.toString(httpEntity, "utf-8");

            } else {
                //如果返回状态不是200,比如404(页面不存在)等,根据情况做处理,这里略
                System.out.println("返回状态不是200");
                System.out.println(EntityUtils.toString(response.getEntity(), "utf-8"));
            }
        } catch (ClientProtocolException e) {
            e.printStackTrace();
        }
        finally {
            //6.关闭
            HttpClientUtils.closeQuietly(httpClient);
        }
        return html;

    }

    public static void main(String[] args) throws IOException {
        
        String html = getRosponse();
        Document document = Jsoup.parse(html);
        String title = Xsoup.compile("//title").evaluate(document).get();
        String result = Xsoup.compile("//div[@class='scrap minwidth']").evaluate(document).getElements().text();
        String data = Xsoup.select(html, "//title").getElements().text();
        System.out.println("===>>>"+title);
        System.out.println("===>>>"+result);
        System.out.println(data);
    }

}

相关文章

网友评论

      本文标题:java爬虫之Xsoup(xpath) 解析

      本文链接:https://www.haomeiwen.com/subject/amqooctx.html