美文网首页
JAVA抓取网页方法

JAVA抓取网页方法

作者: 龍狮虎 | 来源:发表于2017-11-21 14:14 被阅读0次

    package com.xxx.core;

    import org.apache.http.client.methods.CloseableHttpResponse;

    import org.apache.http.client.methods.HttpGet;

    import org.apache.http.impl.client.CloseableHttpClient;

    import org.apache.http.impl.client.HttpClients;

    import org.apache.http.util.EntityUtils;

    import org.jsoup.Jsoup;

    import org.jsoup.nodes.Document;

    import org.jsoup.select.Elements;

    import java.io.*;

    public class GlodonFinanceInfo {

    public static void main(String[] args) {

    //        String url = args[0];

    //        String filePath = args[1];

    String url = "http://quote.eastmoney.com/sz002410.html?StockCode=002410";

    String jsonUrl = "http://nuff.eastmoney.com/EM_Finance2015TradeInterface/JS.ashx?id=0024102";

    String filePath = "E:\\text2.txt";

    GlodonFinanceInfo info = new GlodonFinanceInfo();

    String html = info.getHtml(url, "gb2312");

    String json = info.getHtml(jsonUrl, "utf-8");

    System.out.println(json);

    //        info.save(filePath, html);

    }

    private String getHtml(String url, String code) {

    String html = "";

    try {

    CloseableHttpClient httpClient = HttpClients.createDefault();

    HttpGet get = new HttpGet(url);

    CloseableHttpResponse response = httpClient.execute(get);

    int statusCode = response.getStatusLine().getStatusCode();

    if (statusCode == 200) {

    byte[] bytes = EntityUtils.toByteArray(response.getEntity());

    html = new String(bytes, code);

    }

    } catch (IOException e) {

    e.printStackTrace();

    } finally {

    return html;

    }

    }

    private void saveHtml(String filePath, String text) {

    OutputStreamWriter writer = null;

    try {

    File file = new File(filePath);

    writer = new OutputStreamWriter(new FileOutputStream(file), "UTF-8");

    Document parse = Jsoup.parse(text);

    Elements table = parse.select(".cwzb");

    Elements tr = table.select("tr");

    for (int i = 1; i < tr.size(); i++) {

    if (i == 4) {

    tr.select("div").remove();

    }

    Elements td = tr.get(i).select("td");

    for (int j = 0; j < td.size(); j++) {

    writer.write(td.get(j).text());

    if (j < td.size() - 1) {

    writer.write("$");

    }

    }

    writer.write("\r\n");

    }

    } catch (IOException e) {

    e.printStackTrace();

    } finally {

    try {

    if (writer != null) {

    writer.close();

    }

    } catch (IOException e) {

    e.printStackTrace();

    }

    }

    }

    private void saveJson(String filePath, String text) {

    OutputStreamWriter writer = null;

    try {

    File file = new File(filePath);

    writer = new OutputStreamWriter(new FileOutputStream(file), "UTF-8");

    } catch (UnsupportedEncodingException e) {

    e.printStackTrace();

    } catch (FileNotFoundException e) {

    e.printStackTrace();

    } finally {

    try {

    if (writer != null) {

    writer.close();

    }

    } catch (IOException e) {

    e.printStackTrace();

    }

    }

    }

    }

    相关文章

      网友评论

          本文标题:JAVA抓取网页方法

          本文链接:https://www.haomeiwen.com/subject/kyqwvxtx.html