使用jsoup简单抓取应用市场数据

作者: 浪漫晨风 | 来源:发表于2018-05-15 10:19 被阅读73次

    统计各个应用市场本应用的下载数量是个麻烦没有效率的要求,不符合广大程序员哥们的使用习惯,我是搞android的,深知android应用市场真是繁琐,于是百度了一下,从网上找了一个jsoup来解析网页,用于方便我们应用市场下载量的统计工作。

    1.为了方便封装,便于扩展,封装了一个规则类,以及规则异常

    public class Rule {
        /**
         * 链接
         */
        private String url;
    
        /**
         * 参数集合
         */
        private String[] params;
        /**
         * 参数对应的值
         */
        private String[] values;
    
        /**
         * 对返回的HTML,第一次过滤所用的标签,请先设置type
         */
        private String resultTagName;
    
        /**
         * CLASS / ID / SELECTION 设置resultTagName的类型,默认为ID
         */
        private int type = ID;
    
        /**
         * GET / POST 请求的类型,默认GET
         */
        private int requestMoethod = GET;
    
        public final static int GET = 0;
        public final static int POST = 1;
    
        public final static int CLASS = 0;
        public final static int ID = 1;
    
    
        public Rule() {
        }
    
        public Rule(String url, String[] params, String[] values,
                String resultTagName, int type, int requestMoethod) {
            super();
            this.url = url;
            this.params = params;
            this.values = values;
            this.resultTagName = resultTagName;
            this.type = type;
            this.requestMoethod = requestMoethod;
        }
    
        public String getUrl() {
            return url;
        }
    
        public void setUrl(String url) {
            this.url = url;
        }
    
        public String[] getParams() {
            return params;
        }
    
        public void setParams(String[] params) {
            this.params = params;
        }
    
        public String[] getValues() {
            return values;
        }
    
        public void setValues(String[] values) {
            this.values = values;
        }
    
        public String getResultTagName() {
            return resultTagName;
        }
    
        public void setResultTagName(String resultTagName) {
            this.resultTagName = resultTagName;
        }
    
        public int getType() {
            return type;
        }
    
        public void setType(int type) {
            this.type = type;
        }
    
        public int getRequestMoethod() {
            return requestMoethod;
        }
    
        public void setRequestMoethod(int requestMoethod) {
            this.requestMoethod = requestMoethod;
        }
    
    }
    

    规则异常类

    public class RuleException extends RuntimeException
    {
    
        public RuleException()
        {
            super();
            // TODO Auto-generated constructor stub
        }
    
        public RuleException(String message, Throwable cause)
        {
            super(message, cause);
            // TODO Auto-generated constructor stub
        }
    
        public RuleException(String message)
        {
            super(message);
            // TODO Auto-generated constructor stub
        }
    
        public RuleException(Throwable cause)
        {
            super(cause);
            // TODO Auto-generated constructor stub
        }
    
    }
    

    2.抓取主要类:

    public class ExtractService
    {
        /**
         * @param rule
         * @return
         */
        public static Element  extract(Rule rule)
        {
            validateRule(rule);
               Element result = null  ;
            try
            {
                /**
                 * 解析rule
                 */
              
                String url = rule.getUrl();
                String[] params = rule.getParams();
                String[] values = rule.getValues();
                String resultTagName = rule.getResultTagName();
                int type = rule.getType();
                int requestType = rule.getRequestMoethod();
    
                Connection conn = Jsoup.connect(url);
                // 设置查询参数
    
                if (params != null)
                {
                    for (int i = 0; i < params.length; i++)
                    {
                        conn.data(params[i], values[i]);
                    }
                }
    
                // 设置请求类型
                Document doc = null;
                switch (requestType)
                {
                case Rule.GET:
                    doc = conn.timeout(100000).get();
                    break;
                case Rule.POST:
                    doc = conn.timeout(100000).post();
                    break;
                }
    
                //处理返回数据
                
                switch (type)
                {
                case Rule.CLASS:
                    result = doc.getElementsByClass(resultTagName).first();
                    break;
                case Rule.ID:
                    result = doc.getElementById(resultTagName).firstElementSibling();
                    break;
                }
    
            } catch (IOException e)
            {
                e.printStackTrace();
            }
            return result;
        }
    
        /**
         * 对传入的参数进行必要的校验
         */
        private static void validateRule(Rule rule)
        {
            String url = rule.getUrl();
            if (TextUtil.isEmpty(url))
            {
                throw new RuleException("url不能为空!");
            }
            if (!url.startsWith("http://"))
            {
                throw new RuleException("url的格式不正确!");
            }
    
            if (rule.getParams() != null && rule.getValues() != null)
            {
                if (rule.getParams().length != rule.getValues().length)
                {
                    throw new RuleException("参数的键值对个数不匹配!");
                }
            }
    
        }
    
    
    }
    

    3.运行主要类,通过我们网页分析工具,获取那些网站的源码,然后得到我们想要的数值

        public static void main(String[] args) {
            Market360();
            MarketBaidu();
            MarketAnzhi();
        }
    
        public static void Market360() {
            Rule rule = new Rule("http://zhushou.360.cn/search/index/",
                    new String[] { "kw" }, new String[] { "名医网·健康E家" }, "downNum",
                    Rule.CLASS, Rule.GET);
            System.out.println("360Market:" + ExtractService.extract(rule).text());
        }
    
        public static void MarketBaidu() {
            Rule rule = new Rule("http://shouji.baidu.com/s", new String[] { "wd",
                    "data_type", "f" }, new String[] { "名医网·健康E家", "app",
                    "header_all%40input%40btn_search" }, "download-num",
                    Rule.CLASS, Rule.GET);
            System.out
                    .println("baiduMarket:" + ExtractService.extract(rule).text());
        }
        public static void MarketAnzhi() {
            Rule rule = new Rule("http://www.anzhi.com/pkg/f187_com.zzu.ehome.main.ehome.html",null, null, "spaceleft",
                    Rule.CLASS, Rule.GET);
            System.out
                    .println("AnzhiMarket:" + ExtractService.extract(rule).text());
        }
    
    
    123.png

    相关文章

      网友评论

        本文标题:使用jsoup简单抓取应用市场数据

        本文链接:https://www.haomeiwen.com/subject/crcedftx.html