http协议下的网络爬虫

作者: KingSun_阳 | 来源:发表于2015-12-10 09:33 被阅读0次

http协议下的网络爬虫
爬虫简介
Python库之网络爬虫
Robots协议
Python爬虫基础教程之requests模块
Python 爬虫协议及建议
爬虫的"盗亦有道"-Robots协议
人生不得已——Python爬虫 robots协议
关于爬虫
爬虫入门系列（六）：正则表达式完全指南（下）

主管让做个抓取淘宝数据的功能，但是淘宝的比较难，我先从扒新浪新闻开始。

环境，Apache 提供免费的 HTTPClien t源码和 JAR 包下载，可以登陆这里下载，笔者用的是4.51版本。

参考apache提供的例子，使用正则表达式做出如下程序。


public class Main {
    
    public static void Detail(String url) throws Exception {
        CloseableHttpClient httpclient = HttpClients.createDefault();
        String oldStr;
        try {
            HttpGet httpget = new HttpGet(url);
            String encoding="gbk";
            if(url.contains("comments")){
                
                encoding = "utf-8";
            }
            System.out.println(encoding);
            System.out.println("Executing request " + httpget.getURI());
            CloseableHttpResponse response = httpclient.execute(httpget);
          
            try {
                System.out.println("----------------------------------------");
                System.out.println(response.getStatusLine());
                HttpEntity entity = response.getEntity();
                oldStr = EntityUtils.toString(response.getEntity(),encoding);
 
                // Call abort on the request object
                httpget.abort();
            } finally {
                response.close();
            }
        } finally {
            httpclient.close();
        }

        Pattern pattern =  Pattern.compile("<title>[^<]*</title>");
        Matcher matcher = pattern.matcher(oldStr);
        if(matcher.find()){
            String str = matcher.group();
            str = str.substring(7,str.length()-8);
            System.out.println("---"+str);
        }
        
        pattern =  Pattern.compile("<p>[^<]*</p>");
        matcher = pattern.matcher(oldStr);
        while(matcher.find()){
            String str = matcher.group();
            str = str.substring(3,str.length()-4);
            System.out.println(str);
        }

    }

     

    
    public static void main(String[] args) throws Exception {
        CloseableHttpClient httpclient = HttpClients.createDefault();
        String oldStr;
        try {
            
            String str = null;
        str ="http://news.sina.com.cn/hotnews/";
            HttpGet httpget = new HttpGet(str);
            System.out.println("Executing request " + httpget.getURI());
            CloseableHttpResponse response = httpclient.execute(httpget);
            try { System.out.println("----------------------------------------");
                System.out.println(response.getStatusLine());
                HttpEntity entity = response.getEntity();
                oldStr = EntityUtils.toString(response.getEntity(),"UTF-8");
                // Call abort on the request object
                httpget.abort();
            } finally {
                response.close();
            }
        } finally {
            httpclient.close();
        }
        Pattern pattern =  Pattern.compile("href='http://[^']*'");
        Matcher matcher = pattern.matcher(oldStr);
        int i= 1;
        while(matcher.find()){
            String str = matcher.group();
            str = str.substring(6,str.length()-1);
            System.out.println(str);
            Detail(str);
            System.out.println(i++);
        }
    }
}
```