美文网首页
Curl爬虫案例

Curl爬虫案例

作者: aoshi | 来源:发表于2021-11-23 11:19 被阅读0次

    新版

    <?php
    /**
     * Created by PhpStorm.
     * User: aoshi
     * Date: 2020/12/28
     * Time: 15:18
     */
    
    namespace Cron\Controller;
    
    class CrawltestController extends BaseController
    {
    
        protected $cookie = array();
        protected $referer = '';
    
        /**
         * 登录
         * */
        public function login() {
            //获取cookie
            $url = '********';
            $agent = 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.183 Safari/537.36';
            $res = $this->curlCore($url,'get',array('agent'=>$agent));
            if(!$res) {
                exit('this is over width first curl error');
            }
            $responseHeader = $this->explainHeader($res['response_header']);
            $data = array(
                'account'=>'****.hz.cn',
                'password'=>'*****',
                'Sumit'=>'submit'
            );
            $params = array(
                'content_type'=>'urlencode',
                'data'=>$data,
                'referer'=>$url,
                'agent'=>$agent,
                'headers'=>$this->getHeader('post'),
            );
    
            $resSecond = $this->curlCore($url,'post',$params);
            if(!$res) {
                exit('this is over width second curl error');
            }
            $responseHeaderSecond = $this->explainHeader($resSecond['response_header']);
    
    
    
            //获取数据
            $baseUrl = '********';
            $referBase = '*******';
            for($i = 0;$i<10;$i++) {
                if(isset($listUrl) && $listUrl) {
                    $this->referer = $listUrl;
                } else {
                    $this->referer = $referBase;
                }
                if($i) {
                    $listUrl = $baseUrl . '?offset=' . $i *100;
                } else {
                    $listUrl = $baseUrl;
                }
                $params = array(
                    'referer'=>$this->referer,
                    'agent'=>$agent,
                    'headers'=>$this->getHeader(),
                );
                $htmlStr = $this->curlCore($listUrl,'get',$params);
                $this->explanHtml($htmlStr);
                sleep(2);
    
            }
    
            var_export($this->cookie);
            exit;
    
            $data = array(
                'account'=>'cococao.hz.cn',
                'password'=>'cc191101',
                'Sumit'=>'submit'
                );
            sleep(1);
            var_export($this->cookie);
            echo PHP_EOL . PHP_EOL;
            $this->curlRequest($url,$this->referer,$this->getHeader('post'),2,$data);
            var_export($this->cookie);
            echo PHP_EOL . PHP_EOL;
            exit();
    
    //exit();
            sleep(1);
            //获取数据
            $baseUrl = '*******';
            $referBase = '*************';
            for($i = 0;$i<10;$i++) {
                if(isset($listUrl) && $listUrl) {
                    $this->referer = $listUrl;
                } else {
                    $this->referer = $referBase;
                }
                if($i) {
                    $listUrl = $baseUrl . '?offset=' . $i *100;
                } else {
                    $listUrl = $baseUrl;
                }
                $htmlStr = $this->curlRequest($listUrl,$this->referer,$this->getHeader('get'),1,$data);
                var_export($htmlStr);exit;
                $this->explanHtml($htmlStr);
                sleep(2);
    
            }
        }
    
        /**
         * 拼装header头
         * @param   int     $type       是否表单请求 1|表单请求
         * */
        public function getHeader($method = 'get') {
            $method = strtoupper($method);
            $headersMap = array(
                'cookie'=>$this->cookie,
            );
    
            if($method == 'POST') {
                $headersMap['Content-type'] = 'application/x-www-form-urlencoded';
            } else {
                $headersMap['Content-type'] = 'Content-type:application/json;charset=utf-8';
                $headersMap['Accept'] = 'application/json';
            }
            $headers = array();
            foreach($headersMap as $headerKey => $headerVal) {
                if(is_array($headerVal)) {      //同一个header头不要有换行
                    $headerVal = implode(" ",$headerVal);
                }
                $headers[] = $headerKey . ' : ' . $headerVal;
            }
            return $headers;
        }
    
        /**
         * curl核心
         * @param   string      $url        请求地址
         * @param   string      $method     方法
         * @param   array       $params     其余参数  array()  content_type data referer agent headers timeout
         * */
        public function curlCore($url,$method = 'GET',$params) {
            $method = strtoupper($method);
            $timeOut = $params['time_out'] ? $params['time_out'] : 10;
            $ch = curl_init();
            if($params['ssl']) {
                curl_setopt($ch, CURLOPT_SSL_VERIFYPEER, true);      //设置为FALSE 禁止 cURL 验证对等证书
                curl_setopt($ch, CURLOPT_SSL_VERIFYHOST, true);      //false|0 不校验 设置为 1 是检查服务器SSL证书中是否存在一个公用名 设置成 2,会检查公用名是否存在,并且是否与提供的主机名匹配。
                curl_setopt($ch,CURLOPT_CAINFO,$params['cacert_pem']);      //一个保存着1个或多个用来让服务端验证的证书的文件名。这个选项是和CURLOPT_SSL_VERIFYPEER一起使用的。
                curl_setopt($ch,CURLOPT_CAPATH ,$params['cacert_path']);      // 一个保存着多个CA证书的目录。这个选项是和CURLOPT_SSL_VERIFYPEER一起使用的。
            }
            curl_setopt($ch, CURLOPT_URL, $url);
    
            if(is_array($params['data'])){
                switch($params['content_type']) {
                    case 'urlencode':       //Content-Type 被指定为 application/x-www-form-urlencoded;其次,提交的数据按照 key1=val1&key2=val2 的方式进行编码
                        $data = http_build_query($params['data']);      //
                        break;
                    case 'json':
                        $data = json_encode($params['data']);
                        break;
                    default:
                        $data = $params['data'];
                        break;
                }
            }
            switch($method) {
                case 'GET':
                    curl_setopt($ch, CURLOPT_HTTPGET, true);//TRUE 时会设置 HTTP 的 method 为 GET,由于默认是 GET,所以只有 method 被修改时才需要这个选项。
                    break;
                case 'POST':
                    #curl_setopt($ch, CURLOPT_POST,true);//TRUE 时会发送 POST 请求,类型为:application/x-www-form-urlencoded,是 HTML 表单提交时最常见的一种。
                    #curl_setopt($ch, CURLOPT_NOBODY, true);//TRUE 时将不输出 BODY 部分。同时 Mehtod 变成了 HEAD。修改为 FALSE 时不会变成 GET。
                    curl_setopt($ch, CURLOPT_CUSTOMREQUEST, "POST");//HTTP 请求时,使用自定义的 Method 来代替"GET"或"HEAD"。对 "DELETE" 或者其他更隐蔽的 HTTP 请求有用。 有效值如 "GET","POST","CONNECT"等等;
                    //设置提交的信息
                    curl_setopt($ch, CURLOPT_POSTFIELDS,$data);//全部数据使用HTTP协议中的 "POST" 操作来发送。
                    break;
                case 'PUT':
                    curl_setopt ($ch, CURLOPT_CUSTOMREQUEST, "PUT");
                    curl_setopt($ch, CURLOPT_POSTFIELDS,$data);
                    break;
                case 'DELETE':
                    curl_setopt ($ch, CURLOPT_CUSTOMREQUEST, "DELETE");
                    curl_setopt($ch, CURLOPT_POSTFIELDS,$data);
                    break;
            }
            curl_setopt($ch, CURLOPT_URL, $url);
            curl_setopt($ch, CURLOPT_AUTOREFERER, true);        //遇到重定向时 自动填充referer
            curl_setopt($ch, CURLOPT_MAXREDIRS, 3);          //指定最多的HTTP重定向的数量,这个选项是和CURLOPT_FOLLOWLOCATION一起使用的。
            curl_setopt($ch, CURLOPT_UNRESTRICTED_AUTH, true);          //在使用CURLOPT_FOLLOWLOCATION产生的header中的多个locations中持续追加用户名和密码信息,即使域名已发生改变
            curl_setopt($ch, CURLOPT_FOLLOWLOCATION, true);         //启用时会将服务器服务器返回的"Location: "放在header中递归的返回给服务器,使用CURLOPT_MAXREDIRS可以限定递归返回的数量。
    
    
            curl_setopt($ch, CURLOPT_HEADER, true);         //true:启用时会将头文件的信息作为数据流输出  可以在curl_exec中截取 false:不以数据流返回
            curl_setopt($ch, CURLINFO_HEADER_OUT, true);        //启用时追踪句柄的请求字符串。 1|是 0|否  通过 curl_getinfo($ch, CURLINFO_HEADER_OUT) 获取请求头信息
            curl_setopt($ch, CURLOPT_NOBODY, false);                   //TRUE 时将数据流不输出 BODY 部分。同时 Mehtod 变成了 HEAD。修改为 FALSE 时不会变成 GET。
            if($params['referer']){
                curl_setopt($ch, CURLOPT_REFERER, $params['referer']);       //设置在HTTP请求头中"Referer: "的内容
            }
            if($params['agent']) {
                curl_setopt($ch, CURLOPT_USERAGENT, $params['agent']);            //设置在HTTP请求头中"User-Agent: "的内容
            }
    
            curl_setopt($ch, CURLOPT_TIMEOUT, $timeOut); // 设置超时限制防止死循环
            if($params['headers']) {
                curl_setopt($ch, CURLOPT_HTTPHEADER,$params['headers']);      //一个用来设置HTTP头字段的数组。使用如下的形式的数组进行设置
            }
    
            curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);         //将curl_exec()获取的信息以文件流的形式返回,而不是直接输出。
            $responseStream = curl_exec($ch);       //返回的数据流 包括header头
    
            $responseHeaderSize = curl_getinfo($ch, CURLINFO_HEADER_SIZE);        //获取 response header头大小
            $requestHeader = curl_getinfo($ch, CURLINFO_HEADER_OUT);       //获取 request header头
            $connectTime = curl_getinfo($ch, CURLINFO_CONNECT_TIME);       //建立连接消耗的时间
            $preTransferTime = curl_getinfo($ch, CURLINFO_PRETRANSFER_TIME);       //从建立连接到准备传输所使用的时间
            $startTransferTime = curl_getinfo($ch, CURLINFO_STARTTRANSFER_TIME);       //从建立连接到传输开始所使用的时间
            $redirectTime = curl_getinfo($ch, CURLINFO_REDIRECT_TIME);       //从建立连接到传输开始所使用的时间
            $totalTime = curl_getinfo($ch, CURLINFO_TOTAL_TIME);       //最后一次传输所消耗的时间
            $responseContentType = curl_getinfo($ch, CURLINFO_CONTENT_TYPE);       //下载内容的Content-Type:值,NULL表示服务器没有发送有效的Content-Type: header
    
            curl_close($ch);
    
            //抓取文件类型
            if($responseStream === false) {
                $this->error = '参数错误';
                return false;
            } else {
                $responseHeader = substr($responseStream, 0, $responseHeaderSize);      //返回header头
                $responseBody = substr($responseStream, $responseHeaderSize);
                $responseBody = $responseBody ? $responseBody : '';     //body体为空时 防止返回false
    
                return array(
                    'response_header'=>$responseHeader,
                    'response_body'=>$responseBody,
                    'requestHeader'=>$requestHeader,
                    'connectTime'=>$connectTime,
                    'preTransferTime'=>$preTransferTime,
                    'startTransferTime'=>$startTransferTime,
                    'redirectTime'=>$redirectTime,
                    'totalTime'=>$totalTime,
                    'responseContentType'=>$responseContentType,
                );
            }
    
        }
    
        /**
         * 请求
         * */
        protected function curlRequest($url,$referUrl,$headers,$requestTyp = 1,$data = array()) {
            $agent = 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.183 Safari/537.36';
    
            $ch = curl_init();
    //        curl_setopt($ch, CURLOPT_SSL_VERIFYPEER, FALSE);      //SSL证书校验
    //        curl_setopt($ch, CURLOPT_SSL_VERIFYHOST, FALSE);
            curl_setopt($ch, CURLOPT_URL, $url);
            curl_setopt($ch, CURLOPT_HEADER, true);
            curl_setopt($ch, CURLINFO_HEADER_OUT, true);
    //        curl_setopt($ch, CURLOPT_NOBODY, true);
            if($referUrl){
                curl_setopt($ch, CURLOPT_REFERER, $referUrl);//带来的Referer
            }
            curl_setopt($ch, CURLOPT_USERAGENT, $agent);
            curl_setopt($ch, CURLOPT_TIMEOUT, 10); // 设置超时限制防止死循环
            if($headers) {
                curl_setopt($ch, CURLOPT_HTTPHEADER,$headers);
            }
    
            if($requestTyp == 2) {
                curl_setopt($ch, CURLOPT_POST, true);
                curl_setopt($ch, CURLOPT_POSTFIELDS, http_build_query($data));
            }
    
            curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);
            $return_str = curl_exec($ch);
    
            $header_size = curl_getinfo($ch, CURLINFO_HEADER_SIZE);
            $requestStr = curl_getinfo($ch, CURLINFO_HEADER_OUT);
    
            curl_close($ch);
    
            //抓取文件类型
            $header = substr($return_str, 0, $header_size);
    
            curl_close($ch);
            if($return_str === false) {
                exit('error with stop');
            } else {
    
    
                var_export($requestStr) . PHP_EOL;
                var_export($header) . PHP_EOL;
                $this->explainHeader($url,$header);
    
    
            }
            return $return_str;
        }
    
        /**
         * 请求
         * */
        protected function curlRequestNew($url,$referUrl,$headers,$requestTyp = 1,$data = array()) {
            $agent = 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.183 Safari/537.36';
    
            $ch = curl_init();
    //        curl_setopt($ch, CURLOPT_SSL_VERIFYPEER, FALSE);      //设置为FALSE 禁止 cURL 验证对等证书
    //        curl_setopt($ch, CURLOPT_SSL_VERIFYHOST, FALSE);      //false|0 不校验 设置为 1 是检查服务器SSL证书中是否存在一个公用名 设置成 2,会检查公用名是否存在,并且是否与提供的主机名匹配。
    //        curl_setopt($ch,CURLOPT_CAINFO,dirname(__FILE__).'/cacert.pem');      //一个保存着1个或多个用来让服务端验证的证书的文件名。这个选项是和CURLOPT_SSL_VERIFYPEER一起使用的。
    //        curl_setopt($ch,CURLOPT_CAPATH ,dirname(__FILE__).'/');      // 一个保存着多个CA证书的目录。这个选项是和CURLOPT_SSL_VERIFYPEER一起使用的。
            curl_setopt($ch, CURLOPT_URL, $url);
            curl_setopt($ch, CURLOPT_HEADER, true);         //启用时会将头文件的信息作为数据流输出
            curl_setopt($ch, CURLINFO_HEADER_OUT, true);        //启用时追踪句柄的请求字符串。 1|是 0|否  通过 curl_getinfo($ch, CURLINFO_HEADER_OUT) 获取请求头信息
    //        curl_setopt($ch, CURLOPT_NOBODY, true);
            if($referUrl){
                curl_setopt($ch, CURLOPT_REFERER, $referUrl);       //设置在HTTP请求头中"Referer: "的内容
            }
            curl_setopt($ch, CURLOPT_USERAGENT, $agent);            //设置在HTTP请求头中"User-Agent: "的内容
            curl_setopt($ch, CURLOPT_TIMEOUT, 10); // 设置超时限制防止死循环
            if($headers) {
                curl_setopt($ch, CURLOPT_HTTPHEADER,$headers);      //一个用来设置HTTP头字段的数组。使用如下的形式的数组进行设置
            }
    
            if($requestTyp == 2) {
                curl_setopt($ch, CURLOPT_POST, true);       //启用时会发送一个常规的POST请求,类型为:application/x-www-form-urlencoded,就像表单提交的一样
                curl_setopt($ch, CURLOPT_POSTFIELDS, http_build_query($data));      //全部数据使用HTTP协议中的"POST"操作来发送。要发送文件,在文件名前面加上@前缀并使用完整路径。这个参数可以通过urlencoded后的字符串类似'para1=val1&para2=val2&...'或使用一个以字段名为键值,字段数据为值的数组。如果value是一个数组,Content-Type头将会被设置成multipart/form-data
            }
    
            curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);         //将curl_exec()获取的信息以文件流的形式返回,而不是直接输出。
            $return_str = curl_exec($ch);
    
            $header_size = curl_getinfo($ch, CURLINFO_HEADER_SIZE);        //获取 response header头大小
            $requestStr = curl_getinfo($ch, CURLINFO_HEADER_OUT);       //获取 request header头
            curl_close($ch);
    
            //抓取文件类型
            $header = substr($return_str, 0, $header_size);
    
            curl_close($ch);
            if($return_str === false) {
                exit('error with stop');
            } else {
    
    
                var_export($requestStr) . PHP_EOL;
                $this->explainHeader($url,$header);
    
    
            }
            return $return_str;
        }
    
        /**
         * 解析header头
         * @param   string      $url        请求链接
         * @param   string      $header     response header头
         * return   array
         * */
        public function explainHeader($header) {
            $headArr = explode("\r\n",$header);
            var_export($headArr);
            $map = array();
            foreach($headArr as $val) {
                $pos = strpos($val,':');
                if($pos) {
                    $mapKey = trim(substr($val,0,$pos));        //header头 key
                    $mapValue = trim(substr($val,($pos+1)));        //header头 value
                    $mapKey = strtolower($mapKey);
                    if($mapKey == 'set-cookie') {           //cookie去掉路径限制
                        $subLength = strpos($mapValue,'path=/');
                        if($subLength) {
                            $mapValue = substr($mapValue,0,$subLength);
                        }
                        $this->cookie[] = $mapValue;
                    }
    
                    if($map[$mapKey]) {
                        $map[$mapKey] = array(trim($mapValue),$map[$mapKey]);
                    } else {
                        $map[$mapKey] = trim($mapValue);
                    }
                }
            }
            if($map['set-cookie']) {
                $map['set-cookie'] = $this->cookie;
            }
            return $map;
        }
    
    
        /**
         * 获取列表数据
         * */
        public function getList() {
            $baseUrl = '*************';
            $referBase = '***********************';
            $url = '';
            for($i = 0;$i<10;$i++) {
                if($url) {
                    $referUrl = $url;
                } else {
                    $referUrl = $referBase;
                }
                if($i) {
                    $url = $baseUrl . '?offset=' . $i *100;
                } else {
                    $url = $baseUrl;
                }
                $htmlStr = $this->curlGet($url,$referUrl);
                $this->explanHtml($htmlStr);
                sleep(2);
    
            }
            exit('this is over');
        }
    
        /**
         * 请求
         * */
        protected function curlGet($url,$referUrl) {
            $headers = array(
                "Content-type:application/json;charset=utf-8",
                "Accept:application/json",
                "Cookie: ASPSESSIONIDACQSQDSB=GHGNMIJACHNKAHHMJLEGMIDO; ASPSESSIONIDCAQRRDSA=KCDNMALAGDKKMLKLIFECCION; User=UserLocation=HANGZHOU&UserNameEN=Coco+Cao&Logintime=28&UserDept=SALESHEAD&AccountName=cococao%2Ehz%2Ecn"
            );
            $agent = 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.183 Safari/537.36';
    
            $ch = curl_init();
            curl_setopt($ch, CURLOPT_SSL_VERIFYPEER, FALSE);
            curl_setopt($ch, CURLOPT_SSL_VERIFYHOST, FALSE);
            curl_setopt($ch, CURLOPT_URL, $url);
            if($referUrl){
                curl_setopt($ch, CURLOPT_REFERER, $referUrl);//带来的Referer
            }
            curl_setopt($ch, CURLOPT_USERAGENT, $agent);
            curl_setopt($ch, CURLOPT_TIMEOUT, 10); // 设置超时限制防止死循环
            curl_setopt($ch, CURLOPT_HTTPHEADER,$headers);
            curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);
            $return_str = curl_exec($ch);
            curl_close($ch);
            return $return_str;
        }
    
        /**
         * 解析html
         * */
        protected function explanHtml($htmlStr){
            preg_match_all( '/<table[^>]*?>\s*?<tr>\s*?<td[^>]*?><a href="(.*?)" class="style10">(.*?)<\/a><\/td>\s*?<td[^>]*?>(.*?)<\/td>\s*?<td[^>]*?>(.*?)<\/td>\s*?<td[^>]*?>(.*?)<\/td>\s*?<td[^>]*?>(.*?)<\/td>\s*?<\/tr>\s*?<\/table>/i' , $htmlStr , $results );
            $saveData = array();
            foreach($results[1] as $key => $val) {
                $saveData['link'] = $val;
                $saveData['en_name'] = $results[2][$key];
                $saveData['zh_name'] = $results[3][$key];
                $saveData['nature'] = $results[4][$key];
                $saveData['sales'] = $results[5][$key];
                $saveData['supervisor'] = $results[6][$key];
                $saveData['content'] = '';
                var_export($saveData);exit;
                try{
                    $res = M('PccAsicLlc','wx_',C('ARTICLE_DSN'))->add($saveData);     //如果开启调试模式 会直接报错
                    if(!$res) {
                        throw new \Exception($saveData['zh_name']);
                    }
                } catch (\Exception $e) {
                    echo 'Caught exception: ' . $e->getMessage() . PHP_EOL;
                }
            }
        }
    
        /**
         * 解析详情页
         *
         *
         * */
        public function getInfo() {
            $baseUrl = '***********************';
            $referBase = '****************************';
            $lists = M('PccAsicLlc','wx_',C('ARTICLE_DSN'))->field('id,link')->where('id = 1')->select();
            foreach($lists as $key => $val) {
                $url = $baseUrl . $val['link'];
                $offset = floor($val['id'] / 100) * 100;
                if($offset) {
                    $referUrl = $referBase . '?offset=' . $offset;
                } else {
                    $referUrl = $referBase;
                }
                $htmlUrl = $this->curlGet($url,$referUrl);
                M('PccAsicLlc','wx_',C('ARTICLE_DSN'))->save(array('id'=>$val['id'],'content'=>$htmlUrl));
    
                if($key % 100 == 0) {
                    sleep(1);
                }
            }
            exit('this is over');
        }
    
        /**
         * 检测id断层
         * */
        public function testId() {
            for($i = 1;$i<1000;$i++) {
                $res = M('PccAsicLlc','wx_',C('ARTICLE_DSN'))->find($i);
                if(!$res) {
                    echo $i . PHP_EOL;
                }
            }
            exit('this is over');
        }
    }
    
    <?php
    /**
     * Created by PhpStorm.
     * User: aoshi
     * Date: 2020/12/28
     * Time: 15:18
     */
    
    namespace Cron\Controller;
    
    class CrawltestController extends BaseController
    {
        /**
         * 获取列表数据
         * */
         public function login() {
             $baseUrl = '******************************';
             $referBase = ''******************************';
             for($i = 0;$i<10;$i++) {
                 if($url) {
                     $referUrl = $url;
                 } else {
                     $referUrl = $referBase;
                 }
                 if($i) {
                     $url = $baseUrl . '?offset=' . $i *100;
                 } else {
                     $url = $baseUrl;
                 }
                 $htmlStr = $this->curlGet($url,$referUrl);
                 $this->explanHtml($htmlStr);
                 sleep(2);
    
             }
             exit('this is over');
         }
    
         /**
          * 请求
          * */
         protected function curlGet($url,$referUrl) {
             $headers = array(
                 "Content-type:application/json;charset=utf-8",
                 "Accept:application/json",
                 "Cookie: ASPSESSIONIDACQSQDSB=GHGNMIJACHNKAHHMJLEGMIDO; ASPSESSIONIDCAQRRDSA=KCDNMALAGDKKMLKLIFECCION; User=UserLocation=HANGZHOU&UserNameEN=Coco+Cao&Logintime=28&UserDept=SALESHEAD&AccountName=cococao%2Ehz%2Ecn"
             );
             $agent = 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.183 Safari/537.36';
    
             $ch = curl_init();
             curl_setopt($ch, CURLOPT_SSL_VERIFYPEER, FALSE);
             curl_setopt($ch, CURLOPT_SSL_VERIFYHOST, FALSE);
             curl_setopt($ch, CURLOPT_URL, $url);
             if($referUrl){
                 curl_setopt($ch, CURLOPT_REFERER, $referUrl);//带来的Referer
             }
             curl_setopt($ch, CURLOPT_USERAGENT, $agent);
             curl_setopt($ch, CURLOPT_TIMEOUT, 10); // 设置超时限制防止死循环
             curl_setopt($ch, CURLOPT_HTTPHEADER,$headers);
             curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);
             $return_str = curl_exec($ch);
             curl_close($ch);
             return $return_str;
         }
    
         /**
          * 解析html
          * */
        protected function explanHtml($htmlStr){
             preg_match_all( '/<table[^>]*?>\s*?<tr>\s*?<td[^>]*?><a href="(.*?)" class="style10">(.*?)<\/a><\/td>\s*?<td[^>]*?>(.*?)<\/td>\s*?<td[^>]*?>(.*?)<\/td>\s*?<td[^>]*?>(.*?)<\/td>\s*?<td[^>]*?>(.*?)<\/td>\s*?<\/tr>\s*?<\/table>/i' , $htmlStr , $results );
             $saveData = array();
             foreach($results[1] as $key => $val) {
                 $saveData['link'] = $val;
                 $saveData['en_name'] = $results[2][$key];
                 $saveData['zh_name'] = $results[3][$key];
                 $saveData['nature'] = $results[4][$key];
                 $saveData['sales'] = $results[5][$key];
                 $saveData['supervisor'] = $results[6][$key];
                 $saveData['content'] = '';
                 try{
                     $res = M('PccAsicLlc','wx_',C('ARTICLE_DSN'))->add($saveData);     //如果开启调试模式 会直接报错
                     if(!$res) {
                         throw new \Exception($saveData['zh_name']);
                     }
                 } catch (\Exception $e) {
                     echo 'Caught exception: ' . $e->getMessage() . PHP_EOL;
                 }
             }
         }
    
         /**
          * 解析详情页
          *
          *
          * */
         public function getInfo() {
             $baseUrl = '******************************/';
             $referBase = '******************************';
             $lists = M('PccAsicLlc','wx_',C('ARTICLE_DSN'))->field('id,link')->where('id = 1')->select();
             foreach($lists as $key => $val) {
                $url = $baseUrl . $val['link'];
                $offset = floor($val['id'] / 100) * 100;
                if($offset) {
                    $referUrl = $referBase . '?offset=' . $offset;
                } else {
                    $referUrl = $referBase;
                }
                $htmlUrl = $this->curlGet($url,$referUrl);
                 M('PccAsicLlc','wx_',C('ARTICLE_DSN'))->save(array('id'=>$val['id'],'content'=>$htmlUrl));
    
                 if($key % 100 == 0) {
                     sleep(1);
                 }
             }
             exit('this is over');
         }
    
         /**
          * 检测id断层
          * */
         public function testId() {
             for($i = 1;$i<1000;$i++) {
                 $res = M('PccAsicLlc','wx_',C('ARTICLE_DSN'))->find($i);
                 if(!$res) {
                     echo $i . PHP_EOL;
                 }
             }
             exit('this is over');
         }
    }
    

    相关文章

      网友评论

          本文标题:Curl爬虫案例

          本文链接:https://www.haomeiwen.com/subject/arqtoktx.html