美文网首页扩展类
php网页内容抓取

php网页内容抓取

作者: 七百年前 | 来源:发表于2016-08-01 15:17 被阅读46次

    1.按照所抓取网页的规则去编写

    public function comment(){
        for ($i = 700; $i <750; $i++){
            $aa = $this->caiji($i);
            var_dump($i);
        }
    
    }
    
    private function caiji($id){
    
        $v  = 'http://www.xx.com'.$id;
        $info = file_get_contents($url); // 原网址
    
        $ch = curl_init();
        $timeout = 5;
        curl_setopt ($ch, CURLOPT_URL, $v);
        curl_setopt ($ch, CURLOPT_RETURNTRANSFER, 1);
        curl_setopt ($ch, CURLOPT_CONNECTTIMEOUT, $timeout);
        $content = curl_exec($ch);
        $content = mb_convert_encoding($content, 'utf-8', 'GBK,UTF-8,ASCII');
         
        $preg = "/<tr.*?>(.*?)<\/tr>/ism"; // 这里是表达式,大神看看
        preg_match_all($preg,$content,$matches);
        unset($matches['0']['0']);
        unset($matches['0']['1']);
        //$aa = $matches['0'];
        if($matches['0']){
            $dd = '';
            $cc = '';
            foreach ($matches['0'] as $key => $value) {
                preg_match_all("/<td.*?>(.*?)<\/td>/ism",$value,$aa);
                $dd[] = $aa;
            }
    
            //var_dump($dd);
    
            foreach ($dd as $vv) {
                    $cc['s_id'] = $vv['1']['1'];
                    $cc['pid'] = $vv['1']['2'];
                    $cc['uid'] = $vv['1']['3'];
                    $cc['name'] = $vv['1']['4'];
                    $cc['car_type'] = $vv['1']['5'];
                    $cc['mobile'] = $vv['1']['6'];
                    $cc['zhan_id'] = $vv['1']['7'];
                    $cc['city'] = $vv['1']['8'];
                    $cc['pay'] = $vv['1']['9'];
                    $cc['title'] = $vv['1']['10'];
                    $cc['comment'] = $vv['1']['11'];
                    $cc['time'] = $vv['1']['12'];
    
                    $aa = M('Comment_cc')->add($cc);
                    //var_dump($cc);exit;
            }
        }
        
    }
    

    2.php 获取文章摘要

    function cutArticle($data,$cut=0,$str="....")  
    {     
        $data=strip_tags($data);//去除html标记  
        $pattern = "/&[a-zA-Z]+;/";//去除特殊符号  
        $data=preg_replace($pattern,'',$data);  
        if(!is_numeric($cut))  
        return $data;  
        if($cut>0)  
        $data=mb_strimwidth($data,0,$cut,$str);  
        return $data;  
    }  

    相关文章

      网友评论

        本文标题:php网页内容抓取

        本文链接:https://www.haomeiwen.com/subject/pdnxsttx.html