php网页内容抓取

作者: 七百年前 | 来源:发表于2016-08-01 15:17 被阅读46次

php网页内容抓取
php实战开发网络爬虫，实现采集功能
PHP抓取网页
Python实用练手小案例
Python 爬虫_动态网页抓取
Python抓取网页内容乱码
puppeteer + nodejs 抓取网页内容
python爬虫(四)_urllib2库的基本使用
QueryList异步抓取网页数据
【HtmlUnit】网页爬虫进阶篇

1.按照所抓取网页的规则去编写

public function comment(){
    for ($i = 700; $i <750; $i++){
        $aa = $this->caiji($i);
        var_dump($i);
    }

}

private function caiji($id){

    $v  = 'http://www.xx.com'.$id;
    $info = file_get_contents($url); // 原网址

    $ch = curl_init();
    $timeout = 5;
    curl_setopt ($ch, CURLOPT_URL, $v);
    curl_setopt ($ch, CURLOPT_RETURNTRANSFER, 1);
    curl_setopt ($ch, CURLOPT_CONNECTTIMEOUT, $timeout);
    $content = curl_exec($ch);
    $content = mb_convert_encoding($content, 'utf-8', 'GBK,UTF-8,ASCII');
     
    $preg = "/<tr.*?>(.*?)<\/tr>/ism"; // 这里是表达式，大神看看
    preg_match_all($preg,$content,$matches);
    unset($matches['0']['0']);
    unset($matches['0']['1']);
    //$aa = $matches['0'];
    if($matches['0']){
        $dd = '';
        $cc = '';
        foreach ($matches['0'] as $key => $value) {
            preg_match_all("/<td.*?>(.*?)<\/td>/ism",$value,$aa);
            $dd[] = $aa;
        }

        //var_dump($dd);

        foreach ($dd as $vv) {
                $cc['s_id'] = $vv['1']['1'];
                $cc['pid'] = $vv['1']['2'];
                $cc['uid'] = $vv['1']['3'];
                $cc['name'] = $vv['1']['4'];
                $cc['car_type'] = $vv['1']['5'];
                $cc['mobile'] = $vv['1']['6'];
                $cc['zhan_id'] = $vv['1']['7'];
                $cc['city'] = $vv['1']['8'];
                $cc['pay'] = $vv['1']['9'];
                $cc['title'] = $vv['1']['10'];
                $cc['comment'] = $vv['1']['11'];
                $cc['time'] = $vv['1']['12'];

                $aa = M('Comment_cc')->add($cc);
                //var_dump($cc);exit;
        }
    }
    
}

2.php 获取文章摘要

function cutArticle($data,$cut=0,$str="....")  
{     
    $data=strip_tags($data);//去除html标记  
    $pattern = "/&[a-zA-Z]+;/";//去除特殊符号  
    $data=preg_replace($pattern,'',$data);  
    if(!is_numeric($cut))  
    return $data;  
    if($cut>0)  
    $data=mb_strimwidth($data,0,$cut,$str);  
    return $data;  
}