美文网首页
微信文章爬虫获取

微信文章爬虫获取

作者: LauEl | 来源:发表于2020-01-15 08:26 被阅读0次
    /**
         * 抓取微信公众号文章
         * 
         * @param string $url 微信公众号文章链接
         * @return mixed
         */
        public function search()
        {
            $is_up_oss      = true; //是否将上传到OSS
            $label_replace  = '<p>'; //标签替换模式 p:替换为P标签 span:替换为span标签
    
            $url = request()->post('url');
    
            try {
                //匹配url类型 true:是从微信公众号里复制的连接 false:是从浏览器中搜索的
                preg_match("/https:\/\/mp\.weixin\.qq\.com\/s\/[a-zA-Z0-9]*$/", $url, $url_type);
                if ($url_type) {
                    //获取微信页面信息
                    $wx_content = file_get_contents($url);
                    //匹配接口地址
                    $rule = "/(?<=<meta property=\"og:url\" content=\").*?(?=\" \/>)/";
                    preg_match($rule, $wx_content, $api_url);
                    $content_url = $api_url[0];
                } else {
                    $content_url = $url;
                }
    
                //获取真实文章信息
                $wx_article = file_get_contents($content_url);
                //匹配标题
                $title_rule = "/\<h2 class=\"rich_media_title\".*?id=\"activity-name\".*?\>\K[\s\S]*?(?=\<\/h2\>)/";
                preg_match($title_rule, $wx_article, $titles);
                $title = trim($titles[0]);
    
                //匹配内容
                $content_rule = "/\<div class=\"rich_media_content \".*?id=\"js_content\".*?\>\K[\s\S]*?(?=\<ul id=\"js_hotspot_area\" class=\"article_extend_area\"\>\<\/ul\>)/";
                preg_match($content_rule, $wx_article, $contents);
                $content = $contents[0];
    
                switch ($label_replace) {
                    case 'span':
                        //将块级标签替换为span标签并转为块级元素(这里为了防止百度富文本编辑器转换)
                        $content = preg_replace(["/<section(.*?)style=\"/","/<div(.*?)style=\"/","/<p(.*?)style=\"/"], ['<span$1style="display:block;','<span$1style="display:block;','<span$1style="display:block;'], $content);
                        //将闭合标签也替换
                        $content = strtr($content, ['data-src' => 'src','</section>' => '</span>','</div>' => '</span>','</p>' => '</span>']);
                        break;
                    case 'p':
                        //将块级标签替换为p标签并转为块级元素(配合前端编辑使用,但是板式会错乱)
                        $content = preg_replace(["/<section/","/<div/","/<span(.*?)style=\"/"], ['<p','<p','<p$1style="display:inline;'], $content);
                        //将闭合标签也替换
                        $content = strtr($content, ['</section>' => '</p>','</div>' => '</p>','</span>' => '</p>']);
                        break;
                    case '<p>':
                        //去掉所有的样式
                        //将块级标签替换为p标签并转为块级元素(配合前端编辑使用,但是板式会错乱)
                        $content = preg_replace("/(<\/?)(div|p|a|span|section|h1|h2|h3|h4|h5|h6|strong|table|tr|th|td|ul|li)[\s\S]*?(>)/", '$1p$3', $content);
                }
    
                $preg_replace = [
                    '/<p>\s*?(<br>)*?\s*?<\/p>\n?/' => '', //过滤换行符
                    '/<\/p>\s*?(<br>)*?\s*?<p>\n?/' => '', //过滤两个P标签之间的换行符
                    '/(<img.*?>)/' => '<p>$1</p>', //将IMG用p标签包裹
                ];
                $content = preg_replace(array_keys($preg_replace), array_values($preg_replace), $content);
    
                //将图片url赋值
                $content = strtr($content, ['data-src' => 'src']);
    
                //将有固定宽度的图片这是宽度优先级,防止小图片(比如:表情、箭头)被设置为宽度100%
                //开发中
                //$content = preg_replace('/(<img[^>]*?)width="([^;"]*)/', '$1 style="width:$2 !important', $content);
    
                //将图片上传至阿里OSS
                if ($is_up_oss) {
                    $img_rule = "/<img.*?src=\"\K[^\"]+/";
                    preg_match_all($img_rule, $content, $imgs);
                    $img_strtr = [];
                    foreach ($imgs[0] as $v) {
                        //将图片上传至阿里云OSS
                        $oss_result = Oss::uploadByString(file_get_contents($v), 'wx_tencent_img');
                        if ($oss_result['state'] == 1) {
                            $img_strtr[$v] = Img($oss_result['data']['path']);
                        } else {
                            //删除上传失败的图片
                            //将路径中的字符进行转义
                            $en_v = strtr($v, ["/" => "\/","." => "\.","?" => "\?"]);
                            $content = preg_replace("/<img.*?src=\"". $en_v ."\".*?>/", "<br>",$content);
                        }
                    }
                    $content = strtr($content, $img_strtr);
                }     
            } catch (\Exception $e) {
                return $this->error('匹配失败');
            }
            
            return $this->success('成功', '', [
                'title' => $title,
                'content'  => $content
            ]);
        }
    

    相关文章

      网友评论

          本文标题:微信文章爬虫获取

          本文链接:https://www.haomeiwen.com/subject/zygzactx.html