微信文章爬虫获取

作者: LauEl | 来源:发表于2020-01-15 08:26 被阅读0次

「爬虫」11爬虫之微信爬虫
教你如何入手用python实现简单爬虫微信公众号并下载视频
微信文章爬虫获取
wechat_spider 微信爬虫
抓取微信公众号文章
2019-07-19 机灵鹤的日常
微信客户端公众号爬虫
基于Python对知网（CNKI）主题文献爬虫
Python爬虫项目整理
爬虫案例

/**
     * 抓取微信公众号文章
     * 
     * @param string $url 微信公众号文章链接
     * @return mixed
     */
    public function search()
    {
        $is_up_oss      = true; //是否将上传到OSS
        $label_replace  = '<p>'; //标签替换模式 p:替换为P标签 span:替换为span标签

        $url = request()->post('url');

        try {
            //匹配url类型 true:是从微信公众号里复制的连接 false:是从浏览器中搜索的
            preg_match("/https:\/\/mp\.weixin\.qq\.com\/s\/[a-zA-Z0-9]*$/", $url, $url_type);
            if ($url_type) {
                //获取微信页面信息
                $wx_content = file_get_contents($url);
                //匹配接口地址
                $rule = "/(?<=<meta property=\"og:url\" content=\").*?(?=\" \/>)/";
                preg_match($rule, $wx_content, $api_url);
                $content_url = $api_url[0];
            } else {
                $content_url = $url;
            }

            //获取真实文章信息
            $wx_article = file_get_contents($content_url);
            //匹配标题
            $title_rule = "/\<h2 class=\"rich_media_title\".*?id=\"activity-name\".*?\>\K[\s\S]*?(?=\<\/h2\>)/";
            preg_match($title_rule, $wx_article, $titles);
            $title = trim($titles[0]);

            //匹配内容
            $content_rule = "/\<div class=\"rich_media_content \".*?id=\"js_content\".*?\>\K[\s\S]*?(?=\<ul id=\"js_hotspot_area\" class=\"article_extend_area\"\>\<\/ul\>)/";
            preg_match($content_rule, $wx_article, $contents);
            $content = $contents[0];

            switch ($label_replace) {
                case 'span':
                    //将块级标签替换为span标签并转为块级元素(这里为了防止百度富文本编辑器转换)
                    $content = preg_replace(["/<section(.*?)style=\"/","/<div(.*?)style=\"/","/<p(.*?)style=\"/"], ['<span$1style="display:block;','<span$1style="display:block;','<span$1style="display:block;'], $content);
                    //将闭合标签也替换
                    $content = strtr($content, ['data-src' => 'src','</section>' => '</span>','</div>' => '</span>','</p>' => '</span>']);
                    break;
                case 'p':
                    //将块级标签替换为p标签并转为块级元素(配合前端编辑使用,但是板式会错乱)
                    $content = preg_replace(["/<section/","/<div/","/<span(.*?)style=\"/"], ['<p','<p','<p$1style="display:inline;'], $content);
                    //将闭合标签也替换
                    $content = strtr($content, ['</section>' => '</p>','</div>' => '</p>','</span>' => '</p>']);
                    break;
                case '<p>':
                    //去掉所有的样式
                    //将块级标签替换为p标签并转为块级元素(配合前端编辑使用,但是板式会错乱)
                    $content = preg_replace("/(<\/?)(div|p|a|span|section|h1|h2|h3|h4|h5|h6|strong|table|tr|th|td|ul|li)[\s\S]*?(>)/", '$1p$3', $content);
            }

            $preg_replace = [
                '/<p>\s*?(<br>)*?\s*?<\/p>\n?/' => '', //过滤换行符
                '/<\/p>\s*?(<br>)*?\s*?<p>\n?/' => '', //过滤两个P标签之间的换行符
                '/(<img.*?>)/' => '<p>$1</p>', //将IMG用p标签包裹
            ];
            $content = preg_replace(array_keys($preg_replace), array_values($preg_replace), $content);

            //将图片url赋值
            $content = strtr($content, ['data-src' => 'src']);

            //将有固定宽度的图片这是宽度优先级,防止小图片(比如:表情、箭头)被设置为宽度100%
            //开发中
            //$content = preg_replace('/(<img[^>]*?)width="([^;"]*)/', '$1 style="width:$2 !important', $content);

            //将图片上传至阿里OSS
            if ($is_up_oss) {
                $img_rule = "/<img.*?src=\"\K[^\"]+/";
                preg_match_all($img_rule, $content, $imgs);
                $img_strtr = [];
                foreach ($imgs[0] as $v) {
                    //将图片上传至阿里云OSS
                    $oss_result = Oss::uploadByString(file_get_contents($v), 'wx_tencent_img');
                    if ($oss_result['state'] == 1) {
                        $img_strtr[$v] = Img($oss_result['data']['path']);
                    } else {
                        //删除上传失败的图片
                        //将路径中的字符进行转义
                        $en_v = strtr($v, ["/" => "\/","." => "\.","?" => "\?"]);
                        $content = preg_replace("/<img.*?src=\"". $en_v ."\".*?>/", "<br>",$content);
                    }
                }
                $content = strtr($content, $img_strtr);
            }     
        } catch (\Exception $e) {
            return $this->error('匹配失败');
        }
        
        return $this->success('成功', '', [
            'title' => $title,
            'content'  => $content
        ]);
    }