美文网首页
如何抓取微信公众号文章里面的视频

如何抓取微信公众号文章里面的视频

作者: chinariver | 来源:发表于2018-05-04 11:46 被阅读0次

    最近,要抓取一些公众号的文章以及将里面的视频和视频封面搞下来,这个问题已解决,直接上代码

    我是用Yii2.0框架的console脚本写的,其他的脚本可以修改

    代码呼之欲出:

    controller中:

    /**

    * 抓取微信公众号的文章和里面的视频 url

        * @param $url

        * @return bool

    */

        public function actionGetwx($url)

    {

            if (empty($url)) {

                echo '请输入公众号文章地址';

                return false;

            }

    //微信的链接有长链和短链,以下为长链

    //$url ='http://mp.weixin.qq.com/s?__biz=MzI0NTc1MTczNA==&mid=2247485130&idx=1&sn=945cfb8b0cfdd99f1b730889de0216e2&chksm=e9488c13de3f05057be6c6b065f8e44d43c566cb9ee3a4f35cf8084382742159181ea480b935&scene=27';

            if (stripos($url, '?')) {

                if (stripos($url, '#wechat_redirect')) {

                       $url = str_replace('#wechat_redirect', '', $url);

                }

                $json = $url . '&f=json';

            } else {

                $json = $url . '?f=json';

           }

            $data = Tools::curl_request($json);

            $data = json_decode($data, 1);

            echo '开始采集了...';

            echo '======文章内容====';

            print_r($data);

            //data 为文章的详情

            $html = $data['content_noencode'];

            preg_match_all('/<iframe (.*?)data-src="(.*?)">/', $html, $matchs);

            //没有视频脚本退出

            if (empty($matchs[2])) {

                echo '没有视频匹配到,不采集';

                die;

    }

            //https://v.qq.com/iframe/preview.html?vid=i1324786hv8&width=500&height=375&auto=0

            preg_match_all('/vid=(.*?)&amp/', $matchs[2][0], $vidArray);

            //获取到vid

            $vid = $vidArray[1][0];

            echo '=======vid=========';

            print_r($vid);

            echo '======vid==========';

            $video_json = Tools::getinfo($vid);

            echo '=====getinfo=====';

            print_r($video_json);

            echo '======getinfo====';

            $title = $video_json['vl']['vi'][0]['ti'];

            //高质量视频

            $fn_pre = $video_json['vl']['vi'][0]['lnk'];

            $host = $video_json['vl']['vi'][0]['ul']['ui'][0]['url'];

            $streams = $video_json['fl']['fi'];

            $seg_cnt = $video_json['vl']['vi'][0]['cl']['fc'];

            $best_quality = end($streams)['name'];

            $part_format_id = end($streams)['id'];

            $part_urls = [];

            echo '========计数=======' . $seg_cnt;

            for ($part = 1; $part <= $seg_cnt + 1; $part++) {

                $filename = $fn_pre . '.p' . ($part_format_id % 10000) . '.' . $part . '.mp4';

                $key_api = "http://vv.video.qq.com/getkey?otype=json&platform=11&format="

                    . $part_format_id . "&vid=" . $vid . "&filename=" . $filename . "&appver=3.2.19.333";

                $part_info = Tools::curl($key_api);

                preg_match('/QZOutputJson=(.*);$/Uis', $part_info, $key_json);

                $key_json = json_decode($key_json[1], 1);

                echo '=======getkey=============';

                print_r($key_json);

                echo '========getkey=============';

                if (empty($key_json['key'])) {

                    $vkey = $video_json['vl']['vi'][0]['fvkey'];

                    $url = $video_json['vl']['vi'][0]['ul']['ui'][0]['url'] . $fn_pre . '.mp4?vkey=' . $vkey;

                } else {

                    $vkey = $key_json['key'];

                    $url = $host . $filename . "?vkey=" . $vkey;

    }

                $part_urls[] = $url;

    }

            //真实的地址

            print_r($part_urls);

    //        if (empty($part_urls)) {

    //            //获取的视频质量低

    //            if (!empty($video_json['vl']['vi'])) {

    //                $keys = [];

    //                foreach ($video_json['vl']['vi'] as $key => $value) {

    //                    $fvkey = $value['fvkey'];

    //                    $fn = $value['fn'];

    //                    $self_host = $value['ul']['ui'][$key]['url'];

    //                    $keys['fvkey'] = $fvkey;

    //                    $keys['fn'] = $fn;

    //                    $keys['self_host'] = $self_host;

    //                    $keys['lnk'] = $value['lnk'];

    //                }

    //                $part_urls[0] = $keys['self_host'] . $keys['fn'] . '?vkey=' . $keys['fvkey'];

    //            }

    //        }

            $videoUrl = $part_urls[0];

            //https://shp.qpic.cn/qqvideo_ori/0/i1324786hv8_496_280/0

    //https://shp.qpic.cn/qqvideo_ori/0/o13389u1u8i_496_280/0

            $imageUrl = sprintf('https://shp.qpic.cn/qqvideo_ori/0/%s_496_280/0', $vid);

            $type = 'png';

            $gzhName = $data['nick_name'];

            $savePath = '/opt/sdb/samba/pub/miaomiao/';

            //下载图片

            self::DownloadGzhInfo($imageUrl, $savePath, $gzhName, $data['title'], $type, $title);

            //下载视频

            self::DownloadGzhInfo($videoUrl, $savePath, $gzhName, $data['title'], 'mp4', $title);

    }

        public static function DownloadGzhInfo($url, $save_path, $gzhName, $title, $type, $videoTitle)

    {

            $title = Tools::strFilter($title);

            $cmd = 'cd ' . $save_path;

            exec($cmd);

            $file = $save_path . $gzhName;

            if (!file_exists($file) && !mkdir($file, 0777, true)) {

                return false;

    }

            $save_path_dir = $file . '/' . $title;

            if (!file_exists($save_path_dir) && !mkdir($save_path_dir, 0777, true)) {

                return false;

    }

            if ($type == 'png') {

                $cmd = 'wget -c ' . $url . ' -O ' . $save_path_dir . '/"' . $videoTitle . '".' . $type;

            } else {

                $expension = Tools::getExt($url);

                $cmd = 'wget -c ' . $url . ' -O ' . $save_path_dir . '/"' . $videoTitle . '".' . $expension;

    }

            exec($cmd);

       }

    用到的Tools方法:

    //参数1:访问的URL,参数2:post数据(不填则为GET),参数3:提交的$cookies,参数4:是否返回$cookies

    public static function curl_request($url, $post = '', $cookie = '', $returnCookie = 0)

    {

        $curl = curl_init();

        curl_setopt($curl, CURLOPT_URL, $url);

        curl_setopt($curl, CURLOPT_USERAGENT, 'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; Trident/6.0)');

        curl_setopt($curl, CURLOPT_FOLLOWLOCATION, 1);

        curl_setopt($curl, CURLOPT_AUTOREFERER, 1);

        curl_setopt($curl, CURLOPT_REFERER, "http://XXX");

        if ($post) {

            curl_setopt($curl, CURLOPT_POST, 1);

            curl_setopt($curl, CURLOPT_POSTFIELDS, http_build_query($post));

    }

        if ($cookie) {

            curl_setopt($curl, CURLOPT_COOKIE, $cookie);

    }

        curl_setopt($curl, CURLOPT_HEADER, $returnCookie);

        curl_setopt($curl, CURLOPT_TIMEOUT, 60);

        curl_setopt($curl, CURLOPT_RETURNTRANSFER, 1);

        $data = curl_exec($curl);

        if (curl_errno($curl)) {

            return curl_error($curl);

    }

        curl_close($curl);

        if ($returnCookie) {

            list($header, $body) = explode("\r\n\r\n", $data, 2);

            preg_match_all("/Set\-Cookie:([^;]*);/", $header, $matches);

            $info['cookie'] = substr($matches[1][0], 1);

            $info['content'] = $body;

            return $info;

        } else {

            return $data;

    }

    }

    public static function getinfo($vid)

    {

    //        $json_type = 'json';

    //        $platform = '11001';

    //        $guid = 'ba7f7fab8f8aef2c4ae45883f221c04d';

    //        $sdtfrom = 'v1010';

    //        $_qv_rmt = '703hXQMOA1937593I%3D';

    //        $_qv_rmt2 = 'egp3C2aj149060UUA%3D';

    //        $url = sprintf('http://vv.video.qq.com/getinfo?vid=%s&otype=%s&guid=%s&platform=%s&sdtfrom=%s&_qv_rmt=%s&_qv_rmt2=%s', $vid, $json_type, $guid, $platform, $sdtfrom, $_qv_rmt, $_qv_rmt2);

            $infourl = 'https://vv.video.qq.com/getinfo?otype=json&appver=3.2.19.333&platform=11&defnpayver=1&vid=' . $vid;

            $data = self::curl($infourl);

            preg_match('/QZOutputJson=(.*);$/Uis', $data, $info);

            return json_decode($info[1], 1);

        }

    public static function getExt($url)

    {

        $urlinfo = parse_url($url);

        $file = basename($urlinfo['path']);

        if (strpos($file, '.') !== false) {

            $ext = explode('.', $file);

            return $ext[count($ext) - 1];

    }

        return 'no extension';

    }

    public static function strFilter($str)

    {

        $str = str_replace('    ', '', $str);

        $str = str_replace(' ', '', $str);

        $str = str_replace('`', '', $str);

        $str = str_replace('·', '', $str);

        $str = str_replace('~', '', $str);

        $str = str_replace('!', '', $str);

        $str = str_replace('!', '', $str);

        $str = str_replace('@', '', $str);

        $str = str_replace('#', '', $str);

        $str = str_replace('$', '', $str);

        $str = str_replace('¥', '', $str);

        $str = str_replace('%', '', $str);

        $str = str_replace('^', '', $str);

        $str = str_replace('……', '', $str);

        $str = str_replace('&', '', $str);

        $str = str_replace('*', '', $str);

        $str = str_replace('(', '', $str);

        $str = str_replace(')', '', $str);

        $str = str_replace('(', '', $str);

        $str = str_replace(')', '', $str);

        $str = str_replace('-', '', $str);

        $str = str_replace('_', '', $str);

        $str = str_replace('——', '', $str);

        $str = str_replace('+', '', $str);

        $str = str_replace('=', '', $str);

        $str = str_replace('|', '', $str);

        $str = str_replace('\\', '', $str);

        $str = str_replace('[', '', $str);

        $str = str_replace(']', '', $str);

        $str = str_replace('【', '', $str);

        $str = str_replace('】', '', $str);

        $str = str_replace('{', '', $str);

        $str = str_replace('}', '', $str);

        $str = str_replace(';', '', $str);

        $str = str_replace(';', '', $str);

        $str = str_replace(':', '', $str);

        $str = str_replace(':', '', $str);

        $str = str_replace('\'', '', $str);

        $str = str_replace('"', '', $str);

        $str = str_replace('“', '', $str);

        $str = str_replace('”', '', $str);

        $str = str_replace(',', '', $str);

        $str = str_replace(',', '', $str);

        $str = str_replace('<', '', $str);

        $str = str_replace('>', '', $str);

        $str = str_replace('《', '', $str);

        $str = str_replace('》', '', $str);

        $str = str_replace('.', '', $str);

        $str = str_replace('。', '', $str);

        $str = str_replace('/', '', $str);

        $str = str_replace('、', '', $str);

        $str = str_replace('?', '', $str);

        $str = str_replace('?', '', $str);

        return trim($str);

    }

    public static function curl($url, $option = [])

    {

        $split = explode('/', $url);

        $cookiejar = str_replace('\\', '/', dirname(__FILE__)) . '/' . $split[2] . '.cookie';

        $ch = curl_init();

        $options = [

            CURLOPT_URL => $url,

            CURLOPT_HTTPHEADER => [

                "Accept: text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",

                "Accept-Charset: UTF-8,*;q=0.5",

                "Accept-Encoding': 'gzip,deflate,sdch",

                "Accept-Language': 'en-US,en;q=0.8",

                "User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64; rv:51.0) Gecko/20100101 Firefox/51.0",

    ],

            CURLOPT_RETURNTRANSFER => 1,

            CURLOPT_FOLLOWLOCATION => 1,

            CURLOPT_SSL_VERIFYPEER => false,

            CURLOPT_SSL_VERIFYHOST => false,

            CURLOPT_CONNECTTIMEOUT => 5,

            CURLOPT_TIMEOUT => 5,

            CURLOPT_COOKIEJAR => $cookiejar,

            CURLOPT_COOKIEFILE => $cookiejar,

    ];

        curl_setopt_array($ch, $options);

        $response = curl_exec($ch);

        curl_close($ch);

        return $response;

    }

    相关文章

      网友评论

          本文标题:如何抓取微信公众号文章里面的视频

          本文链接:https://www.haomeiwen.com/subject/dnhdrftx.html