采集数据,也就是爬虫,对于PHP同样能够做到,通过封装curl采集数据函数,从而实现对数据的采集利用
<?php
public function g_url_contents($url, $reurl = '', $ip = '')
{
$ch = curl_init();
$timeout = 30;
//设置你需要抓取的URL
curl_setopt($ch, CURLOPT_URL, $url);
//伪造来源ip
if ($ip) {
$header = array(
'CLIENT-IP:' . $ip,
'X-FORWARDED-FOR:' . $ip,
);
curl_setopt($ch, CURLOPT_HTTPHEADER, $header);
}
//随机生成一个user_agent
$binfo = array('Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1;
Trident/4.0; .NET CLR 2.0.50727; InfoPath.2; AskTbPTV/5.17.0.25589;
Alexa Toolbar)','Mozilla/5.0 (Windows NT 5.1; rv:22.0) Gecko/20100101
Firefox/22.0','Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1;
Trident/4.0; .NET4.0C; Alexa Toolbar)','Mozilla/4.0(compatible;
MSIE 6.0; Windows NT 5.1; SV1)',$_SERVER['HTTP_USER_AGENT']);
$user_agent = $binfo[mt_rand(0,3)];
//下面这个是单个的,有可能被监测到
//$user_agent = $_SERVER['HTTP_USER_AGENT']
//执行curl_exec是把输出做为返回值,不会输出到浏览器
curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1);
//在发起连接前等待的时间,如果设置为0,则不等待
curl_setopt($ch, CURLOPT_CONNECTTIMEOUT, $timeout);
//在HTTP请求中包含一个”user-agent”头的字符串
curl_setopt($ch, CURLOPT_USERAGENT, $user_agent);
//设置这个选项为一个非零值(像 “Location: “)的头,服务器会把它当做
//HTTP头的一部分发送(注意这是递归的,PHP将发送形如 “Location: “的头)
curl_setopt($ch, CURLOPT_FOLLOWLOCATION, 1);
//伪造来源referer
if ($reurl) {
curl_setopt($ch, CURLOPT_REFERER, $reurl);
}
@$c = curl_exec($ch);//执行请求
curl_close($ch);//关闭url请求
return $c;
}
网友评论