大家都知道,你采集别人的网站,可能封掉你的IP地址,那么肯定就不能用自己的真实IP地址了,今天我们就浅谈PHP伪装IP地址,进行数据采集!
首先,我用的是TP5框架,把IP伪装及数据采集封装到common.php中了,代码如下:
<?php
// +----------------------------------------------------------------------
// | ThinkPHP [ WE CAN DO IT JUST THINK ]
// +----------------------------------------------------------------------
// | Copyright (c) 2006-2016 http://thinkphp.cn All rights reserved.
// +----------------------------------------------------------------------
// | Licensed ( http://www.apache.org/licenses/LICENSE-2.0 )
// +----------------------------------------------------------------------
// | Author: 流年 <liu21st@gmail.com>
// +----------------------------------------------------------------------// 应用公共文件
use think\Db;/************************************* 封装公共方法 *****************************************/
/*** 伪装IP 地址 - 抓取数据* GET 请求* @param $url* @return mixed*/
function pretendIpData($url){// 给与IP 段$data = array(119.120.'.'.rand(1,255).'.'.rand(1,255),124.174.'.'.rand(1,255).'.'.rand(1,255),116.249.'.'.rand(1,255).'.'.rand(1,255),118.125.'.'.rand(1,255).'.'.rand(1,255),42.175.'.'.rand(1,255).'.'.rand(1,255),124.162.'.'.rand(1,255).'.'.rand(1,255),211.167.'.'.rand(1,255).'.'.rand(1,255),58.206.'.'.rand(1,255).'.'.rand(1,255),117.24.'.'.rand(1,255).'.'.rand(1,255),203.93.'.'.rand(1,255).'.'.rand(1,255),);//随机获取一个IP地址$ip = $data[array_rand($data)];//模拟来源网址$referUrl = "http://www.baidu.com";$agentArray=[//PC端的UserAgent"safari 5.1 – MAC"=>"Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11","safari 5.1 – Windows"=>"Mozilla/5.0 (Windows; U; Windows NT 6.1; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50","Firefox 38esr"=>"Mozilla/5.0 (Windows NT 10.0; WOW64; rv:38.0) Gecko/20100101 Firefox/38.0","IE 11"=>"Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; .NET4.0C; .NET4.0E; .NET CLR 2.0.50727; .NET CLR 3.0.30729; .NET CLR 3.5.30729; InfoPath.3; rv:11.0) like Gecko","IE 9.0"=>"Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0","IE 8.0"=>"Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0)","IE 7.0"=>"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0)","IE 6.0"=>"Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1)","Firefox 4.0.1 – MAC"=>"Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv:2.0.1) Gecko/20100101 Firefox/4.0.1","Firefox 4.0.1 – Windows"=>"Mozilla/5.0 (Windows NT 6.1; rv:2.0.1) Gecko/20100101 Firefox/4.0.1","Opera 11.11 – MAC"=>"Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; en) Presto/2.8.131 Version/11.11","Opera 11.11 – Windows"=>"Opera/9.80 (Windows NT 6.1; U; en) Presto/2.8.131 Version/11.11","Chrome 17.0 – MAC"=>"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_0) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11","傲游(Maxthon)"=>"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Maxthon 2.0)","腾讯TT"=>"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; TencentTraveler 4.0)","世界之窗(The World) 2.x"=>"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)","世界之窗(The World) 3.x"=>"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; The World)","360浏览器"=>"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; 360SE)","搜狗浏览器 1.x"=>"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; SE 2.X MetaSr 1.0; SE 2.X MetaSr 1.0; .NET CLR 2.0.50727; SE 2.X MetaSr 1.0)","Avant"=>"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Avant Browser)","Green Browser"=>"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)",];$userAgent=$agentArray[array_rand($agentArray,1)]; //随机浏览器userAgent$header = array('CLIENT-IP:'.$ip,'X-FORWARDED-FOR:'.$ip,); //构造ip$curl = curl_init();curl_setopt($curl, CURLOPT_URL, $url); //要抓取的网址curl_setopt($curl, CURLOPT_HTTPHEADER, $header);curl_setopt($curl, CURLOPT_RETURNTRANSFER,true);curl_setopt($curl, CURLOPT_REFERER, $referUrl); //模拟来源网址curl_setopt($curl, CURLOPT_USERAGENT, $userAgent); //模拟常用浏览器的userAgentcurl_setopt($curl, CURLOPT_SSL_VERIFYPEER, false); // 跳过证书检查$info = curl_exec($curl);return $info;
}/*** 伪装IP 地址 - 抓取数据* POST 请求* @param $url* @param array $data* @return mixed*/
function getPostIpData($url,$data=array()){// 给与IP 段$ipData = array(119.120.'.'.rand(1,255).'.'.rand(1,255),124.174.'.'.rand(1,255).'.'.rand(1,255),116.249.'.'.rand(1,255).'.'.rand(1,255),118.125.'.'.rand(1,255).'.'.rand(1,255),42.175.'.'.rand(1,255).'.'.rand(1,255),124.162.'.'.rand(1,255).'.'.rand(1,255),211.167.'.'.rand(1,255).'.'.rand(1,255),58.206.'.'.rand(1,255).'.'.rand(1,255),117.24.'.'.rand(1,255).'.'.rand(1,255),203.93.'.'.rand(1,255).'.'.rand(1,255),);//随机获取一个IP地址$ip = $ipData[array_rand($ipData)];//模拟来源网址$referUrl = "http://www.baidu.com";$agentArray=[//PC端的UserAgent"safari 5.1 – MAC"=>"Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11","safari 5.1 – Windows"=>"Mozilla/5.0 (Windows; U; Windows NT 6.1; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50","Firefox 38esr"=>"Mozilla/5.0 (Windows NT 10.0; WOW64; rv:38.0) Gecko/20100101 Firefox/38.0","IE 11"=>"Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; .NET4.0C; .NET4.0E; .NET CLR 2.0.50727; .NET CLR 3.0.30729; .NET CLR 3.5.30729; InfoPath.3; rv:11.0) like Gecko","IE 9.0"=>"Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0","IE 8.0"=>"Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0)","IE 7.0"=>"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0)","IE 6.0"=>"Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1)","Firefox 4.0.1 – MAC"=>"Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv:2.0.1) Gecko/20100101 Firefox/4.0.1","Firefox 4.0.1 – Windows"=>"Mozilla/5.0 (Windows NT 6.1; rv:2.0.1) Gecko/20100101 Firefox/4.0.1","Opera 11.11 – MAC"=>"Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; en) Presto/2.8.131 Version/11.11","Opera 11.11 – Windows"=>"Opera/9.80 (Windows NT 6.1; U; en) Presto/2.8.131 Version/11.11","Chrome 17.0 – MAC"=>"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_0) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11","傲游(Maxthon)"=>"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Maxthon 2.0)","腾讯TT"=>"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; TencentTraveler 4.0)","世界之窗(The World) 2.x"=>"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)","世界之窗(The World) 3.x"=>"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; The World)","360浏览器"=>"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; 360SE)","搜狗浏览器 1.x"=>"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; SE 2.X MetaSr 1.0; SE 2.X MetaSr 1.0; .NET CLR 2.0.50727; SE 2.X MetaSr 1.0)","Avant"=>"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Avant Browser)","Green Browser"=>"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)",];$userAgent=$agentArray[array_rand($agentArray,1)]; //随机浏览器userAgent$header = array('CLIENT-IP:'.$ip,'X-FORWARDED-FOR:'.$ip,); //构造ip$curl = curl_init(); // 启动一个CURL会话curl_setopt($curl, CURLOPT_URL, $url); // 要访问的地址curl_setopt($curl, CURLOPT_HTTPHEADER, $header);curl_setopt($curl, CURLOPT_RETURNTRANSFER,true);curl_setopt($curl, CURLOPT_REFERER, $referUrl); //模拟来源网址curl_setopt($curl, CURLOPT_SSL_VERIFYPEER, 0); // 对认证证书来源的检查curl_setopt($curl, CURLOPT_SSL_VERIFYHOST, 0); // 0-跳过证书 1-从证书中检查SSL加密算法是否存在curl_setopt($curl, CURLOPT_USERAGENT, $userAgent); // 模拟用户使用的浏览器curl_setopt($curl, CURLOPT_FOLLOWLOCATION, 1); // 使用自动跳转curl_setopt($curl, CURLOPT_AUTOREFERER, 1); // 自动设置Referercurl_setopt($curl, CURLOPT_POST, 1); // 发送一个常规的Post请求curl_setopt($curl, CURLOPT_POSTFIELDS, $data); // Post提交的数据包curl_setopt($curl, CURLOPT_TIMEOUT, 30); // 设置超时限制防止死循环$info = curl_exec($curl); // 执行操作if (curl_errno($curl)) {echo 'Errno'.curl_error($curl);//捕抓异常}curl_close($curl); // 关闭CURL会话return $info; // 返回数据,json格式
}/*** 时间戳转日期 - 精确到毫秒的时间戳* @param $time* @return false|string*/
function timeDate($time)
{$tag='Y-m-d H:i:s';$a = substr($time,0,10);$date = date($tag,$a);return $date;
}/*** 数字字符串求和* @param $numberString* @return float|int*/
function strSum ($numberString) {$arr = explode(',',$numberString);return array_sum($arr);
}/*** 随机数:1-10* @param $num* @return array*/
function randStr($num)
{$numbers = range (1,10);//shuffle 将数组顺序随即打乱shuffle ($numbers);//array_slice 取该数组中的某一段$result = array_slice($numbers,0,$num);//将数组的值按升序排列array_multisort($result,SORT_ASC,SORT_NUMERIC);return $result;
}/*** 单双判断 - 1 单 2 双* @param $num* @return bool|int*/
function isDouble($num){$is_double = 0;if(is_numeric($num)){if(is_int($num)){if($num % 2 == 0){$is_double = 2;}else{$is_double = 1;}}}else{return false;}return $is_double;
}GET请求,我们要在控制器中,调用此方法,控制器代码如下:
<?phpnamespace app\index\controller\choose;use think\Controller;
use think\Db;class ChooseGd extends Controller
{public function collection(){$url = '你要采集的网址';// 将数据格式 转换成你想要的数据类型$data = json_decode(pretendData($url),true);dump($data);die;}
}
最后,看看打印结果:
POST请求,我们要在控制器中,调用此方法,控制器代码如下:
这个是我们要抓取的的POST请求,接下来调用上面封装的POST请求方法:控制器代码如下:
<?phpnamespace app\index\controller\quick;use think\Controller;
use think\Db;class QuickXx extends Controller
{public function collection(){//模拟要抓取网址的参数$postData['lotterytype'] = 'GP_K3_JiLin';//要抓取的网址$url = '你要抓取的网址';$data = json_decode(getPostIpData($url,$postData),true);dump($data);die;}
}
结果打印如下:(顺便打印一下header头)
希望能帮到你!