小弟想利用curl来抓取某网站的内容
https://www.youtube.com/trendsdashboard#age0=25-34&loc0=twn
使用以下mycurl()的程式码 , 有抓到东西
抓到的head如下:
HTTP/1.1 200 OK Date: Wed, 30 Mar 2016 09:23:41 GMT Server: gwiseguy/2.0
X-Frame-Options: SAMEORIGIN X-XSS-Protection: 1; mode=block;
report=https://www.google.com/appserve/security-bugs/log/youtube
Cache-Control: no-cache Content-Type: text/html; charset=utf-8 Expires: Tue,
27 Apr 1971 19:44:06 EST Strict-Transport-Security: max-age=604800
X-Content-Type-Options: nosniff Accept-Ranges: none Vary: Accept-Encoding
Transfer-Encoding: chunked
后来我也有把cookie 送上去
但不知为什么抓不到原始网页下方影片的资讯(连结href, 点及次数...等资讯)
想请教大家自己是哪个地方出错?
或是目标网页有什么新技术 造成无法抓取呢? 感谢~~
function mycurl()
{
//目标网站
$target1="https://www.youtube.com/trendsdashboard#age0=25-34&loc0=twn ";
#####(A 1)选项设定: 目标设置 / 起始 / 假冒
$s=curl_init();
curl_setopt($s,CURLOPT_URL,$target1);
$user_agent='Mozilla/5.0 (Windows NT 6.1;WOW64) AppleWebKit/537.11 (KHTML,
like Gecko) Chrome/23.0.1271.95 Safari/537.11';
curl_setopt($s, CURLOPT_USERAGENT, $user_agent);
#(1)是否发送POST请求 (填表单)
//$POST_data1=array('CN'=>"l6",'remember'=>"1",'UserPASSWORD'=>"091",'URL'=>"");
//$POST_data2=array('username'=>"markbone2@yahoo.com.tw",'passwd'=>"");
//curl_setopt($s,CURLOPT_POST,1);
//curl_setopt($s,CURLOPT_POSTFIELDS,$POST_data1);
#(2)结果回传:是否以 文件流/二元码方式传回
curl_setopt($s,CURLOPT_RETURNTRANSFER,TRUE);
//curl_setopt($s,CURLOPT_BINARYTRANSFER,TRUE);
#(3)结果回传:是否 包含标头 / 只要BODY
curl_setopt($s,CURLOPT_HEADER,TRUE);
#(4)COOKIE设定: CURLOPT_COOKIEJAR接收储存资讯 ,而CURLOPT_COOKIEFILE送出资讯。
###送出多个cookie内容(变量资料串),分号后要带1个空格 范例"friut=8923;
curl_setopt($s,CURLOPT_COOKIE,"VISITOR_INFO1_LIVE=4kLW9NhJVbI;
YSC=KTyxERZA5Io; PREF=f1=50000000");
###记录COOKIE资讯,在cookie.txt中(绝对位置)
curl_setopt($s,CURLOPT_COOKIEJAR,"C:\\xampp\\htdocs\\mark\\cookie.txt");
###浏览时,把cookie.txt当中的COOKIE资讯一并送出给服务器
curl_setopt($s,CURLOPT_COOKIEFILE,"C:\\xampp\\htdocs\\mark\\cookie.txt");
#(5)SSL网站 (https开头)
######抓取SLL网站则要记得开启verifypeer这行 并设定成false
curl_setopt($s,CURLOPT_SSL_VERIFYPEER,false);
curl_setopt($s,CURLOPT_SSL_VERIFYHOST,0);
//curl_setopt($s, CURLOPT_SSL_CAPATH,cainfo);
//curl_setopt($s, CURLOPT_SSL_CAINFO,"cainfo.txt");
//curl_setopt($s, CURLOPT_SSL_VERSION,2);
######是否检查服务器端SSL证书中有没有domain/subdomain name
//curl_setopt($s, CURLOPT_SSL_VERIFYHOST,TRUE);
# (6)设定等待SERVER的时间~单位秒
curl_setopt($s,CURLOPT_TIMEOUT,15);
# (7)重新导向 : 是否跟随重导 ,设定引入参照, 最大重导次数
curl_setopt($s,CURLOPT_FOLLOWLOCATION,TRUE);
curl_setopt($s,CURLOPT_AUTOREFERER,TRUE);
curl_setopt($s,CURLOPT_MAXREDIRS,3);
//curl_setopt($s,CURLOPT_UNRESTRICTED_AUTH,TRUE);
#####(B)最后一次 执行CURL 并把错误资讯导出################
$download_page=curl_exec($s);
$errors=curl_error($s);
###### 展示并写入挡案:$download_page & webget.txt ########
echo $download_page."<br>"."抓取结END!";
file_put_contents("WEBget.txt",$download_page);
}