這幾天關注了一下PHP的采集程序,才發現用PHP采集內容是這么方便,把經常用到的采集函數在這里總結一下,方便以后使用!
獲取所有鏈接內容和地址
function getAllURL($code){preg_match_all( / a/s+href=[ |/ ]?([^ / ]+)[ |/ ]?/s*[^ ]* ([^ ]+) //a /i ,$code,$arr);return array( name = $arr[2], url = $arr[1]);}
獲取所有的圖片地址
function getImgSrc($code){$reg = /]*src=/ (http:////(.+)//(.+)/.(jpg|gif|bmp|bnp|png))/ /isU preg_match_all($reg, $code, $img_array, PREG_PATTERN_ORDER);return $img_array[1];}
當前的腳本網址
function getSelfURL(){if(!empty($_SERVER[ REQUEST_URI ])){$scriptName = $_SERVER[ REQUEST_URI $nowurl = $scriptName;}else{$scriptName = $_SERVER[ PHP_SELF if(empty($_SERVER[ QUERY_STRING ])) $nowurl = $scriptName;else $nowurl = $scriptName. ? .$_SERVER[ QUERY_STRING return $nowurl;}
把全角數字轉為半角數字
function getAlabNum($fnum){$nums = array( 0 , 1 , 2 , 3 , 4 , 5 , 6 , 7 , 8 , 9 $fnums = 0123456789 for($i=0;$i $i++) $fnum = str_replace($nums[$i],$fnums[$i],$fnum);$fnum = ereg_replace( [^0-9/.]|^0{1,} , ,$fnum);if($fnum== ) $fnum=0;return $fnum;}
去除HTML標記
function text2Html($txt){$txt = str_replace( , ,$txt);$txt = str_replace( , ,$txt);$txt = str_replace( , ,$txt);$txt = preg_replace( /[/r/n]{1,}/isU , br/ /r/n ,$txt);return $txt;}
清除HTML標記
function clearHtml($str){$str = str_replace( , ,$str);$str = str_replace( , ,$str);return $str;}
相對路徑轉化成絕對路徑
function relative2Absolute($content, $feed_url) {preg_match( /(http|https|ftp):///// , $feed_url, $protocol);$server_url = preg_replace( /(http|https|ftp|news):///// , , $feed_url);$server_url = preg_replace( ///.*/ , , $server_url);if ($server_url == ) {return $content;if (isset($protocol[0])) {$new_content = preg_replace( /href= /// , href= .$protocol[0].$server_url. / , $content);$new_content = preg_replace( /src= /// , src= .$protocol[0].$server_url. / , $new_content);} else {$new_content = $content;return $new_content;}
獲取指定標記中的內容
function getTagData($str, $start, $end){if ( $start == || $end == ){return;$str = explode($start, $str);$str = explode($end, $str[1]);return $str[0];}
HTML表格的每行轉為CSV格式數組
function getTrArray($table) {$table = preg_replace( td[^ ]*? si , ,$table);$table = str_replace( /td , , ,$table);$table = str_replace( /tr , {tr} ,$table);//去掉 HTML 標記$table = preg_replace( [///!]*?[^ ]*? si , ,$table);//去掉空白字符$table = preg_replace( ([/r/n])[/s]+ , ,$table);$table = str_replace( , ,$table);$table = str_replace( , ,$table);$table = explode( ,{tr} ,$table);array_pop($table);return $table;}
將HTML表格的每行每列轉為數組,采集表格數據
function getTdArray($table) {$table = preg_replace( table[^ ]*? si , ,$table);$table = preg_replace( tr[^ ]*? si , ,$table);$table = preg_replace( td[^ ]*? si , ,$table);$table = str_replace( /tr , {tr} ,$table);$table = str_replace( /td , {td} ,$table);//去掉 HTML 標記$table = preg_replace( [///!]*?[^ ]*? si , ,$table);//去掉空白字符$table = preg_replace( ([/r/n])[/s]+ , ,$table);$table = str_replace( , ,$table);$table = str_replace( , ,$table);$table = explode( {tr} , $table);array_pop($table);foreach ($table as $key= $tr) {$td = explode( {td} , $tr);array_pop($td);$td_array[] = $td;return $td_array;}
返回字符串中的所有單詞 $distinct=true 去除重復
function splitEnStr($str,$distinct=true) {preg_match_all( /([a-zA-Z]+)/ ,$str,$match);if ($distinct == true) {$match[1] = array_unique($match[1]);sort($match[1]);return $match[1];}
相關推薦:
PHP采集程序中常用的函數
php 使用CURL函數采集
以上就是php中常用的采集函數的總結(附代碼)的詳細內容,PHP教程
鄭重聲明:本文版權歸原作者所有,轉載文章僅為傳播更多信息之目的,如作者信息標記有誤,請第一時間聯系我們修改或刪除,多謝。
新聞熱點
疑難解答