|
- PHP获取百度相关搜索和原创程度
- 获取原创程度是get_em_times,数值越高文本长度越长越不原创。
- 用于采集项目的搜索引擎优化
- class baidu {
- public static function build_request_uri($words){
- $words = urlencode(Text::convert_encoding($words,'GBK','UTF-8'));
- return '/s?wd='.$words;
- }
- public static function get_relate_search($words){
- $content = self::get_content($words);
- if(preg_match('/<div id='rs'>(((?!</div>).)*)</div>/', $content,$result)){
- if(preg_match_all('/<a((?!>).)*>(.*?)</a>/', $result[1], $result)) return $result[2];
- }
- return FALSE;
- }
- /**
- * 根据空格分词,获取百度关键词完全匹配<em></em>的次数
- * @param string $words
- * @return int
- */
- public static function get_em_times($words){
- $word = explode(' ', $words);
- $content = self::get_content($words);
- preg_match_all('/<em>(((?!</em>).)*)</em>/', $content,$result);
- if(!isset($result[1]) || count($result[1])==0)
- return 0;
- $word_table = array_count_values($result[1]);
- $times = 0;
- foreach ($word as $pice){
- if(array_key_exists($pice, $word_table)) $times+=$word_table[$pice];
- }
- return $times;
- }
- public static function get_content($words){
- static $loaded_contents = array();
- if(!isset($loaded_contents[$words])){
- $content = file_get_contents('http://www.baidu.com'.self::build_request_uri($words));
- $content = Text::convert_encoding($content,'UTF-8','GBK');
- $loaded_contents[$words] = $content;
- }
- return $loaded_contents[$words];
- }
- function check_veriy($content){
- if(strpos($content,'http://verify.baidu.com/')!==FALSE){
- //百度要求验证,异常
- }
- }
- }
复制代码 |
|