Гость
Покинул форум
Сообщений всего: 66
Дата рег-ции: Нояб. 2010
Помог: 1 раз(а)
[+][+][+][+]
|
написал скрипт паука(плюс часть содрал и два чела помогал скрипт взят с sphider) вот код: PHP:
скопировать код в буфер обмена
<title>Индексируем сайт...</title> <?PHP $include_dir = "../include"; require_once ("$include_dir/commonfuncs.php"); $all = 0; $settings_dir = "../settings"; require_once ("$settings_dir/conf.php"); include "$settings_dir/database.php"; include "messages.php"; echo '<style>body {background: #ededed; padding: 0; margin: 0;}</style><div align="center"><img src="/admin/images/loading.gif" alt="loading"><br></div>'; include "spiderfuncs.php"; $delay_time = 0; $command_line = 0; if (isset($_SERVER['argv']) && $_SERVER['argc'] >= 2 ) { $command_line = 1; $ac = 1; //argument counter while ($ac < (count($_SERVER['argv']))) { $arg = $_SERVER['argv'][$ac]; if ($arg == '-all') { $all = 1; break; } else if ($arg == '-u') { $url = $_SERVER['argv'][$ac+1]; $ac= $ac+2; } else if ($arg == '-f') { $soption = 'full'; $ac++; } else if ($arg == '-d') { $soption = 'level'; $maxlevel = $_SERVER['argv'][$ac+1];; $ac= $ac+2; } else if ($arg == '-l') { $domaincb = 1; $ac++; } else if ($arg == '-r') { $reindex = 1; $ac++; } else if ($arg == '-m') { $ac= $ac+2; } else if ($arg == '-n') { $ac= $ac+2; } else { commandline_help(); } } } if (isset($soption) && $soption == 'full') { $maxlevel = -1; } $domaincb = 0; } $reindex=0; } $maxlevel=0; } if ($keep_log) { if ($log_format=="html") { $log_file = $log_dir."/".Date("ymdHi").".html"; } else { $log_file = $log_dir."/".Date("ymdHi").".log"; } if (!$log_handle = fopen($log_file, 'w')) { die ("Вход опция установлена, но не может открыть файл для logging."); } } if ($all == 1) { index_all(); } else { if ($reindex == 1 && $command_line == 1) { $result=mysql_query("select url, spider_depth, required, disallowed, can_leave_domain from ".$mysql_table_prefix."sites where url='$url'"); $url = $row[0]; $maxlevel = $row[1]; $in= $row[2]; $out = $row[3]; $domaincb = $row[4]; if ($domaincb=='') { $domaincb=0; } if ($maxlevel == -1) { $soption = 'full'; } else { $soption = 'level'; } } } $in = ""; } $out = ""; } index_site($url, $reindex, $maxlevel, $soption, $in, $out, $domaincb); } function microtime_float(){ return ((float)$usec + (float)$sec); } function index_url($url, $level, $site_id, $md5sum, $domain, $indexdate, $sessid, $can_leave_domain, $reindex) { global $entities, $min_delay; global $command_line; global $min_words_per_page; global $supdomain; global $mysql_table_prefix, $user_agent, $tmp_urls, $delay_time, $domain_arr; $needsReindex = 1; $deletable = 0; $url_status = url_status($url); $thislevel = $level - 1; if (strstr($url_status['state'], "Relocation")) { $url = eregi_replace(" ", "", url_purify ($url_status['path'], $url, $can_leave_domain)); if ($url <> '') { $result = mysql_query("select link from ".$mysql_table_prefix."temp where link='$url' && id = '$sessid'"); if ($rows == 0) { mysql_query ("insert into ".$mysql_table_prefix."temp (link, level, id) values ('$url', '$level', '$sessid')"); } } $url_status['state'] == "redirected"; } /* if ($indexdate <> '' && $url_status['date'] <> '') { if ($indexdate > $url_status['date']) { $url_status['state'] = "Date checked. Page contents not changed"; $needsReindex = 0; } }*/ ini_set("user_agent", $user_agent); if ($url_status['state'] == 'ok') { $OKtoIndex = 1; $file_read_error = 0; if (time() - $delay_time < $min_delay) { } if ($file === FALSE) { $file_read_error = 1; } } else { if ($fl) { while ($buffer = @fgets($fl, 4096 )) { $file .= $buffer; } } else { $file_read_error = 1; } } if ($file_read_error) { $contents = getFileContents($url); $file = $contents['file']; } preg_match("@<head[^>]*>(.*?)<\/head>@si",$file, $regs); $headdata = $regs[1]; preg_match("/<meta +http-equiv *=[\"']?Content-Type[\"']? *content=[\"']?([^<>'\"]+)[\"']?/i", $headdata, $res); $content = $res[1]; } if ( $content == "" ) { } $codepages = array("windows-1251", "utf-8", "koi8-r"); for ($i=0;$i<count($codepages);$i++){ $cur_codepage = $codepages[$i]; break; } } if ( $cur_codepage != "UTF-8" || $cur_codepage != "" ) { $file = iconv($cur_codepage, "windows-1251", $file); } printPageSizeReport($pageSize); if ($url_status['content'] != 'text') { $file = extract_text($file, $url_status['content']); } printStandardReport('starting', $command_line); if ($md5sum == $newmd5sum) { printStandardReport('md5notChanged',$command_line); $OKtoIndex = 0; } else if (isDuplicateMD5($newmd5sum)) { $OKtoIndex = 0; printStandardReport('duplicate',$command_line); } if (($md5sum != $newmd5sum || $reindex ==1) && $OKtoIndex == 1) { $newdomain = $urlparts['host']; $type = 0; /* if ($newdomain <> $domain) $domainChanged = 1; if ($domaincb==1) { $start = strlen($newdomain) - strlen($supdomain); if (substr($newdomain, $start) == $supdomain) { $domainChanged = 0; } }*/ // remove link to css file //get all links from file $data = clean_file($file, $url, $url_status['content']); if ($data['noindex'] == 1) { $OKtoIndex = 0; $deletable = 1; printStandardReport('metaNoindex',$command_line); } $wordarray = unique_array (explode(" ", $data['content'])); if ($data['nofollow'] != 1) { $links = get_links($file, $url, $can_leave_domain, $data['base']); $links = distinct_array($links); $all_links = count($links); $numoflinks = 0; //if there are any, add to the temp table, but only if there isnt such url already while ($thislink = each($links)) { if ($tmp_urls[$thislink[1]] != 1) { $tmp_urls[$thislink[1]] = 1; $numoflinks++; mysql_query ("insert into ".$mysql_table_prefix."temp (link, level, id) values ('$thislink[1]', '$level', '$sessid')"); } } } } else { printStandardReport('noFollow',$command_line); } if ($OKtoIndex == 1) { $title = $data['title']; $host = $data['host']; $path = $data['path']; $fulltxt = $data['fulltext']; $desc = substr($data['description'], 0 ,254 ); $domain_for_db = $url_parts['host']; if (isset($domain_arr[$domain_for_db])) { $dom_id = $domain_arr[$domain_for_db]; } else { mysql_query("insert into ".$mysql_table_prefix."domains (domain) values ('$domain_for_db')"); $domain_arr[$domain_for_db] = $dom_id; } $wordarray = calc_weights ($wordarray, $title, $host, $path, $data['keywords']); //if there are words to index, add the link to the database, get its id, and add the word + their relation if (is_array($wordarray) && count($wordarray) > $min_words_per_page) { if ($md5sum == '') { mysql_query ("insert into ".$mysql_table_prefix."links (site_id, url, title, description, fulltxt, indexdate, size, md5sum, level) values ('$site_id', '$url', '$title', '$desc', '$fulltxt', curdate(), '$pageSize', '$newmd5sum', $thislevel)"); $result = mysql_query("select link_id from ".$mysql_table_prefix."links where url='$url'"); $link_id = $row[0]; save_keywords($wordarray, $link_id, $dom_id); printStandardReport('indexed', $command_line); }else if (($md5sum <> '') && ($md5sum <> $newmd5sum)) { //if page has changed, start updating $result = mysql_query("select link_id from ".$mysql_table_prefix."links where url='$url'"); $link_id = $row[0]; for ($i=0;$i<=15; $i++) { mysql_query ("delete from ".$mysql_table_prefix."link_keyword$char where link_id=$link_id"); } save_keywords($wordarray, $link_id, $dom_id); $query = "update ".$mysql_table_prefix."links set title='$title', description ='$desc', fulltxt = '$fulltxt', indexdate=now(), size = '$pageSize', md5sum='$newmd5sum', level=$thislevel where link_id=$link_id"; printStandardReport('re-indexed', $command_line); } }else { printStandardReport('minWords', $command_line); } } } } else { $deletable = 1; printUrlStatus($url_status['state'], $command_line); } if ($reindex ==1 && $deletable == 1) { check_for_removal($url); } else if ($reindex == 1) { } if (!isset($all_links)) { $all_links = 0; } if (!isset($numoflinks)) { $numoflinks = 0; } printLinksReport($numoflinks, $all_links, $command_line); } function index_site($url, $reindex, $maxlevel, $soption, $url_inc, $url_not_inc, $can_leave_domain) { global $mysql_table_prefix, $command_line, $mainurl, $tmp_urls, $domain_arr, $all_keywords; if (!isset($all_keywords)) { $result = mysql_query("select keyword_ID, keyword from ".$mysql_table_prefix."keywords"); } } if ($compurl['path'] == '') $url = $url . "/"; $domain = $urlparts['host']; if (isset($urlparts['port'])) { $port = (int)$urlparts['port']; }else { $port = 80; } $result = mysql_query("select site_id from ".$mysql_table_prefix."sites where url='$url'"); $site_id = $row[0]; if ($site_id != "" && $reindex == 1) { mysql_query ("insert into ".$mysql_table_prefix."temp (link, level, id) values ('$url', 0, '$sessid')"); $result = mysql_query("select url, level from ".$mysql_table_prefix."links where site_id = $site_id"); $site_link = $row['url']; $link_level = $row['level']; if ($site_link != $url) { mysql_query ("insert into ".$mysql_table_prefix."temp (link, level, id) values ('$site_link', $link_level, '$sessid')"); } } $qry = "update ".$mysql_table_prefix."sites set indexdate=now(), spider_depth = $maxlevel, required = '$url_inc'," . "disallowed = '$url_not_inc', can_leave_domain=$can_leave_domain where site_id=$site_id"; } else if ($site_id == '') { mysql_query ("insert into ".$mysql_table_prefix."sites (url, indexdate, spider_depth, required, disallowed, can_leave_domain) " . "values ('$url', now(), $maxlevel, '$url_inc', '$url_not_inc', $can_leave_domain)"); $result = mysql_query("select site_ID from ".$mysql_table_prefix."sites where url='$url'"); $site_id = $row[0]; } else { mysql_query ("update ".$mysql_table_prefix."sites set indexdate=now(), spider_depth = $maxlevel, required = '$url_inc'," . "disallowed = '$url_not_inc', can_leave_domain=$can_leave_domain where site_id=$site_id"); } $result = mysql_query("select site_id, temp_id, level, count, num from ".$mysql_table_prefix."pending where site_id='$site_id'"); $pending = $row[0]; $level = 0; $domain_arr = get_domains(); if ($pending == '') { mysql_query ("insert into ".$mysql_table_prefix."temp (link, level, id) values ('$url', 0, '$sessid')"); } else if ($pending != '') { printStandardReport('continueSuspended',$command_line); mysql_query("select temp_id, level, count from ".$mysql_table_prefix."pending where site_id='$site_id'"); $sessid = $row[1]; $level = $row[2]; $pend_count = $row[3] + 1; $num = $row[4]; $pending = 1; $tmp_urls = get_temp_urls($sessid); } if ($reindex != 1) { mysql_query ("insert into ".$mysql_table_prefix."pending (site_id, temp_id, level, count) values ('$site_id', '$sessid', '0', '0')"); } $omit = check_robot_txt($url); printHeader ($omit, $url, $command_line); $mainurl = $url; $num = 0; while (($level <= $maxlevel && $soption == 'level') || ($soption == 'full')) { if ($pending == 1) { $count = $pend_count; $pending = 0; } else $count = 0; $result = mysql_query("select distinct link from ".$mysql_table_prefix."temp where level=$level && id='$sessid' order by link"); if ($rows == 0) { break; } $i = 0; $links[] = $row['link']; } while ($count < count($links)) { $num++; $thislink = $links[$count]; $forbidden = 0; foreach ($omit as $omiturl) { $omiturl = trim($omiturl); if ($omiturl_parts['scheme'] == '') { $check_omit = $urlparts['host'] . $omiturl; } else { $check_omit = $omiturl; } if (strpos($thislink, $check_omit)) { printRobotsReport($num, $thislink, $command_line); check_for_removal($thislink); $forbidden = 1; break; } } if (!check_include($thislink, $url_inc, $url_not_inc )) { printUrlStringReport($num, $thislink, $command_line); check_for_removal($thislink); $forbidden = 1; } if ($forbidden == 0) { printRetrieving($num, $thislink, $command_line); $query = "select md5sum, indexdate from ".$mysql_table_prefix."links where url='$thislink'"; if ($rows == 0) { index_url($thislink, $level+1, $site_id, '', $domain, '', $sessid, $can_leave_domain, $reindex); mysql_query("update ".$mysql_table_prefix."pending set level = $level, count=$count, num=$num where site_id=$site_id"); }else if ($rows <> 0 && $reindex == 1) { $md5sum = $row['md5sum']; $indexdate = $row['indexdate']; index_url($thislink, $level+1, $site_id, $md5sum, $domain, $indexdate, $sessid, $can_leave_domain, $reindex); mysql_query("update ".$mysql_table_prefix."pending set level = $level, count=$count, num=$num where site_id=$site_id"); }else { printStandardReport('inDatabase',$command_line); } } $count++; } $level++; } mysql_query ("delete from ".$mysql_table_prefix."temp where id = '$sessid'"); mysql_query ("delete from ".$mysql_table_prefix."pending where site_id = '$site_id'"); printStandardReport('completed',$command_line); } function index_all() { global $mysql_table_prefix; $result=mysql_query("select url, spider_depth, required, disallowed, can_leave_domain from ".$mysql_table_prefix."sites"); $url = $row[0]; $depth = $row[1]; $include = $row[2]; $not_include = $row[3]; $can_leave_domain = $row[4]; if ($can_leave_domain=='') { $can_leave_domain=0; } if ($depth == -1) { $soption = 'full'; } else { $soption = 'level'; } index_site($url, 1, $depth, $soption, $include, $not_include, $can_leave_domain); } } function get_temp_urls ($sessid) { global $mysql_table_prefix; $result = mysql_query("select link from ".$mysql_table_prefix."temp where id='$sessid'"); $tmp_urls[$row[0]] = 1; } return $tmp_urls; } function get_domains () { global $mysql_table_prefix; $result = mysql_query("select domain_id, domain from ".$mysql_table_prefix."domains"); $domains[$row[1]] = $row[0]; } return $domains; //почти канец } function commandline_help() { print "Usage: php spider.php <options>\n\n"; print "Опции:\n"; print " -all\t\t Переиндексировать все\n"; print " -u <url>\t Set url to index\n"; print " -f\t\t Задать индексации глубину до полной (неограниченная глубина)\n"; print " -d <num>\t Задать индексации глубину <num>\n"; print " -l\t\t Разрешить паука, чтобы оставить первоначальный домен\n"; print " -r\t\t Задать паука, чтобы индексировать сайт\n"; print " -m <string>\t Задать строке (S), что URL должен включать (используйте \\n в качестве разделителя между несколькими строками)\n"; print " -n <string>\t Задать строке (S), что URL не должен включать (используйте \\n в качестве разделителя между несколькими строками)\n"; } printStandardReport('quit',$command_line); if ($email_log) { $indexed = ($all==1) ? 'ALL' : $url; $log_report = ""; if ($log_handle) { $log_report = "Лог сохраняются в $log_file"; } mail($admin_email, "Доклад Magic Search:", "Magic Search закончил индексирование $indexed at ".date("y-m-d H:i:s").". ".$log_report); } if ( $log_handle) { } ?>
но этот паук почему то индексирует сайт который введеш и даже есть ссылки на другие сайты он их не берет
вчем дело??(если чего то недостаточно могу весь движолк вывалить..
|