Back to Home


<?php
// PHP xml sitemap generator


function spider() {
	global $CONFIG, $SPIDER;
	$protocol = "http://";
	$tempt = "spider_temp_".$SPIDER["tid"];
	$spidert = "spider_".$SPIDER["tid"];
	
	$SPIDER["temp"][0] = $CONFIG["baseurl"];
	$SPIDER["baseurl"] = $CONFIG["baseurl"];

	while(sizeof($SPIDER["temp"]) > 0) {
		for($i = 0; $i < 4 && $i < sizeof($SPIDER["temp"]); $i++) $urls[] = array_pop($SPIDER["temp"]);
		multiGetURL($urls);
	}
	$fp = fopen($CONFIG["sitemap_file"], "w+");
	$xml_sitemap = genXmlSitemap();
	fputs($fp, $xml_sitemap);
	fclose($fp);

}

function handleHref($html, $href, $url) {
	global $SPIDER;
	$url_info = parse_url($href);
	if($url_info["scheme"] == "javascript") {
		return false;
	}
	if($url_info["scheme"] == "http") {
		if(!urlInSpider($href)) {
			if(!isLinkExternal($href, $SPIDER["baseurl"])) {
				if(!urlInTemp($href)) 
					$SPIDER["temp"][] = $href;
			}
		}
	}
}

function getAnchors($url, $html) {
	global $SPIDER;
	$anchors = getTags($html, '<a', '>');
	for($i = 0; $i < sizeof($anchors); $i++) {
		$href = getTagField($anchors[$i], "href=");
		$href = correctUrl($href, $SPIDER["baseurl"]);
		handleHref($html, $href, $url); 
	}
}

function multiGetURL($urls) {
	global $SPIDER;
	$htmls = curlMultiGetPage($urls);
	for($i = 0; $i < sizeof($urls); $i++) {
		echo "Checking ".$urls[$i]." ...\n";
		if($htmls[$i] != "") {
			$SPIDER["spider"][] = $urls[$i];
			getAnchors($urls[$i], $htmls[$i]);
		}
	}
}

function urlInSpider($url) {
	global $SPIDER;
	return in_array($url, $SPIDER["spider"]);
}

function urlInTemp($url) {
	global $SPIDER;
	return in_array($url, $SPIDER["temp"]);
}

?>