#!/usr/bin/env php send(); if ($res->getStatus() !== 200) { echo "Response code is not 200 but " . $res->getStatus() . ", stopping\n"; exit(3); } $mimetype = explode(';', $res->getHeader('content-type'))[0]; if (!in_array($mimetype, $supportedCrawlTypes)) { echo "MIME type not supported for crawling: $mimetype\n"; exit(4); } //FIXME: mime type switch for cdata $doc = new \DOMDocument(); //@ to hide parse warning messages in invalid html @$doc->loadHTMLFile($url); //FIXME: extract base url from html $base = new \Net_URL2($url); $xpath = new \DOMXPath($doc); $links = $xpath->evaluate('//a'); //FIXME: link rel, img, video $alreadySeen = array(); foreach ($links as $link) { $linkTitle = $link->textContent; $href = ''; foreach ($link->attributes as $attribute) { if ($attribute->name == 'href') { $href = $attribute->textContent; } } if ($href == '' || $href{0} == '#') { //link on this page continue; } $linkUrlObj = $base->resolve($href); $linkUrlObj->setFragment(false); $linkUrl = (string) $linkUrlObj; if (isset($alreadySeen[$linkUrl])) { continue; } switch ($linkUrlObj->getScheme()) { case 'http': case 'https': break; default: continue 2; } if ($es->isKnown($linkUrl)) { continue; } //FIXME: check target type //FIXME: check nofollow //var_dump($linkTitle, $linkUrl); $es->markQueued($linkUrl); addToIndex($linkUrl, $linkTitle, $url); if (Helper::isUrlAllowed($linkUrl)) { addToCrawl($linkUrl); } $alreadySeen[$linkUrl] = true; } function addToIndex($linkUrl, $linkTitle, $sourceUrl) { echo "Queuing for indexing: $linkUrl\n"; $gmclient = new \GearmanClient(); $gmclient->addServer('127.0.0.1'); $gmclient->doBackground( 'phinde_index', serialize( array( 'url' => $linkUrl, 'title' => $linkTitle, 'source' => $sourceUrl ) ) ); if ($gmclient->returnCode() != GEARMAN_SUCCESS) { echo 'Error queueing URL indexing for ' . $linkUrl . "\n" . 'Error code: ' . $gmclient->returnCode() . "\n"; exit(2); } } function addToCrawl($linkUrl) { echo "Queuing for crawling: $linkUrl\n"; $gmclient = new \GearmanClient(); $gmclient->addServer('127.0.0.1'); $gmclient->doBackground( 'phinde_crawl', serialize( array( 'url' => $linkUrl ) ) ); if ($gmclient->returnCode() != GEARMAN_SUCCESS) { echo 'Error queueing URL crawling for ' . $linkUrl . "\n" . 'Error code: ' . $gmclient->returnCode() . "\n"; exit(2); } } ?>