X-Git-Url: https://git.cweiske.de/phinde.git/blobdiff_plain/f67e8f0bc3f51f2d280a86a8c7cffa68d812efe1..cd02bac646f42a0cb402ff2dc8240aa01f1f0fb8:/bin/crawl.php diff --git a/bin/crawl.php b/bin/crawl.php index e39a622..e9a6218 100755 --- a/bin/crawl.php +++ b/bin/crawl.php @@ -3,136 +3,23 @@ namespace phinde; require_once __DIR__ . '/../src/init.php'; -$supportedCrawlTypes = array( - 'text/html', 'application/xhtml+xml' -); - - if ($argc < 2) { echo "No URL given\n"; exit(1); } -$es = new Elasticsearch($GLOBALS['phinde']['elasticsearch']); - $url = $argv[1]; +$url = Helper::addSchema($url); if (!Helper::isUrlAllowed($url)) { echo "Domain is not allowed; not crawling\n"; exit(2); } - -$req = new \HTTP_Request2($url); -//FIXME: send supported mime types in header -$res = $req->send(); -if ($res->getStatus() !== 200) { - echo "Response code is not 200 but " . $res->getStatus() . ", stopping\n"; - exit(3); -} -$mimetype = explode(';', $res->getHeader('content-type'))[0]; -if (!in_array($mimetype, $supportedCrawlTypes)) { - echo "MIME type not supported for crawling: $mimetype\n"; - exit(4); -} - -//FIXME: mime type switch for cdata -$doc = new \DOMDocument(); -//@ to hide parse warning messages in invalid html -@$doc->loadHTMLFile($url); - -//FIXME: extract base url from html -$base = new \Net_URL2($url); - -$xpath = new \DOMXPath($doc); -$links = $xpath->evaluate('//a'); -//FIXME: link rel, img, video - -$alreadySeen = array(); - -foreach ($links as $link) { - $linkTitle = $link->textContent; - $href = ''; - foreach ($link->attributes as $attribute) { - if ($attribute->name == 'href') { - $href = $attribute->textContent; - } - } - if ($href == '' || $href{0} == '#') { - //link on this page - continue; - } - - $linkUrlObj = $base->resolve($href); - $linkUrlObj->setFragment(false); - $linkUrl = (string) $linkUrlObj; - if (isset($alreadySeen[$linkUrl])) { - continue; - } - - switch ($linkUrlObj->getScheme()) { - case 'http': - case 'https': - break; - default: - continue 2; - } - - if ($es->isKnown($linkUrl)) { - continue; - } - - //FIXME: check target type - //FIXME: check nofollow - //var_dump($linkTitle, $linkUrl); - $es->markQueued($linkUrl); - addToIndex($linkUrl, $linkTitle, $url); - if (Helper::isUrlAllowed($linkUrl)) { - addToCrawl($linkUrl); - } - $alreadySeen[$linkUrl] = true; -} - -function addToIndex($linkUrl, $linkTitle, $sourceUrl) -{ - echo "Queuing for indexing: $linkUrl\n"; - $gmclient = new \GearmanClient(); - $gmclient->addServer('127.0.0.1'); - $gmclient->doBackground( - 'phinde_index', - serialize( - array( - 'url' => $linkUrl, - 'title' => $linkTitle, - 'source' => $sourceUrl - ) - ) - ); - if ($gmclient->returnCode() != GEARMAN_SUCCESS) { - echo 'Error queueing URL indexing for ' - . $linkUrl . "\n" - . 'Error code: ' . $gmclient->returnCode() . "\n"; - exit(2); - } -} - -function addToCrawl($linkUrl) -{ - echo "Queuing for crawling: $linkUrl\n"; - $gmclient = new \GearmanClient(); - $gmclient->addServer('127.0.0.1'); - $gmclient->doBackground( - 'phinde_crawl', - serialize( - array( - 'url' => $linkUrl - ) - ) - ); - if ($gmclient->returnCode() != GEARMAN_SUCCESS) { - echo 'Error queueing URL crawling for ' - . $linkUrl . "\n" - . 'Error code: ' . $gmclient->returnCode() . "\n"; - exit(2); - } +try { + $crawler = new Crawler(); + $crawler->crawl($url); +} catch (\Exception $e) { + echo $e->getMessage() . "\n"; + exit(10); } ?> \ No newline at end of file