diff options
Diffstat (limited to 'bin')
| -rwxr-xr-x | bin/crawl.php | 47 | ||||
| -rwxr-xr-x | bin/index.php | 245 | ||||
| -rwxr-xr-x | bin/phinde-worker.php | 55 | ||||
| -rwxr-xr-x | bin/process.php | 95 |
4 files changed, 108 insertions, 334 deletions
diff --git a/bin/crawl.php b/bin/crawl.php deleted file mode 100755 index 0d57bb3..0000000 --- a/bin/crawl.php +++ /dev/null @@ -1,47 +0,0 @@ -#!/usr/bin/env php -<?php -namespace phinde; -require_once __DIR__ . '/../src/init.php'; - -$cc = new \Console_CommandLine(); -$cc->description = 'phinde URL crawler'; -$cc->version = '0.0.1'; -$cc->addOption( - 'showLinksOnly', - array( - 'short_name' => '-s', - 'long_name' => '--show-links', - 'description' => 'Only show which URLs were found', - 'action' => 'StoreTrue', - 'default' => false - ) -); -$cc->addArgument( - 'url', - array( - 'description' => 'URL to crawl', - 'multiple' => false - ) -); -try { - $res = $cc->parse(); -} catch (\Exception $e) { - $cc->displayError($e->getMessage()); -} - -$url = $res->args['url']; -$url = Helper::addSchema($url); -if (!Helper::isUrlAllowed($url)) { - echo "Domain is not allowed; not crawling\n"; - exit(2); -} - -try { - $crawler = new Crawler(); - $crawler->setShowLinksOnly($res->options['showLinksOnly']); - $crawler->crawl($url); -} catch (\Exception $e) { - echo $e->getMessage() . "\n"; - exit(10); -} -?>
\ No newline at end of file diff --git a/bin/index.php b/bin/index.php deleted file mode 100755 index 5985a3e..0000000 --- a/bin/index.php +++ /dev/null @@ -1,245 +0,0 @@ -#!/usr/bin/env php -<?php -namespace phinde; -// index a given URL -require_once __DIR__ . '/../src/init.php'; - -$supportedIndexTypes = array( - 'application/xhtml+xml', - 'text/html', -); - -if ($argc < 2) { - echo "No URL given\n"; - exit(1); -} - -function removeTags($doc, $tag) { - $elems = array(); - foreach ($doc->getElementsbyTagName($tag) as $elem) { - $elems[] = $elem; - } - foreach ($elems as $elem) { - $elem->parentNode->removeChild($elem); - } -} - -$url = $argv[1]; - -$req = new \HTTP_Request2($url); -$req->setConfig('follow_redirects', true); -$req->setConfig('connect_timeout', 5); -$req->setConfig('timeout', 10); -$req->setConfig('ssl_verify_peer', false); -//FIXME: size limit - -$es = new Elasticsearch($GLOBALS['phinde']['elasticsearch']); -$existingDoc = $es->get($url); -if ($existingDoc && $existingDoc->status == 'indexed') { - $nMoDate = strtotime($existingDoc->modate); - $refreshtime = $GLOBALS['phinde']['refreshtime']; - if (time() - $nMoDate < $refreshtime) { - echo "URL already indexed less than $refreshtime seconds ago: $url\n"; - exit(0); - } - - $req->setHeader('If-Modified-Since: ' . date('r', $nMoDate)); -} -//FIXME: sourcetitle, sourcelink - -$res = $req->send(); -//FIXME: try-catch - -if ($res->getStatus() === 304) { - //not modified since last time - //FIXME: store "last try" time - exit(0); -} else if ($res->getStatus() !== 200) { - //FIXME: delete if 401 gone or 404 when updating - echo "Response code is not 200 but " . $res->getStatus() . ", stopping\n"; - //FIXME: update status - exit(3); -} - -$mimetype = explode(';', $res->getHeader('content-type'))[0]; -if (!in_array($mimetype, $supportedIndexTypes)) { - echo "MIME type not supported for indexing: $mimetype\n"; - //FIXME: update status - exit(4); -} - - -//FIXME: update index only if changed since last index time -//FIXME: extract base url from html -//FIXME: check if effective url needs updating -$url = $res->getEffectiveUrl(); -$base = new \Net_URL2($url); - -$indexDoc = new \stdClass(); - -//FIXME: MIME type switch -$doc = new \DOMDocument(); -//@ to hide parse warning messages in invalid html -@$doc->loadHTML($res->getBody()); -$dx = new \DOMXPath($doc); - -$xbase = $dx->evaluate('/html/head/base[@href]')->item(0); -if ($xbase) { - $base = $base->resolve( - $xbase->attributes->getNamedItem('href')->textContent - ); -} - -$meta = $dx->evaluate('/html/head/meta[@name="robots" and @content]') - ->item(0); -if ($meta) { - $robots = $meta->attributes->getNamedItem('content')->textContent; - foreach (explode(',', $robots) as $value) { - if (trim($value) == 'noindex') { - echo "URL does not want to be indexed: $url\n"; - exit(0); - } - } -} - -//remove script tags -removeTags($doc, 'script'); -removeTags($doc, 'style'); -removeTags($doc, 'nav'); - -//default content: <body> -$xpContext = $doc->getElementsByTagName('body')->item(0); -//FIXME: follow meta refresh, no body -// example: https://www.gnu.org/software/coreutils/ - -//use microformats content if it exists -$xpElems = $dx->query( - "//*[contains(concat(' ', normalize-space(@class), ' '), ' e-content ')]" -); -if ($xpElems->length) { - $xpContext = $xpElems->item(0); -} else if ($doc->getElementById('content')) { - //if there is an element with ID "content", we'll use this - $xpContext = $doc->getElementById('content'); -} - -$indexDoc->url = $url; -$indexDoc->schemalessUrl = Helper::noSchema($url); -$indexDoc->type = 'html'; -$indexDoc->subtype = ''; -$indexDoc->mimetype = $mimetype; -$indexDoc->domain = parse_url($url, PHP_URL_HOST); - -//$indexDoc->source = 'FIXME'; -//$indexDoc->sourcetitle = 'FIXME'; - -$indexDoc->author = new \stdClass(); - -$arXpElems = $dx->query('/html/head/meta[@name="author" and @content]'); -if ($arXpElems->length) { - $indexDoc->author->name = trim( - $arXpElems->item(0)->attributes->getNamedItem('content')->textContent - ); -} -$arXpElems = $dx->query('/html/head/link[@rel="author" and @href]'); -if ($arXpElems->length) { - $indexDoc->author->url = trim( - $base->resolve( - $arXpElems->item(0)->attributes->getNamedItem('href')->textContent - ) - ); -} - - -$arXpElems = $dx->query('/html/head/title'); -if ($arXpElems->length) { - $indexDoc->title = trim( - $arXpElems->item(0)->textContent - ); -} - -foreach (array('h1', 'h2', 'h3', 'h4', 'h5', 'h6') as $headlinetype) { - $indexDoc->$headlinetype = array(); - foreach ($xpContext->getElementsByTagName($headlinetype) as $xheadline) { - array_push( - $indexDoc->$headlinetype, - trim($xheadline->textContent) - ); - } -} - -//FIXME: split paragraphs -//FIXME: insert space after br -$indexDoc->text = array(); -$indexDoc->text[] = trim( - str_replace( - array("\r\n", "\n", "\r", ' '), - ' ', - $xpContext->textContent - ) -); - -//tags -$tags = array(); -foreach ($dx->query('/html/head/meta[@name="keywords" and @content]') as $xkeywords) { - $keywords = $xkeywords->attributes->getNamedItem('content')->textContent; - foreach (explode(',', $keywords) as $keyword) { - $tags[trim($keyword)] = true; - } -} -$indexDoc->tags = array_keys($tags); - -//dates -$arXpdates = $dx->query('/html/head/meta[@name="DC.date.created" and @content]'); -if ($arXpdates->length) { - $indexDoc->crdate = date( - 'c', - strtotime( - $arXpdates->item(0)->attributes->getNamedItem('content')->textContent - ) - ); -} -//FIXME: keep creation date from database, or use modified date if we -// do not have it there - -$arXpdates = $dx->query('/html/head/meta[@name="DC.date.modified" and @content]'); -if ($arXpdates->length) { - $indexDoc->modate = date( - 'c', - strtotime( - $arXpdates->item(0)->attributes->getNamedItem('content')->textContent - ) - ); -} else { - $lm = $res->getHeader('last-modified'); - if ($lm !== null) { - $indexDoc->modate = date('c', strtotime($lm)); - } else { - //use current time since we don't have any other data - $indexDoc->modate = date('c'); - } -} - -//language -//there may be "en-US" and "de-DE" -$xlang = $doc->documentElement->attributes->getNamedItem('lang'); -if ($xlang) { - $indexDoc->language = strtolower(substr($xlang->textContent, 0, 2)); -} -//FIXME: fallback, autodetection -//FIXME: check noindex - -//var_dump($indexDoc);die(); - -$indexDoc->status = 'indexed'; - -//FIXME: update index if it exists already -$r = new Elasticsearch_Request( - $GLOBALS['phinde']['elasticsearch'] . 'document/' . rawurlencode($url), - \HTTP_Request2::METHOD_PUT -); -$r->setBody(json_encode($indexDoc)); -$r->send(); - - -?> diff --git a/bin/phinde-worker.php b/bin/phinde-worker.php index 939db1f..1e94535 100755 --- a/bin/phinde-worker.php +++ b/bin/phinde-worker.php @@ -6,51 +6,22 @@ chdir(dirname($argv[0])); require_once __DIR__ . '/../src/init.php'; -$cc = new \Console_CommandLine(); -$cc->description = 'phinde queue worker'; -$cc->version = '0.0.1'; -$cc->addArgument( - 'queues', - array( - 'description' => 'Queue(s) to process', - 'multiple' => true, - 'default' => array('crawl', 'index'), - 'choices' => array('crawl', 'index'), - 'optional' => true, - ) -); -try { - $res = $cc->parse(); -} catch (\Exception $e) { - $cc->displayError($e->getMessage()); -} - -$queues = array_flip(array_unique($res->args['queues'])); - $gmworker = new \GearmanWorker(); $gmworker->addServer('127.0.0.1'); -if (isset($queues['crawl'])) { - $gmworker->addFunction( - $GLOBALS['phinde']['queuePrefix'] . 'phinde_crawl', - function(\GearmanJob $job) { - $data = unserialize($job->workload()); - echo "-- Crawling " . $data['url'] . "\n"; - passthru('./crawl.php ' . escapeshellarg($data['url'])); - } - ); -} -if (isset($queues['index'])) { - $gmworker->addFunction( - $GLOBALS['phinde']['queuePrefix'] . 'phinde_index', - function(\GearmanJob $job) { - $data = unserialize($job->workload()); - echo "-- Indexing " . $data['url'] . "\n"; - passthru('./index.php ' . escapeshellarg($data['url'])); - //exit(); - } - ); -} +$gmworker->addFunction( + $GLOBALS['phinde']['queuePrefix'] . 'phinde_process', + function(\GearmanJob $job) { + $data = unserialize($job->workload()); + echo "-- Processing " . $data['url'] + . ' (' . implode(',', $data['actions']) . ')' + . "\n"; + passthru( + './process.php ' . escapeshellarg($data['url']) + . ' ' . implode(' ', $data['actions']) + ); + } +); while ($gmworker->work()) { if ($gmworker->returnCode() != GEARMAN_SUCCESS) { diff --git a/bin/process.php b/bin/process.php new file mode 100755 index 0000000..ababb03 --- /dev/null +++ b/bin/process.php @@ -0,0 +1,95 @@ +#!/usr/bin/env php +<?php +namespace phinde; +require_once __DIR__ . '/../src/init.php'; + +$cc = new \Console_CommandLine(); +$cc->description = 'phinde URL processor'; +$cc->version = '0.0.1'; +$cc->addOption( + 'force', + array( + 'short_name' => '-f', + 'long_name' => '--force', + 'description' => 'Always process URL, even when it did not change', + 'action' => 'StoreTrue', + 'default' => false + ) +); +$cc->addOption( + 'showLinksOnly', + array( + 'short_name' => '-s', + 'long_name' => '--show-links', + 'description' => 'Only show which URLs were found', + 'action' => 'StoreTrue', + 'default' => false + ) +); +$cc->addArgument( + 'url', + array( + 'description' => 'URL to process', + 'multiple' => false + ) +); +$cc->addArgument( + 'actions', + array( + 'description' => 'Actions to take', + 'multiple' => true, + 'optional' => true, + 'choices' => array('index', 'crawl'), + 'default' => array('index', 'crawl'), + ) +); +try { + $res = $cc->parse(); +} catch (\Exception $e) { + $cc->displayError($e->getMessage()); +} + +$url = $res->args['url']; +$url = Helper::addSchema($url); +$urlObj = new \Net_URL2($url); +$url = $urlObj->getNormalizedURL(); +if (!Helper::isUrlAllowed($url)) { + echo "Domain is not allowed; not crawling\n"; + exit(2); +} + +try { + $actions = array(); + foreach ($res->args['actions'] as $action) { + if ($action == 'crawl') { + $crawler = new Crawler(); + $crawler->setShowLinksOnly($res->options['showLinksOnly']); + $actions[$action] = $crawler; + } else if ($action == 'index') { + $actions[$action] = new Indexer(); + } + } + + $fetcher = new Fetcher(); + $retrieved = $fetcher->fetch($url, $actions, $res->options['force']); + if ($retrieved === false) { + exit(0); + } + + $update = false; + foreach ($actions as $key => $action) { + echo "step: $key\n"; + $update |= $action->run($retrieved); + } + + if ($update) { + //FIXME: update index if it exists already + $fetcher->storeDoc($retrieved->url, $retrieved->esDoc); + } else { + echo "Not updating\n"; + } +} catch (\Exception $e) { + echo $e->getMessage() . "\n"; + exit(10); +} +?>
\ No newline at end of file |
