From: Christian Weiske Date: Mon, 7 Nov 2016 20:41:36 +0000 (+0100) Subject: Big patch merging crawling+indexing into one command, new json document structure X-Git-Tag: v0.2.0~16 X-Git-Url: https://git.cweiske.de/phinde.git/commitdiff_plain/d7651fd96dcfa2829519504e4c8ec1ce511cd57f Big patch merging crawling+indexing into one command, new json document structure --- diff --git a/bin/crawl.php b/bin/crawl.php deleted file mode 100755 index 0d57bb3..0000000 --- a/bin/crawl.php +++ /dev/null @@ -1,47 +0,0 @@ -#!/usr/bin/env php -description = 'phinde URL crawler'; -$cc->version = '0.0.1'; -$cc->addOption( - 'showLinksOnly', - array( - 'short_name' => '-s', - 'long_name' => '--show-links', - 'description' => 'Only show which URLs were found', - 'action' => 'StoreTrue', - 'default' => false - ) -); -$cc->addArgument( - 'url', - array( - 'description' => 'URL to crawl', - 'multiple' => false - ) -); -try { - $res = $cc->parse(); -} catch (\Exception $e) { - $cc->displayError($e->getMessage()); -} - -$url = $res->args['url']; -$url = Helper::addSchema($url); -if (!Helper::isUrlAllowed($url)) { - echo "Domain is not allowed; not crawling\n"; - exit(2); -} - -try { - $crawler = new Crawler(); - $crawler->setShowLinksOnly($res->options['showLinksOnly']); - $crawler->crawl($url); -} catch (\Exception $e) { - echo $e->getMessage() . "\n"; - exit(10); -} -?> \ No newline at end of file diff --git a/bin/index.php b/bin/index.php deleted file mode 100755 index 5985a3e..0000000 --- a/bin/index.php +++ /dev/null @@ -1,245 +0,0 @@ -#!/usr/bin/env php -getElementsbyTagName($tag) as $elem) { - $elems[] = $elem; - } - foreach ($elems as $elem) { - $elem->parentNode->removeChild($elem); - } -} - -$url = $argv[1]; - -$req = new \HTTP_Request2($url); -$req->setConfig('follow_redirects', true); -$req->setConfig('connect_timeout', 5); -$req->setConfig('timeout', 10); -$req->setConfig('ssl_verify_peer', false); -//FIXME: size limit - -$es = new Elasticsearch($GLOBALS['phinde']['elasticsearch']); -$existingDoc = $es->get($url); -if ($existingDoc && $existingDoc->status == 'indexed') { - $nMoDate = strtotime($existingDoc->modate); - $refreshtime = $GLOBALS['phinde']['refreshtime']; - if (time() - $nMoDate < $refreshtime) { - echo "URL already indexed less than $refreshtime seconds ago: $url\n"; - exit(0); - } - - $req->setHeader('If-Modified-Since: ' . date('r', $nMoDate)); -} -//FIXME: sourcetitle, sourcelink - -$res = $req->send(); -//FIXME: try-catch - -if ($res->getStatus() === 304) { - //not modified since last time - //FIXME: store "last try" time - exit(0); -} else if ($res->getStatus() !== 200) { - //FIXME: delete if 401 gone or 404 when updating - echo "Response code is not 200 but " . $res->getStatus() . ", stopping\n"; - //FIXME: update status - exit(3); -} - -$mimetype = explode(';', $res->getHeader('content-type'))[0]; -if (!in_array($mimetype, $supportedIndexTypes)) { - echo "MIME type not supported for indexing: $mimetype\n"; - //FIXME: update status - exit(4); -} - - -//FIXME: update index only if changed since last index time -//FIXME: extract base url from html -//FIXME: check if effective url needs updating -$url = $res->getEffectiveUrl(); -$base = new \Net_URL2($url); - -$indexDoc = new \stdClass(); - -//FIXME: MIME type switch -$doc = new \DOMDocument(); -//@ to hide parse warning messages in invalid html -@$doc->loadHTML($res->getBody()); -$dx = new \DOMXPath($doc); - -$xbase = $dx->evaluate('/html/head/base[@href]')->item(0); -if ($xbase) { - $base = $base->resolve( - $xbase->attributes->getNamedItem('href')->textContent - ); -} - -$meta = $dx->evaluate('/html/head/meta[@name="robots" and @content]') - ->item(0); -if ($meta) { - $robots = $meta->attributes->getNamedItem('content')->textContent; - foreach (explode(',', $robots) as $value) { - if (trim($value) == 'noindex') { - echo "URL does not want to be indexed: $url\n"; - exit(0); - } - } -} - -//remove script tags -removeTags($doc, 'script'); -removeTags($doc, 'style'); -removeTags($doc, 'nav'); - -//default content: -$xpContext = $doc->getElementsByTagName('body')->item(0); -//FIXME: follow meta refresh, no body -// example: https://www.gnu.org/software/coreutils/ - -//use microformats content if it exists -$xpElems = $dx->query( - "//*[contains(concat(' ', normalize-space(@class), ' '), ' e-content ')]" -); -if ($xpElems->length) { - $xpContext = $xpElems->item(0); -} else if ($doc->getElementById('content')) { - //if there is an element with ID "content", we'll use this - $xpContext = $doc->getElementById('content'); -} - -$indexDoc->url = $url; -$indexDoc->schemalessUrl = Helper::noSchema($url); -$indexDoc->type = 'html'; -$indexDoc->subtype = ''; -$indexDoc->mimetype = $mimetype; -$indexDoc->domain = parse_url($url, PHP_URL_HOST); - -//$indexDoc->source = 'FIXME'; -//$indexDoc->sourcetitle = 'FIXME'; - -$indexDoc->author = new \stdClass(); - -$arXpElems = $dx->query('/html/head/meta[@name="author" and @content]'); -if ($arXpElems->length) { - $indexDoc->author->name = trim( - $arXpElems->item(0)->attributes->getNamedItem('content')->textContent - ); -} -$arXpElems = $dx->query('/html/head/link[@rel="author" and @href]'); -if ($arXpElems->length) { - $indexDoc->author->url = trim( - $base->resolve( - $arXpElems->item(0)->attributes->getNamedItem('href')->textContent - ) - ); -} - - -$arXpElems = $dx->query('/html/head/title'); -if ($arXpElems->length) { - $indexDoc->title = trim( - $arXpElems->item(0)->textContent - ); -} - -foreach (array('h1', 'h2', 'h3', 'h4', 'h5', 'h6') as $headlinetype) { - $indexDoc->$headlinetype = array(); - foreach ($xpContext->getElementsByTagName($headlinetype) as $xheadline) { - array_push( - $indexDoc->$headlinetype, - trim($xheadline->textContent) - ); - } -} - -//FIXME: split paragraphs -//FIXME: insert space after br -$indexDoc->text = array(); -$indexDoc->text[] = trim( - str_replace( - array("\r\n", "\n", "\r", ' '), - ' ', - $xpContext->textContent - ) -); - -//tags -$tags = array(); -foreach ($dx->query('/html/head/meta[@name="keywords" and @content]') as $xkeywords) { - $keywords = $xkeywords->attributes->getNamedItem('content')->textContent; - foreach (explode(',', $keywords) as $keyword) { - $tags[trim($keyword)] = true; - } -} -$indexDoc->tags = array_keys($tags); - -//dates -$arXpdates = $dx->query('/html/head/meta[@name="DC.date.created" and @content]'); -if ($arXpdates->length) { - $indexDoc->crdate = date( - 'c', - strtotime( - $arXpdates->item(0)->attributes->getNamedItem('content')->textContent - ) - ); -} -//FIXME: keep creation date from database, or use modified date if we -// do not have it there - -$arXpdates = $dx->query('/html/head/meta[@name="DC.date.modified" and @content]'); -if ($arXpdates->length) { - $indexDoc->modate = date( - 'c', - strtotime( - $arXpdates->item(0)->attributes->getNamedItem('content')->textContent - ) - ); -} else { - $lm = $res->getHeader('last-modified'); - if ($lm !== null) { - $indexDoc->modate = date('c', strtotime($lm)); - } else { - //use current time since we don't have any other data - $indexDoc->modate = date('c'); - } -} - -//language -//there may be "en-US" and "de-DE" -$xlang = $doc->documentElement->attributes->getNamedItem('lang'); -if ($xlang) { - $indexDoc->language = strtolower(substr($xlang->textContent, 0, 2)); -} -//FIXME: fallback, autodetection -//FIXME: check noindex - -//var_dump($indexDoc);die(); - -$indexDoc->status = 'indexed'; - -//FIXME: update index if it exists already -$r = new Elasticsearch_Request( - $GLOBALS['phinde']['elasticsearch'] . 'document/' . rawurlencode($url), - \HTTP_Request2::METHOD_PUT -); -$r->setBody(json_encode($indexDoc)); -$r->send(); - - -?> diff --git a/bin/phinde-worker.php b/bin/phinde-worker.php index 939db1f..1e94535 100755 --- a/bin/phinde-worker.php +++ b/bin/phinde-worker.php @@ -6,51 +6,22 @@ chdir(dirname($argv[0])); require_once __DIR__ . '/../src/init.php'; -$cc = new \Console_CommandLine(); -$cc->description = 'phinde queue worker'; -$cc->version = '0.0.1'; -$cc->addArgument( - 'queues', - array( - 'description' => 'Queue(s) to process', - 'multiple' => true, - 'default' => array('crawl', 'index'), - 'choices' => array('crawl', 'index'), - 'optional' => true, - ) -); -try { - $res = $cc->parse(); -} catch (\Exception $e) { - $cc->displayError($e->getMessage()); -} - -$queues = array_flip(array_unique($res->args['queues'])); - $gmworker = new \GearmanWorker(); $gmworker->addServer('127.0.0.1'); -if (isset($queues['crawl'])) { - $gmworker->addFunction( - $GLOBALS['phinde']['queuePrefix'] . 'phinde_crawl', - function(\GearmanJob $job) { - $data = unserialize($job->workload()); - echo "-- Crawling " . $data['url'] . "\n"; - passthru('./crawl.php ' . escapeshellarg($data['url'])); - } - ); -} -if (isset($queues['index'])) { - $gmworker->addFunction( - $GLOBALS['phinde']['queuePrefix'] . 'phinde_index', - function(\GearmanJob $job) { - $data = unserialize($job->workload()); - echo "-- Indexing " . $data['url'] . "\n"; - passthru('./index.php ' . escapeshellarg($data['url'])); - //exit(); - } - ); -} +$gmworker->addFunction( + $GLOBALS['phinde']['queuePrefix'] . 'phinde_process', + function(\GearmanJob $job) { + $data = unserialize($job->workload()); + echo "-- Processing " . $data['url'] + . ' (' . implode(',', $data['actions']) . ')' + . "\n"; + passthru( + './process.php ' . escapeshellarg($data['url']) + . ' ' . implode(' ', $data['actions']) + ); + } +); while ($gmworker->work()) { if ($gmworker->returnCode() != GEARMAN_SUCCESS) { diff --git a/bin/process.php b/bin/process.php new file mode 100755 index 0000000..ababb03 --- /dev/null +++ b/bin/process.php @@ -0,0 +1,95 @@ +#!/usr/bin/env php +description = 'phinde URL processor'; +$cc->version = '0.0.1'; +$cc->addOption( + 'force', + array( + 'short_name' => '-f', + 'long_name' => '--force', + 'description' => 'Always process URL, even when it did not change', + 'action' => 'StoreTrue', + 'default' => false + ) +); +$cc->addOption( + 'showLinksOnly', + array( + 'short_name' => '-s', + 'long_name' => '--show-links', + 'description' => 'Only show which URLs were found', + 'action' => 'StoreTrue', + 'default' => false + ) +); +$cc->addArgument( + 'url', + array( + 'description' => 'URL to process', + 'multiple' => false + ) +); +$cc->addArgument( + 'actions', + array( + 'description' => 'Actions to take', + 'multiple' => true, + 'optional' => true, + 'choices' => array('index', 'crawl'), + 'default' => array('index', 'crawl'), + ) +); +try { + $res = $cc->parse(); +} catch (\Exception $e) { + $cc->displayError($e->getMessage()); +} + +$url = $res->args['url']; +$url = Helper::addSchema($url); +$urlObj = new \Net_URL2($url); +$url = $urlObj->getNormalizedURL(); +if (!Helper::isUrlAllowed($url)) { + echo "Domain is not allowed; not crawling\n"; + exit(2); +} + +try { + $actions = array(); + foreach ($res->args['actions'] as $action) { + if ($action == 'crawl') { + $crawler = new Crawler(); + $crawler->setShowLinksOnly($res->options['showLinksOnly']); + $actions[$action] = $crawler; + } else if ($action == 'index') { + $actions[$action] = new Indexer(); + } + } + + $fetcher = new Fetcher(); + $retrieved = $fetcher->fetch($url, $actions, $res->options['force']); + if ($retrieved === false) { + exit(0); + } + + $update = false; + foreach ($actions as $key => $action) { + echo "step: $key\n"; + $update |= $action->run($retrieved); + } + + if ($update) { + //FIXME: update index if it exists already + $fetcher->storeDoc($retrieved->url, $retrieved->esDoc); + } else { + echo "Not updating\n"; + } +} catch (\Exception $e) { + echo $e->getMessage() . "\n"; + exit(10); +} +?> \ No newline at end of file diff --git a/data/elasticsearch-mapping.json b/data/elasticsearch-mapping.json index 617c69f..aad27d2 100644 --- a/data/elasticsearch-mapping.json +++ b/data/elasticsearch-mapping.json @@ -3,8 +3,27 @@ "document": { "properties": { "status": { - "type": "string", - "index": "not_analyzed" + "type": "object", + "properties": { + "code": { + "type": "integer" + }, + "location": { + "type": "string" + }, + "processed": { + "type": "date" + }, + "crdate": { + "type": "date" + }, + "modate": { + "type": "date" + }, + "findable": { + "type": "boolean" + } + } }, "type": { "type": "string", @@ -76,12 +95,6 @@ "tags": { "type": "string", "boost": 1.5 - }, - "crdate": { - "type": "date" - }, - "modate": { - "type": "date" } } } diff --git a/data/templates/opensearch.htm b/data/templates/opensearch.htm index 1ba805a..6565516 100644 --- a/data/templates/opensearch.htm +++ b/data/templates/opensearch.htm @@ -27,8 +27,8 @@ {{doc.title}} {{doc.url}} - {% if doc.modate %} - {{doc.modate|date('c')}} + {% if doc.status.modate %} + {{doc.status.modate|date('c')}} {% endif %} {{doc.htmlText|striptags}} diff --git a/data/templates/search/hit-chat.htm b/data/templates/search/hit-chat.htm index 6739670..501b843 100644 --- a/data/templates/search/hit-chat.htm +++ b/data/templates/search/hit-chat.htm @@ -15,8 +15,8 @@ at {% endif %} - {% if doc.modate %} - {{doc.modate|date("Y-m-d H:i")}} + {% if doc.status.modate %} + {{doc.status.modate|date("Y-m-d H:i")}} {% endif %} diff --git a/data/templates/search/hit.htm b/data/templates/search/hit.htm index dababcb..f61a271 100644 --- a/data/templates/search/hit.htm +++ b/data/templates/search/hit.htm @@ -16,7 +16,7 @@
{{ellipsis(doc.extra.cleanUrl, 60)}}
- {% if doc.modate %} + {% if doc.status.modate %} {{doc.extra.day|date("Y-m-d")}} {% endif %} {{doc.htmlText|raw}} diff --git a/docs/elasticsearch/doc-html.json b/docs/elasticsearch/doc-html.json index 9f93b8e..5045572 100644 --- a/docs/elasticsearch/doc-html.json +++ b/docs/elasticsearch/doc-html.json @@ -1,5 +1,11 @@ { - "status": "indexed", + "status": { + "processed": '2016-10-06T05:13:42+02:00', + "code": 200, + "location": "", + "crdate": "2016-01-30T12:23:42+01:00", + "modate": "2016-01-30T12:23:42+01:00" + }, "type": "html", "subtype": "rsvp", "mimetype": "application/xhtml+xml", @@ -38,6 +44,4 @@ "Another paragraph in the text" ], "tags": ["example", "test", "documentation"], - "crdate": "2016-01-30T12:23:42+01:00", - "modate": "2016-01-30T12:23:42+01:00", } diff --git a/src/phinde/Crawler.php b/src/phinde/Crawler.php index 1cf9bdc..38e3c3f 100644 --- a/src/phinde/Crawler.php +++ b/src/phinde/Crawler.php @@ -11,7 +11,7 @@ class Crawler */ protected $showLinksOnly = false; - static $supportedIndexTypes = array( + static $supportedTypes = array( 'application/atom+xml' => '\\phinde\\LinkExtractor\\Atom', 'application/xhtml+xml' => '\\phinde\\LinkExtractor\\Html', 'text/html' => '\\phinde\\LinkExtractor\\Html', @@ -23,59 +23,28 @@ class Crawler $this->queue = new Queue(); } - public function crawl($url) + public function run(Retrieved $retrieved) { - $res = $this->fetch($url); - if ($res === false) { - return; - } - - $linkInfos = $this->extractLinks($res); + $linkInfos = $this->extractLinks($retrieved->httpRes); $linkInfos = $this->filterLinks($linkInfos); if ($this->showLinksOnly) { $this->showLinks($linkInfos); + return false; } else { $this->enqueue($linkInfos); + return true; } } - protected function fetch($url) - { - $existingDoc = $this->es->get($url); - - $req = new HttpRequest($url); - $req->setHeader( - 'accept', - implode(',', array_keys(static::$supportedIndexTypes)) - ); - if ($existingDoc && isset($existingDoc->modate)) { - $nMoDate = strtotime($existingDoc->modate); - $req->setHeader('If-Modified-Since: ' . date('r', $nMoDate)); - } - - $res = $req->send(); - if ($res->getStatus() === 304) { - //not modified since last time, so don't crawl again - $this->log('Not modified since last fetch'); - return false; - } else if ($res->getStatus() !== 200) { - throw new \Exception( - "Response code is not 200 but " - . $res->getStatus() . ", stopping" - ); - } - return $res; - } - protected function extractLinks(\HTTP_Request2_Response $res) { $mimetype = explode(';', $res->getHeader('content-type'))[0]; - if (!isset(static::$supportedIndexTypes[$mimetype])) { + if (!isset(static::$supportedTypes[$mimetype])) { echo "MIME type not supported for indexing: $mimetype\n"; return array(); } - $class = static::$supportedIndexTypes[$mimetype]; + $class = static::$supportedTypes[$mimetype]; $extractor = new $class(); return $extractor->extract($res); } @@ -112,15 +81,17 @@ class Crawler } if ($linkInfo->crawl || $linkInfo->index) { $this->es->markQueued($linkInfo->url); - } - if ($linkInfo->index) { - $this->queue->addToIndex( - $linkInfo->url, $linkInfo->title, $linkInfo->source + $actions = array(); + if ($linkInfo->index) { + $actions[] = 'index'; + } + if ($linkInfo->crawl) { + $actions[] = 'crawl'; + } + $this->queue->addToProcessList( + $linkInfo->url, $actions ); } - if ($linkInfo->crawl) { - $this->queue->addToCrawl($linkInfo->url); - } } } @@ -142,10 +113,5 @@ class Crawler { $this->showLinksOnly = $showLinksOnly; } - - protected function log($msg) - { - echo $msg . "\n"; - } } ?> diff --git a/src/phinde/Elasticsearch.php b/src/phinde/Elasticsearch.php index 2887beb..9babfee 100644 --- a/src/phinde/Elasticsearch.php +++ b/src/phinde/Elasticsearch.php @@ -10,10 +10,15 @@ class Elasticsearch $this->baseUrl = $baseUrl; } + public static function getDocId($url) + { + return hash('sha256', $url); + } + public function isKnown($url) { $r = new Elasticsearch_Request( - $this->baseUrl . 'document/' . rawurlencode($url), + $this->baseUrl . 'document/' . static::getDocId($url), \HTTP_Request2::METHOD_HEAD ); $r->allow404 = true; @@ -24,7 +29,7 @@ class Elasticsearch public function get($url) { $r = new Elasticsearch_Request( - $this->baseUrl . 'document/' . rawurlencode($url), + $this->baseUrl . 'document/' . static::getDocId($url), \HTTP_Request2::METHOD_GET ); $r->allow404 = true; @@ -39,12 +44,15 @@ class Elasticsearch public function markQueued($url) { $r = new Elasticsearch_Request( - $this->baseUrl . 'document/' . rawurlencode($url), + $this->baseUrl . 'document/' . static::getDocId($url), \HTTP_Request2::METHOD_PUT ); - $doc = array( - 'status' => 'queued', - 'url' => $url + $doc = (object) array( + 'url' => $url, + 'status' => (object) array( + 'processed' => null, + 'findable' => false, + ) ); $r->setBody(json_encode($doc)); $r->send(); @@ -109,12 +117,12 @@ class Elasticsearch ); $qMust[] = array( 'term' => array( - 'status' => 'indexed' + 'status.findable' => true ) ); if ($sort == 'date') { - $sortCfg = array('modate' => array('order' => 'desc')); + $sortCfg = array('status.modate' => array('order' => 'desc')); } else { $sortCfg = array(); } @@ -133,7 +141,7 @@ class Elasticsearch 'url', 'title', 'author', - 'modate', + 'status.modate', ), 'query' => array( 'bool' => array( diff --git a/src/phinde/Fetcher.php b/src/phinde/Fetcher.php new file mode 100644 index 0000000..b5644af --- /dev/null +++ b/src/phinde/Fetcher.php @@ -0,0 +1,93 @@ +es = new Elasticsearch($GLOBALS['phinde']['elasticsearch']); + } + + /** + * @return Retrieved HTTP response and elasticsearch document + */ + public function fetch($url, $actions, $force = false) + { + $esDoc = $this->es->get($url); + if (isset($esDoc->status->location) + && $esDoc->status->location != '' + ) { + //TODO: what if location redirects change? + $url = $esDoc->status->location; + $esDoc = $this->es->get($url); + } + + $types = array(); + foreach ($actions as $action) { + $types = array_merge($action::$supportedTypes); + } + $types = array_unique($types); + + $req = new HttpRequest($url); + $req->setHeader('accept', implode(',', $types)); + if (!$force && $esDoc + && isset($esDoc->status->processed) + && $esDoc->status->processed != '' + ) { + $nCrawlTime = strtotime($esDoc->status->processed); + $req->setHeader('If-Modified-Since: ' . gmdate('r', $nCrawlTime)); + } + + $res = $req->send(); + if ($res->getStatus() === 304) { + //not modified since last time, so don't crawl again + echo "Not modified since last fetch\n"; + return false; + } else if ($res->getStatus() !== 200) { + throw new \Exception( + "Response code is not 200 but " + . $res->getStatus() . ", stopping" + ); + } + + $effUrl = $res->getEffectiveUrl(); + if ($effUrl != $url) { + $this->storeRedirect($url, $effUrl); + $url = $effUrl; + $esDoc = $this->es->get($url); + } + //FIXME: etag, hash on content + + $retrieved = new Retrieved(); + $retrieved->httpRes = $res; + $retrieved->esDoc = $esDoc; + $retrieved->url = $url; + return $retrieved; + } + + protected function storeRedirect($url, $target) + { + $esDoc = new \stdClass(); + $esDoc->status = (object) array( + 'location' => $target + ); + $esDoc->url = $url; + $this->storeDoc($url, $esDoc); + } + + public function storeDoc($url, $esDoc) + { + echo "Store $url\n"; + $esDoc->status->processed = gmdate('c'); + $r = new Elasticsearch_Request( + $GLOBALS['phinde']['elasticsearch'] . 'document/' + . ElasticSearch::getDocId($url), + \HTTP_Request2::METHOD_PUT + ); + $r->setBody(json_encode($esDoc)); + $r->send(); + } +} +?> diff --git a/src/phinde/Indexer.php b/src/phinde/Indexer.php new file mode 100644 index 0000000..98b52c3 --- /dev/null +++ b/src/phinde/Indexer.php @@ -0,0 +1,205 @@ +httpRes; + $esDoc = $retrieved->esDoc; + $url = $retrieved->url; + + $mimetype = explode(';', $res->getHeader('content-type'))[0]; + if (!in_array($mimetype, static::$supportedTypes)) { + echo "MIME type not supported for indexing: $mimetype\n"; + return false; + } + + if ($esDoc === null) { + $esDoc = new \stdClass(); + } + if (!isset($esDoc->status)) { + $esDoc->status = new \stdClass(); + } + + //FIXME: update index only if changed since last index time + //FIXME: extract base url from html + //FIXME: check if effective url needs updating + + $base = new \Net_URL2($url); + + //FIXME: MIME type switch + $doc = new \DOMDocument(); + //@ to hide parse warning messages in invalid html + @$doc->loadHTML($res->getBody()); + $dx = new \DOMXPath($doc); + + $xbase = $dx->evaluate('/html/head/base[@href]')->item(0); + if ($xbase) { + $base = $base->resolve( + $xbase->attributes->getNamedItem('href')->textContent + ); + } + + $meta = $dx->evaluate('/html/head/meta[@name="robots" and @content]') + ->item(0); + if ($meta) { + $robots = $meta->attributes->getNamedItem('content')->textContent; + foreach (explode(',', $robots) as $value) { + if (trim($value) == 'noindex') { + echo "URL does not want to be indexed: $url\n"; + exit(0); + } + } + } + + //remove script tags + $this->removeTags($doc, 'script'); + $this->removeTags($doc, 'style'); + $this->removeTags($doc, 'nav'); + + //default content: + $xpContext = $doc->getElementsByTagName('body')->item(0); + //FIXME: follow meta refresh, no body + // example: https://www.gnu.org/software/coreutils/ + + //use microformats content if it exists + $xpElems = $dx->query( + "//*[contains(concat(' ', normalize-space(@class), ' '), ' e-content ')]" + ); + if ($xpElems->length) { + $xpContext = $xpElems->item(0); + } else if ($doc->getElementById('content')) { + //if there is an element with ID "content", we'll use this + $xpContext = $doc->getElementById('content'); + } + + $esDoc->url = $url; + $esDoc->schemalessUrl = Helper::noSchema($url); + $esDoc->type = 'html'; + $esDoc->subtype = ''; + $esDoc->mimetype = $mimetype; + $esDoc->domain = parse_url($url, PHP_URL_HOST); + + //$esDoc->source = 'FIXME'; + //$esDoc->sourcetitle = 'FIXME'; + + $esDoc->author = new \stdClass(); + + $arXpElems = $dx->query('/html/head/meta[@name="author" and @content]'); + if ($arXpElems->length) { + $esDoc->author->name = trim( + $arXpElems->item(0)->attributes->getNamedItem('content')->textContent + ); + } + $arXpElems = $dx->query('/html/head/link[@rel="author" and @href]'); + if ($arXpElems->length) { + $esDoc->author->url = trim( + $base->resolve( + $arXpElems->item(0)->attributes->getNamedItem('href')->textContent + ) + ); + } + + + $arXpElems = $dx->query('/html/head/title'); + if ($arXpElems->length) { + $esDoc->title = trim( + $arXpElems->item(0)->textContent + ); + } + + foreach (array('h1', 'h2', 'h3', 'h4', 'h5', 'h6') as $headlinetype) { + $esDoc->$headlinetype = array(); + foreach ($xpContext->getElementsByTagName($headlinetype) as $xheadline) { + array_push( + $esDoc->$headlinetype, + trim($xheadline->textContent) + ); + } + } + + //FIXME: split paragraphs + //FIXME: insert space after br + $esDoc->text = array(); + $esDoc->text[] = trim( + str_replace( + array("\r\n", "\n", "\r", ' '), + ' ', + $xpContext->textContent + ) + ); + + //tags + $tags = array(); + foreach ($dx->query('/html/head/meta[@name="keywords" and @content]') as $xkeywords) { + $keywords = $xkeywords->attributes->getNamedItem('content')->textContent; + foreach (explode(',', $keywords) as $keyword) { + $tags[trim($keyword)] = true; + } + } + $esDoc->tags = array_keys($tags); + + //dates + $arXpdates = $dx->query('/html/head/meta[@name="DC.date.created" and @content]'); + if ($arXpdates->length) { + $esDoc->status->crdate = gmdate( + 'c', + strtotime( + $arXpdates->item(0)->attributes->getNamedItem('content')->textContent + ) + ); + } + //FIXME: keep creation date from database, or use modified date if we + // do not have it there + + $arXpdates = $dx->query('/html/head/meta[@name="DC.date.modified" and @content]'); + if ($arXpdates->length) { + $esDoc->status->modate = gmdate( + 'c', + strtotime( + $arXpdates->item(0)->attributes->getNamedItem('content')->textContent + ) + ); + } else { + $lm = $res->getHeader('last-modified'); + if ($lm !== null) { + $esDoc->status->modate = gmdate('c', strtotime($lm)); + } else { + //use current time since we don't have any other data + $esDoc->status->modate = gmdate('c'); + } + } + $esDoc->status->findable = true; + + //language + //there may be "en-US" and "de-DE" + $xlang = $doc->documentElement->attributes->getNamedItem('lang'); + if ($xlang) { + $esDoc->language = strtolower(substr($xlang->textContent, 0, 2)); + } + //FIXME: fallback, autodetection + //FIXME: check noindex + + //var_dump($esDoc);die(); + + $retrieved->esDoc = $esDoc; + return true; + } + + function removeTags($doc, $tag) { + $elems = array(); + foreach ($doc->getElementsbyTagName($tag) as $elem) { + $elems[] = $elem; + } + foreach ($elems as $elem) { + $elem->parentNode->removeChild($elem); + } + } +} +?> diff --git a/src/phinde/Queue.php b/src/phinde/Queue.php index 406f27e..6c30faa 100644 --- a/src/phinde/Queue.php +++ b/src/phinde/Queue.php @@ -11,40 +11,22 @@ class Queue $this->gmclient->addServer('127.0.0.1'); } - public function addToIndex($linkUrl, $linkTitle, $sourceUrl) + public function addToProcessList($linkUrl, $actions) { - echo "Queuing for indexing: $linkUrl\n"; + echo "Queuing for processing: $linkUrl" + . ' (' . implode(',', $actions) . ')' + . "\n"; $this->gmclient->doBackground( - $GLOBALS['phinde']['queuePrefix'] . 'phinde_index', + $GLOBALS['phinde']['queuePrefix'] . 'phinde_process', serialize( array( - 'url' => $linkUrl, - 'title' => $linkTitle, - 'source' => $sourceUrl + 'url' => $linkUrl, + 'actions' => $actions, ) ) ); if ($this->gmclient->returnCode() != GEARMAN_SUCCESS) { - echo 'Error queueing URL indexing for ' - . $linkUrl . "\n" - . 'Error code: ' . $this->gmclient->returnCode() . "\n"; - exit(2); - } - } - - public function addToCrawl($linkUrl) - { - echo "Queuing for crawling: $linkUrl\n"; - $this->gmclient->doBackground( - $GLOBALS['phinde']['queuePrefix'] . 'phinde_crawl', - serialize( - array( - 'url' => $linkUrl - ) - ) - ); - if ($this->gmclient->returnCode() != GEARMAN_SUCCESS) { - echo 'Error queueing URL crawling for ' + echo 'Error queueing URL processing for ' . $linkUrl . "\n" . 'Error code: ' . $this->gmclient->returnCode() . "\n"; exit(2); diff --git a/src/phinde/Retrieved.php b/src/phinde/Retrieved.php new file mode 100644 index 0000000..5812b71 --- /dev/null +++ b/src/phinde/Retrieved.php @@ -0,0 +1,26 @@ + diff --git a/www/index.php b/www/index.php index 5261156..8bf8147 100644 --- a/www/index.php +++ b/www/index.php @@ -115,8 +115,8 @@ foreach ($res->hits->hits as &$hit) { $doc->extra = new \stdClass(); $doc->extra->cleanUrl = preg_replace('#^.*://#', '', $doc->url); - if (isset($doc->modate)) { - $doc->extra->day = substr($doc->modate, 0, 10); + if (isset($doc->status->modate)) { + $doc->extra->day = substr($doc->status->modate, 0, 10); } }