diff options
| author | Christian Weiske <cweiske@cweiske.de> | 2016-11-07 21:41:36 +0100 |
|---|---|---|
| committer | Christian Weiske <cweiske@cweiske.de> | 2016-11-07 21:41:36 +0100 |
| commit | d7651fd96dcfa2829519504e4c8ec1ce511cd57f (patch) | |
| tree | e24d7a9f90060b0fee5a652de43bd0627f1c5bde /src/phinde | |
| parent | f90790c6b2a54c9b1c8a0aeaf1f23e6aa67d7aca (diff) | |
| download | phinde-d7651fd96dcfa2829519504e4c8ec1ce511cd57f.tar.gz phinde-d7651fd96dcfa2829519504e4c8ec1ce511cd57f.zip | |
Big patch merging crawling+indexing into one command, new json document structure
Diffstat (limited to 'src/phinde')
| -rw-r--r-- | src/phinde/Crawler.php | 66 | ||||
| -rw-r--r-- | src/phinde/Elasticsearch.php | 26 | ||||
| -rw-r--r-- | src/phinde/Fetcher.php | 93 | ||||
| -rw-r--r-- | src/phinde/Indexer.php | 205 | ||||
| -rw-r--r-- | src/phinde/Queue.php | 34 | ||||
| -rw-r--r-- | src/phinde/Retrieved.php | 26 |
6 files changed, 365 insertions, 85 deletions
diff --git a/src/phinde/Crawler.php b/src/phinde/Crawler.php index 1cf9bdc..38e3c3f 100644 --- a/src/phinde/Crawler.php +++ b/src/phinde/Crawler.php @@ -11,7 +11,7 @@ class Crawler */ protected $showLinksOnly = false; - static $supportedIndexTypes = array( + static $supportedTypes = array( 'application/atom+xml' => '\\phinde\\LinkExtractor\\Atom', 'application/xhtml+xml' => '\\phinde\\LinkExtractor\\Html', 'text/html' => '\\phinde\\LinkExtractor\\Html', @@ -23,59 +23,28 @@ class Crawler $this->queue = new Queue(); } - public function crawl($url) + public function run(Retrieved $retrieved) { - $res = $this->fetch($url); - if ($res === false) { - return; - } - - $linkInfos = $this->extractLinks($res); + $linkInfos = $this->extractLinks($retrieved->httpRes); $linkInfos = $this->filterLinks($linkInfos); if ($this->showLinksOnly) { $this->showLinks($linkInfos); + return false; } else { $this->enqueue($linkInfos); + return true; } } - protected function fetch($url) - { - $existingDoc = $this->es->get($url); - - $req = new HttpRequest($url); - $req->setHeader( - 'accept', - implode(',', array_keys(static::$supportedIndexTypes)) - ); - if ($existingDoc && isset($existingDoc->modate)) { - $nMoDate = strtotime($existingDoc->modate); - $req->setHeader('If-Modified-Since: ' . date('r', $nMoDate)); - } - - $res = $req->send(); - if ($res->getStatus() === 304) { - //not modified since last time, so don't crawl again - $this->log('Not modified since last fetch'); - return false; - } else if ($res->getStatus() !== 200) { - throw new \Exception( - "Response code is not 200 but " - . $res->getStatus() . ", stopping" - ); - } - return $res; - } - protected function extractLinks(\HTTP_Request2_Response $res) { $mimetype = explode(';', $res->getHeader('content-type'))[0]; - if (!isset(static::$supportedIndexTypes[$mimetype])) { + if (!isset(static::$supportedTypes[$mimetype])) { echo "MIME type not supported for indexing: $mimetype\n"; return array(); } - $class = static::$supportedIndexTypes[$mimetype]; + $class = static::$supportedTypes[$mimetype]; $extractor = new $class(); return $extractor->extract($res); } @@ -112,15 +81,17 @@ class Crawler } if ($linkInfo->crawl || $linkInfo->index) { $this->es->markQueued($linkInfo->url); - } - if ($linkInfo->index) { - $this->queue->addToIndex( - $linkInfo->url, $linkInfo->title, $linkInfo->source + $actions = array(); + if ($linkInfo->index) { + $actions[] = 'index'; + } + if ($linkInfo->crawl) { + $actions[] = 'crawl'; + } + $this->queue->addToProcessList( + $linkInfo->url, $actions ); } - if ($linkInfo->crawl) { - $this->queue->addToCrawl($linkInfo->url); - } } } @@ -142,10 +113,5 @@ class Crawler { $this->showLinksOnly = $showLinksOnly; } - - protected function log($msg) - { - echo $msg . "\n"; - } } ?> diff --git a/src/phinde/Elasticsearch.php b/src/phinde/Elasticsearch.php index 2887beb..9babfee 100644 --- a/src/phinde/Elasticsearch.php +++ b/src/phinde/Elasticsearch.php @@ -10,10 +10,15 @@ class Elasticsearch $this->baseUrl = $baseUrl; } + public static function getDocId($url) + { + return hash('sha256', $url); + } + public function isKnown($url) { $r = new Elasticsearch_Request( - $this->baseUrl . 'document/' . rawurlencode($url), + $this->baseUrl . 'document/' . static::getDocId($url), \HTTP_Request2::METHOD_HEAD ); $r->allow404 = true; @@ -24,7 +29,7 @@ class Elasticsearch public function get($url) { $r = new Elasticsearch_Request( - $this->baseUrl . 'document/' . rawurlencode($url), + $this->baseUrl . 'document/' . static::getDocId($url), \HTTP_Request2::METHOD_GET ); $r->allow404 = true; @@ -39,12 +44,15 @@ class Elasticsearch public function markQueued($url) { $r = new Elasticsearch_Request( - $this->baseUrl . 'document/' . rawurlencode($url), + $this->baseUrl . 'document/' . static::getDocId($url), \HTTP_Request2::METHOD_PUT ); - $doc = array( - 'status' => 'queued', - 'url' => $url + $doc = (object) array( + 'url' => $url, + 'status' => (object) array( + 'processed' => null, + 'findable' => false, + ) ); $r->setBody(json_encode($doc)); $r->send(); @@ -109,12 +117,12 @@ class Elasticsearch ); $qMust[] = array( 'term' => array( - 'status' => 'indexed' + 'status.findable' => true ) ); if ($sort == 'date') { - $sortCfg = array('modate' => array('order' => 'desc')); + $sortCfg = array('status.modate' => array('order' => 'desc')); } else { $sortCfg = array(); } @@ -133,7 +141,7 @@ class Elasticsearch 'url', 'title', 'author', - 'modate', + 'status.modate', ), 'query' => array( 'bool' => array( diff --git a/src/phinde/Fetcher.php b/src/phinde/Fetcher.php new file mode 100644 index 0000000..b5644af --- /dev/null +++ b/src/phinde/Fetcher.php @@ -0,0 +1,93 @@ +<?php +namespace phinde; + +class Fetcher +{ + protected $es; + + public function __construct() + { + $this->es = new Elasticsearch($GLOBALS['phinde']['elasticsearch']); + } + + /** + * @return Retrieved HTTP response and elasticsearch document + */ + public function fetch($url, $actions, $force = false) + { + $esDoc = $this->es->get($url); + if (isset($esDoc->status->location) + && $esDoc->status->location != '' + ) { + //TODO: what if location redirects change? + $url = $esDoc->status->location; + $esDoc = $this->es->get($url); + } + + $types = array(); + foreach ($actions as $action) { + $types = array_merge($action::$supportedTypes); + } + $types = array_unique($types); + + $req = new HttpRequest($url); + $req->setHeader('accept', implode(',', $types)); + if (!$force && $esDoc + && isset($esDoc->status->processed) + && $esDoc->status->processed != '' + ) { + $nCrawlTime = strtotime($esDoc->status->processed); + $req->setHeader('If-Modified-Since: ' . gmdate('r', $nCrawlTime)); + } + + $res = $req->send(); + if ($res->getStatus() === 304) { + //not modified since last time, so don't crawl again + echo "Not modified since last fetch\n"; + return false; + } else if ($res->getStatus() !== 200) { + throw new \Exception( + "Response code is not 200 but " + . $res->getStatus() . ", stopping" + ); + } + + $effUrl = $res->getEffectiveUrl(); + if ($effUrl != $url) { + $this->storeRedirect($url, $effUrl); + $url = $effUrl; + $esDoc = $this->es->get($url); + } + //FIXME: etag, hash on content + + $retrieved = new Retrieved(); + $retrieved->httpRes = $res; + $retrieved->esDoc = $esDoc; + $retrieved->url = $url; + return $retrieved; + } + + protected function storeRedirect($url, $target) + { + $esDoc = new \stdClass(); + $esDoc->status = (object) array( + 'location' => $target + ); + $esDoc->url = $url; + $this->storeDoc($url, $esDoc); + } + + public function storeDoc($url, $esDoc) + { + echo "Store $url\n"; + $esDoc->status->processed = gmdate('c'); + $r = new Elasticsearch_Request( + $GLOBALS['phinde']['elasticsearch'] . 'document/' + . ElasticSearch::getDocId($url), + \HTTP_Request2::METHOD_PUT + ); + $r->setBody(json_encode($esDoc)); + $r->send(); + } +} +?> diff --git a/src/phinde/Indexer.php b/src/phinde/Indexer.php new file mode 100644 index 0000000..98b52c3 --- /dev/null +++ b/src/phinde/Indexer.php @@ -0,0 +1,205 @@ +<?php +namespace phinde; + +class Indexer +{ + static $supportedTypes = array( + 'application/xhtml+xml', + 'text/html', + ); + + public function run(Retrieved $retrieved) + { + $res = $retrieved->httpRes; + $esDoc = $retrieved->esDoc; + $url = $retrieved->url; + + $mimetype = explode(';', $res->getHeader('content-type'))[0]; + if (!in_array($mimetype, static::$supportedTypes)) { + echo "MIME type not supported for indexing: $mimetype\n"; + return false; + } + + if ($esDoc === null) { + $esDoc = new \stdClass(); + } + if (!isset($esDoc->status)) { + $esDoc->status = new \stdClass(); + } + + //FIXME: update index only if changed since last index time + //FIXME: extract base url from html + //FIXME: check if effective url needs updating + + $base = new \Net_URL2($url); + + //FIXME: MIME type switch + $doc = new \DOMDocument(); + //@ to hide parse warning messages in invalid html + @$doc->loadHTML($res->getBody()); + $dx = new \DOMXPath($doc); + + $xbase = $dx->evaluate('/html/head/base[@href]')->item(0); + if ($xbase) { + $base = $base->resolve( + $xbase->attributes->getNamedItem('href')->textContent + ); + } + + $meta = $dx->evaluate('/html/head/meta[@name="robots" and @content]') + ->item(0); + if ($meta) { + $robots = $meta->attributes->getNamedItem('content')->textContent; + foreach (explode(',', $robots) as $value) { + if (trim($value) == 'noindex') { + echo "URL does not want to be indexed: $url\n"; + exit(0); + } + } + } + + //remove script tags + $this->removeTags($doc, 'script'); + $this->removeTags($doc, 'style'); + $this->removeTags($doc, 'nav'); + + //default content: <body> + $xpContext = $doc->getElementsByTagName('body')->item(0); + //FIXME: follow meta refresh, no body + // example: https://www.gnu.org/software/coreutils/ + + //use microformats content if it exists + $xpElems = $dx->query( + "//*[contains(concat(' ', normalize-space(@class), ' '), ' e-content ')]" + ); + if ($xpElems->length) { + $xpContext = $xpElems->item(0); + } else if ($doc->getElementById('content')) { + //if there is an element with ID "content", we'll use this + $xpContext = $doc->getElementById('content'); + } + + $esDoc->url = $url; + $esDoc->schemalessUrl = Helper::noSchema($url); + $esDoc->type = 'html'; + $esDoc->subtype = ''; + $esDoc->mimetype = $mimetype; + $esDoc->domain = parse_url($url, PHP_URL_HOST); + + //$esDoc->source = 'FIXME'; + //$esDoc->sourcetitle = 'FIXME'; + + $esDoc->author = new \stdClass(); + + $arXpElems = $dx->query('/html/head/meta[@name="author" and @content]'); + if ($arXpElems->length) { + $esDoc->author->name = trim( + $arXpElems->item(0)->attributes->getNamedItem('content')->textContent + ); + } + $arXpElems = $dx->query('/html/head/link[@rel="author" and @href]'); + if ($arXpElems->length) { + $esDoc->author->url = trim( + $base->resolve( + $arXpElems->item(0)->attributes->getNamedItem('href')->textContent + ) + ); + } + + + $arXpElems = $dx->query('/html/head/title'); + if ($arXpElems->length) { + $esDoc->title = trim( + $arXpElems->item(0)->textContent + ); + } + + foreach (array('h1', 'h2', 'h3', 'h4', 'h5', 'h6') as $headlinetype) { + $esDoc->$headlinetype = array(); + foreach ($xpContext->getElementsByTagName($headlinetype) as $xheadline) { + array_push( + $esDoc->$headlinetype, + trim($xheadline->textContent) + ); + } + } + + //FIXME: split paragraphs + //FIXME: insert space after br + $esDoc->text = array(); + $esDoc->text[] = trim( + str_replace( + array("\r\n", "\n", "\r", ' '), + ' ', + $xpContext->textContent + ) + ); + + //tags + $tags = array(); + foreach ($dx->query('/html/head/meta[@name="keywords" and @content]') as $xkeywords) { + $keywords = $xkeywords->attributes->getNamedItem('content')->textContent; + foreach (explode(',', $keywords) as $keyword) { + $tags[trim($keyword)] = true; + } + } + $esDoc->tags = array_keys($tags); + + //dates + $arXpdates = $dx->query('/html/head/meta[@name="DC.date.created" and @content]'); + if ($arXpdates->length) { + $esDoc->status->crdate = gmdate( + 'c', + strtotime( + $arXpdates->item(0)->attributes->getNamedItem('content')->textContent + ) + ); + } + //FIXME: keep creation date from database, or use modified date if we + // do not have it there + + $arXpdates = $dx->query('/html/head/meta[@name="DC.date.modified" and @content]'); + if ($arXpdates->length) { + $esDoc->status->modate = gmdate( + 'c', + strtotime( + $arXpdates->item(0)->attributes->getNamedItem('content')->textContent + ) + ); + } else { + $lm = $res->getHeader('last-modified'); + if ($lm !== null) { + $esDoc->status->modate = gmdate('c', strtotime($lm)); + } else { + //use current time since we don't have any other data + $esDoc->status->modate = gmdate('c'); + } + } + $esDoc->status->findable = true; + + //language + //there may be "en-US" and "de-DE" + $xlang = $doc->documentElement->attributes->getNamedItem('lang'); + if ($xlang) { + $esDoc->language = strtolower(substr($xlang->textContent, 0, 2)); + } + //FIXME: fallback, autodetection + //FIXME: check noindex + + //var_dump($esDoc);die(); + + $retrieved->esDoc = $esDoc; + return true; + } + + function removeTags($doc, $tag) { + $elems = array(); + foreach ($doc->getElementsbyTagName($tag) as $elem) { + $elems[] = $elem; + } + foreach ($elems as $elem) { + $elem->parentNode->removeChild($elem); + } + } +} +?> diff --git a/src/phinde/Queue.php b/src/phinde/Queue.php index 406f27e..6c30faa 100644 --- a/src/phinde/Queue.php +++ b/src/phinde/Queue.php @@ -11,40 +11,22 @@ class Queue $this->gmclient->addServer('127.0.0.1'); } - public function addToIndex($linkUrl, $linkTitle, $sourceUrl) + public function addToProcessList($linkUrl, $actions) { - echo "Queuing for indexing: $linkUrl\n"; + echo "Queuing for processing: $linkUrl" + . ' (' . implode(',', $actions) . ')' + . "\n"; $this->gmclient->doBackground( - $GLOBALS['phinde']['queuePrefix'] . 'phinde_index', + $GLOBALS['phinde']['queuePrefix'] . 'phinde_process', serialize( array( - 'url' => $linkUrl, - 'title' => $linkTitle, - 'source' => $sourceUrl + 'url' => $linkUrl, + 'actions' => $actions, ) ) ); if ($this->gmclient->returnCode() != GEARMAN_SUCCESS) { - echo 'Error queueing URL indexing for ' - . $linkUrl . "\n" - . 'Error code: ' . $this->gmclient->returnCode() . "\n"; - exit(2); - } - } - - public function addToCrawl($linkUrl) - { - echo "Queuing for crawling: $linkUrl\n"; - $this->gmclient->doBackground( - $GLOBALS['phinde']['queuePrefix'] . 'phinde_crawl', - serialize( - array( - 'url' => $linkUrl - ) - ) - ); - if ($this->gmclient->returnCode() != GEARMAN_SUCCESS) { - echo 'Error queueing URL crawling for ' + echo 'Error queueing URL processing for ' . $linkUrl . "\n" . 'Error code: ' . $this->gmclient->returnCode() . "\n"; exit(2); diff --git a/src/phinde/Retrieved.php b/src/phinde/Retrieved.php new file mode 100644 index 0000000..5812b71 --- /dev/null +++ b/src/phinde/Retrieved.php @@ -0,0 +1,26 @@ +<?php +namespace phinde; + +/** + * Information retrieved by Fetcher + */ +class Retrieved +{ + /** + * @var \HTTP_Request2_Response + */ + public $httpRes; + + /** + * Existing elasticsearch document + * + * @var object + */ + public $esDoc; + + /** + * URL of document + */ + public $url; +} +?> |
