From: Christian Weiske Date: Mon, 1 Feb 2016 19:18:59 +0000 (+0100) Subject: first kinda working version X-Git-Tag: v0.1.0~29 X-Git-Url: https://git.cweiske.de/phinde.git/commitdiff_plain/7b4425b096fa8c18d0db9fd9b1ae96d63ee8af55 first kinda working version --- 7b4425b096fa8c18d0db9fd9b1ae96d63ee8af55 diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..d041e45 --- /dev/null +++ b/.gitignore @@ -0,0 +1 @@ +/data/config.php diff --git a/README.rst b/README.rst new file mode 100644 index 0000000..121893c --- /dev/null +++ b/README.rst @@ -0,0 +1,6 @@ +Dependencies +============ +- PHP 5.5+ +- elasticsearch 2.0 +- gearman +- Net_URL2 diff --git a/bin/crawl.php b/bin/crawl.php new file mode 100755 index 0000000..26cf994 --- /dev/null +++ b/bin/crawl.php @@ -0,0 +1,145 @@ +#!/usr/bin/env php +send(); +if ($res->getStatus() !== 200) { + echo "Response code is not 200 but " . $res->getStatus() . ", stopping\n"; + exit(3); +} +$mimetype = explode(';', $res->getHeader('content-type'))[0]; +if (!in_array($mimetype, $supportedCrawlTypes)) { + echo "MIME type not supported for crawling: $mimetype\n"; + exit(4); +} + +//FIXME: mime type switch for cdata +$doc = new \DOMDocument(); +//@ to hide parse warning messages in invalid html +@$doc->loadHTMLFile($url); + +//FIXME: extract base url from html +$base = new \Net_URL2($url); + +$xpath = new \DOMXPath($doc); +$links = $xpath->evaluate('//a'); +//FIXME: link rel, img, video + +$alreadySeen = array(); + +foreach ($links as $link) { + $linkTitle = $link->textContent; + $href = ''; + foreach ($link->attributes as $attribute) { + if ($attribute->name == 'href') { + $href = $attribute->textContent; + } + } + if ($href == '' || $href{0} == '#') { + //link on this page + continue; + } + + $linkUrlObj = $base->resolve($href); + $linkUrlObj->setFragment(false); + $linkUrl = (string) $linkUrlObj; + if (isset($alreadySeen[$linkUrl])) { + continue; + } + + switch ($linkUrlObj->getScheme()) { + case 'http': + case 'https': + break; + default: + continue 2; + } + + if ($es->isKnown($linkUrl)) { + continue; + } + + //FIXME: check target type + //FIXME: check nofollow + //var_dump($linkTitle, $linkUrl); + $es->markQueued($linkUrl); + addToIndex($linkUrl, $linkTitle, $url); + if (isUrlAllowed($linkUrl)) { + addToCrawl($linkUrl); + } + $alreadySeen[$linkUrl] = true; +} + +function addToIndex($linkUrl, $linkTitle, $sourceUrl) +{ + echo "Queuing for indexing: $linkUrl\n"; + $gmclient = new \GearmanClient(); + $gmclient->addServer('127.0.0.1'); + $gmclient->doBackground( + 'phinde_index', + serialize( + array( + 'url' => $linkUrl, + 'title' => $linkTitle, + 'source' => $sourceUrl + ) + ) + ); + if ($gmclient->returnCode() != GEARMAN_SUCCESS) { + echo 'Error queueing URL indexing for ' + . $linkUrl . "\n" + . 'Error code: ' . $gmclient->returnCode() . "\n"; + exit(2); + } +} + +function addToCrawl($linkUrl) +{ + echo "Queuing for crawling: $linkUrl\n"; + $gmclient = new \GearmanClient(); + $gmclient->addServer('127.0.0.1'); + $gmclient->doBackground( + 'phinde_crawl', + serialize( + array( + 'url' => $linkUrl + ) + ) + ); + if ($gmclient->returnCode() != GEARMAN_SUCCESS) { + echo 'Error queueing URL crawling for ' + . $linkUrl . "\n" + . 'Error code: ' . $gmclient->returnCode() . "\n"; + exit(2); + } +} +?> \ No newline at end of file diff --git a/bin/index.php b/bin/index.php new file mode 100755 index 0000000..6a13afd --- /dev/null +++ b/bin/index.php @@ -0,0 +1,164 @@ +#!/usr/bin/env php +get($url); +if ($existingDoc && $existingDoc->status == 'indexed') { + echo "URL already indexed: $url\n"; + exit(0); +} +//FIXME: sourcetitle, sourcelink + +//FIXME: enable redirects +//FIXME: enable ssl +$req = new \HTTP_Request2($url); +$req->setConfig('connect_timeout', 5); +$req->setConfig('timeout', 10); +$res = $req->send(); +//FIXME: try-catch + +//FIXME: delete if 401 gone or 404 when updating +if ($res->getStatus() !== 200) { + echo "Response code is not 200 but " . $res->getStatus() . ", stopping\n"; + //FIXME: update status + exit(3); +} + +$mimetype = explode(';', $res->getHeader('content-type'))[0]; +if (!in_array($mimetype, $supportedIndexTypes)) { + echo "MIME type not supported for indexing: $mimetype\n"; + //FIXME: update status + exit(4); +} + + +//FIXME: update index only if changed since last index time +//FIXME: extract base url from html +//FIXME: use final URL after redirects +$base = new \Net_URL2($url); + +$indexDoc = new \stdClass(); + +//FIXME: MIME type switch +$doc = new \DOMDocument(); +//@ to hide parse warning messages in invalid html +@$doc->loadHTML($res->getBody()); +$sx = simplexml_import_dom($doc); + +$indexDoc->url = $url; +$indexDoc->type = 'html'; +$indexDoc->subtype = ''; +$indexDoc->mimetype = $mimetype; +$indexDoc->domain = parse_url($url, PHP_URL_HOST); + +//$indexDoc->source = 'FIXME'; +//$indexDoc->sourcetitle = 'FIXME'; + +$indexDoc->author = new \stdClass(); + +$arSxElems = $sx->xpath('/html/head/meta[@name="author"]'); +if (count($arSxElems)) { + $indexDoc->author->name = trim($arSxElems[0]['content']); +} +$arSxElems = $sx->xpath('/html/head/link[@rel="author"]'); +if (count($arSxElems)) { + $indexDoc->author->url = (string) $base->resolve($arSxElems[0]['href']); +} + +$indexDoc->title = (string) $sx->head->title; +foreach (array('h1', 'h2', 'h3', 'h4', 'h5', 'h6') as $headlinetype) { + $indexDoc->$headlinetype = array(); + //FIXME: limit to h-entry children + foreach ($sx->xpath('//' . $headlinetype) as $xheadline) { + array_push( + $indexDoc->$headlinetype, + trim(dom_import_simplexml($xheadline)->textContent) + ); + } +} + +//FIXME: limit to h-entry e-content +//FIXME: insert space after br +//FIXME: remove javascript +$indexDoc->text = array(); +foreach ($doc->getElementsByTagName('body') as $body) { + $indexDoc->text[] = trim( + str_replace( + array("\r\n", "\n", "\r", ' '), + ' ', + $body->textContent + ) + ); +} + +//tags +$tags = array(); +foreach ($sx->xpath('/html/head/meta[@name="keywords"]') as $xkeywords) { + foreach (explode(',', $xkeywords['content']) as $keyword) { + $tags[trim($keyword)] = true; + } +} +$indexDoc->tags = array_keys($tags); + +//dates +$arSxdates = $sx->xpath('/html/head/meta[@name="DC.date.created"]'); +if (count($arSxdates)) { + $indexDoc->crdate = date('c', strtotime((string) $arSxdates[0]['content'])); +} +//FIXME: keep creation date from database, or use modified date if we +// do not have it there + +$arSxdates = $sx->xpath('/html/head/meta[@name="DC.date.modified"]'); +if (count($arSxdates)) { + $indexDoc->modate = date('c', strtotime((string) $arSxdates[0]['content'])); +} else { + $lm = $res->getHeader('last-modified'); + if ($lm !== null) { + $indexDoc->modate = date('c', strtotime($lm)); + } else { + //use current time since we don't have any other data + $indexDoc->modate = date('c'); + } +} + +//language +//there may be "en-US" and "de-DE" +$indexDoc->language = substr((string) $sx['lang'], 0, 2); +//FIXME: fallback, autodetection +//FIXME: check noindex + + +//var_dump($indexDoc); + +$indexDoc->status = 'indexed'; + +//FIXME: update index if it exists already +$r = new Elasticsearch_Request( + $GLOBALS['phinde']['elasticsearch'] . 'document/' . rawurlencode($url), + \HTTP_Request2::METHOD_PUT +); +$r->setBody(json_encode($indexDoc)); +$r->send(); + + +?> diff --git a/bin/phinde-worker.php b/bin/phinde-worker.php new file mode 100755 index 0000000..e8253ff --- /dev/null +++ b/bin/phinde-worker.php @@ -0,0 +1,34 @@ +#!/usr/bin/env php +addServer('127.0.0.1'); + +$gmworker->addFunction( + 'phinde_crawl', + function(\GearmanJob $job) { + $data = unserialize($job->workload()); + echo "-- Crawling " . $data['url'] . "\n"; + passthru('./crawl.php ' . escapeshellarg($data['url'])); + } +); +$gmworker->addFunction( + 'phinde_index', + function(\GearmanJob $job) { + $data = unserialize($job->workload()); + echo "-- Indexing " . $data['url'] . "\n"; + passthru('./index.php ' . escapeshellarg($data['url'])); + //exit(); + } +); + +while ($gmworker->work()) { + if ($gmworker->returnCode() != GEARMAN_SUCCESS) { + echo 'Error running job: ' . $gmworker->returnCode() . "\n"; + break; + } +} +?> diff --git a/bin/setup.php b/bin/setup.php new file mode 100755 index 0000000..7dacedd --- /dev/null +++ b/bin/setup.php @@ -0,0 +1,27 @@ +#!/usr/bin/env php +allow404 = true; +$r->send(); + +//recreate it +$r = new Elasticsearch_Request( + $GLOBALS['phinde']['elasticsearch'], + \HTTP_Request2::METHOD_PUT +); +$r->setBody( + file_get_contents(__DIR__ . '/../data/elasticsearch-mapping.json') +); +$r->send(); +?> diff --git a/data/elasticsearch-mapping.json b/data/elasticsearch-mapping.json new file mode 100644 index 0000000..ec9bcc9 --- /dev/null +++ b/data/elasticsearch-mapping.json @@ -0,0 +1,40 @@ +{ + "mappings": { + "document": { + "properties": { + "status": { + "type": "string", + "index": "not_analyzed" + }, + "type": { + "type": "string", + "index": "not_analyzed" + }, + "subtype": { + "type": "string", + "index": "not_analyzed" + }, + "mimetype": { + "type": "string", + "index": "not_analyzed" + }, + "url": { + "type": "string", + "index": "not_analyzed" + }, + "domain": { + "type": "string", + "index": "not_analyzed" + }, + "source": { + "type": "string", + "index": "not_analyzed" + }, + "language": { + "type": "string", + "index": "not_analyzed" + } + } + } + } +} diff --git a/docs/elasticsearch/doc-html.json b/docs/elasticsearch/doc-html.json new file mode 100644 index 0000000..43d4206 --- /dev/null +++ b/docs/elasticsearch/doc-html.json @@ -0,0 +1,41 @@ +{ + 'type': 'html', + 'subtype': 'rsvp', + 'mimetype': 'application/xhtml+xml', + 'url': 'http://example.org/foo.htm', + 'domain': 'example.org', + 'source': [ + 'http://example.org/linkfarm.htm' + ], + 'sourcetitle': [ + 'Click here', + 'Something about bar', + ], + 'language': 'en', + 'author': { + 'name': 'Alice Example', + 'url': 'http://example.org/alice.htm', + }, + 'title': 'Sample HTML page', + 'h1': [ + 'Headline 1', + 'Another headline 1' + ], + 'h2': [ + 'Subheadline' + ], + 'h3': [ + 'Subsub', + 'Another Subsub' + ], + 'h4': [], + 'h5': [], + 'h6': [], + 'text': [ + 'HTML converted to plain text', + 'Another paragraph in the text' + ], + 'tags': ['example', 'test', 'documentation'], + 'crdate': '2016-01-30T12:23:42+01:00', + 'modate': '2016-01-30T12:23:42+01:00', +} diff --git a/docs/elasticsearch/doc-image.json b/docs/elasticsearch/doc-image.json new file mode 100644 index 0000000..59e2231 --- /dev/null +++ b/docs/elasticsearch/doc-image.json @@ -0,0 +1,14 @@ +{ + 'type': 'image', + 'mimetype': 'image/png', + 'url': 'http://example.org/image.png', + 'domain': 'example.org', + 'source': 'http://example.org/foo.htm', + 'sourcetitle': 'Alt text from linking HTML page', + 'text': [ + 'EXIF-description text' + ], + 'tags': [], + 'crdate': '2016-01-30T12:23:42+01:00', + 'modate': '2016-01-30T12:23:42+01:00', +} diff --git a/src/Elasticsearch.php b/src/Elasticsearch.php new file mode 100644 index 0000000..b3f3067 --- /dev/null +++ b/src/Elasticsearch.php @@ -0,0 +1,138 @@ +baseUrl = $baseUrl; + } + + /** + * @link https://www.elastic.co/guide/en/elasticsearch/guide/current/_finding_exact_values.html + */ + public function isKnown($url) + { + $r = new Elasticsearch_Request( + $this->baseUrl . 'document/_search/exists', + \HTTP_Request2::METHOD_GET + ); + $r->allow404 = true; + $r->setBody( + json_encode( + array( + 'query' => array( + 'filtered' => array( + 'filter' => array( + 'term' => array( + 'url' => $url + ) + ) + ) + ) + ) + ) + ); + $res = json_decode($r->send()->getBody()); + return $res->exists; + } + + public function get($url) + { + $r = new Elasticsearch_Request( + $this->baseUrl . 'document/' . rawurlencode($url), + \HTTP_Request2::METHOD_GET + ); + $r->allow404 = true; + $res = $r->send(); + if ($res->getStatus() != 200) { + return null; + } + $d = json_decode($res->getBody()); + return $d->_source; + } + + public function markQueued($url) + { + $r = new Elasticsearch_Request( + $this->baseUrl . 'document/' . rawurlencode($url), + \HTTP_Request2::METHOD_PUT + ); + $doc = array( + 'status' => 'queued', + 'url' => $url + ); + $r->setBody(json_encode($doc)); + $r->send(); + } + + public function search($query, $page, $perPage) + { + $r = new Elasticsearch_Request( + $this->baseUrl . 'document/_search', + \HTTP_Request2::METHOD_GET + ); + $doc = array( + '_source' => array( + 'url', + 'title', + 'author', + 'modate', + ), + 'query' => array( + 'bool' => array( + 'must' => array( + array( + 'query_string' => array( + 'default_field' => '_all', + 'query' => $query + ) + ), + array( + 'term' => array( + 'status' => 'indexed' + ) + ) + ) + ) + ), + 'aggregations' => array( + 'tags' => array( + 'terms' => array( + 'field' => 'tags' + ) + ), + 'language' => array( + 'terms' => array( + 'field' => 'language' + ) + ), + 'domain' => array( + 'terms' => array( + 'field' => 'domain' + ) + ), + 'type' => array( + 'terms' => array( + 'field' => 'type' + ) + ) + ), + 'from' => $page * $perPage, + 'size' => $perPage, + 'sort' => array( + //array('modate' => array('order' => 'desc')) + ) + ); + //unset($doc['_source']); + + //ini_set('xdebug.var_display_max_depth', 10); + //return json_decode(json_encode($doc)); + $r->setBody(json_encode($doc)); + $res = $r->send(); + return json_decode($res->getBody()); + } +} +?> diff --git a/src/Elasticsearch/Request.php b/src/Elasticsearch/Request.php new file mode 100644 index 0000000..7bb6add --- /dev/null +++ b/src/Elasticsearch/Request.php @@ -0,0 +1,35 @@ +getStatus() / 100); + if ($mainCode === 2) { + return $res; + } + + if ($this->allow404 && $res->getStatus() == 404) { + return $res; + } + $js = json_decode($res->getBody()); + if (isset($js->error)) { + $error = json_encode($js->error); + } else { + $error = $res->getBody(); + } + + throw new \Exception( + 'Error in elasticsearch communication at ' + . $this->getMethod() . ' ' . (string) $this->getUrl() + . ' (status code ' . $res->getStatus() . '): ' + . $error + ); + } +} + +?> diff --git a/src/Html/Pager.php b/src/Html/Pager.php new file mode 100644 index 0000000..a14a53d --- /dev/null +++ b/src/Html/Pager.php @@ -0,0 +1,66 @@ +pager = \Pager::factory( + array( + 'mode' => 'Sliding', + 'perPage' => $perPage, + 'delta' => 2, + 'totalItems' => $itemCount, + 'currentPage' => $currentPage, + 'urlVar' => 'page', + 'append' => $append, + 'path' => '', + 'fileName' => $filename, + 'separator' => '###', + 'spacesBeforeSeparator' => 0, + 'spacesAfterSeparator' => 0, + 'curPageSpanPre' => '', + 'curPageSpanPost' => '', + 'firstPagePre' => '', + 'firstPageText' => 'first', + 'firstPagePost' => '', + 'lastPagePre' => '', + 'lastPageText' => 'last', + 'lastPagePost' => '', + 'prevImg' => '« prev', + 'nextImg' => 'next »', + ) + ); + } + + + public function getLinks() + { + $arLinks = $this->pager->getLinks(); + $arLinks['pages'] = explode('###', $arLinks['pages']); + return $arLinks; + } + + public function numPages() + { + return $this->pager->numPages(); + } +} + +?> diff --git a/src/functions.php b/src/functions.php new file mode 100644 index 0000000..fd91360 --- /dev/null +++ b/src/functions.php @@ -0,0 +1,11 @@ + diff --git a/www/index.php b/www/index.php new file mode 100644 index 0000000..96d6f7a --- /dev/null +++ b/www/index.php @@ -0,0 +1,65 @@ +search($query, $page, $perPage); + +$pager = new Html_Pager( + $res->hits->total, $perPage, $page + 1, + '?q=' . $query +); + +foreach ($res->hits->hits as $hit) { + $doc = $hit->_source; + if ($doc->title == '') { + $doc->title = '(no title)'; + } + echo '

' + . '' + . htmlspecialchars($doc->title) + . ''; + if (isset($doc->author->name)) { + echo ' by ' + . htmlspecialchars($doc->author->name) + . ''; + } + echo '
' + . htmlspecialchars(preg_replace('#^.*://#', '', $doc->url)) + . ''; + if (isset($doc->modate)) { + echo '
Changed: ' . substr($doc->modate, 0, 10); + } + echo '

'; +} + +$links = $pager->getLinks(); +echo $links['back'] + . ' ' . implode(' ', $links['pages']) + . ' ' . $links['next']; +//var_dump($links); +var_dump($res->aggregations->domain); +?>