From 7b4425b096fa8c18d0db9fd9b1ae96d63ee8af55 Mon Sep 17 00:00:00 2001 From: Christian Weiske Date: Mon, 1 Feb 2016 20:18:59 +0100 Subject: first kinda working version --- bin/crawl.php | 145 ++++++++++++++++++++++++++++++++++++++++++++ bin/index.php | 164 ++++++++++++++++++++++++++++++++++++++++++++++++++ bin/phinde-worker.php | 34 +++++++++++ bin/setup.php | 27 +++++++++ 4 files changed, 370 insertions(+) create mode 100755 bin/crawl.php create mode 100755 bin/index.php create mode 100755 bin/phinde-worker.php create mode 100755 bin/setup.php (limited to 'bin') diff --git a/bin/crawl.php b/bin/crawl.php new file mode 100755 index 0000000..26cf994 --- /dev/null +++ b/bin/crawl.php @@ -0,0 +1,145 @@ +#!/usr/bin/env php +send(); +if ($res->getStatus() !== 200) { + echo "Response code is not 200 but " . $res->getStatus() . ", stopping\n"; + exit(3); +} +$mimetype = explode(';', $res->getHeader('content-type'))[0]; +if (!in_array($mimetype, $supportedCrawlTypes)) { + echo "MIME type not supported for crawling: $mimetype\n"; + exit(4); +} + +//FIXME: mime type switch for cdata +$doc = new \DOMDocument(); +//@ to hide parse warning messages in invalid html +@$doc->loadHTMLFile($url); + +//FIXME: extract base url from html +$base = new \Net_URL2($url); + +$xpath = new \DOMXPath($doc); +$links = $xpath->evaluate('//a'); +//FIXME: link rel, img, video + +$alreadySeen = array(); + +foreach ($links as $link) { + $linkTitle = $link->textContent; + $href = ''; + foreach ($link->attributes as $attribute) { + if ($attribute->name == 'href') { + $href = $attribute->textContent; + } + } + if ($href == '' || $href{0} == '#') { + //link on this page + continue; + } + + $linkUrlObj = $base->resolve($href); + $linkUrlObj->setFragment(false); + $linkUrl = (string) $linkUrlObj; + if (isset($alreadySeen[$linkUrl])) { + continue; + } + + switch ($linkUrlObj->getScheme()) { + case 'http': + case 'https': + break; + default: + continue 2; + } + + if ($es->isKnown($linkUrl)) { + continue; + } + + //FIXME: check target type + //FIXME: check nofollow + //var_dump($linkTitle, $linkUrl); + $es->markQueued($linkUrl); + addToIndex($linkUrl, $linkTitle, $url); + if (isUrlAllowed($linkUrl)) { + addToCrawl($linkUrl); + } + $alreadySeen[$linkUrl] = true; +} + +function addToIndex($linkUrl, $linkTitle, $sourceUrl) +{ + echo "Queuing for indexing: $linkUrl\n"; + $gmclient = new \GearmanClient(); + $gmclient->addServer('127.0.0.1'); + $gmclient->doBackground( + 'phinde_index', + serialize( + array( + 'url' => $linkUrl, + 'title' => $linkTitle, + 'source' => $sourceUrl + ) + ) + ); + if ($gmclient->returnCode() != GEARMAN_SUCCESS) { + echo 'Error queueing URL indexing for ' + . $linkUrl . "\n" + . 'Error code: ' . $gmclient->returnCode() . "\n"; + exit(2); + } +} + +function addToCrawl($linkUrl) +{ + echo "Queuing for crawling: $linkUrl\n"; + $gmclient = new \GearmanClient(); + $gmclient->addServer('127.0.0.1'); + $gmclient->doBackground( + 'phinde_crawl', + serialize( + array( + 'url' => $linkUrl + ) + ) + ); + if ($gmclient->returnCode() != GEARMAN_SUCCESS) { + echo 'Error queueing URL crawling for ' + . $linkUrl . "\n" + . 'Error code: ' . $gmclient->returnCode() . "\n"; + exit(2); + } +} +?> \ No newline at end of file diff --git a/bin/index.php b/bin/index.php new file mode 100755 index 0000000..6a13afd --- /dev/null +++ b/bin/index.php @@ -0,0 +1,164 @@ +#!/usr/bin/env php +get($url); +if ($existingDoc && $existingDoc->status == 'indexed') { + echo "URL already indexed: $url\n"; + exit(0); +} +//FIXME: sourcetitle, sourcelink + +//FIXME: enable redirects +//FIXME: enable ssl +$req = new \HTTP_Request2($url); +$req->setConfig('connect_timeout', 5); +$req->setConfig('timeout', 10); +$res = $req->send(); +//FIXME: try-catch + +//FIXME: delete if 401 gone or 404 when updating +if ($res->getStatus() !== 200) { + echo "Response code is not 200 but " . $res->getStatus() . ", stopping\n"; + //FIXME: update status + exit(3); +} + +$mimetype = explode(';', $res->getHeader('content-type'))[0]; +if (!in_array($mimetype, $supportedIndexTypes)) { + echo "MIME type not supported for indexing: $mimetype\n"; + //FIXME: update status + exit(4); +} + + +//FIXME: update index only if changed since last index time +//FIXME: extract base url from html +//FIXME: use final URL after redirects +$base = new \Net_URL2($url); + +$indexDoc = new \stdClass(); + +//FIXME: MIME type switch +$doc = new \DOMDocument(); +//@ to hide parse warning messages in invalid html +@$doc->loadHTML($res->getBody()); +$sx = simplexml_import_dom($doc); + +$indexDoc->url = $url; +$indexDoc->type = 'html'; +$indexDoc->subtype = ''; +$indexDoc->mimetype = $mimetype; +$indexDoc->domain = parse_url($url, PHP_URL_HOST); + +//$indexDoc->source = 'FIXME'; +//$indexDoc->sourcetitle = 'FIXME'; + +$indexDoc->author = new \stdClass(); + +$arSxElems = $sx->xpath('/html/head/meta[@name="author"]'); +if (count($arSxElems)) { + $indexDoc->author->name = trim($arSxElems[0]['content']); +} +$arSxElems = $sx->xpath('/html/head/link[@rel="author"]'); +if (count($arSxElems)) { + $indexDoc->author->url = (string) $base->resolve($arSxElems[0]['href']); +} + +$indexDoc->title = (string) $sx->head->title; +foreach (array('h1', 'h2', 'h3', 'h4', 'h5', 'h6') as $headlinetype) { + $indexDoc->$headlinetype = array(); + //FIXME: limit to h-entry children + foreach ($sx->xpath('//' . $headlinetype) as $xheadline) { + array_push( + $indexDoc->$headlinetype, + trim(dom_import_simplexml($xheadline)->textContent) + ); + } +} + +//FIXME: limit to h-entry e-content +//FIXME: insert space after br +//FIXME: remove javascript +$indexDoc->text = array(); +foreach ($doc->getElementsByTagName('body') as $body) { + $indexDoc->text[] = trim( + str_replace( + array("\r\n", "\n", "\r", ' '), + ' ', + $body->textContent + ) + ); +} + +//tags +$tags = array(); +foreach ($sx->xpath('/html/head/meta[@name="keywords"]') as $xkeywords) { + foreach (explode(',', $xkeywords['content']) as $keyword) { + $tags[trim($keyword)] = true; + } +} +$indexDoc->tags = array_keys($tags); + +//dates +$arSxdates = $sx->xpath('/html/head/meta[@name="DC.date.created"]'); +if (count($arSxdates)) { + $indexDoc->crdate = date('c', strtotime((string) $arSxdates[0]['content'])); +} +//FIXME: keep creation date from database, or use modified date if we +// do not have it there + +$arSxdates = $sx->xpath('/html/head/meta[@name="DC.date.modified"]'); +if (count($arSxdates)) { + $indexDoc->modate = date('c', strtotime((string) $arSxdates[0]['content'])); +} else { + $lm = $res->getHeader('last-modified'); + if ($lm !== null) { + $indexDoc->modate = date('c', strtotime($lm)); + } else { + //use current time since we don't have any other data + $indexDoc->modate = date('c'); + } +} + +//language +//there may be "en-US" and "de-DE" +$indexDoc->language = substr((string) $sx['lang'], 0, 2); +//FIXME: fallback, autodetection +//FIXME: check noindex + + +//var_dump($indexDoc); + +$indexDoc->status = 'indexed'; + +//FIXME: update index if it exists already +$r = new Elasticsearch_Request( + $GLOBALS['phinde']['elasticsearch'] . 'document/' . rawurlencode($url), + \HTTP_Request2::METHOD_PUT +); +$r->setBody(json_encode($indexDoc)); +$r->send(); + + +?> diff --git a/bin/phinde-worker.php b/bin/phinde-worker.php new file mode 100755 index 0000000..e8253ff --- /dev/null +++ b/bin/phinde-worker.php @@ -0,0 +1,34 @@ +#!/usr/bin/env php +addServer('127.0.0.1'); + +$gmworker->addFunction( + 'phinde_crawl', + function(\GearmanJob $job) { + $data = unserialize($job->workload()); + echo "-- Crawling " . $data['url'] . "\n"; + passthru('./crawl.php ' . escapeshellarg($data['url'])); + } +); +$gmworker->addFunction( + 'phinde_index', + function(\GearmanJob $job) { + $data = unserialize($job->workload()); + echo "-- Indexing " . $data['url'] . "\n"; + passthru('./index.php ' . escapeshellarg($data['url'])); + //exit(); + } +); + +while ($gmworker->work()) { + if ($gmworker->returnCode() != GEARMAN_SUCCESS) { + echo 'Error running job: ' . $gmworker->returnCode() . "\n"; + break; + } +} +?> diff --git a/bin/setup.php b/bin/setup.php new file mode 100755 index 0000000..7dacedd --- /dev/null +++ b/bin/setup.php @@ -0,0 +1,27 @@ +#!/usr/bin/env php +allow404 = true; +$r->send(); + +//recreate it +$r = new Elasticsearch_Request( + $GLOBALS['phinde']['elasticsearch'], + \HTTP_Request2::METHOD_PUT +); +$r->setBody( + file_get_contents(__DIR__ . '/../data/elasticsearch-mapping.json') +); +$r->send(); +?> -- cgit v1.2.3