From cd02bac646f42a0cb402ff2dc8240aa01f1f0fb8 Mon Sep 17 00:00:00 2001 From: Christian Weiske Date: Wed, 10 Feb 2016 14:56:20 +0100 Subject: [PATCH] rework crawler; add atom link extraction --- bin/crawl.php | 127 ++---------------------------- src/phinde/Crawler.php | 70 ++++++++++++++++ src/phinde/Helper.php | 10 +++ src/phinde/HttpRequest.php | 16 ++++ src/phinde/LinkExtractor/Atom.php | 35 ++++++++ src/phinde/LinkExtractor/Html.php | 67 ++++++++++++++++ src/phinde/LinkInfo.php | 17 ++++ src/phinde/Queue.php | 54 +++++++++++++ 8 files changed, 276 insertions(+), 120 deletions(-) create mode 100644 src/phinde/Crawler.php create mode 100644 src/phinde/HttpRequest.php create mode 100644 src/phinde/LinkExtractor/Atom.php create mode 100644 src/phinde/LinkExtractor/Html.php create mode 100644 src/phinde/LinkInfo.php create mode 100644 src/phinde/Queue.php diff --git a/bin/crawl.php b/bin/crawl.php index e39a622..e9a6218 100755 --- a/bin/crawl.php +++ b/bin/crawl.php @@ -3,136 +3,23 @@ namespace phinde; require_once __DIR__ . '/../src/init.php'; -$supportedCrawlTypes = array( - 'text/html', 'application/xhtml+xml' -); - - if ($argc < 2) { echo "No URL given\n"; exit(1); } -$es = new Elasticsearch($GLOBALS['phinde']['elasticsearch']); - $url = $argv[1]; +$url = Helper::addSchema($url); if (!Helper::isUrlAllowed($url)) { echo "Domain is not allowed; not crawling\n"; exit(2); } - -$req = new \HTTP_Request2($url); -//FIXME: send supported mime types in header -$res = $req->send(); -if ($res->getStatus() !== 200) { - echo "Response code is not 200 but " . $res->getStatus() . ", stopping\n"; - exit(3); -} -$mimetype = explode(';', $res->getHeader('content-type'))[0]; -if (!in_array($mimetype, $supportedCrawlTypes)) { - echo "MIME type not supported for crawling: $mimetype\n"; - exit(4); -} - -//FIXME: mime type switch for cdata -$doc = new \DOMDocument(); -//@ to hide parse warning messages in invalid html -@$doc->loadHTMLFile($url); - -//FIXME: extract base url from html -$base = new \Net_URL2($url); - -$xpath = new \DOMXPath($doc); -$links = $xpath->evaluate('//a'); -//FIXME: link rel, img, video - -$alreadySeen = array(); - -foreach ($links as $link) { - $linkTitle = $link->textContent; - $href = ''; - foreach ($link->attributes as $attribute) { - if ($attribute->name == 'href') { - $href = $attribute->textContent; - } - } - if ($href == '' || $href{0} == '#') { - //link on this page - continue; - } - - $linkUrlObj = $base->resolve($href); - $linkUrlObj->setFragment(false); - $linkUrl = (string) $linkUrlObj; - if (isset($alreadySeen[$linkUrl])) { - continue; - } - - switch ($linkUrlObj->getScheme()) { - case 'http': - case 'https': - break; - default: - continue 2; - } - - if ($es->isKnown($linkUrl)) { - continue; - } - - //FIXME: check target type - //FIXME: check nofollow - //var_dump($linkTitle, $linkUrl); - $es->markQueued($linkUrl); - addToIndex($linkUrl, $linkTitle, $url); - if (Helper::isUrlAllowed($linkUrl)) { - addToCrawl($linkUrl); - } - $alreadySeen[$linkUrl] = true; -} - -function addToIndex($linkUrl, $linkTitle, $sourceUrl) -{ - echo "Queuing for indexing: $linkUrl\n"; - $gmclient = new \GearmanClient(); - $gmclient->addServer('127.0.0.1'); - $gmclient->doBackground( - 'phinde_index', - serialize( - array( - 'url' => $linkUrl, - 'title' => $linkTitle, - 'source' => $sourceUrl - ) - ) - ); - if ($gmclient->returnCode() != GEARMAN_SUCCESS) { - echo 'Error queueing URL indexing for ' - . $linkUrl . "\n" - . 'Error code: ' . $gmclient->returnCode() . "\n"; - exit(2); - } -} - -function addToCrawl($linkUrl) -{ - echo "Queuing for crawling: $linkUrl\n"; - $gmclient = new \GearmanClient(); - $gmclient->addServer('127.0.0.1'); - $gmclient->doBackground( - 'phinde_crawl', - serialize( - array( - 'url' => $linkUrl - ) - ) - ); - if ($gmclient->returnCode() != GEARMAN_SUCCESS) { - echo 'Error queueing URL crawling for ' - . $linkUrl . "\n" - . 'Error code: ' . $gmclient->returnCode() . "\n"; - exit(2); - } +try { + $crawler = new Crawler(); + $crawler->crawl($url); +} catch (\Exception $e) { + echo $e->getMessage() . "\n"; + exit(10); } ?> \ No newline at end of file diff --git a/src/phinde/Crawler.php b/src/phinde/Crawler.php new file mode 100644 index 0000000..53320ec --- /dev/null +++ b/src/phinde/Crawler.php @@ -0,0 +1,70 @@ + '\\phinde\\LinkExtractor\\Atom', + 'application/xhtml+xml' => '\\phinde\\LinkExtractor\\Html', + 'text/html' => '\\phinde\\LinkExtractor\\Html', + ); + + public function __construct() + { + $this->es = new Elasticsearch($GLOBALS['phinde']['elasticsearch']); + $this->queue = new Queue(); + } + + public function crawl($url) + { + $res = $this->fetch($url); + $linkInfos = $this->extractLinks($res); + $this->enqueue($linkInfos); + } + + protected function fetch($url) + { + $req = new HttpRequest($url); + $res = $req->send(); + if ($res->getStatus() !== 200) { + throw new \Exception( + "Response code is not 200 but " + . $res->getStatus() . ", stopping" + ); + } + return $res; + } + + protected function extractLinks(\HTTP_Request2_Response $res) + { + $mimetype = explode(';', $res->getHeader('content-type'))[0]; + if (!isset(static::$supportedIndexTypes[$mimetype])) { + echo "MIME type not supported for indexing: $mimetype\n"; + return array(); + } + + $class = static::$supportedIndexTypes[$mimetype]; + $extractor = new $class(); + return $extractor->extract($res); + } + + protected function enqueue($linkInfos) + { + foreach ($linkInfos as $linkInfo) { + if ($this->es->isKnown($linkInfo->url)) { + continue; + } + $this->es->markQueued($linkInfo->url); + $this->queue->addToIndex( + $linkInfo->url, $linkInfo->title, $linkInfo->source + ); + if (Helper::isUrlAllowed($linkInfo->url)) { + $this->queue->addToCrawl($linkInfo->url); + } + } + } +} +?> diff --git a/src/phinde/Helper.php b/src/phinde/Helper.php index 0b98521..40ea751 100644 --- a/src/phinde/Helper.php +++ b/src/phinde/Helper.php @@ -20,5 +20,15 @@ class Helper $url ); } + + public static function addSchema($url) + { + if (substr($url, 0, 7) == 'http://' + || substr($url, 0, 8) == 'https://' + ) { + return $url; + } + return 'http://' . $url; + } } ?> diff --git a/src/phinde/HttpRequest.php b/src/phinde/HttpRequest.php new file mode 100644 index 0000000..e68bd84 --- /dev/null +++ b/src/phinde/HttpRequest.php @@ -0,0 +1,16 @@ +setConfig('follow_redirects', true); + $this->setConfig('connect_timeout', 5); + $this->setConfig('timeout', 10); + $this->setConfig('ssl_verify_peer', false); + $this->setHeader('user-agent', 'phinde/bot'); + } +} +?> diff --git a/src/phinde/LinkExtractor/Atom.php b/src/phinde/LinkExtractor/Atom.php new file mode 100644 index 0000000..bb4d90b --- /dev/null +++ b/src/phinde/LinkExtractor/Atom.php @@ -0,0 +1,35 @@ +getEffectiveUrl(); + $base = new \Net_URL2($url); + + $sx = simplexml_load_string($res->getBody()); + $linkInfos = array(); + $alreadySeen = array(); + + foreach ($sx->entry as $entry) { + $linkTitle = (string) $entry->title; + foreach ($entry->link as $xlink) { + $linkUrl = (string) $base->resolve((string) $xlink['href']); + if (isset($alreadySeen[$linkUrl])) { + continue; + } + + if ($xlink['rel'] == 'alternate') { + $linkInfos[] = new LinkInfo($linkUrl, $linkTitle, $url); + } + $alreadySeen[$linkUrl] = true; + } + } + + return $linkInfos; + } +} +?> diff --git a/src/phinde/LinkExtractor/Html.php b/src/phinde/LinkExtractor/Html.php new file mode 100644 index 0000000..538d6c4 --- /dev/null +++ b/src/phinde/LinkExtractor/Html.php @@ -0,0 +1,67 @@ +getEffectiveUrl(); + + $linkInfos = array(); + + //FIXME: mime type switch for cdata + $doc = new \DOMDocument(); + //@ to hide parse warning messages in invalid html + @$doc->loadHTML($res->getBody()); + + //FIXME: extract base url from html + $base = new \Net_URL2($url); + + $xpath = new \DOMXPath($doc); + $links = $xpath->evaluate('//a'); + //FIXME: link rel, img, video + + $alreadySeen = array(); + + foreach ($links as $link) { + $linkTitle = $link->textContent; + $href = ''; + foreach ($link->attributes as $attribute) { + if ($attribute->name == 'href') { + $href = $attribute->textContent; + } + } + if ($href == '' || $href{0} == '#') { + //link on this page + continue; + } + + $linkUrlObj = $base->resolve($href); + $linkUrlObj->setFragment(false); + $linkUrl = (string) $linkUrlObj; + if (isset($alreadySeen[$linkUrl])) { + continue; + } + + switch ($linkUrlObj->getScheme()) { + case 'http': + case 'https': + break; + default: + continue 2; + } + + //FIXME: check target type + //FIXME: check nofollow + $linkInfos[] = new LinkInfo( + $linkUrl, $linkTitle, $url + ); + $alreadySeen[$linkUrl] = true; + } + + return $linkInfos; + } +} +?> diff --git a/src/phinde/LinkInfo.php b/src/phinde/LinkInfo.php new file mode 100644 index 0000000..4e3980c --- /dev/null +++ b/src/phinde/LinkInfo.php @@ -0,0 +1,17 @@ +url = $url; + $this->title = $title; + $this->source = $source; + } +} +?> diff --git a/src/phinde/Queue.php b/src/phinde/Queue.php new file mode 100644 index 0000000..98f6462 --- /dev/null +++ b/src/phinde/Queue.php @@ -0,0 +1,54 @@ +gmclient = new \GearmanClient(); + $this->gmclient->addServer('127.0.0.1'); + } + + public function addToIndex($linkUrl, $linkTitle, $sourceUrl) + { + echo "Queuing for indexing: $linkUrl\n"; + $this->gmclient->doBackground( + 'phinde_index', + serialize( + array( + 'url' => $linkUrl, + 'title' => $linkTitle, + 'source' => $sourceUrl + ) + ) + ); + if ($this->gmclient->returnCode() != GEARMAN_SUCCESS) { + echo 'Error queueing URL indexing for ' + . $linkUrl . "\n" + . 'Error code: ' . $this->gmclient->returnCode() . "\n"; + exit(2); + } + } + + public function addToCrawl($linkUrl) + { + echo "Queuing for crawling: $linkUrl\n"; + $this->gmclient->doBackground( + 'phinde_crawl', + serialize( + array( + 'url' => $linkUrl + ) + ) + ); + if ($this->gmclient->returnCode() != GEARMAN_SUCCESS) { + echo 'Error queueing URL crawling for ' + . $linkUrl . "\n" + . 'Error code: ' . $this->gmclient->returnCode() . "\n"; + exit(2); + } + } +} +?> -- 2.30.2