aboutsummaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
Diffstat (limited to 'src')
-rw-r--r--src/phinde/Crawler.php70
-rw-r--r--src/phinde/Helper.php10
-rw-r--r--src/phinde/HttpRequest.php16
-rw-r--r--src/phinde/LinkExtractor/Atom.php35
-rw-r--r--src/phinde/LinkExtractor/Html.php67
-rw-r--r--src/phinde/LinkInfo.php17
-rw-r--r--src/phinde/Queue.php54
7 files changed, 269 insertions, 0 deletions
diff --git a/src/phinde/Crawler.php b/src/phinde/Crawler.php
new file mode 100644
index 0000000..53320ec
--- /dev/null
+++ b/src/phinde/Crawler.php
@@ -0,0 +1,70 @@
+<?php
+namespace phinde;
+
+class Crawler
+{
+ protected $es;
+ protected $queue;
+
+ static $supportedIndexTypes = array(
+ 'application/atom+xml' => '\\phinde\\LinkExtractor\\Atom',
+ 'application/xhtml+xml' => '\\phinde\\LinkExtractor\\Html',
+ 'text/html' => '\\phinde\\LinkExtractor\\Html',
+ );
+
+ public function __construct()
+ {
+ $this->es = new Elasticsearch($GLOBALS['phinde']['elasticsearch']);
+ $this->queue = new Queue();
+ }
+
+ public function crawl($url)
+ {
+ $res = $this->fetch($url);
+ $linkInfos = $this->extractLinks($res);
+ $this->enqueue($linkInfos);
+ }
+
+ protected function fetch($url)
+ {
+ $req = new HttpRequest($url);
+ $res = $req->send();
+ if ($res->getStatus() !== 200) {
+ throw new \Exception(
+ "Response code is not 200 but "
+ . $res->getStatus() . ", stopping"
+ );
+ }
+ return $res;
+ }
+
+ protected function extractLinks(\HTTP_Request2_Response $res)
+ {
+ $mimetype = explode(';', $res->getHeader('content-type'))[0];
+ if (!isset(static::$supportedIndexTypes[$mimetype])) {
+ echo "MIME type not supported for indexing: $mimetype\n";
+ return array();
+ }
+
+ $class = static::$supportedIndexTypes[$mimetype];
+ $extractor = new $class();
+ return $extractor->extract($res);
+ }
+
+ protected function enqueue($linkInfos)
+ {
+ foreach ($linkInfos as $linkInfo) {
+ if ($this->es->isKnown($linkInfo->url)) {
+ continue;
+ }
+ $this->es->markQueued($linkInfo->url);
+ $this->queue->addToIndex(
+ $linkInfo->url, $linkInfo->title, $linkInfo->source
+ );
+ if (Helper::isUrlAllowed($linkInfo->url)) {
+ $this->queue->addToCrawl($linkInfo->url);
+ }
+ }
+ }
+}
+?>
diff --git a/src/phinde/Helper.php b/src/phinde/Helper.php
index 0b98521..40ea751 100644
--- a/src/phinde/Helper.php
+++ b/src/phinde/Helper.php
@@ -20,5 +20,15 @@ class Helper
$url
);
}
+
+ public static function addSchema($url)
+ {
+ if (substr($url, 0, 7) == 'http://'
+ || substr($url, 0, 8) == 'https://'
+ ) {
+ return $url;
+ }
+ return 'http://' . $url;
+ }
}
?>
diff --git a/src/phinde/HttpRequest.php b/src/phinde/HttpRequest.php
new file mode 100644
index 0000000..e68bd84
--- /dev/null
+++ b/src/phinde/HttpRequest.php
@@ -0,0 +1,16 @@
+<?php
+namespace phinde;
+
+class HttpRequest extends \HTTP_Request2
+{
+ public function __construct($url)
+ {
+ parent::__construct($url);
+ $this->setConfig('follow_redirects', true);
+ $this->setConfig('connect_timeout', 5);
+ $this->setConfig('timeout', 10);
+ $this->setConfig('ssl_verify_peer', false);
+ $this->setHeader('user-agent', 'phinde/bot');
+ }
+}
+?>
diff --git a/src/phinde/LinkExtractor/Atom.php b/src/phinde/LinkExtractor/Atom.php
new file mode 100644
index 0000000..bb4d90b
--- /dev/null
+++ b/src/phinde/LinkExtractor/Atom.php
@@ -0,0 +1,35 @@
+<?php
+namespace phinde\LinkExtractor;
+
+use phinde\LinkInfo;
+
+class Atom
+{
+ public function extract(\HTTP_Request2_Response $res)
+ {
+ $url = $res->getEffectiveUrl();
+ $base = new \Net_URL2($url);
+
+ $sx = simplexml_load_string($res->getBody());
+ $linkInfos = array();
+ $alreadySeen = array();
+
+ foreach ($sx->entry as $entry) {
+ $linkTitle = (string) $entry->title;
+ foreach ($entry->link as $xlink) {
+ $linkUrl = (string) $base->resolve((string) $xlink['href']);
+ if (isset($alreadySeen[$linkUrl])) {
+ continue;
+ }
+
+ if ($xlink['rel'] == 'alternate') {
+ $linkInfos[] = new LinkInfo($linkUrl, $linkTitle, $url);
+ }
+ $alreadySeen[$linkUrl] = true;
+ }
+ }
+
+ return $linkInfos;
+ }
+}
+?>
diff --git a/src/phinde/LinkExtractor/Html.php b/src/phinde/LinkExtractor/Html.php
new file mode 100644
index 0000000..538d6c4
--- /dev/null
+++ b/src/phinde/LinkExtractor/Html.php
@@ -0,0 +1,67 @@
+<?php
+namespace phinde\LinkExtractor;
+
+use phinde\LinkInfo;
+
+class Html
+{
+ public function extract(\HTTP_Request2_Response $res)
+ {
+ $url = $res->getEffectiveUrl();
+
+ $linkInfos = array();
+
+ //FIXME: mime type switch for cdata
+ $doc = new \DOMDocument();
+ //@ to hide parse warning messages in invalid html
+ @$doc->loadHTML($res->getBody());
+
+ //FIXME: extract base url from html
+ $base = new \Net_URL2($url);
+
+ $xpath = new \DOMXPath($doc);
+ $links = $xpath->evaluate('//a');
+ //FIXME: link rel, img, video
+
+ $alreadySeen = array();
+
+ foreach ($links as $link) {
+ $linkTitle = $link->textContent;
+ $href = '';
+ foreach ($link->attributes as $attribute) {
+ if ($attribute->name == 'href') {
+ $href = $attribute->textContent;
+ }
+ }
+ if ($href == '' || $href{0} == '#') {
+ //link on this page
+ continue;
+ }
+
+ $linkUrlObj = $base->resolve($href);
+ $linkUrlObj->setFragment(false);
+ $linkUrl = (string) $linkUrlObj;
+ if (isset($alreadySeen[$linkUrl])) {
+ continue;
+ }
+
+ switch ($linkUrlObj->getScheme()) {
+ case 'http':
+ case 'https':
+ break;
+ default:
+ continue 2;
+ }
+
+ //FIXME: check target type
+ //FIXME: check nofollow
+ $linkInfos[] = new LinkInfo(
+ $linkUrl, $linkTitle, $url
+ );
+ $alreadySeen[$linkUrl] = true;
+ }
+
+ return $linkInfos;
+ }
+}
+?>
diff --git a/src/phinde/LinkInfo.php b/src/phinde/LinkInfo.php
new file mode 100644
index 0000000..4e3980c
--- /dev/null
+++ b/src/phinde/LinkInfo.php
@@ -0,0 +1,17 @@
+<?php
+namespace phinde;
+
+class LinkInfo
+{
+ public $url;
+ public $title;
+ public $source;
+
+ public function __construct($url, $title = null, $source = null)
+ {
+ $this->url = $url;
+ $this->title = $title;
+ $this->source = $source;
+ }
+}
+?>
diff --git a/src/phinde/Queue.php b/src/phinde/Queue.php
new file mode 100644
index 0000000..98f6462
--- /dev/null
+++ b/src/phinde/Queue.php
@@ -0,0 +1,54 @@
+<?php
+namespace phinde;
+
+class Queue
+{
+ protected $gmclient;
+
+ public function __construct()
+ {
+ $this->gmclient = new \GearmanClient();
+ $this->gmclient->addServer('127.0.0.1');
+ }
+
+ public function addToIndex($linkUrl, $linkTitle, $sourceUrl)
+ {
+ echo "Queuing for indexing: $linkUrl\n";
+ $this->gmclient->doBackground(
+ 'phinde_index',
+ serialize(
+ array(
+ 'url' => $linkUrl,
+ 'title' => $linkTitle,
+ 'source' => $sourceUrl
+ )
+ )
+ );
+ if ($this->gmclient->returnCode() != GEARMAN_SUCCESS) {
+ echo 'Error queueing URL indexing for '
+ . $linkUrl . "\n"
+ . 'Error code: ' . $this->gmclient->returnCode() . "\n";
+ exit(2);
+ }
+ }
+
+ public function addToCrawl($linkUrl)
+ {
+ echo "Queuing for crawling: $linkUrl\n";
+ $this->gmclient->doBackground(
+ 'phinde_crawl',
+ serialize(
+ array(
+ 'url' => $linkUrl
+ )
+ )
+ );
+ if ($this->gmclient->returnCode() != GEARMAN_SUCCESS) {
+ echo 'Error queueing URL crawling for '
+ . $linkUrl . "\n"
+ . 'Error code: ' . $this->gmclient->returnCode() . "\n";
+ exit(2);
+ }
+ }
+}
+?>