rework crawler; add atom link extraction
authorChristian Weiske <cweiske@cweiske.de>
Wed, 10 Feb 2016 13:56:20 +0000 (14:56 +0100)
committerChristian Weiske <cweiske@cweiske.de>
Wed, 10 Feb 2016 13:56:20 +0000 (14:56 +0100)
bin/crawl.php
src/phinde/Crawler.php [new file with mode: 0644]
src/phinde/Helper.php
src/phinde/HttpRequest.php [new file with mode: 0644]
src/phinde/LinkExtractor/Atom.php [new file with mode: 0644]
src/phinde/LinkExtractor/Html.php [new file with mode: 0644]
src/phinde/LinkInfo.php [new file with mode: 0644]
src/phinde/Queue.php [new file with mode: 0644]

index e39a622..e9a6218 100755 (executable)
 namespace phinde;
 require_once __DIR__ . '/../src/init.php';
 
-$supportedCrawlTypes = array(
-    'text/html', 'application/xhtml+xml'
-);
-
-
 if ($argc < 2) {
     echo "No URL given\n";
     exit(1);
 }
 
-$es = new Elasticsearch($GLOBALS['phinde']['elasticsearch']);
-
 $url = $argv[1];
+$url = Helper::addSchema($url);
 if (!Helper::isUrlAllowed($url)) {
     echo "Domain is not allowed; not crawling\n";
     exit(2);
 }
 
-
-$req = new \HTTP_Request2($url);
-//FIXME: send supported mime types in header
-$res = $req->send();
-if ($res->getStatus() !== 200) {
-    echo "Response code is not 200 but " . $res->getStatus() . ", stopping\n";
-    exit(3);
-}
-$mimetype = explode(';', $res->getHeader('content-type'))[0];
-if (!in_array($mimetype, $supportedCrawlTypes)) {
-    echo "MIME type not supported for crawling: $mimetype\n";
-    exit(4);
-}
-
-//FIXME: mime type switch for cdata
-$doc = new \DOMDocument();
-//@ to hide parse warning messages in invalid html
-@$doc->loadHTMLFile($url);
-
-//FIXME: extract base url from html
-$base = new \Net_URL2($url);
-
-$xpath = new \DOMXPath($doc);
-$links = $xpath->evaluate('//a');
-//FIXME: link rel, img, video
-
-$alreadySeen = array();
-
-foreach ($links as $link) {
-    $linkTitle = $link->textContent;
-    $href = '';
-    foreach ($link->attributes as $attribute) {
-        if ($attribute->name == 'href') {
-            $href = $attribute->textContent;
-        }
-    }
-    if ($href == '' || $href{0} == '#') {
-        //link on this page
-        continue;
-    }
-
-    $linkUrlObj = $base->resolve($href);
-    $linkUrlObj->setFragment(false);
-    $linkUrl    = (string) $linkUrlObj;
-    if (isset($alreadySeen[$linkUrl])) {
-        continue;
-    }
-
-    switch ($linkUrlObj->getScheme()) {
-    case 'http':
-    case 'https':
-        break;
-    default:
-        continue 2;
-    }
-
-    if ($es->isKnown($linkUrl)) {
-        continue;
-    }
-
-    //FIXME: check target type
-    //FIXME: check nofollow
-    //var_dump($linkTitle, $linkUrl);
-    $es->markQueued($linkUrl);
-    addToIndex($linkUrl, $linkTitle, $url);
-    if (Helper::isUrlAllowed($linkUrl)) {
-        addToCrawl($linkUrl);
-    }
-    $alreadySeen[$linkUrl] = true;
-}
-
-function addToIndex($linkUrl, $linkTitle, $sourceUrl)
-{
-    echo "Queuing for indexing: $linkUrl\n";
-    $gmclient = new \GearmanClient();
-    $gmclient->addServer('127.0.0.1');
-    $gmclient->doBackground(
-        'phinde_index',
-        serialize(
-            array(
-                'url'    => $linkUrl,
-                'title'  => $linkTitle,
-                'source' => $sourceUrl
-            )
-        )
-    );
-    if ($gmclient->returnCode() != GEARMAN_SUCCESS) {
-        echo 'Error queueing URL indexing for '
-            . $linkUrl . "\n"
-            . 'Error code: ' . $gmclient->returnCode() . "\n";
-        exit(2);
-    }
-}
-
-function addToCrawl($linkUrl)
-{
-    echo "Queuing for crawling: $linkUrl\n";
-    $gmclient = new \GearmanClient();
-    $gmclient->addServer('127.0.0.1');
-    $gmclient->doBackground(
-        'phinde_crawl',
-        serialize(
-            array(
-                'url' => $linkUrl
-            )
-        )
-    );
-    if ($gmclient->returnCode() != GEARMAN_SUCCESS) {
-        echo 'Error queueing URL crawling for '
-            . $linkUrl . "\n"
-            . 'Error code: ' . $gmclient->returnCode() . "\n";
-        exit(2);
-    }
+try {
+    $crawler = new Crawler();
+    $crawler->crawl($url);
+} catch (\Exception $e) {
+    echo $e->getMessage() . "\n";
+    exit(10);
 }
 ?>
\ No newline at end of file
diff --git a/src/phinde/Crawler.php b/src/phinde/Crawler.php
new file mode 100644 (file)
index 0000000..53320ec
--- /dev/null
@@ -0,0 +1,70 @@
+<?php
+namespace phinde;
+
+class Crawler
+{
+    protected $es;
+    protected $queue;
+
+    static $supportedIndexTypes = array(
+        'application/atom+xml'  => '\\phinde\\LinkExtractor\\Atom',
+        'application/xhtml+xml' => '\\phinde\\LinkExtractor\\Html',
+        'text/html'             => '\\phinde\\LinkExtractor\\Html',
+    );
+
+    public function __construct()
+    {
+        $this->es = new Elasticsearch($GLOBALS['phinde']['elasticsearch']);
+        $this->queue = new Queue();
+    }
+
+    public function crawl($url)
+    {
+        $res       = $this->fetch($url);
+        $linkInfos = $this->extractLinks($res);
+        $this->enqueue($linkInfos);
+    }
+
+    protected function fetch($url)
+    {
+        $req = new HttpRequest($url);
+        $res = $req->send();
+        if ($res->getStatus() !== 200) {
+            throw new \Exception(
+                "Response code is not 200 but "
+                . $res->getStatus() . ", stopping"
+            );
+        }
+        return $res;
+    }
+
+    protected function extractLinks(\HTTP_Request2_Response $res)
+    {
+        $mimetype = explode(';', $res->getHeader('content-type'))[0];
+        if (!isset(static::$supportedIndexTypes[$mimetype])) {
+            echo "MIME type not supported for indexing: $mimetype\n";
+            return array();
+        }
+
+        $class = static::$supportedIndexTypes[$mimetype];
+        $extractor = new $class();
+        return $extractor->extract($res);
+    }
+
+    protected function enqueue($linkInfos)
+    {
+        foreach ($linkInfos as $linkInfo) {
+            if ($this->es->isKnown($linkInfo->url)) {
+                continue;
+            }
+            $this->es->markQueued($linkInfo->url);
+            $this->queue->addToIndex(
+                $linkInfo->url, $linkInfo->title, $linkInfo->source
+            );
+            if (Helper::isUrlAllowed($linkInfo->url)) {
+                $this->queue->addToCrawl($linkInfo->url);
+            }
+        }
+    }
+}
+?>
index 0b98521..40ea751 100644 (file)
@@ -20,5 +20,15 @@ class Helper
             $url
         );
     }
+
+    public static function addSchema($url)
+    {
+        if (substr($url, 0, 7) == 'http://'
+            || substr($url, 0, 8) == 'https://'
+        ) {
+            return $url;
+        }
+        return 'http://' . $url;
+    }
 }
 ?>
diff --git a/src/phinde/HttpRequest.php b/src/phinde/HttpRequest.php
new file mode 100644 (file)
index 0000000..e68bd84
--- /dev/null
@@ -0,0 +1,16 @@
+<?php
+namespace phinde;
+
+class HttpRequest extends \HTTP_Request2
+{
+    public function __construct($url)
+    {
+        parent::__construct($url);
+        $this->setConfig('follow_redirects', true);
+        $this->setConfig('connect_timeout', 5);
+        $this->setConfig('timeout', 10);
+        $this->setConfig('ssl_verify_peer', false);
+        $this->setHeader('user-agent', 'phinde/bot');
+    }
+}
+?>
diff --git a/src/phinde/LinkExtractor/Atom.php b/src/phinde/LinkExtractor/Atom.php
new file mode 100644 (file)
index 0000000..bb4d90b
--- /dev/null
@@ -0,0 +1,35 @@
+<?php
+namespace phinde\LinkExtractor;
+
+use phinde\LinkInfo;
+
+class Atom
+{
+    public function extract(\HTTP_Request2_Response $res)
+    {
+        $url  = $res->getEffectiveUrl();
+        $base = new \Net_URL2($url);
+
+        $sx = simplexml_load_string($res->getBody());
+        $linkInfos   = array();
+        $alreadySeen = array();
+
+        foreach ($sx->entry as $entry) {
+            $linkTitle = (string) $entry->title;
+            foreach ($entry->link as $xlink) {
+                $linkUrl = (string) $base->resolve((string) $xlink['href']);
+                if (isset($alreadySeen[$linkUrl])) {
+                    continue;
+                }
+
+                if ($xlink['rel'] == 'alternate') {
+                    $linkInfos[] = new LinkInfo($linkUrl, $linkTitle, $url);
+                }
+                $alreadySeen[$linkUrl] = true;
+            }
+        }
+
+        return $linkInfos;
+    }
+}
+?>
diff --git a/src/phinde/LinkExtractor/Html.php b/src/phinde/LinkExtractor/Html.php
new file mode 100644 (file)
index 0000000..538d6c4
--- /dev/null
@@ -0,0 +1,67 @@
+<?php
+namespace phinde\LinkExtractor;
+
+use phinde\LinkInfo;
+
+class Html
+{
+    public function extract(\HTTP_Request2_Response $res)
+    {
+        $url = $res->getEffectiveUrl();
+
+        $linkInfos = array();
+
+        //FIXME: mime type switch for cdata
+        $doc = new \DOMDocument();
+        //@ to hide parse warning messages in invalid html
+        @$doc->loadHTML($res->getBody());
+
+        //FIXME: extract base url from html
+        $base = new \Net_URL2($url);
+
+        $xpath = new \DOMXPath($doc);
+        $links = $xpath->evaluate('//a');
+        //FIXME: link rel, img, video
+
+        $alreadySeen = array();
+
+        foreach ($links as $link) {
+            $linkTitle = $link->textContent;
+            $href = '';
+            foreach ($link->attributes as $attribute) {
+                if ($attribute->name == 'href') {
+                    $href = $attribute->textContent;
+                }
+            }
+            if ($href == '' || $href{0} == '#') {
+                //link on this page
+                continue;
+            }
+
+            $linkUrlObj = $base->resolve($href);
+            $linkUrlObj->setFragment(false);
+            $linkUrl    = (string) $linkUrlObj;
+            if (isset($alreadySeen[$linkUrl])) {
+                continue;
+            }
+
+            switch ($linkUrlObj->getScheme()) {
+            case 'http':
+            case 'https':
+                break;
+            default:
+                continue 2;
+            }
+
+            //FIXME: check target type
+            //FIXME: check nofollow
+            $linkInfos[] = new LinkInfo(
+               $linkUrl, $linkTitle, $url
+            );
+            $alreadySeen[$linkUrl] = true;
+        }
+
+        return $linkInfos;
+    }
+}
+?>
diff --git a/src/phinde/LinkInfo.php b/src/phinde/LinkInfo.php
new file mode 100644 (file)
index 0000000..4e3980c
--- /dev/null
@@ -0,0 +1,17 @@
+<?php
+namespace phinde;
+
+class LinkInfo
+{
+    public $url;
+    public $title;
+    public $source;
+
+    public function __construct($url, $title = null, $source = null)
+    {
+        $this->url    = $url;
+        $this->title  = $title;
+        $this->source = $source;
+    }
+}
+?>
diff --git a/src/phinde/Queue.php b/src/phinde/Queue.php
new file mode 100644 (file)
index 0000000..98f6462
--- /dev/null
@@ -0,0 +1,54 @@
+<?php
+namespace phinde;
+
+class Queue
+{
+    protected $gmclient;
+
+    public function __construct()
+    {
+        $this->gmclient = new \GearmanClient();
+        $this->gmclient->addServer('127.0.0.1');
+    }
+
+    public function addToIndex($linkUrl, $linkTitle, $sourceUrl)
+    {
+        echo "Queuing for indexing: $linkUrl\n";
+        $this->gmclient->doBackground(
+            'phinde_index',
+            serialize(
+                array(
+                    'url'    => $linkUrl,
+                    'title'  => $linkTitle,
+                    'source' => $sourceUrl
+                )
+            )
+        );
+        if ($this->gmclient->returnCode() != GEARMAN_SUCCESS) {
+            echo 'Error queueing URL indexing for '
+                . $linkUrl . "\n"
+                . 'Error code: ' . $this->gmclient->returnCode() . "\n";
+            exit(2);
+        }
+    }
+
+    public function addToCrawl($linkUrl)
+    {
+        echo "Queuing for crawling: $linkUrl\n";
+        $this->gmclient->doBackground(
+            'phinde_crawl',
+            serialize(
+                array(
+                    'url' => $linkUrl
+                )
+            )
+        );
+        if ($this->gmclient->returnCode() != GEARMAN_SUCCESS) {
+            echo 'Error queueing URL crawling for '
+                . $linkUrl . "\n"
+                . 'Error code: ' . $this->gmclient->returnCode() . "\n";
+            exit(2);
+        }
+    }
+}
+?>