namespace phinde;
require_once __DIR__ . '/../src/init.php';
-$supportedCrawlTypes = array(
- 'text/html', 'application/xhtml+xml'
-);
-
-
if ($argc < 2) {
echo "No URL given\n";
exit(1);
}
-$es = new Elasticsearch($GLOBALS['phinde']['elasticsearch']);
-
$url = $argv[1];
+$url = Helper::addSchema($url);
if (!Helper::isUrlAllowed($url)) {
echo "Domain is not allowed; not crawling\n";
exit(2);
}
-
-$req = new \HTTP_Request2($url);
-//FIXME: send supported mime types in header
-$res = $req->send();
-if ($res->getStatus() !== 200) {
- echo "Response code is not 200 but " . $res->getStatus() . ", stopping\n";
- exit(3);
-}
-$mimetype = explode(';', $res->getHeader('content-type'))[0];
-if (!in_array($mimetype, $supportedCrawlTypes)) {
- echo "MIME type not supported for crawling: $mimetype\n";
- exit(4);
-}
-
-//FIXME: mime type switch for cdata
-$doc = new \DOMDocument();
-//@ to hide parse warning messages in invalid html
-@$doc->loadHTMLFile($url);
-
-//FIXME: extract base url from html
-$base = new \Net_URL2($url);
-
-$xpath = new \DOMXPath($doc);
-$links = $xpath->evaluate('//a');
-//FIXME: link rel, img, video
-
-$alreadySeen = array();
-
-foreach ($links as $link) {
- $linkTitle = $link->textContent;
- $href = '';
- foreach ($link->attributes as $attribute) {
- if ($attribute->name == 'href') {
- $href = $attribute->textContent;
- }
- }
- if ($href == '' || $href{0} == '#') {
- //link on this page
- continue;
- }
-
- $linkUrlObj = $base->resolve($href);
- $linkUrlObj->setFragment(false);
- $linkUrl = (string) $linkUrlObj;
- if (isset($alreadySeen[$linkUrl])) {
- continue;
- }
-
- switch ($linkUrlObj->getScheme()) {
- case 'http':
- case 'https':
- break;
- default:
- continue 2;
- }
-
- if ($es->isKnown($linkUrl)) {
- continue;
- }
-
- //FIXME: check target type
- //FIXME: check nofollow
- //var_dump($linkTitle, $linkUrl);
- $es->markQueued($linkUrl);
- addToIndex($linkUrl, $linkTitle, $url);
- if (Helper::isUrlAllowed($linkUrl)) {
- addToCrawl($linkUrl);
- }
- $alreadySeen[$linkUrl] = true;
-}
-
-function addToIndex($linkUrl, $linkTitle, $sourceUrl)
-{
- echo "Queuing for indexing: $linkUrl\n";
- $gmclient = new \GearmanClient();
- $gmclient->addServer('127.0.0.1');
- $gmclient->doBackground(
- 'phinde_index',
- serialize(
- array(
- 'url' => $linkUrl,
- 'title' => $linkTitle,
- 'source' => $sourceUrl
- )
- )
- );
- if ($gmclient->returnCode() != GEARMAN_SUCCESS) {
- echo 'Error queueing URL indexing for '
- . $linkUrl . "\n"
- . 'Error code: ' . $gmclient->returnCode() . "\n";
- exit(2);
- }
-}
-
-function addToCrawl($linkUrl)
-{
- echo "Queuing for crawling: $linkUrl\n";
- $gmclient = new \GearmanClient();
- $gmclient->addServer('127.0.0.1');
- $gmclient->doBackground(
- 'phinde_crawl',
- serialize(
- array(
- 'url' => $linkUrl
- )
- )
- );
- if ($gmclient->returnCode() != GEARMAN_SUCCESS) {
- echo 'Error queueing URL crawling for '
- . $linkUrl . "\n"
- . 'Error code: ' . $gmclient->returnCode() . "\n";
- exit(2);
- }
+try {
+ $crawler = new Crawler();
+ $crawler->crawl($url);
+} catch (\Exception $e) {
+ echo $e->getMessage() . "\n";
+ exit(10);
}
?>
\ No newline at end of file
--- /dev/null
+<?php
+namespace phinde;
+
+class Crawler
+{
+ protected $es;
+ protected $queue;
+
+ static $supportedIndexTypes = array(
+ 'application/atom+xml' => '\\phinde\\LinkExtractor\\Atom',
+ 'application/xhtml+xml' => '\\phinde\\LinkExtractor\\Html',
+ 'text/html' => '\\phinde\\LinkExtractor\\Html',
+ );
+
+ public function __construct()
+ {
+ $this->es = new Elasticsearch($GLOBALS['phinde']['elasticsearch']);
+ $this->queue = new Queue();
+ }
+
+ public function crawl($url)
+ {
+ $res = $this->fetch($url);
+ $linkInfos = $this->extractLinks($res);
+ $this->enqueue($linkInfos);
+ }
+
+ protected function fetch($url)
+ {
+ $req = new HttpRequest($url);
+ $res = $req->send();
+ if ($res->getStatus() !== 200) {
+ throw new \Exception(
+ "Response code is not 200 but "
+ . $res->getStatus() . ", stopping"
+ );
+ }
+ return $res;
+ }
+
+ protected function extractLinks(\HTTP_Request2_Response $res)
+ {
+ $mimetype = explode(';', $res->getHeader('content-type'))[0];
+ if (!isset(static::$supportedIndexTypes[$mimetype])) {
+ echo "MIME type not supported for indexing: $mimetype\n";
+ return array();
+ }
+
+ $class = static::$supportedIndexTypes[$mimetype];
+ $extractor = new $class();
+ return $extractor->extract($res);
+ }
+
+ protected function enqueue($linkInfos)
+ {
+ foreach ($linkInfos as $linkInfo) {
+ if ($this->es->isKnown($linkInfo->url)) {
+ continue;
+ }
+ $this->es->markQueued($linkInfo->url);
+ $this->queue->addToIndex(
+ $linkInfo->url, $linkInfo->title, $linkInfo->source
+ );
+ if (Helper::isUrlAllowed($linkInfo->url)) {
+ $this->queue->addToCrawl($linkInfo->url);
+ }
+ }
+ }
+}
+?>
$url
);
}
+
+ public static function addSchema($url)
+ {
+ if (substr($url, 0, 7) == 'http://'
+ || substr($url, 0, 8) == 'https://'
+ ) {
+ return $url;
+ }
+ return 'http://' . $url;
+ }
}
?>
--- /dev/null
+<?php
+namespace phinde;
+
+class HttpRequest extends \HTTP_Request2
+{
+ public function __construct($url)
+ {
+ parent::__construct($url);
+ $this->setConfig('follow_redirects', true);
+ $this->setConfig('connect_timeout', 5);
+ $this->setConfig('timeout', 10);
+ $this->setConfig('ssl_verify_peer', false);
+ $this->setHeader('user-agent', 'phinde/bot');
+ }
+}
+?>
--- /dev/null
+<?php
+namespace phinde\LinkExtractor;
+
+use phinde\LinkInfo;
+
+class Atom
+{
+ public function extract(\HTTP_Request2_Response $res)
+ {
+ $url = $res->getEffectiveUrl();
+ $base = new \Net_URL2($url);
+
+ $sx = simplexml_load_string($res->getBody());
+ $linkInfos = array();
+ $alreadySeen = array();
+
+ foreach ($sx->entry as $entry) {
+ $linkTitle = (string) $entry->title;
+ foreach ($entry->link as $xlink) {
+ $linkUrl = (string) $base->resolve((string) $xlink['href']);
+ if (isset($alreadySeen[$linkUrl])) {
+ continue;
+ }
+
+ if ($xlink['rel'] == 'alternate') {
+ $linkInfos[] = new LinkInfo($linkUrl, $linkTitle, $url);
+ }
+ $alreadySeen[$linkUrl] = true;
+ }
+ }
+
+ return $linkInfos;
+ }
+}
+?>
--- /dev/null
+<?php
+namespace phinde\LinkExtractor;
+
+use phinde\LinkInfo;
+
+class Html
+{
+ public function extract(\HTTP_Request2_Response $res)
+ {
+ $url = $res->getEffectiveUrl();
+
+ $linkInfos = array();
+
+ //FIXME: mime type switch for cdata
+ $doc = new \DOMDocument();
+ //@ to hide parse warning messages in invalid html
+ @$doc->loadHTML($res->getBody());
+
+ //FIXME: extract base url from html
+ $base = new \Net_URL2($url);
+
+ $xpath = new \DOMXPath($doc);
+ $links = $xpath->evaluate('//a');
+ //FIXME: link rel, img, video
+
+ $alreadySeen = array();
+
+ foreach ($links as $link) {
+ $linkTitle = $link->textContent;
+ $href = '';
+ foreach ($link->attributes as $attribute) {
+ if ($attribute->name == 'href') {
+ $href = $attribute->textContent;
+ }
+ }
+ if ($href == '' || $href{0} == '#') {
+ //link on this page
+ continue;
+ }
+
+ $linkUrlObj = $base->resolve($href);
+ $linkUrlObj->setFragment(false);
+ $linkUrl = (string) $linkUrlObj;
+ if (isset($alreadySeen[$linkUrl])) {
+ continue;
+ }
+
+ switch ($linkUrlObj->getScheme()) {
+ case 'http':
+ case 'https':
+ break;
+ default:
+ continue 2;
+ }
+
+ //FIXME: check target type
+ //FIXME: check nofollow
+ $linkInfos[] = new LinkInfo(
+ $linkUrl, $linkTitle, $url
+ );
+ $alreadySeen[$linkUrl] = true;
+ }
+
+ return $linkInfos;
+ }
+}
+?>
--- /dev/null
+<?php
+namespace phinde;
+
+class LinkInfo
+{
+ public $url;
+ public $title;
+ public $source;
+
+ public function __construct($url, $title = null, $source = null)
+ {
+ $this->url = $url;
+ $this->title = $title;
+ $this->source = $source;
+ }
+}
+?>
--- /dev/null
+<?php
+namespace phinde;
+
+class Queue
+{
+ protected $gmclient;
+
+ public function __construct()
+ {
+ $this->gmclient = new \GearmanClient();
+ $this->gmclient->addServer('127.0.0.1');
+ }
+
+ public function addToIndex($linkUrl, $linkTitle, $sourceUrl)
+ {
+ echo "Queuing for indexing: $linkUrl\n";
+ $this->gmclient->doBackground(
+ 'phinde_index',
+ serialize(
+ array(
+ 'url' => $linkUrl,
+ 'title' => $linkTitle,
+ 'source' => $sourceUrl
+ )
+ )
+ );
+ if ($this->gmclient->returnCode() != GEARMAN_SUCCESS) {
+ echo 'Error queueing URL indexing for '
+ . $linkUrl . "\n"
+ . 'Error code: ' . $this->gmclient->returnCode() . "\n";
+ exit(2);
+ }
+ }
+
+ public function addToCrawl($linkUrl)
+ {
+ echo "Queuing for crawling: $linkUrl\n";
+ $this->gmclient->doBackground(
+ 'phinde_crawl',
+ serialize(
+ array(
+ 'url' => $linkUrl
+ )
+ )
+ );
+ if ($this->gmclient->returnCode() != GEARMAN_SUCCESS) {
+ echo 'Error queueing URL crawling for '
+ . $linkUrl . "\n"
+ . 'Error code: ' . $this->gmclient->returnCode() . "\n";
+ exit(2);
+ }
+ }
+}
+?>