From a2e7177d78911d219bc5be86c1cc86989b36983f Mon Sep 17 00:00:00 2001 From: Christian Weiske Date: Thu, 11 Feb 2016 08:43:01 +0100 Subject: [PATCH] debug option for crawler --- README.rst | 1 + bin/crawl.php | 30 ++++++++++++++++++++++++++---- src/phinde/Crawler.php | 27 ++++++++++++++++++++++++++- 3 files changed, 53 insertions(+), 5 deletions(-) diff --git a/README.rst b/README.rst index 40f8c55..3d10657 100644 --- a/README.rst +++ b/README.rst @@ -32,6 +32,7 @@ Dependencies - PHP 5.5+ - elasticsearch 2.0 - gearman +- Console_CommandLine - Net_URL2 diff --git a/bin/crawl.php b/bin/crawl.php index e9a6218..0d57bb3 100755 --- a/bin/crawl.php +++ b/bin/crawl.php @@ -3,12 +3,33 @@ namespace phinde; require_once __DIR__ . '/../src/init.php'; -if ($argc < 2) { - echo "No URL given\n"; - exit(1); +$cc = new \Console_CommandLine(); +$cc->description = 'phinde URL crawler'; +$cc->version = '0.0.1'; +$cc->addOption( + 'showLinksOnly', + array( + 'short_name' => '-s', + 'long_name' => '--show-links', + 'description' => 'Only show which URLs were found', + 'action' => 'StoreTrue', + 'default' => false + ) +); +$cc->addArgument( + 'url', + array( + 'description' => 'URL to crawl', + 'multiple' => false + ) +); +try { + $res = $cc->parse(); +} catch (\Exception $e) { + $cc->displayError($e->getMessage()); } -$url = $argv[1]; +$url = $res->args['url']; $url = Helper::addSchema($url); if (!Helper::isUrlAllowed($url)) { echo "Domain is not allowed; not crawling\n"; @@ -17,6 +38,7 @@ if (!Helper::isUrlAllowed($url)) { try { $crawler = new Crawler(); + $crawler->setShowLinksOnly($res->options['showLinksOnly']); $crawler->crawl($url); } catch (\Exception $e) { echo $e->getMessage() . "\n"; diff --git a/src/phinde/Crawler.php b/src/phinde/Crawler.php index f3158aa..9b14878 100644 --- a/src/phinde/Crawler.php +++ b/src/phinde/Crawler.php @@ -6,6 +6,11 @@ class Crawler protected $es; protected $queue; + /** + * If the links only should be shown, not queued + */ + protected $showLinksOnly = false; + static $supportedIndexTypes = array( 'application/atom+xml' => '\\phinde\\LinkExtractor\\Atom', 'application/xhtml+xml' => '\\phinde\\LinkExtractor\\Html', @@ -22,7 +27,11 @@ class Crawler { $res = $this->fetch($url); $linkInfos = $this->extractLinks($res); - $this->enqueue($linkInfos); + if ($this->showLinksOnly) { + $this->showLinks($linkInfos); + } else { + $this->enqueue($linkInfos); + } } protected function fetch($url) @@ -70,5 +79,21 @@ class Crawler } } } + + protected function showLinks($linkInfos) + { + foreach ($linkInfos as $linkInfo) { + echo $linkInfo->url . "\n"; + if ($linkInfo->title) { + echo ' title: ' . $linkInfo->title . "\n"; + echo ' source: ' . $linkInfo->source . "\n"; + } + } + } + + public function setShowLinksOnly($showLinksOnly) + { + $this->showLinksOnly = $showLinksOnly; + } } ?> -- 2.30.2