aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--README.rst1
-rwxr-xr-xbin/crawl.php30
-rw-r--r--src/phinde/Crawler.php27
3 files changed, 53 insertions, 5 deletions
diff --git a/README.rst b/README.rst
index 40f8c55..3d10657 100644
--- a/README.rst
+++ b/README.rst
@@ -32,6 +32,7 @@ Dependencies
- PHP 5.5+
- elasticsearch 2.0
- gearman
+- Console_CommandLine
- Net_URL2
diff --git a/bin/crawl.php b/bin/crawl.php
index e9a6218..0d57bb3 100755
--- a/bin/crawl.php
+++ b/bin/crawl.php
@@ -3,12 +3,33 @@
namespace phinde;
require_once __DIR__ . '/../src/init.php';
-if ($argc < 2) {
- echo "No URL given\n";
- exit(1);
+$cc = new \Console_CommandLine();
+$cc->description = 'phinde URL crawler';
+$cc->version = '0.0.1';
+$cc->addOption(
+ 'showLinksOnly',
+ array(
+ 'short_name' => '-s',
+ 'long_name' => '--show-links',
+ 'description' => 'Only show which URLs were found',
+ 'action' => 'StoreTrue',
+ 'default' => false
+ )
+);
+$cc->addArgument(
+ 'url',
+ array(
+ 'description' => 'URL to crawl',
+ 'multiple' => false
+ )
+);
+try {
+ $res = $cc->parse();
+} catch (\Exception $e) {
+ $cc->displayError($e->getMessage());
}
-$url = $argv[1];
+$url = $res->args['url'];
$url = Helper::addSchema($url);
if (!Helper::isUrlAllowed($url)) {
echo "Domain is not allowed; not crawling\n";
@@ -17,6 +38,7 @@ if (!Helper::isUrlAllowed($url)) {
try {
$crawler = new Crawler();
+ $crawler->setShowLinksOnly($res->options['showLinksOnly']);
$crawler->crawl($url);
} catch (\Exception $e) {
echo $e->getMessage() . "\n";
diff --git a/src/phinde/Crawler.php b/src/phinde/Crawler.php
index f3158aa..9b14878 100644
--- a/src/phinde/Crawler.php
+++ b/src/phinde/Crawler.php
@@ -6,6 +6,11 @@ class Crawler
protected $es;
protected $queue;
+ /**
+ * If the links only should be shown, not queued
+ */
+ protected $showLinksOnly = false;
+
static $supportedIndexTypes = array(
'application/atom+xml' => '\\phinde\\LinkExtractor\\Atom',
'application/xhtml+xml' => '\\phinde\\LinkExtractor\\Html',
@@ -22,7 +27,11 @@ class Crawler
{
$res = $this->fetch($url);
$linkInfos = $this->extractLinks($res);
- $this->enqueue($linkInfos);
+ if ($this->showLinksOnly) {
+ $this->showLinks($linkInfos);
+ } else {
+ $this->enqueue($linkInfos);
+ }
}
protected function fetch($url)
@@ -70,5 +79,21 @@ class Crawler
}
}
}
+
+ protected function showLinks($linkInfos)
+ {
+ foreach ($linkInfos as $linkInfo) {
+ echo $linkInfo->url . "\n";
+ if ($linkInfo->title) {
+ echo ' title: ' . $linkInfo->title . "\n";
+ echo ' source: ' . $linkInfo->source . "\n";
+ }
+ }
+ }
+
+ public function setShowLinksOnly($showLinksOnly)
+ {
+ $this->showLinksOnly = $showLinksOnly;
+ }
}
?>