aboutsummaryrefslogtreecommitdiff
path: root/bin/process.php
diff options
context:
space:
mode:
authorChristian Weiske <cweiske@cweiske.de>2016-11-07 21:41:36 +0100
committerChristian Weiske <cweiske@cweiske.de>2016-11-07 21:41:36 +0100
commitd7651fd96dcfa2829519504e4c8ec1ce511cd57f (patch)
treee24d7a9f90060b0fee5a652de43bd0627f1c5bde /bin/process.php
parentf90790c6b2a54c9b1c8a0aeaf1f23e6aa67d7aca (diff)
downloadphinde-d7651fd96dcfa2829519504e4c8ec1ce511cd57f.tar.gz
phinde-d7651fd96dcfa2829519504e4c8ec1ce511cd57f.zip
Big patch merging crawling+indexing into one command, new json document structure
Diffstat (limited to 'bin/process.php')
-rwxr-xr-xbin/process.php95
1 files changed, 95 insertions, 0 deletions
diff --git a/bin/process.php b/bin/process.php
new file mode 100755
index 0000000..ababb03
--- /dev/null
+++ b/bin/process.php
@@ -0,0 +1,95 @@
+#!/usr/bin/env php
+<?php
+namespace phinde;
+require_once __DIR__ . '/../src/init.php';
+
+$cc = new \Console_CommandLine();
+$cc->description = 'phinde URL processor';
+$cc->version = '0.0.1';
+$cc->addOption(
+ 'force',
+ array(
+ 'short_name' => '-f',
+ 'long_name' => '--force',
+ 'description' => 'Always process URL, even when it did not change',
+ 'action' => 'StoreTrue',
+ 'default' => false
+ )
+);
+$cc->addOption(
+ 'showLinksOnly',
+ array(
+ 'short_name' => '-s',
+ 'long_name' => '--show-links',
+ 'description' => 'Only show which URLs were found',
+ 'action' => 'StoreTrue',
+ 'default' => false
+ )
+);
+$cc->addArgument(
+ 'url',
+ array(
+ 'description' => 'URL to process',
+ 'multiple' => false
+ )
+);
+$cc->addArgument(
+ 'actions',
+ array(
+ 'description' => 'Actions to take',
+ 'multiple' => true,
+ 'optional' => true,
+ 'choices' => array('index', 'crawl'),
+ 'default' => array('index', 'crawl'),
+ )
+);
+try {
+ $res = $cc->parse();
+} catch (\Exception $e) {
+ $cc->displayError($e->getMessage());
+}
+
+$url = $res->args['url'];
+$url = Helper::addSchema($url);
+$urlObj = new \Net_URL2($url);
+$url = $urlObj->getNormalizedURL();
+if (!Helper::isUrlAllowed($url)) {
+ echo "Domain is not allowed; not crawling\n";
+ exit(2);
+}
+
+try {
+ $actions = array();
+ foreach ($res->args['actions'] as $action) {
+ if ($action == 'crawl') {
+ $crawler = new Crawler();
+ $crawler->setShowLinksOnly($res->options['showLinksOnly']);
+ $actions[$action] = $crawler;
+ } else if ($action == 'index') {
+ $actions[$action] = new Indexer();
+ }
+ }
+
+ $fetcher = new Fetcher();
+ $retrieved = $fetcher->fetch($url, $actions, $res->options['force']);
+ if ($retrieved === false) {
+ exit(0);
+ }
+
+ $update = false;
+ foreach ($actions as $key => $action) {
+ echo "step: $key\n";
+ $update |= $action->run($retrieved);
+ }
+
+ if ($update) {
+ //FIXME: update index if it exists already
+ $fetcher->storeDoc($retrieved->url, $retrieved->esDoc);
+ } else {
+ echo "Not updating\n";
+ }
+} catch (\Exception $e) {
+ echo $e->getMessage() . "\n";
+ exit(10);
+}
+?> \ No newline at end of file