4 require_once __DIR__ . '/../src/init.php';
6 $cc = new \Console_CommandLine();
7 $cc->description = 'phinde URL processor';
8 $cc->version = '0.0.1';
13 'long_name' => '--force',
14 'description' => 'Always process URL, even when it did not change',
15 'action' => 'StoreTrue',
23 'long_name' => '--show-links',
24 'description' => 'Only show which URLs were found',
25 'action' => 'StoreTrue',
32 'description' => 'URL to process',
39 'description' => 'Actions to take',
42 'choices' => array('index', 'crawl'),
43 'default' => array('index', 'crawl'),
48 } catch (\Exception $e) {
49 $cc->displayError($e->getMessage());
52 $url = $res->args['url'];
53 $url = Helper::addSchema($url);
54 $urlObj = new \Net_URL2($url);
55 $url = $urlObj->getNormalizedURL();
56 if (!Helper::isUrlAllowed($url)) {
57 Log::error("Domain is not allowed; not crawling");
63 foreach ($res->args['actions'] as $action) {
64 if ($action == 'crawl') {
65 $crawler = new Crawler();
66 $crawler->setShowLinksOnly($res->options['showLinksOnly']);
67 $actions[$action] = $crawler;
68 } else if ($action == 'index') {
69 $actions[$action] = new Indexer();
73 $fetcher = new Fetcher();
74 $retrieved = $fetcher->fetch($url, $actions, $res->options['force']);
75 if ($retrieved === false) {
80 foreach ($actions as $key => $action) {
81 Log::info("step: $key");
82 $update |= $action->run($retrieved);
86 //FIXME: update index if it exists already
87 $fetcher->storeDoc($retrieved->url, $retrieved->esDoc);
89 Log::info("Not updating");
91 } catch (\Exception $e) {
92 Log::error($e->getMessage());