aboutsummaryrefslogtreecommitdiff
path: root/bin/process.php
blob: 9fc47527b9dd12ad43e8115fc038a9d5f5e765af (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
#!/usr/bin/env php
<?php
namespace phinde;
require_once __DIR__ . '/../src/init.php';

$cc = new \Console_CommandLine();
$cc->description = 'phinde URL processor';
$cc->version = '0.0.1';
$cc->addOption(
    'force',
    array(
        'short_name'  => '-f',
        'long_name'   => '--force',
        'description' => 'Always process URL, even when it did not change',
        'action'      => 'StoreTrue',
        'default'     => false
    )
);
$cc->addOption(
    'showLinksOnly',
    array(
        'short_name'  => '-s',
        'long_name'   => '--show-links',
        'description' => 'Only show which URLs were found',
        'action'      => 'StoreTrue',
        'default'     => false
    )
);
$cc->addArgument(
    'url',
    array(
        'description' => 'URL to process',
        'multiple'    => false
    )
);
$cc->addArgument(
    'actions',
    array(
        'description' => 'Actions to take',
        'multiple'    => true,
        'optional'    => true,
        'choices'     => array('index', 'crawl'),
        'default'     => array('index', 'crawl'),
    )
);
try {
    $res = $cc->parse();
} catch (\Exception $e) {
    $cc->displayError($e->getMessage());
}

$url = $res->args['url'];
$url = Helper::addSchema($url);
$urlObj = new \Net_URL2($url);
$url = $urlObj->getNormalizedURL();

try {
    $actions = array();
    foreach ($res->args['actions'] as $action) {
        if ($action == 'crawl') {
            $crawler = new Crawler();
            $crawler->setShowLinksOnly($res->options['showLinksOnly']);
            $actions[$action] = $crawler;
        } else if ($action == 'index') {
            $actions[$action] = new Indexer();
        }
    }

    $fetcher   = new Fetcher();
    $retrieved = $fetcher->fetch($url, $actions, $res->options['force']);
    if ($retrieved === false) {
        exit(0);
    }

    $update = false;
    foreach ($actions as $key => $action) {
        Log::info("step: $key");
        $update |= $action->run($retrieved);
    }

    if ($update) {
        //FIXME: update index if it exists already
        $fetcher->storeDoc($retrieved->url, $retrieved->esDoc);
    } else {
        Log::info("Not updating");
    }
} catch (\Exception $e) {
    Log::error($e->getMessage());
    exit(10);
}
?>