aboutsummaryrefslogtreecommitdiff
path: root/bin/crawl.php
blob: 0d57bb3232f8a5490de75a81a00b6b73ccad1708 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
#!/usr/bin/env php
<?php
namespace phinde;
require_once __DIR__ . '/../src/init.php';

$cc = new \Console_CommandLine();
$cc->description = 'phinde URL crawler';
$cc->version = '0.0.1';
$cc->addOption(
    'showLinksOnly',
    array(
        'short_name'  => '-s',
        'long_name'   => '--show-links',
        'description' => 'Only show which URLs were found',
        'action'      => 'StoreTrue',
        'default'     => false
    )
);
$cc->addArgument(
    'url',
    array(
        'description' => 'URL to crawl',
        'multiple'    => false
    )
);
try {
    $res = $cc->parse();
} catch (\Exception $e) {
    $cc->displayError($e->getMessage());
}

$url = $res->args['url'];
$url = Helper::addSchema($url);
if (!Helper::isUrlAllowed($url)) {
    echo "Domain is not allowed; not crawling\n";
    exit(2);
}

try {
    $crawler = new Crawler();
    $crawler->setShowLinksOnly($res->options['showLinksOnly']);
    $crawler->crawl($url);
} catch (\Exception $e) {
    echo $e->getMessage() . "\n";
    exit(10);
}
?>