aboutsummaryrefslogtreecommitdiff
path: root/src/phinde/Crawler.php
blob: 9b148789437bac97dbdf897048487e48b2ca0121 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
<?php
namespace phinde;

class Crawler
{
    protected $es;
    protected $queue;

    /**
     * If the links only should be shown, not queued
     */
    protected $showLinksOnly = false;

    static $supportedIndexTypes = array(
        'application/atom+xml'  => '\\phinde\\LinkExtractor\\Atom',
        'application/xhtml+xml' => '\\phinde\\LinkExtractor\\Html',
        'text/html'             => '\\phinde\\LinkExtractor\\Html',
    );

    public function __construct()
    {
        $this->es = new Elasticsearch($GLOBALS['phinde']['elasticsearch']);
        $this->queue = new Queue();
    }

    public function crawl($url)
    {
        $res       = $this->fetch($url);
        $linkInfos = $this->extractLinks($res);
        if ($this->showLinksOnly) {
            $this->showLinks($linkInfos);
        } else {
            $this->enqueue($linkInfos);
        }
    }

    protected function fetch($url)
    {
        $req = new HttpRequest($url);
        $req->setHeader(
            'accept',
            implode(',', array_keys(static::$supportedIndexTypes))
        );
        $res = $req->send();
        if ($res->getStatus() !== 200) {
            throw new \Exception(
                "Response code is not 200 but "
                . $res->getStatus() . ", stopping"
            );
        }
        return $res;
    }

    protected function extractLinks(\HTTP_Request2_Response $res)
    {
        $mimetype = explode(';', $res->getHeader('content-type'))[0];
        if (!isset(static::$supportedIndexTypes[$mimetype])) {
            echo "MIME type not supported for indexing: $mimetype\n";
            return array();
        }

        $class = static::$supportedIndexTypes[$mimetype];
        $extractor = new $class();
        return $extractor->extract($res);
    }

    protected function enqueue($linkInfos)
    {
        foreach ($linkInfos as $linkInfo) {
            if ($this->es->isKnown($linkInfo->url)) {
                continue;
            }
            $this->es->markQueued($linkInfo->url);
            $this->queue->addToIndex(
                $linkInfo->url, $linkInfo->title, $linkInfo->source
            );
            if (Helper::isUrlAllowed($linkInfo->url)) {
                $this->queue->addToCrawl($linkInfo->url);
            }
        }
    }

    protected function showLinks($linkInfos)
    {
        foreach ($linkInfos as $linkInfo) {
            echo $linkInfo->url . "\n";
            if ($linkInfo->title) {
                echo '  title: ' . $linkInfo->title . "\n";
                echo '  source: ' . $linkInfo->source . "\n";
            }
        }
    }

    public function setShowLinksOnly($showLinksOnly)
    {
        $this->showLinksOnly = $showLinksOnly;
    }
}
?>