aboutsummaryrefslogtreecommitdiff
path: root/src/phinde/Crawler.php
blob: 4d596b40e0abc49031fa4127211defb78aed58c4 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
<?php
namespace phinde;

class Crawler
{
    protected $es;
    protected $queue;

    /**
     * If the links only should be shown, not queued
     */
    protected $showLinksOnly = false;

    static $supportedTypes = array(
        'application/atom+xml'  => '\\phinde\\LinkExtractor\\Atom',
        'application/xhtml+xml' => '\\phinde\\LinkExtractor\\Html',
        'text/html'             => '\\phinde\\LinkExtractor\\Html',
    );

    public function __construct()
    {
        $this->es = new Elasticsearch($GLOBALS['phinde']['elasticsearch']);
        $this->queue = new Queue();
    }

    public function run(Retrieved $retrieved)
    {
        $linkInfos = $this->extractLinks($retrieved->httpRes);
        $linkInfos = $this->filterLinks($linkInfos);
        if ($this->showLinksOnly) {
            $this->showLinks($linkInfos);
            return false;
        } else {
            $this->enqueue($linkInfos);
            return true;
        }
    }

    protected function extractLinks(\HTTP_Request2_Response $res)
    {
        $mimetype = explode(';', $res->getHeader('content-type'))[0];
        if (!isset(static::$supportedTypes[$mimetype])) {
            Log::info("MIME type not supported for crawling: $mimetype");
            return array();
        }

        $class = static::$supportedTypes[$mimetype];
        $extractor = new $class();
        return $extractor->extract($res);
    }

    protected function filterLinks($linkInfos)
    {
        $filteredLinkInfos = array();
        foreach ($linkInfos as $linkInfo) {
            $linkInfo->url = Helper::rewriteUrl($linkInfo->url);
            $allowed = Helper::isUrlAllowed($linkInfo->url);
            $crawl   = $allowed;
            $index   = $GLOBALS['phinde']['indexNonAllowed'] || $allowed;

            if ($crawl && count($GLOBALS['phinde']['crawlBlacklist'])) {
                foreach ($GLOBALS['phinde']['crawlBlacklist'] as $bl) {
                    if (preg_match('#' . $bl . '#', $linkInfo->url)) {
                        $crawl = false;
                    }
                }
            }

            $linkInfo->known = $this->es->isKnown($linkInfo->url);
            $linkInfo->crawl = $crawl;
            $linkInfo->index = $index;
            $filteredLinkInfos[] = $linkInfo;
        }
        return $filteredLinkInfos;
    }

    protected function enqueue($linkInfos)
    {
        foreach ($linkInfos as $linkInfo) {
            if ($linkInfo->known) {
                continue;
            }
            if ($linkInfo->crawl || $linkInfo->index) {
                $this->es->markQueued($linkInfo->url);
                $actions = array();
                if ($linkInfo->index) {
                    $actions[] = 'index';
                }
                if ($linkInfo->crawl) {
                    $actions[] = 'crawl';
                }
                $this->queue->addToProcessList(
                    $linkInfo->url, $actions
                );
            }
        }
    }

    protected function showLinks($linkInfos)
    {
        foreach ($linkInfos as $linkInfo) {
            Log::msg($linkInfo->url);
            if ($linkInfo->title) {
                Log::msg('   title: ' . $linkInfo->title);
                Log::msg('  source: ' . $linkInfo->source);
                Log::msg(
                    '   known: ' . intval($linkInfo->known)
                    . ', crawl: ' . intval($linkInfo->crawl)
                    . ', index: ' . intval($linkInfo->index)
                );
            }
        }
    }

    public function setShowLinksOnly($showLinksOnly)
    {
        $this->showLinksOnly = $showLinksOnly;
    }
}
?>