aboutsummaryrefslogtreecommitdiff
path: root/src/phinde/LinkExtractor/Html.php
blob: 538d6c4fd0ccefd7da222561eaefd3d7fa71d77e (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
<?php
namespace phinde\LinkExtractor;

use phinde\LinkInfo;

class Html
{
    public function extract(\HTTP_Request2_Response $res)
    {
        $url = $res->getEffectiveUrl();

        $linkInfos = array();

        //FIXME: mime type switch for cdata
        $doc = new \DOMDocument();
        //@ to hide parse warning messages in invalid html
        @$doc->loadHTML($res->getBody());

        //FIXME: extract base url from html
        $base = new \Net_URL2($url);

        $xpath = new \DOMXPath($doc);
        $links = $xpath->evaluate('//a');
        //FIXME: link rel, img, video

        $alreadySeen = array();

        foreach ($links as $link) {
            $linkTitle = $link->textContent;
            $href = '';
            foreach ($link->attributes as $attribute) {
                if ($attribute->name == 'href') {
                    $href = $attribute->textContent;
                }
            }
            if ($href == '' || $href{0} == '#') {
                //link on this page
                continue;
            }

            $linkUrlObj = $base->resolve($href);
            $linkUrlObj->setFragment(false);
            $linkUrl    = (string) $linkUrlObj;
            if (isset($alreadySeen[$linkUrl])) {
                continue;
            }

            switch ($linkUrlObj->getScheme()) {
            case 'http':
            case 'https':
                break;
            default:
                continue 2;
            }

            //FIXME: check target type
            //FIXME: check nofollow
            $linkInfos[] = new LinkInfo(
               $linkUrl, $linkTitle, $url
            );
            $alreadySeen[$linkUrl] = true;
        }

        return $linkInfos;
    }
}
?>