blob: 538d6c4fd0ccefd7da222561eaefd3d7fa71d77e (
plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
|
<?php
namespace phinde\LinkExtractor;
use phinde\LinkInfo;
class Html
{
public function extract(\HTTP_Request2_Response $res)
{
$url = $res->getEffectiveUrl();
$linkInfos = array();
//FIXME: mime type switch for cdata
$doc = new \DOMDocument();
//@ to hide parse warning messages in invalid html
@$doc->loadHTML($res->getBody());
//FIXME: extract base url from html
$base = new \Net_URL2($url);
$xpath = new \DOMXPath($doc);
$links = $xpath->evaluate('//a');
//FIXME: link rel, img, video
$alreadySeen = array();
foreach ($links as $link) {
$linkTitle = $link->textContent;
$href = '';
foreach ($link->attributes as $attribute) {
if ($attribute->name == 'href') {
$href = $attribute->textContent;
}
}
if ($href == '' || $href{0} == '#') {
//link on this page
continue;
}
$linkUrlObj = $base->resolve($href);
$linkUrlObj->setFragment(false);
$linkUrl = (string) $linkUrlObj;
if (isset($alreadySeen[$linkUrl])) {
continue;
}
switch ($linkUrlObj->getScheme()) {
case 'http':
case 'https':
break;
default:
continue 2;
}
//FIXME: check target type
//FIXME: check nofollow
$linkInfos[] = new LinkInfo(
$linkUrl, $linkTitle, $url
);
$alreadySeen[$linkUrl] = true;
}
return $linkInfos;
}
}
?>
|