blob: 0d6f3d8aa1159bd9053801464033abf063fad09b (
plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
|
<?php
namespace phinde\LinkExtractor;
use phinde\LinkInfo;
class Html
{
public function extract(\HTTP_Request2_Response $res)
{
$url = $res->getEffectiveUrl();
$linkInfos = array();
//FIXME: mime type switch for cdata
$doc = new \DOMDocument();
//@ to hide parse warning messages in invalid html
@$doc->loadHTML($res->getBody());
//FIXME: extract base url from html
$base = new \Net_URL2($url);
$dx = new \DOMXPath($doc);
$meta = $dx->evaluate('/html/head/meta[@name="robots" and @value]')
->item(0);
if ($meta) {
$robots = $meta->attributes->getNamedItem('value')->textContent;
foreach (explode(',', $robots) as $value) {
if (trim($value) == 'nofollow') {
//we shall not follow the links
return array();
}
}
}
$links = $dx->evaluate('//a');
//FIXME: link rel, img, video
$alreadySeen = array();
foreach ($links as $link) {
$linkTitle = $link->textContent;
$href = '';
foreach ($link->attributes as $attribute) {
if ($attribute->name == 'href') {
$href = $attribute->textContent;
} else if ($attribute->name == 'rel') {
foreach (explode(',', $attribute->textContent) as $value) {
if (trim($value) == 'nofollow') {
//we shall not follow this link
continue 3;
}
}
}
}
if ($href == '' || $href{0} == '#') {
//link on this page
continue;
}
$linkUrlObj = $base->resolve($href);
$linkUrlObj->setFragment(false);
$linkUrl = (string) $linkUrlObj;
if (isset($alreadySeen[$linkUrl])) {
continue;
}
switch ($linkUrlObj->getScheme()) {
case 'http':
case 'https':
break;
default:
continue 2;
}
//FIXME: check target type
$linkInfos[] = new LinkInfo(
$linkUrl, $linkTitle, $url
);
$alreadySeen[$linkUrl] = true;
}
return $linkInfos;
}
}
?>
|