2 namespace phinde\LinkExtractor;
9 public function extract(\HTTP_Request2_Response $res)
11 $url = Helper::removeAnchor($res->getEffectiveUrl());
15 //FIXME: mime type switch for cdata
16 $doc = new \DOMDocument();
17 //@ to hide parse warning messages in invalid html
18 @$doc->loadHTML($res->getBody());
20 //FIXME: extract base url from html
21 $base = new \Net_URL2($url);
23 $dx = new \DOMXPath($doc);
25 $xbase = $dx->evaluate('/html/head/base[@href]')->item(0);
27 $base = $base->resolve(
28 $xbase->attributes->getNamedItem('href')->textContent
32 $meta = $dx->evaluate('/html/head/meta[@name="robots" and @content]')
35 $robots = $meta->attributes->getNamedItem('content')->textContent;
36 foreach (explode(',', $robots) as $value) {
37 if (trim($value) == 'nofollow') {
38 //we shall not follow the links
44 $links = $dx->evaluate('//a');
45 //FIXME: link rel, img, video
47 $alreadySeen = array($url => true);
49 foreach ($links as $link) {
50 $linkTitle = Helper::sanitizeTitle($link->textContent);
52 foreach ($link->attributes as $attribute) {
53 if ($attribute->name == 'href') {
54 $href = $attribute->textContent;
55 } else if ($attribute->name == 'rel') {
56 foreach (explode(',', $attribute->textContent) as $value) {
57 if (trim($value) == 'nofollow') {
58 //we shall not follow this link
64 if ($href == '' || $href{0} == '#') {
69 $linkUrlObj = $base->resolve($href);
70 $linkUrlObj->setFragment(false);
71 $linkUrl = (string) $linkUrlObj;
72 if (isset($alreadySeen[$linkUrl])) {
76 switch ($linkUrlObj->getScheme()) {
84 //FIXME: check target type
85 $linkInfos[] = new LinkInfo(
86 $linkUrl, $linkTitle, $url
88 $alreadySeen[$linkUrl] = true;