2 namespace phinde\LinkExtractor;
9 public function extract(\HTTP_Request2_Response $res)
11 $url = $res->getEffectiveUrl();
15 //FIXME: mime type switch for cdata
16 $doc = new \DOMDocument();
17 //@ to hide parse warning messages in invalid html
18 @$doc->loadHTML($res->getBody());
20 //FIXME: extract base url from html
21 $base = new \Net_URL2($url);
23 $dx = new \DOMXPath($doc);
25 $meta = $dx->evaluate('/html/head/meta[@name="robots" and @content]')
28 $robots = $meta->attributes->getNamedItem('content')->textContent;
29 foreach (explode(',', $robots) as $value) {
30 if (trim($value) == 'nofollow') {
31 //we shall not follow the links
37 $links = $dx->evaluate('//a');
38 //FIXME: link rel, img, video
40 $alreadySeen = array($url => true);
42 foreach ($links as $link) {
43 $linkTitle = Helper::sanitizeTitle($link->textContent);
45 foreach ($link->attributes as $attribute) {
46 if ($attribute->name == 'href') {
47 $href = $attribute->textContent;
48 } else if ($attribute->name == 'rel') {
49 foreach (explode(',', $attribute->textContent) as $value) {
50 if (trim($value) == 'nofollow') {
51 //we shall not follow this link
57 if ($href == '' || $href{0} == '#') {
62 $linkUrlObj = $base->resolve($href);
63 $linkUrlObj->setFragment(false);
64 $linkUrl = (string) $linkUrlObj;
65 if (isset($alreadySeen[$linkUrl])) {
69 switch ($linkUrlObj->getScheme()) {
77 //FIXME: check target type
78 $linkInfos[] = new LinkInfo(
79 $linkUrl, $linkTitle, $url
81 $alreadySeen[$linkUrl] = true;