2 namespace phinde\LinkExtractor;
8 public function extract(\HTTP_Request2_Response $res)
10 $url = $res->getEffectiveUrl();
14 //FIXME: mime type switch for cdata
15 $doc = new \DOMDocument();
16 //@ to hide parse warning messages in invalid html
17 @$doc->loadHTML($res->getBody());
19 //FIXME: extract base url from html
20 $base = new \Net_URL2($url);
22 $dx = new \DOMXPath($doc);
24 $meta = $dx->evaluate('/html/head/meta[@name="robots" and @content]')
27 $robots = $meta->attributes->getNamedItem('content')->textContent;
28 foreach (explode(',', $robots) as $value) {
29 if (trim($value) == 'nofollow') {
30 //we shall not follow the links
36 $links = $dx->evaluate('//a');
37 //FIXME: link rel, img, video
39 $alreadySeen = array();
41 foreach ($links as $link) {
42 $linkTitle = $link->textContent;
44 foreach ($link->attributes as $attribute) {
45 if ($attribute->name == 'href') {
46 $href = $attribute->textContent;
47 } else if ($attribute->name == 'rel') {
48 foreach (explode(',', $attribute->textContent) as $value) {
49 if (trim($value) == 'nofollow') {
50 //we shall not follow this link
56 if ($href == '' || $href{0} == '#') {
61 $linkUrlObj = $base->resolve($href);
62 $linkUrlObj->setFragment(false);
63 $linkUrl = (string) $linkUrlObj;
64 if (isset($alreadySeen[$linkUrl])) {
68 switch ($linkUrlObj->getScheme()) {
76 //FIXME: check target type
77 $linkInfos[] = new LinkInfo(
78 $linkUrl, $linkTitle, $url
80 $alreadySeen[$linkUrl] = true;