From cd02bac646f42a0cb402ff2dc8240aa01f1f0fb8 Mon Sep 17 00:00:00 2001 From: Christian Weiske Date: Wed, 10 Feb 2016 14:56:20 +0100 Subject: rework crawler; add atom link extraction --- src/phinde/LinkExtractor/Atom.php | 35 ++++++++++++++++++++ src/phinde/LinkExtractor/Html.php | 67 +++++++++++++++++++++++++++++++++++++++ 2 files changed, 102 insertions(+) create mode 100644 src/phinde/LinkExtractor/Atom.php create mode 100644 src/phinde/LinkExtractor/Html.php (limited to 'src/phinde/LinkExtractor') diff --git a/src/phinde/LinkExtractor/Atom.php b/src/phinde/LinkExtractor/Atom.php new file mode 100644 index 0000000..bb4d90b --- /dev/null +++ b/src/phinde/LinkExtractor/Atom.php @@ -0,0 +1,35 @@ +getEffectiveUrl(); + $base = new \Net_URL2($url); + + $sx = simplexml_load_string($res->getBody()); + $linkInfos = array(); + $alreadySeen = array(); + + foreach ($sx->entry as $entry) { + $linkTitle = (string) $entry->title; + foreach ($entry->link as $xlink) { + $linkUrl = (string) $base->resolve((string) $xlink['href']); + if (isset($alreadySeen[$linkUrl])) { + continue; + } + + if ($xlink['rel'] == 'alternate') { + $linkInfos[] = new LinkInfo($linkUrl, $linkTitle, $url); + } + $alreadySeen[$linkUrl] = true; + } + } + + return $linkInfos; + } +} +?> diff --git a/src/phinde/LinkExtractor/Html.php b/src/phinde/LinkExtractor/Html.php new file mode 100644 index 0000000..538d6c4 --- /dev/null +++ b/src/phinde/LinkExtractor/Html.php @@ -0,0 +1,67 @@ +getEffectiveUrl(); + + $linkInfos = array(); + + //FIXME: mime type switch for cdata + $doc = new \DOMDocument(); + //@ to hide parse warning messages in invalid html + @$doc->loadHTML($res->getBody()); + + //FIXME: extract base url from html + $base = new \Net_URL2($url); + + $xpath = new \DOMXPath($doc); + $links = $xpath->evaluate('//a'); + //FIXME: link rel, img, video + + $alreadySeen = array(); + + foreach ($links as $link) { + $linkTitle = $link->textContent; + $href = ''; + foreach ($link->attributes as $attribute) { + if ($attribute->name == 'href') { + $href = $attribute->textContent; + } + } + if ($href == '' || $href{0} == '#') { + //link on this page + continue; + } + + $linkUrlObj = $base->resolve($href); + $linkUrlObj->setFragment(false); + $linkUrl = (string) $linkUrlObj; + if (isset($alreadySeen[$linkUrl])) { + continue; + } + + switch ($linkUrlObj->getScheme()) { + case 'http': + case 'https': + break; + default: + continue 2; + } + + //FIXME: check target type + //FIXME: check nofollow + $linkInfos[] = new LinkInfo( + $linkUrl, $linkTitle, $url + ); + $alreadySeen[$linkUrl] = true; + } + + return $linkInfos; + } +} +?> -- cgit v1.2.3