diff options
| author | Christian Weiske <cweiske@cweiske.de> | 2016-02-10 14:56:20 +0100 |
|---|---|---|
| committer | Christian Weiske <cweiske@cweiske.de> | 2016-02-10 14:56:20 +0100 |
| commit | cd02bac646f42a0cb402ff2dc8240aa01f1f0fb8 (patch) | |
| tree | 8cc7ee5d841f868e38ccc0b54d8cc6d33a852ed7 /src/phinde/LinkExtractor/Atom.php | |
| parent | f67e8f0bc3f51f2d280a86a8c7cffa68d812efe1 (diff) | |
| download | phinde-cd02bac646f42a0cb402ff2dc8240aa01f1f0fb8.tar.gz phinde-cd02bac646f42a0cb402ff2dc8240aa01f1f0fb8.zip | |
rework crawler; add atom link extraction
Diffstat (limited to 'src/phinde/LinkExtractor/Atom.php')
| -rw-r--r-- | src/phinde/LinkExtractor/Atom.php | 35 |
1 files changed, 35 insertions, 0 deletions
diff --git a/src/phinde/LinkExtractor/Atom.php b/src/phinde/LinkExtractor/Atom.php new file mode 100644 index 0000000..bb4d90b --- /dev/null +++ b/src/phinde/LinkExtractor/Atom.php @@ -0,0 +1,35 @@ +<?php +namespace phinde\LinkExtractor; + +use phinde\LinkInfo; + +class Atom +{ + public function extract(\HTTP_Request2_Response $res) + { + $url = $res->getEffectiveUrl(); + $base = new \Net_URL2($url); + + $sx = simplexml_load_string($res->getBody()); + $linkInfos = array(); + $alreadySeen = array(); + + foreach ($sx->entry as $entry) { + $linkTitle = (string) $entry->title; + foreach ($entry->link as $xlink) { + $linkUrl = (string) $base->resolve((string) $xlink['href']); + if (isset($alreadySeen[$linkUrl])) { + continue; + } + + if ($xlink['rel'] == 'alternate') { + $linkInfos[] = new LinkInfo($linkUrl, $linkTitle, $url); + } + $alreadySeen[$linkUrl] = true; + } + } + + return $linkInfos; + } +} +?> |
