4 class Content_Extractor_Base
6 public function __construct(Logger $log)
11 protected function extractAuthorData($hentry, $xpath, &$data, $source)
13 $data['author_name'] = null;
14 $data['author_image'] = null;
15 $data['author_url'] = null;
17 $authors = $xpath->evaluate(
18 './/*[' . $this->xpc('p-author') . ']'
20 if ($authors->length != 1) {
21 //no p-author, so use page author data
22 $data['author_name'] = $this->getFirst(
23 '/*[self::html or self::h:html]/*[self::head or self::h:head]'
24 . '/*[(self::meta or self::h:meta) and @name="author"]',
25 'content', $hentry, $xpath
31 '/*[self::html or self::h:html]/*[self::head or self::h:head]'
32 . '/*[(self::link or self::h:link) and @rel="author"]',
33 'href', $hentry, $xpath
40 $author = $authors->item(0);
42 $data['author_name'] = $this->getFirst(
43 './/*[' . $this->xpc('p-name') . ' or ' . $this->xpc('fn') . ']',
46 $data['author_image'] = $this->getFirst(
47 './/*[' . $this->xpc('u-photo') . ']',
48 'src', $author, $xpath
50 $data['author_url'] = $this->absUrl(
52 './/*[' . $this->xpc('u-url') . ']',
53 'href', $author, $xpath
59 protected function getFirst($xpathExpr, $attrName, $elem, $xpath)
61 $items = $xpath->evaluate($xpathExpr, $elem);
62 if (!$items instanceof \DOMNodeList || $items->length == 0) {
66 if ($attrName === false) {
67 return $items->item(0);
68 } else if ($attrName == null) {
69 return $items->item(0)->nodeValue;
71 return $items->item(0)->attributes->getNamedItem($attrName)->nodeValue;
75 protected function innerHtml($element)
78 $children = $element->childNodes;
79 foreach ($children as $child) {
80 $tmp_dom = new \DOMDocument();
81 $tmp_dom->appendChild($tmp_dom->importNode($child, true));
82 $innerHTML .= rtrim($tmp_dom->saveHTML(), "\n");
84 return trim($innerHTML);
87 protected function getXpath($node)
89 $xpath = new \DOMXPath($node);
90 $xpath->registerNamespace('h', 'http://www.w3.org/1999/xhtml');
94 protected function xpc($class)
97 . 'concat(" ", normalize-space(@class), " "),'
98 . '" ' . $class . ' "'
102 protected function xpq($str)
104 return '"' . htmlspecialchars($str, ENT_QUOTES) . '"';
107 protected function absUrl($url, $source)
112 $sourceUrl = new \Net_URL2($source);
113 return (string)$sourceUrl->resolve($url);