4 class Content_Extractor_Comment
6 public function __construct(Logger $log)
12 * Try to extract comment data from HTML
14 * @param object $doc HTML
15 * @param string $source URL this HTML has been loaded from
16 * @param string $target URL the reply should be to
18 * @return mixed NULL if nothing found, array if ok
20 public function extract(\DOMDocument $doc, $source, $target)
22 $xpath = $this->getXpath($doc);
23 $hentries = $xpath->query(
24 '//*[(' . $this->xpc('h-entry') . ' or ' . $this->xpc('hentry') . ') and '
26 . $this->xpc('u-in-reply-to') . ' and @href=' . $this->xpq($target)
31 if ($hentries->length == 0) {
39 $hentry = $hentries->item(0);
41 $this->extractAuthorData($hentry, $xpath, $data, $doc);
42 $content = $this->getFirst(
43 './/*[' . $this->xpc('e-content') . ']', false, $hentry, $xpath
46 $data['content'] = $this->innerHtml($content);
48 $data['title'] = $this->getFirst(
49 './/*[' . $this->xpc('p-name') . ']', false, $hentry, $xpath
55 protected function extractAuthorData($hentry, $xpath, &$data, $d)
57 $data['author_name'] = null;
58 $data['author_image'] = null;
59 $data['author_url'] = null;
61 $authors = $xpath->evaluate(
62 './/*[' . $this->xpc('p-author') . ']'
64 if ($authors->length != 1) {
68 $author = $authors->item(0);
70 $data['author_name'] = $this->getFirst(
71 './/*[' . $this->xpc('p-name') . ' or ' . $this->xpc('fn') . ']',
74 $data['author_image'] = $this->getFirst(
75 './/*[' . $this->xpc('u-photo') . ']',
76 'src', $author, $xpath
78 $data['author_url'] = $this->getFirst(
79 './/*[' . $this->xpc('u-url') . ']',
80 'href', $author, $xpath
84 protected function getFirst($xpathExpr, $attrName, $elem, $xpath)
86 $items = $xpath->evaluate($xpathExpr, $elem);
87 if (!$items instanceof \DOMNodeList || $items->length == 0) {
91 if ($attrName === false) {
92 return $items->item(0);
93 } else if ($attrName == null) {
94 return $items->item(0)->nodeValue;
96 return $items->item(0)->attributes->getNamedItem($attrName)->nodeValue;
100 protected function innerHtml($element)
103 $children = $element->childNodes;
104 foreach ($children as $child) {
105 $tmp_dom = new \DOMDocument();
106 $tmp_dom->appendChild($tmp_dom->importNode($child, true));
107 $innerHTML .= rtrim($tmp_dom->saveHTML(), "\n");
109 return trim($innerHTML);
112 protected function getXpath($node)
114 $xpath = new \DOMXPath($node);
115 $xpath->registerNamespace('h', 'http://www.w3.org/1999/xhtml');
119 protected function xpc($class)
122 . 'concat(" ", normalize-space(@class), " "),'
123 . '" ' . $class . ' "'
127 protected function xpq($str)
129 return '"' . htmlspecialchars($str, ENT_QUOTES) . '"';