+$dx = new \DOMXPath($doc);
+
+$xbase = $dx->evaluate('/html/head/base[@href]')->item(0);
+if ($xbase) {
+ $base = $base->resolve(
+ $xbase->attributes->getNamedItem('href')->textContent
+ );
+}
+
+$meta = $dx->evaluate('/html/head/meta[@name="robots" and @content]')
+ ->item(0);
+if ($meta) {
+ $robots = $meta->attributes->getNamedItem('content')->textContent;
+ foreach (explode(',', $robots) as $value) {
+ if (trim($value) == 'noindex') {
+ echo "URL does not want to be indexed: $url\n";
+ exit(0);
+ }
+ }
+}
+
+//remove script tags
+removeTags($doc, 'script');
+removeTags($doc, 'style');
+removeTags($doc, 'nav');
+
+//default content: <body>
+$xpContext = $doc->getElementsByTagName('body')->item(0);
+//FIXME: follow meta refresh, no body
+// example: https://www.gnu.org/software/coreutils/
+
+//use microformats content if it exists
+$xpElems = $dx->query(
+ "//*[contains(concat(' ', normalize-space(@class), ' '), ' e-content ')]"
+);
+if ($xpElems->length) {
+ $xpContext = $xpElems->item(0);
+} else if ($doc->getElementById('content')) {
+ //if there is an element with ID "content", we'll use this
+ $xpContext = $doc->getElementById('content');
+}