6 static $supportedTypes = array(
7 'application/xhtml+xml' => true,
11 public function run(Retrieved $retrieved)
13 $res = $retrieved->httpRes;
14 $esDoc = $retrieved->esDoc;
15 $url = $retrieved->url;
17 $mimetype = explode(';', $res->getHeader('content-type'))[0];
18 if (!in_array($mimetype, static::$supportedTypes)) {
19 Log::info("MIME type not supported for indexing: $mimetype");
23 if ($esDoc === null) {
24 $esDoc = Helper::baseDoc($url);
25 $retrieved->esDoc = $esDoc;
28 //FIXME: update index only if changed since last index time
29 //FIXME: extract base url from html
30 //FIXME: check if effective url needs updating
32 $base = new \Net_URL2($url);
34 //FIXME: MIME type switch
35 $doc = new \DOMDocument();
36 //@ to hide parse warning messages in invalid html
37 @$doc->loadHTML($res->getBody());
38 $dx = new \DOMXPath($doc);
40 $xbase = $dx->evaluate('/html/head/base[@href]')->item(0);
42 $base = $base->resolve(
43 $xbase->attributes->getNamedItem('href')->textContent
47 $meta = $dx->evaluate('/html/head/meta[@name="robots" and @content]')
50 $robots = $meta->attributes->getNamedItem('content')->textContent;
51 foreach (explode(',', $robots) as $value) {
52 if (trim($value) == 'noindex') {
53 $esDoc->status->findable = false;
60 $this->removeTags($doc, 'script');
61 $this->removeTags($doc, 'style');
62 $this->removeTags($doc, 'nav');
64 //default content: <body>
65 $xpContext = $doc->getElementsByTagName('body')->item(0);
66 //FIXME: follow meta refresh, no body
67 // example: https://www.gnu.org/software/coreutils/
69 //use microformats content if it exists
70 $xpElems = $dx->query(
71 "//*[contains(concat(' ', normalize-space(@class), ' '), ' e-content ')]"
73 if ($xpElems->length) {
74 $xpContext = $xpElems->item(0);
75 } else if ($doc->getElementById('content')) {
76 //if there is an element with ID "content", we'll use this
77 $xpContext = $doc->getElementById('content');
81 $esDoc->schemalessUrl = Helper::noSchema($url);
82 $esDoc->type = 'html';
84 $esDoc->mimetype = $mimetype;
85 $esDoc->domain = parse_url($url, PHP_URL_HOST);
87 //$esDoc->source = 'FIXME';
88 //$esDoc->sourcetitle = 'FIXME';
90 $esDoc->author = new \stdClass();
92 $arXpElems = $dx->query('/html/head/meta[@name="author" and @content]');
93 if ($arXpElems->length) {
94 $esDoc->author->name = trim(
95 $arXpElems->item(0)->attributes->getNamedItem('content')->textContent
98 $arXpElems = $dx->query('/html/head/link[@rel="author" and @href]');
99 if ($arXpElems->length) {
100 $esDoc->author->url = trim(
102 $arXpElems->item(0)->attributes->getNamedItem('href')->textContent
108 $arXpElems = $dx->query('/html/head/title');
109 if ($arXpElems->length) {
110 $esDoc->title = trim(
111 $arXpElems->item(0)->textContent
115 foreach (array('h1', 'h2', 'h3', 'h4', 'h5', 'h6') as $headlinetype) {
116 $esDoc->$headlinetype = array();
117 foreach ($xpContext->getElementsByTagName($headlinetype) as $xheadline) {
119 $esDoc->$headlinetype,
120 trim($xheadline->textContent)
125 //FIXME: split paragraphs
126 //FIXME: insert space after br
127 $esDoc->text = array();
128 $esDoc->text[] = trim(
130 array("\r\n", "\n", "\r", ' '),
132 $xpContext->textContent
138 foreach ($dx->query('/html/head/meta[@name="keywords" and @content]') as $xkeywords) {
139 $keywords = $xkeywords->attributes->getNamedItem('content')->textContent;
140 foreach (explode(',', $keywords) as $keyword) {
141 $tags[trim($keyword)] = true;
144 $esDoc->tags = array_keys($tags);
147 $arXpdates = $dx->query('/html/head/meta[@name="DC.date.created" and @content]');
148 if ($arXpdates->length) {
149 $esDoc->status->crdate = gmdate(
152 $arXpdates->item(0)->attributes->getNamedItem('content')->textContent
156 //FIXME: keep creation date from database, or use modified date if we
157 // do not have it there
159 $arXpdates = $dx->query('/html/head/meta[@name="DC.date.modified" and @content]');
160 if ($arXpdates->length) {
161 $esDoc->status->modate = gmdate(
164 $arXpdates->item(0)->attributes->getNamedItem('content')->textContent
168 $lm = $res->getHeader('last-modified');
170 $esDoc->status->modate = gmdate('c', strtotime($lm));
172 //use current time since we don't have any other data
173 $esDoc->status->modate = gmdate('c');
176 $esDoc->status->findable = true;
179 //there may be "en-US" and "de-DE"
180 $xlang = $doc->documentElement->attributes->getNamedItem('lang');
182 $esDoc->language = strtolower(substr($xlang->textContent, 0, 2));
184 //FIXME: fallback, autodetection
185 //FIXME: check noindex
187 //var_dump($esDoc);die();
192 function removeTags($doc, $tag) {
194 foreach ($doc->getElementsbyTagName($tag) as $elem) {
197 foreach ($elems as $elem) {
198 $elem->parentNode->removeChild($elem);