massively improve crawl speed by ditching "exists" queries
[phinde.git] / src / phinde / Elasticsearch.php
index 316f3da8684a79fcf24875224a12d22b9381b163..1732bbb824cbaf8b072379a844f808af06926b72 100644 (file)
@@ -10,33 +10,15 @@ class Elasticsearch
         $this->baseUrl = $baseUrl;
     }
 
-    /**
-     * @link https://www.elastic.co/guide/en/elasticsearch/guide/current/_finding_exact_values.html
-     */
     public function isKnown($url)
     {
         $r = new Elasticsearch_Request(
-            $this->baseUrl . 'document/_search/exists',
-            \HTTP_Request2::METHOD_GET
+            $this->baseUrl . 'document/' . rawurlencode($url),
+            \HTTP_Request2::METHOD_HEAD
         );
         $r->allow404 = true;
-        $r->setBody(
-            json_encode(
-                array(
-                    'query' => array(
-                        'filtered' => array(
-                            'filter' => array(
-                                'term' => array(
-                                    'url' => $url
-                                )
-                            )
-                        )
-                    )
-                )
-            )
-        );
-        $res = json_decode($r->send()->getBody());
-        return $res->exists;
+        $res = $r->send();
+        return $res->getStatus() == 200;
     }
 
     public function get($url)
@@ -68,8 +50,79 @@ class Elasticsearch
         $r->send();
     }
 
-    public function search($query, $filters, $site, $page, $perPage)
+    public function search($query, $filters, $site, $page, $perPage, $sort)
     {
+        if (preg_match('#nick:([^ ]*)#', $query, $matches)) {
+            $authorName = $matches[1];
+            $query = str_replace(
+                'nick:' . $authorName,
+                'author.name:' . $authorName,
+                $query
+            );
+        }
+
+        $qMust = array();//query parts for the MUST section
+
+        //modification date filters
+        if (preg_match('#after:([^ ]+)#', $query, $matches)) {
+            $dateAfter = $matches[1];
+            $query      = trim(str_replace($matches[0], '', $query));
+            $qMust[]    = array(
+                'range' => array(
+                    'modate' => array(
+                        'gt' => $dateAfter . '||/d',
+                    )
+                )
+            );
+        }
+        if (preg_match('#before:([^ ]+)#', $query, $matches)) {
+            $dateBefore = $matches[1];
+            $query      = trim(str_replace($matches[0], '', $query));
+            $qMust[]    = array(
+                'range' => array(
+                    'modate' => array(
+                        'lt' => $dateBefore . '||/d',
+                    )
+                )
+            );
+        }
+        if (preg_match('#date:([^ ]+)#', $query, $matches)) {
+            $dateExact = $matches[1];
+            $query      = trim(str_replace($matches[0], '', $query));
+            $qMust[]    = array(
+                'range' => array(
+                    'modate' => array(
+                        'gte' => $dateExact . '||/d',
+                        'lte' => $dateExact . '||/d',
+                    )
+                )
+            );
+        }
+
+        $qMust[] = array(
+            'query_string' => array(
+                'default_field' => '_all',
+                'default_operator' => 'AND',
+                'query' => $query
+            )
+        );
+        $qMust[] = array(
+            'term' => array(
+                'status' => 'indexed'
+            )
+        );
+
+        if ($sort == 'date') {
+            $sortCfg = array('modate' => array('order' => 'desc'));
+        } else {
+            $sortCfg = array();
+        }
+
+        $contentMatchSize = 100;
+        if ($GLOBALS['phinde']['showFullContent']) {
+            $contentMatchSize = 999999;
+        }
+
         $r = new Elasticsearch_Request(
             $this->baseUrl . 'document/_search',
             \HTTP_Request2::METHOD_GET
@@ -83,20 +136,7 @@ class Elasticsearch
             ),
             'query' => array(
                 'bool' => array(
-                    'must' => array(
-                        array(
-                            'query_string' => array(
-                                'default_field' => '_all',
-                                'default_operator' => 'AND',
-                                'query' => $query
-                            )
-                        ),
-                        array(
-                            'term' => array(
-                                'status' => 'indexed'
-                            )
-                        ),
-                    )
+                    'must' => $qMust
                 )
             ),
             'highlight' => array(
@@ -115,6 +155,8 @@ class Elasticsearch
                     'text' => array(
                         'require_field_match' => false,
                         'number_of_fragments' => 1,
+                        'fragment_size' => $contentMatchSize,
+                        'no_match_size' => $contentMatchSize,
                     ),
                 )
             ),
@@ -142,9 +184,7 @@ class Elasticsearch
             ),
             'from' => $page * $perPage,
             'size' => $perPage,
-            'sort' => array(
-                //array('modate' => array('order' => 'desc'))
-            )
+            'sort' => $sortCfg,
         );
         foreach ($filters as $type => $value) {
             $doc['query']['bool']['must'][] = array(