Do not break when searching for slashes
[phinde.git] / src / phinde / Elasticsearch.php
1 <?php
2 namespace phinde;
3
4 class Elasticsearch
5 {
6     protected $baseUrl;
7
8     public function __construct($baseUrl)
9     {
10         $this->baseUrl = $baseUrl;
11     }
12
13     public static function getDocId($url)
14     {
15         return hash('sha256', $url);
16     }
17
18     public function isKnown($url)
19     {
20         $r = new Elasticsearch_Request(
21             $this->baseUrl . 'document/' . static::getDocId($url),
22             \HTTP_Request2::METHOD_HEAD
23         );
24         $r->allow404 = true;
25         $res = $r->send();
26         return $res->getStatus() == 200;
27     }
28
29     public function get($url)
30     {
31         $r = new Elasticsearch_Request(
32             $this->baseUrl . 'document/' . static::getDocId($url),
33             \HTTP_Request2::METHOD_GET
34         );
35         $r->allow404 = true;
36         $res = $r->send();
37         if ($res->getStatus() != 200) {
38             return null;
39         }
40         $d = json_decode($res->getBody());
41         return $d->_source;
42     }
43
44     public function markQueued($url)
45     {
46         $r = new Elasticsearch_Request(
47             $this->baseUrl . 'document/' . static::getDocId($url),
48             \HTTP_Request2::METHOD_PUT
49         );
50         $doc = (object) array(
51             'url' => $url,
52             'status' => (object) array(
53                 'processed' => null,
54                 'findable'  => false,
55             )
56         );
57         $r->setBody(json_encode($doc));
58         $r->send();
59     }
60
61     public function getIndexStatus()
62     {
63         $r = new Elasticsearch_Request(
64             $this->baseUrl . '_stats/docs,store',
65             \HTTP_Request2::METHOD_GET
66         );
67         $res = $r->send();
68         $data = json_decode($res->getBody());
69         return array(
70             'documents' => $data->_all->total->docs->count,
71             'size'      => $data->_all->total->store->size_in_bytes,
72         );
73     }
74
75     public function search($query, $filters, $site, $page, $perPage, $sort)
76     {
77         if (preg_match_all('#nick:([^ ]*)#', $query, $matches)) {
78             foreach ($matches[1] as $authorName) {
79                 $query = str_replace(
80                     'nick:' . $authorName,
81                     'author.name:' . $authorName,
82                     $query
83                 );
84             }
85         }
86
87         $qMust = array();//query parts for the MUST section
88
89         //modification date filters
90         if (preg_match('#after:([^ ]+)#', $query, $matches)) {
91             $dateAfter = $matches[1];
92             $query      = trim(str_replace($matches[0], '', $query));
93             $qMust[]    = array(
94                 'range' => array(
95                     'status.modate' => array(
96                         'gt' => $dateAfter . '||/d',
97                     )
98                 )
99             );
100         }
101         if (preg_match('#before:([^ ]+)#', $query, $matches)) {
102             $dateBefore = $matches[1];
103             $query      = trim(str_replace($matches[0], '', $query));
104             $qMust[]    = array(
105                 'range' => array(
106                     'status.modate' => array(
107                         'lt' => $dateBefore . '||/d',
108                     )
109                 )
110             );
111         }
112         if (preg_match('#date:([^ ]+)#', $query, $matches)) {
113             $dateExact = $matches[1];
114             $query      = trim(str_replace($matches[0], '', $query));
115             $qMust[]    = array(
116                 'range' => array(
117                     'status.modate' => array(
118                         'gte' => $dateExact . '||/d',
119                         'lte' => $dateExact . '||/d',
120                     )
121                 )
122             );
123         }
124
125         if (strpos($query, '/') !== false && strpos($query, '"') === false) {
126             //add quotes when there is a slash and no quotes
127             // https://stackoverflow.com/questions/31963643/escaping-forward-slashes-in-elasticsearch
128             $query = '"' . $query . '"';
129         }
130         $qMust[] = array(
131             'query_string' => array(
132                 'default_field' => '_all',
133                 'default_operator' => 'AND',
134                 'query' => $query
135             )
136         );
137         $qMust[] = array(
138             'term' => array(
139                 'status.findable' => true
140             )
141         );
142
143         if ($sort == '' && $GLOBALS['phinde']['defaultSort'] == 'date') {
144             $sort = 'date';
145         }
146         if ($sort == 'date') {
147             $sortCfg = array('status.modate' => array('order' => 'desc'));
148         } else {
149             $sortCfg = array();
150         }
151
152         $contentMatchSize = 100;
153         if ($GLOBALS['phinde']['showFullContent']) {
154             $contentMatchSize = 999999;
155         }
156
157         $r = new Elasticsearch_Request(
158             $this->baseUrl . 'document/_search',
159             \HTTP_Request2::METHOD_GET
160         );
161         $doc = array(
162             '_source' => array(
163                 'url',
164                 'title',
165                 'author',
166                 'status.modate',
167             ),
168             'query' => array(
169                 'bool' => array(
170                     'must' => $qMust
171                 )
172             ),
173             'highlight' => array(
174                 'pre_tags' => array('<em class="hl">'),
175                 'order' => 'score',
176                 'encoder' => 'html',
177                 'fields' => array(
178                     'title' => array(
179                         'require_field_match' => false,
180                         'number_of_fragments' => 0,
181                     ),
182                     'url' => array(
183                         'require_field_match' => false,
184                         'number_of_fragments' => 0,
185                     ),
186                     'text' => array(
187                         'require_field_match' => false,
188                         'number_of_fragments' => 1,
189                         'fragment_size' => $contentMatchSize,
190                         'no_match_size' => $contentMatchSize,
191                     ),
192                 )
193             ),
194             'aggregations' => array(
195                 'tags' => array(
196                     'terms' => array(
197                         'field' => 'tags'
198                     )
199                 ),
200                 'language' => array(
201                     'terms' => array(
202                         'field' => 'language'
203                     )
204                 ),
205                 'domain' => array(
206                     'terms' => array(
207                         'field' => 'domain'
208                     )
209                 ),
210                 'type' => array(
211                     'terms' => array(
212                         'field' => 'type'
213                     )
214                 )
215             ),
216             'from' => $page * $perPage,
217             'size' => $perPage,
218             'sort' => $sortCfg,
219         );
220         foreach ($filters as $type => $value) {
221             $doc['query']['bool']['must'][] = array(
222                 'term' => array(
223                     $type => $value
224                 )
225             );
226         }
227         if ($site != '') {
228             $doc['query']['bool']['must'][] = array(
229                 'prefix' => array(
230                     'schemalessUrl' => array(
231                         'value' => $site
232                     )
233                 )
234             );
235         }
236
237         //unset($doc['_source']);
238
239         //ini_set('xdebug.var_display_max_depth', 10);
240         //echo json_encode($doc);die();
241         $r->setBody(json_encode($doc));
242         $res = $r->send();
243         return json_decode($res->getBody());
244     }
245 }
246 ?>