c24f6cd60b690f49baf32539d35c69d2be08071d
[phinde.git] / src / phinde / Elasticsearch.php
1 <?php
2 namespace phinde;
3
4 class Elasticsearch
5 {
6     protected $baseUrl;
7
8     public function __construct($baseUrl)
9     {
10         $this->baseUrl = $baseUrl;
11     }
12
13     public static function getDocId($url)
14     {
15         return hash('sha256', $url);
16     }
17
18     public function isKnown($url)
19     {
20         $r = new Elasticsearch_Request(
21             $this->baseUrl . 'document/' . static::getDocId($url),
22             \HTTP_Request2::METHOD_HEAD
23         );
24         $r->allow404 = true;
25         $res = $r->send();
26         return $res->getStatus() == 200;
27     }
28
29     public function get($url)
30     {
31         $r = new Elasticsearch_Request(
32             $this->baseUrl . 'document/' . static::getDocId($url),
33             \HTTP_Request2::METHOD_GET
34         );
35         $r->allow404 = true;
36         $res = $r->send();
37         if ($res->getStatus() != 200) {
38             return null;
39         }
40         $d = json_decode($res->getBody());
41         return $d->_source;
42     }
43
44     public function markQueued($url)
45     {
46         $r = new Elasticsearch_Request(
47             $this->baseUrl . 'document/' . static::getDocId($url),
48             \HTTP_Request2::METHOD_PUT
49         );
50         $doc = (object) array(
51             'url' => $url,
52             'status' => (object) array(
53                 'processed' => null,
54                 'findable'  => false,
55             )
56         );
57         $r->setBody(json_encode($doc));
58         $r->send();
59     }
60
61     public function getIndexStatus()
62     {
63         $r = new Elasticsearch_Request(
64             $this->baseUrl . '_stats/docs,store',
65             \HTTP_Request2::METHOD_GET
66         );
67         $res = $r->send();
68         $data = json_decode($res->getBody());
69         return array(
70             'documents' => $data->_all->total->docs->count,
71             'size'      => $data->_all->total->store->size_in_bytes,
72         );
73     }
74
75     public function search($query, $filters, $site, $page, $perPage, $sort)
76     {
77         if (preg_match_all('#nick:([^ ]*)#', $query, $matches)) {
78             foreach ($matches[1] as $authorName) {
79                 $query = str_replace(
80                     'nick:' . $authorName,
81                     'author.name:' . $authorName,
82                     $query
83                 );
84             }
85         }
86
87         $qMust = array();//query parts for the MUST section
88
89         //modification date filters
90         if (preg_match('#after:([^ ]+)#', $query, $matches)) {
91             $dateAfter = $matches[1];
92             $query      = trim(str_replace($matches[0], '', $query));
93             $qMust[]    = array(
94                 'range' => array(
95                     'modate' => array(
96                         'gt' => $dateAfter . '||/d',
97                     )
98                 )
99             );
100         }
101         if (preg_match('#before:([^ ]+)#', $query, $matches)) {
102             $dateBefore = $matches[1];
103             $query      = trim(str_replace($matches[0], '', $query));
104             $qMust[]    = array(
105                 'range' => array(
106                     'modate' => array(
107                         'lt' => $dateBefore . '||/d',
108                     )
109                 )
110             );
111         }
112         if (preg_match('#date:([^ ]+)#', $query, $matches)) {
113             $dateExact = $matches[1];
114             $query      = trim(str_replace($matches[0], '', $query));
115             $qMust[]    = array(
116                 'range' => array(
117                     'modate' => array(
118                         'gte' => $dateExact . '||/d',
119                         'lte' => $dateExact . '||/d',
120                     )
121                 )
122             );
123         }
124
125         $qMust[] = array(
126             'query_string' => array(
127                 'default_field' => '_all',
128                 'default_operator' => 'AND',
129                 'query' => $query
130             )
131         );
132         $qMust[] = array(
133             'term' => array(
134                 'status.findable' => true
135             )
136         );
137
138         if ($sort == '' && $GLOBALS['phinde']['defaultSort'] == 'date') {
139             $sort = 'date';
140         }
141         if ($sort == 'date') {
142             $sortCfg = array('status.modate' => array('order' => 'desc'));
143         } else {
144             $sortCfg = array();
145         }
146
147         $contentMatchSize = 100;
148         if ($GLOBALS['phinde']['showFullContent']) {
149             $contentMatchSize = 999999;
150         }
151
152         $r = new Elasticsearch_Request(
153             $this->baseUrl . 'document/_search',
154             \HTTP_Request2::METHOD_GET
155         );
156         $doc = array(
157             '_source' => array(
158                 'url',
159                 'title',
160                 'author',
161                 'status.modate',
162             ),
163             'query' => array(
164                 'bool' => array(
165                     'must' => $qMust
166                 )
167             ),
168             'highlight' => array(
169                 'pre_tags' => array('<em class="hl">'),
170                 'order' => 'score',
171                 'encoder' => 'html',
172                 'fields' => array(
173                     'title' => array(
174                         'require_field_match' => false,
175                         'number_of_fragments' => 0,
176                     ),
177                     'url' => array(
178                         'require_field_match' => false,
179                         'number_of_fragments' => 0,
180                     ),
181                     'text' => array(
182                         'require_field_match' => false,
183                         'number_of_fragments' => 1,
184                         'fragment_size' => $contentMatchSize,
185                         'no_match_size' => $contentMatchSize,
186                     ),
187                 )
188             ),
189             'aggregations' => array(
190                 'tags' => array(
191                     'terms' => array(
192                         'field' => 'tags'
193                     )
194                 ),
195                 'language' => array(
196                     'terms' => array(
197                         'field' => 'language'
198                     )
199                 ),
200                 'domain' => array(
201                     'terms' => array(
202                         'field' => 'domain'
203                     )
204                 ),
205                 'type' => array(
206                     'terms' => array(
207                         'field' => 'type'
208                     )
209                 )
210             ),
211             'from' => $page * $perPage,
212             'size' => $perPage,
213             'sort' => $sortCfg,
214         );
215         foreach ($filters as $type => $value) {
216             $doc['query']['bool']['must'][] = array(
217                 'term' => array(
218                     $type => $value
219                 )
220             );
221         }
222         if ($site != '') {
223             $doc['query']['bool']['must'][] = array(
224                 'prefix' => array(
225                     'schemalessUrl' => array(
226                         'value' => $site
227                     )
228                 )
229             );
230         }
231
232         //unset($doc['_source']);
233
234         //ini_set('xdebug.var_display_max_depth', 10);
235         //echo json_encode($doc);die();
236         $r->setBody(json_encode($doc));
237         $res = $r->send();
238         return json_decode($res->getBody());
239     }
240 }
241 ?>