6c90480475d152a361ea0eb645dfb842265310a7
[phinde.git] / src / phinde / Elasticsearch.php
1 <?php
2 namespace phinde;
3
4 class Elasticsearch
5 {
6     protected $baseUrl;
7
8     public function __construct($baseUrl)
9     {
10         $this->baseUrl = $baseUrl;
11     }
12
13     public static function getDocId($url)
14     {
15         return hash('sha256', $url);
16     }
17
18     public function isKnown($url)
19     {
20         $r = new Elasticsearch_Request(
21             $this->baseUrl . 'document/' . static::getDocId($url),
22             \HTTP_Request2::METHOD_HEAD
23         );
24         $r->allow404 = true;
25         $res = $r->send();
26         return $res->getStatus() == 200;
27     }
28
29     public function get($url)
30     {
31         $r = new Elasticsearch_Request(
32             $this->baseUrl . 'document/' . static::getDocId($url),
33             \HTTP_Request2::METHOD_GET
34         );
35         $r->allow404 = true;
36         $res = $r->send();
37         if ($res->getStatus() != 200) {
38             return null;
39         }
40         $d = json_decode($res->getBody());
41         return $d->_source;
42     }
43
44     public function markQueued($url)
45     {
46         $r = new Elasticsearch_Request(
47             $this->baseUrl . 'document/' . static::getDocId($url),
48             \HTTP_Request2::METHOD_PUT
49         );
50         $doc = (object) array(
51             'url' => $url,
52             'status' => (object) array(
53                 'processed' => null,
54                 'findable'  => false,
55             )
56         );
57         $r->setBody(json_encode($doc));
58         $r->send();
59     }
60
61     public function getIndexStatus()
62     {
63         $r = new Elasticsearch_Request(
64             $this->baseUrl . '_stats/docs,store',
65             \HTTP_Request2::METHOD_GET
66         );
67         $res = $r->send();
68         $data = json_decode($res->getBody());
69         return array(
70             'documents' => $data->_all->total->docs->count,
71             'size'      => $data->_all->total->store->size_in_bytes,
72         );
73     }
74
75     public function search($query, $filters, $site, $page, $perPage, $sort)
76     {
77         if (preg_match_all('#nick:([^ ]*)#', $query, $matches)) {
78             foreach ($matches[1] as $authorName) {
79                 $query = str_replace(
80                     'nick:' . $authorName,
81                     'author.name:' . $authorName,
82                     $query
83                 );
84             }
85         }
86
87         $qMust = array();//query parts for the MUST section
88
89         //modification date filters
90         if (preg_match('#after:([^ ]+)#', $query, $matches)) {
91             $dateAfter = $matches[1];
92             $query      = trim(str_replace($matches[0], '', $query));
93             $qMust[]    = array(
94                 'range' => array(
95                     'modate' => array(
96                         'gt' => $dateAfter . '||/d',
97                     )
98                 )
99             );
100         }
101         if (preg_match('#before:([^ ]+)#', $query, $matches)) {
102             $dateBefore = $matches[1];
103             $query      = trim(str_replace($matches[0], '', $query));
104             $qMust[]    = array(
105                 'range' => array(
106                     'modate' => array(
107                         'lt' => $dateBefore . '||/d',
108                     )
109                 )
110             );
111         }
112         if (preg_match('#date:([^ ]+)#', $query, $matches)) {
113             $dateExact = $matches[1];
114             $query      = trim(str_replace($matches[0], '', $query));
115             $qMust[]    = array(
116                 'range' => array(
117                     'modate' => array(
118                         'gte' => $dateExact . '||/d',
119                         'lte' => $dateExact . '||/d',
120                     )
121                 )
122             );
123         }
124
125         $qMust[] = array(
126             'query_string' => array(
127                 'default_field' => '_all',
128                 'default_operator' => 'AND',
129                 'query' => $query
130             )
131         );
132         $qMust[] = array(
133             'term' => array(
134                 'status.findable' => true
135             )
136         );
137
138         if ($sort == 'date') {
139             $sortCfg = array('status.modate' => array('order' => 'desc'));
140         } else {
141             $sortCfg = array();
142         }
143
144         $contentMatchSize = 100;
145         if ($GLOBALS['phinde']['showFullContent']) {
146             $contentMatchSize = 999999;
147         }
148
149         $r = new Elasticsearch_Request(
150             $this->baseUrl . 'document/_search',
151             \HTTP_Request2::METHOD_GET
152         );
153         $doc = array(
154             '_source' => array(
155                 'url',
156                 'title',
157                 'author',
158                 'status.modate',
159             ),
160             'query' => array(
161                 'bool' => array(
162                     'must' => $qMust
163                 )
164             ),
165             'highlight' => array(
166                 'pre_tags' => array('<em class="hl">'),
167                 'order' => 'score',
168                 'encoder' => 'html',
169                 'fields' => array(
170                     'title' => array(
171                         'require_field_match' => false,
172                         'number_of_fragments' => 0,
173                     ),
174                     'url' => array(
175                         'require_field_match' => false,
176                         'number_of_fragments' => 0,
177                     ),
178                     'text' => array(
179                         'require_field_match' => false,
180                         'number_of_fragments' => 1,
181                         'fragment_size' => $contentMatchSize,
182                         'no_match_size' => $contentMatchSize,
183                     ),
184                 )
185             ),
186             'aggregations' => array(
187                 'tags' => array(
188                     'terms' => array(
189                         'field' => 'tags'
190                     )
191                 ),
192                 'language' => array(
193                     'terms' => array(
194                         'field' => 'language'
195                     )
196                 ),
197                 'domain' => array(
198                     'terms' => array(
199                         'field' => 'domain'
200                     )
201                 ),
202                 'type' => array(
203                     'terms' => array(
204                         'field' => 'type'
205                     )
206                 )
207             ),
208             'from' => $page * $perPage,
209             'size' => $perPage,
210             'sort' => $sortCfg,
211         );
212         foreach ($filters as $type => $value) {
213             $doc['query']['bool']['must'][] = array(
214                 'term' => array(
215                     $type => $value
216                 )
217             );
218         }
219         if ($site != '') {
220             $doc['query']['bool']['must'][] = array(
221                 'prefix' => array(
222                     'schemalessUrl' => array(
223                         'value' => $site
224                     )
225                 )
226             );
227         }
228
229         //unset($doc['_source']);
230
231         //ini_set('xdebug.var_display_max_depth', 10);
232         //echo json_encode($doc);die();
233         $r->setBody(json_encode($doc));
234         $res = $r->send();
235         return json_decode($res->getBody());
236     }
237 }
238 ?>