Big patch merging crawling+indexing into one command, new json document structure
[phinde.git] / src / phinde / Elasticsearch.php
1 <?php
2 namespace phinde;
3
4 class Elasticsearch
5 {
6     protected $baseUrl;
7
8     public function __construct($baseUrl)
9     {
10         $this->baseUrl = $baseUrl;
11     }
12
13     public static function getDocId($url)
14     {
15         return hash('sha256', $url);
16     }
17
18     public function isKnown($url)
19     {
20         $r = new Elasticsearch_Request(
21             $this->baseUrl . 'document/' . static::getDocId($url),
22             \HTTP_Request2::METHOD_HEAD
23         );
24         $r->allow404 = true;
25         $res = $r->send();
26         return $res->getStatus() == 200;
27     }
28
29     public function get($url)
30     {
31         $r = new Elasticsearch_Request(
32             $this->baseUrl . 'document/' . static::getDocId($url),
33             \HTTP_Request2::METHOD_GET
34         );
35         $r->allow404 = true;
36         $res = $r->send();
37         if ($res->getStatus() != 200) {
38             return null;
39         }
40         $d = json_decode($res->getBody());
41         return $d->_source;
42     }
43
44     public function markQueued($url)
45     {
46         $r = new Elasticsearch_Request(
47             $this->baseUrl . 'document/' . static::getDocId($url),
48             \HTTP_Request2::METHOD_PUT
49         );
50         $doc = (object) array(
51             'url' => $url,
52             'status' => (object) array(
53                 'processed' => null,
54                 'findable'  => false,
55             )
56         );
57         $r->setBody(json_encode($doc));
58         $r->send();
59     }
60
61     public function search($query, $filters, $site, $page, $perPage, $sort)
62     {
63         if (preg_match_all('#nick:([^ ]*)#', $query, $matches)) {
64             foreach ($matches[1] as $authorName) {
65                 $query = str_replace(
66                     'nick:' . $authorName,
67                     'author.name:' . $authorName,
68                     $query
69                 );
70             }
71         }
72
73         $qMust = array();//query parts for the MUST section
74
75         //modification date filters
76         if (preg_match('#after:([^ ]+)#', $query, $matches)) {
77             $dateAfter = $matches[1];
78             $query      = trim(str_replace($matches[0], '', $query));
79             $qMust[]    = array(
80                 'range' => array(
81                     'modate' => array(
82                         'gt' => $dateAfter . '||/d',
83                     )
84                 )
85             );
86         }
87         if (preg_match('#before:([^ ]+)#', $query, $matches)) {
88             $dateBefore = $matches[1];
89             $query      = trim(str_replace($matches[0], '', $query));
90             $qMust[]    = array(
91                 'range' => array(
92                     'modate' => array(
93                         'lt' => $dateBefore . '||/d',
94                     )
95                 )
96             );
97         }
98         if (preg_match('#date:([^ ]+)#', $query, $matches)) {
99             $dateExact = $matches[1];
100             $query      = trim(str_replace($matches[0], '', $query));
101             $qMust[]    = array(
102                 'range' => array(
103                     'modate' => array(
104                         'gte' => $dateExact . '||/d',
105                         'lte' => $dateExact . '||/d',
106                     )
107                 )
108             );
109         }
110
111         $qMust[] = array(
112             'query_string' => array(
113                 'default_field' => '_all',
114                 'default_operator' => 'AND',
115                 'query' => $query
116             )
117         );
118         $qMust[] = array(
119             'term' => array(
120                 'status.findable' => true
121             )
122         );
123
124         if ($sort == 'date') {
125             $sortCfg = array('status.modate' => array('order' => 'desc'));
126         } else {
127             $sortCfg = array();
128         }
129
130         $contentMatchSize = 100;
131         if ($GLOBALS['phinde']['showFullContent']) {
132             $contentMatchSize = 999999;
133         }
134
135         $r = new Elasticsearch_Request(
136             $this->baseUrl . 'document/_search',
137             \HTTP_Request2::METHOD_GET
138         );
139         $doc = array(
140             '_source' => array(
141                 'url',
142                 'title',
143                 'author',
144                 'status.modate',
145             ),
146             'query' => array(
147                 'bool' => array(
148                     'must' => $qMust
149                 )
150             ),
151             'highlight' => array(
152                 'pre_tags' => array('<em class="hl">'),
153                 'order' => 'score',
154                 'encoder' => 'html',
155                 'fields' => array(
156                     'title' => array(
157                         'require_field_match' => false,
158                         'number_of_fragments' => 0,
159                     ),
160                     'url' => array(
161                         'require_field_match' => false,
162                         'number_of_fragments' => 0,
163                     ),
164                     'text' => array(
165                         'require_field_match' => false,
166                         'number_of_fragments' => 1,
167                         'fragment_size' => $contentMatchSize,
168                         'no_match_size' => $contentMatchSize,
169                     ),
170                 )
171             ),
172             'aggregations' => array(
173                 'tags' => array(
174                     'terms' => array(
175                         'field' => 'tags'
176                     )
177                 ),
178                 'language' => array(
179                     'terms' => array(
180                         'field' => 'language'
181                     )
182                 ),
183                 'domain' => array(
184                     'terms' => array(
185                         'field' => 'domain'
186                     )
187                 ),
188                 'type' => array(
189                     'terms' => array(
190                         'field' => 'type'
191                     )
192                 )
193             ),
194             'from' => $page * $perPage,
195             'size' => $perPage,
196             'sort' => $sortCfg,
197         );
198         foreach ($filters as $type => $value) {
199             $doc['query']['bool']['must'][] = array(
200                 'term' => array(
201                     $type => $value
202                 )
203             );
204         }
205         if ($site != '') {
206             $doc['query']['bool']['must'][] = array(
207                 'prefix' => array(
208                     'schemalessUrl' => array(
209                         'value' => $site
210                     )
211                 )
212             );
213         }
214
215         //unset($doc['_source']);
216
217         //ini_set('xdebug.var_display_max_depth', 10);
218         //echo json_encode($doc);die();
219         $r->setBody(json_encode($doc));
220         $res = $r->send();
221         return json_decode($res->getBody());
222     }
223 }
224 ?>