*/
public function fetch($url, $actions, $force = false)
{
+ $url = Helper::rewriteUrl($url);
+
$esDoc = $this->es->get($url);
if (isset($esDoc->status->location)
&& $esDoc->status->location != ''
) {
//TODO: what if location redirects change?
$url = $esDoc->status->location;
+ $url = Helper::rewriteUrl($url);
$esDoc = $this->es->get($url);
}
$types = array();
foreach ($actions as $action) {
- $types = array_merge($action::$supportedTypes);
+ $types = array_merge($types, array_keys($action::$supportedTypes));
}
$types = array_unique($types);
$res = $req->send();
if ($res->getStatus() === 304) {
//not modified since last time, so don't crawl again
- echo "Not modified since last fetch\n";
+ Log::info("Not modified since last fetch");
return false;
} else if ($res->getStatus() !== 200) {
throw new \Exception(
);
}
- $effUrl = $res->getEffectiveUrl();
+ $effUrl = Helper::removeAnchor($res->getEffectiveUrl());
+ $effUrl = Helper::rewriteUrl($effUrl);
if ($effUrl != $url) {
$this->storeRedirect($url, $effUrl);
$url = $effUrl;
protected function storeRedirect($url, $target)
{
- $esDoc = new \stdClass();
+ $esDoc = Helper::baseDoc($url);
$esDoc->status = (object) array(
- 'location' => $target
+ 'location' => $target,
+ 'findable' => false,
);
- $esDoc->url = $url;
$this->storeDoc($url, $esDoc);
}
public function storeDoc($url, $esDoc)
{
- echo "Store $url\n";
+ Log::info("Store $url");
$esDoc->status->processed = gmdate('c');
$r = new Elasticsearch_Request(
$GLOBALS['phinde']['elasticsearch'] . 'document/'