$GLOBALS['phinde']['queuePrefix'] . 'phinde_process',
function(\GearmanJob $job) {
$data = unserialize($job->workload());
- echo "-- Processing " . $data['url']
+ Log::info(
+ "-- Processing " . $data['url']
. ' (' . implode(',', $data['actions']) . ')'
- . "\n";
+ );
passthru(
'./process.php ' . escapeshellarg($data['url'])
. ' ' . implode(' ', $data['actions'])
$gmworker->addFunction(
$GLOBALS['phinde']['queuePrefix'] . 'phinde_quit',
function(\GearmanJob $job) {
- echo "Got exit job\n";
+ Log::info('Got exit job');
$job->sendComplete('');
exit(0);
}
while ($gmworker->work()) {
if ($gmworker->returnCode() != GEARMAN_SUCCESS) {
- echo 'Error running job: ' . $gmworker->returnCode() . "\n";
+ Log::error('Error running job: ' . $gmworker->returnCode());
break;
}
}
$urlObj = new \Net_URL2($url);
$url = $urlObj->getNormalizedURL();
if (!Helper::isUrlAllowed($url)) {
- echo "Domain is not allowed; not crawling\n";
+ Log::error("Domain is not allowed; not crawling");
exit(2);
}
$update = false;
foreach ($actions as $key => $action) {
- echo "step: $key\n";
+ Log::info("step: $key");
$update |= $action->run($retrieved);
}
//FIXME: update index if it exists already
$fetcher->storeDoc($retrieved->url, $retrieved->esDoc);
} else {
- echo "Not updating\n";
+ Log::info("Not updating");
}
} catch (\Exception $e) {
- echo $e->getMessage() . "\n";
+ Log::error($e->getMessage());
exit(10);
}
?>
\ No newline at end of file
$json = file_get_contents(__DIR__ . '/../data/elasticsearch-mapping.json');
if (json_decode($json) === null) {
- echo "Error: Schema JSON is broken\n";
+ Log::error("Error: Schema JSON is broken");
chdir(__DIR__ . '/../');
passthru('json_pp -t null < data/elasticsearch-mapping.json');
exit(1);
'subscriptions' => array(
'http://www.example.org/feed',
),
+ //verbose output
+ 'debug' => true,
//time in seconds after which URLs may be re-indexed
'refreshtime' => 86400,
//if directly linked URLs shall be indexed, even if they are
{
$mimetype = explode(';', $res->getHeader('content-type'))[0];
if (!isset(static::$supportedTypes[$mimetype])) {
- echo "MIME type not supported for indexing: $mimetype\n";
+ Log::info("MIME type not supported for crawling: $mimetype");
return array();
}
protected function showLinks($linkInfos)
{
foreach ($linkInfos as $linkInfo) {
- echo $linkInfo->url . "\n";
+ Log::msg($linkInfo->url);
if ($linkInfo->title) {
- echo ' title: ' . $linkInfo->title . "\n";
- echo ' source: ' . $linkInfo->source . "\n";
- echo ' known: ' . intval($linkInfo->known)
+ Log::msg(' title: ' . $linkInfo->title);
+ Log::msg(' source: ' . $linkInfo->source);
+ Log::msg(
+ ' known: ' . intval($linkInfo->known)
. ', crawl: ' . intval($linkInfo->crawl)
- . ', index: ' . intval($linkInfo->index) . "\n";
+ . ', index: ' . intval($linkInfo->index)
+ );
}
}
}
$res = $req->send();
if ($res->getStatus() === 304) {
//not modified since last time, so don't crawl again
- echo "Not modified since last fetch\n";
+ Log::info("Not modified since last fetch");
return false;
} else if ($res->getStatus() !== 200) {
throw new \Exception(
public function storeDoc($url, $esDoc)
{
- echo "Store $url\n";
+ Log::info("Store $url");
$esDoc->status->processed = gmdate('c');
$r = new Elasticsearch_Request(
$GLOBALS['phinde']['elasticsearch'] . 'document/'
$mimetype = explode(';', $res->getHeader('content-type'))[0];
if (!in_array($mimetype, static::$supportedTypes)) {
- echo "MIME type not supported for indexing: $mimetype\n";
+ Log::info("MIME type not supported for indexing: $mimetype");
return false;
}
--- /dev/null
+<?php
+namespace phinde;
+
+class Log
+{
+ public static function error($msg)
+ {
+ static::log($msg);
+ }
+
+ public static function info($msg)
+ {
+ if ($GLOBALS['phinde']['debug']) {
+ static::log($msg);
+ }
+ }
+
+ public static function log($msg)
+ {
+ echo $msg . "\n";
+ }
+}
+?>
public function addToProcessList($linkUrl, $actions)
{
- echo "Queuing for processing: $linkUrl"
+ Log::info(
+ "Queuing for processing: $linkUrl"
. ' (' . implode(',', $actions) . ')'
- . "\n";
+ );
+
$this->gmclient->doBackground(
$GLOBALS['phinde']['queuePrefix'] . 'phinde_process',
serialize(
)
);
if ($this->gmclient->returnCode() != GEARMAN_SUCCESS) {
- echo 'Error queueing URL processing for '
+ Log::error(
+ 'Error queueing URL processing for '
. $linkUrl . "\n"
- . 'Error code: ' . $this->gmclient->returnCode() . "\n";
+ . 'Error code: ' . $this->gmclient->returnCode()
+ );
exit(2);
}
}