*/
protected $showLinksOnly = false;
- static $supportedIndexTypes = array(
+ static $supportedTypes = array(
'application/atom+xml' => '\\phinde\\LinkExtractor\\Atom',
'application/xhtml+xml' => '\\phinde\\LinkExtractor\\Html',
'text/html' => '\\phinde\\LinkExtractor\\Html',
$this->queue = new Queue();
}
- public function crawl($url)
+ public function run(Retrieved $retrieved)
{
- $res = $this->fetch($url);
- if ($res === false) {
- return;
- }
-
- $linkInfos = $this->extractLinks($res);
+ $linkInfos = $this->extractLinks($retrieved->httpRes);
+ $linkInfos = $this->filterLinks($linkInfos);
if ($this->showLinksOnly) {
$this->showLinks($linkInfos);
+ return false;
} else {
$this->enqueue($linkInfos);
+ return true;
}
}
- protected function fetch($url)
- {
- $existingDoc = $this->es->get($url);
-
- $req = new HttpRequest($url);
- $req->setHeader(
- 'accept',
- implode(',', array_keys(static::$supportedIndexTypes))
- );
- if ($existingDoc) {
- $nMoDate = strtotime($existingDoc->modate);
- $req->setHeader('If-Modified-Since: ' . date('r', $nMoDate));
- }
-
- $res = $req->send();
- if ($res->getStatus() === 304) {
- //not modified since last time, so don't crawl again
- return false;
- } else if ($res->getStatus() !== 200) {
- throw new \Exception(
- "Response code is not 200 but "
- . $res->getStatus() . ", stopping"
- );
- }
- return $res;
- }
-
protected function extractLinks(\HTTP_Request2_Response $res)
{
$mimetype = explode(';', $res->getHeader('content-type'))[0];
- if (!isset(static::$supportedIndexTypes[$mimetype])) {
- echo "MIME type not supported for indexing: $mimetype\n";
+ if (!isset(static::$supportedTypes[$mimetype])) {
+ Log::info("MIME type not supported for crawling: $mimetype");
return array();
}
- $class = static::$supportedIndexTypes[$mimetype];
+ $class = static::$supportedTypes[$mimetype];
$extractor = new $class();
return $extractor->extract($res);
}
- protected function enqueue($linkInfos)
+ protected function filterLinks($linkInfos)
{
+ $filteredLinkInfos = array();
foreach ($linkInfos as $linkInfo) {
- if ($this->es->isKnown($linkInfo->url)) {
- continue;
- }
$allowed = Helper::isUrlAllowed($linkInfo->url);
$crawl = $allowed;
$index = $GLOBALS['phinde']['indexNonAllowed'] || $allowed;
- if ($crawl || $index) {
- $this->es->markQueued($linkInfo->url);
+ if ($crawl && count($GLOBALS['phinde']['crawlBlacklist'])) {
+ foreach ($GLOBALS['phinde']['crawlBlacklist'] as $bl) {
+ if (preg_match('#' . $bl . '#', $linkInfo->url)) {
+ $crawl = false;
+ }
+ }
}
- if ($index) {
- $this->queue->addToIndex(
- $linkInfo->url, $linkInfo->title, $linkInfo->source
- );
+
+ $linkInfo->known = $this->es->isKnown($linkInfo->url);
+ $linkInfo->crawl = $crawl;
+ $linkInfo->index = $index;
+ $filteredLinkInfos[] = $linkInfo;
+ }
+ return $filteredLinkInfos;
+ }
+
+ protected function enqueue($linkInfos)
+ {
+ foreach ($linkInfos as $linkInfo) {
+ if ($linkInfo->known) {
+ continue;
}
- if ($allowed) {
- $this->queue->addToCrawl($linkInfo->url);
+ if ($linkInfo->crawl || $linkInfo->index) {
+ $this->es->markQueued($linkInfo->url);
+ $actions = array();
+ if ($linkInfo->index) {
+ $actions[] = 'index';
+ }
+ if ($linkInfo->crawl) {
+ $actions[] = 'crawl';
+ }
+ $this->queue->addToProcessList(
+ $linkInfo->url, $actions
+ );
}
}
}
protected function showLinks($linkInfos)
{
foreach ($linkInfos as $linkInfo) {
- echo $linkInfo->url . "\n";
+ Log::msg($linkInfo->url);
if ($linkInfo->title) {
- echo ' title: ' . $linkInfo->title . "\n";
- echo ' source: ' . $linkInfo->source . "\n";
+ Log::msg(' title: ' . $linkInfo->title);
+ Log::msg(' source: ' . $linkInfo->source);
+ Log::msg(
+ ' known: ' . intval($linkInfo->known)
+ . ', crawl: ' . intval($linkInfo->crawl)
+ . ', index: ' . intval($linkInfo->index)
+ );
}
}
}