protected $es;
protected $queue;
+ /**
+ * If the links only should be shown, not queued
+ */
+ protected $showLinksOnly = false;
+
static $supportedIndexTypes = array(
'application/atom+xml' => '\\phinde\\LinkExtractor\\Atom',
'application/xhtml+xml' => '\\phinde\\LinkExtractor\\Html',
public function crawl($url)
{
- $res = $this->fetch($url);
+ $res = $this->fetch($url);
+ if ($res === false) {
+ return;
+ }
+
$linkInfos = $this->extractLinks($res);
- $this->enqueue($linkInfos);
+ if ($this->showLinksOnly) {
+ $this->showLinks($linkInfos);
+ } else {
+ $this->enqueue($linkInfos);
+ }
}
protected function fetch($url)
{
+ $existingDoc = $this->es->get($url);
+
$req = new HttpRequest($url);
+ $req->setHeader(
+ 'accept',
+ implode(',', array_keys(static::$supportedIndexTypes))
+ );
+ if ($existingDoc && isset($existingDoc->modate)) {
+ $nMoDate = strtotime($existingDoc->modate);
+ $req->setHeader('If-Modified-Since: ' . date('r', $nMoDate));
+ }
+
$res = $req->send();
- if ($res->getStatus() !== 200) {
+ if ($res->getStatus() === 304) {
+ //not modified since last time, so don't crawl again
+ return false;
+ } else if ($res->getStatus() !== 200) {
throw new \Exception(
"Response code is not 200 but "
. $res->getStatus() . ", stopping"
if ($this->es->isKnown($linkInfo->url)) {
continue;
}
- $this->es->markQueued($linkInfo->url);
- $this->queue->addToIndex(
- $linkInfo->url, $linkInfo->title, $linkInfo->source
- );
- if (Helper::isUrlAllowed($linkInfo->url)) {
+ $allowed = Helper::isUrlAllowed($linkInfo->url);
+ $crawl = $allowed;
+ $index = $GLOBALS['phinde']['indexNonAllowed'] || $allowed;
+
+ if ($crawl || $index) {
+ $this->es->markQueued($linkInfo->url);
+ }
+ if ($index) {
+ $this->queue->addToIndex(
+ $linkInfo->url, $linkInfo->title, $linkInfo->source
+ );
+ }
+ if ($allowed) {
$this->queue->addToCrawl($linkInfo->url);
}
}
}
+
+ protected function showLinks($linkInfos)
+ {
+ foreach ($linkInfos as $linkInfo) {
+ echo $linkInfo->url . "\n";
+ if ($linkInfo->title) {
+ echo ' title: ' . $linkInfo->title . "\n";
+ echo ' source: ' . $linkInfo->source . "\n";
+ }
+ }
+ }
+
+ public function setShowLinksOnly($showLinksOnly)
+ {
+ $this->showLinksOnly = $showLinksOnly;
+ }
}
?>