5 set_include_path(__DIR__ . '/../src/' . PATH_SEPARATOR . get_include_path());
6 require_once __DIR__ . '/../data/config.php';
7 require_once 'HTTP/Request2.php';
8 require_once 'Elasticsearch.php';
9 require_once 'Elasticsearch/Request.php';
10 require_once 'Net/URL2.php';
11 require_once 'functions.php';
13 $supportedCrawlTypes = array(
14 'text/html', 'application/xhtml+xml'
19 echo "No URL given\n";
23 $es = new Elasticsearch($GLOBALS['phinde']['elasticsearch']);
26 if (!isUrlAllowed($url)) {
27 echo "Domain is not allowed; not crawling\n";
32 $req = new \HTTP_Request2($url);
33 //FIXME: send supported mime types in header
35 if ($res->getStatus() !== 200) {
36 echo "Response code is not 200 but " . $res->getStatus() . ", stopping\n";
39 $mimetype = explode(';', $res->getHeader('content-type'))[0];
40 if (!in_array($mimetype, $supportedCrawlTypes)) {
41 echo "MIME type not supported for crawling: $mimetype\n";
45 //FIXME: mime type switch for cdata
46 $doc = new \DOMDocument();
47 //@ to hide parse warning messages in invalid html
48 @$doc->loadHTMLFile($url);
50 //FIXME: extract base url from html
51 $base = new \Net_URL2($url);
53 $xpath = new \DOMXPath($doc);
54 $links = $xpath->evaluate('//a');
55 //FIXME: link rel, img, video
57 $alreadySeen = array();
59 foreach ($links as $link) {
60 $linkTitle = $link->textContent;
62 foreach ($link->attributes as $attribute) {
63 if ($attribute->name == 'href') {
64 $href = $attribute->textContent;
67 if ($href == '' || $href{0} == '#') {
72 $linkUrlObj = $base->resolve($href);
73 $linkUrlObj->setFragment(false);
74 $linkUrl = (string) $linkUrlObj;
75 if (isset($alreadySeen[$linkUrl])) {
79 switch ($linkUrlObj->getScheme()) {
87 if ($es->isKnown($linkUrl)) {
91 //FIXME: check target type
92 //FIXME: check nofollow
93 //var_dump($linkTitle, $linkUrl);
94 $es->markQueued($linkUrl);
95 addToIndex($linkUrl, $linkTitle, $url);
96 if (isUrlAllowed($linkUrl)) {
99 $alreadySeen[$linkUrl] = true;
102 function addToIndex($linkUrl, $linkTitle, $sourceUrl)
104 echo "Queuing for indexing: $linkUrl\n";
105 $gmclient = new \GearmanClient();
106 $gmclient->addServer('127.0.0.1');
107 $gmclient->doBackground(
112 'title' => $linkTitle,
113 'source' => $sourceUrl
117 if ($gmclient->returnCode() != GEARMAN_SUCCESS) {
118 echo 'Error queueing URL indexing for '
120 . 'Error code: ' . $gmclient->returnCode() . "\n";
125 function addToCrawl($linkUrl)
127 echo "Queuing for crawling: $linkUrl\n";
128 $gmclient = new \GearmanClient();
129 $gmclient->addServer('127.0.0.1');
130 $gmclient->doBackground(
138 if ($gmclient->returnCode() != GEARMAN_SUCCESS) {
139 echo 'Error queueing URL crawling for '
141 . 'Error code: ' . $gmclient->returnCode() . "\n";