4 require_once __DIR__ . '/../src/init.php';
6 $supportedCrawlTypes = array(
7 'text/html', 'application/xhtml+xml'
12 echo "No URL given\n";
16 $es = new Elasticsearch($GLOBALS['phinde']['elasticsearch']);
19 if (!Helper::isUrlAllowed($url)) {
20 echo "Domain is not allowed; not crawling\n";
25 $req = new \HTTP_Request2($url);
26 //FIXME: send supported mime types in header
28 if ($res->getStatus() !== 200) {
29 echo "Response code is not 200 but " . $res->getStatus() . ", stopping\n";
32 $mimetype = explode(';', $res->getHeader('content-type'))[0];
33 if (!in_array($mimetype, $supportedCrawlTypes)) {
34 echo "MIME type not supported for crawling: $mimetype\n";
38 //FIXME: mime type switch for cdata
39 $doc = new \DOMDocument();
40 //@ to hide parse warning messages in invalid html
41 @$doc->loadHTMLFile($url);
43 //FIXME: extract base url from html
44 $base = new \Net_URL2($url);
46 $xpath = new \DOMXPath($doc);
47 $links = $xpath->evaluate('//a');
48 //FIXME: link rel, img, video
50 $alreadySeen = array();
52 foreach ($links as $link) {
53 $linkTitle = $link->textContent;
55 foreach ($link->attributes as $attribute) {
56 if ($attribute->name == 'href') {
57 $href = $attribute->textContent;
60 if ($href == '' || $href{0} == '#') {
65 $linkUrlObj = $base->resolve($href);
66 $linkUrlObj->setFragment(false);
67 $linkUrl = (string) $linkUrlObj;
68 if (isset($alreadySeen[$linkUrl])) {
72 switch ($linkUrlObj->getScheme()) {
80 if ($es->isKnown($linkUrl)) {
84 //FIXME: check target type
85 //FIXME: check nofollow
86 //var_dump($linkTitle, $linkUrl);
87 $es->markQueued($linkUrl);
88 addToIndex($linkUrl, $linkTitle, $url);
89 if (isUrlAllowed($linkUrl)) {
92 $alreadySeen[$linkUrl] = true;
95 function addToIndex($linkUrl, $linkTitle, $sourceUrl)
97 echo "Queuing for indexing: $linkUrl\n";
98 $gmclient = new \GearmanClient();
99 $gmclient->addServer('127.0.0.1');
100 $gmclient->doBackground(
105 'title' => $linkTitle,
106 'source' => $sourceUrl
110 if ($gmclient->returnCode() != GEARMAN_SUCCESS) {
111 echo 'Error queueing URL indexing for '
113 . 'Error code: ' . $gmclient->returnCode() . "\n";
118 function addToCrawl($linkUrl)
120 echo "Queuing for crawling: $linkUrl\n";
121 $gmclient = new \GearmanClient();
122 $gmclient->addServer('127.0.0.1');
123 $gmclient->doBackground(
131 if ($gmclient->returnCode() != GEARMAN_SUCCESS) {
132 echo 'Error queueing URL crawling for '
134 . 'Error code: ' . $gmclient->returnCode() . "\n";