5 * Fetches entries that need an update and extracts their links
7 class Feed_UpdateEntries
12 public function __construct(Dependencies $deps)
15 $this->db = $deps->db;
16 $this->log = $deps->log;
19 public function updateAll()
21 $this->log->info('Updating feed entries..');
22 $res = $this->db->query(
23 'SELECT * FROM feedentries'
24 . ' WHERE ' . $this->sqlNeedsUpdate()
27 while ($entryRow = $res->fetch(\PDO::FETCH_OBJ)) {
29 $this->updateEntry($entryRow);
31 $this->log->info('Finished updating %d entries.', $items);
34 public function updateSome($urlOrIds)
37 foreach ($urlOrIds as $urlOrId) {
38 if (is_numeric($urlOrId)) {
39 $options[] = 'fe_id = ' . intval($urlOrId);
41 $options[] = 'fe_url = ' . $this->db->quote($urlOrId);
45 $this->log->info('Updating %d feed entries..', count($options));
46 $res = $this->db->query(
47 'SELECT * FROM feedentries'
48 . ' WHERE ' . $this->sqlNeedsUpdate()
49 . ' AND (' . implode(' OR ', $options) . ')'
53 while ($entryRow = $res->fetch(\PDO::FETCH_OBJ)) {
55 $this->updateEntry($entryRow);
57 $this->log->info('Finished updating %d entries.', $items);
60 protected function updateEntry($entryRow)
63 'Updating feed entry #%d: %s', $entryRow->fe_id, $entryRow->fe_url
66 $req = new \HTTP_Request2($entryRow->fe_url);
67 $req->setHeader('User-Agent', 'stapibas');
70 'application/xhtml+xml; q=1'
71 . ', application/xml; q=0.9'
73 . ', text/html; q=0.5'
77 if ($entryRow->fe_updated != '0000-00-00 00:00:00') {
80 gmdate('r', strtotime($entryRow->fe_updated))
85 if ($res->getStatus() == 304) {
87 $this->setNoUpdate($entryRow);
88 $this->log->info('Not modified');
92 if (intval($res->getStatus() / 100) != 2) {
93 //no 2xx is an error for us
94 $this->log->err('Error fetching feed entry URL');
98 $urls = $this->extractUrls($entryRow, $res);
99 $this->updateUrls($entryRow, $urls);
100 $this->setUpdated($entryRow, $res);
103 protected function updateUrls($entryRow, $urls)
105 $res = $this->db->query(
106 'SELECT * FROM feedentryurls'
107 . ' WHERE feu_fe_id = ' . $this->db->quote($entryRow->fe_id)
110 while ($urlRow = $res->fetch(\PDO::FETCH_OBJ)) {
111 $urlRows[$urlRow->feu_url] = $urlRow;
114 $urls = array_unique($urls);
116 $new = $updated = $deleted = 0;
117 $items = count($urls);
119 foreach ($urls as $url) {
120 if (!isset($urlRows[$url])) {
121 //URL is not known - insert it
123 'INSERT INTO feedentryurls SET'
124 . ' feu_fe_id = ' . $this->db->quote($entryRow->fe_id)
125 . ', feu_url = ' . $this->db->quote($url)
128 . ', feu_updated = NOW()'
131 } else if ($urlRows[$url]->feu_active == 0) {
132 //URL is known already, but was once deleted and is back now
134 'UPDATE feedentryurls SET'
136 . ', feu_updated = NOW()'
137 . ' WHERE feu_id = ' . $this->db->quote($urlRows[$url]->feu_id)
140 unset($urlRows[$url]);
142 //already known, all fine
143 unset($urlRows[$url]);
147 //these URLs are in DB but not on the page anymore
148 foreach ($urlRows as $urlRow) {
151 'UPDATE feedentryurls SET'
153 . ', feu_updated = NOW()'
154 . ' WHERE feu_id = ' . $this->db->quote($urlRow->feu_id)
158 'Feed entry #%d: %d new, %d updated, %d deleted of %d URLs',
159 $entryRow->fe_id, $new, $updated, $deleted, $items
163 protected function extractUrls($entryRow, \HTTP_Request2_Response $res)
165 $doc = new \DOMDocument();
166 $typeParts = explode(';', $res->getHeader('content-type'));
167 $type = $typeParts[0];
168 if ($type == 'application/xhtml+xml'
169 || $type == 'application/xml'
170 || $type == 'text/xml'
172 $doc->loadXML($res->getBody());
174 $doc->loadHTML($res->getBody());
177 $xpath = new \DOMXPath($doc);
178 $xpath->registerNamespace('h', 'http://www.w3.org/1999/xhtml');
179 // all links in e-content AND u-in-reply-to links
180 $query = '//*[' . $this->xpc('h-entry') . ' or ' . $this->xpc('hentry') . ']'
181 . '//*[' . $this->xpc('e-content') . ' or ' . $this->xpc('entry-content') . ']'
182 . '//*[(self::a or self::h:a) and @href and not(starts-with(@href, "#"))]'
184 . '//*[' . $this->xpc('h-entry') . ' or ' . $this->xpc('hentry') . ']'
186 . '(self::a or self::h:a) and @href and not(starts-with(@href, "#"))'
187 . 'and ' . $this->xpc('u-in-reply-to')
190 $links = $xpath->query($query);
191 $this->log->info('%d links found', $links->length);
193 $entryUrl = new \Net_URL2($entryRow->fe_url);
194 //FIXME: base URL in html code
197 foreach ($links as $link) {
198 $url = (string)$entryUrl->resolve(
199 $link->attributes->getNamedItem('href')->nodeValue
201 $this->log->info('URL in entry: ' . $url);
207 protected function xpc($class)
210 . 'concat(" ", normalize-space(@class), " "),'
211 . '" ' . $class . ' "'
215 protected function setNoUpdate($entryRow)
218 'UPDATE feedentries SET fe_needs_update = 0'
219 . ' WHERE fe_id = ' . $this->db->quote($entryRow->fe_id)
223 protected function setUpdated($entryRow, \HTTP_Request2_Response $res)
227 . ' SET fe_needs_update = 0'
228 . ', fe_updated = ' . $this->db->quote(
229 gmdate('Y-m-d H:i:s', strtotime($res->getHeader('last-modified')))
231 . ' WHERE fe_id = ' . $this->db->quote($entryRow->fe_id)
235 protected function sqlNeedsUpdate()
237 if ($this->deps->options['force']) {
240 return ' (fe_needs_update = 1 OR fe_updated = "0000-00-00 00:00:00")';