115 lines
3.4 KiB
PHP
115 lines
3.4 KiB
PHP
|
<?php
|
||
|
$starttime = new DateTime();
|
||
|
|
||
|
error_reporting(E_ALL & ~E_WARNING);
|
||
|
|
||
|
require(dirname(__FILE__) . '/../includes/init.inc.php');
|
||
|
|
||
|
// Truncate logfile
|
||
|
$maxloglen = 1000000; // = 1MB
|
||
|
$fs = filesize(CRAWLER_LOG_FILE);
|
||
|
if ($fs > $maxloglen) {
|
||
|
$fh = fopen(CRAWLER_LOG_FILE, 'r+');
|
||
|
fseek($fh, $fs - $maxloglen);
|
||
|
fgets($fh);
|
||
|
$buf = fread($fh, $maxloglen);
|
||
|
ftruncate($fh, 0);
|
||
|
rewind($fh);
|
||
|
fwrite($fh, $buf);
|
||
|
fclose($fh);
|
||
|
}
|
||
|
|
||
|
function logline($line) {
|
||
|
error_log(date('[Y-m-d H:i:s] ') . $line . "\n", 3, CRAWLER_LOG_FILE);
|
||
|
}
|
||
|
|
||
|
$feeds = feeds::getRefreshList();
|
||
|
|
||
|
if (empty($feeds)) {
|
||
|
logline('Nothing to update.');
|
||
|
logline($starttime->diff(new DateTime())->format('Execution time: %s seconds'));
|
||
|
exit(0);
|
||
|
}
|
||
|
|
||
|
$c = curl_init();
|
||
|
|
||
|
// duh.
|
||
|
curl_setopt($c, CURLOPT_USERAGENT, 'Mozilla/5.0 (compatible; FeedizerBot/' . CRAWLER_VERSION . '; +http://feedizer.tigris.fanir.de/page/bot)');
|
||
|
// return the transfer instead of print()ing
|
||
|
curl_setopt($c, CURLOPT_RETURNTRANSFER, true);
|
||
|
// 10 sec connection-timeout, 30 sec total timeout
|
||
|
curl_setopt($c, CURLOPT_CONNECTTIMEOUT, 10);
|
||
|
curl_setopt($c, CURLOPT_TIMEOUT, 30);
|
||
|
// Abort if slower than 1 kbyte/s for more than 10 seconds
|
||
|
curl_setopt($c, CURLOPT_LOW_SPEED_LIMIT, 1000);
|
||
|
curl_setopt($c, CURLOPT_LOW_SPEED_TIME, 10);
|
||
|
// max average recieve speed of 2 Mbyte/s
|
||
|
curl_setopt($c, CURLOPT_MAX_RECV_SPEED_LARGE, 2000000);
|
||
|
// Follow max. 2 redirects (Location-header)
|
||
|
curl_setopt($c, CURLOPT_FOLLOWLOCATION, true);
|
||
|
curl_setopt($c, CURLOPT_MAXREDIRS, 2);
|
||
|
|
||
|
function fetch(&$val, $key) {
|
||
|
global $c;
|
||
|
curl_setopt($c, CURLOPT_URL, $val['uri']);
|
||
|
$val['new_html'] = curl_exec($c);
|
||
|
}
|
||
|
|
||
|
array_walk($feeds, 'fetch');
|
||
|
|
||
|
foreach ($feeds as $feed) {
|
||
|
switch (feedItems::newItem($feed['id'], $feed['new_html'])) {
|
||
|
case 0:
|
||
|
logline($feed['slug'] . "\tupdated");
|
||
|
break;
|
||
|
case 1:
|
||
|
logline($feed['slug'] . "\tunchanged");
|
||
|
break;
|
||
|
default:
|
||
|
logline($feed['slug'] . "\tFAILED!");
|
||
|
}
|
||
|
|
||
|
feeds::updateNextRefresh($feed['id']);
|
||
|
}
|
||
|
|
||
|
|
||
|
/* #VAR2
|
||
|
$c = array();
|
||
|
$cm = curl_multi_init();
|
||
|
$cm_running = null;
|
||
|
|
||
|
$cnt = 0;
|
||
|
foreach ($feeds as $i => $feed) {
|
||
|
$c[$i] = curl_init($feed['uri']);
|
||
|
|
||
|
// duh.
|
||
|
curl_setopt($c[$i], CURLOPT_USERAGENT, 'Mozilla/5.0 (compatible; FeedizerBot/' . CRAWLER_VERSION . '; +http://feedizer.tigris.fanir.de/page/bot)');
|
||
|
// return the transfer instead of print()ing
|
||
|
curl_setopt($c[$i], CURLOPT_RETURNTRANSFER, true);
|
||
|
// 10 sec connection-timeout, 30 sec total timeout
|
||
|
curl_setopt($c[$i], CURLOPT_CONNECTTIMEOUT, 10);
|
||
|
curl_setopt($c[$i], CURLOPT_TIMEOUT, 30);
|
||
|
// Abort if slower than 1 kbyte/s for more than 10 seconds
|
||
|
curl_setopt($c[$i], CURLOPT_LOW_SPEED_LIMIT, 1000);
|
||
|
curl_setopt($c[$i], CURLOPT_LOW_SPEED_TIME, 10);
|
||
|
// max average recieve speed of 2 Mbyte/s
|
||
|
curl_setopt($c[$i], CURLOPT_MAX_RECV_SPEED_LARGE, 2000000);
|
||
|
// Follow max. 2 redirects (Location-header)
|
||
|
curl_setopt($c[$i], CURLOPT_FOLLOWLOCATION, true);
|
||
|
curl_setopt($c[$i], CURLOPT_MAXREDIRS, 2);
|
||
|
|
||
|
curl_multi_add_handle($cm, $c[$i]);
|
||
|
}
|
||
|
|
||
|
do {
|
||
|
curl_multi_exec($cm, $cm_running);
|
||
|
curl_multi_select($cm);
|
||
|
do {
|
||
|
$cm_info = curl_multi_info_read($cm, $cm_queuelen);
|
||
|
var_dump($cm_info);
|
||
|
} while ($cm_queuelen > 0);
|
||
|
} while ($cm_running > 0);
|
||
|
*/
|
||
|
|
||
|
logline('Execution time: ' . $starttime->diff(new DateTime())->format('%s'));
|