initial commit
This commit is contained in:
commit
1cc4bf3572
254 changed files with 63622 additions and 0 deletions
htdocs/scripts
114
htdocs/scripts/crawler.php
Normal file
114
htdocs/scripts/crawler.php
Normal file
|
@ -0,0 +1,114 @@
|
|||
<?php
|
||||
$starttime = new DateTime();
|
||||
|
||||
error_reporting(E_ALL & ~E_WARNING);
|
||||
|
||||
require(dirname(__FILE__) . '/../includes/init.inc.php');
|
||||
|
||||
// Truncate logfile
|
||||
$maxloglen = 1000000; // = 1MB
|
||||
$fs = filesize(CRAWLER_LOG_FILE);
|
||||
if ($fs > $maxloglen) {
|
||||
$fh = fopen(CRAWLER_LOG_FILE, 'r+');
|
||||
fseek($fh, $fs - $maxloglen);
|
||||
fgets($fh);
|
||||
$buf = fread($fh, $maxloglen);
|
||||
ftruncate($fh, 0);
|
||||
rewind($fh);
|
||||
fwrite($fh, $buf);
|
||||
fclose($fh);
|
||||
}
|
||||
|
||||
function logline($line) {
|
||||
error_log(date('[Y-m-d H:i:s] ') . $line . "\n", 3, CRAWLER_LOG_FILE);
|
||||
}
|
||||
|
||||
$feeds = feeds::getRefreshList();
|
||||
|
||||
if (empty($feeds)) {
|
||||
logline('Nothing to update.');
|
||||
logline($starttime->diff(new DateTime())->format('Execution time: %s seconds'));
|
||||
exit(0);
|
||||
}
|
||||
|
||||
$c = curl_init();
|
||||
|
||||
// duh.
|
||||
curl_setopt($c, CURLOPT_USERAGENT, 'Mozilla/5.0 (compatible; FeedizerBot/' . CRAWLER_VERSION . '; +http://feedizer.tigris.fanir.de/page/bot)');
|
||||
// return the transfer instead of print()ing
|
||||
curl_setopt($c, CURLOPT_RETURNTRANSFER, true);
|
||||
// 10 sec connection-timeout, 30 sec total timeout
|
||||
curl_setopt($c, CURLOPT_CONNECTTIMEOUT, 10);
|
||||
curl_setopt($c, CURLOPT_TIMEOUT, 30);
|
||||
// Abort if slower than 1 kbyte/s for more than 10 seconds
|
||||
curl_setopt($c, CURLOPT_LOW_SPEED_LIMIT, 1000);
|
||||
curl_setopt($c, CURLOPT_LOW_SPEED_TIME, 10);
|
||||
// max average recieve speed of 2 Mbyte/s
|
||||
curl_setopt($c, CURLOPT_MAX_RECV_SPEED_LARGE, 2000000);
|
||||
// Follow max. 2 redirects (Location-header)
|
||||
curl_setopt($c, CURLOPT_FOLLOWLOCATION, true);
|
||||
curl_setopt($c, CURLOPT_MAXREDIRS, 2);
|
||||
|
||||
function fetch(&$val, $key) {
|
||||
global $c;
|
||||
curl_setopt($c, CURLOPT_URL, $val['uri']);
|
||||
$val['new_html'] = curl_exec($c);
|
||||
}
|
||||
|
||||
array_walk($feeds, 'fetch');
|
||||
|
||||
foreach ($feeds as $feed) {
|
||||
switch (feedItems::newItem($feed['id'], $feed['new_html'])) {
|
||||
case 0:
|
||||
logline($feed['slug'] . "\tupdated");
|
||||
break;
|
||||
case 1:
|
||||
logline($feed['slug'] . "\tunchanged");
|
||||
break;
|
||||
default:
|
||||
logline($feed['slug'] . "\tFAILED!");
|
||||
}
|
||||
|
||||
feeds::updateNextRefresh($feed['id']);
|
||||
}
|
||||
|
||||
|
||||
/* #VAR2
|
||||
$c = array();
|
||||
$cm = curl_multi_init();
|
||||
$cm_running = null;
|
||||
|
||||
$cnt = 0;
|
||||
foreach ($feeds as $i => $feed) {
|
||||
$c[$i] = curl_init($feed['uri']);
|
||||
|
||||
// duh.
|
||||
curl_setopt($c[$i], CURLOPT_USERAGENT, 'Mozilla/5.0 (compatible; FeedizerBot/' . CRAWLER_VERSION . '; +http://feedizer.tigris.fanir.de/page/bot)');
|
||||
// return the transfer instead of print()ing
|
||||
curl_setopt($c[$i], CURLOPT_RETURNTRANSFER, true);
|
||||
// 10 sec connection-timeout, 30 sec total timeout
|
||||
curl_setopt($c[$i], CURLOPT_CONNECTTIMEOUT, 10);
|
||||
curl_setopt($c[$i], CURLOPT_TIMEOUT, 30);
|
||||
// Abort if slower than 1 kbyte/s for more than 10 seconds
|
||||
curl_setopt($c[$i], CURLOPT_LOW_SPEED_LIMIT, 1000);
|
||||
curl_setopt($c[$i], CURLOPT_LOW_SPEED_TIME, 10);
|
||||
// max average recieve speed of 2 Mbyte/s
|
||||
curl_setopt($c[$i], CURLOPT_MAX_RECV_SPEED_LARGE, 2000000);
|
||||
// Follow max. 2 redirects (Location-header)
|
||||
curl_setopt($c[$i], CURLOPT_FOLLOWLOCATION, true);
|
||||
curl_setopt($c[$i], CURLOPT_MAXREDIRS, 2);
|
||||
|
||||
curl_multi_add_handle($cm, $c[$i]);
|
||||
}
|
||||
|
||||
do {
|
||||
curl_multi_exec($cm, $cm_running);
|
||||
curl_multi_select($cm);
|
||||
do {
|
||||
$cm_info = curl_multi_info_read($cm, $cm_queuelen);
|
||||
var_dump($cm_info);
|
||||
} while ($cm_queuelen > 0);
|
||||
} while ($cm_running > 0);
|
||||
*/
|
||||
|
||||
logline('Execution time: ' . $starttime->diff(new DateTime())->format('%s'));
|
Loading…
Add table
Add a link
Reference in a new issue