Skip to content
Permalink
master
Switch branches/tags

Name already in use

A tag already exists with the provided branch name. Many Git commands accept both tag and branch names, so creating this branch may cause unexpected behavior. Are you sure you want to create this branch?
Go to file
 
 
Cannot retrieve contributors at this time
executable file 132 lines (110 sloc) 3.58 KB
#! /usr/bin/perl
use strict;
use warnings;
our $hostname=`/bin/hostname -s`;$? and exit 1;chomp($hostname);
our $DB="/project/pbackup_$hostname/db/pbackup.db";
my $fail_count=0;
sub ok {
print "OK $_[0]\n";
}
sub fail {
print "FAIL $_[0]\n";
$fail_count++;
}
sub sqlite3 {
my ($db,$sql)=@_;
my $retry=0;
while (1) {
open P,'-|','sqlite3',$db,$sql or die "$!\n";;
my $ret=join('',<P>);
close P;
$? or return $ret;
$?>>8==5 or die "sqlite3 failed\n";
++$retry<60 or die "database locked timeout\n";
warn "(sleep and retry)\n";
sleep 1;
}
}
sub check_processes {
my ($expire,$balance,$backup)=(0,0,0);
open P,'-|','ps -Aoargs' or die "$!\n";
while (<P>) {
if (m"^\S*/perl \S*pbackup (balance|do_jobs|expire)") {
$1 eq 'balance' and $balance++;
$1 eq 'expire' and $expire++;
$1 eq 'do_jobs' and $backup++;
}
}
close P;
$expire>=1 ? ok("expire running once") : fail("expire not running");
$balance>=1 ? ok("balance running once") : fail("balance not running");
$backup>=3 ? ok("backup running three times") : fail ("backup not running three times (no: $backup)");
}
sub df {
my ($path)=@_;
my $pid=open P,'-|';
defined $pid or die "$0: $!\n";
unless ($pid) {
exec 'df','-k',$path;
die "$0: $!\n";
}
my $l;
$l=<P>;
$l=<P>;
chomp $l;
my ($device,$blocks,$used,$avail,$perc,$ppath)=split " ",$l;
1 while ($l=readline(*P));
close P;
return $avail;
}
sub check_space {
my @vol=</project/pbackup_$hostname/data/C*>;
my $sum=0;
my $max_free_vol=0;
for my $vol (@vol) {
my $df=df("$vol/.")/1024/1024/1024;
my $dfp=sprintf('%5.2f',$df);
$df>=2 ? ok ("$vol over 2 GB ($dfp GB)") : fail ("$vol below 2 GB ($dfp GB)");
$df>$max_free_vol and $max_free_vol=$df;
$sum+=$df;
}
my $dfp=sprintf('%5.2f',$max_free_vol);
$max_free_vol>5 ? ok ("more than 5 GB free on a volume (max $dfp GB free)") : fail ("no volume has over 5 GB free (max: $dfp GB free)");
$dfp=sprintf('%5.2f',$sum);
$sum>20 ? ok ("total free space over 20 TB ($dfp GB)") : fail ("total free space below 20 TB ($dfp GB)");
}
sub check_failed_jobs {
my $failed=sqlite3($DB,'select count(*) from job where job_enabled=1 and job_ok=0');
chomp($failed);
$failed<25 ? ok("less then 25 active jobs failed ($failed failed)") : fail("over 25 active jobs failed ($failed failed)");
}
sub check_job {
my ($job_name,$expected_files) = @_;
my $line=sqlite3($DB,qq'select stat_started,stat_files_transferred from stat where stat_job_id in (select job_id from job where job_name="$job_name") order by stat_started desc limit 1');
chomp($line);
my ($started,$files_transferred)=split '\|',$line;
my $hours=(time-$started)/60/60;
my $p=sprintf('%5.2f',$hours);
$hours<=36 ? ok("backup $job_name is not older than 36 hours ($p hours)") : fail("backup $job_name is older than 36 hours ($p hours)");
$files_transferred>$expected_files ? ok ("last backup $job_name transferred over $expected_files files ($files_transferred files)") : fail ("last backup $job_name transferred less than $expected_files files ($files_transferred files)");
}
sub check_progress {
my $lines=sqlite3($DB,'SELECT upid_pid,upid_text,upid_since FROM upid');
my $fail;
for my $line (split "\n",$lines) {
chomp($line);
my ($pid,$text,$since)=split '\|',$line;
my $hours=(time-$since)/60/60;
if ($hours>4) {
fail ("single job running for over 4 hours : $text since ".localtime($since));
$fail++;
}
}
$fail or ok('no jobs running for over 4 hours');
}
check_processes();
check_space();
check_job('cfdl_imapspool_2',1800) if $hostname eq 'void';
check_job('prj_github',20) if $hostname eq 'null';
check_failed_jobs();
check_progress();