diff --git a/bin/quickcheck.pl b/bin/quickcheck.pl index ef03b59..0b46c90 100755 --- a/bin/quickcheck.pl +++ b/bin/quickcheck.pl @@ -2,147 +2,143 @@ use strict; use warnings; -our $hostname=`/bin/hostname -s`;$? and exit 1;chomp($hostname); -our $DB="/project/pbackup_$hostname/db/pbackup.db"; +our $hostname = `/bin/hostname -s` ; $? and exit 1 ; chomp($hostname); +our $DB = "/project/pbackup_$hostname/db/pbackup.db"; - - -my $fail_count=0; +my $fail_count = 0; sub ok { - print "OK $_[0]\n"; + print "OK $_[0]\n"; } sub fail { - print "FAIL $_[0]\n"; - $fail_count++; + print "FAIL $_[0]\n"; + $fail_count++; } sub sqlite3 { - my ($db,$sql)=@_; - my $retry=0; - while (1) { - open P,'-|','sqlite3',$db,$sql or die "$!\n";; - my $ret=join('',
); - close P; - $? or return $ret; - $?>>8==5 or die "sqlite3 failed\n"; - ++$retry<60 or die "database locked timeout\n"; - warn "(sleep and retry)\n"; - sleep 1; - } + my ($db, $sql) = @_; + my $retry = 0; + while (1) { + open P, '-|','sqlite3', $db, $sql or die "$!\n"; + my $ret = join('',
); + close P; + $? or return $ret; + $? >> 8 == 5 or die "sqlite3 failed\n"; + ++$retry < 60 or die "database locked timeout\n"; + warn "(sleep and retry)\n"; + sleep 1; + } } sub check_processes { - my ($expire,$balance,$backup)=(0,0,0); - open P,'-|','ps -Aoargs' or die "$!\n"; - while (
) { - if (m"^\S*/perl \S*pbackup (balance|do_jobs|expire)") { - $1 eq 'balance' and $balance++; - $1 eq 'expire' and $expire++; - $1 eq 'do_jobs' and $backup++; - } - } - close P; - - $expire>=1 ? ok("expire running once") : fail("expire not running"); - $balance>=1 ? ok("balance running once") : fail("balance not running"); - $backup>=2 ? ok("backup running two times") : fail ("backup not running two times (no: $backup)"); + my ($expire, $balance, $backup) = (0, 0, 0); + open P, '-|','ps -Aoargs' or die "$!\n"; + while (
) { + if (m"^\S*/perl \S*pbackup (balance|do_jobs|expire)") { + $1 eq 'balance' and $balance++; + $1 eq 'expire' and $expire++; + $1 eq 'do_jobs' and $backup++; + } + } + close P; + + $expire >= 1 ? ok("expire running once") : fail("expire not running"); + $balance >= 1 ? ok("balance running once") : fail("balance not running"); + $backup >= 2 ? ok("backup running two times") : fail ("backup not running two times (no: $backup)"); } sub df { - my ($path)=@_; - my $pid=open P,'-|'; - defined $pid or die "$0: $!\n"; - unless ($pid) { - exec 'df','-k',$path; - die "$0: $!\n"; - } - my $l; - $l=
; - $l=
; - chomp $l; - my ($device,$blocks,$used,$avail,$perc,$ppath)=split " ",$l; - 1 while ($l=readline(*P)); - close P; - return $avail; + my ($path) = @_; + my $pid = open P, '-|'; + defined $pid or die "$0: $!\n"; + unless ($pid) { + exec 'df', '-k', $path; + die "$0: $!\n"; + } + my $l; + $l=
; + $l=
; + chomp $l; + my ($device, $blocks, $used, $avail, $perc, $ppath) = split " ", $l; + 1 while ($l=readline(*P)); + close P; + return $avail; } - sub check_space { - my @vol=; - - my $sum=0; - my $max_free_vol=0; - for my $vol (@vol) { - my $df=df("$vol/.")/1024/1024/1024; - my $dfp=sprintf('%5.2f',$df); - $df>=2 ? ok ("$vol over 2 TB ($dfp TB)") : fail ("$vol below 2 TB ($dfp TB)"); - $df>$max_free_vol and $max_free_vol=$df; - $sum+=$df; - } - my $dfp=sprintf('%5.2f',$max_free_vol); - $max_free_vol>5 ? ok ("more than 5 TB free on a volume (max $dfp TB free)") : fail ("no volume has over 5 TB free (max: $dfp TB free)"); - $dfp=sprintf('%5.2f',$sum); - $sum>20 ? ok ("total free space over 20 TB ($dfp TB)") : fail ("total free space below 20 TB ($dfp TB)"); + my @vol = ; + + my $sum = 0; + my $max_free_vol = 0; + for my $vol (@vol) { + my $df = df("$vol/.") / 1024 / 1024 / 1024; + my $dfp = sprintf('%5.2f', $df); + $df >= 2 ? ok ("$vol over 2 TB ($dfp TB)") : fail ("$vol below 2 TB ($dfp TB)"); + $df > $max_free_vol and $max_free_vol = $df; + $sum += $df; + } + my $dfp = sprintf('%5.2f', $max_free_vol); + $max_free_vol > 5 ? ok ("more than 5 TB free on a volume (max $dfp TB free)") : fail ("no volume has over 5 TB free (max: $dfp TB free)"); + $dfp = sprintf('%5.2f', $sum); + $sum > 20 ? ok ("total free space over 20 TB ($dfp TB)") : fail ("total free space below 20 TB ($dfp TB)"); } sub check_failed_jobs { - my $failed=sqlite3($DB,'select count(*) from job where job_enabled=1 and job_ok=0 and job_name not like "sys_%"'); - chomp($failed); - $failed<25 ? ok("less then 25 active jobs failed ($failed failed)") : fail("over 25 active jobs failed ($failed failed)"); + my $failed = sqlite3($DB,'select count(*) from job where job_enabled=1 and job_ok=0 and job_name not like "sys_%"'); + chomp($failed); + $failed < 25 ? ok("less then 25 active jobs failed ($failed failed)") : fail("over 25 active jobs failed ($failed failed)"); } sub check_job { - my ($job_name,$expected_files) = @_; + my ($job_name, $expected_files) = @_; - my $line=sqlite3($DB,qq'select stat_started,stat_files_transferred from stat where stat_job_id in (select job_id from job where job_name="$job_name") order by stat_started desc limit 1'); - chomp($line); - my ($started,$files_transferred)=split '\|',$line; + my $line = sqlite3($DB,qq'select stat_started,stat_files_transferred from stat where stat_job_id in (select job_id from job where job_name="$job_name") order by stat_started desc limit 1'); + chomp($line); + my ($started, $files_transferred) = split '\|', $line; - my $hours=(time-$started)/60/60; - my $p=sprintf('%5.2f',$hours); - $hours<=36 ? ok("backup $job_name is not older than 36 hours ($p hours)") : fail("backup $job_name is older than 36 hours ($p hours)"); - $files_transferred>=$expected_files ? ok ("last backup $job_name transferred over $expected_files files ($files_transferred files)") : fail ("last backup $job_name transferred less than $expected_files files ($files_transferred files)"); + my $hours = (time - $started) / 60 / 60; + my $p = sprintf('%5.2f', $hours); + $hours <= 36 ? ok("backup $job_name is not older than 36 hours ($p hours)") : fail("backup $job_name is older than 36 hours ($p hours)"); + $files_transferred >= $expected_files ? ok ("last backup $job_name transferred over $expected_files files ($files_transferred files)") : fail ("last backup $job_name transferred less than $expected_files files ($files_transferred files)"); } - sub check_progress { - my $lines=sqlite3($DB,'SELECT upid_pid,upid_text,upid_since FROM upid'); - my $fail; - for my $line (split "\n",$lines) { - chomp($line); - my ($pid,$text,$since)=split '\|',$line; - $text =~ /idle, waiting/ and next; - my $hours=(time - $since) / 60 / 60; - if ($hours > 10) { - if ($text =~ /^BACKUP I/) { - fail("INCREMENTAL job running for over 10 hours: $text since" . localtime($since)); - } elsif ($text =~ /^EXPIRE/) { - fail("EXPIRE job running for over 10 hours: $text since" . localtime($since)); - } - if ($hours > 72) { - if ($text =~ /^BACKUP F/) { - fail("FULL job running for over 3 days $text since" . localtime($since)); - } elsif ($text =~ /^BALANCE/) { - fail ("BALANCE job running for over 3 days : $text since ".localtime($since)); - } elsif ($text =~ /^BACKUP %/) { - fail("REFRESH job running for over 3 days $text since" . localtime($since)); - } - } - } - } - $fail or ok('all known jobs in their time limits'); + my $lines = sqlite3($DB, 'SELECT upid_pid,upid_text,upid_since FROM upid'); + my $fail; + for my $line (split "\n", $lines) { + chomp($line); + my ($pid, $text, $since) = split '\|', $line; + $text =~ /idle, waiting/ and next; + my $hours = (time - $since) / 60 / 60; + if ($hours > 10) { + if ($text =~ /^BACKUP I/) { + fail("INCREMENTAL job running for over 10 hours: $text since" . localtime($since)); + } elsif ($text =~ /^EXPIRE/) { + fail("EXPIRE job running for over 10 hours: $text since" . localtime($since)); + } + if ($hours > 72) { + if ($text =~ /^BACKUP F/) { + fail("FULL job running for over 3 days $text since" . localtime($since)); + } elsif ($text =~ /^BALANCE/) { + fail ("BALANCE job running for over 3 days : $text since ".localtime($since)); + } elsif ($text =~ /^BACKUP %/) { + fail("REFRESH job running for over 3 days $text since" . localtime($since)); + } + } + } + } + $fail or ok('all known jobs in their time limits'); } check_processes(); check_space(); -check_job('cfdl_imapspool_2',1600) if $hostname eq 'done'; -check_job('home_vingron',0) if $hostname eq 'gone'; -check_job('sys_void',10) if $hostname eq 'null'; -check_job('sys_null',10) if $hostname eq 'void'; -check_job('sys_gone',10) if $hostname eq 'done'; -check_job('sys_done',10) if $hostname eq 'gone'; +check_job('cfdl_imapspool_2', 1600) if $hostname eq 'done'; +check_job('home_vingron', 0) if $hostname eq 'gone'; +check_job('sys_void', 10) if $hostname eq 'null'; +check_job('sys_null', 10) if $hostname eq 'void'; +check_job('sys_gone', 10) if $hostname eq 'done'; +check_job('sys_done', 10) if $hostname eq 'gone'; check_failed_jobs(); check_progress();