Permalink
Cannot retrieve contributors at this time
Name already in use
A tag already exists with the provided branch name. Many Git commands accept both tag and branch names, so creating this branch may cause unexpected behavior. Are you sure you want to create this branch?
pbackup/bin/quickcheck.pl
Go to fileThis commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
executable file
132 lines (110 sloc)
3.58 KB
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#! /usr/bin/perl | |
use strict; | |
use warnings; | |
our $hostname=`/bin/hostname -s`;$? and exit 1;chomp($hostname); | |
our $DB="/project/pbackup_$hostname/db/pbackup.db"; | |
my $fail_count=0; | |
sub ok { | |
print "OK $_[0]\n"; | |
} | |
sub fail { | |
print "FAIL $_[0]\n"; | |
$fail_count++; | |
} | |
sub sqlite3 { | |
my ($db,$sql)=@_; | |
my $retry=0; | |
while (1) { | |
open P,'-|','sqlite3',$db,$sql or die "$!\n";; | |
my $ret=join('',<P>); | |
close P; | |
$? or return $ret; | |
$?>>8==5 or die "sqlite3 failed\n"; | |
++$retry<60 or die "database locked timeout\n"; | |
warn "(sleep and retry)\n"; | |
sleep 1; | |
} | |
} | |
sub check_processes { | |
my ($expire,$balance,$backup)=(0,0,0); | |
open P,'-|','ps -Aoargs' or die "$!\n"; | |
while (<P>) { | |
if (m"^\S*/perl \S*pbackup (balance|do_jobs|expire)") { | |
$1 eq 'balance' and $balance++; | |
$1 eq 'expire' and $expire++; | |
$1 eq 'do_jobs' and $backup++; | |
} | |
} | |
close P; | |
$expire>=1 ? ok("expire running once") : fail("expire not running"); | |
$balance>=1 ? ok("balance running once") : fail("balance not running"); | |
$backup>=3 ? ok("backup running three times") : fail ("backup not running three times (no: $backup)"); | |
} | |
sub df { | |
my ($path)=@_; | |
my $pid=open P,'-|'; | |
defined $pid or die "$0: $!\n"; | |
unless ($pid) { | |
exec 'df','-k',$path; | |
die "$0: $!\n"; | |
} | |
my $l; | |
$l=<P>; | |
$l=<P>; | |
chomp $l; | |
my ($device,$blocks,$used,$avail,$perc,$ppath)=split " ",$l; | |
1 while ($l=readline(*P)); | |
close P; | |
return $avail; | |
} | |
sub check_space { | |
my @vol=</project/pbackup_$hostname/data/C*>; | |
my $sum=0; | |
my $max_free_vol=0; | |
for my $vol (@vol) { | |
my $df=df("$vol/.")/1024/1024/1024; | |
my $dfp=sprintf('%5.2f',$df); | |
$df>=2 ? ok ("$vol over 2 GB ($dfp GB)") : fail ("$vol below 2 GB ($dfp GB)"); | |
$df>$max_free_vol and $max_free_vol=$df; | |
$sum+=$df; | |
} | |
my $dfp=sprintf('%5.2f',$max_free_vol); | |
$max_free_vol>5 ? ok ("more than 5 GB free on a volume (max $dfp GB free)") : fail ("no volume has over 5 GB free (max: $dfp GB free)"); | |
$dfp=sprintf('%5.2f',$sum); | |
$sum>20 ? ok ("total free space over 20 TB ($dfp GB)") : fail ("total free space below 20 TB ($dfp GB)"); | |
} | |
sub check_failed_jobs { | |
my $failed=sqlite3($DB,'select count(*) from job where job_enabled=1 and job_ok=0'); | |
chomp($failed); | |
$failed<25 ? ok("less then 25 active jobs failed ($failed failed)") : fail("over 25 active jobs failed ($failed failed)"); | |
} | |
sub check_job { | |
my ($job_name,$expected_files) = @_; | |
my $line=sqlite3($DB,qq'select stat_started,stat_files_transferred from stat where stat_job_id in (select job_id from job where job_name="$job_name") order by stat_started desc limit 1'); | |
chomp($line); | |
my ($started,$files_transferred)=split '\|',$line; | |
my $hours=(time-$started)/60/60; | |
my $p=sprintf('%5.2f',$hours); | |
$hours<=36 ? ok("backup $job_name is not older than 36 hours ($p hours)") : fail("backup $job_name is older than 36 hours ($p hours)"); | |
$files_transferred>$expected_files ? ok ("last backup $job_name transferred over $expected_files files ($files_transferred files)") : fail ("last backup $job_name transferred less than $expected_files files ($files_transferred files)"); | |
} | |
sub check_progress { | |
my $lines=sqlite3($DB,'SELECT upid_pid,upid_text,upid_since FROM upid'); | |
my $fail; | |
for my $line (split "\n",$lines) { | |
chomp($line); | |
my ($pid,$text,$since)=split '\|',$line; | |
my $hours=(time-$since)/60/60; | |
if ($hours>4) { | |
fail ("single job running for over 4 hours : $text since ".localtime($since)); | |
$fail++; | |
} | |
} | |
$fail or ok('no jobs running for over 4 hours'); | |
} | |
check_processes(); | |
check_space(); | |
check_job('cfdl_imapspool_2',1800) if $hostname eq 'void'; | |
check_job('prj_github',20) if $hostname eq 'null'; | |
check_failed_jobs(); | |
check_progress(); |