mourningdove/cgi-bin/DW/Controller/Admin/StatusCheck.pm

295 lines
8.5 KiB
Perl
Raw Normal View History

2026-05-24 01:03:05 +00:00
#!/usr/bin/perl
#
# Authors:
# Mark Smith <mark@dreamwidth.org>
# Afuna <coder.dw@afunamatata.com>
#
# Copyright (c) 2014 by Dreamwidth Studios, LLC.
#
# This program is free software; you may redistribute it and/or modify it under
# the same terms as Perl itself. For a copy of the license, please reference
# 'perldoc perlartistic' or 'perldoc perlgpl'.
package DW::Controller::Admin::StatusCheck;
use strict;
use DW::Controller;
use DW::Routing;
use DW::Template;
use DW::Controller::Admin;
use IO::Socket::INET ();
=head1 NAME
DW::Controller::Admin::StatusCheck - Checks the status of various services
=cut
DW::Routing->register_string( "/admin/healthy", \&healthy_handler, format => 'plain' );
DW::Controller::Admin->register_admin_page(
'/',
path => 'healthy',
ml_scope => '/admin/healthy.tt',
);
DW::Routing->register_string( "/admin/theschwartz", \&theschwartz_handler );
DW::Controller::Admin->register_admin_page(
'/',
path => 'theschwartz',
ml_scope => '/admin/theschwartz.tt',
privs => ['siteadmin:theschwartz']
);
# Returns some healthy-or-not statistics on the site. Intended to be used by
# remote monitoring services and the like. This is supposed to be very
# lightweight, not designed to replace Nagios monitoring in any way.
# Printing as plain text to avoid having to parse HTML cruft
sub healthy_handler {
my ($opts) = @_;
my ( $ok, $rv ) = controller( anonymous => 1 );
return $rv unless $ok;
my $r = $rv->{r};
my ( @pass, @fail );
# check 1) verify databases reachable
my $dbh = LJ::get_db_writer();
if ($dbh) {
my $time = $dbh->selectrow_array('SELECT UNIX_TIMESTAMP()');
if ( !$time || $dbh->err ) {
push @fail, "global writer test query failed";
}
else {
push @pass, "global writer";
}
}
else {
push @fail, "global writer unreachable";
}
# step 2) check all clusters
foreach my $cid (@LJ::CLUSTERS) {
my $dbcm = LJ::get_cluster_master($cid);
if ($dbcm) {
my $time = $dbcm->selectrow_array('SELECT UNIX_TIMESTAMP()');
if ( !$time || $dbcm->err ) {
push @fail, "cluster $cid writer test query failed";
}
else {
push @pass, "cluster $cid writer";
}
}
else {
push @fail, "cluster $cid writer unreachable";
}
}
# verify connectivity to all memcache machines
foreach my $memc (@LJ::MEMCACHE_SERVERS) {
my $sock = IO::Socket::INET->new( PeerAddr => $memc, Timeout => 1 );
if ($sock) {
push @pass, "memcache $memc";
}
else {
push @fail, "memcache $memc";
}
}
# check each mogilefs server
foreach my $mog ( @{ $LJ::MOGILEFS_CONFIG{hosts} || [] } ) {
my $sock = IO::Socket::INET->new( PeerAddr => $mog, Timeout => 1 );
if ($sock) {
push @pass, "mogilefsd $mog";
}
else {
push @fail, "mogilefsd $mog";
}
}
# check each gearman server
foreach my $gm (@LJ::GEARMAN_SERVERS) {
my $sock = IO::Socket::INET->new( PeerAddr => $gm, Timeout => 1 );
if ($sock) {
push @pass, "gearman $gm";
}
else {
push @fail, "gearman $gm";
}
}
# and each Perlbal
foreach my $pb ( values %LJ::PERLBAL_SERVERS ) {
my $sock = IO::Socket::INET->new( PeerAddr => $pb, Timeout => 1 );
if ($sock) {
push @pass, "perlbal $pb";
}
else {
push @fail, "perlbal $pb";
}
}
if ( !LJ::theschwartz() ) {
# no schwartz
}
elsif (scalar( grep { defined $_->{role} } @LJ::THESCHWARTZ_DBS ) > 0
|| scalar(@LJ::THESCHWARTZ_DBS) > 1 )
{
# cannot test, leaving off
}
else {
my $sid = 0;
foreach my $db (@LJ::THESCHWARTZ_DBS) {
my $s_db = DBI->connect( $db->{dsn}, $db->{user}, $db->{pass} );
if ($s_db) {
my $time = $s_db->selectrow_array(
"DESCRIBE " . ( $db->{prefix} ? $db->{prefix} . "_job" : "job" ) );
if ( !$time || $s_db->err ) {
push @fail, "schwartz $sid";
}
else {
push @pass, "schwartz $sid";
}
}
else {
push @fail, "schwartz $sid unreachable";
}
$sid++;
}
}
my $out = '';
if (@fail) {
$out = "status=fail\n\nfailures:\n";
$out .= join( "\n", map { " $_" } @fail ) . "\n";
}
else {
$out = "status=ok\n";
}
if (@pass) {
$out .= "\nokay:\n";
$out .= join( "\n", map { " $_" } @pass ) . "\n";
}
$r->print($out);
return $r->OK;
}
sub theschwartz_handler {
my ($opts) = @_;
my ( $ok, $rv ) = controller( privcheck => ['siteadmin:theschwartz'] );
return $rv unless $ok;
# of course, if TheSchwartz is off...
my $sch = LJ::theschwartz();
return error_ml("/admin/theschwartz.tt.error.noschwartz") unless $sch;
# okay, this is really hacky, and I apologize in advance for inflicting this
# on the codebase. but we have no way of really getting into the database used
# by TheSchwartz without this manual hackery... also, this requires that we not
# be using roled TheSchwartz, or multiple (undefined results)
#
# FIXME: this can be so much better.
return error_ml("/admin/theschwartz.tt.error.config")
if scalar( grep { defined $_->{role} } @LJ::THESCHWARTZ_DBS ) > 0
|| scalar(@LJ::THESCHWARTZ_DBS) > 1;
# do manual connection
my $db = $LJ::THESCHWARTZ_DBS[0];
my $dbr = DBI->connect( $db->{dsn}, $db->{user}, $db->{pass} );
return error_ml("/admin/theschwartz.tt.error.manual") unless $dbr;
# gather status on jobs in the queue
my $job = ( $db->{prefix} || "" ) . "job";
my $funcmap = ( $db->{prefix} || "" ) . "funcmap";
my $jobs = $dbr->selectall_arrayref(
qq{SELECT j.jobid, f.funcname,
FROM_UNIXTIME(j.insert_time), j.insert_time,
FROM_UNIXTIME(j.run_after), j.run_after,
FROM_UNIXTIME(j.grabbed_until), j.grabbed_until,
j.priority
FROM $job j, $funcmap f
WHERE f.funcid = j.funcid
ORDER BY j.insert_time}
);
return error_ml( '/admin/theschwartz.tt.error.jobs', { error => $dbr->errstr } )
if $dbr->err;
# now get the actual data
my @queue;
if ( $jobs && @$jobs ) {
foreach my $job (@$jobs) {
my ( $jid, $fn, $it, $r_it, $ra, $r_ra, $gu, $r_gu, $pr ) = @$job;
my $ago_it = LJ::diff_ago_text($r_it);
my $ago_ra = LJ::diff_ago_text($r_ra);
my $state;
if ( !$r_ra && !$r_gu ) {
$state = 'queued';
}
elsif ($r_gu) {
if ($r_ra) {
$state = 'retrying';
}
else {
$state = 'running';
}
}
elsif ( $r_ra && !$r_gu ) {
if ( $r_ra < time ) {
$state = 'failed at least once, will retry very soon';
}
else {
$state = 'failed at least once, will retry in ' . $ago_ra;
$state =~ s/\s?ago\s?//; # heh
}
}
else {
$state = 'UNKNOWN';
}
$pr ||= 'undefined';
push @queue,
{
jid => $jid,
it => $it,
ago_it => $ago_it,
fn => $fn,
state => $state,
priority => $pr
};
}
}
# gather some status on the last 100 errors.
my $error = ( $db->{prefix} || "" ) . "error";
my $errs = $dbr->selectall_arrayref(
qq{SELECT e.jobid, FROM_UNIXTIME(e.error_time), f.funcname, e.message
FROM $error e, $funcmap f
WHERE f.funcid = e.funcid
ORDER BY e.error_time DESC
LIMIT 100}
);
return error_ml( '/admin/theschwartz.tt.error.recent', { error => $dbr->errstr } )
if $dbr->err;
return DW::Template->render_template(
"admin/theschwartz.tt",
{
queue => \@queue,
recent_errors => $errs,
}
);
}
1;