torque_install/torque/contrib/blcr/restart_script.in

#! /usr/bin/perl
# @configure_input@
################################################################################
#
# Usage: restart_script <session_id> <job_id> <user_id> <group_id> <checkpoint_dir> <checkpoint_name>
#
# This script is invoked by pbs_mom to restart a job.
#
################################################################################
use strict;
use Sys::Syslog;

#
# These variables should be initialized by configure.
#
my $blcr_bindir="@BLCR_BINDIR@";
my $torque_bindir="@bindir@";
my $mom_priv="@PBS_SERVER_HOME@/mom_priv";

# Customize the following variables
# Customize the PATH to include location of blcr checkpoint commands
# (e.g. cr_restart), torque client commands (e.g. qalter),
# and pbs_mom checkpoint/restart scripts (e.g. restart_script,run_on_success)
$ENV{PATH} .= ":$blcr_bindir" if $blcr_bindir;
$ENV{PATH} .= ":$torque_bindir" if $torque_bindir;
$ENV{PATH} .= ":$mom_priv" if $mom_priv;

my $logLevel = 3; # 0 = none, 1 = fail, 2 = info, 3 = debug

logPrint(2, "Invoked: $0 " . join(' ', @ARGV) . "\n");

my ($sessionId, $jobId, $userId, $groupId, $checkpointDir, $checkpointName);
my $usage =
  "Usage: $0 <session_id> <job_id> <user_id> <group_id> <checkpoint_dir> <checkpoint_name>\n";
if (@ARGV == 6)
{
    ($sessionId, $jobId, $userId, $groupId, $checkpointDir, $checkpointName) = @ARGV;
}
else { logDie(1, $usage); }

# Set of cr_restart hooks for flagging the job with various error conditions
my $runOnSuccessCmd = "qalter -W checkpoint_restart_status=\"Successfully restarted job from checkpoint\" $jobId";
my $runOnFailTempCmd = "qalter -W checkpoint_restart_status=\"Temporary failure restarting job from checkpoint\" $jobId";
my $runOnFailPermCmd = "qalter -W checkpoint_restart_status=\"Permanent failure restarting job from checkpoint\" $jobId";
my $runOnFailArgsCmd = "qalter -W checkpoint_restart_status=\"Argument failure restarting job from checkpoint\" $jobId";
my $runOnFailEnvCmd = "qalter -W checkpoint_restart_status=\"Environment failure restarting job from checkpoint\" $jobId";
my $runOnFailureCmd = "qalter -W checkpoint_restart_status=\"General failure restarting job from checkpoint\" $jobId";

# Drop privileges to the job owner
my $gid = getgrnam($groupId);
logDie(1, "Unable to resolve group id ($groupId)\n") unless defined $gid;
$( = $gid;
$) = $gid;
logDie(1, "Unable to set gid: $gid") unless $gid == $(;
my $uid = getpwnam($userId);
logDie(1, "Unable to resolve user id ($userId)\n") unless defined $uid;
$< = $uid;
$> = $uid;
logDie(1, "Unable to set uid: $uid") unless $uid == $<;

# Change to the checkpoint directory where we want the checkpoint to be created
chdir $checkpointDir
  or logDie(1, "Unable to cd to checkpoint dir ($checkpointDir): $!\n")
  if $logLevel;

my $cmd = "cr_restart";
$cmd .= " --run-on-success='$runOnSuccessCmd'" if $runOnSuccessCmd;
$cmd .= " --run-on-fail-perm='$runOnFailPermCmd'" if $runOnFailPermCmd;
$cmd .= " --run-on-fail-temp='$runOnFailTempCmd'" if $runOnFailTempCmd;
$cmd .= " --run-on-fail-args='$runOnFailArgsCmd'" if $runOnFailArgsCmd;
$cmd .= " --run-on-fail-env='$runOnFailEnvCmd'" if $runOnFailEnvCmd;
$cmd .= " --run-on-failure='$runOnFailureCmd'" if $runOnFailureCmd;
$cmd .= " $checkpointName";
my $restartOutput = `$cmd 2>&1`;
my $restartRc = $? >> 8;
if ($restartRc)
{
    logPrint(1, "Subcommand ($cmd) failed with rc=$restartRc:\n$restartOutput") if $logLevel >= 1;

    # Check whether this was a cr_restart error
    # We need to do this before updating the error with qalter
    my $cmd = "qstat -as $jobId";
    my $output = `$cmd 2>&1`;
    my $rc     = $? >> 8;
    if ($rc)
    {
        logPrint(1, "Subcommand ($cmd) failed with rc=$rc:\n$output") if $logLevel >= 1;
    }
    else
    {
        logPrint(3, "Subcommand ($cmd) yielded rc=$rc:\n$output") if $logLevel >= 3;
    }

    if ($output =~ m/^(.*failure restarting job.*)$/m)
    {
        my $comment = "$1: $restartOutput";

        # Attach restart error message to job
        # Before attaching message we first replace carriage returns with spaces
        $comment =~ s/\n/ /g;
        # Perl automatically enables a set of special security checks,
        # called taint mode, when it detects its program running with
        # differing real and effective user or group IDs.
        # We must therefore untaint the output.
        if ($comment =~ m/(.*)/) { $comment = $1; }
        # Build qalter command
        my $cmd = "qalter -W comment=\"\\\"$comment\\\"\" $jobId";
        my $output = `$cmd 2>&1`;
        my $rc     = $? >> 8;
        if ($rc)
        {
            logPrint(1, "Subcommand ($cmd) failed with rc=$rc:\n$output") if $logLevel >= 1;
        }
        else
        {
            logPrint(3, "Subcommand ($cmd) yielded rc=$rc:\n$output") if $logLevel >= 3;
        }
    }
}
else
{
    logPrint(3, "Subcommand ($cmd) yielded rc=$restartRc:\n$restartOutput") if $logLevel >= 3;
}

# Exit with exit code of cr_restart command
exit $restartRc;


################################################################################
# logPrint($message)
# Write a message (to syslog) and die
################################################################################
sub logPrint
{
    my ($level, $message) = @_;
    my @severity = ('none', 'warning', 'info', 'debug');

    return if $logLevel < $level;

    openlog('restart_script', '', 'user');
    syslog($severity[$level], $message);
    closelog();
}


################################################################################
# logDie($message)
# Write a message (to syslog) and die
################################################################################
sub logDie
{
    my ($level, $message) = @_;

    logPrint($level, $message);
    die($message);
}