#!/bin/sh # # pbs_mom This script will start and stop the PBS Mom # # chkconfig: 345 95 05 # description: TORQUE/PBS is a versatile batch system for SMPs and clusters # #### BEGIN INIT INFO # Provides: pbs_mom # Required-Start: $local_fs $network $syslog # Required-Stop: $local_fs $network $syslog # Default-Start: 2 3 4 5 # Default-Stop: 0 1 6 # Short-Description: Start up the pbs_mom daemon # Description: pbs_mom is part of a batch scheduler # This service starts up the compute node. ### END INIT INFO ulimit -n 32768 # Source the library functions . /etc/rc.d/init.d/functions # NOTE: customize these variables as needed SBIN_PATH=@sbindir@ PBS_DAEMON="$SBIN_PATH/pbs_mom" PBS_HOME=@PBS_HOME@ PBS_ARGS="" SUBSYS_LOCK="/var/lock/subsys/pbs_mom" if [ -f /etc/sysconfig/pbs_mom ];then . /etc/sysconfig/pbs_mom fi MOM_LOCK="$PBS_HOME/mom_priv/mom.lock" if [ -z "$previous" ];then # being run manually, don't disturb jobs args="$args -p" else args="$args -q" fi pidof_pbs_mom() { # This function will echo the PID of the master pbs_mom process if such # a process is running and then return 0. Otherwise it will echo nothing # and return 1. lockfile_info=`stat -Lc "%d:%i" $MOM_LOCK 2>/dev/null` if [ $? -ne 0 ]; then return 1 fi # now we are sure the $MOM_LOCK file exists parent_mom_pid=`cat $MOM_LOCK 2>/dev/null` pidof pbs_mom | egrep -e "(^| )${parent_mom_pid}( |$)" >/dev/null 2>/dev/null if [ $? -ne 0 ]; then return 1 fi # the parent pbs_mom should have a lock on the $MOM_LOCK file, this is # typically acquired early in the pbs_mom process, so this loop typically # stops at the sixth open file. for fd in `ls /proc/${parent_mom_pid}/fd/ | sort -n`; do fd_info=`stat -Lc "%d:%i" /proc/${parent_mom_pid}/fd/${fd} 2>/dev/null` if [ x${fd_info} == x${lockfile_info} ]; then echo $parent_mom_pid return 0 fi done return 1 } kill_pbs_mom() { # This function will try to terminate the master pbs_mom process if such a # process is running. pid=`pidof_pbs_mom` if [ $? -ne 0 ]; then return 0; fi retval=1 for i in {1..5}; do kill -0 $pid &>/dev/null || return 0 $SBIN_PATH/momctl -s && return $? sleep 1 done return $retval } # how were we called case "$1" in start) echo -n "Starting TORQUE Mom: " # check if pbs_mom is already running stat $SUBSYS_LOCK &> /dev/null lock_present=$? pid=`pidof_pbs_mom` if [ $? -eq 0 ]; then if [ $lock_present -eq 0 ]; then echo -n "pbs_mom already running (pid $pid)" else touch $SUBSYS_LOCK && echo -n "pbs_mom running (pid $pid)" fi RET=$? [ $RET -eq 0 ] && success && echo && exit 0 fi # ulimit -c unlimited # Uncomment this to preserve core files daemon $PBS_DAEMON $args -d $PBS_HOME $PBS_ARGS RET=$? touch $SUBSYS_LOCK echo ;; purge) [ -f $SUBSYS_LOCK ] && $0 stop echo -n "Starting TORQUE Mom with purge: " daemon $PBS_DAEMON -r RET=$? [ $RET -eq 0 ] && touch $SUBSYS_LOCK echo ;; stop) echo -n "Shutting down TORQUE Mom: " # check if pbs_mom is running pid=`pidof_pbs_mom` [ $? -ne 0 ] && echo -n "pbs_mom already stopped" && success && echo && exit 0 kill_pbs_mom RET=$? [ $RET -eq 0 ] && success "shutdown" || failure "shutdown" echo rm -f $SUBSYS_LOCK ;; status) stat $SUBSYS_LOCK &> /dev/null lock_present=$? pid=`pidof_pbs_mom` if [ $? -ne 0 ]; then pid=-1 fi # Return codes per Linux Standard Base (LSB) Core Specificiation 3.1 [ $pid -eq -1 -a $lock_present -eq 0 ] && echo -n "pbs_mom dead but subsys locked" && failure && echo && exit 2 [ $pid -eq -1 ] && echo -n "pbs_mom already stopped" && success && echo && exit 3 [ $pid -ne -1 -a $lock_present -eq 0 ] && echo -n "pbs_mom already running" && success && echo && exit 0 [ $pid -ne -1 ] && echo -n "pbs_mom running but subsys not locked" && failure && echo && exit 0 ;; restart) $0 stop sleep 1 $0 start ;; condrestart|try-restart) $0 status || exit 0 $0 restart ;; reload) echo -n "Re-reading TORQUE Mom config file: " pid=`pidof_pbs_mom` if [ $? -eq 0 ]; then kill -HUP $pid RET=$? [ $RET -eq 0 ] && success "HUP" || failure "HUP" else failure "HUP" fi echo ;; *) echo "Usage: pbs_mom {start|stop|restart|reload|status|purge}" exit 1 esac exit $RET