ruoyunbai 2bb9621e30 1
2021-09-29 21:06:16 +08:00

179 lines
4.2 KiB
Bash

#!/bin/sh
#
# pbs_mom This script will start and stop the PBS Mom
#
# chkconfig: 345 95 05
# description: TORQUE/PBS is a versatile batch system for SMPs and clusters
#
#### BEGIN INIT INFO
# Provides: pbs_mom
# Required-Start: $local_fs $network $syslog
# Required-Stop: $local_fs $network $syslog
# Default-Start: 2 3 4 5
# Default-Stop: 0 1 6
# Short-Description: Start up the pbs_mom daemon
# Description: pbs_mom is part of a batch scheduler
# This service starts up the compute node.
### END INIT INFO
ulimit -n 32768
# Source the library functions
. /etc/rc.d/init.d/functions
# NOTE: customize these variables as needed
SBIN_PATH=@sbindir@
PBS_DAEMON="$SBIN_PATH/pbs_mom"
PBS_HOME=@PBS_HOME@
PBS_ARGS=""
SUBSYS_LOCK="/var/lock/subsys/pbs_mom"
if [ -f /etc/sysconfig/pbs_mom ];then
. /etc/sysconfig/pbs_mom
fi
MOM_LOCK="$PBS_HOME/mom_priv/mom.lock"
if [ -z "$previous" ];then
# being run manually, don't disturb jobs
args="$args -p"
else
args="$args -q"
fi
pidof_pbs_mom() {
# This function will echo the PID of the master pbs_mom process if such
# a process is running and then return 0. Otherwise it will echo nothing
# and return 1.
lockfile_info=`stat -Lc "%d:%i" $MOM_LOCK 2>/dev/null`
if [ $? -ne 0 ]; then
return 1
fi
# now we are sure the $MOM_LOCK file exists
parent_mom_pid=`cat $MOM_LOCK 2>/dev/null`
pidof pbs_mom | egrep -e "(^| )${parent_mom_pid}( |$)" >/dev/null 2>/dev/null
if [ $? -ne 0 ]; then
return 1
fi
# the parent pbs_mom should have a lock on the $MOM_LOCK file, this is
# typically acquired early in the pbs_mom process, so this loop typically
# stops at the sixth open file.
for fd in `ls /proc/${parent_mom_pid}/fd/ | sort -n`; do
fd_info=`stat -Lc "%d:%i" /proc/${parent_mom_pid}/fd/${fd} 2>/dev/null`
if [ x${fd_info} == x${lockfile_info} ]; then
echo $parent_mom_pid
return 0
fi
done
return 1
}
kill_pbs_mom() {
# This function will try to terminate the master pbs_mom process if such a
# process is running.
pid=`pidof_pbs_mom`
if [ $? -ne 0 ]; then
return 0;
fi
retval=1
for i in {1..5}; do
kill -0 $pid &>/dev/null || return 0
$SBIN_PATH/momctl -s && return $?
sleep 1
done
return $retval
}
# how were we called
case "$1" in
start)
echo -n "Starting TORQUE Mom: "
# check if pbs_mom is already running
stat $SUBSYS_LOCK &> /dev/null
lock_present=$?
pid=`pidof_pbs_mom`
if [ $? -eq 0 ]; then
if [ $lock_present -eq 0 ]; then
echo -n "pbs_mom already running (pid $pid)"
else
touch $SUBSYS_LOCK && echo -n "pbs_mom running (pid $pid)"
fi
RET=$?
[ $RET -eq 0 ] && success && echo && exit 0
fi
# ulimit -c unlimited # Uncomment this to preserve core files
daemon $PBS_DAEMON $args -d $PBS_HOME $PBS_ARGS
RET=$?
touch $SUBSYS_LOCK
echo
;;
purge)
[ -f $SUBSYS_LOCK ] && $0 stop
echo -n "Starting TORQUE Mom with purge: "
daemon $PBS_DAEMON -r
RET=$?
[ $RET -eq 0 ] && touch $SUBSYS_LOCK
echo
;;
stop)
echo -n "Shutting down TORQUE Mom: "
# check if pbs_mom is running
pid=`pidof_pbs_mom`
[ $? -ne 0 ] && echo -n "pbs_mom already stopped" && success && echo && exit 0
kill_pbs_mom
RET=$?
[ $RET -eq 0 ] && success "shutdown" || failure "shutdown"
echo
rm -f $SUBSYS_LOCK
;;
status)
stat $SUBSYS_LOCK &> /dev/null
lock_present=$?
pid=`pidof_pbs_mom`
if [ $? -ne 0 ]; then
pid=-1
fi
# Return codes per Linux Standard Base (LSB) Core Specificiation 3.1
[ $pid -eq -1 -a $lock_present -eq 0 ] && echo -n "pbs_mom dead but subsys locked" && failure && echo && exit 2
[ $pid -eq -1 ] && echo -n "pbs_mom already stopped" && success && echo && exit 3
[ $pid -ne -1 -a $lock_present -eq 0 ] && echo -n "pbs_mom already running" && success && echo && exit 0
[ $pid -ne -1 ] && echo -n "pbs_mom running but subsys not locked" && failure && echo && exit 0
;;
restart)
$0 stop
sleep 1
$0 start
;;
condrestart|try-restart)
$0 status || exit 0
$0 restart
;;
reload)
echo -n "Re-reading TORQUE Mom config file: "
pid=`pidof_pbs_mom`
if [ $? -eq 0 ]; then
kill -HUP $pid
RET=$?
[ $RET -eq 0 ] && success "HUP" || failure "HUP"
else
failure "HUP"
fi
echo
;;
*)
echo "Usage: pbs_mom {start|stop|restart|reload|status|purge}"
exit 1
esac
exit $RET