560 lines
18 KiB
Perl
560 lines
18 KiB
Perl
#!/usr/bin/perl
|
|
# *****************************************************************************
|
|
#
|
|
# Copyright 2011 Zuse Institute Berlin
|
|
#
|
|
# This program is free software; you can redistribute it and/or modify
|
|
# it under the terms of the GNU General Public License as published by
|
|
# the Free Software Foundation.
|
|
#
|
|
# This program is distributed in the hope that it will be useful,
|
|
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
# GNU General Public License for more details.
|
|
#
|
|
# You should have received a copy of the GNU General Public License
|
|
# along with this program; if not, write to the Free Software
|
|
# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
|
|
#
|
|
# Please send comments to kallies@zib.de
|
|
#
|
|
# *****************************************************************************
|
|
# Purpose: - called from /etc/init.d/pbs_mom during start actions.
|
|
# - creates /var/spool/torque/mom_priv/mom.layout
|
|
# - creates/modifies /dev/cpuset/torque
|
|
# Prereq: - hwloc >= 1.1, http://www.open-mpi.org/projects/hwloc/
|
|
# - Sys::Hwloc >= 0.09, http://search.cpan.org/~bka/
|
|
# Install: Install this script on each UV rack
|
|
# /opt/torque/Scripts/mom_gencfg root:root -rwxr-xr-x
|
|
# Config: Set MOM_GENCFG=/opt/torque/Scripts/mom_gencfg
|
|
# in /etc/init.d/pbs_mom for UV, execute $MOM_GENCFG before
|
|
# starting the pbs_mom daemon.
|
|
# MOM_GENCFG can be overridden in /etc/sysconfig/pbs_mom.
|
|
# *****************************************************************************
|
|
# $Id: mom_gencfg,v 1.1.2.1 2011/01/17 10:12:46 acountin Exp $
|
|
# *****************************************************************************
|
|
|
|
#
|
|
# *** Instructions for use ***
|
|
#
|
|
# 1. Install hwloc - see contrib/hwloc_install.sh. This should already be done since
|
|
# TORQUE needs hwloc for its cpuset implementation starting in 4.0
|
|
# 2. Install Sys::Hwloc from CPAN
|
|
# 3. Set $PBS_HOME to the proper value if not already set
|
|
# 4. Update the variables in the section 'Config Definitions' Especially update firstNodeId
|
|
# and nodesPerBoard if desired.
|
|
# firstNodeId should be set above 0 if you have a root cpuset that you wish to exclude
|
|
# nodesPerBoard is the number of numa nodes per board. Each node is defined in the
|
|
# directory /sys/devices/system/node, in a subdirectory node<node index>
|
|
# 5. Backup your current file, just in case a variable is set incorrectly or neglected
|
|
# 6. Run this script and enjoy the layout file
|
|
#
|
|
#
|
|
|
|
|
|
use strict;
|
|
|
|
use lib qw(
|
|
/usr/lib/perl5
|
|
/usr/lib/perl5/site_perl
|
|
);
|
|
|
|
use Sys::Hostname;
|
|
use File::Basename;
|
|
use Getopt::Long qw(:config no_ignore_case);
|
|
use autouse 'Pod::Usage' => qw(pod2usage);
|
|
use Sys::Hwloc 0.09;
|
|
|
|
my $progName = basename($0);
|
|
my $hostName = hostname();
|
|
|
|
$SIG{__DIE__} = \&xDie;
|
|
|
|
# ==============================================================================
|
|
# Setup needed before init
|
|
# ==============================================================================
|
|
|
|
BEGIN: {
|
|
die "This script needs at least hwloc-1.1\n" unless HWLOC_XSAPI_VERSION() >= 0x00010100;
|
|
}
|
|
|
|
# ==============================================================================
|
|
# Config definitions
|
|
# ==============================================================================
|
|
|
|
my $hostNames = undef; # hostname pattern to be run on, undef to skip test
|
|
my $cpusetFsName = '/dev/cpuset'; # the name of the cpuset file system
|
|
my $cpusetBaseName = '/torque'; # the name of the parent cpuset of a job's cpuset
|
|
my $mkdirCmd = '/bin/mkdir'; # the path to the mkdir command
|
|
my $catCmd = '/bin/cat'; # the path to the cat command
|
|
my $echoCmd = '/bin/echo'; # the path to the echo command
|
|
my $momCfgDir = 'mom_priv'; # the directory where MOM configs are stored
|
|
my $momLayoutFile = 'mom.layout'; # the name of the MOM layout file
|
|
my $firstNodeId = 0; # ID of 1st NUMA node to be used by Torque (start with 0)
|
|
my $lastNodeId = undef; # ID of last NUMA node to be used (undef means last available)
|
|
my $nodesPerBoard = 1; # number of NUMA nodes per nodeboard
|
|
my %cpusetConf = (
|
|
cpus => undef, # undef means auto-generate
|
|
mems => undef, # undef means auto-generate
|
|
cpu_exclusive => 1, #
|
|
mem_exclusive => 1, #
|
|
);
|
|
my %options = (
|
|
-doLayout => 1, # generate mom.layout
|
|
-withCpus => 1, # include cpus in mom.layout
|
|
-withMems => 1, # include mems in mom.layout
|
|
-doCpuset => 1, # generate/modify /torque cpuset
|
|
-withSmt => 1, # include logical processors running on the same core
|
|
-verbose => undef, # be verbose to STDERR
|
|
-dryRun => undef, # no actions, just tell what would be done
|
|
);
|
|
|
|
# ==============================================================================
|
|
# Command line options
|
|
# ==============================================================================
|
|
|
|
GetOptions(
|
|
"layout!" => \$options{-doLayout},
|
|
"cpus!" => \$options{-withCpus},
|
|
"mems!" => \$options{-withMems},
|
|
"smt!" => \$options{-withSmt},
|
|
"cpuset!" => \$options{-doCpuset},
|
|
"dry-run!" => \$options{-dryRun},
|
|
"verbose!" => \$options{-verbose},
|
|
"help|?" => sub { usage(0) },
|
|
"man" => sub { manPage() },
|
|
) or usage(2);
|
|
|
|
if($options{-dryRun}) {
|
|
$options{-verbose} = 1 unless defined $options{-verbose};
|
|
xDebug(">>> DryRunDryRunDryRunDryRunDryRun <<<");
|
|
}
|
|
|
|
# ==============================================================================
|
|
# Quick exit if not wanted on this host, or if no work to do
|
|
# ==============================================================================
|
|
|
|
#if(defined $hostNames) {
|
|
# unless($hostName =~ /$hostNames/) {
|
|
# xDebug("--- Don't run on $hostName ---");
|
|
# exit 0;
|
|
# }
|
|
#}
|
|
|
|
exit 0 unless ($options{-doLayout} || $options{-doCpuset});
|
|
|
|
# ==============================================================================
|
|
# See if PBS_HOME is set, and if $PBS_HOME/mom_priv exists.
|
|
# If not, we are probably not called correctly, thus die.
|
|
# See if cpusets are configured. If not, die.
|
|
# ==============================================================================
|
|
|
|
die "\$PBS_HOME not set\n" unless (exists $ENV{PBS_HOME} && $ENV{PBS_HOME});
|
|
die "PBS_HOME=$ENV{PBS_HOME} does not exist\n" unless -d $ENV{PBS_HOME};
|
|
$momCfgDir = "$ENV{PBS_HOME}/${momCfgDir}";
|
|
die "MOM config dir $momCfgDir does not exist\n" unless -d $momCfgDir;
|
|
$momLayoutFile = "${momCfgDir}/${momLayoutFile}";
|
|
die "this system does not support cpusets\n" unless -d $cpusetFsName;
|
|
|
|
# ==============================================================================
|
|
# Figure out system topology, collect wanted node objects
|
|
# ==============================================================================
|
|
|
|
my $topology = Sys::Hwloc::Topology->init;
|
|
die "Failed to init topology\n" unless defined $topology;
|
|
$topology->set_flags(HWLOC_TOPOLOGY_FLAG_WHOLE_SYSTEM);
|
|
die("Failed to load topology\n") if $topology->load;
|
|
|
|
# ==============================================================================
|
|
# Collect nodesets of wanted NUMA nodes per nodeBoard
|
|
# ==============================================================================
|
|
|
|
my @nodeBoards = ();
|
|
my $nodeObj = undef;
|
|
my $nNodes = 0;
|
|
while($nodeObj = $topology->get_next_obj_by_type(HWLOC_OBJ_NODE, $nodeObj)) {
|
|
my $nodeId = $nodeObj->logical_index;
|
|
next if $nodeId < $firstNodeId;
|
|
last if (defined $lastNodeId && $nodeId > $lastNodeId);
|
|
if($nNodes) {
|
|
$nodeBoards[$#nodeBoards]->{nodeset}->or($nodeObj->nodeset);
|
|
} else {
|
|
push @nodeBoards, {
|
|
cpuset => Sys::Hwloc::Bitmap->new,
|
|
nodeset => $nodeObj->nodeset->dup,
|
|
};
|
|
}
|
|
$nNodes++;
|
|
$nNodes = 0 if $nNodes >= $nodesPerBoard;
|
|
}
|
|
|
|
# ==============================================================================
|
|
# Assemble cpusets per nodeBoard
|
|
# ==============================================================================
|
|
|
|
foreach my $nodeBoard (@nodeBoards) {
|
|
$topology->cpuset_from_nodeset_strict($nodeBoard->{cpuset}, $nodeBoard->{nodeset});
|
|
next if $options{-withSmt};
|
|
my $core = undef;
|
|
while($core = $topology->get_next_obj_inside_cpuset_by_type($nodeBoard->{cpuset}, HWLOC_OBJ_CORE, $core)) {
|
|
my $j = 1;
|
|
while (my $pu = $topology->get_obj_inside_cpuset_by_type($core->cpuset, HWLOC_OBJ_PU, $j++)) {
|
|
$nodeBoard->{cpuset}->andnot($pu->cpuset);
|
|
}
|
|
}
|
|
}
|
|
|
|
# ==============================================================================
|
|
# Generate mom.layout
|
|
# ==============================================================================
|
|
|
|
if($options{-doLayout}) {
|
|
|
|
xDebug("--- Generating $momLayoutFile ---");
|
|
if(! $options{-dryRun}) {
|
|
open(FILE, "> $momLayoutFile") or die "failed to open $momLayoutFile: $!\n";
|
|
}
|
|
foreach my $nodeBoard (@nodeBoards) {
|
|
my $line = sprintf("nodes=%s", $nodeBoard->{nodeset}->sprintf_list);
|
|
$line .= sprintf(" cpus=%s", $nodeBoard->{cpuset}->sprintf_list) if $options{-withCpus};
|
|
$line .= sprintf(" mems=%s", $nodeBoard->{nodeset}->sprintf_list) if $options{-withMems};
|
|
xDebug(" $line");
|
|
print FILE "$line\n" unless $options{-dryRun};
|
|
}
|
|
close(FILE) unless $options{-dryRun};
|
|
|
|
}
|
|
|
|
# ==============================================================================
|
|
# Create/modify torque cpuset
|
|
# ==============================================================================
|
|
|
|
if($options{-doCpuset}) {
|
|
|
|
# Create it if it is not there
|
|
my $cpusetPath = "${cpusetFsName}${cpusetBaseName}";
|
|
if(! -d $cpusetPath) {
|
|
xDebug("--- Creating $cpusetPath ---");
|
|
my $rc = execCmd($mkdirCmd,1,$cpusetPath);
|
|
die "Failed to create $cpusetPath\n" unless defined $rc;
|
|
}
|
|
|
|
# Read content
|
|
xDebug("--- Reading $cpusetPath ---");
|
|
my $cpusetData = readCpuset($cpusetPath);
|
|
die "Failed to read $cpusetPath\n" unless defined $cpusetData;
|
|
|
|
# Assemble changes
|
|
my %cpusetMod = ();
|
|
foreach my $key (keys %cpusetConf) {
|
|
next unless exists $cpusetData->{$key};
|
|
my $val = $cpusetConf{$key};
|
|
CASE: {
|
|
$key eq 'cpus' && do {
|
|
if(! defined $val) {
|
|
my $cpuset = Sys::Hwloc::Bitmap->new;
|
|
foreach my $nodeBoard (@nodeBoards) {
|
|
$cpuset->or($nodeBoard->{cpuset});
|
|
}
|
|
$val = $cpuset->sprintf_list;
|
|
$cpuset->free;
|
|
}
|
|
last CASE;
|
|
};
|
|
$key eq 'mems' && do {
|
|
if(! defined $val) {
|
|
my $nodeset = Sys::Hwloc::Bitmap->new;
|
|
foreach my $nodeBoard (@nodeBoards) {
|
|
$nodeset->or($nodeBoard->{nodeset});
|
|
}
|
|
$val = $nodeset->sprintf_list;
|
|
$nodeset->free;
|
|
}
|
|
last CASE;
|
|
};
|
|
}
|
|
next unless defined $val;
|
|
if(
|
|
(! defined $cpusetData->{$key}) ||
|
|
(defined $cpusetData->{$key} && $cpusetData->{$key} ne $val)
|
|
) {
|
|
$cpusetMod{$key} = $val;
|
|
}
|
|
}
|
|
|
|
# Write changes, if any. Don't abort on error, but warn if changes not done
|
|
if(%cpusetMod) {
|
|
xDebug("--- Modifying $cpusetPath ---");
|
|
if($options{-dryRun}) {
|
|
while(my ($key, $val) = each %cpusetMod) {
|
|
xDebug(sprintf(" = cpuset %s: %-25s %s", $cpusetPath, $key, $val));
|
|
}
|
|
} else {
|
|
while(my ($key, $val) = each %cpusetMod) {
|
|
my $out = execCmd($echoCmd, 0, "$val > ${cpusetPath}/$key");
|
|
}
|
|
if($options{-verbose}) {
|
|
$cpusetData = readCpuset($cpusetPath);
|
|
die "Failed to read $cpusetPath\n" unless defined $cpusetData;
|
|
while(my ($key, $val) = each %cpusetMod) {
|
|
xDebug(sprintf(" %s cpuset %s: %-25s %s", $val eq $cpusetData->{$key} ? '=' : '-', $cpusetPath, $key, $val));
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
# ==============================================================================
|
|
# All done
|
|
# ==============================================================================
|
|
|
|
$topology->destroy;
|
|
|
|
exit 0;
|
|
|
|
# #############################################################################
|
|
|
|
# ==============================================================================
|
|
# Read cpuset data into a hash, return 0 on error, 1 on success
|
|
# ==============================================================================
|
|
|
|
sub readCpuset {
|
|
my $cpusetPath = shift;
|
|
my $cpusetData = {};
|
|
|
|
# Check if cpuset exists
|
|
unless(-d $cpusetPath) {
|
|
xDebug("ERROR: Cpuset $cpusetPath does not exist.");
|
|
return undef;
|
|
}
|
|
|
|
# Read content of cpuset
|
|
foreach my $key (qw(
|
|
cpu_exclusive
|
|
cpus
|
|
mem_exclusive
|
|
mem_hardwall
|
|
memory_migrate
|
|
memory_pressure
|
|
memory_spread_page
|
|
memory_spread_slab
|
|
mems
|
|
notify_on_release
|
|
sched_load_balance
|
|
sched_relax_domain_level
|
|
)) {
|
|
my $f = "${cpusetPath}/$key";
|
|
next unless -e $f;
|
|
my $rc = execCmd($catCmd,0,$f);
|
|
return undef unless defined $rc; # Command failed
|
|
my $val = undef;
|
|
if(@{$rc}) {
|
|
CASE: {
|
|
$key eq 'tasks' && do { $val = join(",", @{$rc}); last CASE };
|
|
$val = $rc->[0];
|
|
}
|
|
}
|
|
xDebug(sprintf(" cpuset %s: %-25s %s", $cpusetPath, $key, defined $val ? $val : "NO DATA"));
|
|
$cpusetData->{$key} = $val;
|
|
}
|
|
|
|
return $cpusetData;
|
|
|
|
}
|
|
|
|
# ==============================================================================
|
|
# Execute a command with args.
|
|
# Returns arrayref with chomped output on success.
|
|
# On command failure, print error msg and return undef.
|
|
# ==============================================================================
|
|
|
|
sub execCmd {
|
|
my $cmdBase = shift;
|
|
my $verbose = shift;
|
|
my @cmdArgs = @_;
|
|
|
|
if(! $cmdBase) {
|
|
xDebug("ERROR execCmd: need \$cmdBase.");
|
|
return undef;
|
|
}
|
|
|
|
# --
|
|
# Check if cmdBase is executable
|
|
# --
|
|
|
|
if(! -x $cmdBase) {
|
|
xDebug("ERROR: File \"$cmdBase\" does not exist or is not executable.");
|
|
return undef;
|
|
}
|
|
|
|
# --
|
|
# Execute
|
|
# --
|
|
|
|
my $cmd = $cmdBase;
|
|
$cmd .= (" " . join(" ", @cmdArgs)) if @cmdArgs;
|
|
xDebug(" About to execute \"$cmd\"") if $verbose;
|
|
open(CMD, "$cmd 2>&1 |") or do {
|
|
xDebug("ERROR: Failed to execute \"$cmd\": $!");
|
|
return undef;
|
|
};
|
|
|
|
my @cmdOut = (<CMD>);
|
|
chomp @cmdOut;
|
|
|
|
close(CMD);
|
|
my $rc = $? >> 8;
|
|
if($rc) {
|
|
xDebug("ERROR: Command \"$cmd\" returned rc = $rc");
|
|
if(@cmdOut) {
|
|
xDebug(join("\n", map { " $_" } grep { /\S/ } $#cmdOut < 3 ? @cmdOut : (@cmdOut[0..2], "...")));
|
|
}
|
|
return undef;
|
|
}
|
|
|
|
# --
|
|
# Return output
|
|
# --
|
|
|
|
return \@cmdOut;
|
|
|
|
}
|
|
|
|
# ==============================================================================
|
|
# Usage message
|
|
# ==============================================================================
|
|
|
|
sub usage {
|
|
my $code = shift || 0;
|
|
pod2usage(
|
|
-verbose => 0,
|
|
-exitval => "NOEXIT",
|
|
);
|
|
exit $code;
|
|
}
|
|
|
|
# ==============================================================================
|
|
# Man page
|
|
# ==============================================================================
|
|
|
|
sub manPage {
|
|
if ($< == 0) { # Cannot invoke perldoc as root
|
|
my $id = eval { getpwnam("nobody") };
|
|
$id = eval { getpwnam("nouser") } unless defined $id;
|
|
$id = -2 unless defined $id;
|
|
$< = $id;
|
|
}
|
|
$> = $<; # Disengage setuid
|
|
$ENV{PATH} = "/bin:/usr/bin"; # Untaint PATH
|
|
delete @ENV{ 'IFS', 'CDPATH', 'ENV', 'BASH_ENV' };
|
|
if ($0 =~ /^([-\/\w\.]+)$/) {
|
|
$0 = $1; # Untaint $0
|
|
} else {
|
|
die "Illegal characters were found in \$0 ($0)\n";
|
|
}
|
|
pod2usage(
|
|
-verbose => 2,
|
|
-exitval => 0,
|
|
);
|
|
}
|
|
|
|
# ==============================================================================
|
|
# Verbose printing
|
|
# ==============================================================================
|
|
|
|
sub xDebug {
|
|
return unless $options{-verbose};
|
|
my $msg = join("", @_);
|
|
if($msg) {
|
|
foreach(split("\n", $msg)) {
|
|
print STDERR "$progName - $_\n"
|
|
}
|
|
} else {
|
|
print STDERR "$progName - something to debug\n";
|
|
}
|
|
}
|
|
|
|
sub xDie {
|
|
die "$progName - ", @_;
|
|
}
|
|
|
|
__END__
|
|
|
|
=head1 NAME
|
|
|
|
mom_gencfg - Create mom.layout and /dev/cpuset/torque, designed to be called from /etc/init.d/pbs_mom
|
|
|
|
=head1 SYNOPSIS
|
|
|
|
mom_gencfg --help|-?|--man
|
|
|
|
mom_gencfg -(no)layout -(no)cpus -(no)mems -(no)cpuset -(no)smt -(no)dry-run -(no)verbose
|
|
|
|
=head1 DESCRIPTION
|
|
|
|
This script creates /var/spool/torque/mom_priv/mom.layout and creates/modifies /dev/cpuset/torque
|
|
for a pbs_mom that is compiled with --enable-numa-support.
|
|
|
|
The basic configuration like number and offset of NUMA node IDs per nodeboard,
|
|
cpuset settings, and defaults of command line options is hardcoded in the script.
|
|
|
|
The script checks if I<PBS_HOME> is set in the environment. Usually this should point to
|
|
/var/spool/torque.
|
|
|
|
=head1 OPTIONS
|
|
|
|
=over 4
|
|
|
|
=item B<-(no)layout>
|
|
|
|
Create the mom.layout file or not.
|
|
|
|
=item B<-(no)cpus>
|
|
|
|
mom.layout contains cpu IDs per nodeboard or not.
|
|
|
|
=item B<-(no)mems>
|
|
|
|
mom.layout contains memory node IDs per nodeboard or not.
|
|
|
|
=item B<-(no)cpuset>
|
|
|
|
Create/modify /dev/cpuset/torque or not.
|
|
|
|
=item B<-(no)smt>
|
|
|
|
The I<cpus> entry in mom.layout and in /dev/cpuset/torque contain additional
|
|
logical processors running on the same core or not.
|
|
|
|
=item B<-(no)dry-run>
|
|
|
|
If B<-dry-run> is given, show what would have been done. Switches B<-verbose> on, unless B<-noverbose> was given.
|
|
|
|
=item B<-(no)verbose>
|
|
|
|
Verbose printing to STDERR.
|
|
|
|
=item B<-man>
|
|
|
|
Prints this man page.
|
|
|
|
=item B<-help|-?>
|
|
|
|
Prints synopsis.
|
|
|
|
=back
|
|
|
|
=head1 AUTHOR
|
|
|
|
Bernd Kallies, E<lt>kallies@zib.deE<gt>
|
|
|
|
=head1 COPYRIGHT AND LICENSE
|
|
|
|
Copyright (C) 2011 Zuse Institute Berlin
|
|
|
|
This library is free software; you can redistribute it and/or modify
|
|
it under the terms of the GNU General Public License as published by
|
|
the Free Software Foundation.
|
|
|
|
=cut
|