ruoyunbai 2bb9621e30 1
2021-09-29 21:06:16 +08:00

560 lines
18 KiB
Perl

#!/usr/bin/perl
# *****************************************************************************
#
# Copyright 2011 Zuse Institute Berlin
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
#
# Please send comments to kallies@zib.de
#
# *****************************************************************************
# Purpose: - called from /etc/init.d/pbs_mom during start actions.
# - creates /var/spool/torque/mom_priv/mom.layout
# - creates/modifies /dev/cpuset/torque
# Prereq: - hwloc >= 1.1, http://www.open-mpi.org/projects/hwloc/
# - Sys::Hwloc >= 0.09, http://search.cpan.org/~bka/
# Install: Install this script on each UV rack
# /opt/torque/Scripts/mom_gencfg root:root -rwxr-xr-x
# Config: Set MOM_GENCFG=/opt/torque/Scripts/mom_gencfg
# in /etc/init.d/pbs_mom for UV, execute $MOM_GENCFG before
# starting the pbs_mom daemon.
# MOM_GENCFG can be overridden in /etc/sysconfig/pbs_mom.
# *****************************************************************************
# $Id: mom_gencfg,v 1.1.2.1 2011/01/17 10:12:46 acountin Exp $
# *****************************************************************************
#
# *** Instructions for use ***
#
# 1. Install hwloc - see contrib/hwloc_install.sh. This should already be done since
# TORQUE needs hwloc for its cpuset implementation starting in 4.0
# 2. Install Sys::Hwloc from CPAN
# 3. Set $PBS_HOME to the proper value if not already set
# 4. Update the variables in the section 'Config Definitions' Especially update firstNodeId
# and nodesPerBoard if desired.
# firstNodeId should be set above 0 if you have a root cpuset that you wish to exclude
# nodesPerBoard is the number of numa nodes per board. Each node is defined in the
# directory /sys/devices/system/node, in a subdirectory node<node index>
# 5. Backup your current file, just in case a variable is set incorrectly or neglected
# 6. Run this script and enjoy the layout file
#
#
use strict;
use lib qw(
/usr/lib/perl5
/usr/lib/perl5/site_perl
);
use Sys::Hostname;
use File::Basename;
use Getopt::Long qw(:config no_ignore_case);
use autouse 'Pod::Usage' => qw(pod2usage);
use Sys::Hwloc 0.09;
my $progName = basename($0);
my $hostName = hostname();
$SIG{__DIE__} = \&xDie;
# ==============================================================================
# Setup needed before init
# ==============================================================================
BEGIN: {
die "This script needs at least hwloc-1.1\n" unless HWLOC_XSAPI_VERSION() >= 0x00010100;
}
# ==============================================================================
# Config definitions
# ==============================================================================
my $hostNames = undef; # hostname pattern to be run on, undef to skip test
my $cpusetFsName = '/dev/cpuset'; # the name of the cpuset file system
my $cpusetBaseName = '/torque'; # the name of the parent cpuset of a job's cpuset
my $mkdirCmd = '/bin/mkdir'; # the path to the mkdir command
my $catCmd = '/bin/cat'; # the path to the cat command
my $echoCmd = '/bin/echo'; # the path to the echo command
my $momCfgDir = 'mom_priv'; # the directory where MOM configs are stored
my $momLayoutFile = 'mom.layout'; # the name of the MOM layout file
my $firstNodeId = 0; # ID of 1st NUMA node to be used by Torque (start with 0)
my $lastNodeId = undef; # ID of last NUMA node to be used (undef means last available)
my $nodesPerBoard = 1; # number of NUMA nodes per nodeboard
my %cpusetConf = (
cpus => undef, # undef means auto-generate
mems => undef, # undef means auto-generate
cpu_exclusive => 1, #
mem_exclusive => 1, #
);
my %options = (
-doLayout => 1, # generate mom.layout
-withCpus => 1, # include cpus in mom.layout
-withMems => 1, # include mems in mom.layout
-doCpuset => 1, # generate/modify /torque cpuset
-withSmt => 1, # include logical processors running on the same core
-verbose => undef, # be verbose to STDERR
-dryRun => undef, # no actions, just tell what would be done
);
# ==============================================================================
# Command line options
# ==============================================================================
GetOptions(
"layout!" => \$options{-doLayout},
"cpus!" => \$options{-withCpus},
"mems!" => \$options{-withMems},
"smt!" => \$options{-withSmt},
"cpuset!" => \$options{-doCpuset},
"dry-run!" => \$options{-dryRun},
"verbose!" => \$options{-verbose},
"help|?" => sub { usage(0) },
"man" => sub { manPage() },
) or usage(2);
if($options{-dryRun}) {
$options{-verbose} = 1 unless defined $options{-verbose};
xDebug(">>> DryRunDryRunDryRunDryRunDryRun <<<");
}
# ==============================================================================
# Quick exit if not wanted on this host, or if no work to do
# ==============================================================================
#if(defined $hostNames) {
# unless($hostName =~ /$hostNames/) {
# xDebug("--- Don't run on $hostName ---");
# exit 0;
# }
#}
exit 0 unless ($options{-doLayout} || $options{-doCpuset});
# ==============================================================================
# See if PBS_HOME is set, and if $PBS_HOME/mom_priv exists.
# If not, we are probably not called correctly, thus die.
# See if cpusets are configured. If not, die.
# ==============================================================================
die "\$PBS_HOME not set\n" unless (exists $ENV{PBS_HOME} && $ENV{PBS_HOME});
die "PBS_HOME=$ENV{PBS_HOME} does not exist\n" unless -d $ENV{PBS_HOME};
$momCfgDir = "$ENV{PBS_HOME}/${momCfgDir}";
die "MOM config dir $momCfgDir does not exist\n" unless -d $momCfgDir;
$momLayoutFile = "${momCfgDir}/${momLayoutFile}";
die "this system does not support cpusets\n" unless -d $cpusetFsName;
# ==============================================================================
# Figure out system topology, collect wanted node objects
# ==============================================================================
my $topology = Sys::Hwloc::Topology->init;
die "Failed to init topology\n" unless defined $topology;
$topology->set_flags(HWLOC_TOPOLOGY_FLAG_WHOLE_SYSTEM);
die("Failed to load topology\n") if $topology->load;
# ==============================================================================
# Collect nodesets of wanted NUMA nodes per nodeBoard
# ==============================================================================
my @nodeBoards = ();
my $nodeObj = undef;
my $nNodes = 0;
while($nodeObj = $topology->get_next_obj_by_type(HWLOC_OBJ_NODE, $nodeObj)) {
my $nodeId = $nodeObj->logical_index;
next if $nodeId < $firstNodeId;
last if (defined $lastNodeId && $nodeId > $lastNodeId);
if($nNodes) {
$nodeBoards[$#nodeBoards]->{nodeset}->or($nodeObj->nodeset);
} else {
push @nodeBoards, {
cpuset => Sys::Hwloc::Bitmap->new,
nodeset => $nodeObj->nodeset->dup,
};
}
$nNodes++;
$nNodes = 0 if $nNodes >= $nodesPerBoard;
}
# ==============================================================================
# Assemble cpusets per nodeBoard
# ==============================================================================
foreach my $nodeBoard (@nodeBoards) {
$topology->cpuset_from_nodeset_strict($nodeBoard->{cpuset}, $nodeBoard->{nodeset});
next if $options{-withSmt};
my $core = undef;
while($core = $topology->get_next_obj_inside_cpuset_by_type($nodeBoard->{cpuset}, HWLOC_OBJ_CORE, $core)) {
my $j = 1;
while (my $pu = $topology->get_obj_inside_cpuset_by_type($core->cpuset, HWLOC_OBJ_PU, $j++)) {
$nodeBoard->{cpuset}->andnot($pu->cpuset);
}
}
}
# ==============================================================================
# Generate mom.layout
# ==============================================================================
if($options{-doLayout}) {
xDebug("--- Generating $momLayoutFile ---");
if(! $options{-dryRun}) {
open(FILE, "> $momLayoutFile") or die "failed to open $momLayoutFile: $!\n";
}
foreach my $nodeBoard (@nodeBoards) {
my $line = sprintf("nodes=%s", $nodeBoard->{nodeset}->sprintf_list);
$line .= sprintf(" cpus=%s", $nodeBoard->{cpuset}->sprintf_list) if $options{-withCpus};
$line .= sprintf(" mems=%s", $nodeBoard->{nodeset}->sprintf_list) if $options{-withMems};
xDebug(" $line");
print FILE "$line\n" unless $options{-dryRun};
}
close(FILE) unless $options{-dryRun};
}
# ==============================================================================
# Create/modify torque cpuset
# ==============================================================================
if($options{-doCpuset}) {
# Create it if it is not there
my $cpusetPath = "${cpusetFsName}${cpusetBaseName}";
if(! -d $cpusetPath) {
xDebug("--- Creating $cpusetPath ---");
my $rc = execCmd($mkdirCmd,1,$cpusetPath);
die "Failed to create $cpusetPath\n" unless defined $rc;
}
# Read content
xDebug("--- Reading $cpusetPath ---");
my $cpusetData = readCpuset($cpusetPath);
die "Failed to read $cpusetPath\n" unless defined $cpusetData;
# Assemble changes
my %cpusetMod = ();
foreach my $key (keys %cpusetConf) {
next unless exists $cpusetData->{$key};
my $val = $cpusetConf{$key};
CASE: {
$key eq 'cpus' && do {
if(! defined $val) {
my $cpuset = Sys::Hwloc::Bitmap->new;
foreach my $nodeBoard (@nodeBoards) {
$cpuset->or($nodeBoard->{cpuset});
}
$val = $cpuset->sprintf_list;
$cpuset->free;
}
last CASE;
};
$key eq 'mems' && do {
if(! defined $val) {
my $nodeset = Sys::Hwloc::Bitmap->new;
foreach my $nodeBoard (@nodeBoards) {
$nodeset->or($nodeBoard->{nodeset});
}
$val = $nodeset->sprintf_list;
$nodeset->free;
}
last CASE;
};
}
next unless defined $val;
if(
(! defined $cpusetData->{$key}) ||
(defined $cpusetData->{$key} && $cpusetData->{$key} ne $val)
) {
$cpusetMod{$key} = $val;
}
}
# Write changes, if any. Don't abort on error, but warn if changes not done
if(%cpusetMod) {
xDebug("--- Modifying $cpusetPath ---");
if($options{-dryRun}) {
while(my ($key, $val) = each %cpusetMod) {
xDebug(sprintf(" = cpuset %s: %-25s %s", $cpusetPath, $key, $val));
}
} else {
while(my ($key, $val) = each %cpusetMod) {
my $out = execCmd($echoCmd, 0, "$val > ${cpusetPath}/$key");
}
if($options{-verbose}) {
$cpusetData = readCpuset($cpusetPath);
die "Failed to read $cpusetPath\n" unless defined $cpusetData;
while(my ($key, $val) = each %cpusetMod) {
xDebug(sprintf(" %s cpuset %s: %-25s %s", $val eq $cpusetData->{$key} ? '=' : '-', $cpusetPath, $key, $val));
}
}
}
}
}
# ==============================================================================
# All done
# ==============================================================================
$topology->destroy;
exit 0;
# #############################################################################
# ==============================================================================
# Read cpuset data into a hash, return 0 on error, 1 on success
# ==============================================================================
sub readCpuset {
my $cpusetPath = shift;
my $cpusetData = {};
# Check if cpuset exists
unless(-d $cpusetPath) {
xDebug("ERROR: Cpuset $cpusetPath does not exist.");
return undef;
}
# Read content of cpuset
foreach my $key (qw(
cpu_exclusive
cpus
mem_exclusive
mem_hardwall
memory_migrate
memory_pressure
memory_spread_page
memory_spread_slab
mems
notify_on_release
sched_load_balance
sched_relax_domain_level
)) {
my $f = "${cpusetPath}/$key";
next unless -e $f;
my $rc = execCmd($catCmd,0,$f);
return undef unless defined $rc; # Command failed
my $val = undef;
if(@{$rc}) {
CASE: {
$key eq 'tasks' && do { $val = join(",", @{$rc}); last CASE };
$val = $rc->[0];
}
}
xDebug(sprintf(" cpuset %s: %-25s %s", $cpusetPath, $key, defined $val ? $val : "NO DATA"));
$cpusetData->{$key} = $val;
}
return $cpusetData;
}
# ==============================================================================
# Execute a command with args.
# Returns arrayref with chomped output on success.
# On command failure, print error msg and return undef.
# ==============================================================================
sub execCmd {
my $cmdBase = shift;
my $verbose = shift;
my @cmdArgs = @_;
if(! $cmdBase) {
xDebug("ERROR execCmd: need \$cmdBase.");
return undef;
}
# --
# Check if cmdBase is executable
# --
if(! -x $cmdBase) {
xDebug("ERROR: File \"$cmdBase\" does not exist or is not executable.");
return undef;
}
# --
# Execute
# --
my $cmd = $cmdBase;
$cmd .= (" " . join(" ", @cmdArgs)) if @cmdArgs;
xDebug(" About to execute \"$cmd\"") if $verbose;
open(CMD, "$cmd 2>&1 |") or do {
xDebug("ERROR: Failed to execute \"$cmd\": $!");
return undef;
};
my @cmdOut = (<CMD>);
chomp @cmdOut;
close(CMD);
my $rc = $? >> 8;
if($rc) {
xDebug("ERROR: Command \"$cmd\" returned rc = $rc");
if(@cmdOut) {
xDebug(join("\n", map { " $_" } grep { /\S/ } $#cmdOut < 3 ? @cmdOut : (@cmdOut[0..2], "...")));
}
return undef;
}
# --
# Return output
# --
return \@cmdOut;
}
# ==============================================================================
# Usage message
# ==============================================================================
sub usage {
my $code = shift || 0;
pod2usage(
-verbose => 0,
-exitval => "NOEXIT",
);
exit $code;
}
# ==============================================================================
# Man page
# ==============================================================================
sub manPage {
if ($< == 0) { # Cannot invoke perldoc as root
my $id = eval { getpwnam("nobody") };
$id = eval { getpwnam("nouser") } unless defined $id;
$id = -2 unless defined $id;
$< = $id;
}
$> = $<; # Disengage setuid
$ENV{PATH} = "/bin:/usr/bin"; # Untaint PATH
delete @ENV{ 'IFS', 'CDPATH', 'ENV', 'BASH_ENV' };
if ($0 =~ /^([-\/\w\.]+)$/) {
$0 = $1; # Untaint $0
} else {
die "Illegal characters were found in \$0 ($0)\n";
}
pod2usage(
-verbose => 2,
-exitval => 0,
);
}
# ==============================================================================
# Verbose printing
# ==============================================================================
sub xDebug {
return unless $options{-verbose};
my $msg = join("", @_);
if($msg) {
foreach(split("\n", $msg)) {
print STDERR "$progName - $_\n"
}
} else {
print STDERR "$progName - something to debug\n";
}
}
sub xDie {
die "$progName - ", @_;
}
__END__
=head1 NAME
mom_gencfg - Create mom.layout and /dev/cpuset/torque, designed to be called from /etc/init.d/pbs_mom
=head1 SYNOPSIS
mom_gencfg --help|-?|--man
mom_gencfg -(no)layout -(no)cpus -(no)mems -(no)cpuset -(no)smt -(no)dry-run -(no)verbose
=head1 DESCRIPTION
This script creates /var/spool/torque/mom_priv/mom.layout and creates/modifies /dev/cpuset/torque
for a pbs_mom that is compiled with --enable-numa-support.
The basic configuration like number and offset of NUMA node IDs per nodeboard,
cpuset settings, and defaults of command line options is hardcoded in the script.
The script checks if I<PBS_HOME> is set in the environment. Usually this should point to
/var/spool/torque.
=head1 OPTIONS
=over 4
=item B<-(no)layout>
Create the mom.layout file or not.
=item B<-(no)cpus>
mom.layout contains cpu IDs per nodeboard or not.
=item B<-(no)mems>
mom.layout contains memory node IDs per nodeboard or not.
=item B<-(no)cpuset>
Create/modify /dev/cpuset/torque or not.
=item B<-(no)smt>
The I<cpus> entry in mom.layout and in /dev/cpuset/torque contain additional
logical processors running on the same core or not.
=item B<-(no)dry-run>
If B<-dry-run> is given, show what would have been done. Switches B<-verbose> on, unless B<-noverbose> was given.
=item B<-(no)verbose>
Verbose printing to STDERR.
=item B<-man>
Prints this man page.
=item B<-help|-?>
Prints synopsis.
=back
=head1 AUTHOR
Bernd Kallies, E<lt>kallies@zib.deE<gt>
=head1 COPYRIGHT AND LICENSE
Copyright (C) 2011 Zuse Institute Berlin
This library is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation.
=cut