--- Begin Message ---
- To: Debian Bug Tracking System <submit@bugs.debian.org>
- Subject: unblock: slurm-llnl/16.05.9-1
- From: Mehdi Dogguy <mehdi@debian.org>
- Date: Thu, 27 Apr 2017 00:43:29 +0200
- Message-id: <149324660920.9926.8498200408620641936.reportbug@athena>
Package: release.debian.org
Severity: normal
User: release.debian.org@packages.debian.org
Usertags: unblock
Slurm 16.05.9-1 has been uploaded to Unstable a while ago and is a bug
fix release. The diff is large but it contains many fixes (See summary
in upstream's NEWS file) and Slurm minor releases have always been
considered safe. Besides, Slurm 16.05.9-1 has stayed in Unstable for a
while now without issues.
Can you please consider unblocking slurm-llnl?
-- System Information:
Debian Release: 9.0
APT prefers testing
APT policy: (990, 'testing'), (500, 'unstable'), (1, 'experimental')
Architecture: amd64
(x86_64)
Foreign Architectures: i386
Kernel: Linux 4.9.0-2-amd64 (SMP w/4 CPU cores)
Locale: LANG=en_US.UTF-8, LC_CTYPE=en_US.UTF-8 (charmap=UTF-8)
Shell: /bin/sh linked to /bin/dash
Init: systemd (via /run/systemd/system)
diff -Nru slurm-llnl-16.05.8/debian/changelog slurm-llnl-16.05.9/debian/changelog
--- slurm-llnl-16.05.8/debian/changelog 2017-01-07 02:40:23.000000000 +0100
+++ slurm-llnl-16.05.9/debian/changelog 2017-02-03 09:50:02.000000000 +0100
@@ -1,3 +1,10 @@
+slurm-llnl (16.05.9-1) unstable; urgency=medium
+
+ * New upstream release
+ * Overrides spelling-error-in-binary false positives
+
+ -- Gennaro Oliva <oliva.g@na.icar.cnr.it> Fri, 03 Feb 2017 09:50:02 +0100
+
slurm-llnl (16.05.8-1) unstable; urgency=medium
* New upstream release
diff -Nru slurm-llnl-16.05.8/debian/libslurm30.lintian-overrides slurm-llnl-16.05.9/debian/libslurm30.lintian-overrides
--- slurm-llnl-16.05.8/debian/libslurm30.lintian-overrides 2017-01-04 23:42:58.000000000 +0100
+++ slurm-llnl-16.05.9/debian/libslurm30.lintian-overrides 2017-02-02 09:41:24.000000000 +0100
@@ -12,3 +12,4 @@
# This happens because because slurm_job_preempt_mode is contained in
# /usr/sbin/slurmctld and will never be referenced when running sinfo.
hardening-no-bindnow
+spelling-error-in-binary
diff -Nru slurm-llnl-16.05.8/debian/libslurmdb30.lintian-overrides slurm-llnl-16.05.9/debian/libslurmdb30.lintian-overrides
--- slurm-llnl-16.05.8/debian/libslurmdb30.lintian-overrides 2017-01-04 23:42:58.000000000 +0100
+++ slurm-llnl-16.05.9/debian/libslurmdb30.lintian-overrides 2017-02-02 09:41:24.000000000 +0100
@@ -12,3 +12,4 @@
# This happens because because slurm_job_preempt_mode is contained in
# /usr/sbin/slurmctld and will never be referenced when running sinfo.
hardening-no-bindnow
+spelling-error-in-binary
diff -Nru slurm-llnl-16.05.8/debian/slurm-client-emulator.lintian-overrides slurm-llnl-16.05.9/debian/slurm-client-emulator.lintian-overrides
--- slurm-llnl-16.05.8/debian/slurm-client-emulator.lintian-overrides 2017-01-04 23:42:58.000000000 +0100
+++ slurm-llnl-16.05.9/debian/slurm-client-emulator.lintian-overrides 2017-02-02 09:41:24.000000000 +0100
@@ -1 +1,2 @@
slurm-client-emulator: hardening-no-bindnow
+spelling-error-in-binary
diff -Nru slurm-llnl-16.05.8/debian/slurm-client.lintian-overrides slurm-llnl-16.05.9/debian/slurm-client.lintian-overrides
--- slurm-llnl-16.05.8/debian/slurm-client.lintian-overrides 2017-01-04 23:42:58.000000000 +0100
+++ slurm-llnl-16.05.9/debian/slurm-client.lintian-overrides 2017-02-02 09:41:24.000000000 +0100
@@ -1,3 +1,4 @@
slurm-client: manpage-has-errors-from-man
slurm-client: conflicts-with-version
slurm-client: hardening-no-bindnow
+spelling-error-in-binary
diff -Nru slurm-llnl-16.05.8/debian/slurmctld.lintian-overrides slurm-llnl-16.05.9/debian/slurmctld.lintian-overrides
--- slurm-llnl-16.05.8/debian/slurmctld.lintian-overrides 2017-01-04 23:42:58.000000000 +0100
+++ slurm-llnl-16.05.9/debian/slurmctld.lintian-overrides 2017-02-02 09:41:24.000000000 +0100
@@ -1,2 +1,3 @@
slurmctld: possible-documentation-but-no-doc-base-registration
slurmctld: hardening-no-bindnow
+spelling-error-in-binary
diff -Nru slurm-llnl-16.05.8/debian/slurmdbd.lintian-overrides slurm-llnl-16.05.9/debian/slurmdbd.lintian-overrides
--- slurm-llnl-16.05.8/debian/slurmdbd.lintian-overrides 2017-01-04 23:42:58.000000000 +0100
+++ slurm-llnl-16.05.9/debian/slurmdbd.lintian-overrides 2017-02-02 09:41:24.000000000 +0100
@@ -1 +1,2 @@
slurmdbd: hardening-no-bindnow
+spelling-error-in-binary
diff -Nru slurm-llnl-16.05.8/debian/slurmd.lintian-overrides slurm-llnl-16.05.9/debian/slurmd.lintian-overrides
--- slurm-llnl-16.05.8/debian/slurmd.lintian-overrides 2017-01-04 23:42:58.000000000 +0100
+++ slurm-llnl-16.05.9/debian/slurmd.lintian-overrides 2017-02-02 09:41:24.000000000 +0100
@@ -1 +1,2 @@
slurmd: hardening-no-bindnow
+spelling-error-in-binary
diff -Nru slurm-llnl-16.05.8/debian/slurm-wlm-emulator.lintian-overrides slurm-llnl-16.05.9/debian/slurm-wlm-emulator.lintian-overrides
--- slurm-llnl-16.05.8/debian/slurm-wlm-emulator.lintian-overrides 2017-01-04 23:42:58.000000000 +0100
+++ slurm-llnl-16.05.9/debian/slurm-wlm-emulator.lintian-overrides 2017-02-02 09:41:24.000000000 +0100
@@ -1 +1,2 @@
slurm-wlm-emulator: hardening-no-bindnow
+spelling-error-in-binary
diff -Nru slurm-llnl-16.05.8/debian/sview.lintian-overrides slurm-llnl-16.05.9/debian/sview.lintian-overrides
--- slurm-llnl-16.05.8/debian/sview.lintian-overrides 2017-01-04 23:42:58.000000000 +0100
+++ slurm-llnl-16.05.9/debian/sview.lintian-overrides 2017-02-02 09:41:24.000000000 +0100
@@ -1 +1,2 @@
sview: hardening-no-bindnow
+spelling-error-in-binary
diff -Nru slurm-llnl-16.05.8/doc/html/prolog_epilog.shtml slurm-llnl-16.05.9/doc/html/prolog_epilog.shtml
--- slurm-llnl-16.05.8/doc/html/prolog_epilog.shtml 2017-01-04 22:11:51.000000000 +0100
+++ slurm-llnl-16.05.9/doc/html/prolog_epilog.shtml 2017-01-31 20:56:34.000000000 +0100
@@ -130,7 +130,7 @@
</tr>
</tbody></table>
</center>
-
+<br>
<p>This second table below identifies what prologs and epilogs are available for job
step allocations, when and where they run.</p>
diff -Nru slurm-llnl-16.05.8/doc/html/publications.shtml slurm-llnl-16.05.9/doc/html/publications.shtml
--- slurm-llnl-16.05.8/doc/html/publications.shtml 2017-01-04 22:11:51.000000000 +0100
+++ slurm-llnl-16.05.9/doc/html/publications.shtml 2017-01-31 20:56:34.000000000 +0100
@@ -305,6 +305,8 @@
Yiannis Georgiou and David Glesser (Bull),
Krzysztof Rzadca (University of Warsaw),
Denis Trystram (University Grenoble-Alpes)</li>
+
+<li><a href="SUG14/data_movement.pdf">High Performance Data movement between Lustre and Enterprise storage systems</a>
Aamir Rashid (Terascala)</li>
<li><a href="SUG14/remote_gpu.pdf">Extending Slurm with Support for Remote GPU Virtualization</a>
@@ -775,6 +777,6 @@
Learning Chef: Compute Cluter with Slurm</a>
A Slurm Cookbook by Adam DeConinck</p>
-<p style="text-align:center;">Last modified 29 November 2016</p>
+<p style="text-align:center;">Last modified 12 January 2017</p>
<!--#include virtual="footer.txt"-->
diff -Nru slurm-llnl-16.05.8/doc/html/reset.css slurm-llnl-16.05.9/doc/html/reset.css
--- slurm-llnl-16.05.8/doc/html/reset.css 2017-01-04 22:11:51.000000000 +0100
+++ slurm-llnl-16.05.9/doc/html/reset.css 2017-01-31 20:56:34.000000000 +0100
@@ -6,7 +6,7 @@
b, u, i, center,
ol, ul, li,
fieldset, form, label, legend,
-table, caption, tbody, tfoot, thead, tr, th, td,
+caption, tbody, tfoot, thead, th,
article, aside, canvas, details, embed,
figure, figcaption, footer, header, hgroup,
menu, nav, output, ruby, section, summary,
@@ -44,6 +44,5 @@
}
table {
- border-collapse: collapse;
border-spacing: 0;
}
diff -Nru slurm-llnl-16.05.8/doc/html/style.css slurm-llnl-16.05.9/doc/html/style.css
--- slurm-llnl-16.05.8/doc/html/style.css 2017-01-04 22:11:51.000000000 +0100
+++ slurm-llnl-16.05.9/doc/html/style.css 2017-01-31 20:56:34.000000000 +0100
@@ -23,7 +23,6 @@
.container {
margin: 0 auto;
padding: 0 18px;
- max-width: 1400px;
}
.container--main {
@@ -661,6 +660,7 @@
@media screen and (min-width: 32em) {
.container {
padding: 0 36px;
+ max-width: 100%;
}
}
@@ -673,6 +673,7 @@
.container {
padding: 0 48px;
+ max-width: 90%;
}
.container--main {
@@ -732,7 +733,7 @@
}
.content .container {
- padding: 0 8% 0 8%;
+ padding: 0 0 0 100px;
margin: 0;
}
@@ -772,6 +773,9 @@
/* Extra Large Size */
@media screen and (min-width: 78em) {
-
+ .container {
+ padding: 0 48px;
+ max-width: 90%;
+ }
}
diff -Nru slurm-llnl-16.05.8/META slurm-llnl-16.05.9/META
--- slurm-llnl-16.05.8/META 2017-01-04 22:11:51.000000000 +0100
+++ slurm-llnl-16.05.9/META 2017-01-31 20:56:34.000000000 +0100
@@ -7,8 +7,8 @@
Name: slurm
Major: 16
Minor: 05
- Micro: 8
- Version: 16.05.8
+ Micro: 9
+ Version: 16.05.9
Release: 1
# Include leading zero for all pre-releases
diff -Nru slurm-llnl-16.05.8/NEWS slurm-llnl-16.05.9/NEWS
--- slurm-llnl-16.05.8/NEWS 2017-01-04 22:11:51.000000000 +0100
+++ slurm-llnl-16.05.9/NEWS 2017-01-31 20:56:34.000000000 +0100
@@ -1,6 +1,46 @@
This file describes changes in recent versions of Slurm. It primarily
documents those changes that are of interest to users and administrators.
+* Changes in Slurm 16.05.9
+==========================
+ -- Fix parsing of SBCAST_COMPRESS environment variable in sbcast.
+ -- Change some debug messages to errors in task/cgroup plugin.
+ -- backfill scheduler: Stop trying to determine expected start time for a job
+ after 2 seconds of wall time. This can happen if there are many running jobs
+ and a pending job can not be started soon.
+ -- Improve performance of cr_sort_part_rows() in cons_res plugin.
+ -- CRAY - Fix dealock issue when updating accounting in the slurmctld and
+ scheduling a Datawarp job.
+ -- Correct the job state accounting information for jobs requeued due to burst
+ buffer errors.
+ -- burst_buffer/cray - Avoid "pre_run" operation if not using buffer (i.e.
+ just creating or deleting a persistent burst buffer).
+ -- Fix slurm.spec file support for BlueGene builds.
+ -- Fix missing TRES read lock in acct_policy_job_runnable_pre_select() code.
+ -- Fix debug2 message printing value using wrong array index in
+ _qos_job_runnable_post_select().
+ -- Prevent job timeout on node power up.
+ -- MYSQL - Fix minor memory leak when querying steps and the sql fails.
+ -- Make it so sacctmgr accepts column headers like MaxTRESPU and not MaxTRESP.
+ -- Only look at SLURM_STEP_KILLED_MSG_NODE_ID on startup, to avoid race
+ condition later when looking at a steps env.
+ -- Make backfill scheduler behave like regular scheduler in respect to
+ 'assoc_limit_stop'.
+ -- Allow a lower version client command to talk to a higher version contoller
+ using the multi-cluster options (e.g. squeue -M<clsuter>).
+ -- slurmctld/agent race condition fix: Prevent job launch while PrologSlurmctld
+ daemon is running or node boot in progress.
+ -- MYSQL - Fix a few other minor memory leaks when uncommon failures occur.
+ -- burst_buffer/cray - Fix race condition that could cause multiple batch job
+ launch requests resulting in drained nodes.
+ -- Correct logic to purge old reservations.
+ -- Fix DBD cache restore from previous versions.
+ -- Fix to logic for getting expected start time of existing job ID with
+ explicit begin time that is in the past.
+ -- Clear job's reason of "BeginTime" in a more timely fashion and/or prevents
+ them from being stuck in a PENDING state.
+ -- Make sure acct policy limits imposed on a job are correct after requeue.
+
* Changes in Slurm 16.05.8
==========================
-- Remove StoragePass from being printed out in the slurmdbd log at debug2
diff -Nru slurm-llnl-16.05.8/slurm.spec slurm-llnl-16.05.9/slurm.spec
--- slurm-llnl-16.05.8/slurm.spec 2017-01-04 22:11:51.000000000 +0100
+++ slurm-llnl-16.05.9/slurm.spec 2017-01-31 20:56:34.000000000 +0100
@@ -564,7 +564,6 @@
rm -f $RPM_BUILD_ROOT/%{_libdir}/slurm/auth_none.so
%endif
%if ! %{slurm_with bluegene}
-rm -f $RPM_BUILD_ROOT/%{_libdir}/slurm/job_submit_cnode.so
rm -f $RPM_BUILD_ROOT/%{_libdir}/slurm/libsched_if.so
rm -f $RPM_BUILD_ROOT/%{_libdir}/slurm/libsched_if64.so
rm -f $RPM_BUILD_ROOT/%{_libdir}/slurm/runjob_plugin.so
@@ -877,7 +876,6 @@
%{_sbindir}/slurm_epilog
%{_sbindir}/slurm_prolog
%{_sbindir}/sfree
-%{_libdir}/slurm/job_submit_cnode.so
%config %{_sysconfdir}/bluegene.conf.example
%endif
#############################################################################
diff -Nru slurm-llnl-16.05.8/src/common/slurmdbd_defs.c slurm-llnl-16.05.9/src/common/slurmdbd_defs.c
--- slurm-llnl-16.05.8/src/common/slurmdbd_defs.c 2017-01-04 22:11:51.000000000 +0100
+++ slurm-llnl-16.05.9/src/common/slurmdbd_defs.c 2017-01-31 20:56:34.000000000 +0100
@@ -2348,21 +2348,16 @@
need to set it back to 0 */
set_buf_offset(buffer, 0);
safe_unpackstr_xmalloc(&ver_str, &ver_str_len, buffer);
- if (remaining_buf(buffer))
- goto unpack_error;
debug3("Version string in dbd_state header is %s", ver_str);
+ unpack_error:
free_buf(buffer);
buffer = NULL;
- unpack_error:
if (ver_str) {
- char curr_ver_str[10];
- snprintf(curr_ver_str, sizeof(curr_ver_str),
- "VER%d", SLURM_PROTOCOL_VERSION);
- if (!xstrcmp(ver_str, curr_ver_str))
- rpc_version = SLURM_PROTOCOL_VERSION;
+ /* get the version after VER */
+ rpc_version = slurm_atoul(ver_str + 3);
+ xfree(ver_str);
}
- xfree(ver_str);
while (1) {
/* If the buffer was not the VER%d string it
was an actual message so we don't want to
diff -Nru slurm-llnl-16.05.8/src/common/slurmdb_pack.c slurm-llnl-16.05.9/src/common/slurmdb_pack.c
--- slurm-llnl-16.05.8/src/common/slurmdb_pack.c 2017-01-04 22:11:51.000000000 +0100
+++ slurm-llnl-16.05.9/src/common/slurmdb_pack.c 2017-01-31 20:56:34.000000000 +0100
@@ -809,6 +809,8 @@
goto unpack_error;
safe_unpack16(&object_ptr->rpc_version, buffer);
+ object_ptr->rpc_version = MIN(SLURM_PROTOCOL_VERSION,
+ object_ptr->rpc_version);
safe_unpackstr_xmalloc(&object_ptr->tres_str,
&uint32_tmp, buffer);
} else if (rpc_version >= SLURM_MIN_PROTOCOL_VERSION) {
diff -Nru slurm-llnl-16.05.8/src/plugins/accounting_storage/mysql/as_mysql_convert.c slurm-llnl-16.05.9/src/plugins/accounting_storage/mysql/as_mysql_convert.c
--- slurm-llnl-16.05.8/src/plugins/accounting_storage/mysql/as_mysql_convert.c 2017-01-04 22:11:51.000000000 +0100
+++ slurm-llnl-16.05.9/src/plugins/accounting_storage/mysql/as_mysql_convert.c 2017-01-31 20:56:34.000000000 +0100
@@ -804,6 +804,7 @@
error("No grp_cpus col name in assoc_table "
"for cluster %s, this should never happen",
cluster_name);
+ mysql_free_result(result);
continue;
}
@@ -899,6 +900,7 @@
if (!(row = mysql_fetch_row(result)) || !row[0] || !row[0][0]) {
error("No count col name for cluster %s, "
"this should never happen", cluster_name);
+ mysql_free_result(result);
continue;
}
diff -Nru slurm-llnl-16.05.8/src/plugins/accounting_storage/mysql/as_mysql_jobacct_process.c slurm-llnl-16.05.9/src/plugins/accounting_storage/mysql/as_mysql_jobacct_process.c
--- slurm-llnl-16.05.8/src/plugins/accounting_storage/mysql/as_mysql_jobacct_process.c 2017-01-04 22:11:51.000000000 +0100
+++ slurm-llnl-16.05.9/src/plugins/accounting_storage/mysql/as_mysql_jobacct_process.c 2017-01-31 20:56:34.000000000 +0100
@@ -502,6 +502,7 @@
local_cluster_list = setup_cluster_list_with_inx(
mysql_conn, job_cond, (void **)&curr_cluster);
if (!local_cluster_list) {
+ mysql_free_result(result);
rc = SLURM_ERROR;
goto end_it;
}
@@ -785,6 +786,7 @@
mysql_conn, query, 0))) {
xfree(query);
rc = SLURM_ERROR;
+ mysql_free_result(result);
goto end_it;
}
xfree(query);
diff -Nru slurm-llnl-16.05.8/src/plugins/accounting_storage/mysql/as_mysql_resource.c slurm-llnl-16.05.9/src/plugins/accounting_storage/mysql/as_mysql_resource.c
--- slurm-llnl-16.05.8/src/plugins/accounting_storage/mysql/as_mysql_resource.c 2017-01-04 22:11:51.000000000 +0100
+++ slurm-llnl-16.05.9/src/plugins/accounting_storage/mysql/as_mysql_resource.c 2017-01-31 20:56:34.000000000 +0100
@@ -312,6 +312,7 @@
if (!(row = mysql_fetch_row(result))) {
error("Resource id %u is not known on the system", res_id);
+ mysql_free_result(result);
return percent_used;
}
@@ -383,6 +384,7 @@
if (!(row = mysql_fetch_row(result))) {
error("Resource id %u is not known on the system", res->id);
+ mysql_free_result(result);
return SLURM_ERROR;
}
@@ -1100,6 +1102,8 @@
if (!query_clusters && !vals) {
xfree(clus_vals);
+ if (result)
+ mysql_free_result(result);
errno = SLURM_NO_CHANGE_IN_DATA;
error("Nothing to change");
return NULL;
diff -Nru slurm-llnl-16.05.8/src/plugins/accounting_storage/mysql/as_mysql_rollup.c slurm-llnl-16.05.9/src/plugins/accounting_storage/mysql/as_mysql_rollup.c
--- slurm-llnl-16.05.8/src/plugins/accounting_storage/mysql/as_mysql_rollup.c 2017-01-04 22:11:51.000000000 +0100
+++ slurm-llnl-16.05.9/src/plugins/accounting_storage/mysql/as_mysql_rollup.c 2017-01-31 20:56:34.000000000 +0100
@@ -1199,6 +1199,7 @@
mysql_conn,
query, 0))) {
rc = SLURM_ERROR;
+ mysql_free_result(result);
goto end_it;
}
xfree(query);
diff -Nru slurm-llnl-16.05.8/src/plugins/burst_buffer/cray/burst_buffer_cray.c slurm-llnl-16.05.9/src/plugins/burst_buffer/cray/burst_buffer_cray.c
--- slurm-llnl-16.05.8/src/plugins/burst_buffer/cray/burst_buffer_cray.c 2017-01-04 22:11:51.000000000 +0100
+++ slurm-llnl-16.05.9/src/plugins/burst_buffer/cray/burst_buffer_cray.c 2017-01-31 20:56:34.000000000 +0100
@@ -1205,8 +1205,8 @@
num_instances = 0; /* Redundant, but fixes CLANG bug */
}
sessions = _bb_get_sessions(&num_sessions, &bb_state, timeout);
- slurm_mutex_lock(&bb_state.bb_mutex);
assoc_mgr_lock(&assoc_locks);
+ slurm_mutex_lock(&bb_state.bb_mutex);
bb_state.last_load_time = time(NULL);
for (i = 0; i < num_sessions; i++) {
if (!init_config) {
@@ -1251,8 +1251,8 @@
if (bb_alloc->job_id == 0)
bb_post_persist_create(NULL, bb_alloc, &bb_state);
}
- assoc_mgr_unlock(&assoc_locks);
slurm_mutex_unlock(&bb_state.bb_mutex);
+ assoc_mgr_unlock(&assoc_locks);
_bb_free_sessions(sessions, num_sessions);
_bb_free_instances(instances, num_instances);
@@ -3383,6 +3383,7 @@
/* Run "paths" function, get DataWarp environment variables */
if (_have_dw_cmd_opts(bb_job)) {
+ /* Setup "paths" operation */
if (bb_state.bb_config.validate_timeout)
timeout = bb_state.bb_config.validate_timeout * 1000;
else
@@ -3422,48 +3423,52 @@
}
xfree(resp_msg);
_free_script_argv(script_argv);
- }
- pre_run_argv = xmalloc(sizeof(char *) * 10);
- pre_run_argv[0] = xstrdup("dw_wlm_cli");
- pre_run_argv[1] = xstrdup("--function");
- pre_run_argv[2] = xstrdup("pre_run");
- pre_run_argv[3] = xstrdup("--token");
- xstrfmtcat(pre_run_argv[4], "%u", job_ptr->job_id);
- pre_run_argv[5] = xstrdup("--job");
- xstrfmtcat(pre_run_argv[6], "%s/script", job_dir);
- if (client_nodes_file_nid) {
+ /* Setup "pre_run" operation */
+ pre_run_argv = xmalloc(sizeof(char *) * 10);
+ pre_run_argv[0] = xstrdup("dw_wlm_cli");
+ pre_run_argv[1] = xstrdup("--function");
+ pre_run_argv[2] = xstrdup("pre_run");
+ pre_run_argv[3] = xstrdup("--token");
+ xstrfmtcat(pre_run_argv[4], "%u", job_ptr->job_id);
+ pre_run_argv[5] = xstrdup("--job");
+ xstrfmtcat(pre_run_argv[6], "%s/script", job_dir);
+ if (client_nodes_file_nid) {
#if defined(HAVE_NATIVE_CRAY)
- pre_run_argv[7] = xstrdup("--nidlistfile");
+ pre_run_argv[7] = xstrdup("--nidlistfile");
#else
- pre_run_argv[7] = xstrdup("--nodehostnamefile");
+ pre_run_argv[7] = xstrdup("--nodehostnamefile");
#endif
- pre_run_argv[8] = xstrdup(client_nodes_file_nid);
- }
- pre_run_args = xmalloc(sizeof(pre_run_args_t));
- pre_run_args->args = pre_run_argv;
- pre_run_args->job_id = job_ptr->job_id;
- pre_run_args->timeout = bb_state.bb_config.other_timeout;
- pre_run_args->user_id = job_ptr->user_id;
- if (job_ptr->details) /* Prevent launch until "pre_run" completes */
- job_ptr->details->prolog_running++;
-
- slurm_attr_init(&pre_run_attr);
- if (pthread_attr_setdetachstate(&pre_run_attr, PTHREAD_CREATE_DETACHED))
- error("pthread_attr_setdetachstate error %m");
- while (pthread_create(&pre_run_tid, &pre_run_attr, _start_pre_run,
- pre_run_args)) {
- if (errno != EAGAIN) {
- error("%s: pthread_create: %m", __func__);
- _start_pre_run(pre_run_argv); /* Do in-line */
- break;
+ pre_run_argv[8] = xstrdup(client_nodes_file_nid);
}
- usleep(100000);
- }
- slurm_attr_destroy(&pre_run_attr);
+ pre_run_args = xmalloc(sizeof(pre_run_args_t));
+ pre_run_args->args = pre_run_argv;
+ pre_run_args->job_id = job_ptr->job_id;
+ pre_run_args->timeout = bb_state.bb_config.other_timeout;
+ pre_run_args->user_id = job_ptr->user_id;
+ if (job_ptr->details) { /* Defer launch until completion */
+ job_ptr->details->prolog_running++;
+ job_ptr->job_state |= JOB_CONFIGURING;
+ }
+
+ slurm_attr_init(&pre_run_attr);
+ if (pthread_attr_setdetachstate(&pre_run_attr,
+ PTHREAD_CREATE_DETACHED))
+ error("pthread_attr_setdetachstate error %m");
+ while (pthread_create(&pre_run_tid, &pre_run_attr,
+ _start_pre_run, pre_run_args)) {
+ if (errno != EAGAIN) {
+ error("%s: pthread_create: %m", __func__);
+ _start_pre_run(pre_run_argv); /* Do in-line */
+ break;
+ }
+ usleep(100000);
+ }
+ slurm_attr_destroy(&pre_run_attr);
+}
- xfree(job_dir);
xfree(client_nodes_file_nid);
+ xfree(job_dir);
return rc;
}
@@ -3472,7 +3477,6 @@
{
last_job_update = time(NULL);
job_ptr->end_time = last_job_update;
- job_ptr->job_state = JOB_PENDING | JOB_COMPLETING;
if (hold_job)
job_ptr->priority = 0;
build_cg_bitmap(job_ptr);
@@ -3480,7 +3484,11 @@
job_ptr->state_reason = FAIL_BURST_BUFFER_OP;
xfree(job_ptr->state_desc);
job_ptr->state_desc = xstrdup("Burst buffer pre_run error");
- job_completion_logger(job_ptr, false);
+
+ job_ptr->job_state = JOB_REQUEUE;
+ job_completion_logger(job_ptr, true);
+ job_ptr->job_state = JOB_PENDING | JOB_COMPLETING;
+
deallocate_nodes(job_ptr, false, false, false);
}
@@ -4066,6 +4074,7 @@
error("%s: unable to find job record for job %u",
__func__, create_args->job_id);
}
+ assoc_mgr_lock(&assoc_locks);
slurm_mutex_lock(&bb_state.bb_mutex);
_reset_buf_state(create_args->user_id, create_args->job_id,
create_args->name, BB_STATE_ALLOCATED,
@@ -4074,7 +4083,6 @@
create_args->user_id);
bb_alloc->size = create_args->size;
bb_alloc->pool = xstrdup(create_args->pool);
- assoc_mgr_lock(&assoc_locks);
if (job_ptr) {
bb_alloc->account = xstrdup(job_ptr->account);
if (job_ptr->assoc_ptr) {
@@ -4120,8 +4128,8 @@
}
(void) bb_post_persist_create(job_ptr, bb_alloc, &bb_state);
bb_state.last_update_time = time(NULL);
- assoc_mgr_unlock(&assoc_locks);
slurm_mutex_unlock(&bb_state.bb_mutex);
+ assoc_mgr_unlock(&assoc_locks);
unlock_slurmctld(job_write_lock);
}
xfree(resp_msg);
@@ -4204,6 +4212,9 @@
assoc_mgr_lock_t assoc_locks =
{ READ_LOCK, NO_LOCK, READ_LOCK, NO_LOCK,
NO_LOCK, NO_LOCK, NO_LOCK };
+ /* assoc_mgr needs locking to call bb_post_persist_delete */
+ if (bb_alloc)
+ assoc_mgr_lock(&assoc_locks);
slurm_mutex_lock(&bb_state.bb_mutex);
_reset_buf_state(destroy_args->user_id, destroy_args->job_id,
destroy_args->name, BB_STATE_DELETED, 0);
@@ -4216,14 +4227,14 @@
bb_limit_rem(bb_alloc->user_id, bb_alloc->size,
bb_alloc->pool, &bb_state);
- assoc_mgr_lock(&assoc_locks);
(void) bb_post_persist_delete(bb_alloc, &bb_state);
- assoc_mgr_unlock(&assoc_locks);
(void) bb_free_alloc_rec(&bb_state, bb_alloc);
}
bb_state.last_update_time = time(NULL);
slurm_mutex_unlock(&bb_state.bb_mutex);
+ if (bb_alloc)
+ assoc_mgr_unlock(&assoc_locks);
}
xfree(resp_msg);
_free_create_args(destroy_args);
diff -Nru slurm-llnl-16.05.8/src/plugins/checkpoint/blcr/checkpoint_blcr.c slurm-llnl-16.05.9/src/plugins/checkpoint/blcr/checkpoint_blcr.c
--- slurm-llnl-16.05.8/src/plugins/checkpoint/blcr/checkpoint_blcr.c 2017-01-04 22:11:51.000000000 +0100
+++ slurm-llnl-16.05.9/src/plugins/checkpoint/blcr/checkpoint_blcr.c 2017-01-31 20:56:34.000000000 +0100
@@ -422,30 +422,29 @@
*/
/* set LD_PRELOAD for batch script shell */
- //if (job->batch) {
- old_env = getenvp(job->env, "LD_PRELOAD");
- if (old_env) {
- /* search and replace all libcr_run and libcr_omit
- * the old env value is messed up --
- * it will be replaced */
- while ((ptr = strtok_r(old_env, " :", &save_ptr))) {
- old_env = NULL;
- if (!ptr)
- break;
- if (!xstrncmp(ptr, "libcr_run.so", 12) ||
- !xstrncmp(ptr, "libcr_omit.so", 13))
- continue;
- xstrcat(new_env, ptr);
- xstrcat(new_env, ":");
- }
+ old_env = getenvp(job->env, "LD_PRELOAD");
+ if (old_env) {
+ /* search and replace all libcr_run and libcr_omit
+ * the old env value is messed up --
+ * it will be replaced */
+ while ((ptr = strtok_r(old_env, " :", &save_ptr))) {
+ old_env = NULL;
+ if (!ptr)
+ break;
+ if (!xstrncmp(ptr, "libcr_run.so", 12) ||
+ !xstrncmp(ptr, "libcr_omit.so", 13))
+ continue;
+ xstrcat(new_env, ptr);
+ xstrcat(new_env, ":");
}
- ptr = xstrdup("libcr_run.so");
- if (new_env)
- xstrfmtcat(ptr, ":%s", new_env);
- setenvf(&job->env, "LD_PRELOAD", ptr);
- xfree(new_env);
- xfree(ptr);
- //}
+ }
+ ptr = xstrdup("libcr_run.so");
+ if (new_env)
+ xstrfmtcat(ptr, ":%s", new_env);
+ setenvf(&job->env, "LD_PRELOAD", ptr);
+ xfree(new_env);
+ xfree(ptr);
+
return SLURM_SUCCESS;
}
diff -Nru slurm-llnl-16.05.8/src/plugins/sched/backfill/backfill.c slurm-llnl-16.05.9/src/plugins/sched/backfill/backfill.c
--- slurm-llnl-16.05.8/src/plugins/sched/backfill/backfill.c 2017-01-04 22:11:51.000000000 +0100
+++ slurm-llnl-16.05.9/src/plugins/sched/backfill/backfill.c 2017-01-31 20:56:34.000000000 +0100
@@ -135,6 +135,7 @@
static int max_backfill_job_per_user = 0;
static int max_backfill_jobs_start = 0;
static bool backfill_continue = false;
+static bool assoc_limit_stop = false;
static int defer_rpc_cnt = 0;
static int sched_timeout = SCHED_TIMEOUT;
static int yield_sleep = YIELD_SLEEP;
@@ -639,6 +640,13 @@
backfill_continue = false;
}
+ if (sched_params && (strstr(sched_params, "assoc_limit_stop"))) {
+ assoc_limit_stop = true;
+ } else {
+ assoc_limit_stop = false;
+ }
+
+
if (sched_params &&
(tmp_ptr = strstr(sched_params, "bf_yield_interval="))) {
sched_timeout = atoi(tmp_ptr + 18);
@@ -1127,7 +1135,8 @@
}
if (!acct_policy_job_runnable_state(job_ptr) &&
- !acct_policy_job_runnable_pre_select(job_ptr))
+ (!assoc_limit_stop ||
+ !acct_policy_job_runnable_pre_select(job_ptr)))
continue;
job_no_reserve = 0;
@@ -1847,8 +1856,19 @@
power_g_job_start(job_ptr);
if (job_ptr->batch_flag == 0)
srun_allocate(job_ptr->job_id);
- else if ((job_ptr->details == NULL) ||
- (job_ptr->details->prolog_running == 0))
+ else if (
+#ifdef HAVE_BG
+ /* On a bluegene system we need to run the
+ * prolog while the job is CONFIGURING so this
+ * can't work off the CONFIGURING flag as done
+ * elsewhere.
+ */
+ !job_ptr->details ||
+ !job_ptr->details->prolog_running
+#else
+ !IS_JOB_CONFIGURING(job_ptr)
+#endif
+ )
launch_job(job_ptr);
slurmctld_diag_stats.backfilled_jobs++;
slurmctld_diag_stats.last_backfilled_jobs++;
diff -Nru slurm-llnl-16.05.8/src/plugins/select/cons_res/select_cons_res.c slurm-llnl-16.05.9/src/plugins/select/cons_res/select_cons_res.c
--- slurm-llnl-16.05.8/src/plugins/select/cons_res/select_cons_res.c 2017-01-04 22:11:51.000000000 +0100
+++ slurm-llnl-16.05.9/src/plugins/select/cons_res/select_cons_res.c 2017-01-31 20:56:34.000000000 +0100
@@ -544,21 +544,24 @@
/* sort the rows of a partition from "most allocated" to "least allocated" */
extern void cr_sort_part_rows(struct part_res_record *p_ptr)
{
- uint32_t i, j, a, b;
+ uint32_t i, j, b;
+ uint32_t a[p_ptr->num_rows];
if (!p_ptr->row)
return;
for (i = 0; i < p_ptr->num_rows; i++) {
if (p_ptr->row[i].row_bitmap)
- a = bit_set_count(p_ptr->row[i].row_bitmap);
+ a[i] = bit_set_count(p_ptr->row[i].row_bitmap);
else
- a = 0;
+ a[i] = 0;
+ }
+ for (i = 0; i < p_ptr->num_rows; i++) {
for (j = i+1; j < p_ptr->num_rows; j++) {
- if (!p_ptr->row[j].row_bitmap)
- continue;
- b = bit_set_count(p_ptr->row[j].row_bitmap);
- if (b > a) {
+ if (a[j] > a[i]) {
+ b = a[j];
+ a[j] = a[i];
+ a[i] = b;
_swap_rows(&(p_ptr->row[i]), &(p_ptr->row[j]));
}
}
@@ -1878,9 +1881,7 @@
((job_ptr->bit_flags & TEST_NOW_ONLY) == 0)) {
int time_window = 30;
bool more_jobs = true;
- bool timed_out = false;
DEF_TIMERS;
-
list_sort(cr_job_list, _cr_job_list_sort);
START_TIMER;
job_iterator = list_iterator_create(cr_job_list);
@@ -1908,14 +1909,6 @@
last_job_ptr = tmp_job_ptr;
_rm_job_from_res(future_part, future_usage,
tmp_job_ptr, 0);
- if (timed_out) {
- /* After timeout, remove ALL remaining
- * jobs and test if the pending job can
- * start, rather than executing the slow
- * cr_job_test() operation after
- * removing every 200 jobs */
- continue;
- }
if (rm_job_cnt++ > 200)
break;
next_job_ptr = list_peek_next(job_iterator);
@@ -1949,12 +1942,9 @@
}
break;
}
- /* After 1 second of iterating over groups of running
- * jobs, simulate the termination of all remaining jobs
- * in order to determine if pending job can ever run */
END_TIMER;
- if (DELTA_TIMER >= 1000000)
- timed_out = true;
+ if (DELTA_TIMER >= 2000000)
+ break; /* Quit after 2 seconds wall time */
}
list_iterator_destroy(job_iterator);
}
diff -Nru slurm-llnl-16.05.8/src/plugins/task/cgroup/task_cgroup_memory.c slurm-llnl-16.05.9/src/plugins/task/cgroup/task_cgroup_memory.c
--- slurm-llnl-16.05.8/src/plugins/task/cgroup/task_cgroup_memory.c 2017-01-04 22:11:51.000000000 +0100
+++ slurm-llnl-16.05.9/src/plugins/task/cgroup/task_cgroup_memory.c 2017-01-31 20:56:34.000000000 +0100
@@ -458,6 +458,7 @@
goto error;
}
+ fstatus = SLURM_SUCCESS;
error:
xcgroup_unlock(&memory_cg);
xcgroup_destroy(&memory_cg);
diff -Nru slurm-llnl-16.05.8/src/sacctmgr/common.c slurm-llnl-16.05.9/src/sacctmgr/common.c
--- slurm-llnl-16.05.8/src/sacctmgr/common.c 2017-01-04 22:11:51.000000000 +0100
+++ slurm-llnl-16.05.9/src/sacctmgr/common.c 2017-01-31 20:56:34.000000000 +0100
@@ -405,8 +405,10 @@
field->name = xstrdup("MaxCPUsPU");
field->len = 9;
field->print_routine = print_fields_uint;
- } else if (!strncasecmp("MaxTRESPerJob",
- object, MAX(command_len, 7))) {
+ } else if (!strncasecmp("MaxTRES",
+ object, MAX(command_len, 7)) ||
+ !strncasecmp("MaxTRESPerJob",
+ object, MAX(command_len, 11))) {
field->type = PRINT_MAXT;
field->name = xstrdup("MaxTRES");
field->len = 13;
@@ -452,7 +454,9 @@
field->len = 13;
field->print_routine = sacctmgr_print_tres;
} else if (!strncasecmp("MaxTRESPerUser", object,
- MAX(command_len, 11))) {
+ MAX(command_len, 11)) ||
+ !strncasecmp("MaxTRESPU", object,
+ MAX(command_len, 9))) {
field->type = PRINT_MAXTU;
field->name = xstrdup("MaxTRESPU");
field->len = 13;
@@ -473,9 +477,9 @@
field->len = 9;
field->print_routine = print_fields_uint;
} else if (!strncasecmp("MaxJobsPerUser", object,
- MAX(command_len, 8)) ||
+ MAX(command_len, 11)) ||
!strncasecmp("MaxJobsPU", object,
- MAX(command_len, 8))) {
+ MAX(command_len, 9))) {
field->type = PRINT_MAXJ; /* used same as MaxJobs */
field->name = xstrdup("MaxJobsPU");
field->len = 9;
diff -Nru slurm-llnl-16.05.8/src/sbcast/opts.c slurm-llnl-16.05.9/src/sbcast/opts.c
--- slurm-llnl-16.05.8/src/sbcast/opts.c 2017-01-04 22:11:51.000000000 +0100
+++ slurm-llnl-16.05.9/src/sbcast/opts.c 2017-01-31 20:56:34.000000000 +0100
@@ -94,7 +94,7 @@
{NULL, 0, 0, 0}
};
- if (getenv("SBCAST_COMPRESS"))
+ if ((env_val = getenv("SBCAST_COMPRESS")))
params.compress = parse_compress_type(env_val);
if ( ( env_val = getenv("SBCAST_FANOUT") ) )
params.fanout = atoi(env_val);
diff -Nru slurm-llnl-16.05.8/src/slurmctld/acct_policy.c slurm-llnl-16.05.9/src/slurmctld/acct_policy.c
--- slurm-llnl-16.05.8/src/slurmctld/acct_policy.c 2017-01-04 22:11:51.000000000 +0100
+++ slurm-llnl-16.05.9/src/slurmctld/acct_policy.c 2017-01-31 20:56:34.000000000 +0100
@@ -1765,7 +1765,7 @@
qos_ptr->name,
assoc_mgr_tres_name_array[tres_pos],
qos_ptr->grp_tres_mins_ctld[tres_pos],
- tres_usage_mins[i]);
+ tres_usage_mins[tres_pos]);
rc = false;
goto end_it;
break;
@@ -2741,7 +2741,7 @@
* parent or not
*/
assoc_mgr_lock_t locks = { READ_LOCK, NO_LOCK, READ_LOCK, NO_LOCK,
- NO_LOCK, NO_LOCK, NO_LOCK };
+ READ_LOCK, NO_LOCK, NO_LOCK };
/* check to see if we are enforcing associations */
if (!accounting_enforce)
diff -Nru slurm-llnl-16.05.8/src/slurmctld/agent.c slurm-llnl-16.05.9/src/slurmctld/agent.c
--- slurm-llnl-16.05.8/src/slurmctld/agent.c 2017-01-04 22:11:51.000000000 +0100
+++ slurm-llnl-16.05.9/src/slurmctld/agent.c 2017-01-31 20:56:34.000000000 +0100
@@ -174,7 +174,12 @@
char *message;
} mail_info_t;
-static void _sig_handler(int dummy);
+typedef struct retry_args {
+ bool mail_too; /* Time to wait between retries */
+ int min_wait; /* Send pending email too */
+} retry_args_t;
+
+static void *_agent_retry(void *arg);
static int _batch_launch_defer(queued_request_t *queued_req_ptr);
static inline int _comm_err(char *node_name, slurm_msg_type_t msg_type);
static void _list_delete_retry(void *retry_entry);
@@ -185,8 +190,9 @@
int no_resp_cnt, int retry_cnt);
static void _purge_agent_args(agent_arg_t *agent_arg_ptr);
static void _queue_agent_retry(agent_info_t * agent_info_ptr, int count);
-static int _setup_requeue(agent_arg_t *agent_arg_ptr, thd_t *thread_ptr,
- int *count, int *spot);
+static int _setup_requeue(agent_arg_t *agent_arg_ptr, thd_t *thread_ptr,
+ int *count, int *spot);
+static void _sig_handler(int dummy);
static void _spawn_retry_agent(agent_arg_t * agent_arg_ptr);
static void *_thread_per_group_rpc(void *args);
static int _valid_agent_arg(agent_arg_t *agent_arg_ptr);
@@ -1261,17 +1267,41 @@
}
/*
- * agent_retry - Agent for retrying pending RPCs. One pending request is
+ * agent_retry - Spawn agent for retrying pending RPCs. One pending request is
* issued if it has been pending for at least min_wait seconds
* IN min_wait - Minimum wait time between re-issue of a pending RPC
* IN mail_too - Send pending email too, note this performed using a
* fork/waitpid, so it can take longer than just creating a pthread
* to send RPCs
- * RET count of queued requests remaining
*/
-extern int agent_retry (int min_wait, bool mail_too)
+extern void agent_retry(int min_wait, bool mail_too)
{
- int list_size = 0, rc;
+ pthread_attr_t thread_attr;
+ pthread_t thread_id = (pthread_t) 0;
+ retry_args_t *retry_args_ptr;
+
+ retry_args_ptr = xmalloc(sizeof(struct retry_args));
+ retry_args_ptr->mail_too = mail_too;
+ retry_args_ptr->min_wait = min_wait;
+
+ slurm_attr_init(&thread_attr);
+ if (pthread_attr_setdetachstate(&thread_attr, PTHREAD_CREATE_DETACHED))
+ error("pthread_attr_setdetachstate error %m");
+ if (pthread_create(&thread_id, &thread_attr, _agent_retry,
+ (void *) retry_args_ptr)) {
+ error("pthread_create error %m");
+ xfree(retry_args_ptr);
+ }
+ slurm_attr_destroy(&thread_attr);
+}
+
+/* Do the work requested by agent_retry (retry pending RPCs).
+ * This is a separate thread so the job records can be locked */
+static void *_agent_retry(void *arg)
+{
+ retry_args_t *retry_args_ptr = (retry_args_t *) arg;
+ bool mail_too;
+ int min_wait, rc;
time_t now = time(NULL);
queued_request_t *queued_req_ptr = NULL;
agent_arg_t *agent_arg_ptr = NULL;
@@ -1279,17 +1309,26 @@
pthread_t thread_mail = 0;
pthread_attr_t attr_mail;
mail_info_t *mi = NULL;
+ /* Write lock on jobs */
+ slurmctld_lock_t job_write_lock =
+ { NO_LOCK, WRITE_LOCK, NO_LOCK, NO_LOCK };
+
+ mail_too = retry_args_ptr->mail_too;
+ min_wait = retry_args_ptr->min_wait;
+ xfree(arg);
+ lock_slurmctld(job_write_lock);
slurm_mutex_lock(&retry_mutex);
if (retry_list) {
static time_t last_msg_time = (time_t) 0;
- uint32_t msg_type[5] = {0, 0, 0, 0, 0}, i = 0;
+ uint32_t msg_type[5] = {0, 0, 0, 0, 0};
+ int i = 0, list_size;
list_size = list_count(retry_list);
if ((list_size > 100) &&
(difftime(now, last_msg_time) > 300)) {
/* Note sizable backlog of work */
info("slurmctld: agent retry_list size is %d",
- list_size);
+ list_size);
retry_iter = list_iterator_create(retry_list);
while ((queued_req_ptr = (queued_request_t *)
list_next(retry_iter))) {
@@ -1311,13 +1350,13 @@
/* too much work already */
slurm_mutex_unlock(&agent_cnt_mutex);
slurm_mutex_unlock(&retry_mutex);
- return list_size;
+ unlock_slurmctld(job_write_lock);
+ return NULL;
}
slurm_mutex_unlock(&agent_cnt_mutex);
if (retry_list) {
/* first try to find a new (never tried) record */
-
retry_iter = list_iterator_create(retry_list);
while ((queued_req_ptr = (queued_request_t *)
list_next(retry_iter))) {
@@ -1327,14 +1366,12 @@
agent_arg_ptr);
xfree(queued_req_ptr);
list_remove(retry_iter);
- list_size--;
continue;
}
if (rc > 0)
continue;
if (queued_req_ptr->last_attempt == 0) {
list_remove(retry_iter);
- list_size--;
break;
}
}
@@ -1356,7 +1393,6 @@
agent_arg_ptr);
xfree(queued_req_ptr);
list_remove(retry_iter);
- list_size--;
continue;
}
if (rc > 0)
@@ -1364,13 +1400,13 @@
age = difftime(now, queued_req_ptr->last_attempt);
if (age > min_wait) {
list_remove(retry_iter);
- list_size--;
break;
}
}
list_iterator_destroy(retry_iter);
}
slurm_mutex_unlock(&retry_mutex);
+ unlock_slurmctld(job_write_lock);
if (queued_req_ptr) {
agent_arg_ptr = queued_req_ptr->agent_arg_ptr;
@@ -1406,7 +1442,7 @@
slurm_mutex_unlock(&agent_cnt_mutex);
}
- return list_size;
+ return NULL;
}
/*
@@ -1823,7 +1859,7 @@
agent_arg_t *agent_arg_ptr;
batch_job_launch_msg_t *launch_msg_ptr;
time_t now = time(NULL);
- struct job_record *job_ptr;
+ struct job_record *job_ptr;
int nodes_ready = 0, tmp = 0;
agent_arg_ptr = queued_req_ptr->agent_arg_ptr;
@@ -1845,6 +1881,9 @@
return -1; /* job cancelled while waiting */
}
+ if (job_ptr->details && job_ptr->details->prolog_running)
+ return 1;
+
if (job_ptr->wait_all_nodes) {
(void) job_node_ready(launch_msg_ptr->job_id, &tmp);
if (tmp == (READY_JOB_STATE | READY_NODE_STATE)) {
@@ -1853,9 +1892,6 @@
!xstrcmp(launch_msg_ptr->alias_list, "TBD")) {
/* Update launch RPC with correct node
* aliases */
- struct job_record *job_ptr;
- job_ptr = find_job_record(launch_msg_ptr->
- job_id);
xfree(launch_msg_ptr->alias_list);
launch_msg_ptr->alias_list = xstrdup(job_ptr->
alias_list);
@@ -1887,7 +1923,8 @@
}
if (nodes_ready) {
- job_config_fini(job_ptr);
+ if (IS_JOB_CONFIGURING(job_ptr))
+ job_config_fini(job_ptr);
queued_req_ptr->last_attempt = (time_t) 0;
return 0;
}
diff -Nru slurm-llnl-16.05.8/src/slurmctld/agent.h slurm-llnl-16.05.9/src/slurmctld/agent.h
--- slurm-llnl-16.05.8/src/slurmctld/agent.h 2017-01-04 22:11:51.000000000 +0100
+++ slurm-llnl-16.05.9/src/slurmctld/agent.h 2017-01-31 20:56:34.000000000 +0100
@@ -85,11 +85,10 @@
* issued if it has been pending for at least min_wait seconds
* IN min_wait - Minimum wait time between re-issue of a pending RPC
* IN mail_too - Send pending email too, note this performed using a
- * fork/waitpid, so it can take longer than just creating
- * a pthread to send RPCs
- * RET count of queued requests remaining
+ * fork/waitpid, so it can take longer than just creating a pthread
+ * to send RPCs
*/
-extern int agent_retry (int min_wait, bool mail_too);
+extern void agent_retry(int min_wait, bool mail_too);
/* agent_purge - purge all pending RPC requests */
extern void agent_purge (void);
diff -Nru slurm-llnl-16.05.8/src/slurmctld/job_mgr.c slurm-llnl-16.05.9/src/slurmctld/job_mgr.c
--- slurm-llnl-16.05.8/src/slurmctld/job_mgr.c 2017-01-04 22:11:51.000000000 +0100
+++ slurm-llnl-16.05.9/src/slurmctld/job_mgr.c 2017-01-31 20:56:34.000000000 +0100
@@ -7470,6 +7470,7 @@
{
time_t now = time(NULL);
+ last_job_update = now;
job_ptr->job_state &= (~JOB_CONFIGURING);
job_ptr->tot_sus_time = difftime(now, job_ptr->start_time);
if ((job_ptr->time_limit != INFINITE) && (job_ptr->tot_sus_time != 0)) {
@@ -7486,9 +7487,20 @@
if (bit_overlap(job_ptr->node_bitmap, power_node_bitmap))
return false;
- if (job_ptr->wait_all_nodes &&
- ((select_g_job_ready(job_ptr) & READY_NODE_STATE) == 0))
- return false;
+ if (job_ptr->wait_all_nodes) {
+ /* Make sure all nodes ready to start job */
+ if ((select_g_job_ready(job_ptr) & READY_NODE_STATE) == 0)
+ return false;
+ } else if (job_ptr->batch_flag) {
+ /* Make first node is ready to start batch job */
+ int i_first = bit_ffs(job_ptr->node_bitmap);
+ struct node_record *node_ptr = node_record_table_ptr + i_first;
+ if ((i_first != -1) &&
+ (IS_NODE_POWER_SAVE(node_ptr) ||
+ IS_NODE_POWER_UP(node_ptr))) {
+ return false;
+ }
+ }
return true;
}
@@ -12846,6 +12858,7 @@
if (job_ptr->alias_list && !xstrcmp(job_ptr->alias_list, "TBD") &&
(prolog == 0) && job_ptr->node_bitmap &&
(bit_overlap(power_node_bitmap, job_ptr->node_bitmap) == 0)) {
+ last_job_update = time(NULL);
job_ptr->job_state &= (~JOB_CONFIGURING);
set_job_alias_list(job_ptr);
}
@@ -14373,6 +14386,8 @@
* to add it again. */
acct_policy_add_job_submit(job_ptr);
+ acct_policy_update_pending_job(job_ptr);
+
if (state & JOB_SPECIAL_EXIT) {
job_ptr->job_state |= JOB_SPECIAL_EXIT;
job_ptr->state_reason = WAIT_HELD_USER;
diff -Nru slurm-llnl-16.05.8/src/slurmctld/job_scheduler.c slurm-llnl-16.05.9/src/slurmctld/job_scheduler.c
--- slurm-llnl-16.05.8/src/slurmctld/job_scheduler.c 2017-01-04 22:11:51.000000000 +0100
+++ slurm-llnl-16.05.9/src/slurmctld/job_scheduler.c 2017-01-31 20:56:34.000000000 +0100
@@ -574,6 +574,7 @@
ListIterator job_iterator;
slurmctld_lock_t job_write_lock =
{ READ_LOCK, WRITE_LOCK, WRITE_LOCK, READ_LOCK };
+ time_t now = time(NULL);
#ifdef HAVE_BG
static uint16_t cpus_per_node = 0;
if (!cpus_per_node)
@@ -591,7 +592,8 @@
continue;
if (part_ptr == NULL)
continue;
- if ((job_ptr->details == NULL) || job_ptr->details->begin_time)
+ if ((job_ptr->details == NULL) ||
+ (job_ptr->details->begin_time > now))
continue;
if ((part_ptr->state_up & PARTITION_SCHED) == 0)
continue;
@@ -863,8 +865,20 @@
info("sched: Allocate JobId=%u Partition=%s NodeList=%s #CPUs=%u",
job_ptr->job_id, job_ptr->part_ptr->name,
job_ptr->nodes, job_ptr->total_cpus);
- if ((job_ptr->details->prolog_running == 0) &&
- ((job_ptr->bit_flags & NODE_REBOOT) == 0)) {
+
+ if (
+#ifdef HAVE_BG
+ /* On a bluegene system we need to run the
+ * prolog while the job is CONFIGURING so this
+ * can't work off the CONFIGURING flag as done
+ * elsewhere.
+ */
+ !job_ptr->details->prolog_running &&
+ !(job_ptr->bit_flags & NODE_REBOOT)
+#else
+ !IS_JOB_CONFIGURING(job_ptr)
+#endif
+ ) {
launch_msg = build_launch_job_msg(job_ptr,
msg->protocol_version);
}
@@ -1842,10 +1856,20 @@
#endif
if (job_ptr->batch_flag == 0)
srun_allocate(job_ptr->job_id);
- else if ((job_ptr->details->prolog_running == 0) &&
- ((job_ptr->bit_flags & NODE_REBOOT) == 0)) {
+ else if (
+#ifdef HAVE_BG
+ /* On a bluegene system we need to run the
+ * prolog while the job is CONFIGURING so this
+ * can't work off the CONFIGURING flag as done
+ * elsewhere.
+ */
+ !job_ptr->details->prolog_running &&
+ !(job_ptr->bit_flags & NODE_REBOOT)
+#else
+ !IS_JOB_CONFIGURING(job_ptr)
+#endif
+ )
launch_job(job_ptr);
- }
rebuild_job_part_list(job_ptr);
job_cnt++;
if (is_job_array_head &&
@@ -3181,7 +3205,8 @@
}
/* Enforce reservation: access control, time and nodes */
- if (job_ptr->details->begin_time)
+ if (job_ptr->details->begin_time &&
+ (job_ptr->details->begin_time > now))
start_res = job_ptr->details->begin_time;
else
start_res = now;
@@ -3753,10 +3778,10 @@
return errno;
}
- if (job_ptr->details)
+ if (job_ptr->details) {
job_ptr->details->prolog_running++;
-
- job_ptr->job_state |= JOB_CONFIGURING;
+ job_ptr->job_state |= JOB_CONFIGURING;
+ }
slurm_attr_init(&thread_attr_prolog);
pthread_attr_setdetachstate(&thread_attr_prolog,
diff -Nru slurm-llnl-16.05.8/src/slurmctld/node_mgr.c slurm-llnl-16.05.9/src/slurmctld/node_mgr.c
--- slurm-llnl-16.05.8/src/slurmctld/node_mgr.c 2017-01-04 22:11:51.000000000 +0100
+++ slurm-llnl-16.05.9/src/slurmctld/node_mgr.c 2017-01-31 20:56:34.000000000 +0100
@@ -3346,9 +3346,11 @@
char *host_str = NULL;
hostlist_t no_resp_hostlist = NULL;
- for (i=0; i<node_record_count; i++) {
+ for (i = 0; i < node_record_count; i++) {
node_ptr = &node_record_table_ptr[i];
- if (!node_ptr->not_responding)
+ if (!node_ptr->not_responding ||
+ IS_NODE_POWER_SAVE(node_ptr) ||
+ IS_NODE_POWER_UP(node_ptr))
continue;
if (no_resp_hostlist) {
(void) hostlist_push_host(no_resp_hostlist,
diff -Nru slurm-llnl-16.05.8/src/slurmctld/proc_req.c slurm-llnl-16.05.9/src/slurmctld/proc_req.c
--- slurm-llnl-16.05.8/src/slurmctld/proc_req.c 2017-01-04 22:11:51.000000000 +0100
+++ slurm-llnl-16.05.9/src/slurmctld/proc_req.c 2017-01-31 20:56:34.000000000 +0100
@@ -3381,8 +3381,20 @@
_throttle_fini(&active_rpc_cnt);
goto fini;
}
- if (job_ptr->details &&
- job_ptr->details->prolog_running) {
+
+ if (
+#ifdef HAVE_BG
+ /* On a bluegene system we need to run the
+ * prolog while the job is CONFIGURING so this
+ * can't work off the CONFIGURING flag as done
+ * elsewhere.
+ */
+ job_ptr->details &&
+ job_ptr->details->prolog_running
+#else
+ IS_JOB_CONFIGURING(job_ptr)
+#endif
+ ) {
slurm_send_rc_msg(msg, EAGAIN);
unlock_slurmctld(job_write_lock);
_throttle_fini(&active_rpc_cnt);
diff -Nru slurm-llnl-16.05.8/src/slurmctld/reservation.c slurm-llnl-16.05.9/src/slurmctld/reservation.c
--- slurm-llnl-16.05.8/src/slurmctld/reservation.c 2017-01-04 22:11:51.000000000 +0100
+++ slurm-llnl-16.05.9/src/slurmctld/reservation.c 2017-01-31 20:56:34.000000000 +0100
@@ -5400,7 +5400,6 @@
}
_advance_resv_time(resv_ptr);
if ((resv_ptr->job_run_cnt == 0) &&
- (resv_ptr->flags_set_node == false) &&
((resv_ptr->flags & RESERVE_FLAG_DAILY ) == 0) &&
((resv_ptr->flags & RESERVE_FLAG_WEEKLY) == 0)) {
if (resv_ptr->job_pend_cnt) {
diff -Nru slurm-llnl-16.05.8/src/slurmctld/step_mgr.c slurm-llnl-16.05.9/src/slurmctld/step_mgr.c
--- slurm-llnl-16.05.8/src/slurmctld/step_mgr.c 2017-01-04 22:11:51.000000000 +0100
+++ slurm-llnl-16.05.9/src/slurmctld/step_mgr.c 2017-01-31 20:56:34.000000000 +0100
@@ -981,11 +981,12 @@
return NULL;
}
}
- if (job_ptr->details
- && job_ptr->details->prolog_running == 0) {
+ if (IS_JOB_CONFIGURING(job_ptr)) {
info("%s: Configuration for job %u is complete",
__func__, job_ptr->job_id);
job_config_fini(job_ptr);
+ if (job_ptr->bit_flags & NODE_REBOOT)
+ job_validate_mem(job_ptr);
}
}
diff -Nru slurm-llnl-16.05.8/src/slurmd/common/xcgroup.c slurm-llnl-16.05.9/src/slurmd/common/xcgroup.c
--- slurm-llnl-16.05.8/src/slurmd/common/xcgroup.c 2017-01-04 22:11:51.000000000 +0100
+++ slurm-llnl-16.05.9/src/slurmd/common/xcgroup.c 2017-01-31 20:56:34.000000000 +0100
@@ -449,7 +449,6 @@
char* file_path;
uid_t uid;
gid_t gid;
- int create_only;
uint32_t notify;
/* init variables based on input cgroup */
@@ -457,7 +456,6 @@
file_path = cg->path;
uid = cg->uid;
gid = cg->gid;
- create_only = 0;
notify = cg->notify;
/* save current mask and apply working one */
@@ -465,20 +463,23 @@
omask = umask(cmask);
/* build cgroup */
- if (mkdir(file_path, 0755)) {
- if (create_only || errno != EEXIST) {
- debug2("%s: unable to create cgroup '%s' : %m",
- __func__, file_path);
+ if (mkdir(file_path, 0755)) {
+ if (errno != EEXIST) {
+ error("%s: unable to create cgroup '%s' : %m",
+ __func__, file_path);
umask(omask);
return fstatus;
+ } else {
+ debug("%s: cgroup '%s' already exists",
+ __func__, file_path);
}
}
umask(omask);
/* change cgroup ownership as requested */
if (chown(file_path, uid, gid)) {
- debug2("%s: unable to chown %d:%d cgroup '%s' : %m",
- __func__, uid, gid, file_path);
+ error("%s: unable to chown %d:%d cgroup '%s' : %m",
+ __func__, uid, gid, file_path);
return fstatus;
}
diff -Nru slurm-llnl-16.05.8/src/slurmd/slurmstepd/req.c slurm-llnl-16.05.9/src/slurmd/slurmstepd/req.c
--- slurm-llnl-16.05.8/src/slurmd/slurmstepd/req.c 2017-01-04 22:11:51.000000000 +0100
+++ slurm-llnl-16.05.9/src/slurmd/slurmstepd/req.c 2017-01-31 20:56:34.000000000 +0100
@@ -128,6 +128,7 @@
static pthread_mutex_t message_lock = PTHREAD_MUTEX_INITIALIZER;
static pthread_cond_t message_cond = PTHREAD_COND_INITIALIZER;
static int message_connections;
+static int msg_target_node_id = 0;
/*
* Returns true if "uid" is a "slurm authorized user" - i.e. uid == 0
@@ -739,8 +740,6 @@
int errnum = 0;
int sig;
static int msg_sent = 0;
- char *ptr = NULL;
- int target_node_id = 0;
stepd_step_task_info_t *task;
uint32_t i;
uint32_t flag;
@@ -792,11 +791,8 @@
}
}
- ptr = getenvp(job->env, "SLURM_STEP_KILLED_MSG_NODE_ID");
- if (ptr)
- target_node_id = atoi(ptr);
if ((job->stepid != SLURM_EXTERN_CONT) &&
- (job->nodeid == target_node_id) && (msg_sent == 0) &&
+ (job->nodeid == msg_target_node_id) && (msg_sent == 0) &&
(job->state < SLURMSTEPD_STEP_ENDING)) {
time_t now = time(NULL);
char entity[24], time_str[24];
@@ -1818,3 +1814,10 @@
}
}
}
+
+extern void set_msg_node_id(stepd_step_rec_t *job)
+{
+ char *ptr = getenvp(job->env, "SLURM_STEP_KILLED_MSG_NODE_ID");
+ if (ptr)
+ msg_target_node_id = atoi(ptr);
+}
diff -Nru slurm-llnl-16.05.8/src/slurmd/slurmstepd/req.h slurm-llnl-16.05.9/src/slurmd/slurmstepd/req.h
--- slurm-llnl-16.05.8/src/slurmd/slurmstepd/req.h 2017-01-04 22:11:51.000000000 +0100
+++ slurm-llnl-16.05.9/src/slurmd/slurmstepd/req.h 2017-01-31 20:56:34.000000000 +0100
@@ -46,4 +46,6 @@
/* Delay until a job is resumed */
extern void wait_for_resumed(uint16_t msg_type);
+extern void set_msg_node_id(stepd_step_rec_t *job);
+
#endif /* _STEP_REQ_H */
diff -Nru slurm-llnl-16.05.8/src/slurmd/slurmstepd/slurmstepd.c slurm-llnl-16.05.9/src/slurmd/slurmstepd/slurmstepd.c
--- slurm-llnl-16.05.8/src/slurmd/slurmstepd/slurmstepd.c 2017-01-04 22:11:51.000000000 +0100
+++ slurm-llnl-16.05.9/src/slurmd/slurmstepd/slurmstepd.c 2017-01-31 20:56:34.000000000 +0100
@@ -581,6 +581,8 @@
env_array_overwrite(&job->env,"SLURM_TOPOLOGY_ADDR_PATTERN",
conf->node_topo_pattern);
+ set_msg_node_id(job);
+
return job;
}
diff -Nru slurm-llnl-16.05.8/src/srun/libsrun/allocate.c slurm-llnl-16.05.9/src/srun/libsrun/allocate.c
--- slurm-llnl-16.05.8/src/srun/libsrun/allocate.c 2017-01-04 22:11:51.000000000 +0100
+++ slurm-llnl-16.05.9/src/srun/libsrun/allocate.c 2017-01-31 20:56:34.000000000 +0100
@@ -877,6 +877,7 @@
j->power_flags = opt.power_flags;
if (opt.mcs_label)
j->mcs_label = opt.mcs_label;
+ j->wait_all_nodes = 1;
return j;
}
diff -Nru slurm-llnl-16.05.8/testsuite/expect/test10.13 slurm-llnl-16.05.9/testsuite/expect/test10.13
--- slurm-llnl-16.05.8/testsuite/expect/test10.13 2017-01-04 22:11:51.000000000 +0100
+++ slurm-llnl-16.05.9/testsuite/expect/test10.13 2017-01-31 20:56:34.000000000 +0100
@@ -44,6 +44,11 @@
print_header $test_id
+if { [test_bluegene] == 0 } {
+ send_user "\nWARNING: This test is only compatible with bluegene systems\n"
+ exit 0
+}
+
if {[file exists $smap] == 0} {
send_user "\nWARNING: smap not installed\n"
exit 0
diff -Nru slurm-llnl-16.05.8/testsuite/expect/test10.5 slurm-llnl-16.05.9/testsuite/expect/test10.5
--- slurm-llnl-16.05.8/testsuite/expect/test10.5 2017-01-04 22:11:51.000000000 +0100
+++ slurm-llnl-16.05.9/testsuite/expect/test10.5 2017-01-31 20:56:34.000000000 +0100
@@ -42,6 +42,11 @@
print_header $test_id
+if { [test_bluegene] == 0 } {
+ send_user "\nWARNING: This test is only compatible with bluegene systems\n"
+ exit 0
+}
+
if {[file exists $smap] == 0} {
send_user "\nWARNING: smap not installed\n"
exit 0
diff -Nru slurm-llnl-16.05.8/testsuite/expect/test1.14 slurm-llnl-16.05.9/testsuite/expect/test1.14
--- slurm-llnl-16.05.8/testsuite/expect/test1.14 2017-01-04 22:11:51.000000000 +0100
+++ slurm-llnl-16.05.9/testsuite/expect/test1.14 2017-01-31 20:56:34.000000000 +0100
@@ -39,7 +39,6 @@
set file_out "test$test_id.output"
set file_out2 "test$test_id.output2"
set job_id 0
-set sleep_secs 10
print_header $test_id
@@ -64,10 +63,15 @@
exec $bin_rm -f $file_in $file_in2 $file_out $file_out2
make_bash_script $file_in "
echo tasks_per_node=\$SLURM_TASKS_PER_NODE
+ if \[ \$SLURM_TASKS_PER_NODE -gt 32 \]; then
+ sleep_secs=45
+ else
+ sleep_secs=10
+ fi
inx=0
while \[ \$inx -lt \$SLURM_TASKS_PER_NODE \]
do
- $srun --exclusive -n1 $bin_sleep $sleep_secs &
+ $srun --exclusive -n1 $bin_sleep \$sleep_secs &
inx=\$((inx+1))
done
$bin_sleep 4
@@ -81,7 +85,7 @@
#
# Spawn a job via sbatch
#
-spawn $sbatch -N1 -t1 --gres=craynetwork:0 --output=$file_out $file_in
+spawn $sbatch -N1 -t2 --gres=craynetwork:0 --output=$file_out $file_in
expect {
-re "Submitted batch job ($number)" {
set job_id $expect_out(1,string)
@@ -149,25 +153,30 @@
# Delete left-over input script
# Build another input script file
# Run one more step than allocated CPUs with immediate option and make aborts
-# The "sleep 4" is meant to insure the earlier job steps start first
+# The "sleep" is meant to insure the earlier job steps start first
#
exec $bin_rm -f $file_in $file_out
make_bash_script $file_in "
inx=0
+ if \[ \$SLURM_TASKS_PER_NODE -gt 32 \]; then
+ sleep_secs=45
+ else
+ sleep_secs=10
+ fi
while \[ \$inx -lt \$SLURM_TASKS_PER_NODE \]
do
- $srun --exclusive -n1 $bin_sleep $sleep_secs &
+ $srun --exclusive -n1 --mem=0 $bin_sleep \$sleep_secs &
inx=\$((inx+1))
done
$bin_sleep 4
- $srun -v --exclusive -n1 --immediate $file_in2 &
+ $srun -v --exclusive -n1 --mem=0 --immediate $file_in2 &
wait
"
#
# Spawn a job via sbatch
#
-spawn $sbatch -N1 -t1 --gres=craynetwork:0 --output=$file_out2 $file_in
+spawn $sbatch -N1 -t2 --gres=craynetwork:0 --output=$file_out2 $file_in
expect {
-re "Submitted batch job ($number)" {
set job_id $expect_out(1,string)
diff -Nru slurm-llnl-16.05.8/testsuite/expect/test12.2 slurm-llnl-16.05.9/testsuite/expect/test12.2
--- slurm-llnl-16.05.8/testsuite/expect/test12.2 2017-01-04 22:11:51.000000000 +0100
+++ slurm-llnl-16.05.9/testsuite/expect/test12.2 2017-01-31 20:56:34.000000000 +0100
@@ -186,7 +186,7 @@
# Compute error in MB
set diff_io [expr $max_disk_write - $max_disk_read]
set error_io [expr abs($diff_io)]
- if { $error_io > 0.05 } {
+ if { $error_io > 0.3 } {
send_user "\nFAILURE: written file size does not match read size "
send_user "file_size:$mb_file_size MB "
send_user "max_disk_write:$max_disk_write MB "
@@ -196,7 +196,7 @@
set diff_io [expr $ave_disk_write - $ave_disk_read]
set error_io [expr abs($diff_io)]
- if { $error_io > 0.05 } {
+ if { $error_io > 0.3 } {
send_user "\nFAILURE: average written file size "
send_user "does not match average read size "
send_user "file_size:$mb_file_size MB "
diff -Nru slurm-llnl-16.05.8/testsuite/expect/test14.10 slurm-llnl-16.05.9/testsuite/expect/test14.10
--- slurm-llnl-16.05.8/testsuite/expect/test14.10 2017-01-04 22:11:51.000000000 +0100
+++ slurm-llnl-16.05.9/testsuite/expect/test14.10 2017-01-31 20:56:34.000000000 +0100
@@ -91,10 +91,10 @@
$srun -N1 -n1 -w$node2 ./$file_in2
echo -n \"Checking node 1: \"
-$srun -N1 -n1 -w$node2 ls /tmp/$node2/test$test_id\_file
+$srun -Q -N1 -n1 -w$node2 ls /tmp/$node2/test$test_id\_file
echo -n \"Checking node 0: \"
-$srun -N1 -n1 -w$node1 ls /tmp/$node1/test$test_id\_file
+$srun -Q -N1 -n1 -w$node1 ls /tmp/$node1/test$test_id\_file
$srun $bin_rm -f /tmp/$node1/test$test_id\_file
$srun $bin_rm -fr /tmp/$node1
@@ -107,6 +107,7 @@
"
# Make allocations
+set timeout $max_job_delay
set matches 0
spawn $salloc -N2 -w$hostlist -t1 ./$file_in1
expect {
diff -Nru slurm-llnl-16.05.8/testsuite/expect/test1.52 slurm-llnl-16.05.9/testsuite/expect/test1.52
--- slurm-llnl-16.05.8/testsuite/expect/test1.52 2017-01-04 22:11:51.000000000 +0100
+++ slurm-llnl-16.05.9/testsuite/expect/test1.52 2017-01-31 20:56:34.000000000 +0100
@@ -37,7 +37,7 @@
set exit_code 0
set num_nodes 2
set num_tasks 2
-set node_count 0
+set idle_nodes 0
set max_nodes 0
set task_count 0
set hostfile "test$test_id.hostfile"
@@ -71,10 +71,6 @@
set max_nodes 999999
exp_continue
}
- -re "TotalNodes=($number)" {
- set node_count $expect_out(1,string)
- exp_continue
- }
timeout {
send_user "\nFAILURE: scontrol not responding\n"
exit 1
@@ -83,8 +79,14 @@
wait
}
}
-if { ($node_count < 3) || ($max_nodes < 3) } {
- send_user "WARNING: system must have at least 3 nodes to run this test on. $node_count $max_nodes\n"
+
+set idle_nodes [available_nodes $def_part idle]
+if { ($idle_nodes < 3) || ($max_nodes < 3) } {
+ if { $max_nodes == 999999 } {
+ send_user "WARNING: partition $def_part must have at least 3 idle nodes and MaxNodes >= 3 to run this test on. IDLE:$idle_nodes MaxNodes:UNLIMITED\n"
+ } else {
+ send_user "WARNING: partition $def_part must have at least 3 idle nodes and MaxNodes >= 3 to run this test on. IDLE:$idle_nodes MaxNodes:$max_nodes\n"
+ }
exit $exit_code
}
diff -Nru slurm-llnl-16.05.8/testsuite/expect/test15.22 slurm-llnl-16.05.9/testsuite/expect/test15.22
--- slurm-llnl-16.05.8/testsuite/expect/test15.22 2017-01-04 22:11:51.000000000 +0100
+++ slurm-llnl-16.05.9/testsuite/expect/test15.22 2017-01-31 20:56:34.000000000 +0100
@@ -131,9 +131,15 @@
#
# Submit job explicitly to a non-default partition
#
-set job_id 0
+set job_id 0
+set legit_failure 0
set salloc_pid [spawn $salloc --partition=$other_part_name -t1 $bin_sleep 1]
expect {
+ -re "Required node not available" {
+ set legit_failure 1
+ exec $bin_kill -INT $salloc_pid
+ exp_continue
+ }
-re "Granted job allocation ($number)" {
set job_id $expect_out(1,string)
exp_continue
@@ -151,7 +157,9 @@
}
}
# Confirm the job's partition
-if {$job_id == 0} {
+if {$legit_failure == 1} {
+ send_user "\nWARNING: partition '$other_part_name' is not usable\n"
+} elseif {$job_id == 0} {
send_user "\nFAILURE: batch submit failure\n"
set exit_code 1
} else {
diff -Nru slurm-llnl-16.05.8/testsuite/expect/test1.63 slurm-llnl-16.05.9/testsuite/expect/test1.63
--- slurm-llnl-16.05.8/testsuite/expect/test1.63 2017-01-04 22:11:51.000000000 +0100
+++ slurm-llnl-16.05.9/testsuite/expect/test1.63 2017-01-31 20:56:34.000000000 +0100
@@ -72,6 +72,7 @@
}
-re "Hello World!" {
incr match_run
+ sleep 0.1
exec $bin_kill -INT $srun_pid
exp_continue
}
diff -Nru slurm-llnl-16.05.8/testsuite/expect/test17.34 slurm-llnl-16.05.9/testsuite/expect/test17.34
--- slurm-llnl-16.05.8/testsuite/expect/test17.34 2017-01-04 22:11:51.000000000 +0100
+++ slurm-llnl-16.05.9/testsuite/expect/test17.34 2017-01-31 20:56:34.000000000 +0100
@@ -62,8 +62,14 @@
} else {
set task_limit 1
}
+
+ set ntasks [expr abs($task_limit + $task)]
+ if {$ntasks == 0} {
+ set ntasks 1
+ }
+
set error_chk 0
- spawn $sbatch -t1 -w$node -S$core_spec -n[expr abs($task_limit + $task)] -o$file_out $spec_in
+ spawn $sbatch -t1 -w$node -S$core_spec -n$ntasks -o$file_out $spec_in
expect {
-re "Submitted batch job ($number)" {
set job_id $expect_out(1,string)
@@ -156,6 +162,19 @@
print_header $test_id
+set select_type [test_select_type]
+if {![string compare $select_type "linear"] || ![string compare $select_type "serial"]} {
+ send_user "\nWARNING: This test is incompatible with select/$select_type\n"
+ exit 0
+} elseif {![string compare $select_type "cray"] && ![test_select_type_params "other_cons_res"]} {
+ send_user "\nWARNING: This test is incompatible with select/linear\n"
+ exit 0
+}
+if {[test_select_type_params "CR_SOCKET"]} {
+ send_user "\nWARNING: This test is incompatible with CR_SOCKET allocations\n"
+ exit 0
+}
+
log_user 0
set allow_spec 0
spawn $scontrol show config
@@ -178,16 +197,6 @@
exit $exit_code
}
-set select_type [test_select_type]
-if {![string compare $select_type "linear"]} {
- send_user "\nWARNING: This test is incompatible with select/$select_type\n"
- exit 0
-}
-if {[test_select_type_params "CR_SOCKET"]} {
- send_user "\nWARNING: This test is incompatible with CR_SOCKET allocations\n"
- exit 0
-}
-
# Remove any vestigial files
exec $bin_rm -f $file_in $file_out $spec_in
diff -Nru slurm-llnl-16.05.8/testsuite/expect/test17.39 slurm-llnl-16.05.9/testsuite/expect/test17.39
--- slurm-llnl-16.05.8/testsuite/expect/test17.39 2017-01-04 22:11:51.000000000 +0100
+++ slurm-llnl-16.05.9/testsuite/expect/test17.39 2017-01-31 20:56:34.000000000 +0100
@@ -35,8 +35,6 @@
set slow_id 0
set fast_id 0
set dep_id 0
-set slow_job "test$test_id\_slow_sc"
-set fast_job "test$test_id\_fast_sc"
set exit_code 0
print_header $test_id
@@ -56,9 +54,6 @@
}
}
-make_bash_script $slow_job "sleep 120"
-make_bash_script $fast_job "sleep 30"
-
proc check_state {id state} {
global squeue exit_code
@@ -85,14 +80,8 @@
}
}
-if {[test_select_type_params "MEMORY"]} {
- set job_mem 10
-} else {
- set job_mem 1
-}
-
# Submit job 1 of 3
-spawn $sbatch -t3 -o/dev/null --mem=${job_mem} $slow_job
+spawn $sbatch -t3 -o /dev/null --wrap "sleep 120"
expect {
-re "Submitted batch job ($number)" {
set slow_id $expect_out(1,string)
@@ -112,7 +101,7 @@
}
# Submit job 2 of 3
-spawn $sbatch -t3 -o/dev/null --mem=${job_mem} $fast_job
+spawn $sbatch -t3 -o /dev/null --wrap "sleep 30"
expect {
-re "Node count specification invalid" {
send_user "\nWARNING: can't test with less than two nodes\n"
@@ -136,7 +125,7 @@
}
# Submit dependency job, 3 of 3
-spawn $sbatch --dependency=afterok:$slow_id?afterok:$fast_id -o/dev/null --mem=${job_mem} $slow_job
+spawn $sbatch --dependency=afterok:$slow_id?afterok:$fast_id -o /dev/null --wrap "sleep 120"
expect {
-re "Submitted batch job ($number)" {
set dep_id $expect_out(1,string)
@@ -197,7 +186,6 @@
cancel_job $dep_id
if {$exit_code == 0} {
- exec $bin_rm -f $slow_job $fast_job
send_user "\nSUCCESS\n"
}
exit $exit_code
diff -Nru slurm-llnl-16.05.8/testsuite/expect/test1.74 slurm-llnl-16.05.9/testsuite/expect/test1.74
--- slurm-llnl-16.05.8/testsuite/expect/test1.74 2017-01-04 22:11:51.000000000 +0100
+++ slurm-llnl-16.05.9/testsuite/expect/test1.74 2017-01-31 20:56:34.000000000 +0100
@@ -65,6 +65,10 @@
send_user "\nWARNING: This test is incompatible with serial systems\n"
exit $exit_code
}
+if {[string compare [check_accounting_admin_level] "Administrator"]} {
+ send_user "\nThis test can't be run without being an Accounting administrator.\n"
+ exit $exit_code
+}
spawn $bin_id -u -n
expect {
diff -Nru slurm-llnl-16.05.8/testsuite/expect/test17.40 slurm-llnl-16.05.9/testsuite/expect/test17.40
--- slurm-llnl-16.05.8/testsuite/expect/test17.40 2017-01-04 22:11:51.000000000 +0100
+++ slurm-llnl-16.05.9/testsuite/expect/test17.40 2017-01-31 20:56:34.000000000 +0100
@@ -156,6 +156,19 @@
print_header $test_id
+set select_type [test_select_type]
+if {![string compare $select_type "linear"] || ![string compare $select_type "serial"]} {
+ send_user "\nWARNING: This test is incompatible with select/$select_type\n"
+ exit 0
+} elseif {![string compare $select_type "cray"] && ![test_select_type_params "other_cons_res"]} {
+ send_user "\nWARNING: This test is incompatible with select/linear\n"
+ exit 0
+}
+if {[test_select_type_params "CR_SOCKET"]} {
+ send_user "\nWARNING: This test is incompatible with CR_SOCKET allocations\n"
+ exit 0
+}
+
log_user 0
set allow_spec 0
spawn $scontrol show config
@@ -178,17 +191,6 @@
exit $exit_code
}
-set select_type [test_select_type]
-if {![string compare $select_type "linear"] || ![string compare $select_type "serial"]} {
- send_user "\nWARNING: This test is incompatible with select/$select_type\n"
- exit 0
-}
-
-if {[test_select_type_params "CR_SOCKET"]} {
- send_user "\nWARNING: This test is incompatible with CR_SOCKET allocations\n"
- exit 0
-}
-
# Remove any vestigial files
exec $bin_rm -f $file_in $file_out $spec_in
diff -Nru slurm-llnl-16.05.8/testsuite/expect/test21.36 slurm-llnl-16.05.9/testsuite/expect/test21.36
--- slurm-llnl-16.05.8/testsuite/expect/test21.36 2017-01-04 22:11:51.000000000 +0100
+++ slurm-llnl-16.05.9/testsuite/expect/test21.36 2017-01-31 20:56:34.000000000 +0100
@@ -81,6 +81,10 @@
send_user "\nThis test can't be run without AccountStorageType=slurmdbd\n"
exit 0
}
+if {[string compare [check_accounting_admin_level] "Administrator"]} {
+ send_user "\nThis test can't be run without being an Accounting administrator.\n"
+ exit 0
+}
# Remove pre-existing items
cleanup
diff -Nru slurm-llnl-16.05.8/testsuite/expect/test2.8 slurm-llnl-16.05.9/testsuite/expect/test2.8
--- slurm-llnl-16.05.8/testsuite/expect/test2.8 2017-01-04 22:11:51.000000000 +0100
+++ slurm-llnl-16.05.9/testsuite/expect/test2.8 2017-01-31 20:56:34.000000000 +0100
@@ -35,7 +35,6 @@
set test_id "2.8"
set exit_code 0
-set file_in "test$test_id.input"
set is_bluegene 0
set job_id1 0
set job_id2 0
@@ -65,26 +64,10 @@
set step_id 0
}
-if {[test_select_type_params "MEMORY"]} {
- set job_mem 20
- set step_mem 10
-} else {
- set job_mem 1
- set step_mem 1
-}
-
-#
-# Build input script file
-#
-make_bash_script $file_in "
- $srun --mem=${step_mem} $bin_sleep 60 &
- $srun --mem=${step_mem} $bin_sleep 60
-"
-
#
# Submit a couple jobs so we have something to work with
#
-set sbatch_pid [spawn $sbatch --output=/dev/null --error=/dev/null -t5 --mem=${job_mem} $file_in]
+set sbatch_pid [spawn $sbatch --output=/dev/null --error=/dev/null -t5 --wrap "$srun $bin_sleep 60"]
expect {
-re "Submitted batch job ($number)" {
set job_id1 $expect_out(1,string)
@@ -104,7 +87,7 @@
exit 1
}
-set sbatch_pid [spawn $sbatch --output=/dev/null --error=/dev/null -t5 --mem=${job_mem} $file_in]
+set sbatch_pid [spawn $sbatch --output=/dev/null --error=/dev/null -t5 --wrap "$srun $bin_sleep 60"]
expect {
-re "Submitted batch job ($number)" {
set job_id2 $expect_out(1,string)
@@ -126,8 +109,6 @@
exit 1
}
-exec $bin_rm -f $file_in
-
if {[wait_for_job $job_id1 "RUNNING"] != 0} {
send_user "\nFAILURE: waiting for job $job_id1 to start\n"
cancel_job $job_id1
@@ -451,4 +432,3 @@
send_user "\nSUCCESS\n"
}
exit $exit_code
-
diff -Nru slurm-llnl-16.05.8/testsuite/expect/test28.7 slurm-llnl-16.05.9/testsuite/expect/test28.7
--- slurm-llnl-16.05.8/testsuite/expect/test28.7 2017-01-04 22:11:51.000000000 +0100
+++ slurm-llnl-16.05.9/testsuite/expect/test28.7 2017-01-31 20:56:34.000000000 +0100
@@ -33,7 +33,6 @@
set test_id "28.7"
set exit_code 0
set array_size 3
-set script "test$test_id\.bash"
set top_array_task_id [expr $array_size - 1]
print_header $test_id
@@ -43,20 +42,11 @@
exit 0
}
-if {[test_select_type_params "MEMORY"]} {
- set job_mem 10
-} else {
- set job_mem 1
-}
-
-exec $bin_rm -f $script
-make_bash_script $script "sleep \$(( ( RANDOM % 10 ) + 1 ))"
-
#
# Submit a job array for first dependency test
#
set job_id 0
-spawn $sbatch -N1 -t1 -a 0-$top_array_task_id -o /dev/null -e /dev/null --mem=${job_mem} $script
+spawn $sbatch -N1 -t1 -a 0-$top_array_task_id -o /dev/null -e /dev/null --wrap "sleep \$(( ( RANDOM % 10 ) + 1 ))"
expect {
-re "Submitted batch job ($number)" {
set job_id $expect_out(1,string)
@@ -80,7 +70,7 @@
#
set timeout $max_job_delay
set match_job_state 0
-set srun_pid [spawn $srun -t1 --dependency=afterany:$job_id\_$top_array_task_id --mem=${job_mem} $scontrol show job $job_id\_$top_array_task_id]
+set srun_pid [spawn $srun -t1 --dependency=afterany:$job_id\_$top_array_task_id $scontrol show job $job_id\_$top_array_task_id]
expect {
-re "JobState=COMPLETED|COMPLETING" {
set match_job_state 1
@@ -108,7 +98,7 @@
# Submit a job array for second dependency test
#
set job_id 0
-spawn $sbatch -N1 -t1 -a 0-[expr $array_size - 1] -o /dev/null -e /dev/null --mem=${job_mem} $script
+spawn $sbatch -N1 -t1 -a 0-[expr $array_size - 1] -o /dev/null -e /dev/null --wrap "sleep \$(( ( RANDOM % 10 ) + 1 ))"
expect {
-re "Submitted batch job ($number)" {
set job_id $expect_out(1,string)
@@ -132,7 +122,7 @@
#
set timeout $max_job_delay
set match_job_state 0
-set srun_pid [spawn $srun -t1 --dependency=afterany:$job_id --mem=${job_mem} $scontrol show job $job_id]
+set srun_pid [spawn $srun -t1 --dependency=afterany:$job_id $scontrol show job $job_id]
expect {
-re "JobState=COMPLETED|COMPLETING" {
incr match_job_state
@@ -154,7 +144,6 @@
cancel_job $job_id
if {$exit_code == 0} {
- exec $bin_rm -f $script
send_user "\nSUCCESS\n"
}
exit $exit_code
diff -Nru slurm-llnl-16.05.8/testsuite/expect/test3.15 slurm-llnl-16.05.9/testsuite/expect/test3.15
--- slurm-llnl-16.05.8/testsuite/expect/test3.15 2017-01-04 22:11:51.000000000 +0100
+++ slurm-llnl-16.05.9/testsuite/expect/test3.15 2017-01-31 20:56:34.000000000 +0100
@@ -32,7 +32,6 @@
set test_id "3.15"
set exit_code 0
-set script_name "test$test_id.bash"
set license_name "test$test_id"
set resv_name "resv$test_id"
set user_name ""
@@ -57,10 +56,9 @@
}
proc submit_job { license_count } {
- global script_name bin_sleep license_name sbatch number exit_code job_mem
+ global bin_sleep license_name sbatch number exit_code
set job_id 0
- make_bash_script $script_name "$bin_sleep 300"
- spawn $sbatch -n1 -t1 -o /dev/null -L $license_name:$license_count --mem=${job_mem} $script_name
+ spawn $sbatch -n1 -t1 -o /dev/null -L $license_name:$license_count --wrap "$bin_sleep 300"
expect {
-re "Submitted batch job ($number)" {
set job_id $expect_out(1,string)
@@ -187,12 +185,6 @@
exit $exit_code
}
-if {[test_select_type_params "MEMORY"]} {
- set job_mem 10
-} else {
- set job_mem 1
-}
-
spawn $bin_id -un
expect {
-re "($alpha_numeric_under)" {
@@ -378,7 +370,7 @@
reconfigure
if {$exit_code == 0} {
- exec $bin_rm -f $cwd/slurm.conf.orig $script_name
+ exec $bin_rm -f $cwd/slurm.conf.orig
send_user "\nSUCCESS\n"
} else {
send_user "\nFAILURE\n"
diff -Nru slurm-llnl-16.05.8/testsuite/expect/test35.2 slurm-llnl-16.05.9/testsuite/expect/test35.2
--- slurm-llnl-16.05.8/testsuite/expect/test35.2 2017-01-04 22:11:51.000000000 +0100
+++ slurm-llnl-16.05.9/testsuite/expect/test35.2 2017-01-31 20:56:34.000000000 +0100
@@ -42,6 +42,28 @@
set script_use "test$test_id.use.bash"
set tmp_file "test$test_id"
+#
+# get my uid and clear any vestigial triggers
+#
+set uid -1
+spawn $bin_id -u
+expect {
+ -re "($number)" {
+ set uid $expect_out(1,string)
+ exp_continue
+ }
+ eof {
+ wait
+ }
+}
+if {$uid == -1} {
+ send_user "\nCan't get my uid\n"
+ exit 1
+} elseif {$uid == 0} {
+ send_user "\nWARNING: Can't run this test as user root\n"
+ exit 0
+}
+
proc find_bb_jobid { fname bb_jobid } {
global bin_cat
@@ -142,6 +164,9 @@
set exit_code 1
}
+# Wait for purge of buffer to complete
+sleep 10
+
set found 0
spawn $scontrol show burst
expect {
diff -Nru slurm-llnl-16.05.8/testsuite/expect/test5.9 slurm-llnl-16.05.9/testsuite/expect/test5.9
--- slurm-llnl-16.05.8/testsuite/expect/test5.9 2017-01-04 22:11:51.000000000 +0100
+++ slurm-llnl-16.05.9/testsuite/expect/test5.9 2017-01-31 20:56:34.000000000 +0100
@@ -98,6 +98,10 @@
send_user "\nWARNING: This test is incompatible with serial systems\n"
exit $exit_code
}
+if {[string compare [check_accounting_admin_level] "Administrator"]} {
+ send_user "\nThis test can't be run without being an Accounting administrator.\n"
+ exit $exit_code
+}
set available [available_nodes $partition idle]
if {$available < 2} {
send_user "\nWARNING: not enough nodes currently available ($available avail, 2 needed)\n"
diff -Nru slurm-llnl-16.05.8/testsuite/expect/test7.11 slurm-llnl-16.05.9/testsuite/expect/test7.11
--- slurm-llnl-16.05.8/testsuite/expect/test7.11 2017-01-04 22:11:51.000000000 +0100
+++ slurm-llnl-16.05.9/testsuite/expect/test7.11 2017-01-31 20:56:34.000000000 +0100
@@ -69,10 +69,12 @@
#
log_user 0
set config_dir ""
+set ctld_slurm_ver ""
spawn $scontrol show config
expect {
- -re "SLURM_CONF.*= (/.*)/slurm.conf.*SLURM_VERSION" {
+ -re "SLURM_CONF.*= (.*)/slurm.conf.*SLURM_VERSION *= ($float)" {
set config_dir $expect_out(1,string)
+ set ctld_slurm_ver $expect_out(2,string)
exp_continue
}
eof {
@@ -84,6 +86,27 @@
send_user "\nFAILURE: Could not locate slurm.conf directory\n"
exit 1
}
+
+log_user 0
+set loc_slurm_ver ""
+spawn $scontrol -V
+expect {
+ -re "slurm ($float)" {
+ set loc_slurm_ver $expect_out(1,string)
+ exp_continue
+ }
+ eof {
+ wait
+ }
+}
+log_user 1
+
+if {[string compare $ctld_slurm_ver $loc_slurm_ver]} {
+ send_user "\nWARNING: slurmctld ($ctld_slurm_ver) and local Slurm ($loc_slurm_ver) versions are not the same, can not continue.\n"
+ exit 0
+}
+
+
set spank_conf_file ${config_dir}/plugstack.conf
exec $bin_rm -f $orig_spank_conf $new_spank_conf $file_out $spank_out
if {[file exists $spank_conf_file]} {
@@ -120,10 +143,6 @@
}
}
-# Allow enough time for configuration file in NFS to be propagated
-# to all nodes of cluster
-exec sleep 60
-
#
# Test of srun help message
#
diff -Nru slurm-llnl-16.05.8/testsuite/expect/test7.13 slurm-llnl-16.05.9/testsuite/expect/test7.13
--- slurm-llnl-16.05.8/testsuite/expect/test7.13 2017-01-04 22:11:51.000000000 +0100
+++ slurm-llnl-16.05.9/testsuite/expect/test7.13 2017-01-31 20:56:34.000000000 +0100
@@ -166,7 +166,7 @@
}
}
if {$matches != 4} {
- send_user "\nFAILURE: sacct of $job_id failed ($matches != 5)\n"
+ send_user "\nFAILURE: sacct of $job_id failed ($matches != 4)\n"
exit 1
}
}
--- End Message ---