HewlettPackard · Dec 10, 2024
diff --git a/‎.spelling
+34 b/‎.spelling
+34
diff --git a/‎docs/portal/developer-portal/HPE_Slingshot_Host_Software_User_Guide.ditamap
+57 b/‎docs/portal/developer-portal/HPE_Slingshot_Host_Software_User_Guide.ditamap
+57
diff --git a/‎docs/portal/developer-portal/HPE_Slingshot_Host_Software_User_Guide.json
+17 b/‎docs/portal/developer-portal/HPE_Slingshot_Host_Software_User_Guide.json
+17
diff --git a/‎docs/portal/developer-portal/Makefile
+1 b/‎docs/portal/developer-portal/Makefile
+1
diff --git a/‎docs/portal/developer-portal/install/kdreg2_introduction.md
+1-1 b/‎docs/portal/developer-portal/install/kdreg2_introduction.md
+1-1
diff --git a/‎docs/portal/developer-portal/troubleshoot/cxi/Common_CXI_core_error_messages.md
+1-1 b/‎docs/portal/developer-portal/troubleshoot/cxi/Common_CXI_core_error_messages.md
+1-1
diff --git a/‎docs/portal/developer-portal/user/about.md
+9 b/‎docs/portal/developer-portal/user/about.md
+9
diff --git a/‎docs/portal/developer-portal/user/application_software_overview.md
+4 b/‎docs/portal/developer-portal/user/application_software_overview.md
+4
diff --git a/‎docs/portal/developer-portal/user/completion_queue_size_attribute.md
+11 b/‎docs/portal/developer-portal/user/completion_queue_size_attribute.md
+11
diff --git a/‎docs/portal/developer-portal/user/daos.md
+12 b/‎docs/portal/developer-portal/user/daos.md
+12
diff --git a/‎docs/portal/developer-portal/user/debug_performance_and_failure_issues.md
+27 b/‎docs/portal/developer-portal/user/debug_performance_and_failure_issues.md
+27
diff --git a/‎docs/portal/developer-portal/user/endpoint_receive_size_attribute.md
+9 b/‎docs/portal/developer-portal/user/endpoint_receive_size_attribute.md
+9
diff --git a/‎docs/portal/developer-portal/user/endpoint_transmit_size_attribute.md
+12 b/‎docs/portal/developer-portal/user/endpoint_transmit_size_attribute.md
+12
diff --git a/‎docs/portal/developer-portal/user/expected_number_of_ranks_and_peers.md
+9 b/‎docs/portal/developer-portal/user/expected_number_of_ranks_and_peers.md
+9
diff --git a/‎docs/portal/developer-portal/user/hardware_offload_capabilities.md
+7 b/‎docs/portal/developer-portal/user/hardware_offload_capabilities.md
+7
diff --git a/‎docs/portal/developer-portal/user/hardware_overview.md
+13 b/‎docs/portal/developer-portal/user/hardware_overview.md
+13
diff --git a/‎docs/portal/developer-portal/user/hpe_cray_programming_environment.md
+20 b/‎docs/portal/developer-portal/user/hpe_cray_programming_environment.md
+20
diff --git a/‎docs/portal/developer-portal/user/hpe_slingshot_nic_rdma_protocol_and_traffic_classes.md
+29 b/‎docs/portal/developer-portal/user/hpe_slingshot_nic_rdma_protocol_and_traffic_classes.md
+29
diff --git a/‎docs/portal/developer-portal/user/intel_mpi.md
+22 b/‎docs/portal/developer-portal/user/intel_mpi.md
+22
diff --git a/‎docs/portal/developer-portal/user/ip_networking_considerations.md
+14 b/‎docs/portal/developer-portal/user/ip_networking_considerations.md
+14
diff --git a/‎docs/portal/developer-portal/user/ip_performance_and_configuration_settings.md
+8 b/‎docs/portal/developer-portal/user/ip_performance_and_configuration_settings.md
+8
diff --git a/‎docs/portal/developer-portal/user/libfabric_and_the_hpe_slingshot_nic_offloads.md
+26 b/‎docs/portal/developer-portal/user/libfabric_and_the_hpe_slingshot_nic_offloads.md
+26
diff --git a/‎docs/portal/developer-portal/user/libfabric_runtime_configurable_parameters.md
+303 b/‎docs/portal/developer-portal/user/libfabric_runtime_configurable_parameters.md
+303
diff --git a/‎docs/portal/developer-portal/user/memory_cache_monitor_settings.md
+39 b/‎docs/portal/developer-portal/user/memory_cache_monitor_settings.md
+39
diff --git a/‎docs/portal/developer-portal/user/memory_registration.md
+15 b/‎docs/portal/developer-portal/user/memory_registration.md
+15
diff --git a/‎docs/portal/developer-portal/user/memory_registration_and_cache_monitors.md
+52 b/‎docs/portal/developer-portal/user/memory_registration_and_cache_monitors.md
+52
diff --git a/‎docs/portal/developer-portal/user/nccl.md
+21 b/‎docs/portal/developer-portal/user/nccl.md
+21
diff --git a/‎docs/portal/developer-portal/user/openmpi.md
+5 b/‎docs/portal/developer-portal/user/openmpi.md
+5
diff --git a/‎docs/portal/developer-portal/user/performance_counters.md
+16 b/‎docs/portal/developer-portal/user/performance_counters.md
+16
diff --git a/‎docs/portal/developer-portal/user/rccl.md
+19 b/‎docs/portal/developer-portal/user/rccl.md
+19
diff --git a/‎docs/portal/developer-portal/user/rdma_messaging_and_relationship_to_environment_settings.md
+22 b/‎docs/portal/developer-portal/user/rdma_messaging_and_relationship_to_environment_settings.md
+22
diff --git a/‎docs/portal/developer-portal/user/rendezvous_protocol_configuration.md
+41 b/‎docs/portal/developer-portal/user/rendezvous_protocol_configuration.md
+41
diff --git a/‎docs/portal/developer-portal/user/software_architecture.md
+17 b/‎docs/portal/developer-portal/user/software_architecture.md
+17
diff --git a/‎docs/portal/developer-portal/user/tag_matching_mode_settings.md
+37 b/‎docs/portal/developer-portal/user/tag_matching_mode_settings.md
+37
diff --git a/‎docs/portal/developer-portal/user/user_configurable_libfabric_environment_variables.md
+31 b/‎docs/portal/developer-portal/user/user_configurable_libfabric_environment_variables.md
+31
diff --git a/‎docs/portal/scripts/build.sh
+16-1 b/‎docs/portal/scripts/build.sh
+16-1
diff --git a/‎hpc-shs-version
+1 b/‎hpc-shs-version
+1
@@ -20,20 +20,29 @@
 200gb
 200Gbps
 2-port
+4KiB
 802.1Q
+ack
+ACKs
+acknowledgment
+acknowledgments
 adminStatus
 AMA
 AMAs
 amdgpu
+AMOs
 aarch64
 AERs
 all2all
 analytics
+analyze
+analyzing
 Ansible
 API
 APIs
 arm64
 Arista
+artifact
 Artifactory
 ASIC
 ASIC_0
@@ -61,6 +70,7 @@ BOS
 bootprep
 catalog
 Cassini
+Center
 Ceph
 CFS
 CFS-based
@@ -126,6 +136,7 @@ deallocation
 debugfs
 default.yml
 defragmented
+deregistering
 dgnettest
 diags
 diskless
@@ -162,6 +173,7 @@ EC_TRNSNT_S
 EC_UNCOR_NS
 EC_UNCOR_S
 EPEL
+enablement
 ENOMEM
 ENOENT
 eth0
@@ -174,6 +186,7 @@ EX235a
 failback
 Failback
 failover
+favoring
 fi_info
 fi_pingpong
 Flavored
@@ -183,6 +196,7 @@ GbE
 Gbps
 Gen4
 gc_thresh
+GDRCopy
 Git
 Gitea
 GPCNeT
@@ -197,6 +211,7 @@ HealthCheck
 HeartBeat
 heatsink
 highpriority
+high_empty
 hodagd
 honor
 honoring
@@ -216,10 +231,13 @@ hsn_traffic
 hugepages
 Hugepages
 Jenkinsfile
+IB-over-Ethernet
 IBM
 ifcfg
 IMAGE_NAME
 image.rpmlist
+incast
+inflight
 initramfs
 initrd
 int
@@ -242,6 +260,7 @@ Keycloak
 keycloak_group
 keycloak_passwd
 keypair
+Kfabric
 kfi
 kfi1
 kfi2
@@ -255,6 +274,8 @@ libcxi
 Libfabric
 libfabric
 libfabric-devel
+Libfabric-to-NCCL
+Libfabric-to-RCCL
 libpals
 limits
 Linux
@@ -270,13 +291,15 @@ LNM
 Loadbalance
 loadbalance
 localtime
+lockup
 LOG_DEBUG
 LogLevelMax
 LOG_NOTICE
 LOG_INFO
 LOG_WARN
 Loopback
 loopback
+low_empty
 low-noise-mode
 lownoise-service
 Lua
@@ -296,16 +319,19 @@ Mellanox
 memhooks
 Memhooks
 metadata
+misconfigurations
 modprobe
 mountpoints
 MOFED
 MPI
+MPI-3
 MPI-3.1
 MPIR
 MpiDefault
 MpiParams
 mpiexec
 mpirun
+MRs
 msr-safe
 munged*
 multisocket
@@ -352,6 +378,7 @@ nil
 node-identity
 nodename
 non-CFS
+non-HPE
 non-VLAN
 nonprivileged
 Nonprivileged
@@ -362,6 +389,7 @@ NVIDIA
 ntp
 OData
 ogopogod
+onloaded
 OOM
 OPA
 OpenMPI
@@ -387,6 +415,7 @@ Podman
 PMI
 PMIx
 pmix
+preemptive
 prepended
 prepends
 ProLiant
@@ -444,6 +473,7 @@ serdes
 SEQUENCE_ERROR
 shadow
 SharePoint
+Shmem
 SHS
 shs-docs
 shs-version
@@ -486,6 +516,7 @@ TCTs
 TEMPLATE_NAME
 tmpfs
 TLV
+toolkits
 traceback
 tunable
 tunables
@@ -547,6 +578,8 @@ Zypper
 2.x
 3.x
 4.x
+5.0.x
+5.1.2
 cos-2.x
 sle15spx
 SSHOT1.2.1
@@ -571,6 +604,7 @@ S-9009
 S-9010
 S-9011
 S-9012
+S-9929
 
 
 #
 
@@ -0,0 +1,57 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!DOCTYPE map PUBLIC "-//OASIS//DTD DITA Map//EN" "map.dtd">
+<map id="shs_relnotes">
+    <title>HPE Slingshot Host Software User Guide (S-9014) (@product_version@)</title>
+    <topicmeta>
+        <shortdesc>This publication describes user procedures for SHS.</shortdesc>
+        <data name="pubsnumber" value="S-9014"></data>
+        <data name="edition" value="SHS Software Release @product_version@"></data>
+    </topicmeta>
+    <topicref href="VeRsIoN.md" format="mdita" />
+    <topicref href="user/about.md" format="mdita" />
+    <topichead>
+      <topicmeta>
+          <navtitle>HPE Slingshot NIC overview</navtitle>
+      </topicmeta>
+      <topicref href="user/hardware_overview.md" format="mdita" />
+      <topicref href="user/libfabric_and_the_hpe_slingshot_nic_offloads.md" format="mdita" />
+      <topicref href="user/memory_registration.md" format="mdita" />
+      <topicref href="user/hardware_offload_capabilities.md" format="mdita" />
+      <topicref href="user/software_architecture.md" format="mdita" />
+      <topicref href="user/performance_counters.md" format="mdita" />
+      <topicref href="user/ip_networking_considerations.md" format="mdita" />
+    </topichead>
+    <topichead>
+      <topicmeta>
+          <navtitle>HPE Slingshot NIC Libfabric</navtitle>
+      </topicmeta>
+      <topicref href="user/user_configurable_libfabric_environment_variables.md" format="mdita">
+           <topicref href="user/rdma_messaging_and_relationship_to_environment_settings.md" format="mdita"/>
+           <topicref href="user/memory_cache_monitor_settings.md" format="mdita"/>
+           <topicref href="user/endpoint_receive_size_attribute.md" format="mdita"/>
+           <topicref href="user/endpoint_transmit_size_attribute.md" format="mdita"/>
+           <topicref href="user/completion_queue_size_attribute.md" format="mdita"/>
+           <topicref href="user/expected_number_of_ranks_and_peers.md" format="mdita"/>
+           <topicref href="user/tag_matching_mode_settings.md" format="mdita"/>
+           <topicref href="user/rendezvous_protocol_configuration.md" format="mdita"/>
+       </topicref>
+       <topicref href="user/debug_performance_and_failure_issues.md" format="mdita"/>
+    </topichead>
+    <topicref href="user/application_software_overview.md" format="mdita">
+       <topicref href="user/hpe_cray_programming_environment.md" format="mdita"/>
+       <topicref href="user/nccl.md" format="mdita"/>
+       <topicref href="user/rccl.md" format="mdita"/>
+       <topicref href="user/intel_mpi.md" format="mdita"/>
+       <topicref href="user/daos.md" format="mdita"/>
+       <topicref href="user/openmpi.md" format="mdita"/>
+    </topicref>
+    <topichead>
+      <topicmeta>
+          <navtitle>Appendex</navtitle>
+       </topicmeta>
+       <topicref href="user/hpe_slingshot_nic_rdma_protocol_and_traffic_classes.md" format="mdita"/>
+       <topicref href="user/ip_performance_and_configuration_settings.md" format="mdita"/>
+       <topicref href="user/memory_registration_and_cache_monitors.md" format="mdita"/>
+       <topicref href="user/libfabric_runtime_configurable_parameters.md" format="mdita"/>
+    </topichead>
+</map>
@@ -0,0 +1,17 @@
+{
+"content_type": "htmlzip",
+"content_class": "html-default",
+"source_system": "git",
+"source_system_id": "https://github.com/Cray-HPE/docs-shs",
+"source_system_version": "@docs_git_hash@",
+"lifecycle": "DRAFT",
+"products": ["1013247219","1013083813"],
+"product_version": "@product_version@",
+"full_title": "HPE Slingshot Host Software User Guide (S-9014) @product_version@",
+"description": "This publication includes user procedures for SHS.",
+"language_code": "en_US",
+"submitter": "nathan.rockershousen@hpe.com",
+"company_info": "HPE-green",
+"customer_available_date": "",
+"content_org": "CMG708"
+}
@@ -167,6 +167,7 @@ dir: $(TMPDIR)/VeRsIoN.md
 	cp -r performance $(TMPDIR)
 	cp -r operations $(TMPDIR)
 	cp -r overview $(TMPDIR)
+	cp -r user $(TMPDIR)
 	cp -r images $(TMPDIR)
 
 
 
@@ -31,7 +31,7 @@ Each of the traditional monitors has advantages and disadvantages. Memhooks are
 By default, HPE Slingshot uses the memhooks monitor unless set otherwise with the appropriate Libfabric environment variable. Also, HPE guides to select userfaultfd for applications that use NCCL or RCCL collectives libraries as they can hang at scale under memhooks.
 
 To overcome many of the previously described limitations as well as avoiding the need to configure this per-application, HPE introduced kdreg2 as a third memory cache monitor. kdreg2 is provided as a Linux kernel module and uses an open-source licensing model.
-As of the date of this note, it ships in the HPE Slingshot host software distribution and is optionally installed.
+As of the date of this note, it ships in the HPE Slingshot Host Software distribution and is optionally installed.
 Future releases may install this by default, and eventually HPE expects HPE Slingshot NIC Libfabric provider to select kdreg2 by default instead of memhooks.
 
 kdreg2 uses kernel mechanisms to monitor mapping changes and provides synchronous notification to the memory registration cache. It can report changes at the byte level to any memory within the application’s virtual address space.
 
@@ -110,7 +110,7 @@ echo "e4000003,80000000" > /sys/class/cxi/cxi0/device/err_flgs_irqa/hni_pml/no_p
 
 ## `tct_tbl_dealloc` errors
 
-This error occurs when, under certain conditions, the HPE Slingshot host software stack does not take proper precautions to prevent the HPE Slingshot 200 GbE NIC from entering an error state. An example of such a condition that may initiate this error - is a fabric event causing packet transfers to be significantly delayed. Normal NIC and fabric operation is not expected to initiate this error.
+This error occurs when, under certain conditions, the HPE Slingshot Host Software stack does not take proper precautions to prevent the HPE Slingshot 200 GbE NIC from entering an error state. An example of such a condition that may initiate this error - is a fabric event causing packet transfers to be significantly delayed. Normal NIC and fabric operation is not expected to initiate this error.
 
 The following is an example of this error:
 
 
@@ -0,0 +1,9 @@
+# About this publication
+
+This document provides an overview of the HPE Slingshot NIC software environment
+for application users. It includes background information on the "theory of operations" to offer context for product configuration and troubleshooting. This document supplements the configuration and troubleshooting information found in the product documentation.
+
+Tuning guidance discussed here is specific to each system or application, so consider your intended application workload and system configuration. For example, the HPE Cray Programming Environment runtime middleware (MPI and SHMEM) sets default values, as detailed in this document and in the Cray PE documentation.
+Users may need to adjust settings for non-HPE Cray Software, such as open-source MPI stacks that may not have tuned values, and for specific applications.
+
+Default environment settings are rarely changed to avoid unintended impacts during upgrades. Therefore, users are encouraged to evaluate whether adjusting environment variables will improve performance. Tuning environment settings is also useful when a specific application is failing or running slowly.
@@ -0,0 +1,4 @@
+# Configure application software
+
+This section provides guidance on configuring application for the HPE Slingshot NIC using the environment variables previously described to share best-known methods from HPE and other users.
+The needs of specific applications with specific data sets may always vary from these guidelines.
@@ -0,0 +1,11 @@
+# Completion queue size attribute (`FI_CXI_DEFAULT_CQ_SIZE`)
+
+This variable specifies the maximum number of entries in the CXI provider completion queue. This is used for various software and hardware event queues to generate Libfabric completion events.
+While the size of the software queues may grow dynamically, hardware event queue sizes are static. If the hardware event queue is undersized, it will fill quicker than expected, and the next operation targeting a full event queue will result in the message operation being dropped and flow control triggered. Flow control results in expensive, side-band, CXI provider internal messaging to recover from which can appear as lockup to the user.
+
+The provider default is 1024. Users are encouraged to set the completion queue size attribute based on the expected number of inflight RDMA operations to and from a single endpoint. The default provider default value can be set in the application, like MPI, to override the provider default value.
+The default CXI provider value is sized to handle the sum of the TX and RX default values, and it must not be below the sum of the TX and RX values if they have been changed from the default. Cray MPI sets this value to a default size of 131072.
+This size is partially an artifact of wanting to prevent a condition in earlier versions of cxi provider when overflowing the buffer could cause lock-ups.
+This is no longer the case – instead overflowing the buffer will cause slower performance because it triggers flow control.
+
+The impact of sizing this too high is reserving extra host memory that may ultimately be unnecessary.
@@ -0,0 +1,12 @@
+# DAOS
+
+Intel DAOS sets this list of environment variables for compatibility with the HPE Slingshot Host Software (SHS) stack.
+
+- `setenv("CRT_MRC_ENABLE","1")`
+- `setenv("FI_CXI_OPTIMIZED_MRS","0")`
+- `setenv("FI_CXI_RX_MATCH_MODE","hybrid")`
+- `setenv("FI_MR_CACHE_MONITOR","memhooks")`
+- `setenv("FI_CXI_REQ_BUF_MIN_POSTED","8")`
+- `setenv("FI_CXI_REQ_BUF_SIZE","8388608")`
+- `setenv("FI_CXI_DEFAULT_CQ_SIZE","131072")`
+- `setenv("FI_CXI_OFLOW_BUF_SIZE","8388608")`
@@ -0,0 +1,27 @@
+# Debug performance and failure issues
+
+This section describes how to debug applications once a fabric is considered operational.
+
+When a fabric is first being brought up and applications are failing, there can be many issues related to either the network or the host. Transient network failures can impact applications, but debugging whether that is the cause of the application failure is not covered here in depth.
+For example, if links are flapping causing an application to fail one would use link debugging procedures.
+
+## Prerequisites
+
+- AMAs must be assigned to every NIC as is done at boot up.
+- TCP communication must be working. Even for RDMA communications, the job scheduler and MPI use TCP/IP to set up connections. If a system is being set up, TCP failures can relate to Linux misconfigurations in the ARP cache, static ARP tables, or missing routing rules that should have been set up using `ifroute` during boot up (for nodes with more than one NIC).
+- VNI job configuration must be enabled unless the system is running with the “default” `cxi-service`.
+- For systems with GPUs, there is a matched set of GPU drivers and programming toolkits for each version of the `cxi` driver as documented in the release notes. Install the GDRCopy library for NVIDIA GPUs.
+
+## Debug steps
+
+The following is a high-level list of actions that can be taken to debug applications:
+
+- Check the _HPE Slingshot Host Software Release Notes_ for known issues or resolved issues. If not running the latest release, check the release notes for the releases that came after the running system.
+- Run the application with Libfabric logging, `FI_LOG_LEVEL=warn` and `FI_LOG_PROV=cxi`. The resulting logs provide guidance and will greatly aid the teams in responding to support tickets.
+- For memory registration related issues, try running with `kdreg2` memory monitor to see if the issue relates to choice or memory cache monitor.  Also one can disable memory registration caching altogether, which will free up an application that is deadlocking but allow it to run instead of locking up. This points to tuning the memory registration cache settings.
+- If failures are being caused be hardware matching resource exhaustion, try setting matching mode to hybrid.
+- For general concern with resource exhaustion when not running Cray MPI, try setting the environment variables sized larger. Using the Cray MPI settings described below plus setting matching mode to hybrid would help detect whether the default settings are too small for the system or application. If so subsequent testing can help tune the size to avoid too much unneeded memory consumption is desired.
+- If the application performed differently after a software upgrade to the HPE Slingshot Host Software, it is possible to try running with the previous version of the Libfabric user space libraries, or even a more recent version of the Libfabric libraries. This might be easier for a user to try than building a new host image. (It is possible that this combination will not work – one can ask the HPE support team whether there are any known incompatibilities.) Today mixing and matching is not always an officially tested or supported combination, but it can be helpful in debugging and sometimes will be perfectly fine in production.
+- Trying the alternative rendezvous protocol – if the application is using large message and is performance glacially slow, trying the instructions for the alternative rendezvous protocol may be a useful debug step.
+- Collect the NIC counters for an application. See the _HPE Cray Cassini Performance Counters User Guide (S-9929)_ on the [HPE Support Center](https://support.hpe.com/connect/s/?language=en_US) for details.
+Counters are collected with Cray MPI, Libfabric, `sysfs`, or LDMS – different deployments use different strategies. Some of these counters are the same as can be collected on the switch port but will be easier for the user. These can present issues such as PCIe congestion, network congestion (pause exertions), and other factors. This can also be of great use by the support teams in responding to tickets.
@@ -0,0 +1,9 @@
+# Endpoint receive size attribute (`FI_CXI_DEFAULT_RX_SIZE`)
+
+This attribute sizes the internal receive command and hardware event queues at job start up. Users are encouraged to set the endpoint receive size attribute based on the number of outstanding receive buffers being posted. The primary benefit to changing from the default setting is when running in hybrid match mode which is more common with HPE Slingshot release 2.1.1 and later.
+See section on [Tag matching mode settings](tag_matching_mode_settings.md#tag-matching-mode-settings-fi_cxi_rx_match_mode) for more information.
+
+The current default is set to 512 (which is not changed with Cray MPI). Over-specifying can consume more memory, while under-specifying it can cause flow control to be exerted which will reduce performance. When running in “hybrid mode” (see [Tag matching mode settings](tag_matching_mode_settings.md#tag-matching-mode-settings-fi_cxi_rx_match_mode)), over-specifying the amount of hardware receive buffers will force other processes to use a software endpoint.
+
+Libfabric allows applications to suggest a receive attribute size in the `fi_info hints` specific to an application.
+If explicitly set, the `cxi` provider will use the size specified rather than the value of this environment variable.
@@ -0,0 +1,12 @@
+# Endpoint transmit size attribute (`FI_CXI_DEFAULT_TX_SIZE`)
+
+The endpoint transmit size attribute sizes the internal command and hardware event queues. This controls how many messages are in flight, so at a minimum, users are encouraged to set the endpoint transmit size attribute based on the expected number of inflight, initiator RDMA operations.
+
+If users are going to be issuing more messages than the CXI provider rendezvous limit (`FI_CXI_RDZV_THRESHOLD`), the transmit size attribute must also include the number of outstanding, unexpected rendezvous operations.
+For instance, inflight, initiator RDMA operations and outstanding, unexpected rendezvous operations.
+See the section on [Rendezvous protocol configuration](rendezvous_protocol_configuration.md#rendezvous-protocol-configuration) for more information.
+
+The current default is 512. Cray MPI sets this to 1024.  
+
+If the setting is too high, it can consume more memory than necessary and allow too many messages to be in flight, potentially overwhelming an endpoint. Conversely, if the setting is too low, it can impact performance due to the instantiation of flow control.
+In some cases, a low setting may cause a deadlock because an application might post too many transmissions before it can post a receive. These issues are often caused by poorly written applications. This situation typically occurs with the Rendezvous protocol, where too many unexpected messages are received.
@@ -0,0 +1,9 @@
+# Expected number of ranks and peers (`FI_UNIVERSE_SIZE`)
+
+The Libfabric `FI_UNIVERSE_SIZE` environment variable defines the number of expected ranks/peers an application needs to communicate with. This value is used in the CXI provider to scale portals flow-control resources used for side-band communication.
+
+The side-band control event queue size is based on the universe size so that more resources are applied as the job scales.
+
+Libfabric sets this default to 1024. The maximum number of ranks when using Cray MPI would be roughly one per core, so 256 ranks would be 256 cores (per NIC). Platforms that have more than 256 cores per NIC may need to increase this size.
+
+If set too small, performance may be impacted by constraining the number of side-band messages that can be outstanding during portals flow-control recovery. If set too large, more memory may be needlessly confused.
@@ -0,0 +1,7 @@
+# Hardware offload capabilities
+
+Some key features of the HPE Slingshot NICs include offload capabilities for the following:
+
+- **Asynchronous Message Progression Offload:** HPE Slingshot NICs provide offloading and asynchronous progression of both send and receive messages to improve system performance by enabling the overlap of communications and computation and reducing the use of memory (and memory bandwidth) by the MPI stack. When sending a message, commands are added to a command queue and progress by the NIC without CPU involvement. Large numbers of non-blocking commands can be queued by any process (up to 65,536 if the maximum size of queue is used). Small MPI messages (those with up to 192 bytes of payload data) can be written directly to a command queue, avoiding a round trip across the host interface.
+- **Tag Matching:** When receiving a message, MPI point-to-point messages are directed to a list processing engine (LPE) that matches sends to receives. The LPE supports both eager and rendezvous protocols. For small messages, the LPE can match a message to receive request and stream the payload data directly to a user buffer. For large messages, the LPE will match the message to a receive request and then issue a Get request to fetch the payload data directly to a user buffer. Tag-matching offloads are used for both expected and unexpected messages. For unexpected messages, “eager data” that is sent before a receive buffer that is allocated is held in overflow buffers and hardware will match these messages against subsequently posted receive buffers.
+- **Completion Events and Triggered Operations:** HPE Slingshot network adapters support counting events and triggered operations that allow complex synchronizations and completion of one or more operations to trigger the issuing of other operations. For example, triggered operations can be used to offload progression of bulk data collectives to the NIC. Counting events and triggered operations are used to implement with low overhead complex CPU and GPU operations using HPE Slingshot-based communications.
@@ -0,0 +1,13 @@
+# Hardware overview
+
+The HPE Slingshot NIC is optimized for high performance RDMA communications on HPC and AI applications.
+It enables applications to achieve high performance by offloading CPU-intensive activities to the NIC hardware to maximize the overlap of compute and communication.
+The NIC simultaneously handles standard Ethernet, and the optimized, non-Ethernet HPE Slingshot Transport (ST) protocol used for offloading RDMA operations.
+
+Sometimes, the NIC works with HPE Slingshot switches to maximize the unique and powerful performance capabilities of the fabric.
+For example, large messages can be offloaded to use a rendezvous protocol that allows for out-of-order packet delivery that can be adaptively routed on a packet-by-packet basis to overcome link congestion and achieve high utilization of available bandwidth while retaining application ordering constraints.
+Another example is the “fine grained” flow control between the NIC and the fabric allow the NIC to reduce bandwidth for a specific application instead of completely pausing all flows or an entire traffic class as would happen with standard Ethernet.
+
+The ST protocol is accessed through a connectionless software interface to deliver large scalability with lower memory footprint overhead as compared to connection-oriented protocols. The ST protocol is supported by reliable packet delivery features in the NIC and the fabric.
+Link-level reliability capabilities such as link-level retry reduce packet drops that otherwise add latency.
+The hardware mechanisms work with a software-based end-to-end retry mechanism, typically referred to as the retry handler service, that acts as a “last resort” mechanism to retry packets to the destination before packet drops, for example when there is congestion that has not been fully mitigated.
@@ -0,0 +1,20 @@
+# HPE Cray Programming Environment
+
+HPE Cray programming environment pre-programs many of the environment variables for the HPE Slingshot NIC libfabric.
+These settings have been found to be useful generally for HPE supercomputing customers when using the distributed middleware such as Cray MPI.
+The settings are a useful starting point for users of other MPI middleware though each MPI software may have unique attributes that could be better optimized though experimentation.
+
+## MPI settings
+
+- `FI_CXI_RDZV_THRESHOLD = 16384`
+- `FI_CXI_RDZV_EAGER_SIZE = 2048`
+- `FI_CXI_DEFAULT_CQ_SIZE = 131072`
+- `FI_CXI_DEFAULT_TX_SIZE = 1024`
+- `FI_CXI_OFLOW_BUF_SIZE = 12582912`
+- `FI_CXI_OFLOW_BUF_COUNT = 3`
+- `FI_CXI_RX_MATCH_MODE = hardware`
+- `FI_CXI_REQ_BUF_MIN_POSTED = 6`
+- `FI_CXI_REQ_BUF_SIZE  = 12582912`
+- `FI_CXI_REQ_BUF_MAX_CACHED = 0`
+- `FI_MR_CACHE_MAX_SIZE = -1`
+- `FI_MR_CACHE_MAX_COUNT= 524288`
@@ -0,0 +1,29 @@
+# HPE Slingshot NIC RDMA protocol and traffic classes
+
+The following tables delineate how various operations/traffic types are adaptively routed.
+
+**Table:** Adaptive routing for various operations and traffic types
+
+| Operation / Traffic Type    | Ordered – Per Flow Adaptive Routing | Unordered – Per Packet Adaptive Routing |
+|-----------------------------|-------------------------------------|-----------------------------------------|
+| MPI message headers         | Yes                                 |                                         |
+| MPI bulk data transfers     |                                     | Yes                                     |
+| Shmem Put/Get and MPI-3 RMA |                                     | Yes                                     |
+| Lustre bulk data transfers  |                                     | Yes                                     |
+| TCP/UDP                     | Yes                                 |                                         |
+
+**Table:** Description of adaptive routing for various operations and traffic types
+
+| Operation / Traffic Type   | Ordered – Per Flow Adaptive Routing                                                                                                                                                                                                                                       | Unordered – Per Packet Adaptive Routing                                                                                                                             |
+|----------------------------|---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|---------------------------------------------------------------------------------------------------------------------------------------------------------------------|
+| MPI message headers        | MPI semantics require message ordering between two points. Message headers (and a small amount of data) are sent in order. Many MPI messages will be new flows. For example, whenever there is a gap of more than a few microseconds in messages to the same destination. |                                                                                                                                                                     |
+| MPI bulk data transfers    |                                                                                                                                                                                                                                                                           | Once an MPI message has been matched, the bulk data transfer can be unordered. The threshold for this is typically 8K - it can be tuned on a per-application basis. |
+| Shmem and MPI-3 RMA        | Some special cases such as put-with-signal and reliable AMOs use the “unrestricted” reliability protocol that uses ordered delivery.                                                                                                                                      | Put/Get use a sequence of unordered single packet operations. Address ordering is implemented using fence.                                                          |
+| Lustre bulk data transfers |                                                                                                                                                                                                                                                                           | Kfabric uses the same bulk-data primitives as MPI. They provide message ordering with unordered deliver of bulk-data.                                               |
+| TCP/UDP                    | Each new flow selects a new path. Long lived flows are rerouted if they encounter congestion.                                                                                                                                                                             |                                                                                                                                                                     |
+
+In the absence of contention, all ordered delivery traffic will be routed minimally, for example in a Dragonfly configuration, taking at most one switch-to-switch one hop in the source group, one global hop, and one hop in the destination group.
+In the presence of congestion, the adaptive routing mechanism can cause packets to route non-minimally to avoid the congestion.
+In the non-minimal case, the hop count can double, with the packets effectively routing to a randomly chosen intermediate switch and then to the destination.
+
+The HPE Slingshot adaptive routing uses both congestion information and traffic class and Quality of Service (QoS) capabilities to prevent traffic from different applications from interfering with each other.
@@ -0,0 +1,22 @@
+# Intel MPI and applications compiled with Intel MPI
+
+Intel MPI documentation states it supports Libfabric version 1.5.0 or later, and it recommends the latest version off of the main branch of Libfabric.
+Therefore, HPE Slingshot Libfabric works with Intel MPI.
+
+As of HPE Slingshot release 2.1.2, the HPE Slingshot NIC provider is not yet packaged in the upstream OFI release, and therefore is not in the Intel MPI included version of Libfabric.
+The environment variables provided will direct the Intel MPI middleware to use the HPE Slingshot NIC Host Software that is installed as part of the host software packages.
+
+HPE expects the HPE Slingshot NIC Libfabric provider to be part of the Open Source Libfabric distribution as of Libfabric version 1.21, expected in July of 2024.
+When Intel MPI incorporates Libfabric 1.21 or beyond, the following instructions may not be necessary.
+
+To point the Intel MPI to the HPE Slingshot NIC installed Libfabric, the following variables must be set:
+
+- Define the Intel MPI 2021.10 path, source `${PATH_TO_IMPI}/setvars.sh`
+- `export I_MPI_OFI_LIBRARY=/opt/cray/libfabric/1.15.2.0/lib64/libfabric.so.1`
+- `export I_MPI_OFI_PROVIDER=cxi`
+- `export I_MPI_OFI_LIBRARY_INTERNAL=0`
+
+The `/opt/cray/libfabric/1.15.2.0/lib64/libfabric.so.1` reference in the previous commands may vary by HPE Slingshot release.
+As of release 2.1.2, the above is correct, but there may be newer versions in HPE Slingshot release 2.2 and later.
+
+Intel MPI will not set environment variables by default the way Cray MPI does, so users may also need to set the environment variables for particular applications or particular sizes.
@@ -0,0 +1,14 @@
+# IP networking considerations
+
+The HPE Slingshot NIC runs IP stacks and the native `cxi` RDMA Libfabric stacks concurrently.
+Libfabric stacks require IP networking even if the application primarily uses RDMA; for example, MPI and AI stacks will use IP addresses to locate endpoints, and Slurm job launching subsystems rely on IP networking as well.
+
+IP networking is configured through Linux. Guidance is provided in the HPE Slingshot documentation for how to best configure Linux IP networking for the HPE Slingshot NIC, and systems deployed with HPE’s Performance Cluster Manager (HPCM) or Cray System Management (CSM) often have pre-built base images that pre-populate some of the Linux settings in `system.d` boot scripts.
+
+Users are encouraged to be familiar with the IP configuration settings of importance, such as the following:
+
+- ARP cache sizes and timeouts
+- TCP performance turning parameters
+- IP routing configuration for multi-NIC systems as per the product documentation
+
+See [IP performance and configuration settings](ip_performance_and_configuration_settings.md#ip-performance-and-configuration-settings) for more information.
@@ -0,0 +1,8 @@
+# IP performance and configuration settings
+
+There are many recommended settings in the product documentation for configuring TCP/IP performance.
+HPE Slingshot Host Software (SHS) does not set up these Linux settings.
+Review the documentation to see if these are settings are in place if TCP/IP performance is a concern.
+
+When measuring performance with IP, use the `iperf2` benchmark instead of `iperf3`. Because IP protocols run on the host to test the networking aspect, the `iperf2` benchmark allows scaling to many cores which is needed to drive the high bandwidth of the HPE Slingshot NIC; `iperf3` is more a single-threaded benchmark.
+Because TCP is host dependent, when performance on `iperf2` is below expectations, there can be many host based contributing causes that are unrelated to the fabric or the NIC.
@@ -0,0 +1,26 @@
+# Libfabric and the HPE Slingshot NIC offloads
+
+The HPE Slingshot NIC exposes its RDMA offload capabilities to software (typically middleware such as MPI or GPU communications collective libraries) using the Libfabric APIs.
+Libfabric is an industry standard open-source library for communications that minimizes the impedance mismatch between applications, middleware, and fabric communication hardware by being independent of the underlying networking protocols and specific network device implementation.
+Its APIs are tailored to meet the middleware’s transport use cases and requirements while allowing NIC vendors to unlock hardware innovations.
+The HPE Slingshot NIC, for example, will deliver low latency, strong progression (overlap of compute and communications), and the ability to scale to tens of thousands of nodes.
+
+Libfabric is a more recent higher level interface as compared to the “Verbs” approach used in InfiniBand and its IB-over-Ethernet (RoCE) derivative. Because it is a higher-level abstraction, closer to the application semantics, the application developer is relieved from having to implement network-technology specific logic for optimized performance.
+Libfabric software can run various hardware fabric types without requiring the developer to rewrite most the code to switch from one fabric type to another.
+Libfabric is widely adopted today by several NIC vendors and will continue to grow as it is the preferred interface for the Ultra Ethernet Consortium’s low-latency transport standardization direction. Libfabric core provides upward-facing APIs to the applications through network device-specific interfaces called "providers".
+The provider for the HPE Slingshot NIC is the `cxi` provider. In addition to NIC-specific providers, Libfabric includes providers for “shared memory” communications (on-node), TCP/IP, and Verbs NICs.
+
+RDMA and OS bypass are fundamental principles to achieving high performance low-latency networking as contrasted with the ubiquitous IP-based “sockets” communications APIs. These can avoid memory copies, enabling asynchronous operations, and direct NIC hardware access from the application.
+Consider this example: when an application wants to send data, it starts a send command using commands not dissimilar from IP sockets equivalent calls. The `cxi` provider then optimally selects the hardware acceleration method to achieve the most performance.
+
+Libfabric does not specify what capabilities the underlying hardware must offload. Applications can even run on a traditional IP protocol Libfabric provider albeit without RDMA offload benefits.
+When hardware offload is provided, the resources will be finite, especially as compared to host software and CPU cycles. For example, hardware-based completion queues can implement asynchronous processing. This adds performance by letting the processing continue without interrupts to the application or requiring host-based mechanisms to check when communications is complete and memory can be re-used.
+But these hardware-based completion queues are a limited resource that must be managed properly.
+
+For the HPE Slingshot 200Gbps NIC, resources are allocated using a `cxi` service that is configured by the privileged user that can access the kernel driver for the NIC. Some `cxi` services are created at boot time, like the service for running the Linux Ethernet stack. Other resources are configured by the host resident job scheduler components, such as PALS (for PBS Pro) or Slurm.
+
+Also, some resources can be managed through user-accessible environment variables. These are used by the NIC provider to configure internal options to help guide how communications can best be optimized for higher performance and lower memory consumption. They are also configurable because there may be different optimization points based on system size, processing type, and specific application attributes such as number of messages in flight at any one time and how reliably can the application’s memory be cached.
+
+Since not all NICs provide the same (or even any) offload capabilities, the need for and importance of Libfabric environment variables varies between different vendor NICs.
+On those that provide substantial offloads like the HPE Slingshot NIC, managing and allocating the finite hardware resources will be more important than for NICs that do not provide offloads and rely on consuming what will look like unlimited host and CPU and memory resources and/or rely on additional memory copies to bounce buffers (which consume memory and CPU cycles and increase latency).
+This can mean that the failure of an application on the HPE Slingshot NIC but runs with a different NIC’s Libfabric provider can be due to the need to configure the environment variables, but are sometimes misinterpreted as a bug or other fabric issue.
@@ -0,0 +1,39 @@
+# Memory cache monitor settings
+
+NICs that offload RDMA operations (like the HPE Slingshot NIC) enable writing and reading directly from the host memory without the host CPU involved and without memory copies. Extensive software interaction in Linux, Libfabric, and the cxi provider is needed to make this operation work properly and optimally.
+To succeed, the NIC must be able to read and write to the current memory location with fresh data that is not changed by other code. To use RDMA, Libfabric provides mechanisms for the application to “register” memory to set up a specific memory location for RDMA operations.
+Registration creates a contiguous block of memory to use for communications with the needed protections and mapping to the NIC hardware. The `cxi` provider requires registering memory to use RDMA.
+
+To achieve good RDMA performance, caching memory registrations is important because the act of registering memory is costly in terms of overhead. The memory registration cache reduces the overhead of repeatedly registering and deregistering memory when the cached copy is current.
+Libfabric also has extensive support for memory registration caching because it is common for software to register the same memory sections in the course of the application.
+Ensuring the data in the MR cache is “fresh” is the role of the memory registration cache monitor. Libfabric supports several methods for monitoring the memory registration cache. These are selectable using the `FI_MR_CACHE_MONITOR` environment variable.
+
+## `FI_MR_CACHE_MONITOR=userfaultfd`
+
+`userfaultfd` (or `uffd`) - is delivered as part of Linux operating system distributions. It is a kernel service for tracking memory mapping changes.
+
+## `FI_MR_CACHE_MONITOR=memhooks`
+
+`memhooks` is distributed as part of Libfabric. Unlike `uffd`, `memhooks` is a user-space function which traps library functions for memory allocation or deallocation.
+Both `userfaultfd` and `memhooks` have advantages as well as known shortcomings as memory monitors for MR Cache.
+
+- `memhooks` is set up primarily to monitor dynamic memory allocations, such as applications using `mmap` and `brk` memory functions. Downsides are that it cannot monitor stack allocations or static allocations. The hook instantiation is dependent on load order, linker directives, etc. It deadlocks if the code frees memory such is observed with GPU-style programming locks.
+- `userfaultfd` uses a file descriptor for communication which introduces a delay between detection of changes to the memory layout and acknowledgement within Libfabric. This delay can provide for memory corrupting errors since scenarios such as allocation-free-reallocation of the same address in user space are unresolvable. `userfaultfd` is also constrained to operating on page-aligned, full page regions, making it unsuitable for data layout commonly found in applications which utilize SHMEM.
+
+Currently the default setting is memhooks. Importantly, `userfaultfd` is required when running applications with NCCL or RCCL.
+
+## `FI_MR_CACHE_MONITOR=kdreg2`
+
+In addition to `memhooks` and `uffd`, the HPE Slingshot NIC provides another monitor, `kdreg2`, developed by HPE and shipped in the HPE Slingshot Host Software (SHS) package for optional installation. This will be provided as open source as well. The purpose of `kdreg2` is to overcome situations where `memhooks` and `uffd` both fail so that the application can achieve performance by utilizing caching. `kdreg2` is able to monitor static, dynamic, and stack memory. It can support arbitrary alignment. It provides synchronous notification mechanisms. And it can employ extra data to detect allocate/free/reallocate scenarios.
+It must be installed into the host OS kernel at image creation time.
+
+`kdreg2` is not installed by default, and HPE encourages system administrators to ensure it is installed so that users can try it and see where it provides benefits and whether it can be a single cache monitor for all applications. Future releases of SHS will change the installation to be done by default, and will likely make this the default memory cache monitor in the future.
+
+There are two Libfabric parameters for the memory registration cache that are of note:
+
+- `FI_MR_CACHE_MAX_SIZE`: This environment variable specifies the maximum size (in bytes) for the cache maintained by the MR Cache Monitor. A setting of `-1` means that there is no maximum. The default in Libfabric is set to the system memory size divided by `cpu_cnt` (the number of processors) divided by 2. HPE Cray Programming Environment sets this to unlimited.
+- `FI_MR_CACHE_MAX_COUNT`: This environment variable controls the maximum number of cached MRs that the MR Cache Monitor can maintain. The Libfabric default is 1024. In general, 1024 is too small and the recommendation is to increase this for most applications (For example, HPE Cray MPI). HPE Cray Programming Environment set this to 524288. The increases by the PE software are to ensure better reuse of locations for RDMA operations when there is an active MR Cache, and avoids the performance hit of having a rotating list of locations which exceed the MR Cache limits.
+
+One failure mode when the memory registration cache monitor is not working properly is deadlocks. If an application has problems, one can reduce the `FI_MR_CACHE_MAX_COUNT` to 0, which disables the caching.
+This will cause the application to run slower, but if it avoids a problem – most frequently a deadlock – one can try a different monitor.
+If performance is worse with cache enabled, then increase the size of the memory registration cache.
@@ -0,0 +1,15 @@
+# Memory registration
+
+High-performance applications use Remote Direct Memory Access (RDMA), allowing the networking stack to read and write data directly into application memory, bypassing normal operating system protection mechanisms.
+To facilitate data transfer from application space to the NIC, memory ranges are "registered" through an operation that translates virtual addresses into physical addresses and pins the relevant pages.
+
+HPE Slingshot leverages Libfabric for its communication libraries, which manage the registration of RDMA memory regions.
+Libfabric supports the registration of both CPU and GPU memory.
+
+In addition to RDMA transfer capabilities, Libfabric implements a "memory registration cache."
+This feature enhances performance by allowing RDMA hardware to reuse previous registrations, thereby reducing computational overhead. This is particularly effective for transfers that repeatedly use the same memory regions and for applications with a limited number of transfer locations.
+However, different applications may require different caching optimizations that can impact performance. Libfabric environment variables can be used to tune caching behavior for specific applications.
+
+The memory registration cache requires a mechanism to maintain accurate data.
+The Memory Registration Cache Monitor tracks changes in the memory map and invalidates the appropriate cached memory registration entries as needed. Libfabric supports various memory registration cache monitors (`userfaultfd`, `memhooks`, and `kdreg2`), which use different techniques to detect changes in the memory map, accommodating application-specific memory allocation strategies.
+These monitors and their associated environment variables are described in more detail in later sections of this document.
@@ -0,0 +1,52 @@
+# Memory registration and cache monitors
+
+High performance applications use Remote Direct Memory Access (RDMA) in which the networking stack reads and writes data directly into application memory bypassing normal operating system protection mechanisms.
+To orchestrate the transfer of data from the application space to the NIC, the memory ranges are “registered” with an operation that translates the virtual addresses into physical addresses and orchestrates the pinning of the relevant pages. HPE Slingshot uses Libfabric as its basis for its communication libraries, and Libfabric manages the registration of RDMA memory regions.
+
+On top of RDMA transfer capabilities, Libfabric implements a “memory registration cache.” This functionality provides an additional layer of performance by allowing the RDMA hardware to re-use previous registrations and avoid the associated computational cost.
+This is especially effective for transfers which repeatedly use to the same pieces of memory and for applications which have a bounded number of transfer locations.
+
+Caching of memory registrations (and the concomitant lazy de-registration) is not a panacea: applications which do not re-use memory locations for data transfer can needlessly incur cache management overhead and experience degraded performance.
+Similarly, applications which attempt to register and use large numbers of memory regions may exceed the (finite) capacity of the cache and inadvertently encounter cache thrashing. In these cases, the appropriate remedy may be to either disable the cache or increase the maximum size to match the I/O patterns. Each of these remedies is available via Libfabric environment variables.
+
+The memory registration cache requires an associated entity to maintain accurate data in the cache. The Memory Registration Cache Monitor watches for memory map for changes that require it to then invalidate the appropriate cached memory registration entries. There are several different techniques which can be used to detect changes to the memory map. This given technique may or may not be effective for a specific application and its memory allocation strategies. Libfabric supports the use of various memory registration cache monitors.
+
+Effective monitoring of memory map changes is crucial for proper memory registration cache functionality. Failure to detect changes can result in data transfer to physical memory which is now mapped elsewhere in the process, or even mapped to another process. The result is corruption.
+The corruption can result in various execution failures such as hangs due to smashed state, slow or stopped execution from excessive retries because some values being watched are not getting updated, or outright failure of the data transfer.
+
+The traditional memory monitors provided with Libfabric are `userfaultfd` and `memhooks`.  
+
+- **userfaultfd:** A Linux Kernel service which gives user-space applications notifications about memory mapping changes via a well-known file descriptor. `userfaultfd` operates on the page level and allows applications to monitor changes to all writable pages within the process’s virtual address space.  The descriptor is queried and address ranges which match the `userfaultfd` events are purged from the memory registration cache.
+- **memhooks:** A user-space subsystem distributed as part of Libfabric which traps library functions for memory allocation/deallocation calls within an application. When an application allocates memory, `memhooks` tracks that memory and when deallocation occurs, it informs the memory registration cache to purge any corresponding entries. `memhooks` monitors memory which has been dynamically allocated during the execution of the application.
+
+Each of the traditional monitors has advantages and disadvantages. `memhooks` are synchronous with the application but cannot monitor stack or static allocations, and the ability to monitor effectively can depend on load order, linker directives, and other application-specific attributes which affect the trapping mechanisms.
+`userfaultfd` can monitor any page-aligned writable memory but cannot provide synchronous notification of memory changes; this means that allocating, freeing, and then reallocating the same address range is unresolvable and error prone.
+It also cannot monitor non-page aligned memory as is common in some HPC applications (specifically SHMEM).
+
+By default, HPE Slingshot uses the `memhooks` monitor unless set otherwise with the appropriate Libfabric environment variable.
+Also, HPE guides to select `userfaultfd` for applications that use NCCL or RCCL collectives libraries as they can hang at scale under `memhooks`.
+
+To overcome many of the previously described limitations and avoiding the need to configure this per-application, HPE introduced `kdreg2` as a third memory cache monitor. `kdreg2` is a provided as a Linux kernel module and uses an open-source licensing model.
+As of the date of this note, it ships in the HPE Slingshot Host Software distribution and is optionally installed. (Future releases may install this by default, and eventually HPE expects HPE Slingshot NIC Libfabric provider to select `kdreg2` by default instead of `memhooks`.)
+
+`kdreg2` uses kernel mechanisms to monitor mapping changes and provides synchronous notification to the memory registration cache.  It can report changes at the byte level to any memory within the application’s virtual address space. Unlike `memhooks` it can monitor stack and heap memory, and unlike `userfaultfd` it provides synchronous notification of changes and can monitor partial pages.
+
+HPE knows of no cases where `kdreg2` does not detect memory mapping changes, resulting in misdirected RDMA transfers.
+On the contrary, one of the primary goals of `kdreg2` is to enable the performance advantages of memory registration caching for those applications that fail with both `memhooks` and `userfaultfd`.
+`kdreg2` has been successfully deployed with enhanced performance for some weather forecasting codes which would otherwise fail to use the traditional monitors.
+
+There can be performance differences between the memory registration cache monitors as described above. In general, HPE has not characterized the range of applications to evaluate performance enhancement observed using `kdreg2` versus `memhooks` and `userfaultfd`.
+But in all cases the performance of successful execution with memory registration cache is substantial over execution without caching.
+`kdreg2` is HPE’s solution to allow more applications to enjoy the benefits of memory registration cache.
+
+The size of the memory registration cache is one of the most important parameters affecting performance when caching is employed. Since the primary users of Libfabric are communication collective libraries such as SHMEM and MPI, the user may not be aware of the presence of Libfabric nor of its configuration via environment variables.
+The default values are set relatively low to accommodate development systems and non-supercomputer environments.
+HPE communication collectives, such as Cray MPI, increase the size of the cache by default.  
+
+In summary, `kdreg2` is available as an additional memory cache monitor that can enable applications that otherwise use memory registration caching to achieve a performance advantage.  For sites that run a mix of HPC applications under the default `memhooks` while setting NCCL and RCCL applications to `userfaultfd`, setting the default configuration to `kdreg2` may simplify operations by eliminating this per-application setting.
+
+Hewlett Packard Enterprise (HPE) recommends that users do the following:
+
+1. Install `kdreg2` in the host images. See "Install `kdreg2`" in the _HPE Slingshot Host Software Installation and Configuration Guide_.
+2. Set the default memory monitor to `kdreg2` using the Libfabric environment variable.
+3. Increase the Libfabric environment variables for memory registration cache size if indicated, especially for applications that are not using Cray MPI.
@@ -0,0 +1,21 @@
+
+# NVIDIA Communications Collectives Libraries (NCCL)
+
+This information is summarized from the “Running NCCL Applications” note that is available on the HPE Support Center (HPESC).
+
+## Prerequisites
+
+- Linux kernel 5.12 or later
+- NCCL support for Libfabric: Use the [open-source “OFI Plug-In” in for Libfabric-to-NCCL version 1.6.0](https://github.com/aws/aws-ofi-nccl/releases/tag/v1.6.0)
+- HPE Slingshot Host Software (SHS) version 2.1  or greater
+- CUDA and NVIDIA GPU Driver supported by the HPE SHS release version
+- GDRCopy must be installed
+
+## Environment Settings for Libfabric
+
+- `FI_MR_CACHE_MONITOR=userfaultfd`
+- `FI_CXI_DISABLE_HOST_REGISTER=1`  
+- `FI_CXI_DEFAULT_CQ_SIZE=131072`
+- `FI_CXI_DEFAULT_TX_SIZE` must be set to at least as large as the number of outstanding unexpected rendezvous messages that must be supported for the endpoint plus 256; the default of 256 will be sufficient for most applications
+- `FI_CXI_RDZV_PROTO=alt_read`
+- Enable the [Alternative Rendezvous Protocol](./rendezvous_protocol_configuration.md#alternative-rendezvous-configuration-fi_cxi_rdzv_protoalt_read) (either `sysfs` variable, or job through job scheduler)
@@ -0,0 +1,5 @@
+# OpenMPI and applications compiled with OpenMPI
+
+OpenMPI supports Libfabric, and the OpenMPI community has been adding optimizations for the HPE Slingshot NIC to OpenMPI 5.0.x and main branches to the HPE Slingshot NIC as described in the following external paper: [Open MPI for HPE Cray EX Systems](https://www.osti.gov/servlets/purl/1997634).
+
+OpenMPI users need to engage with the community for support on running OpenMPI on the HPE Slingshot NIC.
@@ -0,0 +1,16 @@
+
+# Performance counters
+
+This section describes where to find information on a subset of performance counters that have proven to be useful in analyzing application performance.
+
+## Performance counters for HPE Cray EX systems
+
+See the _HPE Cray Cassini Performance Counters User Guide (S-9929)_ on the [HPE Support Center](https://support.hpe.com/connect/s/?language=en_US) for details on how to use performance counters to collect and analyze performance data.
+
+This guide includes procedures for using HPE Performance Analysis Tools running on HPE Cray EX supercomputer systems.
+
+## Performance counters for HPCM systems
+
+See the "Troubleshooting the HPE Slingshot interconnect" section in the _HPE Performance Cluster Manager Software System Monitoring Guide_ on the [HPE Support Center](https://support.hpe.com/connect/s/?language=en_US) for details on the available performance counters for HPCM systems.
+
+This guide describes the performance counters displayed in the Grafana Performance Dashboard.
@@ -0,0 +1,19 @@
+# AMD ROCm Communications Collectives Libraries (RCCL)
+
+This information is summarized from the “Running RCCL Applications” note that is available on the HPE Support Center (HPESC).
+
+## Prerequisites
+
+- Linux kernel 5.12 or later
+- RCCL Support for Libfabric: Use the [open-source “OFI Plug-In” in for Libfabric-to-RCCL version 1.4](https://github.com/ROCmSoftwarePlatform/aws-ofi-rccl/)
+- HPE Slingshot Host Software (SHS) version 2.1  or greater
+- ROCm GPU driver and user stack supported by the HPE SHS release version
+
+## Environment settings for Libfabric
+
+- `FI_MR_CACHE_MONITOR=userfaultfd`
+- `FI_CXI_DISABLE_HOST_REGISTER=1`  
+- `FI_CXI_DEFAULT_CQ_SIZE=131072`
+- `FI_CXI_DEFAULT_TX_SIZE` must be set to at least as large as the number of outstanding unexpected rendezvous messages that must be supported for the endpoint plus 256; the default of 256 will be sufficient for most applications
+- `FI_CXI_RDZV_PROTO=alt_read`
+- Enable the [Alternative Rendezvous Protocol](./rendezvous_protocol_configuration.md#alternative-rendezvous-configuration-fi_cxi_rdzv_protoalt_read) (either `sysfs` variable, or job through job scheduler)
@@ -0,0 +1,22 @@
+# RDMA messaging and relationship to environment settings
+
+Libfabric programmers use either tagged or untagged two-sided messaging interfaces. Tagged messaging is used for complex data exchanges and multi-step protocols that require strict message ordering.
+Untagged messaging has less overhead and is suitable for simpler RDMA communication patterns, such as bulk data transfers and basic request-response. The HPE Slingshot NIC provides extensive hardware offloads for tagged messaging.
+
+Received data goes into a "receive buffer," a memory area from which the target application reads the data once the transmission is complete, without OS memory copies.
+Data to be sent comes from a "transmit buffer," an area of application memory from which the NIC reads directly to send data, without involving the host CPU in copying the data.
+
+The CXI provider selects how to send data from among several message protocols based on payload length.
+Short messages are transferred using an eager protocol where the entire message payload is sent along with the message header. Long messages are transferred using a rendezvous protocol.
+In the rendezvous protocol, a portion of the message payload is sent along with the message header. Once the header is matched to a receive buffer, the remainder of the payload is pulled from the source and matched and ordered by the receiver.
+
+In both methods, if data has been sent before the receive buffer is set up, as might happen if an asynchronous communications pattern is permitted to maximize performance, the message is “unexpected”.
+It will sit in a special section of host memory allocated for unexpected messages until its proper destination is learned, at which point it will be copied into the correct receive buffer location.
+The HPE Slingshot NIC will offload tag matching for the unexpected message to the NIC, but unexpected messages require host CPU intervention, so performance is optimized by limiting their frequency.
+
+The environment variables relate to configuring the various resources and behavior to offload as many of the above operations onto the NIC to optimize performance as is possible.
+The less the host CPU needs to be involved, the more predictably the sending, receiving, and matching can happen using hardware offloads, and the more software-based communications can be targeted instead of triggered unintentionally by hardware exhaustion, the greater the performance will be. For example, when hardware message matching resources become exhausted, messages may be dropped and need to be retransmitted and this impacts performance significantly.
+Sometimes hardware resource exhaustion can cause lock-ups, such as when the sender is waiting for receive responses but the receive resources have been exhausted.
+
+The environment variables help ensure that the resources are sized appropriately so the communication performance is optimized, and resources are in line with the application’s communications patterns.
+Allocating too many resources can use up too much host memory or sometimes hinder performance, while allocating too few risks performance and resource exhaustion lock up.
@@ -0,0 +1,41 @@
+# Rendezvous protocol configuration
+
+For larger message sizes, the rendezvous protocol divides the message into multiple packets, which the sender re-assembles in the correct order. This method involves a "handshake" between the source and destination, allowing large amounts of data to be sent using out-of-order packets. The destination host then matches and orders these packets correctly.
+
+On the HPE Slingshot fabric, the rendezvous protocol leverages fine-grained adaptive routing to load balance data transfer on a packet-by-packet basis across multiple network paths, ensuring high bandwidth.
+
+**Note:** The handshake message can also include "eager" data, as mentioned previously.
+
+Rendezvous uses hardware resources in many areas because it must track all the individual pieces across all the messages.
+This includes resources for handling unexpected messages and resources for handling tag matching. Therefore it can be desirable to configure more explicitly how the CXI Provider should handle different message sizes.
+
+These settings include:
+
+- Message size threshold (`FI_CXI_RDZV_THRESHOLD`)
+- Minimum payload size (`FI_CXI_RDZV_GET_MIN`)
+- Eager data size for rendezvous protocol (`FI_CXI_RDZV_EAGER_SIZE`)
+
+A Send with length less than or equal to `FI_CXI_RDZV_THRESHOLD` plus `FI_CXI_RDZV_GET_MIN` will always be performed using the eager protocol instead of the non-eager rendezvous.
+
+Larger Sends will be performed using the rendezvous protocol with `FI_CXI_RDZV_EAGER_SIZE` bytes of payload sent using the eager protocol and the remainder of the payload read from the source using a Get.
+
+If using these parameters, `FI_CXI_RDZV_THRESHOLD` plus `FI_CXI_RDZV_GET_MIN` must be less than or equal to `FI_CXI_OFLOW_BUF_SIZE`.
+
+For both “hybrid” and “software” tag matching modes, care must be taken to minimize the threshold for rendezvous processing (For instance, `FI_CXI_RDZV_THRESHOLD` and `FI_CXI_RDZV_GET_MIN`).
+When running in software endpoint mode, the environment variables `FI_CXI_REQ_BUF_SIZE` and `FI_CXI_REQ_BUF_MIN_POSTED` are used to control the size and number of the eager request buffers posted to handle incoming unmatched messages.
+
+## Alternative rendezvous configuration (`FI_CXI_RDZV_PROTO=alt_read`)
+
+There are two rendezvous protocols in the cxi provider. The “alternative read” protocol rendezvous was developed because HPE found some applications performed poorly with the default protocol. In effect, it is a hybrid type of rendezvous that handles the eager and non-eager portion of the rendezvous data transfer differently.
+Note RDMA is used for both.
+Initially this was uncovered on AI training applications and with these it is required. Other applications may also benefit from the alternative protocol, but HPE believes that most MPI simulation applications do not require the alternative protocol and may achieve better performance with the default protocol. Hence both options are supported.
+
+Unlike the other environment variables described in this document that are completely in user space, enabling the alternative rendezvous requires changing a setting into the privileged kernel portion of the cxi driver. This means that the alternative protocol cannot be supported just as a user-space runtime settable parameter.
+To initiate this protocol on a job-by-job basis, the best option is to utilize recent versions of Slurm and PALS that can configure this variable in the kernel as part of the job launcher on a per-job basis. The `srun` option is `--network=disable_rdzv_get`.
+
+Alternatively, the HPE Slingshot NIC `sysfs` device property can set to turn off `rdzv_get_en` which will remove the ability to run the default rendezvous protocol and is appropriate if the system should always be running the alternative protocols.
+
+The `alt_read` protocol can also be tested without updating hardware settings. To achieve this, run `FI_CXI_RX_MATCH_MODE=software` along with `FI_CXI_RDZV_PROTO=alt_read`. However, performance may not be optimal.
+
+In addition, the alternate rendezvous protocol must be selected via the runtime environment using the variable `FI_CXI_RDZV_PROTO=alt_read`.
+The default rendezvous protocol is defined as `FI_CXI_RDZV_PROTO=default`.
@@ -0,0 +1,17 @@
+# Software architecture
+
+At a high level, the HPE Slingshot Host Software (SHS) NIC stack relies on the following major elements:
+
+- Libfabric user space libraries.
+- Kernel drivers for Linux. Conceptually there is an Ethernet driver, which interfaces to Linux IP networking functions and provides the kernel level services for the HPE Slingshot NIC RDMA transport, that interfaces to the Libfabric provider.
+- Other services that run-in user space but are essentially driver functionality, specifically the “retry handler” code.
+
+Each of these has logging that may be useful for fabric debugging.
+For user applications, the user space Libfabric logs are the primary useful source of information.
+
+The list in this section is not exhaustive. In addition, there are configuration and diagnostic utilities, and other kernel modules for specific use (software RoCE, Lustre, and an alternative memory cache monitor).
+There is also interaction with other parts of a SHS stack, including the job launcher plug-in (Slurm or PALS) that act as the privileged entity to perform per-job NIC driver settings (like security isolation for the `cxi` RDMA protocol), and GPU drivers, and application of configuration settings and utilities integrated into the boot-up using `system.d`.
+
+The HPE Slingshot NIC currently requires an algorithmic MAC address (AMA) to be configured before it can communicate.
+This is now handled by a Linux boot script that queries the switch with the Linux `lldptool` and assigns the proper MAC address using Linux networking commands.
+Such scripts are provided and also integrated with HPE Performance Cluster Manager (HPCM) and Cray Systems Management (CSM) image configuration tooling. The AMA is essentially a numerical mapping of the NIC to the specific switch port location on the fabric.
@@ -0,0 +1,37 @@
+# Tag matching mode settings (`FI_CXI_RX_MATCH_MODE`)
+
+The CXI provider supports three different operational modes for tag matching: hardware, hybrid, and software.
+Hardware tag matching (`FI_CXI_RX_MATCH_MODE=hardware`) offers performance benefits as matching on the host is expensive in terms of CPU and memory bandwidth utilization.
+Hardware match mode is appropriate for users who can ensure the sum of unexpected messages and posted receives will not exceed the configured hardware receive resource limit for the application. Hardware matching is the default setting.
+
+Hybrid match mode (`FI_CXI_RX_MATCH_MODE=hybrid`) is appropriate for users who are unsure if the sum of unexpected messages and posted receives will not exceed the configured hardware receive resource limit for the application but want to ensure they application still functions if hardware receive resources are consumed.
+Hybrid match mode extends hardware match by allowing for an automated transition into software match mode if resources are consumed.
+Hybrid is generally a good mode to run in over just hardware matching, but the trade-off is that it requires approximately eight MB per rank/domain of additional host memory consumption even if the rank never transitions use the software match.
+
+Software match mode (`FI_CXI_RX_MATCH_MODE=software`) is appropriate for users who know the sum of unexpected messages and posted receives will exceed the configured hardware receive resource limit for the application.
+In software match mode, the CXI provider maintains an unexpected software and posted receive list rather than offloading to hardware.
+This avoids having to allocate a hardware receive resource for each unexpected message and posted receive.
+This will consume approximately eight MB per rank/domain of additional host memory consumption.
+
+## Hardware matching
+
+When hardware receive resources are consumed (list/match entries or overflow buffers), receive operations can be disabled to ensure that the match order is maintained as the provider attempts to recover hardware resources.
+If resources can be recovered, operation can be resumed. Otherwise a different receive match mode is required (hybrid or software).
+During the resource recovery process, side-band communication is required to synchronize re-enablement of the receive function. An improperly sized side-band communication event queue can lengthen the recovery time at scale.
+
+For this reason, with SHS release 2.1.1 and beyond, hybrid can be set as the global default, albeit at the cost of host memory and can avoid this situation. See the next section for more information on configuring hybrid match mode.
+The current default setting is hardware matching, and Cray MPI uses hardware matching, both largely as a legacy of prior releases when the hybrid matching was not performing optimally.
+
+Running with `FI_LOG_LEVEL=warn` and `FI_LOG_PROV=cxi` will report if this flow control transition is happening. This can be useful to understand other application failures because there are other scenarios where software and hybrid match modes may still enter flow control: if a user is not draining the Libfabric completion queue at a reasonable rate, corresponding hardware events may fill up which will also trigger flow control. In practice, dependent processes (For example, parallel jobs) will most likely be sharing a common receive hardware resource pool.
+
+## Hybrid match mode configuration options
+
+Hybrid Match Mode has further configurability to ensure the process requiring more hardware receive resources does not consume them all which would force all the other processes to be forced into the software match mode.
+For example, considered a parallel application which has multiple processes (For instance, ranks) per NIC all sharing the same hardware receive resource pool. Suppose that the application communication pattern results in an all-to-one communication to only a single rank (For example, rank 0) while other ranks may be doing communication among each other.
+If the width of the all-to-one exceeds hardware resource consumptions, all ranks on the target NIC will transition to software match mode. The preemptive options help ensure that only rank 0 would transition to software match mode instead of all the ranks on the target NIC.
+
+The `FI_CXI_HYBRID_POSTED_RECV_PREEMPTIVE` and `FI_CXI_HYBRID_UNEXPECTED_MSG_PREEMPTIVE` environment variables enable users to control the transition to software match. One approach is to set the receive size attribute to expected usage, and if this expected usage is exceeded, only the offending endpoints will transition to software match mode.
+
+`FI_CXI_HYBRID_PREEMPTIVE` and `FI_CXI_HYBRID_RECV_PREEMPTIVE` environment variables will force the transition to software match mode when hardware receive resources in the pool are running low. The CXI provider will do a multi-step process to transition the Libfabric endpoint to software match mode.
+The benefit of running with these enabled is that the number of endpoints transitioning to software match mode may be smaller when compared to forced software match mode transition due to zero hardware resources available.
+These two settings are disabled by default by setting the value to zero.
@@ -0,0 +1,31 @@
+# User configurable Libfabric environment variables when using the HPE Slingshot NIC
+
+The `cxi` provider uses various environment parameters to size hardware resources according to the needs of the system, potentially application and message pattern and system processing type and size. If those patterns are constant, the configuration would be globally set, but for systems that have heterogenous applications and processing, sometimes these need to be set on a per-application basis.
+Often tuning of environment variables is more relevant for larger clusters and larger jobs.
+
+Some are specific to the `cxi` provider, while others are general Libfabric object attribute size and environment variables important when using the `cxi` provider.
+
+The use of these settings can be relevant in several ways:
+
+- Optimizing host memory use: over sizing some parameters can waste host memory.
+- Optimize performance: sizing the resources properly can help performance by maximizing use of the hardware offload capability, minimizing “on-loading” to the host, avoiding the impact of stopping and starting to flow control messages, and maximizing the opportunities re-use host memory caches.
+- Overcoming hangs that may be caused by the complex interaction of any specific application’s memory access and communications patterns with the memory caching and other functions in the communications stack.
+
+Environment variable defaults are usually set by the administrator but can be overridden on an application basis – either the middleware like the MPI library, or on an application-by-application basis. Sometimes the optimal default settings are site-specific, a function of the type of processors being employed (GPU vs CPU), the main type of applications being run (For example, GPU-based AI, MPI, or SHMEM), and the scale of the system.
+Sometimes, a specific application will need unique settings. These settings can include both general Libfabric parameters as well HPE Slingshot specific ones.
+
+Note that HPE Cray MPI provides default settings for environment variables that are not yet set by other Libfabric stacks (like open-source MPI or GPU communications collectives library) and therefore might need to be explicitly set, particularly for larger cluster sizes.
+While it is difficult to provide general guidance across the breadth of different system sizes and types using the HPE Slingshot NIC, sites should consider whether the trade-off of setting a parameter too high at the cost of wasted host memory is more impactful than being too constrained, in which case the HPE Cray MPI settings is a good template thought they are sized for the customer base of large systems.
+
+The most detailed reference for the HPE Slingshot provider is the [manpage for the `cxi` provider](https://github.com/ofiwg/libfabric/blob/main/man/fi_cxi.7.md).
+Libfabric software developers (For example, those developing MPI middleware) should see the manpages for the most complete and information.
+The intent of the information here is to document the most common settings administrators and specific application users will need to use, partially based on current customer experience.
+The [HPE Slingshot NIC RDMA protocol and traffic classes](./hpe_slingshot_nic_rdma_protocol_and_traffic_classes.md#hpe-slingshot-nic-rdma-protocol-and-traffic-classes) table in the appendix of HPE Slingshot NIC specific settable parameters is copied from the `cxi` Libfabric provider man page.
+The syntax for these variables starts with `FI_CXI_xxx`.
+
+General Libfabric variables will use the syntax `FI_xxx` and would be explained in the general Libfabric man page available on `ofiwg.github.io`.
+Software capabilities have evolved from prior versions of the HPE Slingshot Host Software, and guidance might have been different in the past.
+
+Default settings are also current as of the SHS 2.1.2 release but can change in the future.
+
+Users of the HPE Cray Programming Environment (PE) will find additional information in the PE documentation.
@@ -59,6 +59,7 @@ mkdir -m777 -p build/install
 mkdir -m777 -p build/release_notes
 mkdir -m777 -p build/troubleshoot
 mkdir -m777 -p build/admin
+mkdir -m777 -p build/user
 mkdir -m777 -p build/Markdown
 
 # call the script that flattens the dir structure used to build the HPESC bundle
@@ -114,6 +115,17 @@ dita -i HPE_Slingshot_Host_Software_Administration_Guide.ditamap -o build/PDF/ad
 echo "Building Combined Markdown File"
 dita -i HPE_Slingshot_Host_Software_Administration_Guide.ditamap --root-chunk-override=to-content -o build/Markdown -f markdown_github
 
+echo "Building SHS User Guide";
+
+# This line builds the HPESC HTML bundle for the user guide
+dita -i tmp/HPE_Slingshot_Host_Software_User_Guide.ditamap -o build/user -f HPEscHtml5 && cp HPE_Slingshot_Host_Software_User_Guide.json build/user/publication.json && cd build/user/ && zip -r crs9014en_us.zip ./
+cd $THIS_DIR
+# This builds the PDF using DITA-OT's default PDF transform
+echo "Building PDF"
+dita -i HPE_Slingshot_Host_Software_User_Guide.ditamap -o build/PDF/user -f pdf
+# This builds the single file Markdown version of the guide. This leverages DITA's "chunking"
+echo "Building Combined Markdown File"
+dita -i HPE_Slingshot_Host_Software_User_Guide.ditamap --root-chunk-override=to-content -o build/Markdown -f markdown_github
 
 # delete the tmp dir created by the flatten script. The bundle is still in the build subdir
 rm -rf tmp/
@@ -123,8 +135,11 @@ mv build/Markdown/HPE_Slingshot_Host_Software_Installation_and_Configuration_Gui
 mv build/Markdown/HPE_Slingshot_Host_Software_Release_Notes.md build/
 mv build/Markdown/HPE_Slingshot_Host_Software_Troubleshooting_Guide.md build/
 mv build/Markdown/HPE_Slingshot_Host_Software_Administration_Guide.md build/
+mv build/Markdown/HPE_Slingshot_Host_Software_User_Guide.md build/
+
 rm -rf build/Markdown/*
 mv build/HPE_Slingshot_Host_Software_Installation_and_Configuration_Guide.md build/Markdown/
 mv build/HPE_Slingshot_Host_Software_Release_Notes.md build/Markdown/
 mv build/HPE_Slingshot_Host_Software_Troubleshooting_Guide.md build/Markdown/
-mv build/HPE_Slingshot_Host_Software_Administration_Guide.md build/Markdown/
+mv build/HPE_Slingshot_Host_Software_Administration_Guide.md build/Markdown/
+mv build/HPE_Slingshot_Host_Software_User_Guide.md build/Markdown/
@@ -0,0 +1 @@
+Subproject commit 83adf7ea1e342ac0e8e8e19fc662e795fe778c19
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1 @@`
	`1`	`+Subproject commit 83adf7ea1e342ac0e8e8e19fc662e795fe778c19`