diff --git a/gpu_multi_process_run.sh b/gpu_multi_process_run.sh new file mode 100644 index 0000000..1726bc1 --- /dev/null +++ b/gpu_multi_process_run.sh @@ -0,0 +1,152 @@ +#! /bin/bash +set -e +set -u +set -o pipefail + +: "${NNODES:?Must set NNODES}" +: "${NODE_RANK:?Must set NODE_RANK}" +: "${JAX_COORDINATOR_PORT:?Must set JAX_COORDINATOR_PORT}" +: "${JAX_COORDINATOR_ADDRESS:?Must set JAX_COORDINATOR_ADDRESS}" +: "${GPUS_PER_NODE:?Must set GPUS_PER_NODE}" +: "${COMMAND:?Must set COMMAND}" + + +export GPUS_PER_NODE=$GPUS_PER_NODE +export JAX_COORDINATOR_PORT=$JAX_COORDINATOR_PORT +export JAX_COORDINATOR_ADDRESS=$JAX_COORDINATOR_ADDRESS + +set_nccl_gpudirect_tcpx_specific_configuration() { + if [[ "$USE_GPUDIRECT" == "tcpx" ]]; then + echo "Using GPUDirect-TCPX" + export NCCL_CROSS_NIC=0 + export NCCL_ALGO=Ring + export NCCL_PROTO=Simple + export NCCL_DEBUG=INFO + export NCCL_NET_GDR_LEVEL=PIX + export NCCL_P2P_PXN_LEVEL=0 + export NCCL_DEBUG_SUBSYS=INIT,GRAPH,ENV,TUNING,NET,VERSION + export LD_LIBRARY_PATH="${LD_LIBRARY_PATH}:/usr/local/tcpx/lib64" + export NCCL_GPUDIRECTTCPX_FORCE_ACK=0 + export NCCL_GPUDIRECTTCPX_TX_COMPLETION_NANOSLEEP=1000 + export NCCL_DYNAMIC_CHUNK_SIZE=524288 + export NCCL_P2P_NET_CHUNKSIZE=524288 + export NCCL_P2P_PCI_CHUNKSIZE=524288 + export NCCL_P2P_NVL_CHUNKSIZE=1048576 + export NCCL_NSOCKS_PERTHREAD=4 + export NCCL_SOCKET_NTHREADS=1 + export NCCL_MAX_NCHANNELS=12 + export NCCL_MIN_NCHANNELS=12 + export NCCL_GPUDIRECTTCPX_PROGRAM_FLOW_STEERING_WAIT_MICROS=1000000 + export NCCL_SOCKET_IFNAME=eth0 + export NCCL_GPUDIRECTTCPX_TX_BINDINGS="eth1:8-21,112-125;eth2:8-21,112-125;eth3:60-73,164-177;eth4:60-73,164-177" + export NCCL_GPUDIRECTTCPX_RX_BINDINGS="eth1:22-35,124-139;eth2:22-35,124-139;eth3:74-87,178-191;eth4:74-87,178-191" + export NCCL_GPUDIRECTTCPX_SOCKET_IFNAME=eth1,eth2,eth3,eth4 + export NCCL_GPUDIRECTTCPX_CTRL_DEV=eth0 + export NCCL_NVLS_ENABLE=0 + elif [[ "$USE_GPUDIRECT" == "fastrak" ]]; then + echo "Using GPUDirect-TCPFasTrak" + export NCCL_DEBUG_SUBSYS=INIT,GRAPH,ENV,TUNING,NET,VERSION + export NCCL_DEBUG=INFO + export NCCL_FASTRAK_ENABLE_HOTPATH_LOGGING=0 + export LD_LIBRARY_PATH="/usr/local/fastrak/lib64:${LD_LIBRARY_PATH}" + export NCCL_FASTRAK_CTRL_DEV=eth0 + export NCCL_FASTRAK_IFNAME=eth1,eth2,eth3,eth4,eth5,eth6,eth7,eth8 + export NCCL_SOCKET_IFNAME=eth0 + export NCCL_CROSS_NIC=0 + export NCCL_ALGO=Ring + export NCCL_PROTO=Simple + export NCCL_MAX_NCHANNELS=16 + export NCCL_MIN_NCHANNELS=16 + export NCCL_SOCKET_NTHREADS=4 + export NCCL_DYNAMIC_CHUNK_SIZE=524288 + export NCCL_DYNAMIC_CHUNK_SIZE=524288 + export NCCL_P2P_NET_CHUNKSIZE=524288 + export NCCL_P2P_PCI_CHUNKSIZE=524288 + export NCCL_P2P_NVL_CHUNKSIZE=1048576 + export NCCL_FASTRAK_NUM_FLOWS=8 + export NCCL_FASTRAK_FLOWS_PER_GROUP=2 + export NCCL_BUFFSIZE=4194304 + export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 + export NCCL_NET_GDR_LEVEL=PIX + else + echo "NOT using GPUDirect" + fi +} + +echo "LD_LIBRARY_PATH ${LD_LIBRARY_PATH}" + +set_nccl_gpudirect_tcpx_specific_configuration + +wait_all_success_or_exit() { + # https://www.baeldung.com/linux/background-process-get-exit-code + local pids=("$@") + while [[ ${#pids[@]} -ne 0 ]]; do + all_success="true" + for pid in "${pids[@]}"; do + code=$(non_blocking_wait "$pid") + if [[ $code -ne 127 ]]; then + if [[ $code -ne 0 ]]; then + echo "PID $pid failed with exit code $code" + exit "$code" + fi + else + all_success="false" + fi + done + if [[ $all_success == "true" ]]; then + echo "All pids succeeded" + break + fi + sleep 5 + done +} +non_blocking_wait() { + # https://www.baeldung.com/linux/background-process-get-exit-code + local pid=$1 + local code=127 # special code to indicate not-finished + if [[ ! -d "/proc/$pid" ]]; then + wait "$pid" + code=$? + fi + echo $code +} + +resolve_coordinator_ip() { + local lookup_attempt=1 + local max_coordinator_lookups=500 + local coordinator_found=false + local coordinator_ip_address="" + + echo "Coordinator Address $JAX_COORDINATOR_ADDRESS" + + while [[ "$coordinator_found" = false && $lookup_attempt -le $max_coordinator_lookups ]]; do + coordinator_ip_address=$(nslookup "$JAX_COORDINATOR_ADDRESS" 2>/dev/null | awk '/^Address: / { print $2 }' | head -n 1) + if [[ -n "$coordinator_ip_address" ]]; then + coordinator_found=true + echo "Coordinator IP address: $coordinator_ip_address" + export JAX_COORDINATOR_IP=$coordinator_ip_address + return 0 + else + echo "Failed to recognize coordinator address $JAX_COORDINATOR_ADDRESS on attempt $lookup_attempt, retrying..." + ((lookup_attempt++)) + sleep 1 + fi + done + + if [[ "$coordinator_found" = false ]]; then + echo "Failed to resolve coordinator address after $max_coordinator_lookups attempts." + return 1 + fi +} + +# Resolving coordinator IP +set +e +resolve_coordinator_ip +set -e + +PIDS=() +${COMMAND} & +PID=$! +PIDS+=($PID) + +wait_all_success_or_exit "${PIDS[@]}"