Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add gpu_multi_process_run.sh #98

Open
wants to merge 1 commit into
base: main
Choose a base branch
from
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
152 changes: 152 additions & 0 deletions gpu_multi_process_run.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,152 @@
#! /bin/bash
set -e
set -u
set -o pipefail

: "${NNODES:?Must set NNODES}"
: "${NODE_RANK:?Must set NODE_RANK}"
: "${JAX_COORDINATOR_PORT:?Must set JAX_COORDINATOR_PORT}"
: "${JAX_COORDINATOR_ADDRESS:?Must set JAX_COORDINATOR_ADDRESS}"
: "${GPUS_PER_NODE:?Must set GPUS_PER_NODE}"
: "${COMMAND:?Must set COMMAND}"


export GPUS_PER_NODE=$GPUS_PER_NODE
export JAX_COORDINATOR_PORT=$JAX_COORDINATOR_PORT
export JAX_COORDINATOR_ADDRESS=$JAX_COORDINATOR_ADDRESS

set_nccl_gpudirect_tcpx_specific_configuration() {
if [[ "$USE_GPUDIRECT" == "tcpx" ]]; then
echo "Using GPUDirect-TCPX"
export NCCL_CROSS_NIC=0
export NCCL_ALGO=Ring
export NCCL_PROTO=Simple
export NCCL_DEBUG=INFO
export NCCL_NET_GDR_LEVEL=PIX
export NCCL_P2P_PXN_LEVEL=0
export NCCL_DEBUG_SUBSYS=INIT,GRAPH,ENV,TUNING,NET,VERSION
export LD_LIBRARY_PATH="${LD_LIBRARY_PATH}:/usr/local/tcpx/lib64"
export NCCL_GPUDIRECTTCPX_FORCE_ACK=0
export NCCL_GPUDIRECTTCPX_TX_COMPLETION_NANOSLEEP=1000
export NCCL_DYNAMIC_CHUNK_SIZE=524288
export NCCL_P2P_NET_CHUNKSIZE=524288
export NCCL_P2P_PCI_CHUNKSIZE=524288
export NCCL_P2P_NVL_CHUNKSIZE=1048576
export NCCL_NSOCKS_PERTHREAD=4
export NCCL_SOCKET_NTHREADS=1
export NCCL_MAX_NCHANNELS=12
export NCCL_MIN_NCHANNELS=12
export NCCL_GPUDIRECTTCPX_PROGRAM_FLOW_STEERING_WAIT_MICROS=1000000
export NCCL_SOCKET_IFNAME=eth0
export NCCL_GPUDIRECTTCPX_TX_BINDINGS="eth1:8-21,112-125;eth2:8-21,112-125;eth3:60-73,164-177;eth4:60-73,164-177"
export NCCL_GPUDIRECTTCPX_RX_BINDINGS="eth1:22-35,124-139;eth2:22-35,124-139;eth3:74-87,178-191;eth4:74-87,178-191"
export NCCL_GPUDIRECTTCPX_SOCKET_IFNAME=eth1,eth2,eth3,eth4
export NCCL_GPUDIRECTTCPX_CTRL_DEV=eth0
export NCCL_NVLS_ENABLE=0
elif [[ "$USE_GPUDIRECT" == "fastrak" ]]; then
echo "Using GPUDirect-TCPFasTrak"
export NCCL_DEBUG_SUBSYS=INIT,GRAPH,ENV,TUNING,NET,VERSION
export NCCL_DEBUG=INFO
export NCCL_FASTRAK_ENABLE_HOTPATH_LOGGING=0
export LD_LIBRARY_PATH="/usr/local/fastrak/lib64:${LD_LIBRARY_PATH}"
export NCCL_FASTRAK_CTRL_DEV=eth0
export NCCL_FASTRAK_IFNAME=eth1,eth2,eth3,eth4,eth5,eth6,eth7,eth8
export NCCL_SOCKET_IFNAME=eth0
export NCCL_CROSS_NIC=0
export NCCL_ALGO=Ring
export NCCL_PROTO=Simple
export NCCL_MAX_NCHANNELS=16
export NCCL_MIN_NCHANNELS=16
export NCCL_SOCKET_NTHREADS=4
export NCCL_DYNAMIC_CHUNK_SIZE=524288
export NCCL_DYNAMIC_CHUNK_SIZE=524288
export NCCL_P2P_NET_CHUNKSIZE=524288
export NCCL_P2P_PCI_CHUNKSIZE=524288
export NCCL_P2P_NVL_CHUNKSIZE=1048576
export NCCL_FASTRAK_NUM_FLOWS=8
export NCCL_FASTRAK_FLOWS_PER_GROUP=2
export NCCL_BUFFSIZE=4194304
export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
export NCCL_NET_GDR_LEVEL=PIX
else
echo "NOT using GPUDirect"
fi
}

echo "LD_LIBRARY_PATH ${LD_LIBRARY_PATH}"

set_nccl_gpudirect_tcpx_specific_configuration

wait_all_success_or_exit() {
# https://www.baeldung.com/linux/background-process-get-exit-code
local pids=("$@")
while [[ ${#pids[@]} -ne 0 ]]; do
all_success="true"
for pid in "${pids[@]}"; do
code=$(non_blocking_wait "$pid")
if [[ $code -ne 127 ]]; then
if [[ $code -ne 0 ]]; then
echo "PID $pid failed with exit code $code"
exit "$code"
fi
else
all_success="false"
fi
done
if [[ $all_success == "true" ]]; then
echo "All pids succeeded"
break
fi
sleep 5
done
}
non_blocking_wait() {
# https://www.baeldung.com/linux/background-process-get-exit-code
local pid=$1
local code=127 # special code to indicate not-finished
if [[ ! -d "/proc/$pid" ]]; then
wait "$pid"
code=$?
fi
echo $code
}

resolve_coordinator_ip() {
local lookup_attempt=1
local max_coordinator_lookups=500
local coordinator_found=false
local coordinator_ip_address=""

echo "Coordinator Address $JAX_COORDINATOR_ADDRESS"

while [[ "$coordinator_found" = false && $lookup_attempt -le $max_coordinator_lookups ]]; do
coordinator_ip_address=$(nslookup "$JAX_COORDINATOR_ADDRESS" 2>/dev/null | awk '/^Address: / { print $2 }' | head -n 1)
if [[ -n "$coordinator_ip_address" ]]; then
coordinator_found=true
echo "Coordinator IP address: $coordinator_ip_address"
export JAX_COORDINATOR_IP=$coordinator_ip_address
return 0
else
echo "Failed to recognize coordinator address $JAX_COORDINATOR_ADDRESS on attempt $lookup_attempt, retrying..."
((lookup_attempt++))
sleep 1
fi
done

if [[ "$coordinator_found" = false ]]; then
echo "Failed to resolve coordinator address after $max_coordinator_lookups attempts."
return 1
fi
}

# Resolving coordinator IP
set +e
resolve_coordinator_ip
set -e

PIDS=()
${COMMAND} &
PID=$!
PIDS+=($PID)

wait_all_success_or_exit "${PIDS[@]}"
Loading