#!/bin/bash #SBATCH --job-name=new_coupling # Name of the job #SBATCH --output=outs/new_coupling.out # Stdout goes to logs/jobname_jobid.out #SBATCH --error=outs/new_coupling.err # Stderr goes to logs/jobname_jobid.err #SBATCH --partition=dgx-b200 # Queue to submit to #SBATCH --ntasks=1 # Number of tasks (usually one per process) #SBATCH --nodes=1 #SBATCH --gpus=1 #SBATCH --ntasks-per-node=8 #SBATCH --mem-per-gpu=128G #SBATCH --cpus-per-gpu=8 export OMP_NUM_THREADS=64 # export NCCL_DEBUG=INFO export NCCL_NVLS_ENABLE=1 export NCCL_IB_ADAPTIVE_ROUTING=1 export NCCL_IB_SL=1 export NCCL_IB_QPS_PER_CONNECTION=2 export NCCL_IB_SPLIT_DATA_ON_QPS=0 export NCCL_IB_HCA=mlx5_15,mlx5_10,mlx5_14,mlx5_13,mlx5_8,mlx5_7,mlx5_9,mlx5_4 export NCCL_SOCKET_IFNAME=bond0 export NCCL_ALGO=RING export UCX_TLS=rc python ./peptide/new_coupling.py \ --checkpoint ./peptide/ckpt/PepReDi_v2.pt \ --version 3 \ --gen_steps 16