#!/bin/bash #SBATCH --job-name=rectify_train # Name of the job #SBATCH --output=outs/rectify_train.out # Stdout goes to logs/jobname_jobid.out #SBATCH --error=outs/rectify_train.err # Stderr goes to logs/jobname_jobid.err #SBATCH --partition=dgx-b200 # Queue to submit to #SBATCH --ntasks=1 # Number of tasks (usually one per process) #SBATCH --nodes=1 #SBATCH --gpus=1 #SBATCH --ntasks-per-node=8 #SBATCH --mem-per-gpu=128G #SBATCH --cpus-per-gpu=8 export OMP_NUM_THREADS=64 # export NCCL_DEBUG=INFO export NCCL_NVLS_ENABLE=1 export NCCL_IB_ADAPTIVE_ROUTING=1 export NCCL_IB_SL=1 export NCCL_IB_QPS_PER_CONNECTION=2 export NCCL_IB_SPLIT_DATA_ON_QPS=0 export NCCL_IB_HCA=mlx5_15,mlx5_10,mlx5_14,mlx5_13,mlx5_8,mlx5_7,mlx5_9,mlx5_4 export NCCL_SOCKET_IFNAME=bond0 export NCCL_ALGO=RING export UCX_TLS=rc python ./peptide/rectify_train.py \ --train_dataset_path ./peptide/ectified_datasets/v3/train \ --val_dataset_path ./peptide/rectified_datasets/v3/validation \ --version 3 \ --model_dim 512 \ --n_heads 8 \ --n_layers 6 \ --vocab_size 24 \ --seq_len 100 \ --epochs 50 \ --learning_rate 1e-4 \ --weight_decay 2e-5 \ --label_smoothing 0.0 \ --checkpoint_dir ./peptide/ckpt \ --tc_batches 20 \ --tc_k_samples 50 \ --resume_from_checkpoint ./peptide/ckpt/PepReDi_v2.pt