AReUReDi / scripts /peptide_rectify_train.sh
Tong Chen
add files
d2693e0
#!/bin/bash
#SBATCH --job-name=rectify_train # Name of the job
#SBATCH --output=outs/rectify_train.out # Stdout goes to logs/jobname_jobid.out
#SBATCH --error=outs/rectify_train.err # Stderr goes to logs/jobname_jobid.err
#SBATCH --partition=dgx-b200 # Queue to submit to
#SBATCH --ntasks=1 # Number of tasks (usually one per process)
#SBATCH --nodes=1
#SBATCH --gpus=1
#SBATCH --ntasks-per-node=8
#SBATCH --mem-per-gpu=128G
#SBATCH --cpus-per-gpu=8
export OMP_NUM_THREADS=64
# export NCCL_DEBUG=INFO
export NCCL_NVLS_ENABLE=1
export NCCL_IB_ADAPTIVE_ROUTING=1
export NCCL_IB_SL=1
export NCCL_IB_QPS_PER_CONNECTION=2
export NCCL_IB_SPLIT_DATA_ON_QPS=0
export NCCL_IB_HCA=mlx5_15,mlx5_10,mlx5_14,mlx5_13,mlx5_8,mlx5_7,mlx5_9,mlx5_4
export NCCL_SOCKET_IFNAME=bond0
export NCCL_ALGO=RING
export UCX_TLS=rc
python ./peptide/rectify_train.py \
--train_dataset_path ./peptide/ectified_datasets/v3/train \
--val_dataset_path ./peptide/rectified_datasets/v3/validation \
--version 3 \
--model_dim 512 \
--n_heads 8 \
--n_layers 6 \
--vocab_size 24 \
--seq_len 100 \
--epochs 50 \
--learning_rate 1e-4 \
--weight_decay 2e-5 \
--label_smoothing 0.0 \
--checkpoint_dir ./peptide/ckpt \
--tc_batches 20 \
--tc_k_samples 50 \
--resume_from_checkpoint ./peptide/ckpt/PepReDi_v2.pt