use-case-and-architecture/EdgeFLite/scripts/FGKT_W502_20c_350r.sh

# -*- coding: utf-8 -*-
# @Author: Weisen Pan

# Load environment modules required for execution
source /etc/profile.d/modules.sh

# Load the GCC compiler version 11.2.0
module load gcc/11.2.0

# Load the OpenMPI version 4.1.3 for distributed computing
module load openmpi/4.1.3

# Load CUDA version 11.5 (subversion 11.5.2) for GPU acceleration
module load cuda/11.5/11.5.2

# Load cuDNN version 8.3 (subversion 8.3.3) for deep learning libraries
module load cudnn/8.3/8.3.3

# Load NCCL version 2.11 (subversion 2.11.4-1) for multi-GPU communication
module load nccl/2.11/2.11.4-1

# Load Python version 3.10 (subversion 3.10.4) as the programming language
module load python/3.10/3.10.4

# Activate the Python virtual environment for PyTorch and Horovod
source ~/venv/pytorch1.11+horovod/bin/activate

# Create and clean the log directory for this job
LOG_PATH="/home/projadmin/Federated_Learning/project_EdgeFLite/records/${JOB_NAME}_${JOB_ID}"
# Remove any existing log directory to avoid conflicts
rm -rf ${LOG_PATH}
# Create a fresh log directory for the current job
mkdir -p ${LOG_PATH}

# Prepare the local dataset storage
DATA_PATH="${SGE_LOCALDIR}/${JOB_ID}/"
# Copy the dataset for local processing to improve performance
cp -r ../summit2024/simpleFL/performance_test/cifar100/data ${DATA_PATH}

# Change to the working directory of the federated training scripts
cd EdgeFLite

# Execute the federated training process with the specified configuration
python run_gkt.py \
    --is_fed=1 \  # Enable federated training mode
    --fixed_cluster=0 \  # Do not fix clusters
    --split_factor=1 \  # Set the split factor to 1
    --num_clusters=20 \  # Number of clusters to use in federated training
    --num_selected=20 \  # Number of selected clusters per round
    --arch="wide_resnetsl50_2" \  # Use the wide ResNet-50_2 architecture
    --dataset="pill_base" \  # Specify the dataset to use (Pill Base)
    --num_classes=98 \  # Number of classes in the dataset
    --is_single_branch=0 \  # Enable multi-branch training
    --is_amp=0 \  # Disable automatic mixed precision training
    --num_rounds=350 \  # Number of federated training rounds
    --fed_epochs=1 \  # Number of epochs per federated round
    --batch_size=32 \  # Batch size for training
    --crop_size=224 \  # Image crop size
    --spid="FGKT_W502_20c_350r" \  # Specify the unique session ID for logging
    --data=${DATA_PATH}  # Path to the dataset