use-case-and-architecture/EdgeFLite/scripts/EdgeFLite_R110_100c_650r.sh

# -*- coding: utf-8 -*-
# @Author: Weisen Pan

# Load necessary modules
# This section loads essential modules required for the execution environment
source /etc/profile.d/modules.sh              # Load the module environment configuration
module load gcc/11.2.0                        # Load GCC (GNU Compiler Collection) version 11.2.0
module load openmpi/4.1.3                     # Load OpenMPI version 4.1.3 for parallel computing
module load cuda/11.5/11.5.2                  # Load CUDA version 11.5.2 for GPU computing
module load cudnn/8.3/8.3.3                   # Load cuDNN version 8.3.3 for deep learning libraries
module load nccl/2.11/2.11.4-1                # Load NCCL version 2.11 for multi-GPU communication
module load python/3.10/3.10.4                # Load Python version 3.10.4 for executing Python scripts

# Activate virtual environment
# This activates the virtual environment that contains the required Python packages
source ~/venv/pytorch1.11+horovod/bin/activate

# Configure log directory
# Sets up the directory for storing logs related to the job execution
LOG_PATH="/home/projadmin/Federated_Learning/project_EdgeFLite/records/${JOB_NAME}_${JOB_ID}"
mkdir -p ${LOG_PATH}                          # Create the log directory if it doesn't exist

# Prepare dataset directory
# This section prepares the dataset directory by copying data to the local directory for the job
TEMP_DATA_PATH="${SGE_LOCALDIR}/${JOB_ID}/"   # Define the temporary data path for the current job
cp -r ../summit2024/simpleFL/performance_test/cifar100/data ${TEMP_DATA_PATH} # Copy the dataset to the temporary path

# Change to project directory
# Navigates to the project directory where the training script is located
cd EdgeFLite

# Execute training script
# This runs the training script with the specified configuration
python train_EdgeFLite.py \
    --is_fed=1 \                              # Enable federated learning mode
    --fixed_cluster=0 \                       # Do not use a fixed cluster configuration
    --split_factor=4 \                        # Specify the data split factor for federated learning
    --num_clusters=25 \                       # Set the number of clusters to 25
    --num_selected=25 \                       # Select all 25 clusters for training
    --arch="resnet_model_110sl" \                    # Use the 'resnet_model_110sl' architecture for the model
    --dataset="cifar100" \                    # Set the dataset to CIFAR-100
    --num_classes=100 \                       # Specify the number of output classes (100 for CIFAR-100)
    --is_single_branch=0 \                    # Enable multi-branch mode for model training
    --is_amp=0 \                              # Disable automatic mixed precision (AMP) for this run
    --num_rounds=650 \                        # Set the total number of federated rounds to 650
    --fed_epochs=1 \                          # Set the number of local epochs per round to 1
    --spid="EdgeFLite_R110_100c_650r" \           # Set the session/process ID for the current job
    --data=${TEMP_DATA_PATH}                  # Specify the dataset location (temporary directory)