49 lines
3.1 KiB
Bash
49 lines
3.1 KiB
Bash
# -*- coding: utf-8 -*-
|
|
# @Author: Weisen Pan
|
|
|
|
# Load necessary modules
|
|
# This section loads essential modules required for the execution environment
|
|
source /etc/profile.d/modules.sh # Load the module environment configuration
|
|
module load gcc/11.2.0 # Load GCC (GNU Compiler Collection) version 11.2.0
|
|
module load openmpi/4.1.3 # Load OpenMPI version 4.1.3 for parallel computing
|
|
module load cuda/11.5/11.5.2 # Load CUDA version 11.5.2 for GPU computing
|
|
module load cudnn/8.3/8.3.3 # Load cuDNN version 8.3.3 for deep learning libraries
|
|
module load nccl/2.11/2.11.4-1 # Load NCCL version 2.11 for multi-GPU communication
|
|
module load python/3.10/3.10.4 # Load Python version 3.10.4 for executing Python scripts
|
|
|
|
# Activate virtual environment
|
|
# This activates the virtual environment that contains the required Python packages
|
|
source ~/venv/pytorch1.11+horovod/bin/activate
|
|
|
|
# Configure log directory
|
|
# Sets up the directory for storing logs related to the job execution
|
|
LOG_PATH="/home/projadmin/Federated_Learning/project_EdgeFLite/records/${JOB_NAME}_${JOB_ID}"
|
|
mkdir -p ${LOG_PATH} # Create the log directory if it doesn't exist
|
|
|
|
# Prepare dataset directory
|
|
# This section prepares the dataset directory by copying data to the local directory for the job
|
|
TEMP_DATA_PATH="${SGE_LOCALDIR}/${JOB_ID}/" # Define the temporary data path for the current job
|
|
cp -r ../summit2024/simpleFL/performance_test/cifar100/data ${TEMP_DATA_PATH} # Copy the dataset to the temporary path
|
|
|
|
# Change to project directory
|
|
# Navigates to the project directory where the training script is located
|
|
cd EdgeFLite
|
|
|
|
# Execute training script
|
|
# This runs the training script with the specified configuration
|
|
python train_EdgeFLite.py \
|
|
--is_fed=1 \ # Enable federated learning mode
|
|
--fixed_cluster=0 \ # Do not use a fixed cluster configuration
|
|
--split_factor=4 \ # Specify the data split factor for federated learning
|
|
--num_clusters=25 \ # Set the number of clusters to 25
|
|
--num_selected=25 \ # Select all 25 clusters for training
|
|
--arch="resnet_model_110sl" \ # Use the 'resnet_model_110sl' architecture for the model
|
|
--dataset="cifar100" \ # Set the dataset to CIFAR-100
|
|
--num_classes=100 \ # Specify the number of output classes (100 for CIFAR-100)
|
|
--is_single_branch=0 \ # Enable multi-branch mode for model training
|
|
--is_amp=0 \ # Disable automatic mixed precision (AMP) for this run
|
|
--num_rounds=650 \ # Set the total number of federated rounds to 650
|
|
--fed_epochs=1 \ # Set the number of local epochs per round to 1
|
|
--spid="EdgeFLite_R110_100c_650r" \ # Set the session/process ID for the current job
|
|
--data=${TEMP_DATA_PATH} # Specify the dataset location (temporary directory)
|