Skip to content

Commit

Permalink
SmartSim-Scaling ReadMe additions, plotting edits, reading data edits…
Browse files Browse the repository at this point in the history
…, and app edits (#39)

This merge will organize the current README layout, provide new performance results, remove the setting of the model and script in the application, refactor our current plotting code and add two additional requirements.

[ committed by @amandarichardsonn ]
[ reviewed by @billschereriii @ashao ]
  • Loading branch information
al-rigazzi committed Nov 19, 2023
1 parent 9e610b1 commit c8ff9ec
Show file tree
Hide file tree
Showing 78 changed files with 3,391 additions and 3,193 deletions.
356 changes: 356 additions & 0 deletions PERFORMANCE.md

Large diffs are not rendered by default.

802 changes: 191 additions & 611 deletions README.md

Large diffs are not rendered by default.

1 change: 0 additions & 1 deletion batch_scripts/run_aggregation_python_fs_slurm.sh
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,6 @@
#SBATCH -t 24:00:00

cd ..
module load slurm
python driver.py aggregation_scaling_python_fs --exp_name='aggregation-scaling-py-fs-batch' \
--client_nodes=[60] \
--clients_per_node=[48] \
Expand Down
3 changes: 1 addition & 2 deletions batch_scripts/run_aggregation_python_slurm.sh
Original file line number Diff line number Diff line change
Expand Up @@ -3,9 +3,8 @@
#SBATCH -N 93
#SBATCH --exclusive
#SBATCH -t 24:00:00

echo "Note: The flag net_ifname should be replaced with the appropriate value on the target system"
cd ..
module load slurm
python driver.py aggregation_scaling_python --exp_name='aggregation-scaling-py-batch' \
--client_nodes=[60] \
--clients_per_node=[48] \
Expand Down
13 changes: 8 additions & 5 deletions batch_scripts/run_aggregation_slurm.sh
Original file line number Diff line number Diff line change
Expand Up @@ -5,13 +5,16 @@
#SBATCH -t 12:00:00
#SBATCH -C SK48
#SBATCH --oversubscribe

echo "Note: The flag net_ifname should be replaced with the appropriate value on the target system"
cd ..
module load slurm
python driver.py aggregation_scaling --client_nodes=[60] \
python driver.py aggregation_scaling --exp_name='aggregation-scaling-batch' \
--client_nodes=[60] \
--clients_per_node=[48] \
--db_nodes=[16,32] \
--db_nodes=[16] \
--db_cpus=32 --net_ifname=ipogif0 \
--run_db_as_batch=False \
--tensors_per_dataset=[1,4]
--tensors_per_dataset=[4] \
--tensor_bytes=[1024000] \
--iterations=20 \
--tensors_per_dataset=[4]

16 changes: 5 additions & 11 deletions batch_scripts/run_inference_colo_slurm.sh
Original file line number Diff line number Diff line change
@@ -1,15 +1,9 @@
#!/bin/bash

#SBATCH -N 1
#SBATCH -N 16
#SBATCH -C "P100*16"
#SBATCH --exclusive
#SBATCH -p allgriz
#SBATCH -t 1:00:00

module load cudatoolkit/11.7 cudnn PrgEnv-intel
source ~/pyenvs/smartsim-dev/bin/activate

#SBATCH -t 10:00:00
echo "Note: The flag net_ifname should be replaced with the appropriate value on the target system"
cd ..
python driver.py inference_colocated --clients_per_node=[12,24,36,60,96] \
--nodes=[1] --db_tpq=[2] \
--db_cpus=[12] --pin_app_cpus=[True] \
--net_type="uds" --node_feature='{}' --languages=['fortran','cpp']
python driver.py inference_colocated --nodes=[4, 8, 12, 16]
12 changes: 6 additions & 6 deletions batch_scripts/run_inference_standard_slurm.sh
Original file line number Diff line number Diff line change
@@ -1,11 +1,11 @@
#!/bin/bash

#SBATCH -N 60
#SBATCH -N 116
#SBATCH -C "[P100*16&SK48*100]"
#SBATCH --exclusive
#SBATCH -t 10:00:00

echo "Note: The flag net_ifname should be replaced with the appropriate value on the target system"
cd ..
module load slurm
python driver.py inference_standard --client_nodes=[20,40,60] \
--db_nodes=[4,8,16] --db_tpq=[1,2,4] \
--db_cpus=[8,16]
python driver.py inference_standard --client_nodes=[25, 50, 75, 100] \
--db_nodes=[4, 8, 16] --db_tpq=[1] \
--db_cpus=[8]
2 changes: 1 addition & 1 deletion batch_scripts/run_throughput_pbs.sh
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
#PBS -o throughput.out
#PBS -N smartsim-throughput
#PBS -V

echo "Note: The flag net_ifname should be replaced with the appropriate value on the target system"
PYTHON=/lus/snx11242/spartee/miniconda/envs/0.4.0/bin/python
cd $PBS_O_WORKDIR/../
$PYTHON driver.py throughput_standard --client_nodes=[128,256,512] \
Expand Down
5 changes: 2 additions & 3 deletions batch_scripts/run_throughput_slurm.sh
Original file line number Diff line number Diff line change
Expand Up @@ -5,12 +5,11 @@
#SBATCH -t 10:00:00
#SBATCH -C SK48
#SBATCH --oversubscribe

echo "Note: The flag net_ifname should be replaced with the appropriate value on the target system"
cd ..
module load slurm
python driver.py throughput_standard --client_nodes=[60] \
--clients_per_node=[48] \
--db_nodes=[32] \
--db_cpus=32 --net_ifname=ipogif0 \
--db_cpus=[32] --net_ifname=ipogif0 \
--run_db_as_batch=False

11 changes: 10 additions & 1 deletion cpp-data-aggregation/aggregation_consumer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,9 @@ void run_aggregation_consumer(std::ofstream& timing_file,
// Allocate arrays to hold timings
std::vector<double> get_list_times;

// Allocate arrays to hold timings
std::vector<double> poll_list_times;

// Retrieve the number of iterations to run
int iterations = get_iterations();
log_data(context, LLDebug, "Running with iterations: " + std::to_string(iterations));
Expand All @@ -59,6 +62,7 @@ void run_aggregation_consumer(std::ofstream& timing_file,
log_data(context, LLInfo, "Consuming list " + std::to_string(i));
}

double poll_list_start = MPI_Wtime();
// Have rank 0 check that the aggregation list is full
if(rank == 0) {
bool list_is_ready = client.poll_list_length(list_name,
Expand All @@ -73,7 +77,10 @@ void run_aggregation_consumer(std::ofstream& timing_file,
throw std::runtime_error(list_size_error);
}
}

double poll_list_end = MPI_Wtime();
log_data(context, LLDebug, "poll_list completed");
delta_t = poll_list_end - poll_list_start;
poll_list_times.push_back(delta_t);
// Have all ranks wait until the aggregation list is full
MPI_Barrier(MPI_COMM_WORLD);

Expand Down Expand Up @@ -104,6 +111,8 @@ void run_aggregation_consumer(std::ofstream& timing_file,
for (int i = 0; i < iterations; i++) {
timing_file << rank << "," << "get_list" << ","
<< get_list_times[i] << "\n";
timing_file << rank << "," << "poll_list" << ","
<< poll_list_times[i] << "\n";
}

// Write loop time to file
Expand Down
110 changes: 12 additions & 98 deletions cpp-inference/inference_scaling_imagenet.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -98,8 +98,19 @@ void run_mnist(const std::string& model_name,
int num_devices = get_num_devices();
bool use_multigpu = (0 == device.compare("GPU")) && num_devices > 1;
bool should_set = get_set_flag();

std::string model_key = "resnet_model";
bool poll_model_code = client.poll_model(model_key, 100, 100);
if (!poll_model_code) {
log_error(context, LLInfo, "SR Error finding model");
}

std::string script_key = "resnet_script";
bool poll_script_code = client.poll_key(script_key, 100, 100);
if (!poll_script_code) {
log_error(context, LLInfo, "SR Error finding script");
}

// setting up string to debug set vars
std::string program_vars = "Running rank with vars should_set: ";
program_vars += std::to_string(should_set) + " - num_device: ";
Expand All @@ -108,102 +119,6 @@ void run_mnist(const std::string& model_name,
program_vars += std::to_string(is_colocated) + " - cluster: " + std::to_string(cluster);
log_data(context, LLDebug, program_vars);

if (should_set) {
log_data(context, LLDebug, "Entered should_set code block");
int batch_size = get_batch_size();
int n_clients = get_client_count();
std::string should_set_vars = "Running rank with batch_size: ";
should_set_vars += std::to_string(batch_size) + " and n_clients: ";
should_set_vars += std::to_string(n_clients);
log_data(context, LLDebug, should_set_vars);
if (!is_colocated && rank == 0) {
log_data(context, LLDebug, "Setting script/model for Standard test");

std::cout<<"Setting Resnet Model from scaling app" << std::endl;
log_data(context, LLInfo, "Setting Resnet Model from scaling app");

std::cout<<"Setting with batch_size: " << std::to_string(batch_size) << std::endl;
log_data(context, LLInfo, "Setting with batch_size: " + std::to_string(batch_size));

std::cout<<"Setting on device: " << device << std::endl;
log_data(context, LLInfo, "Setting on device: " + device);

std::cout<<"Setting on " << std::to_string(num_devices) << " devices" <<std::endl << std::flush;
log_data(context, LLInfo, "Setting on " + std::to_string(num_devices) + " devices");

std::string model_filename = "./resnet50." + device + ".pt";

if (use_multigpu) {
client.set_model_from_file_multigpu(model_key, model_filename, "TORCH", 0, num_devices, batch_size);
std::string std_model_use_multigpu_vars = "Use_multigpu - model_key:" + model_key;
std_model_use_multigpu_vars += " model_filename:";
std_model_use_multigpu_vars += model_filename + " num_devices:" + std::to_string(num_devices);
std_model_use_multigpu_vars += " batch_size:";
std_model_use_multigpu_vars += std::to_string(batch_size);
log_data(context, LLDebug, std_model_use_multigpu_vars);
client.set_script_from_file_multigpu(script_key, "./data_processing_script.txt", 0, num_devices);
std::string std_script_use_multigpu_vars = "Use_multigpu - script_key:";
std_script_use_multigpu_vars += script_key + " script_filename:";
std_script_use_multigpu_vars += "./data_processing_script.txt num_devices:";
std_script_use_multigpu_vars += std::to_string(num_devices);
}
else {
client.set_model_from_file(model_key, model_filename, "TORCH", device, batch_size);
std::string std_model_vars = "Not multigpu - model_key:" + model_key;
std_model_vars += " model_filename:" + model_filename + " device:";
std_model_vars += device + " batch_size:" + std::to_string(batch_size);
log_data(context, LLDebug, std_model_vars);
client.set_script_from_file(script_key, device, "./data_processing_script.txt");
std::string std_script_vars = "Not multigpu - script_key:" + script_key;
std_script_vars += " script_filename:./data_processing_script.txt device:" ;
std_script_vars += device;
log_data(context, LLDebug, std_script_vars);
}
}
if(is_colocated && rank % n_clients == 0) {
log_data(context, LLDebug, "Setting script/model for Colocated test");

std::cout<<"Setting Resnet Model from scaling app" << std::endl;
log_data(context, LLInfo, "Setting Resnet Model from scaling app");

std::cout<<"Setting with batch_size: " << std::to_string(batch_size) << std::endl;
log_data(context, LLInfo, "Setting with batch_size: " + std::to_string(batch_size));

std::cout<<"Setting on device: " << device << std::endl;
log_data(context, LLInfo, "Setting on device: " + device);

std::cout<<"Setting on " << std::to_string(num_devices) << " devices" <<std::endl << std::flush;
log_data(context, LLInfo, "Setting on " + std::to_string(num_devices) + " devices");

std::string model_filename = "./resnet50." + device + ".pt";

if (use_multigpu) {
client.set_model_from_file_multigpu(model_key, model_filename, "TORCH", 0, num_devices, batch_size);
std::string colo_model_use_multigpu_vars = "Use_multigpu=True - model_key:";
colo_model_use_multigpu_vars + model_key + " model_filename:" + model_filename;
colo_model_use_multigpu_vars += " num_devices:" + std::to_string(num_devices);
colo_model_use_multigpu_vars += " batch_size:" + std::to_string(batch_size);
log_data(context, LLDebug, colo_model_use_multigpu_vars);
client.set_script_from_file_multigpu(script_key, "./data_processing_script.txt", 0, num_devices);
std::string colo_script_use_multigpu_vars = "Use_multigpu=True - script_key:";
colo_model_use_multigpu_vars += script_key + " script_filename:";
colo_script_use_multigpu_vars += "./data_processing_script.txt num_devices:";
colo_model_use_multigpu_vars += std::to_string(num_devices);
log_data(context, LLDebug, colo_script_use_multigpu_vars);
}
else {
client.set_model_from_file(model_key, model_filename, "TORCH", device, batch_size);
std::string colo_model_vars = "Use_multigpu=False - model_key:" + model_key + " model_filename:";
colo_model_vars += model_filename + " device:" + device + " batch_size:";
colo_model_vars += std::to_string(batch_size);
log_data(context, LLDebug, colo_model_vars);
client.set_script_from_file(script_key, device, "./data_processing_script.txt");
std::string colo_script_vars = "Use_multigpu=False - script_key: " + script_key;
colo_script_vars += " script_filename:./data_processing_script.txt device:" + device;
log_data(context, LLDebug, colo_script_vars);
}
}
}
int iterations = get_iterations();
log_data(context, LLDebug, "Running with iterations: " + std::to_string(iterations));
MPI_Barrier(MPI_COMM_WORLD);
Expand Down Expand Up @@ -241,7 +156,6 @@ void run_mnist(const std::string& model_name,
client.put_tensor(in_key, array, {224, 224, 3},
SRTensorTypeFloat,
SRMemLayoutNested);
double put_tensor_end = MPI_Wtime();
log_data(context, LLDebug, "put_tensor completed");
if (use_multigpu) {
client.run_script_multigpu(script_key, "pre_process_3ch", {in_key}, {script_out_key}, rank, 0, num_devices);
Expand Down Expand Up @@ -277,10 +191,10 @@ void run_mnist(const std::string& model_name,

// Begin the actual iteration loop
log_data(context, LLDebug, "Iteration loop starting...");
MPI_Barrier(MPI_COMM_WORLD);
double loop_start = MPI_Wtime();
for (int i = 0; i < iterations + 1; i++) {
log_data(context, LLDebug, "Running iteration: " + std::to_string(i));
MPI_Barrier(MPI_COMM_WORLD);
std::string in_key = "resnet_input_rank_" + std::to_string(rank) + "_" + std::to_string(i);
std::string script_out_key = "resnet_processed_input_rank_" + std::to_string(rank) + "_" + std::to_string(i);
std::string out_key = "resnet_output_rank_" + std::to_string(rank) + "_" + std::to_string(i);
Expand Down
3 changes: 2 additions & 1 deletion driver.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,9 @@
from driverthroughput.main import Throughput
from driverdataaggregation.main import DataAggregation
from driverprocessresults.main import ProcessResults
from driverprocessresults.scaling_plotter import PlotResults

class SmartSimScalingTests(Inference, Throughput, DataAggregation, ProcessResults):
class SmartSimScalingTests(Inference, Throughput, DataAggregation, ProcessResults, PlotResults):
...

if __name__ == "__main__":
Expand Down
Loading

0 comments on commit c8ff9ec

Please sign in to comment.