SmartSim-Scaling ReadMe additions, plotting edits, reading data edits…

…, and app edits (#39) This merge will organize the current README layout, provide new performance results, remove the setting of the model and script in the application, refactor our current plotting code and add two additional requirements. [ committed by @amandarichardsonn ] [ reviewed by @billschereriii @ashao ]
CrayLabs · Nov 19, 2023 · c8ff9ec · c8ff9ec
1 parent 9e610b1
commit c8ff9ec
Show file tree

Hide file tree

Showing 78 changed files with 3,391 additions and 3,193 deletions.
diff --git a/PERFORMANCE.md b/PERFORMANCE.md
diff --git a/README.md b/README.md
diff --git a/batch_scripts/run_aggregation_python_fs_slurm.sh b/batch_scripts/run_aggregation_python_fs_slurm.sh
@@ -5,7 +5,6 @@
 #SBATCH -t 24:00:00
 
 cd ..
-module load slurm
 python driver.py aggregation_scaling_python_fs --exp_name='aggregation-scaling-py-fs-batch' \
                                                --client_nodes=[60] \
                                                --clients_per_node=[48] \

diff --git a/batch_scripts/run_aggregation_python_slurm.sh b/batch_scripts/run_aggregation_python_slurm.sh
@@ -3,9 +3,8 @@
 #SBATCH -N 93
 #SBATCH --exclusive
 #SBATCH -t 24:00:00
-
+echo "Note: The flag net_ifname should be replaced with the appropriate value on the target system"
 cd ..
-module load slurm
 python driver.py aggregation_scaling_python --exp_name='aggregation-scaling-py-batch' \
                                             --client_nodes=[60] \
                                             --clients_per_node=[48] \

diff --git a/batch_scripts/run_aggregation_slurm.sh b/batch_scripts/run_aggregation_slurm.sh
@@ -5,13 +5,16 @@
 #SBATCH -t 12:00:00
 #SBATCH -C SK48
 #SBATCH --oversubscribe
-
+echo "Note: The flag net_ifname should be replaced with the appropriate value on the target system"
 cd ..
-module load slurm
-python driver.py aggregation_scaling --client_nodes=[60] \
+python driver.py aggregation_scaling --exp_name='aggregation-scaling-batch' \
+                                     --client_nodes=[60] \
                                      --clients_per_node=[48] \
-                                     --db_nodes=[16,32] \
+                                     --db_nodes=[16] \
                                      --db_cpus=32 --net_ifname=ipogif0 \
                                      --run_db_as_batch=False \
-                                     --tensors_per_dataset=[1,4]
+                                     --tensors_per_dataset=[4] \
+                                     --tensor_bytes=[1024000] \
+                                     --iterations=20 \
+                                     --tensors_per_dataset=[4]
 
diff --git a/batch_scripts/run_inference_colo_slurm.sh b/batch_scripts/run_inference_colo_slurm.sh
@@ -1,15 +1,9 @@
 #!/bin/bash
 
-#SBATCH -N 1
+#SBATCH -N 16
+#SBATCH -C "P100*16"
 #SBATCH --exclusive
-#SBATCH -p allgriz
-#SBATCH -t 1:00:00
-
-module load cudatoolkit/11.7 cudnn PrgEnv-intel
-source ~/pyenvs/smartsim-dev/bin/activate
-
+#SBATCH -t 10:00:00
+echo "Note: The flag net_ifname should be replaced with the appropriate value on the target system"
 cd ..
-python driver.py inference_colocated --clients_per_node=[12,24,36,60,96] \
-                                     --nodes=[1] --db_tpq=[2] \
-                                     --db_cpus=[12] --pin_app_cpus=[True] \
-                                     --net_type="uds" --node_feature='{}' --languages=['fortran','cpp']
+python driver.py inference_colocated --nodes=[4, 8, 12, 16]
diff --git a/batch_scripts/run_inference_standard_slurm.sh b/batch_scripts/run_inference_standard_slurm.sh
@@ -1,11 +1,11 @@
 #!/bin/bash
 
-#SBATCH -N 60
+#SBATCH -N 116
+#SBATCH -C "[P100*16&SK48*100]"
 #SBATCH --exclusive
 #SBATCH -t 10:00:00
-
+echo "Note: The flag net_ifname should be replaced with the appropriate value on the target system"
 cd ..
-module load slurm
-python driver.py inference_standard --client_nodes=[20,40,60] \
-                                    --db_nodes=[4,8,16] --db_tpq=[1,2,4] \
-                                    --db_cpus=[8,16]
+python driver.py inference_standard --client_nodes=[25, 50, 75, 100] \
+                                    --db_nodes=[4, 8, 16] --db_tpq=[1] \
+                                    --db_cpus=[8]
diff --git a/batch_scripts/run_throughput_pbs.sh b/batch_scripts/run_throughput_pbs.sh
@@ -5,7 +5,7 @@
 #PBS -o throughput.out
 #PBS -N smartsim-throughput
 #PBS -V
-
+echo "Note: The flag net_ifname should be replaced with the appropriate value on the target system"
 PYTHON=/lus/snx11242/spartee/miniconda/envs/0.4.0/bin/python
 cd $PBS_O_WORKDIR/../
 $PYTHON driver.py throughput_standard --client_nodes=[128,256,512] \

diff --git a/batch_scripts/run_throughput_slurm.sh b/batch_scripts/run_throughput_slurm.sh
@@ -5,12 +5,11 @@
 #SBATCH -t 10:00:00
 #SBATCH -C SK48
 #SBATCH --oversubscribe
-
+echo "Note: The flag net_ifname should be replaced with the appropriate value on the target system"
 cd ..
-module load slurm
 python driver.py throughput_standard --client_nodes=[60] \
                                     --clients_per_node=[48] \
                                     --db_nodes=[32] \
-                                    --db_cpus=32 --net_ifname=ipogif0 \
+                                    --db_cpus=[32] --net_ifname=ipogif0 \
                                     --run_db_as_batch=False
 
diff --git a/cpp-data-aggregation/aggregation_consumer.cpp b/cpp-data-aggregation/aggregation_consumer.cpp
@@ -35,6 +35,9 @@ void run_aggregation_consumer(std::ofstream& timing_file,
     // Allocate arrays to hold timings
     std::vector<double> get_list_times;
 
+    // Allocate arrays to hold timings
+    std::vector<double> poll_list_times;
+
     // Retrieve the number of iterations to run
     int iterations = get_iterations();
     log_data(context, LLDebug, "Running with iterations: " + std::to_string(iterations));
@@ -59,6 +62,7 @@ void run_aggregation_consumer(std::ofstream& timing_file,
             log_data(context, LLInfo, "Consuming list " + std::to_string(i));
         }
 
+        double poll_list_start = MPI_Wtime();
         // Have rank 0 check that the aggregation list is full
         if(rank == 0) {
             bool list_is_ready = client.poll_list_length(list_name,
@@ -73,7 +77,10 @@ void run_aggregation_consumer(std::ofstream& timing_file,
                 throw std::runtime_error(list_size_error);
             }
         }
-
+        double poll_list_end = MPI_Wtime();
+        log_data(context, LLDebug, "poll_list completed");
+        delta_t = poll_list_end - poll_list_start;
+        poll_list_times.push_back(delta_t);
         // Have all ranks wait until the aggregation list is full
         MPI_Barrier(MPI_COMM_WORLD);
 
@@ -104,6 +111,8 @@ void run_aggregation_consumer(std::ofstream& timing_file,
     for (int i = 0; i < iterations; i++) {
         timing_file << rank << "," << "get_list" << ","
                     << get_list_times[i] << "\n";
+        timing_file << rank << "," << "poll_list" << ","
+                    << poll_list_times[i] << "\n";
     }
 
     // Write loop time to file

diff --git a/cpp-inference/inference_scaling_imagenet.cpp b/cpp-inference/inference_scaling_imagenet.cpp
@@ -98,8 +98,19 @@ void run_mnist(const std::string& model_name,
   int num_devices = get_num_devices();
   bool use_multigpu = (0 == device.compare("GPU")) && num_devices > 1;
   bool should_set = get_set_flag();
+
   std::string model_key = "resnet_model";
+  bool poll_model_code = client.poll_model(model_key, 100, 100);
+  if (!poll_model_code) {
+    log_error(context, LLInfo, "SR Error finding model");
+  }
+
   std::string script_key = "resnet_script";
+  bool poll_script_code = client.poll_key(script_key, 100, 100);
+  if (!poll_script_code) {
+    log_error(context, LLInfo, "SR Error finding script");
+  }
+
   // setting up string to debug set vars
   std::string program_vars = "Running rank with vars should_set: ";
   program_vars += std::to_string(should_set) + " - num_device: ";
@@ -108,102 +119,6 @@ void run_mnist(const std::string& model_name,
   program_vars += std::to_string(is_colocated) + " - cluster: " + std::to_string(cluster);
   log_data(context, LLDebug, program_vars);
 
-  if (should_set) {
-    log_data(context, LLDebug, "Entered should_set code block");
-    int batch_size = get_batch_size();
-    int n_clients = get_client_count();
-    std::string should_set_vars = "Running rank with batch_size: ";
-    should_set_vars += std::to_string(batch_size) + " and n_clients: ";
-    should_set_vars += std::to_string(n_clients);
-    log_data(context, LLDebug, should_set_vars);
-    if (!is_colocated && rank == 0) {
-      log_data(context, LLDebug, "Setting script/model for Standard test");
-
-      std::cout<<"Setting Resnet Model from scaling app" << std::endl;
-      log_data(context, LLInfo, "Setting Resnet Model from scaling app");
-
-      std::cout<<"Setting with batch_size: " << std::to_string(batch_size) << std::endl;
-      log_data(context, LLInfo, "Setting with batch_size: " + std::to_string(batch_size));
-
-      std::cout<<"Setting on device: " << device << std::endl;
-      log_data(context, LLInfo, "Setting on device: " + device);
-
-      std::cout<<"Setting on " << std::to_string(num_devices) << " devices" <<std::endl << std::flush;
-      log_data(context, LLInfo, "Setting on " + std::to_string(num_devices) + " devices");
-
-      std::string model_filename = "./resnet50." + device + ".pt";
-
-      if (use_multigpu) {
-        client.set_model_from_file_multigpu(model_key, model_filename, "TORCH", 0, num_devices, batch_size);
-        std::string std_model_use_multigpu_vars = "Use_multigpu - model_key:" + model_key;
-        std_model_use_multigpu_vars += " model_filename:";
-        std_model_use_multigpu_vars += model_filename + " num_devices:" + std::to_string(num_devices);
-        std_model_use_multigpu_vars += " batch_size:";
-        std_model_use_multigpu_vars += std::to_string(batch_size);
-        log_data(context, LLDebug, std_model_use_multigpu_vars);
-        client.set_script_from_file_multigpu(script_key, "./data_processing_script.txt", 0, num_devices);
-        std::string std_script_use_multigpu_vars = "Use_multigpu - script_key:";
-        std_script_use_multigpu_vars += script_key + " script_filename:";
-        std_script_use_multigpu_vars += "./data_processing_script.txt num_devices:";
-        std_script_use_multigpu_vars += std::to_string(num_devices);
-      }
-      else {
-        client.set_model_from_file(model_key, model_filename, "TORCH", device, batch_size);
-        std::string std_model_vars =  "Not multigpu - model_key:" + model_key;
-        std_model_vars += " model_filename:" + model_filename + " device:";
-        std_model_vars += device + " batch_size:" + std::to_string(batch_size);
-        log_data(context, LLDebug, std_model_vars);
-        client.set_script_from_file(script_key, device, "./data_processing_script.txt");
-        std::string std_script_vars = "Not multigpu - script_key:" + script_key;
-        std_script_vars +=  " script_filename:./data_processing_script.txt device:" ;
-        std_script_vars += device;
-        log_data(context, LLDebug, std_script_vars);
-      }
-    }
-    if(is_colocated && rank % n_clients == 0) {
-      log_data(context, LLDebug, "Setting script/model for Colocated test");
-
-      std::cout<<"Setting Resnet Model from scaling app" << std::endl;
-      log_data(context, LLInfo, "Setting Resnet Model from scaling app");
-
-      std::cout<<"Setting with batch_size: " << std::to_string(batch_size) << std::endl;
-      log_data(context, LLInfo, "Setting with batch_size: " + std::to_string(batch_size));
-
-      std::cout<<"Setting on device: " << device << std::endl;
-      log_data(context, LLInfo, "Setting on device: " + device);
-
-      std::cout<<"Setting on " << std::to_string(num_devices) << " devices" <<std::endl << std::flush;
-      log_data(context, LLInfo, "Setting on " + std::to_string(num_devices) + " devices");
-
-      std::string model_filename = "./resnet50." + device + ".pt";
-
-      if (use_multigpu) {
-        client.set_model_from_file_multigpu(model_key, model_filename, "TORCH", 0, num_devices, batch_size);
-        std::string colo_model_use_multigpu_vars = "Use_multigpu=True - model_key:";
-        colo_model_use_multigpu_vars + model_key + " model_filename:" + model_filename;
-        colo_model_use_multigpu_vars += " num_devices:" + std::to_string(num_devices);
-        colo_model_use_multigpu_vars += " batch_size:" + std::to_string(batch_size);
-        log_data(context, LLDebug, colo_model_use_multigpu_vars);
-        client.set_script_from_file_multigpu(script_key, "./data_processing_script.txt", 0, num_devices);
-        std::string colo_script_use_multigpu_vars = "Use_multigpu=True - script_key:";
-        colo_model_use_multigpu_vars += script_key + " script_filename:";
-        colo_script_use_multigpu_vars += "./data_processing_script.txt num_devices:";
-        colo_model_use_multigpu_vars += std::to_string(num_devices);
-        log_data(context, LLDebug, colo_script_use_multigpu_vars);
-      }
-      else {
-        client.set_model_from_file(model_key, model_filename, "TORCH", device, batch_size);
-        std::string colo_model_vars = "Use_multigpu=False - model_key:" + model_key + " model_filename:";
-        colo_model_vars += model_filename + " device:" + device + " batch_size:";
-        colo_model_vars += std::to_string(batch_size);
-        log_data(context, LLDebug, colo_model_vars);
-        client.set_script_from_file(script_key, device, "./data_processing_script.txt");
-        std::string colo_script_vars = "Use_multigpu=False - script_key: " + script_key;
-        colo_script_vars += " script_filename:./data_processing_script.txt device:" + device;
-        log_data(context, LLDebug, colo_script_vars);
-      }
-    }
-  }
   int iterations = get_iterations();
   log_data(context, LLDebug, "Running with iterations: " + std::to_string(iterations));
   MPI_Barrier(MPI_COMM_WORLD);
@@ -241,7 +156,6 @@ void run_mnist(const std::string& model_name,
   client.put_tensor(in_key, array, {224, 224, 3},
                     SRTensorTypeFloat,
                     SRMemLayoutNested);
-  double put_tensor_end = MPI_Wtime();
   log_data(context, LLDebug, "put_tensor completed");
   if (use_multigpu) {
     client.run_script_multigpu(script_key, "pre_process_3ch", {in_key}, {script_out_key}, rank, 0, num_devices);
@@ -277,10 +191,10 @@ void run_mnist(const std::string& model_name,
 
   // Begin the actual iteration loop
   log_data(context, LLDebug, "Iteration loop starting...");
+  MPI_Barrier(MPI_COMM_WORLD);
   double loop_start = MPI_Wtime();
   for (int i = 0; i < iterations + 1; i++) {
     log_data(context, LLDebug, "Running iteration: " + std::to_string(i));
-    MPI_Barrier(MPI_COMM_WORLD);
     std::string in_key = "resnet_input_rank_" + std::to_string(rank) + "_" + std::to_string(i);
     std::string script_out_key = "resnet_processed_input_rank_" + std::to_string(rank) + "_" + std::to_string(i);
     std::string out_key = "resnet_output_rank_" + std::to_string(rank) + "_" + std::to_string(i);

diff --git a/driver.py b/driver.py
@@ -3,8 +3,9 @@
 from driverthroughput.main import Throughput
 from driverdataaggregation.main import DataAggregation
 from driverprocessresults.main import ProcessResults
+from driverprocessresults.scaling_plotter import PlotResults
 
-class SmartSimScalingTests(Inference, Throughput, DataAggregation, ProcessResults):
+class SmartSimScalingTests(Inference, Throughput, DataAggregation, ProcessResults, PlotResults):
     ...
 
 if __name__ == "__main__":