ROCm · danpetreamd · Sep 17, 2025 · Sep 17, 2025 · Sep 17, 2025 · Sep 18, 2025
@@ -1,3 +1,3 @@
 # CONTEXT {'gpu_vendor': 'AMD', 'guest_os': 'UBUNTU'}
-ARG BASE_DOCKER=rocm/pytorch
+ARG BASE_DOCKER=docker.io/rocm/pytorch
 FROM $BASE_DOCKER
@@ -21,7 +21,7 @@
 # SOFTWARE.
 #
 #################################################################################
-ARG BASE_DOCKER=rocm/jax-training:maxtext-v25.7-jax060
+ARG BASE_DOCKER=docker.iorocm/jax-training:maxtext-v25.7-jax060
 FROM $BASE_DOCKER
 
 USER root

@@ -24,7 +24,7 @@
 # SOFTWARE.
 #
 #################################################################################
-ARG BASE_DOCKER=rocm/megatron-lm:v25.8_py310
+ARG BASE_DOCKER=docker.io/rocm/megatron-lm:v25.7_py310
 FROM $BASE_DOCKER
 
 USER root

@@ -24,7 +24,7 @@
 # SOFTWARE.
 #
 #################################################################################
-ARG BASE_DOCKER=rocm/megatron-lm:v25.8_py310
+ARG BASE_DOCKER=docker.io/rocm/megatron-lm:v25.7_py310
 FROM $BASE_DOCKER
 
 USER root

@@ -24,7 +24,7 @@
 # SOFTWARE.
 #
 #################################################################################
-ARG BASE_DOCKER=rocm/pytorch-training:v25.8
+ARG BASE_DOCKER=docker.io/rocm/pytorch-training:v25.8
 FROM $BASE_DOCKER
 
 USER root

@@ -24,7 +24,7 @@
 # SOFTWARE.
 #
 #################################################################################
-ARG BASE_DOCKER=rocm/pytorch:latest
+ARG BASE_DOCKER=docker.io/rocm/pytorch:latest
 FROM $BASE_DOCKER
 USER root
 ENV WORKSPACE_DIR=/workspace

@@ -24,7 +24,7 @@
 # SOFTWARE.
 #
 #################################################################################
-ARG BASE_DOCKER=rocm/pytorch:latest
+ARG BASE_DOCKER=docker.io/rocm/pytorch:latest
 FROM $BASE_DOCKER
 
 USER root

@@ -24,7 +24,7 @@
 # SOFTWARE.
 #
 #################################################################################
-ARG BASE_DOCKER=rocm/pytorch:latest
+ARG BASE_DOCKER=docker.io/rocm/pytorch:latest
 FROM $BASE_DOCKER
 
 USER root
@@ -63,7 +63,9 @@ RUN apt-get install -y netcat-traditional
 RUN apt-get install -y locales
 RUN locale-gen en_US.UTF-8
 
+# the model complains about numpy version, it requires <2.0.0
+RUN pip uninstall -y numpy
+RUN pip install numpy==1.26.4
+
 # record configuration for posterity
 RUN pip3 list
-
-
@@ -1,5 +1,5 @@
 # CONTEXT {'gpu_vendor': 'AMD', 'guest_os': 'UBUNTU'}
-ARG BASE_DOCKER=rocm/pytorch:latest
+ARG BASE_DOCKER=docker.io/rocm/pytorch:latest
 FROM $BASE_DOCKER
 
 ARG work_dir=/hunyuanvideo

@@ -24,7 +24,7 @@
 # SOFTWARE.
 #
 #################################################################################
-ARG BASE_DOCKER=rocm/pytorch:latest
+ARG BASE_DOCKER=docker.io/rocm/pytorch:latest
 FROM $BASE_DOCKER
 
 USER root

@@ -24,7 +24,7 @@
 # SOFTWARE.
 #
 #################################################################################
-ARG BASE_DOCKER=rocm/pytorch:latest
+ARG BASE_DOCKER=docker.io/rocm/pytorch:latest
 FROM $BASE_DOCKER
 
 USER root

@@ -1,6 +1,6 @@
 # CONTEXT {'gpu_vendor': 'AMD', 'guest_os': 'UBUNTU'}
 # PyTorch 2.7.0a0+git3a58512
-ARG BASE_DOCKER=rocm/pytorch-training:v25.5
+ARG BASE_DOCKER=docker.io/rocm/pytorch-training:v25.5
 FROM $BASE_DOCKER
 
 WORKDIR /workspace

@@ -1,5 +1,5 @@
 # CONTEXT {'gpu_vendor': 'AMD', 'guest_os': 'UBUNTU'}
-ARG BASE_DOCKER=rocm/pytorch-training:v25.5
+ARG BASE_DOCKER=docker.io/rocm/pytorch-training:v25.5
 FROM $BASE_DOCKER
 
 

@@ -24,7 +24,7 @@
 # SOFTWARE.
 #
 #################################################################################
-ARG BASE_DOCKER=lmsysorg/sglang:v0.4.5-rocm630
+ARG BASE_DOCKER=docker.io/lmsysorg/sglang:v0.4.5-rocm630
 
 FROM $BASE_DOCKER
 

@@ -1,5 +1,5 @@
 # CONTEXT {'gpu_vendor': 'AMD', 'guest_os': 'UBUNTU'}
-ARG BASE_DOCKER=rocm/pytorch-training:v25.5
+ARG BASE_DOCKER=docker.io/rocm/pytorch-training:v25.5
 FROM $BASE_DOCKER
 
 USER root

@@ -24,7 +24,7 @@
 # SOFTWARE.
 #
 #################################################################################
-ARG BASE_DOCKER=rocm/vllm:rocm6.4.1_vllm_0.10.1_20250909
+ARG BASE_DOCKER=docker.io/rocm/vllm:rocm6.4.1_vllm_0.10.1_20250909
 FROM $BASE_DOCKER
 
 USER root

@@ -25,7 +25,7 @@
 #
 #################################################################################
 
-ARG BASE_DOCKER=rocm/pytorch:latest
+ARG BASE_DOCKER=docker.io/rocm/pytorch:latest
 FROM $BASE_DOCKER
 USER root
 ENV WORKSPACE_DIR=/workspace
@@ -89,4 +89,3 @@ RUN cd $WORKSPACE_DIR \
 
 # Display installed packages for verification
 RUN pip list
-
@@ -24,7 +24,7 @@
 # SOFTWARE.
 #
 #################################################################################
-ARG BASE_DOCKER=rocm/pytorch-training:v25.8
+ARG BASE_DOCKER=docker.io/rocm/pytorch-training:v25.7
 FROM $BASE_DOCKER
 
 USER root

@@ -24,7 +24,7 @@
 # SOFTWARE.
 #
 #################################################################################
-ARG BASE_DOCKER=lmsysorg/sglang:v0.5.2rc1-rocm700-mi30x
+ARG BASE_DOCKER=docker.io/lmsysorg/sglang:v0.5.2rc1-rocm700-mi30x
 FROM $BASE_DOCKER
 
 ARG GPU_ARCH=gfx942

@@ -1 +1 @@
-git+https://github.com/ROCm/madengine.git@main
+git+https://github.com/danpetreamd/madengine.git@aac_tweaks
@@ -26,8 +26,8 @@
 #################################################################################
 """MAD: Model Automation and Dashboarding
 
-The script builds the Docker image, runs the Docker container, executes training or inference of the LLMs on the container, 
-and logs the performance metrics. 
+The script builds the Docker image, runs the Docker container, executes training or inference of the LLMs on the container,
+and logs the performance metrics.
 
 The script takes the following arguments:
     --tags: tags to run model.
@@ -109,6 +109,7 @@
 from utils import get_base_docker, get_base_docker_sha
 from utils import get_perf_metric, update_dict
 from utils import update_perf_csv
+from utils import get_amdsmi_path, get_nvidiasmi_path
 from utils import Console, Docker, Timeout, RunDetails
 from version import __version__
 from logger import get_logger
@@ -180,12 +181,12 @@ def run_model(
         console: Console
     ) -> bool:
     """Run the model application.
-    
+
     Args:
         model_info (dict): The model information
         args (argparse.Namespace): The input arguments
         console (Console): The console object
-    
+
     Returns:
         bool: The status of the run (return code: True for success, False for failure)
     """
@@ -195,7 +196,7 @@ def run_model(
     keep_alive = args.keep_alive
     keep_model_dir = args.keep_model_dir
     log_level = args.log_level
-    output = args.output    
+    output = args.output
 
     log_file = f"logs/{model_name}.live.log"
     # Check the log file exist in the directory or not, if not then create the log file, if exist then empty the log file.
@@ -225,7 +226,7 @@ def run_model(
     run_details.machine_name = get_host_name()
     run_details.host_os = get_host_os()
     run_details.gpu_architecture = get_system_gpu_arch()
-    run_details.n_gpus = get_system_gpus()    
+    run_details.n_gpus = get_system_gpus()
     run_details.pipeline = os.environ.get('pipeline')
 
     # Parse the model dictionary
@@ -337,7 +338,7 @@ def run_model(
     docker_opts += get_env_docker_args(run_envs)
 
     docker_opts += get_gpu_docker_args()
-    # docker_opts += get_cpu_docker_args()        
+    # docker_opts += get_cpu_docker_args()
 
     mount_data_paths = []
     docker_opts += get_mount_docker_args(mount_data_paths)
@@ -352,8 +353,8 @@ def run_model(
         )
 
         docker = Docker(
-            image=model_docker_image, 
-            container_name=model_docker_container, 
+            image=model_docker_image,
+            container_name=model_docker_container,
             docker_opts=docker_opts,
             keep_alive=keep_alive,
             console=console
@@ -363,9 +364,11 @@ def run_model(
 
         # Echo GPU information
         if re.search("nvidia", dockerfile_gpu_suffix):
-            docker.sh('/usr/bin/nvidia-smi || true')
+            nvidiasmi_path = get_nvidiasmi_path()
+            docker.sh(f'{nvidiasmi_path} || true')
         elif re.search("amd", dockerfile_gpu_suffix):
-            docker.sh('/opt/rocm/bin/rocm-smi || true')
+            amdsmi_path = get_amdsmi_path()
+            docker.sh(f'{amdsmi_path} || true')
         else:
             logger.error("No GPU information available")
             raise ValueError("Unknown GPU type")
@@ -388,7 +391,7 @@ def run_model(
         # echo git commit
         run_details.git_commit = docker.sh(f"cd {model_dir} && git rev-parse HEAD")
         logger.info(f"MODEL GIT COMMIT is {run_details.git_commit}")
-        
+
         if model_url:
             docker.sh(f"cd {model_dir} && git submodule update --init --recursive")
 
@@ -433,7 +436,7 @@ def run_model(
             # Clean up the instance of docker
             del docker
             sys.exit(1)
-        
+
         test_duration = time.time() - test_start_time
         logger.info(f"Test duration: {test_duration} seconds")
 
@@ -473,9 +476,9 @@ def run_model(
                 run_details.performance = multiple_results
                 run_details.generate_json("common_info.json", multiple_results=True)
                 update_perf_csv(
-                    multiple_results=model["multiple_results"], 
-                    perf_csv=output, 
-                    model_name=run_details.model, 
+                    multiple_results=model["multiple_results"],
+                    perf_csv=output,
+                    model_name=run_details.model,
                     common_info="common_info.json"
                 )
             else:
@@ -496,24 +499,24 @@ def run_model(
         else:
             run_details.generate_json("perf_entry.json")
             update_perf_csv(exception_result="perf_entry.json", perf_csv=output)
-        
+
     except Exception as e:
         logger.error(f"Failed to write the run details to the output file: {e}")
 
     # Clean up the instance of docker
     del docker
 
-    return_code = True if run_details.status == 'SUCCESS' else False    
+    return_code = True if run_details.status == 'SUCCESS' else False
 
     return return_code
 
 
 def main() -> bool:
     """Main function to run the MAD application.
-    
+
     Returns:
         bool: The status of the run (return code: True for success, False for failure)
-        
+
     Raises:
         ValueError: If the GPU type is unknown
     """
Original file line number	Diff line number	Diff line change
		@@ -1 +1 @@
		git+https://github.com/ROCm/madengine.git@main
		git+https://github.com/danpetreamd/madengine.git@aac_tweaks