Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion docker/dummy.ubuntu.amd.Dockerfile
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
# CONTEXT {'gpu_vendor': 'AMD', 'guest_os': 'UBUNTU'}
ARG BASE_DOCKER=rocm/pytorch
ARG BASE_DOCKER=docker.io/rocm/pytorch
FROM $BASE_DOCKER
2 changes: 1 addition & 1 deletion docker/jax_maxtext.ubuntu.amd.Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@
# SOFTWARE.
#
#################################################################################
ARG BASE_DOCKER=rocm/jax-training:maxtext-v25.7-jax060
ARG BASE_DOCKER=docker.iorocm/jax-training:maxtext-v25.7-jax060
FROM $BASE_DOCKER

USER root
Expand Down
2 changes: 1 addition & 1 deletion docker/megatron_train.ubuntu.amd.Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@
# SOFTWARE.
#
#################################################################################
ARG BASE_DOCKER=rocm/megatron-lm:v25.8_py310
ARG BASE_DOCKER=docker.io/rocm/megatron-lm:v25.7_py310
FROM $BASE_DOCKER

USER root
Expand Down
2 changes: 1 addition & 1 deletion docker/primus_megatron_train.ubuntu.amd.Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@
# SOFTWARE.
#
#################################################################################
ARG BASE_DOCKER=rocm/megatron-lm:v25.8_py310
ARG BASE_DOCKER=docker.io/rocm/megatron-lm:v25.7_py310
FROM $BASE_DOCKER

USER root
Expand Down
2 changes: 1 addition & 1 deletion docker/primus_pytorch_train.ubuntu.amd.Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@
# SOFTWARE.
#
#################################################################################
ARG BASE_DOCKER=rocm/pytorch-training:v25.8
ARG BASE_DOCKER=docker.io/rocm/pytorch-training:v25.8
FROM $BASE_DOCKER

USER root
Expand Down
2 changes: 1 addition & 1 deletion docker/pyt_chai1_inference.ubuntu.amd.Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@
# SOFTWARE.
#
#################################################################################
ARG BASE_DOCKER=rocm/pytorch:latest
ARG BASE_DOCKER=docker.io/rocm/pytorch:latest
FROM $BASE_DOCKER
USER root
ENV WORKSPACE_DIR=/workspace
Expand Down
2 changes: 1 addition & 1 deletion docker/pyt_clip_inference.ubuntu.amd.Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@
# SOFTWARE.
#
#################################################################################
ARG BASE_DOCKER=rocm/pytorch:latest
ARG BASE_DOCKER=docker.io/rocm/pytorch:latest
FROM $BASE_DOCKER

USER root
Expand Down
8 changes: 5 additions & 3 deletions docker/pyt_huggingface.ubuntu.amd.Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@
# SOFTWARE.
#
#################################################################################
ARG BASE_DOCKER=rocm/pytorch:latest
ARG BASE_DOCKER=docker.io/rocm/pytorch:latest
FROM $BASE_DOCKER

USER root
Expand Down Expand Up @@ -63,7 +63,9 @@ RUN apt-get install -y netcat-traditional
RUN apt-get install -y locales
RUN locale-gen en_US.UTF-8

# the model complains about numpy version, it requires <2.0.0
RUN pip uninstall -y numpy
RUN pip install numpy==1.26.4

# record configuration for posterity
RUN pip3 list


2 changes: 1 addition & 1 deletion docker/pyt_hy_video.ubuntu.amd.Dockerfile
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
# CONTEXT {'gpu_vendor': 'AMD', 'guest_os': 'UBUNTU'}
ARG BASE_DOCKER=rocm/pytorch:latest
ARG BASE_DOCKER=docker.io/rocm/pytorch:latest
FROM $BASE_DOCKER

ARG work_dir=/hunyuanvideo
Expand Down
2 changes: 1 addition & 1 deletion docker/pyt_janus_pro_inference.ubuntu.amd.Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@
# SOFTWARE.
#
#################################################################################
ARG BASE_DOCKER=rocm/pytorch:latest
ARG BASE_DOCKER=docker.io/rocm/pytorch:latest
FROM $BASE_DOCKER

USER root
Expand Down
2 changes: 1 addition & 1 deletion docker/pyt_mochi_inference.ubuntu.amd.Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@
# SOFTWARE.
#
#################################################################################
ARG BASE_DOCKER=rocm/pytorch:latest
ARG BASE_DOCKER=docker.io/rocm/pytorch:latest
FROM $BASE_DOCKER

USER root
Expand Down
2 changes: 1 addition & 1 deletion docker/pyt_mpt30b_training.ubuntu.amd.Dockerfile
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
# CONTEXT {'gpu_vendor': 'AMD', 'guest_os': 'UBUNTU'}
# PyTorch 2.7.0a0+git3a58512
ARG BASE_DOCKER=rocm/pytorch-training:v25.5
ARG BASE_DOCKER=docker.io/rocm/pytorch-training:v25.5
FROM $BASE_DOCKER

WORKDIR /workspace
Expand Down
2 changes: 1 addition & 1 deletion docker/pyt_ncf_training.ubuntu.amd.Dockerfile
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
# CONTEXT {'gpu_vendor': 'AMD', 'guest_os': 'UBUNTU'}
ARG BASE_DOCKER=rocm/pytorch-training:v25.5
ARG BASE_DOCKER=docker.io/rocm/pytorch-training:v25.5
FROM $BASE_DOCKER


Expand Down
2 changes: 1 addition & 1 deletion docker/pyt_sglang.ubuntu.amd.Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@
# SOFTWARE.
#
#################################################################################
ARG BASE_DOCKER=lmsysorg/sglang:v0.4.5-rocm630
ARG BASE_DOCKER=docker.io/lmsysorg/sglang:v0.4.5-rocm630

FROM $BASE_DOCKER

Expand Down
2 changes: 1 addition & 1 deletion docker/pyt_training_huggingface.ubuntu.amd.Dockerfile
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
# CONTEXT {'gpu_vendor': 'AMD', 'guest_os': 'UBUNTU'}
ARG BASE_DOCKER=rocm/pytorch-training:v25.5
ARG BASE_DOCKER=docker.io/rocm/pytorch-training:v25.5
FROM $BASE_DOCKER

USER root
Expand Down
2 changes: 1 addition & 1 deletion docker/pyt_vllm.ubuntu.amd.Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@
# SOFTWARE.
#
#################################################################################
ARG BASE_DOCKER=rocm/vllm:rocm6.4.1_vllm_0.10.1_20250909
ARG BASE_DOCKER=docker.io/rocm/vllm:rocm6.4.1_vllm_0.10.1_20250909
FROM $BASE_DOCKER

USER root
Expand Down
3 changes: 1 addition & 2 deletions docker/pyt_wan2.1_inference.ubuntu.amd.Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@
#
#################################################################################

ARG BASE_DOCKER=rocm/pytorch:latest
ARG BASE_DOCKER=docker.io/rocm/pytorch:latest
FROM $BASE_DOCKER
USER root
ENV WORKSPACE_DIR=/workspace
Expand Down Expand Up @@ -89,4 +89,3 @@ RUN cd $WORKSPACE_DIR \

# Display installed packages for verification
RUN pip list

2 changes: 1 addition & 1 deletion docker/pytorch_train.ubuntu.amd.Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@
# SOFTWARE.
#
#################################################################################
ARG BASE_DOCKER=rocm/pytorch-training:v25.8
ARG BASE_DOCKER=docker.io/rocm/pytorch-training:v25.7
FROM $BASE_DOCKER

USER root
Expand Down
2 changes: 1 addition & 1 deletion docker/sglang_disagg_inference.ubuntu.amd.Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@
# SOFTWARE.
#
#################################################################################
ARG BASE_DOCKER=lmsysorg/sglang:v0.5.2rc1-rocm700-mi30x
ARG BASE_DOCKER=docker.io/lmsysorg/sglang:v0.5.2rc1-rocm700-mi30x
FROM $BASE_DOCKER

ARG GPU_ARCH=gfx942
Expand Down
2 changes: 1 addition & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
@@ -1 +1 @@
git+https://github.com/ROCm/madengine.git@main
git+https://github.com/danpetreamd/madengine.git@aac_tweaks
43 changes: 23 additions & 20 deletions tools/run_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,8 +26,8 @@
#################################################################################
"""MAD: Model Automation and Dashboarding

The script builds the Docker image, runs the Docker container, executes training or inference of the LLMs on the container,
and logs the performance metrics.
The script builds the Docker image, runs the Docker container, executes training or inference of the LLMs on the container,
and logs the performance metrics.

The script takes the following arguments:
--tags: tags to run model.
Expand Down Expand Up @@ -109,6 +109,7 @@
from utils import get_base_docker, get_base_docker_sha
from utils import get_perf_metric, update_dict
from utils import update_perf_csv
from utils import get_amdsmi_path, get_nvidiasmi_path
from utils import Console, Docker, Timeout, RunDetails
from version import __version__
from logger import get_logger
Expand Down Expand Up @@ -180,12 +181,12 @@ def run_model(
console: Console
) -> bool:
"""Run the model application.

Args:
model_info (dict): The model information
args (argparse.Namespace): The input arguments
console (Console): The console object

Returns:
bool: The status of the run (return code: True for success, False for failure)
"""
Expand All @@ -195,7 +196,7 @@ def run_model(
keep_alive = args.keep_alive
keep_model_dir = args.keep_model_dir
log_level = args.log_level
output = args.output
output = args.output

log_file = f"logs/{model_name}.live.log"
# Check the log file exist in the directory or not, if not then create the log file, if exist then empty the log file.
Expand Down Expand Up @@ -225,7 +226,7 @@ def run_model(
run_details.machine_name = get_host_name()
run_details.host_os = get_host_os()
run_details.gpu_architecture = get_system_gpu_arch()
run_details.n_gpus = get_system_gpus()
run_details.n_gpus = get_system_gpus()
run_details.pipeline = os.environ.get('pipeline')

# Parse the model dictionary
Expand Down Expand Up @@ -337,7 +338,7 @@ def run_model(
docker_opts += get_env_docker_args(run_envs)

docker_opts += get_gpu_docker_args()
# docker_opts += get_cpu_docker_args()
# docker_opts += get_cpu_docker_args()

mount_data_paths = []
docker_opts += get_mount_docker_args(mount_data_paths)
Expand All @@ -352,8 +353,8 @@ def run_model(
)

docker = Docker(
image=model_docker_image,
container_name=model_docker_container,
image=model_docker_image,
container_name=model_docker_container,
docker_opts=docker_opts,
keep_alive=keep_alive,
console=console
Expand All @@ -363,9 +364,11 @@ def run_model(

# Echo GPU information
if re.search("nvidia", dockerfile_gpu_suffix):
docker.sh('/usr/bin/nvidia-smi || true')
nvidiasmi_path = get_nvidiasmi_path()
docker.sh(f'{nvidiasmi_path} || true')
elif re.search("amd", dockerfile_gpu_suffix):
docker.sh('/opt/rocm/bin/rocm-smi || true')
amdsmi_path = get_amdsmi_path()
docker.sh(f'{amdsmi_path} || true')
else:
logger.error("No GPU information available")
raise ValueError("Unknown GPU type")
Expand All @@ -388,7 +391,7 @@ def run_model(
# echo git commit
run_details.git_commit = docker.sh(f"cd {model_dir} && git rev-parse HEAD")
logger.info(f"MODEL GIT COMMIT is {run_details.git_commit}")

if model_url:
docker.sh(f"cd {model_dir} && git submodule update --init --recursive")

Expand Down Expand Up @@ -433,7 +436,7 @@ def run_model(
# Clean up the instance of docker
del docker
sys.exit(1)

test_duration = time.time() - test_start_time
logger.info(f"Test duration: {test_duration} seconds")

Expand Down Expand Up @@ -473,9 +476,9 @@ def run_model(
run_details.performance = multiple_results
run_details.generate_json("common_info.json", multiple_results=True)
update_perf_csv(
multiple_results=model["multiple_results"],
perf_csv=output,
model_name=run_details.model,
multiple_results=model["multiple_results"],
perf_csv=output,
model_name=run_details.model,
common_info="common_info.json"
)
else:
Expand All @@ -496,24 +499,24 @@ def run_model(
else:
run_details.generate_json("perf_entry.json")
update_perf_csv(exception_result="perf_entry.json", perf_csv=output)

except Exception as e:
logger.error(f"Failed to write the run details to the output file: {e}")

# Clean up the instance of docker
del docker

return_code = True if run_details.status == 'SUCCESS' else False
return_code = True if run_details.status == 'SUCCESS' else False

return return_code


def main() -> bool:
"""Main function to run the MAD application.

Returns:
bool: The status of the run (return code: True for success, False for failure)

Raises:
ValueError: If the GPU type is unknown
"""
Expand Down
Loading