aws-samples · amindm · Jul 10, 2025 · Jul 10, 2025
diff --git a/...ining/distributed_training_sm_unified_studio/distributed training in unified studio.ipynb b/...ining/distributed_training_sm_unified_studio/distributed training in unified studio.ipynb
@@ -71,8 +71,9 @@
    "source": [
     "import os\n",
     "\n",
-    "os.environ[\"mlflow_uri\"] = \"<MLFlow Tracking servern ARN>\"\n",
-    "os.environ[\"mlflow_experiment_name\"] = \"deepseek-r1-distill-llama-8b-sft\""
+    "os.environ[\"mlflow_uri\"] = \"\"\n",
+    "os.environ[\"mlflow_experiment_name\"] = \"deepseek-r1-distill-llama-8b-sft\"\n",
+    "os.environ[\"hf_token\"] = \"\""
    ]
   },
   {
@@ -138,8 +139,8 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "db_name = \"<enter your db name>\"\n",
-    "table = \"sqad\""
+    "#db_name = \"<enter your db name>\"\n",
+    "#table = \"sqad\""
    ]
   },
   {
@@ -151,8 +152,8 @@
    },
    "outputs": [],
    "source": [
-    "%%sql project.athena\n",
-    "SELECT * FROM \"<enter your db name>\".\"sqad\";"
+    "#%%sql project.athena\n",
+    "#SELECT * FROM \"<enter your db name>\".\"sqad\";"
    ]
   },
   {
@@ -165,11 +166,11 @@
    "outputs": [],
    "source": [
     "\n",
-    "import pandas as pd\n",
+    "#import pandas as pd\n",
     "\n",
-    "df = _.to_pandas()\n",
+    "#df = _.to_pandas()\n",
     "\n",
-    "print(\"Number of rows:\", len(df))\n"
+    "#print(\"Number of rows:\", len(df))\n"
    ]
   },
   {
@@ -179,14 +180,45 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "from sklearn.model_selection import train_test_split\n",
+    "#from sklearn.model_selection import train_test_split\n",
     "\n",
-    "df = df[:1000]\n",
+    "#df = df[:1000]\n",
     "\n",
-    "train, test = train_test_split(df, test_size=0.1, random_state=42)\n",
+    "#train, test = train_test_split(df, test_size=0.1, random_state=42)\n",
     "\n",
-    "print(\"Number of train elements: \", len(train))\n",
-    "print(\"Number of test elements: \", len(test))"
+    "#print(\"Number of train elements: \", len(train))\n",
+    "#print(\"Number of test elements: \", len(test))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "60fecae8-83b7-482f-9b44-5c1118906651",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from datasets import load_dataset"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "66d13e1c-ef21-47c4-acfe-8773a6d20e05",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "train = load_dataset(\"rajpurkar/squad\", split=\"train\")\n",
+    "test = load_dataset(\"rajpurkar/squad\", split=\"validation\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "7e9a41a8-2187-4613-ac19-a594f58c5e50",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "train"
    ]
   },
   {
@@ -229,7 +261,7 @@
     "def template_dataset(sample):\n",
     "    sample[\"text\"] = prompt_template.format(context=sample[\"context\"],\n",
     "                                            question=sample[\"question\"],\n",
-    "                                            answer=sample[\"answer\"])\n",
+    "                                            answer=sample[\"answers\"])\n",
     "    return sample"
    ]
   },
@@ -257,8 +289,8 @@
    "source": [
     "from datasets import Dataset, DatasetDict\n",
     "\n",
-    "train_dataset = Dataset.from_pandas(train)\n",
-    "test_dataset = Dataset.from_pandas(test)\n",
+    "train_dataset = train\n",
+    "test_dataset = test\n",
     "\n",
     "dataset = DatasetDict({\"train\": train_dataset, \"test\": test_dataset})\n",
     "\n",
@@ -377,9 +409,10 @@
     "%%bash\n",
     "\n",
     "cat > ./args.yaml <<EOF\n",
-    "model_id: \"deepseek-ai/DeepSeek-R1-Distill-Llama-8B\"       # Hugging Face model id\n",
-    "mlflow_uri: \"${mlflow_uri}\"\n",
-    "mlflow_experiment_name: \"${mlflow_experiment_name}\"\n",
+    "hf_token: \"${hf_token}\"\n",
+    "model_id: \"meta-llama/Llama-3.1-8B\"       # Hugging Face model id\n",
+    "#mlflow_uri: \"${mlflow_uri}\"\n",
+    "#mlflow_experiment_name: \"${mlflow_experiment_name}\"\n",
     "# sagemaker specific parameters\n",
     "output_dir: \"/opt/ml/model\"                       # path to where SageMaker will upload the model \n",
     "train_dataset_path: \"/opt/ml/input/data/train/\"   # path to where FSx saves train dataset\n",
@@ -390,8 +423,8 @@
     "lora_dropout: 0.1                 \n",
     "learning_rate: 2e-4                    # learning rate scheduler\n",
     "num_train_epochs: 1                    # number of training epochs\n",
-    "per_device_train_batch_size: 2         # batch size per device during training\n",
-    "per_device_eval_batch_size: 1          # batch size for evaluation\n",
+    "per_device_train_batch_size: 8         # batch size per device during training\n",
+    "per_device_eval_batch_size: 2          # batch size for evaluation\n",
     "gradient_accumulation_steps: 2         # number of steps before performing a backward/update pass\n",
     "gradient_checkpointing: true           # use gradient checkpointing\n",
     "bf16: true                             # use bfloat16 precision\n",
@@ -490,7 +523,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "instance_type = \"ml.g5.12xlarge\" # Override the instance type if you want to get a different container version\n",
+    "instance_type = \"ml.g6.24xlarge\" #\"ml.p4d.24xlarge\" #\"ml.g6.48xlarge\" # Override the instance type if you want to get a different container version\n",
     "\n",
     "instance_type"
    ]
@@ -1132,7 +1165,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.11.11"
+   "version": "3.12.9"
   }
  },
  "nbformat": 4,

diff --git a/3_distributed_training/distributed_training_sm_unified_studio/scripts/train.py b/3_distributed_training/distributed_training_sm_unified_studio/scripts/train.py
@@ -78,6 +78,10 @@ class ScriptArguments:
         metadata={"help": "Path to the test dataset"}
     )
 
+    hf_token: str = field(
+        default="",
+        metadata={"help": "Hugging Face API token"}
+    )
 
 def init_distributed():
     # Initialize the process group
@@ -170,7 +174,7 @@ def train(script_args, training_args, train_ds, test_ds):
     accelerator = Accelerator()
 
     if script_args.token is not None:
-        os.environ.update({"HF_TOKEN": script_args.token})
+        os.environ.update({"HF_TOKEN": script_args.hf_token})
         accelerator.wait_for_everyone()
 
     # Download model based on training setup (single or multi-node)
@@ -381,7 +385,16 @@ def train(script_args, training_args, train_ds, test_ds):
     parser = TrlParser((ScriptArguments, TrainingArguments))
     script_args, training_args = parser.parse_args_and_config()
 
-    set_custom_env({"HF_HUB_ENABLE_HF_TRANSFER": "1"})
+    #set_custom_env({"HF_HUB_ENABLE_HF_TRANSFER": "1"})
+    custom_env: Dict[str, str] = {"HF_DATASETS_TRUST_REMOTE_CODE": "TRUE",
+                                  "HF_HUB_ENABLE_HF_TRANSFER": "1",
+                                  "HF_TOKEN": script_args.hf_token,
+                                 # "FSDP_CPU_RAM_EFFICIENT_LOADING": "1",
+                                 #  "ACCELERATE_USE_FSDP": "1",
+                                 #  "WANDB_API_KEY": script_args.wandb_token,
+                                 #  "WANDB_DIR" : "/opt/ml/output",
+                                 #  "CUDA_VISIBLE_DEVICES": str(torch.cuda.device_count())
+                                  }
 
     if script_args.mlflow_uri is not None and script_args.mlflow_experiment_name is not None and \
         script_args.mlflow_uri != "" and script_args.mlflow_experiment_name != "":