Skip to content

add new notebook read data from HF #91

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 2 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -71,8 +71,9 @@
"source": [
"import os\n",
"\n",
"os.environ[\"mlflow_uri\"] = \"<MLFlow Tracking servern ARN>\"\n",
"os.environ[\"mlflow_experiment_name\"] = \"deepseek-r1-distill-llama-8b-sft\""
"os.environ[\"mlflow_uri\"] = \"\"\n",
"os.environ[\"mlflow_experiment_name\"] = \"deepseek-r1-distill-llama-8b-sft\"\n",
"os.environ[\"hf_token\"] = \"\""
]
},
{
Expand Down Expand Up @@ -138,8 +139,8 @@
"metadata": {},
"outputs": [],
"source": [
"db_name = \"<enter your db name>\"\n",
"table = \"sqad\""
"#db_name = \"<enter your db name>\"\n",
"#table = \"sqad\""
]
},
{
Expand All @@ -151,8 +152,8 @@
},
"outputs": [],
"source": [
"%%sql project.athena\n",
"SELECT * FROM \"<enter your db name>\".\"sqad\";"
"#%%sql project.athena\n",
"#SELECT * FROM \"<enter your db name>\".\"sqad\";"
]
},
{
Expand All @@ -165,11 +166,11 @@
"outputs": [],
"source": [
"\n",
"import pandas as pd\n",
"#import pandas as pd\n",
"\n",
"df = _.to_pandas()\n",
"#df = _.to_pandas()\n",
"\n",
"print(\"Number of rows:\", len(df))\n"
"#print(\"Number of rows:\", len(df))\n"
]
},
{
Expand All @@ -179,14 +180,45 @@
"metadata": {},
"outputs": [],
"source": [
"from sklearn.model_selection import train_test_split\n",
"#from sklearn.model_selection import train_test_split\n",
"\n",
"df = df[:1000]\n",
"#df = df[:1000]\n",
"\n",
"train, test = train_test_split(df, test_size=0.1, random_state=42)\n",
"#train, test = train_test_split(df, test_size=0.1, random_state=42)\n",
"\n",
"print(\"Number of train elements: \", len(train))\n",
"print(\"Number of test elements: \", len(test))"
"#print(\"Number of train elements: \", len(train))\n",
"#print(\"Number of test elements: \", len(test))"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "60fecae8-83b7-482f-9b44-5c1118906651",
"metadata": {},
"outputs": [],
"source": [
"from datasets import load_dataset"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "66d13e1c-ef21-47c4-acfe-8773a6d20e05",
"metadata": {},
"outputs": [],
"source": [
"train = load_dataset(\"rajpurkar/squad\", split=\"train\")\n",
"test = load_dataset(\"rajpurkar/squad\", split=\"validation\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "7e9a41a8-2187-4613-ac19-a594f58c5e50",
"metadata": {},
"outputs": [],
"source": [
"train"
]
},
{
Expand Down Expand Up @@ -229,7 +261,7 @@
"def template_dataset(sample):\n",
" sample[\"text\"] = prompt_template.format(context=sample[\"context\"],\n",
" question=sample[\"question\"],\n",
" answer=sample[\"answer\"])\n",
" answer=sample[\"answers\"])\n",
" return sample"
]
},
Expand Down Expand Up @@ -257,8 +289,8 @@
"source": [
"from datasets import Dataset, DatasetDict\n",
"\n",
"train_dataset = Dataset.from_pandas(train)\n",
"test_dataset = Dataset.from_pandas(test)\n",
"train_dataset = train\n",
"test_dataset = test\n",
"\n",
"dataset = DatasetDict({\"train\": train_dataset, \"test\": test_dataset})\n",
"\n",
Expand Down Expand Up @@ -377,9 +409,10 @@
"%%bash\n",
"\n",
"cat > ./args.yaml <<EOF\n",
"model_id: \"deepseek-ai/DeepSeek-R1-Distill-Llama-8B\" # Hugging Face model id\n",
"mlflow_uri: \"${mlflow_uri}\"\n",
"mlflow_experiment_name: \"${mlflow_experiment_name}\"\n",
"hf_token: \"${hf_token}\"\n",
"model_id: \"meta-llama/Llama-3.1-8B\" # Hugging Face model id\n",
"#mlflow_uri: \"${mlflow_uri}\"\n",
"#mlflow_experiment_name: \"${mlflow_experiment_name}\"\n",
"# sagemaker specific parameters\n",
"output_dir: \"/opt/ml/model\" # path to where SageMaker will upload the model \n",
"train_dataset_path: \"/opt/ml/input/data/train/\" # path to where FSx saves train dataset\n",
Expand All @@ -390,8 +423,8 @@
"lora_dropout: 0.1 \n",
"learning_rate: 2e-4 # learning rate scheduler\n",
"num_train_epochs: 1 # number of training epochs\n",
"per_device_train_batch_size: 2 # batch size per device during training\n",
"per_device_eval_batch_size: 1 # batch size for evaluation\n",
"per_device_train_batch_size: 8 # batch size per device during training\n",
"per_device_eval_batch_size: 2 # batch size for evaluation\n",
"gradient_accumulation_steps: 2 # number of steps before performing a backward/update pass\n",
"gradient_checkpointing: true # use gradient checkpointing\n",
"bf16: true # use bfloat16 precision\n",
Expand Down Expand Up @@ -490,7 +523,7 @@
"metadata": {},
"outputs": [],
"source": [
"instance_type = \"ml.g5.12xlarge\" # Override the instance type if you want to get a different container version\n",
"instance_type = \"ml.g6.24xlarge\" #\"ml.p4d.24xlarge\" #\"ml.g6.48xlarge\" # Override the instance type if you want to get a different container version\n",
"\n",
"instance_type"
]
Expand Down Expand Up @@ -1132,7 +1165,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.11"
"version": "3.12.9"
}
},
"nbformat": 4,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -78,6 +78,10 @@ class ScriptArguments:
metadata={"help": "Path to the test dataset"}
)

hf_token: str = field(
default="",
metadata={"help": "Hugging Face API token"}
)

def init_distributed():
# Initialize the process group
Expand Down Expand Up @@ -170,7 +174,7 @@ def train(script_args, training_args, train_ds, test_ds):
accelerator = Accelerator()

if script_args.token is not None:
os.environ.update({"HF_TOKEN": script_args.token})
os.environ.update({"HF_TOKEN": script_args.hf_token})
accelerator.wait_for_everyone()

# Download model based on training setup (single or multi-node)
Expand Down Expand Up @@ -381,7 +385,16 @@ def train(script_args, training_args, train_ds, test_ds):
parser = TrlParser((ScriptArguments, TrainingArguments))
script_args, training_args = parser.parse_args_and_config()

set_custom_env({"HF_HUB_ENABLE_HF_TRANSFER": "1"})
#set_custom_env({"HF_HUB_ENABLE_HF_TRANSFER": "1"})
custom_env: Dict[str, str] = {"HF_DATASETS_TRUST_REMOTE_CODE": "TRUE",
"HF_HUB_ENABLE_HF_TRANSFER": "1",
"HF_TOKEN": script_args.hf_token,
# "FSDP_CPU_RAM_EFFICIENT_LOADING": "1",
# "ACCELERATE_USE_FSDP": "1",
# "WANDB_API_KEY": script_args.wandb_token,
# "WANDB_DIR" : "/opt/ml/output",
# "CUDA_VISIBLE_DEVICES": str(torch.cuda.device_count())
}

if script_args.mlflow_uri is not None and script_args.mlflow_experiment_name is not None and \
script_args.mlflow_uri != "" and script_args.mlflow_experiment_name != "":
Expand Down