adding better documentation throughout

rdiehlmartinez · Dec 2, 2024 · f2827c9 · f2827c9
1 parent 0b81315
commit f2827c9
Show file tree

Hide file tree

Showing 8 changed files with 574 additions and 227 deletions.
diff --git a/config.py b/config.py
@@ -1,5 +1,5 @@
 """
-Welcome to the Pico Config File!
+The Pico Config File!
 
 This is where you can specify the hyperparameters for the Pico model, the dataset, the training
 process, evaluation yada yada.
@@ -19,16 +19,12 @@
 
 VOCAB_SIZE = 50304
 MAX_SEQ_LEN = 2048
-BATCH_SIZE = 2
+BATCH_SIZE = 1024
 GRADIENT_ACCUMULATION_STEPS = (
-    1  # NOTE: Play with this to make the batch size fit in memory.
+    128  # NOTE: Play with this to make the batch size fit in memory.
 )
-# BATCH_SIZE = 1024
-# GRADIENT_ACCUMULATION_STEPS = (
-#     128  # NOTE: Play with this to make the batch size fit in memory.
-# )
 
-# N.B. The effective batch size is BATCH_SIZE // GRADIENT_ACCUMULATION_STEPS.
+# NOTE! The effective batch size is BATCH_SIZE // GRADIENT_ACCUMULATION_STEPS.
 
 ########################################################
 #
@@ -177,6 +173,10 @@ class EvaluationConfig:
     # Path to load a checkpoint from a local path
     checkpoint_path: Optional[str] = None
 
+    # HuggingFace Hub Configs - set to None to not push to HuggingFace Hub
+    # Should be in the format of <(username or )>/<repo_name>, e.g. pico-lm/pico-7b
+    save_checkpoint_repo_id: Optional[str] = "pico-lm/demo"
+
     # Evaluation metrics to compute: by default, we compute the perplexity of the model
     evaluation_metrics: List[str] = field(default_factory=lambda: ["paloma"])
 

diff --git a/model.py b/model.py
@@ -1,6 +1,7 @@
 """
-Beep Boop - this is the Pico Model: a lightweight transformer-based language model. Pico uses a
-a simple LLAMA-style transformer architecture, written for clarity and educational purposes.
+The Pico Model: a lightweight transformer-based language model.
+
+Pico uses a simple LLAMA-style transformer architecture, written for clarity and educational purposes.
 
 Everything is written with a modular design for easy modification and experimentation.
 
@@ -514,6 +515,20 @@ def forward(
 #
 ########################################################
 
+"""
+HuggingFace wrapper for the Pico model.
+
+Wait why do we need a wrapper? Aren't we just using the Pico class directly? Good question!
+
+Many evaluation frameworks require a model be setup as a HuggingFace model, so we provide a simple
+wrapper that does just that. When we save checkpoints of the Pico model, we save both the normal
+Pico model as well as the model wrapped in this HuggingFace class.
+
+This also lets you do cool things like: 
+
+`model = AutoModelForCausalLM.from_pretrained("path/to/checkpoint")`
+"""
+
 
 class PicoHFConfig(PretrainedConfig):
     """HuggingFace config for Pico model."""

diff --git a/setup.sh b/setup.sh
@@ -2,46 +2,76 @@
 # This script sets up the project by installing dependencies, checking for a poetry environment,
 # and installing pre-commit hooks.
 
+# Add color and formatting variables at the top
+GREEN='\033[0;32m'
+BLUE='\033[0;34m'
+YELLOW='\033[1;33m'
+RED='\033[0;31m'
+NC='\033[0m' # No Color
+BOLD='\033[1m'
+
+# Function for section headers
+print_section() {
+    echo -e "\n${BOLD}${BLUE}=== $1 ===${NC}\n"
+}
+
+# Function for success messages
+print_success() {
+    echo -e "${GREEN}✓ $1${NC}"
+}
+
+# Function for warnings
+print_warning() {
+    echo -e "${YELLOW}⚠ $1${NC}"
+}
+
 # Initialize and update git submodules
+print_section "Git Submodules"
 echo "Initializing git submodules..."
 git submodule update --init --recursive
 
 # ---- ENVIRONMENT VARIABLES ---- #
 # Source .env file if it exists
+print_section "Environment Variables"
 if [ -f .env ]; then
-    echo "Loading environment variables from .env..."
+    print_success "Loading environment variables from .env..."
     source .env
 else
-    echo "Warning: No .env file found. You might need to create one with HF_TOKEN and WANDB_API_KEY"
-    echo "Example .env contents:"
+    print_warning "No .env file found. You might need to create one with HF_TOKEN and WANDB_API_KEY"
+    echo -e "${YELLOW}Example .env contents:${NC}"
     echo "export HF_TOKEN=your_huggingface_token"
     echo "export WANDB_API_KEY=your_wandb_key"
 fi
 
 # ---- EVALUATION SETUP ---- #
 # Clone Paloma dataset if credentials are provided and directory doesn't exist
+print_section "Evaluation Setup"
 if [ ! -d "lib/paloma" ]; then
     if [ ! -z "$HF_TOKEN" ]; then
         echo "Cloning Paloma evaluation dataset..."
         git clone https://oauth2:${HF_TOKEN}@huggingface.co/datasets/allenai/paloma lib/paloma
+        print_success "Paloma dataset cloned successfully"
     else
-        echo "Skipping Paloma dataset clone. To clone, provide HuggingFace credentials"
+        print_warning "Skipping Paloma dataset clone. To clone, provide HuggingFace credentials"
     fi
 else
-    echo "Paloma dataset already exists, skipping clone"
+    print_success "Paloma dataset already exists, skipping clone"
 fi
 
 # Create environment for running evaluation inside of lib/olmo_eval
 # skip if already exists
 if [ ! -d "lib/olmo-eval/env" ]; then
+    print_section "OLMo Eval Setup"
     cd lib/olmo-eval
+    echo "Creating virtual environment..."
     virtualenv env
     source env/bin/activate
     pip install -e .
     deactivate
     cd ../../
+    print_success "OLMo eval environment setup complete"
 else
-    echo "olmo-eval environment already exists, skipping setup"
+    print_success "olmo-eval environment already exists, skipping setup"
 fi
 
 # ---- POETRY ENVIRONMENT SETUP ---- #
@@ -54,10 +84,11 @@ check_poetry_env() {
     fi
 }
 
-# Check if poetry environment exists
+print_section "Poetry Environment Setup"
 if ! check_poetry_env; then
     echo "No poetry environment found. Initializing..."
     poetry install --with dev --no-root
+    print_success "Poetry environment created successfully"
 fi
 
 # Check if we're already in a Poetry shell