Skip to content

Commit

Permalink
adding better documentation throughout
Browse files Browse the repository at this point in the history
  • Loading branch information
rdiehlmartinez committed Dec 2, 2024
1 parent 0b81315 commit f2827c9
Show file tree
Hide file tree
Showing 8 changed files with 574 additions and 227 deletions.
16 changes: 8 additions & 8 deletions config.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
"""
Welcome to the Pico Config File!
The Pico Config File!
This is where you can specify the hyperparameters for the Pico model, the dataset, the training
process, evaluation yada yada.
Expand All @@ -19,16 +19,12 @@

VOCAB_SIZE = 50304
MAX_SEQ_LEN = 2048
BATCH_SIZE = 2
BATCH_SIZE = 1024
GRADIENT_ACCUMULATION_STEPS = (
1 # NOTE: Play with this to make the batch size fit in memory.
128 # NOTE: Play with this to make the batch size fit in memory.
)
# BATCH_SIZE = 1024
# GRADIENT_ACCUMULATION_STEPS = (
# 128 # NOTE: Play with this to make the batch size fit in memory.
# )

# N.B. The effective batch size is BATCH_SIZE // GRADIENT_ACCUMULATION_STEPS.
# NOTE! The effective batch size is BATCH_SIZE // GRADIENT_ACCUMULATION_STEPS.

########################################################
#
Expand Down Expand Up @@ -177,6 +173,10 @@ class EvaluationConfig:
# Path to load a checkpoint from a local path
checkpoint_path: Optional[str] = None

# HuggingFace Hub Configs - set to None to not push to HuggingFace Hub
# Should be in the format of <(username or )>/<repo_name>, e.g. pico-lm/pico-7b
save_checkpoint_repo_id: Optional[str] = "pico-lm/demo"

# Evaluation metrics to compute: by default, we compute the perplexity of the model
evaluation_metrics: List[str] = field(default_factory=lambda: ["paloma"])

Expand Down
19 changes: 17 additions & 2 deletions model.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
"""
Beep Boop - this is the Pico Model: a lightweight transformer-based language model. Pico uses a
a simple LLAMA-style transformer architecture, written for clarity and educational purposes.
The Pico Model: a lightweight transformer-based language model.
Pico uses a simple LLAMA-style transformer architecture, written for clarity and educational purposes.
Everything is written with a modular design for easy modification and experimentation.
Expand Down Expand Up @@ -514,6 +515,20 @@ def forward(
#
########################################################

"""
HuggingFace wrapper for the Pico model.
Wait why do we need a wrapper? Aren't we just using the Pico class directly? Good question!
Many evaluation frameworks require a model be setup as a HuggingFace model, so we provide a simple
wrapper that does just that. When we save checkpoints of the Pico model, we save both the normal
Pico model as well as the model wrapped in this HuggingFace class.
This also lets you do cool things like:
`model = AutoModelForCausalLM.from_pretrained("path/to/checkpoint")`
"""


class PicoHFConfig(PretrainedConfig):
"""HuggingFace config for Pico model."""
Expand Down
45 changes: 38 additions & 7 deletions setup.sh
Original file line number Diff line number Diff line change
Expand Up @@ -2,46 +2,76 @@
# This script sets up the project by installing dependencies, checking for a poetry environment,
# and installing pre-commit hooks.

# Add color and formatting variables at the top
GREEN='\033[0;32m'
BLUE='\033[0;34m'
YELLOW='\033[1;33m'
RED='\033[0;31m'
NC='\033[0m' # No Color
BOLD='\033[1m'

# Function for section headers
print_section() {
echo -e "\n${BOLD}${BLUE}=== $1 ===${NC}\n"
}

# Function for success messages
print_success() {
echo -e "${GREEN}$1${NC}"
}

# Function for warnings
print_warning() {
echo -e "${YELLOW}$1${NC}"
}

# Initialize and update git submodules
print_section "Git Submodules"
echo "Initializing git submodules..."
git submodule update --init --recursive

# ---- ENVIRONMENT VARIABLES ---- #
# Source .env file if it exists
print_section "Environment Variables"
if [ -f .env ]; then
echo "Loading environment variables from .env..."
print_success "Loading environment variables from .env..."
source .env
else
echo "Warning: No .env file found. You might need to create one with HF_TOKEN and WANDB_API_KEY"
echo "Example .env contents:"
print_warning "No .env file found. You might need to create one with HF_TOKEN and WANDB_API_KEY"
echo -e "${YELLOW}Example .env contents:${NC}"
echo "export HF_TOKEN=your_huggingface_token"
echo "export WANDB_API_KEY=your_wandb_key"
fi

# ---- EVALUATION SETUP ---- #
# Clone Paloma dataset if credentials are provided and directory doesn't exist
print_section "Evaluation Setup"
if [ ! -d "lib/paloma" ]; then
if [ ! -z "$HF_TOKEN" ]; then
echo "Cloning Paloma evaluation dataset..."
git clone https://oauth2:${HF_TOKEN}@huggingface.co/datasets/allenai/paloma lib/paloma
print_success "Paloma dataset cloned successfully"
else
echo "Skipping Paloma dataset clone. To clone, provide HuggingFace credentials"
print_warning "Skipping Paloma dataset clone. To clone, provide HuggingFace credentials"
fi
else
echo "Paloma dataset already exists, skipping clone"
print_success "Paloma dataset already exists, skipping clone"
fi

# Create environment for running evaluation inside of lib/olmo_eval
# skip if already exists
if [ ! -d "lib/olmo-eval/env" ]; then
print_section "OLMo Eval Setup"
cd lib/olmo-eval
echo "Creating virtual environment..."
virtualenv env
source env/bin/activate
pip install -e .
deactivate
cd ../../
print_success "OLMo eval environment setup complete"
else
echo "olmo-eval environment already exists, skipping setup"
print_success "olmo-eval environment already exists, skipping setup"
fi

# ---- POETRY ENVIRONMENT SETUP ---- #
Expand All @@ -54,10 +84,11 @@ check_poetry_env() {
fi
}

# Check if poetry environment exists
print_section "Poetry Environment Setup"
if ! check_poetry_env; then
echo "No poetry environment found. Initializing..."
poetry install --with dev --no-root
print_success "Poetry environment created successfully"
fi

# Check if we're already in a Poetry shell
Expand Down
Loading

0 comments on commit f2827c9

Please sign in to comment.