Skip to content

Commit 470ca1b

Browse files
committed
Add week02 materials
1 parent 732a6ff commit 470ca1b

19 files changed

+693
-1
lines changed

README.md

+3-1
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,9 @@ __This branch corresponds to the ongoing 2024 course. If you want to see full ma
77
- [__Week 1:__](./week01_intro) __Introduction__
88
- Lecture: Course overview and organizational details. Core concepts of the GPU architecture and CUDA API.
99
- Seminar: CUDA operations in PyTorch. Introduction to benchmarking.
10-
- __Week 2:__ __Experiment tracking, model and data versioning, testing DL code in Python__
10+
- [__Week 2:__](./week02_management_and_testing) __Experiment tracking, model and data versioning, testing DL code in Python__
11+
- Lecture: Experiment management basics and pipeline versioning. Configuring Python applications. Intro to regular and property-based testing.
12+
- Seminar: Example DVC+Weights & Biases project walkthrough. Intro to testing with pytest.
1113
- __Week 3:__ __Training optimizations, profiling DL code__
1214
- __Week 4:__ __Basics of distributed ML__
1315
- __Week 5:__ __Data-parallel training and All-Reduce__
+17
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,17 @@
1+
# Week 2: Experiment tracking and testing
2+
3+
* Lecture: [slides](./lecture.pdf)
4+
* Seminar: see the [example_project](./example_project) directory
5+
* Homework: see [homework/README.md](homework/README.md)
6+
7+
## Further reading
8+
* Tools for experiment tracking: [Aim](https://github.com/aimhubio/aim), [Comet](https://www.comet.ml/site/), [Neptune](https://neptune.ai/), [Sacred](https://github.com/IDSIA/sacred), [Weights and Biases](https://wandb.ai/), [ClearML](https://clear.ml/)
9+
* [DVC](https://dvc.org/) and [Pachyderm](https://www.pachyderm.com/) for artifact versioning
10+
* [Hydra documentation](https://hydra.cc/docs/intro/)
11+
* [Unittest](https://docs.python.org/3/library/unittest.html) built-in module
12+
* [Doctest](https://docs.python.org/3/library/doctest.html) built-in module (useful for testing docstrings!)
13+
* [Pytest](https://github.com/pytest-dev/pytest/) repository
14+
* Pytest plugins: [pytest-xdist](https://pypi.org/project/pytest-xdist/) for parallel execution, [pytest-cov](https://pytest-cov.readthedocs.io/en/latest/readme.html) for coverage reports.
15+
* [Hypothesis quick start guide](https://hypothesis.readthedocs.io/en/latest/quickstart.html) and [integration with pytest](https://hypothesis.readthedocs.io/en/latest/details.html#the-hypothesis-pytest-plugin)
16+
* [Full Stack Deep Learning "Troubleshooting & Testing" lecture](https://fullstackdeeplearning.com/course/2022/lecture-3-troubleshooting-and-testing/#4-resources)
17+
* [Made With ML MLOps Course, "Testing Machine Learning Systems: Code, Data and Models"](https://madewithml.com/courses/mlops/testing/)
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,54 @@
1+
import json
2+
from argparse import ArgumentParser
3+
4+
import torch
5+
import torchvision.transforms as transforms
6+
from torchvision.datasets import CIFAR10
7+
from torchvision.models import resnet18
8+
9+
from hparams import config
10+
11+
12+
def main(args):
13+
transform = transforms.Compose([
14+
transforms.ToTensor(),
15+
transforms.Normalize((0.4914, 0.4822, 0.4465), (0.247, 0.243, 0.261))
16+
])
17+
18+
test_dataset = CIFAR10(root='CIFAR10/test',
19+
train=False,
20+
transform=transform,
21+
download=False,
22+
)
23+
24+
test_loader = torch.utils.data.DataLoader(dataset=test_dataset,
25+
batch_size=config["batch_size"])
26+
27+
device = torch.device("cuda")
28+
29+
model = resnet18(pretrained=False, num_classes=10)
30+
model.load_state_dict(torch.load("model.pt"))
31+
model.to(device)
32+
33+
correct = 0.0
34+
35+
for test_images, test_labels in test_loader:
36+
test_images = test_images.to(device)
37+
test_labels = test_labels.to(device)
38+
39+
with torch.inference_mode():
40+
outputs = model(test_images)
41+
preds = torch.argmax(outputs, 1)
42+
correct += (preds == test_labels).sum()
43+
44+
accuracy = correct / len(test_dataset)
45+
46+
with open("final_metrics.json", "w+") as f:
47+
json.dump({"accuracy": accuracy.item()}, f)
48+
print("\n", file=f)
49+
50+
51+
if __name__ == '__main__':
52+
parser = ArgumentParser()
53+
args = parser.parse_args()
54+
main(args)
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,24 @@
1+
stages:
2+
prepare_data:
3+
cmd: python prepare_data.py
4+
deps:
5+
- prepare_data.py
6+
outs:
7+
- CIFAR10
8+
train:
9+
cmd: python train.py
10+
deps:
11+
- CIFAR10
12+
- hparams.py
13+
- train.py
14+
outs:
15+
- model.pt
16+
compute_metrics:
17+
cmd: python compute_metrics.py
18+
deps:
19+
- CIFAR10
20+
- compute_metrics.py
21+
- model.pt
22+
metrics:
23+
- final_metrics.json:
24+
cache: false
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
config = dict(
2+
batch_size=64,
3+
learning_rate=1e-5,
4+
weight_decay=0.01,
5+
epochs=2,
6+
zero_init_residual=False,
7+
)
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
from torchvision.datasets import CIFAR10
2+
3+
if __name__ == "__main__":
4+
train_dataset = CIFAR10("CIFAR10/train", download=True)
5+
test_dataset = CIFAR10("CIFAR10/test", download=True)
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,38 @@
1+
import torch
2+
import pytest
3+
4+
from train import compute_accuracy
5+
6+
def test_arange_elems():
7+
arr = torch.arange(0, 10, dtype=torch.float)
8+
assert torch.allclose(arr[-1], torch.tensor([9]).float())
9+
10+
def test_div_zero():
11+
a = torch.zeros(1,dtype=torch.long)
12+
b = torch.ones(1,dtype=torch.long)
13+
14+
assert not torch.isfinite(b/a)
15+
16+
17+
def test_div_zero_python():
18+
with pytest.raises(ZeroDivisionError):
19+
1/0
20+
21+
def test_accuracy():
22+
preds = torch.randint(0,2,size=(100,))
23+
targets = preds.clone()
24+
25+
assert compute_accuracy(preds, targets) == 1.0
26+
27+
preds = torch.tensor([1,2,3,0,0,0])
28+
targets = torch.tensor([1,2,3,4,5,6])
29+
30+
assert compute_accuracy(preds, targets) == 0.5
31+
32+
@pytest.mark.parametrize("preds,targets,result",[
33+
(torch.tensor([1,2,3]),torch.tensor([1,2,3]), 1.0),
34+
(torch.tensor([1,2,3]),torch.tensor([0,0,0]), 0.0),
35+
(torch.tensor([1,2,3]),torch.tensor([1,2,0]), 2/3),
36+
])
37+
def test_accuracy_parametrized(preds, targets, result):
38+
assert torch.allclose(compute_accuracy(preds, targets), torch.tensor([result]), rtol=0, atol=1e-5)
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,91 @@
1+
import torch
2+
import torch.nn as nn
3+
import torchvision.transforms as transforms
4+
import wandb
5+
from torchvision.datasets import CIFAR10
6+
from torchvision.models import resnet18
7+
from tqdm import tqdm, trange
8+
9+
from hparams import config
10+
11+
wandb.init(config=config, project="effdl_example", name="baseline")
12+
13+
def compute_accuracy(preds, targets):
14+
result = (targets == preds).float().sum()
15+
return result
16+
17+
18+
def main():
19+
transform = transforms.Compose([
20+
transforms.ToTensor(),
21+
transforms.Normalize((0.4914, 0.4822, 0.4465), (0.247, 0.243, 0.261)),
22+
transforms.Resize((224, 224)),
23+
])
24+
25+
train_dataset = CIFAR10(root='CIFAR10/train',
26+
train=True,
27+
transform=transform,
28+
download=False,
29+
)
30+
31+
test_dataset = CIFAR10(root='CIFAR10/test',
32+
train=False,
33+
transform=transform,
34+
download=False,
35+
)
36+
37+
train_loader = torch.utils.data.DataLoader(dataset=train_dataset,
38+
batch_size=config["batch_size"],
39+
shuffle=True)
40+
41+
test_loader = torch.utils.data.DataLoader(dataset=test_dataset,
42+
batch_size=config["batch_size"])
43+
44+
device = torch.device("cuda")
45+
46+
model = resnet18(pretrained=False, num_classes=10, zero_init_residual=config["zero_init_residual"])
47+
model.to(device)
48+
wandb.watch(model)
49+
50+
criterion = nn.CrossEntropyLoss()
51+
optimizer = torch.optim.AdamW(model.parameters(), lr=config["learning_rate"], weight_decay=config["weight_decay"])
52+
53+
for epoch in trange(config["epochs"]):
54+
for i, (images, labels) in enumerate(tqdm(train_loader)):
55+
images = images.to(device)
56+
labels = labels.to(device)
57+
58+
outputs = model(images)
59+
loss = criterion(outputs, labels)
60+
61+
loss.backward()
62+
optimizer.step()
63+
optimizer.zero_grad()
64+
65+
if i % 100 == 0:
66+
all_preds = []
67+
all_labels = []
68+
69+
for test_images, test_labels in test_loader:
70+
test_images = test_images.to(device)
71+
test_labels = test_labels.to(device)
72+
73+
with torch.inference_mode():
74+
outputs = model(test_images)
75+
preds = torch.argmax(outputs, 1)
76+
77+
all_preds.append(preds)
78+
all_labels.append(test_labels)
79+
80+
accuracy = compute_accuracy(torch.cat(all_preds), torch.cat(all_labels))
81+
82+
metrics = {'test_acc': accuracy, 'train_loss': loss}
83+
wandb.log(metrics, step=epoch * len(train_dataset) + (i + 1) * config["batch_size"])
84+
torch.save(model.state_dict(), "model.pt")
85+
86+
with open("run_id.txt", "w+") as f:
87+
print(wandb.run.id, file=f)
88+
89+
90+
if __name__ == '__main__':
91+
main()
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,92 @@
1+
# Week 2 home assignment
2+
3+
This assignment consists of 4 parts: you can earn the full amount of points by completing the first two and either of
4+
tasks 3 and 4 (or both of them for bonus points).
5+
However, completing tasks 3 or 4 without the first two will not give you any points.
6+
7+
# Problem statement
8+
You are given a small codebase that should train an **unconditional** [Denoising Diffusion Probabilistic Model](https://arxiv.org/abs/2006.11239)
9+
on the CIFAR-10 dataset.
10+
However, this project contains several bugs of different severity, and even some of the tests are written incorrectly.
11+
A correct implementation will achieve *somewhat* decent results after training for 100 epochs (~2 hours on an average GPU),
12+
but you should not expect much in terms of quality.
13+
In this homework, we are going to have a deeper look at the training pipeline, try to fix any errors we find and make
14+
the code more reliable and reproducible.
15+
16+
# Task 1 (6.5 points)
17+
Implement *correct* tests for the training pipeline.
18+
Specifically, have a look at the current [tests](./tests) folder: it contains several files with tests,
19+
some of which fail, fail sometimes or are plainly incorrect.
20+
Your task is to identify the bugs and make the test suite pass deterministically: this will involve changes
21+
both to `modeling` and to `tests`, as some parts of the testing code need to be modified as well.
22+
23+
In your report, please tell us how you found the bugs in all parts of the code.
24+
You can find the original implementation of DDPM that we use in this assignment, but giving it as an explanation for
25+
your fixes will give you no points.
26+
Obviously, "solving" the assignment by removing all tests or having unreasonably high thresholds will not earn
27+
you a good grade as well.
28+
29+
After that, implement the `test_training` function in `test_pipeline.py` that runs an integration test for the
30+
entire training procedure with different hyperparameters and expects different outcomes.
31+
This test should increase the coverage of the `modeling.training` file (measured by [pytest-cov](https://github.com/pytest-dev/pytest-cov)) to **>80%**.
32+
33+
Importantly, you should ensure that your test code running the actual model can run both on CPU and GPU.
34+
Since training on CPU even for 1 epoch might take too long, you need to implement training on a subset of data.
35+
36+
37+
# Task 2 (1.5 points)
38+
Implement logging of the metrics and artifacts during training with [Weights and Biases](https://wandb.ai/site).
39+
You should log the following values:
40+
* Training loss and the learning rate
41+
* All training hyperparameters (including batch size, number of epochs etc., as well as all model and diffusion hyperparameters)
42+
* Inputs to the model (1 batch is enough) and samples from it after each epoch
43+
44+
However, you should **NOT** log the training code for the model.
45+
46+
Logging the hyperparameters and metrics will likely involve some refactoring of the original codebase.
47+
You can either place the necessary hyperparameters in a config file or simply have them as constants/argparse defaults
48+
defined somewhere reasonable in the training code.
49+
50+
After finishing this task, train the model for at least 100 epochs with default hyperparameters and attach the link to
51+
your W&B project containing this run to the final report.
52+
53+
# Task 3 (2 points)
54+
Improve the configuration process of this pipeline using the [Hydra](https://hydra.cc/) library.
55+
You should create a config that allows adjusting at least the following attributes:
56+
* Peak learning rate and optimizer momentum
57+
* Optimizer (Adam by default, at least SGD should be supported)
58+
* Training batch size and the number of epochs
59+
* Number of workers in the dataloader
60+
* Existence of random flip augmentations
61+
62+
Demonstrate that your integration works by running at least three *complete* runs (less than 100 epochs is OK)
63+
with hyperparameters changed via the config file.
64+
From these runs, it should be evident that changing hyperparameters affects the training procedure.
65+
Here, you should log the config using [run.log_artifact](https://docs.wandb.ai/ref/python/run#log_artifact)
66+
and show that this changes the hyperparameters of the run in W&B.
67+
68+
# Task 4 (2 points)
69+
Make the pipeline reproducible using [Data Version Control](https://dvc.org/).
70+
You should end up with a `dvc.yaml` that represents two stages of your experiment with corresponding inputs and outputs:
71+
getting the data (yes, you need to refactor that part of the code) and training the model itself.
72+
Also, you should specify the relevant code and configuration as dependencies of the corresponding pipeline stages.
73+
Lastly, after running your code, you should have a `dvc.lock` that stores hashes of all artifacts in your pipeline.
74+
Submit both `dvc.yaml` and `dvc.lock` as parts of your solution.
75+
76+
Importantly, modifying any of the relevant modules or hyperparameters should trigger an invalidation of the
77+
corresponding pipeline stages: that is, `dvc repro` should do nothing if and only if `dvc.lock` is consistent with
78+
hashes of all dependencies in the pipeline.
79+
80+
If you have also done the Hydra configuration assignment, make sure to check out [this guide](https://dvc.org/doc/user-guide/experiment-management/hydra-composition)
81+
on integrating Hydra with DVC experiment management.
82+
83+
# Submission format
84+
When submitting this assignment, you should attach a .zip archive that contains:
85+
- The source code with all your fixes and improvements
86+
- A Markdown/PDF report in the root of the project folder that:
87+
1. Details the changes you made to the original code (we will run `diff` and see if everything is explained)
88+
2. Tells how to run the modified code (i.e., which command line arguments you have added and how to use them)
89+
3. Describes your process of fixing and adding new tests for Task 1 and reports the test coverage
90+
4. Gives a link to the Weights and Biases project with all necessary logs for tasks 2 and 3
91+
- If you solved Tasks 3 or 4, please ensure that the archived project contains the corresponding configuration/lock files as well.
92+
- An updated `requirements.txt` file, if your solution requires new dependencies such as `wandb`, `hydra-core` or `dvc`.
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,40 @@
1+
import torch
2+
from torch.utils.data import DataLoader
3+
from torchvision import transforms
4+
from torchvision.datasets import CIFAR10
5+
6+
from modeling.diffusion import DiffusionModel
7+
from modeling.training import generate_samples, train_epoch
8+
from modeling.unet import UnetModel
9+
10+
11+
def main(device: str, num_epochs: int = 100):
12+
ddpm = DiffusionModel(
13+
eps_model=UnetModel(3, 3, hidden_size=128),
14+
betas=(1e-4, 0.02),
15+
num_timesteps=1000,
16+
)
17+
ddpm.to(device)
18+
19+
train_transforms = transforms.Compose(
20+
[transforms.ToTensor(), transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))]
21+
)
22+
23+
dataset = CIFAR10(
24+
"cifar10",
25+
train=True,
26+
download=True,
27+
transform=train_transforms,
28+
)
29+
30+
dataloader = DataLoader(dataset, batch_size=128, num_workers=4, shuffle=True)
31+
optim = torch.optim.Adam(ddpm.parameters(), lr=1e-5)
32+
33+
for i in range(num_epochs):
34+
train_epoch(ddpm, dataloader, optim, device)
35+
generate_samples(ddpm, device, f"samples/{i:02d}.png")
36+
37+
38+
if __name__ == "__main__":
39+
device = "cuda" if torch.cuda.is_available() else "cpu"
40+
main(device=device)

week02_management_and_testing/homework/modeling/__init__.py

Whitespace-only changes.

0 commit comments

Comments
 (0)