Add history to the checkpoint state

brettviren · brettviren · commit 75030c224742 · 2024-11-15T15:42:59.000-05:00
diff --git a/wirecell/dnn/__init__.py b/wirecell/dnn/__init__.py
@@ -0,0 +1 @@
+from . import data, models, io, apps
diff --git a/wirecell/dnn/__main__.py b/wirecell/dnn/__main__.py
@@ -7,6 +7,9 @@
 from wirecell.util.paths import unglob, listify
 
 
+from wirecell import dnn
+
+
 @context("dnn")
 def cli(ctx):
     '''
@@ -41,68 +44,130 @@ def cli(ctx):
               help="File name providing the initial model state dict (def=None - construct fresh)")
 @click.option("-s", "--save", default=None,
               help="File name to save model state dict after training (def=None - results not saved)")
-@click.argument("files", nargs=-1)
+@click.option("--eval-files", multiple=True, type=str, # fixme: remove this in favor of a single file set and a train/eval partitioning
+              help="File path or globs as comma separated list to use for evaluation dataset")
+@click.argument("train_files", nargs=-1)
 @click.pass_context
 def train(ctx, config, epochs, batch, device, cache, debug_torch,
           checkpoint_save, checkpoint_modulus,
-          name, load, save, files):
+          name, load, save, eval_files, train_files):
     '''
     Train a model.
     '''
-    if not files:
-        raise click.BadArgumentUsage("no files given")
+    if not train_files:
+        raise click.BadArgumentUsage("no training files given")
+    train_files = unglob(listify(train_files))
+    log.info(f'training files: {train_files}')
 
     if device == 'gpu': device = 'cuda'
     log.info(f'using device {device}')
 
     if debug_torch:
         torch.autograd.set_detect_anomaly(True)
 
-    # fixme: make choice of dataset optional
-    import wirecell.dnn.apps
-    from wirecell.dnn import io
-
-    app = getattr(wirecell.dnn.apps, name)
+    app = getattr(dnn.apps, name)
 
     net = app.Network()
     opt = app.Optimizer(net.parameters())
+    crit = app.Criterion()
+    trainer = app.Trainer(net, opt, crit, device=device)
 
-    par = dict(epoch=0, loss=0)
-
+    history = dict()
     if load:
         if not Path(load).exists():
             raise click.FileError(load, 'warning: DNN module load file does not exist')
-        par = io.load_checkpoint(load, net, opt)
-
-    tot_epoch = par["epoch"]
-    del par
+        history = dnn.io.load_checkpoint(load, net, opt)
 
-    ds = app.Dataset(files, cache=cache)
-    nsamples = len(ds)
-    if nsamples == 0:
-        raise click.BadArgumentUsage(f'no samples from {len(files)} files')
+    train_ds = app.Dataset(train_files, cache=cache)
+    ntrain = len(train_ds)
+    if ntrain == 0:
+        raise click.BadArgumentUsage(f'no samples from {len(train_files)} files')
 
     from torch.utils.data import DataLoader
-    dl = DataLoader(ds, batch_size=batch, shuffle=True, pin_memory=True)
+    train_dl = DataLoader(train_ds, batch_size=batch, shuffle=True, pin_memory=True)
  
-    trainer = app.Trainer(net, device=device)
-
-    checkpoint=2                # fixme make configurable
-    for epoch in range(epochs):
-        losslist = trainer.epoch(dl)
-        loss = sum(losslist)
-        log.debug(f'epoch {tot_epoch} loss {loss}')
+    neval = 0
+    eval_dl = None
+    if eval_files:
+        eval_files = unglob(listify(eval_files, delim=","))
+        log.info(f'eval files: {eval_files}')
+        eval_ds = app.Dataset(eval_files, cache=cache)
+        neval = len(eval_ds)
+        eval_dl = DataLoader(train_ds, batch_size=batch, shuffle=False, pin_memory=True)
+    else:
+        log.info("no eval files")
+
+    # History
+    run_history = history.get("runs", dict())
+    this_run_number = 0
+    if run_history:
+        this_run_number = max(run_history.keys()) + 1
+    this_run = dict(
+        run = this_run_number,
+        train_files = train_files,
+        ntrain = ntrain,
+        eval_files = eval_files or [],
+        neval = neval,
+        nepochs = epochs,
+        batch = batch,
+        device = device,
+        cache = cache,
+        name = name,
+        load = load,
+    )
+    run_history[this_run_number] = this_run
+
+    epoch_history = history.get("epochs", dict())
+    first_epoch_number = 0
+    if epoch_history:
+        first_epoch_number = max(epoch_history.keys()) + 1
+
+    def saveit(path):
+        if not path:
+            return
+        dnn.io.save_checkpoint(path, net, opt, runs=run_history, epochs=epoch_history)
+
+    for this_epoch_number in range(first_epoch_number, first_epoch_number + epochs):
+        train_losses = trainer.epoch(train_dl)
+        train_loss = sum(train_losses)/ntrain
+
+        eval_losses = []
+        eval_loss = 0
+        if eval_dl:
+            eval_losses = trainer.evaluate(eval_dl)
+            eval_loss = sum(eval_losses) / neval
+
+        this_epoch = dict(
+            run=this_run_number,
+            epoch=this_epoch_number,
+            train_losses=train_losses,
+            train_loss=train_loss,
+            eval_losses=eval_losses,
+            eval_loss=eval_loss)
+        epoch_history[this_epoch_number] = this_epoch
+
+        log.info(f'run: {this_run_number} epoch: {this_epoch_number} loss: {train_loss} eval: {eval_loss}')
 
         if checkpoint_save:
-            if tot_epoch%checkpoint_modulus == 0:
-                cpath = checkpoint_save.format(epoch=tot_epoch)
-                io.save_checkpoint(cpath, net, opt, 
-                                   epoch=tot_epoch, loss=loss)
-        tot_epoch += 1
+            if this_epoch_number % checkpoint_modulus == 0:
+                parms = dict(this_run, **this_epoch)
+                cpath = checkpoint_save.format(**parms)
+                saveit(cpath)
+    saveit(save)
 
-    if save:
-        io.save_checkpoint(save, net, opt, epoch=tot_epoch, loss=loss)
 
+@cli.command('dump')
+@click.argument("checkpoint")
+@click.pass_context
+def dump(ctx, checkpoint):
+    '''
+    Dump info about a checkpoint file.
+    '''
+    state = dnn.io.load_checkpoint_raw(checkpoint)
+    for rnum, robj in state.get("runs",{}).items():
+        print('run: {run} ntrain: {ntrain} neval: {neval}'.format(**robj))
+    for enum, eobj in state.get("epochs",{}).items():
+        print('run: {run} epoch: {epoch} train: {train_loss} eval: {eval_loss}'.format(**eobj))
 
 @cli.command('extract')
 @click.option("-o", "--output", default='samples.npz',
@@ -120,7 +185,6 @@ def extract(ctx, output, sample, datapaths):
     samples = map(int,listify(*sample, delim=","))
 
     # fixme: make choice of dataset optional
-    from wirecell.dnn.apps import dnnroi as app
     ds = app.Dataset(datapaths)
 
     log.info(f'dataset has {len(ds)} entries from {len(datapaths)} data paths')
diff --git a/wirecell/dnn/apps/dnnroi/__init__.py b/wirecell/dnn/apps/dnnroi/__init__.py
@@ -1,12 +1,13 @@
 #!/usr/bin/env python
+from torch import optim
 
-
+## The "app" API
 from .model import Network
 from .data import Dataset
-from .train import Classifier as Trainer
+from wirecell.dnn.train import Classifier as Trainer
+from torch.nn import BCELoss as Criterion
 
 
-from torch import optim
 def Optimizer(params):
     return optim.SGD(params, lr=0.1, momentum=0.9, weight_decay=0.0005)
 
diff --git a/wirecell/dnn/apps/dnnroi/train.py b/wirecell/dnn/apps/dnnroi/train.py
diff --git a/wirecell/dnn/io.py b/wirecell/dnn/io.py
@@ -20,14 +20,19 @@ def save_checkpoint(path, model, optimizer, **kwds):
     torch.save(kwds, path)
 
 
+def load_checkpoint_raw(path):
+    return torch.load(path, weights_only=True)
+    
+
 def load_checkpoint(path, model, optimizer):
     '''
     Load a checkpoint.
 
     The model and optimizer state dicts are updated and a dict of any additional
     parameters is returned.
     '''
-    cp = torch.load(path, weights_only=True)
+    cp = load_checkpoint_raw(path)
     model.load_state_dict(cp.pop("model_state_dict"))
     optimizer.load_state_dict(cp.pop("optimizer_state_dict"))
     return cp
+
diff --git a/wirecell/dnn/train.py b/wirecell/dnn/train.py
@@ -19,21 +19,45 @@
     - optimizer.step()
 
 '''
-from torch import optim
+from torch import optim, no_grad
 import torch.nn as nn
 
 def dump(name, data):
     # print(f'{name:20s}: {data.shape} {data.dtype} {data.device}')
     return
 
 class Classifier:
-    def __init__(self, net, device='cpu', optclass = optim.SGD, **optkwds):
+    def __init__(self, net, optimizer, criterion = nn.BCELoss(), device='cpu'):
         net.to(device)
         self._device = device
         self.net = net              # model
-        self.optimizer = optclass(net.parameters(), **optkwds)
+        self.optimizer = optimizer
+        self.criterion = criterion
 
-    def epoch(self, data, criterion=nn.BCELoss(), retain_graph=False):
+    def loss(self, features, labels):
+
+        features = features.to(self._device)
+        dump('features', features)
+        labels = labels.to(self._device)
+        dump('labels', labels)
+
+        prediction = self.net(features)
+        dump('prediction', prediction)
+
+        loss = self.criterion(prediction, labels)
+        return loss
+
+    def evaluate(self, data):
+        losses = list()
+        with no_grad():
+            for features, labels in data:
+                loss = self.loss(features, labels)
+                loss = loss.item()
+                losses.append(loss)
+        return losses
+
+
+    def epoch(self, data, retain_graph=False):
         '''
         Train over the batches of the data, return list of losses at each batch.
         '''
@@ -42,15 +66,7 @@ def epoch(self, data, criterion=nn.BCELoss(), retain_graph=False):
         epoch_losses = list()
         for features, labels in data:
 
-            features = features.to(self._device)
-            dump('features', features)
-            labels = labels.to(self._device)
-            dump('labels', labels)
-
-            prediction = self.net(features)
-            dump('prediction', prediction)
-
-            loss = criterion(prediction, labels)
+            loss = self.loss(features, labels)
 
             loss.backward(retain_graph=retain_graph)
             self.optimizer.step()