refactor: fix MyPy type issues, refactor data loader and clean up debugging code

MattGPT-ai · MattGPT-ai · commit 4af4195407d5 · 2025-02-04T22:57:16.000-08:00
diff --git a/examples/multi_gpu/gather.py b/examples/multi_gpu/gather.py
diff --git a/flair/distributed_utils.py b/flair/distributed_utils.py
@@ -2,17 +2,15 @@
 import os
 import random
 from multiprocessing.connection import Connection
-from typing import Any, Callable, Collection, Iterable, TypeVar
+from typing import Callable, Collection, Iterable, TypeVar
 
-import numpy as np
 import torch
 import torch.multiprocessing as mp
 from torch.distributed import destroy_process_group, init_process_group
 from torch.utils.data import Dataset
 
 import flair
 from flair.data import Corpus, _len_dataset
-from flair.training_utils import print_execution_time
 
 log = logging.getLogger("flair")
 
@@ -64,7 +62,6 @@ def is_main_process() -> bool:
         return True
 
 
-@print_execution_time
 def validate_corpus_same_each_process(corpus: Corpus) -> None:
     """Catches most cases in which a corpus is not the same on each process. However, there is no guarantee for two
     reasons: 1) It uses a sample for speed 2) It compares strings to avoid requiring the datasets to be serializable
diff --git a/flair/models/pairwise_regression_model.py b/flair/models/pairwise_regression_model.py
@@ -299,18 +299,12 @@ def evaluate(
         if not isinstance(data_points, Dataset):
             data_points = FlairDatapointDataset(data_points)
 
-        if multi_gpu:
-            distributed_sampler: DistributedSampler = DistributedSampler(
-                data_points, shuffle=False
-            )
-            data_loader = DataLoader(
-                data_points,
-                batch_size=mini_batch_size,
-                shuffle=False,
-                sampler=distributed_sampler,
-            )
-        else:
-            data_loader = DataLoader(data_points, batch_size=mini_batch_size)
+        data_loader = DataLoader(
+            data_points,
+            batch_size=mini_batch_size,
+            shuffle=False,
+            sampler=DistributedSampler(data_points, shuffle=False) if multi_gpu else None,
+        )
 
         with torch.no_grad():
             eval_loss = torch.zeros(1, device=flair.device)
@@ -327,15 +321,15 @@ def evaluate(
                     if isinstance(batch, Sentence):
                         batch = [batch]
 
-                    loss, num, scores = self._forward_loss_and_scores(batch, return_scores=True)
+                    loss, num, scores_forward = self._forward_loss_and_scores(batch, return_scores=True)
 
                     true_values = []
                     for sentence in batch:
                         total_count += 1
                         for label in sentence.get_labels(gold_label_type):
                             true_values.append(float(label.value))
 
-                    results = scores.cpu().tolist()
+                    results = scores_forward.cpu().tolist()
 
                     eval_loss += loss
 
@@ -389,7 +383,7 @@ def evaluate(
                 )
 
             else:  # if it's not the main process, just set a dummy Result
-                result = Result(0., "", {}, {'loss': 0.0})
+                result = Result(0.0, "", {}, {"loss": 0.0})
 
             if multi_gpu:
                 result = broadcast_value(result, src=0)
diff --git a/flair/models/text_regression_model.py b/flair/models/text_regression_model.py
@@ -152,18 +152,12 @@ def evaluate(
         if not isinstance(data_points, Dataset):
             data_points = FlairDatapointDataset(data_points)
 
-        if multi_gpu:
-            distributed_sampler: DistributedSampler = DistributedSampler(
-                data_points, shuffle=False
-            )
-            data_loader = DataLoader(
-                data_points,
-                batch_size=mini_batch_size,
-                shuffle=False,
-                sampler=distributed_sampler,
-            )
-        else:
-            data_loader = DataLoader(data_points, batch_size=mini_batch_size)
+        data_loader = DataLoader(
+            data_points,
+            batch_size=mini_batch_size,
+            shuffle=False,
+            sampler=DistributedSampler(data_points, shuffle=False) if multi_gpu else None,
+        )
 
         with torch.no_grad():
             eval_loss = torch.zeros(1, device=flair.device)
@@ -176,15 +170,15 @@ def evaluate(
                 if isinstance(batch, Sentence):
                     batch = [batch]
 
-                scores, loss = self.forward_labels_and_loss(batch)
+                scores_forward, loss = self.forward_labels_and_loss(batch)
 
                 true_values = []
                 for sentence in batch:
                     total_count += 1
                     for label in sentence.get_labels(gold_label_type):
                         true_values.append(float(label.value))
 
-                results = scores[:, 0].cpu().tolist()
+                results = scores_forward[:, 0].cpu().tolist()
 
                 eval_loss += loss
 
@@ -239,7 +233,7 @@ def evaluate(
                 )
 
             else:  # if it's not the main process, just set a dummy Result
-                result = Result(0., "", {}, {'loss': 0.0})
+                result = Result(0.0, "", {}, {"loss": 0.0})
 
             if multi_gpu:
                 result = broadcast_value(result, src=0)
diff --git a/flair/nn/model.py b/flair/nn/model.py
@@ -5,7 +5,6 @@
 from abc import ABC, abstractmethod
 from collections import Counter
 from pathlib import Path
-from time import time
 from typing import Any, Optional, Union
 
 import torch.nn
@@ -19,8 +18,14 @@
 from flair.class_utils import get_non_abstract_subclasses
 from flair.data import DT, DT2, Corpus, Dictionary, Sentence, _iter_dataset
 from flair.datasets import DataLoader, FlairDatapointDataset
-from flair.distributed_utils import aggregate, aggregate_tensor_sum, broadcast_value, flatten_dicts, is_main_process, \
-    merge_sets
+from flair.distributed_utils import (
+    aggregate,
+    aggregate_tensor_sum,
+    broadcast_value,
+    flatten_dicts,
+    is_main_process,
+    merge_sets,
+)
 from flair.embeddings import Embeddings
 from flair.embeddings.base import load_embeddings
 from flair.file_utils import Tqdm, load_torch_state
@@ -274,8 +279,6 @@ def evaluate(
         multi_gpu: bool = False,
         **kwargs,
     ) -> Result:
-        t0 = time()
-        print('running custom evaluate..')
         exclude_labels = exclude_labels if exclude_labels is not None else []
 
         import numpy as np
@@ -302,25 +305,15 @@ def evaluate(
             all_true_values = {}
             all_predicted_values = {}
 
-            if multi_gpu:
-                distributed_sampler: DistributedSampler = DistributedSampler(
-                    data_points, shuffle=False
-                )
-                loader = DataLoader(
-                    data_points,
-                    batch_size=mini_batch_size,
-                    shuffle=False,
-                    sampler=distributed_sampler,
-                )
-                rank = torch.distributed.get_rank()
-                print('rank =', rank)
-            else:
-                loader = DataLoader(data_points, batch_size=mini_batch_size)
-                rank = 0
+            loader = DataLoader(
+                data_points,
+                batch_size=mini_batch_size,
+                shuffle=False,
+                sampler=DistributedSampler(data_points, shuffle=False) if multi_gpu else None,
+            )
+            rank = torch.distributed.get_rank() if multi_gpu else 0
 
             sentence_id = 0
-            t1 = time()
-            print('time1', t1 - t0)
             for batch in Tqdm.tqdm(loader, disable=not is_main_process()):
                 # remove any previously predicted labels
                 for datapoint in batch:
@@ -381,19 +374,14 @@ def evaluate(
                 if out_path:
                     lines.extend(self._print_predictions(batch, gold_label_type))
 
-        t2 = time()
-        print('time2', t2 - t1)
-        print('eval losssss', type(eval_loss), eval_loss)
         if multi_gpu:
             all_spans = aggregate(all_spans, merge_sets)
             all_true_values = aggregate(all_true_values, flatten_dicts)
             all_predicted_values = aggregate(all_predicted_values, flatten_dicts)
             average_over = aggregate(average_over, sum)
             eval_loss = aggregate(eval_loss, aggregate_tensor_sum)
-            print('eval loss =', eval_loss)
-        print('len all', len(all_spans), len(all_true_values), len(all_predicted_values), sep='\t')
 
-        result = Result(0., "", {}, {'loss': 0.0})
+        result = Result(0.0, "", {}, {"loss": 0.0})
         if is_main_process():
 
             # convert true and predicted values to two span-aligned lists
@@ -475,10 +463,8 @@ def evaluate(
                 target_names.append(label_name)
                 labels.append(evaluation_label_dictionary.get_idx_for_item(label_name))
 
-            #print(f"{len(data_points)}\t{len(y_true_save)}\n{len(y_true)}\t{len(y_pred)}\t{len(target_names)}\t{len(labels)}")
-
             # there is at least one gold label or one prediction (default)
-            if len(all_true_values) + len(all_predicted_values) > 1:
+            if is_main_process() and len(all_true_values) + len(all_predicted_values) > 1:
                 classification_report = sklearn.metrics.classification_report(
                     y_true,
                     y_pred,
@@ -512,9 +498,9 @@ def evaluate(
                         if metric_key != "support":
                             classification_report_dict["micro avg"][metric_key] = classification_report_dict["accuracy"]
                         else:
-                            classification_report_dict["micro avg"][metric_key] = classification_report_dict["macro avg"][
-                                "support"
-                            ]
+                            classification_report_dict["micro avg"][metric_key] = classification_report_dict[
+                                "macro avg"
+                            ]["support"]
 
                 detailed_result = (
                     "\nResults:"
@@ -536,14 +522,7 @@ def evaluate(
                 if average_over > 0:
                     eval_loss /= average_over
                 scores["loss"] = eval_loss.item()
-                print('scores', scores)
-
-                print('classification report')
-                print(classification_report_dict['micro avg'])
 
-                t3 = time()
-                print('time3', t3 - t2)
-                print('total time', t3 - t0)
                 result = Result(
                     main_score=classification_report_dict[main_evaluation_metric[0]][main_evaluation_metric[1]],
                     detailed_results=detailed_result,
@@ -559,7 +538,7 @@ def evaluate(
                     f"- And no predictions were made!\n"
                     "Double check your corpus (if the test split has labels), and how you initialize the ModelTrainer!"
                 )
-    
+
                 result = Result(
                     main_score=0.0,
                     detailed_results=error_text,
@@ -572,9 +551,6 @@ def evaluate(
 
         return result
 
-        # final_value
-        # return final_value
-
     @abstractmethod
     def predict(
         self,
diff --git a/flair/training_utils.py b/flair/training_utils.py
@@ -1,14 +1,12 @@
-import functools
 import logging
 import pathlib
 import random
-import time
 from collections import defaultdict
 from enum import Enum
 from functools import reduce
 from math import inf
 from pathlib import Path
-from typing import Callable, Literal, NamedTuple, Optional, Union
+from typing import Literal, NamedTuple, Optional, Union
 
 from numpy import ndarray
 from scipy.stats import pearsonr, spearmanr
@@ -516,23 +514,3 @@ def create_labeled_sentence_from_entity_offsets(
         token_entities = [entity for entity in token_entities if entity.end_token_idx <= token_limit]
 
     return create_labeled_sentence_from_tokens(tokens, token_entities)
-
-
-def print_execution_time(func: Callable) -> Callable:
-    """
-    Decorator that prints the execution time of the decorated function.
-
-    :param func: The function to be decorated.
-    :return: The wrapped function with execution time printing.
-    """
-
-    @functools.wraps(func)
-    def wrapper(*args, **kwargs):
-        start_time = time.perf_counter()  # Start the timer
-        result = func(*args, **kwargs)  # Execute the function
-        end_time = time.perf_counter()  # End the timer
-        elapsed_time = end_time - start_time
-        print(f"Function '{func.__name__}' executed in {elapsed_time:.4f} seconds.")
-        return result
-
-    return wrapper