why norm free nets becoming numerically unstable faster sometimes? #527

mobassir94 · 2021-03-28T07:20:49Z

mobassir94
Mar 28, 2021

i am trying the code below for vinbig kaggle challenge :

!pip install timm
import pandas as pd
import numpy as np

import time
from datetime import datetime
from datetime import timedelta

import os
import random
import copy

import cv2

from tqdm import tqdm
import timm

import sklearn
import warnings
import joblib
from glob import glob

from skimage import io
from sklearn import metrics
from sklearn.metrics import roc_auc_score, log_loss
from sklearn.model_selection import GroupKFold, StratifiedKFold, ShuffleSplit

import pydicom

from scipy.ndimage.interpolation import zoom
import matplotlib.pyplot as plt

import torch
from torch import nn
import torch.nn.functional as F

from torch.utils.data import Dataset,DataLoader
from torch.utils.data.sampler import SequentialSampler, RandomSampler
from torch.cuda.amp import autocast, GradScaler
from torch.nn.modules.loss import _WeightedLoss

import torchvision
from torchvision import transforms


""" Adaptive Gradient Clipping

An impl of AGC, as per (https://arxiv.org/abs/2102.06171):

@article{brock2021high,
  author={Andrew Brock and Soham De and Samuel L. Smith and Karen Simonyan},
  title={High-Performance Large-Scale Image Recognition Without Normalization},
  journal={arXiv preprint arXiv:},
  year={2021}
}

Code references:
  * Official JAX impl (paper authors): https://github.com/deepmind/deepmind-research/tree/master/nfnets
  * Phil Wang's PyTorch gist: https://gist.github.com/lucidrains/0d6560077edac419ab5d3aa29e674d5c

Hacked together by / Copyright 2021 Ross Wightman
"""
import torch


def unitwise_norm(x, norm_type=2.0):
    if x.ndim <= 1:
        return x.norm(norm_type)
    else:
        # works for nn.ConvNd and nn,Linear where output dim is first in the kernel/weight tensor
        # might need special cases for other weights (possibly MHA) where this may not be true
        return x.norm(norm_type, dim=tuple(range(1, x.ndim)), keepdim=True)


def adaptive_clip_grad(parameters, clip_factor=0.01, eps=1e-3, norm_type=2.0):
    if isinstance(parameters, torch.Tensor):
        parameters = [parameters]
    for p in parameters:
        if p.grad is None:
            continue
        p_data = p.detach()
        g_data = p.grad.detach()
        max_norm = unitwise_norm(p_data, norm_type=norm_type).clamp_(min=eps).mul_(clip_factor)
        grad_norm = unitwise_norm(g_data, norm_type=norm_type)
        clipped_grad = g_data * (max_norm / grad_norm.clamp(min=1e-6))
        new_grads = torch.where(grad_norm < max_norm, g_data, clipped_grad)
        p.grad.detach().copy_(new_grads)


from dataclasses import dataclass, field

@dataclass
class config:
    n_fold = 10
    train_fold = [0]
    seed = 2568
    notebook_name = 'dm_nfnet_f0' #nf_resnet50 
    model_name = 'dm_nfnet_f0'
    image_size = 512  
    epochs = 10
    tta = 7
    train_batch_size = 8
    valid_batch_size = 8
    valid_batch_size_tta = 8

    num_workers = 4
    device = 'cuda:0'
    
    debug = False
    
    train_csv = '../input/vinbigdata-chest-xray-abnormalities-detection/train.csv'
    train_image_path = '../input/vinbigdata-image/train_768'
    model_weight_path =  './'


train = pd.read_csv( config.train_csv )


train.class_id.value_counts()

train_normal = train.groupby("image_id")["class_id"].agg(lambda s: (s == 14).sum()).reset_index().rename({"class_id": "num_normal_annotations"}, axis=1)

train.query("image_id == '000434271f63a053c4128a0ba6352c7f'")

num_normal_anno_counts = train_normal["num_normal_annotations"].value_counts()
num_normal_anno_counts.plot(kind="bar")
plt.title("The number of 'No finding' annotations in each image")


train_normal['label'] = 0
train_normal.loc[train_normal['num_normal_annotations'] > 0, 'label'] = 1

train_normal.label.value_counts()

if config.debug == True:
    train_normal = train_normal.sample(n=500, random_state=0)
    train_normal.reset_index(drop=True, inplace=True)

def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True

def get_img( path, image_id ):
    path_name = os.path.join( path, image_id + '.jpg' )
    image = cv2.imread(path_name)
    image = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
    #image = cv2.cvtColor( image, cv2.COLOR_BGR2RGB )
    
    return image


def rand_bbox(size, lam):
    W = size[0]
    H = size[1]
    cut_rat = np.sqrt(1. - lam)
    cut_w = np.int(W * cut_rat)
    cut_h = np.int(H * cut_rat)

    # uniform
    cx = np.random.randint(W)
    cy = np.random.randint(H)

    bbx1 = np.clip(cx - cut_w // 2, 0, W)
    bby1 = np.clip(cy - cut_h // 2, 0, H)
    bbx2 = np.clip(cx + cut_w // 2, 0, W)
    bby2 = np.clip(cy + cut_h // 2, 0, H)
    return bbx1, bby1, bbx2, bby2


class VinBigDataTwoClassDataset(Dataset):
    def __init__(self, df, 
                 image_path,
                 transforms=None, 
                 output_label=True, 
                 do_cutmix=False,
                 cutmix_params={
                     'alpha': 1,
                 },
                 do_mixup=False
                ):
        
        super().__init__()
        self.df = df.reset_index(drop=True).copy()
        self.image_path = image_path
        self.transforms = transforms
        self.do_cutmix = do_cutmix
        self.cutmix_params = cutmix_params
        self.do_mixup = do_mixup
        
        self.output_label = output_label
        
        if output_label == True:
            self.labels = self.df['label'].values
            

    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, idx: int):
        
        # get labels
        if self.output_label:
            target = self.labels[idx]
          
        img = get_img( self.image_path, self.df.loc[idx]['image_id'] )
        img = np.stack([img, img, img])
        img = img.transpose(1,2,0)

        if self.transforms:
            img = self.transforms(image=img)['image']
        
        
        if self.do_cutmix and np.random.uniform(0., 1., size=1)[0] < 0.6:
            #print(img.sum(), img.shape)
            with torch.no_grad():
                cmix_ix = np.random.choice(self.df.index, size=1)[0]
                cmix_img = get_img( self.df.loc[cmix_ix]['path'] )
                
                if self.transforms:
                    cmix_img = self.transforms(image=cmix_img)['image']
                    
                lam = np.clip(np.random.beta(self.cutmix_params['alpha'], self.cutmix_params['alpha']),0.3,0.4)
                bbx1, bby1, bbx2, bby2 = rand_bbox((CFG['image_size'], CFG['image_size']), lam)

                img[:, bbx1:bbx2, bby1:bby2] = cmix_img[:, bbx1:bbx2, bby1:bby2]

                rate = 1 - ((bbx2 - bbx1) * (bby2 - bby1) / (CFG['image_size'] * CFG['image_size']))
                target = rate*target + (1.-rate)*self.labels[cmix_ix]
                      
        
        elif self.do_mixup and np.random.uniform(0., 1., size=1)[0] < 0.5:
                mixup_ix = np.random.choice(self.df.index, size=1)[0]
                mixup_img = get_img( self.df.loc[mixup_ix]['path'] )
                
                if self.transforms:
                    mixup_img = self.transforms(image=mixup_img)['image']
                    
                rate = np.clip(np.random.beta(1, 1), 0.6, 0.7 )
                img = rate*img + (1.-rate)*mixup_img
                
                target = rate*target + (1.-rate)*self.labels[mixup_ix]      
            
        #img = (img - img.min())/(img.max() - img.min())
        
        if self.output_label == True:
            return img, target
        else:
            return img


from albumentations import (
    HorizontalFlip, VerticalFlip, IAAPerspective, ShiftScaleRotate, CLAHE, RandomRotate90,
    Transpose, ShiftScaleRotate, Blur, OpticalDistortion, GridDistortion, HueSaturationValue,
    IAAAdditiveGaussianNoise, GaussNoise, MotionBlur, MedianBlur, IAAPiecewiseAffine, RandomResizedCrop, JpegCompression,
    RandomBrightness, RandomContrast, GaussianBlur,
    IAASharpen, IAAEmboss, RandomBrightnessContrast, Flip, OneOf, Compose, Normalize, Cutout, CoarseDropout, ShiftScaleRotate, CenterCrop, Resize
)

from albumentations.pytorch import ToTensorV2

def get_train_transforms():
    return Compose([
            #RandomResizedCrop(CFG['image_size'], CFG['image_size']),
            #Transpose(p=0.5),
            HorizontalFlip(p=0.5),
            #VerticalFlip(p=0.5),
            ShiftScaleRotate(p=0.5, shift_limit=0.0625, scale_limit=0.1, rotate_limit=10),
        
            #OneOf([RandomBrightness(limit=0.1, p=1), RandomContrast(limit=0.1, p=1)]),
            #OneOf([MotionBlur(), MedianBlur(), GaussianBlur()], p=0.5),
        
            #HueSaturationValue(hue_shift_limit=0.2, sat_shift_limit=0.2, val_shift_limit=0.2, p=0.5),
            RandomBrightnessContrast(brightness_limit=(-0.1,0.1), contrast_limit=(-0.1, 0.1), p=0.5),
            Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225], max_pixel_value=255.0, p=1.0),
            CoarseDropout(p=0.2),
            Cutout(p=0.5),
            ToTensorV2(p=1.0),
        ], p=1.)
        
def get_valid_transforms():
    return Compose([
            #CenterCrop(CFG['image_size'], CFG['image_size'], p=1.),
            #Resize(CFG['image_size'], CFG['image_size']),
            Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225], max_pixel_value=255.0, p=1.0),
            ToTensorV2(p=1.0),
        ], p=1.)


def get_inference_transforms():
    return Compose([
            #RandomResizedCrop(CFG['image_size'], CFG['image_size']),
            #Transpose(p=0.5),
            HorizontalFlip(p=0.5),
            #VerticalFlip(p=0.5),
            ShiftScaleRotate(p=0.5, shift_limit=0.0625, scale_limit=0.1, rotate_limit=10),
        
            #OneOf([RandomBrightness(limit=0.1, p=1), RandomContrast(limit=0.1, p=1)]),
            #OneOf([MotionBlur(), MedianBlur(), GaussianBlur()], p=0.5),
        
            #HueSaturationValue(hue_shift_limit=0.2, sat_shift_limit=0.2, val_shift_limit=0.2, p=0.5),
            RandomBrightnessContrast(brightness_limit=(-0.1,0.1), contrast_limit=(-0.1, 0.1), p=0.5),
            Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225], max_pixel_value=255.0, p=1.0),
            ToTensorV2(p=1.0),
        ], p=1.)

class VinBigDataTwoClassModel(nn.Module):
    def __init__(self, model_name, n_class, pretrained=False, drop_rate=0):
        super().__init__()
        
        self.model = timm.create_model(
            model_name, 
            pretrained=pretrained
        )
        
        n_features = self.model.head.fc.in_features
        
        if drop_rate > 0.01:
            self.model.head.fc = nn.Sequential(
                nn.Dropout(drop_rate),
                nn.Linear(n_features, n_class, bias=True)
            )  
            
        else:
            self.model.head.fc = nn.Linear(n_features, n_class)

    def forward(self, x):
        x = self.model(x)
        return x

class SchedulerOneCycleLR_Batch:
    def __init__(self, optimizer, min_lr: float, max_lr: float, t0: int, total_epochs: int, n_batch: int):
        self.optimizer = optimizer
        self.total_epochs = total_epochs
        self.t0      = t0
        self.min_lr  = min_lr
        self.max_lr  = max_lr
        self.n_batch = n_batch
        self.count   = 0
        
        self.t0_batchs    = t0 * n_batch
        self.total_batchs = total_epochs * n_batch
        
        for param_group in self.optimizer.param_groups:
            param_group["lr"] = self.min_lr       

    def step(self):
        
        self.count = self.count + 1
        
        if self.count >= self.total_batchs:
            self.count = self.total_batchs - 1
        
        if self.count <= self.t0_batchs:
            r = ( np.cos( self.count / self.t0_batchs * np.pi ) + 1 ) / 2
            lr = self.max_lr - r * ( self.max_lr - self.min_lr )
        else:
            r = ( np.cos( ( self.count - self.t0_batchs ) / ( self.total_batchs - self.t0_batchs - 1 ) * np.pi ) + 1 ) / 2
            lr = self.min_lr + r * ( self.max_lr - self.min_lr )

        for param_group in self.optimizer.param_groups:
            param_group["lr"] = lr

    def lr(self) -> float:
        return self.optimizer.param_groups[0]["lr"]  

total_epochs = 15
n_batch = 10

model = torch.nn.Linear(2, 1)
optimizer = torch.optim.Adam( model.parameters(), lr=1.0e-4 )
scheduler = SchedulerOneCycleLR_Batch( optimizer, min_lr=1.0e-6, max_lr=1.0e-4, t0=2, total_epochs=total_epochs, n_batch=n_batch) 

x = []
for e in range(total_epochs):
    for _ in range(n_batch):
        lr = scheduler.lr()
        #print( e, lr )
        x.append(lr)

        scheduler.step()

plt.plot( x )
plt.show()

del model, optimizer, scheduler


def prepare_dataloader(df, trn_idx, val_idx):
    
    train_ = df.loc[trn_idx,:].reset_index(drop=True)
    valid_ = df.loc[val_idx,:].reset_index(drop=True)
        
    # train
    train_dataset = VinBigDataTwoClassDataset( 
        train_, 
        image_path=config.train_image_path, 
        transforms=get_train_transforms(), 
        output_label=True, 
        do_cutmix=False, 
        do_mixup=False
    )
    
    train_loader = torch.utils.data.DataLoader(
        train_dataset,
        batch_size=config.train_batch_size,
        pin_memory=False,
        drop_last=True,
        shuffle=True,        
        num_workers=config.num_workers,
    )
    
    
    # valid
    valid_dataset = VinBigDataTwoClassDataset( 
        valid_, 
        image_path=config.train_image_path, 
        transforms=get_valid_transforms(), 
        output_label=True
    )
    
    valid_loader = torch.utils.data.DataLoader(
        valid_dataset, 
        batch_size=config.valid_batch_size,
        num_workers=config.num_workers,
        shuffle=False,
        pin_memory=False,
    )
    
    return train_loader, valid_loader


class AverageMeter(object):
    """Computes and stores the average and current value"""
    def __init__(self):
        self.reset()

    def reset(self):
        self.val = 0
        self.avg = 0
        self.sum = 0
        self.count = 0

    def update(self, val, n=1):
        self.val = val
        self.sum += val * n
        self.count += n
        self.avg = self.sum / self.count


from torch.cuda import amp

def train_one_epoch(epoch, model, loss_fn, optimizer, scaler, train_loader, device, scheduler=None, schd_batch_update=False):
    
    losses = AverageMeter()
    scores = AverageMeter()    
    model.train()

    pbar = tqdm(enumerate(train_loader), total=len(train_loader))
    
    for step, (image, labels) in pbar:
        image = image.to(device).float()
        labels = labels.unsqueeze(1)
        labels = labels.to(device).float()

        optimizer.zero_grad() 
        
        with amp.autocast(enabled=True):
            output = model(image)
            batch_size = labels.size(0)
            loss = loss_fn( output, labels )
            
        scaler.scale(loss).backward()
        
        
        adaptive_clip_grad(model.parameters(), clip_factor=0.01, eps=1e-3, norm_type=2.0)
        
        scaler.step(optimizer)
        scaler.update()
        
        out = torch.sigmoid(output)
        outputs = out.cpu().detach().numpy()
        targets = labels.cpu().detach().numpy()        
            
        if scheduler is not None and schd_batch_update:
            scheduler.step()

        outputs = np.where(outputs > 0.5, 1.0, 0.0) 
        targets = np.where(targets > 0.5, 1.0, 0.0) 
        auc = (outputs==targets).mean()

        #auc = sklearn.metrics.roc_auc_score(targets, outputs)
        losses.update(loss.item(), batch_size)
        scores.update(auc.item(), batch_size)

        description = f'epoch {epoch}'
        pbar.set_description( description )
        pbar.set_postfix(loss=loss.item(), auc=auc.item(), stage='train')            
        
    if scheduler is not None and not schd_batch_update:
        scheduler.step()
        
    return losses.avg, scores.avg

def valid_one_epoch(epoch, model, loss_fn, val_loader, device):
    
    losses = AverageMeter()
    scores = AverageMeter()    
    model.eval()

    pbar = tqdm(enumerate(val_loader), total=len(val_loader))
    
    for step, (image, labels) in pbar:
        image = image.to(device).float()
        labels = labels.unsqueeze(1)
        labels = labels.to(device).float()
        batch_size = labels.size(0)
        
        output = model(image) 
        
        loss = loss_fn( output, labels )
        out = torch.sigmoid(output)
        outputs = out.cpu().detach().numpy()
        targets = labels.cpu().detach().numpy()

        outputs = np.where(outputs > 0.5, 1.0, 0.0) 
        targets = np.where(targets > 0.5, 1.0, 0.0) 
        auc = (outputs==targets).mean()

        #auc = sklearn.metrics.roc_auc_score(targets, outputs)
        losses.update(loss.item(), batch_size)
        scores.update(auc.item(), batch_size)

        description = f'epoch {epoch}'
        pbar.set_description( description )            
        pbar.set_postfix(loss=loss.item(), auc=auc.item(), stage = 'valid')
    
    print('validation accuracy = {:.8f}'.format(scores.avg))

    return losses.avg, scores.avg


def print_history_list( history_list ):

    for history in history_list:
        print( 'fold:', history['fold'] )
        print( pd.DataFrame( history['history'] ) )
        print( '*' * 65 )
        
        print( 'best_epoch: {}'.format( history['best_epoch'] ) )
        print( 'best_valid_acc: {:.8f}'.format( history['best_valid_accuracy'] ) )
        print( 'time: {}'.format( history['time'] ) )
        print( '*' * 65 )


seed_everything( config.seed )

folds = list( ShuffleSplit( n_splits=config.n_fold, test_size=0.08, random_state=config.seed ).split( train_normal.label.values ) )
#folds = StratifiedKFold(n_splits=config.n_fold, shuffle=True, random_state=config.seed).split(np.arange(train.shape[0]), train.label.values)


from torch.nn.parameter import Parameter
def gem(x, p=3, eps=1e-6):
    return F.avg_pool2d(x.clamp(min=eps).pow(p), (x.size(-2), x.size(-1))).pow(1./p)
class GeM(nn.Module):
    def __init__(self, p=3, eps=1e-6):
        super(GeM,self).__init__()
        self.p = Parameter(torch.ones(1)*p)
        self.eps = eps
    def forward(self, x):
        return gem(x, p=self.p, eps=self.eps)       
    def __repr__(self):
        return self.__class__.__name__ + '(' + 'p=' + '{:.4f}'.format(self.p.data.tolist()[0]) + ', ' + 'eps=' + str(self.eps) + ')'

history_list = []

for fold, (trn_idx, val_idx) in enumerate(folds):

    if fold not in config.train_fold:
        continue

    print('Training with {} started'.format(fold))
    start_time = time.time()
    
    print(len(trn_idx), len(val_idx))
    train_loader, val_loader = prepare_dataloader( train_normal, trn_idx, val_idx )    
    
    device = torch.device(config.device)
    
    model = VinBigDataTwoClassModel(
        model_name=config.model_name,
        n_class=1, 
        pretrained=True,
        drop_rate=0
    ).to(device)
    
    #model.load_state_dict(torch.load('../input/vinbigdata-01-01-2class-eff-b4-train/01-01-eff-b4_fold_0_best', map_location=lambda storage, loc: storage), strict=True)
    
    #model.avg_pool = GeM()
    
    

    learning_rate_max = 1e-5
    learning_rate_min = 1e-6
    
    scaler = GradScaler()   
    optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate_max, weight_decay=1e-6)    
    scheduler = SchedulerOneCycleLR_Batch( optimizer, min_lr=learning_rate_min, max_lr=learning_rate_max, t0=2, total_epochs=config.epochs, n_batch=len(train_loader))
    
    loss_fn = nn.BCEWithLogitsLoss().to(device)
    best_valid_accuracy = 0.0
    best_epoch = 0
    history = []

    
    for epoch in range( config.epochs ):
        
        lr = optimizer.param_groups[0]['lr']   
        state = ''
        train_loss, train_accuracy = train_one_epoch( epoch, model, loss_fn, optimizer, scaler, train_loader, device, scheduler=scheduler, schd_batch_update=True )

        with torch.no_grad():
            valid_loss, valid_accuracy = valid_one_epoch( epoch, model, loss_fn, val_loader, device )

        if valid_accuracy > best_valid_accuracy - 0.0005:
            best_valid_accuracy = valid_accuracy
            best_epoch = epoch
            best_weights = copy.deepcopy( model.state_dict() )  
            state = 'update'
            torch.save(model.state_dict(),'{}_fold_{}_best'.format( config.notebook_name, fold ))
            
        history.append( {
            'train_loss': train_loss,
            'train_acc': train_accuracy,
            'valid_loss': valid_loss, 
            'valid_acc': valid_accuracy,   
            'lr': lr,
            'state': state
        } )
            
    model_weight_file = os.path.join( config.model_weight_path, '{}_fold_{}_last.pth'.format( config.notebook_name, fold ) )
    torch.save( model.state_dict(), model_weight_file )  
         
    t = time.time() - start_time 
    t = str(timedelta(seconds=t))
    
    history_list.append( { 
        'fold': fold, 
        'history': history, 
        'best_epoch': best_epoch, 
        'best_valid_accuracy': best_valid_accuracy, 
        'time': t 
    } )  
    
    print( 'fold: {}, best_epoch:{}, best_valid_accuracy:{:.8f}, time:{}'.format( fold, best_epoch, best_valid_accuracy, t ) )

if i use any other model rather than norm free nets then it works fine,no issue,,,but when i use norm free nets i get this trainig log :

 Training with 0 started
13800 1200
Downloading: "https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-dnf-weights/dm_nfnet_f0-604f9c3a.pth" to /root/.cache/torch/hub/checkpoints/dm_nfnet_f0-604f9c3a.pth
epoch 0: 100%|██████████| 1725/1725 [34:11<00:00,  1.19s/it, auc=0.75, loss=0.263, stage=train]
epoch 0: 100%|██████████| 150/150 [00:56<00:00,  2.66it/s, auc=1, loss=0.00129, stage=valid]
validation accuracy = 0.91083333
epoch 1: 100%|██████████| 1725/1725 [34:27<00:00,  1.20s/it, auc=0.625, loss=0.61, stage=train]
epoch 1: 100%|██████████| 150/150 [00:55<00:00,  2.70it/s, auc=1, loss=1.9e-5, stage=valid]
validation accuracy = 0.93000000
epoch 2: 100%|██████████| 1725/1725 [33:45<00:00,  1.17s/it, auc=0.25, loss=nan, stage=train]
  0%|          | 0/150 [00:00<?, ?it/s]/opt/conda/lib/python3.7/site-packages/ipykernel_launcher.py:22: RuntimeWarning: invalid value encountered in greater
epoch 2: 100%|██████████| 150/150 [00:55<00:00,  2.69it/s, auc=0.25, loss=nan, stage=valid]
validation accuracy = 0.28583333
epoch 3: 100%|██████████| 1725/1725 [33:37<00:00,  1.17s/it, auc=0.375, loss=nan, stage=train]
epoch 3: 100%|██████████| 150/150 [00:55<00:00,  2.68it/s, auc=0.25, loss=nan, stage=valid]
validation accuracy = 0.28583333
epoch 4: 100%|██████████| 1725/1725 [33:40<00:00,  1.17s/it, auc=0.375, loss=nan, stage=train]
epoch 4: 100%|██████████| 150/150 [00:55<00:00,  2.71it/s, auc=0.25, loss=nan, stage=valid]
validation accuracy = 0.28583333
epoch 5: 100%|██████████| 1725/1725 [33:36<00:00,  1.17s/it, auc=0.25, loss=nan, stage=train]
epoch 5: 100%|██████████| 150/150 [00:55<00:00,  2.71it/s, auc=0.25, loss=nan, stage=valid]
validation accuracy = 0.28583333
epoch 6: 100%|██████████| 1725/1725 [33:39<00:00,  1.17s/it, auc=0.375, loss=nan, stage=train]
epoch 6: 100%|██████████| 150/150 [00:56<00:00,  2.67it/s, auc=0.25, loss=nan, stage=valid]
validation accuracy = 0.28583333
epoch 7: 100%|██████████| 1725/1725 [33:40<00:00,  1.17s/it, auc=0.5, loss=nan, stage=train]
epoch 7: 100%|██████████| 150/150 [00:55<00:00,  2.72it/s, auc=0.25, loss=nan, stage=valid]
validation accuracy = 0.28583333
epoch 8: 100%|██████████| 1725/1725 [33:43<00:00,  1.17s/it, auc=0.25, loss=nan, stage=train]
epoch 8: 100%|██████████| 150/150 [00:55<00:00,  2.70it/s, auc=0.25, loss=nan, stage=valid]
validation accuracy = 0.28583333
epoch 9: 100%|██████████| 1725/1725 [33:38<00:00,  1.17s/it, auc=0.125, loss=nan, stage=train]
epoch 9: 100%|██████████| 150/150 [00:55<00:00,  2.71it/s, auc=0.25, loss=nan, stage=valid]
validation accuracy = 0.28583333
fold: 0, best_epoch:1, best_valid_accuracy:0.93000000, time:5:47:34.064576

how do i solve this issue?

if you check some of the comments of this discussion : https://www.kaggle.com/c/cassava-leaf-disease-classification/discussion/220268

you can see many other people getting NaN like me while using nfnets only

rwightman · 2021-03-28T18:32:25Z

rwightman
Mar 28, 2021
Maintainer

You're getting NaN because they are normalization free and much more fussy than nets w/ BN.

I find it very impressive that they work as well as they do through careful analysis and additions to a ResNet architecture and training recipe. But they are delicate, you can't just throw whatever you want at them. If you want to deviate from the hparams in the paper (ie by using adam), you'll have to experiment and do some hparam sweeps to find out what degree of gradient clipping etc is needed, what LR, etc. You also need to look at your data and ensure it's normalized/standardized appropriately, etc.

Also, fix your grad clipping, you aren't doing it correctly when using AMP, it's done correctly in timm utils.

0 replies

mobassir94 · 2021-03-29T06:18:13Z

mobassir94
Mar 29, 2021
Author

@rwightman i tried to use this grad clipping :https://github.com/rwightman/pytorch-image-models/blob/master/timm/utils/agc.py

from timm utils,,i couldn't understand this comment sorry "Also, fix your grad clipping, you aren't doing it correctly when using AMP, it's done correctly in timm utils."

1 reply

SarthakYadav Jun 5, 2021

By fix your grad clipping, @rwightman probably meant that you're applying AGC clipping to the entire model whereas according to the paper it should not be applied to the final linear layer.

timm.utils correctly filters the final linear layer from the list of parameters passed on to AGC clipping.

bishshoy · 2022-05-30T10:33:00Z

bishshoy
May 30, 2022

Hi,
Can anyone give me a recipe for training nf_resnet26 or nf_resnet50 on ImageNet. I have tried setting the hparams according to what is mentioned in the paper and I did not get accuracy mentioned in the paper.
Thanks.

0 replies

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

Uh oh!

why norm free nets becoming numerically unstable faster sometimes? #527

Uh oh!

{{title}}

Uh oh!

Uh oh!

{{editor}}'s edit

{{editor}}'s edit

Uh oh!

Replies: 3 comments 1 reply

Uh oh!

{{title}}

Uh oh!

Uh oh!

{{title}}

Uh oh!

Uh oh!

{{title}}

Uh oh!

Uh oh!

{{title}}

Uh oh!

Select a reply

Uh oh!

Uh oh!

why norm free nets becoming numerically unstable faster sometimes? #527

Uh oh!

Uh oh!

mobassir94 Mar 28, 2021

Replies: 3 comments · 1 reply

Uh oh!

rwightman Mar 28, 2021 Maintainer

Uh oh!

mobassir94 Mar 29, 2021 Author

Uh oh!

SarthakYadav Jun 5, 2021

Uh oh!

bishshoy May 30, 2022

mobassir94
Mar 28, 2021

Replies: 3 comments 1 reply

rwightman
Mar 28, 2021
Maintainer

mobassir94
Mar 29, 2021
Author

bishshoy
May 30, 2022