Add pretrained ViT-S/16 models (90, 150 and 300 epochs) + misc updates

akolesnikoff · lucasb-eyer · xiaohuazhai · akolesnikoff · commit 6c376d6f621f · 2022-05-11T20:51:30.000+02:00
Co-authored-by: Lucas Beyer &lt;lucasb.eyer.be@gmail.com&gt;
Co-authored-by: Xiaohua Zhai &lt;xzhai@google.com&gt;
diff --git a/README.md b/README.md
@@ -181,11 +181,16 @@ gcloud alpha compute tpus tpu-vm ssh $NAME --zone=$ZONE --worker=0 --command "rm
 If you want to integrate other public or custom datasets, i.e. imagenet2012,
 please follow [the official guideline](https://www.tensorflow.org/datasets/catalog/overview).
 
+## Pre-trained models
+
+For the full list of pre-trained models check out the `load` function defined in
+the same module as the model code. And for example config on how to use these
+models, see `configs/transfer.py`.
+
 ## Run the transfer script on TPU VMs
 
 The following command line fine-tunes a pre-trained `vit-i21k-augreg-b/32` model
-on `cifar10` dataset. Please check `transfer.py` directly for more supported
-datasets and models.
+on `cifar10` dataset.
 
 ```
 gcloud alpha compute tpus tpu-vm ssh $NAME --zone=$ZONE --worker=all --command "TFDS_DATA_DIR=gs://$GS_BUCKET_NAME/tensorflow_datasets bash big_vision/run_tpu.sh big_vision.train --config big_vision/configs/transfer.py:model=vit-i21k-augreg-b/32,dataset=cifar10,crop=resmall_crop --workdir gs://$GS_BUCKET_NAME/big_vision/workdir/`date '+%m-%d_%H%M'` --config.lr=0.03"
diff --git a/big_vision/configs/load_and_eval.py b/big_vision/configs/load_and_eval.py
@@ -95,6 +95,31 @@ def bit_paper(config):
   )
 
 
+def vit_i1k(config):
+  # We could omit init_{shapes,types} if we wanted, as they are the default.
+  config.init_shapes = [(1, 224, 224, 3)]
+  config.init_types = ['float32']
+  config.num_classes = 1000
+
+  config.model_name = 'vit'
+  config.model_init = ''  # Will be set in sweep.
+  config.model = dict(variant='S/16', pool_type='gap', posemb='sincos2d',
+                      rep_size=True)
+
+  config.evals = [
+      ('fewshot', 'fewshot_lsr'),
+      ('val', 'classification'),
+  ]
+  config.fewshot = get_fewshot_lsr()
+  config.val = dict(
+      dataset='imagenet2012',
+      split='validation',
+      pp_fn='decode|resize_small(256)|central_crop(224)|value_range(-1, 1)|onehot(1000, key="label", key_result="labels")|keep("image", "labels")',
+      loss_name='softmax_xent',
+      cache_final=False,  # Only run once, on low-mem machine.
+  )
+
+
 def vit_i21k(config):
   # We could omit init_{shapes,types} if we wanted, as they are the default.
   config.init_shapes = [(1, 224, 224, 3)]
diff --git a/big_vision/configs/transfer.py b/big_vision/configs/transfer.py
@@ -42,10 +42,15 @@ def _set_model(config, model):
   config.model_load = dict(dont_load=['head/kernel', 'head/bias'])
 
   if model == 'vit-i21k-augreg-b/32':
-    # Load "recommented" upstream B/32 from https://arxiv.org/abs/2106.10270
+    # Load "recommended" upstream B/32 from https://arxiv.org/abs/2106.10270
     config.model_name = 'vit'
     config.model_init = 'howto-i21k-B/32'
     config.model = dict(variant='B/32', pool_type='tok')
+  elif model == 'vit-s16':
+    config.model_name = 'vit'
+    config.model_init = 'i1k-s16-300ep'
+    config.model = dict(variant='S/16', pool_type='gap', posemb='sincos2d',
+                        rep_size=True)
   else:
     raise ValueError(f'Unknown model: {model}, please define customized model.')
 
diff --git a/big_vision/models/vit.py b/big_vision/models/vit.py
@@ -290,12 +290,13 @@ def fix_old_checkpoints(params):
   # This means a B/32@224px would have 7x7+1 posembs. This is useless and clumsy
   # so we changed to add posemb then concat [cls]. We can recover the old
   # checkpoint by manually summing [cls] token and its posemb entry.
-  pe = params["pos_embedding"]
-  if int(np.sqrt(pe.shape[1])) ** 2 + 1 == int(pe.shape[1]):
-    logging.info("ViT: Loading and fixing combined cls+posemb")
-    pe_cls, params["pos_embedding"] = pe[:, :1], pe[:, 1:]
-    if "cls" in params:
-      params["cls"] += pe_cls
+  if "pos_embedding" in params:
+    pe = params["pos_embedding"]
+    if int(np.sqrt(pe.shape[1])) ** 2 + 1 == int(pe.shape[1]):
+      logging.info("ViT: Loading and fixing combined cls+posemb")
+      pe_cls, params["pos_embedding"] = pe[:, :1], pe[:, 1:]
+      if "cls" in params:
+        params["cls"] += pe_cls
 
   # MAP-head variants during ViT-G development had it inlined:
   if "probe" in params:
@@ -308,8 +309,10 @@ def fix_old_checkpoints(params):
 def load(init_params, init_file, model_cfg, dont_load=()):  # pylint: disable=invalid-name because we had to CamelCase above.
   """Load init from checkpoint, both old model and this one. +Hi-res posemb."""
 
+  del model_cfg
   # Shortcut names for some canonical paper checkpoints:
   init_file = {
+      # pylint: disable=line-too-long
       # pylint: disable=line-too-long
       # Recommended models from https://arxiv.org/abs/2106.10270
       # Many more models at https://github.com/google-research/vision_transformer
@@ -320,24 +323,25 @@ def load(init_params, init_file, model_cfg, dont_load=()):  # pylint: disable=in
       "howto-i21k-B/16": "gs://vit_models/augreg/B_16-i21k-300ep-lr_0.001-aug_medium1-wd_0.1-do_0.0-sd_0.0.npz",
       "howto-i21k-B/8": "gs://vit_models/augreg/B_8-i21k-300ep-lr_0.001-aug_medium2-wd_0.1-do_0.0-sd_0.0.npz",
       "howto-i21k-L/16": "gs://vit_models/augreg/L_16-i21k-300ep-lr_0.001-aug_strong1-wd_0.1-do_0.0-sd_0.0.npz",
+
+      # Better plain vit-s16 baselines from https://arxiv.org/abs/2205.01580
+      "i1k-s16-90ep": "gs://big_vision/vit_s16_i1k_90ep.npz",
+      "i1k-s16-150ep": "gs://big_vision/vit_s16_i1k_150ep.npz",
+      "i1k-s16-300ep": "gs://big_vision/vit_s16_i1k_300ep.npz",
+      # pylint: disable=line-too-long
       # pylint: enable=line-too-long
   }.get(init_file, init_file)
   restored_params = utils.load_params(None, init_file)
 
-  # The following allows implementing both fine-tuning head variants from
-  # (internal link)
-  # depending on the value of `rep_size` in the fine-tuning job.
-  if model_cfg.get("rep_size", False) in (None, False):
-    restored_params.pop("pre_logits", None)
-
   fix_old_checkpoints(restored_params)
 
   # possibly use the random init for some of the params (such as, the head).
   restored_params = common.merge_params(restored_params, init_params, dont_load)
 
   # resample posemb if needed.
-  restored_params["pos_embedding"] = resample_posemb(
-      old=restored_params["pos_embedding"],
-      new=init_params["pos_embedding"])
+  if "pos_embedding" in init_params:
+    restored_params["pos_embedding"] = resample_posemb(
+        old=restored_params["pos_embedding"],
+        new=init_params["pos_embedding"])
 
   return restored_params