Merge pull request NVIDIA#636 from NVIDIA/gh/release

nv-kkudrynski · web-flow · commit 280e75c63edb · 2020-08-05T20:55:02.000+02:00
[VAE/TF] Updating for Ampere
diff --git a/TensorFlow/Recommendation/VAE-CF/Dockerfile b/TensorFlow/Recommendation/VAE-CF/Dockerfile
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-ARG FROM_IMAGE_NAME=nvcr.io/nvidia/tensorflow:19.11-tf1-py3
+ARG FROM_IMAGE_NAME=nvcr.io/nvidia/tensorflow:20.06-tf1-py3
 FROM ${FROM_IMAGE_NAME}
 
 ADD requirements.txt .
diff --git a/TensorFlow/Recommendation/VAE-CF/README.md b/TensorFlow/Recommendation/VAE-CF/README.md
diff --git a/TensorFlow/Recommendation/VAE-CF/main.py b/TensorFlow/Recommendation/VAE-CF/main.py
@@ -1,6 +1,6 @@
 #!/usr/bin/python3
 
-# Copyright (c) 2019, NVIDIA CORPORATION.  All rights reserved.
+# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -15,14 +15,21 @@
 # limitations under the License.
 
 import os
+os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
+
 from functools import partial
 import json
 import logging
 from argparse import ArgumentParser
+
 import tensorflow as tf
+tf.logging.set_verbosity(tf.logging.ERROR)
+
 import numpy as np
 import horovod.tensorflow as hvd
+from mpi4py import MPI
 import dllogger
+import time
 
 from vae.utils.round import round_8
 from vae.metrics.recall import recall
@@ -32,18 +39,16 @@
 
 def main():
     hvd.init()
+    mpi_comm = MPI.COMM_WORLD
 
     parser = ArgumentParser(description="Train a Variational Autoencoder for Collaborative Filtering in TensorFlow")
     parser.add_argument('--train', action='store_true',
                         help='Run training of VAE')
     parser.add_argument('--test', action='store_true',
                         help='Run validation of VAE')
-    parser.add_argument('--inference', action='store_true',
-                        help='Run inference on a single random example.'
-                        'This can also be used to measure the latency for a batch size of 1')
     parser.add_argument('--inference_benchmark', action='store_true',
-                        help='Benchmark the inference throughput on a very large batch size')
-    parser.add_argument('--use_tf_amp', action='store_true',
+                        help='Measure inference latency and throughput on a variety of batch sizes')
+    parser.add_argument('--amp', action='store_true', default=False,
                         help='Enable Automatic Mixed Precision')
     parser.add_argument('--epochs', type=int, default=400,
                         help='Number of epochs to train')
@@ -85,6 +90,7 @@ def main():
                         default=None,
                         help='Path for saving a checkpoint after the training')
     args = parser.parse_args()
+    args.world_size = hvd.size()
 
     if args.batch_size_train % hvd.size() != 0:
         raise ValueError('Global batch size should be a multiple of the number of workers')
@@ -101,16 +107,27 @@ def main():
         dllogger.init(backends=[])
         logger.setLevel(logging.ERROR)
 
-    dllogger.log(data=vars(args), step='PARAMETER')
+    if args.seed is None:
+        if hvd.rank() == 0:
+            seed = int(time.time())
+        else:
+            seed = None
 
-    np.random.seed(args.seed)
-    tf.set_random_seed(args.seed)
+        seed = mpi_comm.bcast(seed, root=0)
+    else:
+        seed = args.seed
+
+    tf.random.set_random_seed(seed)
+    np.random.seed(seed)
+    args.seed = seed
+
+    dllogger.log(data=vars(args), step='PARAMETER')
 
     # Suppress TF warnings
     os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'
 
     # set AMP
-    os.environ['TF_ENABLE_AUTO_MIXED_PRECISION'] = '1' if args.use_tf_amp else '0'
+    os.environ['TF_ENABLE_AUTO_MIXED_PRECISION'] = '1' if args.amp else '0'
 
     # load dataset
     (train_data,
@@ -159,21 +176,36 @@ def main():
     elif args.test and hvd.size() > 1:
         print("Testing is not supported with horovod multigpu yet")
 
-    if args.inference_benchmark and hvd.size() <= 1:
-        # use the train data to get accurate throughput numbers for inference
-        # the test and validation sets are too small to measure this accurately
-        # vae.inference_benchmark()
-        _ = vae.test(test_data_input=train_data,
-                     test_data_true=train_data, metrics={})
-        
-
     elif args.test and hvd.size() > 1:
         print("Testing is not supported with horovod multigpu yet")
 
-    if args.inference:
-        input_data = np.random.randint(low=0, high=10000, size=10)
-        recommendations = vae.query(input_data=input_data)
-        print('Recommended item indices: ', recommendations)
+    if args.inference_benchmark:
+        items_per_user = 10
+        item_indices = np.random.randint(low=0, high=10000, size=items_per_user)
+        user_indices = np.zeros(len(item_indices))
+        indices = np.stack([user_indices, item_indices], axis=1)
+
+        num_batches = 200
+        latencies = []
+        for i in range(num_batches):
+            start_time = time.time()
+            _ = vae.query(indices=indices)
+            end_time = time.time()
+
+            if i < 10:
+                #warmup steps
+                continue
+
+            latencies.append(end_time - start_time)
+
+        result_data = {}
+        result_data[f'batch_1_mean_throughput'] = 1 / np.mean(latencies)
+        result_data[f'batch_1_mean_latency'] = np.mean(latencies)
+        result_data[f'batch_1_p90_latency'] = np.percentile(latencies, 90)
+        result_data[f'batch_1_p95_latency'] = np.percentile(latencies, 95)
+        result_data[f'batch_1_p99_latency'] = np.percentile(latencies, 99)
+
+        dllogger.log(data=result_data, step=tuple())
 
     vae.close_session()
     dllogger.flush()
diff --git a/TensorFlow/Recommendation/VAE-CF/vae/load/downloaders.py b/TensorFlow/Recommendation/VAE-CF/vae/load/downloaders.py
diff --git a/TensorFlow/Recommendation/VAE-CF/vae/load/preprocessing.py b/TensorFlow/Recommendation/VAE-CF/vae/load/preprocessing.py
@@ -23,7 +23,6 @@
 import numpy as np
 from scipy.sparse import load_npz, csr_matrix
 
-from vae.load.downloaders import download_movielens
 import logging
 import json
 
@@ -69,7 +68,7 @@ def save_id_mappings(cache_dir, show2id, profile2id):
             json.dump(d, f, indent=4)
 
 
-def load_and_parse_ML_20M(data_dir, threshold=4):
+def load_and_parse_ML_20M(data_dir, threshold=4, parse=True):
     """
     Original way of processing ml-20m dataset from VAE for CF paper
 	Copyright [2018] [Dawen Liang, Rahul G. Krishnan, Matthew D. Hoffman, and Tony Jebara]
@@ -98,11 +97,14 @@ def load_and_parse_ML_20M(data_dir, threshold=4):
                 load_npz(test_data_true_file), \
                 load_npz(test_data_test_file),
 
+    if not parse:
+        raise ValueError('Dataset not preprocessed. Please run python3 prepare_dataset.py first.')
+
     LOG.info("Parsing movielens.")
 
     source_file = os.path.join(data_dir, "ml-20m/extracted/ml-20m", "ratings.csv")
     if not glob(source_file):
-        download_movielens(data_dir=data_dir)
+        raise ValueError('Dataset not downloaded. Please download the ML-20m dataset from https://grouplens.org/datasets/movielens/20m/, unzip it and put it in ', source_file)
 
     raw_data = pd.read_csv(source_file)
     raw_data.drop('timestamp', axis=1, inplace=True)
diff --git a/TensorFlow/Recommendation/VAE-CF/vae/models/train.py b/TensorFlow/Recommendation/VAE-CF/vae/models/train.py
@@ -340,27 +340,21 @@ def test(
         # Therefore we're using the nan-aware mean from numpy to ignore users with no items to be predicted. 
         return {name: np.nanmean(scores) for name, scores in metrics_scores.items()}
 
-    def query(self, input_data: np.ndarray):
+    def query(self, indices: np.ndarray):
         """
         inference for batch size 1
 
         :param input_data:
         :return:
         """
-        query_start = time.time()
-        indices = np.stack([np.zeros(len(input_data)), input_data], axis=1)
-        values = np.ones(shape=(1, len(input_data)))
+        values = np.ones(shape=(1, len(indices)))
         values = normalize(values)
         values = values.reshape(-1)
 
-        sess_run_start = time.time()
         res = self.session.run(
             self.top_k_query,
             feed_dict={self.inputs_query: (indices,
                                            values)})
-        query_end_time = time.time()
-        LOG.info('query time: {}'.format(query_end_time - query_start))
-        LOG.info('sess run time: {}'.format(query_end_time - sess_run_start))
         return res
 
     def _increment_global_step(self):