Skip to content

Commit 280e75c

Browse files
Merge pull request NVIDIA#636 from NVIDIA/gh/release
[VAE/TF] Updating for Ampere
2 parents 386dd8e + 22f354e commit 280e75c

File tree

6 files changed

+304
-275
lines changed

6 files changed

+304
-275
lines changed

TensorFlow/Recommendation/VAE-CF/Dockerfile

+1-1
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@
1212
# See the License for the specific language governing permissions and
1313
# limitations under the License.
1414

15-
ARG FROM_IMAGE_NAME=nvcr.io/nvidia/tensorflow:19.11-tf1-py3
15+
ARG FROM_IMAGE_NAME=nvcr.io/nvidia/tensorflow:20.06-tf1-py3
1616
FROM ${FROM_IMAGE_NAME}
1717

1818
ADD requirements.txt .

TensorFlow/Recommendation/VAE-CF/README.md

+242-145
Large diffs are not rendered by default.

TensorFlow/Recommendation/VAE-CF/main.py

+54-22
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
#!/usr/bin/python3
22

3-
# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
3+
# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
44
#
55
# Licensed under the Apache License, Version 2.0 (the "License");
66
# you may not use this file except in compliance with the License.
@@ -15,14 +15,21 @@
1515
# limitations under the License.
1616

1717
import os
18+
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
19+
1820
from functools import partial
1921
import json
2022
import logging
2123
from argparse import ArgumentParser
24+
2225
import tensorflow as tf
26+
tf.logging.set_verbosity(tf.logging.ERROR)
27+
2328
import numpy as np
2429
import horovod.tensorflow as hvd
30+
from mpi4py import MPI
2531
import dllogger
32+
import time
2633

2734
from vae.utils.round import round_8
2835
from vae.metrics.recall import recall
@@ -32,18 +39,16 @@
3239

3340
def main():
3441
hvd.init()
42+
mpi_comm = MPI.COMM_WORLD
3543

3644
parser = ArgumentParser(description="Train a Variational Autoencoder for Collaborative Filtering in TensorFlow")
3745
parser.add_argument('--train', action='store_true',
3846
help='Run training of VAE')
3947
parser.add_argument('--test', action='store_true',
4048
help='Run validation of VAE')
41-
parser.add_argument('--inference', action='store_true',
42-
help='Run inference on a single random example.'
43-
'This can also be used to measure the latency for a batch size of 1')
4449
parser.add_argument('--inference_benchmark', action='store_true',
45-
help='Benchmark the inference throughput on a very large batch size')
46-
parser.add_argument('--use_tf_amp', action='store_true',
50+
help='Measure inference latency and throughput on a variety of batch sizes')
51+
parser.add_argument('--amp', action='store_true', default=False,
4752
help='Enable Automatic Mixed Precision')
4853
parser.add_argument('--epochs', type=int, default=400,
4954
help='Number of epochs to train')
@@ -85,6 +90,7 @@ def main():
8590
default=None,
8691
help='Path for saving a checkpoint after the training')
8792
args = parser.parse_args()
93+
args.world_size = hvd.size()
8894

8995
if args.batch_size_train % hvd.size() != 0:
9096
raise ValueError('Global batch size should be a multiple of the number of workers')
@@ -101,16 +107,27 @@ def main():
101107
dllogger.init(backends=[])
102108
logger.setLevel(logging.ERROR)
103109

104-
dllogger.log(data=vars(args), step='PARAMETER')
110+
if args.seed is None:
111+
if hvd.rank() == 0:
112+
seed = int(time.time())
113+
else:
114+
seed = None
105115

106-
np.random.seed(args.seed)
107-
tf.set_random_seed(args.seed)
116+
seed = mpi_comm.bcast(seed, root=0)
117+
else:
118+
seed = args.seed
119+
120+
tf.random.set_random_seed(seed)
121+
np.random.seed(seed)
122+
args.seed = seed
123+
124+
dllogger.log(data=vars(args), step='PARAMETER')
108125

109126
# Suppress TF warnings
110127
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'
111128

112129
# set AMP
113-
os.environ['TF_ENABLE_AUTO_MIXED_PRECISION'] = '1' if args.use_tf_amp else '0'
130+
os.environ['TF_ENABLE_AUTO_MIXED_PRECISION'] = '1' if args.amp else '0'
114131

115132
# load dataset
116133
(train_data,
@@ -159,21 +176,36 @@ def main():
159176
elif args.test and hvd.size() > 1:
160177
print("Testing is not supported with horovod multigpu yet")
161178

162-
if args.inference_benchmark and hvd.size() <= 1:
163-
# use the train data to get accurate throughput numbers for inference
164-
# the test and validation sets are too small to measure this accurately
165-
# vae.inference_benchmark()
166-
_ = vae.test(test_data_input=train_data,
167-
test_data_true=train_data, metrics={})
168-
169-
170179
elif args.test and hvd.size() > 1:
171180
print("Testing is not supported with horovod multigpu yet")
172181

173-
if args.inference:
174-
input_data = np.random.randint(low=0, high=10000, size=10)
175-
recommendations = vae.query(input_data=input_data)
176-
print('Recommended item indices: ', recommendations)
182+
if args.inference_benchmark:
183+
items_per_user = 10
184+
item_indices = np.random.randint(low=0, high=10000, size=items_per_user)
185+
user_indices = np.zeros(len(item_indices))
186+
indices = np.stack([user_indices, item_indices], axis=1)
187+
188+
num_batches = 200
189+
latencies = []
190+
for i in range(num_batches):
191+
start_time = time.time()
192+
_ = vae.query(indices=indices)
193+
end_time = time.time()
194+
195+
if i < 10:
196+
#warmup steps
197+
continue
198+
199+
latencies.append(end_time - start_time)
200+
201+
result_data = {}
202+
result_data[f'batch_1_mean_throughput'] = 1 / np.mean(latencies)
203+
result_data[f'batch_1_mean_latency'] = np.mean(latencies)
204+
result_data[f'batch_1_p90_latency'] = np.percentile(latencies, 90)
205+
result_data[f'batch_1_p95_latency'] = np.percentile(latencies, 95)
206+
result_data[f'batch_1_p99_latency'] = np.percentile(latencies, 99)
207+
208+
dllogger.log(data=result_data, step=tuple())
177209

178210
vae.close_session()
179211
dllogger.flush()

TensorFlow/Recommendation/VAE-CF/vae/load/downloaders.py

-96
This file was deleted.

TensorFlow/Recommendation/VAE-CF/vae/load/preprocessing.py

+5-3
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,6 @@
2323
import numpy as np
2424
from scipy.sparse import load_npz, csr_matrix
2525

26-
from vae.load.downloaders import download_movielens
2726
import logging
2827
import json
2928

@@ -69,7 +68,7 @@ def save_id_mappings(cache_dir, show2id, profile2id):
6968
json.dump(d, f, indent=4)
7069

7170

72-
def load_and_parse_ML_20M(data_dir, threshold=4):
71+
def load_and_parse_ML_20M(data_dir, threshold=4, parse=True):
7372
"""
7473
Original way of processing ml-20m dataset from VAE for CF paper
7574
Copyright [2018] [Dawen Liang, Rahul G. Krishnan, Matthew D. Hoffman, and Tony Jebara]
@@ -98,11 +97,14 @@ def load_and_parse_ML_20M(data_dir, threshold=4):
9897
load_npz(test_data_true_file), \
9998
load_npz(test_data_test_file),
10099

100+
if not parse:
101+
raise ValueError('Dataset not preprocessed. Please run python3 prepare_dataset.py first.')
102+
101103
LOG.info("Parsing movielens.")
102104

103105
source_file = os.path.join(data_dir, "ml-20m/extracted/ml-20m", "ratings.csv")
104106
if not glob(source_file):
105-
download_movielens(data_dir=data_dir)
107+
raise ValueError('Dataset not downloaded. Please download the ML-20m dataset from https://grouplens.org/datasets/movielens/20m/, unzip it and put it in ', source_file)
106108

107109
raw_data = pd.read_csv(source_file)
108110
raw_data.drop('timestamp', axis=1, inplace=True)

TensorFlow/Recommendation/VAE-CF/vae/models/train.py

+2-8
Original file line numberDiff line numberDiff line change
@@ -340,27 +340,21 @@ def test(
340340
# Therefore we're using the nan-aware mean from numpy to ignore users with no items to be predicted.
341341
return {name: np.nanmean(scores) for name, scores in metrics_scores.items()}
342342

343-
def query(self, input_data: np.ndarray):
343+
def query(self, indices: np.ndarray):
344344
"""
345345
inference for batch size 1
346346
347347
:param input_data:
348348
:return:
349349
"""
350-
query_start = time.time()
351-
indices = np.stack([np.zeros(len(input_data)), input_data], axis=1)
352-
values = np.ones(shape=(1, len(input_data)))
350+
values = np.ones(shape=(1, len(indices)))
353351
values = normalize(values)
354352
values = values.reshape(-1)
355353

356-
sess_run_start = time.time()
357354
res = self.session.run(
358355
self.top_k_query,
359356
feed_dict={self.inputs_query: (indices,
360357
values)})
361-
query_end_time = time.time()
362-
LOG.info('query time: {}'.format(query_end_time - query_start))
363-
LOG.info('sess run time: {}'.format(query_end_time - sess_run_start))
364358
return res
365359

366360
def _increment_global_step(self):

0 commit comments

Comments
 (0)