From 1c9d195aa0e4d4a1c3b29b12b5c4eed6c22483d2 Mon Sep 17 00:00:00 2001 From: andabi Date: Thu, 19 Apr 2018 13:58:44 +0900 Subject: [PATCH] update: eval1, eval2 --- LICENSE | 19 ++++++ convert.py | 114 +++++++++++++++++-------------- data_load.py | 3 +- eval1.py | 85 ++++++++++-------------- eval2.py | 89 ++++++++++++------------- hparams/default.yaml | 13 ++-- hparams/hparams.yaml | 66 +++++------------- models.py | 144 ++++++++++++++++++++++------------------ notes/MoL.md | 18 +++-- tensorpack_extension.py | 29 ++++++++ train1.py | 2 + train2.py | 18 ++--- utils.py | 4 +- 13 files changed, 324 insertions(+), 280 deletions(-) create mode 100644 LICENSE diff --git a/LICENSE b/LICENSE new file mode 100644 index 00000000..28f18c23 --- /dev/null +++ b/LICENSE @@ -0,0 +1,19 @@ +The MIT License (MIT) Copyright (c) 2016 Igor Babuschkin + +Permission is hereby granted, free of charge, to any person obtaining a copy of +this software and associated documentation files (the "Software"), to deal in +the Software without restriction, including without limitation the rights to +use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies +of the Software, and to permit persons to whom the Software is furnished to do +so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/convert.py b/convert.py index 0f650a58..d394a971 100644 --- a/convert.py +++ b/convert.py @@ -17,51 +17,32 @@ from tensorpack.predict.config import PredictConfig from tensorpack.tfutils.sessinit import SaverRestore from tensorpack.tfutils.sessinit import ChainInit - - -def convert(model, mfccs, spec, mel, ckpt1=None, ckpt2=None): - session_inits = [] - if ckpt2: - session_inits.append(SaverRestore(ckpt2)) - if ckpt1: - session_inits.append(SaverRestore(ckpt1, ignore=['global_step'])) - - pred_conf = PredictConfig( - model=model, - input_names=get_eval_input_names(), - output_names=get_eval_output_names(), - session_init=ChainInit(session_inits)) - predict_spec = OfflinePredictor(pred_conf) - - pred_spec, y_spec, ppgs = predict_spec(mfccs, spec, mel) - - return pred_spec, y_spec, ppgs - - -def get_eval_input_names(): - return ['x_mfccs', 'y_spec', 'y_mel'] - - -def get_eval_output_names(): - return ['pred_spec', 'y_spec', 'net1/ppgs'] - - -def do_convert(args, logdir1, logdir2): - - # Load graph - model = Net2(batch_size=hp.convert.batch_size) - - df = Net2DataFlow(hp.convert.data_path, hp.convert.batch_size) - - # samples - mfccs, spec, mel = df().get_data().next() - - ckpt1 = tf.train.latest_checkpoint(logdir1) - ckpt2 = '{}/{}'.format(logdir2, args.ckpt) if args.ckpt else tf.train.latest_checkpoint(logdir2) - - pred_spec, y_spec, ppgs = convert(model, mfccs, spec, mel, ckpt1, ckpt2) - print(np.max(pred_spec)) - print(np.min(pred_spec)) +from tensorpack.callbacks.base import Callback + + +# class ConvertCallback(Callback): +# def __init__(self, logdir, test_per_epoch=1): +# self.df = Net2DataFlow(hp.convert.data_path, hp.convert.batch_size) +# self.logdir = logdir +# self.test_per_epoch = test_per_epoch +# +# def _setup_graph(self): +# self.predictor = self.trainer.get_predictor( +# get_eval_input_names(), +# get_eval_output_names()) +# +# def _trigger_epoch(self): +# if self.epoch_num % self.test_per_epoch == 0: +# audio, y_audio, _ = convert(self.predictor, self.df) +# # self.trainer.monitors.put_scalar('eval/accuracy', acc) +# +# # Write the result +# # tf.summary.audio('A', y_audio, hp.default.sr, max_outputs=hp.convert.batch_size) +# # tf.summary.audio('B', audio, hp.default.sr, max_outputs=hp.convert.batch_size) + + +def convert(predictor, df): + pred_spec, y_spec, ppgs = predictor(next(df().get_data())) # Denormalizatoin pred_spec = denormalize_db(pred_spec, hp.default.max_db, hp.default.min_db) @@ -85,10 +66,43 @@ def do_convert(args, logdir1, logdir2): audio = inv_preemphasis(audio, coeff=hp.default.preemphasis) y_audio = inv_preemphasis(y_audio, coeff=hp.default.preemphasis) - if hp.convert.one_full_wav: - # Concatenate to a wav - y_audio = np.reshape(y_audio, (1, y_audio.size), order='C') - audio = np.reshape(audio, (1, audio.size), order='C') + # if hp.convert.one_full_wav: + # # Concatenate to a wav + # y_audio = np.reshape(y_audio, (1, y_audio.size), order='C') + # audio = np.reshape(audio, (1, audio.size), order='C') + + return audio, y_audio, ppgs + + +def get_eval_input_names(): + return ['x_mfccs', 'y_spec', 'y_mel'] + + +def get_eval_output_names(): + return ['pred_spec', 'y_spec', 'ppgs'] + + +def do_convert(args, logdir1, logdir2): + # Load graph + model = Net2() + + df = Net2DataFlow(hp.convert.data_path, hp.convert.batch_size) + + ckpt1 = tf.train.latest_checkpoint(logdir1) + ckpt2 = '{}/{}'.format(logdir2, args.ckpt) if args.ckpt else tf.train.latest_checkpoint(logdir2) + session_inits = [] + if ckpt2: + session_inits.append(SaverRestore(ckpt2)) + if ckpt1: + session_inits.append(SaverRestore(ckpt1, ignore=['global_step'])) + pred_conf = PredictConfig( + model=model, + input_names=get_eval_input_names(), + output_names=get_eval_output_names(), + session_init=ChainInit(session_inits)) + predictor = OfflinePredictor(pred_conf) + + audio, y_audio, ppgs = convert(predictor, df) # Write the result tf.summary.audio('A', y_audio, hp.default.sr, max_outputs=hp.convert.batch_size) @@ -137,4 +151,4 @@ def get_arguments(): e = datetime.datetime.now() diff = e - s - print("Done. elapsed time:{}s".format(diff.seconds)) \ No newline at end of file + print("Done. elapsed time:{}s".format(diff.seconds)) diff --git a/data_load.py b/data_load.py index 18df877a..f0ef76e0 100644 --- a/data_load.py +++ b/data_load.py @@ -22,8 +22,7 @@ def __init__(self, data_path, batch_size): def __call__(self, n_prefetch=1000, n_thread=1): df = self - if self.batch_size > 1: - df = BatchData(df, self.batch_size) + df = BatchData(df, self.batch_size) df = PrefetchData(df, n_prefetch, n_thread) return df diff --git a/eval1.py b/eval1.py index c0516269..a08348ef 100644 --- a/eval1.py +++ b/eval1.py @@ -7,65 +7,54 @@ import tensorflow as tf -from data_load import get_batch, phns, load_vocab +from data_load import Net1DataFlow, phns, load_vocab from hparam import hparam as hp -from models import Model +from models import Net1 from utils import plot_confusion_matrix +from tensorpack.predict.config import PredictConfig +from tensorpack.predict.base import OfflinePredictor +from tensorpack.tfutils.sessinit import SaverRestore -def eval(logdir, writer, queue=False): - # Load graph - model = Model(mode="test1", batch_size=hp.test1.batch_size, queue=queue) - - # Accuracy - acc_op = model.acc_net1() +def get_eval_input_names(): + return ['x_mfccs', 'y_ppgs'] - # Loss - loss_op = model.loss_net1() - # confusion matrix - y_ppg_1d = tf.reshape(model.y_ppg, shape=(tf.size(model.y_ppg),)) - pred_ppg_1d = tf.reshape(model.pred_ppg, shape=(tf.size(model.pred_ppg),)) +def get_eval_output_names(): + return ['net1/eval/y_ppg_1d', 'net1/eval/pred_ppg_1d', 'net1/eval/summ_loss', 'net1/eval/summ_acc'] - # Summary - tf.summary.scalar('net1/eval/acc', acc_op) - tf.summary.scalar('net1/eval/loss', loss_op) - summ_op = tf.summary.merge_all() - session_conf = tf.ConfigProto( - allow_soft_placement=True, - device_count={'CPU': 1, 'GPU': 0}, - ) - with tf.Session(config=session_conf) as sess: - coord = tf.train.Coordinator() - threads = tf.train.start_queue_runners(coord=coord) +def eval(logdir): + # Load graph + model = Net1() - # Load trained model - sess.run(tf.global_variables_initializer()) - model.load(sess, 'train1', logdir=logdir) + # dataflow + df = Net1DataFlow(hp.test1.data_path, hp.test1.batch_size) - if queue: - summ, acc, loss, y_ppg_1d, pred_ppg_1d = sess.run([summ_op, acc_op, loss_op, y_ppg_1d, pred_ppg_1d]) - else: - mfcc, ppg = get_batch(model.mode, model.batch_size) - summ, acc, loss, y_ppg_1d, pred_ppg_1d = sess.run([summ_op, acc_op, loss_op, y_ppg_1d, pred_ppg_1d], - feed_dict={model.x_mfcc: mfcc, model.y_ppg: ppg}) + ckpt = tf.train.latest_checkpoint(logdir) - # plot confusion matrix - _, idx2phn = load_vocab() - y_ppg_1d = [idx2phn[i] for i in y_ppg_1d] - pred_ppg_1d = [idx2phn[i] for i in pred_ppg_1d] - cm_summ = plot_confusion_matrix(y_ppg_1d, pred_ppg_1d, phns) + pred_conf = PredictConfig( + model=model, + input_names=get_eval_input_names(), + output_names=get_eval_output_names()) + if ckpt: + pred_conf.session_init = SaverRestore(ckpt) + predictor = OfflinePredictor(pred_conf) - writer.add_summary(summ) - writer.add_summary(cm_summ) + x_mfccs, y_ppgs = next(df().get_data()) + y_ppg_1d, pred_ppg_1d, summ_loss, summ_acc = predictor(x_mfccs, y_ppgs) - print("acc:", acc) - print("loss:", loss) - print('\n') + # plot confusion matrix + _, idx2phn = load_vocab() + y_ppg_1d = [idx2phn[i] for i in y_ppg_1d] + pred_ppg_1d = [idx2phn[i] for i in pred_ppg_1d] + summ_cm = plot_confusion_matrix(y_ppg_1d, pred_ppg_1d, phns) - coord.request_stop() - coord.join(threads) + writer = tf.summary.FileWriter(logdir) + writer.add_summary(summ_loss) + writer.add_summary(summ_acc) + writer.add_summary(summ_cm) + writer.close() def get_arguments(): @@ -79,8 +68,6 @@ def get_arguments(): args = get_arguments() hp.set_hparam_yaml(args.case) logdir = '{}/train1'.format(hp.logdir) - writer = tf.summary.FileWriter(logdir) - eval(logdir=logdir, writer=writer) - writer.close() + eval(logdir=logdir) - print("Done") + print("Done") \ No newline at end of file diff --git a/eval2.py b/eval2.py index e5aeaef0..165c2d1d 100644 --- a/eval2.py +++ b/eval2.py @@ -4,68 +4,67 @@ from __future__ import print_function import tensorflow as tf -from data_load import get_batch -from models import Model +from models import Net2 import argparse from hparam import hparam as hp +from tensorpack.predict.base import OfflinePredictor +from tensorpack.predict.config import PredictConfig +from tensorpack.tfutils.sessinit import SaverRestore +from tensorpack.tfutils.sessinit import ChainInit +from data_load import Net2DataFlow -def eval(logdir, writer, queue=True): +def get_eval_input_names(): + return ['x_mfccs', 'y_spec'] - # Load graph - model = Model(mode="test2", batch_size=hp.test2.batch_size, queue=queue) - - # Loss - loss_op = model.loss_net2() - - # Summary - summ_op = summaries(loss_op) - - session_conf = tf.ConfigProto( - allow_soft_placement=True, - device_count={'CPU': 1, 'GPU': 0}, - ) - with tf.Session(config=session_conf) as sess: - # Load trained model - sess.run(tf.global_variables_initializer()) - model.load(sess, 'test2', logdir=logdir) - - coord = tf.train.Coordinator() - threads = tf.train.start_queue_runners(coord=coord) - - if queue: - summ, loss = sess.run([summ_op, loss_op]) - else: - mfcc, spec, mel = get_batch(model.mode, model.batch_size) - summ, loss = sess.run([summ_op, loss_op], feed_dict={model.x_mfcc: mfcc, model.y_spec: spec, model.y_mel: mel}) - writer.add_summary(summ) +def get_eval_output_names(): + return ['net2/eval/summ_loss'] - coord.request_stop() - coord.join(threads) - print("loss:", loss) - - -def summaries(loss): - tf.summary.scalar('net2/eval/loss', loss) - return tf.summary.merge_all() +def eval(logdir1, logdir2): + # Load graph + model = Net2() + + # dataflow + df = Net2DataFlow(hp.test2.data_path, hp.test2.batch_size) + + ckpt1 = tf.train.latest_checkpoint(logdir1) + ckpt2 = tf.train.latest_checkpoint(logdir2) + session_inits = [] + if ckpt2: + session_inits.append(SaverRestore(ckpt2)) + if ckpt1: + session_inits.append(SaverRestore(ckpt1, ignore=['global_step'])) + pred_conf = PredictConfig( + model=model, + input_names=get_eval_input_names(), + output_names=get_eval_output_names(), + session_init=ChainInit(session_inits)) + predictor = OfflinePredictor(pred_conf) + + x_mfccs, y_spec, _ = next(df().get_data()) + summ_loss, = predictor(x_mfccs, y_spec) + + writer = tf.summary.FileWriter(logdir2) + writer.add_summary(summ_loss) + writer.close() def get_arguments(): parser = argparse.ArgumentParser() - parser.add_argument('case', type=str, help='experiment case name') + parser.add_argument('case1', type=str, help='experiment case name of train1') + parser.add_argument('case2', type=str, help='experiment case name of train2') arguments = parser.parse_args() return arguments if __name__ == '__main__': args = get_arguments() - hp.set_hparam_yaml(args.case) - logdir = '{}/train2'.format(hp.logdir) + hp.set_hparam_yaml(args.case2) + logdir_train1 = '{}/{}/train1'.format(hp.logdir_path, args.case1) + logdir_train2 = '{}/{}/train2'.format(hp.logdir_path, args.case2) - writer = tf.summary.FileWriter(logdir) - eval(logdir=logdir, writer=writer) - writer.close() + eval(logdir1=logdir_train1, logdir2=logdir_train2) - print("Done") + print("Done") \ No newline at end of file diff --git a/hparams/default.yaml b/hparams/default.yaml index 0b646a76..792ed346 100644 --- a/hparams/default.yaml +++ b/hparams/default.yaml @@ -62,16 +62,17 @@ train2: # train batch_size: 32 - lr: 0.0001 + lr: 0.0003 lr_cyclic_margin: 0. lr_cyclic_steps: 5000 clip_value_max: 3. clip_value_min: -3. clip_norm: 10 - mol_step: 0.001 - num_epochs: 100000 - steps_per_epoch: 10 + mol_step: 0.003 + num_epochs: 10000 + steps_per_epoch: 100 save_per_epoch: 50 + test_per_epoch: 1 num_gpu: 4 --- test1: @@ -94,5 +95,5 @@ convert: # convert one_full_wav: False - batch_size: 3 - emphasis_magnitude: 1.5 \ No newline at end of file + batch_size: 1 + emphasis_magnitude: 1.2 \ No newline at end of file diff --git a/hparams/hparams.yaml b/hparams/hparams.yaml index 72e604d5..cc620de9 100644 --- a/hparams/hparams.yaml +++ b/hparams/hparams.yaml @@ -6,69 +6,35 @@ default: # lr: 0.0001 steps_per_epoch: 1 --- -mix_1: +mol/lj: train2: - num_mixtures: 1 ---- -mix_2: - train2: - num_mixtures: 2 ---- -mix_10: - train2: - num_mixtures: 10 ---- -mix_5_256: - train2: - hidden_units: 256 - num_mixtures: 10 ---- -mol_iu: - train2: - data_path: '/data/private/vc/datasets/IU/*_split/*.wav' - clip_value_max: 3. - clip_value_min: -3. - clip_norm: 50 - dropout_rate: 0.1 - lr_cyclic_margin: 0. - test2: - data_path: '/data/private/vc/datasets/IU/*_split/*.wav' + data_path: '/data/public/rw/LJSpeech-1.0_processed/*.wav' convert: - data_path: '/data/private/vc/datasets/test/2018.wav' - one_full_wav: True batch_size: 3 emphasis_magnitude: 1.3 --- -mol_1_iu: +mol/lj/2: train2: - data_path: '/data/private/vc/datasets/IU/*_split/*.wav' - clip_value_max: 3. - clip_value_min: -3. - clip_norm: 50 - dropout_rate: 0.1 - lr_cyclic_margin: 0. - n_mixtures: 1 - test2: - data_path: '/data/private/vc/datasets/IU/*_split/*.wav' + data_path: '/data/public/rw/LJSpeech-1.0_processed/*.wav' + hidden_units: 512 # alias: E + num_banks: 8 + num_highway_blocks: 16 + norm_type: 'ins' # a normalizer function. value: bn, ln, ins, or None + dropout_rate: 0 convert: - data_path: '/data/private/vc/datasets/test/2018.wav' - one_full_wav: True batch_size: 3 emphasis_magnitude: 1.3 --- -mol_log_iu: +mol/arctic: + convert: + batch_size: 3 + lr: 0.0001 +--- +mol/iu: train2: data_path: '/data/private/vc/datasets/IU/*_split/*.wav' - clip_value_max: 3. - clip_value_min: -3. - clip_norm: 50 - dropout_rate: 0.1 - lr_cyclic_margin: 0. lr: 0.0001 - test2: - data_path: '/data/private/vc/datasets/IU/*_split/*.wav' convert: data_path: '/data/private/vc/datasets/test/2018.wav' - one_full_wav: True - batch_size: 3 + batch_size: 1 emphasis_magnitude: 1.3 \ No newline at end of file diff --git a/models.py b/models.py index 808a57e3..b93477bc 100644 --- a/models.py +++ b/models.py @@ -2,7 +2,6 @@ # !/usr/bin/env python import tensorflow as tf -from tensorflow.contrib import distributions from tensorpack.graph_builder.model_desc import ModelDesc, InputDesc from tensorpack.tfutils import ( get_current_tower_context, optimizer, gradproc) @@ -12,7 +11,6 @@ from data_load import phns from hparam import hparam as hp from modules import prenet, cbhg, normalize -import sys class Net1(ModelDesc): @@ -25,13 +23,24 @@ def _get_inputs(self): def _build_graph(self, inputs): self.x_mfccs, self.y_ppgs = inputs + is_training = get_current_tower_context().is_training with tf.variable_scope('net1'): - self.ppgs, self.preds, self.logits = self.network(self.x_mfccs, get_current_tower_context().is_training) + self.ppgs, self.preds, self.logits = self.network(self.x_mfccs, is_training) self.cost = self.loss() + acc = self.acc() # summaries tf.summary.scalar('net1/train/loss', self.cost) - tf.summary.scalar('net1/train/acc', self.acc()) + tf.summary.scalar('net1/train/acc', acc) + + if not is_training: + # summaries + tf.summary.scalar('net1/eval/summ_loss', self.cost) + tf.summary.scalar('net1/eval/summ_acc', acc) + + # for confusion matrix + tf.reshape(self.y_ppgs, shape=(tf.size(self.y_ppgs),), name='net1/eval/y_ppg_1d') + tf.reshape(self.preds, shape=(tf.size(self.preds),), name='net1/eval/pred_ppg_1d') def _get_optimizer(self): lr = tf.get_variable('learning_rate', initializer=hp.train1.lr, trainable=False) @@ -73,14 +82,13 @@ def acc(self): class Net2(ModelDesc): - def __init__(self, batch_size): - self.net1 = Net1() - self.batch_size = batch_size def _get_inputs(self): - return [InputDesc(tf.float32, (self.batch_size, None, hp.default.n_mfcc), 'x_mfccs'), - InputDesc(tf.float32, (self.batch_size, None, hp.default.n_fft // 2 + 1), 'y_spec'), - InputDesc(tf.float32, (self.batch_size, None, hp.default.n_mels), 'y_mel'), ] + n_timesteps = (hp.default.duration * hp.default.sr) // hp.default.hop_length + 1 + + return [InputDesc(tf.float32, (None, n_timesteps, hp.default.n_mfcc), 'x_mfccs'), + InputDesc(tf.float32, (None, n_timesteps, hp.default.n_fft // 2 + 1), 'y_spec'), + InputDesc(tf.float32, (None, n_timesteps, hp.default.n_mels), 'y_mel'), ] def _build_graph(self, inputs): self.x_mfcc, self.y_spec, self.y_mel = inputs @@ -88,29 +96,35 @@ def _build_graph(self, inputs): is_training = get_current_tower_context().is_training # build net1 + self.net1 = Net1() with tf.variable_scope('net1'): self.ppgs, _, _ = self.net1.network(self.x_mfcc, is_training) + self.ppgs = tf.identity(self.ppgs, name='ppgs') + # build net2 with tf.variable_scope('net2'): - self.pred_spec_mu, self.pred_spec_logvar, self.pred_spec_phi = self.network(self.ppgs, is_training) + self.mu, self.log_var, self.log_pi = self.network(self.ppgs, is_training) self.cost = self.loss() - # build for conversion phase - self.convert() - # summaries tf.summary.scalar('net2/train/loss', self.cost) - # tf.summary.scalar('net2/train/lr', lr) - tf.summary.histogram('net2/train/mu', self.pred_spec_mu) - tf.summary.histogram('net2/train/var', tf.exp(self.pred_spec_logvar)) - tf.summary.histogram('net2/train/phi', self.pred_spec_phi) + tf.summary.histogram('net2/train/mu', self.mu) + tf.summary.histogram('net2/train/var', tf.exp(self.log_var)) + tf.summary.histogram('net2/train/pi', tf.exp(self.log_pi)) + + if not is_training: + tf.summary.scalar('net2/eval/summ_loss', self.cost) + + # build for conversion phase + self.convert() def _get_optimizer(self): gradprocs = [ tensorpack_extension.FilterGradientVariables('.*net2.*', verbose=False), - gradproc.MapGradient(lambda grad: tf.clip_by_value(grad, hp.train2.clip_value_min, hp.train2.clip_value_max)), + gradproc.MapGradient( + lambda grad: tf.clip_by_value(grad, hp.train2.clip_value_min, hp.train2.clip_value_max)), gradproc.GlobalNormClip(hp.train2.clip_norm), # gradproc.PrintGradient(), # gradproc.CheckGradient(), @@ -137,54 +151,58 @@ def network(self, ppgs, is_training): # CBHG2: linear-scale out = tf.layers.dense(pred_mel, hp.train2.hidden_units // 2) # (N, T, n_mels) out = cbhg(out, hp.train2.num_banks, hp.train2.hidden_units // 2, - hp.train2.num_highway_blocks, hp.train2.norm_type, is_training, - scope="cbhg_linear") + hp.train2.num_highway_blocks, hp.train2.norm_type, is_training, scope="cbhg_linear") - batch_size, _, num_bins = self.y_spec.get_shape().as_list() - num_units = num_bins * hp.train2.n_mixtures - out = tf.layers.dense(out, num_units * 3, bias_initializer=tf.random_uniform_initializer(minval=-3., maxval=3.)) + _, n_timesteps, n_bins = self.y_spec.get_shape().as_list() + n_units = n_bins * hp.train2.n_mixtures + out = tf.layers.dense(out, n_units * 3, bias_initializer=tf.random_uniform_initializer(minval=-3., maxval=3.)) - mu = tf.nn.sigmoid(out[..., :num_units]) - mu = tf.reshape(mu, shape=(batch_size, -1, num_bins, hp.train2.n_mixtures)) # (N, T, 1+hp.n_fft//2, n_mixtures) + mu = tf.nn.sigmoid(out[..., :n_units]) + mu = tf.reshape(mu, shape=(-1, n_timesteps, n_bins, hp.train2.n_mixtures)) # (N, T, 1+hp.n_fft//2, n_mixtures) - logvar = tf.clip_by_value(out[..., num_units: 2 * num_units], clip_value_min=-7, clip_value_max=7) - logvar = tf.reshape(logvar, shape=(batch_size, -1, num_bins, hp.train2.n_mixtures)) # (N, T, 1+hp.n_fft//2, n_mixtures) + log_var = tf.maximum(out[..., n_units: 2 * n_units], -7.0) + log_var = tf.reshape(log_var, + shape=(-1, n_timesteps, n_bins, hp.train2.n_mixtures)) # (N, T, 1+hp.n_fft//2, n_mixtures) - # normalize to prevent softmax output to be NaN. - pi = tf.reshape(out[..., 2 * num_units: 3 * num_units], shape=(batch_size, -1, num_bins, hp.train2.n_mixtures)) # (N, T, 1+hp.n_fft//2, n_mixtures) - pi = normalize(pi, type='ins', is_training=get_current_tower_context().is_training, scope='normalize_phi') - pi = tf.nn.softmax(pi) + log_pi = tf.reshape(out[..., 2 * n_units: 3 * n_units], + shape=(-1, n_timesteps, n_bins, hp.train2.n_mixtures)) # (N, T, 1+hp.n_fft//2, n_mixtures) + log_pi = normalize(log_pi, type='ins', is_training=get_current_tower_context().is_training, scope='normalize_pi') + log_pi = tf.nn.log_softmax(log_pi) - return mu, logvar, pi + return mu, log_var, log_pi def loss(self): - # negative log likelihood - logistic_dists = [] - for i in range(hp.train2.n_mixtures): - mu = self.pred_spec_mu[..., i] - logvar = self.pred_spec_logvar[..., i] - logistic_dist = distributions.Logistic(mu, tf.exp(logvar)) - logistic_dists.append(logistic_dist) - cat = distributions.Categorical(probs=self.pred_spec_phi) - mixture_dist = distributions.Mixture(cat=cat, components=logistic_dists) - cdf_pos = mixture_dist.cdf(value=self.y_spec + hp.train2.mol_step) - cdf_neg = mixture_dist.cdf(value=self.y_spec - hp.train2.mol_step) - # FIXME: why minus prob? - prob = cdf_pos - cdf_neg - prob /= hp.train2.mol_step * 2 - prob = tf.maximum(prob, sys.float_info.epsilon) - - tf.summary.histogram('net2/train/cdf_pos', cdf_pos) - tf.summary.histogram('net2/train/cdf_neg', cdf_neg) - tf.summary.scalar('net2/train/min_cdf_pos', tf.reduce_min(cdf_pos)) - tf.summary.scalar('net2/train/min_cdf_neg', tf.reduce_min(cdf_neg)) - tf.summary.histogram('net2/train/prob', prob) - tf.summary.scalar('net2/prob_min', tf.reduce_min(prob)) - - loss_mle = -tf.reduce_mean(tf.log(prob)) - - mean = tf.reduce_sum(self.pred_spec_mu * self.pred_spec_phi, axis=-1, keep_dims=True) - loss_mix = tf.reduce_sum(self.pred_spec_phi * tf.squared_difference(self.pred_spec_mu, mean), axis=-1) + y = tf.expand_dims(self.y_spec, axis=-1) + y = tf.concat([y]*hp.train2.n_mixtures, axis=-1) + + centered_x = y - self.mu + inv_stdv = tf.exp(-self.log_var) + plus_in = inv_stdv * (centered_x + hp.train2.mol_step) + min_in = inv_stdv * (centered_x - hp.train2.mol_step) + cdf_plus = tf.sigmoid(plus_in) + cdf_min = tf.sigmoid(min_in) + + # log probability for edge case + log_cdf_plus = plus_in - tf.nn.softplus(plus_in) + log_one_minus_cdf_min = -tf.nn.softplus(min_in) + + # probability for all other cases + cdf_delta = cdf_plus - cdf_min + + log_prob = tf.where(y < 0.001, log_cdf_plus, tf.where(y > 0.999, log_one_minus_cdf_min, tf.log(tf.maximum(cdf_delta, 1e-12)))) + + tf.summary.histogram('net2/train/prob', tf.exp(log_prob)) + + log_prob = log_prob + self.log_pi + + tf.summary.histogram('net2/prob_max', tf.reduce_max(tf.exp(log_prob), axis=-1)) + + log_prob = tf.reduce_logsumexp(log_prob, axis=-1) + + loss_mle = -tf.reduce_mean(log_prob) + + mean = tf.reduce_sum(self.mu * self.log_pi, axis=-1, keep_dims=True) + loss_mix = tf.reduce_sum(self.log_pi * tf.squared_difference(self.mu, mean), axis=-1) loss_mix = -tf.reduce_mean(loss_mix) lamb = 0 @@ -196,6 +214,6 @@ def loss(self): return loss def convert(self): - argmax = tf.one_hot(tf.argmax(self.pred_spec_phi, axis=-1), hp.train2.n_mixtures) - pred_spec = tf.reduce_sum(self.pred_spec_mu * argmax, axis=-1, name='pred_spec') - return pred_spec \ No newline at end of file + argmax = tf.one_hot(tf.argmax(self.log_pi, axis=-1), hp.train2.n_mixtures) + pred_spec = tf.reduce_sum(self.mu * argmax, axis=-1, name='pred_spec') + return pred_spec diff --git a/notes/MoL.md b/notes/MoL.md index 23f77ed7..088f89ef 100644 --- a/notes/MoL.md +++ b/notes/MoL.md @@ -6,9 +6,19 @@ loss가 한계에 다다르면 더 이상 떨어지지 않음. variance도 같이 학습해야 한다. 처음에 1로 고정하고 mu만 학습했더니 분포의 표현력이 좋지 않아 높은 likelihood를 얻을 수 없었다. -log(prob + e)에서 e의 값의 크기가 학습되는 likelihood에 영향을 미친다. -더 작은 e일수록 더 큰 likelihood를 얻는다. e를 없애고도 학습할 수 있는 방법 도입해야 함. +inference시 dominant dist.를 고름. 따라서 학습시에도 여러 분포 중에 나온 값 중 최대를 maximize하는 식으로 함. +mixture를 하나의 분포로 보기보다는 여러 개의 서로 다른 single dist.가 있는 것으로 보는게 맞음. +* logsumexp -logsumexp +edge 케이스 -pixel cnn++을 참고하자 \ No newline at end of file +pi softmax 앞에 normalization 중요하다. + +--- +documentation +- why mol? + - the importance on multi modal modeling in generation task. +- experiments on single and multi modal + - step-by-step explanation on code + - tips + - result \ No newline at end of file diff --git a/tensorpack_extension.py b/tensorpack_extension.py index 22126ac9..aa94ad5a 100644 --- a/tensorpack_extension.py +++ b/tensorpack_extension.py @@ -4,8 +4,37 @@ import re from tensorpack.utils import logger from tensorpack.tfutils.gradproc import GradientProcessor +from tensorpack.callbacks.monitor import JSONWriter +import tensorflow as tf +# class AudioWriter(TrainingMonitor): +# """ +# Write summaries to TensorFlow event file. +# """ +# def __new__(cls): +# if logger.get_logger_dir(): +# return super(TFEventWriter, cls).__new__(cls) +# else: +# logger.warn("logger directory was not set. Ignore TFEventWriter.") +# return NoOpMonitor() +# +# def _setup_graph(self): +# self._writer = tf.summary.FileWriter(logger.get_logger_dir(), graph=tf.get_default_graph()) +# +# def process_summary(self, summary): +# self._writer.add_summary(summary, self.global_step) +# +# def process_event(self, evt): +# self._writer.add_event(evt) +# +# def _trigger(self): # flush every epoch +# self._writer.flush() +# +# def _after_train(self): +# self._writer.close() +# + class FilterGradientVariables(GradientProcessor): """ Skip the update of certain variables and print a warning. diff --git a/train1.py b/train1.py index e0dd8eef..273fea80 100644 --- a/train1.py +++ b/train1.py @@ -28,6 +28,8 @@ def train(args, logdir): # dataflow df = Net1DataFlow(hp.train1.data_path, hp.train1.batch_size) + ckpt1 = tf.train.latest_checkpoint(logdir1) + # set logger for event and model saver logger.set_logger_dir(logdir) diff --git a/train2.py b/train2.py index 3b32a28d..9780d897 100644 --- a/train2.py +++ b/train2.py @@ -4,32 +4,28 @@ from __future__ import print_function import argparse -import math +import os import tensorflow as tf from tensorpack.callbacks.saver import ModelSaver -from tensorpack.graph_builder.distributed import DataParallelBuilder -from tensorpack.graph_builder.utils import LeastLoadedDeviceSetter from tensorpack.input_source.input_source import QueueInput -from tensorpack.input_source.input_source import StagingInput from tensorpack.tfutils.sessinit import ChainInit from tensorpack.tfutils.sessinit import SaverRestore -from tensorpack.tfutils.tower import TowerFuncWrapper from tensorpack.train.interface import TrainConfig from tensorpack.train.interface import launch_train_with_config -from tensorpack.train.tower import TowerTrainer from tensorpack.train.trainers import SyncMultiGPUTrainerReplicated from tensorpack.utils import logger +from convert import ConvertCallback from data_load import Net2DataFlow from hparam import hparam as hp from models import Net2 -import os +from utils import remove_all_files def train(args, logdir1, logdir2): # model - model = Net2(batch_size=hp.train2.batch_size) + model = Net2() # dataflow df = Net2DataFlow(hp.train2.data_path, hp.train2.batch_size) @@ -57,7 +53,7 @@ def train(args, logdir1, logdir2): callbacks=[ # TODO save on prefix net2 ModelSaver(checkpoint_dir=logdir2), - # TODO EvalCallback() + # ConvertCallback(logdir2, hp.train2.test_per_epoch), ], max_epoch=hp.train2.num_epochs, steps_per_epoch=hp.train2.steps_per_epoch, @@ -84,6 +80,7 @@ def get_arguments(): parser.add_argument('case2', type=str, help='experiment case name of train2') parser.add_argument('-ckpt', help='checkpoint to load model.') parser.add_argument('-gpu', help='comma separated list of GPU(s) to use.') + parser.add_argument('-r', action='store_true', help='start training from the beginning.') arguments = parser.parse_args() return arguments @@ -94,6 +91,9 @@ def get_arguments(): logdir_train1 = '{}/{}/train1'.format(hp.logdir_path, args.case1) logdir_train2 = '{}/{}/train2'.format(hp.logdir_path, args.case2) + if args.r: + remove_all_files(logdir_train2) + print('case1: {}, case2: {}, logdir1: {}, logdir2: {}'.format(args.case1, args.case2, logdir_train1, logdir_train2)) train(args, logdir1=logdir_train1, logdir2=logdir_train2) diff --git a/utils.py b/utils.py index b8a4d110..32dd2961 100644 --- a/utils.py +++ b/utils.py @@ -24,8 +24,8 @@ def split_path(path): return basepath, filename, extension -def remove_all_files(prefix): - files = glob.glob(prefix + '*') +def remove_all_files(path): + files = glob.glob('{}/*'.format(path)) for f in files: os.remove(f)