-
Notifications
You must be signed in to change notification settings - Fork 31
/
Copy pathconfig.ini
100 lines (95 loc) · 4.81 KB
/
config.ini
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
#configuration file
[acoustic_network_params]
# Number of layers
# Recommanded value : 5 to 9 (see https://arxiv.org/pdf/1609.05935v2.pdf)
num_layers : 5
# Number of LSTM cell per layer
# Recommended value : 1024 (see https://arxiv.org/pdf/1609.05935v2.pdf)
hidden_size : 1024
# Dropout value during training
# Those values define the probability to keep a cell active during training
# Recommanded values are 0.8 for input (dropping 20%) and 0.5 for output (dropping 50%)
# http://www.cs.toronto.edu/~rsalakhu/papers/srivastava14a.pdf
dropout_input_keep_prob : 0.8
dropout_output_keep_prob : 0.5
# Number of training examples for one batch
# You should find the greater value (between 2 and 50 probably) which allow your graphic
# card time to be "optimized" (check that you GPU never go to 0% usage for example)
batch_size : 10
# Number of mini-batch inside a batch (gradient update will happen only between batchs)
# Recommanded value is : batch_size x mini_batch_size = 32 or more (see https://arxiv.org/pdf/1609.05935v2.pdf)
mini_batch_size : 3
# Recommended learning_rate is 0.0003, higher values (0.001 for example) may give a faster
# convergence at the beginning but usually ends up with the error rate oscillating
learning_rate : 0.0003
lr_decay_factor : 0.33
grad_clip : 1
# Signal processing mode, options are mfcc or fbank
# - mfcc is Mel Frequency Cepstral Coefficients (20-dim input)
# - fbank is mel filterbank (including delta and double-delta, 120-dim input)
# mfcc is essentially fbank with an extra DCT (discrete cosine transformation) applied
# delta and double-delta represent 'velocity' and 'acceleration' of the input signal
# Recommanded value is fbank which give better training result
signal_processing : fbank
# Audio language. Currently supported : english
language : english
# The ratio to which the internal state of the RNN will be reset to 0. For example :
# 1.0 mean the RNN internal state will be reset at the end of each batch
# 0.25 mean there is 25% chances that the RNN internal state will be reset at the end of each batch
rnn_state_reset_ratio : 0.25
[lm_network_params]
num_layers : 3
hidden_size : 34
dropout : 0.9
batch_size : 1
learning_rate : 1e-5
lr_decay_factor : 0.97
grad_clip : 5
[general]
# Whether to read config settings if pre-existing ones are found in checkpoint path
use_config_file_if_checkpoint_exists : True
# Frequency at which to save the model
steps_per_checkpoint : 100
# Frequency at which to evaluate on the test set. This must be a multiple of steps_per_checkpoint
steps_per_evaluation : 1000
checkpoint_dir : data/checkpoints/
[training]
# Note : supported datasets are LibriSpeech, Shtooka, Vystadial_2013 or TEDLIUM
# Training dataset dirs (comma separated)
training_dataset_dirs : data/Shtooka/train, data/LibriSpeech/train, data/TEDLIUM_release2/train
# Training dataset cache file
training_filelist_cache : data/train_shtooka_Libri_TED.data
# Test dataset dirs (optional, comma separated)
test_dataset_dirs : data/LibriSpeech/test
# Fraction of the training set used for test set (optional)
# If test and training datasets are not separated then a fraction of the training set can be used for test
# Should not be provided if test_dataset_dirs is provided
#train_frac : 0.98
# max_input_seq_length is the maximum number of 0.01s chunks we allow in a single training (or test) example
# max_target_seq_length is the maximum number of letters we allow in a single training (or test) example output
# Must be less than 65535 (int16 for reduced memory consumption)
# Advices :
# - Those two values need to be consistent one with the other
# - These values depends on your dataset used for training
# - For reduced memory consumption it is possible to reduce batch_size
# LibriSpeech or TEDLIUM training dataset : 3510 / 600
# Shtooka training dataset : 350 / 40
# Vystadial_2013 training dataset : 900 / 100
max_input_seq_length : 3510
max_target_seq_length : 600
# Create a tensorboard file during training (directory or blank, directory must already exist, won't be created)
# Launch tensorboard in another terminal with : "tensorboard --logdir=data/tensorboard/"
tensorboard_dir : data/tensorboard
# Apply batch normalization to the data during training (True / False)
batch_normalization : False
# Order each dataset by filesize ascending. Authorized values :
# True : order ascending (improve compute time less quality for the resulting RNN)
# False : shuffling the dataset (better quality)
# First_run_only : order ascending for first epoch and shuffle for each new epoch (improve compute time on the first
# epoch but less quality for the resulting RNN)
dataset_size_ordering : False
[logging]
# Set a log file, if void then log messages will be outputed to the screen
log_file : data/full_data.log
# Set a log level : DEBUG, INFO, WARNING (default) , ERROR or CRITICAL
log_level : INFO