-
Notifications
You must be signed in to change notification settings - Fork 92
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
copy some old data processing scripts
- Loading branch information
Showing
3 changed files
with
241 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,143 @@ | ||
import collections | ||
|
||
kaldi_base = "/scail/group/deeplearning/speech/awni/kaldi-stanford/kaldi-trunk/egs/swbd/s5b/" | ||
|
||
# Symbols | ||
laugh = '[laughter]' | ||
noise = '[noise]' | ||
voc_noise = '[vocalized-noise]' | ||
space = '[space]' | ||
|
||
# Spell out integers | ||
integers = ['zero','one','two','three','four','five','six','seven','eight','nine'] | ||
|
||
def unique_tokens(): | ||
""" | ||
Reads swbd transcripts and stores unique tokens. | ||
""" | ||
with open('data/train/text','r') as fid: | ||
lines = [l.strip().split()[1:] for l in fid.readlines()] | ||
|
||
tokens = collections.defaultdict(int) | ||
for i,line in enumerate(lines): | ||
for l in line: | ||
if l == laugh or l==noise or l==voc_noise: | ||
tokens[l] += 1 | ||
else: | ||
for t in list(l): | ||
if t=='_': | ||
continue | ||
try: | ||
int(t) | ||
except ValueError: | ||
# Ignore integers | ||
tokens[t] += 1 | ||
tokens[space] += 1 | ||
print "Parsed %d lines."%i | ||
|
||
fid = open('ctc-utils/chars.txt','w') | ||
for i,k in enumerate(tokens.keys()): | ||
fid.write(k+' '+str(i+1)+'\n') | ||
fid.close() | ||
|
||
return tokens | ||
|
||
def tokenize(labels,file='data/train/text_ctc'): | ||
""" | ||
Reads swbd transcripts and builds swbd k to list of | ||
integer labels mapping. | ||
""" | ||
|
||
with open(file,'r') as fid: | ||
lines = [l.strip().split() for l in fid.readlines()] | ||
data = dict((l[0],l[1:]) for l in lines) | ||
|
||
int_labels = [[labels[l] for l in list(i)] for i in integers] | ||
|
||
# for every utterance | ||
for k,line in data.iteritems(): | ||
newline = [] | ||
# for every word in transcription | ||
for i,word in enumerate(line): | ||
# for [noise] etc | ||
if word in labels.keys(): | ||
newline.append(labels[word]) | ||
else: | ||
# for every char in word | ||
for j,char in enumerate(list(word)): | ||
try: | ||
newline.append(labels[char]) | ||
except KeyError: | ||
# Add spelled out integer followed by space | ||
newline += int_labels[int(char)] | ||
if j < len(list(word)) - 1: | ||
newline.append(labels[space]) | ||
|
||
|
||
# Add a space inbetween every word | ||
if i < len(line) -1: | ||
newline.append(labels[space]) | ||
|
||
data[k] = newline | ||
return data | ||
|
||
def write_alis(utts,file=kaldi_base+'exp/train_ctc',numfiles=20): | ||
""" | ||
Takes utterance to alignment mapping and splits it up | ||
into alignment files according to file structure of | ||
training set. | ||
""" | ||
for f in range(1,numfiles+1): | ||
print "writing file %d..."%f | ||
with open(file+'/keys%d.txt'%f,'r') as fid: | ||
keys = [l.strip().split()[0] for l in fid.readlines()] | ||
|
||
with open(file+'/alis%d.txt'%f,'w') as fid: | ||
for k in keys: | ||
fid.write(k+" "+" ".join(utts[k])+'\n') | ||
|
||
def load_labels(): | ||
""" | ||
Loads file with label to integer mapping. Use | ||
unique_tokens to create file. | ||
""" | ||
with open('ctc-utils/chars.txt','r') as fid: | ||
labels = dict(tuple(l.strip().split()) for l in fid.readlines()) | ||
return labels | ||
|
||
def compute_bigrams(): | ||
""" | ||
Compute bigrams with smoothing. Save in bigrams.bin. | ||
""" | ||
import cPickle as pickle | ||
import numpy as np | ||
fid_bg = open(kaldi_base+'exp/train_ctc/bigrams.bin','w') | ||
labels = load_labels() | ||
numLabels = len(labels.keys()) | ||
bigrams = np.ones((numLabels,numLabels)) | ||
numfiles = 384 | ||
|
||
for f in range(1,numfiles+1): | ||
print "Reading alis %d."%f | ||
with open('exp/train_ctc/alis%d.txt'%f,'r') as fid: | ||
alis = [l.strip().split()[1:] for l in fid.readlines()] | ||
for v in alis: | ||
for i,j in zip(v[1:],v[:-1]): | ||
bigrams[int(i)-1,int(j)-1] += 1 | ||
|
||
bigrams = bigrams/np.sum(bigrams,axis=0) | ||
pickle.dump(bigrams,fid_bg) | ||
|
||
return bigrams | ||
|
||
if __name__=='__main__': | ||
# unique_tokens() | ||
labelset = load_labels() | ||
data = [('train',384),('dev',20)] | ||
|
||
for name,num in data: | ||
utts = tokenize(labelset, file=kaldi_base+'data/%s/text_ctc'%name) | ||
write_alis(utts, file=kaldi_base+'exp/%s_ctc'%name,numfiles=num) | ||
|
||
|
||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,24 @@ | ||
#!/bin/bash | ||
|
||
kaldi_dir=/scail/group/deeplearning/speech/awni/kaldi-stanford/kaldi-trunk/egs/swbd/s5b | ||
|
||
. $kaldi_dir/path.sh ## Source the tools/utils | ||
|
||
for x in 'dev' 'eval2000' | ||
do | ||
dir=exp/${x}_ctc #location to store nn data and models | ||
data=data/$x #location of data | ||
train_dir=exp/train_ctc #location of data using to train | ||
echo "writing" $data | ||
|
||
transform=$train_dir/$(readlink $train_dir/final.feature_transform) | ||
echo $transform | ||
|
||
mkdir $dir 2>/dev/null | ||
|
||
feats="ark:copy-feats scp:$data/feats.scp ark:- | apply-cmvn --norm-vars=false --utt2spk=ark:$data/utt2spk scp:$data/cmvn.scp ark:- ark:- |" | ||
|
||
feat-write --utts-per-file=200 --feature-transform=$transform "$feats" $kaldi_dir/$dir/ | ||
done | ||
|
||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,74 @@ | ||
#!/bin/bash | ||
|
||
# feature config | ||
norm_vars=false # normalize the FBANKs (CVN) | ||
splice_lr=20 # temporal splicing | ||
splice_step=1 # stepsize of the splicing (1 is no gap between frames, just like splice_feats does) | ||
|
||
echo "$0 $@" # Print the command line for logging | ||
|
||
[ -f path.sh ] && . ./path.sh; | ||
|
||
data=data/train_nodup | ||
dir=exp/train_ctc | ||
kaldi_dir=/scail/group/deeplearning/speech/awni/kaldi-stanford/kaldi-trunk/egs/swbd/s5b | ||
|
||
echo "$0 [info]: Build Training Data" | ||
printf "\t dir : $dir \n" | ||
printf "\t Train-set : $data \n" | ||
|
||
mkdir -p $dir/{log,nnet} | ||
|
||
# shuffle the list | ||
echo "Preparing train lists" | ||
cat $data/feats.scp | $kaldi_dir/utils/shuffle_list.pl --srand ${seed:-777} > $dir/train.scp | ||
|
||
# print the list sizes | ||
wc -l $dir/train.scp | ||
|
||
#get feature dim | ||
echo -n "Getting feature dim : " | ||
feat_dim=$(feat-to-dim --print-args=false scp:$dir/train.scp -) | ||
echo $feat_dim | ||
|
||
#read the features | ||
feats="ark:copy-feats scp:$dir/train.scp ark:- |" | ||
|
||
#add per-speaker CMVN | ||
echo "Will use CMVN statistics : $data/cmvn.scp" | ||
[ ! -r $data/cmvn.scp ] && echo "Cannot find cmvn stats $data/cmvn.scp" && exit 1; | ||
cmvn="scp:$data/cmvn.scp" | ||
feats="$feats apply-cmvn --print-args=false --norm-vars=$norm_vars --utt2spk=ark:$data/utt2spk $cmvn ark:- ark:- |" | ||
# keep track of norm_vars option | ||
echo "$norm_vars" >$dir/norm_vars | ||
|
||
# Generate the splice transform | ||
echo "Using splice +/- $splice_lr , step $splice_step" | ||
feature_transform=$dir/tr_splice$splice_lr-$splice_step.nnet | ||
$kaldi_dir/utils/nnet/gen_splice.py --fea-dim=$feat_dim --splice=$splice_lr --splice-step=$splice_step > $feature_transform | ||
|
||
# keep track of feat_type | ||
echo $feat_type > $dir/feat_type | ||
|
||
#renormalize the input to zero mean and unit variance | ||
cmvn_g="$dir/cmvn_glob.mat" | ||
echo "Renormalizing input features by : $cmvn_g" | ||
compute-cmvn-stats --binary=false "$feats nnet-forward $feature_transform ark:- ark:- |" $cmvn_g 2>${cmvn_g}_log || exit 1 | ||
#convert the global cmvn stats to nnet format | ||
cmvn-to-nnet --binary=false $cmvn_g $cmvn_g.nnet 2>$cmvn_g.nnet_log || exit 1; | ||
#append matrix to feature_transform | ||
{ | ||
feature_transform_old=$feature_transform | ||
feature_transform=${feature_transform%.nnet}_cmvn-g.nnet | ||
cp $feature_transform_old $feature_transform | ||
cat $cmvn_g.nnet >> $feature_transform | ||
} | ||
|
||
###### MAKE LINK TO THE FINAL feature_transform, so the other scripts will find it ###### | ||
(cd $dir; ln -s $(basename $feature_transform) final.feature_transform ) | ||
|
||
###### WRITE DATA ###### | ||
feat-write ${feature_transform:+ --feature-transform=$feature_transform} ${use_gpu_id:+ --use-gpu-id=$use_gpu_id} --utts-per-file=500 "$feats" "$kaldi_dir/$dir/" | ||
|
||
echo "Succeeded building data." | ||
|