Skip to content

Commit

Permalink
copy some old data processing scripts
Browse files Browse the repository at this point in the history
  • Loading branch information
zxie committed Feb 23, 2016
1 parent c1cc1c6 commit 3d3bd9c
Show file tree
Hide file tree
Showing 3 changed files with 241 additions and 0 deletions.
143 changes: 143 additions & 0 deletions util/swbd/write_alis.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,143 @@
import collections

kaldi_base = "/scail/group/deeplearning/speech/awni/kaldi-stanford/kaldi-trunk/egs/swbd/s5b/"

# Symbols
laugh = '[laughter]'
noise = '[noise]'
voc_noise = '[vocalized-noise]'
space = '[space]'

# Spell out integers
integers = ['zero','one','two','three','four','five','six','seven','eight','nine']

def unique_tokens():
"""
Reads swbd transcripts and stores unique tokens.
"""
with open('data/train/text','r') as fid:
lines = [l.strip().split()[1:] for l in fid.readlines()]

tokens = collections.defaultdict(int)
for i,line in enumerate(lines):
for l in line:
if l == laugh or l==noise or l==voc_noise:
tokens[l] += 1
else:
for t in list(l):
if t=='_':
continue
try:
int(t)
except ValueError:
# Ignore integers
tokens[t] += 1
tokens[space] += 1
print "Parsed %d lines."%i

fid = open('ctc-utils/chars.txt','w')
for i,k in enumerate(tokens.keys()):
fid.write(k+' '+str(i+1)+'\n')
fid.close()

return tokens

def tokenize(labels,file='data/train/text_ctc'):
"""
Reads swbd transcripts and builds swbd k to list of
integer labels mapping.
"""

with open(file,'r') as fid:
lines = [l.strip().split() for l in fid.readlines()]
data = dict((l[0],l[1:]) for l in lines)

int_labels = [[labels[l] for l in list(i)] for i in integers]

# for every utterance
for k,line in data.iteritems():
newline = []
# for every word in transcription
for i,word in enumerate(line):
# for [noise] etc
if word in labels.keys():
newline.append(labels[word])
else:
# for every char in word
for j,char in enumerate(list(word)):
try:
newline.append(labels[char])
except KeyError:
# Add spelled out integer followed by space
newline += int_labels[int(char)]
if j < len(list(word)) - 1:
newline.append(labels[space])


# Add a space inbetween every word
if i < len(line) -1:
newline.append(labels[space])

data[k] = newline
return data

def write_alis(utts,file=kaldi_base+'exp/train_ctc',numfiles=20):
"""
Takes utterance to alignment mapping and splits it up
into alignment files according to file structure of
training set.
"""
for f in range(1,numfiles+1):
print "writing file %d..."%f
with open(file+'/keys%d.txt'%f,'r') as fid:
keys = [l.strip().split()[0] for l in fid.readlines()]

with open(file+'/alis%d.txt'%f,'w') as fid:
for k in keys:
fid.write(k+" "+" ".join(utts[k])+'\n')

def load_labels():
"""
Loads file with label to integer mapping. Use
unique_tokens to create file.
"""
with open('ctc-utils/chars.txt','r') as fid:
labels = dict(tuple(l.strip().split()) for l in fid.readlines())
return labels

def compute_bigrams():
"""
Compute bigrams with smoothing. Save in bigrams.bin.
"""
import cPickle as pickle
import numpy as np
fid_bg = open(kaldi_base+'exp/train_ctc/bigrams.bin','w')
labels = load_labels()
numLabels = len(labels.keys())
bigrams = np.ones((numLabels,numLabels))
numfiles = 384

for f in range(1,numfiles+1):
print "Reading alis %d."%f
with open('exp/train_ctc/alis%d.txt'%f,'r') as fid:
alis = [l.strip().split()[1:] for l in fid.readlines()]
for v in alis:
for i,j in zip(v[1:],v[:-1]):
bigrams[int(i)-1,int(j)-1] += 1

bigrams = bigrams/np.sum(bigrams,axis=0)
pickle.dump(bigrams,fid_bg)

return bigrams

if __name__=='__main__':
# unique_tokens()
labelset = load_labels()
data = [('train',384),('dev',20)]

for name,num in data:
utts = tokenize(labelset, file=kaldi_base+'data/%s/text_ctc'%name)
write_alis(utts, file=kaldi_base+'exp/%s_ctc'%name,numfiles=num)



24 changes: 24 additions & 0 deletions util/swbd/write_dev_test.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
#!/bin/bash

kaldi_dir=/scail/group/deeplearning/speech/awni/kaldi-stanford/kaldi-trunk/egs/swbd/s5b

. $kaldi_dir/path.sh ## Source the tools/utils

for x in 'dev' 'eval2000'
do
dir=exp/${x}_ctc #location to store nn data and models
data=data/$x #location of data
train_dir=exp/train_ctc #location of data using to train
echo "writing" $data

transform=$train_dir/$(readlink $train_dir/final.feature_transform)
echo $transform

mkdir $dir 2>/dev/null

feats="ark:copy-feats scp:$data/feats.scp ark:- | apply-cmvn --norm-vars=false --utt2spk=ark:$data/utt2spk scp:$data/cmvn.scp ark:- ark:- |"

feat-write --utts-per-file=200 --feature-transform=$transform "$feats" $kaldi_dir/$dir/
done


74 changes: 74 additions & 0 deletions util/swbd/write_feats.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,74 @@
#!/bin/bash

# feature config
norm_vars=false # normalize the FBANKs (CVN)
splice_lr=20 # temporal splicing
splice_step=1 # stepsize of the splicing (1 is no gap between frames, just like splice_feats does)

echo "$0 $@" # Print the command line for logging

[ -f path.sh ] && . ./path.sh;

data=data/train_nodup
dir=exp/train_ctc
kaldi_dir=/scail/group/deeplearning/speech/awni/kaldi-stanford/kaldi-trunk/egs/swbd/s5b

echo "$0 [info]: Build Training Data"
printf "\t dir : $dir \n"
printf "\t Train-set : $data \n"

mkdir -p $dir/{log,nnet}

# shuffle the list
echo "Preparing train lists"
cat $data/feats.scp | $kaldi_dir/utils/shuffle_list.pl --srand ${seed:-777} > $dir/train.scp

# print the list sizes
wc -l $dir/train.scp

#get feature dim
echo -n "Getting feature dim : "
feat_dim=$(feat-to-dim --print-args=false scp:$dir/train.scp -)
echo $feat_dim

#read the features
feats="ark:copy-feats scp:$dir/train.scp ark:- |"

#add per-speaker CMVN
echo "Will use CMVN statistics : $data/cmvn.scp"
[ ! -r $data/cmvn.scp ] && echo "Cannot find cmvn stats $data/cmvn.scp" && exit 1;
cmvn="scp:$data/cmvn.scp"
feats="$feats apply-cmvn --print-args=false --norm-vars=$norm_vars --utt2spk=ark:$data/utt2spk $cmvn ark:- ark:- |"
# keep track of norm_vars option
echo "$norm_vars" >$dir/norm_vars

# Generate the splice transform
echo "Using splice +/- $splice_lr , step $splice_step"
feature_transform=$dir/tr_splice$splice_lr-$splice_step.nnet
$kaldi_dir/utils/nnet/gen_splice.py --fea-dim=$feat_dim --splice=$splice_lr --splice-step=$splice_step > $feature_transform

# keep track of feat_type
echo $feat_type > $dir/feat_type

#renormalize the input to zero mean and unit variance
cmvn_g="$dir/cmvn_glob.mat"
echo "Renormalizing input features by : $cmvn_g"
compute-cmvn-stats --binary=false "$feats nnet-forward $feature_transform ark:- ark:- |" $cmvn_g 2>${cmvn_g}_log || exit 1
#convert the global cmvn stats to nnet format
cmvn-to-nnet --binary=false $cmvn_g $cmvn_g.nnet 2>$cmvn_g.nnet_log || exit 1;
#append matrix to feature_transform
{
feature_transform_old=$feature_transform
feature_transform=${feature_transform%.nnet}_cmvn-g.nnet
cp $feature_transform_old $feature_transform
cat $cmvn_g.nnet >> $feature_transform
}

###### MAKE LINK TO THE FINAL feature_transform, so the other scripts will find it ######
(cd $dir; ln -s $(basename $feature_transform) final.feature_transform )

###### WRITE DATA ######
feat-write ${feature_transform:+ --feature-transform=$feature_transform} ${use_gpu_id:+ --use-gpu-id=$use_gpu_id} --utts-per-file=500 "$feats" "$kaldi_dir/$dir/"

echo "Succeeded building data."

0 comments on commit 3d3bd9c

Please sign in to comment.