Skip to content

Commit

Permalink
Fix simplify_arpabet bug
Browse files Browse the repository at this point in the history
  • Loading branch information
zsh01 committed Jun 23, 2021
1 parent fb61e71 commit 6e95fc1
Showing 1 changed file with 18 additions and 2 deletions.
20 changes: 18 additions & 2 deletions scoring/fit.py
Original file line number Diff line number Diff line change
Expand Up @@ -214,7 +214,7 @@ def score_cpc_quantizations(gt_alignments, quantized_outputs,
def score_cpc_quantizations_matching_sentpieces_with_phones(
gt_alignments, quantized_outputs, quantized_format='csv', shift=-1,
stride=10, subsample=1, simplify_arpabet=False,
per_ignore_short_blocks=1, print_sample=0):
per_ignore_short_blocks=1, print_sample=0, save_collapsed_phones=False):

csvs = {Path(f).stem: f for f in Path(gt_alignments).rglob('*.csv')}

Expand Down Expand Up @@ -253,7 +253,7 @@ def score_cpc_quantizations_matching_sentpieces_with_phones(
for (start, end, ph) in ali:
t[start:end] = ph

sps = load_alignments(csvs_sp[key], simplify_arpabet=simplify_arpabet)
sps = load_alignments(csvs_sp[key], simplify_arpabet=False)
for (start, end, sp) in sps:
for iv in t[start:end]:
piece2phone[sp][iv.data] += iv.end - iv.begin
Expand All @@ -273,6 +273,22 @@ def rle(s, counts=True):
else:
return [k for k, v in itertools.groupby(s)]

if save_collapsed_phones:
# Collapses phones to their regexps, e.g., aaaaaFFFFggg --> a+F+g+
pieceRLE2phone = defaultdict(Counter)
with open('pieces2phones.txt', 'w') as f:
for pi, ph in piece2phone.items():
rle_ = ''.join(rle(pi, counts=False))
f.write(f'{pi} {rle_} {ph}\n')
pieceRLE2phone[rle_][ph] += 1

s = 0
for rle_, cnt in pieceRLE2phone.items():
print(cnt, cnt.most_common(1))
s += cnt.most_common(1)[0][1]
print('After collapsing:', len(pieceRLE2phone))
print('Collapsed properly:', s)

wers = []
# Map sentencepieces to phones
for idx, (key, gt_sp) in enumerate(data.items()):
Expand Down

0 comments on commit 6e95fc1

Please sign in to comment.