Skip to content

Commit

Permalink
feat: clustering
Browse files Browse the repository at this point in the history
  • Loading branch information
gcie committed Jul 8, 2021
1 parent 71d398f commit e8db677
Show file tree
Hide file tree
Showing 7 changed files with 183 additions and 24 deletions.
106 changes: 106 additions & 0 deletions cluster.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,106 @@
import os
import pathlib
import random
import tqdm
from argparse import ArgumentError, ArgumentParser
import pickle

from collections import defaultdict
import numpy as np
import sentencepiece
from more_itertools import grouper

from simi import dataset
from simi import utils
from simi.vectorization import vectorize
from simi.clusterization import cluster_kmeans

def parseArgs():
parser = ArgumentParser()

parser.add_argument('segmentation', type=pathlib.Path,
help='Path to the segmentation')
parser.add_argument('output', type=pathlib.Path,
help='Output path')
parser.add_argument('--vocab_size', type=int, default=100,
help='Size of the output vocab size, 100 by default')
parser.add_argument('--word2vec_size', type=int, default=100,
help='Size of the word2vec vectors, 100 by default')
parser.add_argument('--word2vec_path', type=pathlib.Path,
help='Path to the word2vec model, if not specified/empty then it will be computed')
parser.add_argument('--kmeans_path', type=str,
help='Path to the kmeans model, if not specified/empty then it will be computed')
parser.add_argument('--seed', type=int, default=290956,
help='Random seed')
return parser.parse_args()


class LibriSpeechSegmentation(object):
def __init__(self, path) -> None:
self.data = defaultdict(list)
self.vocab = set()
super().__init__()
for root, _, files in os.walk(path):
for file in files:
if file.endswith('.csv'):
for line in open(os.path.join(root, file), 'r', encoding='utf8'):
t1, t2, q, kind = line.strip().split(',')
self.data[file[:-4]].append((t1, t2, q, kind))
self.vocab.add(q)

def __getitem__(self, fname):
return self.data[fname]

def to_sentences(self):
return list(list(q for _, _, q, _ in d) for _, d in self.data.items())

def rename(self, word_map):
for _, sample in self.data.items():
for i in range(len(sample)):
sample[i] = (sample[i][0], sample[i][1], word_map[sample[i][2]], sample[i][3])

def save(self, path):
if not os.path.exists(path):
os.makedirs(path)
for fname, sample in self.data.items():
with open(path / (fname+'.csv'), 'w') as output:
for x in sample:
output.write(','.join(map(str, x)) + '\n')

def run(args):
print(f'Loading train segmentation...')
segmentation = LibriSpeechSegmentation(args.segmentation)
print(f'Vocabulary size of the segmentation: {len(segmentation.vocab)}')

assert len(segmentation.vocab) > args.vocab_size, 'Segmentation vocab size must be greater than the output vocab'

word2vec_path = f'./tmp/word2vec/s{args.seed}' if args.word2vec_path is None else args.word2vec_path
sentences = segmentation.to_sentences()
encodings, weights, reconstruct, build_map = vectorize(sentences, word2vec_path, args.word2vec_size)


kmeans_path = f'./tmp/kmeans/s{args.seed}_cosine' if args.kmeans_path is None else args.kmeans_path
labels = cluster_kmeans(encodings, weights, kmeans_path, args.vocab_size, cosine=True)

word_map = build_map(labels)
segmentation.rename(word_map)
segmentation.save(args.output)
print('Done!')

class StubArgs(object):
def __init__(self):
self.seed = 290956
self.segmentation = pathlib.Path('/pio/scratch/1/i290956/zs2021/simi/models/segmentations/train-clean-100_train-clean-100_vs1000_a1.0/viterbi_segmentation/')
# self.test_seg = pathlib.Path('/pio/scratch/1/i290956/zs2021/simi/models/segmentations/train-clean-100_dev-clean_vs1000_a1.0/viterbi_segmentation/')
# self.output = pathlib.Path('/pio/scratch/1/i290956/zs2021/simi/models/segmentations/train-clean-100_train-clean-100_vs1000_a1.0/viterbi_segmentation_clustered_100/')
self.output = pathlib.Path('/pio/scratch/1/i290956/zs2021/simi/tmp/segmentation')
self.vocab_size = 100
self.word2vec_size = 100
self.word2vec_path = None
self.kmeans_path = None


if __name__ == "__main__":
args = parseArgs()
# args = StubArgs()
run(args)
22 changes: 22 additions & 0 deletions results/clustering_results.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
dataset | vocab size (pre) | PER (pre) sp | PER (pre) viterbi | vocab size (after) | PER (after) sp | PER (after) viterbi
:---: | :---: | :---: | :---: | :---: | :---: | :---:
train-clean-100 | 1000 | | 61.06% | 50 | | 64.71%
train-clean-100 | 1000 | | 61.06% | 100 | | ?
train-clean-100 | 1000 | | 61.06% | 150 | | 61.89%
train-clean-100 | 1000 | | 61.06% | 200 | | 61.34%
train-clean-100 | 1000 | | 61.06% | 250 | | 61.25%
train-clean-100 | 1000 | | 61.06% | 300 | | 61.23%
train-clean-100 | 1000 | | 61.06% | 400 | | 61.05%
train-clean-100 | 1000 (sp: 994) | 98.10% | 67.46% | |
train-clean-100 | 2000 (sp: 1993) | 84.13% | 67.76% | |
train-clean-100 | 5000 (sp: 4981) | 69.28% | 65.46% | |
train-clean-100 | 10000 (sp: 9974)| 61.03% | 66.56% | |
train-clean-100 | 20000 (sp: 19946) | 55.04% | 68.99% | 50 | 76.87%
train-clean-100 | 20000 (sp: 19946) | 55.04% | 68.99% | 75 | 74.29%
train-clean-100 | 20000 (sp: 19946) | 55.04% | 68.99% | 100 | 74.00%
train-clean-100 | 20000 (sp: 19946) | 55.04% | 68.99% | 200 | 68.76%
train-clean-100 | 20000 (sp: 19946) | 55.04% | 68.99% | 300 | 66.17%
train-clean-100 | 50000 | | | |
train-clean-100 | 100000 | 49.24% | | |
train-clean-100 | 200000 | 49.15% | | |
train-clean-100 | 500000 | 49.43% | | |
8 changes: 8 additions & 0 deletions results/clustering_results_a10.0.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
dataset | vocab size (initial) | PER sp (initial) | PER viterbi (initial) | vocab size (reduced) | PER (after) sp | PER (after) viterbi
:---: | :---: | :---: | :---: | :---: | :---: | :---:
train-clean-100 | 1000 | 98.10% | 58.53%
train-clean-100 | 2000 | 84.13% | 52.77%
train-clean-100 | 5000 | 69.28% | 48.39%
train-clean-100 | 10000 | 61.03% | 47.11%
train-clean-100 | 20000 | 55.04% | 47.13%
train-clean-100 | 50000 | 50.48% | 48.96%
2 changes: 1 addition & 1 deletion scoring/simple_score.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,6 @@

quant_dir = (Path(quantized) / '..').resolve().name
acc = score_cpc_quantizations_matching_sentpieces_with_phones(
gt, quantized, shift=shift)
gt, quantized, shift=shift, per_ignore_short_blocks=0, print_sample=1)

print(f'PER: {acc:.2f} % ({quant_dir})')
17 changes: 17 additions & 0 deletions scripts/cluster.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
#!/bin/bash

: ${VS:="10000 20000 50000 "}
: ${VVS:="50 75 100 200 300 400"}
: ${ALPHA:=10.0}

for VS_ in $VS; do
for VVS_ in $VVS; do

python cluster.py \
/pio/scratch/1/i290956/zs2021/simi/models/segmentations_mpl100/train-clean-100_train-clean-100_vs${VS_}_a${ALPHA}/viterbi_segmentation/ \
/pio/scratch/1/i290956/zs2021/simi/models/segmentations_mpl100/train-clean-100_train-clean-100_vs${VS_}_a${ALPHA}/viterbi_segmentation_clustered_${VVS_}/ \
--word2vec_path /pio/scratch/1/i290956/zs2021/simi/models/word2vec_mpl100_sp/train-clean-100_vs${VS_}_a${ALPHA}_c${VVS_} \
--kmeans_path /pio/scratch/1/i290956/zs2021/simi/models/kmeans_mpl100_sp/train-clean-100_vs${VS_}_a${ALPHA}_c${VVS_}_cosine \
--vocab_size $VVS_ &
done;
done;
36 changes: 17 additions & 19 deletions scripts/segment.sh
Original file line number Diff line number Diff line change
@@ -1,29 +1,27 @@
#!/bin/bash

TRAINSET=train-full-960
TESTSET=dev-clean
TRAINSET=train-clean-100
TESTSET=train-clean-100
: ${ALPHA:=10.0}

# Space-delimited lists
: ${ALPHA:="1.0 2.0 5.0 8.0 10.0 12.0 15.0 20.0"}
: ${VS:="1000 "}
: ${VS:="1000 2000 5000 10000 20000 50000"}

echo "==================================================="
echo " WARNING: Jobs will be run in parallel."
echo " Be careful not to overload the system."
echo "==================================================="
sleep 5
sleep 1

for ALPH_ in $ALPHA; do
for VS_ in $VS ; do
python segment.py \
/pio/data/zerospeech2021/quantized/LibriSpeech/${TRAINSET}/quantized_outputs.txt \
/pio/data/zerospeech2021/quantized/LibriSpeech/${TESTSET}/quantized_outputs.txt \
$VS_ \
segmentations/${TRAINSET}_${TESTSET}_vs${VS_}_a${ALPH_}_test \
--sentencepiece_prefix=/pio/scratch/1/i290956/zs2021/simi/models/sentencepieces/${TRAINSET}_vs${VS_} \
--segmentation_output_format=csv \
--viterbi \
--alpha $ALPH_ \
--clusterings=/pio/scratch/1/i290956/zs2021/clusterings/LibriSpeech/${TESTSET} &
done
done
for VS_ in $VS ; do
python segment.py \
/pio/data/zerospeech2021/quantized/LibriSpeech/${TRAINSET}/quantized_outputs.txt \
/pio/data/zerospeech2021/quantized/LibriSpeech/${TESTSET}/quantized_outputs.txt \
$VS_ \
models/segmentations_mpl100/${TRAINSET}_${TESTSET}_vs${VS_}_a${ALPHA} \
--sentencepiece_prefix=/pio/scratch/1/i290956/zs2021/simi/models/sentencepieces_mpl100/${TRAINSET}_vs${VS_} \
--segmentation_output_format=csv \
--alpha=${ALPHA} \
--clusterings=/pio/scratch/1/i290956/zs2021/clusterings/LibriSpeech/${TESTSET} \
--viterbi &
done ;
16 changes: 12 additions & 4 deletions simi/vectorization.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,11 @@ def r(labels):
nonlocal d, segmentation
return [[labels[d[word]] for word in sentence] for sentence in segmentation]

return res, cnt, r
def m(labels):
nonlocal d
return { word: labels[d[word]] for word in d.keys() }

return res, cnt, r, m


def find_closest_encodings(segmentation, w2v):
Expand Down Expand Up @@ -61,7 +65,11 @@ def r(labels):
nonlocal d, segmentation
return [[labels[d[word]] for word in sentence] for sentence in segmentation]

return res, cnt, r
def m(labels):
nonlocal d
return { word: labels[d[word]] for word in d.keys() }

return res, cnt, r, m


def vectorize(data, path, size, train=True):
Expand All @@ -78,9 +86,9 @@ def vectorize(data, path, size, train=True):
raise Exception(f"Tried to eval word2vec, but there is no model at {path}. Maybe set train=True?")
# train word2vec
print("Training word2vec model...", flush=True)
Word2Vec(sentences=data, min_count=1, size=size).save(path)
Word2Vec(sentences=data, min_count=1, size=size).save(str(path))

model = Word2Vec.load(path)
model = Word2Vec.load(str(path))
if train:
return encode_and_format(data, model.wv)
else:
Expand Down

0 comments on commit e8db677

Please sign in to comment.