feat: clustering

chorowski-lab · Jul 8, 2021 · e8db677 · e8db677
1 parent 71d398f
commit e8db677
Show file tree

Hide file tree

Showing 7 changed files with 183 additions and 24 deletions.
diff --git a/cluster.py b/cluster.py
@@ -0,0 +1,106 @@
+import os
+import pathlib
+import random
+import tqdm
+from argparse import ArgumentError, ArgumentParser
+import pickle 
+
+from collections import defaultdict
+import numpy as np
+import sentencepiece
+from more_itertools import grouper
+
+from simi import dataset
+from simi import utils
+from simi.vectorization import vectorize
+from simi.clusterization import cluster_kmeans
+
+def parseArgs():
+    parser = ArgumentParser()
+
+    parser.add_argument('segmentation', type=pathlib.Path,
+                        help='Path to the segmentation')
+    parser.add_argument('output', type=pathlib.Path,
+                        help='Output path')
+    parser.add_argument('--vocab_size', type=int, default=100,
+                        help='Size of the output vocab size, 100 by default')
+    parser.add_argument('--word2vec_size', type=int, default=100,
+                        help='Size of the word2vec vectors, 100 by default')
+    parser.add_argument('--word2vec_path', type=pathlib.Path,
+                        help='Path to the word2vec model, if not specified/empty then it will be computed')
+    parser.add_argument('--kmeans_path', type=str,
+                        help='Path to the kmeans model, if not specified/empty then it will be computed')
+    parser.add_argument('--seed', type=int, default=290956,
+                        help='Random seed')
+    return parser.parse_args()
+
+
+class LibriSpeechSegmentation(object):
+    def __init__(self, path) -> None:
+        self.data = defaultdict(list)
+        self.vocab = set()
+        super().__init__()
+        for root, _, files in os.walk(path):
+            for file in files:
+                if file.endswith('.csv'):
+                    for line in open(os.path.join(root, file), 'r', encoding='utf8'):
+                        t1, t2, q, kind = line.strip().split(',')
+                        self.data[file[:-4]].append((t1, t2, q, kind))
+                        self.vocab.add(q)
+
+    def __getitem__(self, fname):
+        return self.data[fname]
+
+    def to_sentences(self):
+        return list(list(q for _, _, q, _ in d) for _, d in self.data.items())
+
+    def rename(self, word_map):
+        for _, sample in self.data.items():
+            for i in range(len(sample)):
+                sample[i] = (sample[i][0], sample[i][1], word_map[sample[i][2]], sample[i][3])
+
+    def save(self, path):
+        if not os.path.exists(path):
+            os.makedirs(path)
+        for fname, sample in self.data.items():
+            with open(path / (fname+'.csv'), 'w') as output:
+                for x in sample:
+                    output.write(','.join(map(str, x)) + '\n')
+
+def run(args):
+    print(f'Loading train segmentation...')
+    segmentation = LibriSpeechSegmentation(args.segmentation)
+    print(f'Vocabulary size of the segmentation: {len(segmentation.vocab)}')
+
+    assert len(segmentation.vocab) > args.vocab_size, 'Segmentation vocab size must be greater than the output vocab'
+
+    word2vec_path = f'./tmp/word2vec/s{args.seed}' if args.word2vec_path is None else args.word2vec_path
+    sentences = segmentation.to_sentences()
+    encodings, weights, reconstruct, build_map = vectorize(sentences, word2vec_path, args.word2vec_size)
+
+
+    kmeans_path = f'./tmp/kmeans/s{args.seed}_cosine' if args.kmeans_path is None else args.kmeans_path
+    labels = cluster_kmeans(encodings, weights, kmeans_path, args.vocab_size, cosine=True)
+
+    word_map = build_map(labels)
+    segmentation.rename(word_map)
+    segmentation.save(args.output)
+    print('Done!')
+
+class StubArgs(object):
+    def __init__(self):
+        self.seed = 290956
+        self.segmentation = pathlib.Path('/pio/scratch/1/i290956/zs2021/simi/models/segmentations/train-clean-100_train-clean-100_vs1000_a1.0/viterbi_segmentation/')
+        # self.test_seg = pathlib.Path('/pio/scratch/1/i290956/zs2021/simi/models/segmentations/train-clean-100_dev-clean_vs1000_a1.0/viterbi_segmentation/')
+        # self.output = pathlib.Path('/pio/scratch/1/i290956/zs2021/simi/models/segmentations/train-clean-100_train-clean-100_vs1000_a1.0/viterbi_segmentation_clustered_100/')
+        self.output = pathlib.Path('/pio/scratch/1/i290956/zs2021/simi/tmp/segmentation')
+        self.vocab_size = 100
+        self.word2vec_size = 100
+        self.word2vec_path = None
+        self.kmeans_path = None
+
+
+if __name__ == "__main__":
+    args = parseArgs()
+    # args = StubArgs()
+    run(args)
diff --git a/results/clustering_results.md b/results/clustering_results.md
@@ -0,0 +1,22 @@
+dataset | vocab size (pre) | PER (pre) sp | PER (pre) viterbi | vocab size (after) | PER (after) sp | PER (after) viterbi
+:---: | :---: | :---: | :---: | :---: | :---: | :---:
+train-clean-100 | 1000 | | 61.06% | 50 | | 64.71%
+train-clean-100 | 1000 | | 61.06% | 100 | | ?
+train-clean-100 | 1000 | | 61.06% | 150 | | 61.89%
+train-clean-100 | 1000 | | 61.06% | 200 | | 61.34%
+train-clean-100 | 1000 | | 61.06% | 250 | | 61.25%
+train-clean-100 | 1000 | | 61.06% | 300 | | 61.23%
+train-clean-100 | 1000 | | 61.06% | 400 | | 61.05%
+train-clean-100 | 1000 (sp: 994) | 98.10% | 67.46% | |
+train-clean-100 | 2000 (sp: 1993) | 84.13% | 67.76% | |
+train-clean-100 | 5000 (sp: 4981) | 69.28% | 65.46% | |
+train-clean-100 | 10000 (sp: 9974)| 61.03% | 66.56% | |
+train-clean-100 | 20000 (sp: 19946) | 55.04% | 68.99% | 50 | 76.87%
+train-clean-100 | 20000 (sp: 19946) | 55.04% | 68.99% | 75 | 74.29%
+train-clean-100 | 20000 (sp: 19946) | 55.04% | 68.99% | 100 | 74.00%
+train-clean-100 | 20000 (sp: 19946) | 55.04% | 68.99% | 200 | 68.76%
+train-clean-100 | 20000 (sp: 19946) | 55.04% | 68.99% | 300 | 66.17%
+train-clean-100 | 50000 | | | |
+train-clean-100 | 100000 | 49.24% | | |
+train-clean-100 | 200000 | 49.15% | | |
+train-clean-100 | 500000 | 49.43% | | |
diff --git a/results/clustering_results_a10.0.md b/results/clustering_results_a10.0.md
@@ -0,0 +1,8 @@
+dataset | vocab size (initial) | PER sp (initial) | PER viterbi (initial) | vocab size (reduced) | PER (after) sp | PER (after) viterbi
+:---: | :---: | :---: | :---: | :---: | :---: | :---:
+train-clean-100 | 1000 | 98.10% | 58.53%
+train-clean-100 | 2000 | 84.13% | 52.77%
+train-clean-100 | 5000 | 69.28% | 48.39%
+train-clean-100 | 10000 | 61.03% | 47.11%
+train-clean-100 | 20000 | 55.04% | 47.13%
+train-clean-100 | 50000 | 50.48% | 48.96%
diff --git a/scoring/simple_score.py b/scoring/simple_score.py
@@ -10,6 +10,6 @@
 
 quant_dir = (Path(quantized) / '..').resolve().name
 acc = score_cpc_quantizations_matching_sentpieces_with_phones(
-    gt, quantized, shift=shift)
+    gt, quantized, shift=shift, per_ignore_short_blocks=0, print_sample=1)
 
 print(f'PER: {acc:.2f} % ({quant_dir})')
diff --git a/scripts/cluster.sh b/scripts/cluster.sh
@@ -0,0 +1,17 @@
+#!/bin/bash
+
+: ${VS:="10000 20000 50000 "}
+: ${VVS:="50 75 100 200 300 400"}
+: ${ALPHA:=10.0}
+
+for VS_ in $VS; do
+    for VVS_ in $VVS; do
+
+        python cluster.py \
+            /pio/scratch/1/i290956/zs2021/simi/models/segmentations_mpl100/train-clean-100_train-clean-100_vs${VS_}_a${ALPHA}/viterbi_segmentation/ \
+            /pio/scratch/1/i290956/zs2021/simi/models/segmentations_mpl100/train-clean-100_train-clean-100_vs${VS_}_a${ALPHA}/viterbi_segmentation_clustered_${VVS_}/ \
+            --word2vec_path /pio/scratch/1/i290956/zs2021/simi/models/word2vec_mpl100_sp/train-clean-100_vs${VS_}_a${ALPHA}_c${VVS_} \
+            --kmeans_path /pio/scratch/1/i290956/zs2021/simi/models/kmeans_mpl100_sp/train-clean-100_vs${VS_}_a${ALPHA}_c${VVS_}_cosine \
+            --vocab_size $VVS_ &
+    done;
+done;
diff --git a/scripts/segment.sh b/scripts/segment.sh
@@ -1,29 +1,27 @@
 #!/bin/bash
 
-TRAINSET=train-full-960
-TESTSET=dev-clean
+TRAINSET=train-clean-100
+TESTSET=train-clean-100
+: ${ALPHA:=10.0}
 
 # Space-delimited lists
-: ${ALPHA:="1.0 2.0 5.0 8.0 10.0 12.0 15.0 20.0"}
-: ${VS:="1000 "}
+: ${VS:="1000 2000 5000 10000 20000 50000"}
 
 echo "==================================================="
 echo "  WARNING: Jobs will be run in parallel."
 echo "           Be careful not to overload the system."
 echo "==================================================="
-sleep 5
+sleep 1
 
-for ALPH_ in $ALPHA; do
-    for VS_ in $VS ; do
-        python segment.py \
-            /pio/data/zerospeech2021/quantized/LibriSpeech/${TRAINSET}/quantized_outputs.txt \
-            /pio/data/zerospeech2021/quantized/LibriSpeech/${TESTSET}/quantized_outputs.txt \
-            $VS_ \
-            segmentations/${TRAINSET}_${TESTSET}_vs${VS_}_a${ALPH_}_test \
-            --sentencepiece_prefix=/pio/scratch/1/i290956/zs2021/simi/models/sentencepieces/${TRAINSET}_vs${VS_} \
-            --segmentation_output_format=csv \
-            --viterbi \
-            --alpha $ALPH_ \
-            --clusterings=/pio/scratch/1/i290956/zs2021/clusterings/LibriSpeech/${TESTSET} &
-    done
-done
+for VS_ in $VS ; do
+    python segment.py \
+        /pio/data/zerospeech2021/quantized/LibriSpeech/${TRAINSET}/quantized_outputs.txt \
+        /pio/data/zerospeech2021/quantized/LibriSpeech/${TESTSET}/quantized_outputs.txt \
+        $VS_ \
+        models/segmentations_mpl100/${TRAINSET}_${TESTSET}_vs${VS_}_a${ALPHA} \
+        --sentencepiece_prefix=/pio/scratch/1/i290956/zs2021/simi/models/sentencepieces_mpl100/${TRAINSET}_vs${VS_} \
+        --segmentation_output_format=csv \
+        --alpha=${ALPHA} \
+        --clusterings=/pio/scratch/1/i290956/zs2021/clusterings/LibriSpeech/${TESTSET} \
+        --viterbi &
+done ;
diff --git a/simi/vectorization.py b/simi/vectorization.py
@@ -23,7 +23,11 @@ def r(labels):
         nonlocal d, segmentation
         return [[labels[d[word]] for word in sentence] for sentence in segmentation]
 
-    return res, cnt, r
+    def m(labels):
+        nonlocal d
+        return { word: labels[d[word]] for word in d.keys() }
+
+    return res, cnt, r, m
 
 
 def find_closest_encodings(segmentation, w2v):
@@ -61,7 +65,11 @@ def r(labels):
         nonlocal d, segmentation
         return [[labels[d[word]] for word in sentence] for sentence in segmentation]
 
-    return res, cnt, r
+    def m(labels):
+        nonlocal d
+        return { word: labels[d[word]] for word in d.keys() }
+
+    return res, cnt, r, m
 
 
 def vectorize(data, path, size, train=True):
@@ -78,9 +86,9 @@ def vectorize(data, path, size, train=True):
             raise Exception(f"Tried to eval word2vec, but there is no model at {path}. Maybe set train=True?")
         # train word2vec
         print("Training word2vec model...", flush=True)
-        Word2Vec(sentences=data, min_count=1, size=size).save(path)
+        Word2Vec(sentences=data, min_count=1, size=size).save(str(path))
 
-    model = Word2Vec.load(path)
+    model = Word2Vec.load(str(path))
     if train:
         return encode_and_format(data, model.wv)
     else: