-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathclusterization.py
140 lines (111 loc) · 5.43 KB
/
clusterization.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
import json
import os
import pathlib
from argparse import ArgumentParser
from pathlib import Path
from time import time
from urllib import parse
import numpy as np
import progressbar
import torch
from cpc.dataset import findAllSeqs
from cpc.feature_loader import buildFeature_batch
from simi.clusterization.utils_functions import loadClusterModule, loadCPCFeatureMaker, readArgs
def parseArgs():
parser = ArgumentParser()
parser.add_argument('clustering_checkpoint', type=pathlib.Path,
help='Path to the clustering checkpoint')
parser.add_argument('dataset', type=pathlib.Path,
help='Path to the dataset, which is to be quantized')
parser.add_argument('output', type=pathlib.Path,
help='Output path')
parser.add_argument('--file-ext', type=str, default='wav',
help='File extension of the audio files')
parser.add_argument('--cuda', action='store_true',
help='Use CUDA')
return parser.parse_args()
def quantize_file(file_path, cpc_feature_function, clusterModule, args):
# Get CPC features
cFeatures = cpc_feature_function(file_path)
if clusterModule.Ck.is_cuda:
cFeatures = cFeatures.cuda()
nGroups = cFeatures.size(-1)//clusterModule.Ck.size(-1) # groups information
# Quantize the output of clustering on the CPC features
cFeatures = cFeatures.view(1, -1, clusterModule.Ck.size(-1))
clustered = clusterModule(cFeatures)
if args.cuda:
clusterModule = clusterModule.cuda()
return clustered.detach().cpu().numpy().reshape(-1, clusterModule.k)
def main(args):
pathClusteringCheckpoint = str(args.clustering_checkpoint) # '/pio/data/zerospeech2021/checkpoints/CPC-big-kmeans50/clustering_kmeans50/clustering_CPC_big_kmeans50.pt'
pathDB = str(args.dataset) # '/pio/data/zerospeech2021/LibriSpeech/test-clean'
pathOutputDir = str(args.output) # '/pio/scratch/1/i290956/zs2021/clusterings/LibriSpeech/test-clean'
seqNames, _ = findAllSeqs(pathDB, speaker_level=1, extension=args.file_ext, loadCache=True)
if not os.path.exists(pathOutputDir):
print("")
print(f"Creating the output directory at {pathOutputDir}")
Path(pathOutputDir).mkdir(parents=True, exist_ok=True)
assert len(seqNames) > 0, \
"No file to be quantized!"
assert pathClusteringCheckpoint[-3:] == ".pt"
if os.path.exists(pathClusteringCheckpoint[:-3] + "_args.json"):
pathConfig = pathClusteringCheckpoint[:-3] + "_args.json"
elif os.path.exists(os.path.join(os.path.dirname(pathClusteringCheckpoint), "checkpoint_args.json")):
pathConfig = os.path.join(os.path.dirname(pathClusteringCheckpoint), "checkpoint_args.json")
clustering_args = readArgs(pathConfig)
print("")
print(f"Clutering args:\n{json.dumps(vars(clustering_args), indent=4, sort_keys=True)}")
print('-' * 50)
if not os.path.isabs(clustering_args.pathCheckpoint): # Maybe it's relative path
clustering_args.pathCheckpoint = os.path.join(os.path.dirname(os.path.abspath(pathClusteringCheckpoint)), clustering_args.pathCheckpoint)
assert os.path.exists(clustering_args.pathCheckpoint), \
f"CPC path at {clustering_args.pathCheckpoint} does not exist!!"
# Load CluterModule
print("")
print(f"Loading ClusterModule at {pathClusteringCheckpoint}")
clusterModule = loadClusterModule(pathClusteringCheckpoint)
if args.cuda:
clusterModule.cuda()
print("ClusterModule loaded!")
print("")
print(f"Loading CPC FeatureMaker from {clustering_args.pathCheckpoint}")
## If we don't apply batch implementation, we can set LSTM model to keep hidden units
## making the quality of the quantized units better (that's why I set keep_hidden=args.nobatch)
featureMaker = loadCPCFeatureMaker(
clustering_args.pathCheckpoint,
gru_level=vars(clustering_args).get('level_gru', None),
get_encoded=clustering_args.encoder_layer,
keep_hidden=False)
if clustering_args.dimReduction is not None:
dimRed = loadDimReduction(clustering_args.dimReduction, clustering_args.centroidLimits)
featureMaker = torch.nn.Sequential(featureMaker, dimRed)
if not clustering_args.train_mode:
featureMaker.eval()
if args.cuda:
featureMaker.cuda()
def cpc_feature_function(x):
return buildFeature_batch(featureMaker, x,seqNorm=False, strict=True,
maxSizeSeq=10240, batch_size=8)
print("CPC FeatureMaker loaded!")
# Quantization of files
print("")
bar = progressbar.ProgressBar(maxval=len(seqNames))
bar.start()
start_time = time()
for index, vals in enumerate(seqNames):
bar.update(index)
file_path = vals[1]
file_path = os.path.join(pathDB, file_path)
file_name = os.path.splitext(os.path.basename(file_path))[0]
outputPath = os.path.join(pathOutputDir, file_name + '.npy')
if not os.path.exists(outputPath):
# Quantization
f = open(outputPath, 'wb')
f.close()
clustered_file = quantize_file(file_path, cpc_feature_function, clusterModule, args)
np.save(outputPath, clustered_file)
bar.finish()
print(f"...done {len(seqNames)} files in {time()-start_time} seconds.")
if __name__ == "__main__":
args = parseArgs()
main(args)