Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Submission of work done by Jarek #16

Open
wants to merge 4 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
27 changes: 26 additions & 1 deletion cpc/cpc_default_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,31 @@ def set_default_cpc_config(parser):
group.add_argument('--limitNegsInBatch', type=int, default=0,
help='Limit the number of different seqs from whithc neg samples are taken.')

group.add_argument('--smartpoolingLayer', type=int, default=4,
help='Which layers of the encoder should be replaced with smartpooling. Available layers: 3, 4, 5 (smart averaging)')
group.add_argument('--smartpoolingNoPadding', action='store_true',
help='No padding is added to encoder conv layer')
group.add_argument('--smartpoolingDimMlp', type=int, default=2048,
help='Dimension of the mlp responsible for assigning importance to frames.')
group.add_argument('--smartpoolingUseDifferences', action='store_true',
help='Whether to not use mlp for importance and use abs of differences of consecutive values')
group.add_argument('--smartpoolingTemperature', type=float, default=1e-5,
help='Temperature added to frame importance. Larger temperature means the importance is going to be smoother')
group.add_argument('--smartaveragingWindowSize', type=int, default=None,
help='How large the smart averaging window should be')
group.add_argument('--smartaveragingHardcodedWeights', action='store_true',
help='Make the MLP output some hardcoded averaging weights')
group.add_argument('--smartaveragingHardcodedWindowSize', type=int, default=None,
help='How large the smart averaging HARDCODED window should be')

group.add_argument('--smartpoolingInAR', action='store_true',
help='Put smart averaging in AR. So archtecture is encoder -> (smart averaging -> AR) instead of (encoder -> smart averaging) -> AR')
group.add_argument('--smartpoolingInARUnfreezeEpoch', type=int, default=None,
help='Which epoch to unfreeze the smartpooling in the AR. 0 means it is unfrozen from the start')
group.add_argument('--smartaveragingLossParameter', type=float, default=None,
help='The hyperparameter to scale the smart averaging loss. None means that the loss is not applied')
group.add_argument('--smartaveragingLossAverage', type=float, default=None,
help='Which value should the average aim towards')

group.add_argument('--negativeSamplingExt', type=int, default=128,
help='Number of negative samples to take.')
Expand Down Expand Up @@ -70,7 +95,7 @@ def set_default_cpc_config(parser):
choices=['reverse', 'none'],
help='Some variations on CPC.')
group.add_argument('--encoder_type', type=str,
choices=['cpc', 'mfcc', 'lfb'],
choices=['cpc', 'mfcc', 'lfb', 'smart'],
default='cpc',
help='Replace the encoder network by mfcc features '
'or learned filter banks')
Expand Down
24 changes: 16 additions & 8 deletions cpc/criterion/criterion.py
Original file line number Diff line number Diff line change
Expand Up @@ -269,18 +269,22 @@ def forward(self, cFeature, encodedData, label, captureOptions=None):

class SpeakerCriterion(BaseCriterion):

def __init__(self, dimEncoder, nSpeakers, nLayers=1):
def __init__(self, dimEncoder, nSpeakers, nLayers=1, dimClassifier=None):

super(SpeakerCriterion, self).__init__()
# self.linearSpeakerClassifier = nn.Linear(
# dimEncoder, nSpeakers)
if nLayers == 1:
self.linearSpeakerClassifier = nn.Linear(dimEncoder, nSpeakers)
else:
outLayers = [nn.Linear(dimEncoder, nSpeakers)]
for l in range(nLayers - 1):
if dimClassifier is None:
dimClassifier = nSpeakers
outLayers = [nn.Linear(dimEncoder, dimClassifier)]
for l in range(0, nLayers - 2):
outLayers.append(nn.ReLU())
outLayers.append(nn.Linear(nSpeakers, nSpeakers))
outLayers.append(nn.Linear(dimClassifier, dimClassifier))
outLayers.append(nn.ReLU())
outLayers.append(nn.Linear(dimClassifier, nSpeakers))
self.linearSpeakerClassifier = nn.Sequential(*outLayers)
self.lossCriterion = nn.CrossEntropyLoss()
self.entropyCriterion = nn.LogSoftmax(dim=1)
Expand Down Expand Up @@ -325,16 +329,20 @@ def forward(self, cFeature, otherEncoded, label):
class PhoneCriterion(BaseCriterion):

def __init__(self, dimEncoder, nPhones, onEncoder,
nLayers=1):
nLayers=1, dimClassifier=None):

super(PhoneCriterion, self).__init__()
if nLayers == 1:
self.PhoneCriterionClassifier = nn.Linear(dimEncoder, nPhones)
else:
outLayers = [nn.Linear(dimEncoder, nPhones)]
for l in range(nLayers - 1):
if dimClassifier is None:
dimClassifier = nPhones
outLayers = [nn.Linear(dimEncoder, dimClassifier)]
for l in range(nLayers - 2):
outLayers.append(nn.ReLU())
outLayers.append(nn.Linear(nPhones, nPhones))
outLayers.append(nn.Linear(dimClassifier, dimClassifier))
outLayers.append(nn.ReLU())
outLayers.append(nn.Linear(dimClassifier, nPhones))
self.PhoneCriterionClassifier = nn.Sequential(*outLayers)

self.lossCriterion = nn.CrossEntropyLoss()
Expand Down
49 changes: 48 additions & 1 deletion cpc/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,8 @@ def __init__(self,
phoneLabelsDict,
nSpeakers,
nProcessLoader=50,
MAX_SIZE_LOADED=4000000000):
MAX_SIZE_LOADED=4000000000,
alternatePhoneLabelsDict=None):
"""
Args:
- path (string): path to the training dataset
Expand Down Expand Up @@ -427,6 +428,8 @@ def extractLength(couple):
speaker, locPath = couple
info = torchaudio.info(str(locPath))[0]
return info.length
#info = torchaudio.info(str(locPath))
#return info.num_frames


def findAllSeqs(dirName,
Expand Down Expand Up @@ -517,6 +520,50 @@ def parseSeqLabels(pathLabels):
maxPhone = max(maxPhone, max(output[data[0]]))
return output, maxPhone + 1

def parseSeqLabelsAlternate(pathLabels, phoneLabels):
files = [(filename, dirname) for dirname, _, files in os.walk(pathLabels, followlinks=True) for filename in files if filename.endswith(".csv")]

output = {"step": 160} # Step in librispeech dataset is 160bits
phones = set()
maxPhone = 0
for filename, dirname in files:
with open(os.path.join(dirname, filename), 'r') as f:
filename = filename[:-len(".csv")]
if filename not in phoneLabels:
continue

output[filename] = []
lines = f.readlines()
for line in lines:
data = line.rstrip().split(",")
start_time = float(data[0])
end_time = float(data[1])
phone = data[2]
mode = data[3]
if mode == "words":
continue
output[filename].append((start_time, end_time, phone))

phones.update([data[2] for data in output[filename]])

phones = {phone : counter for counter, phone in enumerate(sorted(phones))}

for filename in output:
if filename == "step":
continue

data = output[filename]
total_time = data[-1][1]
total_frames = len(phoneLabels[filename])
output[filename] = [-1] * total_frames

for start_time, end_time, phone in data:
start_frame = int(start_time / total_time * total_frames)
end_frame = int(end_time / total_time * total_frames)
output[filename][start_frame : end_frame] = [phones[phone]] * (end_frame - start_frame)

return output, phones


def filterSeqs(pathTxt, seqCouples, percentage=None, totalNum=None):
assert(percentage is None or totalNum is None)
Expand Down
10 changes: 8 additions & 2 deletions cpc/eval/linear_separability.py
Original file line number Diff line number Diff line change
Expand Up @@ -281,6 +281,10 @@ def parse_args(argv):
parser.add_argument('--max_size_loaded', type=int, default=4000000000,
help='Maximal amount of data (in byte) a dataset '
'can hold in memory at any given time')
parser.add_argument('--n_layers', type=int, default=1,
help='Number of layers in the criterion')
parser.add_argument('--dim_classifier', type=int, default=None,
help='The dimension inbetween embeddings and classifier. Different from dim_inter. Dim_classifier makes it (dim_embeddings x dim_classifier) x (dim_classifier x dim_classifier)^(n_layer-1) x (dim_classifier x n_speakers or n_phones)')
parser.add_argument("--model", type=str, default="cpc",
help="Pre-trained model architecture ('cpc' [default] or 'wav2vec2').")
parser.add_argument("--path_fairseq", type=str, default="/pio/scratch/1/i273233/fairseq",
Expand Down Expand Up @@ -407,7 +411,7 @@ def my_nullspace(At, rcond=None):
if not args.CTC:
print(f"Running phone separability with aligned phones")
criterion = cr.PhoneCriterion(dim_features,
n_phones, args.get_encoded)
n_phones, args.get_encoded, nLayers=args.n_layers, dimClassifier=args.dim_classifier)
else:
print(f"Running phone separability with CTC loss")
criterion = cr.CTCPhoneCriterion(dim_features,
Expand All @@ -418,10 +422,12 @@ def my_nullspace(At, rcond=None):
if args.mode == "speakers_factorized":
criterion = cr.SpeakerDoubleCriterion(dim_features, dim_inter, len(speakers))
else:
criterion = cr.SpeakerCriterion(dim_features, len(speakers))
criterion = cr.SpeakerCriterion(dim_features, len(speakers), nLayers=args.n_layers, dimClassifier=args.dim_classifier)
criterion.cuda()
criterion = torch.nn.DataParallel(criterion, device_ids=range(args.nGPU))

if args.mode != "phonemes_nullspace" and args.mode != "speakers_nullspace":
model.disableSmartaveragingLossParameter()
model.cuda()
model = torch.nn.DataParallel(model, device_ids=range(args.nGPU))

Expand Down
10 changes: 9 additions & 1 deletion cpc/feature_loader.py
Original file line number Diff line number Diff line change
Expand Up @@ -145,6 +145,9 @@ def getEncoder(args):
elif args.encoder_type == 'lfb':
from .model import LFBEnconder
return LFBEnconder(args.hiddenEncoder)
elif args.encoder_type == 'smart' and not args.smartpoolingInAR:
from .model import CPCSmartpoolEncoder
return CPCSmartpoolEncoder(args.hiddenEncoder, args.normMode, smartpoolingLayer=args.smartpoolingLayer, noPadding=args.smartpoolingNoPadding, dimMlp=args.smartpoolingDimMlp, useDifferences=args.smartpoolingUseDifferences, temperature=args.smartpoolingTemperature, smartaveragingHardcodedWeights=args.smartaveragingHardcodedWeights, smartaveragingWindowSize=args.smartaveragingWindowSize, smartaveragingLossParameter=args.smartaveragingLossParameter, smartaveragingHardcodedWindowSize=args.smartaveragingHardcodedWindowSize)
else:
from .model import CPCEncoder
return CPCEncoder(args.hiddenEncoder, args.normMode)
Expand All @@ -161,11 +164,13 @@ def getAR(args):
arNet = NoAr()
else:
from .model import CPCAR
smartpoolingConfig = (args.smartpoolingDimMlp, args.smartpoolingUseDifferences, args.smartpoolingTemperature, args.smartaveragingHardcodedWeights, args.smartaveragingWindowSize, args.smartaveragingLossParameter is not None, args.smartaveragingHardcodedWindowSize) if args.smartpoolingInAR else None
arNet = CPCAR(args.hiddenEncoder, args.hiddenGar,
args.samplingType == "sequential",
args.nLevelsGRU,
mode=args.arMode,
reverse=args.cpc_mode == "reverse")
reverse=args.cpc_mode == "reverse",
smartpoolingConfig=smartpoolingConfig)
return arNet


Expand Down Expand Up @@ -213,6 +218,9 @@ def loadModel(pathCheckpoints, loadStateDict=True, load_nullspace=False, updateC
else:
m_.load_state_dict(state_dict["best"], strict=False)

if locArgs.smartaveragingLossParameter is not None:
m_.smartaveragingLossParameter = locArgs.smartaveragingLossParameter
m_.smartpoolingInAR = locArgs.smartpoolingInAR

if not doLoad:
hiddenGar += locArgs.hiddenGar
Expand Down
Loading