Highly parallelizable Viterbi decoding for CPU or GPU compute. Below are time benchmarks of our method relative to librosa.sequence.viterbi. We use 1440 states and ~20 million timesteps over ~40k files for benchmarking.
| Method | Timesteps decoded per second |
|---|---|
| Librosa (1x cpu) | 208 |
| Librosa (16x cpu) | 1,382* |
| Proposed (1x cpu) | 171 |
| Proposed (16x cpu) | 2,240 |
| Proposed (1x a40 gpu, batch size 1) | 3,944,452 |
| Proposed (1x a40 gpu, batch size 512) | 692,160,422 |
*By default, librosa.sequence.viterbi uses one CPU thread. We use a Multiprocessing pool to parallelize.
git clone [email protected]:maxrmorrison/torbi
pip install torbi/If you receive an error message regarding mismatched CUDA versions, change the torch version in pyproject.toml to your currently installed version of torch.
To perform evaluation of the accuracy and speed of decoding methods, install torbi with the additional evaluation dependencies.
pip install torbi[evaluate]
import torbi
import torch
# Time-varying categorical distribution to decode
observation = torch.tensor([
[0.25, 0.5, 0.25],
[0.25, 0.25, 0.5],
[0.33, 0.33, 0.33]
]).unsqueeze(dim=0)
# Transition probabilities bewteen categories
transition = torch.tensor([
[0.5, 0.25, 0.25],
[0.33, 0.34, 0.33],
[0.25, 0.25, 0.5]
])
# Initial category probabilities
initial = torch.tensor([0.4, 0.35, 0.25])
# Find optimal path using CPU compute
torbi.from_probabilities(
observation,
transition=transition,
initial=initial,
log_probs=False)
# Find optimal path using GPU compute
torbi.from_probabilities(
observation,
transition=transition,
initial=initial,
log_probs=False,
gpu=0)def from_probabilities(
observation: torch.Tensor,
batch_frames: Optional[torch.Tensor] = None,
transition: Optional[torch.Tensor] = None,
initial: Optional[torch.Tensor] = None,
log_probs: bool = False,
gpu: Optional[int] = None,
num_threads: Optional[int] = 1
) -> torch.Tensor:
"""Decode a time-varying categorical distribution
Arguments
observation
Time-varying categorical distribution
shape=(batch, frames, states)
batch_frames
Number of frames in each batch item; defaults to all
shape=(batch,)
transition
Categorical transition matrix; defaults to uniform
shape=(states, states)
initial
Categorical initial distribution; defaults to uniform
shape=(states,)
log_probs
Whether inputs are in (natural) log space
gpu
GPU index to use for decoding. Defaults to CPU.
num_threads
The number of threads to use for parallelized decoding
Returns
indices
The decoded bin indices
shape=(batch, frames)
"""def from_file(
input_file: Union[str, os.PathLike],
transition_file: Optional[Union[str, os.PathLike]] = None,
initial_file: Optional[Union[str, os.PathLike]] = None,
log_probs: bool = False,
gpu: Optional[int] = None,
num_threads: Optional[int] = 1
) -> torch.Tensor:
"""Decode a time-varying categorical distribution file
Arguments
input_file
Time-varying categorical distribution file
shape=(frames, states)
transition_file
Categorical transition matrix file; defaults to uniform
shape=(states, states)
initial_file
Categorical initial distribution file; defaults to uniform
shape=(states,)
log_probs
Whether inputs are in (natural) log space
gpu
GPU index to use for decoding. Defaults to CPU.
num_threads
The number of threads to use for parallelized decoding
Returns
indices
The decoded bin indices
shape=(frames,)
"""def from_file_to_file(
input_file: Union[str, os.PathLike],
output_file: Union[str, os.PathLike],
transition_file: Optional[Union[str, os.PathLike]] = None,
initial_file: Optional[Union[str, os.PathLike]] = None,
log_probs: bool = False,
gpu: Optional[int] = None,
num_threads: Optional[int] = None
) -> None:
"""Decode a time-varying categorical distribution file and save
Arguments
input_file
Time-varying categorical distribution file
shape=(frames, states)
output_file
File to save decoded indices
transition_file
Categorical transition matrix file; defaults to uniform
shape=(states, states)
initial_file
Categorical initial distribution file; defaults to uniform
shape=(states,)
log_probs
Whether inputs are in (natural) log space
gpu
GPU index to use for decoding. Defaults to CPU.
num_threads
The number of threads to use for parallelized decoding
"""def from_files_to_files(
input_files: List[Union[str, os.PathLike]],
output_files: List[Union[str, os.PathLike]],
transition_file: Optional[Union[str, os.PathLike]] = None,
initial_file: Optional[Union[str, os.PathLike]] = None,
log_probs: bool = False,
gpu: Optional[int] = None,
num_threads: Optional[int] = None
) -> None:
"""Decode time-varying categorical distribution files and save
Arguments
input_files
Time-varying categorical distribution files
shape=(frames, states)
output_files
Files to save decoded indices
transition_file
Categorical transition matrix file; defaults to uniform
shape=(states, states)
initial_file
Categorical initial distribution file; defaults to uniform
shape=(states,)
log_probs
Whether inputs are in (natural) log space
gpu
GPU index to use for decoding. Defaults to CPU.
num_threads
The number of threads to use for parallelized decoding
"""usage: python -m torbi
[-h]
--input_files INPUT_FILES [INPUT_FILES ...]
--output_files OUTPUT_FILES [OUTPUT_FILES ...]
[--transition_file TRANSITION_FILE]
[--initial_file INITIAL_FILE]
[--log_probs]
[--gpu GPU]
[--num_threads NUM_THREADS]
arguments:
--input_files INPUT_FILES [INPUT_FILES ...]
Time-varying categorical distribution files
--output_files OUTPUT_FILES [OUTPUT_FILES ...]
Files to save decoded indices
optional arguments:
-h, --help
show this help message and exit
--transition_file TRANSITION_FILE
Categorical transition matrix file; defaults to uniform
--initial_file INITIAL_FILE
Categorical initial distribution file; defaults to uniform
--log_probs
Whether inputs are in (natural) log space
--gpu GPU
GPU index to use for decoding. Defaults to CPU.
--num_threads NUM_THREADS
The number of threads to use for parellelized CPU decoding
python -m torbi.data.download
Downloads and decompresses the daps and vctk datasets used for evaluation.
python -m torbi.data.preprocess --gpu 0
Preprocess the dataset to prepare time-varying categorical distributions for
evaluation. The distributions are pitch posteriorgrams produced by the penn
pitch estimator.
python -m torbi.partition
Select all examples in dataset for evaluation.
python -m torbi.evaluate --config <config> --gpu <gpu>
Evaluates the accuracy and speed of decoding methods. <gpu> is the GPU index.
M. Morrison, C. Churchwell, N. Pruyne, and B. Pardo, "Fine-Grained and Interpretable Neural Speech Editing," Interspeech, September 2024.
@inproceedings{morrison2024fine,
title={Fine-Grained and Interpretable Neural Speech Editing},
author={Morrison, Max and Churchwell, Cameron and Pruyne, Nathan and Pardo, Bryan},
booktitle={Interspeech},
month={September},
year={2024}
}