Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add additional audio attack methods for enhanced evaluation #57

Closed
wants to merge 1 commit into from
Closed
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
266 changes: 265 additions & 1 deletion examples/attacks.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@

import julius
import torch

import torchaudio

def generate_pink_noise(length: int) -> torch.Tensor:
"""
Expand Down Expand Up @@ -294,3 +294,267 @@ def shush(
shush_tensor[:, :, :int(fraction*time)] = 0.0

return audio_effect_return(tensor=shush_tensor, mask=mask)

@staticmethod
def pitch_shift(
waveform: torch.Tensor,
sample_rate: int = 16000,
n_steps: float = 2,
mask: tp.Optional[torch.Tensor] = None,
) -> tp.Union[tp.Tuple[torch.Tensor, torch.Tensor], torch.Tensor]:
"""
Change the pitch of an audio signal by a given number of steps.
"""
pitch_shifted = torchaudio.transforms.PitchShift(sample_rate, n_steps=n_steps)(waveform)
return audio_effect_return(tensor=pitch_shifted, mask=mask)

@staticmethod
def reverse(
tensor: torch.Tensor,
mask: tp.Optional[torch.Tensor] = None,
) -> tp.Union[tp.Tuple[torch.Tensor, torch.Tensor], torch.Tensor]:
"""
Reverse the audio signal.

Parameters:
- tensor (torch.Tensor): Input audio tensor, assuming shape (batch_size, channels, time).
- mask (torch.Tensor): Optional mask tensor.

Returns:
- torch.Tensor: Reversed audio tensor.
"""
reversed_tensor = torch.flip(tensor, dims=[-1])
return audio_effect_return(tensor=reversed_tensor, mask=mask)

@staticmethod
def pitch_shift(
tensor: torch.Tensor,
n_steps: float = 2.0,
sample_rate: int = 16000,
mask: tp.Optional[torch.Tensor] = None,
) -> tp.Union[tp.Tuple[torch.Tensor, torch.Tensor], torch.Tensor]:
"""
Change the pitch of the audio signal by a given number of steps.

Parameters:
- tensor (torch.Tensor): Input audio tensor, assuming shape (batch_size, channels, time).
- n_steps (float): Number of pitch steps to shift (positive for higher pitch, negative for lower pitch).
- sample_rate (int): Sample rate of the audio signal.
- mask (torch.Tensor): Optional mask tensor.

Returns:
- torch.Tensor: Pitch-shifted audio tensor.
"""
shifted_tensor = julius.pitch_shift(tensor, sample_rate, n_steps)
return audio_effect_return(tensor=shifted_tensor, mask=mask)

@staticmethod
def clipping(
tensor: torch.Tensor,
clip_value: float = 0.5,
mask: tp.Optional[torch.Tensor] = None,
) -> tp.Union[tp.Tuple[torch.Tensor, torch.Tensor], torch.Tensor]:
"""
Clip the audio signal to a specific threshold value, distorting the signal.

Parameters:
- tensor (torch.Tensor): Input audio tensor, assuming shape (batch_size, channels, time).
- clip_value (float): Threshold for clipping the audio signal.
- mask (torch.Tensor): Optional mask tensor.

Returns:
- torch.Tensor: Clipped audio tensor.
"""
clipped_tensor = torch.clamp(tensor, min=-clip_value, max=clip_value)
return audio_effect_return(tensor=clipped_tensor, mask=mask)


@staticmethod
def time_stretch(
tensor: torch.Tensor,
stretch_factor: float = 1.2,
mask: tp.Optional[torch.Tensor] = None,
) -> tp.Union[tp.Tuple[torch.Tensor, torch.Tensor], torch.Tensor]:
"""
Stretch the audio signal in time without changing its pitch.

Parameters:
- tensor (torch.Tensor): Input audio tensor, assuming shape (batch_size, channels, time).
- stretch_factor (float): Factor by which to stretch the audio (greater than 1 for slower, less than 1 for faster).
- mask (torch.Tensor): Optional mask tensor.

Returns:
- torch.Tensor: Time-stretched audio tensor.
"""
stretched_tensor = julius.time_stretch(tensor, stretch_factor)
return audio_effect_return(tensor=stretched_tensor, mask=mask)

@staticmethod
def tremolo(
tensor: torch.Tensor,
frequency: float = 5.0,
depth: float = 0.5,
sample_rate: int = 16000,
mask: tp.Optional[torch.Tensor] = None,
) -> tp.Union[tp.Tuple[torch.Tensor, torch.Tensor], torch.Tensor]:
"""
Apply a tremolo effect to the audio signal by modulating its amplitude.

Parameters:
- tensor (torch.Tensor): Input audio tensor, assuming shape (batch_size, channels, time).
- frequency (float): Frequency of the tremolo effect in Hz.
- depth (float): Depth of modulation (between 0 and 1).
- sample_rate (int): Sample rate of the audio signal.
- mask (torch.Tensor): Optional mask tensor.

Returns:
- torch.Tensor: Audio tensor with tremolo effect applied.
"""
time = torch.arange(tensor.shape[-1], device=tensor.device) / sample_rate
modulation = (1.0 + depth * torch.sin(2 * torch.pi * frequency * time)) / 2.0
tremolo_tensor = tensor * modulation.unsqueeze(0).unsqueeze(0)
return audio_effect_return(tensor=tremolo_tensor, mask=mask)

@staticmethod
def flanger(
tensor: torch.Tensor,
delay: float = 0.002,
depth: float = 0.002,
rate: float = 0.25,
sample_rate: int = 16000,
mask: tp.Optional[torch.Tensor] = None,
) -> tp.Union[tp.Tuple[torch.Tensor, torch.Tensor], torch.Tensor]:
"""
Apply a flanger effect to the audio signal by mixing a delayed version of the signal with itself.

Parameters:
- tensor (torch.Tensor): Input audio tensor, assuming shape (batch_size, channels, time).
- delay (float): Base delay time in seconds.
- depth (float): Depth of the delay modulation.
- rate (float): Rate of modulation in Hz.
- sample_rate (int): Sample rate of the audio signal.
- mask (torch.Tensor): Optional mask tensor.

Returns:
- torch.Tensor: Audio tensor with flanger effect applied.
"""
time = torch.arange(tensor.shape[-1], device=tensor.device) / sample_rate
lfo = torch.sin(2 * torch.pi * rate * time) * depth + delay
lfo_samples = (lfo * sample_rate).long().clamp(0, tensor.shape[-1] - 1)
delayed_signal = tensor[..., lfo_samples]
flanger_tensor = tensor + delayed_signal
return audio_effect_return(tensor=flanger_tensor, mask=mask)

@staticmethod
def distortion(
tensor: torch.Tensor,
gain: float = 20.0,
mask: tp.Optional[torch.Tensor] = None,
) -> tp.Union[tp.Tuple[torch.Tensor, torch.Tensor], torch.Tensor]:
"""
Apply a distortion effect by amplifying the audio signal and then clipping it.

Parameters:
- tensor (torch.Tensor): Input audio tensor, assuming shape (batch_size, channels, time).
- gain (float): Gain factor for amplifying the signal.
- mask (torch.Tensor): Optional mask tensor.

Returns:
- torch.Tensor: Distorted audio tensor.
"""
amplified = tensor * gain
distorted = torch.tanh(amplified)
return audio_effect_return(tensor=distorted, mask=mask)

@staticmethod
def bit_crusher(
tensor: torch.Tensor,
bit_depth: int = 8,
mask: tp.Optional[torch.Tensor] = None,
) -> tp.Union[tp.Tuple[torch.Tensor, torch.Tensor], torch.Tensor]:
"""
Apply a bit crusher effect by reducing the bit depth of the audio signal.

Parameters:
- tensor (torch.Tensor): Input audio tensor, assuming shape (batch_size, channels, time).
- bit_depth (int): Bit depth to reduce to (e.g., 8 bits).
- mask (torch.Tensor): Optional mask tensor.

Returns:
- torch.Tensor: Audio tensor with reduced bit depth.
"""
scale = 2 ** bit_depth
crushed_tensor = torch.round(tensor * scale) / scale
return audio_effect_return(tensor=crushed_tensor, mask=mask)

@staticmethod
def vocoder(
tensor: torch.Tensor,
modulation_frequency: float = 100.0,
sample_rate: int = 16000,
mask: tp.Optional[torch.Tensor] = None,
) -> tp.Union[tp.Tuple[torch.Tensor, torch.Tensor], torch.Tensor]:
"""
Apply a vocoder effect by modulating the audio signal with a carrier frequency.

Parameters:
- tensor (torch.Tensor): Input audio tensor, assuming shape (batch_size, channels, time).
- modulation_frequency (float): Frequency of the modulation in Hz.
- sample_rate (int): Sample rate of the audio signal.
- mask (torch.Tensor): Optional mask tensor.

Returns:
- torch.Tensor: Vocoded audio tensor.
"""
time = torch.arange(tensor.shape[-1], device=tensor.device) / sample_rate
carrier = torch.sin(2 * torch.pi * modulation_frequency * time)
vocoded_tensor = tensor * carrier.unsqueeze(0).unsqueeze(0)
return audio_effect_return(tensor=vocoded_tensor, mask=mask)

@staticmethod
def ring_modulation(
tensor: torch.Tensor,
modulation_frequency: float = 30.0,
sample_rate: int = 16000,
mask: tp.Optional[torch.Tensor] = None,
) -> tp.Union[tp.Tuple[torch.Tensor, torch.Tensor], torch.Tensor]:
"""
Apply a ring modulation effect to the audio signal, creating a metallic sound.

Parameters:
- tensor (torch.Tensor): Input audio tensor, assuming shape (batch_size, channels, time).
- modulation_frequency (float): Frequency of the modulation in Hz.
- sample_rate (int): Sample rate of the audio signal.
- mask (torch.Tensor): Optional mask tensor.

Returns:
- torch.Tensor: Ring-modulated audio tensor.
"""
time = torch.arange(tensor.shape[-1], device=tensor.device) / sample_rate
modulation = torch.sin(2 * torch.pi * modulation_frequency * time)
ring_modulated_tensor = tensor * modulation.unsqueeze(0).unsqueeze(0)
return audio_effect_return(tensor=ring_modulated_tensor, mask=mask)

@staticmethod
def granulate(
tensor: torch.Tensor,
grain_size: int = 512,
overlap: float = 0.5,
mask: tp.Optional[torch.Tensor] = None,
) -> tp.Union[tp.Tuple[torch.Tensor, torch.Tensor], torch.Tensor]:
"""
Apply a granulation effect by breaking the audio into small overlapping grains.

Parameters:
- tensor (torch.Tensor): Input audio tensor, assuming shape (batch_size, channels, time).
- grain_size (int): Size of each grain in samples.
- overlap (float): Overlap ratio between grains (0 to 1).
- mask (torch.Tensor): Optional mask tensor.

Returns:
- torch.Tensor: Granulated audio tensor.
"""
step_size = int(grain_size * (1 - overlap))
grains = [tensor[..., i:i+grain_size] for i in range(0, tensor.shape[-1] - grain_size, step_size)]
granulated_tensor = torch.cat(grains, dim=-1)
return audio_effect_return(tensor=granulated_tensor, mask=mask)
Loading