Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fixed tokenizer and audio processing logic #214

Open
wants to merge 27 commits into
base: main
Choose a base branch
from
Open
Changes from 1 commit
Commits
Show all changes
27 commits
Select commit Hold shift + click to select a range
c53d130
Update dependencies and fix language detection typo
1amageek Oct 3, 2024
2608340
Update AudioEncoder shape access and add tokenizer methods
1amageek Oct 3, 2024
b8029fb
Add `Sendable` conformance to several structs and enums
1amageek Oct 4, 2024
2af7d50
Refactor AudioProcessor to use actor model and async
1amageek Oct 5, 2024
dd1c4d5
Add Sendable conformance to various types and protocols
1amageek Oct 5, 2024
208893d
Update development team and package dependencies
1amageek Oct 5, 2024
9539e8b
Update package version and clean up code formatting
1amageek Oct 5, 2024
769dc29
Refactor audio energy calculations and buffer conversion
1amageek Oct 5, 2024
c8219d3
Refactor calculateRelativeEnergy method for clarity
1amageek Oct 5, 2024
2c0549c
Optimize audio buffer processing with vDSP_mmov
1amageek Oct 5, 2024
22aaa70
Refactor audio sample access methods in AudioProcessor
1amageek Oct 5, 2024
8ceaa0a
Remove unnecessary weak self references in closure
1amageek Oct 5, 2024
4d4233e
Refactor audio processing to use async/await methods
1amageek Oct 5, 2024
f9bcd1d
Use weak self in audio tap closure to prevent retain cycle
1amageek Oct 5, 2024
ea5d853
Log file name in error message for transcriber
1amageek Oct 5, 2024
5909d11
Refactor VADAudioChunker to a struct from a class
1amageek Oct 5, 2024
184b990
Refactor voice activity detection to use protocols
1amageek Oct 5, 2024
933b71b
Add audio converter initialization in resampling process
1amageek Oct 5, 2024
2dbb87f
Refactor AudioProcessor to use SampleRange type
1amageek Oct 6, 2024
368333f
Make AudioProcessing conform to Actor protocol
1amageek Oct 6, 2024
c41fb22
Refactor SegmentSeeker to improve readability and performance
1amageek Oct 6, 2024
621b1f3
Refactor SegmentSeeker to improve clarity and efficiency
1amageek Oct 6, 2024
f646268
Refactor SegmentSeeker to simplify alignment handling
1amageek Oct 6, 2024
db66166
Refactor SegmentSeeker to handle Float16 data type
1amageek Oct 6, 2024
28f34c3
Remove unnecessary comments in SegmentSeeker.swift
1amageek Oct 6, 2024
bb66ae1
Refactor SegmentSeeker for improved clarity and performance
1amageek Oct 6, 2024
8bfdd88
Refactor audio processor deinit and improve memory management
1amageek Oct 23, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
Refactor audio energy calculations and buffer conversion
1amageek committed Oct 5, 2024
commit 769dc29fa03b2669d44b3ca4544a0da606e9f735
70 changes: 26 additions & 44 deletions Sources/WhisperKit/Core/Audio/AudioProcessor.swift
Original file line number Diff line number Diff line change
@@ -490,19 +490,21 @@ public actor AudioProcessor: @preconcurrency AudioProcessing {
var rmsEnergy: Float = 0.0
var minEnergy: Float = 0.0
var maxEnergy: Float = 0.0

// Calculate the root mean square of the signal
vDSP_rmsqv(signal, 1, &rmsEnergy, vDSP_Length(signal.count))

// Calculate the maximum sample value of the signal
vDSP_maxmgv(signal, 1, &maxEnergy, vDSP_Length(signal.count))

// Calculate the minimum sample value of the signal
vDSP_minmgv(signal, 1, &minEnergy, vDSP_Length(signal.count))

vDSP_maxv(signal, 1, &maxEnergy, vDSP_Length(signal.count))
vDSP_minv(signal, 1, &minEnergy, vDSP_Length(signal.count))
return (rmsEnergy, maxEnergy, minEnergy)
}

public static func calculateRelativeEnergy(of signal: [Float], relativeTo reference: Float) -> Float {
let signalEnergy = calculateAverageEnergy(of: signal)
let referenceEnergy = max(1e-8, reference)
let dbEnergy = 20 * log10(signalEnergy)
let refEnergy = 20 * log10(referenceEnergy)
let normalizedEnergy = rescale(value: dbEnergy, min: refEnergy, max: 0)
return max(0, min(normalizedEnergy, 1))
}

public static func calculateRelativeEnergy(of signal: [Float], relativeTo reference: Float?) -> Float {
let signalEnergy = calculateAverageEnergy(of: signal)

@@ -522,41 +524,13 @@ public actor AudioProcessor: @preconcurrency AudioProcessing {
return max(0, min(normalizedEnergy, 1))
}

public static func convertBufferToArray(buffer: AVAudioPCMBuffer, chunkSize: Int = 1024) -> [Float] {
public static func convertBufferToArray(buffer: AVAudioPCMBuffer) -> [Float] {
guard let channelData = buffer.floatChannelData else {
return []
}

let frameLength = Int(buffer.frameLength)
let startPointer = channelData[0]

var result: [Float] = []
result.reserveCapacity(frameLength) // Reserve the capacity to avoid multiple allocations

var currentFrame = 0
while currentFrame < frameLength {
let remainingFrames = frameLength - currentFrame
let currentChunkSize = min(chunkSize, remainingFrames)

var chunk = [Float](repeating: 0, count: currentChunkSize)

chunk.withUnsafeMutableBufferPointer { bufferPointer in
vDSP_mmov(
startPointer.advanced(by: currentFrame),
bufferPointer.baseAddress!,
vDSP_Length(currentChunkSize),
1,
vDSP_Length(currentChunkSize),
1
)
}

result.append(contentsOf: chunk)
currentFrame += currentChunkSize

memset(startPointer.advanced(by: currentFrame - currentChunkSize), 0, currentChunkSize * MemoryLayout<Float>.size)
}

let result = Array(UnsafeBufferPointer(start: startPointer, count: frameLength))
return result
}

@@ -691,15 +665,23 @@ public extension AudioProcessor {
/// We have a new buffer, process and store it.
/// NOTE: Assumes audio is 16khz mono
func processBuffer(_ buffer: [Float]) {
let bufferCount = buffer.count
let previousCount = audioSamples.count
audioSamples.reserveCapacity(previousCount + bufferCount)
audioSamples.append(contentsOf: buffer)

// Find the lowest average energy of the last 20 buffers ~2 seconds
let minAvgEnergy = self.audioEnergy.suffix(20).reduce(Float.infinity) { min($0, $1.avg) }
let relativeEnergy = Self.calculateRelativeEnergy(of: buffer, relativeTo: minAvgEnergy)
// エネルギー計算
let recentAudioEnergy = self.audioEnergy.suffix(relativeEnergyWindow)
let minAvgEnergy: Float
if recentAudioEnergy.isEmpty {
minAvgEnergy = 1e-8 // デフォルトの最小エネルギー値
} else {
minAvgEnergy = recentAudioEnergy.reduce(Float.infinity) { min($0, $1.avg) }
}

// Update energy for buffers with valid data
let relativeEnergy = Self.calculateRelativeEnergy(of: buffer, relativeTo: minAvgEnergy)
let signalEnergy = Self.calculateEnergy(of: buffer)
let newEnergy = (relativeEnergy, signalEnergy.avg, signalEnergy.max, signalEnergy.min)
let newEnergy = (rel: relativeEnergy, avg: signalEnergy.avg, max: signalEnergy.max, min: signalEnergy.min)
self.audioEnergy.append(newEnergy)
// Call the callback with the new buffer
audioBufferCallback?(buffer)