Skip to content

Commit 042cc8d

Browse files
authored
Merge pull request #489 from classtranscribe/UpdateForWhisper
Update for whisper
2 parents 6a78953 + 905ab4c commit 042cc8d

25 files changed

+1573
-64
lines changed

ClassTranscribeDatabase/CommonUtils.cs

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,7 @@ public enum TaskType
2323
DownloadPlaylistInfo = 3,
2424
DownloadMedia = 4,
2525
ConvertMedia = 5,
26-
TranscribeVideo = 6,
26+
// TranscribeVideo = 6,
2727
ProcessVideo = 7,
2828
Aggregator = 8,
2929
GenerateVTTFile = 9,
@@ -39,7 +39,9 @@ public enum TaskType
3939
PythonCrawler = 19,
4040

4141
DescribeVideo = 20,
42-
DescribeImage = 21
42+
DescribeImage = 21,
43+
AzureTranscribeVideo = 22,
44+
LocalTranscribeVideo = 23
4345

4446
}
4547

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
{
22
"sdk": {
3-
"version": "8.0.201"
3+
"version": "8.0"
44
}
55
}

ClassTranscribeServer/Controllers/PlaylistsController.cs

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -170,7 +170,7 @@ public async Task<ActionResult<IEnumerable<PlaylistDTO>>> GetPlaylists2(string o
170170
JsonMetadata = m.JsonMetadata,
171171
CreatedAt = m.CreatedAt,
172172
SceneDetectReady = m.Video.HasSceneObjectData(),
173-
Ready = m.Video != null && "NoError" == m.Video.TranscriptionStatus ,
173+
Ready = m.Video != null && Video.TranscriptionStatusMessages.NOERROR == m.Video.TranscriptionStatus ,
174174
SourceType = m.SourceType,
175175
Duration = m.Video?.Duration,
176176
PublishStatus = m.PublishStatus,
@@ -265,7 +265,7 @@ public async Task<ActionResult<PlaylistDTO>> GetPlaylist(string id)
265265
PublishStatus = m.PublishStatus,
266266
Options = m.GetOptionsAsJson(),
267267
SceneDetectReady = m.Video != null && m.Video.HasSceneObjectData(),
268-
Ready = m.Video != null && "NoError" == m.Video.TranscriptionStatus ,
268+
Ready = m.Video != null && Video.TranscriptionStatusMessages.NOERROR == m.Video.TranscriptionStatus ,
269269
Video = m.Video == null ? null : new VideoDTO
270270
{
271271
Id = m.Video.Id,

ClassTranscribeServer/Utils/WakeDownloader.cs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -104,7 +104,7 @@ public void TranscribeVideo(string videoOrMediaId, bool deleteExisting)
104104
{
105105
JObject msg = new JObject
106106
{
107-
{ "Type", TaskType.TranscribeVideo.ToString() },
107+
{ "Type", TaskType.LocalTranscribeVideo.ToString() },
108108
{ "videoOrMediaId", videoOrMediaId },
109109
{ "DeleteExisting", deleteExisting }
110110
};

ClassTranscribeServer/global.json

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
{
22
"sdk": {
3-
"version": "8.0.401"
3+
"version": "8.0"
44
}
55
}

PythonRpcServer/.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
venv/

PythonRpcServer/requirements.txt

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -32,8 +32,9 @@ wcwidth==0.2.13
3232

3333
# Not versioned
3434
numpy
35-
pytube # if not available, use the tar.gz package (see Dockerfile)
36-
35+
# No longer maintained pytube # if not available, use the tar.gz package (see Dockerfile)
36+
yt-dlp
37+
#Always get latest
3738

3839
# protobuf version 3.18.3 causes NotImplementedError("To be implemented") in PythonRpcServer/mediaprovider.py
3940
# Likely need to coordinate updating the C# version too

PythonRpcServer/server.py

Lines changed: 33 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,9 @@
1212
from echo import EchoProvider
1313
from kaltura import KalturaProvider
1414
from mediaprovider import InvalidPlaylistInfoException
15+
from transcribe import transcribe_audio
16+
17+
import json
1518
import hasher
1619
import ffmpeg
1720
# import phrasehinter
@@ -41,6 +44,18 @@ def LogWorker(logId, worker):
4144

4245

4346
class PythonServerServicer(ct_pb2_grpc.PythonServerServicer):
47+
# Transcribe it into a json string from the transcribe text
48+
# Make it returns a json string
49+
# change name to TranscribeRPC
50+
# def CaptionRPC(self, request, context):
51+
# #See CaptionRequest
52+
# print( f"CaptionRPC({request.logId};{request.refId};{request.filePath};{request.phraseHints};{request.courseHints};{request.outputLanguages})")
53+
# kalturaprovider = KalturaProvider()
54+
# result = LogWorker(f"CaptionRPC({request.filePath})", lambda: kalturaprovider.getCaptions(request.refId))
55+
# return ct_pb2.JsonString(json = result)
56+
57+
58+
4459
def GetScenesRPC(self, request, context):
4560
raise NotImplementedError('Implementation now in pyapi')
4661
# res = scenedetector.find_scenes(request.filePath)
@@ -113,14 +128,31 @@ def ComputeFileHash(self, request, context):
113128
def GetMediaInfoRPC(self, request, context):
114129
result = LogWorker(f"GetMediaInfo({request.filePath})", lambda: ffmpeg.getMediaInfo(request.filePath))
115130
return ct_pb2.JsonString(json = result)
131+
132+
133+
def TranscribeAudioRPC(self, request, context):
134+
print(f"TranscribeAudioRPC({request.logId};{request.filePath})")
135+
try:
136+
logging.info(f"Starting transcription for file: {request.filePath}")
137+
transcription_result = LogWorker(
138+
f"TranscribeAudioRPC({request.filePath})",
139+
lambda: transcribe_audio(request.filePath, request.testing)
140+
)
141+
logging.info(f"Transcription completed successfully for: {request.filePath}")
142+
return ct_pb2.JsonString(json=json.dumps(transcription_result))
143+
144+
except Exception as e:
145+
context.set_code(grpc.StatusCode.INTERNAL)
146+
context.set_details(f"Transcription failed: {str(e)}")
147+
return ct_pb2.JsonString(json=json.dumps({"error": str(e)}))
116148

117149
def serve():
118150
print("Python RPC Server Starting")
119151

120152
# Until we can ensure no timeouts on remote services, the default here is set to a conservative low number
121153
# This is to ensure we can still make progress even if every python tasks tries to use all cpu cores.
122154
max_workers=int(os.getenv('NUM_PYTHON_WORKERS', 3))
123-
print(f"max_workers={max_workers}")
155+
print(f"max_workers={max_workers}. Starting up grpc server...")
124156

125157
server = grpc.server(futures.ThreadPoolExecutor(max_workers=max_workers))
126158

PythonRpcServer/transcribe.py

Lines changed: 126 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,126 @@
1+
import os
2+
import subprocess
3+
import json
4+
from time import perf_counter
5+
from ffmpy import FFmpeg
6+
import utils
7+
8+
# Path to the Whisper executable inside the container
9+
WHISPER_EXECUTABLE = os.environ.get('WHISPER_EXE','whisper') # Executable 'main' is assumed to be in the same directory as this script
10+
MODEL = os.environ.get('WHISPER_MODEL','models/ggml-base.en.bin')
11+
12+
def convert_video_to_wav(input_filepath, offset=None):
13+
"""
14+
Converts a video file to WAV format using ffmpy.
15+
"""
16+
try:
17+
start_time = perf_counter()
18+
if offset is None:
19+
offset = 0.0
20+
21+
nthreads = utils.getMaxThreads()
22+
23+
print(f"Converting video '{input_filepath}' to WAV with offset {offset} using {nthreads} thread(s).")
24+
output_filepath = utils.getTmpFile()
25+
ext = '.wav'
26+
27+
ff = FFmpeg(
28+
global_options=f"-hide_banner -loglevel error -nostats -threads {nthreads}",
29+
inputs={input_filepath: f'-ss {offset}'},
30+
outputs={output_filepath: '-c:a pcm_s16le -ac 1 -y -ar 16000 -f wav'}
31+
)
32+
print(f"Starting conversion. Audio output will be saved in {output_filepath}")
33+
ff.run()
34+
end_time = perf_counter()
35+
print(f"Conversion complete. Duration: {int(end_time - start_time)} seconds")
36+
return output_filepath, ext
37+
except Exception as e:
38+
print("Exception during conversion:" + str(e))
39+
raise e
40+
41+
def transcribe_audio(media_filepath, testing=False):
42+
if testing:
43+
json_output_path = f"/PythonRpcServer/transcribe_hellohellohello.wav.json"
44+
with open(json_output_path, 'r') as json_file:
45+
transcription_result = json.load(json_file)
46+
47+
# Print the transcription result (testing purpose)
48+
print("Transcription result:")
49+
print(json.dumps(transcription_result, indent=4))
50+
51+
return transcription_result
52+
53+
if media_filepath == 'TEST-transcribe_example_result':
54+
result_json_file = 'transcribe_exampleffmp_result.json'
55+
with open(result_json_file, 'r') as json_file:
56+
transcription_result = json.load(json_file)
57+
return transcription_result
58+
59+
# Ensure the media file exists
60+
if not os.path.exists(media_filepath):
61+
raise FileNotFoundError(f"Media file not found: {media_filepath}")
62+
63+
# convert video to wav if needed
64+
wav_created = False # Track if WAV was created
65+
if not media_filepath.endswith('.wav'):
66+
media_filepath, _ = convert_video_to_wav(media_filepath)
67+
wav_created = True # WAV file was created
68+
69+
70+
# Path to the output JSON file that Whisper will generate
71+
json_output_path = f"{media_filepath}.json"
72+
if os.path.exists(json_output_path):
73+
os.remove(json_output_path)
74+
75+
# Command to run Whisper.cpp inside the container using the main executable
76+
whisper_command = [
77+
WHISPER_EXECUTABLE, # Path to Whisper executable
78+
'-ojf', # Output as JSON file
79+
'-f', media_filepath, # Media file path
80+
'-m', MODEL
81+
]
82+
83+
print("Running Whisper transcription inside the container...")
84+
85+
# Execute the Whisper command
86+
result = subprocess.run(whisper_command, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
87+
88+
# Handle command failure
89+
if result.returncode != 0:
90+
raise Exception(f"Whisper failed with error:\n{result.stderr.decode('utf-8')}")
91+
92+
# Check if the output JSON file was generated
93+
print(f"Checking for JSON output at: {json_output_path}")
94+
if not os.path.exists(json_output_path):
95+
raise FileNotFoundError(f"Expected JSON output file not found: {json_output_path}")
96+
97+
# Load the JSON transcription result
98+
with open(json_output_path, 'r') as json_file:
99+
transcription_result = json.load(json_file)
100+
101+
# Print the transcription result (testing purpose)
102+
print("Transcription result:")
103+
print(json.dumps(transcription_result, indent=4))
104+
105+
# Delete the JSON file after reading it
106+
os.remove(json_output_path)
107+
print(f"Deleted the JSON file: {json_output_path}")
108+
109+
if wav_created:
110+
try:
111+
os.remove(media_filepath)
112+
print(f"Deleted the WAV file: {media_filepath}")
113+
except Exception as e:
114+
print(f"Error deleting WAV file: {str(e)}")
115+
116+
return transcription_result
117+
118+
# Example usage
119+
if __name__ == '__main__':
120+
# Example media file path inside the container (the actual path will depend on where the file is located)
121+
json_output_path = f"/PythonRpcServer/transcribe_hellohellohello.wav.json"
122+
with open(json_output_path, 'r') as json_file:
123+
transcription_result = json.load(json_file)
124+
125+
print("Transcription Result:", json.dumps(transcription_result, indent=4))
126+

0 commit comments

Comments
 (0)