-
Notifications
You must be signed in to change notification settings - Fork 25
/
Copy pathaudio_video_sync.py
412 lines (350 loc) · 15.7 KB
/
audio_video_sync.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
import torch
import torchaudio
import os
import subprocess
from datetime import datetime
import math
from PIL import Image
# import logging
import torchvision.transforms as transforms
class AudioVideoSync:
"""
ComfyUI custom node for synchronizing audio and video with configurable speed adjustments.
Supports both video files and image sequences as input, as well as audio files or AUDIO objects.
"""
def __init__(self):
"""Initialize the AudioVideoSync node."""
self.base_dir = "Bjornulf"
self.temp_dir = os.path.join(self.base_dir, "temp_frames")
self.sync_video_dir = os.path.join(self.base_dir, "sync_video")
self.sync_audio_dir = os.path.join(self.base_dir, "sync_audio")
# Create necessary directories
for directory in [self.temp_dir, self.sync_video_dir, self.sync_audio_dir]:
os.makedirs(directory, exist_ok=True)
@classmethod
def INPUT_TYPES(cls):
"""Define input parameters for the node."""
return {
"required": {
"max_speedup": ("FLOAT", {
"default": 1.5,
"min": 1.0,
"max": 10.0,
"step": 0.1
}),
"max_slowdown": ("FLOAT", {
"default": 0.5,
"min": 0.1,
"max": 1.0,
"step": 0.1
}),
},
"optional": {
"IMAGES": ("IMAGE",),
"AUDIO": ("AUDIO",),
"audio_path": ("STRING", {"default": "", "forceInput": True}),
"audio_duration": ("FLOAT", {
"default": 0.0,
"min": 0.0,
"max": 3600.0,
"step": 0.001
}),
"video_path": ("STRING", {
"default": "",
"forceInput": True
}),
"output_fps": ("FLOAT", {
"default": 30.0,
"min": 1.0,
"max": 120.0,
"step": 0.1
}),
}
}
RETURN_TYPES = ("IMAGE", "AUDIO", "STRING", "STRING", "FLOAT", "FLOAT", "FLOAT", "FLOAT", "INT")
RETURN_NAMES = ("sync_IMAGES", "sync_AUDIO", "sync_audio_path", "sync_video_path",
"input_video_duration", "sync_video_duration", "input_audio_duration", "sync_audio_duration",
"sync_video_frame_count")
FUNCTION = "sync_audio_video"
CATEGORY = "Bjornulf"
def generate_timestamp(self):
"""Generate a unique timestamp for file naming."""
return datetime.now().strftime("%Y%m%d_%H%M%S")
def validate_audio_input(self, audio):
"""Validate the audio input format."""
if not isinstance(audio, dict) or 'waveform' not in audio or 'sample_rate' not in audio:
raise ValueError("Expected audio input to be a dictionary with 'waveform' and 'sample_rate' keys")
def validate_speed_limits(self, max_speedup, max_slowdown):
"""Validate the speed limit parameters."""
if max_speedup < 1.0:
raise ValueError("max_speedup must be greater than or equal to 1.0")
if max_slowdown > 1.0:
raise ValueError("max_slowdown must be less than or equal to 1.0")
def get_audio_duration(self, audio):
"""Calculate audio duration from audio input."""
if isinstance(audio, dict) and 'waveform' in audio and 'sample_rate' in audio:
return audio['waveform'].shape[-1] / audio['sample_rate']
else:
raise ValueError("Invalid audio input format")
def ffprobe_run(self, cmd):
"""Run ffprobe command and return the output."""
result = subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
return result.stdout.strip()
def get_video_info(self, video_path):
"""Get video duration, fps, and frame count."""
duration = float(self.ffprobe_run([
'ffprobe', '-v', 'error',
'-show_entries', 'format=duration',
'-of', 'default=noprint_wrappers=1:nokey=1',
video_path
]))
fps_str = self.ffprobe_run([
'ffprobe', '-v', 'error',
'-select_streams', 'v:0',
'-show_entries', 'stream=r_frame_rate',
'-of', 'csv=p=0',
video_path
])
fps = float(eval(fps_str)) if '/' in fps_str else float(fps_str)
frame_count = int(self.ffprobe_run([
'ffprobe', '-v', 'error',
'-count_packets',
'-select_streams', 'v:0',
'-show_entries', 'stream=nb_read_packets',
'-of', 'csv=p=0',
video_path
]))
return duration, fps, frame_count
def process_images_to_video(self, IMAGES, fps):
"""Convert image sequence to video."""
timestamp = self.generate_timestamp()
temp_dir = os.path.join(self.temp_dir, f"frames_{timestamp}")
os.makedirs(temp_dir, exist_ok=True)
# Save frames
frame_paths = []
for i, img in enumerate(IMAGES):
if isinstance(img, torch.Tensor):
if img.dim() == 4:
img = img.squeeze(0)
img = (img * 255).byte().cpu().numpy()
img = Image.fromarray(img)
frame_path = os.path.join(temp_dir, f"frame_{i:05d}.png")
img.save(frame_path)
frame_paths.append(frame_path)
# Create video
output_path = os.path.join(self.temp_dir, f"video_{timestamp}.mp4")
subprocess.run([
'ffmpeg', '-y',
'-framerate', str(fps),
'-i', os.path.join(temp_dir, 'frame_%05d.png'),
'-c:v', 'libx264',
'-pix_fmt', 'yuv420p',
'-preset', 'medium',
'-crf', '19',
output_path
], check=True)
# Cleanup
for path in frame_paths:
os.remove(path)
os.rmdir(temp_dir)
return output_path
def adjust_video_speed(self, video_path, speed_factor, output_path):
"""Adjust video speed using ffmpeg."""
pts_speed = 1 / speed_factor
subprocess.run([
'ffmpeg', '-y',
'-i', video_path,
'-filter:v', f'setpts={pts_speed}*PTS',
'-an',
'-c:v', 'libx264',
'-preset', 'medium',
'-crf', '19',
output_path
], check=True)
def create_sync_video(self, video_path, original_duration, target_duration, max_speedup, max_slowdown):
"""Create synchronized version of the video."""
timestamp = self.generate_timestamp()
output_path = os.path.join(self.sync_video_dir, f"sync_video_{timestamp}.mp4")
if target_duration > original_duration:
speed_ratio = original_duration / target_duration
if speed_ratio >= max_slowdown:
# Slow down video within limits
self.adjust_video_speed(video_path, speed_ratio, output_path)
else:
# Repeat video if slowdown would exceed limit
repeat_count = math.ceil(target_duration / original_duration)
concat_file = os.path.join(self.sync_video_dir, f"concat_{timestamp}.txt")
with open(concat_file, 'w') as f:
for _ in range(repeat_count):
f.write(f"file '{os.path.abspath(video_path)}'\n")
subprocess.run([
'ffmpeg', '-y',
'-f', 'concat',
'-safe', '0',
'-i', concat_file,
'-c', 'copy',
output_path
], check=True)
os.remove(concat_file)
else:
speed_ratio = original_duration / target_duration
if abs(speed_ratio - 1.0) <= 0.1:
# Copy video if speed change is minimal
subprocess.run([
'ffmpeg', '-y',
'-i', video_path,
'-c', 'copy',
output_path
], check=True)
else:
# Speed up video within limits
speed = min(speed_ratio, max_speedup)
self.adjust_video_speed(video_path, speed, output_path)
return os.path.abspath(output_path)
def process_audio(self, audio_tensor, sample_rate, target_duration, original_duration,
max_speedup, max_slowdown):
"""Process audio to match video duration."""
# Ensure audio tensor has correct dimensions
if audio_tensor.dim() == 2:
audio_tensor = audio_tensor.unsqueeze(0)
elif audio_tensor.dim() == 1:
audio_tensor = audio_tensor.unsqueeze(0).unsqueeze(0)
current_duration = audio_tensor.shape[-1] / sample_rate
# Calculate synchronized video duration
if target_duration > original_duration:
speed_ratio = original_duration / target_duration
if speed_ratio >= max_slowdown:
sync_duration = target_duration
else:
sync_duration = math.ceil(target_duration / original_duration) * original_duration
else:
speed_ratio = original_duration / target_duration
if abs(speed_ratio - 1.0) <= 0.1:
sync_duration = original_duration
else:
speed = min(speed_ratio, max_speedup)
sync_duration = original_duration / speed
# Adjust audio length
if current_duration < sync_duration:
silence_samples = int((sync_duration - current_duration) * sample_rate)
silence = torch.zeros(audio_tensor.shape[0], audio_tensor.shape[1], silence_samples)
processed_audio = torch.cat([audio_tensor, silence], dim=-1)
else:
required_samples = int(sync_duration * sample_rate)
processed_audio = audio_tensor[..., :required_samples]
return processed_audio, sync_duration
def save_audio(self, audio_tensor, sample_rate, target_duration, original_duration,
max_speedup, max_slowdown):
"""Save processed audio to file and return consistent AUDIO format."""
timestamp = self.generate_timestamp()
output_path = os.path.join(self.sync_audio_dir, f"sync_audio_{timestamp}.wav")
processed_audio, sync_duration = self.process_audio(
audio_tensor, sample_rate, target_duration, original_duration,
max_speedup, max_slowdown
)
# Save with proper format
torchaudio.save(output_path, processed_audio.squeeze(0), sample_rate)
# Return consistent AUDIO format
return {
'waveform': processed_audio,
'sample_rate': sample_rate
}
def load_audio_from_path(self, audio_path):
"""Load audio from file path and format it consistently with AUDIO input."""
waveform, sample_rate = torchaudio.load(audio_path)
# Ensure waveform has 3 dimensions (batch, channels, samples) like AUDIO input
if waveform.dim() == 2:
waveform = waveform.unsqueeze(0) # Add batch dimension
# Convert to float32 and normalize to range [0, 1] if needed
if waveform.dtype != torch.float32:
waveform = waveform.float()
if waveform.max() > 1.0:
waveform = waveform / 32768.0 # Normalize 16-bit audio
return {'waveform': waveform, 'sample_rate': sample_rate}
def extract_frames(self, video_path):
"""Extract all frames of the video as a tensor."""
temp_dir = os.path.join(self.temp_dir, "temp_frames")
os.makedirs(temp_dir, exist_ok=True)
# Extract frames using ffmpeg
subprocess.run([
'ffmpeg', '-i', video_path,
os.path.join(temp_dir, 'frame_%05d.png')
], check=True)
# Load frames and convert to tensor
frames = []
frame_files = sorted(os.listdir(temp_dir))
transform = transforms.Compose([
transforms.ToTensor(),
transforms.Lambda(lambda x: x * 255) # Scale to 0-255 range
])
for frame_file in frame_files:
image = Image.open(os.path.join(temp_dir, frame_file))
frame_tensor = transform(image)
frames.append(frame_tensor)
# Stack frames into a single tensor
frames_tensor = torch.stack(frames)
# Ensure the tensor is in the correct format (B, C, H, W)
if frames_tensor.dim() == 3:
frames_tensor = frames_tensor.unsqueeze(0)
# Convert to uint8
frames_tensor = frames_tensor.byte()
# Clean up temporary directory
for frame_file in frame_files:
os.remove(os.path.join(temp_dir, frame_file))
os.rmdir(temp_dir)
return frames_tensor
def sync_audio_video(self, max_speedup=1.5, max_slowdown=0.5,
AUDIO=None, audio_path="", audio_duration=None,
video_path="", IMAGES=None, output_fps=30.0):
"""Main function to synchronize audio and video."""
self.validate_speed_limits(max_speedup, max_slowdown)
# Handle audio input
if AUDIO is None and not audio_path:
raise ValueError("Either AUDIO or audio_path must be provided")
if audio_path:
AUDIO = self.load_audio_from_path(audio_path)
self.validate_audio_input(AUDIO)
# Calculate audio duration if not provided
if audio_duration is None or audio_duration == 0.0:
audio_duration = self.get_audio_duration(AUDIO)
# logging.info(f"Audio duration: {audio_duration}")
# Process input source
if IMAGES is not None and len(IMAGES) > 0:
video_path = self.process_images_to_video(IMAGES, output_fps)
original_duration = len(IMAGES) / output_fps
video_fps = output_fps
original_frame_count = len(IMAGES)
elif video_path:
original_duration, video_fps, original_frame_count = self.get_video_info(video_path)
else:
raise ValueError("Either video_path or IMAGES must be provided")
# Create synchronized versions
sync_video_path = self.create_sync_video(
video_path, original_duration, audio_duration, max_speedup, max_slowdown
)
# Process and save audio, getting consistent AUDIO format back
sync_audio = self.save_audio(
AUDIO['waveform'], AUDIO['sample_rate'], audio_duration,
original_duration, max_speedup, max_slowdown
)
# Get sync_audio_path separately
sync_audio_path = os.path.join(self.sync_audio_dir, f"sync_audio_{self.generate_timestamp()}.wav")
torchaudio.save(sync_audio_path, sync_audio['waveform'].squeeze(0), sync_audio['sample_rate'])
# Get final properties
sync_video_duration, _, sync_frame_count = self.get_video_info(sync_video_path)
sync_audio_duration = sync_audio['waveform'].shape[-1] / sync_audio['sample_rate']
video_frames = self.extract_frames(sync_video_path)
# Convert video_frames to the format expected by ComfyUI
video_frames = video_frames.float() / 255.0
video_frames = video_frames.permute(0, 2, 3, 1)
return (
video_frames,
sync_audio, # Now returns consistent AUDIO format
sync_audio_path,
sync_video_path,
original_duration,
sync_video_duration,
audio_duration,
sync_audio_duration,
sync_frame_count
)