fix: audio frame skipping

varshith15 · varshith15 · commit 0e4d8afb4e2e · 2025-02-16T22:52:46.000+05:30
diff --git a/nodes/audio_utils/load_audio_tensor.py b/nodes/audio_utils/load_audio_tensor.py
@@ -26,21 +26,21 @@ def IS_CHANGED():
     
     def execute(self, buffer_size):
         if self.sample_rate is None or self.buffer_samples is None:
-            first_audio, sr = tensor_cache.audio_inputs.get(block=True)
-            self.sample_rate = sr
-            self.buffer_samples = int(sr * buffer_size / 1000)
-            self.leftover = first_audio
+            frame = tensor_cache.audio_inputs.get(block=True)
+            self.sample_rate = frame.sample_rate
+            self.buffer_samples = int(self.sample_rate * buffer_size / 1000)
+            self.leftover = frame.side_data.input
         
         if self.leftover.shape[0] < self.buffer_samples:
             chunks = [self.leftover] if self.leftover.size > 0 else []
             total_samples = self.leftover.shape[0]
             
             while total_samples < self.buffer_samples:
-                audio, sr = tensor_cache.audio_inputs.get(block=True)
-                if sr != self.sample_rate:
+                frame = tensor_cache.audio_inputs.get(block=True)
+                if frame.sample_rate != self.sample_rate:
                     raise ValueError("Sample rate mismatch")
-                chunks.append(audio)
-                total_samples += audio.shape[0]
+                chunks.append(frame.side_data.input)
+                total_samples += frame.side_data.input.shape[0]
             
             merged_audio = np.concatenate(chunks, dtype=np.int16)
             buffered_audio = merged_audio[:self.buffer_samples]
diff --git a/nodes/tensor_utils/load_tensor.py b/nodes/tensor_utils/load_tensor.py
@@ -1,4 +1,3 @@
-import time
 from comfystream import tensor_cache
 
 
diff --git a/server/pipeline.py b/server/pipeline.py
@@ -27,10 +27,12 @@ async def warm_video(self):
             await self.client.get_video_output()
 
     async def warm_audio(self):
-        dummy_audio_inp = np.random.randint(-32768, 32767, int(48000 * 0.5), dtype=np.int16)  # TODO: adds a lot of delay if it doesn't match the buffer size, is warmup needed?
+        dummy_frame = av.AudioFrame()
+        dummy_frame.side_data.input = np.random.randint(-32768, 32767, int(48000 * 0.5), dtype=np.int16)   # TODO: adds a lot of delay if it doesn't match the buffer size, is warmup needed?
+        dummy_frame.sample_rate = 48000
 
         for _ in range(WARMUP_RUNS):
-            self.client.put_audio_input((dummy_audio_inp, 48000))
+            self.client.put_audio_input(dummy_frame)
             await self.client.get_audio_output()
 
     async def set_prompts(self, prompts: Union[Dict[Any, Any], List[Dict[Any, Any]]]):
@@ -52,9 +54,10 @@ async def put_video_frame(self, frame: av.VideoFrame):
         await self.video_incoming_frames.put(frame)
 
     async def put_audio_frame(self, frame: av.AudioFrame):
-        inp_np = self.audio_preprocess(frame)
-        self.client.put_audio_input((inp_np, frame.sample_rate))
-        await self.audio_incoming_frames.put((frame.pts, frame.time_base, frame.samples, frame.sample_rate))
+        frame.side_data.input = self.audio_preprocess(frame)
+        frame.side_data.skipped = True
+        self.client.put_audio_input(frame)
+        await self.audio_incoming_frames.put(frame)
 
     def video_preprocess(self, frame: av.VideoFrame) -> Union[torch.Tensor, np.ndarray]:
         frame_np = frame.to_ndarray(format="rgb24").astype(np.float32) / 255.0
@@ -85,18 +88,18 @@ async def get_processed_video_frame(self):
         return processed_frame
 
     async def get_processed_audio_frame(self):
-        # TODO: make it generic to support purely generative audio cases
-        pts, time_base, samples, sample_rate = await self.audio_incoming_frames.get()
-        if samples > len(self.processed_audio_buffer):
+        # TODO: make it generic to support purely generative audio cases and also add frame skipping
+        frame = await self.audio_incoming_frames.get()
+        if frame.samples > len(self.processed_audio_buffer):
             out_tensor = await self.client.get_audio_output()
             self.processed_audio_buffer = np.concatenate([self.processed_audio_buffer, out_tensor])
-        out_data = self.processed_audio_buffer[:samples]
-        self.processed_audio_buffer = self.processed_audio_buffer[samples:]
+        out_data = self.processed_audio_buffer[:frame.samples]
+        self.processed_audio_buffer = self.processed_audio_buffer[frame.samples:]
 
         processed_frame = self.audio_postprocess(out_data)
-        processed_frame.pts = pts
-        processed_frame.time_base = time_base
-        processed_frame.sample_rate = sample_rate
+        processed_frame.pts = frame.pts
+        processed_frame.time_base = frame.time_base
+        processed_frame.sample_rate = frame.sample_rate
         
         return processed_frame
     
diff --git a/src/comfystream/client.py b/src/comfystream/client.py
@@ -48,8 +48,8 @@ def put_video_input(self, frame):
             tensor_cache.image_inputs.get(block=True)
         tensor_cache.image_inputs.put(frame)
     
-    def put_audio_input(self, inp_tensor):
-        tensor_cache.audio_inputs.put(inp_tensor)
+    def put_audio_input(self, frame):
+        tensor_cache.audio_inputs.put(frame)
 
     async def get_video_output(self):
         return await tensor_cache.image_outputs.get()

Original file line number	Diff line number	Diff line change
`@@ -1,4 +1,3 @@`
`1`		`-import time`
`2`	`1`	`from comfystream import tensor_cache`
`3`	`2`
`4`	`3`