fix: one-to-one mapping

varshith15 · varshith15 · commit 1c1959e00ebe · 2025-02-03T21:44:19.000+05:30
diff --git a/nodes/audio_utils/load_audio_tensor.py b/nodes/audio_utils/load_audio_tensor.py
@@ -4,34 +4,49 @@
 
 class LoadAudioTensor:
     CATEGORY = "audio_utils"
-    RETURN_TYPES = ("AUDIO",)
+    RETURN_TYPES = ("WAVEFORM", "INT")
     FUNCTION = "execute"
-
+    
     def __init__(self):
-        self.audio_buffer = np.array([], dtype=np.int16)
+        self.audio_buffer = np.empty(0, dtype=np.int16)
         self.buffer_samples = None
-
+        self.sample_rate = None
+    
     @classmethod
     def INPUT_TYPES(s):
         return {
             "required": {
                 "buffer_size": ("FLOAT", {"default": 500.0}),
-                "sample_rate": ("INT", {"default": 48000})
             }
         }
-
+    
     @classmethod
     def IS_CHANGED():
         return float("nan")
-
-    def execute(self, buffer_size, sample_rate):
-        if not self.buffer_samples:
-            self.buffer_samples = int(buffer_size * sample_rate / 1000)
-
-        while self.audio_buffer.size < self.buffer_samples:
-            audio = tensor_cache.audio_inputs.get()
-            self.audio_buffer = np.concatenate((self.audio_buffer, audio))
-
-        buffered_audio = self.audio_buffer
-        self.audio_buffer = np.array([], dtype=np.int16)
-        return (buffered_audio,)
+    
+    def execute(self, buffer_size):
+        if self.sample_rate is None or self.buffer_samples is None:
+            first_audio, sr = tensor_cache.audio_inputs.get(block=True)
+            self.sample_rate = sr
+            self.buffer_samples = int(sr * buffer_size / 1000)
+            self.leftover = first_audio
+        
+        if self.leftover.shape[0] < self.buffer_samples:
+            chunks = [self.leftover] if self.leftover.size > 0 else []
+            total_samples = self.leftover.shape[0]
+            
+            while total_samples < self.buffer_samples:
+                audio, sr = tensor_cache.audio_inputs.get(block=True)
+                if sr != self.sample_rate:
+                    raise ValueError("Sample rate mismatch")
+                chunks.append(audio)
+                total_samples += audio.shape[0]
+            
+            merged_audio = np.concatenate(chunks, dtype=np.int16)
+            buffered_audio = merged_audio[:self.buffer_samples]
+            self.leftover = merged_audio[self.buffer_samples:]
+        else:
+            buffered_audio = self.leftover[:self.buffer_samples]
+            self.leftover = self.leftover[self.buffer_samples:]
+                
+        return buffered_audio, self.sample_rate
diff --git a/nodes/audio_utils/save_audio_tensor.py b/nodes/audio_utils/save_audio_tensor.py
@@ -6,30 +6,20 @@ class SaveAudioTensor:
     FUNCTION = "execute"
     OUTPUT_NODE = True
 
-    def __init__(self):
-        self.frame_samples = None
 
     @classmethod
     def INPUT_TYPES(s):
         return {
             "required": {
-                "audio": ("AUDIO",),
-                "frame_size": ("FLOAT", {"default": 20.0}),
-                "sample_rate": ("INT", {"default": 48000})
+                "audio": ("WAVEFORM",)
             }
         }
 
     @classmethod
     def IS_CHANGED(s):
         return float("nan")
 
-    def execute(self, audio, frame_size, sample_rate):
-        if self.frame_samples is None:
-            self.frame_samples = int(frame_size * sample_rate / 1000)
-            
-        for idx in range(0, len(audio), self.frame_samples):
-            frame = audio[idx:idx + self.frame_samples]
-            fut = tensor_cache.audio_outputs.get()
-            fut.set_result(frame)
+    def execute(self, audio):
+        tensor_cache.audio_outputs.put_nowait(audio)
         return (audio,)
 
diff --git a/nodes/tensor_utils/save_tensor.py b/nodes/tensor_utils/save_tensor.py
@@ -22,6 +22,5 @@ def IS_CHANGED(s):
         return float("nan")
 
     def execute(self, images: torch.Tensor):
-        fut = tensor_cache.image_outputs.get()
-        fut.set_result(images)
+        tensor_cache.image_outputs.put_nowait(images)
         return images
diff --git a/server/app.py b/server/app.py
@@ -107,7 +107,6 @@ async def offer(request):
     params = await request.json()
 
     await pipeline.set_prompts(params["prompts"])
-    # await pipeline.warm()
 
     offer_params = params["offer"]
     offer = RTCSessionDescription(sdp=offer_params["sdp"], type=offer_params["type"])
@@ -152,13 +151,13 @@ async def on_message(message):
                             "nodes": nodes_info
                         }
                         channel.send(json.dumps(response))
-                    elif params.get("type") == "update_prompt":
-                        if "prompt" not in params:
+                    elif params.get("type") == "update_prompts":
+                        if "prompts" not in params:
                             logger.warning("[Control] Missing prompt in update_prompt message")
                             return
-                        pipeline.set_prompt(params["prompt"])
+                        pipeline.set_prompts(params["prompts"])
                         response = {
-                            "type": "prompt_updated",
+                            "type": "prompts_updated",
                             "success": True
                         }
                         channel.send(json.dumps(response))
diff --git a/server/pipeline.py b/server/pipeline.py
@@ -6,78 +6,89 @@
 from typing import Any, Dict, Union, List
 from comfystream.client import ComfyStreamClient
 
-WARMUP_RUNS = 10
+WARMUP_RUNS = 5
 
 
 class Pipeline:
     def __init__(self, **kwargs):
         self.client = ComfyStreamClient(**kwargs, max_workers=5) # TODO: hardcoded max workers, should it be configurable?
 
-        self.video_futures = asyncio.Queue()
-        self.audio_futures = asyncio.Queue()
+        self.video_incoming_frames = asyncio.Queue()
+        self.audio_incoming_frames = asyncio.Queue()
+
+        self.processed_audio_buffer = np.array([], dtype=np.int16)
 
     async def warm_video(self):
         dummy_video_inp = torch.randn(1, 512, 512, 3)
 
         for _ in range(WARMUP_RUNS):
-            image_out_fut = self.client.put_video_input(dummy_video_inp)
-            await image_out_fut
+            self.client.put_video_input(dummy_video_inp)
+            await self.client.get_video_output()
 
     async def warm_audio(self):
-        dummy_audio_inp = np.random.randint(-32768, 32767, 48 * 20, dtype=np.int16)  # TODO: might affect the workflow, due to buffering
+        dummy_audio_inp = np.random.randint(-32768, 32767, int(48000 * 0.5), dtype=np.int16)  # TODO: adds a lot of delay if it doesn't match the buffer size, is warmup needed?
 
-        futs = []
         for _ in range(WARMUP_RUNS):
-            audio_out_fut = self.client.put_audio_input(dummy_audio_inp)
-            futs.append(audio_out_fut)
-
-        await asyncio.gather(*futs)
+            self.client.put_audio_input((dummy_audio_inp, 48000))
+            await self.client.get_audio_output()
 
     async def set_prompts(self, prompts: Union[Dict[Any, Any], List[Dict[Any, Any]]]):
-        if isinstance(prompts, dict):
-            await self.client.set_prompts([prompts])
-        else:
+        if isinstance(prompts, list):
             await self.client.set_prompts(prompts)
+        else:
+            await self.client.set_prompts([prompts])
 
     async def put_video_frame(self, frame: av.VideoFrame):
         inp_tensor = self.video_preprocess(frame)
-        out_future = self.client.put_video_input(inp_tensor)
-        await self.video_futures.put((out_future, frame.pts, frame.time_base))
+        self.client.put_video_input(inp_tensor)
+        await self.video_incoming_frames.put((frame.pts, frame.time_base))
 
     async def put_audio_frame(self, frame: av.AudioFrame):
-        inp_tensor = self.audio_preprocess(frame)
-        out_future = self.client.put_audio_input(inp_tensor)
-        await self.audio_futures.put((out_future, frame.pts, frame.time_base, frame.sample_rate))
+        inp_np = self.audio_preprocess(frame)
+        self.client.put_audio_input((inp_np, frame.sample_rate))
+        await self.audio_incoming_frames.put((frame.pts, frame.time_base, frame.samples, frame.sample_rate))
 
-    def video_preprocess(self, frame: av.VideoFrame) -> torch.Tensor:
+    def video_preprocess(self, frame: av.VideoFrame) -> Union[torch.Tensor, np.ndarray]:
         frame_np = frame.to_ndarray(format="rgb24").astype(np.float32) / 255.0
         return torch.from_numpy(frame_np).unsqueeze(0)
     
-    def audio_preprocess(self, frame: av.AudioFrame) -> torch.Tensor:
+    def audio_preprocess(self, frame: av.AudioFrame) -> Union[torch.Tensor, np.ndarray]:
         return frame.to_ndarray().ravel().reshape(-1, 2).mean(axis=1).astype(np.int16)
     
-    def video_postprocess(self, output: torch.Tensor) -> av.VideoFrame:
+    def video_postprocess(self, output: Union[torch.Tensor, np.ndarray]) -> av.VideoFrame:
         return av.VideoFrame.from_ndarray(
             (output * 255.0).clamp(0, 255).to(dtype=torch.uint8).squeeze(0).cpu().numpy()
         )
 
-    def audio_postprocess(self, output: torch.Tensor) -> av.AudioFrame:
-        return av.AudioFrame.from_ndarray(output.reshape(1, -1), layout="mono")
+    def audio_postprocess(self, output: Union[torch.Tensor, np.ndarray]) -> av.AudioFrame:
+        return av.AudioFrame.from_ndarray(np.repeat(output, 2).reshape(1, -1))
     
     async def get_processed_video_frame(self):
-        out_fut, pts, time_base = await self.video_futures.get()
-        frame = self.video_postprocess(await out_fut)
-        frame.pts = pts
-        frame.time_base = time_base
-        return frame
+        # TODO: make it generic to support purely generative video cases
+        pts, time_base = await self.video_incoming_frames.get()
+        out_tensor = await self.client.get_video_output()
+
+        processed_frame  = self.video_postprocess(out_tensor)
+        processed_frame.pts = pts
+        processed_frame.time_base = time_base
+        
+        return processed_frame
 
     async def get_processed_audio_frame(self):
-        out_fut, pts, time_base, sample_rate = await self.audio_futures.get()
-        frame = self.audio_postprocess(await out_fut)
-        frame.pts = pts
-        frame.time_base = time_base
-        frame.sample_rate = sample_rate
-        return frame
+        # TODO: make it generic to support purely generative audio cases
+        pts, time_base, samples, sample_rate = await self.audio_incoming_frames.get()
+        if samples > len(self.processed_audio_buffer):
+            out_tensor = await self.client.get_audio_output()
+            self.processed_audio_buffer = np.concatenate([self.processed_audio_buffer, out_tensor])
+        out_data = self.processed_audio_buffer[:samples]
+        self.processed_audio_buffer = self.processed_audio_buffer[samples:]
+
+        processed_frame = self.audio_postprocess(out_data)
+        processed_frame.pts = pts
+        processed_frame.time_base = time_base
+        processed_frame.sample_rate = sample_rate
+        
+        return processed_frame
     
     async def get_nodes_info(self) -> Dict[str, Any]:
         """Get information about all nodes in the current prompt including metadata."""
diff --git a/src/comfystream/client.py b/src/comfystream/client.py
@@ -41,16 +41,16 @@ async def run_prompt(self, prompt: PromptDictInput):
                 raise
 
     def put_video_input(self, inp_tensor):
-        out_future = asyncio.Future()
-        tensor_cache.image_outputs.put(out_future)
         tensor_cache.image_inputs.put(inp_tensor)
-        return out_future
     
     def put_audio_input(self, inp_tensor):
-        out_future = asyncio.Future()
-        tensor_cache.audio_outputs.put(out_future)
         tensor_cache.audio_inputs.put(inp_tensor)
-        return out_future
+
+    async def get_video_output(self):
+        return await tensor_cache.image_outputs.get()
+    
+    async def get_audio_output(self):
+        return await tensor_cache.audio_outputs.get()
 
     async def get_available_nodes(self):
         """Get metadata and available nodes info in a single pass"""
diff --git a/src/comfystream/tensor_cache.py b/src/comfystream/tensor_cache.py
@@ -1,9 +1,13 @@
-import asyncio
 import torch
+import numpy as np
+
 from queue import Queue
+from asyncio import Queue as AsyncQueue
+
+from typing import Union
 
-image_inputs: Queue[torch.Tensor] = Queue()
-image_outputs: Queue[asyncio.Future] = Queue()
+image_inputs: Queue[Union[torch.Tensor, np.ndarray]] = Queue()
+image_outputs: AsyncQueue[Union[torch.Tensor, np.ndarray]] = AsyncQueue()
 
-audio_inputs: Queue[torch.Tensor] = Queue()
-audio_outputs: Queue[asyncio.Future] = Queue()
+audio_inputs: Queue[Union[torch.Tensor, np.ndarray]] = Queue()
+audio_outputs: AsyncQueue[Union[torch.Tensor, np.ndarray]] = AsyncQueue()