ryanontheinside
diff --git a/‎README.md
+28-7 b/‎README.md
+28-7
diff --git a/‎audio_example.py
+1-1 b/‎audio_example.py
+1-1
diff --git a/‎nodes/audio_utils/apply_whisper.py
+8-7 b/‎nodes/audio_utils/apply_whisper.py
+8-7
diff --git a/‎nodes/audio_utils/load_audio_tensor.py
+1-1 b/‎nodes/audio_utils/load_audio_tensor.py
+1-1
diff --git a/‎nodes/audio_utils/save_asr_response.py
+1-2 b/‎nodes/audio_utils/save_asr_response.py
+1-2
diff --git a/‎nodes/video_stream_utils/__init__.py
+5 b/‎nodes/video_stream_utils/__init__.py
+5
diff --git a/‎nodes/video_stream_utils/primary_input_load_image.py
+5 b/‎nodes/video_stream_utils/primary_input_load_image.py
+5
diff --git a/‎server/app.py
+23-2 b/‎server/app.py
+23-2
diff --git a/‎server/pipeline.py
+9-3 b/‎server/pipeline.py
+9-3
diff --git a/‎setup.py
+1 b/‎setup.py
+1
diff --git a/‎src/comfystream/utils.py
+51-19 b/‎src/comfystream/utils.py
+51-19
@@ -8,6 +8,7 @@ This repo also includes a WebRTC server and UI that uses comfystream to support
 - [Install package](#install-package)
   - [Custom Nodes](#custom-nodes)
   - [Usage](#usage)
+- [Run tests](#run-tests)
 - [Run server](#run-server)
 - [Run UI](#run-ui)
 - [Limitations](#limitations)
@@ -48,24 +49,38 @@ pip install git+https://github.com/yondonfu/comfystream.git
 
 ## Custom Nodes
 
-**tensor_utils**
+comfystream uses a few custom nodes to support running workflows.
 
-Copy the `tensor_utils` nodes into the `custom_nodes` folder of your ComfyUI workspace:
+Copy the custom nodes into the `custom_nodes` folder of your ComfyUI workspace:
 
 ```
-cp -r nodes/tensor_utils custom_nodes
+cp -r nodes/* custom_nodes/
 ```
 
-For example, if you ComfyUI workspace is under `/home/user/ComfyUI`:
+For example, if your ComfyUI workspace is under `/home/user/ComfyUI`:
 
 ```
-cp -r nodes/tensor_utils /home/user/ComfyUI/custom_nodes
+cp -r nodes/* /home/user/ComfyUI/custom_nodes
 ```
 
 ## Usage
 
 See `example.py`.
 
+# Run tests
+
+Install dev dependencies:
+
+```
+pip install .[dev]
+```
+
+Run tests:
+
+```
+pytest
+```
+
 # Run server
 
 Install dependencies:
@@ -144,9 +159,15 @@ The Stream URL is the URL of the [server](#run-server) which defaults to http://
 
 At the moment, a workflow must fufill the following requirements:
 
-- Single input using the LoadImage node
+- The workflow must have a single primary input node that will receive individual video frames
+  - The primary input node is designed by one of the following:
+    - A single [PrimaryInputLoadImage](./nodes/video_stream_utils/primary_input_load_image.py) node (see [this workflow](./workflows/liveportait.json) for example usage)
+      - This node can be used as a drop-in replacement for a LoadImage node
+      - In this scenario, any number of additional LoadImage nodes can be used
+    - A single LoadImage node
+      - In this scenario, the workflow can only contain the single LoadImage node
   - At runtime, this node is replaced with a LoadTensor node
-- Single output using a PreviewImage or SaveImage node
+- The workflow must have a single output using a PreviewImage or SaveImage node
   - At runtime, this node is replaced with a SaveTensor node
 
 # Troubleshoot
 
@@ -13,7 +13,7 @@ async def main():
 
     client.set_prompt(prompt)
 
-    waveform, _ = torchaudio.load("harvard.wav")
+    waveform, _ = torchaudio.load("/home/user/harvard.wav")
     if waveform.ndim > 1:
         audio_tensor = waveform.mean(dim=0)
 
 
@@ -11,6 +11,7 @@ def INPUT_TYPES(s):
             }
         }
 
+    CATEGORY = "audio_utils"
     RETURN_TYPES = ("DICT",)
     FUNCTION = "apply_whisper"
 
@@ -33,23 +34,23 @@ def apply_whisper(self, audio, model):
         concatenated_audio = torch.cat(self.audio_buffer, dim=0).cuda()
         self.audio_buffer = []
         result = self.model.transcribe(concatenated_audio.float(), fp16=True, word_timestamps=True)
-        segments = result['segments']
+        segments = result["segments"]
         segments_alignment = []
         words_alignment = []
 
         for segment in segments:
             segment_dict = {
-                'value': segment['text'].strip(),
-                'start': segment['start'],
-                'end': segment['end']
+                "value": segment["text"].strip(),
+                "start": segment["start"],
+                "end": segment["end"]
             }
             segments_alignment.append(segment_dict)
 
             for word in segment["words"]:
                 word_dict = {
-                    'value': word["word"].strip(),
-                    'start': word["start"],
-                    'end': word['end']
+                    "value": word["word"].strip(),
+                    "start": word["start"],
+                    "end": word["end"]
                 }
                 words_alignment.append(word_dict)
 
 
@@ -1,7 +1,7 @@
 from comfystream import tensor_cache
 
 class LoadAudioTensor:
-    CATEGORY = "tensor_utils"
+    CATEGORY = "audio_utils"
     RETURN_TYPES = ("AUDIO",)
     FUNCTION = "execute"
 
 
@@ -1,10 +1,9 @@
 from comfystream import tensor_cache
 
 class SaveASRResponse:
-    CATEGORY = "tensor_utils"
+    CATEGORY = "audio_utils"
     RETURN_TYPES = ()
     FUNCTION = "execute"
-    OUTPUT_NODE = True
 
     @classmethod
     def INPUT_TYPES(s):
 
@@ -0,0 +1,5 @@
+from .primary_input_load_image import PrimaryInputLoadImage 
+
+NODE_CLASS_MAPPINGS = {"PrimaryInputLoadImage": PrimaryInputLoadImage}
+
+__all__ = ["NODE_CLASS_MAPPINGS"]
@@ -0,0 +1,5 @@
+import nodes
+
+
+class PrimaryInputLoadImage(nodes.LoadImage):
+    pass
@@ -16,11 +16,15 @@
     MediaStreamTrack,
 )
 from aiortc.rtcrtpsender import RTCRtpSender
+from aiortc.codecs import h264
 from pipeline import Pipeline
 from utils import patch_loop_datagram
 
 logger = logging.getLogger(__name__)
 
+MAX_BITRATE = 2000000
+MIN_BITRATE = 2000000
+
 
 class VideoStreamTrack(MediaStreamTrack):
     kind = "video"
@@ -165,12 +169,12 @@ def get_ice_servers():
 
 
 async def offer(request):
+    pipeline = request.app["pipeline"]
     pcs = request.app["pcs"]
-    workspace = request.app["workspace"]
 
     params = await request.json()
 
-    pipeline = Pipeline(params["prompt"], cwd=workspace)
+    pipeline.set_prompt(params["prompt"])
     await pipeline.warm()
 
     offer_params = params["offer"]
@@ -194,6 +198,10 @@ async def offer(request):
     prefs = list(filter(lambda x: x.name == "H264", caps.codecs))
     transceiver.setCodecPreferences(prefs)
 
+    # Monkey patch max and min bitrate to ensure constant bitrate
+    h264.MAX_BITRATE = MAX_BITRATE
+    h264.MIN_BITRATE = MIN_BITRATE
+
     @pc.on("track")
     def on_track(track):
         logger.info(f"Track received: {track.kind}")
@@ -236,6 +244,15 @@ async def on_connectionstatechange():
     )
 
 
+async def set_prompt(request):
+    pipeline = request.app["pipeline"]
+
+    prompt = await request.json()
+    pipeline.set_prompt(prompt)
+
+    return web.Response(content_type="application/json", text="OK")
+
+
 def health(_):
     return web.Response(content_type="application/json", text="OK")
 
@@ -244,6 +261,9 @@ async def on_startup(app: web.Application):
     if app["media_ports"]:
         patch_loop_datagram(app["media_ports"])
 
+    app["pipeline"] = Pipeline(
+        cwd=app["workspace"], disable_cuda_malloc=True, gpu_only=True
+    )
     app["pcs"] = set()
 
 
@@ -282,6 +302,7 @@ async def on_shutdown(app: web.Application):
     app.on_shutdown.append(on_shutdown)
 
     app.router.add_post("/offer", offer)
+    app.router.add_post("/prompt", set_prompt)
     app.router.add_get("/", health)
 
     web.run_app(app, host=args.host, port=int(args.port))
@@ -5,15 +5,21 @@
 from typing import Any, Dict
 from comfystream.client import ComfyStreamClient
 
+WARMUP_RUNS = 5
+
 
 class Pipeline:
-    def __init__(self, prompt: Dict[Any, Any], **kwargs):
+    def __init__(self, **kwargs):
         self.client = ComfyStreamClient(**kwargs)
-        self.client.set_prompt(prompt)
 
     async def warm(self):
         frame = torch.randn(1, 512, 512, 3)
-        await self.predict(frame)
+
+        for _ in range(WARMUP_RUNS):
+            await self.predict(frame)
+
+    def set_prompt(self, prompt: Dict[Any, Any]):
+        self.client.set_prompt(prompt)
 
     def preprocess(self, frame: av.VideoFrame) -> torch.Tensor:
         frame_np = frame.to_ndarray(format="rgb24").astype(np.float32) / 255.0
 
@@ -19,5 +19,6 @@
         "opentelemetry-semantic-conventions==0.48b0",
         "comfyui @ git+https://github.com/hiddenswitch/ComfyUI.git@89d07f3adf32a6703181343bc732bd85104bb653",
     ],
+    extras_require={"dev": ["pytest"]},
     url="https://github.com/yondonfu/comfystream",
 )
@@ -1,53 +1,85 @@
 import copy
 
+from typing import Dict, Any
 from comfy.api.components.schema.prompt import Prompt, PromptDictInput
 
 
+def create_load_tensor_node():
+    return {
+        "inputs": {},
+        "class_type": "LoadTensor",
+        "_meta": {"title": "LoadTensor"},
+    }
+
+
+def create_save_tensor_node(inputs: Dict[Any, Any]):
+    return {
+        "inputs": inputs,
+        "class_type": "SaveTensor",
+        "_meta": {"title": "SaveTensor"},
+    }
+
+
 def convert_prompt(prompt: PromptDictInput) -> Prompt:
     # Validate the schema
     Prompt.validate(prompt)
 
     prompt = copy.deepcopy(prompt)
 
+    num_primary_inputs = 0
     num_inputs = 0
     num_outputs = 0
 
+    keys = {
+        "PrimaryInputLoadImage": [],
+        "LoadImage": [],
+        "PreviewImage": [],
+        "SaveImage": [],
+    }
     for key, node in prompt.items():
-        if node.get("class_type") == "LoadImage":
-            num_inputs += 1
+        class_type = node.get("class_type")
 
-            prompt[key] = {
-                "inputs": {},
-                "class_type": "LoadTensor",
-                "_meta": {"title": "LoadTensor"},
-            }
-        elif node.get("class_type") in ["PreviewImage", "SaveImage"]:
-            num_outputs += 1
+        # Collect keys for nodes that might need to be replaced
+        if class_type in keys:
+            keys[class_type].append(key)
 
-            prompt[key] = {
-                "inputs": node["inputs"],
-                "class_type": "SaveTensor",
-                "_meta": {"title": "SaveTensor"},
-            }
-        elif node.get("class_type") in ["LoadTensor", "LoadAudioTensor"]:
+        # Count inputs and outputs
+        if class_type == "PrimaryInputLoadImage":
+            num_primary_inputs += 1
+        elif class_type in ["LoadImage", "LoadTensor", "LoadAudioTensor"]:
             num_inputs += 1
-        elif node.get("class_type") in ["SaveTensor", "SaveASRResponse"]:
+        elif class_type in ["PreviewImage", "SaveImage", "SaveTensor", "SaveASRResponse"]:
             num_outputs += 1
 
-    # Only handle single input for now
-    if num_inputs > 1:
+    # Only handle single primary input
+    if num_primary_inputs > 1:
+        raise Exception("too many primary inputs in prompt")
+
+    # If there are no primary inputs, only handle single input
+    if num_primary_inputs == 0 and num_inputs > 1:
         raise Exception("too many inputs in prompt")
 
     # Only handle single output for now
     if num_outputs > 1:
         raise Exception("too many outputs in prompt")
 
-    if num_inputs == 0:
+    if num_primary_inputs + num_inputs == 0:
         raise Exception("missing input")
 
     if num_outputs == 0:
         raise Exception("missing output")
 
+    # Replace nodes
+    for key in keys["PrimaryInputLoadImage"]:
+        prompt[key] = create_load_tensor_node()
+
+    if num_primary_inputs == 0 and len(keys["LoadImage"]) == 1:
+        prompt[keys["LoadImage"][0]] = create_load_tensor_node()
+
+    for key in keys["PreviewImage"] + keys["SaveImage"]:
+        node = prompt[key]
+        prompt[key] = create_save_tensor_node(node["inputs"])
+
     # Validate the processed prompt input
     prompt = Prompt.validate(prompt)
Original file line number	Diff line number	Diff line change
`@@ -19,5 +19,6 @@`
`19`	`19`	`"opentelemetry-semantic-conventions==0.48b0",`
`20`	`20`	`"comfyui @ git+https://github.com/hiddenswitch/ComfyUI.git@89d07f3adf32a6703181343bc732bd85104bb653",`
`21`	`21`	`],`
	`22`	`+ extras_require={"dev": ["pytest"]},`
`22`	`23`	`url="https://github.com/yondonfu/comfystream",`
`23`	`24`	`)`