Skip to content

Commit 8aa9baa

Browse files
authored
feat: add resource profiler script (#80)
* feat(dev): add profiler script for resource usage tracking This commit adds a lightweight profiler script for developers working on ComfyStream to monitor resource usage and compare it against previous runs.
1 parent c343599 commit 8aa9baa

File tree

3 files changed

+362
-9
lines changed

3 files changed

+362
-9
lines changed

scripts/README.md

+51-9
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@ This directory contains helper scripts to simplify the deployment and management
44

55
- **Ansible Playbook (`ansible/plays/setup_comfystream.yml`)** – Deploys ComfyStream on any cloud provider.
66
- **`spinup_comfystream_tensordock.py`** – Fully automates VM creation and ComfyStream setup on a [TensorDock server](https://tensordock.com/).
7+
- `monitor_pid_resources.py`: Monitors and profiles the resource usage of a running ComfyStream server.
78

89
## Usage Instructions
910

@@ -84,29 +85,70 @@ The `spinup_comfystream_tensordock.py` script automates VM provisioning, setup,
8485
3. **View Available Script Options** *(Optional)*:
8586
To see all available options, run:
8687

87-
```bash
88-
python spinup_comfystream_tensordock.py --help
89-
```
88+
```bash
89+
python spinup_comfystream_tensordock.py --help
90+
```
9091

9192
4. **Run the Script**:
9293
Execute the following command to provision a VM and set up ComfyStream automatically:
9394

94-
```bash
95-
python spinup_comfystream_tensordock.py --api-key <API_KEY> --api-token <API_TOKEN>
96-
```
95+
```bash
96+
python spinup_comfystream_tensordock.py --api-key <API_KEY> --api-token <API_TOKEN>
97+
```
9798

9899
5. **Access the Server**:
99100
Once the setup is complete, the script will display the URLs to access ComfyStream.
100101

101102
6. **Stop & Delete the VM** *(When No Longer Needed)*:
102103
To stop and remove the instance, run:
103104

104-
```bash
105-
python spinup_comfystream_tensordock.py --delete <VM_ID>
106-
```
105+
```bash
106+
python spinup_comfystream_tensordock.py --delete <VM_ID>
107+
```
107108

108109
Replace `<VM_ID>` with the VM ID found in the script logs or the [TensorDock dashboard](https://dashboard.tensordock.com/instances).
109110

110111
> [!WARNING]
111112
> If you encounter `max retries exceeded with URL` errors, the VM might have been created but is inaccessible.
112113
> Check the [TensorDock dashboard](https://dashboard.tensordock.com/instances), delete the VM manually, wait **2-3 minutes**, then rerun the script.
114+
115+
### Profiling a Running ComfyStream Server
116+
117+
To monitor the resource consumption of a running ComfyStream server, use the `monitor_pid_resources.py` script:
118+
119+
1. **Start the ComfyStream server** and execute a streaming workflow.
120+
2. **Run the profiling script**:
121+
122+
```bash
123+
python monitor_pid_resources.py --name app.py
124+
```
125+
126+
The script will automatically try to find the process ID (PID) of the server. If you prefer to specify the PID manually, you can retrieve it using:
127+
128+
```bash
129+
pgrep -f app.py | xargs ps -o pid,cmd --pid
130+
```
131+
132+
Then run the profiling script with the retrieved PID:
133+
134+
```bash
135+
python monitor_pid_resources.py --pid <PID>
136+
```
137+
138+
3. **Running Inside a Container**: If you are running the script inside a container, use the `--host-pid` option to provide the host PID for accurate GPU monitoring:
139+
140+
```bash
141+
python monitor_pid_resources.py --name app.py --host-pid <HOST_PID>
142+
```
143+
144+
Find `<HOST_PID>` with `nvidia-smi` on the host.
145+
146+
The script will continuously track **CPU and memory usage** at specified intervals. If the `--spy` flag is used, it will also generate a **detailed Py-Spy profiler report** for deeper performance insights.
147+
148+
### Additional Options
149+
150+
For a complete list of available options, run:
151+
152+
```bash
153+
python monitor_pid_resources.py --help
154+
```

scripts/monitor_pid_resources.py

+308
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,308 @@
1+
"""A Python script to monitor system resources for a given PID and optionally create
2+
a py-spy profiler report."""
3+
4+
import psutil
5+
import pynvml
6+
import time
7+
import subprocess
8+
import click
9+
import threading
10+
import csv
11+
from pathlib import Path
12+
from typing import List
13+
14+
15+
def is_running_inside_container():
16+
"""Detects if the script is running inside a container."""
17+
if Path("/.dockerenv").exists():
18+
return True
19+
try:
20+
with open("/proc/1/cgroup", "rt") as f:
21+
return any("docker" in line or "kubepods" in line for line in f)
22+
except FileNotFoundError:
23+
return False
24+
25+
26+
def get_all_processes(pid: int) -> List[psutil.Process]:
27+
"""Return the parent process and all its children.
28+
29+
Args:
30+
pid: Parent process ID.
31+
32+
Returns:
33+
List of all processes (parent and children).
34+
"""
35+
try:
36+
parent = psutil.Process(pid)
37+
children = parent.children(recursive=True)
38+
return [parent] + children
39+
except psutil.NoSuchProcess:
40+
return []
41+
42+
43+
def total_cpu_percent(pids: List[psutil.Process]) -> float:
44+
"""Return total CPU usage (%) for a list of process IDs.
45+
Args:
46+
pids: List of process IDs to monitor.
47+
48+
Returns:
49+
Total CPU usage (%) for the process IDs.
50+
"""
51+
if not pids:
52+
return 0.0
53+
54+
# Prime CPU measurement for child processes.
55+
for proc in pids:
56+
try:
57+
proc.cpu_percent(interval=None) # Prime the reading
58+
except (psutil.NoSuchProcess, psutil.AccessDenied, psutil.ZombieProcess):
59+
continue # Ignore inaccessible processes
60+
61+
time.sleep(0.1) # Allow measurements to update
62+
63+
# Get the real CPU usage for all processes.
64+
total_cpu = 0.0
65+
for proc in pids:
66+
try:
67+
total_cpu += proc.cpu_percent(interval=0.0) # Get real CPU %
68+
except (psutil.NoSuchProcess, psutil.AccessDenied, psutil.ZombieProcess):
69+
continue # Ignore processes that disappeared
70+
return total_cpu
71+
72+
73+
def total_memory(pids: List[psutil.Process]) -> float:
74+
"""Return total memory usage (MB) for a list of process IDs.
75+
76+
Args:
77+
pids: List of process IDs to monitor.
78+
79+
Returns:
80+
Total memory usage in MB for the process IDs.
81+
"""
82+
if not pids:
83+
return 0.0
84+
85+
total_mem = 0
86+
for proc in pids:
87+
try:
88+
mem_info = proc.memory_info()
89+
total_mem += mem_info.rss # Count physical memory (RAM)
90+
except (psutil.NoSuchProcess, psutil.AccessDenied):
91+
continue # Ignore processes we can't access
92+
return total_mem / (1024 * 1024) # Convert bytes to MB
93+
94+
95+
def total_gpu_usage(pids: List[int]) -> tuple:
96+
"""Return total GPU and VRAM usage (%) for a list of process IDs.
97+
98+
Args:
99+
pids: List of process IDs to monitor.
100+
101+
Returns:
102+
Tuple containing total GPU usage (%) and total VRAM usage (MB) for the
103+
proccess IDs.
104+
"""
105+
total_usage = 0
106+
total_vram_usage = 0
107+
108+
try:
109+
device_count = pynvml.nvmlDeviceGetCount()
110+
for i in range(device_count):
111+
handle = pynvml.nvmlDeviceGetHandleByIndex(i)
112+
processes = pynvml.nvmlDeviceGetComputeRunningProcesses(handle)
113+
for proc_info in processes:
114+
if proc_info.pid in pids:
115+
total_usage += pynvml.nvmlDeviceGetUtilizationRates(handle).gpu
116+
total_vram_usage += proc_info.usedGpuMemory / (1024 * 1024) # MB
117+
except Exception:
118+
pass # Ignore errors (e.g., no GPU available)
119+
return total_usage, total_vram_usage
120+
121+
122+
def find_pid_by_name(name: str) -> int:
123+
"""Find the PID of the process with the given name.
124+
125+
Args:
126+
name: Name of the process to find.
127+
128+
Returns:
129+
Process ID of the process with the given name.
130+
"""
131+
for proc in psutil.process_iter(["pid", "name", "cmdline"]):
132+
if name in proc.info["cmdline"]:
133+
found_pid = proc.info["pid"]
134+
click.echo(
135+
click.style(f"Found process '{name}' with PID {found_pid}.", fg="green")
136+
)
137+
return found_pid
138+
click.echo(click.style(f"Error: Process with name '{name}' not found.", fg="red"))
139+
return None
140+
141+
142+
@click.command()
143+
@click.option(
144+
"--pid", type=str, default="auto", help='Process ID or "auto" to find by name'
145+
)
146+
@click.option(
147+
"--name", type=str, default="app.py", help="Process name (default: app.py)"
148+
)
149+
@click.option("--interval", type=int, default=2, help="Monitoring interval (seconds)")
150+
@click.option(
151+
"--duration", type=int, default=30, help="Total monitoring duration (seconds)"
152+
)
153+
@click.option("--output", type=str, default=None, help="File to save logs (optional)")
154+
@click.option("--spy", is_flag=True, help="Enable py-spy profiling")
155+
@click.option(
156+
"--spy-output", type=str, default="pyspy_profile.svg", help="Py-Spy output file"
157+
)
158+
@click.option(
159+
"--host-pid",
160+
type=int,
161+
default=None,
162+
help="Host PID for GPU monitoring when running inside a container. Use 'pgrep -f app.py' to find the PID.",
163+
)
164+
def monitor_resources(
165+
pid: int,
166+
name: str,
167+
interval: int,
168+
duration: int,
169+
output: str,
170+
spy: bool,
171+
spy_output: str,
172+
host_pid: int,
173+
):
174+
"""Monitor system resources for a given PID and optionally create a py-spy profiler
175+
report.
176+
177+
Args:
178+
pid (int): Process ID of the Python script.
179+
name (str): Name of the Python script.
180+
interval (int): Monitoring interval in seconds.
181+
duration (int): Total monitoring duration in seconds.
182+
output (str): File to save logs (optional).
183+
spy (bool): Enable py-spy profiling.
184+
spy_output (str): Py-Spy output file.
185+
host_pid (int): Host PID for GPU monitoring (useful inside containers).
186+
"""
187+
if pid == "auto":
188+
pid = find_pid_by_name(name)
189+
if pid is None:
190+
return
191+
else:
192+
pid = int(pid)
193+
194+
if is_running_inside_container():
195+
if not host_pid:
196+
click.echo(
197+
click.style(
198+
"Warning: Running inside a container. GPU monitoring may not work correctly "
199+
"since `nvidia-smi` uses the host PID namespace. To fix this, provide the "
200+
"host PID using the `--host-pid` flag.",
201+
fg="yellow",
202+
)
203+
)
204+
else:
205+
click.echo(
206+
click.style(
207+
f"Running inside a container. Monitoring GPU using host PID {host_pid}.",
208+
fg="cyan",
209+
)
210+
)
211+
212+
if not psutil.pid_exists(pid):
213+
click.echo(click.style(f"Error: Process with PID {pid} not found.", fg="red"))
214+
return
215+
216+
click.echo(
217+
click.style(f"Monitoring PID {pid} for {duration} seconds...", fg="green")
218+
)
219+
220+
def run_py_spy():
221+
"""Run py-spy profiler for deep profiling."""
222+
click.echo(click.style("Running py-spy for deep profiling...", fg="green"))
223+
spy_cmd = f"py-spy record -o {spy_output} --pid {pid} --duration {duration}"
224+
try:
225+
subprocess.run(
226+
spy_cmd, shell=True, check=True, capture_output=True, text=True
227+
)
228+
click.echo(
229+
click.style(f"Py-Spy flame graph saved to {spy_output}", fg="green")
230+
)
231+
except subprocess.CalledProcessError as e:
232+
click.echo(click.style(f"Error running py-spy: {e.stderr}", fg="red"))
233+
234+
# Start py-spy profiling in a separate thread if enabled.
235+
if spy:
236+
spy_thread = threading.Thread(target=run_py_spy)
237+
spy_thread.start()
238+
239+
# Start main resources monitoring loop.
240+
pynvml.nvmlInit()
241+
monitor_start_time = time.time()
242+
end_time = time.time() + duration
243+
logs = []
244+
cpu_usages, ram_usages, gpu_usages, vram_usages = [], [], [], []
245+
while time.time() < end_time:
246+
start_time = time.time()
247+
elapsed_monitor_time = time.time() - monitor_start_time
248+
progress = (elapsed_monitor_time / duration) * 100
249+
try:
250+
all_processes = get_all_processes(pid)
251+
cpu_usage = total_cpu_percent(all_processes)
252+
memory_usage = total_memory(all_processes)
253+
gpu_usage, vram_usage = total_gpu_usage(
254+
[proc.pid for proc in all_processes] if not host_pid else [host_pid]
255+
)
256+
257+
log_entry = {
258+
"CPU (%)": cpu_usage,
259+
"RAM (MB)": memory_usage,
260+
"GPU (%)": gpu_usage,
261+
"VRAM (MB)": vram_usage,
262+
}
263+
click.echo(
264+
f"[{progress:.1f}%] CPU: {cpu_usage:.2f}%, RAM: {memory_usage:.2f}MB, "
265+
f"GPU: {gpu_usage:.2f}%, VRAM: {vram_usage:.2f}MB"
266+
)
267+
logs.append(log_entry)
268+
cpu_usages.append(cpu_usage)
269+
ram_usages.append(memory_usage)
270+
gpu_usages.append(gpu_usage)
271+
vram_usages.append(vram_usage)
272+
273+
# Adjust sleep time to maintain exact interval
274+
elapsed_time = time.time() - start_time
275+
sleep_time = max(0, interval - elapsed_time)
276+
time.sleep(sleep_time)
277+
except psutil.NoSuchProcess:
278+
click.echo(click.style("Error: Process terminated!", fg="red"))
279+
break
280+
281+
pynvml.nvmlShutdown()
282+
283+
# Calculate and log averages
284+
avg_cpu = sum(cpu_usages) / len(cpu_usages) if cpu_usages else 0
285+
avg_ram = sum(ram_usages) / len(ram_usages) if ram_usages else 0
286+
avg_gpu = sum(gpu_usages) / len(gpu_usages) if gpu_usages else 0
287+
avg_vram = sum(vram_usages) / len(vram_usages) if vram_usages else 0
288+
289+
click.echo(
290+
f"AVERAGE - CPU: {avg_cpu:.2f}%, RAM: {avg_ram:.2f}MB, GPU: {avg_gpu:.2f}%, VRAM: {avg_vram:.2f}MB"
291+
)
292+
293+
# Save logs if output file is provided.
294+
if output:
295+
with open(output, "w", newline="") as csvfile:
296+
fieldnames = ["CPU (%)", "RAM (MB)", "GPU (%)", "VRAM (MB)"]
297+
writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
298+
writer.writeheader()
299+
writer.writerows(logs)
300+
click.echo(click.style(f"Logs saved to {output}", fg="green"))
301+
302+
# Wait for py-spy thread to finish if it was started.
303+
if spy:
304+
spy_thread.join()
305+
306+
307+
if __name__ == "__main__":
308+
monitor_resources()

0 commit comments

Comments
 (0)