Skip to content

Commit beedaa0

Browse files
committed
fix(dev): track system resources for parent and child processes
Ensure CPU, memory, and GPU usage are correctly aggregated for the main process and its children, improving accuracy in resource monitoring.
1 parent e2049c7 commit beedaa0

File tree

2 files changed

+160
-58
lines changed

2 files changed

+160
-58
lines changed

scripts/README.md

+1-1
Original file line numberDiff line numberDiff line change
@@ -116,7 +116,7 @@ To monitor the resource consumption of a running ComfyStream server, use the `mo
116116
2. **Retrieve the process ID (PID) of the server** using:
117117

118118
```bash
119-
ps aux | grep app.py
119+
pgrep -f app.py | xargs ps -o pid,cmd --pid
120120
```
121121

122122
3. **Run the profiling script:**

scripts/monitor_pid_resources.py

+159-57
Original file line numberDiff line numberDiff line change
@@ -10,45 +10,167 @@
1010
import csv
1111

1212

13+
def total_cpu_percent_with_children(pid: int) -> float:
14+
"""Return total CPU usage (%) for process `pid` and its children.
15+
16+
Args:
17+
pid: Process ID to monitor.
18+
19+
Returns:
20+
Total CPU usage (%) for the process and its children.
21+
"""
22+
try:
23+
parent = psutil.Process(pid)
24+
except psutil.NoSuchProcess:
25+
return 0.0
26+
27+
# Prime CPU measurement for child processes.
28+
processes = [parent] + parent.children(recursive=True)
29+
for proc in processes:
30+
try:
31+
proc.cpu_percent(interval=None) # Prime the reading
32+
except (psutil.NoSuchProcess, psutil.AccessDenied, psutil.ZombieProcess):
33+
continue # Ignore inaccessible processes
34+
35+
time.sleep(0.1) # Allow measurements to update
36+
37+
# Get the real CPU usage for all processes.
38+
total_cpu = 0.0
39+
for proc in processes:
40+
try:
41+
total_cpu += proc.cpu_percent(interval=0.0) # Get real CPU %
42+
except (psutil.NoSuchProcess, psutil.AccessDenied, psutil.ZombieProcess):
43+
continue # Ignore processes that disappeared
44+
return total_cpu
45+
46+
47+
def total_memory_with_children(pid: int) -> float:
48+
"""
49+
Return total memory usage (MB) for a process and its children.
50+
51+
Args:
52+
pid: Parent process ID.
53+
54+
Returns:
55+
Total memory usage in MB.
56+
"""
57+
try:
58+
parent = psutil.Process(pid)
59+
children = parent.children(recursive=True)
60+
all_processes = [parent] + children
61+
total_mem = 0
62+
for proc in all_processes:
63+
try:
64+
mem_info = proc.memory_info()
65+
total_mem += mem_info.rss # Count physical memory (RAM)
66+
except (psutil.NoSuchProcess, psutil.AccessDenied):
67+
continue # Ignore processes we can't access
68+
return total_mem / (1024 * 1024) # Convert bytes to MB
69+
except psutil.NoSuchProcess:
70+
return 0.0 # Process not found
71+
72+
73+
def total_gpu_usage_with_children(pid: int) -> tuple:
74+
"""Return total GPU and VRAM usage (%) for process `pid` and its children.
75+
76+
Args:
77+
pid: Process ID to monitor.
78+
79+
Returns:
80+
Tuple containing total GPU usage (%) and total VRAM usage (MB) for the process
81+
and its children.
82+
"""
83+
total_gpu_usage = 0
84+
total_vram_usage = 0
85+
86+
try:
87+
parent = psutil.Process(pid)
88+
children = parent.children(recursive=True)
89+
all_processes = [parent] + children
90+
91+
device_count = pynvml.nvmlDeviceGetCount()
92+
for i in range(device_count):
93+
handle = pynvml.nvmlDeviceGetHandleByIndex(i)
94+
processes = pynvml.nvmlDeviceGetComputeRunningProcesses(handle)
95+
for proc_info in processes:
96+
if proc_info.pid in [p.pid for p in all_processes]:
97+
total_gpu_usage += pynvml.nvmlDeviceGetUtilizationRates(handle).gpu
98+
total_vram_usage += proc_info.usedGpuMemory / (1024 * 1024) # MB
99+
except Exception:
100+
pass # Ignore errors (e.g., no GPU available)
101+
return total_gpu_usage, total_vram_usage
102+
103+
104+
def find_pid_by_name(name: str) -> int:
105+
"""Find the PID of the process with the given name.
106+
107+
Args:
108+
name: Name of the process to find.
109+
110+
Returns:
111+
Process ID of the process with the given name.
112+
"""
113+
for proc in psutil.process_iter(["pid", "name", "cmdline"]):
114+
if name in proc.info["cmdline"]:
115+
found_pid = proc.info["pid"]
116+
click.echo(
117+
click.style(f"Found process '{name}' with PID {found_pid}.", fg="green")
118+
)
119+
return found_pid
120+
click.echo(click.style(f"Error: Process with name '{name}' not found.", fg="red"))
121+
return None
122+
123+
13124
@click.command()
14-
@click.option("--pid", type=int, required=True, help="Process ID of the Python script")
15-
@click.option("--interval", type=int, default=2, help="Monitoring interval (seconds)")
16125
@click.option(
17-
"--duration", type=int, default=30, help="Total monitoring duration (seconds)"
126+
"--pid", type=str, default="auto", help='Process ID or "auto" to find by name'
127+
)
128+
@click.option(
129+
"--name", type=str, default="app.py", help="Process name (default: app.py)"
18130
)
131+
@click.option("--interval", type=int, default=2, help="Monitoring interval (seconds)")
19132
@click.option(
20-
"--output",
21-
type=str,
22-
default=None,
23-
help="File to save system resource logs (optional)",
133+
"--duration", type=int, default=30, help="Total monitoring duration (seconds)"
24134
)
135+
@click.option("--output", type=str, default=None, help="File to save logs (optional)")
25136
@click.option("--spy", is_flag=True, help="Enable py-spy profiling")
26137
@click.option(
27138
"--spy-output", type=str, default="pyspy_profile.svg", help="Py-Spy output file"
28139
)
29140
def monitor_resources(
30-
pid: int, interval: int, duration: int, output: str, spy: bool, spy_output: str
141+
pid: int,
142+
name: str,
143+
interval: int,
144+
duration: int,
145+
output: str,
146+
spy: bool,
147+
spy_output: str,
31148
):
32149
"""Monitor system resources for a given PID and optionally create a py-spy profiler
33150
report.
34151
35152
Args:
36153
pid (int): Process ID of the Python script.
154+
name (str): Name of the Python script.
37155
interval (int): Monitoring interval in seconds.
38156
duration (int): Total monitoring duration in seconds.
39157
output (str): File to save logs (optional).
40158
spy (bool): Enable py-spy profiling.
41159
spy_output (str): Py-Spy output file.
42160
"""
161+
if pid == "auto":
162+
pid = find_pid_by_name(name)
163+
if pid is None:
164+
return
165+
else:
166+
pid = int(pid)
167+
43168
if not psutil.pid_exists(pid):
44169
click.echo(click.style(f"Error: Process with PID {pid} not found.", fg="red"))
45170
return
46171

47172
click.echo(
48-
click.style(
49-
f"Monitoring system resources for PID {pid} for {duration} seconds...",
50-
fg="green",
51-
)
173+
click.style(f"Monitoring PID {pid} for {duration} seconds...", fg="green")
52174
)
53175

54176
def run_py_spy():
@@ -76,72 +198,52 @@ def run_py_spy():
76198
logs = []
77199
cpu_usages, ram_usages, gpu_usages, vram_usages = [], [], [], []
78200
while time.time() < end_time:
201+
start_time = time.time()
79202
try:
80-
# General system usage.
81-
process = psutil.Process(pid)
82-
cpu_usage = process.cpu_percent(interval=interval)
83-
ram_usage = process.memory_info().rss / (1024 * 1024) # MB
84-
85-
# GPU usage.
86-
process_gpu_usage = 0
87-
process_vram_usage = 0
88-
device_count = pynvml.nvmlDeviceGetCount()
89-
for i in range(device_count):
90-
handle = pynvml.nvmlDeviceGetHandleByIndex(i)
91-
processes = pynvml.nvmlDeviceGetComputeRunningProcesses(handle)
92-
for proc_info in processes:
93-
if proc_info.pid == pid:
94-
process_gpu_usage = pynvml.nvmlDeviceGetUtilizationRates(
95-
handle
96-
).gpu
97-
process_vram_usage = proc_info.usedGpuMemory / (
98-
1024 * 1024
99-
) # MB
100-
break
101-
102-
# Collect and log resource usage.
203+
cpu_usage = total_cpu_percent_with_children(pid)
204+
memory_usage = total_memory_with_children(pid)
205+
gpu_usage, vram_usage = total_gpu_usage_with_children(pid)
206+
103207
log_entry = {
104208
"CPU (%)": cpu_usage,
105-
"RAM (MB)": ram_usage,
106-
"GPU (%)": process_gpu_usage,
107-
"VRAM (MB)": process_vram_usage,
209+
"RAM (MB)": memory_usage,
210+
"GPU (%)": gpu_usage,
211+
"VRAM (MB)": vram_usage,
108212
}
109213
click.echo(
110-
f"CPU: {cpu_usage:.2f}%, RAM: {ram_usage:.2f}MB, GPU: {process_gpu_usage:.2f}%, VRAM: {process_vram_usage:.2f}MB"
214+
f"CPU: {cpu_usage:.2f}%, RAM: {memory_usage:.2f}MB, GPU: {gpu_usage:.2f}%, VRAM: {vram_usage:.2f}MB"
111215
)
112216
logs.append(log_entry)
113217
cpu_usages.append(cpu_usage)
114-
ram_usages.append(ram_usage)
115-
gpu_usages.append(process_gpu_usage)
116-
vram_usages.append(process_vram_usage)
218+
ram_usages.append(memory_usage)
219+
gpu_usages.append(gpu_usage)
220+
vram_usages.append(vram_usage)
221+
222+
# Adjust sleep time to maintain exact interval
223+
elapsed_time = time.time() - start_time
224+
sleep_time = max(0, interval - elapsed_time)
225+
time.sleep(sleep_time)
117226
except psutil.NoSuchProcess:
118-
click.echo(click.style("Error: Process terminated!"))
227+
click.echo(click.style("Error: Process terminated!", fg="red"))
119228
break
120229

121230
pynvml.nvmlShutdown()
122231

123-
# Calculate and log average resource usage.
124-
avg_cpu_usage = sum(cpu_usages) / len(cpu_usages) if cpu_usages else 0
125-
avg_ram_usage = sum(ram_usages) / len(ram_usages) if ram_usages else 0
126-
avg_gpu_usage = sum(gpu_usages) / len(gpu_usages) if gpu_usages else 0
127-
avg_vram_usage = sum(vram_usages) / len(vram_usages) if vram_usages else 0
128-
avg_log_entry = {
129-
"CPU (%)": avg_cpu_usage,
130-
"RAM (MB)": avg_ram_usage,
131-
"GPU (%)": avg_gpu_usage,
132-
"VRAM (MB)": avg_vram_usage,
133-
}
232+
# Calculate and log averages
233+
avg_cpu = sum(cpu_usages) / len(cpu_usages) if cpu_usages else 0
234+
avg_ram = sum(ram_usages) / len(ram_usages) if ram_usages else 0
235+
avg_gpu = sum(gpu_usages) / len(gpu_usages) if gpu_usages else 0
236+
avg_vram = sum(vram_usages) / len(vram_usages) if vram_usages else 0
237+
134238
click.echo(
135-
f"AVERAGE - CPU: {avg_cpu_usage:.2f}%, RAM: {avg_ram_usage:.2f}MB, GPU: {avg_gpu_usage:.2f}%, VRAM: {avg_vram_usage:.2f}MB"
239+
f"AVERAGE - CPU: {avg_cpu:.2f}%, RAM: {avg_ram:.2f}MB, GPU: {avg_gpu:.2f}%, VRAM: {avg_vram:.2f}MB"
136240
)
137-
logs.append(avg_log_entry)
138241

139242
# Save logs if output file is provided.
140243
if output:
141244
with open(output, "w", newline="") as csvfile:
142245
fieldnames = ["CPU (%)", "RAM (MB)", "GPU (%)", "VRAM (MB)"]
143246
writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
144-
145247
writer.writeheader()
146248
writer.writerows(logs)
147249
click.echo(click.style(f"Logs saved to {output}", fg="green"))

0 commit comments

Comments
 (0)