Skip to content

Commit 76dcbf7

Browse files
committed
refactor(dev): clarify GPU monitoring behavior in containers
This commit improves monitoring logic to clearly indicate that GPU monitoring cannot be done automatically using the script name or container PID inside a container. Users are advised to provide the host PID instead.
1 parent b0f15bf commit 76dcbf7

File tree

2 files changed

+106
-44
lines changed

2 files changed

+106
-44
lines changed

scripts/README.md

+16-2
Original file line numberDiff line numberDiff line change
@@ -117,18 +117,32 @@ The `spinup_comfystream_tensordock.py` script automates VM provisioning, setup,
117117
To monitor the resource consumption of a running ComfyStream server, use the `monitor_pid_resources.py` script:
118118

119119
1. **Start the ComfyStream server** and execute a streaming workflow.
120-
2. **Retrieve the process ID (PID) of the server** using:
120+
2. **Run the profiling script**:
121+
122+
```bash
123+
python monitor_pid_resources.py --name app.py
124+
```
125+
126+
The script will automatically try to find the process ID (PID) of the server. If you prefer to specify the PID manually, you can retrieve it using:
121127

122128
```bash
123129
pgrep -f app.py | xargs ps -o pid,cmd --pid
124130
```
125131

126-
3. **Run the profiling script:**
132+
Then run the profiling script with the retrieved PID:
127133

128134
```bash
129135
python monitor_pid_resources.py --pid <PID>
130136
```
131137

138+
3. **Running Inside a Container**: If you are running the script inside a container, use the `--host-pid` option to provide the host PID for accurate GPU monitoring:
139+
140+
```bash
141+
python monitor_pid_resources.py --name app.py --host-pid <HOST_PID>
142+
```
143+
144+
Find `<HOST_PID>` with `nvidia-smi` on the host.
145+
132146
The script will continuously track **CPU and memory usage** at specified intervals. If the `--spy` flag is used, it will also generate a **detailed Py-Spy profiler report** for deeper performance insights.
133147

134148
### Additional Options

scripts/monitor_pid_resources.py

+90-42
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
"""A Python script to monitor system resources for a given PID and optionally create
1+
"""A Python script to monitor system resources for a given PID and optionally create
22
a py-spy profiler report."""
33

44
import psutil
@@ -8,25 +8,51 @@
88
import click
99
import threading
1010
import csv
11+
from pathlib import Path
12+
from typing import List
1113

1214

13-
def total_cpu_percent_with_children(pid: int) -> float:
14-
"""Return total CPU usage (%) for process `pid` and its children.
15+
def is_running_inside_container():
16+
"""Detects if the script is running inside a container."""
17+
if Path("/.dockerenv").exists():
18+
return True
19+
try:
20+
with open("/proc/1/cgroup", "rt") as f:
21+
return any("docker" in line or "kubepods" in line for line in f)
22+
except FileNotFoundError:
23+
return False
24+
25+
26+
def get_all_processes(pid: int) -> List[psutil.Process]:
27+
"""Return the parent process and all its children.
1528
1629
Args:
17-
pid: Process ID to monitor.
30+
pid: Parent process ID.
1831
1932
Returns:
20-
Total CPU usage (%) for the process and its children.
33+
List of all processes (parent and children).
2134
"""
2235
try:
2336
parent = psutil.Process(pid)
37+
children = parent.children(recursive=True)
38+
return [parent] + children
2439
except psutil.NoSuchProcess:
40+
return []
41+
42+
43+
def total_cpu_percent(pids: List[psutil.Process]) -> float:
44+
"""Return total CPU usage (%) for a list of process IDs.
45+
Args:
46+
pids: List of process IDs to monitor.
47+
48+
Returns:
49+
Total CPU usage (%) for the process IDs.
50+
"""
51+
if not pids:
2552
return 0.0
2653

2754
# Prime CPU measurement for child processes.
28-
processes = [parent] + parent.children(recursive=True)
29-
for proc in processes:
55+
for proc in pids:
3056
try:
3157
proc.cpu_percent(interval=None) # Prime the reading
3258
except (psutil.NoSuchProcess, psutil.AccessDenied, psutil.ZombieProcess):
@@ -36,68 +62,61 @@ def total_cpu_percent_with_children(pid: int) -> float:
3662

3763
# Get the real CPU usage for all processes.
3864
total_cpu = 0.0
39-
for proc in processes:
65+
for proc in pids:
4066
try:
4167
total_cpu += proc.cpu_percent(interval=0.0) # Get real CPU %
4268
except (psutil.NoSuchProcess, psutil.AccessDenied, psutil.ZombieProcess):
4369
continue # Ignore processes that disappeared
4470
return total_cpu
4571

4672

47-
def total_memory_with_children(pid: int) -> float:
48-
"""Return total memory usage (MB) for a process and its children.
73+
def total_memory(pids: List[psutil.Process]) -> float:
74+
"""Return total memory usage (MB) for a list of process IDs.
4975
5076
Args:
51-
pid: Parent process ID.
77+
pids: List of process IDs to monitor.
5278
5379
Returns:
54-
Total memory usage in MB.
80+
Total memory usage in MB for the process IDs.
5581
"""
56-
try:
57-
parent = psutil.Process(pid)
58-
children = parent.children(recursive=True)
59-
all_processes = [parent] + children
60-
total_mem = 0
61-
for proc in all_processes:
62-
try:
63-
mem_info = proc.memory_info()
64-
total_mem += mem_info.rss # Count physical memory (RAM)
65-
except (psutil.NoSuchProcess, psutil.AccessDenied):
66-
continue # Ignore processes we can't access
67-
return total_mem / (1024 * 1024) # Convert bytes to MB
68-
except psutil.NoSuchProcess:
69-
return 0.0 # Process not found
82+
if not pids:
83+
return 0.0
84+
85+
total_mem = 0
86+
for proc in pids:
87+
try:
88+
mem_info = proc.memory_info()
89+
total_mem += mem_info.rss # Count physical memory (RAM)
90+
except (psutil.NoSuchProcess, psutil.AccessDenied):
91+
continue # Ignore processes we can't access
92+
return total_mem / (1024 * 1024) # Convert bytes to MB
7093

7194

72-
def total_gpu_usage_with_children(pid: int) -> tuple:
73-
"""Return total GPU and VRAM usage (%) for process `pid` and its children.
95+
def total_gpu_usage(pids: List[int]) -> tuple:
96+
"""Return total GPU and VRAM usage (%) for a list of process IDs.
7497
7598
Args:
76-
pid: Process ID to monitor.
99+
pids: List of process IDs to monitor.
77100
78101
Returns:
79-
Tuple containing total GPU usage (%) and total VRAM usage (MB) for the process
80-
and its children.
102+
Tuple containing total GPU usage (%) and total VRAM usage (MB) for the
103+
proccess IDs.
81104
"""
82-
total_gpu_usage = 0
105+
total_usage = 0
83106
total_vram_usage = 0
84107

85108
try:
86-
parent = psutil.Process(pid)
87-
children = parent.children(recursive=True)
88-
all_processes = [parent] + children
89-
90109
device_count = pynvml.nvmlDeviceGetCount()
91110
for i in range(device_count):
92111
handle = pynvml.nvmlDeviceGetHandleByIndex(i)
93112
processes = pynvml.nvmlDeviceGetComputeRunningProcesses(handle)
94113
for proc_info in processes:
95-
if proc_info.pid in [p.pid for p in all_processes]:
96-
total_gpu_usage += pynvml.nvmlDeviceGetUtilizationRates(handle).gpu
114+
if proc_info.pid in pids:
115+
total_usage += pynvml.nvmlDeviceGetUtilizationRates(handle).gpu
97116
total_vram_usage += proc_info.usedGpuMemory / (1024 * 1024) # MB
98117
except Exception:
99118
pass # Ignore errors (e.g., no GPU available)
100-
return total_gpu_usage, total_vram_usage
119+
return total_usage, total_vram_usage
101120

102121

103122
def find_pid_by_name(name: str) -> int:
@@ -136,6 +155,12 @@ def find_pid_by_name(name: str) -> int:
136155
@click.option(
137156
"--spy-output", type=str, default="pyspy_profile.svg", help="Py-Spy output file"
138157
)
158+
@click.option(
159+
"--host-pid",
160+
type=int,
161+
default=None,
162+
help="Host PID for GPU monitoring when running inside a container. Use 'pgrep -f app.py' to find the PID.",
163+
)
139164
def monitor_resources(
140165
pid: int,
141166
name: str,
@@ -144,6 +169,7 @@ def monitor_resources(
144169
output: str,
145170
spy: bool,
146171
spy_output: str,
172+
host_pid: int,
147173
):
148174
"""Monitor system resources for a given PID and optionally create a py-spy profiler
149175
report.
@@ -156,6 +182,7 @@ def monitor_resources(
156182
output (str): File to save logs (optional).
157183
spy (bool): Enable py-spy profiling.
158184
spy_output (str): Py-Spy output file.
185+
host_pid (int): Host PID for GPU monitoring (useful inside containers).
159186
"""
160187
if pid == "auto":
161188
pid = find_pid_by_name(name)
@@ -164,6 +191,24 @@ def monitor_resources(
164191
else:
165192
pid = int(pid)
166193

194+
if is_running_inside_container():
195+
if not host_pid:
196+
click.echo(
197+
click.style(
198+
"Warning: Running inside a container. GPU monitoring may not work correctly "
199+
"since `nvidia-smi` uses the host PID namespace. To fix this, provide the "
200+
"host PID using the `--host-pid` flag.",
201+
fg="yellow",
202+
)
203+
)
204+
else:
205+
click.echo(
206+
click.style(
207+
f"Running inside a container. Monitoring GPU using host PID {host_pid}.",
208+
fg="cyan",
209+
)
210+
)
211+
167212
if not psutil.pid_exists(pid):
168213
click.echo(click.style(f"Error: Process with PID {pid} not found.", fg="red"))
169214
return
@@ -202,9 +247,12 @@ def run_py_spy():
202247
elapsed_monitor_time = time.time() - monitor_start_time
203248
progress = (elapsed_monitor_time / duration) * 100
204249
try:
205-
cpu_usage = total_cpu_percent_with_children(pid)
206-
memory_usage = total_memory_with_children(pid)
207-
gpu_usage, vram_usage = total_gpu_usage_with_children(pid)
250+
all_processes = get_all_processes(pid)
251+
cpu_usage = total_cpu_percent(all_processes)
252+
memory_usage = total_memory(all_processes)
253+
gpu_usage, vram_usage = total_gpu_usage(
254+
[proc.pid for proc in all_processes] if not host_pid else [host_pid]
255+
)
208256

209257
log_entry = {
210258
"CPU (%)": cpu_usage,

0 commit comments

Comments
 (0)