From 4ae4353cc556ebb59d92d74bf7192de2e09e3527 Mon Sep 17 00:00:00 2001 From: youkaichao Date: Tue, 18 Jun 2024 22:22:12 -0700 Subject: [PATCH 1/6] use fork by default for mp --- vllm/envs.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/vllm/envs.py b/vllm/envs.py index f03b69f4b8866..ae2fcd0826fb1 100644 --- a/vllm/envs.py +++ b/vllm/envs.py @@ -29,7 +29,7 @@ VLLM_CPU_KVCACHE_SPACE: int = 0 VLLM_XLA_CACHE_PATH: str = "~/.vllm/xla_cache/" VLLM_USE_RAY_COMPILED_DAG: bool = False - VLLM_WORKER_MULTIPROC_METHOD: str = "spawn" + VLLM_WORKER_MULTIPROC_METHOD: str = "fork" VLLM_IMAGE_FETCH_TIMEOUT: int = 5 VLLM_TARGET_DEVICE: str = "cuda" MAX_JOBS: Optional[str] = None @@ -212,7 +212,7 @@ # Use dedicated multiprocess context for workers. # Both spawn and fork work "VLLM_WORKER_MULTIPROC_METHOD": - lambda: os.getenv("VLLM_WORKER_MULTIPROC_METHOD", "spawn"), + lambda: os.getenv("VLLM_WORKER_MULTIPROC_METHOD", "fork"), # Timeout for fetching images when serving multimodal models # Default is 5 seconds From 4e5a3f014abf53de1de79028848260f23173d580 Mon Sep 17 00:00:00 2001 From: youkaichao Date: Wed, 19 Jun 2024 09:06:16 -0700 Subject: [PATCH 2/6] fix spawn --- .../custom_all_reduce_utils.py | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) diff --git a/vllm/distributed/device_communicators/custom_all_reduce_utils.py b/vllm/distributed/device_communicators/custom_all_reduce_utils.py index e0641a54c4194..440f59ed3f915 100644 --- a/vllm/distributed/device_communicators/custom_all_reduce_utils.py +++ b/vllm/distributed/device_communicators/custom_all_reduce_utils.py @@ -1,6 +1,9 @@ import ctypes import json import os +import pickle +import subprocess +import sys from itertools import product from typing import Dict, List, Optional, Sequence @@ -198,7 +201,13 @@ def gpu_p2p_access_check(src: int, tgt: int) -> bool: ids = list(range(num_dev)) # batch of all pairs of GPUs batch_src, batch_tgt = zip(*list(product(ids, ids))) - result = can_actually_p2p(batch_src, batch_tgt) + input_bytes = pickle.dumps((batch_src, batch_tgt)) + returned = subprocess.run([sys.executable, __file__], + input=input_bytes, + capture_output=True) + # check if the subprocess is successful + returned.check_returncode() + result = pickle.loads(returned.stdout) for _i, _j, r in zip(batch_src, batch_tgt, result): cache[f"{_i}->{_j}"] = r with open(path, "w") as f: @@ -213,3 +222,8 @@ def gpu_p2p_access_check(src: int, tgt: int) -> bool: __all__ = ["gpu_p2p_access_check"] + +if __name__ == "__main__": + batch_src, batch_tgt = pickle.loads(sys.stdin.buffer.read()) + result = can_actually_p2p(batch_src, batch_tgt) + sys.stdout.buffer.write(pickle.dumps(result)) From 2e2731637f1b25d7ca3086b7951a7e671e023f71 Mon Sep 17 00:00:00 2001 From: youkaichao Date: Wed, 19 Jun 2024 09:10:02 -0700 Subject: [PATCH 3/6] add comments --- .../device_communicators/custom_all_reduce_utils.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/vllm/distributed/device_communicators/custom_all_reduce_utils.py b/vllm/distributed/device_communicators/custom_all_reduce_utils.py index 440f59ed3f915..1bb8b8011def1 100644 --- a/vllm/distributed/device_communicators/custom_all_reduce_utils.py +++ b/vllm/distributed/device_communicators/custom_all_reduce_utils.py @@ -201,6 +201,12 @@ def gpu_p2p_access_check(src: int, tgt: int) -> bool: ids = list(range(num_dev)) # batch of all pairs of GPUs batch_src, batch_tgt = zip(*list(product(ids, ids))) + # NOTE: we use `subprocess` rather than `multiprocessing` here + # because the caller might not have `if __name__ == "__main__":`, + # in that case we cannot use spawn method in multiprocessing. + # However, `can_actually_p2p` requires spawn method. + # The fix is, we use `subprocess` to call the function, + # where we have `if __name__ == "__main__":` in this file. input_bytes = pickle.dumps((batch_src, batch_tgt)) returned = subprocess.run([sys.executable, __file__], input=input_bytes, From 288c99a304730e701ac715b1452031d539d000c8 Mon Sep 17 00:00:00 2001 From: youkaichao Date: Wed, 19 Jun 2024 09:28:41 -0700 Subject: [PATCH 4/6] wrap error message --- .../device_communicators/custom_all_reduce_utils.py | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/vllm/distributed/device_communicators/custom_all_reduce_utils.py b/vllm/distributed/device_communicators/custom_all_reduce_utils.py index 1bb8b8011def1..ffe357f51a6ea 100644 --- a/vllm/distributed/device_communicators/custom_all_reduce_utils.py +++ b/vllm/distributed/device_communicators/custom_all_reduce_utils.py @@ -212,7 +212,15 @@ def gpu_p2p_access_check(src: int, tgt: int) -> bool: input=input_bytes, capture_output=True) # check if the subprocess is successful - returned.check_returncode() + try: + returned.check_returncode() + except Exception as e: + # wrap raised exception to provide more information + logger.error( + "Error happened when batch testing " + "peer-to-peer access" + " from %s to %s", batch_src, batch_tgt) + raise e result = pickle.loads(returned.stdout) for _i, _j, r in zip(batch_src, batch_tgt, result): cache[f"{_i}->{_j}"] = r From 191f8ccfcdbe63f06f5beb753df43f5f2f736c66 Mon Sep 17 00:00:00 2001 From: youkaichao Date: Thu, 20 Jun 2024 15:35:21 -0700 Subject: [PATCH 5/6] temp fix, use spawn for testing --- .buildkite/test-pipeline.yaml | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml index 5afe3730210e8..67437ef7ea4b3 100644 --- a/.buildkite/test-pipeline.yaml +++ b/.buildkite/test-pipeline.yaml @@ -37,6 +37,9 @@ steps: working_dir: "/vllm-workspace/tests" num_gpus: 2 commands: + # FIXIT: find out which code initialize cuda before running the test + # before the fix, we need to use spawn to test it + - export VLLM_WORKER_MULTIPROC_METHOD=spawn - VLLM_TEST_SAME_HOST=1 torchrun --nproc-per-node=4 distributed/test_same_node.py - TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_basic_distributed_correctness.py - TEST_DIST_MODEL=meta-llama/Llama-2-7b-hf DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_basic_distributed_correctness.py @@ -55,6 +58,9 @@ steps: working_dir: "/vllm-workspace/tests" num_gpus: 4 commands: + # FIXIT: find out which code initialize cuda before running the test + # before the fix, we need to use spawn to test it + - export VLLM_WORKER_MULTIPROC_METHOD=spawn - pytest -v -s distributed/test_pynccl.py # We want to test that models which use 2 GPUs work with 4 GPUs, which is why we duplicate them here. # See https://github.com/vllm-project/vllm/pull/5473#issuecomment-2166601837 for context. @@ -145,6 +151,9 @@ steps: num_gpus: 4 # This test runs llama 13B, so it is required to run on 4 GPUs. commands: + # FIXIT: find out which code initialize cuda before running the test + # before the fix, we need to use spawn to test it + - export VLLM_WORKER_MULTIPROC_METHOD=spawn - pytest -v -s -x lora/test_long_context.py - label: Tensorizer Test From 8348ef58bb55c57dafcc1e27c53bf98c44254ddd Mon Sep 17 00:00:00 2001 From: youkaichao Date: Thu, 20 Jun 2024 15:36:50 -0700 Subject: [PATCH 6/6] use raise from --- .../device_communicators/custom_all_reduce_utils.py | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/vllm/distributed/device_communicators/custom_all_reduce_utils.py b/vllm/distributed/device_communicators/custom_all_reduce_utils.py index ffe357f51a6ea..d3e41fa710676 100644 --- a/vllm/distributed/device_communicators/custom_all_reduce_utils.py +++ b/vllm/distributed/device_communicators/custom_all_reduce_utils.py @@ -216,11 +216,9 @@ def gpu_p2p_access_check(src: int, tgt: int) -> bool: returned.check_returncode() except Exception as e: # wrap raised exception to provide more information - logger.error( - "Error happened when batch testing " - "peer-to-peer access" - " from %s to %s", batch_src, batch_tgt) - raise e + raise RuntimeError( + f"Error happened when batch testing " + f"peer-to-peer access from {batch_src} to {batch_tgt}") from e result = pickle.loads(returned.stdout) for _i, _j, r in zip(batch_src, batch_tgt, result): cache[f"{_i}->{_j}"] = r