From 4ae4353cc556ebb59d92d74bf7192de2e09e3527 Mon Sep 17 00:00:00 2001
From: youkaichao <youkaichao@126.com>
Date: Tue, 18 Jun 2024 22:22:12 -0700
Subject: [PATCH 1/6] use fork by default for mp

---
 vllm/envs.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/vllm/envs.py b/vllm/envs.py
index f03b69f4b8866..ae2fcd0826fb1 100644
--- a/vllm/envs.py
+++ b/vllm/envs.py
@@ -29,7 +29,7 @@
     VLLM_CPU_KVCACHE_SPACE: int = 0
     VLLM_XLA_CACHE_PATH: str = "~/.vllm/xla_cache/"
     VLLM_USE_RAY_COMPILED_DAG: bool = False
-    VLLM_WORKER_MULTIPROC_METHOD: str = "spawn"
+    VLLM_WORKER_MULTIPROC_METHOD: str = "fork"
     VLLM_IMAGE_FETCH_TIMEOUT: int = 5
     VLLM_TARGET_DEVICE: str = "cuda"
     MAX_JOBS: Optional[str] = None
@@ -212,7 +212,7 @@
     # Use dedicated multiprocess context for workers.
     # Both spawn and fork work
     "VLLM_WORKER_MULTIPROC_METHOD":
-    lambda: os.getenv("VLLM_WORKER_MULTIPROC_METHOD", "spawn"),
+    lambda: os.getenv("VLLM_WORKER_MULTIPROC_METHOD", "fork"),
 
     # Timeout for fetching images when serving multimodal models
     # Default is 5 seconds

From 4e5a3f014abf53de1de79028848260f23173d580 Mon Sep 17 00:00:00 2001
From: youkaichao <youkaichao@126.com>
Date: Wed, 19 Jun 2024 09:06:16 -0700
Subject: [PATCH 2/6] fix spawn

---
 .../custom_all_reduce_utils.py                   | 16 +++++++++++++++-
 1 file changed, 15 insertions(+), 1 deletion(-)

diff --git a/vllm/distributed/device_communicators/custom_all_reduce_utils.py b/vllm/distributed/device_communicators/custom_all_reduce_utils.py
index e0641a54c4194..440f59ed3f915 100644
--- a/vllm/distributed/device_communicators/custom_all_reduce_utils.py
+++ b/vllm/distributed/device_communicators/custom_all_reduce_utils.py
@@ -1,6 +1,9 @@
 import ctypes
 import json
 import os
+import pickle
+import subprocess
+import sys
 from itertools import product
 from typing import Dict, List, Optional, Sequence
 
@@ -198,7 +201,13 @@ def gpu_p2p_access_check(src: int, tgt: int) -> bool:
         ids = list(range(num_dev))
         # batch of all pairs of GPUs
         batch_src, batch_tgt = zip(*list(product(ids, ids)))
-        result = can_actually_p2p(batch_src, batch_tgt)
+        input_bytes = pickle.dumps((batch_src, batch_tgt))
+        returned = subprocess.run([sys.executable, __file__],
+                                  input=input_bytes,
+                                  capture_output=True)
+        # check if the subprocess is successful
+        returned.check_returncode()
+        result = pickle.loads(returned.stdout)
         for _i, _j, r in zip(batch_src, batch_tgt, result):
             cache[f"{_i}->{_j}"] = r
         with open(path, "w") as f:
@@ -213,3 +222,8 @@ def gpu_p2p_access_check(src: int, tgt: int) -> bool:
 
 
 __all__ = ["gpu_p2p_access_check"]
+
+if __name__ == "__main__":
+    batch_src, batch_tgt = pickle.loads(sys.stdin.buffer.read())
+    result = can_actually_p2p(batch_src, batch_tgt)
+    sys.stdout.buffer.write(pickle.dumps(result))

From 2e2731637f1b25d7ca3086b7951a7e671e023f71 Mon Sep 17 00:00:00 2001
From: youkaichao <youkaichao@126.com>
Date: Wed, 19 Jun 2024 09:10:02 -0700
Subject: [PATCH 3/6] add comments

---
 .../device_communicators/custom_all_reduce_utils.py         | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/vllm/distributed/device_communicators/custom_all_reduce_utils.py b/vllm/distributed/device_communicators/custom_all_reduce_utils.py
index 440f59ed3f915..1bb8b8011def1 100644
--- a/vllm/distributed/device_communicators/custom_all_reduce_utils.py
+++ b/vllm/distributed/device_communicators/custom_all_reduce_utils.py
@@ -201,6 +201,12 @@ def gpu_p2p_access_check(src: int, tgt: int) -> bool:
         ids = list(range(num_dev))
         # batch of all pairs of GPUs
         batch_src, batch_tgt = zip(*list(product(ids, ids)))
+        # NOTE: we use `subprocess` rather than `multiprocessing` here
+        # because the caller might not have `if __name__ == "__main__":`,
+        # in that case we cannot use spawn method in multiprocessing.
+        # However, `can_actually_p2p` requires spawn method.
+        # The fix is, we use `subprocess` to call the function,
+        # where we have `if __name__ == "__main__":` in this file.
         input_bytes = pickle.dumps((batch_src, batch_tgt))
         returned = subprocess.run([sys.executable, __file__],
                                   input=input_bytes,

From 288c99a304730e701ac715b1452031d539d000c8 Mon Sep 17 00:00:00 2001
From: youkaichao <youkaichao@126.com>
Date: Wed, 19 Jun 2024 09:28:41 -0700
Subject: [PATCH 4/6] wrap error message

---
 .../device_communicators/custom_all_reduce_utils.py    | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/vllm/distributed/device_communicators/custom_all_reduce_utils.py b/vllm/distributed/device_communicators/custom_all_reduce_utils.py
index 1bb8b8011def1..ffe357f51a6ea 100644
--- a/vllm/distributed/device_communicators/custom_all_reduce_utils.py
+++ b/vllm/distributed/device_communicators/custom_all_reduce_utils.py
@@ -212,7 +212,15 @@ def gpu_p2p_access_check(src: int, tgt: int) -> bool:
                                   input=input_bytes,
                                   capture_output=True)
         # check if the subprocess is successful
-        returned.check_returncode()
+        try:
+            returned.check_returncode()
+        except Exception as e:
+            # wrap raised exception to provide more information
+            logger.error(
+                "Error happened when batch testing "
+                "peer-to-peer access"
+                " from %s to %s", batch_src, batch_tgt)
+            raise e
         result = pickle.loads(returned.stdout)
         for _i, _j, r in zip(batch_src, batch_tgt, result):
             cache[f"{_i}->{_j}"] = r

From 191f8ccfcdbe63f06f5beb753df43f5f2f736c66 Mon Sep 17 00:00:00 2001
From: youkaichao <youkaichao@126.com>
Date: Thu, 20 Jun 2024 15:35:21 -0700
Subject: [PATCH 5/6] temp fix, use spawn for testing

---
 .buildkite/test-pipeline.yaml | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
index 5afe3730210e8..67437ef7ea4b3 100644
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -37,6 +37,9 @@ steps:
   working_dir: "/vllm-workspace/tests"
   num_gpus: 2
   commands:
+  # FIXIT: find out which code initialize cuda before running the test
+  # before the fix, we need to use spawn to test it
+  - export VLLM_WORKER_MULTIPROC_METHOD=spawn
   - VLLM_TEST_SAME_HOST=1 torchrun --nproc-per-node=4 distributed/test_same_node.py
   - TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_basic_distributed_correctness.py
   - TEST_DIST_MODEL=meta-llama/Llama-2-7b-hf DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_basic_distributed_correctness.py
@@ -55,6 +58,9 @@ steps:
   working_dir: "/vllm-workspace/tests"
   num_gpus: 4
   commands:
+  # FIXIT: find out which code initialize cuda before running the test
+  # before the fix, we need to use spawn to test it
+  - export VLLM_WORKER_MULTIPROC_METHOD=spawn
   - pytest -v -s distributed/test_pynccl.py
   # We want to test that models which use 2 GPUs work with 4 GPUs, which is why we duplicate them here.
   # See https://github.com/vllm-project/vllm/pull/5473#issuecomment-2166601837 for context.
@@ -145,6 +151,9 @@ steps:
   num_gpus: 4
   # This test runs llama 13B, so it is required to run on 4 GPUs.
   commands:
+    # FIXIT: find out which code initialize cuda before running the test
+    # before the fix, we need to use spawn to test it
+    - export VLLM_WORKER_MULTIPROC_METHOD=spawn
     - pytest -v -s -x lora/test_long_context.py
 
 - label: Tensorizer Test

From 8348ef58bb55c57dafcc1e27c53bf98c44254ddd Mon Sep 17 00:00:00 2001
From: youkaichao <youkaichao@126.com>
Date: Thu, 20 Jun 2024 15:36:50 -0700
Subject: [PATCH 6/6] use raise from

---
 .../device_communicators/custom_all_reduce_utils.py       | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

diff --git a/vllm/distributed/device_communicators/custom_all_reduce_utils.py b/vllm/distributed/device_communicators/custom_all_reduce_utils.py
index ffe357f51a6ea..d3e41fa710676 100644
--- a/vllm/distributed/device_communicators/custom_all_reduce_utils.py
+++ b/vllm/distributed/device_communicators/custom_all_reduce_utils.py
@@ -216,11 +216,9 @@ def gpu_p2p_access_check(src: int, tgt: int) -> bool:
             returned.check_returncode()
         except Exception as e:
             # wrap raised exception to provide more information
-            logger.error(
-                "Error happened when batch testing "
-                "peer-to-peer access"
-                " from %s to %s", batch_src, batch_tgt)
-            raise e
+            raise RuntimeError(
+                f"Error happened when batch testing "
+                f"peer-to-peer access from {batch_src} to {batch_tgt}") from e
         result = pickle.loads(returned.stdout)
         for _i, _j, r in zip(batch_src, batch_tgt, result):
             cache[f"{_i}->{_j}"] = r