[llm.serving] Fix using uni executor when world size == 1 (#50849) (#50863)

GeneDer · web-flow · commit ecd0709d2d66 · 2025-02-24T12:21:05.000-08:00
Cherry-pick: #50849 Signed-off-by: Gene Su <e870252314@gmail.com>
diff --git a/python/ray/llm/_internal/serve/deployments/llm/vllm/vllm_engine.py b/python/ray/llm/_internal/serve/deployments/llm/vllm/vllm_engine.py
@@ -191,15 +191,18 @@ def __init__(self, ipc_path, engine_args, engine_config):
         # Adapted from vllm.engine.multiprocessing.engine.MQLLMEngine.from_engine_args
         vllm.plugins.load_general_plugins()
 
-        executor_class = vllm.engine.llm_engine.LLMEngine._get_executor_cls(
-            engine_config
-        )
+        # Note (genesu): There is a bug in vllm 0.7.2 forced the use of uni processing
+        # executor when world_size is 1. This is a bug in vllm 0.7.2 and
+        # is fixed by https://github.com/vllm-project/vllm/pull/12934 which is shipped
+        # with vllm 0.7.3. However, in Ray's llm package, we will enforce the use of
+        # ray distributed executor for all cases so it's always compatible with Ray.
+        from vllm.executor.ray_distributed_executor import RayDistributedExecutor
 
         self.engine = MQLLMEngine(
             ipc_path=ipc_path,
             use_async_sockets=engine_config.model_config.use_async_output_proc,
             vllm_config=engine_config,
-            executor_class=executor_class,
+            executor_class=RayDistributedExecutor,
             log_requests=not engine_args.disable_log_requests,
             log_stats=not engine_args.disable_log_stats,
             usage_context=vllm.usage.usage_lib.UsageContext.API_SERVER,