[llm.serving] Reconfigure router to better perform under high concurrency (#50876) (#50884)

GeneDer · web-flow · commit ecdcdc6a6e63 · 2025-02-25T02:51:03.000-08:00
Cherry pick: #50876 Signed-off-by: Gene Su <e870252314@gmail.com>
diff --git a/python/ray/llm/_internal/serve/builders/application_builders.py b/python/ray/llm/_internal/serve/builders/application_builders.py
@@ -64,4 +64,6 @@ def build_openai_app(llm_serving_args: LLMServingArgs) -> Application:
 
     llm_deployments = _get_llm_deployments(llm_configs)
 
-    return LLMRouter.as_deployment().bind(llm_deployments=llm_deployments)
+    return LLMRouter.as_deployment(llm_configs=llm_configs).bind(
+        llm_deployments=llm_deployments
+    )
diff --git a/python/ray/llm/_internal/serve/configs/constants.py b/python/ray/llm/_internal/serve/configs/constants.py
@@ -66,3 +66,21 @@
 ENABLE_VERBOSE_TELEMETRY = bool(int(os.getenv("RAYLLM_ENABLE_VERBOSE_TELEMETRY", "0")))
 
 RAYLLM_VLLM_ENGINE_CLS_ENV = "RAYLLM_VLLM_ENGINE_CLS"
+
+# The ratio of number of router replicas to number of model replicas. Default to 2
+# meaning that there are 2 router replicas for every model replica.
+ROUTER_TO_MODEL_REPLICA_RATIO = float(
+    os.getenv("RAYLLM_ROUTER_TO_MODEL_REPLICA_RATIO", "2")
+)
+
+RAYLLM_ROUTER_MIN_REPLICAS = int(os.environ.get("RAYLLM_ROUTER_MIN_REPLICAS", 0))
+RAYLLM_ROUTER_INITIAL_REPLICAS = int(
+    os.environ.get("RAYLLM_ROUTER_INITIAL_REPLICAS", 2)
+)
+RAYLLM_ROUTER_MAX_REPLICAS = int(os.environ.get("RAYLLM_ROUTER_MAX_REPLICAS", 16))
+RAYLLM_ROUTER_TARGET_ONGOING_REQUESTS = int(
+    os.environ.get(
+        "RAYLLM_ROUTER_TARGET_ONGOING_REQUESTS",
+        DEFAULT_TARGET_ONGOING_REQUESTS,  # 16
+    )
+)
diff --git a/python/ray/llm/_internal/serve/deployments/routers/router.py b/python/ray/llm/_internal/serve/deployments/routers/router.py
@@ -22,7 +22,14 @@
 from ray.serve.handle import DeploymentHandle
 from starlette.responses import JSONResponse, Response, StreamingResponse
 
-from ray.llm._internal.serve.configs.constants import RAYLLM_ROUTER_HTTP_TIMEOUT
+from ray.llm._internal.serve.configs.constants import (
+    RAYLLM_ROUTER_HTTP_TIMEOUT,
+    ROUTER_TO_MODEL_REPLICA_RATIO,
+    RAYLLM_ROUTER_MIN_REPLICAS,
+    RAYLLM_ROUTER_INITIAL_REPLICAS,
+    RAYLLM_ROUTER_MAX_REPLICAS,
+    RAYLLM_ROUTER_TARGET_ONGOING_REQUESTS,
+)
 from ray.llm._internal.serve.observability.logging import get_logger
 from ray.llm._internal.serve.observability.metrics.fast_api_metrics import (
     add_http_metrics_middleware,
@@ -52,6 +59,7 @@
     LLMConfig,
     ModelData,
     Model,
+    AutoscalingConfig,
 )
 from ray.llm._internal.serve.deployments.routers.middleware import (
     SetRequestIdMiddleware,
@@ -397,30 +405,53 @@ async def chat(self, body: ChatCompletionRequest) -> Response:
                 return JSONResponse(content=result.model_dump())
 
     @classmethod
-    def as_deployment(cls) -> serve.Deployment:
+    def as_deployment(
+        cls, llm_configs: Optional[List[LLMConfig]] = None
+    ) -> serve.Deployment:
         """Converts this class to a Ray Serve deployment with ingress.
 
         Returns:
             A Ray Serve deployment.
         """
+        min_replicas = RAYLLM_ROUTER_MIN_REPLICAS
+        initial_replicas = RAYLLM_ROUTER_INITIAL_REPLICAS
+        max_replicas = RAYLLM_ROUTER_MAX_REPLICAS
+
+        # Note (genesu): Based on our internal benchmark, we are currently bottleneck
+        # by the router replicas during high concurrency situation. We are setting the
+        # router replicas to be ~2x the total model replicas and making it scale faster.
+        if llm_configs:
+            model_min_replicas = 0
+            model_initial_replicas = 0
+            model_max_replicas = 0
+            for llm_config in llm_configs:
+                if "autoscaling_config" in llm_config.deployment_config:
+                    autoscaling_config = llm_config.deployment_config[
+                        "autoscaling_config"
+                    ]
+                    if isinstance(autoscaling_config, dict):
+                        autoscaling_config = AutoscalingConfig(
+                            **llm_config.deployment_config["autoscaling_config"]
+                        )
+                else:
+                    # When autoscaling config is not provided, we use the default.
+                    autoscaling_config = AutoscalingConfig()
+                model_min_replicas += autoscaling_config.min_replicas
+                model_initial_replicas += autoscaling_config.initial_replicas
+                model_max_replicas += autoscaling_config.max_replicas
+            min_replicas = int(model_min_replicas * ROUTER_TO_MODEL_REPLICA_RATIO)
+            initial_replicas = int(
+                model_initial_replicas * ROUTER_TO_MODEL_REPLICA_RATIO
+            )
+            max_replicas = int(model_max_replicas * ROUTER_TO_MODEL_REPLICA_RATIO)
 
         ingress_cls = serve.ingress(fastapi_router_app)(cls)
         deployment_decorator = serve.deployment(
-            # TODO (Kourosh): make this configurable
             autoscaling_config={
-                "min_replicas": int(os.environ.get("RAYLLM_ROUTER_MIN_REPLICAS", 0)),
-                "initial_replicas": int(
-                    os.environ.get("RAYLLM_ROUTER_INITIAL_REPLICAS", 2)
-                ),
-                "max_replicas": int(os.environ.get("RAYLLM_ROUTER_MAX_REPLICAS", 16)),
-                "target_ongoing_requests": int(
-                    os.environ.get(
-                        "RAYLLM_ROUTER_TARGET_ONGOING_REQUESTS",
-                        os.environ.get(
-                            "RAYLLM_ROUTER_TARGET_NUM_ONGOING_REQUESTS_PER_REPLICA", 200
-                        ),
-                    )
-                ),
+                "min_replicas": min_replicas,
+                "initial_replicas": initial_replicas,
+                "max_replicas": max_replicas,
+                "target_ongoing_requests": RAYLLM_ROUTER_TARGET_ONGOING_REQUESTS,
             },
             ray_actor_options=json.loads(
                 os.environ.get("RAYLLM_ROUTER_RAY_ACTOR_OPTIONS", "{}")
diff --git a/python/ray/llm/tests/serve/builders/test_application_builders.py b/python/ray/llm/tests/serve/builders/test_application_builders.py
@@ -3,11 +3,17 @@
 
 from ray.llm._internal.serve.configs.server_models import (
     LLMServingArgs,
+    LLMConfig,
+    AutoscalingConfig,
+    ModelLoadingConfig,
 )
 from ray.llm._internal.serve.builders.application_builders import (
     build_openai_app,
     build_vllm_deployment,
 )
+from ray.llm._internal.serve.configs.constants import (
+    RAYLLM_ROUTER_TARGET_ONGOING_REQUESTS,
+)
 import subprocess
 import yaml
 import os
@@ -94,6 +100,51 @@ def deployments_healthy():
         p.send_signal(signal.SIGINT)  # Equivalent to ctrl-C
         p.wait()
 
+    def test_router_built_with_autoscaling_configs(self):
+        """Test that the router is built with the correct autoscaling configs that
+        will scale.
+        """
+        llm_config_no_autoscaling_configured = LLMConfig(
+            model_loading_config=ModelLoadingConfig(model_id="model_id_1"),
+            accelerator_type="L4",
+        )
+        llm_config_autoscaling_default = LLMConfig(
+            model_loading_config=ModelLoadingConfig(model_id="model_id_2"),
+            accelerator_type="L4",
+            deployment_config={"autoscaling_config": AutoscalingConfig()},
+        )
+        llm_config_autoscaling_non_default = LLMConfig(
+            model_loading_config=ModelLoadingConfig(model_id="model_id_3"),
+            accelerator_type="L4",
+            deployment_config={
+                "autoscaling_config": AutoscalingConfig(
+                    min_replicas=2,
+                    initial_replicas=3,
+                    max_replicas=4,
+                )
+            },
+        )
+
+        app = build_openai_app(
+            LLMServingArgs(
+                llm_configs=[
+                    llm_config_no_autoscaling_configured,
+                    llm_config_autoscaling_default,
+                    llm_config_autoscaling_non_default,
+                ]
+            )
+        )
+        router_autoscaling_config = (
+            app._bound_deployment._deployment_config.autoscaling_config
+        )
+        assert router_autoscaling_config.min_replicas == 8  # (1 + 1 + 2) * 2
+        assert router_autoscaling_config.initial_replicas == 10  # (1 + 1 + 3) * 2
+        assert router_autoscaling_config.max_replicas == 408  # (100 + 100 + 4) * 2
+        assert (
+            router_autoscaling_config.target_ongoing_requests
+            == RAYLLM_ROUTER_TARGET_ONGOING_REQUESTS
+        )
+
 
 class TestBuildVllmDeployment:
     def test_build_vllm_deployment(