Skip to content

Commit ecdcdc6

Browse files
authored
[llm.serving] Reconfigure router to better perform under high concurrency (#50876) (#50884)
Cherry pick: #50876 Signed-off-by: Gene Su <e870252314@gmail.com>
1 parent 84f2764 commit ecdcdc6

File tree

4 files changed

+119
-17
lines changed

4 files changed

+119
-17
lines changed

python/ray/llm/_internal/serve/builders/application_builders.py

+3-1
Original file line numberDiff line numberDiff line change
@@ -64,4 +64,6 @@ def build_openai_app(llm_serving_args: LLMServingArgs) -> Application:
6464

6565
llm_deployments = _get_llm_deployments(llm_configs)
6666

67-
return LLMRouter.as_deployment().bind(llm_deployments=llm_deployments)
67+
return LLMRouter.as_deployment(llm_configs=llm_configs).bind(
68+
llm_deployments=llm_deployments
69+
)

python/ray/llm/_internal/serve/configs/constants.py

+18
Original file line numberDiff line numberDiff line change
@@ -66,3 +66,21 @@
6666
ENABLE_VERBOSE_TELEMETRY = bool(int(os.getenv("RAYLLM_ENABLE_VERBOSE_TELEMETRY", "0")))
6767

6868
RAYLLM_VLLM_ENGINE_CLS_ENV = "RAYLLM_VLLM_ENGINE_CLS"
69+
70+
# The ratio of number of router replicas to number of model replicas. Default to 2
71+
# meaning that there are 2 router replicas for every model replica.
72+
ROUTER_TO_MODEL_REPLICA_RATIO = float(
73+
os.getenv("RAYLLM_ROUTER_TO_MODEL_REPLICA_RATIO", "2")
74+
)
75+
76+
RAYLLM_ROUTER_MIN_REPLICAS = int(os.environ.get("RAYLLM_ROUTER_MIN_REPLICAS", 0))
77+
RAYLLM_ROUTER_INITIAL_REPLICAS = int(
78+
os.environ.get("RAYLLM_ROUTER_INITIAL_REPLICAS", 2)
79+
)
80+
RAYLLM_ROUTER_MAX_REPLICAS = int(os.environ.get("RAYLLM_ROUTER_MAX_REPLICAS", 16))
81+
RAYLLM_ROUTER_TARGET_ONGOING_REQUESTS = int(
82+
os.environ.get(
83+
"RAYLLM_ROUTER_TARGET_ONGOING_REQUESTS",
84+
DEFAULT_TARGET_ONGOING_REQUESTS, # 16
85+
)
86+
)

python/ray/llm/_internal/serve/deployments/routers/router.py

+47-16
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,14 @@
2222
from ray.serve.handle import DeploymentHandle
2323
from starlette.responses import JSONResponse, Response, StreamingResponse
2424

25-
from ray.llm._internal.serve.configs.constants import RAYLLM_ROUTER_HTTP_TIMEOUT
25+
from ray.llm._internal.serve.configs.constants import (
26+
RAYLLM_ROUTER_HTTP_TIMEOUT,
27+
ROUTER_TO_MODEL_REPLICA_RATIO,
28+
RAYLLM_ROUTER_MIN_REPLICAS,
29+
RAYLLM_ROUTER_INITIAL_REPLICAS,
30+
RAYLLM_ROUTER_MAX_REPLICAS,
31+
RAYLLM_ROUTER_TARGET_ONGOING_REQUESTS,
32+
)
2633
from ray.llm._internal.serve.observability.logging import get_logger
2734
from ray.llm._internal.serve.observability.metrics.fast_api_metrics import (
2835
add_http_metrics_middleware,
@@ -52,6 +59,7 @@
5259
LLMConfig,
5360
ModelData,
5461
Model,
62+
AutoscalingConfig,
5563
)
5664
from ray.llm._internal.serve.deployments.routers.middleware import (
5765
SetRequestIdMiddleware,
@@ -397,30 +405,53 @@ async def chat(self, body: ChatCompletionRequest) -> Response:
397405
return JSONResponse(content=result.model_dump())
398406

399407
@classmethod
400-
def as_deployment(cls) -> serve.Deployment:
408+
def as_deployment(
409+
cls, llm_configs: Optional[List[LLMConfig]] = None
410+
) -> serve.Deployment:
401411
"""Converts this class to a Ray Serve deployment with ingress.
402412
403413
Returns:
404414
A Ray Serve deployment.
405415
"""
416+
min_replicas = RAYLLM_ROUTER_MIN_REPLICAS
417+
initial_replicas = RAYLLM_ROUTER_INITIAL_REPLICAS
418+
max_replicas = RAYLLM_ROUTER_MAX_REPLICAS
419+
420+
# Note (genesu): Based on our internal benchmark, we are currently bottleneck
421+
# by the router replicas during high concurrency situation. We are setting the
422+
# router replicas to be ~2x the total model replicas and making it scale faster.
423+
if llm_configs:
424+
model_min_replicas = 0
425+
model_initial_replicas = 0
426+
model_max_replicas = 0
427+
for llm_config in llm_configs:
428+
if "autoscaling_config" in llm_config.deployment_config:
429+
autoscaling_config = llm_config.deployment_config[
430+
"autoscaling_config"
431+
]
432+
if isinstance(autoscaling_config, dict):
433+
autoscaling_config = AutoscalingConfig(
434+
**llm_config.deployment_config["autoscaling_config"]
435+
)
436+
else:
437+
# When autoscaling config is not provided, we use the default.
438+
autoscaling_config = AutoscalingConfig()
439+
model_min_replicas += autoscaling_config.min_replicas
440+
model_initial_replicas += autoscaling_config.initial_replicas
441+
model_max_replicas += autoscaling_config.max_replicas
442+
min_replicas = int(model_min_replicas * ROUTER_TO_MODEL_REPLICA_RATIO)
443+
initial_replicas = int(
444+
model_initial_replicas * ROUTER_TO_MODEL_REPLICA_RATIO
445+
)
446+
max_replicas = int(model_max_replicas * ROUTER_TO_MODEL_REPLICA_RATIO)
406447

407448
ingress_cls = serve.ingress(fastapi_router_app)(cls)
408449
deployment_decorator = serve.deployment(
409-
# TODO (Kourosh): make this configurable
410450
autoscaling_config={
411-
"min_replicas": int(os.environ.get("RAYLLM_ROUTER_MIN_REPLICAS", 0)),
412-
"initial_replicas": int(
413-
os.environ.get("RAYLLM_ROUTER_INITIAL_REPLICAS", 2)
414-
),
415-
"max_replicas": int(os.environ.get("RAYLLM_ROUTER_MAX_REPLICAS", 16)),
416-
"target_ongoing_requests": int(
417-
os.environ.get(
418-
"RAYLLM_ROUTER_TARGET_ONGOING_REQUESTS",
419-
os.environ.get(
420-
"RAYLLM_ROUTER_TARGET_NUM_ONGOING_REQUESTS_PER_REPLICA", 200
421-
),
422-
)
423-
),
451+
"min_replicas": min_replicas,
452+
"initial_replicas": initial_replicas,
453+
"max_replicas": max_replicas,
454+
"target_ongoing_requests": RAYLLM_ROUTER_TARGET_ONGOING_REQUESTS,
424455
},
425456
ray_actor_options=json.loads(
426457
os.environ.get("RAYLLM_ROUTER_RAY_ACTOR_OPTIONS", "{}")

python/ray/llm/tests/serve/builders/test_application_builders.py

+51
Original file line numberDiff line numberDiff line change
@@ -3,11 +3,17 @@
33

44
from ray.llm._internal.serve.configs.server_models import (
55
LLMServingArgs,
6+
LLMConfig,
7+
AutoscalingConfig,
8+
ModelLoadingConfig,
69
)
710
from ray.llm._internal.serve.builders.application_builders import (
811
build_openai_app,
912
build_vllm_deployment,
1013
)
14+
from ray.llm._internal.serve.configs.constants import (
15+
RAYLLM_ROUTER_TARGET_ONGOING_REQUESTS,
16+
)
1117
import subprocess
1218
import yaml
1319
import os
@@ -94,6 +100,51 @@ def deployments_healthy():
94100
p.send_signal(signal.SIGINT) # Equivalent to ctrl-C
95101
p.wait()
96102

103+
def test_router_built_with_autoscaling_configs(self):
104+
"""Test that the router is built with the correct autoscaling configs that
105+
will scale.
106+
"""
107+
llm_config_no_autoscaling_configured = LLMConfig(
108+
model_loading_config=ModelLoadingConfig(model_id="model_id_1"),
109+
accelerator_type="L4",
110+
)
111+
llm_config_autoscaling_default = LLMConfig(
112+
model_loading_config=ModelLoadingConfig(model_id="model_id_2"),
113+
accelerator_type="L4",
114+
deployment_config={"autoscaling_config": AutoscalingConfig()},
115+
)
116+
llm_config_autoscaling_non_default = LLMConfig(
117+
model_loading_config=ModelLoadingConfig(model_id="model_id_3"),
118+
accelerator_type="L4",
119+
deployment_config={
120+
"autoscaling_config": AutoscalingConfig(
121+
min_replicas=2,
122+
initial_replicas=3,
123+
max_replicas=4,
124+
)
125+
},
126+
)
127+
128+
app = build_openai_app(
129+
LLMServingArgs(
130+
llm_configs=[
131+
llm_config_no_autoscaling_configured,
132+
llm_config_autoscaling_default,
133+
llm_config_autoscaling_non_default,
134+
]
135+
)
136+
)
137+
router_autoscaling_config = (
138+
app._bound_deployment._deployment_config.autoscaling_config
139+
)
140+
assert router_autoscaling_config.min_replicas == 8 # (1 + 1 + 2) * 2
141+
assert router_autoscaling_config.initial_replicas == 10 # (1 + 1 + 3) * 2
142+
assert router_autoscaling_config.max_replicas == 408 # (100 + 100 + 4) * 2
143+
assert (
144+
router_autoscaling_config.target_ongoing_requests
145+
== RAYLLM_ROUTER_TARGET_ONGOING_REQUESTS
146+
)
147+
97148

98149
class TestBuildVllmDeployment:
99150
def test_build_vllm_deployment(

0 commit comments

Comments
 (0)