|
22 | 22 | from ray.serve.handle import DeploymentHandle
|
23 | 23 | from starlette.responses import JSONResponse, Response, StreamingResponse
|
24 | 24 |
|
25 |
| -from ray.llm._internal.serve.configs.constants import RAYLLM_ROUTER_HTTP_TIMEOUT |
| 25 | +from ray.llm._internal.serve.configs.constants import ( |
| 26 | + RAYLLM_ROUTER_HTTP_TIMEOUT, |
| 27 | + ROUTER_TO_MODEL_REPLICA_RATIO, |
| 28 | + RAYLLM_ROUTER_MIN_REPLICAS, |
| 29 | + RAYLLM_ROUTER_INITIAL_REPLICAS, |
| 30 | + RAYLLM_ROUTER_MAX_REPLICAS, |
| 31 | + RAYLLM_ROUTER_TARGET_ONGOING_REQUESTS, |
| 32 | +) |
26 | 33 | from ray.llm._internal.serve.observability.logging import get_logger
|
27 | 34 | from ray.llm._internal.serve.observability.metrics.fast_api_metrics import (
|
28 | 35 | add_http_metrics_middleware,
|
|
52 | 59 | LLMConfig,
|
53 | 60 | ModelData,
|
54 | 61 | Model,
|
| 62 | + AutoscalingConfig, |
55 | 63 | )
|
56 | 64 | from ray.llm._internal.serve.deployments.routers.middleware import (
|
57 | 65 | SetRequestIdMiddleware,
|
@@ -397,30 +405,53 @@ async def chat(self, body: ChatCompletionRequest) -> Response:
|
397 | 405 | return JSONResponse(content=result.model_dump())
|
398 | 406 |
|
399 | 407 | @classmethod
|
400 |
| - def as_deployment(cls) -> serve.Deployment: |
| 408 | + def as_deployment( |
| 409 | + cls, llm_configs: Optional[List[LLMConfig]] = None |
| 410 | + ) -> serve.Deployment: |
401 | 411 | """Converts this class to a Ray Serve deployment with ingress.
|
402 | 412 |
|
403 | 413 | Returns:
|
404 | 414 | A Ray Serve deployment.
|
405 | 415 | """
|
| 416 | + min_replicas = RAYLLM_ROUTER_MIN_REPLICAS |
| 417 | + initial_replicas = RAYLLM_ROUTER_INITIAL_REPLICAS |
| 418 | + max_replicas = RAYLLM_ROUTER_MAX_REPLICAS |
| 419 | + |
| 420 | + # Note (genesu): Based on our internal benchmark, we are currently bottleneck |
| 421 | + # by the router replicas during high concurrency situation. We are setting the |
| 422 | + # router replicas to be ~2x the total model replicas and making it scale faster. |
| 423 | + if llm_configs: |
| 424 | + model_min_replicas = 0 |
| 425 | + model_initial_replicas = 0 |
| 426 | + model_max_replicas = 0 |
| 427 | + for llm_config in llm_configs: |
| 428 | + if "autoscaling_config" in llm_config.deployment_config: |
| 429 | + autoscaling_config = llm_config.deployment_config[ |
| 430 | + "autoscaling_config" |
| 431 | + ] |
| 432 | + if isinstance(autoscaling_config, dict): |
| 433 | + autoscaling_config = AutoscalingConfig( |
| 434 | + **llm_config.deployment_config["autoscaling_config"] |
| 435 | + ) |
| 436 | + else: |
| 437 | + # When autoscaling config is not provided, we use the default. |
| 438 | + autoscaling_config = AutoscalingConfig() |
| 439 | + model_min_replicas += autoscaling_config.min_replicas |
| 440 | + model_initial_replicas += autoscaling_config.initial_replicas |
| 441 | + model_max_replicas += autoscaling_config.max_replicas |
| 442 | + min_replicas = int(model_min_replicas * ROUTER_TO_MODEL_REPLICA_RATIO) |
| 443 | + initial_replicas = int( |
| 444 | + model_initial_replicas * ROUTER_TO_MODEL_REPLICA_RATIO |
| 445 | + ) |
| 446 | + max_replicas = int(model_max_replicas * ROUTER_TO_MODEL_REPLICA_RATIO) |
406 | 447 |
|
407 | 448 | ingress_cls = serve.ingress(fastapi_router_app)(cls)
|
408 | 449 | deployment_decorator = serve.deployment(
|
409 |
| - # TODO (Kourosh): make this configurable |
410 | 450 | autoscaling_config={
|
411 |
| - "min_replicas": int(os.environ.get("RAYLLM_ROUTER_MIN_REPLICAS", 0)), |
412 |
| - "initial_replicas": int( |
413 |
| - os.environ.get("RAYLLM_ROUTER_INITIAL_REPLICAS", 2) |
414 |
| - ), |
415 |
| - "max_replicas": int(os.environ.get("RAYLLM_ROUTER_MAX_REPLICAS", 16)), |
416 |
| - "target_ongoing_requests": int( |
417 |
| - os.environ.get( |
418 |
| - "RAYLLM_ROUTER_TARGET_ONGOING_REQUESTS", |
419 |
| - os.environ.get( |
420 |
| - "RAYLLM_ROUTER_TARGET_NUM_ONGOING_REQUESTS_PER_REPLICA", 200 |
421 |
| - ), |
422 |
| - ) |
423 |
| - ), |
| 451 | + "min_replicas": min_replicas, |
| 452 | + "initial_replicas": initial_replicas, |
| 453 | + "max_replicas": max_replicas, |
| 454 | + "target_ongoing_requests": RAYLLM_ROUTER_TARGET_ONGOING_REQUESTS, |
424 | 455 | },
|
425 | 456 | ray_actor_options=json.loads(
|
426 | 457 | os.environ.get("RAYLLM_ROUTER_RAY_ACTOR_OPTIONS", "{}")
|
|
0 commit comments