Skip to content

Commit 744eaa9

Browse files
[serve.llm] Cherry-pick - Fix quickstart serve LLM docs (#50910) (#50953)
picking #50910 Signed-off-by: akshay-anyscale <122416226+akshay-anyscale@users.noreply.github.com>
1 parent ecdcdc6 commit 744eaa9

File tree

1 file changed

+13
-7
lines changed

1 file changed

+13
-7
lines changed

doc/source/serve/llm/overview.rst

+13-7
Original file line numberDiff line numberDiff line change
@@ -87,7 +87,7 @@ Deployment through ``LLMRouter``
8787
)
8888
8989
# Deploy the application
90-
deployment = VLLMService.as_deployment().bind(llm_config)
90+
deployment = VLLMService.as_deployment(llm_config.get_serve_options(name_prefix="VLLM:")).bind(llm_config)
9191
llm_app = LLMRouter.as_deployment().bind([deployment])
9292
serve.run(llm_app)
9393
@@ -121,7 +121,7 @@ You can query the deployed models using either cURL or the OpenAI Python client:
121121
# Basic chat completion with streaming
122122
response = client.chat.completions.create(
123123
model="qwen-0.5b",
124-
messages=[{"role": "user", "content": "Hello!"}]
124+
messages=[{"role": "user", "content": "Hello!"}],
125125
stream=True
126126
)
127127
@@ -165,8 +165,8 @@ For deploying multiple models, you can pass a list of ``LLMConfig`` objects to t
165165
)
166166
167167
# Deploy the application
168-
deployment1 = VLLMService.as_deployment().bind(llm_config1)
169-
deployment2 = VLLMService.as_deployment().bind(llm_config2)
168+
deployment1 = VLLMService.as_deployment(llm_config1.get_serve_options(name_prefix="VLLM:")).bind(llm_config1)
169+
deployment2 = VLLMService.as_deployment(llm_config2.get_serve_options(name_prefix="VLLM:")).bind(llm_config2)
170170
llm_app = LLMRouter.as_deployment().bind([deployment1, deployment2])
171171
serve.run(llm_app)
172172
@@ -318,7 +318,8 @@ This allows the weights to be loaded on each replica on-the-fly and be cached vi
318318
# Make a request to the desired lora checkpoint
319319
response = client.chat.completions.create(
320320
model="qwen-0.5b:lora_model_1_ckpt",
321-
messages=[{"role": "user", "content": "Hello!"}]
321+
messages=[{"role": "user", "content": "Hello!"}],
322+
stream=True,
322323
)
323324
324325
for chunk in response:
@@ -430,7 +431,11 @@ For multimodal models that can process both text and images:
430431
max_replicas=2,
431432
)
432433
),
433-
accelerator_type="A10G",
434+
accelerator_type="L40S",
435+
engine_kwargs=dict(
436+
tensor_parallel_size=1,
437+
max_model_len=8192,
438+
),
434439
)
435440
436441
# Build and deploy the model
@@ -466,7 +471,8 @@ For multimodal models that can process both text and images:
466471
}
467472
]
468473
}
469-
]
474+
],
475+
stream=True,
470476
)
471477
472478
for chunk in response:

0 commit comments

Comments
 (0)