[serve.llm] Cherry-pick - Fix quickstart serve LLM docs (#50910) (#50953)

akshay-anyscale · web-flow · commit 744eaa949703 · 2025-02-27T09:16:34.000-08:00
picking #50910 Signed-off-by: akshay-anyscale <122416226+akshay-anyscale@users.noreply.github.com>
diff --git a/doc/source/serve/llm/overview.rst b/doc/source/serve/llm/overview.rst
@@ -87,7 +87,7 @@ Deployment through ``LLMRouter``
     )
 
     # Deploy the application
-    deployment = VLLMService.as_deployment().bind(llm_config)
+    deployment = VLLMService.as_deployment(llm_config.get_serve_options(name_prefix="VLLM:")).bind(llm_config)
     llm_app = LLMRouter.as_deployment().bind([deployment])
     serve.run(llm_app)
 
@@ -121,7 +121,7 @@ You can query the deployed models using either cURL or the OpenAI Python client:
             # Basic chat completion with streaming
             response = client.chat.completions.create(
                 model="qwen-0.5b",
-                messages=[{"role": "user", "content": "Hello!"}]
+                messages=[{"role": "user", "content": "Hello!"}],
                 stream=True
             )
 
@@ -165,8 +165,8 @@ For deploying multiple models, you can pass a list of ``LLMConfig`` objects to t
     )
 
     # Deploy the application
-    deployment1 = VLLMService.as_deployment().bind(llm_config1)
-    deployment2 = VLLMService.as_deployment().bind(llm_config2)
+    deployment1 = VLLMService.as_deployment(llm_config1.get_serve_options(name_prefix="VLLM:")).bind(llm_config1)
+    deployment2 = VLLMService.as_deployment(llm_config2.get_serve_options(name_prefix="VLLM:")).bind(llm_config2)
     llm_app = LLMRouter.as_deployment().bind([deployment1, deployment2])
     serve.run(llm_app)
 
@@ -318,7 +318,8 @@ This allows the weights to be loaded on each replica on-the-fly and be cached vi
             # Make a request to the desired lora checkpoint
             response = client.chat.completions.create(
                 model="qwen-0.5b:lora_model_1_ckpt",
-                messages=[{"role": "user", "content": "Hello!"}]
+                messages=[{"role": "user", "content": "Hello!"}],
+                stream=True,
             )
 
             for chunk in response:
@@ -430,7 +431,11 @@ For multimodal models that can process both text and images:
                         max_replicas=2,
                     )
                 ),
-                accelerator_type="A10G",
+                accelerator_type="L40S",
+                engine_kwargs=dict(
+                    tensor_parallel_size=1,
+                    max_model_len=8192,
+                ),
             )
 
             # Build and deploy the model
@@ -466,7 +471,8 @@ For multimodal models that can process both text and images:
                             }
                         ]
                     }
-                ]
+                ],
+                stream=True,
             )
 
             for chunk in response:

Original file line number	Diff line number	Diff line change
@@ -87,7 +87,7 @@ Deployment through ``LLMRouter``
`87`	`87`	`)`
`88`	`88`
`89`	`89`	`# Deploy the application`
`90`		`- deployment = VLLMService.as_deployment().bind(llm_config)`
	`90`	`+ deployment = VLLMService.as_deployment(llm_config.get_serve_options(name_prefix="VLLM:")).bind(llm_config)`
`91`	`91`	`llm_app = LLMRouter.as_deployment().bind([deployment])`
`92`	`92`	`serve.run(llm_app)`
`93`	`93`
`@@ -121,7 +121,7 @@ You can query the deployed models using either cURL or the OpenAI Python client:`
`121`	`121`	`# Basic chat completion with streaming`
`122`	`122`	`response = client.chat.completions.create(`
`123`	`123`	`model="qwen-0.5b",`
`124`		`- messages=[{"role": "user", "content": "Hello!"}]`
	`124`	`+ messages=[{"role": "user", "content": "Hello!"}],`
`125`	`125`	`stream=True`
`126`	`126`	`)`
`127`	`127`
@@ -165,8 +165,8 @@ For deploying multiple models, you can pass a list of ``LLMConfig`` objects to t
`165`	`165`	`)`
`166`	`166`
`167`	`167`	`# Deploy the application`
`168`		`- deployment1 = VLLMService.as_deployment().bind(llm_config1)`
`169`		`- deployment2 = VLLMService.as_deployment().bind(llm_config2)`
	`168`	`+ deployment1 = VLLMService.as_deployment(llm_config1.get_serve_options(name_prefix="VLLM:")).bind(llm_config1)`
	`169`	`+ deployment2 = VLLMService.as_deployment(llm_config2.get_serve_options(name_prefix="VLLM:")).bind(llm_config2)`
`170`	`170`	`llm_app = LLMRouter.as_deployment().bind([deployment1, deployment2])`
`171`	`171`	`serve.run(llm_app)`
`172`	`172`
`@@ -318,7 +318,8 @@ This allows the weights to be loaded on each replica on-the-fly and be cached vi`
`318`	`318`	`# Make a request to the desired lora checkpoint`
`319`	`319`	`response = client.chat.completions.create(`
`320`	`320`	`model="qwen-0.5b:lora_model_1_ckpt",`
`321`		`- messages=[{"role": "user", "content": "Hello!"}]`
	`321`	`+ messages=[{"role": "user", "content": "Hello!"}],`
	`322`	`+ stream=True,`
`322`	`323`	`)`
`323`	`324`
`324`	`325`	`for chunk in response:`
`@@ -430,7 +431,11 @@ For multimodal models that can process both text and images:`
`430`	`431`	`max_replicas=2,`
`431`	`432`	`)`
`432`	`433`	`),`
`433`		`- accelerator_type="A10G",`
	`434`	`+ accelerator_type="L40S",`
	`435`	`+ engine_kwargs=dict(`
	`436`	`+ tensor_parallel_size=1,`
	`437`	`+ max_model_len=8192,`
	`438`	`+ ),`
`434`	`439`	`)`
`435`	`440`
`436`	`441`	`# Build and deploy the model`
`@@ -466,7 +471,8 @@ For multimodal models that can process both text and images:`
`466`	`471`	`}`
`467`	`472`	`]`
`468`	`473`	`}`
`469`		`- ]`
	`474`	`+ ],`
	`475`	`+ stream=True,`
`470`	`476`	`)`
`471`	`477`
`472`	`478`	`for chunk in response:`