@@ -87,7 +87,7 @@ Deployment through ``LLMRouter``
87
87
)
88
88
89
89
# Deploy the application
90
- deployment = VLLMService.as_deployment().bind(llm_config)
90
+ deployment = VLLMService.as_deployment(llm_config.get_serve_options( name_prefix = " VLLM: " ) ).bind(llm_config)
91
91
llm_app = LLMRouter.as_deployment().bind([deployment])
92
92
serve.run(llm_app)
93
93
@@ -121,7 +121,7 @@ You can query the deployed models using either cURL or the OpenAI Python client:
121
121
# Basic chat completion with streaming
122
122
response = client.chat.completions.create(
123
123
model = " qwen-0.5b" ,
124
- messages = [{" role" : " user" , " content" : " Hello!" }]
124
+ messages = [{" role" : " user" , " content" : " Hello!" }],
125
125
stream = True
126
126
)
127
127
@@ -165,8 +165,8 @@ For deploying multiple models, you can pass a list of ``LLMConfig`` objects to t
165
165
)
166
166
167
167
# Deploy the application
168
- deployment1 = VLLMService.as_deployment().bind(llm_config1)
169
- deployment2 = VLLMService.as_deployment().bind(llm_config2)
168
+ deployment1 = VLLMService.as_deployment(llm_config1.get_serve_options( name_prefix = " VLLM: " ) ).bind(llm_config1)
169
+ deployment2 = VLLMService.as_deployment(llm_config2.get_serve_options( name_prefix = " VLLM: " ) ).bind(llm_config2)
170
170
llm_app = LLMRouter.as_deployment().bind([deployment1, deployment2])
171
171
serve.run(llm_app)
172
172
@@ -318,7 +318,8 @@ This allows the weights to be loaded on each replica on-the-fly and be cached vi
318
318
# Make a request to the desired lora checkpoint
319
319
response = client.chat.completions.create(
320
320
model = " qwen-0.5b:lora_model_1_ckpt" ,
321
- messages = [{" role" : " user" , " content" : " Hello!" }]
321
+ messages = [{" role" : " user" , " content" : " Hello!" }],
322
+ stream = True ,
322
323
)
323
324
324
325
for chunk in response:
@@ -430,7 +431,11 @@ For multimodal models that can process both text and images:
430
431
max_replicas = 2 ,
431
432
)
432
433
),
433
- accelerator_type = " A10G" ,
434
+ accelerator_type = " L40S" ,
435
+ engine_kwargs = dict (
436
+ tensor_parallel_size = 1 ,
437
+ max_model_len = 8192 ,
438
+ ),
434
439
)
435
440
436
441
# Build and deploy the model
@@ -466,7 +471,8 @@ For multimodal models that can process both text and images:
466
471
}
467
472
]
468
473
}
469
- ]
474
+ ],
475
+ stream = True ,
470
476
)
471
477
472
478
for chunk in response:
0 commit comments