|
42 | 42 | },
|
43 | 43 | {
|
44 | 44 | "cell_type": "code",
|
45 |
| - "execution_count": 2, |
| 45 | + "execution_count": 1, |
46 | 46 | "metadata": {},
|
47 | 47 | "outputs": [
|
48 | 48 | {
|
|
121 | 121 | },
|
122 | 122 | {
|
123 | 123 | "cell_type": "code",
|
124 |
| - "execution_count": 4, |
| 124 | + "execution_count": 2, |
125 | 125 | "metadata": {},
|
126 | 126 | "outputs": [
|
127 | 127 | {
|
|
138 | 138 | "{\n",
|
139 | 139 | " \"debug\": false,\n",
|
140 | 140 | " \"parallel_workers\": 0,\n",
|
141 |
| - " \"gzip_enabled\": false,\n", |
142 |
| - " \"metrics_endpoint\": null\n", |
| 141 | + " \"gzip_enabled\": false\n", |
143 | 142 | "}\n"
|
144 | 143 | ]
|
145 | 144 | },
|
|
150 | 149 | "Note the currently there are three main limitations of the streaming support in MLServer:\n",
|
151 | 150 | "\n",
|
152 | 151 | "- distributed workers are not supported (i.e., the `parallel_workers` setting should be set to `0`)\n",
|
153 |
| - "- `gzip` middleware is not supported for REST (i.e., `gzip_enabled` setting should be set to `false`)\n", |
154 |
| - "- metrics endpoint is not available (i.e. `metrics_endpoint` is also disabled for streaming for gRPC)" |
| 152 | + "- `gzip` middleware is not supported for REST (i.e., `gzip_enabled` setting should be set to `false`)" |
155 | 153 | ]
|
156 | 154 | },
|
157 | 155 | {
|
|
163 | 161 | },
|
164 | 162 | {
|
165 | 163 | "cell_type": "code",
|
166 |
| - "execution_count": 5, |
| 164 | + "execution_count": 3, |
167 | 165 | "metadata": {},
|
168 | 166 | "outputs": [
|
169 | 167 | {
|
|
227 | 225 | },
|
228 | 226 | {
|
229 | 227 | "cell_type": "code",
|
230 |
| - "execution_count": 6, |
| 228 | + "execution_count": 4, |
231 | 229 | "metadata": {},
|
232 | 230 | "outputs": [
|
233 | 231 | {
|
234 | 232 | "name": "stdout",
|
235 | 233 | "output_type": "stream",
|
236 | 234 | "text": [
|
237 |
| - "Writing generate-request.json\n" |
| 235 | + "Overwriting generate-request.json\n" |
238 | 236 | ]
|
239 | 237 | }
|
240 | 238 | ],
|
|
272 | 270 | },
|
273 | 271 | {
|
274 | 272 | "cell_type": "code",
|
275 |
| - "execution_count": null, |
| 273 | + "execution_count": 5, |
276 | 274 | "metadata": {},
|
277 |
| - "outputs": [], |
| 275 | + "outputs": [ |
| 276 | + { |
| 277 | + "name": "stdout", |
| 278 | + "output_type": "stream", |
| 279 | + "text": [ |
| 280 | + "['What']\n", |
| 281 | + "[' is']\n", |
| 282 | + "[' the']\n", |
| 283 | + "[' capital']\n", |
| 284 | + "[' of']\n", |
| 285 | + "[' France?']\n" |
| 286 | + ] |
| 287 | + } |
| 288 | + ], |
278 | 289 | "source": [
|
279 | 290 | "import httpx\n",
|
280 | 291 | "from httpx_sse import connect_sse\n",
|
|
301 | 312 | },
|
302 | 313 | {
|
303 | 314 | "cell_type": "code",
|
304 |
| - "execution_count": null, |
| 315 | + "execution_count": 6, |
305 | 316 | "metadata": {},
|
306 |
| - "outputs": [], |
| 317 | + "outputs": [ |
| 318 | + { |
| 319 | + "name": "stdout", |
| 320 | + "output_type": "stream", |
| 321 | + "text": [ |
| 322 | + "['What']\n", |
| 323 | + "[' is']\n", |
| 324 | + "[' the']\n", |
| 325 | + "[' capital']\n", |
| 326 | + "[' of']\n", |
| 327 | + "[' France?']\n" |
| 328 | + ] |
| 329 | + } |
| 330 | + ], |
307 | 331 | "source": [
|
308 | 332 | "import grpc\n",
|
309 | 333 | "import mlserver.types as types\n",
|
|
315 | 339 | "inference_request = types.InferenceRequest.parse_file(\"./generate-request.json\")\n",
|
316 | 340 | "\n",
|
317 | 341 | "# need to convert from string to bytes for grpc\n",
|
318 |
| - "inference_request.inputs[0] = StringCodec.encode_input(\"prompt\", inference_request.inputs[0].data.__root__)\n", |
| 342 | + "inference_request.inputs[0] = StringCodec.encode_input(\"prompt\", inference_request.inputs[0].data.root)\n", |
319 | 343 | "inference_request_g = converters.ModelInferRequestConverter.from_types(\n",
|
320 | 344 | " inference_request, model_name=\"text-model\", model_version=None\n",
|
321 | 345 | ")\n",
|
|
338 | 362 | "source": [
|
339 | 363 | "Note that for gRPC, the request is transformed into an async generator which is then passed to the `ModelStreamInfer` method. The response is also an async generator which can be iterated over to get the response."
|
340 | 364 | ]
|
341 |
| - }, |
342 |
| - { |
343 |
| - "cell_type": "markdown", |
344 |
| - "metadata": {}, |
345 |
| - "source": [] |
346 | 365 | }
|
347 | 366 | ],
|
348 | 367 | "metadata": {
|
|
361 | 380 | "name": "python",
|
362 | 381 | "nbconvert_exporter": "python",
|
363 | 382 | "pygments_lexer": "ipython3",
|
364 |
| - "version": "3.10.14" |
| 383 | + "version": "3.10.12" |
365 | 384 | }
|
366 | 385 | },
|
367 | 386 | "nbformat": 4,
|
|
0 commit comments