xorbitsai · qinxuye · Feb 21, 2025 · Feb 21, 2025 · Feb 21, 2025 · Feb 21, 2025
diff --git a/xinference/model/llm/reasoning_parsers/abs_reasoning_parsers.py b/xinference/model/llm/reasoning_parsers/abs_reasoning_parsers.py
@@ -1,7 +1,7 @@
 from abc import ABC, abstractmethod
 from typing import Dict, Optional, Tuple, Type, Union
 
-from ....types import ChatCompletionChunkDelta, CompletionChoice, CompletionChunk
+from ....types import ChatCompletionChunkDelta, CompletionChoice
 
 
 class ReasoningParser(ABC):
@@ -26,7 +26,7 @@ def extract_reasoning_content_streaming(
         self,
         previous_text: str,
         current_text: str,
-        delta: Union[str, CompletionChunk],
+        delta: ChatCompletionChunkDelta,
     ) -> ChatCompletionChunkDelta:
         """Extract reasoning content from model output in a streaming fashion.
 

diff --git a/xinference/model/llm/reasoning_parsers/deepseek_r1_reasoning_parser.py b/xinference/model/llm/reasoning_parsers/deepseek_r1_reasoning_parser.py
@@ -24,7 +24,7 @@ def extract_reasoning_content_streaming(
         previous_text: str,
         current_text: str,
         delta: ChatCompletionChunkDelta,
-    ) -> Optional[ChatCompletionChunkDelta]:
+    ) -> ChatCompletionChunkDelta:
         """Extract reasoning content from DeepSeek-R1 model output in a streaming fashion.
 
         Args:
@@ -122,7 +122,7 @@ def extract_reasoning_content(
         # Thus we assume the reasoning content is always at the start.
         # Ref https://huggingface.co/deepseek-ai/DeepSeek-R1/commit/8a58a132790c9935686eb97f042afa8013451c9f
         if self.reasoning_end_tag not in model_output:
-            return model_output, None
+            return model_output, ""
         else:
             # Add a start token if it's missing to keep compatibility.
             if self.reasoning_start_tag not in model_output:
@@ -136,5 +136,5 @@ def extract_reasoning_content(
             final_output = model_output[end_index:]
 
             if len(final_output) == 0:
-                return reasoning_content, None
+                return reasoning_content, ""
             return reasoning_content, final_output
diff --git a/xinference/model/llm/tests/test_llm_family.py b/xinference/model/llm/tests/test_llm_family.py
@@ -136,7 +136,7 @@ def test_serialize_llm_family_v1():
         stop=["hello", "world"],
     )
 
-    expected = """{"version": 1, "context_length": 2048, "model_name": "TestModel", "model_lang": ["en"], "model_ability": ["embed", "generate"], "model_description": null, "model_family": null, "model_specs": [{"model_format": "ggufv2", "model_hub": "huggingface", "model_size_in_billions": 2, "quantizations": ["q4_0", "q4_1"], "quantization_parts": {"q4_2": ["a", "b"]}, "model_id": "example/TestModel", "model_revision": "123", "model_file_name_template": "TestModel.{quantization}.bin", "model_file_name_split_template": "TestModel.{quantization}.bin.{part}", "model_uri": null}, {"model_format": "pytorch", "model_hub": "huggingface", "model_size_in_billions": 3, "quantizations": ["int8", "int4", "none"], "model_id": "example/TestModel", "model_revision": "456", "model_uri": null}], "chat_template": "xyz", "stop_token_ids": [1, 2, 3], "stop": ["hello", "world"]}"""
+    expected = """{"version": 1, "context_length": 2048, "model_name": "TestModel", "model_lang": ["en"], "model_ability": ["embed", "generate"], "model_description": null, "model_family": null, "model_specs": [{"model_format": "ggufv2", "model_hub": "huggingface", "model_size_in_billions": 2, "quantizations": ["q4_0", "q4_1"], "quantization_parts": {"q4_2": ["a", "b"]}, "model_id": "example/TestModel", "model_revision": "123", "model_file_name_template": "TestModel.{quantization}.bin", "model_file_name_split_template": "TestModel.{quantization}.bin.{part}", "model_uri": null}, {"model_format": "pytorch", "model_hub": "huggingface", "model_size_in_billions": 3, "quantizations": ["int8", "int4", "none"], "model_id": "example/TestModel", "model_revision": "456", "model_uri": null}], "chat_template": "xyz", "stop_token_ids": [1, 2, 3], "stop": ["hello", "world"], "reasoning_start_tag":null, "reasoning_end_tag":null}"""
     assert json.loads(llm_family.json()) == json.loads(expected)
 
     llm_family_context_length = LLMFamilyV1(

diff --git a/xinference/model/llm/utils.py b/xinference/model/llm/utils.py
@@ -374,19 +374,21 @@ async def _async_to_chat_completion_chunks(
         current_text = ""
         async for chunk in chunks:
             if i == 0:
-                chunk = cls._get_first_chat_completion_chunk(chunk)
+                chat_chunk = cls._get_first_chat_completion_chunk(chunk)
             elif not chunk.get("choices"):
                 # usage
-                chunk = cls._get_final_chat_completion_chunk(chunk)
+                chat_chunk = cls._get_final_chat_completion_chunk(chunk)
             else:
-                chunk = cls._to_chat_completion_chunk(chunk)
+                chat_chunk = cls._to_chat_completion_chunk(chunk)
             if reasoning_parser is not None:
-                choices = chunk.get("choices")
+                choices = chat_chunk.get("choices")
+                if choices is None:
+                    continue
                 for choice in choices:
                     delta = choice.get("delta")
                     if not delta:
                         continue
-                    current_text = previous_text + delta.get("content")
+                    current_text = previous_text + delta.get("content", "")
                     choice[
                         "delta"
                     ] = reasoning_parser.extract_reasoning_content_streaming(
@@ -395,7 +397,7 @@ async def _async_to_chat_completion_chunks(
                         delta=delta,
                     )
                     previous_text = current_text
-            yield chunk
+            yield chat_chunk
             i += 1
 
     @staticmethod
@@ -408,7 +410,7 @@ def _to_chat_completion(
             reasoning_content = None
 
             if reasoning_parser is not None:
-                reasoning_content, content = reasoning_parser.extract_reasoning_content(
+                reasoning_content, content = reasoning_parser.extract_reasoning_content(  # type: ignore
                     choice
                 )
 
@@ -429,7 +431,8 @@ def _to_chat_completion(
             "id": "chat" + completion["id"],
             "object": "chat.completion",
             "created": completion["created"],
-            "model": choices,
+            "model": completion["model"],
+            "choices": choices,  # type: ignore
             "usage": completion["usage"],
         }