mlx_gguf_server/schemas.py at main · gitkaz/mlx_gguf_server · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
from pydantic import BaseModel, model_validator, Field, ConfigDict
from typing_extensions import Self
from typing import Optional, Dict, List, Union, Literal

class CompletionParams(BaseModel):
    model: str = "dummy"
    prompt: str = ""
    messages: List[Dict] = []
    max_tokens: Optional[int] = 4096
    temperature: Optional[float] = 1.0
    seed: Optional[int] = None
    stream: bool = False
    apply_chat_template: bool = False
    complete_text: bool = False
    top_p: Optional[float] = 1.0
    stop: Optional[list] = []
    logit_bias: Optional[Dict[int, float]] = None # mlx only
    repetition_penalty: Optional[float] = None    # mlx only
    repetition_context_size: Optional[int] = 20   # mlx only
    use_kv_cache: bool = False                    # mlx only
    kv_cache_threshold: Optional[int] = 5000      # mlx only
    tools: Optional[list] = None                  # mlx only
    enable_thinking: Optional[bool] = None        # mlx only
    experimental_generate: Optional[bool] = False # mlx only
    top_k: int = 0                      # llama-cpp only
    min_p: float = 0.05                 # llama-cpp only
    typical_p: float = 1.0              # llama-cpp only
    frequency_penalty: float = 0.0      # llama-cpp only
    presence_penalty: float = 0.0       # llama-cpp only
    repet_penalty: float = 1.1          # llama-cpp only
    mirostat_mode: int = 0              # llama-cpp only
    mirostat_tau: float = 5.0           # llama-cpp only
    mirostat_eta: float = 0.1           # llama-cpp only

    @model_validator(mode='after')
    def validate_prompt_and_messages(self) -> Self:
        prompt = self.prompt
        messages = self.messages
        if prompt and messages:
            raise ValueError("Only one of 'prompt' or 'messages' should be provided.")
        return self


class ModelLoadParams(BaseModel):
    llm_model_name: str
    llm_model_path: str = Field(default="", exclude=True)
    adapter_name: Optional[str] = None
    adapter_path: Optional[str] = Field(default=None, exclude=True)
    chat_format: Optional[str] = None # only for llama-cpp
    temperature: Optional[float] = None
    max_tokens: Optional[int] = None
    logit_bias: Optional[Dict[int, float]] = None
    repetition_penalty: Optional[float] = None
    repetition_context_size: Optional[int] = None
    top_p: Optional[float] = None
    use_kv_cache: Optional[bool] = None
    kv_cache_threshold: Optional[int] = None
    auto_unload: Optional[bool] = None
    priority: Optional[int] = None
    trust_remote_code: Optional[bool] = None

class ProcessCleanParams(BaseModel):
    timeout: int

class KokoroTtsParams(BaseModel):
    text: str
    lang_code: str    = "a"
    voice: str        = "af_heart"
    speed: int        = 1
    split_pattern:str = r'\n+'


class EmbeddingsParams(BaseModel):
    """
    Parameters of Embedding API. Referred by OpenAI API.
    Refs:
        https://platform.openai.com/docs/api-reference/embeddings/create
    """
    input: Union[str, List[str]] = Field(
        ...,
        description="Input text to embed, encoded as a string or array of strings. "
                    "To embed multiple inputs in a single request, pass an array of strings. "
                    "The input must not exceed the max input tokens for the model."
    )
    encoding_format: Optional[str] = Field(
        default="float",
        description="The format to return the embeddings in. Can be either float or base64."
    )
    dimensions: Optional[Literal[32, 64, 128, 256, 512, 768, 1024]] = Field(
        default=None,
        description="The number of dimensions the resulting output embeddings should have."
    )

class InputTokenCountParams(BaseModel):
    """
    Parameters for OpenAI-compatible input token counting API.
    Ref: https://developers.openai.com/api/reference/resources/responses/subresources/input_tokens/methods/count
    """
    model_config = ConfigDict(
        json_schema_extra={
            "example": {
                "model": "gemma-3-270m-it-8bit",
                "input": "Hello, world!"
            }
        }
    )

    model: str = Field(..., min_length=1, description="Model ID to use for tokenization")
    input: str = Field(..., description="Text input to tokenize")

class InputTokenCountResponse(BaseModel):
    """
    Response for OpenAI-compatible input token counting API.
    """
    object: str = "response.input_tokens"
    model_config = ConfigDict(
        json_schema_extra={
            "example": {
                "object": "response.input_tokens",
                "input_tokens": 4
            }
        }
    )

    object: str = "response.input_tokens"
    input_tokens: int