huggingface
diff --git a/‎benchmark/src/lib.rs‎
Lines changed: 3 additions & 1 deletion b/‎benchmark/src/lib.rs‎
Lines changed: 3 additions & 1 deletion
diff --git a/‎clients/python/text_generation/client.py‎
Lines changed: 10 additions & 0 deletions b/‎clients/python/text_generation/client.py‎
Lines changed: 10 additions & 0 deletions
diff --git a/‎clients/python/text_generation/types.py‎
Lines changed: 26 additions & 2 deletions b/‎clients/python/text_generation/types.py‎
Lines changed: 26 additions & 2 deletions
diff --git a/‎docs/source/basic_tutorials/launcher.md‎
Lines changed: 8 additions & 0 deletions b/‎docs/source/basic_tutorials/launcher.md‎
Lines changed: 8 additions & 0 deletions
diff --git a/‎integration-tests/conftest.py‎
Lines changed: 27 additions & 3 deletions b/‎integration-tests/conftest.py‎
Lines changed: 27 additions & 3 deletions
diff --git a/‎integration-tests/models/__snapshots__/test_grammar_llama/test_flash_llama_grammar.json‎
Lines changed: 89 additions & 0 deletions b/‎integration-tests/models/__snapshots__/test_grammar_llama/test_flash_llama_grammar.json‎
Lines changed: 89 additions & 0 deletions
@@ -8,7 +8,7 @@ use crate::app::App;
 use crate::event::Event;
 use crossterm::ExecutableCommand;
 use std::io;
-use text_generation_client::{NextTokenChooserParameters, ShardedClient};
+use text_generation_client::{GrammarType, NextTokenChooserParameters, ShardedClient};
 use tokenizers::Tokenizer;
 use tokio::sync::{broadcast, mpsc};
 use tui::backend::CrosstermBackend;
@@ -45,6 +45,8 @@ pub async fn run(
         repetition_penalty: repetition_penalty.unwrap_or(1.0),
         frequency_penalty: frequency_penalty.unwrap_or(0.0),
         watermark,
+        grammar: String::new(),
+        grammar_type: GrammarType::None as i32,
     };
 
     // Initialize terminal properties
 
@@ -10,6 +10,7 @@
     Response,
     Request,
     Parameters,
+    Grammar,
 )
 from text_generation.errors import parse_error
 
@@ -76,6 +77,7 @@ def generate(
         watermark: bool = False,
         decoder_input_details: bool = False,
         top_n_tokens: Optional[int] = None,
+        grammar: Optional[Grammar] = None,
     ) -> Response:
         """
         Given a prompt, generate the following text
@@ -138,6 +140,7 @@ def generate(
             watermark=watermark,
             decoder_input_details=decoder_input_details,
             top_n_tokens=top_n_tokens,
+            grammar=grammar,
         )
         request = Request(inputs=prompt, stream=False, parameters=parameters)
 
@@ -169,6 +172,7 @@ def generate_stream(
         typical_p: Optional[float] = None,
         watermark: bool = False,
         top_n_tokens: Optional[int] = None,
+        grammar: Optional[Grammar] = None,
     ) -> Iterator[StreamResponse]:
         """
         Given a prompt, generate the following stream of tokens
@@ -227,6 +231,7 @@ def generate_stream(
             typical_p=typical_p,
             watermark=watermark,
             top_n_tokens=top_n_tokens,
+            grammar=grammar,
         )
         request = Request(inputs=prompt, stream=True, parameters=parameters)
 
@@ -326,6 +331,7 @@ async def generate(
         watermark: bool = False,
         decoder_input_details: bool = False,
         top_n_tokens: Optional[int] = None,
+        grammar: Optional[Grammar] = None,
     ) -> Response:
         """
         Given a prompt, generate the following text asynchronously
@@ -370,6 +376,7 @@ async def generate(
         Returns:
             Response: generated response
         """
+
         # Validate parameters
         parameters = Parameters(
             best_of=best_of,
@@ -388,6 +395,7 @@ async def generate(
             typical_p=typical_p,
             watermark=watermark,
             top_n_tokens=top_n_tokens,
+            grammar=grammar,
         )
         request = Request(inputs=prompt, stream=False, parameters=parameters)
 
@@ -417,6 +425,7 @@ async def generate_stream(
         typical_p: Optional[float] = None,
         watermark: bool = False,
         top_n_tokens: Optional[int] = None,
+        grammar: Optional[Grammar] = None,
     ) -> AsyncIterator[StreamResponse]:
         """
         Given a prompt, generate the following stream of tokens asynchronously
@@ -475,6 +484,7 @@ async def generate_stream(
             typical_p=typical_p,
             watermark=watermark,
             top_n_tokens=top_n_tokens,
+            grammar=grammar,
         )
         request = Request(inputs=prompt, stream=True, parameters=parameters)
 
 
@@ -1,10 +1,24 @@
 from enum import Enum
 from pydantic import BaseModel, validator
-from typing import Optional, List
+from typing import Optional, List, Union
 
 from text_generation.errors import ValidationError
 
 
+# enum for grammar type
+class GrammarType(str, Enum):
+    Json = "json"
+    Regex = "regex"
+
+
+# Grammar type and value
+class Grammar(BaseModel):
+    # Grammar type
+    type: GrammarType
+    # Grammar value
+    value: Union[str, dict]
+
+
 class Parameters(BaseModel):
     # Activate logits sampling
     do_sample: bool = False
@@ -41,6 +55,8 @@ class Parameters(BaseModel):
     decoder_input_details: bool = False
     # Return the N most likely tokens at each step
     top_n_tokens: Optional[int] = None
+    # grammar to use for generation
+    grammar: Optional[Grammar] = None
 
     @validator("best_of")
     def valid_best_of(cls, field_value, values):
@@ -109,6 +125,14 @@ def valid_top_n_tokens(cls, v):
             raise ValidationError("`top_n_tokens` must be strictly positive")
         return v
 
+    @validator("grammar")
+    def valid_grammar(cls, v):
+        if v is not None:
+            if v.type == GrammarType.Regex and not v.value:
+                raise ValidationError("`value` cannot be empty for `regex` grammar")
+            if v.type == GrammarType.Json and not v.value:
+                raise ValidationError("`value` cannot be empty for `json` grammar")
+        return v
 
 class Request(BaseModel):
     # Prompt
@@ -157,7 +181,7 @@ class Token(BaseModel):
     # Token text
     text: str
     # Logprob
-    logprob: float
+    logprob: Optional[float] = None
     # Is the token a special token
     # Can be used to ignore tokens when concatenating
     special: bool
 
@@ -378,6 +378,14 @@ Options:
           
           [env: TOKENIZER_CONFIG_PATH=]
 
+```
+## DISABLE_GRAMMAR_SUPPORT
+```shell
+      --disable-grammar-support
+          Disable outlines grammar constrained generation. This is a feature that allows you to generate text that follows a specific grammar
+          
+          [env: DISABLE_GRAMMAR_SUPPORT=]
+
 ```
 ## ENV
 ```shell
 
@@ -16,7 +16,14 @@
 from aiohttp import ClientConnectorError, ClientOSError, ServerDisconnectedError
 
 from text_generation import AsyncClient
-from text_generation.types import Response, Details, InputToken, Token, BestOfSequence
+from text_generation.types import (
+    Response,
+    Details,
+    InputToken,
+    Token,
+    BestOfSequence,
+    Grammar,
+)
 
 DOCKER_IMAGE = os.getenv("DOCKER_IMAGE", None)
 HUGGING_FACE_HUB_TOKEN = os.getenv("HUGGING_FACE_HUB_TOKEN", None)
@@ -224,6 +231,7 @@ def local_launcher(
         quantize: Optional[str] = None,
         trust_remote_code: bool = False,
         use_flash_attention: bool = True,
+        disable_grammar_support: bool = False,
         dtype: Optional[str] = None,
     ):
         port = random.randint(8000, 10_000)
@@ -247,6 +255,8 @@ def local_launcher(
 
         env = os.environ
 
+        if disable_grammar_support:
+            args.append("--disable-grammar-support")
         if num_shard is not None:
             args.extend(["--num-shard", str(num_shard)])
         if quantize is not None:
@@ -287,12 +297,15 @@ def docker_launcher(
         quantize: Optional[str] = None,
         trust_remote_code: bool = False,
         use_flash_attention: bool = True,
+        disable_grammar_support: bool = False,
         dtype: Optional[str] = None,
     ):
         port = random.randint(8000, 10_000)
 
         args = ["--model-id", model_id, "--env"]
 
+        if disable_grammar_support:
+            args.append("--disable-grammar-support")
         if num_shard is not None:
             args.extend(["--num-shard", str(num_shard)])
         if quantize is not None:
@@ -370,11 +383,22 @@ def docker_launcher(
 @pytest.fixture(scope="module")
 def generate_load():
     async def generate_load_inner(
-        client: AsyncClient, prompt: str, max_new_tokens: int, n: int
+        client: AsyncClient,
+        prompt: str,
+        max_new_tokens: int,
+        n: int,
+        seed: Optional[int] = None,
+        grammar: Optional[Grammar] = None,
+        stop_sequences: Optional[List[str]] = None,
     ) -> List[Response]:
         futures = [
             client.generate(
-                prompt, max_new_tokens=max_new_tokens, decoder_input_details=True
+                prompt,
+                max_new_tokens=max_new_tokens,
+                decoder_input_details=True,
+                seed=seed,
+                grammar=grammar,
+                stop_sequences=stop_sequences,
             )
             for _ in range(n)
         ]
 
@@ -0,0 +1,89 @@
+{
+  "details": {
+    "best_of_sequences": null,
+    "finish_reason": "length",
+    "generated_tokens": 10,
+    "prefill": [
+      {
+        "id": 1,
+        "logprob": null,
+        "text": "<s>"
+      },
+      {
+        "id": 4321,
+        "logprob": -13.90625,
+        "text": "Test"
+      },
+      {
+        "id": 2009,
+        "logprob": -12.328125,
+        "text": "request"
+      }
+    ],
+    "seed": null,
+    "tokens": [
+      {
+        "id": 13,
+        "logprob": -2.0566406,
+        "special": false,
+        "text": "\n"
+      },
+      {
+        "id": 13,
+        "logprob": -1.5253906,
+        "special": false,
+        "text": "\n"
+      },
+      {
+        "id": 29902,
+        "logprob": -2.7578125,
+        "special": false,
+        "text": "I"
+      },
+      {
+        "id": 4966,
+        "logprob": -1.9033203,
+        "special": false,
+        "text": " hope"
+      },
+      {
+        "id": 445,
+        "logprob": -0.5019531,
+        "special": false,
+        "text": " this"
+      },
+      {
+        "id": 6911,
+        "logprob": -0.21264648,
+        "special": false,
+        "text": " helps"
+      },
+      {
+        "id": 29991,
+        "logprob": -0.5991211,
+        "special": false,
+        "text": "!"
+      },
+      {
+        "id": 2803,
+        "logprob": -0.37475586,
+        "special": false,
+        "text": " Let"
+      },
+      {
+        "id": 592,
+        "logprob": -0.018463135,
+        "special": false,
+        "text": " me"
+      },
+      {
+        "id": 1073,
+        "logprob": -0.0008597374,
+        "special": false,
+        "text": " know"
+      }
+    ],
+    "top_tokens": null
+  },
+  "generated_text": "\n\nI hope this helps! Let me know"
+}