@@ -261,6 +261,38 @@ def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
261
261
262
262
return output + bos_token_ids + token_ids_1
263
263
264
+ def get_special_tokens_mask (
265
+ self , token_ids_0 : List [int ], token_ids_1 : Optional [List [int ]] = None , already_has_special_tokens : bool = False
266
+ ) -> List [int ]:
267
+ """
268
+ Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding
269
+ special tokens using the tokenizer `prepare_for_model` or `encode_plus` methods.
270
+
271
+ Args:
272
+ token_ids_0 (`List[int]`):
273
+ List of IDs.
274
+ token_ids_1 (`List[int]`, *optional*):
275
+ Optional second list of IDs for sequence pairs.
276
+ already_has_special_tokens (`bool`, *optional*, defaults to `False`):
277
+ Whether or not the token list is already formatted with special tokens for the model.
278
+
279
+ Returns:
280
+ `List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
281
+ """
282
+ if already_has_special_tokens :
283
+ return super ().get_special_tokens_mask (
284
+ token_ids_0 = token_ids_0 , token_ids_1 = token_ids_1 , already_has_special_tokens = True
285
+ )
286
+
287
+ if not self .add_bos_token :
288
+ return super ().get_special_tokens_mask (
289
+ token_ids_0 = token_ids_0 , token_ids_1 = token_ids_1 , already_has_special_tokens = False
290
+ )
291
+
292
+ if token_ids_1 is None :
293
+ return [1 ] + ([0 ] * len (token_ids_0 ))
294
+ return [1 ] + ([0 ] * len (token_ids_0 )) + [1 ] + ([0 ] * len (token_ids_1 ))
295
+
264
296
def _tokenize (self , text ):
265
297
"""Tokenize a string."""
266
298
bpe_tokens = []
0 commit comments