
    i                         d dl mZmZmZ d dlmZ ddlmZ ddlm	Z	  e	j                  e      ZddiZ G d d	e      Zd	gZy
)    )	Tokenizerdecodersnormalizers)BPE   )TokenizersBackend)loggingtokenizer_fileztokenizer.jsonc                        e Zd ZdZeZdZddgZeZ		 	 	 	 	 	 	 dde
ee
ef   z  dz  de
ee
   z  dz  de
d	e
d
e
de
de
f fdZdefdZ xZS )GemmaTokenizeru  
    Construct a fast Gemma tokenizer (backed by HuggingFace's tokenizers library).

    This tokenizer uses a BPE model with byte fallback, no prefix space, and a normalizer that replaces
    spaces with "▁".

    Args:
        tokenizer_file (`str`, optional):
            A tokenizers JSON file containing the serialization of a tokenizer.
        unk_token (`str`, optional, defaults to "<unk>"):
            The unknown token.
        bos_token (`str`, optional, defaults to "<bos>"):
            The beginning of sequence token.
        eos_token (`str`, optional, defaults to "<eos>"):
            The end of sequence token.
        pad_token (`str`, optional, defaults to "<pad>"):
            The padding token.
        mask_token (`str`, optional, defaults to "<mask>"):
            The mask token.
        add_bos_token (`bool`, optional, defaults to True):
            Whether or not to add a `bos_token` at the start of sequences.
        add_eos_token (`bool`, optional, defaults to False):
            Whether or not to add an `eos_token` at the end of sequences.
        vocab (`str` or `dict[str, int]`, optional):
            Custom vocabulary dict. If not provided, a minimal vocabulary is created using the special tokens.
    left	input_idsattention_maskNvocabmerges	unk_token	bos_token	eos_token	pad_token
mask_tokenc                 J   |9t        |      dt        |      dt        |      dt        |      dt        |      di}|| _        |xs g | _        t        t	        | j                  | j                  dt        |      d d            | _        t        j                  t        j                  dd	      t        j                         t        j                         g      | j
                  _        t        j                  d	d      | j
                  _        t        	| <  d|||||d
| y )Nr         r      T)r   r   fuse_unkr   dropoutbyte_fallbacku   ▁ )r   r   r   r   r    )str_vocab_mergesr   r   
_tokenizerr   SequenceReplaceByteFallbackFusedecoderr   
normalizersuper__init__)
selfr   r   r   r   r   r   r   kwargs	__class__s
            v/home/obispo/Crisostomo_bridge/mision_env/lib/python3.12/site-packages/transformers/models/gemma/tokenization_gemma.pyr+   zGemmaTokenizer.__init__;   s	    =IIIIJE |#kk||i."	
 #+"3"3eS)8+@+@+BHMMOT#
 &1%8%8e%D" 	
!	
 	
    returnc                      y)Nr   r   )r,   s    r/   _unk_idzGemmaTokenizer._unk_idi   s    r0   )NNz<unk>z<bos>z<eos>z<pad>z<mask>)__name__
__module____qualname____doc__VOCAB_FILES_NAMESvocab_files_namespadding_sidemodel_input_namesr   modelr    dictintlistr+   r3   __classcell__)r.   s   @r/   r   r      s    6 *L$&67E .2)-    ",
T#s(^#d*,
 d3i$&,
 	,

 ,
 ,
 ,
 ,
\ r0   r   N)
tokenizersr   r   r   tokenizers.modelsr   tokenization_utils_tokenizersr   utilsr	   
get_loggerr4   loggerr8   r   __all__r   r0   r/   <module>rH      sP    8 7 ! >  
		H	%%'78 Q& Qh 
r0   