
    i                         d Z ddlmZmZmZmZ ddlmZ ddlm	Z	 ddl
mZ  ej                  e      Zddd	d
Z G d de	      ZdgZy)z$Tokenization classes for OpenAI GPT.    )	Tokenizerdecodersnormalizerspre_tokenizers)BPE   )TokenizersBackend)loggingz
vocab.jsonz
merges.txtztokenizer.json)
vocab_filemerges_filetokenizer_filec                        e Zd ZdZeZddgZeZ	 	 	 d
de	e
e	ef   z  dz  de	ee	   z  dz  de	f fdZed	        Z xZS )OpenAIGPTTokenizera  
    Construct a GPT Tokenizer (backed by HuggingFace's *tokenizers* library). Based on Byte-Pair-Encoding with
    the following peculiarities:

    - lower case all inputs
    - uses BERT's BasicTokenizer for pre-BPE tokenization

    This tokenizer inherits from [`TokenizersBackend`] which contains most of the main methods. Users should
    refer to this superclass for more information regarding those methods.

    Args:
        vocab_file (`str`, *optional*):
            Path to the vocabulary file.
        merges_file (`str`, *optional*):
            Path to the merges file.
        tokenizer_file (`str`, *optional*):
            Path to a tokenizers JSON file containing the serialization of a tokenizer.
        unk_token (`str`, *optional*, defaults to `"<unk>"`):
            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
            token instead.
        vocab (`str` or `dict[str, int]`, *optional*):
            Custom vocabulary dictionary. If not provided, a blank vocabulary is initialized.
        merges (`str` or `list[str]`, *optional*):
            Custom merges list. If not provided, an empty list is used.
    	input_idsattention_maskNvocabmerges	unk_tokenc                 .   ||nt        |      di| _        |xs g | _        t        t	        | j                  | j                  d dddt        |                  | _        t        j                  t        j                         t        j                         t        j                         g      | j
                  _        t        j                         | j
                  _        t        j                   d      | j
                  _        t%        | L  dd|i| y )	Nr    z</w>F)r   r   dropoutcontinuing_subword_prefixend_of_word_suffixfuse_unkr   )suffixr    )str_vocab_mergesr   r   
_tokenizerr   SequenceNFD	LowercaseStripAccents
normalizerr   BertPreTokenizerpre_tokenizerr   
BPEDecoderdecodersuper__init__)selfr   r   r   kwargs	__class__s        x/home/obispo/Crisostomo_bridge/mision_env/lib/python3.12/site-packages/transformers/models/openai/tokenization_openai.pyr+   zOpenAIGPTTokenizer.__init__;   s      %0es9~q6I|#kk||*,#)i.

 &1%9%9!%%'((*&
" )7(G(G(I%"*"5"5V"D 	
	
	
    c                      y)NTr   )r,   s    r/   do_lower_casez OpenAIGPTTokenizer.do_lower_casec   s    r0   )NNz<unk>)__name__
__module____qualname____doc__VOCAB_FILES_NAMESvocab_files_namesmodel_input_namesr   modelr   dictintlistr+   propertyr2   __classcell__)r.   s   @r/   r   r      s    4 *$&67E .2)- 	&
T#s(^#d*&
 d3i$&&
 	&
P  r0   r   N)r6   
tokenizersr   r   r   r   tokenizers.modelsr   tokenization_utils_tokenizersr	   utilsr
   
get_loggerr3   loggerr7   r   __all__r   r0   r/   <module>rG      sU    + G G ! >  
		H	%#/`pq I* IX  
 r0   