
    ic7                         d dl mZmZmZmZmZmZ d dlmZ ddl	m
Z
mZ ddlmZ ddlmZ  ej                   e      Zddd	Zg d
Z G d de      ZdgZy)    )Regex	Tokenizerdecodersnormalizerspre_tokenizers
processors)BPE   )
AddedTokenBatchEncoding)TokenizersBackend)loggingzsentencepiece.bpe.modelztokenizer.json)
vocab_filetokenizer_file)ace_Arabace_Latnacm_Arabacq_Arabaeb_Arabafr_Latnajp_Arabaka_Latnamh_Ethiapc_Arabarb_Arabars_Arabary_Arabarz_Arabasm_Bengast_Latnawa_Devaayr_Latnazb_Arabazj_Latnbak_Cyrlbam_Latnban_Latnbel_Cyrlbem_Latnben_Bengbho_Devabjn_Arabbjn_Latnbod_Tibtbos_Latnbug_Latnbul_Cyrlcat_Latnceb_Latnces_Latncjk_Latnckb_Arabcrh_Latncym_Latndan_Latndeu_Latndik_Latndyu_Latndzo_Tibtell_Grekeng_Latnepo_Latnest_Latneus_Latnewe_Latnfao_Latnpes_Arabfij_Latnfin_Latnfon_Latnfra_Latnfur_Latnfuv_Latngla_Latngle_Latnglg_Latngrn_Latnguj_Gujrhat_Latnhau_Latnheb_Hebrhin_Devahne_Devahrv_Latnhun_Latnhye_Armnibo_Latnilo_Latnind_Latnisl_Latnita_Latnjav_Latnjpn_Jpankab_Latnkac_Latnkam_Latnkan_Kndakas_Arabkas_Devakat_Georknc_Arabknc_Latnkaz_Cyrlkbp_Latnkea_Latnkhm_Khmrkik_Latnkin_Latnkir_Cyrlkmb_Latnkon_Latnkor_Hangkmr_Latnlao_Laoolvs_Latnlij_Latnlim_Latnlin_Latnlit_Latnlmo_Latnltg_Latnltz_Latnlua_Latnlug_Latnluo_Latnlus_Latnmag_Devamai_Devamal_Mlymmar_Devamin_Latnmkd_Cyrlplt_Latnmlt_Latnmni_Bengkhk_Cyrlmos_Latnmri_Latnzsm_Latnmya_Mymrnld_Latnnno_Latnnob_Latnnpi_Devanso_Latnnus_Latnnya_Latnoci_Latngaz_Latnory_Oryapag_Latnpan_Gurupap_Latnpol_Latnpor_Latnprs_Arabpbt_Arabquy_Latnron_Latnrun_Latnrus_Cyrlsag_Latnsan_Devasat_Bengscn_Latnshn_Mymrsin_Sinhslk_Latnslv_Latnsmo_Latnsna_Latnsnd_Arabsom_Latnsot_Latnspa_Latnals_Latnsrd_Latnsrp_Cyrlssw_Latnsun_Latnswe_Latnswh_Latnszl_Latntam_Tamltat_Cyrltel_Telutgk_Cyrltgl_Latntha_Thaitir_Ethitaq_Latntaq_Tfngtpi_Latntsn_Latntso_Latntuk_Latntum_Latntur_Latntwi_Latntzm_Tfnguig_Arabukr_Cyrlumb_Latnurd_Arabuzn_Latnvec_Latnvie_Latnwar_Latnwol_Latnxho_Latnydd_Hebryor_Latnyue_Hantzho_Hanszho_Hantzul_Latnc                       e Zd ZU dZeZddgZeZg Z	e
e   ed<   g Ze
e   ed<   	 	 	 	 	 	 	 	 	 	 	 	 	 	 ddeeeef   z  dz  dee
e   z  dz  f fd	Zed
efd       Zej&                  ded
dfd       Zdededz  dedz  fdZ	 	 	 	 	 	 	 	 dde
e   dede
e   dz  dededz  dedz  dededz  ded
efdZd Zd Zd dZded
dfdZ xZS )!NllbTokenizera	  
    Construct an NLLB tokenizer (backed by HuggingFace's *tokenizers* library). Based on
    [Unigram](https://huggingface.co/docs/tokenizers/python/latest/components.html?highlight=unigram#models).

    This tokenizer inherits from [`TokenizersBackend`] which contains most of the main methods. Users should
    refer to this superclass for more information regarding those methods.

    The tokenization method is `<tokens> <eos> <language code>` for source language documents, and `<language code>
    <tokens> <eos>` for target language documents.

    Examples:

    ```python
    >>> from transformers import NllbTokenizer

    >>> tokenizer = NllbTokenizer.from_pretrained(
    ...     "facebook/nllb-200-distilled-600M", src_lang="eng_Latn", tgt_lang="fra_Latn"
    ... )
    >>> example_english_phrase = " UN Chief Says There Is No Military Solution in Syria"
    >>> expected_translation_french = "Le chef de l'ONU affirme qu'il n'y a pas de solution militaire en Syrie."
    >>> inputs = tokenizer(example_english_phrase, text_target=expected_translation_french, return_tensors="pt")
    ```

    Args:
        vocab_file (`str`, *optional*):
            Path to the vocabulary file.
        bos_token (`str`, *optional*, defaults to `"<s>"`):
            The beginning of sequence token that was used during pretraining.
        eos_token (`str`, *optional*, defaults to `"</s>"`):
            The end of sequence token.
        sep_token (`str`, *optional*, defaults to `"</s>"`):
            The separator token.
        cls_token (`str`, *optional*, defaults to `"<s>"`):
            The classifier token.
        unk_token (`str`, *optional*, defaults to `"<unk>"`):
            The unknown token.
        pad_token (`str`, *optional*, defaults to `"<pad>"`):
            The token used for padding.
        mask_token (`str`, *optional*, defaults to `"<mask>"`):
            The token used for masking values.
        src_lang (`str`, *optional*):
            The language to use as source language for translation.
        tgt_lang (`str`, *optional*):
            The language to use as target language for translation.
        legacy_behaviour (`bool`, *optional*, defaults to `False`):
            Whether to use legacy behaviour (suffix pattern) or new behaviour (prefix pattern).
    	input_idsattention_maskprefix_tokenssuffix_tokensNvocabmergesc                 V   ||}n|t         }t        |	t              rt        |	ddd      n|	}	|| _        |.t        |      dt        |      dt        |      dt        |      di}|| _        |xs g | _        t        t        | j
                  | j                  d t        |      dd            | _	        t        j                  t        j                  t        d	      d
      t        j                         t        j                  t        d      d
      g      | j                  _        t!        j"                  ddd      | j                  _        t'        j"                  ddd      | j                  _        t+        | X  d|||||||
||	||d| d| _        ddddd| _        | j0                  j3                         D ci c]  \  }}||
 c}}| _        |
|
nd| _        | j9                  | j6                        | _        || _        | j?                  | j6                         y c c}}w )NT)
normalizedlstripspecialr         r
   F)r   r   dropout	unk_tokenfuse_unkbyte_fallbackz[\n\r\t] z {2,}u   ▁always)replacementprepend_schemesplit)	bos_token	eos_token	sep_token	cls_tokenr   	pad_tokensrc_langtgt_lang
mask_tokenextra_special_tokenslegacy_behaviour)<s><pad></s><unk>r?    ) FAIRSEQ_LANGUAGE_CODES
isinstancestrr   r   _vocab_mergesr   r	   
_tokenizerr   SequenceReplacer   NFKC
normalizerr   	Metaspacepre_tokenizerr   decodersuper__init__fairseq_offsetfairseq_tokens_to_idsitemsfairseq_ids_to_tokens	_src_langconvert_tokens_to_idscur_lang_coder   set_src_lang_special_tokens)selfr   r   r   r   r   r   r   r   r   r   r   additional_special_tokensr   r   kwargskv	__class__s                     t/home/obispo/Crisostomo_bridge/mision_env/lib/python3.12/site-packages/transformers/models/nllb/tokenization_nllb.pyr  zNllbTokenizer.__init__Y   s   (  +(<%&.(>% *c* zd4N 	
 !1=IIII	E |#kk||i.#	
 &1%9%9##E+$6<  "##E(OS9&
" )7(@(@Ucksw(x%"*"4"4W_gk"l 	
!!:-	
 	
   	&
" 8<7Q7Q7W7W7Y%Ztq!ad%Z"%-%9z!77G ((8 &[s   H%returnc                     | j                   S N)r  r  s    r  r   zNllbTokenizer.src_lang   s    ~~    new_src_langc                 H    || _         | j                  | j                          y r!  )r  r  )r  r$  s     r  r   zNllbTokenizer.src_lang   s    %((8r#  return_tensorsr   r   c                 v    ||t        d      || _         | |fd|d|}| j                  |      }||d<   |S )zIUsed by translation pipeline, to prepare inputs for the generate functionzATranslation requires a `src_lang` and a `tgt_lang` for this modelT)add_special_tokensr&  forced_bos_token_id)
ValueErrorr   r  )r  
raw_inputsr&  r   r   extra_kwargsinputstgt_lang_ids           r  _build_translation_inputsz'NllbTokenizer._build_translation_inputs   sY     x/`aa jiT.i\hi00:(3$%r#  	src_texts	tgt_texts
max_lengthmax_target_lengthpadding
truncationc
           	          || _         || _        || j                  } | |fd||||	d|
}||S ||}| j                           | |fd||||	d|
}|d   |d<   | j	                          |S )NT)r(  r&  r2  r4  r5  )r(  r&  r4  r2  r5  r   labels)r   r   model_max_length_switch_to_target_mode_switch_to_input_mode)r  r0  r   r1  r   r2  r3  r4  r&  r5  r  model_inputsr7  s                r  prepare_seq2seq_batchz#NllbTokenizer.prepare_seq2seq_batch   s     ! ..J
#)!!
 
  $ * 	##%
#)(!
 
 "(!4X 	""$r#  c                 8    | j                  | j                        S r!  )r  r   r"  s    r  r:  z#NllbTokenizer._switch_to_input_mode  s    //>>r#  c                 r    | j                   | j                  | _         | j                  | j                         S r!  )r   r  set_tgt_lang_special_tokensr"  s    r  r9  z$NllbTokenizer._switch_to_target_mode  s,    ==  NNDM//>>r#  c                    | j                  |      | _        | j                  r%g | _        | j                  | j                  g| _        n$| j                  g| _        | j                  g| _        | j                  | j                        }| j                  | j
                        }t        j                  |dgz   |z   |ddgz   |z   t        t        ||z   | j                  | j
                  z                     | j                  _        y)zReset the special tokens to the source lang setting.
        - In legacy mode: No prefix and suffix=[eos, src_lang_code].
        - In default mode: Prefix=[src_lang_code], suffix = [eos]
        $A$Bsinglepairspecial_tokensNr  r  r   r   eos_token_idr   convert_ids_to_tokensr   TemplateProcessinglistzipr  post_processor)r  r   prefix_tokens_strsuffix_tokens_strs       r  r  z)NllbTokenizer.set_src_lang_special_tokens
  s    
 "77A  !#D"&"3"3T5G5G!HD"&"4"4!5D"&"3"3!4D 66t7I7IJ 66t7I7IJ)3)F)F$v-0AA"dD\14EE$58I$I4K]K]`d`r`rKr st*
&r#  langc                    | j                  |      | _        | j                  r%g | _        | j                  | j                  g| _        n$| j                  g| _        | j                  g| _        | j                  | j                        }| j                  | j
                        }t        j                  |dgz   |z   |ddgz   |z   t        t        ||z   | j                  | j
                  z                     | j                  _        y)zReset the special tokens to the target lang setting.
        - In legacy mode: No prefix and suffix=[eos, tgt_lang_code].
        - In default mode: Prefix=[tgt_lang_code], suffix = [eos]
        rA  rB  rC  NrG  )r  rP  rN  rO  s       r  r?  z)NllbTokenizer.set_tgt_lang_special_tokens!  s    
 "77=  !#D"&"3"3T5G5G!HD"&"4"4!5D"&"3"3!4D 66t7I7IJ 66t7I7IJ)3)F)F$v-0AA"dD\14EE$58I$I4K]K]`d`r`rKr st*
&r#  )NNr   r   r   r   r   r   z<mask>NNNNF)r?   NrI   NNlongestNT)r  N)__name__
__module____qualname____doc__VOCAB_FILES_NAMESvocab_files_namesmodel_input_namesr	   modelr   rK  int__annotations__r   r  dictr  propertyr   setterr/  boolr   r<  r:  r9  r  r?  __classcell__)r  s   @r  r   r   !   s   .` *$&67E!M49!!M49! .2)-"&!\9T#s(^#d*\9 d3i$&\9| #   __9S 9T 9 9
*-
9<t
ORUYz
 #&*"!%(, %)494 4 9t#	4
 4 $J4 :4 4 d
4 4 
4l??

.
 
 
r#  r   N)
tokenizersr   r   r   r   r   r   tokenizers.modelsr	   tokenization_pythonr   r   tokenization_utils_tokenizersr   utilsr   
get_loggerrS  loggerrW  r  r   __all__r   r#  r  <module>rj     s_     [ Z ! < >  
		H	% $=P`a  R& T
% T
n 
r#  