
    ii                     B   d Z ddlmZ ddlmZ ddlZddlmZ ddlmZ ddl	m
Z
 dd	lmZ dd
lmZ ddlmZ ddlmZmZ ddlmZ ddlmZ ddlmZmZmZmZmZmZ ddlm Z  ddl!m"Z"  ejF                  e$      Z%e ed       G d de                    Z&e ed       G d de                    Z' G d dejP                        Z)dejT                  dz  dejT                  dz  dedz  fd Z+	 	 	 	 d3d!e
d"ejT                  d#ejT                  dz  d$ejT                  d%edz  d&ejT                  dz  dejT                  dz  d'ejX                  dz  d(e-dz  d)e-dz  de.fd*Z/e G d+ d,e             Z0 ed-       G d. d/e0             Z1 ed-       G d0 d1e0e             Z2g d2Z3y)4zPyTorch PaliGemmamodel.    )Callable)	dataclassN)nn   )Cache)PreTrainedConfig)GenerationMixin)create_masks_for_generate)FlashAttentionKwargs)BaseModelOutputWithPastBaseModelOutputWithPooling)PreTrainedModel)Unpack)ModelOutputTransformersKwargsauto_docstringcan_return_tupleloggingtorch_compilable_check   )	AutoModel   )PaliGemmaConfigzN
    Base class for Paligemma outputs, with hidden states and attentions.
    custom_introc                   :    e Zd ZU dZdZej                  dz  ed<   y)PaligemmaModelOutputWithPasta  
    image_hidden_states (`torch.FloatTensor`, *optional*):
        A `torch.FloatTensor` of size `(batch_size, num_images, sequence_length, hidden_size)`.
        image_hidden_states of the model produced by the vision encoder and after projecting the last hidden state.
    Nimage_hidden_states)__name__
__module____qualname____doc__r   torchFloatTensor__annotations__     z/home/obispo/Crisostomo_bridge/mision_env/lib/python3.12/site-packages/transformers/models/paligemma/modeling_paligemma.pyr   r   -   s     59**T18r'   r   zU
    Base class for PaliGemma causal language model (or autoregressive) outputs.
    c                       e Zd ZU dZdZej                  dz  ed<   dZej                  dz  ed<   dZ	e
dz  ed<   dZeej                     dz  ed<   dZeej                     dz  ed<   dZej                  dz  ed<   y)	PaliGemmaCausalLMOutputWithPasta8  
    loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
        Language modeling loss (for next-token prediction).
    logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.text_config.vocab_size)`):
        Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
    past_key_values (`Cache`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
        It is a [`~cache_utils.Cache`] instance. For more details, see our [kv cache guide](https://huggingface.co/docs/transformers/en/kv_cache).

        Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see
        `past_key_values` input) to speed up sequential decoding.
    image_hidden_states (`torch.FloatTensor`, *optional*):
        A `torch.FloatTensor` of size `(batch_size, num_images, sequence_length, hidden_size)`.
        image_hidden_states of the model produced by the vision encoder after projecting last hidden state.
    Nlosslogitspast_key_valueshidden_states
attentionsr   )r   r    r!   r"   r+   r#   r$   r%   r,   r-   r   r.   tupler/   r   r&   r'   r(   r*   r*   =   s     &*D%

d
")'+FE$+$(OUT\(59M5**+d2926Je''(4/648**T18r'   r*   c                   *     e Zd Zdef fdZd Z xZS )PaliGemmaMultiModalProjectorconfigc                     t         |           t        j                  |j                  j
                  |j                  j                  d      | _        y )NTbias)super__init__r   Linearvision_confighidden_sizeprojection_dimlinearselfr3   	__class__s     r(   r8   z%PaliGemmaMultiModalProjector.__init__\   s;    ii 4 4 @ @&BVBVBeBelpqr'   c                 (    | j                  |      }|S N)r=   )r?   image_featuresr.   s      r(   forwardz$PaliGemmaMultiModalProjector.forward`   s    N3r'   )r   r    r!   r   r8   rD   __classcell__r@   s   @r(   r2   r2   [   s    r rr'   r2   token_type_idsimage_group_idsreturnc           
      Z      ydt         dt         dt         dt         dt        f
 fd}|S )z
    This function adds the correct offsets to the `q_idx` and `kv_idx` as the torch API can only accept lengths,
    not start and end indices.
    N	batch_idxhead_idxq_idxkv_idxrI   c                 :   t        j                  |j                  d   k  |d      }t        j                  |j                  d   k  |d      }| |f   }t        j                  |j                  d   k  |d      }| |f   }t        j                  |j                  d   k  |d      }| |f   }t        j                  |j                  d   k  |d      }| |f   }	t        j                  |j                  d   k  |	d      }	|dk(  |dk(  z  }
||	k(  }|
|z  S )Nr   r   )r#   whereshape)rK   rL   rM   rN   
safe_q_idxsafe_kv_idxtoken_type_ids_at_q_idxtoken_type_ids_at_kv_idximage_group_ids_at_q_idximage_group_ids_at_kv_idxis_image_blocksame_image_blockrH   rG   s               r(   
inner_maskz0token_type_ids_mask_function.<locals>.inner_maskr   sM    [[)=)=a)@!@%K
kk&>+?+?+B"BFAN"0J1F"G"'++en6J6J16M.MOfhi"j#1)[2H#I #(;;v8L8LQ8O/OQikl#m #29j3H#I #(;;u7L7LQ7O/OQikm#n $3I{4J$K!$)KK9N9Nq9Q0QSlnp$q!1Q6;SWX;XY37PP  000r'   )intbool)rG   rH   r[   s   `` r(   token_type_ids_mask_functionr^   f   s>     1c 1S 1 1c 1d 12 r'   r3   input_embedsattention_maskcache_positionr-   position_idspixel_valuesis_trainingis_first_iterationc
                    |r|t        d      | j                         |||||d}|	r|	n|du xs |j                   xs |du}	|	s|
j                  dd      s<|d|z
  }n4t        j                  d       t        j                  |      dddddf   }||	r|dk(  j                  |j                        }t        j                  j                  |d	d
      ddddf   }|| z  }t        j                  |j                         d      dz
  }t        j                  ||t        j                   |d            }t#        |j                  |j                        |      |d<   t%        di |S )a"  
    Overwrites the base `create_masks_for_generate` with `token_type_ids` masking to create the causal mask mapping
    for all kinds of forward passes. Paligemma uses a bidirectional mask on the prompt tokens.

    Uses `pixel_values` as an optional input to disambiguate edge cases.
    Nz;`token_type_ids` is required as a model input when training)r3   r_   r`   ra   r-   rb   	use_cacheTr   zIt is a prefill stage but The `token_type_ids` is not provided. We recommend passing `token_type_ids` to the model to prevent bad attention masking.r   )r   r   )valuerP   )dimor_mask_functionr&   )
ValueErrorget_text_configis_initializedgetloggerwarning_oncer#   	ones_liketodevicer   
functionalpadcumsumr\   rQ   	full_liker^   r
   )r3   r_   r`   ra   r-   rb   rG   rc   rd   re   kwargsmask_kwargsis_imageis_previous_imagenew_image_startrH   s                   r(   create_causal_mask_mappingr}      s   & ~-VWW ((*$((*$K  	%g_-K-K)Kg|cgOg  K!>% /NZ
 #__\:1a7CN
 !&8 #a'++N,A,ABMM--ha-HCRCP"&7%77,,':':'<!DqH++hQ_acAde*Fn334o+
&' %3{33r'   c                   B    e Zd ZU eed<   dZdZdZdgZdZ	dZ
dZdZdZdZy)	PaliGemmaPreTrainedModelr3   model)imagetextTr2   r-   FN)r   r    r!   r   r%   base_model_prefixinput_modalitiessupports_gradient_checkpointing_no_split_modules_skip_keys_device_placement_can_compile_fullgraph_supports_flash_attn_supports_sdpa_supports_flex_attn_supports_attention_backendr&   r'   r(   r   r      sF    (&*#78"3"N"&r'   r   z|
    The Base Paligemma model which consists of a vision backbone and a language model without language modeling head.,
    c            "       P    e Zd ZddiZdZdef fdZd Zd Ze	 e
d	      d
ej                  dee   deez  fd              Zdej$                  dej                  dej                  fdZe	e
	 	 	 	 	 	 	 	 	 	 	 	 	 ddej$                  dz  d
ej                  dz  dej(                  dz  dej$                  dz  dedz  dej$                  dz  dej$                  dz  dej                  dz  dej$                  dz  dedz  dedz  dedz  dedz  dee   deez  fd              Z xZS )PaliGemmaModelzlanguage_model.modellanguage_modelFr3   c                    t         |   |       t        j                  |j                        | _        t        |      | _        |j                  j                  | _	        t        j                  |j                        }|| _
        | j                  j                         j                  xs | j                  | _        | j                          y )N)r3   )r7   r8   r   from_configr:   vision_towerr2   multi_modal_projectortext_config
vocab_sizer   r3   rl   dtypetext_config_dtype	post_init)r?   r3   r   r@   s      r(   r8   zPaliGemmaModel.__init__   s     %119M9MN%A&%I" ,,77"..f6H6HI,!%!<!<!>!D!D!R

r'   c                 6    | j                   j                         S rB   )r   get_input_embeddingsr?   s    r(   r   z#PaliGemmaModel.get_input_embeddings   s    ""7799r'   c                 :    | j                   j                  |       y rB   )r   set_input_embeddingsr?   rh   s     r(   r   z#PaliGemmaModel.set_input_embeddings  s    007r'   zWObtains image last hidden states from the vision tower and apply multimodal projection.r   rc   rx   rI   c                      | j                   |fddi|}|j                  }| j                  |      }|| j                  j                  j
                  dz  z  }||_        |S )Nreturn_dictTg      ?)r   last_hidden_stater   r3   r   r;   pooler_output)r?   rc   rx   image_outputsselected_image_featurerC   s         r(   get_image_featuresz!PaliGemmaModel.get_image_features  sk     *)),SDSFS!.!@!@334JK'4;;+B+B+N+NPS+ST&4#r'   	input_idsinputs_embedsrC   c                 N   |m| | j                         t        j                  | j                  j                  t        j
                  |j                              k(  }|j                  d      }n|| j                  j                  k(  }|j                         }|j                  d   |j                  d   z  }|j                  d      j                  |      j                  |j                        }t        ||   j                         |j                         k(  d| d|        |S )z
        Obtains multimodal placeholder mask from `input_ids` or `inputs_embeds`, and checks that the placeholder token count is
        equal to the length of multimodal features. If the lengths are different, an error is raised.
        )r   rs   rP   r   r   z6Image features and image tokens do not match, tokens: z, features: )r   r#   tensorr3   image_token_idlongrs   allsumrR   	unsqueeze	expand_asrr   r   numel)r?   r   r   rC   special_image_maskn_image_tokensn_image_featuress          r(   get_placeholder_maskz#PaliGemmaModel.get_placeholder_mask  s    !.2M$2K2K2MT[[77uzzR_RfRfg3 " "4!7!7!;!*dkk.H.H!H+//1)//2^5I5I!5LL/99"=GGVYYZgZnZno,-3359M9M9OOD^DTT`aq`rs	
 "!r'   Nr`   rb   r-   rG   ra   labelsrg   output_attentionsoutput_hidden_statesr   c                 b   |du |duz  rt        d      ||n| j                  j                  }||n| j                  j                  }||n| j                  j                  }|R| j                  j
                  | j                  k\  r/|| j                  j
                  k(  }|j                         }d||<   n|}| | j                         |      }|F||j                         nd}t        j                  |||j                  d   z   |j                        }||j                  d      dz   }|i| j                  |d      j                   }|j#                  |j                  |j$                        }| j'                  |||      }|j)                  ||      }t+        |x}t,              s(t/        | j                  |||||||| j0                  		      } | j2                  d|||||
||d|d
	|}t5        |j6                  |j8                  |j:                  |j<                  |      S d      S )  
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
            config.text_config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.text_config.vocab_size]`.

        Example:

        ```python
        >>> from PIL import Image
        >>> import httpx
        >>> from io import BytesIO
        >>> from transformers import AutoProcessor, PaliGemmaForConditionalGeneration

        >>> model = PaliGemmaForConditionalGeneration.from_pretrained("google/paligemma2-3b-mix-224")
        >>> processor = AutoProcessor.from_pretrained("google/paligemma2-3b-mix-224")

        >>> prompt = "Where is the cat standing?"
        >>> url = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/pipeline-cat-chonk.jpeg"
        >>> with httpx.stream("GET", url) as response:
        ...     image = Image.open(BytesIO(response.read()))

        >>> inputs = processor(images=image, text=prompt,  return_tensors="pt")

        >>> # Generate
        >>> generate_ids = model.generate(**inputs,)
        >>> processor.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
        "Where is the cat standing?\nsnow"
        ```Nz:You must specify exactly one of input_ids or inputs_embedsr   r   )rs   T)r   )r   rC   )rd   )	r`   rb   r-   r   rg   r   r   r   ra   )r   r-   r.   r/   r   r&   )rk   r3   r   r   use_return_dictr   r   cloner   get_seq_lengthr#   arangerR   rs   r   r   r   rr   r   r   masked_scatter
isinstancedictr}   trainingr   r   r   r-   r.   r/   )r?   r   rc   r`   rb   r-   rG   ra   r   r   rg   r   r   r   rx   r   llm_input_idspast_seen_tokensrC   causal_mask_mappingoutputss                        r(   rD   zPaliGemmaModel.forward,  s}   b -t";<YZZ1B1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++B]B]  T[[%?%?4??%R!*dkk.H.H!H%OO-M01M,-%M 7D557FM!CRC^==?de"\\ "2]5H5H5K"KTaThThN )33A6:L #!44\t4TbbN+..}/C/C]EXEXYN!%!:!:~ "; " *889K^\M ?-F"< MM
# &$%% 
.%+'/!5)
 
 ,%77#33!//))2>2J
 	

 QU
 	
r'   )NNNNNNNNNNNNN)r   r    r!   _checkpoint_conversion_mappingaccepts_loss_kwargsr   r8   r   r   r   r   r#   r$   r   r   r0   r   r   
LongTensorr   Tensorr   r]   r   r   rD   rE   rF   s   @r(   r   r      s
    '=>N%O"
 
:8 n	!--	9?@R9S		+	+	 	"))":?:K:K"]b]n]n"0  .215.204(,262626*.!%)-,0#'v
##d*v
 ''$.v
 t+	v

 &&-v
 v
 ((4/v
 ((4/v
 ((4/v
   4'v
 $;v
  $;v
 #Tkv
 D[v
 -.v
  
-	-!v
  v
r'   r   c            $           e Zd ZdddddZddiZdef fd	Zd
 Zd Ze	de
j                  dee   fd       Zee		 	 	 	 	 	 	 	 	 	 	 	 	 	 d#de
j                   dz  de
j                  dz  de
j"                  dz  de
j                   dz  dedz  de
j                   dz  de
j                   dz  de
j                  dz  de
j                   dz  dedz  dedz  dedz  dedz  dee
j"                  z  dee   deez  f d              Z	 	 	 	 	 	 	 	 	 	 	 d$ fd	Ze	 	 d%ded e
j"                  de
j"                  dz  de
j"                  dedz  de
j"                  dz  de
j"                  dz  d!edz  defd"       Z xZS )&!PaliGemmaForConditionalGenerationzmodel.language_modelzmodel.vision_towerzmodel.multi_modal_projectorlm_head)z^language_model.modelz^vision_towerz^multi_modal_projectorz^language_model.lm_headzlm_head.weightz(model.language_model.embed_tokens.weightr3   c                     t         |   |       t        |      | _        t	        j
                  |j                  j                  |j                  j                  d      | _	        | j                          y )NFr5   )r7   r8   r   r   r   r9   r   r;   r   r   r   r>   s     r(   r8   z*PaliGemmaForConditionalGeneration.__init__  sS     #F+
yy!3!3!?!?ASASA^A^ejkr'   c                 6    | j                   j                         S rB   )r   r   r   s    r(   r   z6PaliGemmaForConditionalGeneration.get_input_embeddings  s    zz..00r'   c                 :    | j                   j                  |       y rB   )r   r   r   s     r(   r   z6PaliGemmaForConditionalGeneration.set_input_embeddings  s    

''.r'   rc   rx   c                 <     | j                   j                  |fi |S rB   )r   r   )r?   rc   rx   s      r(   r   z4PaliGemmaForConditionalGeneration.get_image_features  s    ,tzz,,\DVDDr'   Nr   r`   rb   r-   rG   ra   r   r   rg   r   r   r   logits_to_keeprI   c                 >   ||n| j                   j                  }||n| j                   j                  }||n| j                   j                  } | j                  d||||||||
|	||d|d|}|d   }t        |t              rt        | d      n|}| j                  |dd|ddf         }d}|	4 | j                  d||	| j                   j                  j                  d|}t        |||j                  |j                  |j                  |j                         S )r   NT)r   rc   rG   r`   rb   r-   r   rg   r   r   r   r   ra   r   )r,   r   r   )r+   r,   r-   r.   r/   r   r&   )r3   r   r   r   r   r   r\   slicer   loss_functionr   r   r*   r-   r.   r/   r   )r?   r   rc   r`   rb   r-   rG   ra   r   r   rg   r   r   r   r   rx   r   r.   slice_indicesr,   r+   s                        r(   rD   z)PaliGemmaForConditionalGeneration.forward  sS   b 2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++B]B]$** 
%))%+'/!5)
 
"  
8B>SV8W~ot4]kmA}a,?@A%4%% f9P9P9[9[_eD /#33!//)) ' ; ;
 	
r'   c                     t        |   |f||||||	|
||d	|}|j                  d      |dxx   dz  cc<   |s|	s||d<   |S )N)	r-   r   r`   rb   ra   rg   r   rG   re   rb   r   rc   )r7   prepare_inputs_for_generationrn   )r?   r   r-   r   ra   rb   rc   r`   rG   rg   r   r   re   rx   model_inputsr@   s                  r(   r   z?PaliGemmaForConditionalGeneration.prepare_inputs_for_generation!  s{    " w<
+')%)))1
 
 N+7(A-( Y+7L(r'   r_   re   c           
          t        | ||||||fd|i|j                         D 	
ci c]  \  }	}
|	dk7  s|	|
 c}
}	S c c}
}	w )Nre   rc   )r}   items)r3   r_   r`   ra   r-   rb   rG   re   rx   kvs              r(   r
   z;PaliGemmaForConditionalGeneration.create_masks_for_generateM  s`     *

  2

 !'F1!~2Eq!tF

 
	
 Gs   ==)NNNNNNNNNNNNNr   )NNNNNNNTNNF)NF)r   r    r!   r   _tied_weights_keysr   r8   r   r   r   r#   r$   r   r   r   r   r   r   r   r]   r\   r0   r*   rD   r   staticmethodr   r   r
   rE   rF   s   @r(   r   r     s    "8-"?#,	&" +,VW 1/ Eu/@/@ EFSeLf E E  .215.204(,262626*.!%)-,0#'-.X
##d*X
 ''$.X
 t+	X

 &&-X
 X
 ((4/X
 ((4/X
 ((4/X
   4'X
 $;X
  $;X
 #TkX
 D[X
 ell*X
  +,!X
" 
0	0#X
  X
z  *X  /3*/
 
ll
 t+
 	

 
 llT)
 t+
 !4K
 

 
r'   r   )r   r   r   )NNFN)4r"   collections.abcr   dataclassesr   r#   r   cache_utilsr   configuration_utilsr   
generationr	   masking_utilsr
   modeling_flash_attention_utilsr   modeling_outputsr   r   modeling_utilsr   processing_utilsr   utilsr   r   r   r   r   r   autor   configuration_paligemmar   
get_loggerr   ro   r   r*   Moduler2   r   r^   r$   r]   r   r}   r   r   r   __all__r&   r'   r(   <module>r      s2    $ !     3 ) 6 B S - &   4 
		H	% 
9#: 9 9 
9k 9 90299 %LL4'%\\D(% _%^ +/-1$&*G4G4,,G4 LL4'G4 LL	G4
 T\G4 ,,%G4 LL4'G4 ##d*G4 G4 tG4 
G4T ' ' ' 
x
- x

x
v 
x
(@/ x

x
v ^r'   