
    iR                        d dl mZ d dlZd dlmZ ddlmZ ddlmZ ddlm	Z	 ddl
mZmZmZ dd	lmZ dd
lmZ ddlmZmZmZmZ ddlmZ ddlmZ ddlmZ e ed       G d de                    Ze ed       G d de                    Z G d dej>                        Z e G d de             Z! ed       G d de!             Z" ed       G d  d!e!e	             Z#g d"Z$y)#    )	dataclassN)nn   )ACT2FN)Cache)GenerationMixin)BaseModelOutputWithPastBaseModelOutputWithPoolingModelOutput)PreTrainedModel)Unpack)TransformersKwargsauto_docstringcan_return_tupletorch_compilable_check)check_model_inputs   )	AutoModel   )VipLlavaConfigzM
    Base class for VipLlava outputs, with hidden states and attentions.
    custom_introc                   :    e Zd ZU dZdZej                  dz  ed<   y)VipLlavaModelOutputWithPasta  
    past_key_values (`Cache`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
        It is a [`~cache_utils.Cache`] instance. For more details, see our [kv cache guide](https://huggingface.co/docs/transformers/en/kv_cache).

        Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see
        `past_key_values` input) to speed up sequential decoding.
    image_hidden_states (`torch.FloatTensor`, *optional*):
        A `torch.FloatTensor` of size `(batch_size, num_images, sequence_length, hidden_size)`.
        image_hidden_states of the model produced by the vision encoder and after projecting the last hidden state.
    Nimage_hidden_states)__name__
__module____qualname____doc__r   torchFloatTensor__annotations__     x/home/obispo/Crisostomo_bridge/mision_env/lib/python3.12/site-packages/transformers/models/vipllava/modeling_vipllava.pyr   r   &   s    	 59**T18r$   r   zT
    Base class for VipLlava causal language model (or autoregressive) outputs.
    c                       e Zd ZU dZdZej                  dz  ed<   dZej                  dz  ed<   dZ	e
dz  ed<   dZeej                     dz  ed<   dZeej                     dz  ed<   dZej                  dz  ed<   y)	VipLlavaCausalLMOutputWithPasta4  
    loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
        Language modeling loss (for next-token prediction).
    logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
        Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
    past_key_values (`Cache`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
        It is a [`~cache_utils.Cache`] instance. For more details, see our [kv cache guide](https://huggingface.co/docs/transformers/en/kv_cache).

        Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see
        `past_key_values` input) to speed up sequential decoding.
    image_hidden_states (`torch.FloatTensor`, *optional*):
        A `torch.FloatTensor` of size `(batch_size, num_images, sequence_length, hidden_size)`.
        image_hidden_states of the model produced by the vision encoder and after projecting the last hidden state.
    Nlosslogitspast_key_valueshidden_states
attentionsr   )r   r   r   r   r(   r    r!   r"   r)   r*   r   r+   tupler,   r   r#   r$   r%   r'   r'   ;   s     &*D%

d
")'+FE$+$(OUT\(59M5**+d2926Je''(4/648**T18r$   r'   c                   *     e Zd Zdef fdZd Z xZS )VipLlavaMultiModalProjectorconfigc                 H   t         |           t        |j                  t              rdnt        |j                        }t        j                  ||j                  j                  z  |j                        | _        t        j                  ||j                  j                  z  |j                  j                  d      | _        t        |j                      | _        t        j                  |j                  j                  |j                  j                  d      | _        y )Nr   )epsTbias)super__init__
isinstancevision_feature_layersintlenr   	LayerNormvision_confighidden_sizeprojector_layernorm_epsprojector_layernormLineartext_configlinear_1r   projector_hidden_actactlinear_2)selfr0   num_feature_layers	__class__s      r%   r6   z$VipLlavaMultiModalProjector.__init__Z   s    ",V-I-I3"OQUXY_YuYuUv#%<<!5!5!A!AAvGeGe$
  		!5!5!A!AA**

 &556		&"4"4"@"@&BTBTB`B`gklr$   c                     | j                  |      }| j                  |      }| j                  |      }| j                  |      }|S N)r?   rB   rD   rE   )rF   r+   s     r%   forwardz#VipLlavaMultiModalProjector.forwardi   sB    00?m4/m4r$   )r   r   r   r   r6   rK   __classcell__rH   s   @r%   r/   r/   Y   s    m~ mr$   r/   c                   <    e Zd ZU eed<   dZdZdZdZdZ	dZ
dZdZdZy)VipLlavaPreTrainedModelr0   model)imagetextTr*   N)r   r   r   r   r"   base_model_prefixinput_modalitiessupports_gradient_checkpointing_skip_keys_device_placement_supports_flash_attn_supports_sdpa_can_compile_fullgraph_supports_flex_attn_supports_attention_backendr#   r$   r%   rO   rO   q   s=    (&*#"3N!"&r$   rO   zx
    The VipLlava model which consists of a vision backbone and a language model, without a language modeling head.
    c                   6    e Zd ZddiZdef fdZd Zd Ze e	d      	 	 dd
e
j                  deee   z  d	z  ded	z  dee   deez  f
d              Zde
j(                  de
j                  de
j                  fdZe		 	 	 	 	 	 	 	 	 	 	 	 dde
j(                  d	z  d
e
j                  d	z  de
j,                  d	z  de
j(                  d	z  ded	z  de
j                  d	z  deee   z  d	z  ded	z  ded	z  ded	z  ded	z  de
j(                  d	z  deez  fd       Z xZS )VipLlavaModel^language_model.modellanguage_modelr0   c                     t         |   |       t        j                  |j                        | _        t        |      | _        t        j                  |j                        | _	        | j                          y rJ   )r5   r6   r   from_configr<   vision_towerr/   multi_modal_projectorrA   r_   	post_initrF   r0   rH   s     r%   r6   zVipLlavaModel.__init__   sY     %11&2F2FG%@%H"'33F4F4FGr$   c                 6    | j                   j                         S rJ   )r_   get_input_embeddingsrF   s    r%   rg   z"VipLlavaModel.get_input_embeddings   s    ""7799r$   c                 :    | j                   j                  |       y rJ   )r_   set_input_embeddingsrF   values     r%   rj   z"VipLlavaModel.set_input_embeddings   s    007r$   zWObtains image last hidden states from the vision tower and apply multimodal projection.r   Npixel_valuesr8   output_hidden_stateskwargsreturnc                 j   ||n| j                   j                  } | j                  |fddd|}t        |t              r|j
                  |   ddddf   }n<|D cg c]  }|j
                  |   ddddf    }}t        j                  |d      }| j                  |      }||_	        |S c c}w )\  
        pixel_values (`torch.FloatTensor]` of shape `(batch_size, channels, height, width)`):
            The tensors corresponding to the input images.
        vision_feature_layers (`Union[int, list[int]]`, *optional*):
            The vision feature layer, or the list of indexes of the layers to select
            the vision feature.
        NT)rn   return_dictr   )dim)
r0   r8   rb   r7   r9   r+   r    catrc   pooler_output)rF   rm   r8   rn   ro   image_outputsimage_featuresindexs           r%   get_image_featuresz VipLlavaModel.get_image_features   s    & &;%F!DKKLmLm 	 *))
!%
 	
 +S1*889NOPQSTSUPUVN VkkEm99%@ABGkNk"YY~2>N33NC&4# ls   B0	input_idsinputs_embedsry   c                 N   |m| | j                         t        j                  | j                  j                  t        j
                  |j                              k(  }|j                  d      }n|| j                  j                  k(  }|j                         }|j                  d   |j                  d   z  }|j                  d      j                  |      j                  |j                        }t        ||   j                         |j                         k(  d| d|        |S )z
        Obtains multimodal placeholder mask from `input_ids` or `inputs_embeds`, and checks that the placeholder token count is
        equal to the length of multimodal features. If the lengths are different, an error is raised.
        )dtypedevicert   r   r   z6Image features and image tokens do not match, tokens: z, features: )rg   r    tensorr0   image_token_idlongr   allsumshape	unsqueeze	expand_astor   numel)rF   r|   r}   ry   special_image_maskn_image_tokensn_image_featuress          r%   get_placeholder_maskz"VipLlavaModel.get_placeholder_mask   s    !.2M$2K2K2MT[[77uzzR_RfRfg3 " "4!7!7!;!*dkk.H.H!H+//1)//2^5I5I!5LL/99"=GGVYYZgZnZno,-3359M9M9OOD^DTT`aq`rs	
 "!r$   attention_maskposition_idsr*   	use_cacheoutput_attentionsrs   cache_positionc                    |	|	n| j                   j                  }	|
|
n| j                   j                  }
||n| j                   j                  }||n| j                   j                  }|du |duz  rt        d      | | j                         |      }|j| j                  ||d      j                  }|j                  |j                  |j                        }| j                  |||      }|j                  ||      } | j                  d||||||	|
d|d	|}t        |j                   |j"                  |j$                  |j&                  |nd      }|r|S |j)                         S )	z
        vision_feature_layers (`Union[int, list[int]]`, *optional*):
            The vision feature layer, or the list of indexes of the layers to select
            the vision feature.
        Nz:You must specify exactly one of input_ids or inputs_embedsT)rm   r8   rs   )r}   ry   )	r   r   r*   r}   r   r   rn   rs   r   )last_hidden_stater*   r+   r,   r   r#   )r0   r   rn   use_return_dictr8   
ValueErrorrg   r{   rw   r   r   r   r   masked_scatterr_   r   r   r*   r+   r,   to_tuple)rF   r|   rm   r   r   r*   r}   r8   r   r   rn   rs   r   	lm_kwargsry   r   outputsoutputs                     r%   rK   zVipLlavaModel.forward   s   , 2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++B]B]%:%F!DKKLmLm 	 -t";<YZZ 7D557	BM#!44)AVdh 5 m  ,..}/C/C]EXEXYN!%!:!:~ "; " *889K^\M%$%% 
)%+'/!5)
 
 -%77#33!//))2>2JPT
 %v;&//*;;r$   )NN)NNNNNNNNNNNN)r   r   r   _checkpoint_conversion_mappingr   r6   rg   rj   r   r   r    r!   r9   listboolr   r   r-   r
   r{   
LongTensorr   Tensorr   r   rK   rL   rM   s   @r%   r]   r]      s    	!"2&"~ :8 n 9=,0	#''#  #T#Y5# #Tk	#
 +,# 
+	+# #J"))":?:K:K"]b]n]n"0  .215.204(,268<!%)-,0#'26B<##d*B< ''$.B< t+	B<
 &&-B< B< ((4/B<  #T#Y5B< $;B<  $;B< #TkB< D[B< ((4/B< 
,	,B< B<r$   r]   zV
    The VIPLLAVA model which consists of a vision backbone and a language model.
    c            "       b    e Zd ZdddddZddiZdef fd	Zd
 Zd Zde	j                  fdZe	 d#dej                  deee   z  dz  dee   deez  fd       Z ed      e	 	 	 	 	 	 	 	 	 	 	 	 	 	 d$dej.                  dz  dej                  dz  dej0                  dz  dej.                  dz  dedz  dej                  dz  deee   z  dz  dej.                  dz  dedz  dedz  dedz  dedz  dej.                  dz  d eej0                  z  deez  fd!              Z	 	 	 	 	 	 	 d% fd"	Z xZS )& VipLlavaForConditionalGenerationzmodel.language_modelzmodel.vision_towerzmodel.multi_modal_projectorlm_head)r^   z^vision_towerz^multi_modal_projectorz^language_model.lm_headzlm_head.weightz(model.language_model.embed_tokens.weightr0   c                     t         |   |       t        |      | _        t	        j
                  |j                  j                  |j                  j                  d      | _	        | j                          y )NFr3   )r5   r6   r]   rP   r   r@   rA   r=   
vocab_sizer   rd   re   s     r%   r6   z)VipLlavaForConditionalGeneration.__init__.  sS     "6*
yy!3!3!?!?ASASA^A^ejkr$   c                 6    | j                   j                         S rJ   )rP   rg   rh   s    r%   rg   z5VipLlavaForConditionalGeneration.get_input_embeddings4  s    zz..00r$   c                 :    | j                   j                  |       y rJ   )rP   rj   rk   s     r%   rj   z5VipLlavaForConditionalGeneration.set_input_embeddings7  s    

''.r$   rp   c                     | j                   S rJ   )r   rh   s    r%   get_output_embeddingsz6VipLlavaForConditionalGeneration.get_output_embeddings:  s    ||r$   Nrm   r8   ro   c                 @     | j                   j                  d||d|S )rr   )rm   r8   r#   )rP   r{   )rF   rm   r8   ro   s       r%   r{   z3VipLlavaForConditionalGeneration.get_image_features=  s0     -tzz,, 
%=R
V\
 	
r$   F)tie_last_hidden_statesr|   r   r   r*   r}   labelsr   r   rn   rs   r   logits_to_keepc                 l   |
|
n| j                   j                  }
||n| j                   j                  }||n| j                   j                  }||n| j                   j                  } | j
                  d|||||||	||
|d|d|}|d   }t        |t              rt        | d      n|}| j                  |dd|ddf         }d}|2| j                  ||| j                   j                  j                        }t        |||j                  |j                  |j                   |j"                        S )a  
        vision_feature_layers (`Union[int, list[int]]`, *optional*):
            The vision feature layer, or the list of indexes of the layers to select
            the vision feature.
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
            config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.

        Example:

        ```python
        >>> import torch
        >>> from PIL import Image
        >>> import httpx
        >>> from io import BytesIO
        >>> from transformers import AutoProcessor, VipLlavaForConditionalGeneration

        >>> model = VipLlavaForConditionalGeneration.from_pretrained("llava-hf/vip-llava-7b-hf", device_map="auto", dtype=torch.float16)
        >>> processor = AutoProcessor.from_pretrained("llava-hf/vip-llava-7b-hf")

        >>> prompt = "A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions.###Human: <image>\n{}###Assistant:"
        >>> question = "Can you please describe this image?"
        >>> prompt = prompt.format(question)
        >>> url = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/compel-neg.png"
        >>> with httpx.stream("GET", url) as response:
        ...     image = Image.open(BytesIO(response.read()))

        >>> inputs = processor(text=text, images=image, return_tensors="pt").to(0, torch.float16)

        >>> # Generate
        >>> generate_ids = model.generate(**inputs, max_new_tokens=20)
        >>> processor.decode(generate_ids[0][len(inputs["input_ids"][0]):], skip_special_tokens=True)
        The image features a brown and white cat sitting on a green surface, with a red ball in its
        ```NT)r|   rm   r   r   r*   r}   r   r8   r   rn   rs   r   r   )r)   r   r   )r(   r)   r*   r+   r,   r   r#   )r0   r   rn   r   r8   rP   r7   r9   slicer   loss_functionrA   r   r'   r*   r+   r,   r   )rF   r|   rm   r   r   r*   r}   r8   r   r   r   rn   rs   r   r   r   r   r+   slice_indicesr)   r(   s                        r%   rK   z(VipLlavaForConditionalGeneration.forwardO  s[   p 2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++B]B]%:%F!DKKLmLm 	 $** 
%)%+'"7/!5)
 
   
8B>SV8W~ot4]kmA}a,?@A%%VFt{{OfOfOqOq%rD-#33!//)) ' ; ;
 	
r$   c	           
      h    t        |   |f||||||d|	}
|s|	j                  dd      s||
d<   |
S )N)r*   r}   r   r   r   is_first_iterationr   Trm   )r5   prepare_inputs_for_generationget)rF   r|   r*   r}   rm   r   r   r   r   ro   model_inputsrH   s              r%   r   z>VipLlavaForConditionalGeneration.prepare_inputs_for_generation  s\     w<	
+')))1	
 	
 VZZT%B
 ,8L(r$   rJ   )NNNNNNNNNNNNNr   )NNNNNNF)r   r   r   r   _tied_weights_keysr   r6   rg   rj   r   Moduler   r   r    r!   r9   r   r   r   r-   r
   r{   r   r   r   r   r   r'   rK   r   rL   rM   s   @r%   r   r      s:    #9.#@$-	&" +,VW~ 1/ryy   9=
''
  #T#Y5
 +,	

 
+	+
 
" u5 .215.204(,268<*.!%)-,0#'26-._
##d*_
 ''$._
 t+	_

 &&-_
 _
 ((4/_
  #T#Y5_
   4'_
 $;_
  $;_
 #Tk_
 D[_
 ((4/_
 ell*_
" 
/	/#_
  6_
H     r$   r   )r]   r   rO   )%dataclassesr   r    r   activationsr   cache_utilsr   
generationr   modeling_outputsr	   r
   r   modeling_utilsr   processing_utilsr   utilsr   r   r   r   utils.genericr   autor   configuration_vipllavar   r   r'   r   r/   rO   r]   r   __all__r#   r$   r%   <module>r      s  * "   !   ) ` ` - & a a /  2 
9"9 9 9 
9[ 9 90")) 0 'o ' ' 
W<+ W<
W<t 
m'> m
m` [r$   