
    iI                        d Z ddlmZ ddlZddlmZ ddlmZ ddlmZ ddl	m
Z
 dd	lmZmZmZ dd
lmZ ddlmZ ddlmZmZmZmZ ddlmZ ddlmZ ddlmZ  ej<                  e      Z e ed       G d de                    Z!e ed       G d de                    Z" G d dejF                        Z$e G d de             Z% ed       G d de%             Z& ed        G d! d"e%e
             Z'g d#Z(y)$zPyTorch Llava model.    )	dataclassN)nn   )ACT2FN)Cache)GenerationMixin)BaseModelOutputWithPastBaseModelOutputWithPoolingModelOutput)PreTrainedModel)Unpack)TransformersKwargsauto_docstringloggingtorch_compilable_check)check_model_inputs   )	AutoModel   )LlavaConfigzJ
    Base class for Llava outputs, with hidden states and attentions.
    custom_introc                   :    e Zd ZU dZdZej                  dz  ed<   y)LlavaModelOutputWithPasta  
    past_key_values (`Cache`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
        It is a [`~cache_utils.Cache`] instance. For more details, see our [kv cache guide](https://huggingface.co/docs/transformers/en/kv_cache).

        Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see
        `past_key_values` input) to speed up sequential decoding.
    image_hidden_states (`torch.FloatTensor`, *optional*):
        A `torch.FloatTensor` of size `(batch_size, num_images, sequence_length, hidden_size)`.
        image_hidden_states of the model produced by the vision encoder and after projecting the last hidden state.
    Nimage_hidden_states)__name__
__module____qualname____doc__r   torchFloatTensor__annotations__     r/home/obispo/Crisostomo_bridge/mision_env/lib/python3.12/site-packages/transformers/models/llava/modeling_llava.pyr   r   $   s    	 59**T18r$   r   zQ
    Base class for Llava causal language model (or autoregressive) outputs.
    c                       e Zd ZU dZdZej                  dz  ed<   dZej                  dz  ed<   dZ	e
dz  ed<   dZeej                     dz  ed<   dZeej                     dz  ed<   dZej                  dz  ed<   y)	LlavaCausalLMOutputWithPasta4  
    loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
        Language modeling loss (for next-token prediction).
    logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
        Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
    past_key_values (`Cache`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
        It is a [`~cache_utils.Cache`] instance. For more details, see our [kv cache guide](https://huggingface.co/docs/transformers/en/kv_cache).

        Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see
        `past_key_values` input) to speed up sequential decoding.
    image_hidden_states (`torch.FloatTensor`, *optional*):
        A `torch.FloatTensor` of size `(batch_size, num_images, sequence_length, hidden_size)`.
        image_hidden_states of the model produced by the vision encoder and after projecting the last hidden state.
    Nlosslogitspast_key_valueshidden_states
attentionsr   )r   r   r   r   r(   r    r!   r"   r)   r*   r   r+   tupler,   r   r#   r$   r%   r'   r'   9   s     &*D%

d
")'+FE$+$(OUT\(59M5**+d2926Je''(4/648**T18r$   r'   c                   *     e Zd Zdef fdZd Z xZS )LlavaMultiModalProjectorconfigc                    t         |           t        |j                  t              rdnt        |j                        }t        j                  |j                  j                  |z  |j                  j                  |j                        | _        t        |j                     | _        t        j                  |j                  j                  |j                  j                  |j                        | _        y )Nr   bias)super__init__
isinstancevision_feature_layerintlenr   Linearvision_confighidden_sizetext_configmultimodal_projector_biaslinear_1r   projector_hidden_actactlinear_2)selfr0   num_feature_layers	__class__s      r%   r5   z!LlavaMultiModalProjector.__init__X   s    ",V-H-H#"NQTWX^XsXsTt		  ,,/AA**11

 &556		**F,>,>,J,JQWQqQq
r$   c                 l    | j                  |      }| j                  |      }| j                  |      }|S N)r?   rA   rB   )rC   image_featuresr+   s      r%   forwardz LlavaMultiModalProjector.forwardf   s2    n5/m4r$   )r   r   r   r   r5   rI   __classcell__rE   s   @r%   r/   r/   W   s    
{ 
r$   r/   c                   <    e Zd ZU eed<   dZdZdZdZdZ	dZ
dZdZdZy)LlavaPreTrainedModelr0   model)imagetextTr*   N)r   r   r   r   r"   base_model_prefixinput_modalitiessupports_gradient_checkpointing_skip_keys_device_placement_supports_flash_attn_supports_sdpa_can_compile_fullgraph_supports_flex_attn_supports_attention_backendr#   r$   r%   rM   rM   m   s=    (&*#"3N!"&r$   rM   zu
    The Llava model which consists of a vision backbone and a language model, without a language modeling head.
    c                   n    e Zd ZddiZdef fdZd Zd Z ed       e	d	
      	 	 	 dde
j                  deee   z  dz  dedz  dedz  dee   deez  fd              Zde
j*                  de
j                  de
j                  fdZ ed      e		 	 	 	 	 	 	 	 	 	 dde
j*                  dz  de
j                  dz  de
j.                  dz  de
j*                  dz  dedz  de
j                  dz  deee   z  dz  dedz  de
j*                  dz  de
j.                  dz  dee   deez  fd              Z xZS )
LlavaModel^language_model.modellanguage_modelr0   c                     t         |   |       t        j                  |j                        | _        t        |      | _        t        j                  |j                        | _	        | j                          y rG   )r4   r5   r   from_configr;   vision_towerr/   multi_modal_projectorr=   r]   	post_initrC   r0   rE   s     r%   r5   zLlavaModel.__init__   sY     %11&2F2FG%=f%E"'33F4F4FGr$   c                 6    | j                   j                         S rG   )r]   get_input_embeddingsrC   s    r%   re   zLlavaModel.get_input_embeddings   s    ""7799r$   c                 :    | j                   j                  |       y rG   )r]   set_input_embeddingsrC   values     r%   rh   zLlavaModel.set_input_embeddings   s    007r$   Ftie_last_hidden_stateszWObtains image last hidden states from the vision tower and apply multimodal projection.r   Npixel_valuesr7   vision_feature_select_strategyoutput_hidden_stateskwargsreturnc                    |j                         D ci c]  \  }}|	|| }}} | j                  |fddd|}t        |t              r |j                  |   }	|dk(  r\|	d d dd f   }	nP|D 
cg c]  }
|j                  |
    }}
|dk(  r|D cg c]  }|d d dd f    }}t        j                  |d      }	| j                  |	      }|j                  d      t        j                  |d   |j                        | j                  j                  z  j                  d      j                         }t        j                  |j                  d	      |      }nt!        |      }||_        |S c c}}w c c}
w c c}w )
NT)ro   return_dictdefaultr   dimimage_sizes)devicer   )itemsr`   r6   r8   r+   r    catra   get	as_tensorry   
patch_sizeprodtolistsplitsqueezelistpooler_output)rC   rm   r7   rn   ro   rp   kvimage_outputsselected_image_feature	layer_idxhs_poolhsrH   split_sizess                  r%   get_image_featureszLlavaModel.get_image_features   s    $*<<>C41aQ]!Q$CC)))
!%
 	
 *C0%2%@%@AU%V"-:)?12)F&Ocd)}229=dGd-:/672ae977%*YYwB%?"334JK ::m$0!6~?T?TUY]YjYjYuYuu" 
 #[[)?)?)BKPN!.1N&4#K D  e 8s   
E2E21E8E=	input_idsinputs_embedsrH   c                 N   |m| | j                         t        j                  | j                  j                  t        j
                  |j                              k(  }|j                  d      }n|| j                  j                  k(  }|j                         }|j                  d   |j                  d   z  }|j                  d      j                  |      j                  |j                        }t        ||   j                         |j                         k(  d| d|        |S )z
        Obtains multimodal placeholder mask from `input_ids` or `inputs_embeds`, and checks that the placeholder token count is
        equal to the length of multimodal features. If the lengths are different, an error is raised.
        )dtypery   ru   r   r   z6Image features and image tokens do not match, tokens: z, features: )re   r    tensorr0   image_token_idlongry   allsumshape	unsqueeze	expand_astor   numel)rC   r   r   rH   special_image_maskn_image_tokensn_image_featuress          r%   get_placeholder_maskzLlavaModel.get_placeholder_mask   s    !.2M$2K2K2MT[[77uzzR_RfRfg3 " "4!7!7!;!*dkk.H.H!H+//1)//2^5I5I!5LL/99"=GGVYYZgZnZno,-3359M9M9OOD^DTT`aq`rs	
 "!r$   attention_maskposition_idsr*   cache_positionrx   c           	         |d u |d uz  rt        d      | | j                         |      }|| j                  ||||
d      j                  }t	        j
                  |d      j                  |j                  |j                        }| j                  |||      }|j                  ||      } | j                  d	|||||	d|}t        |j                  |j                  |j                  |j                   |      S d       S )
Nz:You must specify exactly one of input_ids or inputs_embedsT)rm   r7   rn   rx   rs   r   rv   )r   rH   )r   r   r*   r   r   )last_hidden_stater*   r+   r,   r   r#   )
ValueErrorre   r   r   r    r{   r   ry   r   r   masked_scatterr]   r   r   r*   r+   r,   )rC   r   rm   r   r   r*   r   r7   rn   r   rx   rp   rH   r   outputss                  r%   rI   zLlavaModel.forward   sK     -t";<YZZ 7D557	BM#!44)%9/M'  5  m  #YY~1=@@AUAUWdWjWjkN!%!:!:~ "; " *889K^\M%$%% 
)%+')
 
 (%77#33!//))2>2J
 	

 QU
 	
r$   )NNN)
NNNNNNNNNN)r   r   r   _checkpoint_conversion_mappingr   r5   re   rh   r   r   r    r!   r8   r   strboolr   r   r-   r
   r   
LongTensorr   Tensorr   r   rI   rJ   rK   s   @r%   r[   r[   }   s    	!"2&"{ :8 u5n 8<59,0-''- "DIo4- ),d
	-
 #Tk- +,- 
+	+- 6-^"))":?:K:K"]b]n]n"0 u5 .215.204(,267;5926+/1
##d*1
 ''$.1
 t+	1

 &&-1
 1
 ((4/1
 "DIo41
 ),d
1
 ((4/1
 \\D(1
 +,1
 
)	)1
  61
r$   r[   zS
    The LLAVA model which consists of a vision backbone and a language model.
    c                    t    e Zd ZdddddZddiZdef fd	Zd
 Zd Zde	j                  fdZe	 	 d!dej                  deee   z  dz  dedz  dee   deez  f
d       Z ed      e	 	 	 	 	 	 	 	 	 	 	 	 d"dej0                  dz  dej                  dz  dej2                  dz  dej0                  dz  dedz  dej                  dz  deee   z  dz  dedz  dej0                  dz  dej0                  dz  deej2                  z  dej2                  dz  dee   deez  fd              Z	 	 	 	 	 	 	 d# fd 	Z xZS )$LlavaForConditionalGenerationzmodel.language_modelzmodel.vision_towerzmodel.multi_modal_projectorlm_head)r\   z^vision_towerz^multi_modal_projectorz^language_model.lm_headzlm_head.weightz(model.language_model.embed_tokens.weightr0   c                     t         |   |       t        |      | _        t	        j
                  |j                  j                  |j                  j                  d      | _	        | j                          y )NFr2   )r4   r5   r[   rN   r   r:   r=   r<   
vocab_sizer   rb   rc   s     r%   r5   z&LlavaForConditionalGeneration.__init__$  sS     '
yy!3!3!?!?ASASA^A^ejkr$   c                 6    | j                   j                         S rG   )rN   re   rf   s    r%   re   z2LlavaForConditionalGeneration.get_input_embeddings*  s    zz..00r$   c                 :    | j                   j                  |       y rG   )rN   rh   ri   s     r%   rh   z2LlavaForConditionalGeneration.set_input_embeddings-  s    

''.r$   rq   c                     | j                   S rG   )r   rf   s    r%   get_output_embeddingsz3LlavaForConditionalGeneration.get_output_embeddings0  s    ||r$   Nrm   r7   rn   rp   c                 B     | j                   j                  d|||d|S )N)rm   r7   rn   r#   )rN   r   )rC   rm   r7   rn   rp   s        r%   r   z0LlavaForConditionalGeneration.get_image_features3  s5     -tzz,, 
%!5+I
 	
 	
r$   Frk   r   r   r   r*   r   labelsr   logits_to_keeprx   c                     | j                   d|||||||||
|d
|}|d   }t        |t              rt        | d      n|}| j	                  |dd|ddf         }d}|	4 | j
                  d||	| j                  j                  j                  d|}t        |||j                  |j                  |j                  |j                        S )a  
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
            config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.

        Example:

        ```python
        >>> from PIL import Image
        >>> import httpx
        >>> from io import BytesIO
        >>> from transformers import AutoProcessor, LlavaForConditionalGeneration

        >>> model = LlavaForConditionalGeneration.from_pretrained("llava-hf/llava-1.5-7b-hf")
        >>> processor = AutoProcessor.from_pretrained("llava-hf/llava-1.5-7b-hf")

        >>> prompt = "USER: <image>\nWhat's the content of the image? ASSISTANT:"
        >>> url = "https://www.ilankelman.org/stopsigns/australia.jpg"
        >>> with httpx.stream("GET", url) as response:
        ...     image = Image.open(BytesIO(response.read()))

        >>> inputs = processor(images=image, text=prompt, return_tensors="pt")

        >>> # Generate
        >>> generate_ids = model.generate(**inputs, max_new_tokens=15)
        >>> processor.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
        "USER:  \nWhat's the content of the image? ASSISTANT: The image features a busy city street with a stop sign prominently displayed"
        ```)
r   rm   r   r   r*   r   r7   rn   r   rx   r   N)r)   r   r   )r(   r)   r*   r+   r,   r   r#   )rN   r6   r8   slicer   loss_functionr0   r=   r   r'   r*   r+   r,   r   )rC   r   rm   r   r   r*   r   r7   rn   r   r   r   rx   rp   r   r+   slice_indicesr)   r(   s                      r%   rI   z%LlavaForConditionalGeneration.forwardB  s    ^ $** 
%)%+'!5+I)#
 
  
8B>SV8W~ot4]kmA}a,?@A%4%% f9P9P9[9[_eD +#33!//)) ' ; ;
 	
r$   c	           
      h    t        |   |f||||||d|	}
|s|	j                  dd      s||
d<   |
S )N)r*   r   r   r   r   is_first_iteration	use_cacheTrm   )r4   prepare_inputs_for_generationr|   )rC   r   r*   r   rm   r   r   r   r   rp   model_inputsrE   s              r%   r   z;LlavaForConditionalGeneration.prepare_inputs_for_generation  s\     w<	
+')))1	
 	
 VZZT%B
 ,8L(r$   )NN)NNNNNNNNNNr   N)NNNNNNF)r   r   r   r   _tied_weights_keysr   r5   re   rh   r   Moduler   r   r    r!   r8   r   r   r   r   r-   r
   r   r   r   r   r   r'   rI   r   rJ   rK   s   @r%   r   r     s=    #9.#@$-	&" +,VW{ 1/ryy   8<59	
''
 "DIo4
 ),d
	

 +,
 
+	+
 
 u5 .215.204(,267;59*.26-.+/M
##d*M
 ''$.M
 t+	M

 &&-M
 M
 ((4/M
 "DIo4M
 ),d
M
   4'M
 ((4/M
 ell*M
 \\D(M
 +,M
 
,	,M
  6M
d     r$   r   )r   rM   r[   ))r   dataclassesr   r    r   activationsr   cache_utilsr   
generationr   modeling_outputsr	   r
   r   modeling_utilsr   processing_utilsr   utilsr   r   r   r   utils.genericr   autor   configuration_llavar   
get_loggerr   loggerr   r'   r   r/   rM   r[   r   __all__r#   r$   r%   <module>r      s/    !   !   ) ` ` - & X X /  , 
		H	% 
96 9 9 
9+ 9 90ryy , '? ' ' 
Q
% Q

Q
h 
X$8/ X
Xv Rr$   