
    iAJ                        d dl Z d dlmZ d dlZd dlmZ ddlmZ ddlm	Z	 ddl
mZ ddlmZmZ dd	lmZ dd
lmZ ddlmZmZmZ ddlmZ ddlmZmZ ddlmZ ddlmZm Z  ddl!m"Z"m#Z# ddl$m%Z%m&Z& ddl'm(Z(m)Z) ddl*m+Z+m,Z, dejZ                  de.fdZ/ee G d de                    Z0 G d de&      Z1 G d de%      Z2 G d d e       Z3 G d! d"e      Z4 G d# d$e)      Z5 G d% d&e      Z6 G d' d(e      Z7 G d) d*e(      Z8 G d+ d,ejr                        Z: G d- d.ejv                        Z< G d/ d0e      Z= G d1 d2e=      Z> G d3 d4e#      Z?e G d5 d6e"e             Z@g d7ZAy)8    N)	dataclass)nn   )initialization)Cache)GenerationMixin)BaseModelOutputBaseModelOutputWithPooling)PreTrainedModel)Unpack)TransformersKwargsauto_docstringcan_return_tuple)check_model_inputs   )Aimv2AttentionAimv2EncoderLayer)	AutoModel)LlamaMLPLlamaRMSNorm)LlavaForConditionalGeneration
LlavaModel)LlavaNextCausalLMOutputWithPastLlavaNextModelOutputWithPast)SiglipEncoderSiglipVisionEmbeddings   )Ovis2ConfigOvis2VisionConfiglogitsdimc                     | j                  |      }|j                  |d      d   }t        j                  | t        j                        j                  ||d      }||j                         z
  |z   }|S )NT)keepdimr   )memory_formatg      ?)softmaxmaxtorch
zeros_likelegacy_contiguous_formatscatter_detach)r    r!   y_softindexy_hardrets         q/home/obispo/Crisostomo_bridge/mision_env/lib/python3.12/site-packages/transformers/models/ovis2/modular_ovis2.pyhard_softmaxr1   &   sk    ^^C FJJsDJ)!,EfE4R4RS\\]`bgilmF
6==?
"V
+CJ    c                   :    e Zd ZU dZdZej                  dz  ed<   y)*BaseModelOutputWithVisualIndicatorFeaturesz
    visual_indicator_features (`torch.FloatTensor` of shape `(batch_size, visual_indicator_size)`):
        Visual indicator features extracted from the model, which can be used for auxiliary tasks or further processing.
    Nvisual_indicator_features)__name__
__module____qualname____doc__r5   r'   FloatTensor__annotations__ r2   r0   r4   r4   0   s    
 ;?u0047>r2   r4   c                       e Zd Zy)Ovis2ModelOutputWithPastNr6   r7   r8   r<   r2   r0   r>   r>   ;       r2   r>   c                       e Zd Zy)Ovis2CausalLMOutputWithPastNr?   r<   r2   r0   rB   rB   ?   r@   r2   rB   c                       e Zd Zy)Ovis2RMSNormNr?   r<   r2   r0   rD   rD   C   r@   r2   rD   c                       e Zd Zy)Ovis2VisionMLPNr?   r<   r2   r0   rF   rF   G   r@   r2   rF   c                   b     e Zd Zdef fdZd Zdej                  dej                  fdZ	 xZ
S )Ovis2VisionEmbeddingsconfigc                 n    t         |   |       t        |j                  |j                        | _        y N)super__init__rD   hidden_sizerms_norm_epsrms_normselfrI   	__class__s     r0   rM   zOvis2VisionEmbeddings.__init__L   s*     $V%7%79L9LMr2   c                     t        d      )NzNot needed for Ovis2)NotImplementedError)rR   s    r0   interpolate_pos_encodingz.Ovis2VisionEmbeddings.interpolate_pos_encodingP   s    !"899r2   pixel_valuesreturnc                 (   | j                   j                  j                  }| j                  |j                  |            }|j	                  d      j                  dd      }| j                  |      }|| j                  | j                        z   }|S )Ndtyper   r   )	patch_embeddingweightr[   toflatten	transposerP   position_embeddingposition_ids)rR   rW   target_dtypepatch_embeds
embeddingss        r0   forwardzOvis2VisionEmbeddings.forwardS   s    ++2288++LOO,O,OP!))!,66q!<
]]:.
$"9"9$:K:K"LL
r2   )r6   r7   r8   r   rM   rV   r'   r:   Tensorrf   __classcell__rS   s   @r0   rH   rH   K   s4    N0 N:E$5$5 %,, r2   rH   c                       e Zd Zy)Ovis2VisionAttentionNr?   r<   r2   r0   rk   rk   ^   r@   r2   rk   c                   $     e Zd Zdef fdZ xZS )Ovis2VisionEncoderLayerrI   c                 B    t         |           t        |      | _        y rK   )rL   rM   rk   	attentionrQ   s     r0   rM   z Ovis2VisionEncoderLayer.__init__c   s    -f5r2   )r6   r7   r8   r   rM   rh   ri   s   @r0   rm   rm   b   s    60 6 6r2   rm   c            	       p     e Zd Zdef fdZee	 ddej                  dz  de	e
   defd              Z xZS )	Ovis2VisionEncoderrI   c                     t         |   |       t        j                  t	        |j
                        D cg c]  }t        |       c}      | _        y c c}w rK   )rL   rM   r   
ModuleListrangenum_hidden_layersrm   layers)rR   rI   _rS   s      r0   rM   zOvis2VisionEncoder.__init__i   s@     mmeTZTlTlNm$n%<V%D$no$ns   ANattention_maskkwargsrX   c                 T    |}| j                   D ]  } |||fi |} t        |      S )Nlast_hidden_state)rv   r	   )rR   inputs_embedsrx   ry   hidden_statesencoder_layers         r0   rf   zOvis2VisionEncoder.forwardm   s<     &![[ 	SM)-R6RM	S ??r2   rK   )r6   r7   r8   r   rM   r   r   r'   rg   r   r   r	   rf   rh   ri   s   @r0   rq   rq   h   se    p0 p  /3
@ t+
@ +,	
@
 

@  
@r2   rq   c                   X     e Zd Zdef fdZe	 ddej                  dz  fd       Z xZ	S )Ovis2VisionTransformerrI   c                     t         |           || _        t        |      | _        t        |      | _        t        |j                  |j                        | _
        d| _        y )NF)rL   rM   rI   rH   re   rq   encoderrD   rN   rO   rP   gradient_checkpointingrQ   s     r0   rM   zOvis2VisionTransformer.__init__}   sO    /7)&1$V%7%79L9LM&+#r2   Nrx   c                     | j                  |      } | j                  d||d|}|j                  }| j                  |      }t	        |      S )N)r}   rx   r{   r<   )re   r   r|   rP   r	   )rR   rW   rx   ry   r~   encoder_outputsr|   s          r0   rf   zOvis2VisionTransformer.forward   sa     5+74<< ,
'),
 ,
 ,== MM*;<1BCCr2   rK   )
r6   r7   r8   r   rM   r   r'   rg   rf   rh   ri   s   @r0   r   r   |   s?    ,0 ,  /3D t+D Dr2   r   c                   P     e Zd Zdej                  dej                  f fdZ xZS )Ovis2VisualEmbeddingTablevisual_tokensrX   c                    |j                   t        j                  t        j                  t        j                  t        j
                  t        j                  fv rt        | !  |      S t        j                  || j                        S rK   )r[   r'   int8int16int32int64longrL   rf   matmulr]   )rR   r   rS   s     r0   rf   z!Ovis2VisualEmbeddingTable.forward   sW    5::u{{EKKV[V`V`"aa7?=11||M4;;77r2   )r6   r7   r8   r'   rg   rf   rh   ri   s   @r0   r   r      s#    8U\\ 8ell 8 8r2   r   c                   X     e Zd ZU eed<   dZdZdZdgZdZ	dZ
dZdZdZdZdZ fdZ xZS )Ovis2PreTrainedModelrI   model)imagetextTrk   past_key_valuesc                     t         |   |       t        |t              rZt	        j
                  |j                  t        j                  |j                  j                  d         j                  d             y y )N)r   r   )rL   _init_weights
isinstancerH   initcopy_rb   r'   arangeshapeexpand)rR   modulerS   s     r0   r   z"Ovis2PreTrainedModel._init_weights   s[    f%f34JJv**ELL9L9L9R9RSU9V,W,^,^_f,gh 5r2   )r6   r7   r8   r   r;   base_model_prefixinput_modalitiessupports_gradient_checkpointing_no_split_modules_skip_keys_device_placement_supports_cache_class_supports_flash_attn_supports_flex_attn_supports_sdpa_can_compile_fullgraph_supports_attention_backendr   rh   ri   s   @r0   r   r      sY    (&*#/0"3 N!"&i ir2   r   c                   x     e Zd ZU eed<   eedZdef fdZe	de
j                  dee   deez  fd       Z xZS )Ovis2VisionModelrI   )r~   
attentionsc                    t         |   |       || _        t        |      | _        |j
                  | _        |j                  | _        t        j                  |j                  |j                  z  |j                  z  | j                  | j
                  z
  d      | _        t        j                  | j                  | j
                  z
        | _        | j                          y NF)bias)rL   rM   rI   r   transformernum_visual_indicator_tokens
vocab_sizer   LinearrN   hidden_stridehead_linear	LayerNorm	head_norm	post_initrQ   s     r0   rM   zOvis2VisionModel.__init__   s     1&9+1+M+M( ++99!5!558L8LLOOd>>>

 doo8X8X&XYr2   rW   ry   rX   c           	          | j                   |fi |}|d   }| j                  j                  dkD  r|j                  \  }}}| j                  j                  }t	        t        j                  |            }	|	|	z  |k7  rt        d      ||	|z  z
  |z  }
t        j                  j                  |ddd|
d|
fdd      }|	|
z  }	|j                  ||	|z  ||	|z  ||      }|j                  dddddd      }|j                  |d	||z  |z        }| j                  |      }| j                  |      }| j                  j                  d
k(  r#t        j                  j!                  |d	d      }na| j                  j                  dk(  rt#        |d	      }n:| j                  j                  dk(  r!t        j                  j%                  |d	      }t'        |      S )Nr   r   z.Token sequence length must be a perfect squareconstantr   r         r   gumbel_argmaxT)r!   hard	st_argmaxr!   r%   )r|   pooler_output)r   rI   r   r   intmathsqrt
ValueErrorr   
functionalpadreshapepermuter   r   tokenize_functiongumbel_softmaxr1   r%   r4   )rR   rW   ry   outputsr|   
num_imagesseq_len
hidden_dimr   sqrt_lpad_sizer    
prob_tokens                r0   rf   zOvis2VisionModel.forward   s    #$""<:6:#AJ;;$$q(.?.E.E+J KK55M7+,F') !QRR%-)?@MQH " 1 12CaAxYZ\dEegqst uhF 1 9 9Fm3]FmD[]jlv! !2 9 9!Q1a K 1 9 9B =
 J! !!"34';;((O;55f"45PJ[[**k9%f"5J[[**i7..v2.>J9/$
 	
r2   )r6   r7   r8   r   r;   rm   rk   _can_record_outputsrM   r   r'   r:   r   r   tupler4   rf   rh   ri   s   @r0   r   r      sb    0*
0  &
!--&
9?@R9S&
	;	;&
 &
r2   r   c                        e Zd Zi Zdef fdZe ed      dej                  de
e   deez  fd              Zee	 	 	 	 	 	 	 	 	 	 	 	 	 dd
ej                  d	z  dej                  d	z  dej                   d	z  dej                  d	z  ded	z  dej                  d	z  dej                  d	z  ded	z  ded	z  ded	z  ded	z  dej                  d	z  deej                   z  deez  fd              Z xZS )
Ovis2ModelrI   c                 |   t         |   |       t        |j                        | _        t        |j                  j                  |j                        | _        |j                  j                  | _	        |j                  | _        |j                  | _
        t        j                  |j                        | _        | `y rK   )rL   rM   r   vision_configvision_towerr   r   rN   visual_embeddings_tablevisual_vocab_sizevisual_indicator_token_idsr   from_configtext_configlanguage_modelmulti_modal_projectorrQ   s     r0   rM   zOvis2Model.__init__   s     ,V-A-AB'@AUAUA`A`bhbtbt'u$!'!5!5!@!@ ++*0*K*K''33F4F4FG&r2   zWObtains image last hidden states from the vision tower and apply multimodal projection.)custom_introrW   ry   rX   c                 h    | j                   |fddi|}|j                  }|j                  \  }}}t        j                  ||| j                   j
                  f|j                  |j                  d|j                        }t        j                  ||gd      }| j                  |      }t        j                  | j                  | j                   j
                  z
  | j                  t        j                        j                  |j                        }	||_        | j                  |	      |_        |S )Nreturn_dictTF)r[   devicerequires_gradlayoutr   r   rZ   )r   r   r   r'   zerosr   r[   r   r   catr   r   r   r   r^   r5   )
rR   rW   ry   image_outputsimage_features
batch_sizeimg_seq_lenrw   padding_tensorvisual_indicators
             r0   get_image_featureszOvis2Model.get_image_features  s    *)),SDSFS&44%3%9%9"
Kd&7&7&S&ST &&!((!((
 NN#CK55nE <<""T%6%6%R%RR""**
 "^""
#	 	
 '5#262N2NO_2`/r2   N	input_idsrx   rb   r   r}   labels	use_cacheoutput_attentionsoutput_hidden_statesr   cache_positionlogits_to_keepc                    |	|	n| j                   j                  }	|
|
n| j                   j                  }
|d u |d uz  rt        d      | | j	                         |      }|6| j                  |d      }|j                  }|j                  }| j                  |||      }|j                  ||      }t        | j                        D ]  \  }}|Y| | j	                         t        j                  |t        j                  |j                              k(  }|j!                  d      }n||k(  j#                  |j                        }|j%                         s||   j'                  ||         j#                  |j                  |j(                        ||<     | j*                  d	||||||	|
d||d
|}t-        |j.                  |j0                  |j2                  |j4                  |      S d       S )
Nz:You must specify exactly one of input_ids or inputs_embedsT)rW   r   )r}   r   )r[   r   r   )
rx   rb   r   r}   r   r   r   r   r   r   )r|   r   r~   r   image_hidden_statesr<   )rI   r   r   r   get_input_embeddingsr   r   r5   get_placeholder_maskmasked_scatter	enumerater   r'   tensorr   r   allr^   any	expand_asr[   r   r>   r|   r   r~   r   )rR   r   rW   rx   rb   r   r}   r   r   r   r   r   r   r   ry   r   r   r5   special_image_maskivisual_indicator_idmaskr   s                          r0   rf   zOvis2Model.forward$  s5   & 2C1N-TXT_T_TqTq$8$D $++JjJj 	 -t";<YZZ 7D557	BM# 33[_3`M*88N(5(O(O%!%!:!:+- "; "
 *889K^\M*3D4S4S*T &&$(,GD,E,E,G%8

S`SgSgh- D  88B<D%)<<@@AUAUVD88:1!4"=#67M00-2E2EF "$'  &$%% 
)%+'/!5))
 
 (%77#33!//))2>2J
 	

 QU
 	
r2   NNNNNNNNNNNNr   )r6   r7   r8   _checkpoint_conversion_mappingr   rM   r   r   r'   r:   r   r   r   r4   r   
LongTensorrg   r   boolr   r>   rf   rh   ri   s   @r0   r   r      s   %'"	'{ 	' n'' +, 
;	;	 8  .215.204(,26*.!%)-,0#'26-.L
##d*L
 ''$.L
 t+	L

 &&-L
 L
 ((4/L
   4'L
 $;L
  $;L
 #TkL
 D[L
 ((4/L
 ell*L
  
)	)!L
  L
r2   r   c                        e Zd Zi Zdef fdZedej                  de	e
   deez  fd       Zee	 	 	 	 	 	 	 	 	 	 	 	 	 ddej                  dz  dej                  dz  d	ej                   dz  d
ej                  dz  dedz  dej                  dz  dej                  dz  dedz  dedz  dedz  dedz  dej                  dz  deej                   z  deez  fd              Z xZS )Ovis2ForConditionalGenerationrI   c                     t         |   |       t        j                  |j                  |j
                  d      | _        y r   )rL   rM   r   r   rN   r   lm_headrQ   s     r0   rM   z&Ovis2ForConditionalGeneration.__init__y  s0     yy!3!3V5F5FUSr2   rW   ry   rX   c                 >     | j                   j                  dd|i|S )NrW   r<   )r   r   )rR   rW   ry   s      r0   r   z0Ovis2ForConditionalGeneration.get_image_features}  s#     -tzz,,Q,Q&QQr2   Nr   rx   rb   r   r}   r   r   r   r   r   r   r   c                    |	|	n| j                   j                  }	|
|
n| j                   j                  }
 | j                  d||||||||	|
d|d|}|d   }t	        |t
              rt        | d      n|}| j                  |dd|ddf         }d}|4 | j                  d||| j                   j                  j                  d|}t        |||j                  |j                  |j                  |j                        S )a  
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
            config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.

        Example:

        ```python
        >>> from PIL import Image
        >>> import httpx
        >>> from io import BytesIO
        >>> from transformers import AutoProcessor, Ovis2ForConditionalGeneration

        >>> model = Ovis2ForConditionalGeneration.from_pretrained("thisisiron/Ovis2-2B-hf")
        >>> processor = AutoProcessor.from_pretrained("thisisiron/Ovis2-2B-hf")

        >>> prompt = "<|im_start|>user\n<image>\nDescribe the image.<|im_end|>\n<|im_start|>assistant\n"
        >>> url = "http://images.cocodataset.org/val2014/COCO_val2014_000000537955.jpg"
        >>> with httpx.stream("GET", url) as response:
        ...     image = Image.open(BytesIO(response.read()))

        >>> inputs = processor(images=image, text=prompt, return_tensors="pt")

        >>> # Generate
        >>> generate_ids = model.generate(**inputs, max_new_tokens=15)
        >>> processor.batch_decode(generate_ids, skip_special_tokens=True)[0]
        "user\n\nDescribe the image.\nassistant\nThe image features a brown dog standing on a wooden floor, looking up with"
        ```NT)r   rW   rx   rb   r   r}   r   r   r   r   r   r   )r    r   r   )lossr    r   r~   r   r   r<   )rI   r   r   r   r   r   slicer  loss_functionr   r   rB   r   r~   r   r   )rR   r   rW   rx   rb   r   r}   r   r   r   r   r   r   r   ry   r   r~   slice_indicesr    r  s                       r0   rf   z%Ovis2ForConditionalGeneration.forward  s7   ` 2C1N-TXT_T_TqTq$8$D $++JjJj 	 $** 
%)%+'/!5)
 
  
8B>SV8W~ot4]kmA}a,?@A%4%% f9P9P9[9[_eD +#33!//)) ' ; ;
 	
r2   r  )r6   r7   r8   r  r   rM   r   r'   r:   r   r   r   r4   r   r   r	  rg   r   r
  r   rB   rf   rh   ri   s   @r0   r  r  u  s   %'"T{ T R!--R9?@R9SR	;	;R R
  .215.204(,26*.!%)-,0#'26-.T
##d*T
 ''$.T
 t+	T

 &&-T
 T
 ((4/T
   4'T
 $;T
  $;T
 #TkT
 D[T
 ((4/T
 ell*T
  
,	,!T
  T
r2   r  )r   r   r  )Br   dataclassesr   r'   r    r   r   cache_utilsr   
generationr   modeling_outputsr	   r
   modeling_utilsr   processing_utilsr   utilsr   r   r   utils.genericr   aimv2.modeling_aimv2r   r   autor   llama.modeling_llamar   r   llava.modeling_llavar   r   llava_next.modeling_llava_nextr   r   siglip.modeling_siglipr   r   configuration_ovis2r   r   rg   r   r1   r4   r>   rB   rD   rF   rH   rk   rm   rq   Moduler   	Embeddingr   r   r   r   r  __all__r<   r2   r0   <module>r(     s[    !   &   ) K - & I I / D  9 L j J ? C  ?1K ?  ?	; 		"A 		< 		X 	2 &	> 	6/ 6@ @(DRYY D<8 8i? i*=
+ =
@|
 |
~ c
$A? c
 c
L Rr2   