
    i2                       d dl Z d dlmZ d dlmZ d dlmZmZ d dlZd dl	m
Z
 d dlm
c mZ d dl	mZ ddlmZ ddlmZ dd	lmZmZ dd
lmZ ddlmZ ddlmZ ddlmZ ddlm Z  ddl!m"Z"m#Z#m$Z$ ddl%m&Z&m'Z' ddl(m)Z)m*Z* ddl+m,Z, ddl-m.Z.m/Z/m0Z0m1Z1m2Z2 ddl3m4Z4m5Z5m6Z6 ddl7m8Z8m9Z9m:Z:  ed       G d de
jv                               Z< G d de
jv                        Z= G d de
jv                        Z> G d de
jv                        Z? G d  d!e
jv                        Z@ G d" d#e
jv                        ZAd$ ZBd%ej                  d&ej                  d'ej                  d(ej                  d)eDej                  ej                  f   f
d*ZEd+ej                  d,eFd)ej                  fd-ZG	 dWd.e
jv                  d/ej                  d0ej                  d1ej                  d2ej                  dz  d3eHd4eHd5e,e.   fd6ZI G d7 d8e
jv                        ZJ G d9 d:e       ZK G d; d<e
jv                        ZLd= ZMdXd>ZN G d? d@e
jv                        ZO G dA dBe
jv                        ZP G dC dDe       ZQe e/dEF       G dG dHe$                    ZRe/ G dI dJe*             ZS G dK dLeS      ZTe/ G dM dNeS             ZUe/ G dO dPeS             ZVe e/dQF       G dR dSe$                    ZW G dT dUeSe      ZXg dVZYy)Y    N)Callable)	dataclass)AnyOptional)	LayerNorm   )initialization)ACT2FN)CacheDynamicCache)GenerationMixin)use_kernel_forward_from_hub)create_causal_mask)FlashAttentionKwargs)GradientCheckpointingLayer)BaseModelOutputWithPastBaseModelOutputWithPoolingModelOutput)ROPE_INIT_FUNCTIONSdynamic_rope_update)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack)TransformersKwargsauto_docstringcan_return_tupleis_torchdynamo_compilingtorch_compilable_check)check_model_inputsis_flash_attention_requestedmaybe_autocast   )Glm4vConfigGlm4vTextConfigGlm4vVisionConfigRMSNormc                   ,     e Zd Zd fd	Zd Zd Z xZS )Glm4vRMSNormc                     t         |           t        j                  t	        j
                  |            | _        || _        y)z;
        Glm4vRMSNorm is equivalent to T5LayerNorm
        N)super__init__nn	Parametertorchonesweightvariance_epsilon)selfhidden_sizeeps	__class__s      r/home/obispo/Crisostomo_bridge/mision_env/lib/python3.12/site-packages/transformers/models/glm4v/modeling_glm4v.pyr+   zGlm4vRMSNorm.__init__7   s1     	ll5::k#:; #    c                 "   |j                   }|j                  t        j                        }|j	                  d      j                  dd      }|t        j                  || j                  z         z  }| j                  |j                  |      z  S )N   Tkeepdim)	dtypetor.   float32powmeanrsqrtr1   r0   )r2   hidden_statesinput_dtypevariances       r6   forwardzGlm4vRMSNorm.forward?   sy    #))%((7 $$Q',,R,>%Ht?T?T4T(UU{{]--k:::r7   c                 ^    t        | j                  j                         d| j                   S )Nz, eps=)tupler0   shaper1   r2   s    r6   
extra_reprzGlm4vRMSNorm.extra_reprF   s*    ))*+6$2G2G1HIIr7   )gư>)__name__
__module____qualname__r+   rF   rK   __classcell__r5   s   @r6   r(   r(   5   s    $;Jr7   r(   c                   ,     e Zd Zddef fdZd Z xZS )Glm4VisionMlpbiasc                    t         |           |j                  | _        |j                  | _        t        j                  | j                  | j                  |      | _        t        j                  | j                  | j                  |      | _        t        j                  | j                  | j                  |      | _	        t        |j                     | _        y NrS   )r*   r+   r3   out_hidden_sizeintermediate_sizer,   Linear	gate_projup_proj	down_projr
   
hidden_actact_fn)r2   configrS   r5   s      r6   r+   zGlm4VisionMlp.__init__K   s    !--!'!7!74#3#3T5K5KRVWyy!1!143I3IPTU4#9#94;K;KRVWV../r7   c                     | j                  | j                  | j                  |            | j                  |      z        S N)r\   r^   rZ   r[   r2   hidden_states     r6   rF   zGlm4VisionMlp.forwardT   s2    ~~dkk$..*FG$,,WcJddeer7   F)rL   rM   rN   boolr+   rF   rO   rP   s   @r6   rR   rR   J   s    0T 0fr7   rR   c                   `     e Zd Zdeddf fdZdej                  dej                  fdZ xZS )Glm4vVisionPatchEmbedr_   returnNc                 T   t         |           |j                  | _        |j                  | _        |j                  | _        |j
                  | _        | j                  | j                  | j                  g}t        j                  | j                  | j                  ||      | _	        y )N)kernel_sizestride)
r*   r+   
patch_sizetemporal_patch_sizein_channelsr3   	embed_dimr,   Conv3dproj)r2   r_   rj   r5   s      r6   r+   zGlm4vVisionPatchEmbed.__init__Y   s     ++#)#=#= !--++//$//RIId..K`kl	r7   rC   c                 6   | j                   j                  j                  }|j                  d| j                  | j
                  | j                  | j                        }| j                  |j                  |            j                  d| j                        }|S )Nr:   r=   )	rq   r0   r=   viewrn   rm   rl   r>   ro   )r2   rC   target_dtypes      r6   rF   zGlm4vVisionPatchEmbed.forwardc   s~    yy''--%**  $":":DOOT__
 		-"2"2"2"FGLLRQUQ_Q_`r7   	rL   rM   rN   r%   r+   r.   TensorrF   rO   rP   s   @r6   rg   rg   X   s5    m0 mT mU\\ ell r7   rg   c                   r     e Zd ZU ej                  ed<   d	dededdf fdZdedej                  fdZ	 xZ
S )
Glm4vVisionRotaryEmbeddinginv_freqdimthetarh   Nc                     t         |           || _        || _        d|t	        j
                  d|dt        j                        |z  z  z  }| j                  d|d       y )N      ?r   r9   rs   rz   F
persistent)r*   r+   r{   r|   r.   arangefloatregister_buffer)r2   r{   r|   rz   r5   s       r6   r+   z#Glm4vVisionRotaryEmbedding.__init__o   sY    
%ELLC%++$NQT$TUVZeDr7   seqlenc                     t        j                  || j                  j                  | j                  j                        }t        j
                  || j                        }|S )Ndevicer=   )r.   r   rz   r   r=   outer)r2   r   seqfreqss       r6   rF   z"Glm4vVisionRotaryEmbedding.forwardv   sA    ll6$--*>*>dmmFYFYZC/r7   )g     @)rL   rM   rN   r.   rw   __annotations__intr   r+   rF   rO   rP   s   @r6   ry   ry   l   sB    llEC E ED Ec ell r7   ry   c                   n     e Zd Zd
dededededdf
 fdZdej                  dej                  fd	Z	 xZ
S )Glm4vVisionPatchMergerr{   context_dimr]   rS   rh   Nc                 x   t         |           t        j                  |||      | _        t        |      | _        t        j                  |||      | _        t        j                  |||      | _        t        j                  |||      | _	        t        j                         | _        t        |   | _        y rU   )r*   r+   r,   rY   rq   r   post_projection_normrZ   r[   r\   GELUact1r
   r^   )r2   r{   r   r]   rS   r5   s        r6   r+   zGlm4vVisionPatchMerger.__init__}   s    IIc3T2	$-cN!3$?yyk=;$?GGI	Z(r7   rc   c                     | j                  |      }| j                  | j                  |            }| j                  | j	                  | j                  |            | j                  |      z        S ra   )rq   r   r   r\   r^   rZ   r[   rb   s     r6   rF   zGlm4vVisionPatchMerger.forward   sY    yy.yy!:!:<!HI~~dkk$..*FG$,,WcJddeer7   rd   )rL   rM   rN   r   strre   r+   r.   rw   rF   rO   rP   s   @r6   r   r   |   sJ    )C )c )s )$ )[_ )fELL fU\\ fr7   r   c                   D     e Zd Zdef fdZdej                  fdZ xZS )Glm4vVisionEmbeddingsr_   c                 f   t         |           || _        |j                  | _        |j
                  | _        |j                  | _        | j
                  | j                  z  dz  | _        | j                  | _        t        j                  | j                  | j                        | _        d| _        y )Nr9   bicubic)r*   r+   r_   r3   ro   
image_sizerl   num_patchesnum_positionsr,   	Embeddingposition_embeddinginterpolated_methodr2   r_   r5   s     r6   r+   zGlm4vVisionEmbeddings.__init__   s    ++ ++ ++ OOt>1D!--"$,,t/A/A4>>"R#, r7   rh   c           	      2   | j                   j                  }|j                  d   }|j                  }t	        |t
              r&t        j                  ||t        j                        }|j                  d   }	t        |	dz        }
|j                  |
|
|      j                  ddd      j                  d      j                  |t        j                        }t        j                  t!        t#        |            D cg c]  }||df   j%                  ||          c}      j                  |t        j                        }t        j                  t!        t#        |            D cg c]  }||df   j%                  ||          c}      j                  |t        j                        }|dz   |z  dz  dz
  }|dz   |z  dz  dz
  }t        j&                  ||fd      j                  d      j                  d      }t)        j*                  ||| j,                  dd	
      }|j/                  d      j/                  d      j                  dd      }|j                  |j0                        j                  |j                        }||z   }|S c c}w c c}w )a  
        Forward pass with integrated position encoding adaptation using 2D interpolation.

        Args:
            embeddings: Input embeddings tensor
            lengths (torch.Tensor): Sequence lengths for each image in the batch.
            image_shapes (torch.Tensor): Tensor of shape [batch_size, 3] representing the image shapes (t, h, w).
            h_coords (torch.Tensor): Tensor of shape [total_seq] representing the h coordinate for each patch.
            w_coords (torch.Tensor): Tensor of shape [total_seq] representing the w coordinate for each patch.

        Returns:
            torch.Tensor: Embeddings with adapted position encoding added.
        r"   r   r   g      ?r9   r:   r{   Fborder)modealign_cornerspadding_mode)r   r0   rI   r   
isinstancelistr.   tensorlongr   rt   permute	unsqueezer>   r?   catrangelenrepeatstackFgrid_sampler   squeezer=   )r2   
embeddingslengthsimage_shapesh_coordsw_coordspos_embed_weightr3   r   orig_size_sq	orig_sizepos_embed_2ditarget_htarget_wnorm_wnorm_hgridinterpolated_embed_fp32adapted_pos_embed_fp32adapted_pos_embeds                        r6   rF   zGlm4vVisionEmbeddings.forward   s^     2299&,,Q/!(( gt$ll76LG (--a0c)*	!!)YDWQ1Yq\RvU]]R3	 	 99USVW^S_M`al1a4077
Cabee f 
 99USVW^S_M`al1a4077
Cabee f 

 c>X-2Q6c>X-2Q6 {{FF+4>>qAKKAN #$--$T%=%=Uai#

 "9!@!@!C!K!KB!O!W!WXY[\!]2556F6L6LMPPQ[QbQbc  "33
3 b bs   < J' Jrv   rP   s   @r6   r   r      s#    
-0 
-;PUP\P\ ;r7   r   c                     | dd| j                   d   dz  f   }| d| j                   d   dz  df   }t        j                  | |fd      S )*Rotates half the hidden dims of the input..Nr:   r9   r   )rI   r.   r   xx1x2s      r6   rotate_halfr      sZ    	
3"!''"+"""	#B	
3q ""	#B99rc2YB''r7   qkcossinrh   c                    | j                   }|j                   }| j                         |j                         }} |j                  d      j                         |j                  d      j                         }}| |z  t        |       |z  z   }||z  t        |      |z  z   }|j	                  |      }|j	                  |      }||fS )N)r=   r   r   r   r>   )r   r   r   r   orig_q_dtypeorig_k_dtypeq_embedk_embeds           r6   apply_rotary_pos_emb_visionr      s     77L77L779aggiqA}}R &&(#--*;*A*A*CC3w;q>C/0G3w;q>C/0Gjj&Gjj&GGr7   rC   n_repc                     | j                   \  }}}}|dk(  r| S | dddddddddf   j                  |||||      } | j                  |||z  ||      S )z
    This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
    num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
    r"   N)rI   expandreshape)rC   r   batchnum_key_value_headsslenhead_dims         r6   	repeat_kvr      so    
 2?1D1D.Ehz!!Qa"23::5BUW\^bdlmM  (;e(CT8TTr7   modulequerykeyvalueattention_maskscalingdropoutkwargsc                 T   t        || j                        }t        || j                        }	t        j                  ||j	                  dd            |z  }
|#|d d d d d d d |j
                  d   f   }|
|z   }
t        j                  j                  |
dt        j                        j                  |j                        }
t        j                  j                  |
|| j                        }
t        j                  |
|	      }|j	                  dd      j                         }||
fS )Nr9   r   r   r:   r{   r=   )ptrainingr"   )r   num_key_value_groupsr.   matmul	transposerI   r,   
functionalsoftmaxr?   r>   r=   r   r   
contiguous)r   r   r   r   r   r   r   r   
key_statesvalue_statesattn_weightscausal_maskattn_outputs                r6   eager_attention_forwardr      s    3 ; ;<JUF$?$?@L<<z';';Aq'ABWLL!$Q1.D
0@0@0D.D%DE#k1==((2U]](SVVW\WbWbcL==((6??([L,,|\:K''1-88:K$$r7   c                        e Zd Zdeddf fdZ	 	 d
dej                  dej                  dej                  dz  deej                  ej                  f   dz  dej                  f
d	Z xZ	S )Glm4vVisionAttentionr_   rh   Nc                    t         |           |j                  | _        |j                  | _        | j                  | j                  z  | _        d| _        t        j                  |j                  |j                  dz  |j                        | _
        t        j                  |j                  |j                  d      | _        | j
                  dz  | _        || _        |j                  | _        d| _        y )Nr"   r   rV   F      )r*   r+   r3   r{   	num_headsr   r   r,   rY   attention_biasqkvrq   r   r_   attention_dropout	is_causalr   s     r6   r+   zGlm4vVisionAttention.__init__  s    %%))DNN2$%!99V//1C1Ca1GfNcNcdIIf00&2D2D5Q	}}d*!'!9!9r7   rC   
cu_seqlensrotary_pos_embposition_embeddingsc                    |j                   d   }| j                  |      j                  |d| j                  d      j	                  dddd      j                  d      \  }}}	|\  }
}t        |||
|      \  }}|j                  dd      j                  d      }|j                  dd      j                  d      }|	j                  dd      j                  d      }	t        j                  | j                  j                  t              }t        | j                        rT|dd  |d d z
  j                         } || |||	fd | j                   | j"                  sdn| j$                  ||||dd|\  }}n|dd  |d d z
  }|||	fD cg c](  }t'        j(                  ||j+                         d	      * }}t-        | D cg c]<  \  }}} || |||fd | j                   | j"                  sdn| j$                  dd
|d   > }}}}t'        j.                  |d	      }|j                  |d      j1                         }| j3                  |      }|S c c}w c c}}}w )Nr   r   r:   r"   r9           F)r   r   r   cu_seq_lens_qcu_seq_lens_kmax_length_qmax_length_kr  r   )r   r   r   r  )rI   r   r   r   r   unbindr   r   r   r   get_interfacer_   _attn_implementationr   r    maxr   r   r  r.   splittolistzipr   r   rq   )r2   rC   r  r  r  r   
seq_lengthquery_statesr   r   r   r   attention_interface
max_seqlenr   _r   r   splitsr   r   vattn_outputss                          r6   rF   zGlm4vVisionAttention.forward!  s    #((+
HH]#++J4>>2NVVWXZ[]^`abiijkl 	/j, 'S#>|ZY\^a#b j#--a3==a@))!Q/99!<
#--a3==a@(?(M(MKK,,.E)
 (4$QR.:cr?:??AJ0	
  $#'==d6L6L(('' NK" !nz#26GLXZdfrKsAGFGNN$4!<F    #F|  Aq! $	

 $( LL'+}}C$:P:P#
 
 
L   ))La8K!))*b9DDFii,-s   -I?AINN)
rL   rM   rN   r%   r+   r.   rw   rH   rF   rO   rP   s   @r6   r   r     s    0 T " /3HLB||B LLB t+	B
 #5<<#=>EB 
Br7   r   c                        e Zd Zd	 fdZ	 	 d
dej
                  dej
                  dej
                  dz  deej
                  ej
                  f   dz  dej
                  f
dZ xZS )Glm4vVisionBlockrh   Nc                     t         |           t        |j                  |j                        | _        t        |j                  |j                        | _        t        |      | _        t        |d      | _
        y )Nr4   FrV   )r*   r+   r(   r3   rms_norm_epsnorm1norm2r   attnrR   mlpr   s     r6   r+   zGlm4vVisionBlock.__init__g  s\    !&"4"4&:M:MN
!&"4"4&:M:MN
(0	 e4r7   rC   r  r  r  c                     | | j                   | j                  |      f|||d|z   }|| j                  | j                  |            z   }|S )N)r  r  r  )r#  r!  r$  r"  )r2   rC   r  r  r  r   s         r6   rF   zGlm4vVisionBlock.forwardn  sc     &			JJ}%)
!) 3	)

 )
 
 &M1J(KKr7   rh   Nr  )	rL   rM   rN   r+   r.   rw   rH   rF   rO   rP   s   @r6   r  r  f  sq    5 /3HL|| LL t+	
 #5<<#=>E 
r7   r  c                        e Zd ZU ej                  ed<   ddef fdZe	 	 	 ddedz  de	d   de
dz  ded	ef   fd
       Z ej                         ed               Zd Z xZS )Glm4vTextRotaryEmbeddingrz   Nr_   c                    t         |           |j                  | _        |j                  | _        || _        | j
                  j                  d   | _        | j                  }| j                  dk7  rt        | j                     } || j
                  |      \  }| _
        | j                  d|d       | j                  d|j                         d       |j                  j                  dg d      | _        y )	N	rope_typedefaultrz   Fr   original_inv_freqmrope_section)      r/  )r*   r+   max_position_embeddingsmax_seq_len_cachedoriginal_max_seq_lenr_   rope_parametersr*  compute_default_rope_parametersr   attention_scalingr   clonegetr-  )r2   r_   r   rope_init_fnrz   r5   s        r6   r+   z!Glm4vTextRotaryEmbedding.__init__  s    "("@"@$*$B$B!44[A!%!E!E>>Y&.t~~>L+7V+L($(ZeD0(..2BuU#3377Ur7   r   ztorch.deviceseq_lenrh   ztorch.Tensorc                 n   | j                   d   }| j                   j                  dd      }t        | dd      xs | j                  | j                  z  }t        ||z        }d}d|t        j                  d|dt        j                        j                  |t        j                  	      |z  z  z  }||fS )
a  
        Computes the inverse frequencies according to the original RoPE implementation
        Args:
            config ([`~transformers.PreTrainedConfig`]):
                The model configuration.
            device (`torch.device`):
                The device to use for initialization of the inverse frequencies.
            seq_len (`int`, *optional*):
                The current sequence length. Unused for this type of RoPE.
        Returns:
            Tuple of (`torch.Tensor`, `float`), containing the inverse frequencies for the RoPE embeddings and the
            post-processing scaling factor applied to the computed cos/sin (unused in this type of RoPE).
        
rope_thetapartial_rotary_factorr~   r   Nr   r9   rs   r   )r3  r7  getattrr3   num_attention_headsr   r.   r   int64r>   r   )	r_   r   r9  baser<  r   r{   attention_factorrz   s	            r6   r4  z8Glm4vTextRotaryEmbedding.compute_default_rope_parameters  s    & %%l3 & 6 6 : :;RTW X6:t4h8J8JfNhNh8h(223 U\\!S!5;;?BB&X]XcXcBdgjjk
 )))r7   c                 ^   | j                   d d d d d f   j                         j                  d|j                  d   dd      }|d d d d d d d f   j                         }t	        |j
                  j                  t              r/|j
                  j                  dk7  r|j
                  j                  nd}t        |d      5  |j                         |j                         z  j                  dd      }| j                  || j                        }t        j                  ||fd	      }|j                         | j                  z  }|j!                         | j                  z  }	d d d        j#                  |j$                  
      	j#                  |j$                  
      fS # 1 sw Y   AxY w)Nr   r"   r:   mpscpuF)device_typeenabledr9   r   rs   )rz   r   r   rI   r   r   typer   r!   r   apply_mroper-  r.   r   r   r5  r   r>   r=   )
r2   r   position_idsinv_freq_expandedposition_ids_expandedrE  r   embr   r   s
             r6   rF   z Glm4vTextRotaryEmbedding.forward  s`   
 !MM$a*=>DDFMMaQ]QcQcdeQfhjlmn ,Q4] ; A A C'1!((--'E!((--[`J`ahhmmfkUC 	5&,,.1F1L1L1NNYYZ[]^_E$$UD,>,>?E))UEN3C'')d444C'')d444C	5 vvAGGv$cff177f&;;;	5 	5s   B!F##F,c           	          |}|j                  |d      }t        j                  t        |      D cg c]  \  }}||dz      c}}d      }|S c c}}w )Nr:   r   r   )r  r.   r   	enumerate)r2   r   r-  sectionchunksr   chunkresults           r6   rH  z$Glm4vTextRotaryEmbedding.apply_mrope  sQ    W"-69JKXQE!a%LKQST Ls   A
ra   )NNN)rL   rM   rN   r.   rw   r   r$   r+   staticmethodr   r   rH   r   r4  no_gradr   rF   rH  rO   rP   s   @r6   r(  r(    s    llV V" )-+/"*$&*(* t* 
~u$	%	* *> U]]_<  < r7   r(  c                 |    | ddddf   }| ddddf   }t        j                  | |fd      j                  d      S )	r   .r   Nr9   r"   r:   r   r   )r.   r   flattenr   s      r6   rotate_half_llmrW    sJ    	
319B	
319B;;Ryb)11"55r7   c                    |j                  |      }|j                  |      }|dd|j                  d   dz  f   j                  dd      }|dd|j                  d   dz  f   j                  dd      }|j                  d   }| dd|f   | d|df   }}|dd|f   |d|df   }	}||z  t        |      |z  z   }
||z  t        |      |z  z   }t	        j
                  |
|gd      }
t	        j
                  ||	gd      }|
|fS )a  Applies Rotary Position Embedding to the query and key tensors.

    Args:
        q (`torch.Tensor`): The query tensor.
        k (`torch.Tensor`): The key tensor.
        cos (`torch.Tensor`): The cosine part of the rotary embedding.
        sin (`torch.Tensor`): The sine part of the rotary embedding.
        unsqueeze_dim (`int`, *optional*, defaults to 1):
            The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
            sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
            that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
            k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
            cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
            the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
    Returns:
        `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
    .Nr:   r9   r   )r   rI   repeat_interleaverW  r.   r   )r   r   r   r   unsqueeze_dim
rotary_dimq_rotq_passk_rotk_passr   r   s               r6   apply_rotary_pos_embr`    sD   $ --
&C
--
&C c'SYYr]a'''
(
:
:1"
:
EC
c'SYYr]a'''
(
:
:1"
:
EC 2Jc;J;&'3
+;)<6Ec;J;&'3
+;)<6E s{u5;<Gs{u5;<G ii&)r2Gii&)r2GGr7   c                   H    e Zd ZdZddededz  f fdZ	 	 	 	 ddej                  de	ej                  ej                  f   dz  dej                  dz  d	e
dz  d
ej                  dz  dee   de	ej                  ej                  dz  e	ej                     dz  f   fdZ xZS )Glm4vTextAttentionz
    Multi-headed attention from 'Attention Is All You Need' paper.
    and "Generating Long Sequences with Sparse Transformers".
    Nr_   	layer_idxc                    t         |           || _        || _        |j                  | _        |j
                  | _        | j                  | j                  z  | _        |j                  | _        | j                  | j                  z  | _	        d| _
        |j                  | _        |j                  | _        | j                  dz  | _        t        j                  | j                  | j                  | j                  z  d      | _        t        j                  | j                  | j                  | j                  z  d      | _        t        j                  | j                  | j                  | j                  z  d      | _        t        j                  | j                  | j                  z  | j                  d      | _        y )NTr   rV   F)r*   r+   r_   rc  r3   r>  r   r   r   r   r  r  r3  r   r,   rY   q_projk_projv_projo_projr2   r_   rc  r5   s      r6   r+   zGlm4vTextAttention.__init__  sI   "!--33((DNN:#)#=#= $(NNd6N6N$N!!'!9!9%55}}d*ii 0 0$..4==2PW[\ii 0 0$2J2JT]]2Zaefii 0 0$2J2JT]]2Zaefii >@P@PW\]r7   rC   r  r   past_key_valuescache_positionr   rh   c                 T   |j                         \  }}}	| j                  |      }
| j                  |      }| j                  |      }|
j	                  ||d| j
                        j                  dd      }
|j	                  ||d| j
                        j                  dd      }|j	                  ||d| j
                        j                  dd      }|\  }}t        |
|||      \  }
}|'|||d}|j                  ||| j                  |      \  }}t        j                  | j                  j                  t              } || |
|||f| j                  sdn| j                   | j"                  d|\  }}|j%                  ||d      j'                         }| j)                  |      }||fS )Nr:   r"   r9   )r   r   rk  r  )r   r   )sizere  rf  rg  rt   r   r   r`  updaterc  r   r  r_   r  r   r   r  r   r   r   rh  )r2   rC   r  r   rj  rk  r   bszq_lenr  r  r   r   r   r   cache_kwargsr  r   r   s                      r6   rF   zGlm4vTextAttention.forward  s    &**,UA{{=1[[/
{{=1#((eRGQQRSUVW__S%T]]CMMaQRS
#((eRGQQRSUVW&S#7jRUWZ#[ j&#&snUL'6'='=j,X\XfXfht'u$J(?(M(MKK,,.E)
 %8	%
  $}}C$2H2HLL	%
 	%
!\ "))#ub9DDFkk+.L((r7   ra   NNNN)rL   rM   rN   __doc__r$   r   r+   r.   rw   rH   r   
LongTensorr   r   rF   rO   rP   s   @r6   rb  rb    s    
^ ^3: ^. IM.2(,26+)||+) #5<<#=>E+) t+	+)
 +) ((4/+) -.+) 
u||U\\D0%2E2LL	M+)r7   rb  c                   V     e Zd Z fdZdej
                  dej
                  fdZ xZS )Glm4vTextMLPc                 *   t         |           || _        t        j                  |j
                  d|j                  z  d      | _        t        j                  |j                  |j
                  d      | _        t        |j                     | _        y )Nr9   FrV   )r*   r+   r_   r,   rY   r3   rX   gate_up_projr\   r
   r]   activation_fnr   s     r6   r+   zGlm4vTextMLP.__init__F  sp    IIf&8&8!f>V>V:V]bc6#;#;V=O=OV[\#F$5$56r7   rC   rh   c                     | j                  |      }|j                  dd      \  }}|| j                  |      z  }| j                  |      S )Nr9   r:   r   )rx  rQ  ry  r\   )r2   rC   	up_statesgates       r6   rF   zGlm4vTextMLP.forwardN  sL    %%m4	#//!/4i 2 24 88	~~i((r7   )rL   rM   rN   r+   r.   FloatTensorrF   rO   rP   s   @r6   rv  rv  E  s'    7)U%6%6 )5;L;L )r7   rv  c                   d    e Zd Zdedef fdZe	 	 	 	 	 	 ddej                  de	ej                  ej                  f   dz  dej                  dz  dej                  dz  d	edz  d
edz  dej                  dz  de	ej                  e	ej                  ej                  f   dz  f   fd       Z xZS )Glm4vTextDecoderLayerr_   rc  c                    t         |           |j                  | _        t        ||      | _        t        |      | _        t        |j                  |j                        | _	        t        |j                  |j                        | _
        t        |j                  |j                        | _        t        |j                  |j                        | _        y )Nr  )r*   r+   r3   rb  	self_attnrv  r$  r(   r   input_layernormpost_attention_layernormpost_self_attn_layernormpost_mlp_layernormri  s      r6   r+   zGlm4vTextDecoderLayer.__init__X  s    !--+FI>'+F,>,>FDWDWX(4V5G5GVM`M`(a%(4V5G5GVM`M`(a%".v/A/AvGZGZ"[r7   NrC   r  r   rI  rj  	use_cacherk  rh   c                    |}	| j                  |      } | j                  d|||||||d|\  }}
| j                  |      }|	|z   }|}	| j                  |      }| j	                  |      }| j                  |      }|	|z   }|S )N)rC   r  r   rI  rj  r  rk   )r  r  r  r  r$  r  )r2   rC   r  r   rI  rj  r  rk  r   residualr  s              r6   rF   zGlm4vTextDecoderLayer.forwardb  s     !,,]; *4>> 	
' 3)%+)	
 	
q 55mD =0 !55mD///> =0r7   )NNNNFN)rL   rM   rN   r$   r   r+   r   r.   rw   rH   rt  r   re   r}  rF   rO   rP   s   @r6   r  r  W  s    \ \3 \  IM.204(,!&26%||% #5<<#=>E% t+	%
 &&-% % $;% ((4/% 
u  %(9(95;L;L(L"MPT"TT	U% %r7   r  zJ
    Base class for Llava outputs, with hidden states and attentions.
    )custom_introc                       e Zd ZU dZdZej                  dz  ed<   dZe	dz  ed<   dZ
eej                     dz  ed<   dZeej                     dz  ed<   dZej                  dz  ed<   y)Glm4vModelOutputWithPasta[  
    past_key_values (`Cache`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
        It is a [`~cache_utils.Cache`] instance. For more details, see our [kv cache guide](https://huggingface.co/docs/transformers/en/kv_cache).

        Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see
        `past_key_values` input) to speed up sequential decoding.
    rope_deltas (`torch.LongTensor` of shape `(batch_size, )`, *optional*):
        The rope index difference between sequence length and multimodal rope.
    Nlast_hidden_staterj  rC   
attentionsrope_deltas)rL   rM   rN   rs  r  r.   r}  r   rj  r   rC   rH   r  r  rt  r  r7   r6   r  r    sv     37u((4/6$(OUT\(59M5**+d2926Je''(4/6+/K!!D(/r7   r  c                   \     e Zd ZU eed<   dZdZdZddgZdZ	dZ
dZdZdZeedZ fd	Z xZS )
Glm4vPreTrainedModelr_   model)imagevideotextTr  r  rj  rC   r  c                 "   t         |   |       t        |t              rod|j                  t        j                  d|j                  dt
        j                        |j                  z  z  z  }t        j                  |j                  |       y y )Nr~   r   r9   rs   )r*   _init_weightsr   ry   r|   r.   r   r{   r   initcopy_rz   )r2   r   rz   r5   s      r6   r  z"Glm4vPreTrainedModel._init_weights  sk    f%f89fllu||Avzz1TYT_T_/`cicmcm/mnoHJJv1 :r7   )rL   rM   rN   r#   r   base_model_prefixinput_modalitiessupports_gradient_checkpointing_no_split_modules_skip_keys_device_placement_supports_flash_attn_supports_sdpa_can_compile_fullgraph_supports_attention_backendr  rb  _can_record_outputsr  rO   rP   s   @r6   r  r    s\    1&*#02DE"3N!"&.(
2 2r7   r  c                        e Zd ZU eed<   dZdgZeedZ	d fdZ
d Zeedej                  d	ej                  d
ee   deez  fd              Z xZS )Glm4vVisionModelr_   )r  r  r  r  rh   c                 F   t         |   |       |j                  | _        |j                  | _        t	        |      | _        t        |      | _        |j                  |j                  z  }t        |dz        | _        t        j                  t        |j                        D cg c]  }t!        |       c}      | _        t%        |j&                  |j(                  |j*                        | _        t/        |j                  |j0                        | _        t        j4                  |j                  |j&                  |j                  |j                        | _        t/        |j                  |j0                        | _        d| _        | j=                          y c c}w )Nr9   )r{   r   r]   r  )rn   out_channelsrj   rk   F)r*   r+   spatial_merge_sizerl   r   r   rg   patch_embedr3   r   ry   r  r,   
ModuleListr   depthr  blocksr   rW   rX   r]   mergerr(   r   post_conv_layernormConv2d
downsamplepost_layernormgradient_checkpointing	post_init)r2   r_   r   r  r5   s       r6   r+   zGlm4vVisionModel.__init__  sA    "(";"; ++/708%%)9)998QGmmuV\\GZ$[!%5f%=$[\,&&F4L4LY_YjYj
 $00B0BH[H[#\ ))**//11,,	
 +6+=+=6CVCVW&+# %\s   %Fc                    g }|D ]s  \  }}}t        j                  |      j                  d      j                  d|      }|j	                  || j
                  z  | j
                  || j
                  z  | j
                        }|j                  dddd      }|j                         }t        j                  |      j                  d      j                  |d      }|j	                  || j
                  z  | j
                  || j
                  z  | j
                        }|j                  dddd      }|j                         }|j                  t        j                  ||gd      j                  |d             v t        j                  |d      }|d d dd f   j                         }| j                  |      }	|	|   j                  d      }
|
|fS )Nr"   r:   r   r9   r   r   )r.   r   r   r   r   r  r   rV  appendr   r   r   r  r  )r2   grid_thwpos_idsthwhpos_idswpos_idsmax_grid_sizerotary_pos_emb_fullr  s              r6   rot_pos_embzGlm4vVisionModel.rot_pos_emb  s    	SGAq!||A003::2qAH''T,,,''T,,,''	H  ''1a3H'')H||A003::1bAH''T,,,''T,,,''	H  ''1a3H'')HNN5;;(';DKKAqQR)	S* ))G+ AB++-"11-@,W5==a@w&&r7   rC   r  r   c           	      n   | j                  |      }| j                  |      }| j                  |      \  }}t        j                  ||fd      }|j                         |j                         f}t        j                  |dddf   |dddf   z  |dddf         j                  dt        j                  j                         r|j                  nt        j                        }t        j                  |dd	      }|dd |dd z
  j                         }	| j!                  ||	||dddf   j#                  |j$                        |dddf   j#                  |j$                              }| j&                  D ]  }
 |
|f||d
|} | j)                  |      }|j+                  d| j,                  | j,                  |j.                  d         }|j1                  dddd      }| j3                  |      j+                  d| j4                  j6                        }| j9                  |      }t;        ||      S )a\  
        hidden_states (`torch.Tensor` of shape `(seq_len, hidden_size)`):
            The final hidden states of the model.
        grid_thw (`torch.Tensor` of shape `(num_images_or_videos, 3)`):
            The temporal, height and width of feature shape of each image in LLM.

        Returns:
            `torch.Tensor`: hidden_states.
        r:   r   Nr"   r9   r   r   )r"   r   )r   )r  r  r   )r  pooler_output)r  r  r  r.   r   r   r   rY  cumsumjit
is_tracingr=   int32r   padr  r   r>   r   r  r  rt   r  rI   r   r  r_   rW   r  r   )r2   rC   r  r   r  image_type_idsrL  r  r  seqlensblkmerged_hidden_statess               r6   rF   zGlm4vVisionModel.forward  s&    ((700?)-)9)9()C&ii8bA"wwy#'')4,,Xad^hq!tn-LhWXZ[W[n]dd
 %*II$8$8$:(.. e 

 UU:vQ7
ab>JsO3;;=1a4 ##M$8$891a4 ##M$8$89
 ;; 	C%$7 	M	 ++M:%**'')@)@-BUBUVXBY
 &--aAq96;;B@[@[\#{{=9)+.
 	
r7   r&  )rL   rM   rN   r%   r   r  r  r  r   r  r+   r  r   r   r.   rw   r   r   rH   r   rF   rO   rP   s   @r6   r  r    s    )+,)*
8': 9
"\\9
5:\\9
MSTfMg9
	+	+9
  9
r7   r  c                       e Zd ZU eed<   dZdef fdZee	 	 	 	 	 	 	 dde	j                  dz  de	j                  dz  de	j                  dz  dedz  d	e	j                  dz  d
edz  de	j                  dz  dee   deez  fd              Z xZS )Glm4vTextModelr_   )r  c           	         t         |   |       |j                  | _        |j                  | _        t        j                  |j                  |j                  | j                        | _        t        j                  t        |j                        D cg c]  }t        ||       c}      | _        t        |j                  |j                        | _        t#        |      | _        d| _        | j)                          y c c}w )Nr  r_   F)r*   r+   pad_token_idpadding_idx
vocab_sizer,   r   r3   embed_tokensr  r   num_hidden_layersr  layersr(   r   normr(  
rotary_embr  r  ri  s      r6   r+   zGlm4vTextModel.__init__A  s     !.. ++LL):):F<N<NPTP`P`ammGLVMeMeGfg)"695g
 !!3!39L9LM	2&A&+# hs   DN	input_idsr   rI  rj  inputs_embedsr  rk  r   rh   c           
      ^   |d u |d uz  rt        d      |r6|4t        j                  j                         st	        | j
                        }|| j                  |      }|F||j                         nd}	t        j                  |	|	|j                  d   z   |j                        }|2|j                  ddd      j                  d|j                  d   d      }n2|j                  dk(  r#|d	   j                  d|j                  d   d      }|j                  dk(  r|j                  d   d
k(  r|d   }
|dd  }nd }
| j
                  |||||
d}t        di |}|}| j                  ||      }| j                   D ]  } ||f||
|||d|}|} | j#                  |      }t%        ||      S )N:You must specify exactly one of input_ids or inputs_embedsr  r   r"   r   r:   r   r9   )N.   )r_   input_embedsr   rk  rj  rI  )rI  )r   rI  rj  rk  r  )r  rj  r  )
ValueErrorr.   r  r  r   r_   r  get_seq_lengthr   rI   r   rt   r   ndimr   r  r  r  r   )r2   r  r   rI  rj  r  r  rk  r   past_seen_tokenstext_position_idsmask_kwargsr   rC   r  decoder_layerlayer_outputss                    r6   rF   zGlm4vTextModel.forwardQ  s    -t";<YZZ 09M9M9O*$++>O  --i8M!CRC^==?de"\\ "2]5H5H5K"KTaThThN
 )..q!R8??=CVCVWXCY[]^L!#'	299!\=O=OPQ=RTVWL !l&8&8&;q&@ ,Q'+L !% kk),,.-
 )7;7%"oom,oW![[ 
	*M)*. /-$7 M *M
	* 		-0&++
 	
r7   )NNNNNNN)rL   rM   rN   r$   r   r  r+   r   r   r.   rt  rw   r   r}  re   r   r   rH   r   rF   rO   rP   s   @r6   r  r  <  s         .2.204(,26!%26Q
##d*Q
 t+Q
 &&-	Q

 Q
 ((4/Q
 $;Q
 ((4/Q
 -.Q
 
(	(Q
  Q
r7   r  c                       e Zd ZU dZi ZdZeed<   ddgZ fdZ	d Z
d Z	 	 	 	 dd
ej                  d	z  dej                  d	z  dej                  d	z  dej                  d	z  deej                  ej                  f   f
dZee	 ddej&                  dej                  d	z  dee   deez  fd              Zee	 ddej&                  dej                  d	z  dee   deez  fd              Z	 	 d d
ej                  dej&                  dej&                  d	z  dej&                  d	z  fdZee	 	 	 	 	 	 	 	 	 	 	 d!d
ej                  d	z  dej                  d	z  dej                  d	z  ded	z  dej&                  d	z  dej                  d	z  dej&                  d	z  dej                  d	z  dej                  d	z  dej                  d	z  dej                  d	z  dee   deez  fd              Z xZS )"
Glm4vModelr  Fr_   r  r  c                     t         |   |       t        j                  |j                        | _        t        j                  |j                        | _        d | _	        | j                          y ra   )r*   r+   r  _from_configvision_configvisualr  text_configlanguage_modelr  r  r   s     r6   r+   zGlm4vModel.__init__  sU     &33F4H4HI,99&:L:LM 	r7   c                 6    | j                   j                         S ra   )r  get_input_embeddingsrJ   s    r6   r  zGlm4vModel.get_input_embeddings  s    ""7799r7   c                 :    | j                   j                  |       y ra   )r  set_input_embeddingsr2   r   s     r6   r  zGlm4vModel.set_input_embeddings  s    007r7   Nr  image_grid_thwvideo_grid_thwr   rh   c           
      F   | j                   j                  j                  }| j                   j                  }| j                   j                  }| j                   j
                  }g }	|h||c|}
|t        j                  |
      }t        j                  d|j                  d   |j                  d   |j                  |j                        }d\  }}d}|j                  |
j                        }t        |
      D ]  \  }}|||   dk(     }|j                         }g }d}|D ]T  }||k(  rd}n||k(  rd}||k(  r|s|j                  d       +||k(  r|r|j                  d	       D|j                  d
       V g }t!        j"                  t        |      d       D ]7  \  }}t%        |      }|d   d   }|d   d   dz   }|j                  |||f       9 g }d}|D ]:  \  }}}t'        |      dkD  r|d   j)                         dz   nd}|dk(  rQ||   d   ||   d   ||   d   }!} }|j+                         | j+                         |z  |!j+                         |z  }$}#}"t        j,                  |"      j/                  dd      j1                  d|#|$z        j3                         }%t        j,                  |#      j/                  ddd      j1                  |"d|$      j3                         }&t        j,                  |$      j/                  ddd      j1                  |"|#d      j3                         }'|j                  t        j4                  |%|&|'g      |z          |dz  }d}|d	k(  rb|||   d   ||   d   }!} }|| j+                         |z  |!j+                         |z  }$}#}"t7        |"      D ]  }(t        j8                  |(      j/                  dd      j1                  d|#|$z        j3                         }%t        j,                  |#      j/                  ddd      j1                  dd|$      j3                         }&t        j,                  |$      j/                  ddd      j1                  d|#d      j3                         }'|j                  t        j4                  |%|&|'g      |z           |dz  }|||   d   k\  r|dz  }d}|dz  }||z
  })|j                  t        j,                  |)      j/                  dd      j1                  dd      |z          d}= t        j:                  |d      j=                  dd      }*|*j                  |j                        |d|||   dk(  f<   |	j                  |*j)                         dz   t'        |
|         z
          t        j8                  |	|j                        j?                  d      }	||	fS ||jA                         jC                  d      dz
  }|jE                  |dk(  d       |j?                  d      j1                  ddd      j                  |j                        }|j)                  dd      d   j)                  dd      d   }+|+dz   |j                  d   z
  }	||	fS t        j,                  |j                  d   |j                        j/                  ddd      j1                  d|j                  d   d      }t        jF                  |j                  d   dg|j                  |j                        }	||	fS )aU  
        Calculate the 3D rope index based on image and video's temporal, height and width in LLM.

        Explanation:
            Each embedding sequence contains vision embedding and text embedding or just contains text embedding.

            For pure text embedding sequence, the rotary position embedding has no difference with modern LLMs.
            Examples:
                input_ids: [T T T T T], here T is for text.
                temporal position_ids: [0, 1, 2, 3, 4]
                height position_ids: [0, 1, 2, 3, 4]
                width position_ids: [0, 1, 2, 3, 4]

            For vision and text embedding sequence, we calculate 3D rotary position embedding for vision part
            and 1D rotary position embedding for text part.
            Examples:
                Temporal (Time): 3 patches, representing different segments of the video in time.
                Height: 2 patches, dividing each frame vertically.
                Width: 2 patches, dividing each frame horizontally.
                We also have some important parameters:
                fps (Frames Per Second): The video's frame rate, set to 1. This means one frame is processed each second.
                tokens_per_second: This is a crucial parameter. It dictates how many "time-steps" or "temporal tokens" are conceptually packed into a one-second interval of the video. In this case, we have 25 tokens per second. So each second of the video will be represented with 25 separate time points. It essentially defines the temporal granularity.
                temporal_patch_size: The number of frames that compose one temporal patch. Here, it's 2 frames.
                interval: The step size for the temporal position IDs, calculated as tokens_per_second * temporal_patch_size / fps. In this case, 25 * 2 / 1 = 50. This means that each temporal patch will be have a difference of 50 in the temporal position IDs.
                input_ids: [V V V V V V V V V V V V T T T T T], here V is for vision.
                vision temporal position_ids: [0, 0, 0, 0, 50, 50, 50, 50, 100, 100, 100, 100]
                vision height position_ids: [0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1]
                vision width position_ids: [0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1]
                text temporal position_ids: [101, 102, 103, 104, 105]
                text height position_ids: [101, 102, 103, 104, 105]
                text width position_ids: [101, 102, 103, 104, 105]
                Here we calculate the text start position_ids as the max vision position_ids plus 1.

        Args:
            input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
                Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
                it.
            image_grid_thw (`torch.LongTensor` of shape `(num_images, 3)`, *optional*):
                The temporal, height and width of feature shape of each image in LLM.
            video_grid_thw (`torch.LongTensor` of shape `(num_videos, 3)`, *optional*):
                The temporal, height and width of feature shape of each video in LLM.
            attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
                Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:

                - 1 for tokens that are **not masked**,
                - 0 for tokens that are **masked**.

        Returns:
            position_ids (`torch.LongTensor` of shape `(3, batch_size, sequence_length)`)
            mrope_position_deltas (`torch.Tensor` of shape `(batch_size)`)
        r   r   r"   r=   r   )r   r   FTr  r  r  c                     | d   S )Nr"   r  )r   s    r6   <lambda>z+Glm4vModel.get_rope_index.<locals>.<lambda>"  s    [\]^[_ r7   r:   r9   r   .r  r;   r   )$r_   r  r  image_token_idvideo_start_token_idvideo_end_token_idr.   	ones_liker/   rI   r=   r   r>   rN  r  r  	itertoolsgroupbyr   r   r  itemr   rt   r   rV  r   r   r   r   r   r   r   r  masked_fill_zeros),r2   r  r  r  r   r  r  r  r  mrope_position_deltastotal_input_idsrI  image_indexvideo_indexvideo_group_indexr   input_tokensinput_token_typevideo_check_flgtokeninput_type_groupr   groupstart_index	end_indexllm_pos_ids_listvideo_frame_nummodality_type	start_idxend_idxst_idxr  r  r  
llm_grid_t
llm_grid_h
llm_grid_wt_indexh_indexw_indext_idxtext_lenllm_positionsmax_position_idss,                                               r6   get_rope_indexzGlm4vModel.get_rope_index  sL   v "[[66II33#{{??![[;; " n&@ND^'O%!&!A ::""oo ''L (,$K !+../E/EFN )/ : W`9%nQ&71&<=	(//1#% "') 8E 44*."44*/.(//8.0_(//8(//78 $& "+"3"3I>N4OQ_"` KJC KE"'(1+K %b	!q 0I$++S+y,IJ	K $& "#9I 7,5M9g?BCS?TWX?X-b1557!;^_F$/*;7:*;7:*;7:  1 FFHFFH(::FFH(:: 1;J
 #(,,z":"?"?A"F"M"MbR\_iRi"j"r"r"t"',,z":"?"?2q"I"P"PQ[]_ak"l"t"t"v"',,z":"?"?1b"I"P"PQ[]gik"l"t"t"v(//Wgw<W0X[a0ab#q(*+&'1+*;7:*;7:  1 FFH(::FFH(:: 1;J
 &+:%6 gE&+ll5&9&>&>r1&E&L&LRQ[^hQh&i&q&q&sG&+ll:&>&C&CAr1&M&T&TUVXZ\f&g&o&o&qG&+ll:&>&C&CAq"&M&T&TUVXbdf&g&o&o&qG,33EKK'SZ@[4\_e4efg *Q.),{0KA0NN'1,K01-'1, $+Y#6(//X0F0K0KAr0R0Y0YZ[]_0`ci0ij*+o7,r !&		*: B J J1b Q?L?O?OP\PcPc?dS!^A%6!%;;<%,,]->->-@1-Ds?[\K]G^-^_oW`p %*LL1FyO_O_$`$j$jkl$m!!666)-224;;B?!C)).A*=qA+55a8??2rJMMnNcNcd#/#3#3Au#3#Ea#H#L#LRY]#L#^_`#a (81(<~?S?STV?W(W%  !666 LL!3I<L<LMT!Q^VAyq126 
 ).__Q'+$++#//)%  !666r7   pixel_values_videosr   c                 L   |j                  | j                  j                        }g }|D ]j  \  }}}t        j                  d|j                         |j                         g      j                  d      j                  |d      }|j                  |       l t        j                  |d      }	 | j                  |f|	dd|}
|j                  d      | j                  j                  dz  z  j                         }t        j                  |
j                  |      }||
_        |
S )[  
        pixel_values_videos (`torch.FloatTensor` of shape `(batch_size, num_channels, image_size, image_size)`):
            The tensors corresponding to the input videos.
        video_grid_thw (`torch.LongTensor` of shape `(num_videos, 3)`, *optional*):
            The temporal, height and width of feature shape of each video in LLM.
        r"   r   r   Tr  return_dictr:   r9   )rG  r  r=   r.   r   r   r   r   r  r   prodr  r  r  r  )r2   r!  r  r   temp_frames_hwr  r  r  repeated_rowflattened_video_grid_thwvision_outputssplit_sizesvideo_embedss                r6   get_video_featureszGlm4vModel.get_video_features}  s    266t{{7H7HI% 	0GAq! <<AFFHaffh(?@JJ1MTTUVXYZL!!,/	0 $)99^#C $
*BPT
X^
 &**2.$++2P2PRS2SS[[]{{>#?#?M'3$r7   pixel_valuesc                 <   |j                  | j                  j                        } | j                  |f|dd|}|j                  d      | j                  j                  dz  z  j                         }t        j                  |j                  |      }||_        |S )T  
        pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, image_size, image_size)`):
            The tensors corresponding to the input images.
        image_grid_thw (`torch.LongTensor` of shape `(num_images, 3)`, *optional*):
            The temporal, height and width of feature shape of each image in LLM.
        Tr$  r:   r9   )	rG  r  r=   r&  r  r  r.   r  r  )r2   r.  r  r   r*  r+  image_embedss          r6   get_image_featureszGlm4vModel.get_image_features  s     $(():):;$\gNX\g`fg%**2.$++2P2PRS2SS[[]{{>#?#?M'3$r7   r  image_featuresvideo_featuresc                 T   || | j                         t        j                  | j                  j                  t        j
                  |j                              k(  }|j                  d      }| | j                         t        j                  | j                  j                  t        j
                  |j                              k(  }|j                  d      }n2|| j                  j                  k(  }|| j                  j                  k(  }|j                         }|j                  d      j                  |      j                  |j                        }|At        ||   j                         |j                         k(  d| d|j                  d           |j                         }|j                  d      j                  |      j                  |j                        }|At        ||   j                         |j                         k(  d| d|j                  d           ||fS )z
        Obtains multimodal placeholder mask from `input_ids` or `inputs_embeds`, and checks that the placeholder token count is
        equal to the length of multimodal features. If the lengths are different, an error is raised.
        r  r:   z6Image features and image tokens do not match, tokens: z, features: r   z6Video features and video tokens do not match, tokens: )r  r.   r   r_   r  r   r   allvideo_token_idsumr   	expand_asr>   r   numelrI   )	r2   r  r  r3  r4  special_image_maskspecial_video_maskn_image_tokensn_video_tokenss	            r6   get_placeholder_maskzGlm4vModel.get_placeholder_mask  s    !.2M$2K2K2MT[[77uzzR_RfRfg3 " "4!7!7!;!.2M$2K2K2MT[[77uzzR_RfRfg3 " "4!7!7!; "+dkk.H.H!H!*dkk.H.H!H+//1/99"=GGVYYZgZnZno%"01779^=Q=Q=SSHHXXdeseyeyz{e|d}~
 ,//1/99"=GGVYYZgZnZno%"01779^=Q=Q=SSHHXXdeseyeyz{e|d}~ "#555r7   rI  rj  r  rk  c           
      Z   |du |duz  rt        d      | | j                         |      }|| j                  ||d      j                  }t	        j
                  |d      j                  |j                  |j                        }| j                  |||      \  }}|j                  ||      }|| j                  ||	d      j                  }t	        j
                  |d      j                  |j                  |j                        }| j                  |||      \  }}|j                  ||      }|t        |t              s|n|d	   }||j                  d
k(  rtt	        j                  |dddf   dd      }|j                  j                   r?|t	        j"                  |j                        j$                  z  }d|z
  j'                         }t)               xr2 |duxr |j*                  d   dk7  xs |duxr |j*                  d   dk7  }t)                xr) |duxr |d   dk(  xs |du xs |j-                         dk(  }|s|s| j.                   | j1                  |||	|      \  }}
|
| _        n|j*                  \  }}}|+|d   | j.                  z   j                  |j                        nd}t	        j2                  ||j                        }|j5                  dd      j7                  |d      }|#|j9                  ||j*                  d   z  d      }|j;                  |      }|j=                  d      j7                  ddd      } | j>                  dd|||||d|}tA        |jB                  |jD                  |jF                  |jH                  | j.                        S )a  
        image_grid_thw (`torch.LongTensor` of shape `(num_images, 3)`, *optional*):
            The temporal, height and width of feature shape of each image in LLM.
        video_grid_thw (`torch.LongTensor` of shape `(num_videos, 3)`, *optional*):
            The temporal, height and width of feature shape of each video in LLM.
        rope_deltas (`torch.LongTensor` of shape `(batch_size, )`, *optional*):
            The rope index difference between sequence length and multimodal rope.
        Nr  T)r%  r   r   )r3  )r4  full_attentionr  r"   r9   )dim1dim2r~   )r   r  r:   r   )r  rI  r   rj  r  rk  )r  rj  rC   r  r  r  )%r  r  r2  r  r.   r   r>   r   r=   r?  masked_scatterr-  r   dictr  diagonalis_floating_pointfinfominr   r   rI   r  r  r   r   rt   r   rY  addr   r  r  r  rj  rC   r  )r2   r  r   rI  rj  r  r.  r!  r  r  r  rk  r   r1  
image_maskr  r,  
video_maskattention_mask_tensorprefill_compiled_stageprefill_noncompiled_stage
batch_sizer  deltaoutputss                            r6   rF   zGlm4vModel.forward  s   2 -t";<YZZ 7D557	BM#22<]a2bppL 99\q9<<]=Q=QS`SfSfgL 55i_k5lMJ)88\RM*223Fdh2iwwL 99\q9<<]=Q=QS`SfSfgL 55i_k5lMAz)88\RM&0&FN[kLl " %05J5O5OST5T(-7LQPQT7RYZab(c%(..@@,AEKKPePkPkDlDpDp,p)-03H-H,M,M,O) &>%? &$&B9??1+=+B O!-M-2E2Ea2HA2M # -E,F(F )t+Fq0AQ0F V#t+T/M/M/OST/T & '*CHXHXH`,0,?,?""#8	 -@ -)k $/  -:,?,?)
J &1 $A&)9)99==m>R>RS 
  %||J}?S?ST+00B7>>z2N!-!33J%++a.4PVW3XE+//6+55a8??2rJ%$%% 
%)+')
 
 (%77#33!//))((
 	
r7   rr  ra   r  )NNNNNNNNNNN)rL   rM   rN   r  _checkpoint_conversion_mappingaccepts_loss_kwargsr#   r   r  r+   r  r  r.   rt  rw   rH   r   r   r   r}  r   r   r   r-  r2  r?  r   r  rF   rO   rP   s   @r6   r  r    s"   %'"02DE:8
 .22626.2|7##d*|7 ((4/|7 ((4/	|7
 t+|7 
u||U\\)	*|7|  37".. ((4/ +,	
 
+	+  8  37'' ((4/ +,	
 
+	+  0 4837(6##(6 (((6 ))D0	(6
 ))D0(6T  .2.204(,26,08<2626/326g
##d*g
 t+g
 &&-	g

 g
 ((4/g
 llT)g
 #..5g
 ((4/g
 ((4/g
 %%,g
 ((4/g
 +,g
 
)	)g
  g
r7   r  zQ
    Base class for Glm4v causal language model (or autoregressive) outputs.
    c                       e Zd ZU dZdZej                  dz  ed<   dZej                  dz  ed<   dZ	e
dz  ed<   dZeej                     dz  ed<   dZeej                     dz  ed<   dZej                  dz  ed<   y)	Glm4vCausalLMOutputWithPasta  
    loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
        Language modeling loss (for next-token prediction).
    logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
        Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
    past_key_values (`Cache`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
        It is a [`~cache_utils.Cache`] instance. For more details, see our [kv cache guide](https://huggingface.co/docs/transformers/en/kv_cache).

        Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see
        `past_key_values` input) to speed up sequential decoding.
    rope_deltas (`torch.LongTensor` of shape `(batch_size, )`, *optional*):
        The rope index difference between sequence length and multimodal rope.
    Nlosslogitsrj  rC   r  r  )rL   rM   rN   rs  rW  r.   r}  r   rX  rj  r   rC   rH   r  r  rt  r  r7   r6   rV  rV  G  s     &*D%

d
")'+FE$+$(OUT\(59M5**+d2926Je''(4/6+/K!!D(/r7   rV  c                        e Zd Zi ZddiZdZ fdZd Zd Ze		 dde
j                  d	e
j                  dz  d
ee   deez  fd       Ze		 dde
j                  de
j                  dz  d
ee   deez  fd       Zee		 	 	 	 	 	 	 	 	 	 	 	 dde
j                  dz  de
j(                  dz  de
j                  dz  dedz  de
j                  dz  de
j                  dz  de
j(                  dz  de
j                  dz  de
j                  dz  d	e
j                  dz  de
j                  dz  dee
j(                  z  d
ee   deez  fd              Z	 	 	 	 	 	 	 	 	 	 	 d  fd	Z	 dde
j                  dz  de
j(                  dz  dee
j(                  e
j(                  f   fdZ	 	 	 d!dedede
j                  dz  dee
j                  eeef   f   fdZ xZ S )"Glm4vForConditionalGenerationzlm_head.weightz(model.language_model.embed_tokens.weightFc                     t         |   |       t        |      | _        t	        j
                  |j                  j                  |j                  j                  d      | _	        | j                          y )NFrV   )r*   r+   r  r  r,   rY   r  r3   r  lm_headr  r   s     r6   r+   z&Glm4vForConditionalGeneration.__init__j  sS     '
yy!3!3!?!?ASASA^A^ejkr7   c                 6    | j                   j                         S ra   )r  r  rJ   s    r6   r  z2Glm4vForConditionalGeneration.get_input_embeddingsq  s    zz..00r7   c                 :    | j                   j                  |       y ra   )r  r  r  s     r6   r  z2Glm4vForConditionalGeneration.set_input_embeddingst  s    

''.r7   Nr!  r  r   rh   c                 @     | j                   j                  d||d|S )r#  )r!  r  r  )r  r-  )r2   r!  r  r   s       r6   r-  z0Glm4vForConditionalGeneration.get_video_featuresw  s/     -tzz,, 
 3N
V\
 	
r7   r.  r  c                 @     | j                   j                  d||d|S )r0  )r.  r  r  )r  r2  )r2   r.  r  r   s       r6   r2  z0Glm4vForConditionalGeneration.get_image_features  s'     -tzz,,p,Wepioppr7   r  r   rI  rj  r  labelsrk  logits_to_keepc                     | j                   d||||	|
|||||d
|}|d   }t        |t              rt        | d      n|}| j	                  |dd|ddf         }d}|2| j                  ||| j                  j                  j                        }t        |||j                  |j                  |j                  |j                        S )a  
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
            config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
        image_grid_thw (`torch.LongTensor` of shape `(num_images, 3)`, *optional*):
            The temporal, height and width of feature shape of each image in LLM.
        video_grid_thw (`torch.LongTensor` of shape `(num_videos, 3)`, *optional*):
            The temporal, height and width of feature shape of each video in LLM.

        Example:

        ```python
        >>> from PIL import Image
        >>> import httpx
        >>> from io import BytesIO
        >>> from transformers import AutoProcessor, Glm4vForConditionalGeneration

        >>> model = Glm4vForConditionalGeneration.from_pretrained("zai-org/GLM-4.1V-9B-Thinking")
        >>> processor = AutoProcessor.from_pretrained("zai-org/GLM-4.1V-9B-Thinking")

        >>> messages = [
            {
                "role": "user",
                "content": [
                    {"type": "image", "url": "https://www.ilankelman.org/stopsigns/australia.jpg"},
                    {"type": "text", "text": "What is shown in this image?"},
                ],
            },
        ]
        >>> url = "https://www.ilankelman.org/stopsigns/australia.jpg"
        >>> with httpx.stream("GET", url) as response:
        ...     image = Image.open(BytesIO(response.read()))

        >>> text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
        >>> inputs = processor(text=[text], images=[image], vision_infos=[vision_infos])

        >>> # Generate
        >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
        >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
        "The image shows a street scene with a red stop sign in the foreground. In the background, there is a large red gate with Chinese characters ..."
        ```)
r  r.  r!  r  r  rI  r   rj  r  rk  r   N)rX  ra  r  )rW  rX  rj  rC   r  r  r  )r  r   r   slicer\  loss_functionr_   r  r  rV  rj  rC   r  r  )r2   r  r   rI  rj  r  ra  r.  r!  r  r  rk  rb  r   rR  rC   slice_indicesrX  rW  s                      r6   rF   z%Glm4vForConditionalGeneration.forward  s    x $** 
% 3))%)+')
 
  
 9C>SV8W~ot4]kmA}a,?@A%%VFt{{OfOfOqOq%rD*#33!//))++
 	
r7   c                 f    t        |   |f|||||||	|
|||d|}d |d<   |s|r
d |d<   d |d<   |S )N)rj  r   r  rk  rI  r.  r!  r  r  r  is_first_iterationrI  r.  r!  )r*   prepare_inputs_for_generation)r2   r  rj  r   r  rk  rI  r  r.  r!  r  r  rh  r   model_inputsr5   s                  r6   ri  z;Glm4vForConditionalGeneration.prepare_inputs_for_generation  ss    $ w<
+)')%% 3))1
 
" (,^$!i+/L(26L./r7   c                    || | j                         t        j                  | j                  j                  t        j
                  |j                              k(  d   }| | j                         t        j                  | j                  j                  t        j
                  |j                              k(  d   }| | j                         t        j                  | j                  j                  t        j
                  |j                              k(  d   }nK|| j                  j                  k(  }|| j                  j                  k(  }|| j                  j                  k(  }t        j                  |j                         |j                         z
  d      }|dkD  }|| z  }|j                  d      }	|j                  d      }
|	|
fS )aa  
        Get the number of images and videos for each sample to calculate the separation length of the sample tensor.
        These parameters are not passed through the processor to avoid unpredictable impacts from interface modifications.

        Args:
            input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
                Indices of input sequence tokens in the vocabulary.

        Returns:
            image_nums (`torch.LongTensor` of shape `(batch_size, num_images_sample)`)
            video_nums (`torch.LongTensor` of shape `(batch_size, num_videos_sample)`)
        r  ).r   r"   r   r   )r  r.   r   r_   image_start_token_idr   r   r  r  r  r   r8  )r2   r  r  is_imageis_video_startis_video_endvideo_levelinside_videostandalone_imagesimage_countsvideo_countss              r6   _get_image_nums_and_video_numsz<Glm4vForConditionalGeneration._get_image_nums_and_video_nums  s   $ $.4,,.LL!A!A\i\p\pq H .4,,.LL!A!A\i\p\pq N .4,,.LL!?!?uzzZgZnZno L !DKK$D$DDH&$++*J*JJN$(F(FFL ll>#5#5#7,:J:J:L#LRST"Q %6 ),,,3%))a)0\))r7   expand_sizeis_encoder_decoderc                      dk(  rfS g d fd}fd} |      j                  d       |      |r*j                  d      t        d       |d         d<   fS )	Nr"   )r.  r  r!  r  second_per_grid_tsc                 4   j                  dd       }j                  dd       }j                  j                  dd             \  }}d }| D ]9  }|dk(  rct        j                  |t	        |            }|D cg c]'  }t        j
                  |d      j                         ) }	} || |   |	
	      | |<   l|dk(  rt	        |      }	 || |   |	
	      | |<   |d
k(  rct        j                  |t	        |            }|D cg c]'  }t        j
                  |d      j                         ) }	} || |   |	
	      | |<   |dk(  rt	        |      }	 || |   |	
	      | |<   |dk(  s  || |   t	        |      
	      | |<   < | S c c}w c c}w )Nr  r  r  )r  c                     t        j                  | |      }|gdg| j                         dz
  z  z   }t        j                  |D cg c]  } |j                  |  c}d      }|S c c}w )Nr"   r   r   )r.   r  r{   r   r   )r   r   repeat_timessamplesrepeat_argssamplerR  s          r6   _repeat_interleave_sampleszGlm4vForConditionalGeneration._expand_inputs_for_generation.<locals>._expand_dict_for_generation_visual.<locals>._repeat_interleave_sampleso  sa    ++a1+nsaeegk/BBg#VFMFMM;$?#V\]^ $Ws   A&r.  r"   r   )r   r|  r!  ry  )r7  ru  r.   r  r   r&  r8  )dict_to_expandr  r  
image_nums
video_numsr  r   r}  r  r   rv  r  model_kwargsr2   s             r6   "_expand_dict_for_generation_visualzgGlm4vForConditionalGeneration._expand_inputs_for_generation.<locals>._expand_dict_for_generation_visualh  s   )--.>EN)--.>EN%)%H%H)9)9/4)P &I &"J
 & .(#kk.$z:JKGMTU6uzz&a8<<>UGU*D&s+W;+N3' ,,":.G*D&s+W;+N3' 11#kk.$z:JKGMTU6uzz&a8<<>UGU*D&s+W;+N3' ,,":.G*D&s+W;+N3' 00*D&s+T*5ET_+N3'7< "!3 V Vs   =,F,Fc                     | D ]J  }|dk7  s	| |   t        | |   t        j                        s-|vs2| |   j                  d      | |<   L | S )Nrk  r   r   )r   r.   rw   rY  )r  r   rv  visual_keyss     r6   _expand_dict_for_generationz`Glm4vForConditionalGeneration._expand_inputs_for_generation.<locals>._expand_dict_for_generation  sl    % d++&s+7">##6E;.*8*=*O*OP[ab*O*cN3'd "!r7   r   r   encoder_outputszMIf `is_encoder_decoder` is True, make sure that `encoder_outputs` is defined.)rY  r7  r  )r2   rv  rw  r  r  r  r  r  s   `` ``  @r6   _expand_inputs_for_generationz;Glm4vForConditionalGeneration._expand_inputs_for_generationW  s     !l**w+	"Z		" :,G !33KQ3GI2<@ 12: !pqq.I,WhJi.jL*+,&&r7   ra   )NNNNNNNNNNNr   )NNNNNTNNNNF)r"   FN)!rL   rM   rN   rS  _tied_weights_keysrT  r+   r  r  r   r.   r}  rt  r   r   rH   r   r-  r2  r   rw   r   r   rV  rF   ri  ru  re   rE  r   r   r  rO   rP   s   @r6   rZ  rZ  d  s   %'"*,VW1/  37
"..
 ((4/
 +,	

 
+	+
 
   37q''q ((4/q +,	q
 
+	+q q  .2.204(,26*.,08<262626-.Y
##d*Y
 t+Y
 &&-	Y

 Y
 ((4/Y
   4'Y
 llT)Y
 #..5Y
 ((4/Y
 ((4/Y
 ((4/Y
 ell*Y
 +,Y
 
,	,Y
  Y
|   )\ .26*##d*6* ||d*6* 
u||U\\)	*	6*t #(-1	U'U' !U' ##d*	U' 
uc3h/	0U'r7   rZ  )rZ  r  r  r  r  )r  )r"   )Zr  collections.abcr   dataclassesr   typingr   r   r.   torch.nnr,   torch.nn.functionalr   r   r    r	   r  activationsr
   cache_utilsr   r   
generationr   integrationsr   masking_utilsr   modeling_flash_attention_utilsr   modeling_layersr   modeling_outputsr   r   r   modeling_rope_utilsr   r   modeling_utilsr   r   processing_utilsr   utilsr   r   r   r   r   utils.genericr   r    r!   configuration_glm4vr#   r$   r%   Moduler(   rR   rg   ry   r   r   r   rw   rH   r   r   r   r   r   r   r  r(  rW  r`  rb  rv  r  r  r  r  r  r  rV  rZ  __all__r  r7   r6   <module>r     s  (  $ !        & ! . ) 7 / B 9 ` ` K F &  ^ ] P P Y'J299 J (J(fBII fBII (  fRYY f"HBII HV(||+0<<>Cll
5<<%&	UU\\ 	U# 	U%,, 	U& %II%<<% 
% <<	%
 LL4'% % % '(%4P299 Pf1 6Jryy JZ6%PE) E)P)299 )$16 1h 
0{ 0 0$ 2? 2 20}
+ }
@ g
) g
 g
T \
% \
 \
~ 
0+ 0 0.H'$8/ H'V
 xr7   