
    i                        d dl Z d dlmZ d dlmZ d dlmZ d dlZd dlm	Z	 d dl
m	c mZ d dlmZ ddlmZ ddlmZ dd	lmZmZ dd
lmZ ddlmZ ddlmZmZ ddlmZ ddl m!Z! ddl"m#Z#m$Z$m%Z%m&Z&m'Z' ddl(m)Z)m*Z* ddl+m,Z,m-Z- ddl.m/Z/ ddl0m1Z1m2Z2m3Z3m4Z4m5Z5 ddl6m7Z7m8Z8 ddl9m:Z:m;Z;  e4jx                  e=      Z> G d de	j~                        Z@ G d de	j~                        ZA G d dej                  j~                        ZB G d de	j~                        ZC G d d e	j                        ZE ed!       G d" d!e	j~                               ZF G d# d$e	j~                        ZGd%ej                  d&ej                  d'ej                  d(eIej                  ej                  f   fd)ZJd*ej                  d+eKd(ej                  fd,ZL	 d_d-e	j~                  d.ej                  d/ej                  d0ej                  d1ej                  dz  d2eMd3eMfd4ZN	 d_d-e	j~                  d.ej                  d/ej                  d0ej                  d1ej                  dz  d2eMd3eMfd5ZO G d6 d7e	j~                        ZP G d8 d9e!      ZQe2 G d: d;e-             ZRe2 G d< d=eR             ZS G d> d?eRe      ZTe e2d@A       G dB dCe'                    ZU G dD dEej                  j~                        ZV G dF dGe	j~                        ZWdH ZX G dI dJe	j~                        ZYdKej                  d.ej                  fdLZZd.ej                  d/ej                  dKej                  d(eIej                  ej                  f   fdMZ[ G dN dOe	j~                        Z\ G dP dQe	j~                        Z] G dR dSe!      Z^ G dT dUe	j~                        Z_ G dV dWe	j~                        Z` G dX dYe	j~                        Za G dZ d[eR      Zb G d\ d]eRe      Zcg d^Zdy)`    N)Callable)	dataclass)Optional)Llama4VisionConfig   )initialization)ACT2FN)CacheDynamicCache)GenerationMixin)use_kernel_forward_from_hub)create_causal_maskcreate_chunked_causal_mask)FlashAttentionKwargs)GradientCheckpointingLayer)BaseModelOutputBaseModelOutputWithPastBaseModelOutputWithPoolingCausalLMOutputWithPastModelOutput)ROPE_INIT_FUNCTIONSdynamic_rope_update)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack)TransformersKwargsauto_docstringcan_return_tupleloggingtorch_compilable_check)check_model_inputsmaybe_autocast   )Llama4ConfigLlama4TextConfigc                   \     e Zd Zdef fdZdej                  dej                  fdZ xZS )Llama4TextExpertsconfigc                    t         |           |j                  | _        |j                  | _        |j
                  | _        | j                  | _        t        j                  t        j                  | j                  | j
                  d| j                  z              | _        t        j                  t        j                  | j                  | j                  | j
                  f            | _        t        |j                     | _        y N   )super__init__num_local_expertsnum_expertsintermediate_sizehidden_size
expert_dimnn	Parametertorchzerosgate_up_projempty	down_projr	   
hidden_actact_fnselfr(   	__class__s     t/home/obispo/Crisostomo_bridge/mision_env/lib/python3.12/site-packages/transformers/models/llama4/modeling_llama4.pyr-   zLlama4TextExperts.__init__8   s    !33!'!9!9!--00LLT5E5EtGWGWYZ]a]l]lYl)mnekk43C3CT__VZVfVf2g&hiV../    hidden_statesreturnc                 v   |j                  | j                  j                  d   d| j                        }t	        j
                  || j                        }|j                  dd      \  }}t	        j
                  || j                  |      z  | j                        }|j                  d| j                        }|S )a2  
        This should really not be run on a single machine, as we are reaching compute bound:
        - the inputs are expected to be "sorted" per expert already.
        - the weights are viewed with another dim, to match num_expert, 1, shape * num_tokens, shape

        Args:
            hidden_states (torch.Tensor): (batch_size * token_num, hidden_size)
            selected_experts (torch.Tensor): (batch_size * token_num, top_k)
            routing_weights (torch.Tensor): (batch_size * token_num, top_k)
        Returns:
            torch.Tensor
        r   r+   dim)	viewr7   shaper1   r5   bmmchunkr;   r9   )r=   rA   gate_upgateupnext_statess         r?   forwardzLlama4TextExperts.forwardB   s     &**4+<+<+B+B1+Er4K[K[\))M4+<+<====+biidkk$&7!7$..I!&&r4+;+;<r@   )	__name__
__module____qualname__r%   r-   r5   TensorrO   __classcell__r>   s   @r?   r'   r'   7   s+    0/ 0U\\ ell r@   r'   c                   &     e Zd Zd fd	Zd Z xZS )Llama4TextMLPc                 f   t         |           ||j                  }|| _        t	        j
                  |j                  |d      | _        t	        j
                  |j                  |d      | _        t	        j
                  ||j                  d      | _	        t        |j                     | _        y NFbias)r,   r-   r0   r(   r3   Linearr1   	gate_projup_projr9   r	   r:   activation_fn)r=   r(   r0   r>   s      r?   r-   zLlama4TextMLP.__init__Y   s    $ & 8 86#5#57HuUyy!3!35FUS#4f6H6HuU#F$5$56r@   c                     | j                  | j                  |            | j                  |      z  }| j                  |      S N)r_   r]   r^   r9   )r=   xr9   s      r?   rO   zLlama4TextMLP.forwarde   s7    &&t~~a'89DLLOK	~~i((r@   ra   rP   rQ   rR   r-   rO   rT   rU   s   @r?   rW   rW   X   s    
7)r@   rW   c                   8     e Zd Zddef fdZd Zd Zd Z xZS )Llama4TextL2Normepsc                 0    t         |           || _        y ra   )r,   r-   rf   )r=   rf   r>   s     r?   r-   zLlama4TextL2Norm.__init__k   s    r@   c                     |t        j                  |j                  d      j                  dd      | j                  z         z  S Nr+   rD   T)keepdimr5   rsqrtpowmeanrf   r=   rb   s     r?   _normzLlama4TextL2Norm._normo   4    5;;quuQx}}R}>IJJJr@   c                 ^    | j                  |j                               j                  |      S ra   )rp   floattype_asro   s     r?   rO   zLlama4TextL2Norm.forwardr   s"    zz!'')$,,Q//r@   c                      d| j                    S )Nzeps=rf   r=   s    r?   
extra_reprzLlama4TextL2Norm.extra_repru   s    dhhZ  r@   )gư>)	rP   rQ   rR   rs   r-   rp   rO   rx   rT   rU   s   @r?   re   re   j   s    E K0!r@   re   c                   2     e Zd Zd fd	Zd Zd Zd Z xZS )Llama4TextRMSNormc                     t         |           || _        t        j                  t        j                  |            | _        y)z<
        Llama4RMSNorm is equivalent to T5LayerNorm
        N)r,   r-   rf   r3   r4   r5   onesweight)r=   r1   rf   r>   s      r?   r-   zLlama4TextRMSNorm.__init__z   s0     	ll5::k#:;r@   c                     |t        j                  |j                  d      j                  dd      | j                  z         z  S ri   rk   ro   s     r?   rp   zLlama4TextRMSNorm._norm   rq   r@   c                 |    | j                  |j                               j                  |      }|| j                  z  S ra   )rp   rs   rt   r}   )r=   rb   outputs      r?   rO   zLlama4TextRMSNorm.forward   s0    AGGI&..q1##r@   c                 ^    t        | j                  j                         d| j                   S )Nz, eps=)tupler}   rH   rf   rw   s    r?   rx   zLlama4TextRMSNorm.extra_repr   s'    ))*+6$((<<r@   )gh㈵>)rP   rQ   rR   r-   rp   rO   rx   rT   rU   s   @r?   rz   rz   y   s    <K$=r@   rz   c                   (     e Zd Z fdZ fdZ xZS )Llama4Routerc                     t         |   |j                  |j                  d       |j                  | _        |j
                  | _        y rY   )r,   r-   r1   r.   r/   num_experts_per_toktop_kr<   s     r?   r-   zLlama4Router.__init__   s>    ++V-E-EER!33//
r@   c                 t   t         |   |      }t        j                  || j                  d      \  }}t        j
                  |t        d            j                  d||      }t        j                  j                  j                  |j                               j                  |j                        }||fS )Nr#   rE   z-inf)r,   rO   r5   topkr   	full_likers   scatter_r3   
functionalsigmoidtodtype)r=   rA   router_logitsrouter_top_valuerouter_indicesrouter_scoresr>   s         r?   rO   zLlama4Router.forward   s    6+0::mTZZUV+W(.uV}ENNqR`brs++33M4G4G4IJMMmNaNabm++r@   rc   rU   s   @r?   r   r      s    0
, ,r@   r   Llama4TextMoec                   $     e Zd Z fdZd Z xZS )r   c                     t         |           |j                  | _        |j                  | _        |j                  | _        t        |      | _	        t        |      | _        t        |      | _        y ra   )r,   r-   r   r   r1   
hidden_dimr.   r/   r'   expertsr   routerrW   shared_expertr<   s     r?   r-   zLlama4TextMoe.__init__   s[    //
 ,,!33(0"6**62r@   c                    |j                  d| j                        }| j                  |      \  }}|j                  |j                  d   d      }||j                  dd      j                  dd      z  }| j                  |      }| j                  |      }|j                  |j                  |j                  d   d|j                  d         j                  d             ||fS )NrD   r#   r   rE   )
reshaper   r   repeatrH   	transposer   r   add_sum)r=   rA   r   r   	routed_in
routed_outouts          r?   rO   zLlama4TextMoe.forward   s    %--b$//B'+{{='A$}!(()<)<Q)?C	 7 71 = E Eb! LL	\\),
  /##M$7$7$:B
@P@PQS@TUYY^_Y`aM!!r@   rc   rU   s   @r?   r   r      s    3"r@   c                        e Zd ZU ej                  ed<   ddef fdZe	 	 	 ddedz  de	d   de
dz  ded	ef   fd
       Z ej                         ed               Z xZS )Llama4TextRotaryEmbeddinginv_freqNr(   c                    t         |           |j                  | _        |j                  | _        || _        | j
                  j                  d   | _        | j                  }| j                  dk7  rt        | j                     } || j
                  |      \  }| _
        | j                  d|d       | j                  d|j                         d       y )N	rope_typedefaultr   F)
persistentoriginal_inv_freq)r,   r-   max_position_embeddingsmax_seq_len_cachedoriginal_max_seq_lenr(   rope_parametersr   compute_default_rope_parametersr   attention_scalingregister_bufferclone)r=   r(   devicerope_init_fnr   r>   s        r?   r-   z"Llama4TextRotaryEmbedding.__init__   s    "("@"@$*$B$B!44[A!%!E!E>>Y&.t~~>L+7V+L($(ZeD0(..2BuUr@   r   ztorch.deviceseq_lenrB   ztorch.Tensorc                    | j                   d   }t        | dd      xs | j                  | j                  z  }d}d|t	        j
                  d|dt        j                        j                  |t        j                        |z  z  z  }||fS )	a  
        Computes the inverse frequencies according to the original RoPE implementation
        Args:
            config ([`~transformers.PreTrainedConfig`]):
                The model configuration.
            device (`torch.device`):
                The device to use for initialization of the inverse frequencies.
            seq_len (`int`, *optional*):
                The current sequence length. Unused for this type of RoPE.
        Returns:
            Tuple of (`torch.Tensor`, `float`), containing the inverse frequencies for the RoPE embeddings and the
            post-processing scaling factor applied to the computed cos/sin (unused in this type of RoPE).
        
rope_thetahead_dimN      ?r   r+   r   )r   r   )	r   getattrr1   num_attention_headsr5   arangeint64r   rs   )r(   r   r   baserF   attention_factorr   s          r?   r   z9Llama4TextRotaryEmbedding.compute_default_rope_parameters   s    & %%l3fj$/c63E3EIcIc3c U\\!S!5;;?BB&X]XcXcBdgjjk
 )))r@   c                 v   | j                   d d d d f   j                         j                  |j                  d   dd      }|d d d d d f   j                         }t	        |j
                  j                  t              r/|j
                  j                  dk7  r|j
                  j                  nd}t        |d      5  |j                  |j
                        |z  j                  dd      }t        j                  t        j                  |      |      }|| j                  z  }d d d        |S # 1 sw Y   S xY w)	Nr   rD   r#   mpscpuF)device_typeenabledr+   )r   rs   expandrH   
isinstancer   typestrr"   r   r   r5   polar	ones_liker   )r=   rb   position_idsinv_freq_expandedposition_ids_expandedr   freqs	freqs_ciss           r?   rO   z!Llama4TextRotaryEmbedding.forward   s    !MM$4-8>>@GGHZHZ[\H]_acde ,QaZ 8 > > @'1!((--'E!((--[`J`ahhmmfkUC 	;&))!((36KKVVWXZ[\EEOOE$:EBI!D$:$::I	;
 	;
 s   =A'D..D8ra   )NNN)rP   rQ   rR   r5   rS   __annotations__r%   r-   staticmethodr   intr   rs   r   no_gradr   rO   rT   rU   s   @r?   r   r      s    llV/ V  *.+/"* 4'*(* t* 
~u$	%	* *< U]]_
  
r@   r   xqxkr   rB   c           	      &   t        j                   | j                         j                  g | j                  d d dd       }t        j                   |j                         j                  g |j                  d d dd       }t        j
                  ||d d d d d d d f   z        j                  d      }t        j
                  ||d d d d d d d f   z        j                  d      }|j                  |       |j                  |      fS )NrD   r+   r   )r5   view_as_complexrs   r   rH   view_as_realflattenrt   )r   r   r   xq_xk_xq_outxk_outs          r?   apply_rotary_embr      s    
 

 2
 2 2 IBHHSbM I2 Iq I
JC


 2
 2 2 IBHHSbM I2 Iq I
JCi1dA&> >?GGJFi1dA&> >?GGJF>>"v~~b111r@   rA   n_repc                     | j                   \  }}}}|dk(  r| S | dddddddddf   j                  |||||      } | j                  |||z  ||      S )z
    This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
    num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
    r#   N)rH   r   r   )rA   r   batchnum_key_value_headsslenr   s         r?   	repeat_kvr      so    
 2?1D1D.Ehz!!Qa"23::5BUW\^bdlmM  (;e(CT8TTr@   modulequerykeyvalueattention_maskscalingdropoutc                    t        || j                        }t        || j                        }	t        j                  ||j	                  dd            |z  }
|#|d d d d d d d |j
                  d   f   }|
|z   }
t        j                  j                  |
d      }
t        j                  j                  |
|| j                        }
t        j                  |
|	      }|j	                  dd      j                         }||
fS )Nr+   r   rD   rE   ptrainingr#   )r   num_key_value_groupsr5   matmulr   rH   r3   r   softmaxr   r   
contiguousr   r   r   r   r   r   r   kwargs
key_statesvalue_statesattn_weightscausal_maskattn_outputs                r?   eager_attention_forwardr     s     3 ; ;<JUF$?$?@L<<z';';Aq'ABWLL!$Q1.D
0@0@0D.D%DE#k1==((2(>L==((6??([L,,|\:K''1-88:K$$r@   c                    t        || j                        }t        || j                        }	t        j                  ||j	                  dd            | j
                  dz  z  }
|#|d d d d d d d |j                  d   f   }|
|z   }
t        j                  j                  |
d      }
t        j                  j                  |
|| j                        }
t        j                  |
|	      }|j	                  dd      j                         }||
fS )	Nr+   r         r   rD   rE   r   r#   )r   r   r5   r   r   r   rH   r3   r   r   r   r   r   r   s                r?   vision_eager_attention_forwardr  (  s     3 ; ;<JUF$?$?@L<<z';';Aq'ABV__VZEZZL!$Q1.D
0@0@0D.D%DE#k1==((2(>L==((6??([L,,|\:K''1-88:K$$r@   c                   2    e Zd ZdZdef fdZ	 	 ddej                  deej                  ej                  f   dej                  dz  de	dz  d	ej                  dz  d
ee   deej                  ej                  dz  eej                     dz  f   fdZ xZS )Llama4TextAttentionz=Multi-headed attention from 'Attention Is All You Need' paperr(   c                    t         |           || _        || _        t	        |d|j
                  |j                  z        | _        |j                  | _        |j                  |j                  z  | _	        |j                  | _        | j                  dz  | _
        |j                  | _        |j                  | _        |j                  | _        |j                  | _        d| _        |j                   |   | _        t%        j&                  |j
                  |j                  | j                  z  |j(                        | _        t%        j&                  |j
                  |j                  | j                  z  |j(                        | _        t%        j&                  |j
                  |j                  | j                  z  |j(                        | _        t%        j&                  |j                  | j                  z  |j
                  |j(                        | _        | j                  j2                  r(| j"                  rt5        |j6                        | _        y y y )Nr   r   TrZ   )r,   r-   r(   	layer_idxr   r1   r   r   r   r   r   
attn_scalefloor_scaleattn_temperature_tuningattention_dropout	is_causalno_rope_layersuse_roper3   r\   attention_biasq_projk_projv_projo_projuse_qk_normre   rms_norm_epsqk_normr=   r(   r  r>   s      r?   r-   zLlama4TextAttention.__init__E  s   "
F4F4F&JdJd4de#)#=#= $*$>$>&B\B\$\!#)#=#= }}d* ++!--'-'E'E$!'!9!9--i8ii : :T]] JQWQfQf
 ii : :T]] JQWQfQf
 ii : :T]] JQWQfQf
 ii&&68J8JQWQfQf
 ;;""t}}+F,?,?@DL (5"r@   NrA   position_embeddingsr   past_key_valuescache_positionr   rB   c                 H   |j                   d d }g |d| j                  }| j                  |      j                  |      }	 | j	                  |      j                  g |d| j                   }
| j                  |      j                  |      j                  dd      }| j                  r)t        |	|
|j                  |	j                              \  }	}
t        | d      r"| j                  |	      }	| j                  |
      }
| j                  r| j                  st        j                  t        j                   |j#                         dz   | j$                  z              | j&                  z  dz   }|j                  d|d   ddf      j)                  g |dd      }|	|z  j                  |	j*                        }	|	j                  dd      }	|
j                  dd      }
|%d|i}|j-                  |
|| j.                  |      \  }
}t1        j2                  | j4                  j6                  t8              } || |	|
||f| j:                  sdn| j<                  | j>                  d|\  }} |j@                  g |d jC                         }| jE                  |      }||fS )	NrD   r#   r+   r  r   r          )r   r   )#rH   r   r  rG   r  r  r   r  r   r   r   hasattrr  r  r5   log1pfloorrs   r  r  r   r   updater  r   get_interfacer(   _attn_implementationr   r   r	  r   r   r   r  )r=   rA   r  r   r  r  r   input_shapehidden_shapequery_statesr   r   attn_scalescache_kwargsattention_interfacer   r   s                    r?   rO   zLlama4TextAttention.forwardc  s    $))#2.88b8$--8{{=166|D4T[[/44UkU2Ut}}U
{{=166|DNNqRST=='7j*=*@*@ATAT*U($L* 4#<<5Lj1J ''EKK)=)=)?#)EIYIY(YZ[^b^m^mmpss  &**A{21+EFMMNbP[Nb]^Nb`aNbcK(;6::<;M;MNL#--a3))!Q/
&,n=L'6'='=j,X\XfXfht'u$J(?(M(MKK,,.E)
 %8	%
  $}}C$2H2HLL	%
 	%
!\ *k));;;;FFHkk+.L((r@   NN)rP   rQ   rR   __doc__r%   r-   r5   rS   r   r
   
LongTensorr   r   rO   rT   rU   s   @r?   r  r  B  s    GA/ AF )-269)||9) #5<<#=>9) t+	9)
 9) ((4/9) -.9) 
u||U\\D0%2E2LL	M9)r@   r  c                   Z    e Zd Z fdZ	 	 	 	 	 	 ddej
                  dej
                  dz  dej                  dz  dedz  dedz  dej                  dz  d	e	ej
                  ej
                  f   dz  d
e
e   de	ej                  e	ej                  ej                  f   dz  f   fdZ xZS )Llama4TextDecoderLayerc                    t         |           |j                  | _        || _        |j                  |   | _        t        ||      | _        ||j                  v | _	        | j                  rt        |      | _        nt        ||j                        | _        t        |j                  |j                        | _        t        |j                  |j                        | _        y )N)r0   rv   )r,   r-   r1   r  layer_typesattention_typer  	self_attn
moe_layersis_moe_layerr   feed_forwardrW   intermediate_size_mlprz   r  input_layernormpost_attention_layernormr  s      r?   r-   zLlama4TextDecoderLayer.__init__  s    !--"$00;,VY?%):):: -f 5D -fHdHd eD01C1CI\I\](9&:L:LRXReRe(f%r@   NrA   r   r   r  	use_cacher  r  r   rB   c           
         |}	| j                  |      } | j                  d||||||d|\  }
}|	|
z   }|}	| j                  |      }| j                  |      }| j                  r|\  }}|	|j                  |	j                        z   }|S )N)rA   r  r   r  r6  r   )r4  r/  r5  r2  r1  rG   rH   )r=   rA   r   r   r  r6  r  r  r   residualattention_states_s               r?   rO   zLlama4TextDecoderLayer.forward  s     !,,]; -dnn 
' 3)+)
 
! !#33 !55mD))-8,M1 =#5#5hnn#EEr@   )NNNFNN)rP   rQ   rR   r-   r5   rS   r)  r
   boolr   r   r   FloatTensorrO   rT   rU   s   @r?   r+  r+    s    g$ /304(,!&26HL"||" t+" &&-	"
 " $;" ((4/" #5<<#=>E" -." 
u  %(9(95;L;L(L"MPT"TT	U"r@   r+  c                   t     e Zd ZU eed<   dZdZdgZdZdZ	dZ
dZdZ ej                          fd       Z xZS )Llama4PreTrainedModelr(   )imagetextTr  Fc                 4   t         |   |       t        | j                  d      r| j                  j                  n| j                  j
                  j                  }t        |t              rEt        j                  |j                  d|       t        j                  |j                  d|       y t        |t              rWt        j                  |j                  |j                         t        j                  |j                  |j                         y y )Ninitializer_ranger  )rn   std)rD  )r,   _init_weightsr  r(   rC  text_configr   r'   initnormal_r7   r9   Llama4VisionModelclass_embeddingscalepositional_embedding_vlm)r=   r   rD  r>   s      r?   rE  z#Llama4PreTrainedModel._init_weights  s    f% t{{$78 KK))((:: 	
 f/0LL,,3C@LL))= 12LL//V\\BLL88fllK 3r@   )rP   rQ   rR   r$   r   input_modalitiessupports_gradient_checkpointing_skip_keys_device_placement_supports_flash_attn_supports_sdpa_supports_flex_attn_can_compile_fullgraph_supports_attention_backendr5   r   rE  rT   rU   s   @r?   r?  r?    sW    (&*##4"5 N!"&U]]_L Lr@   r?  c                   >    e Zd ZU dgZdZdZeed<   ee	e
dZdef fdZeee	 	 	 	 	 	 	 ddej"                  dz  d	ej$                  dz  d
ej"                  dz  dedz  dej(                  dz  dedz  dej"                  dz  dee   deez  fd                     Z xZS )Llama4TextModelr+  model)rA  r(   )
attentionsrA   r   c           	         t         |   |       |j                  | _        |j                  | _        t        j                  |j                  |j                  | j                        | _        t        j                  t        |j                        D cg c]  }t        ||       c}      | _        t        |j                  |j                        | _        t#        |      | _        d| _        | j)                          y c c}w )Nrv   r(   F)r,   r-   pad_token_idpadding_idx
vocab_sizer3   	Embeddingr1   embed_tokens
ModuleListrangenum_hidden_layersr+  layersrz   r  normr   
rotary_embgradient_checkpointing	post_initr  s      r?   r-   zLlama4TextModel.__init__  s     !.. ++LL):):F<N<NPTP`P`ammHMfNfNfHgh9#FI6h
 &f&8&8f>Q>QR	36B&+# 	 is   DN	input_idsr   r   r  inputs_embedsr6  r  r   rB   c                    |d u |d uz  rt        d      |>| j                  |j                  | j                  j                  j                              }|r|t        | j                        }|F||j                         nd}	t        j                  |	|	|j                  d   z   |j                        }||j                  d      }t        |x}
t              s*| j                  |||||d}t        d
i |t        d
i |d}
|}| j!                  ||      }| j"                  d | j                  j$                   D ]  } ||f|
|j&                     |||||d|}! | j)                  |      }t+        ||r|	      S d 	      S )N:You must specify exactly one of input_ids or inputs_embedsrZ  r   r#   )r   )r(   input_embedsr   r  r  r   )full_attentionchunked_attention)r   r   r  r6  r  r  )last_hidden_stater  r8  )
ValueErrorr_  r   r}   r   r   r(   get_seq_lengthr5   r   rH   	unsqueezer   dictr   r   re  rc  rb  r.  rd  r   )r=   rh  r   r   r  ri  r6  r  r   past_seen_tokenscausal_mask_mappingmask_kwargsrA   freq_cisdecoder_layers                  r?   rO   zLlama4TextModel.forward  s    -t";<YZZ  --ill4;L;L;S;S;Z;Z.[\M0*$++>O!CRC^==?de"\\ "2]5H5H5K"KTaThThN )33A6L ?-F ++ -"0"0#2 ,K #5"C{"C%?%N+%N#
 & ??=,?![[)H4;;+H+HI 
	M)	2=3O3OP) /#-$,	 	M
	 		-0&+/8O
 	
>B
 	
r@   )NNNNNNN)rP   rQ   rR   _no_split_modulesbase_model_prefixrM  r%   r   r  r+  r   _can_record_outputsr-   r   r!   r   r5   r)  rS   r
   r=  r<  r   r   r   r   rO   rT   rU   s   @r?   rV  rV    s#   12 )/&/    .2.204(,26!%26C
##d*C
 t+C
 &&-	C

 C
 ((4/C
 $;C
 ((4/C
 +,C
 
(	(C
   C
r@   rV  c                   t    e Zd ZU dgZdZddiZddiZeed<   def fdZ	e
e	 	 	 	 	 	 	 	 	 dd
ej                  d	z  dej                  d	z  dej                  d	z  ded	z  dej                   d	z  dej                  d	z  ded	z  dej                  d	z  deej                  z  dee   deez  fd              Z xZS )Llama4ForCausalLMr+  language_modelzlm_head.weightzmodel.embed_tokens.weightlm_headcolwise_gather_outputr(   c                     t         |   |       t        |      | _        |j                  | _        t        j                  |j                  |j                  d      | _        | j                          y rY   )
r,   r-   rV  rW  r]  r3   r\   r1   r  rg  r<   s     r?   r-   zLlama4ForCausalLM.__init__]  sU     $V,
 ++yy!3!3V5F5FUS 	r@   Nrh  r   r   r  ri  labelsr6  r  logits_to_keepr   rB   c
                 l    | j                   d|||||||d|
}|d   }t        |	t              rt        |	 d      n|	}| j	                  |dd|ddf         }d}|* | j
                  d||| j                  j                  d|
}t        |||j                  |j                  |j                        S )az  
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
            config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.

        Example:

        ```python
        >>> from transformers import AutoTokenizer, Llama4ForCausalLM

        >>> model = Llama4ForCausalLM.from_pretrained("meta-llama4/Llama4-2-7b-hf")
        >>> tokenizer = AutoTokenizer.from_pretrained("meta-llama4/Llama4-2-7b-hf")

        >>> prompt = "Hey, are you conscious? Can you talk to me?"
        >>> inputs = tokenizer(prompt, return_tensors="pt")

        >>> # Generate
        >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
        >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
        "Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you."
        ```)rh  r   r   r  ri  r6  r  r   N)logitsr  r]  )lossr  r  rA   rX  r8  )rW  r   r   slicer  loss_functionr(   r]  r   r  rA   rX  )r=   rh  r   r   r  ri  r  r6  r  r  r   outputsrA   slice_indicesr  r  s                   r?   rO   zLlama4ForCausalLM.forwardf  s    J $** 	
)%+')	
 	
  
8B>SV8W~ot4]kmA}a,?@A%4%%pVFt{{OeOepiopD%#33!//))
 	
r@   )	NNNNNNNNr   )rP   rQ   rR   ry  rz  _tied_weights_keys_tp_planr%   r   r-   r   r   r5   r)  rS   r
   r=  r<  r   r   r   r   r   rO   rT   rU   s   @r?   r}  r}  V  s>   12(*,GH23H/   .2.204(,26*.!%26-.<
##d*<
 t+<
 &&-	<

 <
 ((4/<
   4'<
 $;<
 ((4/<
 ell*<
 +,<
 
'	'<
  <
r@   r}  zQ
    Base class for Llava causal language model (or autoregressive) outputs.
    custom_introc                       e Zd ZU dZdZej                  dz  ed<   dZej                  dz  ed<   dZ	e
dz  ed<   dZeej                     dz  ed<   dZeej                     dz  ed<   dZej                  dz  ed<   y)	Llama4CausalLMOutputWithPasta3  
    loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
        Language modeling loss (for next-token prediction).
    logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
        Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
    past_key_values (`Cache`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
        It is a [`~cache_utils.Cache`] instance. For more details, see our [kv cache guide](https://huggingface.co/docs/transformers/en/kv_cache).

        Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see
        `past_key_values` input) to speed up sequential decoding.
    image_hidden_states (`torch.FloatTensor`, *optional*):
        A `torch.FloatTensor` of size (batch_size, num_images, sequence_length, hidden_size)`.
        image_hidden_states of the model produced by the vision encoder and after projecting the last hidden state.
    Nr  r  r  rA   rX  image_hidden_states)rP   rQ   rR   r(  r  r5   r=  r   r  r  r
   rA   r   rX  r  r8  r@   r?   r  r    s     &*D%

d
")'+FE$+$(OUT\(59M5**+d2926Je''(4/648**T18r@   r  c                   $     e Zd Z fdZd Z xZS )Llama4VisionMLP2c                 ~   t         |           |j                  | _        |j                  | _        t	        j
                  | j                  |j                  d      | _        t	        j
                  |j                  |j                  d      | _	        t	        j                         | _        |j                  | _        y rY   )r,   r-   r1   r0   r3   r\   projector_input_dimfc1projector_output_dimfc2GELUr_   projector_dropoutr   r<   s     r?   r-   zLlama4VisionMLP2.__init__  s    !--!'!9!999T33V5O5OV[\99V88&:U:U\abWWY//r@   c                     | j                  |      }| j                  |      }t        j                  || j                  | j                        }| j                  | j                  |            S )Nr   )r  r_   Fr   r   r  r=   rA   s     r?   rO   zLlama4VisionMLP2.forward  sT    /**=9		-4<<$--X!!$((="9::r@   rc   rU   s   @r?   r  r    s    0;r@   r  c                   $     e Zd Z fdZd Z xZS )Llama4MultiModalProjectorc                     t         |           t        j                  |j                  j
                  |j                  j                  d      | _        y rY   )	r,   r-   r3   r\   vision_configvision_output_dimrF  r1   linear_1r<   s     r?   r-   z"Llama4MultiModalProjector.__init__  s?    		  22**
r@   c                 (    | j                  |      }|S ra   )r  )r=   image_featuresrA   s      r?   rO   z!Llama4MultiModalProjector.forward  s    n5r@   rc   rU   s   @r?   r  r    s    
r@   r  c           
      J   | j                   \  }}}t        t        j                  |            }| j	                  |||d      } | j                         \  }}}}| j	                  ||t        ||z        t        ||z              }|j                  dddd      j                         }|j	                  |t        ||z        t        ||z        t        ||dz  z              }|j                  dddd      j                         }|j	                  |d|j                   d         }	|	S )NrD   r   r+   r#   r   )rH   r   mathsqrtrG   sizepermuter   )
input_tensorshuffle_ratio
batch_sizenum_patcheschannels
patch_sizeheightwidthreshaped_tensoroutput_tensors
             r?   pixel_shuffler    s%   (4(:(:%JXTYY{+,J$$ZZLL*6*;*;*='Jx"''
FC@U<VX[\dgt\tXuvO%--aAq9DDFO%**C./U]5J1KSQY]jlm]mQnMoO &--aAq9DDFO#((R9N9Nr9RSMr@   c                   V     e Zd Z fdZdej
                  dej
                  fdZ xZS )Llama4VisionPixelShuffleMLPc                     t         |           |j                  | _        t        |j                  | j                  dz  z        | _        |j                  | _        t        |      | _	        y r*   )
r,   r-   pixel_shuffle_ratior   r  	inner_dimr  
output_dimr  mlpr<   s     r?   r-   z$Llama4VisionPixelShuffleMLP.__init__  sX    #)#=#= V77D<T<TVW<WXY 55#F+r@   encoded_patchesrB   c                 P    t        || j                        }| j                  |      S ra   )r  r  r  )r=   r  s     r?   rO   z#Llama4VisionPixelShuffleMLP.forward   s#    '9Q9QRxx((r@   rP   rQ   rR   r-   r5   rS   rO   rT   rU   s   @r?   r  r    s#    ,)u|| ) )r@   r  freqs_cic                     |j                   }t        |j                        D cg c]  \  }}|dk(  s||dz
  k(  r|nd }}} | j                  | S c c}}w )Nr#   )ndim	enumeraterH   rG   )r  r   r  idrH   s         r?   reshape_for_broadcastr    sW    ::D=Fu{{=STTQ!q&AMQq0TET8==%   Us   Ac                 B   t        j                   | j                         j                  g | j                  d d dd       }t        j                   |j                         j                  g |j                  d d dd       }t        ||      }|j                  |j                        }t        j                  ||z        j                  d      }t        j                  ||z        j                  d      }|j                  |       |j                  |      fS )NrD   r+   )r  r   r   )r5   r   rs   r   rH   r  r   r   r   r   rt   )r   r   r  query_key_	query_outkey_outs          r?   vision_apply_rotary_embr    s    
 ""#85;;=#8#8#R%++cr:J#RB#RPQ#RSF  !4!4!4!Lciin!Lb!L!!LMD$hfEH{{6==)H""6H#45==a@I  199!<GU#W__S%999r@   c                        e Zd Zdef fdZ	 	 ddej                  dej                  dej                  dz  dedz  dee	   d	e
ej                  ej                  dz  e
ej                     dz  f   fd
Z xZS )Llama4VisionAttentionr(   c                    t         |           || _        |j                  | _        |j
                  | _        |j                  |j
                  z  | _        d| _        |j                  | _	        | j                  dz  | _
        t        j                  | j                  | j                  | j                  z  d      | _        t        j                  | j                  | j                  | j                  z  d      | _        t        j                  | j                  | j                  | j                  z  d      | _        t        j                  | j                  | j                  z  | j                  d      | _        y )Nr#   r   TrZ   )r,   r-   r(   r1   	embed_dimr   	num_headsr   r   r	  r   r3   r\   r  r  r  r  r<   s     r?   r-   zLlama4VisionAttention.__init__  s   ++33**f.H.HH$%!!'!9!9}}d*ii0NUYZii0NUYZii0NUYZii >UYZr@   NrA   r  r   r  r   rB   c                    |j                   d d }g |d| j                  }| j                  |      j                  |      }| j	                  |      j                  |      }	| j                  |      j                  |      }
t        ||	|      \  }}	|j                  dd      }|	j                  dd      }	|
j                  dd      }
t        j                  | j                  j                  t              } || ||	|
d f| j                  sdn| j                  d dd|\  }} |j                  g |d j!                         }| j#                  |      }||fS )NrD   )r  r#   r+   r  F)r   r   r
  )rH   r   r  rG   r  r  r  r   r   r  r(   r   r  r   r	  r   r   r  )r=   rA   r  r   r  r   r!  r"  r#  r   r   r&  r   r   s                 r?   rO   zLlama4VisionAttention.forward*  sj    $))#2.88b8$--8{{=166|D[[/44\B
{{=166|D#:<^f#g j#--a3))!Q/
#--a3(?(M(MKK,,.L)
 %8
%
  $}}C$2H2H
%
 
%
!\ *k));;;;FFHkk+.L((r@   r'  )rP   rQ   rR   r   r-   r5   rS   r
   r   r   r   rO   rT   rU   s   @r?   r  r    s    [1 [& /3(,')||') ,,') t+	')
 ') -.') 
u||U\\D0%2E2LL	M')r@   r  c                   V     e Zd Z fdZdej
                  dej
                  fdZ xZS )Llama4VisionMLPc                 &   t         |           || _        t        j                         | _        t        j                  |j                  |j                  d      | _	        t        j                  |j                  |j                  d      | _
        y )NTrZ   )r,   r-   r(   r3   r  r_   r\   r1   r0   r  r  r<   s     r?   r-   zLlama4VisionMLP.__init__U  se    WWY99V//1I1IPTU99V55v7I7IPTUr@   rA   rB   c                 l    | j                  |      }| j                  |      }| j                  |      }|S ra   )r  r_   r  r  s     r?   rO   zLlama4VisionMLP.forward\  s4    /**=9/r@   r  rU   s   @r?   r  r  T  s$    VU\\ ell r@   r  c            
            e Zd Zdef fdZ	 	 d	dej                  dej                  dej                  dz  dedz  fdZ xZ	S )
Llama4VisionEncoderLayerr(   c                    t         |           |j                  | _        t        |      | _        t        |      | _        t        j                  |j                        | _	        t        j                  |j                        | _
        y ra   )r,   r-   r1   r  r/  r  r  r3   	LayerNormr4  r5  r<   s     r?   r-   z!Llama4VisionEncoderLayer.__init__d  sb    !--.v6"6*!||F,>,>?(*V5G5G(H%r@   Nhidden_stater  r   output_attentionsc                     |}| j                  |      }| j                  |||      \  }}||z   }|}| j                  |      }| j                  |      }||z   }|f}|r||fz  }|S )N)r  r   )r4  r/  r5  r  )r=   r  r  r   r  r9  r   r  s           r?   rO   z Llama4VisionEncoderLayer.forwardn  s      ++L9%)^^) &4 &
"l
  ,.  44\Bxx-,./&Gr@   r'  )
rP   rQ   rR   r   r-   r5   rS   r<  rO   rT   rU   s   @r?   r  r  c  sZ    I1 I /3)-ll ,, t+	
  $;r@   r  c                        e Zd ZdZdef fdZ	 	 	 	 ddej                  dej                  dej                  dz  dedz  d	edz  d
edz  de	e
z  fdZ xZS )Llama4VisionEncoderz
    Transformer encoder consisting of `config.num_hidden_layers` self attention layers. Each layer is a
    [`Llama4VisionEncoderLayer`].

    Args:
        config: Llama4VisionConfig
    r(   c                     t         |           || _        t        j                  t        |j                        D cg c]  }t        |       c}      | _        d| _	        || _        y c c}w )NF)
r,   r-   r(   r3   r`  ra  rb  r  rc  rf  )r=   r(   r;  r>   s      r?   r-   zLlama4VisionEncoder.__init__  sW    mmuU[UmUmOn$o!%=f%E$op&+# %ps   A*NrA   r  r   r  output_hidden_statesreturn_dictrB   c                 z   ||n| j                   j                  }||n| j                   j                  }||n| j                   j                  }|rdnd}|rdnd}| j                  D ]&  }	|r||fz   } |	||||      }
|r	||
d   fz   }|
d   }( |r||fz   }|st        d |||fD              S t        |||      S )ad  
        Args:
            inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
                Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation.
                This is useful if you want more control over how to convert `input_ids` indices into associated vectors
                than the model's internal embedding lookup matrix.
            attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
                Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:

                - 1 for tokens that are **not masked**,
                - 0 for tokens that are **masked**.

                [What are attention masks?](../glossary#attention-mask)
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more detail.
            output_hidden_states (`bool`, *optional*):
                Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
                for more detail.
            return_dict (`bool`, *optional*):
                Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
        Nr8  )r  r   r  r  r#   r   c              3   &   K   | ]	  }||  y wra   r8  .0vs     r?   	<genexpr>z.Llama4VisionEncoder.forward.<locals>.<genexpr>  s     eqWXWde   ro  rA   rX  )r(   r  r  use_return_dictrc  r   r   )r=   rA   r  r   r  r  r  encoder_statesall_attentionsencoder_layerlayer_outputss              r?   rO   zLlama4VisionEncoder.forward  s    > 2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++B]B]30d![[ 	-M#!/=2B!B)*-"3!	M !!/=3C2E!E)!,M	-   +}.>>Ne]NN$Seee+>Vd
 	
r@   NNNN)rP   rQ   rR   r(  r   r-   r5   rS   r<  r   r   rO   rT   rU   s   @r?   r  r    s    1  /3)-,0#'?
||?
 ,,?
 t+	?

  $;?
 #Tk?
 D[?
 
	 ?
r@   r  c                   V     e Zd Z fdZdej
                  dej
                  fdZ xZS )Llama4UnfoldConvolutionc                 <   t         |           |j                  }t        |t              r||f}t
        j                  j                  ||j                        | _        t        j                  |j                  |d   z  |d   z  |j                  d      | _        y )N)kernel_sizestrider   r#   FrZ   )r,   r-   r  r   r   r5   r3   Unfoldunfoldr\   num_channelsr1   linear)r=   r(   r  r>   s      r?   r-   z Llama4UnfoldConvolution.__init__  s    ''k3'&4Khhoo+fFWFWoXii+a.0;q>A
r@   rA   rB   c                 p    | j                  |      }|j                  ddd      }| j                  |      }|S )Nr   r+   r#   )r  r  r  r  s     r?   rO   zLlama4UnfoldConvolution.forward  s8    M2%--aA6M2r@   r  rU   s   @r?   r  r    s#    

U\\ ell r@   r  c                   *     e Zd Zdef fdZd Z xZS )Llama4VisionRotaryEmbeddingr(   c                    t         |           |j                  |j                  z  }t	        j
                  |dz  t        j                        j                  |dz  d      }t	        j                  ||d d gd      }d|d<   ||z  }||z  }|j                  |j                  z  dz  }d|j                  d	   t	        j
                  d|d      d |dz   j                         |z  z  z  }|dz   d
   |d d d d f   z  j                  dd      }|dz   d
   |d d d d f   z  j                  dd      }	t	        j                  ||	gd      j                         j                         dd d df   }
|
j                  |j                  ddd      dk  d      }
t	        j                   t	        j"                  t	        j$                  |
      t	        j&                  |
      gd            }|| _        y )Nr+   r   r#   r   rE   r   )rD   rD   r   r   ).NrD   .)r,   r-   
image_sizer  r5   r   int32r   catr1   r   r   rs   repeat_interleaver   masked_fillr   stackcossinr  )r=   r(   idximg_idxfrequencies_xfrequencies_yfreq_dim	rope_freqfreqs_xfreqs_yr   rw  r>   s               r?   r-   z$Llama4VisionRotaryEmbedding.__init__  s   6#4#44,,sAvU[[9AA#q&!L))Wgbqk2:#3%%)C)CCqH""<0Q!,->A?EEG(RT
	 "A%y1IdD!m4LL__`agi_j!A%y1IdD!m4LL__`agi_j		7G,"5;;=HHJ3PSRSPS8T!!'//"a";a"?C((eii6F		RWHX5Y_a)bc r@   c                 L    | j                   j                  |j                        S ra   )r  r   r   r  s     r?   rO   z#Llama4VisionRotaryEmbedding.forward
  s    }} 4 455r@   )rP   rQ   rR   r   r-   rO   rT   rU   s   @r?   r  r    s    !1 !(6r@   r  c                        e Zd ZU dZdZdgZeed<   def fdZd Z		 	 	 	 dde
j                  d	e
j                  dz  d
edz  dedz  dedz  deee
j                  df   z  fdZ xZS )rI  vision_model)r@  r  r(   c                 r   t         |   |       |j                  | _        |j                  | _        |j                  | _        |j
                  | _        | j                  | j                  z  dz  dz   | _        |j                  dz  | _        t        |      | _	        t        j                  | j                  t        j                  | j                        z        | _        t        j                  | j                  t        j                  | j                  | j                        z        | _        t!        |      | _        t        j$                  | j                        | _        t        j$                  | j                        | _        t+        |      | _        t/        |      | _        | j3                          y )Nr+   r#   r   )r,   r-   r  r  r1   r  r  rK  r  patch_embeddingr3   r4   r5   randnrJ  rL  r  rotary_embeddingr  layernorm_prelayernorm_postr  rW  r  vision_adapterrg  r<   s     r?   r-   zLlama4VisionModel.__init__  sA     ++ ++!--"// OOt>1DqH''-
6v>!||DJJTEUEU9V,VW(*TZZ%++dN^N^`d`p`pBq5q(r% ;F C  \\$*:*:; ll4+;+;< )0
9&Ar@   c                     | j                   S )zg
        This function is used to fetch the first embedding layer to activate grads on inputs.
        )r  rw   s    r?   get_input_embeddingsz&Llama4VisionModel.get_input_embeddings-  s     ###r@   Npixel_valuesr   r  r  r  rB   .c                    ||n| j                   j                  }||n| j                   j                  }||n| j                   j                  }|j                  \  }}}	}
d}d}| j                  |      }|j                  \  }}}|j                  ||z  |z  ||      }| j                  j                  |j                  d   d|j                  d         }t        j                  ||gd      }|dz  }|j                  ||z  |||      }| j                  j                  |j                  |j                        }||z   }| j                  |      }|j!                  |d|      }| j#                  |      }| j%                  |d|||      }|j&                  }| j)                  |      }|ddddddf   }| j+                  |      }|r|j,                  nd}|r|d   }nd}|st/        d	 |||fD              S t1        |||
      S )a  

        Example:

        ```python
        >>> from PIL import Image
        >>> import httpx
        >>> from io import BytesIO
        >>> from transformers import AutoProcessor, MllamaVisionModel

        >>> checkpoint = "meta-llama/Llama-3.2-11B-Vision"
        >>> model = MllamaVisionModel.from_pretrained(checkpoint)
        >>> processor = AutoProcessor.from_pretrained(checkpoint)

        >>> url = "https://www.ilankelman.org/stopsigns/australia.jpg"
        >>> with httpx.stream("GET", url) as response:
        ...     image = Image.open(BytesIO(response.read()))
        >>> inputs = processor(images=image, return_tensors="pt")

        >>> output = model(**inputs)

        >>> print(output.last_hidden_state.shape)
        torch.Size([1, 1, 4, 1025, 7680])
        ```
        Nr#   r   rD   rE   r   r   )r   r  r  r  r+   c              3   &   K   | ]	  }||  y wra   r8  r  s     r?   r  z,Llama4VisionModel.forward.<locals>.<genexpr>  s     _qQRQ^_r  r  )r(   r  r  r  rH   r  r   rJ  r   r5   r   rL  r   r   r   r  rG   r  rW  ro  r  r  rA   r   r   )r=   r  r   r  r  r  r   batch_size_times_num_tilesr  r  r  num_concurrent_media
num_chunksr  r;  r  r   rJ  positional_embeddingr  r   rA   rX  s                          r?   rO   zLlama4VisionModel.forward3  sT   D 2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++B]B] COBTBT?"L&% 
++L9%1%7%7";
 $++&)==
JKYc
 ..55l6H6H6KQP\PbPbcePfgyy,!@aHq $++&)==z;Xb
  $<<??lFXFXamatat?u#&::)),7#(()CRT((6!5/  
 //**<8#AssAI. **<80D,,$JJ_\=*$M___*'!
 	
r@   r  )rP   rQ   rR   rz  rM  ry  r   r   r-   r  r5   rS   r<  r   r   rO   rT   rU   s   @r?   rI  rI    s    &!341 2$ /3)-,0#'b
llb
 t+b
  $;	b

 #Tkb
 D[b
 
5s!23	3b
r@   rI  c            $           e Zd ZU ddgZi ZdZeed<   def fdZd Z	d Z
d Zd	 Zd
 Zd Z ed       ed      dej$                  dedee   deez  fd              Zdej2                  dej$                  dej$                  fdZ ed      e	 	 	 	 	 	 	 	 	 	 	 	 	 	 d&dej2                  dz  dej$                  dz  dej6                  dz  dej2                  dz  dedz  dej$                  dz  dedz  dej2                  dz  dedz  dedz  d edz  d!edz  d"ej2                  dz  d#eej6                  z  dee   deez  f d$              Z 	 	 	 	 	 	 	 d'd%Z! xZ"S )(Llama4ForConditionalGenerationr+  r  rW  r(   c                 h   t         |   |       t        |j                        | _        t        |      | _        t        |j                        | _	        |j                  j                  | _
        | j                  j                  | j                  j                  nd| _        | j                          y )NrD   )r,   r-   rI  r  r  r  multi_modal_projectorr}  rF  r~  r]  r(   r[  rg  r<   s     r?   r-   z'Llama4ForConditionalGeneration.__init__  s     -f.B.BC%>v%F"/0B0BC ,,778<8P8P8\DKK44bdr@   c                 6    | j                   j                         S ra   )r~  r  rw   s    r?   r  z3Llama4ForConditionalGeneration.get_input_embeddings  s    ""7799r@   c                 :    | j                   j                  |       y ra   )r~  set_input_embeddings)r=   r   s     r?   r(  z3Llama4ForConditionalGeneration.set_input_embeddings  s    007r@   c                 6    | j                   j                         S ra   )r~  get_output_embeddingsrw   s    r?   r*  z4Llama4ForConditionalGeneration.get_output_embeddings  s    ""88::r@   c                 :    | j                   j                  |       y ra   )r~  set_output_embeddings)r=   new_embeddingss     r?   r,  z4Llama4ForConditionalGeneration.set_output_embeddings  s    11.Ar@   c                 :    | j                   j                  |       y ra   )r~  set_decoder)r=   decoders     r?   r/  z*Llama4ForConditionalGeneration.set_decoder  s    ''0r@   c                 6    | j                   j                         S ra   )r~  get_decoderrw   s    r?   r2  z*Llama4ForConditionalGeneration.get_decoder  s    ""..00r@   F)tie_last_hidden_stateszOObtains image last hidden states from the vision tower and apply al projection.r  r  vision_feature_select_strategyr   rB   c                     |j                         D ci c]  \  }}|	|| }}} | j                  |fi |S c c}}w )aj  
        pixel_values (`torch.FloatTensor]` of shape `(batch_size, channels, height, width)`)
            The tensors corresponding to the input images.
        vision_feature_select_strategy (`str`):
            The feature selection strategy used to select the vision feature from the vision backbone.
            Can be one of `"default"` or `"full"`
        )itemsr  )r=   r  r4  r   kr  s         r?   get_image_featuresz1Llama4ForConditionalGeneration.get_image_features  sG     $*<<>C41aQ]!Q$CC t  888 Ds   
::rh  ri  r  c                 *   |m| | j                         t        j                  | j                  j                  t        j
                  |j                              k(  }|j                  d      }n|| j                  j                  k(  }|j                         }|j                  d      j                  |      j                  |j                        }t        ||   j                         |j                         k(  d| d|j                  d           |S )z
        Obtains multimodal placeholder mask from `input_ids` or `inputs_embeds`, and checks that the placeholder token count is
        equal to the length of multimodal features. If the lengths are different, an error is raised.
        r  rD   z6Image features and image tokens do not match, tokens: z, features: r   )r  r5   tensorr(   image_token_idlongr   allr   rr  	expand_asr   r    numelrH   )r=   rh  ri  r  special_image_maskn_image_tokenss         r?   get_placeholder_maskz3Llama4ForConditionalGeneration.get_placeholder_mask  s    !.2M$2K2K2MT[[77uzzR_RfRfg3 " "4!7!7!;!*dkk.H.H!H+//1/99"=GGVYYZgZnZno,-3359M9M9OOD^DTT`aoauauvwax`yz	
 "!r@   Nr   r   r  r  r6  r  r  r  r  r  c                    |
|
n| j                   j                  }
||n| j                   j                  }||n| j                   j                  }|du |duz  rt	        d      ||t	        d      | | j                         |      }|| j                  ||d      j                  }|j                  d|j                  d            }| j                  |      j                  |j                  |j                        }| j                  |||      }|j                  ||      } | j                   d|||||	|
||||d
|}|d	   }d}|<||dd|j"                  d
   d
z
   df   j                  |j                        }|dddddf   |j                  |j                        d	k7     j%                         }|dd
df   |j                  |j                        d	k7     j%                         }n1|dddddf   j%                         }|dd
df   j%                         }t'        j(                         } ||j                  d|j                  d            |j                  d      j                  |j                              }|s|f|d
d z   }||f|z   S |S t+        |||j,                  |j.                  |j0                  |      S d      S )a  
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
            config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.

        Example:

        ```python
        >>> from PIL import Image
        >>> import httpx
        >>> from io import BytesIO
        >>> from transformers import AutoProcessor, LlavaForConditionalGeneration

        >>> model = LlavaForConditionalGeneration.from_pretrained("llava-hf/llava-1.5-7b-hf")
        >>> processor = AutoProcessor.from_pretrained("llava-hf/llava-1.5-7b-hf")

        >>> prompt = "USER: <image>\nWhat's the content of the image? ASSISTANT:"
        >>> url = "https://www.ilankelman.org/stopsigns/australia.jpg"
        >>> with httpx.stream("GET", url) as response:
        ...     image = Image.open(BytesIO(response.read()))

        >>> inputs = processor(images=image, text=prompt, return_tensors="pt")

        >>> # Generate
        >>> generate_ids = model.generate(**inputs, max_new_tokens=15)
        >>> processor.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
        "USER:  \nWhat's the content of the image? ASSISTANT: The image features a busy city street with a stop sign prominently displayed"
        ```Nrk  zdYou cannot specify both pixel_values and inputs_embeds at the same time, and must specify either oneT)r  r4  r  rD   )ri  r  )
r   r   r  ri  r6  r  r  r  r  r  r   r#   .)r  r  r  rA   rX  r  r8  )r(   r  r  r  rp  r  r8  pooler_outputrG   r  r%  r   r   r   rB  masked_scatterr~  rH   r   r3   CrossEntropyLossr  r  rA   rX  )r=   rh  r  r   r   r  ri  r4  r  r6  r  r  r  r  r  r   r  vision_flatprojected_vision_flatr@  r  r  r  shift_attention_maskshift_logitsshift_labelsloss_fctr   s                               r?   rO   z&Llama4ForConditionalGeneration.forward  sS   d 2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++B]B]-t";<YZZ#(Av   7D557	BM#!44)/M  5  m	  )--b.2E2Eb2IJK$($>$>{$K$N$N$$m&9&9%! "&!:!:G\ "; " *889KMbcM%$%% 
)%+'/!5#))
 
 ) (6a6<<?Q;N9O9Q6Q'R'U'UV\VcVc'd$%c3B3k23G3J3J6==3Y]^3^_jjl%c12g/C/F/Fv}}/UYZ/Z[ffh%c3B3k2==?%c12g99;**,H!!"l&7&7&;<l>O>OPR>S>V>VWcWjWj>kD Y,F'+'7D7V#CVC+#33!//))2>2J
 	
 QU
 	
r@   c	           
           | j                   j                  |f||||||d|	}
|s|	j                  dd      s||
d<   |
S )N)r  ri  r   r  r  is_first_iterationr6  Tr  )r~  prepare_inputs_for_generationget)r=   rh  r  ri  r  r   r  r  rN  r   model_inputss              r?   rO  z<Llama4ForConditionalGeneration.prepare_inputs_for_generationf  se     It**HH	
+')))1	
 	
 VZZT%B
 ,8L(r@   )NNNNNNNNNNNNNr   )NNNNNNF)#rP   rQ   rR   ry  r  rz  r$   r   r-   r  r(  r*  r,  r/  r2  r!   r   r5   r=  r   r   r   r   r   r8  r)  rB  rS   r
   r<  r   r  rO   rO  rT   rU   s   @r?   r#  r#    sl   13MNH	| 	:8;B11 u5!rs9''9 ),9 +,	9
 
+	+9 t 69 "))":?:K:K"]b]n]n". u5 .215.204(,2659*.!%)-,0#'26-.~
##d*~
 ''$.~
 t+	~

 &&-~
 ~
 ((4/~
 ),d
~
   4'~
 $;~
  $;~
 #Tk~
 D[~
 ((4/~
 ell*~
  +,!~
" 
-	-#~
  6~
F   r@   r#  )r?  rV  rI  r}  r#  )r  )er  collections.abcr   dataclassesr   typingr   r5   torch.nnr3   torch.nn.functionalr   r  /transformers.models.llama4.configuration_llama4r    r   rG  activationsr	   cache_utilsr
   r   
generationr   integrationsr   masking_utilsr   r   modeling_flash_attention_utilsr   modeling_layersr   modeling_outputsr   r   r   r   r   modeling_rope_utilsr   r   modeling_utilsr   r   processing_utilsr   utilsr   r   r   r   r    utils.genericr!   r"   configuration_llama4r$   r%   
get_loggerrP   loggerModuler'   rW   re   rz   r\   r   r   r   rS   r   r   r   r   rs   r   r  r  r+  r?  rV  r}  r  r  r  r  r  r  r  r  r  r  r  r  r  rI  r#  __all__r8  r@   r?   <module>rk     s    $ !      N & ! . ) 7 K B 9  G & j j ? @ 
		H	%		 B)BII )$!uxx !=		 =(,299 , _-"BII " .",?		 ?D	2	2	2 ||	2 5<<%&		2	UU\\ 	U# 	U%,, 	U( %II%<<% 
% <<	%
 LL4'% % %D %II%<<% 
% <<	%
 LL4'% % %4Z)")) Z)z27 2j LO L L8 a
+ a
 a
HN
- N
b 
9; 9 90;uxx ;"		 (
)")) 
)!ELL ! !:<<:	: ll: 5<<%&	:7)BII 7)tbii )9 )XO
")) O
dbii (6")) 62G
- G
Tn%:O nbr@   