
    i\k                        d Z ddlZddlmZ ddlZddlmZ ddlmZ ddl	m
Z
 ddlmZmZ dd	lmZ dd
lmZ ddlmZ ddlmZmZ ddlmZ ddlmZmZmZ ddlmZ ddlm Z   e       rddl!m"Z" ddl#m$Z$  ejJ                  e&      Z'de(de(dejR                  fdZ*dejR                  dejR                  fdZ+dejR                  dejR                  dejR                  dejR                  fdZ, G d dejZ                        Z. G d  d!ejZ                        Z/ G d" d#e      Z0e G d$ d%e             Z1e G d& d'e1             Z2 ed()       G d* d+e1e             Z3g d,Z4y)-zPyTorch CodeGen model.    N)Union)nn   )initialization)ACT2FN)CacheDynamicCache)GenerationMixin)AttentionMaskConverter)GradientCheckpointingLayer)BaseModelOutputWithPastCausalLMOutputWithPast)PreTrainedModel)auto_docstringis_torch_flex_attn_availablelogging)is_flash_attention_requested   )CodeGenConfig)	BlockMask)make_flex_block_causal_masknum_posdimreturnc                    ddt        j                  d|dt         j                        |z  z  z  }t        j                  dt        j                  | t         j                        j	                         |      j	                         }t        j
                  t        j                  |      t        j                  |      fd      S )	Ng      ?i'  r      )dtypezi , j -> i jr   r   )torcharangeint64einsumfloatcatsincos)r   r   inv_freqsinusoid_inps       v/home/obispo/Crisostomo_bridge/mision_env/lib/python3.12/site-packages/transformers/models/codegen/modeling_codegen.pycreate_sinusoidal_positionsr*   1   s    eQQekk JS PQRH<<WEKK0X0^0^0`bjkqqsL99eii-uyy/FGQOO    xc                     | d d d d d d d d df   }| d d d d d d dd df   }t        j                  | |fd      } | j                  d      S )Nr   r   r   )r   stackflatten)r,   x1x2s      r)   rotate_every_twor4   8   sS    	
1aCaC<B	
1aADqD=	BbS"I2&A99R=r+   tensorr%   r&   c                     t        j                  |d d d d d d d f   dd      }t        j                  |d d d d d d d f   dd      }| |z  t        |       |z  z   S )Nr   r   )r   repeat_interleaver4   )r5   r%   r&   s      r)   apply_rotary_pos_embr8   @   s^    

!
!#aD!m"4a
;C

!
!#aD!m"4a
;CSL-f5;<<r+   c                       e Zd Zd fd	Zd Zd Z	 ddZ	 	 	 	 	 	 ddej                  dz  de	dz  dej                  dz  d	ej                  dz  d
edz  dedz  dej                  dz  deej                  eej                     f   eej                  eej                     eej                  df   f   z  dz  fdZ xZS )CodeGenAttentionNc                 .   t         |           |j                  | _        t	        j
                  |j                        | _        t	        j
                  |j                        | _	        || _
        |-t        j                  d| j                  j                   d       |j                  | _        |j"                  | _        | j                   | j"                  z  | _        | j$                  | j"                  z  | j                   k7  r&t'        d| j                    d| j"                   d      t)        j*                  | j$                        | _        t	        j.                  | j                   | j                   dz  d      | _        t	        j.                  | j                   | j                   d      | _        |j4                  | _        | j4                  xs | j                   | _        | j9                  d	t;        | j                  | j6                        d
       y )NzInstantiating z without passing a `layer_idx` is not recommended and will lead to errors during the forward call if caching is used. Please make sure to provide a `layer_idx` when creating this class.zEembed_dim must be divisible by num_attention_heads (got `embed_dim`: z and `num_attention_heads`: z).r   F)biasembed_positions)
persistent)super__init__max_position_embeddingsmax_positionsr   Dropout
attn_pdropattn_dropoutresid_pdropresid_dropout	layer_idxloggerwarning_once	__class____name__hidden_size	embed_dimnum_attention_headshead_dim
ValueErrormathsqrt
scale_attnLinearqkv_projout_proj
rotary_dimpos_embd_dimregister_bufferr*   )selfconfigrH   rK   s      r)   r@   zCodeGenAttention.__init__G   s   #;;JJv'8'89ZZ(:(:;" !8!8 9 :, ,  ++#)#=#= $*B*BB==4333t~~EWX\XfXfWg h++/+C+C*DBH  ))DMM2		$..$..12D5Q		$..$..uM ++ OO=t~~:4;M;MtO`O`ans 	 	
r+   c                     |j                  |j                  d d ||z  |fz         }|j                  |j                  d d dz   |j                  dd  z         }|S )Nr.   r/   )r.   )reshapeshape)r[   r,   n_headdim_headmp_numreshapeds         r)   _split_headszCodeGenAttention._split_headsg   s]    99QWWSb\Vv-=x,HHI##AGGCRL5$88>>"#;N$NOr+   c                    t        |j                        dk(  r$|j                  ddddd      j                         }n\t        |j                        dk(  r#|j                  dddd      j                         }n!t	        dt        |j                               |j                         dd	 ||z  fz   }|j                  |      S )
zM
        Merges attn_head_size dim and num_attn_heads dim into n_ctx
           r   r   r   r      z3Input tensor rank should be one of [4, 5], but is: Nr/   )lenr_   permute
contiguousrQ   sizeview)r[   r5   rO   attn_head_size	new_shapes        r)   _merge_headszCodeGenAttention._merge_headsl   s     v||!^^Aq!Q2==?F!#^^Aq!Q/::<FRSVW]WcWcSdRefggKKM#2&*=*N)PP	{{9%%r+   c                    |j                  t        j                        }|j                  t        j                        }t        j                  ||j	                  dd            }|#|d d d d d d d |j
                  d   f   }||z  }|| j                  z  } t        j                  d      |      }|j                  |j                        }| j                  |      }t        j                  ||      }||fS )Nr.   r/   r   )tor   float32matmul	transposer_   rT   r   Softmaxr   rE   )r[   querykeyvalueattention_maskattn_weightscausal_maskattn_outputs           r)   _attnzCodeGenAttention._attny   s     'ffU]]#||E3==R+@A%(Aq/CIIbM/)ABKK'L#doo5)rzzb),7#u{{3((6ll<7L((r+   hidden_states
layer_pastry   position_ids	use_cacheoutput_attentionscache_positionr   .c                 Z   | j                  |      }d}	|j                  |j                  d d |	dfz         }
| j                  | j                  z  |	z  }t        j                  |
|d      \  }}}| j                  || j                  | j                  |	      }| j                  || j                  | j                  |	      }| j                  || j                  | j                  |	      }|j                  dddd      }| j                  }|j                  |j                  k7  r"|j                  |j                        }|| _	        ||   }t        j                  ||j                  d   dz  d      \  }}| j                  |d d d d d d d | j                  f   }|d d d d d d | j                  d f   }|d d d d d d d | j                  f   }|d d d d d d | j                  d f   }t        |||      }t        |||      }t        j                  ||gd      }t        j                  ||gd      }nt        |||      }t        |||      }|j                  dddd      }|j                  dddd      }|K||| j                  |d	}|j                  |j                  |j                         || j"                  |      \  }}| j%                  ||||      \  }}| j'                  || j                  | j                        }| j)                  |      }| j+                  |      }||fS )
Nrg   r.   r   )rb   r   r   r   r   )r%   r&   partial_rotation_sizer   )rV   r^   r_   rP   rO   r   splitrd   ri   r=   devicerq   rX   r8   r$   updater   rH   r}   ro   rW   rG   )r[   r~   r   ry   r   r   r   r   qkvrb   	qkv_split	local_dimrv   rx   rw   r=   sincosr%   r&   k_rotk_passq_rotq_passcache_kwargsr|   rz   s                             r)   forwardzCodeGenAttention.forward   s    mmM*KK		#2&" =>	MMD$<$<<F	!KK	9"Euc!!%)A)A4==Y_!`T%=%=t}}U[\!!%)A)A4==Y_!`aAq)..!!\%8%88-001D1DEO#2D  .;;vv||B'71'<"ES??&1a!24??!223EAq$//"334F!Q#4T__#445E1aDOO$556F(S9E(S9E))UFO4CIIufo26E&sC5C(S9Ekk!Q1%aAq) !)-"0	L $**366-2E2E+Ft~~_klJC %)JJuc5.$Q!\''T5M5Mt}}]mmK0((5L((r+   NNNNFFN)rL   
__module____qualname__r@   rd   ro   r}   r   FloatTensorr   
LongTensorbooltupleTensorr   __classcell__rK   s   @r)   r:   r:   F   s   
@
&$ ): $(3704!&).26G)((4/G) DLG) ))D0	G)
 &&-G) $;G)  $;G) ((4/G) 	ellE%,,//0
eELL15s9J3KK
L	M
	G)r+   r:   c                   \     e Zd Z fdZdej
                  dz  dej
                  fdZ xZS )
CodeGenMLPc                    t         |           |j                  }t        j                  ||      | _        t        j                  ||      | _        t        |j                     | _	        t        j                  |j                        | _        y r   )r?   r@   n_embdr   rU   fc_infc_outr   activation_functionactrC   rF   dropout)r[   intermediate_sizer\   rN   rK   s       r)   r@   zCodeGenMLP.__init__   se    MM	YYy*;<
ii 19=&445zz&"4"45r+   r~   Nr   c                     | j                  |      }| j                  |      }| j                  |      }| j                  |      }|S r   )r   r   r   r   )r[   r~   s     r)   r   zCodeGenMLP.forward   s@    

=1/M2]3r+   )rL   r   r   r@   r   r   r   r   r   s   @r)   r   r      s,    6U%6%6%= %BSBS r+   r   c                   2    e Zd Zd fd	Z	 	 	 	 	 	 ddej
                  dz  dedz  dej
                  dz  dej                  dz  dedz  dedz  d	ej                  dz  d
e	ej                     e	ej                  e	ej
                  df   f   z  dz  fdZ xZS )CodeGenBlockNc                    t         |           |j                  |j                  nd|j                  z  }t	        j
                  |j                  |j                        | _        t        ||      | _	        t        ||      | _        y )Nrg   eps)r?   r@   n_innerr   r   	LayerNormlayer_norm_epsilonln_1r:   attnr   mlp)r[   r\   rH   	inner_dimrK   s       r)   r@   zCodeGenBlock.__init__   sc    &,nn&@FNNa&--FW	LLF4M4MN	$VY7	i0r+   r~   r   ry   r   r   r   r   r   .c           	          |}| j                  |      }| j                  |||||||      \  }	}
| j                  |      }|	|z   |z   }||
fS )N)r~   r   ry   r   r   r   r   )r   r   r   )r[   r~   r   ry   r   r   r   r   residualattn_outputsrz   feed_forward_hidden_statess               r)   r   zCodeGenBlock.forward   sp     !		-0%)YY'!)%/) &/ &
"l &*XXm%<"$'AAHLl**r+   r   r   )rL   r   r   r@   r   r   r   r   r   r   r   r   r   r   s   @r)   r   r      s    1 $(3704!&).26+((4/+ DL+ ))D0	+
 &&-+ $;+  $;+ ((4/+ 
u||	uU\\59J9JC9O3P%PQ	QTX	X+r+   r   c                   @     e Zd ZU eed<   dZdZdgZdZdZ	 fdZ
 xZS )CodeGenPreTrainedModelr\   transformerTr   past_key_valuesc                     t         |   |       t        |t              r?t	        j
                  |j                  t        |j                  |j                               y y r   )
r?   _init_weights
isinstancer:   initcopy_r=   r*   rB   rY   )r[   modulerK   s     r)   r   z$CodeGenPreTrainedModel._init_weights  sI    f%f./JJv--/J6K_K_agatat/uv 0r+   )rL   r   r   r   __annotations__base_model_prefixsupports_gradient_checkpointing_no_split_modules_skip_keys_device_placement_can_compile_fullgraphr   r   r   s   @r)   r   r     s6    %&*#'("3!w wr+   r   c                       e Zd Z fdZd Zd Ze	 	 	 	 	 	 	 	 	 	 	 ddej                  dz  de	dz  dej                  dz  dej                  dz  d	ej                  dz  d
ej                  dz  dedz  dedz  dedz  dedz  dej                  dz  deez  fd       Z	 ddeej                   df   dej                   dej                   de	def
dZedej                   dededej(                  dej                   defd       Z xZS )CodeGenModelc           	         t         |   |       |j                  | _        |j                  | _        t        j                  |j                  | j                        | _        t        j                  |j                        | _
        t        j                  t        |j                        D cg c]  }t        ||       c}      | _        t        j                   | j                  |j"                        | _        t'        |j(                  |j*                  |j,                  z        | _        d| _        | j1                          y c c}w )N)rH   r   F)r?   r@   r   rN   
vocab_sizer   	EmbeddingwterC   
embd_pdropdrop
ModuleListrangen_layerr   hr   r   ln_fminrX   n_ctxrO   gradient_checkpointing	post_init)r[   r\   irK   s      r)   r@   zCodeGenModel.__init__'  s      ++<< 1 14>>BJJv001	5QWQ_Q_K`aaVq AabLLV5N5NO	f//A[A[1[\&+# 	  bs   ,Ec                     | j                   S r   r   )r[   s    r)   get_input_embeddingsz!CodeGenModel.get_input_embeddings7  s    xxr+   c                     || _         y r   r   )r[   new_embeddingss     r)   set_input_embeddingsz!CodeGenModel.set_input_embeddings:  s	    !r+   N	input_idsr   ry   token_type_idsr   inputs_embedsr   r   output_hidden_statesreturn_dictr   r   c           
         ||n| j                   j                  }|	|	n| j                   j                  }	||n| j                   j                  }|
|
n| j                   j                  }
|du |duz  rt        d      | j                  r%| j                  r|rt        j                  d       d}|| j                  |      }|r|t        | j                         }|j                  d   }|9||j                         nd}t        j                  |||z   |j                         }||j#                  d      }| j%                  |||||      }|}|(|j'                  d	|      }| j                  |      }||z   }| j)                  |      }d	||j+                  d	      f}|rd
nd}|	rd
nd}t-        | j.                        D ]-  \  }}|	r||fz   } ||||||||      }|d   }|s%||d   fz   }/ | j1                  |      }|j'                  |      }|	r||fz   }|
st3        d ||||fD              S t5        ||||      S )a  
        inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_dim)`, *optional*):
            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
            is useful if you want more control over how to convert *input_ids* indices into associated vectors than the
            model's internal embedding lookup matrix.
        Nz:You must specify exactly one of input_ids or inputs_embedszZ`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...F)r\   r   r   r   r.    )r   ry   r   r   r   r   c              3   &   K   | ]	  }||  y wr   r   ).0vs     r)   	<genexpr>z'CodeGenModel.forward.<locals>.<genexpr>  s      ghgts   )last_hidden_stater   r~   
attentions)r\   r   r   r   use_return_dictrQ   r   trainingrI   rJ   r   r	   r_   get_seq_lengthr   r    r   	unsqueeze_update_causal_maskrl   r   rk   	enumerater   r   r   r   )r[   r   r   ry   r   r   r   r   r   r   r   r   kwargs
seq_lengthpast_seen_tokensr{   r~   token_type_embedsoutput_shapeall_self_attentionsall_hidden_statesr   blockoutputss                           r)   r   zCodeGenModel.forward=  s   , 2C1N-TXT_T_TqTq$8$D $++JjJj 	 "+!6IDKK<Q<Q	%0%<k$++B]B]-t";<YZZ&&4==##p "	  HHY/M0*$++>O"((+
!CRC^==?de"\\*:<Lz<YbobvbvwN)33A6L..M>?L]
 &%+00Z@N $ 8),==M		-0J(:(:2(>?$5b4"6BD!$&&) 	JHAu#$58H$H!**)#"3-G $AJM &9WQZM&I#!	J$ 		-0%**<8 1]4D D )?<MObc   '+++*	
 	
r+   r   input_tensorc           	         t        | j                        r||dk(  j                         r|S y | j                  j                  dk(  r't	        |t
        j                        rt        |      }|S ||j                         nd}||j                  nd}| j                  j                  dk(  r(|s&|s$t        j                  |||| j                        ry |j                  }|j                  d   }	|r|j                         }
n1t	        |t
        j                        r|j                  d   n||	z   dz   }
| j!                  ||	|
|||j                  d   	      }| j                  j                  dk(  rQ|O|j"                  j$                  d
v r7|s5t        j&                  |      j(                  }t        j*                  ||      }|S )Ng        flex_attentionr   Fsdpa)r   past_key_values_lengthis_trainingr   r.   )sequence_lengthtarget_lengthr   r   
batch_size)cudaxpunpu)r   r\   any_attn_implementationr   r   r   r   r   is_compileabler   _ignore_causal_mask_sdpar   r   r_   get_max_cache_shape5_prepare_4d_causal_attention_mask_with_cache_positionr   typefinfor   _unmask_unattended)r[   ry   r   r   r   r   r   using_compilable_cacher   r  r  r{   	min_dtypes                r)   r   z CodeGenModel._update_causal_mask  s    (4)~/D.I.I.K%%;;++/??.%,,7!<^!L!!
 @O?Z?99;`aCRC^!?!?di ;;++v5>T]n%>>*'7 MM	 ""&,,Q/!+??AM nell; $$R(%7!;  PP+')#))!, Q 
 KK,,6*%%**.DD%
 E*..I0CCKQZ[Kr+   r  r  r   r  c                    | | j                         dk(  r| }|S t        j                  |      j                  }t        j                  ||f|||j
                        }|dk7  rt        j                  |d      }|t        j                  ||j
                        |j                  dd      kD  z  }|ddddddf   j                  |ddd      }| |j                         }| j                  d   }	|ddddddd|	f   | ddddddf   j                  |j
                        z   }
|
dk(  }
|ddddddd|	f   j                  |
|      |ddddddd|	f<   |S )	aM  
        Creates a causal 4D mask of shape `(batch_size, 1, query_length, key_value_length)` from a 2D mask of shape
        `(batch_size, key_value_length)`, or if the input `attention_mask` is already 4D, do nothing.

        Args:
            attention_mask (`torch.Tensor`):
                A 2D attention mask of shape `(batch_size, key_value_length)` or a 4D attention mask of shape
                `(batch_size, 1, query_length, key_value_length)`.
            sequence_length (`int`):
                The sequence length being processed.
            target_length (`int`):
                The target length: when generating with static cache, the mask should be as long as the static cache,
                to account for the 0 padding, the part of the cache that is not filled yet.
            dtype (`torch.dtype`):
                The dtype to use for the 4D attention mask.
            cache_position (`torch.Tensor`):
                Indices depicting the position of the input sequence tokens in the sequence.
            batch_size (`torch.Tensor`):
                Batch size.
        Nrg   )
fill_valuer   r   r   )diagonalr   r.   r   )r   r   r  r   fullr   triur    r^   expandcloner_   rq   masked_fill)ry   r  r  r   r   r  r   r{   r  mask_lengthpadding_masks              r)   r  zBCodeGenModel._prepare_4d_causal_attention_mask_with_cache_position  s   > %.*<*<*>!*C(K* ' E*..I** -0Ye\j\q\qK !##jjqA5<<n>S>STWeWmWmnprsWtttK%dD!Q&67>>z1bRTUK))//1,2226*1aL[L+@ANSTVZ\`bcScDdDgDg&&E    ,q05@Aq,;,AV5W5c5c )6Aq!\k\12 r+   )NNNNNNNNNNN)F)rL   r   r   r@   r   r   r   r   r   r   r   r   r   r   r   r   r   r   staticmethodintr   r  r   r   s   @r)   r   r   %  s    "  .2(,37260426!%)-,0#'26g
##d*g
 g
 ))D0	g

 ((4/g
 &&-g
 ((4/g
 $;g
  $;g
 #Tkg
 D[g
 ((4/g
 
(	(g
 g
` #(BellK78B llB 	B
 B  BH 444 4 {{	4
 4 4 4r+   r   zM
    The CodeGen Model transformer with a language modeling head on top.
    )custom_introc                       e Zd ZddiZ fdZe	 	 	 	 	 	 	 	 	 	 	 	 	 ddej                  dz  dedz  dej                  dz  dej                  dz  d	ej                  dz  d
ej                  dz  dej                  dz  de
dz  de
dz  de
dz  de
dz  dej                  dz  deej                  z  deez  fd       Z xZS )CodeGenForCausalLMzlm_head.weightztransformer.wte.weightc                     t         |   |       t        |      | _        t	        j
                  |j                  |j                        | _        | j                          y r   )
r?   r@   r   r   r   rU   r   r   lm_headr   )r[   r\   rK   s     r)   r@   zCodeGenForCausalLM.__init__-  sE     '/yy0A0AB 	r+   Nr   r   ry   r   r   r   labelsr   r   r   r   r   logits_to_keepr   c                    ||n| j                   j                  }| j                  ||||||||	|
||      }|d   }t        |t              rt        | d      n|}| j                  |dd|ddf         }d}|* | j                  d||| j                   j                  d|}|s|f|dd z   }||f|z   S |S t        |||j                  |j                  |j                        S )aG  
        inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_dim)`, *optional*):
            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
            is useful if you want more control over how to convert *input_ids* indices into associated vectors than the
            model's internal embedding lookup matrix.
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for language modeling. Note that the labels **are shifted** inside the model, i.e. you can set
            `labels = input_ids` Indices are selected in `[-100, 0, ..., config.vocab_size]` All labels set to `-100`
            are ignored (masked), the loss is only computed for labels in `[0, ..., config.vocab_size]`
        N)
r   ry   r   r   r   r   r   r   r   r   r   )logitsr&  r   r   )lossr)  r   r~   r   r   )r\   r   r   r   r   slicer%  loss_functionr   r   r   r~   r   )r[   r   r   ry   r   r   r   r&  r   r   r   r   r   r'  r   transformer_outputsr~   slice_indicesr)  r*  outputs                        r)   r   zCodeGenForCausalLM.forward5  s"   8 &1%<k$++B]B]"..+))%'/!5#) / 
 ,A.8B>SV8W~ot4]kmA}a,?@A%4%%pVFt{{OeOepiopDY!4QR!88F)-)9TGf$EvE%/??-;;*55
 	
r+   )NNNNNNNNNNNNr   )rL   r   r   _tied_weights_keysr@   r   r   r   r   r   r   r   r   r   r   r   r   r   s   @r)   r#  r#  %  sM    +,DE  .2(,37260426*.!%)-,0#'26-.>
##d*>
 >
 ))D0	>

 ((4/>
 &&->
 ((4/>
   4'>
 $;>
  $;>
 #Tk>
 D[>
 ((4/>
 ell*>
  
'	'!>
 >
r+   r#  )r#  r   r   )5__doc__rR   typingr   r   r    r   r   activationsr   cache_utilsr   r	   
generationr
   modeling_attn_mask_utilsr   modeling_layersr   modeling_outputsr   r   modeling_utilsr   utilsr   r   r   utils.genericr   configuration_codegenr   !torch.nn.attention.flex_attentionr   integrations.flex_attentionr   
get_loggerrL   rI   r   r   r*   r4   r8   Moduler:   r   r   r   r   r#  __all__r   r+   r)   <module>rC     sv        & ! . ) > 9 O - 
 : 0  !;J 
		H	%P P3 P5<< P  = =ELL =u|| =X]XdXd =T)ryy T)p (!+- !+H w_ w w |) | |~ 
J
/ J

J
Z Kr+   