
    iA              
          d Z ddlZddlmZ ddlZddlmZ ddlmZmZm	Z	m
Z
 ddlmZ ddlmZmZmZ dd	lmZ dd
lmZ ddlmZ ddlmZmZmZmZmZ ddlmZ ddlm Z m!Z!m"Z" ddl#m$Z$ ddl%m&Z&  e!       rddl'm(Z( ddl)m*Z*  e"jV                  e,      Z-dej\                  de/dej`                  dej\                  fdZ1dej\                  dej\                  de2de3dej\                  f
dZ4dej\                  dej\                  fdZ5dej\                  dej\                  dej\                  fd Z6 G d! d"ejn                  jp                        Z9 G d# d$ejt                        Z; G d% d&ejt                        Z< G d' d(ejt                        Z= G d) d*e      Z>e  G d+ d,e             Z?e  G d- d.e?             Z@ e d/0       G d1 d2e?e             ZA e d30       G d4 d5e?             ZBe  G d6 d7e?             ZCe  G d8 d9e?             ZDg d:ZEy);zPyTorch BLOOM model.    N)Union)nn)BCEWithLogitsLossCrossEntropyLoss	LayerNormMSELoss)
functional   )CacheDynamicCacheStaticCache)GenerationMixin)AttentionMaskConverter)GradientCheckpointingLayer))BaseModelOutputWithPastAndCrossAttentions!CausalLMOutputWithCrossAttentionsQuestionAnsweringModelOutput SequenceClassifierOutputWithPastTokenClassifierOutput)PreTrainedModel)auto_docstringis_torch_flex_attn_availablelogging)is_flash_attention_requested   )BloomConfig)	BlockMask)make_flex_block_causal_maskattention_mask	num_headsdtypereturnc                    | j                   \  }}dt        j                  t        j                  |            z  }t	        j
                  ddt        j                  |      dz
   z   z  | j                  t        j                        }t	        j                  dd|z   | j                  t        j                        }t	        j                  ||      }||k7  rt	        j
                  ddt        j                  d|z        dz
   z   z  | j                  t        j                        }	t        |||z
        }
t	        j                  ddd|
z  z   d| j                  t        j                        }t	        j                  |t	        j                  |	|      gd      }| j                  d      dz
  | z  dddddf   }|d	   |z  }|j                  ||z  d|      j                  |      S )
a  
    Link to paper: https://huggingface.co/papers/2108.12409 Alibi tensor is not causal as the original paper mentions, it
    relies on a translation invariance of softmax for quick implementation: with l being a tensor, and a fixed value
    `softmax(l+a) = softmax(l)`. Based on
    https://github.com/ofirpress/attention_with_linear_biases/blob/a35aaca144e0eb6b789dfcb46784c4b8e31b7983/fairseq/models/transformer.py#L742
    TODO @thomasw21 this doesn't work as nicely due to the masking strategy, and so masking varies slightly.

    Args:
    Returns tensor shaped (batch_size * num_heads, 1, max_seq_len)
        attention_mask (`torch.Tensor`):
            Token-wise attention mask, this should be of shape (batch_size, max_seq_len).
        num_heads (`int`):
            number of heads
        dtype (`torch.dtype`, *optional*, default=`torch.bfloat16`):
            dtype of the output tensor
       r
   devicer!   r   r   dimN).N)shapemathfloorlog2torchtensorr&   float32arangeint32powmincatcumsumreshapeto)r   r    r!   
batch_size
seq_lengthclosest_power_of_2basepowersslopes
extra_basenum_remaining_headsextra_powersarange_tensoralibis                 r/home/obispo/Crisostomo_bridge/mision_env/lib/python3.12/site-packages/transformers/models/bloom/modeling_bloom.pybuild_alibi_tensorrE   6   s   " ,11J
djj9)=>><<	tyy!34q899:;NDYDYafananD \\!Q!33N<Q<QY^YdYdeFYYtV$FY&\\A499Q);%;<q@AABCNLaLainiviv

 ""4iBT6TU||Aq1/B+B'BAnNcNckpkvkvwFEIIj,$GHaP %+++3a7>I1dTU:VM9-E==i/J?BB5II    xresidualprobtrainingc                 @    t        j                  | ||      }||z   }|S )a
  
    Dropout add function

    Args:
        x (`torch.tensor`):
            input tensor
        residual (`torch.tensor`):
            residual tensor
        prob (`float`):
            dropout probability
        training (`bool`):
            training mode
    )prJ   )Fdropout)rG   rH   rI   rJ   outs        rD   dropout_addrP   b   s$     ))A
1C
S.CJrF   c                 \    | dz  dt        j                  d| z  dd| z  | z  z   z        z   z  S )z
    Custom bias GELU function. Adapted from Megatron-DeepSpeed code. Here we use a simple implementation (inference) to
    make the model jitable.

    Args:
        x (`torch.tensor`):
            input hidden states
          ?      ? e3E?r   Hm?r.   tanh)rG   s    rD   bloom_gelu_forwardrX   u   s8     s7cEJJzA~X\A=M9M'NOOPPrF   gc                     |d   }t        j                  d|z  dd|z  |z  z   z        }d|z  d||z  z
  dd|z  |z  z   z  z  dd|z   z  z   }|| z  S )a   
    gradient of tanh approximation of gelu gradient of actual gelu is: 0.5 * (1. + torch.erf(x * 0.70710678)) +
    0.3989423 * x * torch.exp(-0.5 * x * x)

    Args:
        g (`torch.tensor`):
            gradient output tensor
        x (`torch.tensor`):
            input tensor
    r   rT   r   rU   rR   g6vf?rV   )rY   rG   tanh_outffs       rD   bloom_gelu_backr]      sz     	
!Azz*q.A1q0@,@ABH	qQH,,lQ>NQR>R1RS	TWZ^_bj^jWk	kB6MrF   c                       e Zd Zedej
                  dej
                  fd       Zedej
                  dej
                  fd       Zy)GeLUFunctioninputr"   c                 :    | j                  |       t        |      S N)save_for_backwardrX   )ctxr`   s     rD   forwardzGeLUFunction.forward   s    e$!%((rF   grad_outputc                 6    | j                   }t        ||      }|S rb   )saved_tensorsr]   )rd   rf   r`   tmps       rD   backwardzGeLUFunction.backward   s    !!k51
rF   N)__name__
__module____qualname__staticmethodr.   Tensorre   rj    rF   rD   r_   r_      sT    )ELL )U\\ ) ) 5<< ELL  rF   r_   c                   Z     e Zd ZdZ fdZdej                  dej                  fdZ xZS )	BloomGeluzN
    Partly copied from Megatron-DeepSpeed code and adapted for our needs
    c                 "    t         |           y rb   )super__init__)self	__class__s    rD   ru   zBloomGelu.__init__   s    rF   rG   r"   c                 ,    t         j                  |      S rb   )r_   apply)rv   rG   s     rD   re   zBloomGelu.forward   s    !!!$$rF   )	rk   rl   rm   __doc__ru   r.   ro   re   __classcell__rw   s   @rD   rr   rr      s(    % %%,, %rF   rr   c                   v    e Zd Zddededz  f fdZdej                  deej                  ej                  ej                  f   fdZ	dej                  dej                  fd	Z
	 	 	 	 dd
ej                  dej                  dej                  dej                  dedz  dededej                  dz  fdZ xZS )BloomAttentionNconfig	layer_idxc                    t         |           |j                  | _        |j                  | _        |j                  | _        |j
                  | _        | j                  | j                  z  | _        | j                  | _        |j                  | _	        | j                  | j                  z  | j                  k7  r&t        d| j                   d| j                   d      dt        j                  | j                        z  | _        d| _        || _        |-t         j#                  d| j$                  j&                   d       t)        j*                  | j                  d| j                  z  d	      | _        t)        j*                  | j                  | j                        | _        t)        j0                  |j2                        | _        y )
NzA`hidden_size` must be divisible by num_heads (got `hidden_size`: z and `num_heads`: z).rS   zInstantiating z without passing a `layer_idx` is not recommended and will lead to errors during the forward call if caching is used. Please make sure to provide a `layer_idx` when creating this class.r
   Tbias)rt   ru   pretraining_tpslow_but_exacthidden_sizen_headr    head_dim
split_sizehidden_dropout
ValueErrorr+   sqrtinv_norm_factorbetar   loggerwarning_oncerw   rk   r   Linearquery_key_valuedenseDropoutattention_dropout)rv   r   r   rw   s      rD   ru   zBloomAttention.__init__   sz   $33$33!--((DNN:**$33==4>>)T-=-==STXTdTdSe fNN#2'   #TYYt}}%==	" !8!8 9 :, ,  "yy)9)91t?O?O;OVZ[YYt//1A1AB
!#F,D,D!ErF   	fused_qkvr"   c                    |j                   \  }}}|j                  ||| j                  d| j                        }|ddddf   j	                  dd      }|ddddf   j	                  dd      }|ddddf   j	                  dd      }|||fS )a  
        Split the last dimension into (num_heads, head_dim) and reshapes to (bs, heads, len, dim) shape
        without making any copies, results share same memory storage as `fused_qkv`

        Args:
            fused_qkv (`torch.tensor`): [batch_size, seq_length, num_heads * 3 * head_dim]

        Returns:
            query: [batch_size, num_heads, seq_length, head_dim]
            key: [batch_size, num_heads, seq_length, head_dim]
            value: [batch_size, num_heads, seq_length, head_dim]
        r
   .r   Nr   r$   )r*   viewr    r   	transpose)rv   r   r9   r:   three_times_hidden_sizequery_layer	key_layervalue_layers           rD   _reshapezBloomAttention._reshape   s     ;D//7
J 7NN:z4>>1dmm\	Q	*44Q:c1ai(221a8	Q	*44Q:I{22rF   rG   c                    |j                   \  }}}|| j                  z  }|j                  || j                  || j                        }|j	                  dddd      }|j                  ||| j                  | j                  z        S )z
        Merge heads together over the last dimension

        Args:
            x (`torch.tensor`): [batch_size * num_heads, seq_length, head_dim]

        Returns:
            torch.tensor: [batch_size, seq_length, num_heads * head_dim]
        r   r$   r   r
   )r*   r    r   r   permuter7   )rv   rG   batch_size_and_num_headsr:   _r9   s         rD   _merge_headszBloomAttention._merge_heads   sy     34''/ *a-?
 FF:t~~z4==I IIaAq! yyZ$--1OPPrF   hidden_statesrH   rC   r   
layer_past	use_cacheoutput_attentionscache_positionc	                    |j                   \  }	}
}| j                  |      }| j                  |      \  }}}|%d|i}|j                  ||| j                  |      \  }}|j                  |	| j                  z  d| j                        }|j                  |	| j                  z  d| j                        j                  dd      }|j                  |	| j                  z  d| j                        }|j                  ||| j                  | j                        }|j                  |	| j                  |
d      }|#|d d d d d d d |j                   d   f   }||z   }t        j                  |dt        j                         j#                  |j$                        }| j'                  |      }|j                  |	| j                  z  |
d      }t        j(                  ||      }| j+                  |      }| j,                  dkD  r| j.                  r| j0                  | j,                  z  }t        j2                  |      }t5        | j,                        D ]z  }|t        j6                  |d d d d t9        ||z        t9        |dz   |z        f   | j:                  j<                  d d t9        ||z        t9        |dz   |z        f         z   }| n| j;                  |      }t?        ||| j@                  | jB                        }||fS )Nr   r)   )batch1batch2r   alpha)r(   r!   r   )"r*   r   r   updater   r7   r    r   r   baddbmmr   r   r   rM   softmaxr.   r0   r8   r!   r   bmmr   r   r   r   
zeros_likerangelinearintr   weightrP   r   rJ   )rv   r   rH   rC   r   r   r   r   r   r9   q_lengthr   r   r   r   r   cache_kwargsattention_scoresattn_weightscausal_maskattention_probsattention_probs_reshapedcontext_layerslicesoutput_tensoris                             rD   re   zBloomAttention.forward   s    #0"5"5
Ha((7	.2mmI.F+Y!,n=L%/%6%6y+t~~_k%l"I{ "))*t~~*Er4==Y%%j4>>&A2t}}U__`bdfg	!))*t~~*Er4==Y !==&&	 ) 
 (,,ZSUV%(Aq2GIOOB4G2G)GHK'+5L ))LbNQQR]RcRcd 00A $3#7#7
T^^8SU]_a#b  		":KH ))-8 "t':':%%(;(;;F!,,];M4../  -!!QAJ#q1u>N:O(O"OPJJ%%aQZ3A?O;P)P&PQ1 ! !JJ}5M#M8T=P=PRVR_R_`o--rF   rb   NFFN)rk   rl   rm   r   r   ru   r.   ro   tupler   r   r   bool
LongTensorre   r{   r|   s   @rD   r~   r~      s    F{ FsTz FB3%,, 35u||UZUaUa9a3b 3(Qell Qu|| Q> $("'26C.||C. ,,C. ||	C.
 C. DLC. C.  C. ((4/C.rF   r~   c                   t     e Zd Zdef fdZdej                  dej                  dej                  fdZ xZS )BloomMLPr   c                 6   t         |           |j                  }|j                  | _        |j                  | _        t        j                  |d|z        | _        t               | _	        t        j                  d|z  |      | _
        |j                  | _        y )N   )rt   ru   r   r   r   r   r   dense_h_to_4hrr   	gelu_impldense_4h_to_hr   )rv   r   r   rw   s      rD   ru   zBloomMLP.__init__B  sz    (($33$33YY{AOD"YYq;D$33rF   r   rH   r"   c                    | j                  | j                  |            }| j                  dkD  r| j                  rt	        j
                  |      }| j                  j                  j                  d   | j                  z  }t        | j                        D ]z  }|t        j                  |d d d d t        ||z        t        |dz   |z        f   | j                  j                  d d t        ||z        t        |dz   |z        f         z   }| n| j                  |      }t        ||| j                  | j                        }|S )Nr   r)   )r   r   r   r   r.   r   r   r   r*   r   rM   r   r   rP   r   rJ   )rv   r   rH   intermediate_outputr   r   outputs          rD   re   zBloomMLP.forwardM  s.   t'9'9-'HI"t':':"'"2"28"<''..44R84;N;NNF4../ &9AHH!!QAJ#q1u>N:O(O"OP&&--aQZ3AQWGWCX1X.XY= '# #'"4"4]"C0(D<O<OQUQ^Q^_rF   )	rk   rl   rm   r   ru   r.   ro   re   r{   r|   s   @rD   r   r   A  s5    	4{ 	4U\\ U\\ ell rF   r   c                        e Zd Zddededz  f fdZ	 	 	 	 ddej                  dej                  dej                  dedz  d	e	d
e	dej                  dz  fdZ xZS )
BloomBlockNr   r   c                 R   t         |           |j                  }t        ||j                        | _        |j                  | _        t        ||      | _	        t        ||j                        | _
        t        |      | _        |j                  | _        |j                  | _        y )Neps)rt   ru   r   r   layer_norm_epsiloninput_layernormr   r    r~   self_attentionpost_attention_layernormr   mlp(apply_residual_connection_post_layernormr   )rv   r   r   r   rw   s       rD   ru   zBloomBlock.__init__a  s    (((&:S:ST,VY?(1+6C\C\(]%F#8>8g8g5$33rF   r   rC   r   r   r   r   r   c           
          | j                  |      }| j                  r|}	n|}	| j                  ||	||||||      \  }
}| j                  |
      }| j                  r|}	n|
}	| j	                  ||	      }||fS )N)r   r   rC   r   r   r   )r   r   r   r   r   )rv   r   rC   r   r   r   r   r   layernorm_outputrH   attention_outputr   r   s                rD   re   zBloomBlock.forwardo  s      //> 88'H$H *.)<)<!)/) *= 	*
&,  889IJ 88'H'H *H5|##rF   rb   r   )rk   rl   rm   r   r   ru   r.   ro   r   r   r   re   r{   r|   s   @rD   r   r   `  s    4{ 4sTz 4& $("'26,$||,$ ||,$ 	,$
 DL,$ ,$  ,$ ((4/,$rF   r   c                   .    e Zd ZU eed<   dZdZdgZdZdZ	y)BloomPreTrainedModelr   transformerTr   past_key_valuesN)
rk   rl   rm   r   __annotations__base_model_prefixsupports_gradient_checkpointing_no_split_modules_skip_keys_device_placement_can_compile_fullgraphrp   rF   rD   r   r     s(    %&*#%"3!rF   r   c                   b    e Zd Zdef fdZdej                  dedej                  dej                  fdZ	d Z
d	ej                  fd
Ze	 	 	 	 	 	 	 	 	 ddej                  dz  dedz  dej                  dz  dej                  dz  dedz  dedz  dedz  dedz  dej                  dz  deej                  df   ez  fd       Z	 ddeej                  df   dej                  dej                  dedef
dZedej                  dededej                  dej                  defd       Z xZS )
BloomModelr   c           	      "   t         |   |       |j                  | _        |j                  | _        t        j                  |j                  | j                        | _	        t        | j                  |j                        | _        t        j                  t        |j                        D cg c]  }t!        ||       c}      | _        t        | j                  |j                        | _        d| _        | j)                          y c c}w )Nr   )r   F)rt   ru   r   	embed_dimr   r    r   	Embedding
vocab_sizeword_embeddingsr   r   word_embeddings_layernorm
ModuleListr   num_hidden_layersr   hln_fgradient_checkpointing	post_init)rv   r   r   rw   s      rD   ru   zBloomModel.__init__  s     ++  "||F,=,=t~~N)24>>vG`G`)a& vOgOgIhiA
6Q ?ij dnn&2K2KL	&+# 	  js   .Dr   r    r!   r"   c                     t        |||      S rb   )rE   )rv   r   r    r!   s       rD   rE   zBloomModel.build_alibi_tensor  s    !.)UCCrF   c                     | j                   S rb   r   )rv   s    rD   get_input_embeddingszBloomModel.get_input_embeddings  s    ###rF   new_embeddingsc                     || _         y rb   r   rv   r   s     rD   set_input_embeddingszBloomModel.set_input_embeddings  s
    -rF   N	input_idsr   inputs_embedsr   r   output_hidden_statesreturn_dictr   .c
           
         ||n| j                   j                  }||n| j                   j                  }||n| j                   j                  }||n| j                   j                  }|du |duz  rt        d      | j                  r%| j                  r|rt        j                  d       d}|| j                  |      }|r|t        | j                         }|j                  \  }}}||j                         nd}||z   }|	%t        j                  |||z   |j                         }	| j#                  |      }|rdnd}|rdnd}|$t        j$                  ||f|j                         }n|j'                  |j                         }| j)                  || j*                  |j,                  	      }| j/                  |||	||      }t1        | j2                        D ]-  \  }}|r||fz   } ||||||||	
      }|d   }|s%||d   fz   }/ | j5                  |      }|r||fz   }|st7        d ||||fD              S t9        ||||      S )  
        input_ids (`torch.LongTensor` of shape `(batch_size, input_ids_length)`):
            `input_ids_length` = `sequence_length` if `past_key_values` is `None` else `past_key_values.get_seq_length()`
            (`sequence_length` of input past key value states). Indices of input sequence tokens in the vocabulary.

            If `past_key_values` is used, only `input_ids` that do not have their past calculated should be passed as
            `input_ids`.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.

            [What are input IDs?](../glossary#input-ids)
        Nz:You must specify exactly one of input_ids or inputs_embedszZ`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...F)r   r   r&   rp   )r!   )r   r   r   r   rC   r   r   c              3   &   K   | ]	  }||  y wrb   rp   ).0vs     rD   	<genexpr>z%BloomModel.forward.<locals>.<genexpr>(  s      ghgts   )last_hidden_stater   r   
attentions)r   r   r   r   use_return_dictr   r   rJ   r   r   r   r   r*   get_seq_lengthr.   r1   r&   r   onesr8   rE   r    r!   _update_causal_mask	enumerater   r   r   r   )rv   r   r   r   r   r   r   r   r  r   kwargsr9   r:   r   past_lengthseq_length_with_pastr   all_self_attentionsall_hidden_statesrC   r   r   blockoutputss                           rD   re   zBloomModel.forward  s   6 2C1N-TXT_T_TqTq$8$D $++JjJj 	 "+!6IDKK<Q<Q	%0%<k$++B]B]-t";<YZZ&&4==Yl I  00;M0*$++>O$1$7$7!
J:I:Uo446[\)K7!"\\+{Z7OXeXlXlmN66}E$5b4"6BD !"ZZ5I(JS`SgSghN+..}/C/CDN''mNaNa'b..M>?L]
 "$&&) 	JHAu#$58H$H!**#"3-G $AJM &9WQZM&I#!	J& 		-0 1]4D D )?<MObc   9+++*	
 	
rF   r   input_tensorc           	         t        | j                        r||dk(  j                         r|S y | j                  j                  dk(  r't	        |t
        j                        rt        |      }|S ||j                         nd}||j                  nd}| j                  j                  dk(  r(|s&|s$t        j                  |||| j                        ry |j                  }|j                  d   }	|r|j                         }
n1t	        |t
        j                        r|j                  d   n||	z   dz   }
| j!                  ||	|
|||j                  d   	      }| j                  j                  dk(  rQ|O|j"                  j$                  d
v r7|s5t        j&                  |      j(                  }t        j*                  ||      }|S )Ng        flex_attentionr   Fsdpa)r   past_key_values_lengthis_trainingr   r)   )sequence_lengthtarget_lengthr!   r   r9   )cudaxpunpu)r   r   any_attn_implementation
isinstancer.   ro   r   r  is_compileabler   _ignore_causal_mask_sdparJ   r!   r*   get_max_cache_shape5_prepare_4d_causal_attention_mask_with_cache_positionr&   typefinfor4   _unmask_unattended)rv   r   r  r   r   r   past_seen_tokensusing_compilable_cacher!   r  r  r   	min_dtypes                rD   r  zBloomModel._update_causal_mask4  s    (4)~/D.I.I.K%%;;++/??.%,,7!<^!L!!
 @O?Z?99;`aCRC^!?!?di ;;++v5>T]n%>>*'7 MM	 ""&,,Q/!+??AM nell; $$R(%7!;  PP+')#))!, Q 
 KK,,6*%%**.DD%
 E*..I0CCKQZ[KrF   r  r  r9   c                    | | j                         dk(  r| }|S t        j                  |      j                  }t        j                  ||f|||j
                        }|dk7  rt        j                  |d      }|t        j                  ||j
                        |j                  dd      kD  z  }|ddddddf   j                  |ddd      }| |j                         }| j                  d   }	|ddddddd|	f   | ddddddf   j                  |j
                        z   }
|
dk(  }
|ddddddd|	f   j                  |
|      |ddddddd|	f<   |S )	aM  
        Creates a causal 4D mask of shape `(batch_size, 1, query_length, key_value_length)` from a 2D mask of shape
        `(batch_size, key_value_length)`, or if the input `attention_mask` is already 4D, do nothing.

        Args:
            attention_mask (`torch.Tensor`):
                A 2D attention mask of shape `(batch_size, key_value_length)` or a 4D attention mask of shape
                `(batch_size, 1, query_length, key_value_length)`.
            sequence_length (`int`):
                The sequence length being processed.
            target_length (`int`):
                The target length: when generating with static cache, the mask should be as long as the static cache,
                to account for the 0 padding, the part of the cache that is not filled yet.
            dtype (`torch.dtype`):
                The dtype to use for the 4D attention mask.
            cache_position (`torch.Tensor`):
                Indices depicting the position of the input sequence tokens in the sequence.
            batch_size (`torch.Tensor`):
                Batch size.
        Nr   )
fill_valuer!   r&   r   )diagonalr  r)   r   )r(   r.   r*  r4   fullr&   triur1   r7   expandcloner*   r8   masked_fill)r   r  r  r!   r   r9   r  r   r.  mask_lengthpadding_masks              rD   r(  z@BloomModel._prepare_4d_causal_attention_mask_with_cache_positionx  s   > %.*<*<*>!*C(K* ' E*..I** -0Ye\j\q\qK !##jjqA5<<n>S>STWeWmWmnprsWtttK%dD!Q&67>>z1bRTUK))//1,2226*1aL[L+@ANSTVZ\`bcScDdDgDg&&E    ,q05@Aq,;,AV5W5c5c )6Aq!\k\12 rF   	NNNNNNNNN)F)rk   rl   rm   r   ru   r.   ro   r   r!   rE   r   r   r   r   r   r   r   r   re   r   r  rn   r(  r{   r|   s   @rD   r   r     s   { *D D# DV[VaVa Dfkfrfr D$.5<< .  .2(,.215!%)-,0#'26h
##d*h
 h
 t+	h

 ''$.h
 $;h
  $;h
 #Tkh
 D[h
 ((4/h
 
u||S 	!$M	Mh
 h
b #(BellK78B llB 	B
 B  BH 444 4 {{	4
 4 4 4rF   r   z
    The Bloom Model transformer with a language modeling head on top (linear layer with weights tied to the input
    embeddings).
    )custom_introc                       e Zd ZddiZdef fdZdej                  fdZ	 	 	 	 	 	 d fd	Z	e
	 	 	 	 	 	 	 	 	 	 	 dd	ej                  dz  d
edz  dej                  dz  dej                  dz  dej                  dz  dedz  dedz  dedz  dedz  dej                  dz  deej                  z  deej                     ez  fd       Z xZS )BloomForCausalLMzlm_head.weightz"transformer.word_embeddings.weightr   c                     t         |   |       t        |      | _        t	        j
                  |j                  |j                  d      | _        | j                          y NFr   )
rt   ru   r   r   r   r   r   r   lm_headr   rv   r   rw   s     rD   ru   zBloomForCausalLM.__init__  sI     %f-yy!3!3V5F5FUS 	rF   r   c                     || _         y rb   )r?  r   s     rD   set_output_embeddingsz&BloomForCausalLM.set_output_embeddings  s	    %rF   Nc           
      8   t        |   |f||||||d|}	t        |t              rq|o|j	                         }
|j
                  \  }}|
|z
  }t        j                  |||j                  |j                        }t        j                  ||gd      }||	d<   |	S )N)r   r   r   r   r   is_first_iterationr%   r)   r'   r   )rt   prepare_inputs_for_generationr$  r   r'  r*   r.   zerosr&   r!   r5   )rv   r   r   r   r   r   r   rD  r  model_inputsr  r9   r:   diffnew_attn_maskrw   s                  rD   rE  z.BloomForCausalLM.prepare_inputs_for_generation  s     w<	
+)')1	
 	
 o{38R+??AM%3%9%9"J
 :-D!KK
DAVAV^l^r^rsM"YY'FBON-;L)*rF   r   r   r   r   labelsr   r   r   r  r   logits_to_keepr"   c                    |	|	n| j                   j                  }	| j                  ||||||||	|
	      }|d   }t        |t              rt        | d      n|}| j                  |dd|ddf         }d}|8| j                  ||| j                   j                  |j                  d            }|	s|f|dd z   }||f|z   S |S t        |||j                  |j                  |j                        S )a\  
        input_ids (`torch.LongTensor` of shape `(batch_size, input_ids_length)`):
            `input_ids_length` = `sequence_length` if `past_key_values` is `None` else `past_key_values.get_seq_length()`
            (`sequence_length` of input past key value states). Indices of input sequence tokens in the vocabulary.

            If `past_key_values` is used, only `input_ids` that do not have their past calculated should be passed as
            `input_ids`.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.

            [What are input IDs?](../glossary#input-ids)
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for language modeling. Note that the labels **are shifted** inside the model, i.e. you can set
            `labels = input_ids` Indices are selected in `[-100, 0, ..., config.vocab_size]` All labels set to `-100`
            are ignored (masked), the loss is only computed for labels in `[0, ..., config.vocab_size]`
        N)r   r   r   r   r   r   r  r   r   num_items_in_batch)r   rM  r   losslogitsr   r   r
  )r   r  r   r$  r   slicer?  loss_functionr   getr   r   r   r
  )rv   r   r   r   r   rJ  r   r   r   r  r   rK  r  transformer_outputsr   slice_indicesrP  rO  r   s                      rD   re   zBloomForCausalLM.forward  s+   B &1%<k$++B]B]"..+)'/!5#) / 

 ,A.8B>SV8W~ot4]kmA}a,?@A%%;;11#)::.B#C	 & D Y!4QR!88F)-)9TGf$EvE0/??-;;*55
 	
rF   )NNNNTF)NNNNNNNNNNr   )rk   rl   rm   _tied_weights_keysr   ru   r.   ro   rB  rE  r   r   r   r   r   r   r   re   r{   r|   s   @rD   r<  r<    s_    +,PQ{ &ELL &  #J  .2(,.2-1&*!%)-,0#'26-.F
##d*F
 F
 t+	F

 ||d*F
 t#F
 $;F
  $;F
 #TkF
 D[F
 ((4/F
 ell*F
 
u||	@	@F
 F
rF   r<  a  
    The Bloom Model transformer with a sequence classification head on top (linear layer).

    [`BloomForSequenceClassification`] uses the last token in order to do the classification, as other causal models
    (e.g. GPT-1) do.

    Since it does classification on the last token, it requires to know the position of the last token. If a
    `pad_token_id` is defined in the configuration, it finds the last token that is not a padding token in each row. If
    no `pad_token_id` is defined, it simply takes the last value in each row of the batch. Since it cannot guess the
    padding tokens when `inputs_embeds` are passed instead of `input_ids`, it does the same (take the last value in
    each row of the batch).
    c                       e Zd Zdef fdZe	 	 	 	 	 	 	 	 	 ddej                  dz  dedz  dej                  dz  dej                  dz  dej                  dz  d	e
dz  d
e
dz  de
dz  de
dz  deej                     ez  fd       Z xZS )BloomForSequenceClassificationr   c                     t         |   |       |j                  | _        t        |      | _        t        j                  |j                  |j                  d      | _        | j                          y r>  )
rt   ru   
num_labelsr   r   r   r   r   scorer   r@  s     rD   ru   z'BloomForSequenceClassification.__init__C  sV      ++%f-YYv1163D3D5Q
 	rF   Nr   r   r   r   rJ  r   r   r   r  r"   c
           
      r   |	|	n| j                   j                  }	| j                  ||||||||	      }|d   }| j                  |      }||j                  d   }n|j                  d   }| j                   j
                  |dk7  rt        d      | j                   j
                  d}n||| j                   j
                  k7  j                  |j                  t        j                        }t        j                  |j                  d   |j                  t        j                        }||z  j                  d      }n.d}t        j                  | j                  j                    d       |t        j                  ||j                  	      |f   }d}|^| j                   j"                  | j$                  dk(  rd
| j                   _        nl| j$                  dkD  rL|j&                  t        j(                  k(  s|j&                  t        j*                  k(  rd| j                   _        nd| j                   _        | j                   j"                  d
k(  rIt-               }| j$                  dk(  r& ||j/                         |j/                               }nc |||      }nY| j                   j"                  dk(  rt1               } |||      }n,| j                   j"                  dk(  rt3               } |||      }|	s|f|dd z   }||f|z   S |S t5        |||j6                  |j8                  |j:                        S )6  
        input_ids (`torch.LongTensor` of shape `(batch_size, input_ids_length)`):
            `input_ids_length` = `sequence_length` if `past_key_values` is `None` else `past_key_values.get_seq_length()`
            (`sequence_length` of input past key value states). Indices of input sequence tokens in the vocabulary.

            If `past_key_values` is used, only `input_ids` that do not have their past calculated should be passed as
            `input_ids`.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.

            [What are input IDs?](../glossary#input-ids)
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        Nr   r   r   r   r   r   r  r   r   z=Cannot handle batch sizes > 1 if no padding token is defined.r)   r%   z will not detect padding tokens in `inputs_embeds`. Results may be unexpected if using padding tokens in conjunction with `inputs_embeds.`r  
regressionsingle_label_classificationmulti_label_classificationrN  )r   r  r   r[  r*   pad_token_idr   r8   r&   r.   r2   r1   argmaxr   r   rw   rk   problem_typerZ  r!   longr   r   squeezer   r   r   r   r   r
  )rv   r   r   r   r   rJ  r   r   r   r  r  rT  r   rP  r9   last_non_pad_tokennon_pad_masktoken_indicespooled_logitsrO  loss_fctr   s                         rD   re   z&BloomForSequenceClassification.forwardL  s   > &1%<k$++B]B]"..+)'/!5# / 	
 ,A.M* "+J&,,Q/J;;##+
a\]];;##+!#"%)A)AAEEfmmUZU`U`aL!LL)<V]]Z_ZeZefM"/,">!F!Fr!J!#>>**+ ,Z Z
 u||Jv}}MOaab{{''/??a'/;DKK,__q(fllejj.HFLL\a\e\eLe/LDKK,/KDKK,{{''<7"9??a'#M$9$9$;V^^=MND#M6:D))-JJ+-v6))-II,.v6#%(;AB(??F)-)9TGf$EvE/ /??-;;*55
 	
rF   r9  )rk   rl   rm   r   ru   r   r.   r   r   ro   r   r   r   re   r{   r|   s   @rD   rX  rX  4  s    {   .2(,.2-1&*!%)-,0#'e
##d*e
 e
 t+	e

 ||d*e
 t#e
 $;e
  $;e
 #Tke
 D[e
 
u||	?	?e
 e
rF   rX  c                       e Zd Zdef fdZe	 	 	 	 	 	 	 	 	 ddej                  dz  dedz  dej                  dz  dej                  dz  dej                  dz  d	e
dz  d
e
dz  de
dz  de
dz  deej                     ez  fd       Z xZS )BloomForTokenClassificationr   c                    t         |   |       |j                  | _        t        |      | _        t        |d      r|j                  |j                  }n't        |d      r|j                  |j                  }nd}t        j                  |      | _
        t        j                  |j                  |j                        | _        | j                          y )Nclassifier_dropoutr   g?)rt   ru   rZ  r   r   hasattrro  r   r   r   rN   r   r   
classifierr   )rv   r   ro  rw   s      rD   ru   z$BloomForTokenClassification.__init__  s      ++%f-6/0V5N5N5Z!'!:!:V-.63H3H3T!'!6!6!$zz"45))F$6$68I8IJ 	rF   Nr   r   r   r   rJ  r   r   r   r  r"   c
           
         |	|	n| j                   j                  }	| j                  ||||||||	      }|d   }| j                  |      }| j	                  |      }d}|l|j                  |j                        }|j                  \  }}t               } ||j                  ||z  | j                        |j                  ||z              }|	s|f|dd z   }||f|z   S |S t        |||j                  |j                        S )r]  Nr^  r   r$   )rO  rP  r   r
  )r   r  r   rN   rq  r8   r&   r*   r   r   rZ  r   r   r
  )rv   r   r   r   r   rJ  r   r   r   r  r  rT  r   rP  rO  r9   r:   rk  r   s                      rD   re   z#BloomForTokenClassification.forward  s+   > &1%<k$++B]B]"..+)'/!5# / 	
 ,A.]3/YYv}}-F%+\\"J
')HJ3T__Ev{{S]`jSjGkD Y!4QR!88F)-)9TGf$EvE$-;;*55	
 	
rF   r9  )rk   rl   rm   r   ru   r   r.   r   r   ro   r   r   r   re   r{   r|   s   @rD   rm  rm    s    { "  .2(,.2-1&*!%)-,0#'B
##d*B
 B
 t+	B

 ||d*B
 t#B
 $;B
  $;B
 #TkB
 D[B
 
u||	4	4B
 B
rF   rm  c                        e Zd Z fdZe	 	 	 	 	 	 	 	 ddej                  dz  dej                  dz  dej                  dz  dej                  dz  dej                  dz  dedz  d	edz  d
edz  de	e
z  fd       Z xZS )BloomForQuestionAnsweringc                     t         |   |       t        |      | _        t	        j
                  |j                  d      | _        | j                          y )Nr$   )	rt   ru   r   r   r   r   r   
qa_outputsr   r@  s     rD   ru   z"BloomForQuestionAnswering.__init__  sA     %f-))F$6$6: 	rF   Nr   r   r   start_positionsend_positionsr   r   r  r"   c	                 "   ||n| j                   j                  }| j                  ||||||      }
|
d   }| j                  |      }|j	                  dd      \  }}|j                  d      j                         }|j                  d      j                         }d}||t        |j                               dkD  r|j                  d      }t        |j                               dkD  r|j                  d      }|j                  d      }|j                  d|      }|j                  d|      }t        |      } |||      } |||      }||z   dz  }|s||f|
dd z   }||f|z   S |S t        ||||
j                  |
j                  	      S )
r  N)r   r   r   r   r  r   r   r)   r'   )ignore_indexr$   )rO  start_logits
end_logitsr   r
  )r   r  r   rv  splitrf  
contiguouslensizeclampr   r   r   r
  )rv   r   r   r   rw  rx  r   r   r  r  r  sequence_outputrP  r{  r|  
total_lossignored_indexrk  
start_lossend_lossr   s                        rD   re   z!BloomForQuestionAnswering.forward  s   4 &1%<k$++B]B]"")'/!5# # 
 "!*1#)<<r<#: j#++B/::<''+668

&=+D?'')*Q."1"9"9""==%%'(1, - 5 5b 9(--a0M-33A}EO)//=AM']CH!,@J
M:H$x/14J"J/'!"+=F/9/EZMF*Q6Q+%!!//))
 	
rF   )NNNNNNNN)rk   rl   rm   ru   r   r.   r   FloatTensorr   r   r   re   r{   r|   s   @rD   rt  rt    s      .237263715)-,0#'F
##d*F
 ))D0F
 ((4/	F

 ))D0F
 ''$.F
  $;F
 #TkF
 D[F
 
-	-F
 F
rF   rt  )r<  r   r   rX  rm  rt  )Frz   r+   typingr   r.   r   torch.nnr   r   r   r   r	   rM   cache_utilsr   r   r   
generationr   modeling_attn_mask_utilsr   modeling_layersr   modeling_outputsr   r   r   r   r   modeling_utilsr   utilsr   r   r   utils.genericr   configuration_bloomr   !torch.nn.attention.flex_attentionr   integrations.flex_attentionr   
get_loggerrk   r   ro   r   r!   rE   floatr   rP   rX   r]   autogradFunctionr_   Modulerr   r~   r   r   r   r   r<  rX  rm  rt  __all__rp   rF   rD   <module>r     sR        L L $ ; ; ) > 9  . 
 : ,  !;J 
		H	%)Ju|| )J )JEKK )J\a\h\h )JX5<< 5<< u PT Y^YeYe &	Q%,, 	Q5<< 	Qu||   $
5>>** 
	%		 	%R.RYY R.jryy >;$+ ;$| "? " " E% E EP z
+_ z
z
z p
%9 p
p
f U
"6 U
 U
p P
 4 P
 P
frF   