
    i                         d Z ddlmZ ddlmZ ddlZddlmZ ddlmZ ddl	m
Z dd	lmZ dd
lmZ ddlmZmZmZ ddlmZmZ ddlmZ ddlmZmZmZmZ ddlmZ  ej@                  e!      Z" G d dejF                        Z$	 d;dejF                  dejJ                  dejJ                  dejJ                  dejJ                  dz  de&de&fdZ' G d dejF                        Z( G d dejF                        Z) G d d ejF                        Z* G d! d"ejF                        Z+ G d# d$ejF                        Z, G d% d&e      Z- G d' d(ejF                        Z.e G d) d*e             Z/e G d+ d,e/             Z0 G d- d.ejF                        Z1 G d/ d0ejF                        Z2e G d1 d2e/             Z3e ed34       G d5 d6e                    Z4 ed74       G d8 d9e/             Z5g d:Z6y)<zPyTorch Splinter model.    )Callable)	dataclassN)nn)CrossEntropyLoss   )initialization)ACT2FN)GradientCheckpointingLayer)BaseModelOutputModelOutputQuestionAnsweringModelOutput)ALL_ATTENTION_FUNCTIONSPreTrainedModel)apply_chunking_to_forward)auto_docstringcan_return_tupleloggingtorch_compilable_check   )SplinterConfigc                        e Zd ZdZ fdZ	 	 	 	 d
dej                  dz  dej                  dz  dej                  dz  dej                  dz  def
d	Z	 xZ
S )SplinterEmbeddingszGConstruct the embeddings from word, position and token_type embeddings.c                 |   t         |           t        j                  |j                  |j
                  |j                        | _        t        j                  |j                  |j
                        | _	        t        j                  |j                  |j
                        | _        t        j                  |j
                  |j                        | _        t        j                  |j                        | _        | j#                  dt%        j&                  |j                        j)                  d      d       y )N)padding_idxepsposition_idsr   F)
persistent)super__init__r   	Embedding
vocab_sizehidden_sizepad_token_idword_embeddingsmax_position_embeddingsposition_embeddingstype_vocab_sizetoken_type_embeddings	LayerNormlayer_norm_epsDropouthidden_dropout_probdropoutregister_buffertorcharangeexpandselfconfig	__class__s     x/home/obispo/Crisostomo_bridge/mision_env/lib/python3.12/site-packages/transformers/models/splinter/modeling_splinter.pyr"   zSplinterEmbeddings.__init__,   s    !||F,=,=v?Q?Q_e_r_rs#%<<0N0NPVPbPb#c %'\\&2H2H&J\J\%]"f&8&8f>S>STzz&"<"<= 	ELL)G)GHOOPWXej 	 	
    N	input_idstoken_type_idsr   inputs_embedsreturnc                    ||j                         }n|j                         d d }|d   }|| j                  d d d |f   }|:t        j                  |t        j                  | j                  j
                        }|| j                  |      }| j                  |      }||z   }| j                  |      }	||	z  }| j                  |      }| j                  |      }|S )Nr   r   dtypedevice)sizer   r2   zeroslongrB   r'   r+   r)   r,   r0   )
r6   r;   r<   r   r=   input_shape
seq_lengthr+   
embeddingsr)   s
             r9   forwardzSplinterEmbeddings.forward:   s      #..*K',,.s3K ^
,,Q^<L!"[[EJJtO`O`OgOghN  00;M $ : :> J"%::
"66|D))
^^J/
\\*-
r:   )NNNN)__name__
__module____qualname____doc__r"   r2   
LongTensorFloatTensortuplerI   __classcell__r8   s   @r9   r   r   )   sz    Q
  .2260426##d* ((4/ &&-	
 ((4/ 
r:   r   modulequerykeyvalueattention_maskscalingr0   c                    t        j                  ||j                  dd            |z  }|#|d d d d d d d |j                  d   f   }	||	z   }t        j
                  j                  |dt         j                        j                  |j                        }t        j
                  j                  ||| j                        }t        j                  ||      }
|
j                  dd      j                         }
|
|fS )N   r   r   )dimrA   )ptrainingr   )r2   matmul	transposeshaper   
functionalsoftmaxfloat32torA   r0   r^   
contiguous)rS   rT   rU   rV   rW   rX   r0   kwargsattn_weightscausal_maskattn_outputs              r9   eager_attention_forwardrk   \   s     <<s}}Q':;gEL!$Q1o		"o%=>#k1==((2U]](SVVW\WbWbcL==((6??([L,,|U3K''1-88:K$$r:   c            
            e Zd Z fdZ	 	 ddej
                  dej                  dz  dedz  deej
                     fdZ	 xZ
S )	SplinterSelfAttentionc                 $   t         |           |j                  |j                  z  dk7  r2t	        |d      s&t        d|j                   d|j                   d      || _        |j                  | _        t        |j                  |j                  z        | _        | j                  | j                  z  | _	        t        j                  |j                  | j                        | _        t        j                  |j                  | j                        | _        t        j                  |j                  | j                        | _        t        j                  |j                         | _        |j                   | _        | j                  dz  | _        y )Nr   embedding_sizezThe hidden size (z6) is not a multiple of the number of attention heads ()g      )r!   r"   r%   num_attention_headshasattr
ValueErrorr7   intattention_head_sizeall_head_sizer   LinearrT   rU   rV   r.   attention_probs_dropout_probr0   attention_dropoutrX   r5   s     r9   r"   zSplinterSelfAttention.__init__u   sC    : ::a?PVXhHi#F$6$6#7 8 445Q8 
 #)#=#= #&v'9'9F<V<V'V#W !558P8PPYYv1143E3EF
99V//1C1CDYYv1143E3EF
zz&"E"EF!'!D!D//5r:   Nhidden_statesrW   output_attentionsr>   c                    |j                   d d }g |d| j                  }| j                  |      j                  |      j	                  dd      }| j                  |      j                  |      j	                  dd      }| j                  |      j                  |      j	                  dd      }	t        j                  | j                  j                  t              }
 |
| |||	|f| j                  sdn| j                  | j                  d|\  }} |j                  g |d j!                         }|r||f}|S |f}|S )Nr   r   rZ           )r0   rX   )ra   ru   rT   viewr`   rU   rV   r   get_interfacer7   _attn_implementationrk   r^   ry   rX   reshaperf   )r6   rz   rW   r{   rg   rF   hidden_shapequery_states
key_statesvalue_statesattention_interfacerj   rh   outputss                 r9   rI   zSplinterSelfAttention.forward   sT    $))#2.CCbC$*B*BCzz-055lCMMaQRSXXm,11,?II!QO
zz-055lCMMaQRS(?(M(MKK,,.E)
 %8	%
  $}}C$2H2HLL	%
 	%
!\ *k));;;;FFH1B;- JUr:   NFrJ   rK   rL   r"   r2   TensorrO   boolrP   rI   rQ   rR   s   @r9   rm   rm   t   sW    60 48).	|| ))D0  $;	 
u||	r:   rm   c                   n     e Zd Z fdZdej
                  dej
                  dej
                  fdZ xZS )SplinterSelfOutputc                 (   t         |           t        j                  |j                  |j                        | _        t        j                  |j                  |j                        | _        t        j                  |j                        | _
        y Nr   )r!   r"   r   rw   r%   denser,   r-   r.   r/   r0   r5   s     r9   r"   zSplinterSelfOutput.__init__   s`    YYv1163E3EF
f&8&8f>S>STzz&"<"<=r:   rz   input_tensorr>   c                 r    | j                  |      }| j                  |      }| j                  ||z         }|S Nr   r0   r,   r6   rz   r   s      r9   rI   zSplinterSelfOutput.forward   7    

=1]3}|'CDr:   rJ   rK   rL   r"   r2   r   rI   rQ   rR   s   @r9   r   r      1    >U\\  RWR^R^ r:   r   c            
            e Zd Z fdZ	 	 ddej
                  dej                  dz  dedz  deej
                     fdZ	 xZ
S )	SplinterAttentionc                 b    t         |           t        |      | _        t	        |      | _        y r   )r!   r"   rm   r6   r   outputr5   s     r9   r"   zSplinterAttention.__init__   s&    )&1	(0r:   Nrz   rW   r{   r>   c                 n     | j                   |f||d|}| j                  |d   |      }|f|dd  z   }|S N)rW   r{   r   r   )r6   r   )r6   rz   rW   r{   rg   self_outputsattention_outputr   s           r9   rI   zSplinterAttention.forward   s\     !tyy
)/
 	
  ;;|AF#%QR(88r:   r   r   rR   s   @r9   r   r      sW    1 48).	|| ))D0  $;	 
u||	r:   r   c                   V     e Zd Z fdZdej
                  dej
                  fdZ xZS )SplinterIntermediatec                    t         |           t        j                  |j                  |j
                        | _        t        |j                  t              rt        |j                     | _        y |j                  | _        y r   )r!   r"   r   rw   r%   intermediate_sizer   
isinstance
hidden_actstrr	   intermediate_act_fnr5   s     r9   r"   zSplinterIntermediate.__init__   s]    YYv1163K3KL
f''-'-f.?.?'@D$'-'8'8D$r:   rz   r>   c                 J    | j                  |      }| j                  |      }|S r   )r   r   )r6   rz   s     r9   rI   zSplinterIntermediate.forward   s&    

=100?r:   r   rR   s   @r9   r   r      s#    9U\\ ell r:   r   c                   n     e Zd Z fdZdej
                  dej
                  dej
                  fdZ xZS )SplinterOutputc                 (   t         |           t        j                  |j                  |j
                        | _        t        j                  |j
                  |j                        | _        t        j                  |j                        | _        y r   )r!   r"   r   rw   r   r%   r   r,   r-   r.   r/   r0   r5   s     r9   r"   zSplinterOutput.__init__   s`    YYv779K9KL
f&8&8f>S>STzz&"<"<=r:   rz   r   r>   c                 r    | j                  |      }| j                  |      }| j                  ||z         }|S r   r   r   s      r9   rI   zSplinterOutput.forward   r   r:   r   rR   s   @r9   r   r      r   r:   r   c            
            e Zd Z fdZ	 	 d	dej
                  dej                  dz  dedz  deej
                     fdZ	d Z
 xZS )
SplinterLayerc                     t         |           |j                  | _        d| _        t	        |      | _        t        |      | _        t        |      | _	        y )Nr   )
r!   r"   chunk_size_feed_forwardseq_len_dimr   	attentionr   intermediater   r   r5   s     r9   r"   zSplinterLayer.__init__   sI    '-'E'E$*6208$V,r:   Nrz   rW   r{   r>   c                      | j                   |f||d|}|d   }|dd  }t        | j                  | j                  | j                  |      }|f|z   }|S r   )r   r   feed_forward_chunkr   r   )	r6   rz   rW   r{   rg   self_attention_outputsr   r   layer_outputs	            r9   rI   zSplinterLayer.forward   s     "0"
)/"
 	"
 2!4(,0##T%A%A4CSCSUe
  /G+r:   c                 L    | j                  |      }| j                  ||      }|S r   )r   r   )r6   r   intermediate_outputr   s       r9   r   z SplinterLayer.feed_forward_chunk  s,    "//0@A{{#68HIr:   r   )rJ   rK   rL   r"   r2   r   rO   r   rP   rI   r   rQ   rR   s   @r9   r   r      s\    - 48).	|| ))D0  $;	 
u||	.r:   r   c                        e Zd Z fdZe	 	 	 	 d
dej                  dej                  dz  dedz  dedz  dedz  de	ej                     e
z  fd	       Z xZS )SplinterEncoderc                     t         |           || _        t        j                  t        |j                        D cg c]  }t        |       c}      | _        d| _	        y c c}w r   )
r!   r"   r7   r   
ModuleListrangenum_hidden_layersr   layergradient_checkpointing)r6   r7   ir8   s      r9   r"   zSplinterEncoder.__init__  sN    ]]5IaIaCb#caM&$9#cd
&+# $ds   A#Nrz   rW   r{   output_hidden_statesreturn_dictr>   c                     |rdnd }|rdnd }t        | j                        D ])  \  }	}
|r||fz   } |
|||fi |}|d   }|s!||d   fz   }+ |r||fz   }t        |||      S )N r   r   last_hidden_staterz   
attentions)	enumerater   r   )r6   rz   rW   r{   r   r   rg   all_hidden_statesall_self_attentionsr   layer_modulelayer_outputss               r9   rI   zSplinterEncoder.forward"  s     #7BD$5b4(4 	POA|#$58H$H!(! 	M *!,M &9]1=M<O&O#	P   1]4D D++*
 	
r:   )NFFT)rJ   rK   rL   r"   r   r2   r   rO   r   rP   r   rI   rQ   rR   s   @r9   r   r     s    ,  48).,1#'"
||"
 ))D0"
  $;	"

 #Tk"
 D["
 
u||		."
 "
r:   r   c                   2     e Zd ZU eed<   dZdZ fdZ xZS )SplinterPreTrainedModelr7   splinterTc                     t         |   |       t        |t              rZt	        j
                  |j                  t        j                  |j                  j                  d         j                  d             y y )Nr   r   )r!   _init_weightsr   r   initcopy_r   r2   r3   ra   r4   )r6   rS   r8   s     r9   r   z%SplinterPreTrainedModel._init_weightsN  s[    f%f01JJv**ELL9L9L9R9RSU9V,W,^,^_f,gh 2r:   )	rJ   rK   rL   r   __annotations__base_model_prefixsupports_gradient_checkpointingr   rQ   rR   s   @r9   r   r   H  s!    "&*#i ir:   r   c                       e Zd ZdZ fdZd Zd Zee	 	 	 	 	 	 	 	 dde	j                  dz  de	j                  dz  de	j                  dz  d	e	j                  dz  d
e	j                  dz  dedz  dedz  dedz  deez  fd              Z xZS )SplinterModela2  
    The model is an encoder (with only self-attention) following the architecture described in [Attention is all you
    need](https://huggingface.co/papers/1706.03762) by Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones,
    Aidan N. Gomez, Lukasz Kaiser and Illia Polosukhin.
    c                     t         |   |       || _        t        |      | _        t        |      | _        | j                          y r   )r!   r"   r7   r   rH   r   encoder	post_initr5   s     r9   r"   zSplinterModel.__init__\  s;     ,V4&v. 	r:   c                 .    | j                   j                  S r   rH   r'   )r6   s    r9   get_input_embeddingsz"SplinterModel.get_input_embeddingsf  s    ...r:   c                 &    || j                   _        y r   r   )r6   rV   s     r9   set_input_embeddingsz"SplinterModel.set_input_embeddingsi  s    */'r:   Nr;   rW   r<   r   r=   r{   r   r   r>   c	                    ||n| j                   j                  }||n| j                   j                  }||n| j                   j                  }||t	        d      |#| j                  ||       |j                         }
n!||j                         dd }
nt	        d      |
\  }}||j                  n|j                  }|t        j                  ||f|      }|&t        j                  |
t        j                  |      }| j                  ||
      }| j                  ||||      }| j                  ||||d	      }|d
   }t        ||j                   |j"                        S )a  
        token_type_ids (`torch.LongTensor` of shape `batch_size, sequence_length`, *optional*):
            Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,
            1]`:

            - 0 corresponds to a *sentence A* token,
            - 1 corresponds to a *sentence B* token.

            [What are token type IDs?](../glossary#token-type-ids)
        position_ids (`torch.LongTensor` of shape `batch_size, sequence_length`, *optional*):
            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
            config.max_position_embeddings - 1]`.

            [What are position IDs?](../glossary#position-ids)
        NzDYou cannot specify both input_ids and inputs_embeds at the same timer   z5You have to specify either input_ids or inputs_embeds)rB   r@   )r;   r   r<   r=   T)rW   r{   r   r   r   r   )r7   r{   r   use_return_dictrs   %warn_if_padding_and_no_attention_maskrC   rB   r2   onesrD   rE   get_extended_attention_maskrH   r   r   rz   r   )r6   r;   rW   r<   r   r=   r{   r   r   rg   rF   
batch_sizerG   rB   extended_attention_maskembedding_outputencoder_outputssequence_outputs                     r9   rI   zSplinterModel.forwardl  s   : 2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++B]B] ]%>cdd"66y.Q#..*K&',,.s3KTUU!,
J%.%:!!@T@T!"ZZ*j)A6RN!"[[EJJvVN 150P0PQ_al0m??%)'	 + 
 ,,2/!5 ' 
 *!,-)77&11
 	
r:   )NNNNNNNN)rJ   rK   rL   rM   r"   r   r   r   r   r2   r   r   rP   r   rI   rQ   rR   s   @r9   r   r   T  s    /0  *..2.2,0-1)-,0#'J
<<$&J
 t+J
 t+	J

 llT)J
 ||d*J
  $;J
 #TkJ
 D[J
 
	 J
  J
r:   r   c                   X     e Zd Zd fd	Zdej
                  dej
                  fdZ xZS )SplinterFullyConnectedLayerc                     t         |           || _        || _        t	        j
                  | j                  | j                        | _        t        |   | _        t	        j                  | j                        | _	        y r   )
r!   r"   	input_dim
output_dimr   rw   r   r	   act_fnr,   )r6   r   r   r   r8   s       r9   r"   z$SplinterFullyConnectedLayer.__init__  sV    "$YYt~~t?
Z(doo6r:   inputsr>   c                 l    | j                  |      }| j                  |      }| j                  |      }|S r   )r   r   r,   )r6   r   rz   s      r9   rI   z#SplinterFullyConnectedLayer.forward  s2    

6*M2}5r:   )gelur   rR   s   @r9   r   r     s#    7ell u|| r:   r   c                   (     e Zd ZdZ fdZd Z xZS )QuestionAwareSpanSelectionHeadzf
    Implementation of Question-Aware Span Selection (QASS) head, described in Splinter's paper:

    c                    t         |           t        |j                  |j                        | _        t        |j                  |j                        | _        t        |j                  |j                        | _        t        |j                  |j                        | _        t        j                  |j                  |j                  d      | _
        t        j                  |j                  |j                  d      | _        y )NF)bias)r!   r"   r   r%   query_start_transformquery_end_transformstart_transformend_transformr   rw   start_classifierend_classifierr5   s     r9   r"   z'QuestionAwareSpanSelectionHead.__init__  s    %@ASASU[UgUg%h"#>v?Q?QSYSeSe#f :6;M;MvOaOab89K9KVM_M_` "		&*<*<f>P>PW\ ] ii(:(:F<N<NUZ[r:   c                    |j                         \  }}}|j                  d      j                  dd|      }t        j                  |d|      }| j                  |      }| j                  |      }| j                  |      }	| j                  |      }
| j                  |      }|	j                  ddd      }	t        j                  ||	      }| j                  |      }|
j                  ddd      }
t        j                  ||
      }||fS )Nr   r   )r\   indexr   rZ   )rC   	unsqueezerepeatr2   gatherr   r   r   r   r   permuter_   r   )r6   r   	positions_r\   r  gathered_repsquery_start_repsquery_end_reps
start_repsend_repsrz   start_logits
end_logitss                 r9   rI   z&QuestionAwareSpanSelectionHead.forward  s    KKM	1c##B'..q!S9V%@55mD11-@))&1
%%f---.>?''1a0
||M:>++N;##Aq!,\\-:
Z''r:   )rJ   rK   rL   rM   r"   rI   rQ   rR   s   @r9   r   r     s    
	\(r:   r   c                   `    e Zd Z fdZe	 	 	 	 	 	 	 	 	 	 	 ddej                  dz  dej                  dz  dej                  dz  dej                  dz  dej                  dz  dej                  dz  d	ej                  dz  d
edz  dedz  dedz  dej                  dz  de	e
z  fd       Z xZS )SplinterForQuestionAnsweringc                     t         |   |       t        |      | _        t	        |      | _        |j                  | _        | j                          y r   r!   r"   r   r   r   splinter_qassquestion_token_idr   r5   s     r9   r"   z%SplinterForQuestionAnswering.__init__  C     %f-;FC!'!9!9 	r:   Nr;   rW   r<   r   r=   start_positionsend_positionsr{   r   r   question_positionsr>   c           
         |
|
n| j                   j                  }
d}||Dt        j                  t        j                  || j
                        j                         d      }nJt        j                  |j                  d      t        j                  |j                  |j                        }|j                  d      }d}| j                  |||||||	|
      }|d   }| j                  ||      \  }}|r"|j                  d	      |j                  d	      }}|d|d	|z
  t        j                   |j"                        j$                  z  z   }|d	|z
  t        j                   |j"                        j$                  z  z   }d}||t'        |j                               d	kD  r|j                  d      }t'        |j                               d	kD  r|j                  d      }|j                  d	      }|j)                  d|       |j)                  d|       t+        |
      } |||      } |||      }||z   dz  }|
s||f|d	d z   }||f|z   S |S t-        ||||j.                  |j0                        S )a  
        token_type_ids (`torch.LongTensor` of shape `batch_size, sequence_length`, *optional*):
            Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,
            1]`:

            - 0 corresponds to a *sentence A* token,
            - 1 corresponds to a *sentence B* token.

            [What are token type IDs?](../glossary#token-type-ids)
        position_ids (`torch.LongTensor` of shape `batch_size, sequence_length`, *optional*):
            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
            config.max_position_embeddings - 1]`.

            [What are position IDs?](../glossary#position-ids)
        question_positions (`torch.LongTensor` of shape `(batch_size, num_questions)`, *optional*):
            The positions of all question tokens. If given, start_logits and end_logits will be of shape `(batch_size,
            num_questions, sequence_length)`. If None, the first question token in each sequence in the batch will be
            the only one for which start_logits and end_logits are calculated and they will be of shape `(batch_size,
            sequence_length)`.
        NFr   )r\   r   )rA   layoutrB   TrW   r<   r   r=   r{   r   r   r   ignore_indexrZ   lossr  r  rz   r   )r7   r   r2   argmaxeqr  rt   rD   rC   rE   r  rB   r  r   r  squeezefinforA   minlenclamp_r   r   rz   r   )r6   r;   rW   r<   r   r=   r  r  r{   r   r   r  rg   question_positions_were_none"question_position_for_each_exampler   r   r  r  
total_lossignored_indexloss_fct
start_lossend_lossr   s                            r9   rI   z$SplinterForQuestionAnswering.forward  s   H &1%<k$++B]B]',$%$5:\\XXi)?)?@EEGR62 6;[[!&&q)MDXDXanauau62 "D!M!Mb!Q+/(--))%'/!5#   	
 "!*#'#5#5oGY#Z j''3';';A'>
@R@RST@U*L%'1~+=\M_M_A`AdAd*ddL#q>'9U[[IYIY=Z=^=^&^^J
&=+D?'')*Q."1"9"9""==%%'(1, - 5 5b 9(--a0M""1m4  M2']CH!,@J
M:H$x/14J"J/'!"+=F/9/EZMF*Q6Q+%!!//))
 	
r:   NNNNNNNNNNN)rJ   rK   rL   r"   r   r2   r   rN   r   rP   r   rI   rQ   rR   s   @r9   r  r    s#     *..2.2,0-13715)-,0#'6:b
<<$&b
 t+b
 t+	b

 llT)b
 ||d*b
 ))D0b
 ''$.b
  $;b
 #Tkb
 D[b
 ",,t3b
 
-	-b
 b
r:   r  zB
    Class for outputs of Splinter as a span selection model.
    )custom_introc                       e Zd ZU dZdZej                  dz  ed<   dZej                  dz  ed<   dZ	ej                  dz  ed<   dZ
eej                     dz  ed<   dZeej                     dz  ed<   y)SplinterForPreTrainingOutputa  
    loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when start and end positions are provided):
        Total span extraction loss is the sum of a Cross-Entropy for the start and end positions.
    start_logits (`torch.FloatTensor` of shape `(batch_size, num_questions, sequence_length)`):
        Span-start scores (before SoftMax).
    end_logits (`torch.FloatTensor` of shape `(batch_size, num_questions, sequence_length)`):
        Span-end scores (before SoftMax).
    Nr   r  r  rz   r   )rJ   rK   rL   rM   r   r2   rO   r   r  r  rz   rP   r   r   r:   r9   r2  r2  e  s|     &*D%

d
")-1L%##d*1+/J!!D(/59M5**+d2926Je''(4/6r:   r2  z
    Splinter Model for the recurring span selection task as done during the pretraining. The difference to the QA task
    is that we do not have a question, but multiple question tokens that replace the occurrences of recurring spans
    instead.
    c                       e Zd Z fdZe	 	 	 	 	 	 	 	 	 	 	 ddej                  dz  dej                  dz  dej                  dz  dej                  dz  dej                  dz  dej                  dz  d	ej                  dz  d
edz  dedz  dedz  dej                  dz  de	e
z  fd       Zdej                  dej                  fdZ xZS )SplinterForPreTrainingc                     t         |   |       t        |      | _        t	        |      | _        |j                  | _        | j                          y r   r  r5   s     r9   r"   zSplinterForPreTraining.__init__  r  r:   Nr;   rW   r<   r   r=   r  r  r{   r   r   r  r>   c           
      `   |
|
n| j                   j                  }
|||t        d      ||t        d      || j                  |      }| j	                  |||||||	|
      }|d   }|j                         \  }}}| j                  ||      \  }}|j                  d      }||j                  d      j                  |||      }|d|z
  t        j                  |j                        j                  z  z   }|d|z
  t        j                  |j                        j                  z  z   }d}|||j                  dt        d|dz
               |j                  dt        d|dz
               t        | j                   j                         } ||j#                  ||z  |      |j#                  ||z              } ||j#                  ||z  |      |j#                  ||z              }||z   dz  }|
s||f|dd z   }||f|z   S |S t%        ||||j&                  |j(                  	      S )
a  
        input_ids (`torch.LongTensor` of shape `(batch_size, num_questions, sequence_length)`):
            Indices of input sequence tokens in the vocabulary.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.

            [What are input IDs?](../glossary#input-ids)
        token_type_ids (`torch.LongTensor` of shape `batch_size, num_questions, sequence_length`, *optional*):
            Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,
            1]`:

            - 0 corresponds to a *sentence A* token,
            - 1 corresponds to a *sentence B* token.

            [What are token type IDs?](../glossary#token-type-ids)
        position_ids (`torch.LongTensor` of shape `batch_size, num_questions, sequence_length`, *optional*):
            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
            config.max_position_embeddings - 1]`.

            [What are position IDs?](../glossary#position-ids)
        inputs_embeds (`torch.FloatTensor` of shape `(batch_size, num_questions, sequence_length, hidden_size)`, *optional*):
            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
            is useful if you want more control over how to convert *input_ids* indices into associated vectors than the
            model's internal embedding lookup matrix.
        start_positions (`torch.LongTensor` of shape `(batch_size, num_questions)`, *optional*):
            Labels for position (index) of the start of the labelled span for computing the token classification loss.
            Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
            are not taken into account for computing the loss.
        end_positions (`torch.LongTensor` of shape `(batch_size, num_questions)`, *optional*):
            Labels for position (index) of the end of the labelled span for computing the token classification loss.
            Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
            are not taken into account for computing the loss.
        question_positions (`torch.LongTensor` of shape `(batch_size, num_questions)`, *optional*):
            The positions of all question tokens. If given, start_logits and end_logits will be of shape `(batch_size,
            num_questions, sequence_length)`. If None, the first question token in each sequence in the batch will be
            the only one for which start_logits and end_logits are calculated and they will be of shape `(batch_size,
            sequence_length)`.
        NzCquestion_positions must be specified in order to calculate the lossz>question_positions must be specified when input_embeds is usedr  r   r   r  rZ   r  )r7   r   	TypeError_prepare_question_positionsr   rC   r  r  r4   r2   r$  rA   r%  r'  maxr   r&   r~   r2  rz   r   )r6   r;   rW   r<   r   r=   r  r  r{   r   r   r  rg   r   r   r   sequence_lengthr\   r  r  num_questions attention_mask_for_each_questionr*  r,  r-  r.  r   s                              r9   rI   zSplinterForPreTraining.forward  s   n &1%<k$++B]B]%/*E-Jcabb'I,=\]]'!%!A!A)!L--))%'/!5#   	
 "!*+:+?+?+A(
OS#'#5#5oGY#Z j*//2%/=/G/G/J/Q/QM?0, (1/O+OSXS^S^_k_q_qSrSvSv*vvL#q+K'Ku{{[e[k[kOlOpOp&ppJ
&=+D""1c!_q-@&AB  C?Q+>$?@ (T[[5M5MNH!!!*}"<oN$$Z-%?@J  
] :OL"":#=>H %x/14J"J/'!"+=F/9/EZMF*Q6Q+%!!//))
 	
r:   c                 4   t        j                  || j                  j                  k(        \  }}t        j                  |      }t        j
                  |j                  d      |j                         f| j                  j                  t         j                  |j                        }t        |j                  d      |j                  d      k(  d       t        j                  |D cg c]  }t        j                  |       c}      }||||f<   |S c c}w )Nr   r@   z?All samples in the batch must have at least one question token.)r2   wherer7   r  bincountfullrC   r9  r&   rE   rB   r   catr3   )r6   r;   rowsflat_positionsr;  r  ncolss           r9   r8  z2SplinterForPreTraining._prepare_question_positions
  s    ${{98U8U+UVnt,JJ^^A 1 1 34KK$$**##	
	 	q!Y^^A%66M	
 yy=Aa%,,q/AB .	$* Bs   )Dr/  )rJ   rK   rL   r"   r   r2   r   rN   r   rP   r2  rI   r8  rQ   rR   s   @r9   r4  r4  |  s?     *..2.2,0-13715)-,0#'6:y
<<$&y
 t+y
 t+	y

 llT)y
 ||d*y
 ))D0y
 ''$.y
  $;y
 #Tky
 D[y
 ",,t3y
 
-	-y
 y
vU\\ ell r:   r4  )r  r4  r   r   r   )r}   )7rM   collections.abcr   dataclassesr   r2   r   torch.nnr    r   r   activationsr	   modeling_layersr
   modeling_outputsr   r   r   modeling_utilsr   r   pytorch_utilsr   utilsr   r   r   r   configuration_splinterr   
get_loggerrJ   loggerModuler   r   floatrk   rm   r   r   r   r   r   r   r   r   r   r   r  r2  r4  __all__r   r:   r9   <module>rV     s    $ !   % & ! 9 Z Z F 6  3 
		H	%/ /t %II%<<% 
% <<	%
 LL4'% % %05BII 5r 		 2299  RYY #. #N*
bii *
Z io i i c
+ c
 c
L")) $#(RYY #(L n
#: n
 n
b 
7; 7 7" V4 VVrr:   