
    iz                        d Z ddlZddlmZ ddlZddlmZ ddlmZmZm	Z	 ddl
mZ ddlmZmZ dd	lmZ dd
lmZmZmZmZmZ ddlmZmZ ddlmZ ddlmZmZm Z m!Z! ddl"m#Z#m$Z$ ddl%m&Z&  e!jN                  e(      Z)d Z*d Z+d Z,d Z-d Z. G d dej
                  j^                        Z0 G d dej^                        Z1 G d dej^                        Z2	 	 dGdej^                  dejf                  dejf                  dejf                  d ejf                  dz  d!e4dz  d"e4d#ee   fd$Z5 G d% d&ej^                        Z6 G d' d(ej^                        Z7 G d) d*ej^                        Z8 G d+ d,ej^                        Z9 G d- d.ej^                        Z: G d/ d0e      Z; G d1 d2ej^                        Z< G d3 d4ej^                        Z=e G d5 d6e             Z>e G d7 d8e>             Z?e G d9 d:e>             Z@ G d; d<ej^                        ZA ed=>       G d? d@e>             ZBe G dA dBe>             ZC G dC dDej^                        ZDdE ZEg dFZFy)HzPyTorch ESM model.    N)Callable)nn)BCEWithLogitsLossCrossEntropyLossMSELoss   )initialization)create_bidirectional_maskcreate_causal_mask)GradientCheckpointingLayer)"BaseModelOutputWithCrossAttentions,BaseModelOutputWithPoolingAndCrossAttentionsMaskedLMOutputSequenceClassifierOutputTokenClassifierOutput)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack)TransformersKwargsauto_docstringcan_return_tuplelogging)OutputRecordercheck_model_inputs   )	EsmConfigc                 b    | j                  dd      \  }}t        j                  | |fd      S )N   dim)chunktorchcat)xx1x2s      n/home/obispo/Crisostomo_bridge/mision_env/lib/python3.12/site-packages/transformers/models/esm/modeling_esm.pyrotate_halfr)   ,   s/    WWQBWFB99rc2YB''    c                     |d d d d d | j                   d   d d f   }|d d d d d | j                   d   d d f   }| |z  t        |       |z  z   S )N)shaper)   )r%   cossins      r(   apply_rotary_pos_embr0   1   sX    
aMaggbkM1$
%C
aMaggbkM1$
%CGA,--r*   c                 j    | dz  dt        j                  | t        j                  d      z        z   z  S )zo
    This is the gelu implementation from the original ESM repo. Using F.gelu yields subtly wrong results.
    g      ?      ?g       @)r#   erfmathsqrtr%   s    r(   gelur7   8   s.     s7cEIIa$))C.&899::r*   c                 ,    | | j                  dd      z   S )zJMake layer symmetric in final two dimensions, used for contact prediction.r   r,   )	transposer6   s    r(   
symmetrizer:   ?   s    q{{2r"""r*   c                     | j                  dd      }| j                  dd      }| j                  dd      }||z  }|j                  |       | |z
  }|S )z=Perform average product correct, used for contact prediction.r   T)keepdimsr,   )r   r,   )sumdiv_)r%   a1a2a12avg
normalizeds         r(   average_product_correctrD   D   s[    	
rD	!B	
rD	!B
%%4%
(C
r'CHHSMSJr*   c                        e Zd ZU dZej
                  ed<   def fdZd
dZ	dej
                  dej
                  de
ej
                  ej
                  f   fd	Z xZS )RotaryEmbeddingz
    Rotary position embeddings based on those in
    [RoFormer](https://huggingface.co/docs/transformers/model_doc/roformer). Query and keys are transformed by rotation
    matrices which depend on their relative positions.
    inv_freqr!   c                     t         |           || _        ddt        j                  d|dt        j
                        j                         |z  z  z  }| j                  d|       d | _        d | _	        d | _
        y )Nr2   '  r   r   dtyperG   )super__init__r!   r#   arangeint64floatregister_buffer_seq_len_cached_cos_cached_sin_cached)selfr!   rG   	__class__s      r(   rM   zRotaryEmbedding.__init__Y   sn    %ELLC%++$N$T$T$VY\$\]^Z2#r*   c                 t   |j                   |   }|| j                  k7  s#| j                  j                  |j                  k7  r|| _        t	        j
                  |j                   |   |j                        j                  | j                        }t	        j                  || j                        }t	        j                  ||fd      j                  |j                        }|j                         d d d d d d f   | _        |j                         d d d d d d f   | _        | j                  | j                  fS )Ndevicer   r    )r-   rR   rS   rY   r#   rN   type_asrG   outerr$   tor.   r/   rT   )rU   r%   seq_dimensionseq_lentfreqsembs          r(   _update_cos_sin_tablesz&RotaryEmbedding._update_cos_sin_tablesd   s    ''-( d***d.>.>.E.E.Q#*D QWW]3AHHEMMdmm\AKK4==1E))UEN366qxx@C"wwytQ)9:D"wwytQ)9:D!1!111r*   qkreturnc                 .   | j                  |d      \  | _        | _        t        || j                  | j                        j	                  |j
                        t        || j                  | j                        j	                  |j
                        fS )Nr,   )r]   rJ   )rb   rS   rT   r0   r\   rK   )rU   rc   rd   s      r(   forwardzRotaryEmbedding.forwardt   s    -1-H-HZ\-H-]*$* !D$4$4d6F6FGJJQRQXQXJY D$4$4d6F6FGJJQRQXQXJY
 	
r*   )r   )__name__
__module____qualname____doc__r#   Tensor__annotations__intrM   rb   tuplerg   __classcell__rV   s   @r(   rF   rF   P   sY     ll	 C 	 2 
 
%,, 
5u||A[;\ 
r*   rF   c                   8     e Zd ZdZ	 	 ddedef fdZd Z xZS )EsmContactPredictionHeadzWPerforms symmetrization, apc, and computes a logistic regression on the output featuresin_featureseos_idxc                     t         |           || _        || _        t	        j
                  |d|      | _        t	        j                         | _        y )Nr   )	rL   rM   rt   ru   r   Linear
regressionSigmoid
activation)rU   rt   biasru   rV   s       r(   rM   z!EsmContactPredictionHead.__init__   s@     	&))KD9**,r*   c                 X   |j                  | j                        j                  |      }|j                  d      |j                  d      z  }||d d d d d d d d f   z  }|dd dd df   }|ddd dd f   }|j	                         \  }}}}}|j                  |||z  ||      }|j                  | j                  j                  j                        }t        t        |            }|j                  dddd      }| j                  | j                  |      j                  d            S )Nr   r   .r   r   r   )neru   r\   	unsqueezesizeviewrx   weightrY   rD   r:   permuterz   squeeze)	rU   tokens
attentionseos_mask
batch_sizelayersheadsseqlen_s	            r(   rg   z EsmContactPredictionHead.forward   s!   99T\\*--j9%%a(8+=+=a+@@(1dD!Q+>"??
SbS#2#.
QR,
/9/@,
FE61__Z%P
  ]]OO""))

 -Z
-CD
''1a3
tz:BB1EFFr*   )Tr   )rh   ri   rj   rk   rn   rM   rg   rp   rq   s   @r(   rs   rs   }   s+    a
 	
'
' 	
'Gr*   rs   c                   8     e Zd ZdZ fdZ	 	 	 	 ddZd Z xZS )EsmEmbeddingszV
    Same as BertEmbeddings with a tiny tweak for positional embeddings indexing.
    c                    t         |           t        j                  |j                  |j
                  |j                        | _        |j                  r1t        j                  |j
                  |j                        | _        nd | _        t        j                  |j                        | _        t        |dd      | _        | j#                  dt%        j&                  |j(                        j+                  d      d       |j                  | _        | j                   dk(  r;t        j                  |j(                  |j
                  | j,                        | _        |j0                  | _        |j2                  | _        y )	N)padding_idxepsposition_embedding_typeabsoluteposition_idsr   r   F)
persistent)rL   rM   r   	Embedding
vocab_sizehidden_sizepad_token_idword_embeddingsemb_layer_norm_before	LayerNormlayer_norm_eps
layer_normDropouthidden_dropout_probdropoutgetattrr   rQ   r#   rN   max_position_embeddingsexpandr   position_embeddingstoken_dropoutmask_token_idrU   configrV   s     r(   rM   zEsmEmbeddings.__init__   s*   !||F,=,=v?Q?Q_e_r_rs'' ll6+=+=6CXCXYDO"DOzz&"<"<='.v7PR\']$ELL)G)GHOOPWXej 	 	
 "..'':5')||..0B0BPTP`P`(D$ $11#11r*   c                    |*|t        || j                        }n| j                  |      }|| j                  |      }|}| j                  r||j                  || j                  k(  j                  d      d      }d}||j                  d      n|j                  d   }|| j                  k(  j                  d      j                         |z  }|d|z
  z  d|z
  d d d d f   z  j                  |j                        }| j                  dk(  r| j                  |      }	||	z   }| j                  | j                  |      }|-||j                  d      z  j                  |j                        }|S )Nr           gQ?r   r   )"create_position_ids_from_input_idsr   &create_position_ids_from_inputs_embedsr   r   masked_fillr   r~   r=   r-   rP   r\   rK   r   r   r   )
rU   	input_idsattention_maskr   inputs_embeds
embeddingsmask_ratio_trainsrc_lengthsmask_ratio_observedr   s
             r(   rg   zEsmEmbeddings.forward   s    $A)TM]M]^#JJ=Y  00;M #
 )"7#//d>P>P1P0[0[\^0_adeJ)4B4N.,,R0T]TcTcdeTfK#,0B0B#B"G"G"K"Q"Q"SVa"a$,<(<=EXAXZ[]acgZg@hhll  J '':5"&":":<"H#&99J??&4J%$~'?'?'CCGG
HXHXYJ r*   c                    |j                         dd }|d   }t        j                  | j                  dz   || j                  z   dz   t        j                  |j
                        }|j                  d      j                  |      S )z
        We are provided embeddings directly. We cannot infer which are padded so just generate sequential position ids.

        Args:
            inputs_embeds: torch.Tensor

        Returns: torch.Tensor
        Nr   r   )rK   rY   r   )r   r#   rN   r   longrY   r~   r   )rU   r   input_shapesequence_lengthr   s        r(   r   z4EsmEmbeddings.create_position_ids_from_inputs_embeds   s     $((*3B/%a.||q /D4D4D"Dq"HPUPZPZcpcwcw
 %%a(//<<r*   )NNNN)rh   ri   rj   rk   rM   rg   r   rp   rq   s   @r(   r   r      s&    22 /b=r*   r   modulequerykeyvaluer   scalingr   kwargsc                    ||j                  d      dz  }t        j                  ||j                  dd            |z  }|#|d d d d d d d |j                  d   f   }||z   }t
        j                  j                  |d      }t
        j                  j                  ||| j                        }t        j                  ||      }	|	j                  dd      j                         }	|	|fS )	Nr         r   r   r,   r    )ptrainingr   )r   r#   matmulr9   r-   r   
functionalsoftmaxr   r   
contiguous)
r   r   r   r   r   r   r   r   attn_weightsattn_outputs
             r(   eager_attention_forwardr      s     **R.D( <<s}}Q':;gEL!'1a399R=(@A#n4==((2(>L==((6??([L,,|U3K''1-88:K$$r*   c                        e Zd Zd
 fd	Z	 	 	 ddej
                  dej                  dz  dej                  dz  dej                  dz  dee   de	ej
                     fd	Z
 xZS )EsmSelfAttentionNc                    t         |           || _        |j                  |j                  z  dk7  r2t        |d      s&t        d|j                   d|j                   d      |j                  | _        t        |j                  |j                  z        | _        | j                  | j                  z  | _	        t        j                  |j                  | j                        | _        t        j                  |j                  | j                        | _        t        j                  |j                  | j                        | _        |j                  | _        d | _        |xs t%        |dd      | _        | j&                  dk(  rt)        | j                  	      | _        d
| _        |j,                  | _        || _        | j,                  xr | | _        y )Nr   embedding_sizezThe hidden size (z6) is not a multiple of the number of attention heads ()r   r   rotaryr    r2   )rL   rM   r   r   num_attention_headshasattr
ValueErrorrn   attention_head_sizeall_head_sizer   rw   r   r   r   attention_probs_dropout_probr   rotary_embeddingsr   r   rF   r   
is_decoder	layer_idx	is_causal)rU   r   r   r   is_cross_attentionrV   s        r(   rM   zEsmSelfAttention.__init__  s    : ::a?PVXhHi#F$6$6#7 8 445Q8 
 $*#=#= #&v'9'9F<V<V'V#W !558P8PPYYv1143E3EF
99V//1C1CDYYv1143E3EF
::!%'> (
'-zC
$ ''83%49Q9Q%RD" ++"C1C-Cr*   hidden_statesr   encoder_hidden_statesencoder_attention_maskr   re   c                    |j                   d d \  }}||d| j                  f}| j                  |      j                  |      j	                  dd      }	|d u}
|
r|n|}|
r|n|}| j                  |      j                  |      j	                  dd      }| j                  |      j                  |      j	                  dd      }|	| j                  dz  z  }	| j                  dk(  r| j                  |	|      \  }	}t        j                  | j                  j                  t              } || |	|||f| j                  sdn| j                  | j                   d|\  }}|j#                  ||d      j%                         }||fS )Nr   r   r   r   r   r   )r   r   )r-   r   r   r   r9   r   r   r   r   r   get_interfacer   _attn_implementationr   r   r   r   reshaper   )rU   r   r   r   r   r   r   
seq_lengthhidden_shapequery_layerr   current_states	key_layervalue_layerattention_interfacer   r   s                    r(   rg   zEsmSelfAttention.forward>  s    "/!4!4Sb!9
J"JD4L4LMjj/44\BLLQPQR2$>2D.-3E/>HH^,11,?II!QO	jj055lCMMaQRS "D$<$<d$BB''83%)%;%;K%S"K(?(M(MKK,,.E)
 %8	%
  $}}C$,,LL	%
 	%
!\ "))*j"EPPRL((r*   )NNFNNN)rh   ri   rj   rM   r#   rl   FloatTensorr   r   ro   rg   rp   rq   s   @r(   r   r     s    DF 48:>;?,)||,) ))D0,)  %0047	,)
 !& 1 1D 8,) +,,) 
u||	,)r*   r   c                   $     e Zd Z fdZd Z xZS )EsmSelfOutputc                     t         |           t        j                  |j                  |j                        | _        t        j                  |j                        | _        y N)	rL   rM   r   rw   r   denser   r   r   r   s     r(   rM   zEsmSelfOutput.__init__n  sB    YYv1163E3EF
zz&"<"<=r*   c                 T    | j                  |      }| j                  |      }||z   }|S r   r   r   rU   r   input_tensors      r(   rg   zEsmSelfOutput.forwards  .    

=1]3%4r*   rh   ri   rj   rM   rg   rp   rq   s   @r(   r   r   m      >
r*   r   c                   :     e Zd Zd fd	Z	 	 	 ddee   fdZ xZS )EsmAttentionc                     t         |           t        |||      | _        t	        |      | _        t        j                  |j                  |j                        | _        y )N)r   r   r   )
rL   rM   r   rU   r   outputr   r   r   r   )rU   r   r   r   rV   s       r(   rM   zEsmAttention.__init__{  sI    $VyUgh	#F+f&8&8f>S>STr*   r   c                     | j                  |      } | j                  |f|||d|\  }}| j                  ||      }|S )Nr   r   r   )r   rU   r   )	rU   r   r   r   r   r   hidden_states_lnr   r   s	            r(   rg   zEsmAttention.forward  sZ      >>-8"
)"7#9	

 
Q kk+}=r*   )NFr   )rh   ri   rj   rM   r   r   rg   rp   rq   s   @r(   r   r   z  s)    U "# +,r*   r   c                   V     e Zd Z fdZdej
                  dej
                  fdZ xZS )EsmIntermediatec                     t         |           t        j                  |j                  |j
                        | _        y r   )rL   rM   r   rw   r   intermediate_sizer   r   s     r(   rM   zEsmIntermediate.__init__  s,    YYv1163K3KL
r*   r   re   c                 >    | j                  |      }t        |      }|S r   )r   r7   )rU   r   s     r(   rg   zEsmIntermediate.forward  s     

=1]+r*   rh   ri   rj   rM   r#   rl   rg   rp   rq   s   @r(   r  r    s$    MU\\ ell r*   r  c                   $     e Zd Z fdZd Z xZS )	EsmOutputc                     t         |           t        j                  |j                  |j
                        | _        t        j                  |j                        | _	        y r   )
rL   rM   r   rw   r  r   r   r   r   r   r   s     r(   rM   zEsmOutput.__init__  sB    YYv779K9KL
zz&"<"<=r*   c                 T    | j                  |      }| j                  |      }||z   }|S r   r   r   s      r(   rg   zEsmOutput.forward  r   r*   r   rq   s   @r(   r  r    r   r*   r  c                   >     e Zd Z fdZ	 	 	 ddee   fdZd Z xZS )EsmLayerc                    t         |           |j                  | _        d| _        t	        |      | _        |j                  | _        |j                  | _        | j                  r,| j                  st        |  d      t	        |d      | _	        t        |      | _        t        |      | _        t        j                  |j                   |j"                        | _        y )Nr   z> should be used as a decoder model if cross attention is addedT)r   r   )rL   rM   chunk_size_feed_forwardseq_len_dimr   	attentionr   add_cross_attentionRuntimeErrorcrossattentionr  intermediater  r   r   r   r   r   r   s     r(   rM   zEsmLayer.__init__  s    '-'E'E$%f- ++#)#=#= ##??"dV+i#jkk".v$"OD+F3'f&8&8f>S>STr*   r   c                      | j                   |fd|i|}| j                  r4|2t        | d      st        d|  d       | j                  |f|||d|}| j                  |      }|S )Nr   r  z'If `encoder_hidden_states` are passed, z` has to be instantiated with cross-attention layers by setting `config.add_cross_attention=True`r   )r  r   r   AttributeErrorr  feed_forward_chunk)rU   r   r   r   r   r   attention_outputlayer_outputs           r(   rg   zEsmLayer.forward  s     *4>>
)
 
 ??4@4!12$=dV D` ` 
  3t22  -&;'=	 
   ../?@r*   c                 n    | j                  |      }| j                  |      }| j                  ||      }|S r   )r   r  r   )rU   r  attention_output_lnintermediate_outputr  s        r(   r  zEsmLayer.feed_forward_chunk  s<    "nn-=>"//0CD{{#68HIr*   r   )	rh   ri   rj   rM   r   r   rg   r  rp   rq   s   @r(   r  r    s/    U$ "# +,@r*   r  c                   B     e Zd Z fdZe	 	 	 ddee   fd       Z xZS )
EsmEncoderc                 0   t         |           || _        t        j                  t        |j                        D cg c]  }t        |       c}      | _        t        j                  |j                  |j                        | _        d| _        y c c}w )Nr   F)rL   rM   r   r   
ModuleListrangenum_hidden_layersr  layerr   r   r   emb_layer_norm_aftergradient_checkpointing)rU   r   r   rV   s      r(   rM   zEsmEncoder.__init__  sm    ]]eFD\D\>]#^HV$4#^_
$&LL1C1CI^I^$_!&+# $_s   Br   c                     t        | j                        D ]  \  }} ||f|||d|} | j                  r| j                  |      }t        |      S )Nr   )last_hidden_state)	enumerater"  r#  r   )rU   r   r   r   r   r   ilayer_modules           r(   rg   zEsmEncoder.forward  sk      )4 	OA|(-&;'=	
 M	 $$ 55mDM1MRRr*   r   )	rh   ri   rj   rM   r   r   r   rg   rp   rq   s   @r(   r  r    s:    ,  "#S +,S Sr*   r  c                   V     e Zd Z fdZdej
                  dej
                  fdZ xZS )	EsmPoolerc                     t         |           t        j                  |j                  |j                        | _        t        j                         | _        y r   )rL   rM   r   rw   r   r   Tanhrz   r   s     r(   rM   zEsmPooler.__init__  s9    YYv1163E3EF
'')r*   r   re   c                 \    |d d df   }| j                  |      }| j                  |      }|S Nr   )r   rz   )rU   r   first_token_tensorpooled_outputs       r(   rg   zEsmPooler.forward  s6     +1a40

#566r*   r  rq   s   @r(   r+  r+    s#    $
U\\ ell r*   r+  c                        e Zd ZU eed<   dZdZdZg dZdgZ	dZ
dZdZdZe eedd	      g eedd
	      gdZ ej&                          fd       Zd Z xZS )EsmPreTrainedModelr   esmTF)r  #EsmFoldTriangularSelfAttentionBlockr   zposition_embeddings.weightr   r  )index
layer_namer  )r   r   cross_attentionsc                 ^   t         |   |       t        |t              r t	        j
                  |j                         yt        |t              rZt	        j                  |j                  t        j                  |j                  j                  d         j                  d             yt        |t              rsddt        j                  d|j                  dt        j                         j#                         |j                  z  z  z  }t	        j                  |j$                  |       yy)	zInitialize the weightsr   r   r2   rI   r   r   rJ   N)rL   _init_weights
isinstance	EsmLMHeadinitzeros_r{   r   copy_r   r#   rN   r-   r   rF   r!   rO   rP   rG   )rU   r   rG   rV   s      r(   r:  z EsmPreTrainedModel._init_weights*  s     	f%fi(KK$.JJv**ELL9L9L9R9RSU9V,W,^,^_f,gh0eQ

AU[[(Y(_(_(adjdndn(nopHJJv1 1r*   c                      y r    rU   s    r(   get_output_embeddingsz(EsmPreTrainedModel.get_output_embeddings6  s     r*   )rh   ri   rj   r   rm   base_model_prefixsupports_gradient_checkpointingaccepts_loss_kwargs_no_split_modules"_keys_to_ignore_on_load_unexpected_supports_flash_attn_supports_sdpa_supports_flex_attn_supports_attention_backendr  r   r   _can_record_outputsr#   no_gradr:  rC  rp   rq   s   @r(   r3  r3    s    &*#\*F)G&N"& "%&6aKXY+1AQR
 U]]_	2 	2r*   r3  c                   H    e Zd ZdZd fd	Zd Zd Zee	 	 	 	 	 	 dde	j                  dz  de	j                  dz  de	j                  dz  d	e	j                  dz  d
e	j                  dz  de	j                  dz  dee   dee	j                     ez  fd              Zd Zd Z xZS )EsmModela  

    The model can behave as an encoder (with only self-attention) as well as a decoder, in which case a layer of
    cross-attention is added between the self-attention layers, following the architecture described in [Attention is
    all you need](https://huggingface.co/papers/1706.03762) by Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit,
    Llion Jones, Aidan N. Gomez, Lukasz Kaiser and Illia Polosukhin.

    To behave as an decoder the model needs to be initialized with the `is_decoder` argument of the configuration set
    to `True`. To be used in a Seq2Seq model, the model needs to initialized with both `is_decoder` argument and
    `add_cross_attention` set to `True`; an `encoder_hidden_states` is then expected as an input to the forward pass.
    c                    t         |   |       || _        t        |      | _        t        |      | _        |rt        |      nd| _        t        |j                  |j                  z  d      | _        | j                          y)zv
        add_pooling_layer (bool, *optional*, defaults to `True`):
            Whether to add a pooling layer
        NT)rt   r{   )rL   rM   r   r   r   r  encoderr+  poolerrs   r!  r   contact_head	post_init)rU   r   add_pooling_layerrV   s      r(   rM   zEsmModel.__init__J  sq    
 	 '/!&)+<i'$40063M3MMTX

 	r*   c                 .    | j                   j                  S r   r   r   rB  s    r(   get_input_embeddingszEsmModel.get_input_embeddings^  s    ...r*   c                 &    || j                   _        y r   rX  )rU   r   s     r(   set_input_embeddingszEsmModel.set_input_embeddingsa  s    */'r*   Nr   r   r   r   r   r   r   re   c           
      p   |du |duz  rt        d      || j                  ||      }| j                  ||||t        j                  |j
                  d   |j                        d      \  }} | j                  |f|||d|}|d   }	| j                  | j                  |	      nd}
t        |	|
	      S )
aV  
        input_ids (`torch.LongTensor` of shape `((batch_size, sequence_length))`):
            Indices of input sequence tokens in the vocabulary.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.

            [What are input IDs?](../glossary#input-ids)
        position_ids (`torch.LongTensor` of shape `((batch_size, sequence_length))`, *optional*):
            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
            config.max_position_embeddings - 1]`.

            [What are position IDs?](../glossary#position-ids)
        inputs_embeds (`torch.FloatTensor` of shape `((batch_size, sequence_length), hidden_size)`, *optional*):
            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
            is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
            model's internal embedding lookup matrix.
        Nz:You must specify exactly one of input_ids or inputs_embeds)r   r   r   rX   )r   r   embedding_outputr   cache_positionpast_key_valuesr   r   )r&  pooler_output)
r   r   _create_attention_masksr#   rN   r-   rY   rR  rS  r   )rU   r   r   r   r   r   r   r   encoder_outputssequence_outputr1  s              r(   rg   zEsmModel.forwardd  s    < -t";<YZZ  OO#) , M
 261M1M)#9*"7 <<(;(;A(>}G[G[\  2N 2
.. '$,,
)"7#9	

 
 *!,8<8OO4UY;-'
 	
r*   c                     | j                   j                  rt        | j                   ||||      }nt        | j                   ||      }|t        | j                   |||      }||fS )N)r   input_embedsr   r^  r_  )r   re  r   )r   re  r   r   )r   r   r   r
   )rU   r   r   r]  r   r^  r_  s          r(   ra  z EsmModel._create_attention_masks  sx     ;;!!/{{--- /N 7{{--N "-%>{{-5&;	&" 555r*   c                 H    | ||dd      j                   }t        j                  |d      }||j                  d      j                  d      j                  d      z  }||j                  d      j                  d      j                  d      z  }| j	                  ||      S )NT)r   return_dictoutput_attentionsr   r    r   r      )r   r#   stackr~   rT  )rU   r   r   attnss       r(   predict_contactszEsmModel.predict_contacts  s    VN`deppEq)
 	))!,66q9CCAFF))!,66q9CCAFF  //r*   )T)NNNNNN)rh   ri   rj   rk   rM   rY  r[  r   r   r#   rl   r   r   ro   r   rg   ra  rl  rp   rq   s   @r(   rP  rP  <  s    
(/0  *..2,0-1596:<
<<$&<
 t+<
 llT)	<

 ||d*<
  %||d2<
 !&t 3<
 +,<
 
u||	K	K<
  <
~ 6D	0r*   rP  c                   J    e Zd ZddiZ fdZd Zd Zee	 	 	 	 	 	 	 dde	j                  dz  de	j                  dz  d	e	j                  dz  d
e	j                  dz  de	j                  dz  de	j                  dz  de	j                  dz  dee   deez  fd              Zd Z xZS )EsmForMaskedLMzlm_head.decoder.weightz%esm.embeddings.word_embeddings.weightc                     t         |   |       |j                  rt        j	                  d       t        |d      | _        t        |      | _        | j                          y )NzjIf you want to use `EsmForMaskedLM` make sure `config.is_decoder=False` for bi-directional self-attention.FrV  )
rL   rM   r   loggerwarningrP  r4  r<  lm_headrU  r   s     r(   rM   zEsmForMaskedLM.__init__  sP     NN1
 Fe< (r*   c                 .    | j                   j                  S r   rs  decoderrB  s    r(   rC  z$EsmForMaskedLM.get_output_embeddings  s    ||###r*   c                 &    || j                   _        y r   ru  )rU   new_embeddingss     r(   set_output_embeddingsz$EsmForMaskedLM.set_output_embeddings  s    -r*   Nr   r   r   r   r   r   labelsr   re   c           	      p    | j                   |f|||||d|}	|	d   }
| j                  |
      }d}|at               }|j                  |j                        } ||j                  d| j                  j                        |j                  d            }t        |||	j                  |	j                        S )a  
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ...,
            config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the
            loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
        )r   r   r   r   r   r   Nr   losslogitsr   r   )r4  rs  r   r\   rY   r   r   r   r   r   r   )rU   r   r   r   r   r   r   rz  r   outputsrc  prediction_scoresmasked_lm_lossloss_fcts                 r(   rg   zEsmForMaskedLM.forward  s    ( $((
)%'"7#9
 
 "!* LL9')HYY0778F%&7&<&<RAWAW&XZ`ZeZefhZijN$!//))	
 	
r*   c                 <    | j                   j                  ||      S )N)r   )r4  rl  )rU   r   r   s      r(   rl  zEsmForMaskedLM.predict_contacts  s    xx(((OOr*   )NNNNNNN)rh   ri   rj   _tied_weights_keysrM   rC  ry  r   r   r#   
LongTensorrl   r   r   r   ro   r   rg   rl  rp   rq   s   @r(   rn  rn    s   24[\$.  .2.20426:>6:*.*
##d**
 t+*
 &&-	*

 ((4/*
  %0047*
 !&t 3*
   4'*
 +,*
 
	*
  *
XPr*   rn  c                   (     e Zd ZdZ fdZd Z xZS )r<  z&ESM Head for masked language modeling.c                    t         |           t        j                  |j                  |j                        | _        t        j                  |j                  |j                        | _        t        j                  |j                  |j                  d      | _
        t        j                  t        j                  |j                              | _        y )Nr   F)r{   )rL   rM   r   rw   r   r   r   r   r   r   rv  	Parameterr#   zerosr{   r   s     r(   rM   zEsmLMHead.__init__   s    YYv1163E3EF
,,v'9'9v?T?TUyy!3!3V5F5FUSLLV->->!?@	r*   c                     | j                  |      }t        |      }| j                  |      }| j                  |      | j                  z   }|S r   )r   r7   r   rv  r{   rU   featuresr   r%   s       r(   rg   zEsmLMHead.forward(  sD    JJx GOOA LLOdii'r*   rh   ri   rj   rk   rM   rg   rp   rq   s   @r(   r<  r<    s    0Ar*   r<  z
    ESM Model transformer with a sequence classification/regression head on top (a linear layer on top of the pooled
    output) e.g. for GLUE tasks.
    )custom_introc                        e Zd Z fdZee	 	 	 	 	 ddej                  dz  dej                  dz  dej                  dz  dej                  dz  dej                  dz  de
e   d	eez  fd
              Z xZS )EsmForSequenceClassificationc                     t         |   |       |j                  | _        || _        t	        |d      | _        t        |      | _        | j                          y NFrp  )	rL   rM   
num_labelsr   rP  r4  EsmClassificationHead
classifierrU  r   s     r(   rM   z%EsmForSequenceClassification.__init__9  sH      ++Fe</7r*   Nr   r   r   r   rz  r   re   c                     | j                   |f|||d|}|d   }| j                  |      }	d}
||j                  |	j                        }| j                  j
                  | j                  dk(  rd| j                  _        nl| j                  dkD  rL|j                  t        j                  k(  s|j                  t        j                  k(  rd| j                  _        nd| j                  _        | j                  j
                  dk(  rIt               }| j                  dk(  r& ||	j                         |j                               }
n ||	|      }
n| j                  j
                  dk(  r=t               } ||	j                  d| j                        |j                  d            }
n,| j                  j
                  dk(  rt               } ||	|      }
t!        |
|	|j"                  |j$                  	      S )
a  
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        r   r   r   r   Nr   rx   single_label_classificationmulti_label_classificationr   r|  )r4  r  r\   rY   r   problem_typer  rK   r#   r   rn   r   r   r   r   r   r   r   r   rU   r   r   r   r   rz  r   r  rc  r~  r}  r  s               r(   rg   z$EsmForSequenceClassification.forwardC  s   $ $((
)%'	

 
 "!*1YYv}}-F{{''/??a'/;DKK,__q(fllejj.HFLL\a\e\eLe/LDKK,/KDKK,{{''<7"9??a'#FNN$4fnn6FGD#FF3D))-JJ+-B @&++b/R))-II,./'!//))	
 	
r*   NNNNN)rh   ri   rj   rM   r   r   r#   r  rl   r   r   r   ro   r   rg   rp   rq   s   @r(   r  r  2  s      .2.20426*.8
##d*8
 t+8
 &&-	8

 ((4/8
   4'8
 +,8
 
)	)8
  8
r*   r  c                        e Zd Z fdZee	 	 	 	 	 ddej                  dz  dej                  dz  dej                  dz  dej                  dz  dej                  dz  de
e   d	eez  fd
              Z xZS )EsmForTokenClassificationc                 0   t         |   |       |j                  | _        t        |d      | _        t        j                  |j                        | _        t        j                  |j                  |j                        | _        | j                          y r  )rL   rM   r  rP  r4  r   r   r   r   rw   r   r  rU  r   s     r(   rM   z"EsmForTokenClassification.__init__  si      ++Fe<zz&"<"<=))F$6$68I8IJr*   Nr   r   r   r   rz  r   re   c                 z    | j                   |f|||d|}|d   }| j                  |      }| j                  |      }	d}
|Wt               }|j	                  |	j
                        } ||	j                  d| j                        |j                  d            }
t        |
|	|j                  |j                        S )z
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`.
        r  r   Nr   r|  )r4  r   r  r   r\   rY   r   r  r   r   r   r  s               r(   rg   z!EsmForTokenClassification.forward  s      $((
)%'	

 
 "!*,,71')HYYv}}-FFKKDOO<fkk"oND$!//))	
 	
r*   r  )rh   ri   rj   rM   r   r   r#   r  rl   r   r   r   ro   r   rg   rp   rq   s   @r(   r  r    s      .2.20426*.'
##d*'
 t+'
 &&-	'

 ((4/'
   4''
 +,'
 
&	&'
  '
r*   r  c                   (     e Zd ZdZ fdZd Z xZS )r  z-Head for sentence-level classification tasks.c                 &   t         |           t        j                  |j                  |j                        | _        t        j                  |j                        | _        t        j                  |j                  |j                        | _
        y r   )rL   rM   r   rw   r   r   r   r   r   r  out_projr   s     r(   rM   zEsmClassificationHead.__init__  s`    YYv1163E3EF
zz&"<"<=		&"4"4f6G6GHr*   c                     |d d dd d f   }| j                  |      }| j                  |      }t        j                  |      }| j                  |      }| j	                  |      }|S r/  )r   r   r#   tanhr  r  s       r(   rg   zEsmClassificationHead.forward  sY    Q1WLLOJJqMJJqMLLOMM!r*   r  rq   s   @r(   r  r    s    7Ir*   r  c                     | j                  |      j                         }t        j                  |d      j	                  |      |z  }|j                         |z   S )a  
    Replace non-padding symbols with their position numbers. Position numbers begin at padding_idx+1. Padding symbols
    are ignored. This is modified from fairseq's `utils.make_positions`.

    Args:
        x: torch.Tensor x:

    Returns: torch.Tensor
    r   r    )r}   rn   r#   cumsumrZ   r   )r   r   maskincremental_indicess       r(   r   r     sP     <<$((*D,,t3;;DADH##%33r*   )rn  r  r  rP  r3  )Nr   )Grk   r4   collections.abcr   r#   r   torch.nnr   r   r    r	   r=  masking_utilsr
   r   modeling_layersr   modeling_outputsr   r   r   r   r   modeling_utilsr   r   processing_utilsr   utilsr   r   r   r   utils.genericr   r   configuration_esmr   
get_loggerrh   rq  r)   r0   r7   r:   rD   ModulerF   rs   r   rl   rP   r   r   r   r   r  r  r  r  r+  r3  rP  rn  r<  r  r  r  r   __all__rA  r*   r(   <module>r     s~     $   A A & J 9  G & R R ? ( 
		H	%(
.;#
	*
ehhoo *
Z Gryy  GF\=BII \=L !%II%<<% 
% <<	%
 LL4'% T\% % '(%:M)ryy M)`
BII 
299 8bii 
		 
4) 4nS SB		  # # #L S0! S0 S0l FP' FP FPR		 * E
#5 E
E
P 4
 2 4
 4
nBII &4 r*   