
    i              	          d dl Z d dlZd dlZd dlZd dlZd dlZd dlmZ d dlm	Z	 d dl
mZ d dlmZ d dlmZmZ d dlZddlmZ d	d
lmZmZmZmZ d	dlmZ d	dlmZmZ d	dlmZm Z m!Z! d	dl"m#Z#  ee$      Z%d Z&d Z'd0dZ(d0dZ)d Z*d1dZ+d1dZ,d2dZ-d1dZ.d3de/de/de0fdZ1	 d4de/de/de0de0fdZ2dejf                  jh                  d ejj                  fd!Z6dejf                  jh                  d"e7fd#Z8d$ejr                  jt                  d%e7fd&Z;dejf                  jh                  fd'Z<dejf                  jh                  d(ejf                  jh                  fd)Z=dejf                  jh                  d(eejf                  jh                  ge0f   fd*Z>d+ Z?d,e7d(e7fd-Z@d.ee	ejf                  jh                     e/f   d(eAejf                  j                     fd/ZCy)5    N)defaultdict)Iterable)nullcontext)Path)CallableUnion   )
get_logger   )FSDP_MODEL_NAMEOPTIMIZER_NAMESAFE_WEIGHTS_NAMEWEIGHTS_NAME)get_module_class_from_name)get_non_persistent_buffersis_peft_model)get_module_children_bottom_upis_compiled_modulesave)is_torch_versionc                  t    dt         j                  vrdt         j                  d<   dt         j                  d<   y)z[
    Enables RAM efficient loading of Hugging Face models for FSDP in the environment.
    ACCELERATE_USE_FSDPTrueFSDP_CPU_RAM_EFFICIENT_LOADINGNosenviron     e/home/obispo/Crisostomo_bridge/mision_env/lib/python3.12/site-packages/accelerate/utils/fsdp_utils.py!enable_fsdp_ram_efficient_loadingr!   '   s.    
 BJJ.,2

()39BJJ/0r   c                  *    dt         j                  d<   y)z\
    Disables RAM efficient loading of Hugging Face models for FSDP in the environment.
    Falser   Nr   r   r   r    "disable_fsdp_ram_efficient_loadingr$   1   s     4;BJJ/0r   c                     |r%t        |       rddlm}  || | j                        S |ddlm}  || |      S | j                         S )Nr   )get_peft_model_state_dictadapter_name)get_model_state_dictoptions)r   peftr&   active_adapter'torch.distributed.checkpoint.state_dictr)   
state_dict)modeladapter_only
sd_optionsr&   r)   s        r    _get_model_state_dictr3   8   sI    e,2(U=Q=QRR P#E:>>!!r   c                     |r&t        |       rddlm}  || || j                        S |ddlm}  || ||      S | j                  |      S )Nr   )set_peft_model_state_dictr'   )set_model_state_dictr*   )r   r,   r5   r-   r.   r6   load_state_dict)r0   r/   r1   r2   r5   r6   s         r    _set_model_state_dictr8   G   sO    e,2(
I]I]^^ P#E:zJJ$$Z00r   c           	          d }| j                   dk(  rXddlm} ddlm}  || j
                  |j                  k(  t        | j                  dd      t        | j                  dd            }|S )	Nr	   r   )StateDictOptionsStateDictTypeoffload_to_cpuF
rank0_only)full_state_dictcpu_offloadbroadcast_from_rank0)	fsdp_versionr.   r:   2torch.distributed.fsdp.fully_sharded_data_parallelr<   state_dict_typeFULL_STATE_DICTgetattrstate_dict_config)fsdp_pluginr2   r:   r<   s       r    _prepare_sd_optionsrI   V   sj    J 1$LT%'77=;X;XX = =?OQVW!()F)FV[!\

 r   c                    dd l mc m} ddlm} ddlm} ddlm}	 t        j                  |d       | j                  |	j                  k(  r1|j                  dkD  }
|
| j                  _        |
| j                  _        | j                   dk(  r2|j                  || j                  | j                  | j"                        n	t%               }t'        |       }|5  t)        |||      }| j                  |	j                  k(  r|dk(  r	t*         d	nt*         d
| d	}t        j,                  j/                  ||      }|j0                  dk(  rt2        j5                  d|        t7        j8                  ||       t2        j5                  d|        ni| j                  |	j:                  k(  r|dk(  rt*         d|j0                   d	nt*         d
| d|j0                   d	}t        j,                  j/                  ||      }t2        j5                  d|        t7        j8                  ||       t2        j5                  d|        n| j                  |	j<                  k(  rt        j,                  j/                  |t*         d
|       }t        j                  |d       t2        j5                  d|        d|i}|j9                  ||j?                  |       |              t2        j5                  d|        d d d        y # 1 sw Y   y xY w)Nr   DefaultSavePlannerFullyShardedDataParallelr;   Texist_okr   r1   r2   .bin_zSaving model to zModel saved to _rankr0   r/   storage_writerplanner) torch.distributed.checkpointdistributed
checkpoint,torch.distributed.checkpoint.default_plannerrL   rC   rN   r<   r   makedirsrD   rE   num_processesrG   r=   r>   rB   optim_state_dict_configr   rI   r3   r   pathjoinprocess_indexloggerinfotorchr   LOCAL_STATE_DICTSHARDED_STATE_DICTFileSystemWriter)rH   acceleratorr0   
output_dirmodel_indexr1   dist_cprL   FSDPr<   is_multi_processctxr2   r/   weights_nameoutput_model_fileckpt_dirs                    r    save_fsdp_modelrr   g   s   22OcPKK
T*""m&C&CC '44q87G%%43C%%0 ##q( 	;..0M0M{OrOr	
 ]  %[1J	 6*5|Xbc
&&-*G*GG7Ba7Go.d3P_O``abmannrMsL "Z F((A-./@.ABC

:'89o.?-@AB((M,J,JJ !# ##5)B)B(C4H'(+eK<U<U;VVZ[ 
 !#Z FKK*+<*=>?JJz#45KK/*;)<=>((M,L,LLww||J?2C1[M0RSHKK40KK*8*56!:.JLL%&77A*,  
 KK/(45?6 6 6s   H#LL
c                    dd l mc m} ddlm} ddlm} ddlm}	 |j                          | j                  |	j                  k(  r1|j                  dkD  }
|
| j                  _        |
| j                  _        | j                  dk(  r2|j                  || j                  | j                  | j                         n	t#               }t%        |       }|5  | j                  |	j                  k(  rt'        |      |urK|j(                  dk7  r<|j*                  s0| j,                  s| j                  dk(  rt/        d      	 d d d        y |dk(  r	t0         dnt0         d| d}t2        j4                  j7                  ||      }t8        j;                  d	|        |j*                   xs |j<                  }|rt?        j@                  |d
      }ni }t8        j;                  d|        no| j                  |	jB                  k(  r|dk(  rt0         d|j(                   dnt0         d| d|j(                   d}t2        j4                  j7                  ||      }t8        j;                  d	|        t?        j@                  |d
      }t8        j;                  d|        n| j                  |	jD                  k(  rt0         |vr)t2        j4                  j7                  |t0         d|       n|}t8        j;                  d	|        dtG        |||      i}|jA                  ||jI                  |       |              |d   }t8        j;                  d|        tK        |||      }d d d        |S # 1 sw Y   S xY w)Nr   )DefaultLoadPlannerrM   r;   r   zzSet the `sync_module_states` flag to `True` so that model states are synced across processes when initializing FSDP objectrR   rS   zLoading model from Tweights_onlyzModel loaded from rT   r0   rQ   )r/   storage_readerrW   )&rX   rY   rZ   r[   rt   rC   rN   r<   wait_for_everyonerD   rE   r]   rG   r=   r>   rB   r^   r   rI   typera   is_fsdp2sync_module_states
ValueErrorr   r   r_   r`   rb   rc   is_main_processrd   loadre   rf   r3   FileSystemReaderr8   )rH   rh   r0   	input_dirrj   r1   rk   rt   rl   r<   rm   rn   r2   ro   input_model_file
load_modelr/   rq   load_results                      r    load_fsdp_modelr      si   22OcP!!#""m&C&CC '44q87G%%43C%%0 ##q( 	;..0M0M{OrOr	
 ]  %[1J	 -q&&-*G*GGE{$&;+D+D+IR]RfRf"55+:R:RVW:W$3  -q -q 8Ca7Go.d3P_O``abmannrMsL!ww||I|DKK-.>-?@A(111P[5P5PJ"ZZ(8tL

KK,-=,>?@((M,J,JJ !# ##5)B)B(C4H'(+eK<U<U;VVZ[ 
  "ww||I|DKK-.>-?@A$44HJKK,-=,>?@((M,L,LL &&y8 Y?*;1[M(JK 
 KK-hZ89!#8\fp#qrJLL%&77A*,  
 $G,JKK,XJ78+E:Leop[-q\ ]-q\ s   A)M+ H!M++M5c                 0   dd l mc m} ddlm} ddlm} ddlm}	 t        j                  |d       | j                  dk(  r2|j                  || j                  | j                  | j                        n	t               }
t        |       }|
5  | j                  dk(  rdd	lm}  ||||
      }n|j%                  ||      }| j                  |	j&                  k(  r|j(                  dk(  r|dk(  r	t*         dnt*         d| d}t        j,                  j/                  ||      }t0        j3                  d|        t5        j6                  ||       t0        j3                  d|        nt        j,                  j/                  |t*         d|       }t        j                  |d       t0        j3                  d|        |j7                  d|i|j9                  |       |              t0        j3                  d|        d d d        y # 1 sw Y   y xY w)Nr   rK   rM   r;   TrO   r   r	   )get_optimizer_state_dictr*   rR   rS   zSaving Optimizer state to zOptimizer state saved in 	optimizerrU   )rX   rY   rZ   r[   rL   rC   rN   r<   r   r\   rB   rD   rG   r^   r   rI   r.   r   optim_state_dictrE   ra   r   r_   r`   rb   rc   rd   r   rg   )rH   rh   r   r0   ri   optimizer_indexrk   rL   rl   r<   rn   r2   r   optim_stateoptim_state_nameoutput_optimizer_filerq   s                    r    save_fsdp_optimizerr      s   22OcPKK
T* ##q( 	;..0M0M{OrOr	
 ]  %[1J	 @##q(X25)ZXK//yAK&&-*G*GG((A-/>!/C~&d+NK[[\]l\mmqIr ! )+ZAQ(R%89N8OPQ

;(=>78M7NOPww||J>2B!OCT0UVHKK40KK4XJ?@LL'5&77A*,  
 KK3H:>?5@ @ @s   E8HHc                 x   dd l mc m} ddlm} ddlm}	 |j                          | j                  dk(  r2|j                  || j                  | j                  | j                        n	t               }
t        |       }|
5  | j                  |	j                  k(  rd }|j                  dk(  s| j                  j                  s |dk(  r	t          dnt          d| d}t"        j$                  j'                  ||      }t(        j+                  d|        t-        j.                  |d	      }t(        j+                  d
|        nt          |vr)t"        j$                  j'                  |t          d|       n|}t(        j+                  d|        d|j1                         i}|j/                  |||j3                  |             |d   }t(        j+                  d|        | j                  dk(  r&|j5                  |||      }|j7                  |       nddlm}  |||||       d d d        y # 1 sw Y   y xY w)Nr   rM   r;   r   rR   rS   zLoading Optimizer state from Tru   zOptimizer state loaded from zLoading Optimizer from r   )checkpoint_idrw   zOptimizer loaded from )r0   optimr   )set_optimizer_state_dictr*   )rX   rY   rZ   rC   rN   r<   rx   rB   rD   rG   r^   r   rI   rE   ra   r>   r   r   r_   r`   rb   rc   rd   r~   r/   r   optim_state_dict_to_loadr7   r.   r   )rH   rh   r   r0   r   r   r1   rk   rl   r<   rn   r2   r   optimizer_nameinput_optimizer_filerq   flattened_osdr   s                     r    load_fsdp_optimizerr     s'   22cP!!#
 ##q( 	;..0M0M{OrOr	
 ]  %[1J	 !X&&-*G*GGK((A-[5X5X5c5c/>!/C~&d+NK[[\]l\mmqIr  (*ww||I~'N$;<P;QRS#jj)=DQ:;O:PQR %%i7 Y>*:!O;L(MN 
 KK1(<=&	(<(<(>?KLL&&77A  
 &k2KKK0
;<##q( 99Yit9uM%%m4X$UI{JWC!X !X !Xs   >F)H00H9checkpoint_dir	save_pathsafe_serializationc                 p   ddl mc m} ddlmc mc m} i }t        |      }|j                  d       |j                  ||j                  |       |j                         d       |r	|t        z  n|t        z  }t        |j                               dk(  r|t        |      d      }t        |||       |S )z
    Passthrough to `torch.distributed.checkpoint.format_utils.dcp_to_torch_save`

    Will save under `save_path` as either `model.safetensors` or `pytorch_model.bin`.
    r   NTrO   )rw   rW   no_distr   )r   )rX   rY   rZ   )torch.distributed.checkpoint.format_utilsformat_utilsr   mkdir_load_state_dictr   _EmptyStateDictLoadPlannerr   r   lenkeyslistr   )r   r   r   rk   dist_cp_format_utilsr/   s         r    )_distributed_checkpoint_to_merged_weightsr   L  s     32LLJYIOOTO"))//?$??A	 *  2D	--UaIaI :??"Z 0 34
Y3EFr   output_pathremove_checkpoint_dirc                    t        |       } ddlm} t        dd      st	        d      | j                         s| dz  j                         }| dz  j                         }d|  d	}|r#|r!|d
z  }|d|  d|  dz  }|dz  }t	        |      |r|dz  }|d|  dz  }t	        |      |r|dz  }|d|  dz  }t	        |       |       }|j                  rlt        j                  d|         t        | ||      }	t        j                  d|	        |r-t        j                  d|         t        j                  |        |j                          y)a?  
    Merge the weights from sharded FSDP model checkpoints into a single combined checkpoint. Should be used if
    `SHARDED_STATE_DICT` was used for the model. Weights will be saved to `{output_path}/model.safetensors` if
    `safe_serialization` else `pytorch_model.bin`.

    Note: this is a CPU-bound process.

    Args:
        checkpoint_dir (`str`):
            The directory containing the FSDP checkpoints (can be either the model or optimizer).
        output_path (`str`):
            The path to save the merged checkpoint.
        safe_serialization (`bool`, *optional*, defaults to `True`):
            Whether to save the merged weights with safetensors (recommended).
        remove_checkpoint_dir (`bool`, *optional*, defaults to `False`):
            Whether to remove the checkpoint directory after merging.
    r   )PartialStatez>=z2.3.0z/`merge_fsdp_weights` requires PyTorch >= 2.3.0`pytorch_model_fsdp_0optimizer_0zTried to load from z) but couldn't find a valid metadata file.zE However, potential model and optimizer checkpoint directories exist.zPlease pass in either z/pytorch_model_fsdp_0 or z/optimizer_0zinstead.z8 However, a potential model checkpoint directory exists.zPlease try passing in z/pytorch_model_fsdp_0 instead.z< However, a potential optimizer checkpoint directory exists.z/optimizer_0 instead.zMerging FSDP weights from z.Successfully merged FSDP weights and saved to z"Removing old checkpoint directory N)r   accelerate.stater   r   r|   existsr}   rb   rc   r   shutilrmtreerx   )
r   r   r   r   r   model_path_existsoptimizer_path_existserrstater   s
             r    merge_fsdp_weightsr   h  s   ( .)N-D'*JKK   "+.DDLLN!/-!? G G I#N#33\]!6ZZC+N+;;TUcTddpqqC:C o MMC+N+;;YZZC o #QQC+N+;;PQQCo NE00@AB=nk[mn	DYKPQ KK<^<LMNMM.)	r   r0   devicec                 *   	 t        |dd       }|s S i 	|D ]W  }|j                  d      }dj                  |d d       |d   }}|j                  |      }t        ||      }d 	t	        |      <   Y dt
        j                  j                  f	 fd}|S )N_tied_weights_keys.modulec                 F   t        t              }| j                  d      D ]0  \  }}t        |      v s|t        |         j	                  |       2  |       } |j                         D ]0  \  }}|D ]&  }|   }|t        | |      |<   t        | ||       ( 2 | S )NF)recurse)r   r   named_parametersidappenditemsrF   setattr)	r   params_to_tienparamid_key_param_names
param_name_tied_paramsparam_init_fns	          r    param_init_fn_tied_paramz7ensure_weights_retied.<locals>.param_init_fn_tied_param  s     $D)///> 	3HAu%yL(bi(//2	3 v& %2$7$7$9 	7 FL* 7
$V,= ,36:+FL(FJ67	7 r   )rF   splitr`   get_submoduler   rd   nnModule)
r   r0   r   _tied_namesnamer   modr   r   r   s
   `        @r    ensure_weights_retiedr     s    %!5t<K L 'zz#88D"I.Rj!!$'Z("&RY' 2 $#r   full_sdc                    ddl m} ddlm}m} |j                         }i }d }d }	| j                  rt        |j                         |j                               D ]  \  \  }
}}|j                  }|j                         j                  |j                        }t        ||      r|j                         }|j!                  |d|j"                  j$                          ||||j&                        } |||
|      \  }} |	|||      }|||
<    n|j                         D ]  \  }
}|j                  }t)        j*                  |j-                         |j                  |j.                        }|j!                  |d|j"                  j$                          ||||j&                        } |||
|      \  }} |	|||      }|||
<    |j1                  |d	       |S )
a  
    Loads the full state dict (could be only on rank 0) into the sharded model. This is done by broadcasting the
    parameters from rank 0 to all other ranks. This function modifies the model in-place.

    Args:
        accelerator (`Accelerator`): The accelerator instance
        model (`torch.nn.Module`):
            The model to load the state dict into, expected to be on meta device or a VRAM spike can occur
        full_sd (`dict`): The full state dict to load, can only be on rank 0
    r   N)DTensordistribute_tensorc                    	 | j                  |      }t        t        d      }d }|xr |j                  t        j                  k(  }	|j                  j                  r|	s|j                  }|d uxr |j                         |fS # t        $ r5 |j                  dd      \  }}| j                  |      }t	        ||      }Y w xY w)Nr   r   float8_e4m3fn)get_parameter_or_bufferAttributeErrorrsplitr   rF   hasattrrd   dtyper   is_floating_pointis_contiguous)
r0   r   empty_param	old_parambase_param_namelocal_param_name	submoduleis_torch_e4m3fn_availablecasting_dtypeis_param_float8_e4m3fns
             r    _infer_parameter_dtypez:fsdp2_load_full_state_dict.<locals>._infer_parameter_dtype  s    	=55jAI %,E?$C!!:!g{?P?PTYTgTg?g..7M%OOM$B)@)@)BMQQ  	=0:0A0A#q0I-O-++O<I	+;<I		=s   B ;C ?C c                 R    || j                  |      } |r| j                         } | S )N)r   )to
contiguous)tensorto_contiguousr   s      r    _cast_and_contiguousz8fsdp2_load_full_state_dict.<locals>._cast_and_contiguous  s.    YYUY+F&&(Fr   )srcgroup)r   r   T)assign)torch.distributedrY   torch.distributed.tensorr   r   r/   r}   zipr   valuesdevice_meshdetachr   device_type
isinstanceto_local	broadcastr   WORLD
placementsrd   emptysizer   r7   )rh   r0   r   distr   r   meta_sharded_sd
sharded_sdr   r   r   
full_paramsharded_paramr   sharded_tensorr   r   full_tensors                     r    fsdp2_load_full_state_dictr    s    %C &&(OJR$ ""7:7==?OLbLbLd7e 	43$Zm'33K#**,//0G0GHJ*g. (002
NN:1DJJ4D4DNE.z;H`H`aN+A,(M=
 2.-Q^_N%3Jz"!	4& *9)>)>)@ 	4%J'33K++m&8&8&:;CZCZbobubuvKNN;ATZZ5E5ENF.{KIaIabN+A,(M=
 2.-Q^_N%3Jz"	4 
*T2Lr   r   mappingc                     ddl m} i }d||<   	 | j                  D ]%  }|d   D cg c]  }||j                      c}|d<   ' yc c}w # t        $ r t	        d      w xY w)a  
    Switches the parameters of the optimizer to new ones (sharded parameters in usual case). This function modifies the
    optimizer in-place.

    Args:
        optimizer (`torch.optim.Optimizer`): Optimizer instance which contains the original model parameters
        mapping (`dict`): Mapping from the original parameter (specified by `data_ptr`) to the sharded parameter

    Raises:
        KeyError:
            If a parameter in the optimizer couldn't be switched to its sharded version. This should never happen and
            indicates a bug. If we kept the original params instead of raising, the training wouldn't be numerically
            correct and weights wouldn't get updated.
    r   )r   _local_tensorparamszA parameter in the optimizer couldn't be switched to its sharded version. This breaks the training. Please raise an issue on GitHub.N)r   r   param_groupsdata_ptrKeyError)r   r  r   accessor_mappingparam_groupps         r    !fsdp2_switch_optimizer_parametersr    s~     1 /W
$11 	YKBMhBW$XQWQZZ%8$XK!	Y$X 
  S
 	

s   A	 AA	 A	 	Ac                 d   ddl m} t        | j                  j                  |      }t        |d      dd D ]v  \  }}t        |j                  d            dkD  r|j                  dd      \  }}nd}|}|r|j                  |      n|} ||      s[ ||d	
      }|j                  ||       x |S )a8  
    Applies the activation checkpointing to the model.

    Args:
        accelerator (`Accelerator`): The accelerator instance
        model (`torch.nn.Module`): The model to apply the activation checkpointing to

    Returns:
        `torch.nn.Module`: The model with the activation checkpointing applied
    r   )checkpoint_wrapperT)return_fqnsNr   r   r   F)preserve_rng_state);torch.distributed.algorithms._checkpoint.checkpoint_wrapperr  fsdp2_prepare_auto_wrap_policyr   rH   r   r   r   r   r   register_module)	rh   r0   r  auto_wrap_policy_func
layer_namelayerparent_name
child_nameparent_modules	            r    fsdp2_apply_acr   >  s     ;;;L;L;X;XZ_`:5dSTWUWX 
=
Ez$%)&0&7&7Q&?#KK#J<G++K8U /&uGE))*e<
= Lr   returnc           	         ddl m}m}m} t	        ||      xs# t        |      xr t	        |j                  |      }|r|S | j                  j                  }|j                  |       |j                         }t        | dd      }|j                  |j                  |j                  xs  |       |"|t        | j                   j"                           ndt%        |j&                  || j(                        d}	d}
|j+                         D ]"  \  }}|j,                  j.                  dk(  s d}
 n |j0                  r|
st3        |dd	      }t5        j6                  |j9                         D ci c]  \  }}||v s|| c}}      }|j;                  t=        j(                  d
            }t?        |d      r|jA                          tC        ||      }|2tE        |      dd D ]!  } ||      st	        ||      r ||fi |	 # t	        ||      s	 ||fi |	 |j0                  rtG        | ||       |j0                  r|
sjI                         D ]c  \  }}|j;                  | j(                        }d|v r'|jK                  dd      \  }}|jM                  |      }n|}|}|jO                  ||d       e t?        |d      r|jA                          t        |dd      }| jP                  dk7  rU||t<        jR                  k7  r@|j;                  t<        jR                        }| jT                  rtW        jX                  d       |S c c}}w )a"  Prepares the model for FSDP2 in-place. Also returns the model to avoid misuse of the original model.

    Args:
        accelerator (`Accelerator`): The accelerator instance
        model (`torch.nn.Module`): The model to prepare

    Returns:
        `torch.nn.Module`: Prepared model
    r   )
FSDPModuleMixedPrecisionPolicyfully_shardtorch_device_meshN)reshard_after_forwardoffload_policy	mp_policymeshignored_paramsF
Params4bitT)r   fqnsmetatie_weightsr   r   r   )
persistentr   noz~FSDP upcast of low precision parameters to fp32 (since mixed_precision != 'no') may affect the precision of model checkpoints.)-torch.distributed.fsdpr#  r$  r%  r   r   	_orig_modr   rH   set_auto_wrap_policyr/   rF   r'  r@   mixed_precision_policytupleparallelism_configfsdp_dim_namesget_parameters_from_modulesignored_modulesr   r   	__class____name__cpu_ram_efficient_loadingr   copydeepcopynamed_buffersr   rd   r   r/  r  r   r  r   r   r   register_buffermixed_precisionfloat32r}   warningswarn)rh   r0   r#  r$  r%  is_type_fsdpfsdp2_pluginoriginal_sdr*  fsdp2_kwargsmodel_has_params4bitr   r   non_persistent_buffer_fqnskvoriginal_non_persistent_buffersr  r   fqnbuffer_tensor
parent_fqnlocal_buffer_namer  model_dtypes                            r    fsdp2_prepare_modelrT  _  s@    UTeZ0 5!Mj*&M  $$00L%%e,""$K; 3T:D ".!C!C&22!88R<P<RNRN^U;99HHIJdh5l6R6RTY[f[m[mnL !--/ e ??##|3#'  --6J &@tZ^%_"*.--#113Wdaq<V7VQTW+
' f-. 5-(:<O(3E:3B? 	4F$V,Z
5SF3l3	4 eZ(E*\*-- 	#;{C--6J"A"G"G"I 
	^C),,[-?-?@Mcz03

30B-
- % 3 3J ?$'! %))*;]W\)]
	^  5-( %$/K""d*0C{V[VcVcGc '&&MM Q Ls Xs   +M
8M
c                    
 ddl m}m}  j                  }t	        |t
        j                        r|j                  }||u rt        |dd      }|g }t        |      } j                   j                  }t               
|D ]0  }t        ||      }|t        d| d      
j                  |       2 dt        j                   j"                  dt$        f 
fd	}	|	S ||u r(dt        j                   j"                  dt$        f fd
}	|	S y)a!  Prepares the auto wrap policy based on its type, done to mimic the behaviour of FSDP1 auto wrap policy.

    Args:
        fsdp2_plugin (`FullyShardedDataParallelPlugin`):
            Instance of `FullyShardedDataParallelPlugin` containing the configuration options
        auto_wrap_policy_type (`str`):
            Either `transformer` or `size`
        model (`torch.nn.Module`):
            The model to wrap

    Returns:
        `Callable[[torch.nn.Module], bool]`:
            The auto wrap policy function to be applied to the model
    r   )size_based_auto_wrap_policytransformer_auto_wrap_policy_no_split_modulesNz+Could not find the transformer layer class z in the model.r   r!  c                 H    j                   yt        | t                    S )NF)transformer_cls_names_to_wrapr   r6  )r   rG  transformer_cls_to_wraps    r    policyz.fsdp2_prepare_auto_wrap_policy.<locals>.policy  s%    99Afe,C&DEEr   c                 b    t        d | j                         D              }|j                  kD  S )Nc              3   <   K   | ]  }|j                           y w)N)numel).0r  s     r    	<genexpr>zAfsdp2_prepare_auto_wrap_policy.<locals>.policy.<locals>.<genexpr>  s     #K!AGGI#Ks   )sum
parametersmin_num_params)r   module_num_paramsrG  s     r    r\  z.fsdp2_prepare_auto_wrap_policy.<locals>.policy  s.     ##Kv7H7H7J#K K$|'B'BBBr   )torch.distributed.fsdp.wraprV  rW  auto_wrap_policyr   	functoolspartialfuncrF   r   rZ  setr   r|   addrd   r   r   bool)rG  r0   rV  rW  fnno_split_modulesrZ  layer_classtransformer_clsr\  r[  s   `         @r    r  r    s    f		&	&B"i''(WW	))"5*=tD#!(,-=(>%55A,8,V,V)"%%8 	9K8LO& #N{m[i!jkk#''8		9	F588?? 	Ft 	F M 
*	*	C588?? 	Ct 	C M r   c                      ddl m}  |di | S )a  
    Returns a `GradScaler` for FSDP2, as the current implementation of `get_grad_scaler` doesn't accept other args. We
    need this as current `get_grad_scaler` accepts only `distributed_type` as arg, which doesn't differentiate between
    FSDP1 and FSDP2
    r   )
GradScalerr   )torch.amp.grad_scalerrs  )kwargsrs  s     r    get_fsdp2_grad_scalerrv    s     1r   named_paramsc                    | j                         D ci c]  \  }}|j                  dd      | } }}| j                         D ci c]+  \  }}|j                  d      r|j                  dd      n||- } }}| j                         D ci c]  \  }}|j                  dd      | } }}| S c c}}w c c}}w c c}}w )a6  Removes parameter name modifiers in order to map them back to their original names.

    See huggingface/accelerate#3554 for more context.

    Args:
        named_params (`dict`): The named parameters dictionary to canonicalize.

    Returns:
        `dict`: The canonicalized named parameters dictionary
    z._checkpoint_wrapped_module z
_orig_mod.z
._orig_mod)r   replace
startswith)rw  rL  rM  s      r    fsdp2_canonicalize_namesr|    s     Q]PbPbPde1AII;R@!CeLeXdXjXjXlPTPQSTq||L'A		,#q!KL  @L?Q?Q?STtq!AIIlB/2TLT f Us   B/0B5B;modulesc                 |   | 
t               S g }t        | t              ret        j                  |       }g }|j                         D ]9  \  }}|j                  |      s|j                  |       |j                  |       ; |} | D ]*  }|j                  t        |j                                      , t        |      S )zConverts modules to parameters where modules can be a string or list of torch.nn.Module

    Args:
        modules (`Union[Iterable[torch.nn.Module], str]`): List of modules

    Returns:
        `set[torch.nn.Parameter]`: List of parameters
    )rk  r   strrecompilenamed_modules	fullmatchr   r   extendr   rc  )r}  r0   r   rc  regmapped_modulesr   r   s           r    r9  r9  $  s     uJ'3jj!!//1 	.LD&}}T"		&!%%f-	. ! 5$v002345z?r   )FN)r   F)r   )T)TF)Dr>  rh  r   r  r   rD  collectionsr   collections.abcr   
contextlibr   pathlibr   typingr   r   rd   loggingr
   	constantsr   r   r   r   dataclassesr   modelingr   r   otherr   r   r   versionsr   r<  rb   r!   r$   r3   r8   rI   rr   r   r   r   r  rm  r   r   r   r   r   r   dictr  r   	Optimizerr  r   rT  r  rv  r|  rk  	Parameterr9  r   r   r    <module>r     s     	 	   # $ "  "    W W 3 ? J J & 
H	:;"1"76tEP-@`0Xfc c gk : kp44&)4?C4cg4n+$ +$ +$\O588?? OT Od
1F1F 
QU 
>uxx BoEHHOO o od2 2HV[V^V^VeVeUfhlUlLm 2j 4 D &8EHHOO,c12		r   