
    ei`                       d dl mZ d dlZd dlZd dlmZ d dlmZmZm	Z	 d dl
Zd dlmZ d dlmZ d dlmZ d dlmZ d d	lmZ d d
lmZ d dlmZ erd dlmZ  ej4                  e      Ze	d   ZddddddddddddddZ G d de      Zy)    )annotationsN)Callable)TYPE_CHECKINGAnyLiteral)Tensor)tqdm)SentenceTransformer)InformationRetrievalEvaluator)SentenceEvaluator)SimilarityFunction)is_datasets_available)climatefeverdbpediafeverfiqa2018hotpotqamsmarconfcorpusnqquoraretrievalscidocsarguanascifact
touche2020ClimateFEVERDBPediaFEVERFiQA2018HotpotQAMSMARCONFCorpusNQQuoraRetrievalSCIDOCSArguAnaSciFact
Touche2020c                      e Zd ZdZeZdddgdgg dg ddgddd	dddej                  d
dddf	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 d fdZd Z		 	 	 d	 	 	 	 	 	 	 	 	 ddZ
ddZ	 	 	 	 ddZddZd Zd Z fdZddZ xZS )NanoBEIREvaluatora   
    This class evaluates the performance of a SentenceTransformer Model on the NanoBEIR collection of Information Retrieval datasets.

    The NanoBEIR collection consists of downsized versions of several BEIR information-retrieval datasets, making it
    suitable for quickly benchmarking a model's retrieval performance before running a full-scale BEIR evaluation.
    The datasets are available on Hugging Face in the Sentence Transformers `NanoBEIR collection <https://huggingface.co/collections/sentence-transformers/nanobeir-datasets>`_,
    which reformats the `original collection <https://huggingface.co/collections/zeta-alpha-ai/nanobeir>`_ from Zeta Alpha
    into the default `NanoBEIR-en <https://huggingface.co/datasets/sentence-transformers/NanoBEIR-en>`_ dataset,
    alongside many translated versions.
    This evaluator reports the same metrics as the :class:`~sentence_transformers.evaluation.InformationRetrievalEvaluator`
    (e.g., MRR, nDCG, Recall@k) for each dataset individually, as well as aggregated across all datasets.

    Args:
        dataset_names (List[str]): The short names of the datasets to evaluate on (e.g., "climatefever", "msmarco").
            If not specified, all predefined NanoBEIR datasets are used. The full list of available datasets is:
            "climatefever", "dbpedia", "fever", "fiqa2018", "hotpotqa", "msmarco", "nfcorpus", "nq", "quoraretrieval",
            "scidocs", "arguana", "scifact", and "touche2020".
        dataset_id (str): The HuggingFace dataset ID to load the datasets from. Defaults to
            "sentence-transformers/NanoBEIR-en". The dataset must contain "corpus", "queries", and "qrels"
            subsets for each NanoBEIR dataset, stored under splits named ``Nano{DatasetName}`` (for example,
            ``NanoMSMARCO`` or ``NanoNFCorpus``).
        mrr_at_k (List[int]): A list of integers representing the values of k for MRR calculation. Defaults to [10].
        ndcg_at_k (List[int]): A list of integers representing the values of k for NDCG calculation. Defaults to [10].
        accuracy_at_k (List[int]): A list of integers representing the values of k for accuracy calculation. Defaults to [1, 3, 5, 10].
        precision_recall_at_k (List[int]): A list of integers representing the values of k for precision and recall calculation. Defaults to [1, 3, 5, 10].
        map_at_k (List[int]): A list of integers representing the values of k for MAP calculation. Defaults to [100].
        show_progress_bar (bool): Whether to show a progress bar during evaluation. Defaults to False.
        batch_size (int): The batch size for evaluation. Defaults to 32.
        write_csv (bool): Whether to write the evaluation results to a CSV file. Defaults to True.
        truncate_dim (int, optional): The dimension to truncate the embeddings to. Defaults to None.
        score_functions (Dict[str, Callable[[Tensor, Tensor], Tensor]]): A dictionary mapping score function names to score functions. Defaults to {SimilarityFunction.COSINE.value: cos_sim, SimilarityFunction.DOT_PRODUCT.value: dot_score}.
        main_score_function (Union[str, SimilarityFunction], optional): The main score function to use for evaluation. Defaults to None.
        aggregate_fn (Callable[[list[float]], float]): The function to aggregate the scores. Defaults to np.mean.
        aggregate_key (str): The key to use for the aggregated score. Defaults to "mean".
        query_prompts (str | dict[str, str], optional): The prompts to add to the queries. If a string, will add the same prompt to all queries. If a dict, expects that all datasets in dataset_names are keys.
        corpus_prompts (str | dict[str, str], optional): The prompts to add to the corpus. If a string, will add the same prompt to all corpus. If a dict, expects that all datasets in dataset_names are keys.
        write_predictions (bool): Whether to write the predictions to a JSONL file. Defaults to False.
            This can be useful for downstream evaluation as it can be used as input to the :class:`~sentence_transformers.sparse_encoder.evaluation.ReciprocalRankFusionEvaluator` that accept precomputed predictions.

    .. tip::

        See this `NanoBEIR datasets collection on Hugging Face <https://huggingface.co/collections/sentence-transformers/nanobeir-datasets>`_
        with valid NanoBEIR ``dataset_id`` options for different languages.

    Example:
        ::

            from sentence_transformers import SentenceTransformer
            from sentence_transformers.evaluation import NanoBEIREvaluator

            model = SentenceTransformer('intfloat/multilingual-e5-large-instruct')

            datasets = ["QuoraRetrieval", "MSMARCO"]
            query_prompts = {
                "QuoraRetrieval": "Instruct: Given a question, retrieve questions that are semantically equivalent to the given question\nQuery: ",
                "MSMARCO": "Instruct: Given a web search query, retrieve relevant passages that answer the query\nQuery: "
            }

            evaluator = NanoBEIREvaluator(
                dataset_names=datasets,
                query_prompts=query_prompts,
            )

            results = evaluator(model)
            '''
            NanoBEIR Evaluation of the model on ['QuoraRetrieval', 'MSMARCO'] dataset:
            Evaluating NanoQuoraRetrieval
            Information Retrieval Evaluation of the model on the NanoQuoraRetrieval dataset:
            Queries: 50
            Corpus: 5046

            Score-Function: cosine
            Accuracy@1: 92.00%
            Accuracy@3: 98.00%
            Accuracy@5: 100.00%
            Accuracy@10: 100.00%
            Precision@1: 92.00%
            Precision@3: 40.67%
            Precision@5: 26.00%
            Precision@10: 14.00%
            Recall@1: 81.73%
            Recall@3: 94.20%
            Recall@5: 97.93%
            Recall@10: 100.00%
            MRR@10: 0.9540
            NDCG@10: 0.9597
            MAP@100: 0.9395

            Evaluating NanoMSMARCO
            Information Retrieval Evaluation of the model on the NanoMSMARCO dataset:
            Queries: 50
            Corpus: 5043

            Score-Function: cosine
            Accuracy@1: 40.00%
            Accuracy@3: 74.00%
            Accuracy@5: 78.00%
            Accuracy@10: 88.00%
            Precision@1: 40.00%
            Precision@3: 24.67%
            Precision@5: 15.60%
            Precision@10: 8.80%
            Recall@1: 40.00%
            Recall@3: 74.00%
            Recall@5: 78.00%
            Recall@10: 88.00%
            MRR@10: 0.5849
            NDCG@10: 0.6572
            MAP@100: 0.5892
            Average Queries: 50.0
            Average Corpus: 5044.5

            Aggregated for Score Function: cosine
            Accuracy@1: 66.00%
            Accuracy@3: 86.00%
            Accuracy@5: 89.00%
            Accuracy@10: 94.00%
            Precision@1: 66.00%
            Recall@1: 60.87%
            Precision@3: 32.67%
            Recall@3: 84.10%
            Precision@5: 20.80%
            Recall@5: 87.97%
            Precision@10: 11.40%
            Recall@10: 94.00%
            MRR@10: 0.7694
            NDCG@10: 0.8085
            '''
            print(evaluator.primary_metric)
            # => "NanoBEIR_mean_cosine_ndcg@10"
            print(results[evaluator.primary_metric])
            # => 0.8084508771660436

        Evaluating on custom/translated datasets::

            import logging
            from pprint import pprint

            from sentence_transformers import SentenceTransformer
            from sentence_transformers.evaluation import NanoBEIREvaluator

            logging.basicConfig(format="%(asctime)s - %(message)s", datefmt="%Y-%m-%d %H:%M:%S", level=logging.INFO)

            model = SentenceTransformer("google/embeddinggemma-300m")
            evaluator = NanoBEIREvaluator(
                ["msmarco", "nq"],
                dataset_id="lightonai/NanoBEIR-de",
                batch_size=32,
            )
            results = evaluator(model)
            print(results[evaluator.primary_metric])
            pprint({key: value for key, value in results.items() if "ndcg@10" in key})
    Nz!sentence-transformers/NanoBEIR-en
   )         r+   d   F    Tmeanc                f   t         |           |t        t        j	                               }|| _        || _        || _        || _        |
| _	        || _
        || _        || _        |
| _	        || _        |r,t        t        | j                  j	                                     ng | _        || _        || _        d| | _        | j"                  r"| xj$                  d| j"                   z  c_        || _        || _        || _        || _        || _        | j1                          | j3                          |||||||	|
||||d}t5        | j
                  dd      D cg c]  } | j6                  |fi | c}| _        d| d| _        d	d
g| _        | j?                  | j                         y c c}w )N	NanoBEIR__)mrr_at_k	ndcg_at_kaccuracy_at_kprecision_recall_at_kmap_at_kshow_progress_bar
batch_size	write_csvtruncate_dimscore_functionsmain_score_functionwrite_predictionszLoading NanoBEIR datasetsF)descleaveNanoBEIR_evaluation_z_results.csvepochsteps) super__init__listDATASET_NAME_TO_HUMAN_READABLEkeysdataset_names
dataset_idaggregate_fnaggregate_keyr<   query_promptscorpus_promptsr:   r>   sortedscore_function_namesr?   r=   namer5   r6   r7   r8   r9   _validate_dataset_names_validate_promptsr	   _load_dataset
evaluatorscsv_filecsv_headers_append_csv_headers)selfrK   rL   r5   r6   r7   r8   r9   r:   r;   r<   r=   r>   r?   rM   rN   rO   rP   r@   ir_evaluator_kwargsrS   	__class__s                        |/home/obispo/Crisostomo_bridge/mision_env/lib/python3.12/site-packages/sentence_transformers/evaluation/NanoBEIREvaluator.pyrG   zNanoBEIREvaluator.__init__   s   * 	  !?!D!D!FGM*$(*"*,!2".Q`F40D0D0I0I0K+L$Mfh!#6 (/	II1T../00I "*%:" $$&  !"*%: !2$"(.#6!2
  T//6QY^_
 Dt;':;

  4M?,O#W-  !:!:;
s   F.c                @   |D ]  }| j                   D ]"  }| j                  j                  | d|        $ | j                  D ]B  }| j                  j                  | d|        | j                  j                  | d|        D | j                  D ]"  }| j                  j                  | d|        $ | j
                  D ]"  }| j                  j                  | d|        $ | j                  D ]"  }| j                  j                  | d|        $  y )Nz
-Accuracy@z-Precision@z-Recall@z-MRR@z-NDCG@z-MAP@)r7   rY   appendr8   r5   r6   r9   )r[   rR   
score_nameks       r^   rZ   z%NanoBEIREvaluator._append_csv_headers  sB   . 	AJ'' F  '':,j(DEF // D  '':,k!(EF  '':,hqc(BCD ]] A  '':,eA3(?@A ^^ B  '':,fQC(@AB ]] A  '':,eA3(?@A	A    c                    i }i }|dk7  r|dk(  rd| }	nd| d| d}	nd}	| j                   |	d| j                    dz  }	t        j                  d	| j                   d
|	 d       | j                  J|j
                  |j                  i| _        |j
                  g| _        | j                  | j                         | j                  j                  d      }
t        | j                  d| j                         D ]  }t        j                  d|j                           |||||      }|j                         D ]?  \  }}|j                  d|
      }|d   }||vrg ||<   |||<   ||   j!                  |       A  i }|D ]  }| j#                  ||         ||<    || j$                  rt'        j(                  |d       t&        j*                  j-                  || j.                        }t&        j*                  j1                  |      sJt3        |dd      }|j5                  dj-                  | j6                               |j5                  d       nt3        |dd      }||g}| j                  D ]  }| j8                  D ]  }|j!                  || d|            | j:                  D ]4  }|j!                  || d|           |j!                  || d|           6 | j<                  D ]  }|j!                  || d|            | j>                  D ]  }|j!                  || d|            | j@                  D ]  }|j!                  || d|             |j5                  dj-                  tC        tD        |                   |j5                  d       |jG                          | jH                  s| jJ                  ftM        | j                  D cg c]!  }||| dtM        | j>                            f# c}d        d!   }| dtM        | j>                         | _$        n3| jJ                  jN                   dtM        | j>                         | _$        tQ        jR                  | j                  D cg c]  }tU        |jV                         c}      }tQ        jR                  | j                  D cg c]  }tU        |jX                         c}      }t        j                  d"|        t        j                  d#| d       | j                  D ]  }t        j                  d$|        | j8                  D ]2  }t        j                  d%j[                  ||| d|    d&z               4 | j:                  D ]b  }t        j                  d'j[                  ||| d|    d&z               t        j                  d(j[                  ||| d|    d&z               d | j<                  D ]/  }t        j                  d)j[                  ||| d|                 1 | j>                  D ]/  }t        j                  d*j[                  ||| d|                 1 | j@                  D ]/  }t        j                  d+j[                  ||| d|                 1  | j]                  || j                        }| j_                  ||||       |ja                  |       |S c c}w c c}w c c}w ),Nz after epoch z
 in epoch z after z steps z (truncated to )z$NanoBEIR Evaluation of the model on z dataset:r4   zEvaluating datasets)rA   disablezEvaluating )maxsplitT)exist_okwzutf-8)modeencoding,
az
_accuracy@z_precision@z_recall@z_mrr@z_ndcg@z_map@c                    | d   S Nr,    )xs    r^   <lambda>z,NanoBEIREvaluator.__call__.<locals>.<lambda>  s
    !A$ rc   )keyr   zAverage Queries: zAverage Corpus: zAggregated for Score Function: zAccuracy@{}: {:.2f}%r/   zPrecision@{}: {:.2f}%zRecall@{}: {:.2f}%zMRR@{}: {:.4f}zNDCG@{}: {:.4f}zMAP@{}: {:.4f})1r=   loggerinforK   r>   similarity_fn_name
similarityrR   rZ   rS   countr	   rW   r:   itemssplitr`   rM   r<   osmakedirspathjoinrX   isfileopenwriterY   r7   r8   r5   r6   r9   mapstrcloseprimary_metricr?   maxvaluenpr1   lenqueriescorpusformatprefix_name_to_metrics store_metrics_in_model_card_dataupdate)r[   modeloutput_pathrD   rE   argskwargsper_metric_resultsper_dataset_resultsout_txtnum_underscores_in_name	evaluator
evaluationfull_keymetric_valuesplitsmetricagg_resultscsv_pathfOutoutput_datarS   rb   score_functionavg_queries
avg_corpuss                             r^   __call__zNanoBEIREvaluator.__call__1  s      B;{)%1&ugWUG6BG():):(;1==G:4;M;M:NhW^V__`ab'$)$<$<e>N>N#OD ).)A)A(BD%$$T%>%>?"&))//#"6doo4IW[WmWmSmn 		@IKK+inn%567"5+ueDJ*4*:*:*< @&,!6MN!3313&v.0<#H-"6*11,?@		@ ( 	PF"&"3"34Fv4N"OK	P "t~~KKd3ww||K?H77>>(+H3A

388D$4$456

4  H3A %.K11 G++ LA&&{dV:aS3I'JKL 33 JA&&{dV;qc3J'KL&&{dV8A33G'HIJ  GA&&{dV53D'EFG  HA&&{dV6!3E'FGH  GA&&{dV53D'EFGG" JJsxxC 567JJtJJL""''/!$[_[t[tuSWdK4&s4>>7J6K(LMNu&" " *8(8s4>>?R>S&T#)-)A)A)G)G(HsSWSaSaObNc&d#ggtW)s9#4#45WXWWT__U	c)"2"23UV
'}56&zl"56-- 	XDKK9$@A'' i299![D6Q[\][^I_=`cf=fghi // e3::1kTFR]^_]`Ja>beh>hij077;$xXYWZG[;\_b;bcde ]] X,33A{dV5QRPSCT7UVWX ^^ Z-44QtfFSTRUDV8WXYZ ]] X,33A{dV5QRPSCT7UVWX	X$ 11+tyyI--e[%O"";/""G v XUs    &[1[6[;c                v    dt         |j                             }| j                  |d| j                   z  }|S )NNanor4   )rI   lowerr=   )r[   dataset_namehuman_readable_names      r^   _get_human_readable_namez*NanoBEIREvaluator._get_human_readable_name  sJ     $%CLDVDVDX%Y$Z[(Qt'8'8&9#::""rc   c                   |j                         t        vrt        d| d      t        |j                            }d| }| j                  d|ddg      }| j                  d|ddg      }| j                  d	|d
dg      }|D ci c]  }t	        |d         dkD  s|d   |d    }	}|D ci c]  }t	        |d         dkD  s|d   |d    }
}i }|D ]i  }|j                  d      }|d
   |vrt               ||d
   <   t        |t              r||d
      j                  |       S||d
      j                  |       k | j                  | j                  j                  |d       |d<   | j                  | j                  j                  |d       |d<   | j                  |      } | j                  d|
|	||d|S c c}w c c}w )Nz	Dataset 'z"' is not a valid NanoBEIR dataset.r   r   _idtext)r~   required_columnsr   qrelszquery-idz	corpus-idr   query_promptcorpus_prompt)r   r   relevant_docsrS   rt   )r   rI   
ValueError_load_dataset_subset_splitr   getset
isinstancerH   r   addrO   rP   r   information_retrieval_class)r[   r   r\   human_readable
split_namer   r   r   samplecorpus_dictqueries_dict
qrels_dict
corpus_idsr   s                 r^   rV   zNanoBEIREvaluator._load_dataset  s)    'EEy6XYZZ78J8J8LMN+,
00_dfl^m0n11):afhn`o1p//z]git\u/vCIeSQWX^Q_M`cdMdve}fVn4eeDKg&sSYZ`SaObefOfuvf~5gg
 	?FK0Jj!314
6*-.*d+6*-.55jA6*-.22:>	? )262D2D2H2HW[2\/*373F3F3J3J<Y]3^0";;LI/t// 
 $$	

 "
 	
' fgs   GG0GGc                f   t               st        d      ddlm} 	  || j                  ||      }t        |      t        |j                        z
  x}r+t        d	| d| d| j                   d
t        |       d	      |S # t
        $ r%}t        d| d| d| j                   d      |d }~ww xY w)Nzedatasets is not available. Please install it to use the NanoBEIREvaluator via `pip install datasets`.r   )load_dataset)r~   zCould not load subset 'z	' split 'z' from dataset 'z'.zSubset 'z' is missing required columns: .)	r   r   datasetsr   rL   	Exceptionr   column_namesrH   )r[   subsetr~   r   r   datasetemissing_columnss           r^   r   z,NanoBEIREvaluator._load_dataset_subset_split  s    $&w  	*	"4??F%HG ""23c':N:N6OOO?O6()E72B4??BSSrsw  yH  tI  sJ  JK  L    	)&5'AQRVRaRaQbbde	s   B 	B0 B++B0c           	        t        | j                        dk(  rt        d      | j                  D cg c]  }|j                         t        vr| }}|r,t        d| dt        t        j                                      y c c}w )Nr   zDdataset_names cannot be empty. Use None to evaluate on all datasets.zDataset(s) z; are not valid NanoBEIR datasets. Valid dataset names are: )r   rK   r   r   rI   rH   rJ   )r[   r   missing_datasetss      r^   rT   z)NanoBEIREvaluator._validate_dataset_names  s    t!!"a'cdd !% 2 2
!!#+II 
 

 ./ 0,,01O1T1T1V,W+XZ  
s   Bc                z   d}| j                   yt        | j                   t              r+| j                  D ci c]  }|| j                    c}| _         n4| j                  D cg c]  }|| j                   vs| c}x}r	|d| dz  }| j                  yt        | j                  t              r+| j                  D ci c]  }|| j                   c}| _        n4| j                  D cg c]  }|| j                  vs| c}x}r	|d| dz  }|rt        |j                               y c c}w c c}w c c}w c c}w )Nrf   z2The following datasets are missing query prompts: rp   z3The following datasets are missing corpus prompts: )rO   r   r   rK   rP   r   strip)r[   	error_msgr   missing_query_promptsmissing_corpus_promptss        r^   rU   z#NanoBEIREvaluator._validate_prompts  sQ   	)$,,c2[_[m[m%n<lD4F4F&F%n"151C1C+!-|[_[m[mGm+ &  QRgQhhjkk	*$--s3]a]o]o&p\|T5H5H'H&p#151C1C,!-|[_[n[nGn, '  RSiRjjlmm	Y__.//  &o+ 'q,s#   D)"D.6D.<D3'D8;D8c                V    t        | j                        dkD  rt        |   |i | y y rs   )r   rK   rF   r   )r[   r   r   r]   s      r^   r   z2NanoBEIREvaluator.store_metrics_in_model_card_data
  s/     t!!"Q&G4dEfE 'rc   c                    | j                   | j                  d}g d}|D ]  }t        | |      t        | |      ||<     |S )N)rK   rL   )r=   rO   rP   )rK   rL   getattr)r[   config_dictconfig_dict_candidate_keysrw   s       r^   get_config_dictz!NanoBEIREvaluator.get_config_dict  sQ    (,(:(:$//Z%X"- 	6CtS!-#*4#5C 	6 rc   )$rK   z"list[DatasetNameType | str] | NonerL   r   r5   	list[int]r6   r   r7   r   r8   r   r9   r   r:   boolr;   intr<   r   r=   z
int | Noner>   z4dict[str, Callable[[Tensor, Tensor], Tensor]] | Noner?   zstr | SimilarityFunction | NonerM   zCallable[[list[float]], float]rN   r   rO   str | dict[str, str] | NonerP   r   r@   r   )Nre   re   )
r   r
   r   z
str | NonerD   r   rE   r   returnzdict[str, float])r   DatasetNameType | strr   r   )r   r   r   r   )r   r   r~   r   r   z	list[str])r   zdict[str, Any])__name__
__module____qualname____doc__r   r   r   r1   rG   rZ   r   r   rV   r   rT   rU   r   r   __classcell__)r]   s   @r^   r*   r*   8   s   Xt #@ =A=!d "t#0+8"e"'#'PT?C79ww#596:"''H<9H< H< 	H<
 H< !H<  )H< H<  H< H< H< !H< NH< =H< 5H<  !H<" 3#H<$ 4%H<&  'H<TA* #'r#"r#  r# 	r#
 r# 
r#h#%
1%
	&%
N(0*Frc   r*   ) 
__future__r   loggingr   collections.abcr   typingr   r   r   numpyr   torchr   r	   sentence_transformersr
   >sentence_transformers.evaluation.InformationRetrievalEvaluatorr   2sentence_transformers.evaluation.SentenceEvaluatorr   *sentence_transformers.similarity_functionsr   sentence_transformers.utilr   )sentence_transformers.SentenceTransformer	getLoggerr   rx   DatasetNameTyperI   r*   rt   rc   r^   <module>r      s    "  	 $ . .    5 h P I <M			8	$" #
&" "_) _rc   