From 48b14218104d4ef53ebcdca1ceb311d0f6f7e3be Mon Sep 17 00:00:00 2001 From: dafnapension Date: Tue, 21 Oct 2025 22:27:23 +0300 Subject: [PATCH 1/2] modified and new Signed-off-by: dafnapension --- prepare/metrics/llm_as_judge/rag_judge.py | 2 +- ..._instruct_wml_answer_correctness_q_a_gt_loose.json | 11 +++++++++++ ..._wml_answer_correctness_q_a_gt_loose_logprobs.json | 11 +++++++++++ ...ama_3_3_70b_instruct_wml_answer_relevance_q_a.json | 11 +++++++++++ ...0b_instruct_wml_answer_relevance_q_a_logprobs.json | 11 +++++++++++ ...3_70b_instruct_wml_context_relevance_q_c_ares.json | 11 +++++++++++ ...truct_wml_context_relevance_q_c_ares_logprobs.json | 11 +++++++++++ ...3_70b_instruct_wml_correctness_holistic_q_c_a.json | 11 +++++++++++ ...truct_wml_correctness_holistic_q_c_a_logprobs.json | 11 +++++++++++ .../llama_3_3_70b_instruct_wml_faithfulness_c_a.json | 11 +++++++++++ ..._3_70b_instruct_wml_faithfulness_c_a_logprobs.json | 11 +++++++++++ ...llama_3_3_70b_instruct_wml_faithfulness_q_c_a.json | 11 +++++++++++ ..._70b_instruct_wml_faithfulness_q_c_a_logprobs.json | 11 +++++++++++ .../llama_3_3_70b_instruct_wml_q_a_gt_loose.json | 10 ++++++++++ ...ma_3_3_70b_instruct_wml_q_a_gt_loose_logprobs.json | 10 ++++++++++ ...ama_3_3_70b_instruct_wml_q_a_gt_loose_numeric.json | 10 ++++++++++ .../llama_3_3_70b_instruct_wml_q_a.json | 10 ++++++++++ .../llama_3_3_70b_instruct_wml_q_a_logprobs.json | 10 ++++++++++ .../llama_3_3_70b_instruct_wml_q_a_numeric.json | 10 ++++++++++ .../llama_3_3_70b_instruct_wml_q_c_ares.json | 10 ++++++++++ .../llama_3_3_70b_instruct_wml_q_c_ares_logprobs.json | 10 ++++++++++ .../llama_3_3_70b_instruct_wml_q_c_ares_numeric.json | 10 ++++++++++ .../llama_3_3_70b_instruct_wml_q_c_a.json | 10 ++++++++++ .../llama_3_3_70b_instruct_wml_q_c_a_logprobs.json | 10 ++++++++++ .../llama_3_3_70b_instruct_wml_q_c_a_numeric.json | 10 ++++++++++ .../faithfulness/llama_3_3_70b_instruct_wml_c_a.json | 10 ++++++++++ .../llama_3_3_70b_instruct_wml_c_a_logprobs.json | 10 ++++++++++ .../llama_3_3_70b_instruct_wml_c_a_verbal.json | 10 ++++++++++ .../llama_3_3_70b_instruct_wml_q_c_a.json | 10 ++++++++++ .../llama_3_3_70b_instruct_wml_q_c_a_logprobs.json | 10 ++++++++++ .../llama_3_3_70b_instruct_wml_q_c_a_verbal.json | 10 ++++++++++ 31 files changed, 313 insertions(+), 1 deletion(-) create mode 100644 src/unitxt/catalog/metrics/llm_as_judge/binary/llama_3_3_70b_instruct_wml_answer_correctness_q_a_gt_loose.json create mode 100644 src/unitxt/catalog/metrics/llm_as_judge/binary/llama_3_3_70b_instruct_wml_answer_correctness_q_a_gt_loose_logprobs.json create mode 100644 src/unitxt/catalog/metrics/llm_as_judge/binary/llama_3_3_70b_instruct_wml_answer_relevance_q_a.json create mode 100644 src/unitxt/catalog/metrics/llm_as_judge/binary/llama_3_3_70b_instruct_wml_answer_relevance_q_a_logprobs.json create mode 100644 src/unitxt/catalog/metrics/llm_as_judge/binary/llama_3_3_70b_instruct_wml_context_relevance_q_c_ares.json create mode 100644 src/unitxt/catalog/metrics/llm_as_judge/binary/llama_3_3_70b_instruct_wml_context_relevance_q_c_ares_logprobs.json create mode 100644 src/unitxt/catalog/metrics/llm_as_judge/binary/llama_3_3_70b_instruct_wml_correctness_holistic_q_c_a.json create mode 100644 src/unitxt/catalog/metrics/llm_as_judge/binary/llama_3_3_70b_instruct_wml_correctness_holistic_q_c_a_logprobs.json create mode 100644 src/unitxt/catalog/metrics/llm_as_judge/binary/llama_3_3_70b_instruct_wml_faithfulness_c_a.json create mode 100644 src/unitxt/catalog/metrics/llm_as_judge/binary/llama_3_3_70b_instruct_wml_faithfulness_c_a_logprobs.json create mode 100644 src/unitxt/catalog/metrics/llm_as_judge/binary/llama_3_3_70b_instruct_wml_faithfulness_q_c_a.json create mode 100644 src/unitxt/catalog/metrics/llm_as_judge/binary/llama_3_3_70b_instruct_wml_faithfulness_q_c_a_logprobs.json create mode 100644 src/unitxt/catalog/metrics/rag/answer_correctness/llama_3_3_70b_instruct_wml_q_a_gt_loose.json create mode 100644 src/unitxt/catalog/metrics/rag/answer_correctness/llama_3_3_70b_instruct_wml_q_a_gt_loose_logprobs.json create mode 100644 src/unitxt/catalog/metrics/rag/answer_correctness/llama_3_3_70b_instruct_wml_q_a_gt_loose_numeric.json create mode 100644 src/unitxt/catalog/metrics/rag/answer_relevance/llama_3_3_70b_instruct_wml_q_a.json create mode 100644 src/unitxt/catalog/metrics/rag/answer_relevance/llama_3_3_70b_instruct_wml_q_a_logprobs.json create mode 100644 src/unitxt/catalog/metrics/rag/answer_relevance/llama_3_3_70b_instruct_wml_q_a_numeric.json create mode 100644 src/unitxt/catalog/metrics/rag/context_relevance/llama_3_3_70b_instruct_wml_q_c_ares.json create mode 100644 src/unitxt/catalog/metrics/rag/context_relevance/llama_3_3_70b_instruct_wml_q_c_ares_logprobs.json create mode 100644 src/unitxt/catalog/metrics/rag/context_relevance/llama_3_3_70b_instruct_wml_q_c_ares_numeric.json create mode 100644 src/unitxt/catalog/metrics/rag/correctness_holistic/llama_3_3_70b_instruct_wml_q_c_a.json create mode 100644 src/unitxt/catalog/metrics/rag/correctness_holistic/llama_3_3_70b_instruct_wml_q_c_a_logprobs.json create mode 100644 src/unitxt/catalog/metrics/rag/correctness_holistic/llama_3_3_70b_instruct_wml_q_c_a_numeric.json create mode 100644 src/unitxt/catalog/metrics/rag/faithfulness/llama_3_3_70b_instruct_wml_c_a.json create mode 100644 src/unitxt/catalog/metrics/rag/faithfulness/llama_3_3_70b_instruct_wml_c_a_logprobs.json create mode 100644 src/unitxt/catalog/metrics/rag/faithfulness/llama_3_3_70b_instruct_wml_c_a_verbal.json create mode 100644 src/unitxt/catalog/metrics/rag/faithfulness/llama_3_3_70b_instruct_wml_q_c_a.json create mode 100644 src/unitxt/catalog/metrics/rag/faithfulness/llama_3_3_70b_instruct_wml_q_c_a_logprobs.json create mode 100644 src/unitxt/catalog/metrics/rag/faithfulness/llama_3_3_70b_instruct_wml_q_c_a_verbal.json diff --git a/prepare/metrics/llm_as_judge/rag_judge.py b/prepare/metrics/llm_as_judge/rag_judge.py index 65ce814ba3..b46c138db2 100644 --- a/prepare/metrics/llm_as_judge/rag_judge.py +++ b/prepare/metrics/llm_as_judge/rag_judge.py @@ -24,7 +24,7 @@ generic_engine_label = "generic_inference_engine" inference_models = { - "llama_3_1_70b_instruct_wml": "engines.classification.llama_3_1_70b_instruct_wml", + "llama_3_3_70b_instruct_wml": "engines.classification.llama_3_3_70b_instruct_wml", generic_engine_label: GenericInferenceEngine(), } diff --git a/src/unitxt/catalog/metrics/llm_as_judge/binary/llama_3_3_70b_instruct_wml_answer_correctness_q_a_gt_loose.json b/src/unitxt/catalog/metrics/llm_as_judge/binary/llama_3_3_70b_instruct_wml_answer_correctness_q_a_gt_loose.json new file mode 100644 index 0000000000..1846c671dd --- /dev/null +++ b/src/unitxt/catalog/metrics/llm_as_judge/binary/llama_3_3_70b_instruct_wml_answer_correctness_q_a_gt_loose.json @@ -0,0 +1,11 @@ +{ + "__type__": "task_based_ll_mas_judge", + "inference_model": "engines.classification.llama_3_3_70b_instruct_wml", + "template": "templates.rag_eval.answer_correctness.judge_loose_match_no_context", + "task": "tasks.rag_eval.answer_correctness.binary", + "format": null, + "main_score": "answer_correctness_q_a_gt_loose", + "prediction_field": "answer", + "infer_log_probs": false, + "__deprecated_msg__": "This metric should be replaced with metrics.rag.answer_correctness.llama_3_3_70b_instruct_wml_q_a_gt_loose" +} diff --git a/src/unitxt/catalog/metrics/llm_as_judge/binary/llama_3_3_70b_instruct_wml_answer_correctness_q_a_gt_loose_logprobs.json b/src/unitxt/catalog/metrics/llm_as_judge/binary/llama_3_3_70b_instruct_wml_answer_correctness_q_a_gt_loose_logprobs.json new file mode 100644 index 0000000000..777340e9b4 --- /dev/null +++ b/src/unitxt/catalog/metrics/llm_as_judge/binary/llama_3_3_70b_instruct_wml_answer_correctness_q_a_gt_loose_logprobs.json @@ -0,0 +1,11 @@ +{ + "__type__": "task_based_ll_mas_judge", + "inference_model": "engines.classification.llama_3_3_70b_instruct_wml", + "template": "templates.rag_eval.answer_correctness.judge_loose_match_no_context_logprobs", + "task": "tasks.rag_eval.answer_correctness.binary", + "format": null, + "main_score": "answer_correctness_q_a_gt_loose_logprobs", + "prediction_field": "answer", + "infer_log_probs": true, + "__deprecated_msg__": "This metric should be replaced with metrics.rag.answer_correctness.llama_3_3_70b_instruct_wml_q_a_gt_loose_logprobs" +} diff --git a/src/unitxt/catalog/metrics/llm_as_judge/binary/llama_3_3_70b_instruct_wml_answer_relevance_q_a.json b/src/unitxt/catalog/metrics/llm_as_judge/binary/llama_3_3_70b_instruct_wml_answer_relevance_q_a.json new file mode 100644 index 0000000000..f9a12a1c33 --- /dev/null +++ b/src/unitxt/catalog/metrics/llm_as_judge/binary/llama_3_3_70b_instruct_wml_answer_relevance_q_a.json @@ -0,0 +1,11 @@ +{ + "__type__": "task_based_ll_mas_judge", + "inference_model": "engines.classification.llama_3_3_70b_instruct_wml", + "template": "templates.rag_eval.answer_relevance.judge_answer_relevance", + "task": "tasks.rag_eval.answer_relevance.binary", + "format": null, + "main_score": "answer_relevance_q_a", + "prediction_field": "answer", + "infer_log_probs": false, + "__deprecated_msg__": "This metric should be replaced with metrics.rag.answer_relevance.llama_3_3_70b_instruct_wml_q_a" +} diff --git a/src/unitxt/catalog/metrics/llm_as_judge/binary/llama_3_3_70b_instruct_wml_answer_relevance_q_a_logprobs.json b/src/unitxt/catalog/metrics/llm_as_judge/binary/llama_3_3_70b_instruct_wml_answer_relevance_q_a_logprobs.json new file mode 100644 index 0000000000..aba534abaf --- /dev/null +++ b/src/unitxt/catalog/metrics/llm_as_judge/binary/llama_3_3_70b_instruct_wml_answer_relevance_q_a_logprobs.json @@ -0,0 +1,11 @@ +{ + "__type__": "task_based_ll_mas_judge", + "inference_model": "engines.classification.llama_3_3_70b_instruct_wml", + "template": "templates.rag_eval.answer_relevance.judge_answer_relevance_logprobs", + "task": "tasks.rag_eval.answer_relevance.binary", + "format": null, + "main_score": "answer_relevance_q_a_logprobs", + "prediction_field": "answer", + "infer_log_probs": true, + "__deprecated_msg__": "This metric should be replaced with metrics.rag.answer_relevance.llama_3_3_70b_instruct_wml_q_a_logprobs" +} diff --git a/src/unitxt/catalog/metrics/llm_as_judge/binary/llama_3_3_70b_instruct_wml_context_relevance_q_c_ares.json b/src/unitxt/catalog/metrics/llm_as_judge/binary/llama_3_3_70b_instruct_wml_context_relevance_q_c_ares.json new file mode 100644 index 0000000000..56c157451b --- /dev/null +++ b/src/unitxt/catalog/metrics/llm_as_judge/binary/llama_3_3_70b_instruct_wml_context_relevance_q_c_ares.json @@ -0,0 +1,11 @@ +{ + "__type__": "task_based_ll_mas_judge", + "inference_model": "engines.classification.llama_3_3_70b_instruct_wml", + "template": "templates.rag_eval.context_relevance.judge_context_relevance_ares", + "task": "tasks.rag_eval.context_relevance.binary", + "format": null, + "main_score": "context_relevance_q_c_ares", + "prediction_field": "contexts", + "infer_log_probs": false, + "__deprecated_msg__": "This metric should be replaced with metrics.rag.context_relevance.llama_3_3_70b_instruct_wml_q_c_ares" +} diff --git a/src/unitxt/catalog/metrics/llm_as_judge/binary/llama_3_3_70b_instruct_wml_context_relevance_q_c_ares_logprobs.json b/src/unitxt/catalog/metrics/llm_as_judge/binary/llama_3_3_70b_instruct_wml_context_relevance_q_c_ares_logprobs.json new file mode 100644 index 0000000000..e8c60354c1 --- /dev/null +++ b/src/unitxt/catalog/metrics/llm_as_judge/binary/llama_3_3_70b_instruct_wml_context_relevance_q_c_ares_logprobs.json @@ -0,0 +1,11 @@ +{ + "__type__": "task_based_ll_mas_judge", + "inference_model": "engines.classification.llama_3_3_70b_instruct_wml", + "template": "templates.rag_eval.context_relevance.judge_context_relevance_ares_logprobs", + "task": "tasks.rag_eval.context_relevance.binary", + "format": null, + "main_score": "context_relevance_q_c_ares_logprobs", + "prediction_field": "contexts", + "infer_log_probs": true, + "__deprecated_msg__": "This metric should be replaced with metrics.rag.context_relevance.llama_3_3_70b_instruct_wml_q_c_ares_logprobs" +} diff --git a/src/unitxt/catalog/metrics/llm_as_judge/binary/llama_3_3_70b_instruct_wml_correctness_holistic_q_c_a.json b/src/unitxt/catalog/metrics/llm_as_judge/binary/llama_3_3_70b_instruct_wml_correctness_holistic_q_c_a.json new file mode 100644 index 0000000000..e88220d187 --- /dev/null +++ b/src/unitxt/catalog/metrics/llm_as_judge/binary/llama_3_3_70b_instruct_wml_correctness_holistic_q_c_a.json @@ -0,0 +1,11 @@ +{ + "__type__": "task_based_ll_mas_judge", + "inference_model": "engines.classification.llama_3_3_70b_instruct_wml", + "template": "templates.rag_eval.correctness_holistic.judge_correctness_simple", + "task": "tasks.rag_eval.correctness_holistic.binary", + "format": null, + "main_score": "correctness_holistic_q_c_a", + "prediction_field": "answer", + "infer_log_probs": false, + "__deprecated_msg__": "This metric should be replaced with metrics.rag.correctness_holistic.llama_3_3_70b_instruct_wml_q_c_a" +} diff --git a/src/unitxt/catalog/metrics/llm_as_judge/binary/llama_3_3_70b_instruct_wml_correctness_holistic_q_c_a_logprobs.json b/src/unitxt/catalog/metrics/llm_as_judge/binary/llama_3_3_70b_instruct_wml_correctness_holistic_q_c_a_logprobs.json new file mode 100644 index 0000000000..1888d459eb --- /dev/null +++ b/src/unitxt/catalog/metrics/llm_as_judge/binary/llama_3_3_70b_instruct_wml_correctness_holistic_q_c_a_logprobs.json @@ -0,0 +1,11 @@ +{ + "__type__": "task_based_ll_mas_judge", + "inference_model": "engines.classification.llama_3_3_70b_instruct_wml", + "template": "templates.rag_eval.correctness_holistic.judge_correctness_simple_logprobs", + "task": "tasks.rag_eval.correctness_holistic.binary", + "format": null, + "main_score": "correctness_holistic_q_c_a_logprobs", + "prediction_field": "answer", + "infer_log_probs": true, + "__deprecated_msg__": "This metric should be replaced with metrics.rag.correctness_holistic.llama_3_3_70b_instruct_wml_q_c_a_logprobs" +} diff --git a/src/unitxt/catalog/metrics/llm_as_judge/binary/llama_3_3_70b_instruct_wml_faithfulness_c_a.json b/src/unitxt/catalog/metrics/llm_as_judge/binary/llama_3_3_70b_instruct_wml_faithfulness_c_a.json new file mode 100644 index 0000000000..63c9e0f60b --- /dev/null +++ b/src/unitxt/catalog/metrics/llm_as_judge/binary/llama_3_3_70b_instruct_wml_faithfulness_c_a.json @@ -0,0 +1,11 @@ +{ + "__type__": "task_based_ll_mas_judge", + "inference_model": "engines.classification.llama_3_3_70b_instruct_wml", + "template": "templates.rag_eval.faithfulness.judge_no_question_simplified", + "task": "tasks.rag_eval.faithfulness.binary", + "format": null, + "main_score": "faithfulness_c_a", + "prediction_field": "answer", + "infer_log_probs": false, + "__deprecated_msg__": "This metric should be replaced with metrics.rag.faithfulness.llama_3_3_70b_instruct_wml_c_a" +} diff --git a/src/unitxt/catalog/metrics/llm_as_judge/binary/llama_3_3_70b_instruct_wml_faithfulness_c_a_logprobs.json b/src/unitxt/catalog/metrics/llm_as_judge/binary/llama_3_3_70b_instruct_wml_faithfulness_c_a_logprobs.json new file mode 100644 index 0000000000..99ab0ab399 --- /dev/null +++ b/src/unitxt/catalog/metrics/llm_as_judge/binary/llama_3_3_70b_instruct_wml_faithfulness_c_a_logprobs.json @@ -0,0 +1,11 @@ +{ + "__type__": "task_based_ll_mas_judge", + "inference_model": "engines.classification.llama_3_3_70b_instruct_wml", + "template": "templates.rag_eval.faithfulness.judge_no_question_simplified_logprobs", + "task": "tasks.rag_eval.faithfulness.binary", + "format": null, + "main_score": "faithfulness_c_a_logprobs", + "prediction_field": "answer", + "infer_log_probs": true, + "__deprecated_msg__": "This metric should be replaced with metrics.rag.faithfulness.llama_3_3_70b_instruct_wml_c_a_logprobs" +} diff --git a/src/unitxt/catalog/metrics/llm_as_judge/binary/llama_3_3_70b_instruct_wml_faithfulness_q_c_a.json b/src/unitxt/catalog/metrics/llm_as_judge/binary/llama_3_3_70b_instruct_wml_faithfulness_q_c_a.json new file mode 100644 index 0000000000..001e1ec9b1 --- /dev/null +++ b/src/unitxt/catalog/metrics/llm_as_judge/binary/llama_3_3_70b_instruct_wml_faithfulness_q_c_a.json @@ -0,0 +1,11 @@ +{ + "__type__": "task_based_ll_mas_judge", + "inference_model": "engines.classification.llama_3_3_70b_instruct_wml", + "template": "templates.rag_eval.faithfulness.judge_with_question_simplified", + "task": "tasks.rag_eval.faithfulness.binary", + "format": null, + "main_score": "faithfulness_q_c_a", + "prediction_field": "answer", + "infer_log_probs": false, + "__deprecated_msg__": "This metric should be replaced with metrics.rag.faithfulness.llama_3_3_70b_instruct_wml_q_c_a" +} diff --git a/src/unitxt/catalog/metrics/llm_as_judge/binary/llama_3_3_70b_instruct_wml_faithfulness_q_c_a_logprobs.json b/src/unitxt/catalog/metrics/llm_as_judge/binary/llama_3_3_70b_instruct_wml_faithfulness_q_c_a_logprobs.json new file mode 100644 index 0000000000..bdfd221bb4 --- /dev/null +++ b/src/unitxt/catalog/metrics/llm_as_judge/binary/llama_3_3_70b_instruct_wml_faithfulness_q_c_a_logprobs.json @@ -0,0 +1,11 @@ +{ + "__type__": "task_based_ll_mas_judge", + "inference_model": "engines.classification.llama_3_3_70b_instruct_wml", + "template": "templates.rag_eval.faithfulness.judge_with_question_simplified_logprobs", + "task": "tasks.rag_eval.faithfulness.binary", + "format": null, + "main_score": "faithfulness_q_c_a_logprobs", + "prediction_field": "answer", + "infer_log_probs": true, + "__deprecated_msg__": "This metric should be replaced with metrics.rag.faithfulness.llama_3_3_70b_instruct_wml_q_c_a_logprobs" +} diff --git a/src/unitxt/catalog/metrics/rag/answer_correctness/llama_3_3_70b_instruct_wml_q_a_gt_loose.json b/src/unitxt/catalog/metrics/rag/answer_correctness/llama_3_3_70b_instruct_wml_q_a_gt_loose.json new file mode 100644 index 0000000000..73278e7010 --- /dev/null +++ b/src/unitxt/catalog/metrics/rag/answer_correctness/llama_3_3_70b_instruct_wml_q_a_gt_loose.json @@ -0,0 +1,10 @@ +{ + "__type__": "task_based_ll_mas_judge", + "inference_model": "engines.classification.llama_3_3_70b_instruct_wml", + "template": "templates.rag_eval.answer_correctness.judge_loose_match_no_context", + "task": "tasks.rag_eval.answer_correctness.binary", + "format": null, + "main_score": "answer_correctness_q_a_gt_loose", + "prediction_field": "answer", + "infer_log_probs": false +} diff --git a/src/unitxt/catalog/metrics/rag/answer_correctness/llama_3_3_70b_instruct_wml_q_a_gt_loose_logprobs.json b/src/unitxt/catalog/metrics/rag/answer_correctness/llama_3_3_70b_instruct_wml_q_a_gt_loose_logprobs.json new file mode 100644 index 0000000000..ba92f6b276 --- /dev/null +++ b/src/unitxt/catalog/metrics/rag/answer_correctness/llama_3_3_70b_instruct_wml_q_a_gt_loose_logprobs.json @@ -0,0 +1,10 @@ +{ + "__type__": "task_based_ll_mas_judge", + "inference_model": "engines.classification.llama_3_3_70b_instruct_wml", + "template": "templates.rag_eval.answer_correctness.judge_loose_match_no_context_logprobs", + "task": "tasks.rag_eval.answer_correctness.binary", + "format": null, + "main_score": "answer_correctness_q_a_gt_loose_logprobs", + "prediction_field": "answer", + "infer_log_probs": true +} diff --git a/src/unitxt/catalog/metrics/rag/answer_correctness/llama_3_3_70b_instruct_wml_q_a_gt_loose_numeric.json b/src/unitxt/catalog/metrics/rag/answer_correctness/llama_3_3_70b_instruct_wml_q_a_gt_loose_numeric.json new file mode 100644 index 0000000000..d533c86e09 --- /dev/null +++ b/src/unitxt/catalog/metrics/rag/answer_correctness/llama_3_3_70b_instruct_wml_q_a_gt_loose_numeric.json @@ -0,0 +1,10 @@ +{ + "__type__": "task_based_ll_mas_judge", + "inference_model": "engines.classification.llama_3_3_70b_instruct_wml", + "template": "templates.rag_eval.answer_correctness.judge_loose_match_no_context_numeric", + "task": "tasks.rag_eval.answer_correctness.binary", + "format": null, + "main_score": "answer_correctness_q_a_gt_loose_numeric", + "prediction_field": "answer", + "infer_log_probs": false +} diff --git a/src/unitxt/catalog/metrics/rag/answer_relevance/llama_3_3_70b_instruct_wml_q_a.json b/src/unitxt/catalog/metrics/rag/answer_relevance/llama_3_3_70b_instruct_wml_q_a.json new file mode 100644 index 0000000000..e8203dc080 --- /dev/null +++ b/src/unitxt/catalog/metrics/rag/answer_relevance/llama_3_3_70b_instruct_wml_q_a.json @@ -0,0 +1,10 @@ +{ + "__type__": "task_based_ll_mas_judge", + "inference_model": "engines.classification.llama_3_3_70b_instruct_wml", + "template": "templates.rag_eval.answer_relevance.judge_answer_relevance", + "task": "tasks.rag_eval.answer_relevance.binary", + "format": null, + "main_score": "answer_relevance_q_a", + "prediction_field": "answer", + "infer_log_probs": false +} diff --git a/src/unitxt/catalog/metrics/rag/answer_relevance/llama_3_3_70b_instruct_wml_q_a_logprobs.json b/src/unitxt/catalog/metrics/rag/answer_relevance/llama_3_3_70b_instruct_wml_q_a_logprobs.json new file mode 100644 index 0000000000..2228e40304 --- /dev/null +++ b/src/unitxt/catalog/metrics/rag/answer_relevance/llama_3_3_70b_instruct_wml_q_a_logprobs.json @@ -0,0 +1,10 @@ +{ + "__type__": "task_based_ll_mas_judge", + "inference_model": "engines.classification.llama_3_3_70b_instruct_wml", + "template": "templates.rag_eval.answer_relevance.judge_answer_relevance_logprobs", + "task": "tasks.rag_eval.answer_relevance.binary", + "format": null, + "main_score": "answer_relevance_q_a_logprobs", + "prediction_field": "answer", + "infer_log_probs": true +} diff --git a/src/unitxt/catalog/metrics/rag/answer_relevance/llama_3_3_70b_instruct_wml_q_a_numeric.json b/src/unitxt/catalog/metrics/rag/answer_relevance/llama_3_3_70b_instruct_wml_q_a_numeric.json new file mode 100644 index 0000000000..59e975d688 --- /dev/null +++ b/src/unitxt/catalog/metrics/rag/answer_relevance/llama_3_3_70b_instruct_wml_q_a_numeric.json @@ -0,0 +1,10 @@ +{ + "__type__": "task_based_ll_mas_judge", + "inference_model": "engines.classification.llama_3_3_70b_instruct_wml", + "template": "templates.rag_eval.answer_relevance.judge_answer_relevance_numeric", + "task": "tasks.rag_eval.answer_relevance.binary", + "format": null, + "main_score": "answer_relevance_q_a_numeric", + "prediction_field": "answer", + "infer_log_probs": false +} diff --git a/src/unitxt/catalog/metrics/rag/context_relevance/llama_3_3_70b_instruct_wml_q_c_ares.json b/src/unitxt/catalog/metrics/rag/context_relevance/llama_3_3_70b_instruct_wml_q_c_ares.json new file mode 100644 index 0000000000..fe1799c6bb --- /dev/null +++ b/src/unitxt/catalog/metrics/rag/context_relevance/llama_3_3_70b_instruct_wml_q_c_ares.json @@ -0,0 +1,10 @@ +{ + "__type__": "task_based_ll_mas_judge", + "inference_model": "engines.classification.llama_3_3_70b_instruct_wml", + "template": "templates.rag_eval.context_relevance.judge_context_relevance_ares", + "task": "tasks.rag_eval.context_relevance.binary", + "format": null, + "main_score": "context_relevance_q_c_ares", + "prediction_field": "contexts", + "infer_log_probs": false +} diff --git a/src/unitxt/catalog/metrics/rag/context_relevance/llama_3_3_70b_instruct_wml_q_c_ares_logprobs.json b/src/unitxt/catalog/metrics/rag/context_relevance/llama_3_3_70b_instruct_wml_q_c_ares_logprobs.json new file mode 100644 index 0000000000..e4a488ee6f --- /dev/null +++ b/src/unitxt/catalog/metrics/rag/context_relevance/llama_3_3_70b_instruct_wml_q_c_ares_logprobs.json @@ -0,0 +1,10 @@ +{ + "__type__": "task_based_ll_mas_judge", + "inference_model": "engines.classification.llama_3_3_70b_instruct_wml", + "template": "templates.rag_eval.context_relevance.judge_context_relevance_ares_logprobs", + "task": "tasks.rag_eval.context_relevance.binary", + "format": null, + "main_score": "context_relevance_q_c_ares_logprobs", + "prediction_field": "contexts", + "infer_log_probs": true +} diff --git a/src/unitxt/catalog/metrics/rag/context_relevance/llama_3_3_70b_instruct_wml_q_c_ares_numeric.json b/src/unitxt/catalog/metrics/rag/context_relevance/llama_3_3_70b_instruct_wml_q_c_ares_numeric.json new file mode 100644 index 0000000000..4428792605 --- /dev/null +++ b/src/unitxt/catalog/metrics/rag/context_relevance/llama_3_3_70b_instruct_wml_q_c_ares_numeric.json @@ -0,0 +1,10 @@ +{ + "__type__": "task_based_ll_mas_judge", + "inference_model": "engines.classification.llama_3_3_70b_instruct_wml", + "template": "templates.rag_eval.context_relevance.judge_context_relevance_ares_numeric", + "task": "tasks.rag_eval.context_relevance.binary", + "format": null, + "main_score": "context_relevance_q_c_ares_numeric", + "prediction_field": "contexts", + "infer_log_probs": false +} diff --git a/src/unitxt/catalog/metrics/rag/correctness_holistic/llama_3_3_70b_instruct_wml_q_c_a.json b/src/unitxt/catalog/metrics/rag/correctness_holistic/llama_3_3_70b_instruct_wml_q_c_a.json new file mode 100644 index 0000000000..ea389fdacc --- /dev/null +++ b/src/unitxt/catalog/metrics/rag/correctness_holistic/llama_3_3_70b_instruct_wml_q_c_a.json @@ -0,0 +1,10 @@ +{ + "__type__": "task_based_ll_mas_judge", + "inference_model": "engines.classification.llama_3_3_70b_instruct_wml", + "template": "templates.rag_eval.correctness_holistic.judge_correctness_simple", + "task": "tasks.rag_eval.correctness_holistic.binary", + "format": null, + "main_score": "correctness_holistic_q_c_a", + "prediction_field": "answer", + "infer_log_probs": false +} diff --git a/src/unitxt/catalog/metrics/rag/correctness_holistic/llama_3_3_70b_instruct_wml_q_c_a_logprobs.json b/src/unitxt/catalog/metrics/rag/correctness_holistic/llama_3_3_70b_instruct_wml_q_c_a_logprobs.json new file mode 100644 index 0000000000..76063236bd --- /dev/null +++ b/src/unitxt/catalog/metrics/rag/correctness_holistic/llama_3_3_70b_instruct_wml_q_c_a_logprobs.json @@ -0,0 +1,10 @@ +{ + "__type__": "task_based_ll_mas_judge", + "inference_model": "engines.classification.llama_3_3_70b_instruct_wml", + "template": "templates.rag_eval.correctness_holistic.judge_correctness_simple_logprobs", + "task": "tasks.rag_eval.correctness_holistic.binary", + "format": null, + "main_score": "correctness_holistic_q_c_a_logprobs", + "prediction_field": "answer", + "infer_log_probs": true +} diff --git a/src/unitxt/catalog/metrics/rag/correctness_holistic/llama_3_3_70b_instruct_wml_q_c_a_numeric.json b/src/unitxt/catalog/metrics/rag/correctness_holistic/llama_3_3_70b_instruct_wml_q_c_a_numeric.json new file mode 100644 index 0000000000..aa7d570ac8 --- /dev/null +++ b/src/unitxt/catalog/metrics/rag/correctness_holistic/llama_3_3_70b_instruct_wml_q_c_a_numeric.json @@ -0,0 +1,10 @@ +{ + "__type__": "task_based_ll_mas_judge", + "inference_model": "engines.classification.llama_3_3_70b_instruct_wml", + "template": "templates.rag_eval.correctness_holistic.judge_correctness_simple_numeric", + "task": "tasks.rag_eval.correctness_holistic.binary", + "format": null, + "main_score": "correctness_holistic_q_c_a_numeric", + "prediction_field": "answer", + "infer_log_probs": false +} diff --git a/src/unitxt/catalog/metrics/rag/faithfulness/llama_3_3_70b_instruct_wml_c_a.json b/src/unitxt/catalog/metrics/rag/faithfulness/llama_3_3_70b_instruct_wml_c_a.json new file mode 100644 index 0000000000..a01b13516e --- /dev/null +++ b/src/unitxt/catalog/metrics/rag/faithfulness/llama_3_3_70b_instruct_wml_c_a.json @@ -0,0 +1,10 @@ +{ + "__type__": "task_based_ll_mas_judge", + "inference_model": "engines.classification.llama_3_3_70b_instruct_wml", + "template": "templates.rag_eval.faithfulness.judge_no_question_simplified", + "task": "tasks.rag_eval.faithfulness.binary", + "format": null, + "main_score": "faithfulness_c_a", + "prediction_field": "answer", + "infer_log_probs": false +} diff --git a/src/unitxt/catalog/metrics/rag/faithfulness/llama_3_3_70b_instruct_wml_c_a_logprobs.json b/src/unitxt/catalog/metrics/rag/faithfulness/llama_3_3_70b_instruct_wml_c_a_logprobs.json new file mode 100644 index 0000000000..df338d5fcb --- /dev/null +++ b/src/unitxt/catalog/metrics/rag/faithfulness/llama_3_3_70b_instruct_wml_c_a_logprobs.json @@ -0,0 +1,10 @@ +{ + "__type__": "task_based_ll_mas_judge", + "inference_model": "engines.classification.llama_3_3_70b_instruct_wml", + "template": "templates.rag_eval.faithfulness.judge_no_question_simplified_logprobs", + "task": "tasks.rag_eval.faithfulness.binary", + "format": null, + "main_score": "faithfulness_c_a_logprobs", + "prediction_field": "answer", + "infer_log_probs": true +} diff --git a/src/unitxt/catalog/metrics/rag/faithfulness/llama_3_3_70b_instruct_wml_c_a_verbal.json b/src/unitxt/catalog/metrics/rag/faithfulness/llama_3_3_70b_instruct_wml_c_a_verbal.json new file mode 100644 index 0000000000..814e32afb2 --- /dev/null +++ b/src/unitxt/catalog/metrics/rag/faithfulness/llama_3_3_70b_instruct_wml_c_a_verbal.json @@ -0,0 +1,10 @@ +{ + "__type__": "task_based_ll_mas_judge", + "inference_model": "engines.classification.llama_3_3_70b_instruct_wml", + "template": "templates.rag_eval.faithfulness.judge_no_question_simplified_verbal", + "task": "tasks.rag_eval.faithfulness.binary", + "format": null, + "main_score": "faithfulness_c_a_verbal", + "prediction_field": "answer", + "infer_log_probs": false +} diff --git a/src/unitxt/catalog/metrics/rag/faithfulness/llama_3_3_70b_instruct_wml_q_c_a.json b/src/unitxt/catalog/metrics/rag/faithfulness/llama_3_3_70b_instruct_wml_q_c_a.json new file mode 100644 index 0000000000..05d5041a34 --- /dev/null +++ b/src/unitxt/catalog/metrics/rag/faithfulness/llama_3_3_70b_instruct_wml_q_c_a.json @@ -0,0 +1,10 @@ +{ + "__type__": "task_based_ll_mas_judge", + "inference_model": "engines.classification.llama_3_3_70b_instruct_wml", + "template": "templates.rag_eval.faithfulness.judge_with_question_simplified", + "task": "tasks.rag_eval.faithfulness.binary", + "format": null, + "main_score": "faithfulness_q_c_a", + "prediction_field": "answer", + "infer_log_probs": false +} diff --git a/src/unitxt/catalog/metrics/rag/faithfulness/llama_3_3_70b_instruct_wml_q_c_a_logprobs.json b/src/unitxt/catalog/metrics/rag/faithfulness/llama_3_3_70b_instruct_wml_q_c_a_logprobs.json new file mode 100644 index 0000000000..54b538a4fb --- /dev/null +++ b/src/unitxt/catalog/metrics/rag/faithfulness/llama_3_3_70b_instruct_wml_q_c_a_logprobs.json @@ -0,0 +1,10 @@ +{ + "__type__": "task_based_ll_mas_judge", + "inference_model": "engines.classification.llama_3_3_70b_instruct_wml", + "template": "templates.rag_eval.faithfulness.judge_with_question_simplified_logprobs", + "task": "tasks.rag_eval.faithfulness.binary", + "format": null, + "main_score": "faithfulness_q_c_a_logprobs", + "prediction_field": "answer", + "infer_log_probs": true +} diff --git a/src/unitxt/catalog/metrics/rag/faithfulness/llama_3_3_70b_instruct_wml_q_c_a_verbal.json b/src/unitxt/catalog/metrics/rag/faithfulness/llama_3_3_70b_instruct_wml_q_c_a_verbal.json new file mode 100644 index 0000000000..b4e6dd159f --- /dev/null +++ b/src/unitxt/catalog/metrics/rag/faithfulness/llama_3_3_70b_instruct_wml_q_c_a_verbal.json @@ -0,0 +1,10 @@ +{ + "__type__": "task_based_ll_mas_judge", + "inference_model": "engines.classification.llama_3_3_70b_instruct_wml", + "template": "templates.rag_eval.faithfulness.judge_with_question_simplified_verbal", + "task": "tasks.rag_eval.faithfulness.binary", + "format": null, + "main_score": "faithfulness_q_c_a_verbal", + "prediction_field": "answer", + "infer_log_probs": false +} From 2a509950e5683fa49d39ca9c2ac841276fae6dba Mon Sep 17 00:00:00 2001 From: dafnapension Date: Tue, 21 Oct 2025 22:29:17 +0300 Subject: [PATCH 2/2] delete old Signed-off-by: dafnapension --- ..._instruct_wml_answer_correctness_q_a_gt_loose.json | 11 ----------- ..._wml_answer_correctness_q_a_gt_loose_logprobs.json | 11 ----------- ...ama_3_1_70b_instruct_wml_answer_relevance_q_a.json | 11 ----------- ...0b_instruct_wml_answer_relevance_q_a_logprobs.json | 11 ----------- ...1_70b_instruct_wml_context_relevance_q_c_ares.json | 11 ----------- ...truct_wml_context_relevance_q_c_ares_logprobs.json | 11 ----------- ...1_70b_instruct_wml_correctness_holistic_q_c_a.json | 11 ----------- ...truct_wml_correctness_holistic_q_c_a_logprobs.json | 11 ----------- .../llama_3_1_70b_instruct_wml_faithfulness_c_a.json | 11 ----------- ..._1_70b_instruct_wml_faithfulness_c_a_logprobs.json | 11 ----------- ...llama_3_1_70b_instruct_wml_faithfulness_q_c_a.json | 11 ----------- ..._70b_instruct_wml_faithfulness_q_c_a_logprobs.json | 11 ----------- .../llama_3_1_70b_instruct_wml_q_a_gt_loose.json | 10 ---------- ...ma_3_1_70b_instruct_wml_q_a_gt_loose_logprobs.json | 10 ---------- ...ama_3_1_70b_instruct_wml_q_a_gt_loose_numeric.json | 10 ---------- .../llama_3_1_70b_instruct_wml_q_a.json | 10 ---------- .../llama_3_1_70b_instruct_wml_q_a_logprobs.json | 10 ---------- .../llama_3_1_70b_instruct_wml_q_a_numeric.json | 10 ---------- .../llama_3_1_70b_instruct_wml_q_c_ares.json | 10 ---------- .../llama_3_1_70b_instruct_wml_q_c_ares_logprobs.json | 10 ---------- .../llama_3_1_70b_instruct_wml_q_c_ares_numeric.json | 10 ---------- .../llama_3_1_70b_instruct_wml_q_c_a.json | 10 ---------- .../llama_3_1_70b_instruct_wml_q_c_a_logprobs.json | 10 ---------- .../llama_3_1_70b_instruct_wml_q_c_a_numeric.json | 10 ---------- .../faithfulness/llama_3_1_70b_instruct_wml_c_a.json | 10 ---------- .../llama_3_1_70b_instruct_wml_c_a_logprobs.json | 10 ---------- .../llama_3_1_70b_instruct_wml_c_a_verbal.json | 10 ---------- .../llama_3_1_70b_instruct_wml_q_c_a.json | 10 ---------- .../llama_3_1_70b_instruct_wml_q_c_a_logprobs.json | 10 ---------- .../llama_3_1_70b_instruct_wml_q_c_a_verbal.json | 10 ---------- 30 files changed, 312 deletions(-) delete mode 100644 src/unitxt/catalog/metrics/llm_as_judge/binary/llama_3_1_70b_instruct_wml_answer_correctness_q_a_gt_loose.json delete mode 100644 src/unitxt/catalog/metrics/llm_as_judge/binary/llama_3_1_70b_instruct_wml_answer_correctness_q_a_gt_loose_logprobs.json delete mode 100644 src/unitxt/catalog/metrics/llm_as_judge/binary/llama_3_1_70b_instruct_wml_answer_relevance_q_a.json delete mode 100644 src/unitxt/catalog/metrics/llm_as_judge/binary/llama_3_1_70b_instruct_wml_answer_relevance_q_a_logprobs.json delete mode 100644 src/unitxt/catalog/metrics/llm_as_judge/binary/llama_3_1_70b_instruct_wml_context_relevance_q_c_ares.json delete mode 100644 src/unitxt/catalog/metrics/llm_as_judge/binary/llama_3_1_70b_instruct_wml_context_relevance_q_c_ares_logprobs.json delete mode 100644 src/unitxt/catalog/metrics/llm_as_judge/binary/llama_3_1_70b_instruct_wml_correctness_holistic_q_c_a.json delete mode 100644 src/unitxt/catalog/metrics/llm_as_judge/binary/llama_3_1_70b_instruct_wml_correctness_holistic_q_c_a_logprobs.json delete mode 100644 src/unitxt/catalog/metrics/llm_as_judge/binary/llama_3_1_70b_instruct_wml_faithfulness_c_a.json delete mode 100644 src/unitxt/catalog/metrics/llm_as_judge/binary/llama_3_1_70b_instruct_wml_faithfulness_c_a_logprobs.json delete mode 100644 src/unitxt/catalog/metrics/llm_as_judge/binary/llama_3_1_70b_instruct_wml_faithfulness_q_c_a.json delete mode 100644 src/unitxt/catalog/metrics/llm_as_judge/binary/llama_3_1_70b_instruct_wml_faithfulness_q_c_a_logprobs.json delete mode 100644 src/unitxt/catalog/metrics/rag/answer_correctness/llama_3_1_70b_instruct_wml_q_a_gt_loose.json delete mode 100644 src/unitxt/catalog/metrics/rag/answer_correctness/llama_3_1_70b_instruct_wml_q_a_gt_loose_logprobs.json delete mode 100644 src/unitxt/catalog/metrics/rag/answer_correctness/llama_3_1_70b_instruct_wml_q_a_gt_loose_numeric.json delete mode 100644 src/unitxt/catalog/metrics/rag/answer_relevance/llama_3_1_70b_instruct_wml_q_a.json delete mode 100644 src/unitxt/catalog/metrics/rag/answer_relevance/llama_3_1_70b_instruct_wml_q_a_logprobs.json delete mode 100644 src/unitxt/catalog/metrics/rag/answer_relevance/llama_3_1_70b_instruct_wml_q_a_numeric.json delete mode 100644 src/unitxt/catalog/metrics/rag/context_relevance/llama_3_1_70b_instruct_wml_q_c_ares.json delete mode 100644 src/unitxt/catalog/metrics/rag/context_relevance/llama_3_1_70b_instruct_wml_q_c_ares_logprobs.json delete mode 100644 src/unitxt/catalog/metrics/rag/context_relevance/llama_3_1_70b_instruct_wml_q_c_ares_numeric.json delete mode 100644 src/unitxt/catalog/metrics/rag/correctness_holistic/llama_3_1_70b_instruct_wml_q_c_a.json delete mode 100644 src/unitxt/catalog/metrics/rag/correctness_holistic/llama_3_1_70b_instruct_wml_q_c_a_logprobs.json delete mode 100644 src/unitxt/catalog/metrics/rag/correctness_holistic/llama_3_1_70b_instruct_wml_q_c_a_numeric.json delete mode 100644 src/unitxt/catalog/metrics/rag/faithfulness/llama_3_1_70b_instruct_wml_c_a.json delete mode 100644 src/unitxt/catalog/metrics/rag/faithfulness/llama_3_1_70b_instruct_wml_c_a_logprobs.json delete mode 100644 src/unitxt/catalog/metrics/rag/faithfulness/llama_3_1_70b_instruct_wml_c_a_verbal.json delete mode 100644 src/unitxt/catalog/metrics/rag/faithfulness/llama_3_1_70b_instruct_wml_q_c_a.json delete mode 100644 src/unitxt/catalog/metrics/rag/faithfulness/llama_3_1_70b_instruct_wml_q_c_a_logprobs.json delete mode 100644 src/unitxt/catalog/metrics/rag/faithfulness/llama_3_1_70b_instruct_wml_q_c_a_verbal.json diff --git a/src/unitxt/catalog/metrics/llm_as_judge/binary/llama_3_1_70b_instruct_wml_answer_correctness_q_a_gt_loose.json b/src/unitxt/catalog/metrics/llm_as_judge/binary/llama_3_1_70b_instruct_wml_answer_correctness_q_a_gt_loose.json deleted file mode 100644 index 4608df17fe..0000000000 --- a/src/unitxt/catalog/metrics/llm_as_judge/binary/llama_3_1_70b_instruct_wml_answer_correctness_q_a_gt_loose.json +++ /dev/null @@ -1,11 +0,0 @@ -{ - "__type__": "task_based_ll_mas_judge", - "inference_model": "engines.classification.llama_3_1_70b_instruct_wml", - "template": "templates.rag_eval.answer_correctness.judge_loose_match_no_context", - "task": "tasks.rag_eval.answer_correctness.binary", - "format": null, - "main_score": "answer_correctness_q_a_gt_loose", - "prediction_field": "answer", - "infer_log_probs": false, - "__deprecated_msg__": "This metric should be replaced with metrics.rag.answer_correctness.llama_3_1_70b_instruct_wml_q_a_gt_loose" -} diff --git a/src/unitxt/catalog/metrics/llm_as_judge/binary/llama_3_1_70b_instruct_wml_answer_correctness_q_a_gt_loose_logprobs.json b/src/unitxt/catalog/metrics/llm_as_judge/binary/llama_3_1_70b_instruct_wml_answer_correctness_q_a_gt_loose_logprobs.json deleted file mode 100644 index df11925768..0000000000 --- a/src/unitxt/catalog/metrics/llm_as_judge/binary/llama_3_1_70b_instruct_wml_answer_correctness_q_a_gt_loose_logprobs.json +++ /dev/null @@ -1,11 +0,0 @@ -{ - "__type__": "task_based_ll_mas_judge", - "inference_model": "engines.classification.llama_3_1_70b_instruct_wml", - "template": "templates.rag_eval.answer_correctness.judge_loose_match_no_context_logprobs", - "task": "tasks.rag_eval.answer_correctness.binary", - "format": null, - "main_score": "answer_correctness_q_a_gt_loose_logprobs", - "prediction_field": "answer", - "infer_log_probs": true, - "__deprecated_msg__": "This metric should be replaced with metrics.rag.answer_correctness.llama_3_1_70b_instruct_wml_q_a_gt_loose_logprobs" -} diff --git a/src/unitxt/catalog/metrics/llm_as_judge/binary/llama_3_1_70b_instruct_wml_answer_relevance_q_a.json b/src/unitxt/catalog/metrics/llm_as_judge/binary/llama_3_1_70b_instruct_wml_answer_relevance_q_a.json deleted file mode 100644 index cb52d639ae..0000000000 --- a/src/unitxt/catalog/metrics/llm_as_judge/binary/llama_3_1_70b_instruct_wml_answer_relevance_q_a.json +++ /dev/null @@ -1,11 +0,0 @@ -{ - "__type__": "task_based_ll_mas_judge", - "inference_model": "engines.classification.llama_3_1_70b_instruct_wml", - "template": "templates.rag_eval.answer_relevance.judge_answer_relevance", - "task": "tasks.rag_eval.answer_relevance.binary", - "format": null, - "main_score": "answer_relevance_q_a", - "prediction_field": "answer", - "infer_log_probs": false, - "__deprecated_msg__": "This metric should be replaced with metrics.rag.answer_relevance.llama_3_1_70b_instruct_wml_q_a" -} diff --git a/src/unitxt/catalog/metrics/llm_as_judge/binary/llama_3_1_70b_instruct_wml_answer_relevance_q_a_logprobs.json b/src/unitxt/catalog/metrics/llm_as_judge/binary/llama_3_1_70b_instruct_wml_answer_relevance_q_a_logprobs.json deleted file mode 100644 index 7dbc464162..0000000000 --- a/src/unitxt/catalog/metrics/llm_as_judge/binary/llama_3_1_70b_instruct_wml_answer_relevance_q_a_logprobs.json +++ /dev/null @@ -1,11 +0,0 @@ -{ - "__type__": "task_based_ll_mas_judge", - "inference_model": "engines.classification.llama_3_1_70b_instruct_wml", - "template": "templates.rag_eval.answer_relevance.judge_answer_relevance_logprobs", - "task": "tasks.rag_eval.answer_relevance.binary", - "format": null, - "main_score": "answer_relevance_q_a_logprobs", - "prediction_field": "answer", - "infer_log_probs": true, - "__deprecated_msg__": "This metric should be replaced with metrics.rag.answer_relevance.llama_3_1_70b_instruct_wml_q_a_logprobs" -} diff --git a/src/unitxt/catalog/metrics/llm_as_judge/binary/llama_3_1_70b_instruct_wml_context_relevance_q_c_ares.json b/src/unitxt/catalog/metrics/llm_as_judge/binary/llama_3_1_70b_instruct_wml_context_relevance_q_c_ares.json deleted file mode 100644 index dd7cfc3e41..0000000000 --- a/src/unitxt/catalog/metrics/llm_as_judge/binary/llama_3_1_70b_instruct_wml_context_relevance_q_c_ares.json +++ /dev/null @@ -1,11 +0,0 @@ -{ - "__type__": "task_based_ll_mas_judge", - "inference_model": "engines.classification.llama_3_1_70b_instruct_wml", - "template": "templates.rag_eval.context_relevance.judge_context_relevance_ares", - "task": "tasks.rag_eval.context_relevance.binary", - "format": null, - "main_score": "context_relevance_q_c_ares", - "prediction_field": "contexts", - "infer_log_probs": false, - "__deprecated_msg__": "This metric should be replaced with metrics.rag.context_relevance.llama_3_1_70b_instruct_wml_q_c_ares" -} diff --git a/src/unitxt/catalog/metrics/llm_as_judge/binary/llama_3_1_70b_instruct_wml_context_relevance_q_c_ares_logprobs.json b/src/unitxt/catalog/metrics/llm_as_judge/binary/llama_3_1_70b_instruct_wml_context_relevance_q_c_ares_logprobs.json deleted file mode 100644 index 590b902a34..0000000000 --- a/src/unitxt/catalog/metrics/llm_as_judge/binary/llama_3_1_70b_instruct_wml_context_relevance_q_c_ares_logprobs.json +++ /dev/null @@ -1,11 +0,0 @@ -{ - "__type__": "task_based_ll_mas_judge", - "inference_model": "engines.classification.llama_3_1_70b_instruct_wml", - "template": "templates.rag_eval.context_relevance.judge_context_relevance_ares_logprobs", - "task": "tasks.rag_eval.context_relevance.binary", - "format": null, - "main_score": "context_relevance_q_c_ares_logprobs", - "prediction_field": "contexts", - "infer_log_probs": true, - "__deprecated_msg__": "This metric should be replaced with metrics.rag.context_relevance.llama_3_1_70b_instruct_wml_q_c_ares_logprobs" -} diff --git a/src/unitxt/catalog/metrics/llm_as_judge/binary/llama_3_1_70b_instruct_wml_correctness_holistic_q_c_a.json b/src/unitxt/catalog/metrics/llm_as_judge/binary/llama_3_1_70b_instruct_wml_correctness_holistic_q_c_a.json deleted file mode 100644 index 1d2f7a144c..0000000000 --- a/src/unitxt/catalog/metrics/llm_as_judge/binary/llama_3_1_70b_instruct_wml_correctness_holistic_q_c_a.json +++ /dev/null @@ -1,11 +0,0 @@ -{ - "__type__": "task_based_ll_mas_judge", - "inference_model": "engines.classification.llama_3_1_70b_instruct_wml", - "template": "templates.rag_eval.correctness_holistic.judge_correctness_simple", - "task": "tasks.rag_eval.correctness_holistic.binary", - "format": null, - "main_score": "correctness_holistic_q_c_a", - "prediction_field": "answer", - "infer_log_probs": false, - "__deprecated_msg__": "This metric should be replaced with metrics.rag.correctness_holistic.llama_3_1_70b_instruct_wml_q_c_a" -} diff --git a/src/unitxt/catalog/metrics/llm_as_judge/binary/llama_3_1_70b_instruct_wml_correctness_holistic_q_c_a_logprobs.json b/src/unitxt/catalog/metrics/llm_as_judge/binary/llama_3_1_70b_instruct_wml_correctness_holistic_q_c_a_logprobs.json deleted file mode 100644 index a819a468fb..0000000000 --- a/src/unitxt/catalog/metrics/llm_as_judge/binary/llama_3_1_70b_instruct_wml_correctness_holistic_q_c_a_logprobs.json +++ /dev/null @@ -1,11 +0,0 @@ -{ - "__type__": "task_based_ll_mas_judge", - "inference_model": "engines.classification.llama_3_1_70b_instruct_wml", - "template": "templates.rag_eval.correctness_holistic.judge_correctness_simple_logprobs", - "task": "tasks.rag_eval.correctness_holistic.binary", - "format": null, - "main_score": "correctness_holistic_q_c_a_logprobs", - "prediction_field": "answer", - "infer_log_probs": true, - "__deprecated_msg__": "This metric should be replaced with metrics.rag.correctness_holistic.llama_3_1_70b_instruct_wml_q_c_a_logprobs" -} diff --git a/src/unitxt/catalog/metrics/llm_as_judge/binary/llama_3_1_70b_instruct_wml_faithfulness_c_a.json b/src/unitxt/catalog/metrics/llm_as_judge/binary/llama_3_1_70b_instruct_wml_faithfulness_c_a.json deleted file mode 100644 index 247710c3e4..0000000000 --- a/src/unitxt/catalog/metrics/llm_as_judge/binary/llama_3_1_70b_instruct_wml_faithfulness_c_a.json +++ /dev/null @@ -1,11 +0,0 @@ -{ - "__type__": "task_based_ll_mas_judge", - "inference_model": "engines.classification.llama_3_1_70b_instruct_wml", - "template": "templates.rag_eval.faithfulness.judge_no_question_simplified", - "task": "tasks.rag_eval.faithfulness.binary", - "format": null, - "main_score": "faithfulness_c_a", - "prediction_field": "answer", - "infer_log_probs": false, - "__deprecated_msg__": "This metric should be replaced with metrics.rag.faithfulness.llama_3_1_70b_instruct_wml_c_a" -} diff --git a/src/unitxt/catalog/metrics/llm_as_judge/binary/llama_3_1_70b_instruct_wml_faithfulness_c_a_logprobs.json b/src/unitxt/catalog/metrics/llm_as_judge/binary/llama_3_1_70b_instruct_wml_faithfulness_c_a_logprobs.json deleted file mode 100644 index c64ee52d9b..0000000000 --- a/src/unitxt/catalog/metrics/llm_as_judge/binary/llama_3_1_70b_instruct_wml_faithfulness_c_a_logprobs.json +++ /dev/null @@ -1,11 +0,0 @@ -{ - "__type__": "task_based_ll_mas_judge", - "inference_model": "engines.classification.llama_3_1_70b_instruct_wml", - "template": "templates.rag_eval.faithfulness.judge_no_question_simplified_logprobs", - "task": "tasks.rag_eval.faithfulness.binary", - "format": null, - "main_score": "faithfulness_c_a_logprobs", - "prediction_field": "answer", - "infer_log_probs": true, - "__deprecated_msg__": "This metric should be replaced with metrics.rag.faithfulness.llama_3_1_70b_instruct_wml_c_a_logprobs" -} diff --git a/src/unitxt/catalog/metrics/llm_as_judge/binary/llama_3_1_70b_instruct_wml_faithfulness_q_c_a.json b/src/unitxt/catalog/metrics/llm_as_judge/binary/llama_3_1_70b_instruct_wml_faithfulness_q_c_a.json deleted file mode 100644 index 8e6b53e089..0000000000 --- a/src/unitxt/catalog/metrics/llm_as_judge/binary/llama_3_1_70b_instruct_wml_faithfulness_q_c_a.json +++ /dev/null @@ -1,11 +0,0 @@ -{ - "__type__": "task_based_ll_mas_judge", - "inference_model": "engines.classification.llama_3_1_70b_instruct_wml", - "template": "templates.rag_eval.faithfulness.judge_with_question_simplified", - "task": "tasks.rag_eval.faithfulness.binary", - "format": null, - "main_score": "faithfulness_q_c_a", - "prediction_field": "answer", - "infer_log_probs": false, - "__deprecated_msg__": "This metric should be replaced with metrics.rag.faithfulness.llama_3_1_70b_instruct_wml_q_c_a" -} diff --git a/src/unitxt/catalog/metrics/llm_as_judge/binary/llama_3_1_70b_instruct_wml_faithfulness_q_c_a_logprobs.json b/src/unitxt/catalog/metrics/llm_as_judge/binary/llama_3_1_70b_instruct_wml_faithfulness_q_c_a_logprobs.json deleted file mode 100644 index a15607ebec..0000000000 --- a/src/unitxt/catalog/metrics/llm_as_judge/binary/llama_3_1_70b_instruct_wml_faithfulness_q_c_a_logprobs.json +++ /dev/null @@ -1,11 +0,0 @@ -{ - "__type__": "task_based_ll_mas_judge", - "inference_model": "engines.classification.llama_3_1_70b_instruct_wml", - "template": "templates.rag_eval.faithfulness.judge_with_question_simplified_logprobs", - "task": "tasks.rag_eval.faithfulness.binary", - "format": null, - "main_score": "faithfulness_q_c_a_logprobs", - "prediction_field": "answer", - "infer_log_probs": true, - "__deprecated_msg__": "This metric should be replaced with metrics.rag.faithfulness.llama_3_1_70b_instruct_wml_q_c_a_logprobs" -} diff --git a/src/unitxt/catalog/metrics/rag/answer_correctness/llama_3_1_70b_instruct_wml_q_a_gt_loose.json b/src/unitxt/catalog/metrics/rag/answer_correctness/llama_3_1_70b_instruct_wml_q_a_gt_loose.json deleted file mode 100644 index b5fe039f72..0000000000 --- a/src/unitxt/catalog/metrics/rag/answer_correctness/llama_3_1_70b_instruct_wml_q_a_gt_loose.json +++ /dev/null @@ -1,10 +0,0 @@ -{ - "__type__": "task_based_ll_mas_judge", - "inference_model": "engines.classification.llama_3_1_70b_instruct_wml", - "template": "templates.rag_eval.answer_correctness.judge_loose_match_no_context", - "task": "tasks.rag_eval.answer_correctness.binary", - "format": null, - "main_score": "answer_correctness_q_a_gt_loose", - "prediction_field": "answer", - "infer_log_probs": false -} diff --git a/src/unitxt/catalog/metrics/rag/answer_correctness/llama_3_1_70b_instruct_wml_q_a_gt_loose_logprobs.json b/src/unitxt/catalog/metrics/rag/answer_correctness/llama_3_1_70b_instruct_wml_q_a_gt_loose_logprobs.json deleted file mode 100644 index 821144df5e..0000000000 --- a/src/unitxt/catalog/metrics/rag/answer_correctness/llama_3_1_70b_instruct_wml_q_a_gt_loose_logprobs.json +++ /dev/null @@ -1,10 +0,0 @@ -{ - "__type__": "task_based_ll_mas_judge", - "inference_model": "engines.classification.llama_3_1_70b_instruct_wml", - "template": "templates.rag_eval.answer_correctness.judge_loose_match_no_context_logprobs", - "task": "tasks.rag_eval.answer_correctness.binary", - "format": null, - "main_score": "answer_correctness_q_a_gt_loose_logprobs", - "prediction_field": "answer", - "infer_log_probs": true -} diff --git a/src/unitxt/catalog/metrics/rag/answer_correctness/llama_3_1_70b_instruct_wml_q_a_gt_loose_numeric.json b/src/unitxt/catalog/metrics/rag/answer_correctness/llama_3_1_70b_instruct_wml_q_a_gt_loose_numeric.json deleted file mode 100644 index c89c843aba..0000000000 --- a/src/unitxt/catalog/metrics/rag/answer_correctness/llama_3_1_70b_instruct_wml_q_a_gt_loose_numeric.json +++ /dev/null @@ -1,10 +0,0 @@ -{ - "__type__": "task_based_ll_mas_judge", - "inference_model": "engines.classification.llama_3_1_70b_instruct_wml", - "template": "templates.rag_eval.answer_correctness.judge_loose_match_no_context_numeric", - "task": "tasks.rag_eval.answer_correctness.binary", - "format": null, - "main_score": "answer_correctness_q_a_gt_loose_numeric", - "prediction_field": "answer", - "infer_log_probs": false -} diff --git a/src/unitxt/catalog/metrics/rag/answer_relevance/llama_3_1_70b_instruct_wml_q_a.json b/src/unitxt/catalog/metrics/rag/answer_relevance/llama_3_1_70b_instruct_wml_q_a.json deleted file mode 100644 index f6af64f895..0000000000 --- a/src/unitxt/catalog/metrics/rag/answer_relevance/llama_3_1_70b_instruct_wml_q_a.json +++ /dev/null @@ -1,10 +0,0 @@ -{ - "__type__": "task_based_ll_mas_judge", - "inference_model": "engines.classification.llama_3_1_70b_instruct_wml", - "template": "templates.rag_eval.answer_relevance.judge_answer_relevance", - "task": "tasks.rag_eval.answer_relevance.binary", - "format": null, - "main_score": "answer_relevance_q_a", - "prediction_field": "answer", - "infer_log_probs": false -} diff --git a/src/unitxt/catalog/metrics/rag/answer_relevance/llama_3_1_70b_instruct_wml_q_a_logprobs.json b/src/unitxt/catalog/metrics/rag/answer_relevance/llama_3_1_70b_instruct_wml_q_a_logprobs.json deleted file mode 100644 index 49b65f021f..0000000000 --- a/src/unitxt/catalog/metrics/rag/answer_relevance/llama_3_1_70b_instruct_wml_q_a_logprobs.json +++ /dev/null @@ -1,10 +0,0 @@ -{ - "__type__": "task_based_ll_mas_judge", - "inference_model": "engines.classification.llama_3_1_70b_instruct_wml", - "template": "templates.rag_eval.answer_relevance.judge_answer_relevance_logprobs", - "task": "tasks.rag_eval.answer_relevance.binary", - "format": null, - "main_score": "answer_relevance_q_a_logprobs", - "prediction_field": "answer", - "infer_log_probs": true -} diff --git a/src/unitxt/catalog/metrics/rag/answer_relevance/llama_3_1_70b_instruct_wml_q_a_numeric.json b/src/unitxt/catalog/metrics/rag/answer_relevance/llama_3_1_70b_instruct_wml_q_a_numeric.json deleted file mode 100644 index 90b1e4f36e..0000000000 --- a/src/unitxt/catalog/metrics/rag/answer_relevance/llama_3_1_70b_instruct_wml_q_a_numeric.json +++ /dev/null @@ -1,10 +0,0 @@ -{ - "__type__": "task_based_ll_mas_judge", - "inference_model": "engines.classification.llama_3_1_70b_instruct_wml", - "template": "templates.rag_eval.answer_relevance.judge_answer_relevance_numeric", - "task": "tasks.rag_eval.answer_relevance.binary", - "format": null, - "main_score": "answer_relevance_q_a_numeric", - "prediction_field": "answer", - "infer_log_probs": false -} diff --git a/src/unitxt/catalog/metrics/rag/context_relevance/llama_3_1_70b_instruct_wml_q_c_ares.json b/src/unitxt/catalog/metrics/rag/context_relevance/llama_3_1_70b_instruct_wml_q_c_ares.json deleted file mode 100644 index a25ab27136..0000000000 --- a/src/unitxt/catalog/metrics/rag/context_relevance/llama_3_1_70b_instruct_wml_q_c_ares.json +++ /dev/null @@ -1,10 +0,0 @@ -{ - "__type__": "task_based_ll_mas_judge", - "inference_model": "engines.classification.llama_3_1_70b_instruct_wml", - "template": "templates.rag_eval.context_relevance.judge_context_relevance_ares", - "task": "tasks.rag_eval.context_relevance.binary", - "format": null, - "main_score": "context_relevance_q_c_ares", - "prediction_field": "contexts", - "infer_log_probs": false -} diff --git a/src/unitxt/catalog/metrics/rag/context_relevance/llama_3_1_70b_instruct_wml_q_c_ares_logprobs.json b/src/unitxt/catalog/metrics/rag/context_relevance/llama_3_1_70b_instruct_wml_q_c_ares_logprobs.json deleted file mode 100644 index 8bad207bba..0000000000 --- a/src/unitxt/catalog/metrics/rag/context_relevance/llama_3_1_70b_instruct_wml_q_c_ares_logprobs.json +++ /dev/null @@ -1,10 +0,0 @@ -{ - "__type__": "task_based_ll_mas_judge", - "inference_model": "engines.classification.llama_3_1_70b_instruct_wml", - "template": "templates.rag_eval.context_relevance.judge_context_relevance_ares_logprobs", - "task": "tasks.rag_eval.context_relevance.binary", - "format": null, - "main_score": "context_relevance_q_c_ares_logprobs", - "prediction_field": "contexts", - "infer_log_probs": true -} diff --git a/src/unitxt/catalog/metrics/rag/context_relevance/llama_3_1_70b_instruct_wml_q_c_ares_numeric.json b/src/unitxt/catalog/metrics/rag/context_relevance/llama_3_1_70b_instruct_wml_q_c_ares_numeric.json deleted file mode 100644 index 65e1b9e8ad..0000000000 --- a/src/unitxt/catalog/metrics/rag/context_relevance/llama_3_1_70b_instruct_wml_q_c_ares_numeric.json +++ /dev/null @@ -1,10 +0,0 @@ -{ - "__type__": "task_based_ll_mas_judge", - "inference_model": "engines.classification.llama_3_1_70b_instruct_wml", - "template": "templates.rag_eval.context_relevance.judge_context_relevance_ares_numeric", - "task": "tasks.rag_eval.context_relevance.binary", - "format": null, - "main_score": "context_relevance_q_c_ares_numeric", - "prediction_field": "contexts", - "infer_log_probs": false -} diff --git a/src/unitxt/catalog/metrics/rag/correctness_holistic/llama_3_1_70b_instruct_wml_q_c_a.json b/src/unitxt/catalog/metrics/rag/correctness_holistic/llama_3_1_70b_instruct_wml_q_c_a.json deleted file mode 100644 index e1ca021268..0000000000 --- a/src/unitxt/catalog/metrics/rag/correctness_holistic/llama_3_1_70b_instruct_wml_q_c_a.json +++ /dev/null @@ -1,10 +0,0 @@ -{ - "__type__": "task_based_ll_mas_judge", - "inference_model": "engines.classification.llama_3_1_70b_instruct_wml", - "template": "templates.rag_eval.correctness_holistic.judge_correctness_simple", - "task": "tasks.rag_eval.correctness_holistic.binary", - "format": null, - "main_score": "correctness_holistic_q_c_a", - "prediction_field": "answer", - "infer_log_probs": false -} diff --git a/src/unitxt/catalog/metrics/rag/correctness_holistic/llama_3_1_70b_instruct_wml_q_c_a_logprobs.json b/src/unitxt/catalog/metrics/rag/correctness_holistic/llama_3_1_70b_instruct_wml_q_c_a_logprobs.json deleted file mode 100644 index 559ce625dd..0000000000 --- a/src/unitxt/catalog/metrics/rag/correctness_holistic/llama_3_1_70b_instruct_wml_q_c_a_logprobs.json +++ /dev/null @@ -1,10 +0,0 @@ -{ - "__type__": "task_based_ll_mas_judge", - "inference_model": "engines.classification.llama_3_1_70b_instruct_wml", - "template": "templates.rag_eval.correctness_holistic.judge_correctness_simple_logprobs", - "task": "tasks.rag_eval.correctness_holistic.binary", - "format": null, - "main_score": "correctness_holistic_q_c_a_logprobs", - "prediction_field": "answer", - "infer_log_probs": true -} diff --git a/src/unitxt/catalog/metrics/rag/correctness_holistic/llama_3_1_70b_instruct_wml_q_c_a_numeric.json b/src/unitxt/catalog/metrics/rag/correctness_holistic/llama_3_1_70b_instruct_wml_q_c_a_numeric.json deleted file mode 100644 index eb60fe03df..0000000000 --- a/src/unitxt/catalog/metrics/rag/correctness_holistic/llama_3_1_70b_instruct_wml_q_c_a_numeric.json +++ /dev/null @@ -1,10 +0,0 @@ -{ - "__type__": "task_based_ll_mas_judge", - "inference_model": "engines.classification.llama_3_1_70b_instruct_wml", - "template": "templates.rag_eval.correctness_holistic.judge_correctness_simple_numeric", - "task": "tasks.rag_eval.correctness_holistic.binary", - "format": null, - "main_score": "correctness_holistic_q_c_a_numeric", - "prediction_field": "answer", - "infer_log_probs": false -} diff --git a/src/unitxt/catalog/metrics/rag/faithfulness/llama_3_1_70b_instruct_wml_c_a.json b/src/unitxt/catalog/metrics/rag/faithfulness/llama_3_1_70b_instruct_wml_c_a.json deleted file mode 100644 index 666269ad0a..0000000000 --- a/src/unitxt/catalog/metrics/rag/faithfulness/llama_3_1_70b_instruct_wml_c_a.json +++ /dev/null @@ -1,10 +0,0 @@ -{ - "__type__": "task_based_ll_mas_judge", - "inference_model": "engines.classification.llama_3_1_70b_instruct_wml", - "template": "templates.rag_eval.faithfulness.judge_no_question_simplified", - "task": "tasks.rag_eval.faithfulness.binary", - "format": null, - "main_score": "faithfulness_c_a", - "prediction_field": "answer", - "infer_log_probs": false -} diff --git a/src/unitxt/catalog/metrics/rag/faithfulness/llama_3_1_70b_instruct_wml_c_a_logprobs.json b/src/unitxt/catalog/metrics/rag/faithfulness/llama_3_1_70b_instruct_wml_c_a_logprobs.json deleted file mode 100644 index 6ac0bbfdb1..0000000000 --- a/src/unitxt/catalog/metrics/rag/faithfulness/llama_3_1_70b_instruct_wml_c_a_logprobs.json +++ /dev/null @@ -1,10 +0,0 @@ -{ - "__type__": "task_based_ll_mas_judge", - "inference_model": "engines.classification.llama_3_1_70b_instruct_wml", - "template": "templates.rag_eval.faithfulness.judge_no_question_simplified_logprobs", - "task": "tasks.rag_eval.faithfulness.binary", - "format": null, - "main_score": "faithfulness_c_a_logprobs", - "prediction_field": "answer", - "infer_log_probs": true -} diff --git a/src/unitxt/catalog/metrics/rag/faithfulness/llama_3_1_70b_instruct_wml_c_a_verbal.json b/src/unitxt/catalog/metrics/rag/faithfulness/llama_3_1_70b_instruct_wml_c_a_verbal.json deleted file mode 100644 index 9965060d3e..0000000000 --- a/src/unitxt/catalog/metrics/rag/faithfulness/llama_3_1_70b_instruct_wml_c_a_verbal.json +++ /dev/null @@ -1,10 +0,0 @@ -{ - "__type__": "task_based_ll_mas_judge", - "inference_model": "engines.classification.llama_3_1_70b_instruct_wml", - "template": "templates.rag_eval.faithfulness.judge_no_question_simplified_verbal", - "task": "tasks.rag_eval.faithfulness.binary", - "format": null, - "main_score": "faithfulness_c_a_verbal", - "prediction_field": "answer", - "infer_log_probs": false -} diff --git a/src/unitxt/catalog/metrics/rag/faithfulness/llama_3_1_70b_instruct_wml_q_c_a.json b/src/unitxt/catalog/metrics/rag/faithfulness/llama_3_1_70b_instruct_wml_q_c_a.json deleted file mode 100644 index c7af8670fd..0000000000 --- a/src/unitxt/catalog/metrics/rag/faithfulness/llama_3_1_70b_instruct_wml_q_c_a.json +++ /dev/null @@ -1,10 +0,0 @@ -{ - "__type__": "task_based_ll_mas_judge", - "inference_model": "engines.classification.llama_3_1_70b_instruct_wml", - "template": "templates.rag_eval.faithfulness.judge_with_question_simplified", - "task": "tasks.rag_eval.faithfulness.binary", - "format": null, - "main_score": "faithfulness_q_c_a", - "prediction_field": "answer", - "infer_log_probs": false -} diff --git a/src/unitxt/catalog/metrics/rag/faithfulness/llama_3_1_70b_instruct_wml_q_c_a_logprobs.json b/src/unitxt/catalog/metrics/rag/faithfulness/llama_3_1_70b_instruct_wml_q_c_a_logprobs.json deleted file mode 100644 index 7ae8386d15..0000000000 --- a/src/unitxt/catalog/metrics/rag/faithfulness/llama_3_1_70b_instruct_wml_q_c_a_logprobs.json +++ /dev/null @@ -1,10 +0,0 @@ -{ - "__type__": "task_based_ll_mas_judge", - "inference_model": "engines.classification.llama_3_1_70b_instruct_wml", - "template": "templates.rag_eval.faithfulness.judge_with_question_simplified_logprobs", - "task": "tasks.rag_eval.faithfulness.binary", - "format": null, - "main_score": "faithfulness_q_c_a_logprobs", - "prediction_field": "answer", - "infer_log_probs": true -} diff --git a/src/unitxt/catalog/metrics/rag/faithfulness/llama_3_1_70b_instruct_wml_q_c_a_verbal.json b/src/unitxt/catalog/metrics/rag/faithfulness/llama_3_1_70b_instruct_wml_q_c_a_verbal.json deleted file mode 100644 index 1291a2e446..0000000000 --- a/src/unitxt/catalog/metrics/rag/faithfulness/llama_3_1_70b_instruct_wml_q_c_a_verbal.json +++ /dev/null @@ -1,10 +0,0 @@ -{ - "__type__": "task_based_ll_mas_judge", - "inference_model": "engines.classification.llama_3_1_70b_instruct_wml", - "template": "templates.rag_eval.faithfulness.judge_with_question_simplified_verbal", - "task": "tasks.rag_eval.faithfulness.binary", - "format": null, - "main_score": "faithfulness_q_c_a_verbal", - "prediction_field": "answer", - "infer_log_probs": false -}