From 48b14218104d4ef53ebcdca1ceb311d0f6f7e3be Mon Sep 17 00:00:00 2001
From: dafnapension <dafnashein@yahoo.com>
Date: Tue, 21 Oct 2025 22:27:23 +0300
Subject: [PATCH 1/2] modified and new

Signed-off-by: dafnapension <dafnashein@yahoo.com>
---
 prepare/metrics/llm_as_judge/rag_judge.py             |  2 +-
 ..._instruct_wml_answer_correctness_q_a_gt_loose.json | 11 +++++++++++
 ..._wml_answer_correctness_q_a_gt_loose_logprobs.json | 11 +++++++++++
 ...ama_3_3_70b_instruct_wml_answer_relevance_q_a.json | 11 +++++++++++
 ...0b_instruct_wml_answer_relevance_q_a_logprobs.json | 11 +++++++++++
 ...3_70b_instruct_wml_context_relevance_q_c_ares.json | 11 +++++++++++
 ...truct_wml_context_relevance_q_c_ares_logprobs.json | 11 +++++++++++
 ...3_70b_instruct_wml_correctness_holistic_q_c_a.json | 11 +++++++++++
 ...truct_wml_correctness_holistic_q_c_a_logprobs.json | 11 +++++++++++
 .../llama_3_3_70b_instruct_wml_faithfulness_c_a.json  | 11 +++++++++++
 ..._3_70b_instruct_wml_faithfulness_c_a_logprobs.json | 11 +++++++++++
 ...llama_3_3_70b_instruct_wml_faithfulness_q_c_a.json | 11 +++++++++++
 ..._70b_instruct_wml_faithfulness_q_c_a_logprobs.json | 11 +++++++++++
 .../llama_3_3_70b_instruct_wml_q_a_gt_loose.json      | 10 ++++++++++
 ...ma_3_3_70b_instruct_wml_q_a_gt_loose_logprobs.json | 10 ++++++++++
 ...ama_3_3_70b_instruct_wml_q_a_gt_loose_numeric.json | 10 ++++++++++
 .../llama_3_3_70b_instruct_wml_q_a.json               | 10 ++++++++++
 .../llama_3_3_70b_instruct_wml_q_a_logprobs.json      | 10 ++++++++++
 .../llama_3_3_70b_instruct_wml_q_a_numeric.json       | 10 ++++++++++
 .../llama_3_3_70b_instruct_wml_q_c_ares.json          | 10 ++++++++++
 .../llama_3_3_70b_instruct_wml_q_c_ares_logprobs.json | 10 ++++++++++
 .../llama_3_3_70b_instruct_wml_q_c_ares_numeric.json  | 10 ++++++++++
 .../llama_3_3_70b_instruct_wml_q_c_a.json             | 10 ++++++++++
 .../llama_3_3_70b_instruct_wml_q_c_a_logprobs.json    | 10 ++++++++++
 .../llama_3_3_70b_instruct_wml_q_c_a_numeric.json     | 10 ++++++++++
 .../faithfulness/llama_3_3_70b_instruct_wml_c_a.json  | 10 ++++++++++
 .../llama_3_3_70b_instruct_wml_c_a_logprobs.json      | 10 ++++++++++
 .../llama_3_3_70b_instruct_wml_c_a_verbal.json        | 10 ++++++++++
 .../llama_3_3_70b_instruct_wml_q_c_a.json             | 10 ++++++++++
 .../llama_3_3_70b_instruct_wml_q_c_a_logprobs.json    | 10 ++++++++++
 .../llama_3_3_70b_instruct_wml_q_c_a_verbal.json      | 10 ++++++++++
 31 files changed, 313 insertions(+), 1 deletion(-)
 create mode 100644 src/unitxt/catalog/metrics/llm_as_judge/binary/llama_3_3_70b_instruct_wml_answer_correctness_q_a_gt_loose.json
 create mode 100644 src/unitxt/catalog/metrics/llm_as_judge/binary/llama_3_3_70b_instruct_wml_answer_correctness_q_a_gt_loose_logprobs.json
 create mode 100644 src/unitxt/catalog/metrics/llm_as_judge/binary/llama_3_3_70b_instruct_wml_answer_relevance_q_a.json
 create mode 100644 src/unitxt/catalog/metrics/llm_as_judge/binary/llama_3_3_70b_instruct_wml_answer_relevance_q_a_logprobs.json
 create mode 100644 src/unitxt/catalog/metrics/llm_as_judge/binary/llama_3_3_70b_instruct_wml_context_relevance_q_c_ares.json
 create mode 100644 src/unitxt/catalog/metrics/llm_as_judge/binary/llama_3_3_70b_instruct_wml_context_relevance_q_c_ares_logprobs.json
 create mode 100644 src/unitxt/catalog/metrics/llm_as_judge/binary/llama_3_3_70b_instruct_wml_correctness_holistic_q_c_a.json
 create mode 100644 src/unitxt/catalog/metrics/llm_as_judge/binary/llama_3_3_70b_instruct_wml_correctness_holistic_q_c_a_logprobs.json
 create mode 100644 src/unitxt/catalog/metrics/llm_as_judge/binary/llama_3_3_70b_instruct_wml_faithfulness_c_a.json
 create mode 100644 src/unitxt/catalog/metrics/llm_as_judge/binary/llama_3_3_70b_instruct_wml_faithfulness_c_a_logprobs.json
 create mode 100644 src/unitxt/catalog/metrics/llm_as_judge/binary/llama_3_3_70b_instruct_wml_faithfulness_q_c_a.json
 create mode 100644 src/unitxt/catalog/metrics/llm_as_judge/binary/llama_3_3_70b_instruct_wml_faithfulness_q_c_a_logprobs.json
 create mode 100644 src/unitxt/catalog/metrics/rag/answer_correctness/llama_3_3_70b_instruct_wml_q_a_gt_loose.json
 create mode 100644 src/unitxt/catalog/metrics/rag/answer_correctness/llama_3_3_70b_instruct_wml_q_a_gt_loose_logprobs.json
 create mode 100644 src/unitxt/catalog/metrics/rag/answer_correctness/llama_3_3_70b_instruct_wml_q_a_gt_loose_numeric.json
 create mode 100644 src/unitxt/catalog/metrics/rag/answer_relevance/llama_3_3_70b_instruct_wml_q_a.json
 create mode 100644 src/unitxt/catalog/metrics/rag/answer_relevance/llama_3_3_70b_instruct_wml_q_a_logprobs.json
 create mode 100644 src/unitxt/catalog/metrics/rag/answer_relevance/llama_3_3_70b_instruct_wml_q_a_numeric.json
 create mode 100644 src/unitxt/catalog/metrics/rag/context_relevance/llama_3_3_70b_instruct_wml_q_c_ares.json
 create mode 100644 src/unitxt/catalog/metrics/rag/context_relevance/llama_3_3_70b_instruct_wml_q_c_ares_logprobs.json
 create mode 100644 src/unitxt/catalog/metrics/rag/context_relevance/llama_3_3_70b_instruct_wml_q_c_ares_numeric.json
 create mode 100644 src/unitxt/catalog/metrics/rag/correctness_holistic/llama_3_3_70b_instruct_wml_q_c_a.json
 create mode 100644 src/unitxt/catalog/metrics/rag/correctness_holistic/llama_3_3_70b_instruct_wml_q_c_a_logprobs.json
 create mode 100644 src/unitxt/catalog/metrics/rag/correctness_holistic/llama_3_3_70b_instruct_wml_q_c_a_numeric.json
 create mode 100644 src/unitxt/catalog/metrics/rag/faithfulness/llama_3_3_70b_instruct_wml_c_a.json
 create mode 100644 src/unitxt/catalog/metrics/rag/faithfulness/llama_3_3_70b_instruct_wml_c_a_logprobs.json
 create mode 100644 src/unitxt/catalog/metrics/rag/faithfulness/llama_3_3_70b_instruct_wml_c_a_verbal.json
 create mode 100644 src/unitxt/catalog/metrics/rag/faithfulness/llama_3_3_70b_instruct_wml_q_c_a.json
 create mode 100644 src/unitxt/catalog/metrics/rag/faithfulness/llama_3_3_70b_instruct_wml_q_c_a_logprobs.json
 create mode 100644 src/unitxt/catalog/metrics/rag/faithfulness/llama_3_3_70b_instruct_wml_q_c_a_verbal.json

diff --git a/prepare/metrics/llm_as_judge/rag_judge.py b/prepare/metrics/llm_as_judge/rag_judge.py
index 65ce814ba3..b46c138db2 100644
--- a/prepare/metrics/llm_as_judge/rag_judge.py
+++ b/prepare/metrics/llm_as_judge/rag_judge.py
@@ -24,7 +24,7 @@
 
 generic_engine_label = "generic_inference_engine"
 inference_models = {
-    "llama_3_1_70b_instruct_wml": "engines.classification.llama_3_1_70b_instruct_wml",
+    "llama_3_3_70b_instruct_wml": "engines.classification.llama_3_3_70b_instruct_wml",
     generic_engine_label: GenericInferenceEngine(),
 }
 
diff --git a/src/unitxt/catalog/metrics/llm_as_judge/binary/llama_3_3_70b_instruct_wml_answer_correctness_q_a_gt_loose.json b/src/unitxt/catalog/metrics/llm_as_judge/binary/llama_3_3_70b_instruct_wml_answer_correctness_q_a_gt_loose.json
new file mode 100644
index 0000000000..1846c671dd
--- /dev/null
+++ b/src/unitxt/catalog/metrics/llm_as_judge/binary/llama_3_3_70b_instruct_wml_answer_correctness_q_a_gt_loose.json
@@ -0,0 +1,11 @@
+{
+    "__type__": "task_based_ll_mas_judge",
+    "inference_model": "engines.classification.llama_3_3_70b_instruct_wml",
+    "template": "templates.rag_eval.answer_correctness.judge_loose_match_no_context",
+    "task": "tasks.rag_eval.answer_correctness.binary",
+    "format": null,
+    "main_score": "answer_correctness_q_a_gt_loose",
+    "prediction_field": "answer",
+    "infer_log_probs": false,
+    "__deprecated_msg__": "This metric should be replaced with metrics.rag.answer_correctness.llama_3_3_70b_instruct_wml_q_a_gt_loose"
+}
diff --git a/src/unitxt/catalog/metrics/llm_as_judge/binary/llama_3_3_70b_instruct_wml_answer_correctness_q_a_gt_loose_logprobs.json b/src/unitxt/catalog/metrics/llm_as_judge/binary/llama_3_3_70b_instruct_wml_answer_correctness_q_a_gt_loose_logprobs.json
new file mode 100644
index 0000000000..777340e9b4
--- /dev/null
+++ b/src/unitxt/catalog/metrics/llm_as_judge/binary/llama_3_3_70b_instruct_wml_answer_correctness_q_a_gt_loose_logprobs.json
@@ -0,0 +1,11 @@
+{
+    "__type__": "task_based_ll_mas_judge",
+    "inference_model": "engines.classification.llama_3_3_70b_instruct_wml",
+    "template": "templates.rag_eval.answer_correctness.judge_loose_match_no_context_logprobs",
+    "task": "tasks.rag_eval.answer_correctness.binary",
+    "format": null,
+    "main_score": "answer_correctness_q_a_gt_loose_logprobs",
+    "prediction_field": "answer",
+    "infer_log_probs": true,
+    "__deprecated_msg__": "This metric should be replaced with metrics.rag.answer_correctness.llama_3_3_70b_instruct_wml_q_a_gt_loose_logprobs"
+}
diff --git a/src/unitxt/catalog/metrics/llm_as_judge/binary/llama_3_3_70b_instruct_wml_answer_relevance_q_a.json b/src/unitxt/catalog/metrics/llm_as_judge/binary/llama_3_3_70b_instruct_wml_answer_relevance_q_a.json
new file mode 100644
index 0000000000..f9a12a1c33
--- /dev/null
+++ b/src/unitxt/catalog/metrics/llm_as_judge/binary/llama_3_3_70b_instruct_wml_answer_relevance_q_a.json
@@ -0,0 +1,11 @@
+{
+    "__type__": "task_based_ll_mas_judge",
+    "inference_model": "engines.classification.llama_3_3_70b_instruct_wml",
+    "template": "templates.rag_eval.answer_relevance.judge_answer_relevance",
+    "task": "tasks.rag_eval.answer_relevance.binary",
+    "format": null,
+    "main_score": "answer_relevance_q_a",
+    "prediction_field": "answer",
+    "infer_log_probs": false,
+    "__deprecated_msg__": "This metric should be replaced with metrics.rag.answer_relevance.llama_3_3_70b_instruct_wml_q_a"
+}
diff --git a/src/unitxt/catalog/metrics/llm_as_judge/binary/llama_3_3_70b_instruct_wml_answer_relevance_q_a_logprobs.json b/src/unitxt/catalog/metrics/llm_as_judge/binary/llama_3_3_70b_instruct_wml_answer_relevance_q_a_logprobs.json
new file mode 100644
index 0000000000..aba534abaf
--- /dev/null
+++ b/src/unitxt/catalog/metrics/llm_as_judge/binary/llama_3_3_70b_instruct_wml_answer_relevance_q_a_logprobs.json
@@ -0,0 +1,11 @@
+{
+    "__type__": "task_based_ll_mas_judge",
+    "inference_model": "engines.classification.llama_3_3_70b_instruct_wml",
+    "template": "templates.rag_eval.answer_relevance.judge_answer_relevance_logprobs",
+    "task": "tasks.rag_eval.answer_relevance.binary",
+    "format": null,
+    "main_score": "answer_relevance_q_a_logprobs",
+    "prediction_field": "answer",
+    "infer_log_probs": true,
+    "__deprecated_msg__": "This metric should be replaced with metrics.rag.answer_relevance.llama_3_3_70b_instruct_wml_q_a_logprobs"
+}
diff --git a/src/unitxt/catalog/metrics/llm_as_judge/binary/llama_3_3_70b_instruct_wml_context_relevance_q_c_ares.json b/src/unitxt/catalog/metrics/llm_as_judge/binary/llama_3_3_70b_instruct_wml_context_relevance_q_c_ares.json
new file mode 100644
index 0000000000..56c157451b
--- /dev/null
+++ b/src/unitxt/catalog/metrics/llm_as_judge/binary/llama_3_3_70b_instruct_wml_context_relevance_q_c_ares.json
@@ -0,0 +1,11 @@
+{
+    "__type__": "task_based_ll_mas_judge",
+    "inference_model": "engines.classification.llama_3_3_70b_instruct_wml",
+    "template": "templates.rag_eval.context_relevance.judge_context_relevance_ares",
+    "task": "tasks.rag_eval.context_relevance.binary",
+    "format": null,
+    "main_score": "context_relevance_q_c_ares",
+    "prediction_field": "contexts",
+    "infer_log_probs": false,
+    "__deprecated_msg__": "This metric should be replaced with metrics.rag.context_relevance.llama_3_3_70b_instruct_wml_q_c_ares"
+}
diff --git a/src/unitxt/catalog/metrics/llm_as_judge/binary/llama_3_3_70b_instruct_wml_context_relevance_q_c_ares_logprobs.json b/src/unitxt/catalog/metrics/llm_as_judge/binary/llama_3_3_70b_instruct_wml_context_relevance_q_c_ares_logprobs.json
new file mode 100644
index 0000000000..e8c60354c1
--- /dev/null
+++ b/src/unitxt/catalog/metrics/llm_as_judge/binary/llama_3_3_70b_instruct_wml_context_relevance_q_c_ares_logprobs.json
@@ -0,0 +1,11 @@
+{
+    "__type__": "task_based_ll_mas_judge",
+    "inference_model": "engines.classification.llama_3_3_70b_instruct_wml",
+    "template": "templates.rag_eval.context_relevance.judge_context_relevance_ares_logprobs",
+    "task": "tasks.rag_eval.context_relevance.binary",
+    "format": null,
+    "main_score": "context_relevance_q_c_ares_logprobs",
+    "prediction_field": "contexts",
+    "infer_log_probs": true,
+    "__deprecated_msg__": "This metric should be replaced with metrics.rag.context_relevance.llama_3_3_70b_instruct_wml_q_c_ares_logprobs"
+}
diff --git a/src/unitxt/catalog/metrics/llm_as_judge/binary/llama_3_3_70b_instruct_wml_correctness_holistic_q_c_a.json b/src/unitxt/catalog/metrics/llm_as_judge/binary/llama_3_3_70b_instruct_wml_correctness_holistic_q_c_a.json
new file mode 100644
index 0000000000..e88220d187
--- /dev/null
+++ b/src/unitxt/catalog/metrics/llm_as_judge/binary/llama_3_3_70b_instruct_wml_correctness_holistic_q_c_a.json
@@ -0,0 +1,11 @@
+{
+    "__type__": "task_based_ll_mas_judge",
+    "inference_model": "engines.classification.llama_3_3_70b_instruct_wml",
+    "template": "templates.rag_eval.correctness_holistic.judge_correctness_simple",
+    "task": "tasks.rag_eval.correctness_holistic.binary",
+    "format": null,
+    "main_score": "correctness_holistic_q_c_a",
+    "prediction_field": "answer",
+    "infer_log_probs": false,
+    "__deprecated_msg__": "This metric should be replaced with metrics.rag.correctness_holistic.llama_3_3_70b_instruct_wml_q_c_a"
+}
diff --git a/src/unitxt/catalog/metrics/llm_as_judge/binary/llama_3_3_70b_instruct_wml_correctness_holistic_q_c_a_logprobs.json b/src/unitxt/catalog/metrics/llm_as_judge/binary/llama_3_3_70b_instruct_wml_correctness_holistic_q_c_a_logprobs.json
new file mode 100644
index 0000000000..1888d459eb
--- /dev/null
+++ b/src/unitxt/catalog/metrics/llm_as_judge/binary/llama_3_3_70b_instruct_wml_correctness_holistic_q_c_a_logprobs.json
@@ -0,0 +1,11 @@
+{
+    "__type__": "task_based_ll_mas_judge",
+    "inference_model": "engines.classification.llama_3_3_70b_instruct_wml",
+    "template": "templates.rag_eval.correctness_holistic.judge_correctness_simple_logprobs",
+    "task": "tasks.rag_eval.correctness_holistic.binary",
+    "format": null,
+    "main_score": "correctness_holistic_q_c_a_logprobs",
+    "prediction_field": "answer",
+    "infer_log_probs": true,
+    "__deprecated_msg__": "This metric should be replaced with metrics.rag.correctness_holistic.llama_3_3_70b_instruct_wml_q_c_a_logprobs"
+}
diff --git a/src/unitxt/catalog/metrics/llm_as_judge/binary/llama_3_3_70b_instruct_wml_faithfulness_c_a.json b/src/unitxt/catalog/metrics/llm_as_judge/binary/llama_3_3_70b_instruct_wml_faithfulness_c_a.json
new file mode 100644
index 0000000000..63c9e0f60b
--- /dev/null
+++ b/src/unitxt/catalog/metrics/llm_as_judge/binary/llama_3_3_70b_instruct_wml_faithfulness_c_a.json
@@ -0,0 +1,11 @@
+{
+    "__type__": "task_based_ll_mas_judge",
+    "inference_model": "engines.classification.llama_3_3_70b_instruct_wml",
+    "template": "templates.rag_eval.faithfulness.judge_no_question_simplified",
+    "task": "tasks.rag_eval.faithfulness.binary",
+    "format": null,
+    "main_score": "faithfulness_c_a",
+    "prediction_field": "answer",
+    "infer_log_probs": false,
+    "__deprecated_msg__": "This metric should be replaced with metrics.rag.faithfulness.llama_3_3_70b_instruct_wml_c_a"
+}
diff --git a/src/unitxt/catalog/metrics/llm_as_judge/binary/llama_3_3_70b_instruct_wml_faithfulness_c_a_logprobs.json b/src/unitxt/catalog/metrics/llm_as_judge/binary/llama_3_3_70b_instruct_wml_faithfulness_c_a_logprobs.json
new file mode 100644
index 0000000000..99ab0ab399
--- /dev/null
+++ b/src/unitxt/catalog/metrics/llm_as_judge/binary/llama_3_3_70b_instruct_wml_faithfulness_c_a_logprobs.json
@@ -0,0 +1,11 @@
+{
+    "__type__": "task_based_ll_mas_judge",
+    "inference_model": "engines.classification.llama_3_3_70b_instruct_wml",
+    "template": "templates.rag_eval.faithfulness.judge_no_question_simplified_logprobs",
+    "task": "tasks.rag_eval.faithfulness.binary",
+    "format": null,
+    "main_score": "faithfulness_c_a_logprobs",
+    "prediction_field": "answer",
+    "infer_log_probs": true,
+    "__deprecated_msg__": "This metric should be replaced with metrics.rag.faithfulness.llama_3_3_70b_instruct_wml_c_a_logprobs"
+}
diff --git a/src/unitxt/catalog/metrics/llm_as_judge/binary/llama_3_3_70b_instruct_wml_faithfulness_q_c_a.json b/src/unitxt/catalog/metrics/llm_as_judge/binary/llama_3_3_70b_instruct_wml_faithfulness_q_c_a.json
new file mode 100644
index 0000000000..001e1ec9b1
--- /dev/null
+++ b/src/unitxt/catalog/metrics/llm_as_judge/binary/llama_3_3_70b_instruct_wml_faithfulness_q_c_a.json
@@ -0,0 +1,11 @@
+{
+    "__type__": "task_based_ll_mas_judge",
+    "inference_model": "engines.classification.llama_3_3_70b_instruct_wml",
+    "template": "templates.rag_eval.faithfulness.judge_with_question_simplified",
+    "task": "tasks.rag_eval.faithfulness.binary",
+    "format": null,
+    "main_score": "faithfulness_q_c_a",
+    "prediction_field": "answer",
+    "infer_log_probs": false,
+    "__deprecated_msg__": "This metric should be replaced with metrics.rag.faithfulness.llama_3_3_70b_instruct_wml_q_c_a"
+}
diff --git a/src/unitxt/catalog/metrics/llm_as_judge/binary/llama_3_3_70b_instruct_wml_faithfulness_q_c_a_logprobs.json b/src/unitxt/catalog/metrics/llm_as_judge/binary/llama_3_3_70b_instruct_wml_faithfulness_q_c_a_logprobs.json
new file mode 100644
index 0000000000..bdfd221bb4
--- /dev/null
+++ b/src/unitxt/catalog/metrics/llm_as_judge/binary/llama_3_3_70b_instruct_wml_faithfulness_q_c_a_logprobs.json
@@ -0,0 +1,11 @@
+{
+    "__type__": "task_based_ll_mas_judge",
+    "inference_model": "engines.classification.llama_3_3_70b_instruct_wml",
+    "template": "templates.rag_eval.faithfulness.judge_with_question_simplified_logprobs",
+    "task": "tasks.rag_eval.faithfulness.binary",
+    "format": null,
+    "main_score": "faithfulness_q_c_a_logprobs",
+    "prediction_field": "answer",
+    "infer_log_probs": true,
+    "__deprecated_msg__": "This metric should be replaced with metrics.rag.faithfulness.llama_3_3_70b_instruct_wml_q_c_a_logprobs"
+}
diff --git a/src/unitxt/catalog/metrics/rag/answer_correctness/llama_3_3_70b_instruct_wml_q_a_gt_loose.json b/src/unitxt/catalog/metrics/rag/answer_correctness/llama_3_3_70b_instruct_wml_q_a_gt_loose.json
new file mode 100644
index 0000000000..73278e7010
--- /dev/null
+++ b/src/unitxt/catalog/metrics/rag/answer_correctness/llama_3_3_70b_instruct_wml_q_a_gt_loose.json
@@ -0,0 +1,10 @@
+{
+    "__type__": "task_based_ll_mas_judge",
+    "inference_model": "engines.classification.llama_3_3_70b_instruct_wml",
+    "template": "templates.rag_eval.answer_correctness.judge_loose_match_no_context",
+    "task": "tasks.rag_eval.answer_correctness.binary",
+    "format": null,
+    "main_score": "answer_correctness_q_a_gt_loose",
+    "prediction_field": "answer",
+    "infer_log_probs": false
+}
diff --git a/src/unitxt/catalog/metrics/rag/answer_correctness/llama_3_3_70b_instruct_wml_q_a_gt_loose_logprobs.json b/src/unitxt/catalog/metrics/rag/answer_correctness/llama_3_3_70b_instruct_wml_q_a_gt_loose_logprobs.json
new file mode 100644
index 0000000000..ba92f6b276
--- /dev/null
+++ b/src/unitxt/catalog/metrics/rag/answer_correctness/llama_3_3_70b_instruct_wml_q_a_gt_loose_logprobs.json
@@ -0,0 +1,10 @@
+{
+    "__type__": "task_based_ll_mas_judge",
+    "inference_model": "engines.classification.llama_3_3_70b_instruct_wml",
+    "template": "templates.rag_eval.answer_correctness.judge_loose_match_no_context_logprobs",
+    "task": "tasks.rag_eval.answer_correctness.binary",
+    "format": null,
+    "main_score": "answer_correctness_q_a_gt_loose_logprobs",
+    "prediction_field": "answer",
+    "infer_log_probs": true
+}
diff --git a/src/unitxt/catalog/metrics/rag/answer_correctness/llama_3_3_70b_instruct_wml_q_a_gt_loose_numeric.json b/src/unitxt/catalog/metrics/rag/answer_correctness/llama_3_3_70b_instruct_wml_q_a_gt_loose_numeric.json
new file mode 100644
index 0000000000..d533c86e09
--- /dev/null
+++ b/src/unitxt/catalog/metrics/rag/answer_correctness/llama_3_3_70b_instruct_wml_q_a_gt_loose_numeric.json
@@ -0,0 +1,10 @@
+{
+    "__type__": "task_based_ll_mas_judge",
+    "inference_model": "engines.classification.llama_3_3_70b_instruct_wml",
+    "template": "templates.rag_eval.answer_correctness.judge_loose_match_no_context_numeric",
+    "task": "tasks.rag_eval.answer_correctness.binary",
+    "format": null,
+    "main_score": "answer_correctness_q_a_gt_loose_numeric",
+    "prediction_field": "answer",
+    "infer_log_probs": false
+}
diff --git a/src/unitxt/catalog/metrics/rag/answer_relevance/llama_3_3_70b_instruct_wml_q_a.json b/src/unitxt/catalog/metrics/rag/answer_relevance/llama_3_3_70b_instruct_wml_q_a.json
new file mode 100644
index 0000000000..e8203dc080
--- /dev/null
+++ b/src/unitxt/catalog/metrics/rag/answer_relevance/llama_3_3_70b_instruct_wml_q_a.json
@@ -0,0 +1,10 @@
+{
+    "__type__": "task_based_ll_mas_judge",
+    "inference_model": "engines.classification.llama_3_3_70b_instruct_wml",
+    "template": "templates.rag_eval.answer_relevance.judge_answer_relevance",
+    "task": "tasks.rag_eval.answer_relevance.binary",
+    "format": null,
+    "main_score": "answer_relevance_q_a",
+    "prediction_field": "answer",
+    "infer_log_probs": false
+}
diff --git a/src/unitxt/catalog/metrics/rag/answer_relevance/llama_3_3_70b_instruct_wml_q_a_logprobs.json b/src/unitxt/catalog/metrics/rag/answer_relevance/llama_3_3_70b_instruct_wml_q_a_logprobs.json
new file mode 100644
index 0000000000..2228e40304
--- /dev/null
+++ b/src/unitxt/catalog/metrics/rag/answer_relevance/llama_3_3_70b_instruct_wml_q_a_logprobs.json
@@ -0,0 +1,10 @@
+{
+    "__type__": "task_based_ll_mas_judge",
+    "inference_model": "engines.classification.llama_3_3_70b_instruct_wml",
+    "template": "templates.rag_eval.answer_relevance.judge_answer_relevance_logprobs",
+    "task": "tasks.rag_eval.answer_relevance.binary",
+    "format": null,
+    "main_score": "answer_relevance_q_a_logprobs",
+    "prediction_field": "answer",
+    "infer_log_probs": true
+}
diff --git a/src/unitxt/catalog/metrics/rag/answer_relevance/llama_3_3_70b_instruct_wml_q_a_numeric.json b/src/unitxt/catalog/metrics/rag/answer_relevance/llama_3_3_70b_instruct_wml_q_a_numeric.json
new file mode 100644
index 0000000000..59e975d688
--- /dev/null
+++ b/src/unitxt/catalog/metrics/rag/answer_relevance/llama_3_3_70b_instruct_wml_q_a_numeric.json
@@ -0,0 +1,10 @@
+{
+    "__type__": "task_based_ll_mas_judge",
+    "inference_model": "engines.classification.llama_3_3_70b_instruct_wml",
+    "template": "templates.rag_eval.answer_relevance.judge_answer_relevance_numeric",
+    "task": "tasks.rag_eval.answer_relevance.binary",
+    "format": null,
+    "main_score": "answer_relevance_q_a_numeric",
+    "prediction_field": "answer",
+    "infer_log_probs": false
+}
diff --git a/src/unitxt/catalog/metrics/rag/context_relevance/llama_3_3_70b_instruct_wml_q_c_ares.json b/src/unitxt/catalog/metrics/rag/context_relevance/llama_3_3_70b_instruct_wml_q_c_ares.json
new file mode 100644
index 0000000000..fe1799c6bb
--- /dev/null
+++ b/src/unitxt/catalog/metrics/rag/context_relevance/llama_3_3_70b_instruct_wml_q_c_ares.json
@@ -0,0 +1,10 @@
+{
+    "__type__": "task_based_ll_mas_judge",
+    "inference_model": "engines.classification.llama_3_3_70b_instruct_wml",
+    "template": "templates.rag_eval.context_relevance.judge_context_relevance_ares",
+    "task": "tasks.rag_eval.context_relevance.binary",
+    "format": null,
+    "main_score": "context_relevance_q_c_ares",
+    "prediction_field": "contexts",
+    "infer_log_probs": false
+}
diff --git a/src/unitxt/catalog/metrics/rag/context_relevance/llama_3_3_70b_instruct_wml_q_c_ares_logprobs.json b/src/unitxt/catalog/metrics/rag/context_relevance/llama_3_3_70b_instruct_wml_q_c_ares_logprobs.json
new file mode 100644
index 0000000000..e4a488ee6f
--- /dev/null
+++ b/src/unitxt/catalog/metrics/rag/context_relevance/llama_3_3_70b_instruct_wml_q_c_ares_logprobs.json
@@ -0,0 +1,10 @@
+{
+    "__type__": "task_based_ll_mas_judge",
+    "inference_model": "engines.classification.llama_3_3_70b_instruct_wml",
+    "template": "templates.rag_eval.context_relevance.judge_context_relevance_ares_logprobs",
+    "task": "tasks.rag_eval.context_relevance.binary",
+    "format": null,
+    "main_score": "context_relevance_q_c_ares_logprobs",
+    "prediction_field": "contexts",
+    "infer_log_probs": true
+}
diff --git a/src/unitxt/catalog/metrics/rag/context_relevance/llama_3_3_70b_instruct_wml_q_c_ares_numeric.json b/src/unitxt/catalog/metrics/rag/context_relevance/llama_3_3_70b_instruct_wml_q_c_ares_numeric.json
new file mode 100644
index 0000000000..4428792605
--- /dev/null
+++ b/src/unitxt/catalog/metrics/rag/context_relevance/llama_3_3_70b_instruct_wml_q_c_ares_numeric.json
@@ -0,0 +1,10 @@
+{
+    "__type__": "task_based_ll_mas_judge",
+    "inference_model": "engines.classification.llama_3_3_70b_instruct_wml",
+    "template": "templates.rag_eval.context_relevance.judge_context_relevance_ares_numeric",
+    "task": "tasks.rag_eval.context_relevance.binary",
+    "format": null,
+    "main_score": "context_relevance_q_c_ares_numeric",
+    "prediction_field": "contexts",
+    "infer_log_probs": false
+}
diff --git a/src/unitxt/catalog/metrics/rag/correctness_holistic/llama_3_3_70b_instruct_wml_q_c_a.json b/src/unitxt/catalog/metrics/rag/correctness_holistic/llama_3_3_70b_instruct_wml_q_c_a.json
new file mode 100644
index 0000000000..ea389fdacc
--- /dev/null
+++ b/src/unitxt/catalog/metrics/rag/correctness_holistic/llama_3_3_70b_instruct_wml_q_c_a.json
@@ -0,0 +1,10 @@
+{
+    "__type__": "task_based_ll_mas_judge",
+    "inference_model": "engines.classification.llama_3_3_70b_instruct_wml",
+    "template": "templates.rag_eval.correctness_holistic.judge_correctness_simple",
+    "task": "tasks.rag_eval.correctness_holistic.binary",
+    "format": null,
+    "main_score": "correctness_holistic_q_c_a",
+    "prediction_field": "answer",
+    "infer_log_probs": false
+}
diff --git a/src/unitxt/catalog/metrics/rag/correctness_holistic/llama_3_3_70b_instruct_wml_q_c_a_logprobs.json b/src/unitxt/catalog/metrics/rag/correctness_holistic/llama_3_3_70b_instruct_wml_q_c_a_logprobs.json
new file mode 100644
index 0000000000..76063236bd
--- /dev/null
+++ b/src/unitxt/catalog/metrics/rag/correctness_holistic/llama_3_3_70b_instruct_wml_q_c_a_logprobs.json
@@ -0,0 +1,10 @@
+{
+    "__type__": "task_based_ll_mas_judge",
+    "inference_model": "engines.classification.llama_3_3_70b_instruct_wml",
+    "template": "templates.rag_eval.correctness_holistic.judge_correctness_simple_logprobs",
+    "task": "tasks.rag_eval.correctness_holistic.binary",
+    "format": null,
+    "main_score": "correctness_holistic_q_c_a_logprobs",
+    "prediction_field": "answer",
+    "infer_log_probs": true
+}
diff --git a/src/unitxt/catalog/metrics/rag/correctness_holistic/llama_3_3_70b_instruct_wml_q_c_a_numeric.json b/src/unitxt/catalog/metrics/rag/correctness_holistic/llama_3_3_70b_instruct_wml_q_c_a_numeric.json
new file mode 100644
index 0000000000..aa7d570ac8
--- /dev/null
+++ b/src/unitxt/catalog/metrics/rag/correctness_holistic/llama_3_3_70b_instruct_wml_q_c_a_numeric.json
@@ -0,0 +1,10 @@
+{
+    "__type__": "task_based_ll_mas_judge",
+    "inference_model": "engines.classification.llama_3_3_70b_instruct_wml",
+    "template": "templates.rag_eval.correctness_holistic.judge_correctness_simple_numeric",
+    "task": "tasks.rag_eval.correctness_holistic.binary",
+    "format": null,
+    "main_score": "correctness_holistic_q_c_a_numeric",
+    "prediction_field": "answer",
+    "infer_log_probs": false
+}
diff --git a/src/unitxt/catalog/metrics/rag/faithfulness/llama_3_3_70b_instruct_wml_c_a.json b/src/unitxt/catalog/metrics/rag/faithfulness/llama_3_3_70b_instruct_wml_c_a.json
new file mode 100644
index 0000000000..a01b13516e
--- /dev/null
+++ b/src/unitxt/catalog/metrics/rag/faithfulness/llama_3_3_70b_instruct_wml_c_a.json
@@ -0,0 +1,10 @@
+{
+    "__type__": "task_based_ll_mas_judge",
+    "inference_model": "engines.classification.llama_3_3_70b_instruct_wml",
+    "template": "templates.rag_eval.faithfulness.judge_no_question_simplified",
+    "task": "tasks.rag_eval.faithfulness.binary",
+    "format": null,
+    "main_score": "faithfulness_c_a",
+    "prediction_field": "answer",
+    "infer_log_probs": false
+}
diff --git a/src/unitxt/catalog/metrics/rag/faithfulness/llama_3_3_70b_instruct_wml_c_a_logprobs.json b/src/unitxt/catalog/metrics/rag/faithfulness/llama_3_3_70b_instruct_wml_c_a_logprobs.json
new file mode 100644
index 0000000000..df338d5fcb
--- /dev/null
+++ b/src/unitxt/catalog/metrics/rag/faithfulness/llama_3_3_70b_instruct_wml_c_a_logprobs.json
@@ -0,0 +1,10 @@
+{
+    "__type__": "task_based_ll_mas_judge",
+    "inference_model": "engines.classification.llama_3_3_70b_instruct_wml",
+    "template": "templates.rag_eval.faithfulness.judge_no_question_simplified_logprobs",
+    "task": "tasks.rag_eval.faithfulness.binary",
+    "format": null,
+    "main_score": "faithfulness_c_a_logprobs",
+    "prediction_field": "answer",
+    "infer_log_probs": true
+}
diff --git a/src/unitxt/catalog/metrics/rag/faithfulness/llama_3_3_70b_instruct_wml_c_a_verbal.json b/src/unitxt/catalog/metrics/rag/faithfulness/llama_3_3_70b_instruct_wml_c_a_verbal.json
new file mode 100644
index 0000000000..814e32afb2
--- /dev/null
+++ b/src/unitxt/catalog/metrics/rag/faithfulness/llama_3_3_70b_instruct_wml_c_a_verbal.json
@@ -0,0 +1,10 @@
+{
+    "__type__": "task_based_ll_mas_judge",
+    "inference_model": "engines.classification.llama_3_3_70b_instruct_wml",
+    "template": "templates.rag_eval.faithfulness.judge_no_question_simplified_verbal",
+    "task": "tasks.rag_eval.faithfulness.binary",
+    "format": null,
+    "main_score": "faithfulness_c_a_verbal",
+    "prediction_field": "answer",
+    "infer_log_probs": false
+}
diff --git a/src/unitxt/catalog/metrics/rag/faithfulness/llama_3_3_70b_instruct_wml_q_c_a.json b/src/unitxt/catalog/metrics/rag/faithfulness/llama_3_3_70b_instruct_wml_q_c_a.json
new file mode 100644
index 0000000000..05d5041a34
--- /dev/null
+++ b/src/unitxt/catalog/metrics/rag/faithfulness/llama_3_3_70b_instruct_wml_q_c_a.json
@@ -0,0 +1,10 @@
+{
+    "__type__": "task_based_ll_mas_judge",
+    "inference_model": "engines.classification.llama_3_3_70b_instruct_wml",
+    "template": "templates.rag_eval.faithfulness.judge_with_question_simplified",
+    "task": "tasks.rag_eval.faithfulness.binary",
+    "format": null,
+    "main_score": "faithfulness_q_c_a",
+    "prediction_field": "answer",
+    "infer_log_probs": false
+}
diff --git a/src/unitxt/catalog/metrics/rag/faithfulness/llama_3_3_70b_instruct_wml_q_c_a_logprobs.json b/src/unitxt/catalog/metrics/rag/faithfulness/llama_3_3_70b_instruct_wml_q_c_a_logprobs.json
new file mode 100644
index 0000000000..54b538a4fb
--- /dev/null
+++ b/src/unitxt/catalog/metrics/rag/faithfulness/llama_3_3_70b_instruct_wml_q_c_a_logprobs.json
@@ -0,0 +1,10 @@
+{
+    "__type__": "task_based_ll_mas_judge",
+    "inference_model": "engines.classification.llama_3_3_70b_instruct_wml",
+    "template": "templates.rag_eval.faithfulness.judge_with_question_simplified_logprobs",
+    "task": "tasks.rag_eval.faithfulness.binary",
+    "format": null,
+    "main_score": "faithfulness_q_c_a_logprobs",
+    "prediction_field": "answer",
+    "infer_log_probs": true
+}
diff --git a/src/unitxt/catalog/metrics/rag/faithfulness/llama_3_3_70b_instruct_wml_q_c_a_verbal.json b/src/unitxt/catalog/metrics/rag/faithfulness/llama_3_3_70b_instruct_wml_q_c_a_verbal.json
new file mode 100644
index 0000000000..b4e6dd159f
--- /dev/null
+++ b/src/unitxt/catalog/metrics/rag/faithfulness/llama_3_3_70b_instruct_wml_q_c_a_verbal.json
@@ -0,0 +1,10 @@
+{
+    "__type__": "task_based_ll_mas_judge",
+    "inference_model": "engines.classification.llama_3_3_70b_instruct_wml",
+    "template": "templates.rag_eval.faithfulness.judge_with_question_simplified_verbal",
+    "task": "tasks.rag_eval.faithfulness.binary",
+    "format": null,
+    "main_score": "faithfulness_q_c_a_verbal",
+    "prediction_field": "answer",
+    "infer_log_probs": false
+}

From 2a509950e5683fa49d39ca9c2ac841276fae6dba Mon Sep 17 00:00:00 2001
From: dafnapension <dafnashein@yahoo.com>
Date: Tue, 21 Oct 2025 22:29:17 +0300
Subject: [PATCH 2/2] delete old

Signed-off-by: dafnapension <dafnashein@yahoo.com>
---
 ..._instruct_wml_answer_correctness_q_a_gt_loose.json | 11 -----------
 ..._wml_answer_correctness_q_a_gt_loose_logprobs.json | 11 -----------
 ...ama_3_1_70b_instruct_wml_answer_relevance_q_a.json | 11 -----------
 ...0b_instruct_wml_answer_relevance_q_a_logprobs.json | 11 -----------
 ...1_70b_instruct_wml_context_relevance_q_c_ares.json | 11 -----------
 ...truct_wml_context_relevance_q_c_ares_logprobs.json | 11 -----------
 ...1_70b_instruct_wml_correctness_holistic_q_c_a.json | 11 -----------
 ...truct_wml_correctness_holistic_q_c_a_logprobs.json | 11 -----------
 .../llama_3_1_70b_instruct_wml_faithfulness_c_a.json  | 11 -----------
 ..._1_70b_instruct_wml_faithfulness_c_a_logprobs.json | 11 -----------
 ...llama_3_1_70b_instruct_wml_faithfulness_q_c_a.json | 11 -----------
 ..._70b_instruct_wml_faithfulness_q_c_a_logprobs.json | 11 -----------
 .../llama_3_1_70b_instruct_wml_q_a_gt_loose.json      | 10 ----------
 ...ma_3_1_70b_instruct_wml_q_a_gt_loose_logprobs.json | 10 ----------
 ...ama_3_1_70b_instruct_wml_q_a_gt_loose_numeric.json | 10 ----------
 .../llama_3_1_70b_instruct_wml_q_a.json               | 10 ----------
 .../llama_3_1_70b_instruct_wml_q_a_logprobs.json      | 10 ----------
 .../llama_3_1_70b_instruct_wml_q_a_numeric.json       | 10 ----------
 .../llama_3_1_70b_instruct_wml_q_c_ares.json          | 10 ----------
 .../llama_3_1_70b_instruct_wml_q_c_ares_logprobs.json | 10 ----------
 .../llama_3_1_70b_instruct_wml_q_c_ares_numeric.json  | 10 ----------
 .../llama_3_1_70b_instruct_wml_q_c_a.json             | 10 ----------
 .../llama_3_1_70b_instruct_wml_q_c_a_logprobs.json    | 10 ----------
 .../llama_3_1_70b_instruct_wml_q_c_a_numeric.json     | 10 ----------
 .../faithfulness/llama_3_1_70b_instruct_wml_c_a.json  | 10 ----------
 .../llama_3_1_70b_instruct_wml_c_a_logprobs.json      | 10 ----------
 .../llama_3_1_70b_instruct_wml_c_a_verbal.json        | 10 ----------
 .../llama_3_1_70b_instruct_wml_q_c_a.json             | 10 ----------
 .../llama_3_1_70b_instruct_wml_q_c_a_logprobs.json    | 10 ----------
 .../llama_3_1_70b_instruct_wml_q_c_a_verbal.json      | 10 ----------
 30 files changed, 312 deletions(-)
 delete mode 100644 src/unitxt/catalog/metrics/llm_as_judge/binary/llama_3_1_70b_instruct_wml_answer_correctness_q_a_gt_loose.json
 delete mode 100644 src/unitxt/catalog/metrics/llm_as_judge/binary/llama_3_1_70b_instruct_wml_answer_correctness_q_a_gt_loose_logprobs.json
 delete mode 100644 src/unitxt/catalog/metrics/llm_as_judge/binary/llama_3_1_70b_instruct_wml_answer_relevance_q_a.json
 delete mode 100644 src/unitxt/catalog/metrics/llm_as_judge/binary/llama_3_1_70b_instruct_wml_answer_relevance_q_a_logprobs.json
 delete mode 100644 src/unitxt/catalog/metrics/llm_as_judge/binary/llama_3_1_70b_instruct_wml_context_relevance_q_c_ares.json
 delete mode 100644 src/unitxt/catalog/metrics/llm_as_judge/binary/llama_3_1_70b_instruct_wml_context_relevance_q_c_ares_logprobs.json
 delete mode 100644 src/unitxt/catalog/metrics/llm_as_judge/binary/llama_3_1_70b_instruct_wml_correctness_holistic_q_c_a.json
 delete mode 100644 src/unitxt/catalog/metrics/llm_as_judge/binary/llama_3_1_70b_instruct_wml_correctness_holistic_q_c_a_logprobs.json
 delete mode 100644 src/unitxt/catalog/metrics/llm_as_judge/binary/llama_3_1_70b_instruct_wml_faithfulness_c_a.json
 delete mode 100644 src/unitxt/catalog/metrics/llm_as_judge/binary/llama_3_1_70b_instruct_wml_faithfulness_c_a_logprobs.json
 delete mode 100644 src/unitxt/catalog/metrics/llm_as_judge/binary/llama_3_1_70b_instruct_wml_faithfulness_q_c_a.json
 delete mode 100644 src/unitxt/catalog/metrics/llm_as_judge/binary/llama_3_1_70b_instruct_wml_faithfulness_q_c_a_logprobs.json
 delete mode 100644 src/unitxt/catalog/metrics/rag/answer_correctness/llama_3_1_70b_instruct_wml_q_a_gt_loose.json
 delete mode 100644 src/unitxt/catalog/metrics/rag/answer_correctness/llama_3_1_70b_instruct_wml_q_a_gt_loose_logprobs.json
 delete mode 100644 src/unitxt/catalog/metrics/rag/answer_correctness/llama_3_1_70b_instruct_wml_q_a_gt_loose_numeric.json
 delete mode 100644 src/unitxt/catalog/metrics/rag/answer_relevance/llama_3_1_70b_instruct_wml_q_a.json
 delete mode 100644 src/unitxt/catalog/metrics/rag/answer_relevance/llama_3_1_70b_instruct_wml_q_a_logprobs.json
 delete mode 100644 src/unitxt/catalog/metrics/rag/answer_relevance/llama_3_1_70b_instruct_wml_q_a_numeric.json
 delete mode 100644 src/unitxt/catalog/metrics/rag/context_relevance/llama_3_1_70b_instruct_wml_q_c_ares.json
 delete mode 100644 src/unitxt/catalog/metrics/rag/context_relevance/llama_3_1_70b_instruct_wml_q_c_ares_logprobs.json
 delete mode 100644 src/unitxt/catalog/metrics/rag/context_relevance/llama_3_1_70b_instruct_wml_q_c_ares_numeric.json
 delete mode 100644 src/unitxt/catalog/metrics/rag/correctness_holistic/llama_3_1_70b_instruct_wml_q_c_a.json
 delete mode 100644 src/unitxt/catalog/metrics/rag/correctness_holistic/llama_3_1_70b_instruct_wml_q_c_a_logprobs.json
 delete mode 100644 src/unitxt/catalog/metrics/rag/correctness_holistic/llama_3_1_70b_instruct_wml_q_c_a_numeric.json
 delete mode 100644 src/unitxt/catalog/metrics/rag/faithfulness/llama_3_1_70b_instruct_wml_c_a.json
 delete mode 100644 src/unitxt/catalog/metrics/rag/faithfulness/llama_3_1_70b_instruct_wml_c_a_logprobs.json
 delete mode 100644 src/unitxt/catalog/metrics/rag/faithfulness/llama_3_1_70b_instruct_wml_c_a_verbal.json
 delete mode 100644 src/unitxt/catalog/metrics/rag/faithfulness/llama_3_1_70b_instruct_wml_q_c_a.json
 delete mode 100644 src/unitxt/catalog/metrics/rag/faithfulness/llama_3_1_70b_instruct_wml_q_c_a_logprobs.json
 delete mode 100644 src/unitxt/catalog/metrics/rag/faithfulness/llama_3_1_70b_instruct_wml_q_c_a_verbal.json

diff --git a/src/unitxt/catalog/metrics/llm_as_judge/binary/llama_3_1_70b_instruct_wml_answer_correctness_q_a_gt_loose.json b/src/unitxt/catalog/metrics/llm_as_judge/binary/llama_3_1_70b_instruct_wml_answer_correctness_q_a_gt_loose.json
deleted file mode 100644
index 4608df17fe..0000000000
--- a/src/unitxt/catalog/metrics/llm_as_judge/binary/llama_3_1_70b_instruct_wml_answer_correctness_q_a_gt_loose.json
+++ /dev/null
@@ -1,11 +0,0 @@
-{
-    "__type__": "task_based_ll_mas_judge",
-    "inference_model": "engines.classification.llama_3_1_70b_instruct_wml",
-    "template": "templates.rag_eval.answer_correctness.judge_loose_match_no_context",
-    "task": "tasks.rag_eval.answer_correctness.binary",
-    "format": null,
-    "main_score": "answer_correctness_q_a_gt_loose",
-    "prediction_field": "answer",
-    "infer_log_probs": false,
-    "__deprecated_msg__": "This metric should be replaced with metrics.rag.answer_correctness.llama_3_1_70b_instruct_wml_q_a_gt_loose"
-}
diff --git a/src/unitxt/catalog/metrics/llm_as_judge/binary/llama_3_1_70b_instruct_wml_answer_correctness_q_a_gt_loose_logprobs.json b/src/unitxt/catalog/metrics/llm_as_judge/binary/llama_3_1_70b_instruct_wml_answer_correctness_q_a_gt_loose_logprobs.json
deleted file mode 100644
index df11925768..0000000000
--- a/src/unitxt/catalog/metrics/llm_as_judge/binary/llama_3_1_70b_instruct_wml_answer_correctness_q_a_gt_loose_logprobs.json
+++ /dev/null
@@ -1,11 +0,0 @@
-{
-    "__type__": "task_based_ll_mas_judge",
-    "inference_model": "engines.classification.llama_3_1_70b_instruct_wml",
-    "template": "templates.rag_eval.answer_correctness.judge_loose_match_no_context_logprobs",
-    "task": "tasks.rag_eval.answer_correctness.binary",
-    "format": null,
-    "main_score": "answer_correctness_q_a_gt_loose_logprobs",
-    "prediction_field": "answer",
-    "infer_log_probs": true,
-    "__deprecated_msg__": "This metric should be replaced with metrics.rag.answer_correctness.llama_3_1_70b_instruct_wml_q_a_gt_loose_logprobs"
-}
diff --git a/src/unitxt/catalog/metrics/llm_as_judge/binary/llama_3_1_70b_instruct_wml_answer_relevance_q_a.json b/src/unitxt/catalog/metrics/llm_as_judge/binary/llama_3_1_70b_instruct_wml_answer_relevance_q_a.json
deleted file mode 100644
index cb52d639ae..0000000000
--- a/src/unitxt/catalog/metrics/llm_as_judge/binary/llama_3_1_70b_instruct_wml_answer_relevance_q_a.json
+++ /dev/null
@@ -1,11 +0,0 @@
-{
-    "__type__": "task_based_ll_mas_judge",
-    "inference_model": "engines.classification.llama_3_1_70b_instruct_wml",
-    "template": "templates.rag_eval.answer_relevance.judge_answer_relevance",
-    "task": "tasks.rag_eval.answer_relevance.binary",
-    "format": null,
-    "main_score": "answer_relevance_q_a",
-    "prediction_field": "answer",
-    "infer_log_probs": false,
-    "__deprecated_msg__": "This metric should be replaced with metrics.rag.answer_relevance.llama_3_1_70b_instruct_wml_q_a"
-}
diff --git a/src/unitxt/catalog/metrics/llm_as_judge/binary/llama_3_1_70b_instruct_wml_answer_relevance_q_a_logprobs.json b/src/unitxt/catalog/metrics/llm_as_judge/binary/llama_3_1_70b_instruct_wml_answer_relevance_q_a_logprobs.json
deleted file mode 100644
index 7dbc464162..0000000000
--- a/src/unitxt/catalog/metrics/llm_as_judge/binary/llama_3_1_70b_instruct_wml_answer_relevance_q_a_logprobs.json
+++ /dev/null
@@ -1,11 +0,0 @@
-{
-    "__type__": "task_based_ll_mas_judge",
-    "inference_model": "engines.classification.llama_3_1_70b_instruct_wml",
-    "template": "templates.rag_eval.answer_relevance.judge_answer_relevance_logprobs",
-    "task": "tasks.rag_eval.answer_relevance.binary",
-    "format": null,
-    "main_score": "answer_relevance_q_a_logprobs",
-    "prediction_field": "answer",
-    "infer_log_probs": true,
-    "__deprecated_msg__": "This metric should be replaced with metrics.rag.answer_relevance.llama_3_1_70b_instruct_wml_q_a_logprobs"
-}
diff --git a/src/unitxt/catalog/metrics/llm_as_judge/binary/llama_3_1_70b_instruct_wml_context_relevance_q_c_ares.json b/src/unitxt/catalog/metrics/llm_as_judge/binary/llama_3_1_70b_instruct_wml_context_relevance_q_c_ares.json
deleted file mode 100644
index dd7cfc3e41..0000000000
--- a/src/unitxt/catalog/metrics/llm_as_judge/binary/llama_3_1_70b_instruct_wml_context_relevance_q_c_ares.json
+++ /dev/null
@@ -1,11 +0,0 @@
-{
-    "__type__": "task_based_ll_mas_judge",
-    "inference_model": "engines.classification.llama_3_1_70b_instruct_wml",
-    "template": "templates.rag_eval.context_relevance.judge_context_relevance_ares",
-    "task": "tasks.rag_eval.context_relevance.binary",
-    "format": null,
-    "main_score": "context_relevance_q_c_ares",
-    "prediction_field": "contexts",
-    "infer_log_probs": false,
-    "__deprecated_msg__": "This metric should be replaced with metrics.rag.context_relevance.llama_3_1_70b_instruct_wml_q_c_ares"
-}
diff --git a/src/unitxt/catalog/metrics/llm_as_judge/binary/llama_3_1_70b_instruct_wml_context_relevance_q_c_ares_logprobs.json b/src/unitxt/catalog/metrics/llm_as_judge/binary/llama_3_1_70b_instruct_wml_context_relevance_q_c_ares_logprobs.json
deleted file mode 100644
index 590b902a34..0000000000
--- a/src/unitxt/catalog/metrics/llm_as_judge/binary/llama_3_1_70b_instruct_wml_context_relevance_q_c_ares_logprobs.json
+++ /dev/null
@@ -1,11 +0,0 @@
-{
-    "__type__": "task_based_ll_mas_judge",
-    "inference_model": "engines.classification.llama_3_1_70b_instruct_wml",
-    "template": "templates.rag_eval.context_relevance.judge_context_relevance_ares_logprobs",
-    "task": "tasks.rag_eval.context_relevance.binary",
-    "format": null,
-    "main_score": "context_relevance_q_c_ares_logprobs",
-    "prediction_field": "contexts",
-    "infer_log_probs": true,
-    "__deprecated_msg__": "This metric should be replaced with metrics.rag.context_relevance.llama_3_1_70b_instruct_wml_q_c_ares_logprobs"
-}
diff --git a/src/unitxt/catalog/metrics/llm_as_judge/binary/llama_3_1_70b_instruct_wml_correctness_holistic_q_c_a.json b/src/unitxt/catalog/metrics/llm_as_judge/binary/llama_3_1_70b_instruct_wml_correctness_holistic_q_c_a.json
deleted file mode 100644
index 1d2f7a144c..0000000000
--- a/src/unitxt/catalog/metrics/llm_as_judge/binary/llama_3_1_70b_instruct_wml_correctness_holistic_q_c_a.json
+++ /dev/null
@@ -1,11 +0,0 @@
-{
-    "__type__": "task_based_ll_mas_judge",
-    "inference_model": "engines.classification.llama_3_1_70b_instruct_wml",
-    "template": "templates.rag_eval.correctness_holistic.judge_correctness_simple",
-    "task": "tasks.rag_eval.correctness_holistic.binary",
-    "format": null,
-    "main_score": "correctness_holistic_q_c_a",
-    "prediction_field": "answer",
-    "infer_log_probs": false,
-    "__deprecated_msg__": "This metric should be replaced with metrics.rag.correctness_holistic.llama_3_1_70b_instruct_wml_q_c_a"
-}
diff --git a/src/unitxt/catalog/metrics/llm_as_judge/binary/llama_3_1_70b_instruct_wml_correctness_holistic_q_c_a_logprobs.json b/src/unitxt/catalog/metrics/llm_as_judge/binary/llama_3_1_70b_instruct_wml_correctness_holistic_q_c_a_logprobs.json
deleted file mode 100644
index a819a468fb..0000000000
--- a/src/unitxt/catalog/metrics/llm_as_judge/binary/llama_3_1_70b_instruct_wml_correctness_holistic_q_c_a_logprobs.json
+++ /dev/null
@@ -1,11 +0,0 @@
-{
-    "__type__": "task_based_ll_mas_judge",
-    "inference_model": "engines.classification.llama_3_1_70b_instruct_wml",
-    "template": "templates.rag_eval.correctness_holistic.judge_correctness_simple_logprobs",
-    "task": "tasks.rag_eval.correctness_holistic.binary",
-    "format": null,
-    "main_score": "correctness_holistic_q_c_a_logprobs",
-    "prediction_field": "answer",
-    "infer_log_probs": true,
-    "__deprecated_msg__": "This metric should be replaced with metrics.rag.correctness_holistic.llama_3_1_70b_instruct_wml_q_c_a_logprobs"
-}
diff --git a/src/unitxt/catalog/metrics/llm_as_judge/binary/llama_3_1_70b_instruct_wml_faithfulness_c_a.json b/src/unitxt/catalog/metrics/llm_as_judge/binary/llama_3_1_70b_instruct_wml_faithfulness_c_a.json
deleted file mode 100644
index 247710c3e4..0000000000
--- a/src/unitxt/catalog/metrics/llm_as_judge/binary/llama_3_1_70b_instruct_wml_faithfulness_c_a.json
+++ /dev/null
@@ -1,11 +0,0 @@
-{
-    "__type__": "task_based_ll_mas_judge",
-    "inference_model": "engines.classification.llama_3_1_70b_instruct_wml",
-    "template": "templates.rag_eval.faithfulness.judge_no_question_simplified",
-    "task": "tasks.rag_eval.faithfulness.binary",
-    "format": null,
-    "main_score": "faithfulness_c_a",
-    "prediction_field": "answer",
-    "infer_log_probs": false,
-    "__deprecated_msg__": "This metric should be replaced with metrics.rag.faithfulness.llama_3_1_70b_instruct_wml_c_a"
-}
diff --git a/src/unitxt/catalog/metrics/llm_as_judge/binary/llama_3_1_70b_instruct_wml_faithfulness_c_a_logprobs.json b/src/unitxt/catalog/metrics/llm_as_judge/binary/llama_3_1_70b_instruct_wml_faithfulness_c_a_logprobs.json
deleted file mode 100644
index c64ee52d9b..0000000000
--- a/src/unitxt/catalog/metrics/llm_as_judge/binary/llama_3_1_70b_instruct_wml_faithfulness_c_a_logprobs.json
+++ /dev/null
@@ -1,11 +0,0 @@
-{
-    "__type__": "task_based_ll_mas_judge",
-    "inference_model": "engines.classification.llama_3_1_70b_instruct_wml",
-    "template": "templates.rag_eval.faithfulness.judge_no_question_simplified_logprobs",
-    "task": "tasks.rag_eval.faithfulness.binary",
-    "format": null,
-    "main_score": "faithfulness_c_a_logprobs",
-    "prediction_field": "answer",
-    "infer_log_probs": true,
-    "__deprecated_msg__": "This metric should be replaced with metrics.rag.faithfulness.llama_3_1_70b_instruct_wml_c_a_logprobs"
-}
diff --git a/src/unitxt/catalog/metrics/llm_as_judge/binary/llama_3_1_70b_instruct_wml_faithfulness_q_c_a.json b/src/unitxt/catalog/metrics/llm_as_judge/binary/llama_3_1_70b_instruct_wml_faithfulness_q_c_a.json
deleted file mode 100644
index 8e6b53e089..0000000000
--- a/src/unitxt/catalog/metrics/llm_as_judge/binary/llama_3_1_70b_instruct_wml_faithfulness_q_c_a.json
+++ /dev/null
@@ -1,11 +0,0 @@
-{
-    "__type__": "task_based_ll_mas_judge",
-    "inference_model": "engines.classification.llama_3_1_70b_instruct_wml",
-    "template": "templates.rag_eval.faithfulness.judge_with_question_simplified",
-    "task": "tasks.rag_eval.faithfulness.binary",
-    "format": null,
-    "main_score": "faithfulness_q_c_a",
-    "prediction_field": "answer",
-    "infer_log_probs": false,
-    "__deprecated_msg__": "This metric should be replaced with metrics.rag.faithfulness.llama_3_1_70b_instruct_wml_q_c_a"
-}
diff --git a/src/unitxt/catalog/metrics/llm_as_judge/binary/llama_3_1_70b_instruct_wml_faithfulness_q_c_a_logprobs.json b/src/unitxt/catalog/metrics/llm_as_judge/binary/llama_3_1_70b_instruct_wml_faithfulness_q_c_a_logprobs.json
deleted file mode 100644
index a15607ebec..0000000000
--- a/src/unitxt/catalog/metrics/llm_as_judge/binary/llama_3_1_70b_instruct_wml_faithfulness_q_c_a_logprobs.json
+++ /dev/null
@@ -1,11 +0,0 @@
-{
-    "__type__": "task_based_ll_mas_judge",
-    "inference_model": "engines.classification.llama_3_1_70b_instruct_wml",
-    "template": "templates.rag_eval.faithfulness.judge_with_question_simplified_logprobs",
-    "task": "tasks.rag_eval.faithfulness.binary",
-    "format": null,
-    "main_score": "faithfulness_q_c_a_logprobs",
-    "prediction_field": "answer",
-    "infer_log_probs": true,
-    "__deprecated_msg__": "This metric should be replaced with metrics.rag.faithfulness.llama_3_1_70b_instruct_wml_q_c_a_logprobs"
-}
diff --git a/src/unitxt/catalog/metrics/rag/answer_correctness/llama_3_1_70b_instruct_wml_q_a_gt_loose.json b/src/unitxt/catalog/metrics/rag/answer_correctness/llama_3_1_70b_instruct_wml_q_a_gt_loose.json
deleted file mode 100644
index b5fe039f72..0000000000
--- a/src/unitxt/catalog/metrics/rag/answer_correctness/llama_3_1_70b_instruct_wml_q_a_gt_loose.json
+++ /dev/null
@@ -1,10 +0,0 @@
-{
-    "__type__": "task_based_ll_mas_judge",
-    "inference_model": "engines.classification.llama_3_1_70b_instruct_wml",
-    "template": "templates.rag_eval.answer_correctness.judge_loose_match_no_context",
-    "task": "tasks.rag_eval.answer_correctness.binary",
-    "format": null,
-    "main_score": "answer_correctness_q_a_gt_loose",
-    "prediction_field": "answer",
-    "infer_log_probs": false
-}
diff --git a/src/unitxt/catalog/metrics/rag/answer_correctness/llama_3_1_70b_instruct_wml_q_a_gt_loose_logprobs.json b/src/unitxt/catalog/metrics/rag/answer_correctness/llama_3_1_70b_instruct_wml_q_a_gt_loose_logprobs.json
deleted file mode 100644
index 821144df5e..0000000000
--- a/src/unitxt/catalog/metrics/rag/answer_correctness/llama_3_1_70b_instruct_wml_q_a_gt_loose_logprobs.json
+++ /dev/null
@@ -1,10 +0,0 @@
-{
-    "__type__": "task_based_ll_mas_judge",
-    "inference_model": "engines.classification.llama_3_1_70b_instruct_wml",
-    "template": "templates.rag_eval.answer_correctness.judge_loose_match_no_context_logprobs",
-    "task": "tasks.rag_eval.answer_correctness.binary",
-    "format": null,
-    "main_score": "answer_correctness_q_a_gt_loose_logprobs",
-    "prediction_field": "answer",
-    "infer_log_probs": true
-}
diff --git a/src/unitxt/catalog/metrics/rag/answer_correctness/llama_3_1_70b_instruct_wml_q_a_gt_loose_numeric.json b/src/unitxt/catalog/metrics/rag/answer_correctness/llama_3_1_70b_instruct_wml_q_a_gt_loose_numeric.json
deleted file mode 100644
index c89c843aba..0000000000
--- a/src/unitxt/catalog/metrics/rag/answer_correctness/llama_3_1_70b_instruct_wml_q_a_gt_loose_numeric.json
+++ /dev/null
@@ -1,10 +0,0 @@
-{
-    "__type__": "task_based_ll_mas_judge",
-    "inference_model": "engines.classification.llama_3_1_70b_instruct_wml",
-    "template": "templates.rag_eval.answer_correctness.judge_loose_match_no_context_numeric",
-    "task": "tasks.rag_eval.answer_correctness.binary",
-    "format": null,
-    "main_score": "answer_correctness_q_a_gt_loose_numeric",
-    "prediction_field": "answer",
-    "infer_log_probs": false
-}
diff --git a/src/unitxt/catalog/metrics/rag/answer_relevance/llama_3_1_70b_instruct_wml_q_a.json b/src/unitxt/catalog/metrics/rag/answer_relevance/llama_3_1_70b_instruct_wml_q_a.json
deleted file mode 100644
index f6af64f895..0000000000
--- a/src/unitxt/catalog/metrics/rag/answer_relevance/llama_3_1_70b_instruct_wml_q_a.json
+++ /dev/null
@@ -1,10 +0,0 @@
-{
-    "__type__": "task_based_ll_mas_judge",
-    "inference_model": "engines.classification.llama_3_1_70b_instruct_wml",
-    "template": "templates.rag_eval.answer_relevance.judge_answer_relevance",
-    "task": "tasks.rag_eval.answer_relevance.binary",
-    "format": null,
-    "main_score": "answer_relevance_q_a",
-    "prediction_field": "answer",
-    "infer_log_probs": false
-}
diff --git a/src/unitxt/catalog/metrics/rag/answer_relevance/llama_3_1_70b_instruct_wml_q_a_logprobs.json b/src/unitxt/catalog/metrics/rag/answer_relevance/llama_3_1_70b_instruct_wml_q_a_logprobs.json
deleted file mode 100644
index 49b65f021f..0000000000
--- a/src/unitxt/catalog/metrics/rag/answer_relevance/llama_3_1_70b_instruct_wml_q_a_logprobs.json
+++ /dev/null
@@ -1,10 +0,0 @@
-{
-    "__type__": "task_based_ll_mas_judge",
-    "inference_model": "engines.classification.llama_3_1_70b_instruct_wml",
-    "template": "templates.rag_eval.answer_relevance.judge_answer_relevance_logprobs",
-    "task": "tasks.rag_eval.answer_relevance.binary",
-    "format": null,
-    "main_score": "answer_relevance_q_a_logprobs",
-    "prediction_field": "answer",
-    "infer_log_probs": true
-}
diff --git a/src/unitxt/catalog/metrics/rag/answer_relevance/llama_3_1_70b_instruct_wml_q_a_numeric.json b/src/unitxt/catalog/metrics/rag/answer_relevance/llama_3_1_70b_instruct_wml_q_a_numeric.json
deleted file mode 100644
index 90b1e4f36e..0000000000
--- a/src/unitxt/catalog/metrics/rag/answer_relevance/llama_3_1_70b_instruct_wml_q_a_numeric.json
+++ /dev/null
@@ -1,10 +0,0 @@
-{
-    "__type__": "task_based_ll_mas_judge",
-    "inference_model": "engines.classification.llama_3_1_70b_instruct_wml",
-    "template": "templates.rag_eval.answer_relevance.judge_answer_relevance_numeric",
-    "task": "tasks.rag_eval.answer_relevance.binary",
-    "format": null,
-    "main_score": "answer_relevance_q_a_numeric",
-    "prediction_field": "answer",
-    "infer_log_probs": false
-}
diff --git a/src/unitxt/catalog/metrics/rag/context_relevance/llama_3_1_70b_instruct_wml_q_c_ares.json b/src/unitxt/catalog/metrics/rag/context_relevance/llama_3_1_70b_instruct_wml_q_c_ares.json
deleted file mode 100644
index a25ab27136..0000000000
--- a/src/unitxt/catalog/metrics/rag/context_relevance/llama_3_1_70b_instruct_wml_q_c_ares.json
+++ /dev/null
@@ -1,10 +0,0 @@
-{
-    "__type__": "task_based_ll_mas_judge",
-    "inference_model": "engines.classification.llama_3_1_70b_instruct_wml",
-    "template": "templates.rag_eval.context_relevance.judge_context_relevance_ares",
-    "task": "tasks.rag_eval.context_relevance.binary",
-    "format": null,
-    "main_score": "context_relevance_q_c_ares",
-    "prediction_field": "contexts",
-    "infer_log_probs": false
-}
diff --git a/src/unitxt/catalog/metrics/rag/context_relevance/llama_3_1_70b_instruct_wml_q_c_ares_logprobs.json b/src/unitxt/catalog/metrics/rag/context_relevance/llama_3_1_70b_instruct_wml_q_c_ares_logprobs.json
deleted file mode 100644
index 8bad207bba..0000000000
--- a/src/unitxt/catalog/metrics/rag/context_relevance/llama_3_1_70b_instruct_wml_q_c_ares_logprobs.json
+++ /dev/null
@@ -1,10 +0,0 @@
-{
-    "__type__": "task_based_ll_mas_judge",
-    "inference_model": "engines.classification.llama_3_1_70b_instruct_wml",
-    "template": "templates.rag_eval.context_relevance.judge_context_relevance_ares_logprobs",
-    "task": "tasks.rag_eval.context_relevance.binary",
-    "format": null,
-    "main_score": "context_relevance_q_c_ares_logprobs",
-    "prediction_field": "contexts",
-    "infer_log_probs": true
-}
diff --git a/src/unitxt/catalog/metrics/rag/context_relevance/llama_3_1_70b_instruct_wml_q_c_ares_numeric.json b/src/unitxt/catalog/metrics/rag/context_relevance/llama_3_1_70b_instruct_wml_q_c_ares_numeric.json
deleted file mode 100644
index 65e1b9e8ad..0000000000
--- a/src/unitxt/catalog/metrics/rag/context_relevance/llama_3_1_70b_instruct_wml_q_c_ares_numeric.json
+++ /dev/null
@@ -1,10 +0,0 @@
-{
-    "__type__": "task_based_ll_mas_judge",
-    "inference_model": "engines.classification.llama_3_1_70b_instruct_wml",
-    "template": "templates.rag_eval.context_relevance.judge_context_relevance_ares_numeric",
-    "task": "tasks.rag_eval.context_relevance.binary",
-    "format": null,
-    "main_score": "context_relevance_q_c_ares_numeric",
-    "prediction_field": "contexts",
-    "infer_log_probs": false
-}
diff --git a/src/unitxt/catalog/metrics/rag/correctness_holistic/llama_3_1_70b_instruct_wml_q_c_a.json b/src/unitxt/catalog/metrics/rag/correctness_holistic/llama_3_1_70b_instruct_wml_q_c_a.json
deleted file mode 100644
index e1ca021268..0000000000
--- a/src/unitxt/catalog/metrics/rag/correctness_holistic/llama_3_1_70b_instruct_wml_q_c_a.json
+++ /dev/null
@@ -1,10 +0,0 @@
-{
-    "__type__": "task_based_ll_mas_judge",
-    "inference_model": "engines.classification.llama_3_1_70b_instruct_wml",
-    "template": "templates.rag_eval.correctness_holistic.judge_correctness_simple",
-    "task": "tasks.rag_eval.correctness_holistic.binary",
-    "format": null,
-    "main_score": "correctness_holistic_q_c_a",
-    "prediction_field": "answer",
-    "infer_log_probs": false
-}
diff --git a/src/unitxt/catalog/metrics/rag/correctness_holistic/llama_3_1_70b_instruct_wml_q_c_a_logprobs.json b/src/unitxt/catalog/metrics/rag/correctness_holistic/llama_3_1_70b_instruct_wml_q_c_a_logprobs.json
deleted file mode 100644
index 559ce625dd..0000000000
--- a/src/unitxt/catalog/metrics/rag/correctness_holistic/llama_3_1_70b_instruct_wml_q_c_a_logprobs.json
+++ /dev/null
@@ -1,10 +0,0 @@
-{
-    "__type__": "task_based_ll_mas_judge",
-    "inference_model": "engines.classification.llama_3_1_70b_instruct_wml",
-    "template": "templates.rag_eval.correctness_holistic.judge_correctness_simple_logprobs",
-    "task": "tasks.rag_eval.correctness_holistic.binary",
-    "format": null,
-    "main_score": "correctness_holistic_q_c_a_logprobs",
-    "prediction_field": "answer",
-    "infer_log_probs": true
-}
diff --git a/src/unitxt/catalog/metrics/rag/correctness_holistic/llama_3_1_70b_instruct_wml_q_c_a_numeric.json b/src/unitxt/catalog/metrics/rag/correctness_holistic/llama_3_1_70b_instruct_wml_q_c_a_numeric.json
deleted file mode 100644
index eb60fe03df..0000000000
--- a/src/unitxt/catalog/metrics/rag/correctness_holistic/llama_3_1_70b_instruct_wml_q_c_a_numeric.json
+++ /dev/null
@@ -1,10 +0,0 @@
-{
-    "__type__": "task_based_ll_mas_judge",
-    "inference_model": "engines.classification.llama_3_1_70b_instruct_wml",
-    "template": "templates.rag_eval.correctness_holistic.judge_correctness_simple_numeric",
-    "task": "tasks.rag_eval.correctness_holistic.binary",
-    "format": null,
-    "main_score": "correctness_holistic_q_c_a_numeric",
-    "prediction_field": "answer",
-    "infer_log_probs": false
-}
diff --git a/src/unitxt/catalog/metrics/rag/faithfulness/llama_3_1_70b_instruct_wml_c_a.json b/src/unitxt/catalog/metrics/rag/faithfulness/llama_3_1_70b_instruct_wml_c_a.json
deleted file mode 100644
index 666269ad0a..0000000000
--- a/src/unitxt/catalog/metrics/rag/faithfulness/llama_3_1_70b_instruct_wml_c_a.json
+++ /dev/null
@@ -1,10 +0,0 @@
-{
-    "__type__": "task_based_ll_mas_judge",
-    "inference_model": "engines.classification.llama_3_1_70b_instruct_wml",
-    "template": "templates.rag_eval.faithfulness.judge_no_question_simplified",
-    "task": "tasks.rag_eval.faithfulness.binary",
-    "format": null,
-    "main_score": "faithfulness_c_a",
-    "prediction_field": "answer",
-    "infer_log_probs": false
-}
diff --git a/src/unitxt/catalog/metrics/rag/faithfulness/llama_3_1_70b_instruct_wml_c_a_logprobs.json b/src/unitxt/catalog/metrics/rag/faithfulness/llama_3_1_70b_instruct_wml_c_a_logprobs.json
deleted file mode 100644
index 6ac0bbfdb1..0000000000
--- a/src/unitxt/catalog/metrics/rag/faithfulness/llama_3_1_70b_instruct_wml_c_a_logprobs.json
+++ /dev/null
@@ -1,10 +0,0 @@
-{
-    "__type__": "task_based_ll_mas_judge",
-    "inference_model": "engines.classification.llama_3_1_70b_instruct_wml",
-    "template": "templates.rag_eval.faithfulness.judge_no_question_simplified_logprobs",
-    "task": "tasks.rag_eval.faithfulness.binary",
-    "format": null,
-    "main_score": "faithfulness_c_a_logprobs",
-    "prediction_field": "answer",
-    "infer_log_probs": true
-}
diff --git a/src/unitxt/catalog/metrics/rag/faithfulness/llama_3_1_70b_instruct_wml_c_a_verbal.json b/src/unitxt/catalog/metrics/rag/faithfulness/llama_3_1_70b_instruct_wml_c_a_verbal.json
deleted file mode 100644
index 9965060d3e..0000000000
--- a/src/unitxt/catalog/metrics/rag/faithfulness/llama_3_1_70b_instruct_wml_c_a_verbal.json
+++ /dev/null
@@ -1,10 +0,0 @@
-{
-    "__type__": "task_based_ll_mas_judge",
-    "inference_model": "engines.classification.llama_3_1_70b_instruct_wml",
-    "template": "templates.rag_eval.faithfulness.judge_no_question_simplified_verbal",
-    "task": "tasks.rag_eval.faithfulness.binary",
-    "format": null,
-    "main_score": "faithfulness_c_a_verbal",
-    "prediction_field": "answer",
-    "infer_log_probs": false
-}
diff --git a/src/unitxt/catalog/metrics/rag/faithfulness/llama_3_1_70b_instruct_wml_q_c_a.json b/src/unitxt/catalog/metrics/rag/faithfulness/llama_3_1_70b_instruct_wml_q_c_a.json
deleted file mode 100644
index c7af8670fd..0000000000
--- a/src/unitxt/catalog/metrics/rag/faithfulness/llama_3_1_70b_instruct_wml_q_c_a.json
+++ /dev/null
@@ -1,10 +0,0 @@
-{
-    "__type__": "task_based_ll_mas_judge",
-    "inference_model": "engines.classification.llama_3_1_70b_instruct_wml",
-    "template": "templates.rag_eval.faithfulness.judge_with_question_simplified",
-    "task": "tasks.rag_eval.faithfulness.binary",
-    "format": null,
-    "main_score": "faithfulness_q_c_a",
-    "prediction_field": "answer",
-    "infer_log_probs": false
-}
diff --git a/src/unitxt/catalog/metrics/rag/faithfulness/llama_3_1_70b_instruct_wml_q_c_a_logprobs.json b/src/unitxt/catalog/metrics/rag/faithfulness/llama_3_1_70b_instruct_wml_q_c_a_logprobs.json
deleted file mode 100644
index 7ae8386d15..0000000000
--- a/src/unitxt/catalog/metrics/rag/faithfulness/llama_3_1_70b_instruct_wml_q_c_a_logprobs.json
+++ /dev/null
@@ -1,10 +0,0 @@
-{
-    "__type__": "task_based_ll_mas_judge",
-    "inference_model": "engines.classification.llama_3_1_70b_instruct_wml",
-    "template": "templates.rag_eval.faithfulness.judge_with_question_simplified_logprobs",
-    "task": "tasks.rag_eval.faithfulness.binary",
-    "format": null,
-    "main_score": "faithfulness_q_c_a_logprobs",
-    "prediction_field": "answer",
-    "infer_log_probs": true
-}
diff --git a/src/unitxt/catalog/metrics/rag/faithfulness/llama_3_1_70b_instruct_wml_q_c_a_verbal.json b/src/unitxt/catalog/metrics/rag/faithfulness/llama_3_1_70b_instruct_wml_q_c_a_verbal.json
deleted file mode 100644
index 1291a2e446..0000000000
--- a/src/unitxt/catalog/metrics/rag/faithfulness/llama_3_1_70b_instruct_wml_q_c_a_verbal.json
+++ /dev/null
@@ -1,10 +0,0 @@
-{
-    "__type__": "task_based_ll_mas_judge",
-    "inference_model": "engines.classification.llama_3_1_70b_instruct_wml",
-    "template": "templates.rag_eval.faithfulness.judge_with_question_simplified_verbal",
-    "task": "tasks.rag_eval.faithfulness.binary",
-    "format": null,
-    "main_score": "faithfulness_q_c_a_verbal",
-    "prediction_field": "answer",
-    "infer_log_probs": false
-}