open-compass · OliverLeeXZ · Jul 20, 2025 · Jul 20, 2025 · Aug 8, 2025 · Aug 8, 2025
diff --git a/run.py b/run.py
@@ -271,7 +271,8 @@ def main():
                 dist.barrier()
 
             try:
-                result_file_base = f'{model_name}_{dataset_name}.xlsx'
+                pred_format = get_pred_file_format()
+                result_file_base = f'{model_name}_{dataset_name}.{pred_format}'
 
                 if use_config:
                     if WORLD_SIZE > 1:
@@ -299,9 +300,6 @@ def main():
                         continue
 
                 # Handling Multi-Turn Dataset
-                if dataset.TYPE == 'MT':
-                    result_file_base = result_file_base.replace('.xlsx', '.tsv')
-
                 result_file = osp.join(pred_root, result_file_base)
                 # Reuse the previous prediction file if exists
                 if RANK == 0 and len(prev_pred_roots):

diff --git a/scripts/apires_scan.py b/scripts/apires_scan.py
@@ -10,7 +10,9 @@
 model_name = root.split('/')[-1]
 
 for d in SUPPORTED_DATASETS:
-    fname = f'{model_name}_{d}.xlsx'
+    from vlmeval.smp import get_pred_file_format
+    pred_format = get_pred_file_format()
+    fname = f'{model_name}_{d}.{pred_format}'
     pth = osp.join(root, fname)
     if osp.exists(pth):
         data = load(pth)

diff --git a/scripts/auto_run.py b/scripts/auto_run.py
@@ -26,7 +26,9 @@ def is_large(x):
 models = [x for x in models if not listinstr(['MiniGPT', 'grounding-generalist'], x)]
 
 for m in models:
-    unknown_datasets = [x for x in args.data if not osp.exists(f'{m}/{m}_{x}.xlsx')]
+    from vlmeval.smp import get_pred_file_format
+    pred_format = get_pred_file_format()
+    unknown_datasets = [x for x in args.data if not osp.exists(f'{m}/{m}_{x}.{pred_format}')]
     if len(unknown_datasets) == 0:
         continue
     dataset_str = ' '.join(unknown_datasets)

diff --git a/vlmeval/dataset/CGAVCounting/cg_av_counting.py b/vlmeval/dataset/CGAVCounting/cg_av_counting.py
@@ -359,10 +359,11 @@ def save_video_frames(self, video, uid, num_frames=8, fps=-1):
 
     def evaluate(self, eval_file, **judge_kwargs):
 
-        assert eval_file.endswith(".xlsx"), "data file should be an xlsx file"
+        assert get_file_extension(eval_file) in ['xlsx', 'json', 'tsv'], \
+            'data file should be an supported format (xlsx/json/tsv) file'
 
-        tgt_file = eval_file.replace(".xlsx", "_rating.json")
-        score_file = eval_file.replace(".xlsx", "_score.xlsx")
+        tgt_file = get_intermediate_file_path(eval_file, '_rating', 'json')
+        score_file = get_intermediate_file_path(eval_file, '_score', 'csv')
 
         data = load(eval_file)
 

diff --git a/vlmeval/dataset/EgoExoBench/egoexobench.py b/vlmeval/dataset/EgoExoBench/egoexobench.py
@@ -244,11 +244,12 @@ def build_prompt(self, line, video_llm):
     def evaluate(self, eval_file, **judge_kwargs):
         from .utils import get_dimension_rating, extract_characters_regex, extract_option
 
-        assert eval_file.endswith('.xlsx'), 'data file should be an xlsx file'
+        assert get_file_extension(eval_file) in ['xlsx', 'json', 'tsv'], \
+            'data file should be an supported format (xlsx/json/tsv) file'
 
-        tmp_file = eval_file.replace('.xlsx', '_tmp.pkl')
-        tgt_file = eval_file.replace('.xlsx', '_rating.json')
-        score_file = eval_file.replace('.xlsx', '_score.xlsx')
+        tmp_file = get_intermediate_file_path(eval_file, '_tmp', 'pkl')
+        tgt_file = get_intermediate_file_path(eval_file, '_rating', 'json')
+        score_file = get_intermediate_file_path(eval_file, '_score', 'csv')
 
         if not osp.exists(score_file):
             model = judge_kwargs.get('model', 'exact_matching')

diff --git a/vlmeval/dataset/GUI/screenspot.py b/vlmeval/dataset/GUI/screenspot.py
@@ -324,7 +324,7 @@ def evaluate_rectangle(self, eval_file, **judge_kwargs):
                 results_dict[key] = str(0)
             else:
                 results_dict[key] = str(sum(results_dict[key]) / len(results_dict[key]))
-        score_pth = eval_file.replace(".xlsx", "_score.json")
+        score_pth = get_intermediate_file_path(eval_file, '_score', 'json')
         dump(results_dict, score_pth)
 
         failure_cases_path = os.environ.get("FAILURE_CASES_PATH", None)
@@ -437,7 +437,7 @@ def make_safe(value):
                 sub_stats = itertools.chain(*sub_stats)
                 final_score_dict[c + '_Accuracy'] = np.mean([x > 0 for x in sub_stats]) * 100
 
-        score_pth = eval_file.replace(".xlsx", "_score.json")
+        score_pth = get_intermediate_file_path(eval_file, '_score', 'json')
         dump(final_score_dict, score_pth)
 
         failure_cases_path = os.environ.get("FAILURE_CASES_PATH", None)

diff --git a/vlmeval/dataset/GUI/screenspot_pro.py b/vlmeval/dataset/GUI/screenspot_pro.py
@@ -312,7 +312,7 @@ def evaluate_rectangle(self, eval_file, **judge_kwargs):
                 results_dict[key] = str(0)
             else:
                 results_dict[key] = str(sum(results_dict[key]) / len(results_dict[key]))
-        score_pth = eval_file.replace(".xlsx", "_score.json")
+        score_pth = get_intermediate_file_path(eval_file, '_score', 'json')
         dump(results_dict, score_pth)
 
         failure_cases_path = os.environ.get("FAILURE_CASES_PATH", None)
@@ -422,7 +422,7 @@ def make_safe(value):
             sub_stats = itertools.chain(*sub_stats)
             final_score_dict[c + '_Accuracy'] = np.mean([x > 0 for x in sub_stats]) * 100
 
-        score_pth = eval_file.replace(".xlsx", "_score.json")
+        score_pth = get_intermediate_file_path(eval_file, '_score', 'json')
         dump(final_score_dict, score_pth)
 
         failure_cases_path = os.environ.get("FAILURE_CASES_PATH", None)

diff --git a/vlmeval/dataset/OmniDocBench/omnidocbench.py b/vlmeval/dataset/OmniDocBench/omnidocbench.py
@@ -4,10 +4,12 @@
 import pandas as pd
 import tempfile
 import base64
+import numpy as np
 from tqdm import tqdm
 import torch.distributed as dist
 from ..image_base import ImageBaseDataset
 from ...smp import *
+from .utils import get_intermediate_file_path, load, dump
 
 
 class OmniDocBench(ImageBaseDataset):
@@ -75,9 +77,6 @@ def __init__(self,
                  tsv_path,
                  match_method:str='quick_match',
                  filter_types:dict=None):
-        self.result_foler='../../../outputs/OmniDocBench'
-        if not os.path.exists(self.result_foler):
-            os.makedirs(self.result_foler)
         self.eval_file=eval_file
         self.match_method=match_method
         self.references=[]
@@ -374,17 +373,18 @@ def process_generated_metric_results(self,samples,save_name:str='end2end_quick_m
                 'group':group_result,
                 'page':page_result
             }
-            if not os.path.exists('./output/OmniDocBench'):
-                os.makedirs('./output/OmniDocBench')
             if isinstance(cur_samples,list):
                 saved_samples=cur_samples
             else:
                 saved_samples=cur_samples.samples
-            with open(os.path.join(self.result_foler,f'{save_name}_result.josn'),'w',encoding='utf-8') as f:
-                json.dump(saved_samples,f,indent=4,ensure_ascii=False)
+            # NOTE: The original code has a bug here, it will overwrite the result file in each iteration.
+            # I will fix it by adding element to the filename.
+            # NOTE: Fixed typo .josn -> .json
+            result_file = get_intermediate_file_path(self.eval_file, f'_{save_name}_{element}_result', 'json')
+            dump(saved_samples, result_file)
 
-        with open(os.path.join(self.result_foler,f'{save_name}_metric_result.json'),'w',encoding='utf-8') as f:
-            json.dump(result_all,f,indent=4,ensure_ascii=False)
+        metric_result_file = get_intermediate_file_path(self.eval_file, f'_{save_name}_metric_result', 'json')
+        dump(result_all, metric_result_file)
 
         dict_list = []
         save_dict={}
@@ -409,20 +409,20 @@ def process_generated_metric_results(self,samples,save_name:str='end2end_quick_m
         dict_list.append(save_dict)
         df = pd.DataFrame(dict_list,index=['end2end',]).round(3)
 
-        with open(os.path.join(self.result_foler,'End2End_Evaluation.json'),'w',encoding='utf-8') as f:
-            json.dump(result_all,f,indent=4,ensure_ascii=False)
-        df.to_csv(os.path.join(self.result_foler,'overall.csv'))
-        over_all_path=os.path.join(self.result_foler,'End2End_Evaluation.json')
-        print(f"The save path of overall.csv is :{over_all_path}")
+        e2e_eval_file = get_intermediate_file_path(self.eval_file, '_End2End_Evaluation', 'json')
+        dump(result_all, e2e_eval_file)
+
+        overall_file = get_intermediate_file_path(self.eval_file, '_overall')
+        dump(df, overall_file)
+
+        print(f"The save path of End2End_Evaluation is: {e2e_eval_file}")
+        print(f"The save path of overall metrics is: {overall_file}")
         return df
 
 
 class table_evalutor():
     def __init__(self,eval_file,tsv_path):
-
-        self.result_foler='../../../outputs/OmniDocBench'
-        if not os.path.exists(self.result_foler):
-            os.makedirs(self.result_foler)
+        self.eval_file = eval_file
         gt_key='html'
         pred_key='pred'
         self.category_filter='table'
@@ -434,8 +434,8 @@ def load_data(self,eval_file,gt_file,pred_key,gt_key):
         from .data_preprocess import clean_string, normalized_formula, textblock2unicode, normalized_table
         samples=[]
         preds=[]
-        predictions=pd.read_excel(eval_file)['prediction'].tolist()
-        gt_samples=pd.read_csv(gt_file,sep='\t')['answer'].tolist()
+        predictions=load(eval_file)['prediction'].tolist()
+        gt_samples=load(gt_file)['answer'].tolist()
         load_success,load_fail=0,0
         for i,gt_sample in tqdm(enumerate(gt_samples),desc='Loading data'):
             try:
@@ -533,8 +533,8 @@ def process_generated_metric_results(self,save_name:str='OmniDocBench_table'):
             'page':page_result
         }
 
-        with open(os.path.join(self.result_foler,f'{save_name}_metric_result.json'),'w',encoding='utf-8') as f:
-            json.dump(result_all,f,indent=4,ensure_ascii=False)
+        metric_result_file = get_intermediate_file_path(self.eval_file, f'_{save_name}_metric_result', 'json')
+        dump(result_all, metric_result_file)
 
         dict_list=[]
         dict_list.append(result_all["group"]["TEDS"])
@@ -545,10 +545,7 @@ def process_generated_metric_results(self,save_name:str='OmniDocBench_table'):
         selected_columns = df4[["language: table_en", "language: table_simplified_chinese", "language: table_en_ch_mixed", "line: full_line", "line: less_line", "line: fewer_line", "line: wireless_line",
                         "with_span: True", "with_span: False", "include_equation: True", "include_equation: False", "include_background: True", "include_background: False", "table_layout: vertical", "table_layout: horizontal"]]
 
-        selected_columns.to_csv(os.path.join(self.result_foler,'table_attribute.csv'))
-        table_attribute_path=os.path.join(self.result_foler,'table_attribute.csv')
-        print(f'The save path of table_attribute.csv is :{table_attribute_path}')
-        selected_columns
-
-
+        table_attr_file = get_intermediate_file_path(self.eval_file, '_table_attribute')
+        dump(selected_columns, table_attr_file)
+        print(f'The save path of table_attribute is :{table_attr_file}')
         return selected_columns
diff --git a/vlmeval/dataset/__init__.py b/vlmeval/dataset/__init__.py
@@ -151,7 +151,6 @@ def supported_datasets(cls):
         return list(cls.DATASET_SETS)
 
     def evaluate(self, eval_file, **judge_kwargs):
-        suffix = eval_file.split('.')[-1]
         # First, split the eval_file by dataset
         data_all = load(eval_file)
         for dname in self.datasets:
@@ -179,11 +178,11 @@ def evaluate(self, eval_file, **judge_kwargs):
 
         if len(df_all):
             result = pd.concat(df_all)
-            score_file = eval_file.replace(f'.{suffix}', '_acc.csv')
+            score_file = get_intermediate_file_path(eval_file, '_acc', 'csv')
             dump(result, score_file)
             return result
         else:
-            score_file = eval_file.replace(f'.{suffix}', '_score.json')
+            score_file = get_intermediate_file_path(eval_file, '_score', 'json')
             dump(dict_all, score_file)
             return dict_all
 

diff --git a/vlmeval/dataset/cgbench.py b/vlmeval/dataset/cgbench.py
@@ -1,5 +1,6 @@
 from huggingface_hub import snapshot_download
 from ..smp import *
+from ..smp.file import get_intermediate_file_path, get_file_extension
 from .video_base import VideoBaseDataset
 from .utils import build_judge, DEBUG_MESSAGE
 from .utils.cgbench import *
@@ -432,10 +433,10 @@ def save_video_frames(self, video, uid, clue_intervals=None, num_frames=8, fps=-
 
     def evaluate(self, eval_file, **judge_kwargs):
 
-        assert eval_file.endswith(".xlsx"), "data file should be an xlsx file"
+        assert get_file_extension(eval_file) in ['xlsx', 'json', 'tsv'], "data file should be a supported format"
 
-        tgt_file = eval_file.replace(".xlsx", "_rating.json")
-        score_file = eval_file.replace(".xlsx", "_score.xlsx")
+        tgt_file = get_intermediate_file_path(eval_file, '_rating', 'json')
+        score_file = get_intermediate_file_path(eval_file, '_score')
 
         data = load(eval_file)
 
@@ -760,12 +761,12 @@ def evaluate(self, eval_file, **judge_kwargs):
 
         from .utils.cgbench import get_dimention_rating_open_ended, post_process_open
 
-        assert eval_file.endswith(".xlsx"), "data file should be an xlsx file"
+        assert get_file_extension(eval_file) in ['xlsx', 'json', 'tsv'], "data file should be a supported format"
 
-        tgt_file = eval_file.replace(".xlsx", "_rating.json")
-        score_file = eval_file.replace(".xlsx", "_score.xlsx")
-        step_1_tmp_file = eval_file.replace(".xlsx", "_step_1.pkl")
-        step_2_tmp_file = eval_file.replace(".xlsx", "_step_2.pkl")
+        tgt_file = get_intermediate_file_path(eval_file, '_rating', 'json')
+        score_file = get_intermediate_file_path(eval_file, '_score')
+        step_1_tmp_file = get_intermediate_file_path(eval_file, '_step_1', 'pkl')
+        step_2_tmp_file = get_intermediate_file_path(eval_file, '_step_2', 'pkl')
 
         data = load(eval_file)
 
@@ -784,13 +785,13 @@ def evaluate(self, eval_file, **judge_kwargs):
             axis=1,
         )
 
-        data_no_model_result = data_pred_no_na[data_pred_no_na["model_result"] == -1]
-        data_step_1 = data_pred_no_na[data_pred_no_na["model_result"] != -1]
-
         if judge_kwargs.get("model", None) != "gpt-4o-0806":
             judge_kwargs["model"] = "gpt-4o-0806"
             print("The judge model in cg-bench is gpt-4o-0806!")
 
+        data_no_model_result = data_pred_no_na[data_pred_no_na["model_result"] == -1]
+        data_step_1 = data_pred_no_na[data_pred_no_na["model_result"] != -1]
+
         model_step_1 = build_judge(system_prompt=sys_prompt_open_eval_step_1, **judge_kwargs)
         nproc = judge_kwargs.pop("nproc", 32)
 
@@ -1314,10 +1315,10 @@ def save_video_frames(self, video, uid, clue_intervals=None, num_frames=8, fps=-
 
     def evaluate(self, eval_file, **judge_kwargs):
 
-        assert eval_file.endswith(".xlsx"), "data file should be an xlsx file"
+        assert get_file_extension(eval_file) in ['xlsx', 'json', 'tsv'], "data file should be a supported format"
 
-        tgt_file = eval_file.replace(".xlsx", "_rating.json")
-        score_file = eval_file.replace(".xlsx", "_score.xlsx")
+        tgt_file = get_intermediate_file_path(eval_file, '_rating', 'json')
+        score_file = get_intermediate_file_path(eval_file, '_score')
 
         data = load(eval_file)
 
@@ -1641,12 +1642,12 @@ def evaluate(self, eval_file, **judge_kwargs):
 
         from .utils.cgbench import get_dimention_rating_open_ended, post_process_open
 
-        assert eval_file.endswith(".xlsx"), "data file should be an xlsx file"
+        assert get_file_extension(eval_file) in ['xlsx', 'json', 'tsv'], "data file should be a supported format"
 
-        tgt_file = eval_file.replace(".xlsx", "_rating.json")
-        score_file = eval_file.replace(".xlsx", "_score.xlsx")
-        step_1_tmp_file = eval_file.replace(".xlsx", "_step_1.pkl")
-        step_2_tmp_file = eval_file.replace(".xlsx", "_step_2.pkl")
+        tgt_file = get_intermediate_file_path(eval_file, '_rating', 'json')
+        score_file = get_intermediate_file_path(eval_file, '_score')
+        step_1_tmp_file = get_intermediate_file_path(eval_file, '_step_1', 'pkl')
+        step_2_tmp_file = get_intermediate_file_path(eval_file, '_step_2', 'pkl')
 
         data = load(eval_file)
 

diff --git a/vlmeval/dataset/chartmimic.py b/vlmeval/dataset/chartmimic.py
@@ -570,19 +570,12 @@ def judge_one_item_success(item):
 
         infer_data_all = load(eval_file).to_dict(orient="records")
 
-        suffix = eval_file.split(".")[-1]
         print(f"judge_kwargs: {judge_kwargs}")
         infer_model = judge_kwargs["model"]
-        storage = os.path.abspath(
-            eval_file.replace(f".{suffix}", f"_{infer_model}.jsonl")
-        )
-        score_file = os.path.abspath(
-            eval_file.replace(f".{suffix}", f"_{infer_model}_score.csv")
-        )
+        storage = os.path.abspath(get_intermediate_file_path(eval_file, f'_{infer_model}', 'jsonl'))
+        score_file = os.path.abspath(get_intermediate_file_path(eval_file, f'_{infer_model}_score', 'csv'))
         # use abs path because of using os.chdir()
-        tmp_file = os.path.abspath(
-            eval_file.replace(f".{suffix}", f"_{infer_model}_tmp.pkl")
-        )
+        tmp_file = os.path.abspath(get_intermediate_file_path(eval_file, f'_{infer_model}_tmp', 'pkl'))
         # actually the --api-nproc
         nproc = judge_kwargs.pop("nproc", 8)
         logger.info(f"nproc: {nproc}")

diff --git a/vlmeval/dataset/charxiv.py b/vlmeval/dataset/charxiv.py
@@ -6,6 +6,7 @@
 
 from vlmeval.dataset.image_base import ImageBaseDataset
 from vlmeval.smp import misc, file
+from vlmeval.smp.file import get_intermediate_file_path
 from vlmeval import utils
 from vlmeval.dataset.utils import build_judge
 
@@ -203,10 +204,9 @@ def evaluate(self, eval_file: str, **judge_kwargs: Any) -> pd.DataFrame:
         judge_model_name = judge_model.model
 
         # Define file paths
-        suffix = eval_file.split(".")[-1]
-        result_file = eval_file.replace(f".{suffix}", f"_{judge_model_name}.xlsx")
-        temp_result_file = eval_file.replace(f".{suffix}", f"_{judge_model_name}.pkl")
-        score_file = result_file.replace(".xlsx", "_acc.csv")
+        result_file = get_intermediate_file_path(eval_file, f"_{judge_model_name}")
+        temp_result_file = get_intermediate_file_path(eval_file, f"_{judge_model_name}", "pkl")
+        score_file = get_intermediate_file_path(result_file, "_acc", "csv")
 
         # Return existing results if available
         if os.path.exists(result_file):

diff --git a/vlmeval/dataset/cmmmu.py b/vlmeval/dataset/cmmmu.py
@@ -5,6 +5,7 @@
 import re
 import tempfile
 from ..smp import *
+from ..smp.file import get_intermediate_file_path
 
 
 def get_multi_choice_prediction(response, all_choices, index2ans):
@@ -223,8 +224,7 @@ def dump_image(self, line):
     @classmethod
     def evaluate(self, eval_file, **judge_kwargs):
 
-        suffix = eval_file.split('.')[-1]
-        result_file = eval_file.replace(f'.{suffix}', '_acc.csv')
+        result_file = get_intermediate_file_path(eval_file, '_acc', 'csv')
 
         if not osp.exists(result_file):
             data = load(eval_file)