Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 2 additions & 4 deletions run.py
Original file line number Diff line number Diff line change
Expand Up @@ -271,7 +271,8 @@ def main():
dist.barrier()

try:
result_file_base = f'{model_name}_{dataset_name}.xlsx'
pred_format = get_pred_file_format()
result_file_base = f'{model_name}_{dataset_name}.{pred_format}'

if use_config:
if WORLD_SIZE > 1:
Expand Down Expand Up @@ -299,9 +300,6 @@ def main():
continue

# Handling Multi-Turn Dataset
if dataset.TYPE == 'MT':
result_file_base = result_file_base.replace('.xlsx', '.tsv')

result_file = osp.join(pred_root, result_file_base)
# Reuse the previous prediction file if exists
if RANK == 0 and len(prev_pred_roots):
Expand Down
4 changes: 3 additions & 1 deletion scripts/apires_scan.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,9 @@
model_name = root.split('/')[-1]

for d in SUPPORTED_DATASETS:
fname = f'{model_name}_{d}.xlsx'
from vlmeval.smp import get_pred_file_format
pred_format = get_pred_file_format()
fname = f'{model_name}_{d}.{pred_format}'
pth = osp.join(root, fname)
if osp.exists(pth):
data = load(pth)
Expand Down
4 changes: 3 additions & 1 deletion scripts/auto_run.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,9 @@ def is_large(x):
models = [x for x in models if not listinstr(['MiniGPT', 'grounding-generalist'], x)]

for m in models:
unknown_datasets = [x for x in args.data if not osp.exists(f'{m}/{m}_{x}.xlsx')]
from vlmeval.smp import get_pred_file_format
pred_format = get_pred_file_format()
unknown_datasets = [x for x in args.data if not osp.exists(f'{m}/{m}_{x}.{pred_format}')]
if len(unknown_datasets) == 0:
continue
dataset_str = ' '.join(unknown_datasets)
Expand Down
7 changes: 4 additions & 3 deletions vlmeval/dataset/CGAVCounting/cg_av_counting.py
Original file line number Diff line number Diff line change
Expand Up @@ -359,10 +359,11 @@ def save_video_frames(self, video, uid, num_frames=8, fps=-1):

def evaluate(self, eval_file, **judge_kwargs):

assert eval_file.endswith(".xlsx"), "data file should be an xlsx file"
assert get_file_extension(eval_file) in ['xlsx', 'json', 'tsv'], \
'data file should be an supported format (xlsx/json/tsv) file'

tgt_file = eval_file.replace(".xlsx", "_rating.json")
score_file = eval_file.replace(".xlsx", "_score.xlsx")
tgt_file = get_intermediate_file_path(eval_file, '_rating', 'json')
score_file = get_intermediate_file_path(eval_file, '_score', 'csv')

data = load(eval_file)

Expand Down
9 changes: 5 additions & 4 deletions vlmeval/dataset/EgoExoBench/egoexobench.py
Original file line number Diff line number Diff line change
Expand Up @@ -244,11 +244,12 @@ def build_prompt(self, line, video_llm):
def evaluate(self, eval_file, **judge_kwargs):
from .utils import get_dimension_rating, extract_characters_regex, extract_option

assert eval_file.endswith('.xlsx'), 'data file should be an xlsx file'
assert get_file_extension(eval_file) in ['xlsx', 'json', 'tsv'], \
'data file should be an supported format (xlsx/json/tsv) file'

tmp_file = eval_file.replace('.xlsx', '_tmp.pkl')
tgt_file = eval_file.replace('.xlsx', '_rating.json')
score_file = eval_file.replace('.xlsx', '_score.xlsx')
tmp_file = get_intermediate_file_path(eval_file, '_tmp', 'pkl')
tgt_file = get_intermediate_file_path(eval_file, '_rating', 'json')
score_file = get_intermediate_file_path(eval_file, '_score', 'csv')

if not osp.exists(score_file):
model = judge_kwargs.get('model', 'exact_matching')
Expand Down
4 changes: 2 additions & 2 deletions vlmeval/dataset/GUI/screenspot.py
Original file line number Diff line number Diff line change
Expand Up @@ -324,7 +324,7 @@ def evaluate_rectangle(self, eval_file, **judge_kwargs):
results_dict[key] = str(0)
else:
results_dict[key] = str(sum(results_dict[key]) / len(results_dict[key]))
score_pth = eval_file.replace(".xlsx", "_score.json")
score_pth = get_intermediate_file_path(eval_file, '_score', 'json')
dump(results_dict, score_pth)

failure_cases_path = os.environ.get("FAILURE_CASES_PATH", None)
Expand Down Expand Up @@ -437,7 +437,7 @@ def make_safe(value):
sub_stats = itertools.chain(*sub_stats)
final_score_dict[c + '_Accuracy'] = np.mean([x > 0 for x in sub_stats]) * 100

score_pth = eval_file.replace(".xlsx", "_score.json")
score_pth = get_intermediate_file_path(eval_file, '_score', 'json')
dump(final_score_dict, score_pth)

failure_cases_path = os.environ.get("FAILURE_CASES_PATH", None)
Expand Down
4 changes: 2 additions & 2 deletions vlmeval/dataset/GUI/screenspot_pro.py
Original file line number Diff line number Diff line change
Expand Up @@ -312,7 +312,7 @@ def evaluate_rectangle(self, eval_file, **judge_kwargs):
results_dict[key] = str(0)
else:
results_dict[key] = str(sum(results_dict[key]) / len(results_dict[key]))
score_pth = eval_file.replace(".xlsx", "_score.json")
score_pth = get_intermediate_file_path(eval_file, '_score', 'json')
dump(results_dict, score_pth)

failure_cases_path = os.environ.get("FAILURE_CASES_PATH", None)
Expand Down Expand Up @@ -422,7 +422,7 @@ def make_safe(value):
sub_stats = itertools.chain(*sub_stats)
final_score_dict[c + '_Accuracy'] = np.mean([x > 0 for x in sub_stats]) * 100

score_pth = eval_file.replace(".xlsx", "_score.json")
score_pth = get_intermediate_file_path(eval_file, '_score', 'json')
dump(final_score_dict, score_pth)

failure_cases_path = os.environ.get("FAILURE_CASES_PATH", None)
Expand Down
53 changes: 25 additions & 28 deletions vlmeval/dataset/OmniDocBench/omnidocbench.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,10 +4,12 @@
import pandas as pd
import tempfile
import base64
import numpy as np
from tqdm import tqdm
import torch.distributed as dist
from ..image_base import ImageBaseDataset
from ...smp import *
from .utils import get_intermediate_file_path, load, dump


class OmniDocBench(ImageBaseDataset):
Expand Down Expand Up @@ -75,9 +77,6 @@ def __init__(self,
tsv_path,
match_method:str='quick_match',
filter_types:dict=None):
self.result_foler='../../../outputs/OmniDocBench'
if not os.path.exists(self.result_foler):
os.makedirs(self.result_foler)
self.eval_file=eval_file
self.match_method=match_method
self.references=[]
Expand Down Expand Up @@ -374,17 +373,18 @@ def process_generated_metric_results(self,samples,save_name:str='end2end_quick_m
'group':group_result,
'page':page_result
}
if not os.path.exists('./output/OmniDocBench'):
os.makedirs('./output/OmniDocBench')
if isinstance(cur_samples,list):
saved_samples=cur_samples
else:
saved_samples=cur_samples.samples
with open(os.path.join(self.result_foler,f'{save_name}_result.josn'),'w',encoding='utf-8') as f:
json.dump(saved_samples,f,indent=4,ensure_ascii=False)
# NOTE: The original code has a bug here, it will overwrite the result file in each iteration.
# I will fix it by adding element to the filename.
# NOTE: Fixed typo .josn -> .json
result_file = get_intermediate_file_path(self.eval_file, f'_{save_name}_{element}_result', 'json')
dump(saved_samples, result_file)

with open(os.path.join(self.result_foler,f'{save_name}_metric_result.json'),'w',encoding='utf-8') as f:
json.dump(result_all,f,indent=4,ensure_ascii=False)
metric_result_file = get_intermediate_file_path(self.eval_file, f'_{save_name}_metric_result', 'json')
dump(result_all, metric_result_file)

dict_list = []
save_dict={}
Expand All @@ -409,20 +409,20 @@ def process_generated_metric_results(self,samples,save_name:str='end2end_quick_m
dict_list.append(save_dict)
df = pd.DataFrame(dict_list,index=['end2end',]).round(3)

with open(os.path.join(self.result_foler,'End2End_Evaluation.json'),'w',encoding='utf-8') as f:
json.dump(result_all,f,indent=4,ensure_ascii=False)
df.to_csv(os.path.join(self.result_foler,'overall.csv'))
over_all_path=os.path.join(self.result_foler,'End2End_Evaluation.json')
print(f"The save path of overall.csv is :{over_all_path}")
e2e_eval_file = get_intermediate_file_path(self.eval_file, '_End2End_Evaluation', 'json')
dump(result_all, e2e_eval_file)

overall_file = get_intermediate_file_path(self.eval_file, '_overall')
dump(df, overall_file)

print(f"The save path of End2End_Evaluation is: {e2e_eval_file}")
print(f"The save path of overall metrics is: {overall_file}")
return df


class table_evalutor():
def __init__(self,eval_file,tsv_path):

self.result_foler='../../../outputs/OmniDocBench'
if not os.path.exists(self.result_foler):
os.makedirs(self.result_foler)
self.eval_file = eval_file
gt_key='html'
pred_key='pred'
self.category_filter='table'
Expand All @@ -434,8 +434,8 @@ def load_data(self,eval_file,gt_file,pred_key,gt_key):
from .data_preprocess import clean_string, normalized_formula, textblock2unicode, normalized_table
samples=[]
preds=[]
predictions=pd.read_excel(eval_file)['prediction'].tolist()
gt_samples=pd.read_csv(gt_file,sep='\t')['answer'].tolist()
predictions=load(eval_file)['prediction'].tolist()
gt_samples=load(gt_file)['answer'].tolist()
load_success,load_fail=0,0
for i,gt_sample in tqdm(enumerate(gt_samples),desc='Loading data'):
try:
Expand Down Expand Up @@ -533,8 +533,8 @@ def process_generated_metric_results(self,save_name:str='OmniDocBench_table'):
'page':page_result
}

with open(os.path.join(self.result_foler,f'{save_name}_metric_result.json'),'w',encoding='utf-8') as f:
json.dump(result_all,f,indent=4,ensure_ascii=False)
metric_result_file = get_intermediate_file_path(self.eval_file, f'_{save_name}_metric_result', 'json')
dump(result_all, metric_result_file)

dict_list=[]
dict_list.append(result_all["group"]["TEDS"])
Expand All @@ -545,10 +545,7 @@ def process_generated_metric_results(self,save_name:str='OmniDocBench_table'):
selected_columns = df4[["language: table_en", "language: table_simplified_chinese", "language: table_en_ch_mixed", "line: full_line", "line: less_line", "line: fewer_line", "line: wireless_line",
"with_span: True", "with_span: False", "include_equation: True", "include_equation: False", "include_background: True", "include_background: False", "table_layout: vertical", "table_layout: horizontal"]]

selected_columns.to_csv(os.path.join(self.result_foler,'table_attribute.csv'))
table_attribute_path=os.path.join(self.result_foler,'table_attribute.csv')
print(f'The save path of table_attribute.csv is :{table_attribute_path}')
selected_columns


table_attr_file = get_intermediate_file_path(self.eval_file, '_table_attribute')
dump(selected_columns, table_attr_file)
print(f'The save path of table_attribute is :{table_attr_file}')
return selected_columns
5 changes: 2 additions & 3 deletions vlmeval/dataset/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -151,7 +151,6 @@ def supported_datasets(cls):
return list(cls.DATASET_SETS)

def evaluate(self, eval_file, **judge_kwargs):
suffix = eval_file.split('.')[-1]
# First, split the eval_file by dataset
data_all = load(eval_file)
for dname in self.datasets:
Expand Down Expand Up @@ -179,11 +178,11 @@ def evaluate(self, eval_file, **judge_kwargs):

if len(df_all):
result = pd.concat(df_all)
score_file = eval_file.replace(f'.{suffix}', '_acc.csv')
score_file = get_intermediate_file_path(eval_file, '_acc', 'csv')
dump(result, score_file)
return result
else:
score_file = eval_file.replace(f'.{suffix}', '_score.json')
score_file = get_intermediate_file_path(eval_file, '_score', 'json')
dump(dict_all, score_file)
return dict_all

Expand Down
39 changes: 20 additions & 19 deletions vlmeval/dataset/cgbench.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
from huggingface_hub import snapshot_download
from ..smp import *
from ..smp.file import get_intermediate_file_path, get_file_extension
from .video_base import VideoBaseDataset
from .utils import build_judge, DEBUG_MESSAGE
from .utils.cgbench import *
Expand Down Expand Up @@ -432,10 +433,10 @@ def save_video_frames(self, video, uid, clue_intervals=None, num_frames=8, fps=-

def evaluate(self, eval_file, **judge_kwargs):

assert eval_file.endswith(".xlsx"), "data file should be an xlsx file"
assert get_file_extension(eval_file) in ['xlsx', 'json', 'tsv'], "data file should be a supported format"

tgt_file = eval_file.replace(".xlsx", "_rating.json")
score_file = eval_file.replace(".xlsx", "_score.xlsx")
tgt_file = get_intermediate_file_path(eval_file, '_rating', 'json')
score_file = get_intermediate_file_path(eval_file, '_score')

data = load(eval_file)

Expand Down Expand Up @@ -760,12 +761,12 @@ def evaluate(self, eval_file, **judge_kwargs):

from .utils.cgbench import get_dimention_rating_open_ended, post_process_open

assert eval_file.endswith(".xlsx"), "data file should be an xlsx file"
assert get_file_extension(eval_file) in ['xlsx', 'json', 'tsv'], "data file should be a supported format"

tgt_file = eval_file.replace(".xlsx", "_rating.json")
score_file = eval_file.replace(".xlsx", "_score.xlsx")
step_1_tmp_file = eval_file.replace(".xlsx", "_step_1.pkl")
step_2_tmp_file = eval_file.replace(".xlsx", "_step_2.pkl")
tgt_file = get_intermediate_file_path(eval_file, '_rating', 'json')
score_file = get_intermediate_file_path(eval_file, '_score')
step_1_tmp_file = get_intermediate_file_path(eval_file, '_step_1', 'pkl')
step_2_tmp_file = get_intermediate_file_path(eval_file, '_step_2', 'pkl')

data = load(eval_file)

Expand All @@ -784,13 +785,13 @@ def evaluate(self, eval_file, **judge_kwargs):
axis=1,
)

data_no_model_result = data_pred_no_na[data_pred_no_na["model_result"] == -1]
data_step_1 = data_pred_no_na[data_pred_no_na["model_result"] != -1]

if judge_kwargs.get("model", None) != "gpt-4o-0806":
judge_kwargs["model"] = "gpt-4o-0806"
print("The judge model in cg-bench is gpt-4o-0806!")

data_no_model_result = data_pred_no_na[data_pred_no_na["model_result"] == -1]
data_step_1 = data_pred_no_na[data_pred_no_na["model_result"] != -1]

model_step_1 = build_judge(system_prompt=sys_prompt_open_eval_step_1, **judge_kwargs)
nproc = judge_kwargs.pop("nproc", 32)

Expand Down Expand Up @@ -1314,10 +1315,10 @@ def save_video_frames(self, video, uid, clue_intervals=None, num_frames=8, fps=-

def evaluate(self, eval_file, **judge_kwargs):

assert eval_file.endswith(".xlsx"), "data file should be an xlsx file"
assert get_file_extension(eval_file) in ['xlsx', 'json', 'tsv'], "data file should be a supported format"

tgt_file = eval_file.replace(".xlsx", "_rating.json")
score_file = eval_file.replace(".xlsx", "_score.xlsx")
tgt_file = get_intermediate_file_path(eval_file, '_rating', 'json')
score_file = get_intermediate_file_path(eval_file, '_score')

data = load(eval_file)

Expand Down Expand Up @@ -1641,12 +1642,12 @@ def evaluate(self, eval_file, **judge_kwargs):

from .utils.cgbench import get_dimention_rating_open_ended, post_process_open

assert eval_file.endswith(".xlsx"), "data file should be an xlsx file"
assert get_file_extension(eval_file) in ['xlsx', 'json', 'tsv'], "data file should be a supported format"

tgt_file = eval_file.replace(".xlsx", "_rating.json")
score_file = eval_file.replace(".xlsx", "_score.xlsx")
step_1_tmp_file = eval_file.replace(".xlsx", "_step_1.pkl")
step_2_tmp_file = eval_file.replace(".xlsx", "_step_2.pkl")
tgt_file = get_intermediate_file_path(eval_file, '_rating', 'json')
score_file = get_intermediate_file_path(eval_file, '_score')
step_1_tmp_file = get_intermediate_file_path(eval_file, '_step_1', 'pkl')
step_2_tmp_file = get_intermediate_file_path(eval_file, '_step_2', 'pkl')

data = load(eval_file)

Expand Down
13 changes: 3 additions & 10 deletions vlmeval/dataset/chartmimic.py
Original file line number Diff line number Diff line change
Expand Up @@ -570,19 +570,12 @@ def judge_one_item_success(item):

infer_data_all = load(eval_file).to_dict(orient="records")

suffix = eval_file.split(".")[-1]
print(f"judge_kwargs: {judge_kwargs}")
infer_model = judge_kwargs["model"]
storage = os.path.abspath(
eval_file.replace(f".{suffix}", f"_{infer_model}.jsonl")
)
score_file = os.path.abspath(
eval_file.replace(f".{suffix}", f"_{infer_model}_score.csv")
)
storage = os.path.abspath(get_intermediate_file_path(eval_file, f'_{infer_model}', 'jsonl'))
score_file = os.path.abspath(get_intermediate_file_path(eval_file, f'_{infer_model}_score', 'csv'))
# use abs path because of using os.chdir()
tmp_file = os.path.abspath(
eval_file.replace(f".{suffix}", f"_{infer_model}_tmp.pkl")
)
tmp_file = os.path.abspath(get_intermediate_file_path(eval_file, f'_{infer_model}_tmp', 'pkl'))
# actually the --api-nproc
nproc = judge_kwargs.pop("nproc", 8)
logger.info(f"nproc: {nproc}")
Expand Down
8 changes: 4 additions & 4 deletions vlmeval/dataset/charxiv.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@

from vlmeval.dataset.image_base import ImageBaseDataset
from vlmeval.smp import misc, file
from vlmeval.smp.file import get_intermediate_file_path
from vlmeval import utils
from vlmeval.dataset.utils import build_judge

Expand Down Expand Up @@ -203,10 +204,9 @@ def evaluate(self, eval_file: str, **judge_kwargs: Any) -> pd.DataFrame:
judge_model_name = judge_model.model

# Define file paths
suffix = eval_file.split(".")[-1]
result_file = eval_file.replace(f".{suffix}", f"_{judge_model_name}.xlsx")
temp_result_file = eval_file.replace(f".{suffix}", f"_{judge_model_name}.pkl")
score_file = result_file.replace(".xlsx", "_acc.csv")
result_file = get_intermediate_file_path(eval_file, f"_{judge_model_name}")
temp_result_file = get_intermediate_file_path(eval_file, f"_{judge_model_name}", "pkl")
score_file = get_intermediate_file_path(result_file, "_acc", "csv")

# Return existing results if available
if os.path.exists(result_file):
Expand Down
4 changes: 2 additions & 2 deletions vlmeval/dataset/cmmmu.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
import re
import tempfile
from ..smp import *
from ..smp.file import get_intermediate_file_path


def get_multi_choice_prediction(response, all_choices, index2ans):
Expand Down Expand Up @@ -223,8 +224,7 @@ def dump_image(self, line):
@classmethod
def evaluate(self, eval_file, **judge_kwargs):

suffix = eval_file.split('.')[-1]
result_file = eval_file.replace(f'.{suffix}', '_acc.csv')
result_file = get_intermediate_file_path(eval_file, '_acc', 'csv')

if not osp.exists(result_file):
data = load(eval_file)
Expand Down
Loading
Loading