-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathhate_datasets.py
More file actions
executable file
·109 lines (90 loc) · 4.46 KB
/
hate_datasets.py
File metadata and controls
executable file
·109 lines (90 loc) · 4.46 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
""" Collect and save in data folder datasets with group labels of human annotations """
import os
import argparse
import pandas as pd
from typing import Dict
# Global vars
DNAMES = ['jigsaw', 'xtremespeech', 'gabhatecorpus', 'hatexplain']
# Names to match output in Multi-output TargetIdentityLayer.
OUTPUT_GROUP_NAMES = {'age': 'target_age', 'disability': 'target_disability', 'gender': 'target_gender',
'origin': 'target_origin', 'race': 'target_race', 'religion': 'target_religion',
'sexual_orientation': 'target_sexuality'}
def import_dataset(dname: str, d_path: str = None, o_path: str = None):
""" Import processed data from 'data' folder or collect and save it from d_path"""
if not o_path:
o_path = f'./data/{dname}.csv'
if dname == "jigsaw":
from functions.dataCollect import jigsaw as data_processor
elif dname == 'xtremespeech':
from functions.dataCollect import xtremespeech as data_processor
elif dname == 'gabhatecorpus':
from functions.dataCollect import gabhatecorpus as data_processor
elif dname == 'hatexplain':
from functions.dataCollect import hatexplain as data_processor
else:
raise ValueError(f"Need to include data collection functions and path to original data file: {dname}. "
f"List of available datasets: " + " , ".join(DNAMES))
if os.path.exists(o_path):
# import processed file from data folder
df = pd.read_csv(o_path)
text_col, id_col = data_processor.TEXT_COL, data_processor.ID_COL
identities_dict = data_processor.IDENTITIES_DICT
print(f'{dname} imported successfully from data folder: {df.shape[0]} annotations samples.')
else:
# import file from source folder
try:
df, text_col, id_col, identities_dict = data_processor.process_data(d_path, o_path=o_path)
print(f'{dname} fetched, prepared, and saved to export path')
except FileNotFoundError:
raise FileNotFoundError("{} dataset not found in data folder. "
"Provide valid path to original data file: {}".format(dname, d_path))
return df, text_col, id_col, identities_dict
# Evaluating identity group identification models
def prepare_for_model_evaluation(df: pd.DataFrame, text_col0: str, id_col0: str, identities_dict: Dict,
thr: float = 0.5, output_names=None):
""" prepare for evaluating identity group identification model as an external dataset"""
# 1. Rename to match model output convention
if output_names is None:
output_names = OUTPUT_GROUP_NAMES
text_col, id_col = 'predict_text', 'comment_id'
to_rename = {id_col0: id_col, text_col0: text_col}
for group in identities_dict.keys():
if group in output_names.keys():
to_rename[group] = output_names[group]
df = df.rename(to_rename, axis=1)
# 2. Apply 0.5 hard thresholding to target annotations (y true)
target_cols = sorted([col for col in df.columns if (col in output_names.values())])
for target_col in target_cols:
df[target_col] = df[target_col].apply(lambda target_col: int(target_col >= thr))
# 3. Add the OR(Gender, Sexuality) column
df['target_gso'] = ((df['target_gender'] == 1) | (df['target_sexuality'] == 1)).astype('int')
target_cols.append('target_gso')
return df, sorted(target_cols), text_col, id_col
def main():
desc = "Collect and save dataset with identity labels in data folder"
parser = argparse.ArgumentParser(description=desc)
# Required parameters
parser.add_argument("--d_name",
default=None,
type=str,
required=True,
help="Dataset selected in the list: " + ", ".join(DNAMES),
)
# Other parameters
parser.add_argument("--d_path",
default=None,
type=str,
required=False,
help="Path to original data file.",
)
parser.add_argument("--o_path",
default=None,
type=str,
required=False,
help="Path to collected data file.",
)
args = parser.parse_args()
_ = import_dataset(args.d_name, args.d_path, args.o_path)
return
if __name__ == "__main__":
main()