-
-
Notifications
You must be signed in to change notification settings - Fork 2
Expand file tree
/
Copy pathpreprocess.py
More file actions
49 lines (38 loc) · 1.3 KB
/
preprocess.py
File metadata and controls
49 lines (38 loc) · 1.3 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
import json
import re
input_src = "story_data.json"
output_src = "story_data_proc.json"
input_raw = ""
with open(input_src) as file:
input_raw = file.read()
file.close()
#remove hanging commas (breaks json lib)
input_raw = re.sub(r",(\s+)\}",r"\1}",input_raw)
input_raw = re.sub(r",(\s+)\]",r"\1]",input_raw)
data = json.loads(input_raw)
def process(pages):
result = []
for i in range(len(pages)):
page = pages[i]
page["ref_start"] = re.sub(r"[a-z]",'',page["ref_start"])
page["ref_end"] = re.sub(r"[a-z]",'',page["ref_end"])
if len(result) == 0 or result[-1]["ref_end"] != page["ref_start"]:
result.append(page)
else:
result[-1]["ref_end"] = page["ref_end"]
for i in range(len(result)):
result[i]["page"] = i + 1
return result
for i in range(len(data["storyCollection"])):
data["storyCollection"][i]["story"]["pages"] = process(data["storyCollection"][i]["story"]["pages"])
# just some formatting for outut json
def flatten(match):
result = match.group(0)
result = re.sub(r",\s+", ', ', result)
result = re.sub(r"\s+\}", ' }', result)
result = re.sub(r"\{\s+", '{ ', result)
return result
output = json.dumps(data,indent=4, sort_keys=False)
output = re.sub(r"\{[^\[\]\{\}]+\}",flatten,output)
with open(output_src,"w+") as file:
file.write(output)