-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathdata_splitter.py
More file actions
26 lines (24 loc) · 853 Bytes
/
data_splitter.py
File metadata and controls
26 lines (24 loc) · 853 Bytes
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
from nltk.tokenize import sent_tokenize, word_tokenize
with open("./brown.txt", 'r', encoding="utf8") as f:
data=f.read()
data.replace("<", " ")
data.replace(">", " ")
data.replace("@", " ")
data.replace("#", " ")
data.replace("<", " ")
lines=sent_tokenize(data)
count=len(lines)
print(count)
for index, line in enumerate(lines):
if index<35000:
with open("./train.txt", "a") as f1:
f1.write(line.strip().replace("\n", " "))
f1.write("\n")
elif index<45000:
with open("./valid.txt", "a") as f1:
f1.write(line.strip().replace("\n", " "))
f1.write("\n")
else:
with open("./test.txt", "a") as f1:
f1.write(line.strip().replace("\n", " "))
f1.write("\n")