-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathclassifiers.py
More file actions
70 lines (60 loc) · 2.43 KB
/
classifiers.py
File metadata and controls
70 lines (60 loc) · 2.43 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
# -*- coding: utf-8 -*-
"""
Created on Sat Jun 04 20:15:48 2016
@author: Fianna
"""
import os
import random
import collections
import nltk
from itertools import combinations
class Language:
def __init__(self, tag):
self.tag = tag
self.training_sents, self.testing_sents = self.get_texts()
def get_texts(self):
t_sents = open(os.path.join("ICNALE_editted",(self.tag+".txt"))).read().split('\n')
tagged_sents = [[tuple(w.split('|')) for w in sent.split(' ')] for sent in t_sents]
random.shuffle(tagged_sents)
training = tagged_sents[:2181]
testing = tagged_sents[2181:2727]
return training, testing
class LangClassifier(Language):
def __init__(self, lang1, lang2, pos = True, words = True):
self.lang1 = Language(lang1)
self.lang2 = Language(lang2)
self.feat_pos = pos
self.feat_words = words
self.classifier = self.get_classifier()
def features(self, sent):
feature_dict = collections.defaultdict(float)
for n in range(1,5):
if self.feat_pos == True:
pos = [t for w,t in sent]
for gram in nltk.ngrams(pos, n):
feature_dict['n=%d_pos=%s'%(n,gram)] = True
else:
pass
if self.feat_words == True:
wor = [w for w,t in sent]
for gram in nltk.ngrams(wor, n):
feature_dict['n=%d_words=%s'%(n,gram)] = True
else:
pass
return feature_dict
def get_classifier(self):
def traintag(lang):
return [(self.features(sent), lang.tag) for sent in lang.training_sents]
training_set = traintag(self.lang1)+traintag(self.lang2)
classifier = nltk.classify.NaiveBayesClassifier.train(training_set)
return classifier
def test(self):
def testtag(lang):
return [(self.features(sent), lang.tag) for sent in lang.testing_sents]
testing_set = testtag(self.lang1)+testtag(self.lang2)
return nltk.classify.accuracy(self.classifier, testing_set)
tag_set = [fname[:3] for fname in os.listdir('ICNALE_editted')]
tag_pairs = list(combinations(tag_set,2))
results = {}
for (l1,l2) in tag_pairs:
results['%s_%s'%(l1,l2)] = {'words_and_POS':LangClassifier(l1,l2).test(), 'words':LangClassifier(l1,l2, pos=False).test(), 'POS':LangClassifier(l1,l2, words=False).test()}