Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
File renamed without changes.
Empty file removed nonce2vec/logging/__init__.py
Empty file.
39 changes: 20 additions & 19 deletions nonce2vec/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,15 +20,14 @@
import nonce2vec.utils.config as cutils
import nonce2vec.utils.files as futils

from nonce2vec.models.nonce2vec import Nonce2Vec, Nonce2VecVocab, \
Nonce2VecTrainables
from nonce2vec.models.nonce2vec import Nonce2Vec
from nonce2vec.utils.files import Samples
from nonce2vec.models.informativeness import Informativeness


logging.config.dictConfig(
cutils.load(
os.path.join(os.path.dirname(__file__), 'logging', 'logging.yml')))
os.path.join(os.path.dirname(__file__), 'logging.yml')))

logger = logging.getLogger(__name__)

Expand Down Expand Up @@ -58,8 +57,6 @@ def _update_rr_and_count(relative_ranks, count, rank):
def _load_nonce2vec_model(args, info, nonce):
logger.info('Loading Nonce2Vec model...')
model = Nonce2Vec.load(args.background)
model.vocabulary = Nonce2VecVocab.load(model.vocabulary)
model.trainables = Nonce2VecTrainables.load(model.trainables)
model.sg = 1
model.replication = args.replication
model.sum_over_set = args.sum_over_set
Expand Down Expand Up @@ -88,7 +85,7 @@ def _load_nonce2vec_model(args, info, nonce):
if not args.sum_only:
model.train_with = args.train_with
model.alpha = args.alpha
model.iter = args.epochs
model.epochs = args.epochs
model.negative = args.neg
model.lambda_den = args.lambda_den
model.kappa = args.kappa
Expand All @@ -97,9 +94,9 @@ def _load_nonce2vec_model(args, info, nonce):
# precompute negative labels optimization for pure-python training
model.neg_labels = np.zeros(model.negative + 1)
model.neg_labels[0] = 1.
model.trainables.info = info
model.trainables_info = info
model.workers = args.num_threads
model.vocabulary.nonce = nonce
model.current_nonce = nonce
logger.info('Model loaded')
return model

Expand All @@ -118,8 +115,8 @@ def _test_on_chimeras(args): # pylint:disable=R0914
for sentences, nonce, probes, responses in samples:
if num_batch == 1 or args.reload:
model = _load_nonce2vec_model(args, info, nonce)
model.vocabulary.nonce = nonce
vocab_size = len(model.wv.vocab)
model.current_nonce = nonce
vocab_size = len(model.wv)
logger.info('-' * 30)
logger.info('Processing batch {}/{}'.format(num_batch,
total_num_batches))
Expand All @@ -135,14 +132,14 @@ def _test_on_chimeras(args): # pylint:disable=R0914
model.build_vocab(sentences, update=True)
if not args.sum_only:
model.train(sentences, total_examples=model.corpus_count,
epochs=model.iter)
epochs=model.epochs)
num_batch += 1
system_responses = []
human_responses = []
probe_count = 0
for probe in probes:
try:
cos = model.similarity(nonce, probe)
cos = model.wv.similarity(nonce, probe)
system_responses.append(cos)
human_responses.append(responses[probe_count])
except: # pylint:disable=W0702
Expand All @@ -152,7 +149,7 @@ def _test_on_chimeras(args): # pylint:disable=R0914
logger.info('system_responses = {}'.format(system_responses))
logger.info('human_responses = {}'.format(human_responses))
logger.info('10 most similar words = {}'.format(
model.most_similar(nonce, topn=10)))
model.wv.similar_by_word(nonce, topn=10)))
rho = _spearman(human_responses, system_responses)
logger.info('RHO = {}'.format(rho))
if not math.isnan(rho):
Expand Down Expand Up @@ -231,18 +228,18 @@ def _test_on_definitions(args): # pylint:disable=R0914
if num_sent == 1 or args.reload:
model = _load_nonce2vec_model(args, info, nonce)
model.vocabulary.nonce = nonce
vocab_size = len(model.wv.vocab)
vocab_size = len(model.wv)
logger.info('vocab size = {}'.format(vocab_size))
logger.info('nonce: {}'.format(nonce))
logger.info('sentence: {}'.format(sentences))
if nonce not in model.wv.vocab:
if nonce not in model.wv:
logger.error('Nonce \'{}\' not in gensim.word2vec.model '
'vocabulary'.format(nonce))
continue
model.build_vocab(sentences, update=True)
if not args.sum_only:
model.train(sentences, total_examples=model.corpus_count,
epochs=model.iter)
epochs=model.epochs)
nns = model.most_similar(nonce, topn=vocab_size)
logger.info('10 most similar words: {}'.format(nns[:10]))
rank = _get_rank(probe, nns)
Expand Down Expand Up @@ -299,7 +296,7 @@ def _check_men(args):
human_actual = []
count = 0
for (first, second), human in Samples(source='men', shuffle=False):
if first not in model.wv.vocab or second not in model.wv.vocab:
if first not in model.wv or second not in model.wv:
logger.error('Could not find one of more pair item in model '
'vocabulary: {}, {}'.format(first, second))
continue
Expand All @@ -325,8 +322,8 @@ def _train(args):
logger.info('Saving output w2v model to {}'.format(output_model_filepath))
model = gensim.models.Word2Vec(
min_count=args.min_count, alpha=args.alpha, negative=args.neg,
window=args.window, sample=args.sample, iter=args.epochs,
size=args.size, workers=args.num_threads)
window=args.window, sample=args.sample, epochs=args.epochs,
vector_size=args.size, workers=args.num_threads)
if args.train_mode == 'cbow':
model.sg = 0
if args.train_mode == 'skipgram':
Expand Down Expand Up @@ -463,3 +460,7 @@ def main():
help='shuffle the test set')
args = parser.parse_args()
args.func(args)


if __name__ == '__main__':
main()
26 changes: 13 additions & 13 deletions nonce2vec/models/informativeness.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,7 +63,7 @@ def sum_filter(self, sum_filter):
@lru_cache(maxsize=10)
def _get_prob_distribution(self, context):
words_and_probs = self._model.predict_output_word(
context, topn=len(self._model.wv.vocab))
context, topn=len(self._model.wv))
return [item[1] for item in words_and_probs]

@lru_cache(maxsize=10)
Expand Down Expand Up @@ -91,10 +91,10 @@ def _keep_item(self, idx, context, filter_type, threshold):
if not filter_type:
return True
if filter_type == 'random':
return self._model.wv.vocab[context[idx]].sample_int \
return self._model.wv.get_vecattr(context[idx], "sample_int") \
> self._model.random.rand() * 2 ** 32
if filter_type == 'self':
return np.log(self._model.wv.vocab[context[idx]].sample_int) \
return np.log(self._model.wv.get_vecattr(context[idx], "sample_int")) \
> threshold
if filter_type == 'cwi':
return self._get_context_word_entropy(context, idx) > threshold
Expand All @@ -112,13 +112,13 @@ def _filter_context(self, context, filter_type, threshold):
self._keep_item(idx, context, filter_type, threshold))

@classmethod
def _get_in_vocab_context(cls, sentence, vocab, nonce):
return tuple([w for w in sentence if w in vocab and w != nonce])
def _get_in_vocab_context(cls, sentence, keyed_vectors, nonce):
return tuple([w for w in sentence if w in keyed_vectors and w != nonce])

def get_ctx_ent_for_weighted_sum(self, sentences, vocab, nonce):
def get_ctx_ent_for_weighted_sum(self, sentences, keyed_vectors, nonce):
"""Return context entropy."""
ctx_ent_map = {}
ctx_ent = self._get_filtered_train_ctx_ent(sentences, vocab, nonce)
ctx_ent = self._get_filtered_train_ctx_ent(sentences, keyed_vectors, nonce)
for ctx, cwi in ctx_ent:
if ctx not in ctx_ent_map:
ctx_ent_map[ctx] = cwi
Expand All @@ -127,10 +127,10 @@ def get_ctx_ent_for_weighted_sum(self, sentences, vocab, nonce):
ctx_ent_map[ctx] = cwi
return ctx_ent_map

def _get_filtered_train_ctx_ent(self, sentences, vocab, nonce):
def _get_filtered_train_ctx_ent(self, sentences, keyed_vectors, nonce):
ctx_ent = []
for sentence in sentences:
context = self._get_in_vocab_context(sentence, vocab, nonce)
context = self._get_in_vocab_context(sentence, keyed_vectors, nonce)
for idx, ctx in enumerate(context):
if self._keep_item(idx, context, self._train_filter,
self._train_thresh):
Expand All @@ -140,10 +140,10 @@ def _get_filtered_train_ctx_ent(self, sentences, vocab, nonce):
ctx_ent.append((ctx, cwi))
return ctx_ent

def filter_and_sort_train_ctx_ent(self, sentences, vocab, nonce):
def filter_and_sort_train_ctx_ent(self, sentences, keyed_vectors, nonce):
"""Sort context and return a list of (ctx_word, ctx_word_entropy)."""
logger.debug('Filtering and sorting train context...')
ctx_ent = self._get_filtered_train_ctx_ent(sentences, vocab, nonce)
ctx_ent = self._get_filtered_train_ctx_ent(sentences, keyed_vectors, nonce)
if not self._sort_by:
return ctx_ent
if self._sort_by == 'desc':
Expand All @@ -152,13 +152,13 @@ def filter_and_sort_train_ctx_ent(self, sentences, vocab, nonce):
return sorted(ctx_ent, key=lambda x: x[1])
raise Exception('Invalid sort_by value: {}'.format(self._sort_by))

def filter_sum_context(self, sentences, vocab, nonce):
def filter_sum_context(self, sentences, keyed_vectors, nonce):
"""Filter the context to be summed over."""
logger.debug('Filtering sum context...')
filtered_ctx = []
raw_ctx = []
for sentence in sentences:
_ctx = self._get_in_vocab_context(sentence, vocab, nonce)
_ctx = self._get_in_vocab_context(sentence, keyed_vectors, nonce)
_filtered_ctx = self._filter_context(_ctx, self._sum_filter,
self._sum_thresh)
raw_ctx.extend(list(_ctx))
Expand Down
Loading