minimalparts · dcferreira · Sep 6, 2022 · Sep 8, 2022 · Sep 8, 2022 · Sep 9, 2022
diff --git a/nonce2vec/logging/logging.yml → nonce2vec/logging.yml b/nonce2vec/logging/logging.yml → nonce2vec/logging.yml
diff --git a/nonce2vec/logging/__init__.py b/nonce2vec/logging/__init__.py
diff --git a/nonce2vec/main.py b/nonce2vec/main.py
@@ -20,15 +20,14 @@
 import nonce2vec.utils.config as cutils
 import nonce2vec.utils.files as futils
 
-from nonce2vec.models.nonce2vec import Nonce2Vec, Nonce2VecVocab, \
-                                       Nonce2VecTrainables
+from nonce2vec.models.nonce2vec import Nonce2Vec
 from nonce2vec.utils.files import Samples
 from nonce2vec.models.informativeness import Informativeness
 
 
 logging.config.dictConfig(
     cutils.load(
-        os.path.join(os.path.dirname(__file__), 'logging', 'logging.yml')))
+        os.path.join(os.path.dirname(__file__), 'logging.yml')))
 
 logger = logging.getLogger(__name__)
 
@@ -58,8 +57,6 @@ def _update_rr_and_count(relative_ranks, count, rank):
 def _load_nonce2vec_model(args, info, nonce):
     logger.info('Loading Nonce2Vec model...')
     model = Nonce2Vec.load(args.background)
-    model.vocabulary = Nonce2VecVocab.load(model.vocabulary)
-    model.trainables = Nonce2VecTrainables.load(model.trainables)
     model.sg = 1
     model.replication = args.replication
     model.sum_over_set = args.sum_over_set
@@ -88,7 +85,7 @@ def _load_nonce2vec_model(args, info, nonce):
     if not args.sum_only:
         model.train_with = args.train_with
         model.alpha = args.alpha
-        model.iter = args.epochs
+        model.epochs = args.epochs
         model.negative = args.neg
         model.lambda_den = args.lambda_den
         model.kappa = args.kappa
@@ -97,9 +94,9 @@ def _load_nonce2vec_model(args, info, nonce):
             # precompute negative labels optimization for pure-python training
             model.neg_labels = np.zeros(model.negative + 1)
             model.neg_labels[0] = 1.
-    model.trainables.info = info
+    model.trainables_info = info
     model.workers = args.num_threads
-    model.vocabulary.nonce = nonce
+    model.current_nonce = nonce
     logger.info('Model loaded')
     return model
 
@@ -118,8 +115,8 @@ def _test_on_chimeras(args):  # pylint:disable=R0914
     for sentences, nonce, probes, responses in samples:
         if num_batch == 1 or args.reload:
             model = _load_nonce2vec_model(args, info, nonce)
-        model.vocabulary.nonce = nonce
-        vocab_size = len(model.wv.vocab)
+        model.current_nonce = nonce
+        vocab_size = len(model.wv)
         logger.info('-' * 30)
         logger.info('Processing batch {}/{}'.format(num_batch,
                                                     total_num_batches))
@@ -135,14 +132,14 @@ def _test_on_chimeras(args):  # pylint:disable=R0914
             model.build_vocab(sentences, update=True)
         if not args.sum_only:
             model.train(sentences, total_examples=model.corpus_count,
-                        epochs=model.iter)
+                        epochs=model.epochs)
         num_batch += 1
         system_responses = []
         human_responses = []
         probe_count = 0
         for probe in probes:
             try:
-                cos = model.similarity(nonce, probe)
+                cos = model.wv.similarity(nonce, probe)
                 system_responses.append(cos)
                 human_responses.append(responses[probe_count])
             except:  # pylint:disable=W0702
@@ -152,7 +149,7 @@ def _test_on_chimeras(args):  # pylint:disable=R0914
             logger.info('system_responses = {}'.format(system_responses))
             logger.info('human_responses = {}'.format(human_responses))
             logger.info('10 most similar words = {}'.format(
-                model.most_similar(nonce, topn=10)))
+                model.wv.similar_by_word(nonce, topn=10)))
             rho = _spearman(human_responses, system_responses)
             logger.info('RHO = {}'.format(rho))
             if not math.isnan(rho):
@@ -231,18 +228,18 @@ def _test_on_definitions(args):  # pylint:disable=R0914
         if num_sent == 1 or args.reload:
             model = _load_nonce2vec_model(args, info, nonce)
         model.vocabulary.nonce = nonce
-        vocab_size = len(model.wv.vocab)
+        vocab_size = len(model.wv)
         logger.info('vocab size = {}'.format(vocab_size))
         logger.info('nonce: {}'.format(nonce))
         logger.info('sentence: {}'.format(sentences))
-        if nonce not in model.wv.vocab:
+        if nonce not in model.wv:
             logger.error('Nonce \'{}\' not in gensim.word2vec.model '
                          'vocabulary'.format(nonce))
             continue
         model.build_vocab(sentences, update=True)
         if not args.sum_only:
             model.train(sentences, total_examples=model.corpus_count,
-                        epochs=model.iter)
+                        epochs=model.epochs)
         nns = model.most_similar(nonce, topn=vocab_size)
         logger.info('10 most similar words: {}'.format(nns[:10]))
         rank = _get_rank(probe, nns)
@@ -299,7 +296,7 @@ def _check_men(args):
     human_actual = []
     count = 0
     for (first, second), human in Samples(source='men', shuffle=False):
-        if first not in model.wv.vocab or second not in model.wv.vocab:
+        if first not in model.wv or second not in model.wv:
             logger.error('Could not find one of more pair item in model '
                          'vocabulary: {}, {}'.format(first, second))
             continue
@@ -325,8 +322,8 @@ def _train(args):
     logger.info('Saving output w2v model to {}'.format(output_model_filepath))
     model = gensim.models.Word2Vec(
         min_count=args.min_count, alpha=args.alpha, negative=args.neg,
-        window=args.window, sample=args.sample, iter=args.epochs,
-        size=args.size, workers=args.num_threads)
+        window=args.window, sample=args.sample, epochs=args.epochs,
+        vector_size=args.size, workers=args.num_threads)
     if args.train_mode == 'cbow':
         model.sg = 0
     if args.train_mode == 'skipgram':
@@ -463,3 +460,7 @@ def main():
                              help='shuffle the test set')
     args = parser.parse_args()
     args.func(args)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/nonce2vec/models/informativeness.py b/nonce2vec/models/informativeness.py
@@ -63,7 +63,7 @@ def sum_filter(self, sum_filter):
     @lru_cache(maxsize=10)
     def _get_prob_distribution(self, context):
         words_and_probs = self._model.predict_output_word(
-            context, topn=len(self._model.wv.vocab))
+            context, topn=len(self._model.wv))
         return [item[1] for item in words_and_probs]
 
     @lru_cache(maxsize=10)
@@ -91,10 +91,10 @@ def _keep_item(self, idx, context, filter_type, threshold):
         if not filter_type:
             return True
         if filter_type == 'random':
-            return self._model.wv.vocab[context[idx]].sample_int \
+            return self._model.wv.get_vecattr(context[idx], "sample_int") \
              > self._model.random.rand() * 2 ** 32
         if filter_type == 'self':
-            return np.log(self._model.wv.vocab[context[idx]].sample_int) \
+            return np.log(self._model.wv.get_vecattr(context[idx], "sample_int")) \
              > threshold
         if filter_type == 'cwi':
             return self._get_context_word_entropy(context, idx) > threshold
@@ -112,13 +112,13 @@ def _filter_context(self, context, filter_type, threshold):
                      self._keep_item(idx, context, filter_type, threshold))
 
     @classmethod
-    def _get_in_vocab_context(cls, sentence, vocab, nonce):
-        return tuple([w for w in sentence if w in vocab and w != nonce])
+    def _get_in_vocab_context(cls, sentence, keyed_vectors, nonce):
+        return tuple([w for w in sentence if w in keyed_vectors and w != nonce])
 
-    def get_ctx_ent_for_weighted_sum(self, sentences, vocab, nonce):
+    def get_ctx_ent_for_weighted_sum(self, sentences, keyed_vectors, nonce):
         """Return context entropy."""
         ctx_ent_map = {}
-        ctx_ent = self._get_filtered_train_ctx_ent(sentences, vocab, nonce)
+        ctx_ent = self._get_filtered_train_ctx_ent(sentences, keyed_vectors, nonce)
         for ctx, cwi in ctx_ent:
             if ctx not in ctx_ent_map:
                 ctx_ent_map[ctx] = cwi
@@ -127,10 +127,10 @@ def get_ctx_ent_for_weighted_sum(self, sentences, vocab, nonce):
                     ctx_ent_map[ctx] = cwi
         return ctx_ent_map
 
-    def _get_filtered_train_ctx_ent(self, sentences, vocab, nonce):
+    def _get_filtered_train_ctx_ent(self, sentences, keyed_vectors, nonce):
         ctx_ent = []
         for sentence in sentences:
-            context = self._get_in_vocab_context(sentence, vocab, nonce)
+            context = self._get_in_vocab_context(sentence, keyed_vectors, nonce)
             for idx, ctx in enumerate(context):
                 if self._keep_item(idx, context, self._train_filter,
                                    self._train_thresh):
@@ -140,10 +140,10 @@ def _get_filtered_train_ctx_ent(self, sentences, vocab, nonce):
                     ctx_ent.append((ctx, cwi))
         return ctx_ent
 
-    def filter_and_sort_train_ctx_ent(self, sentences, vocab, nonce):
+    def filter_and_sort_train_ctx_ent(self, sentences, keyed_vectors, nonce):
         """Sort context and return a list of (ctx_word, ctx_word_entropy)."""
         logger.debug('Filtering and sorting train context...')
-        ctx_ent = self._get_filtered_train_ctx_ent(sentences, vocab, nonce)
+        ctx_ent = self._get_filtered_train_ctx_ent(sentences, keyed_vectors, nonce)
         if not self._sort_by:
             return ctx_ent
         if self._sort_by == 'desc':
@@ -152,13 +152,13 @@ def filter_and_sort_train_ctx_ent(self, sentences, vocab, nonce):
             return sorted(ctx_ent, key=lambda x: x[1])
         raise Exception('Invalid sort_by value: {}'.format(self._sort_by))
 
-    def filter_sum_context(self, sentences, vocab, nonce):
+    def filter_sum_context(self, sentences, keyed_vectors, nonce):
         """Filter the context to be summed over."""
         logger.debug('Filtering sum context...')
         filtered_ctx = []
         raw_ctx = []
         for sentence in sentences:
-            _ctx = self._get_in_vocab_context(sentence, vocab, nonce)
+            _ctx = self._get_in_vocab_context(sentence, keyed_vectors, nonce)
             _filtered_ctx = self._filter_context(_ctx, self._sum_filter,
                                                  self._sum_thresh)
             raw_ctx.extend(list(_ctx))