From 0ccbe464470496d92859c454ff966195ee4eb003 Mon Sep 17 00:00:00 2001 From: Boris Shminke Date: Sun, 29 Oct 2017 14:40:46 +0300 Subject: [PATCH 01/17] change izip to zip --- mrec/__init__.py | 5 ++--- mrec/item_similarity/recommender.py | 5 ++--- mrec/mf/model/warp.py | 3 +-- mrec/mf/recommender.py | 1 - 4 files changed, 5 insertions(+), 9 deletions(-) diff --git a/mrec/__init__.py b/mrec/__init__.py index dece6e3..5e92662 100644 --- a/mrec/__init__.py +++ b/mrec/__init__.py @@ -1,4 +1,3 @@ -from itertools import izip import numpy as np from scipy.sparse import coo_matrix, csr_matrix from scipy.io import mmread, mmwrite @@ -89,12 +88,12 @@ def save_sparse_matrix(data,fmt,filepath): if fmt == 'tsv': m = data.tocoo() with open(filepath,'w') as out: - for u,i,v in izip(m.row,m.col,m.data): + for u,i,v in zip(m.row,m.col,m.data): print >>out,'{0}\t{1}\t{2}'.format(u+1,i+1,v) elif fmt == 'csv': m = data.tocoo() with open(filepath,'w') as out: - for u,i,v in izip(m.row,m.col,m.data): + for u,i,v in zip(m.row,m.col,m.data): print >>out,'{0},{1},{2}'.format(u+1,i+1,v) elif fmt == 'mm': mmwrite(filepath,data) diff --git a/mrec/item_similarity/recommender.py b/mrec/item_similarity/recommender.py index 4199b5d..1328118 100644 --- a/mrec/item_similarity/recommender.py +++ b/mrec/item_similarity/recommender.py @@ -7,7 +7,6 @@ except ImportError: import pickle import numpy as np -from itertools import izip from operator import itemgetter from scipy.sparse import csr_matrix, coo_matrix @@ -307,9 +306,9 @@ def _get_recommendations_from_predictions(self,r,dataset,user_start,user_end,max print ux,'..', ru = r[ux,:] if return_scores: - recs[ux] = [(i,v) for v,i in sorted(izip(ru.data,ru.indices),reverse=True) if v > 0][:max_items] + recs[ux] = [(i,v) for v,i in sorted(zip(ru.data,ru.indices),reverse=True) if v > 0][:max_items] else: - recs[ux] = [i for v,i in sorted(izip(ru.data,ru.indices),reverse=True) if v > 0][:max_items] + recs[ux] = [i for v,i in sorted(zip(ru.data,ru.indices),reverse=True) if v > 0][:max_items] if show_progress: print return recs diff --git a/mrec/mf/model/warp.py b/mrec/mf/model/warp.py index 0465343..2189bb4 100644 --- a/mrec/mf/model/warp.py +++ b/mrec/mf/model/warp.py @@ -1,6 +1,5 @@ import numpy as np import random -from itertools import izip from mrec.evaluation import metrics @@ -280,7 +279,7 @@ def estimate_precision(self,decomposition,train,validation,k=30): r = decomposition.reconstruct(rows) prec = 0 - for u,ru in izip(rows,r): + for u,ru in zip(rows,r): predicted = ru.argsort()[::-1][:k] if have_validation_set: actual = validation[u] diff --git a/mrec/mf/recommender.py b/mrec/mf/recommender.py index f7e422c..bf40ac2 100644 --- a/mrec/mf/recommender.py +++ b/mrec/mf/recommender.py @@ -8,7 +8,6 @@ except ImportError: import pickle import numpy as np -from itertools import izip from scipy.sparse import csr_matrix from mrec.base_recommender import BaseRecommender From 1958a3894daa071667e8f61758261ba82dc6521d Mon Sep 17 00:00:00 2001 From: Boris Shminke Date: Sun, 29 Oct 2017 14:47:03 +0300 Subject: [PATCH 02/17] add parentheses to print --- doc/conf.py | 8 ++++---- mrec/__init__.py | 4 ++-- mrec/base_recommender.py | 4 ++-- mrec/evaluation/metrics.py | 8 ++++---- mrec/examples/convert.py | 4 ++-- mrec/examples/prepare.py | 2 +- mrec/examples/tune_slim.py | 10 +++++----- mrec/item_similarity/knn.py | 20 ++++++++++---------- mrec/item_similarity/recommender.py | 4 ++-- mrec/item_similarity/slim.py | 20 ++++++++++---------- mrec/mf/climf.py | 8 ++++---- mrec/mf/evaluate.py | 2 +- mrec/mf/model/warp.py | 6 +++--- mrec/mf/recommender.py | 4 ++-- mrec/mf/warp.py | 6 +++--- mrec/mf/wrmf.py | 2 +- mrec/parallel/item_similarity.py | 2 +- mrec/parallel/predict.py | 2 +- mrec/tests/test_sparse.py | 2 +- 19 files changed, 59 insertions(+), 59 deletions(-) diff --git a/doc/conf.py b/doc/conf.py index e2b3948..9c2fe47 100644 --- a/doc/conf.py +++ b/doc/conf.py @@ -51,10 +51,10 @@ try: release = pkg_resources.get_distribution('mrec').version except pkg_resources.DistributionNotFound: - print 'To build the documentation, The distribution information of mrec' - print 'has to be available. Either install the package into your' - print 'development environment or run "python setup.py develop" to setup' - print 'the metadata.' + print('To build the documentation, The distribution information of mrec') + print('has to be available. Either install the package into your') + print('development environment or run "python setup.py develop" to setup') + print('the metadata.') sys.exit(1) del pkg_resources version = '.'.join(release.split('.')[:2]) diff --git a/mrec/__init__.py b/mrec/__init__.py index 5e92662..ec961ee 100644 --- a/mrec/__init__.py +++ b/mrec/__init__.py @@ -89,12 +89,12 @@ def save_sparse_matrix(data,fmt,filepath): m = data.tocoo() with open(filepath,'w') as out: for u,i,v in zip(m.row,m.col,m.data): - print >>out,'{0}\t{1}\t{2}'.format(u+1,i+1,v) + print('{0}\t{1}\t{2}'.format(u+1,i+1,v), file=out) elif fmt == 'csv': m = data.tocoo() with open(filepath,'w') as out: for u,i,v in zip(m.row,m.col,m.data): - print >>out,'{0},{1},{2}'.format(u+1,i+1,v) + print('{0},{1},{2}'.format(u+1,i+1,v), file=out) elif fmt == 'mm': mmwrite(filepath,data) elif fmt == 'npz': diff --git a/mrec/base_recommender.py b/mrec/base_recommender.py index ef5333f..34c92de 100644 --- a/mrec/base_recommender.py +++ b/mrec/base_recommender.py @@ -192,10 +192,10 @@ def batch_recommend_items(self, recs = [] for u in xrange(self.num_users): if show_progress and u%1000 == 0: - print u,'..', + print(u,'..',) recs.append(self.recommend_items(dataset,u,max_items,return_scores)) if show_progress: - print + print() return recs def range_recommend_items(self, diff --git a/mrec/evaluation/metrics.py b/mrec/evaluation/metrics.py index ec5a787..ad2b928 100644 --- a/mrec/evaluation/metrics.py +++ b/mrec/evaluation/metrics.py @@ -63,7 +63,7 @@ def run_evaluation(models,retrain,get_split,num_runs,evaluation_func): retrain(model,train) run_metrics = evaluation_func(model,train,users,test) for m,val in run_metrics.iteritems(): - print m,val + print(m,val) metrics[i][m].append(val) return metrics @@ -99,15 +99,15 @@ def print_report(models,metrics): Call this to print out the metrics returned by run_evaluation(). """ for model,results in zip(models,metrics): - print model + print(model) if hasattr(model,'similarity_matrix'): nnz = model.similarity_matrix.nnz num_items = model.similarity_matrix.shape[0] density = float(model.similarity_matrix.nnz)/num_items**2 - print 'similarity matrix nnz = {0} (density {1:.3f})'.format(nnz,density) + print('similarity matrix nnz = {0} (density {1:.3f})'.format(nnz,density)) for m in sort_metrics_by_name(results.keys()): vals = results[m] - print '{0}{1:.4f} +/- {2:.4f}'.format(m.ljust(15),np.mean(vals),stats.sem(vals,ddof=0)) + print('{0}{1:.4f} +/- {2:.4f}'.format(m.ljust(15),np.mean(vals),stats.sem(vals,ddof=0))) def evaluate(model,train,users,get_known_items,compute_metrics): avg_metrics = defaultdict(float) diff --git a/mrec/examples/convert.py b/mrec/examples/convert.py index 53442f5..b1ba950 100644 --- a/mrec/examples/convert.py +++ b/mrec/examples/convert.py @@ -18,8 +18,8 @@ def tsv2mtx(infile,outfile): nnz += 1 headerfile = outfile+'.header' with open(headerfile,'w') as header: - print >>header,'%%MatrixMarket matrix coordinate real general' - print >>header,'{0} {1} {2}'.format(num_users,num_items,nnz) + print('%%MatrixMarket matrix coordinate real general', file=header) + print('{0} {1} {2}'.format(num_users,num_items,nnz), file=header) subprocess.check_call(['cat',headerfile,infile],stdout=open(outfile,'w')) subprocess.check_call(['rm',headerfile]) diff --git a/mrec/examples/prepare.py b/mrec/examples/prepare.py index a7fe6e0..752b42a 100644 --- a/mrec/examples/prepare.py +++ b/mrec/examples/prepare.py @@ -8,7 +8,7 @@ def __init__(self,splitter,parser,min_items_per_user,preprocess=None): def output(self,user,vals,outfile): for v,c in vals: - print >>outfile,'{0}\t{1}\t{2}'.format(user,v,c) + print('{0}\t{1}\t{2}'.format(user,v,c), file=outfile) def handle(self,user,vals): if len(vals) >= self.min_items_per_user: diff --git a/mrec/examples/tune_slim.py b/mrec/examples/tune_slim.py index 45a9762..98cbdad 100644 --- a/mrec/examples/tune_slim.py +++ b/mrec/examples/tune_slim.py @@ -91,12 +91,12 @@ def main(): if candidates: best = min(candidates,key=itemgetter(1)) - print 'best parameter setting: {0}'.format(best[0]) - print 'mean # positive similarity weights per item = {0:.3}'.format(best[1]) - print 'proportion of items with fewer than {0} positive similarity weights = {1:.3}'.format(opts.min_sims,best[2]) - print 'mean # negative similarity weights per item = {0:.3}'.format(best[3]) + print('best parameter setting: {0}'.format(best[0])) + print('mean # positive similarity weights per item = {0:.3}'.format(best[1])) + print('proportion of items with fewer than {0} positive similarity weights = {1:.3}'.format(opts.min_sims,best[2])) + print('mean # negative similarity weights per item = {0:.3}'.format(best[3])) else: - print 'no parameter settings satisfied the conditions, try increasing --min_sims, --max_sims or --max_sparse' + print('no parameter settings satisfied the conditions, try increasing --min_sims, --max_sims or --max_sparse') if __name__ == '__main__': main() diff --git a/mrec/item_similarity/knn.py b/mrec/item_similarity/knn.py index 542dbda..8a97ad4 100644 --- a/mrec/item_similarity/knn.py +++ b/mrec/item_similarity/knn.py @@ -84,7 +84,7 @@ def __str__(self): random.seed(0) - print 'loading test data...' + print('loading test data...') data = """\ %%MatrixMarket matrix coordinate real general 3 5 9 @@ -98,7 +98,7 @@ def __str__(self): 3 3 1 3 4 1 """ - print data + print(data) dataset = load_fast_sparse_matrix('mm',StringIO.StringIO(data)) num_users,num_items = dataset.shape @@ -108,32 +108,32 @@ def __str__(self): def output(i,j,val): # convert back to 1-indexed - print '{0}\t{1}\t{2:.3f}'.format(i+1,j+1,val) + print('{0}\t{1}\t{2:.3f}'.format(i+1,j+1,val)) - print 'computing some item similarities...' - print 'item\tsim\tweight' + print('computing some item similarities...') + print('item\tsim\tweight') # if we want we can compute these individually without calling fit() for i in random.sample(xrange(num_items),num_samples): for j,weight in model.get_similar_items(i,max_similar_items=2,dataset=dataset): output(i,j,weight) - print 'learning entire similarity matrix...' + print('learning entire similarity matrix...') # more usually we just call train() on the entire dataset model = CosineKNNRecommender(k=2) model.fit(dataset) - print 'making some recommendations...' - print 'user\trec\tscore' + print('making some recommendations...') + print('user\trec\tscore') for u in random.sample(xrange(num_users),num_samples): for i,score in model.recommend_items(dataset.X,u,max_items=10): output(u,i,score) - print 'making batch recommendations...' + print('making batch recommendations...') recs = model.batch_recommend_items(dataset.X) for u in xrange(num_users): for i,score in recs[u]: output(u,i,score) - print 'making range recommendations...' + print('making range recommendations...') for start,end in [(0,2),(2,3)]: recs = model.range_recommend_items(dataset.X,start,end) for u in xrange(start,end): diff --git a/mrec/item_similarity/recommender.py b/mrec/item_similarity/recommender.py index 1328118..94b2c34 100644 --- a/mrec/item_similarity/recommender.py +++ b/mrec/item_similarity/recommender.py @@ -303,12 +303,12 @@ def _get_recommendations_from_predictions(self,r,dataset,user_start,user_end,max for u in xrange(user_start,user_end): ux = u - user_start if show_progress and ux%1000 == 0: - print ux,'..', + print(ux,'..',) ru = r[ux,:] if return_scores: recs[ux] = [(i,v) for v,i in sorted(zip(ru.data,ru.indices),reverse=True) if v > 0][:max_items] else: recs[ux] = [i for v,i in sorted(zip(ru.data,ru.indices),reverse=True) if v > 0][:max_items] if show_progress: - print + print() return recs diff --git a/mrec/item_similarity/slim.py b/mrec/item_similarity/slim.py index 2cf698a..c6781bd 100644 --- a/mrec/item_similarity/slim.py +++ b/mrec/item_similarity/slim.py @@ -126,7 +126,7 @@ def __str__(self): random.seed(0) - print 'loading test data...' + print('loading test data...') data = """\ %%MatrixMarket matrix coordinate real general 3 5 9 @@ -140,7 +140,7 @@ def __str__(self): 3 3 1 3 4 1 """ - print data + print(data) dataset = load_fast_sparse_matrix('mm',StringIO.StringIO(data)) num_users,num_items = dataset.shape @@ -150,32 +150,32 @@ def __str__(self): def output(i,j,val): # convert back to 1-indexed - print '{0}\t{1}\t{2:.3f}'.format(i+1,j+1,val) + print('{0}\t{1}\t{2:.3f}'.format(i+1,j+1,val)) - print 'computing some item similarities...' - print 'item\tsim\tweight' + print('computing some item similarities...') + print('item\tsim\tweight') # if we want we can compute these individually without calling fit() for i in random.sample(xrange(num_items),num_samples): for j,weight in model.get_similar_items(i,max_similar_items=10,dataset=dataset): output(i,j,weight) - print 'learning entire similarity matrix...' + print('learning entire similarity matrix...') # usually we'll call train() on the entire dataset model = SLIM() model.fit(dataset) - print 'making some recommendations...' - print 'user\trec\tscore' + print('making some recommendations...') + print('user\trec\tscore') for u in random.sample(xrange(num_users),num_samples): for i,score in model.recommend_items(dataset.X,u,max_items=10): output(u,i,score) - print 'making batch recommendations...' + print('making batch recommendations...') recs = model.batch_recommend_items(dataset.X) for u in xrange(num_users): for i,score in recs[u]: output(u,i,score) - print 'making range recommendations...' + print('making range recommendations...') for start,end in [(0,2),(2,3)]: recs = model.range_recommend_items(dataset.X,start,end) for u in xrange(start,end): diff --git a/mrec/mf/climf.py b/mrec/mf/climf.py index 61ba395..c5935d3 100644 --- a/mrec/mf/climf.py +++ b/mrec/mf/climf.py @@ -42,8 +42,8 @@ def fit(self,data): # TODO: create a validation set for iter in xrange(self.max_iters): - print 'iteration {0}:'.format(iter+1) - print 'objective = {0:.4f}'.format(self.objective(data)) + print('iteration {0}:'.format(iter+1)) + print('objective = {0:.4f}'.format(self.objective(data))) self.update(data) # TODO: compute MRR on validation set, terminate if appropriate @@ -137,8 +137,8 @@ def compute_mrr(self,data,test_users=None): found = True break if not found: - print 'fail, no relevant items predicted for test user {0}'.format(i+1) - print 'known items: {0}'.format(items) + print('fail, no relevant items predicted for test user {0}'.format(i+1)) + print('known items: {0}'.format(items)) assert(len(mrr) == len(test_users)) return np.mean(mrr) diff --git a/mrec/mf/evaluate.py b/mrec/mf/evaluate.py index 02c0794..d1fe163 100644 --- a/mrec/mf/evaluate.py +++ b/mrec/mf/evaluate.py @@ -22,7 +22,7 @@ def retrain_recommender(model,dataset): parser.print_help() raise SystemExit - print 'doing a grid search for regularization parameters...' + print('doing a grid search for regularization parameters...') params = {'d':[100],'gamma':[0.01],'C':[100],'max_iter':[100000],'validation_iters':[500]} models = [WARPMFRecommender(**a) for a in ParameterGrid(params)] diff --git a/mrec/mf/model/warp.py b/mrec/mf/model/warp.py index 2189bb4..e5ebb44 100644 --- a/mrec/mf/model/warp.py +++ b/mrec/mf/model/warp.py @@ -191,13 +191,13 @@ def _fit(self,decomposition,updates,train,validation): tot_trials = 0 for it in xrange(self.max_iters): if it % self.validation_iters == 0: - print 'tot_trials',tot_trials + print('tot_trials',tot_trials) tot_trials = 0 prec = self.estimate_precision(decomposition,train,validation) precs.append(prec) - print '{0}: validation precision = {1:.3f}'.format(it,precs[-1]) + print('{0}: validation precision = {1:.3f}'.format(it,precs[-1])) if len(precs) > 3 and precs[-1] < precs[-2] and precs[-2] < precs[-3]: - print 'validation precision got worse twice, terminating' + print('validation precision got worse twice, terminating') break tot_trials += self.compute_updates(train,decomposition,updates) decomposition.apply_updates(updates,self.gamma,self.C) diff --git a/mrec/mf/recommender.py b/mrec/mf/recommender.py index bf40ac2..06947be 100644 --- a/mrec/mf/recommender.py +++ b/mrec/mf/recommender.py @@ -251,12 +251,12 @@ def _get_recommendations_from_predictions(self, for u in xrange(user_start,user_end): ux = u - user_start if show_progress and ux%1000 == 0: - print ux,'..', + print(ux,'..',) ru = r[ux] if return_scores: recs[ux] = [(i,ru[i]) for i in ru.argsort()[::-1] if ru[i] > 0][:max_items] else: recs[ux] = [i for i in ru.argsort()[::-1] if ru[i] > 0][:max_items] if show_progress: - print + print() return recs diff --git a/mrec/mf/warp.py b/mrec/mf/warp.py index 94b0346..5340f47 100644 --- a/mrec/mf/warp.py +++ b/mrec/mf/warp.py @@ -81,9 +81,9 @@ def create_validation_set(self,train): # and reasonable number of validation cycles max_iters = 30*validation_iters - print num_validation_users,'validation users' - print validation_iters,'validation iters' - print max_iters,'max_iters' + print(num_validation_users,'validation users') + print(validation_iters,'validation iters') + print(max_iters,'max_iters') validation = dict() for u in xrange(num_validation_users): diff --git a/mrec/mf/wrmf.py b/mrec/mf/wrmf.py index 725b05c..5ae54bb 100644 --- a/mrec/mf/wrmf.py +++ b/mrec/mf/wrmf.py @@ -62,7 +62,7 @@ def fit(self,train,item_features=None): self.U = self.init_factors(num_users,False) # don't need values, will compute them self.V = self.init_factors(num_items) for it in xrange(self.num_iters): - print 'iteration',it + print('iteration',it) # fit user factors VV = self.V.T.dot(self.V) for u in xrange(num_users): diff --git a/mrec/parallel/item_similarity.py b/mrec/parallel/item_similarity.py index 239912e..da1eed5 100644 --- a/mrec/parallel/item_similarity.py +++ b/mrec/parallel/item_similarity.py @@ -110,7 +110,7 @@ def process(task): for j in xrange(start,end): w = model.get_similar_items(j,max_similar_items=max_similar_items,dataset=dataset) for k,v in w: - print >>out,'{0}\t{1}\t{2}'.format(j+1,k+1,v) # write as 1-indexed + print('{0}\t{1}\t{2}'.format(j+1,k+1,v), file=out) # write as 1-indexed out.close() # record success diff --git a/mrec/parallel/predict.py b/mrec/parallel/predict.py index e9d5b40..4aa6569 100644 --- a/mrec/parallel/predict.py +++ b/mrec/parallel/predict.py @@ -35,7 +35,7 @@ def run(task): recs = model.range_recommend_items(dataset,start,end,max_items=20,return_scores=True) for u,items in zip(xrange(start,end),recs): for i,w in items: - print >>out,'{0}\t{1}\t{2}'.format(u+1,i+1,w) # write as 1-indexed + print('{0}\t{1}\t{2}'.format(u+1,i+1,w), file=out) # write as 1-indexed out.close() # record success diff --git a/mrec/tests/test_sparse.py b/mrec/tests/test_sparse.py index 1b5f931..f1179da 100644 --- a/mrec/tests/test_sparse.py +++ b/mrec/tests/test_sparse.py @@ -16,7 +16,7 @@ def test_loadtxt(): f,path = tempfile.mkstemp(suffix='.npz') with open(path,'w') as f: for i,j,v in zip(X.row,X.col,X.data): - print >>f,'{0}\t{1}\t{2}'.format(i+1,j+1,v) + print('{0}\t{1}\t{2}'.format(i+1,j+1,v), file=f) Y = loadtxt(path) os.remove(path) assert_sparse_matrix_equal(X,Y) From de7bb1d45d405bb62a903f40759647a52fd279a4 Mon Sep 17 00:00:00 2001 From: Boris Shminke Date: Sun, 29 Oct 2017 14:54:25 +0300 Subject: [PATCH 03/17] change xrange to range --- mrec/base_recommender.py | 6 +++--- mrec/evaluation/metrics.py | 2 +- mrec/evaluation/tests/test_metrics.py | 8 ++++---- mrec/examples/predict.py | 2 +- mrec/examples/prepare.py | 2 +- mrec/examples/tune_slim.py | 4 ++-- mrec/item_similarity/knn.py | 8 ++++---- mrec/item_similarity/recommender.py | 6 +++--- mrec/item_similarity/slim.py | 8 ++++---- mrec/mf/climf.py | 6 +++--- mrec/mf/model/warp.py | 6 +++--- mrec/mf/model/warp_fast.pyx | 12 ++++++------ mrec/mf/recommender.py | 6 +++--- mrec/mf/warp.py | 2 +- mrec/mf/wrmf.py | 6 +++--- mrec/parallel/item_similarity.py | 4 ++-- mrec/parallel/predict.py | 2 +- mrec/parallel/warp.py | 2 +- mrec/parallel/wrmf.py | 6 +++--- mrec/popularity.py | 8 ++++---- mrec/sparse.py | 4 ++-- mrec/testing.py | 2 +- mrec/tests/test_base_recommender.py | 4 ++-- mrec/tests/test_sparse.py | 6 +++--- 24 files changed, 61 insertions(+), 61 deletions(-) diff --git a/mrec/base_recommender.py b/mrec/base_recommender.py index 34c92de..b6ff0b7 100644 --- a/mrec/base_recommender.py +++ b/mrec/base_recommender.py @@ -190,7 +190,7 @@ def batch_recommend_items(self, this for most recommenders. """ recs = [] - for u in xrange(self.num_users): + for u in range(self.num_users): if show_progress and u%1000 == 0: print(u,'..',) recs.append(self.recommend_items(dataset,u,max_items,return_scores)) @@ -234,7 +234,7 @@ def range_recommend_items(self, This provides a default implementation, you will be able to optimize this for most recommenders. """ - return [self.recommend_items(dataset,u,max_items,return_scores) for u in xrange(user_start,user_end)] + return [self.recommend_items(dataset,u,max_items,return_scores) for u in range(user_start,user_end)] def _zero_known_item_scores(self,r,train): """ @@ -264,7 +264,7 @@ def _zero_known_item_scores(self,r,train): # - we can't just use row,col = train.nonzero() as this eliminates # u,i for which train[u,i] has been explicitly set to zero row = np.zeros(col.shape) - for u in xrange(train.shape[0]): + for u in range(train.shape[0]): start,end = train.indptr[u],train.indptr[u+1] if end > start: row[start:end] = u diff --git a/mrec/evaluation/metrics.py b/mrec/evaluation/metrics.py index ad2b928..80f951b 100644 --- a/mrec/evaluation/metrics.py +++ b/mrec/evaluation/metrics.py @@ -57,7 +57,7 @@ def run_evaluation(models,retrain,get_split,num_runs,evaluation_func): A number of suitable functions are already available in the module. """ metrics = [defaultdict(list) for m in models] - for _ in xrange(num_runs): + for _ in range(num_runs): train,users,test = get_split() for i,model in enumerate(models): retrain(model,train) diff --git a/mrec/evaluation/tests/test_metrics.py b/mrec/evaluation/tests/test_metrics.py index d0b9bab..05e21ba 100644 --- a/mrec/evaluation/tests/test_metrics.py +++ b/mrec/evaluation/tests/test_metrics.py @@ -12,7 +12,7 @@ def test_prec(): true = [2,8,6,4] predicted = [6,5,8,7] expected = [1,0.5,2./3.,0.5] - for k in xrange(1,5): + for k in range(1,5): assert_equal(metrics.prec([],true,k),0) assert_equal(metrics.prec(true,true,k),1) assert_equal(metrics.prec(predicted,true,k),expected[k-1]) @@ -24,17 +24,17 @@ def test_prec(): def test_hit_rate(): predicted = [6,5,8,7] for true in [[],[2,8]]: - for k in xrange(1,5): + for k in range(1,5): with assert_raises(ValueError): metrics.hit_rate(predicted,true,k) true = [5] expected = [0,1,1,1] - for k in xrange(1,5): + for k in range(1,5): assert_equal(metrics.hit_rate(predicted,true,k),expected[k-1]) def test_rr(): true = [2,8,6,4] predicted = [5,7,6,8] expected = [0,0,1./3.,1./3.] - for k in xrange(1,5): + for k in range(1,5): assert_equal(metrics.rr(predicted[:k],true),expected[k-1]) diff --git a/mrec/examples/predict.py b/mrec/examples/predict.py index 6b8ab1d..ad6c760 100644 --- a/mrec/examples/predict.py +++ b/mrec/examples/predict.py @@ -107,7 +107,7 @@ def create_tasks(modelfile, evaluator): users_per_task,num_users = estimate_users_per_task(mb_per_task,input_format,trainfile,modelfile) tasks = [] - for start in xrange(0,num_users,users_per_task): + for start in range(0,num_users,users_per_task): end = min(num_users,start+users_per_task) generate = (start,end) not in done tasks.append((modelfile,input_format,trainfile,test_input_format,testfile,item_feature_format,featurefile,outdir,start,end,evaluator,generate)) diff --git a/mrec/examples/prepare.py b/mrec/examples/prepare.py index 752b42a..3c94ab2 100644 --- a/mrec/examples/prepare.py +++ b/mrec/examples/prepare.py @@ -81,7 +81,7 @@ def main(): sample_before_thresholding=opts.sample_before_thresholding) processor = Processor(splitter,parser,opts.min_items_per_user) - for i in xrange(opts.num_splits): + for i in range(opts.num_splits): trainfile = get_splitfile(opts.dataset,opts.outdir,'train',i) testfile = get_splitfile(opts.dataset,opts.outdir,'test',i) diff --git a/mrec/examples/tune_slim.py b/mrec/examples/tune_slim.py index 98cbdad..4327a1a 100644 --- a/mrec/examples/tune_slim.py +++ b/mrec/examples/tune_slim.py @@ -39,7 +39,7 @@ def estimate_sparsity(task): return args,avg_nnz,too_few_sims,avg_neg def pow_range(small,big): - return [10**v for v in xrange(int(log10(small)),int(log10(big))+1)] + return [10**v for v in range(int(log10(small)),int(log10(big))+1)] def main(): parser = OptionParser() @@ -68,7 +68,7 @@ def main(): params = {'l1_reg':pow_range(opts.l1_min,opts.l1_max), 'l2_reg':pow_range(opts.l2_min,opts.l2_max)} num_items = dataset.shape[1] - sample_items = random.sample(xrange(num_items),opts.num_samples) + sample_items = random.sample(range(num_items),opts.num_samples) logging.info('preparing tasks for a grid search of these values:') logging.info(params) diff --git a/mrec/item_similarity/knn.py b/mrec/item_similarity/knn.py index 8a97ad4..cb60f3d 100644 --- a/mrec/item_similarity/knn.py +++ b/mrec/item_similarity/knn.py @@ -113,7 +113,7 @@ def output(i,j,val): print('computing some item similarities...') print('item\tsim\tweight') # if we want we can compute these individually without calling fit() - for i in random.sample(xrange(num_items),num_samples): + for i in random.sample(range(num_items),num_samples): for j,weight in model.get_similar_items(i,max_similar_items=2,dataset=dataset): output(i,j,weight) @@ -123,19 +123,19 @@ def output(i,j,val): model.fit(dataset) print('making some recommendations...') print('user\trec\tscore') - for u in random.sample(xrange(num_users),num_samples): + for u in random.sample(range(num_users),num_samples): for i,score in model.recommend_items(dataset.X,u,max_items=10): output(u,i,score) print('making batch recommendations...') recs = model.batch_recommend_items(dataset.X) - for u in xrange(num_users): + for u in range(num_users): for i,score in recs[u]: output(u,i,score) print('making range recommendations...') for start,end in [(0,2),(2,3)]: recs = model.range_recommend_items(dataset.X,start,end) - for u in xrange(start,end): + for u in range(start,end): for i,score in recs[u-start]: output(u,i,score) diff --git a/mrec/item_similarity/recommender.py b/mrec/item_similarity/recommender.py index 94b2c34..a8eb1ef 100644 --- a/mrec/item_similarity/recommender.py +++ b/mrec/item_similarity/recommender.py @@ -39,7 +39,7 @@ def fit(self,dataset,item_features=None): data = [] row = [] col = [] - for j in xrange(num_items): + for j in range(num_items): w = self.compute_similarities(dataset,j) for k,v in enumerate(w): if v != 0: @@ -299,8 +299,8 @@ def _get_recommendations_from_predictions(self,r,dataset,user_start,user_end,max else just a list of idxs. """ r = self._zero_known_item_scores(r,dataset[user_start:user_end,:]) - recs = [[] for u in xrange(user_start,user_end)] - for u in xrange(user_start,user_end): + recs = [[] for u in range(user_start,user_end)] + for u in range(user_start,user_end): ux = u - user_start if show_progress and ux%1000 == 0: print(ux,'..',) diff --git a/mrec/item_similarity/slim.py b/mrec/item_similarity/slim.py index c6781bd..b800428 100644 --- a/mrec/item_similarity/slim.py +++ b/mrec/item_similarity/slim.py @@ -155,7 +155,7 @@ def output(i,j,val): print('computing some item similarities...') print('item\tsim\tweight') # if we want we can compute these individually without calling fit() - for i in random.sample(xrange(num_items),num_samples): + for i in random.sample(range(num_items),num_samples): for j,weight in model.get_similar_items(i,max_similar_items=10,dataset=dataset): output(i,j,weight) @@ -165,19 +165,19 @@ def output(i,j,val): model.fit(dataset) print('making some recommendations...') print('user\trec\tscore') - for u in random.sample(xrange(num_users),num_samples): + for u in random.sample(range(num_users),num_samples): for i,score in model.recommend_items(dataset.X,u,max_items=10): output(u,i,score) print('making batch recommendations...') recs = model.batch_recommend_items(dataset.X) - for u in xrange(num_users): + for u in range(num_users): for i,score in recs[u]: output(u,i,score) print('making range recommendations...') for start,end in [(0,2),(2,3)]: recs = model.range_recommend_items(dataset.X,start,end) - for u in xrange(start,end): + for u in range(start,end): for i,score in recs[u-start]: output(u,i,score) diff --git a/mrec/mf/climf.py b/mrec/mf/climf.py index c5935d3..e144610 100644 --- a/mrec/mf/climf.py +++ b/mrec/mf/climf.py @@ -41,7 +41,7 @@ def fit(self,data): self.V = 0.01*np.random.random_sample((data.shape[1],self.d)) # TODO: create a validation set - for iter in xrange(self.max_iters): + for iter in range(self.max_iters): print('iteration {0}:'.format(iter+1)) print('objective = {0:.4f}'.format(self.objective(data))) self.update(data) @@ -77,7 +77,7 @@ def objective(self,data): current value of F(U,V) """ F = -0.5*self.lbda*(np.sum(self.U*self.U)+np.sum(self.V*self.V)) - for i in xrange(len(self.U)): + for i in range(len(self.U)): f = self.precompute_f(data,i) for j in f: F += log(g(f[j])) @@ -96,7 +96,7 @@ def update(self,data): lbda : regularization constant lambda gamma: learning rate """ - for i in xrange(len(self.U)): + for i in range(len(self.U)): dU = -self.lbda*self.U[i] f = self.precompute_f(data,i) for j in f: diff --git a/mrec/mf/model/warp.py b/mrec/mf/model/warp.py index e5ebb44..41dbd4e 100644 --- a/mrec/mf/model/warp.py +++ b/mrec/mf/model/warp.py @@ -189,7 +189,7 @@ def fit(self,train,validation=None): def _fit(self,decomposition,updates,train,validation): precs = [] tot_trials = 0 - for it in xrange(self.max_iters): + for it in range(self.max_iters): if it % self.validation_iters == 0: print('tot_trials',tot_trials) tot_trials = 0 @@ -210,13 +210,13 @@ def precompute_warp_loss(self,num_cols): """ assert(num_cols>1) self.warp_loss = np.ones(num_cols) - for i in xrange(1,num_cols): + for i in range(1,num_cols): self.warp_loss[i] = self.warp_loss[i-1]+1.0/(i+1) def compute_updates(self,train,decomposition,updates): updates.clear() tot_trials = 0 - for ix in xrange(self.batch_size): + for ix in range(self.batch_size): u,i,j,N,trials = self.sample(train,decomposition) tot_trials += trials L = self.estimate_warp_loss(train,u,N) diff --git a/mrec/mf/model/warp_fast.pyx b/mrec/mf/model/warp_fast.pyx index e4b8417..c65569d 100644 --- a/mrec/mf/model/warp_fast.pyx +++ b/mrec/mf/model/warp_fast.pyx @@ -121,7 +121,7 @@ cdef sample_violating_negative_example(np.ndarray[np.float_t,ndim=2] U, num_items = V.shape[0] r = U[u].dot(V[i]) - for N in xrange(1,max_trials): + for N in range(1,max_trials): # find j!=i s.t. data[u,j] < data[u,i] j = sample_negative_example(num_items,vals,indices,begin,end,ix) if r - U[u].dot(V[j]) < 1: @@ -166,7 +166,7 @@ cdef sample_negative_example(num_items, # sample item uniformly with replacement j = rand() % num_items found = 0 - for jx in xrange(begin,end): + for jx in range(begin,end): if indices[jx] == j: found = 1 break @@ -242,7 +242,7 @@ def apply_updates(np.ndarray[np.float_t,ndim=2] F, assert(rows.shape[0] == deltas.shape[0]) num = rows.shape[0] - for i in xrange(num): + for i in range(num): row = rows[i] delta = deltas[i] F[row] += gamma*delta @@ -379,7 +379,7 @@ cdef sample_violating_negative_example2(np.ndarray[np.float_t,ndim=2] U, XW = sparse_sdot(xbuf,W,X,i,is_sparse) r = U[u].dot(V[i] + XW) - for N in xrange(1,max_trials): + for N in range(1,max_trials): # find j!=i s.t. data[u,j] < data[u,i] j = sample_negative_example(num_items,vals,indices,begin,end,ix) XW = sparse_sdot(xbuf,W,X,j,is_sparse) @@ -399,10 +399,10 @@ cdef sparse_sdot(np.ndarray[np.float_t,ndim=1] xbuf, if is_sparse: # TODO: surely there's something built in to do this... - for ix in xrange(X.indptr[i],X.indptr[i+1]): + for ix in range(X.indptr[i],X.indptr[i+1]): xbuf[X.indices[ix]] = X.data[ix] XW = xbuf.dot(W) - for ix in xrange(X.indptr[i],X.indptr[i+1]): + for ix in range(X.indptr[i],X.indptr[i+1]): xbuf[X.indices[ix]] = 0 else: XW = X[i].dot(W) diff --git a/mrec/mf/recommender.py b/mrec/mf/recommender.py index 06947be..eaefe1c 100644 --- a/mrec/mf/recommender.py +++ b/mrec/mf/recommender.py @@ -209,7 +209,7 @@ def range_recommend_items(self, Each entry is a list of (idx,score) pairs if return_scores is True, else just a list of idxs. """ - r = self.predict_ratings(xrange(user_start,user_end),item_features=item_features) + r = self.predict_ratings(range(user_start,user_end),item_features=item_features) return self._get_recommendations_from_predictions(r,dataset,user_start,user_end,max_items,return_scores) def _get_recommendations_from_predictions(self, @@ -247,8 +247,8 @@ def _get_recommendations_from_predictions(self, else just a list of idxs. """ r = np.array(self._zero_known_item_scores(r,dataset[user_start:user_end,:])) - recs = [[] for u in xrange(user_start,user_end)] - for u in xrange(user_start,user_end): + recs = [[] for u in range(user_start,user_end)] + for u in range(user_start,user_end): ux = u - user_start if show_progress and ux%1000 == 0: print(ux,'..',) diff --git a/mrec/mf/warp.py b/mrec/mf/warp.py index 5340f47..7bbdf68 100644 --- a/mrec/mf/warp.py +++ b/mrec/mf/warp.py @@ -86,7 +86,7 @@ def create_validation_set(self,train): print(max_iters,'max_iters') validation = dict() - for u in xrange(num_validation_users): + for u in range(num_validation_users): positive = np.where(train[u].data > 0)[0] hidden = random.sample(positive,positive.shape[0]/2) if hidden: diff --git a/mrec/mf/wrmf.py b/mrec/mf/wrmf.py index 5ae54bb..96691ab 100644 --- a/mrec/mf/wrmf.py +++ b/mrec/mf/wrmf.py @@ -61,11 +61,11 @@ def fit(self,train,item_features=None): self.U = self.init_factors(num_users,False) # don't need values, will compute them self.V = self.init_factors(num_items) - for it in xrange(self.num_iters): + for it in range(self.num_iters): print('iteration',it) # fit user factors VV = self.V.T.dot(self.V) - for u in xrange(num_users): + for u in range(num_users): # get (positive i.e. non-zero scored) items for user indices = train.X[u].nonzero()[1] if indices.size: @@ -74,7 +74,7 @@ def fit(self,train,item_features=None): self.U[u,:] = np.zeros(self.d) # fit item factors UU = self.U.T.dot(self.U) - for i in xrange(num_items): + for i in range(num_items): indices = train.fast_get_col(i).nonzero()[0] if indices.size: self.V[i,:] = self.update(indices,self.U,UU) diff --git a/mrec/parallel/item_similarity.py b/mrec/parallel/item_similarity.py index da1eed5..371109c 100644 --- a/mrec/parallel/item_similarity.py +++ b/mrec/parallel/item_similarity.py @@ -79,7 +79,7 @@ def create_tasks(self,model,input_format,trainfile,outdir,num_items,num_engines, num_engines = 1 items_per_engine = int(math.ceil(float(num_items)/num_engines)) tasks = [] - for start in xrange(0,num_items,items_per_engine): + for start in range(0,num_items,items_per_engine): end = min(num_items,start+items_per_engine) if (start,end) not in done: tasks.append((model,input_format,trainfile,outdir,start,end,max_similar_items)) @@ -107,7 +107,7 @@ def process(task): # write sims directly to file as we compute them outfile = os.path.join(outdir,'sims.{0}-{1}.tsv'.format(start,end)) out = open(outfile,'w') - for j in xrange(start,end): + for j in range(start,end): w = model.get_similar_items(j,max_similar_items=max_similar_items,dataset=dataset) for k,v in w: print('{0}\t{1}\t{2}'.format(j+1,k+1,v), file=out) # write as 1-indexed diff --git a/mrec/parallel/predict.py b/mrec/parallel/predict.py index 4aa6569..ae92e22 100644 --- a/mrec/parallel/predict.py +++ b/mrec/parallel/predict.py @@ -33,7 +33,7 @@ def run(task): recs = model.range_recommend_items(dataset,start,end,max_items=20,return_scores=True,item_features=item_features) else: recs = model.range_recommend_items(dataset,start,end,max_items=20,return_scores=True) - for u,items in zip(xrange(start,end),recs): + for u,items in zip(range(start,end),recs): for i,w in items: print('{0}\t{1}\t{2}'.format(u+1,i+1,w), file=out) # write as 1-indexed out.close() diff --git a/mrec/parallel/warp.py b/mrec/parallel/warp.py index 840ff56..99d3241 100644 --- a/mrec/parallel/warp.py +++ b/mrec/parallel/warp.py @@ -87,7 +87,7 @@ def create_tasks(self, num_engines, done): tasks = [] - for ix in xrange(num_engines): + for ix in range(num_engines): if ix not in done: outfile = self.get_modelfile(ix,outdir) tasks.append((model,input_format,trainfile,feature_format,featurefile,outfile,ix,num_engines)) diff --git a/mrec/parallel/wrmf.py b/mrec/parallel/wrmf.py index e2d0fc5..95dc8e8 100644 --- a/mrec/parallel/wrmf.py +++ b/mrec/parallel/wrmf.py @@ -42,7 +42,7 @@ def run(self,view,model,input_format,trainfile,num_engines,workdir,modelfile): num_users,num_items = data.shape del data - for it in xrange(model.num_iters): + for it in range(model.num_iters): logging.info('iteration {0}'.format(it)) tasks = self.create_tasks(num_users,num_engines,model,input_format,trainfile,workdir,'U',get_user_indices,get_item_factor_files,init_item_factors) self.run_tasks(view,tasks) @@ -66,7 +66,7 @@ def run_tasks(self,view,tasks): def create_tasks(self,num_factors,num_engines,model,input_format,trainfile,workdir,factor_type,get_indices,get_fixed_factor_files,init_fixed_factors): factors_per_engine = int(math.ceil(float(num_factors)/num_engines)) tasks = [] - for start in xrange(0,num_factors,factors_per_engine): + for start in range(0,num_factors,factors_per_engine): end = min(num_factors,start+factors_per_engine) fixed_factor_files = get_fixed_factor_files(workdir) tasks.append((model,input_format,trainfile,factor_type,get_indices,init_fixed_factors,fixed_factor_files,start,end,workdir)) @@ -95,7 +95,7 @@ def compute_factors(task): HH = H.T.dot(H) W = np.zeros(((end-start),model.d)) - for j in xrange(start,end): + for j in range(start,end): indices = get_indices(data,j) if indices.size: W[j-start,:] = model.update(indices,H,HH) diff --git a/mrec/popularity.py b/mrec/popularity.py index 9c04ee2..96eac95 100644 --- a/mrec/popularity.py +++ b/mrec/popularity.py @@ -53,16 +53,16 @@ def fit(self,dataset,item_features=None): d = dataset.tocsc() if self.method == 'count': # count the total number of ratings for each item - popularity = [(d[:,i].nnz,i) for i in xrange(d.shape[1])] + popularity = [(d[:,i].nnz,i) for i in range(d.shape[1])] elif self.method == 'sum': # find the sum of the ratings for each item - popularity = [(d[:,i].sum(),i) for i in xrange(d.shape[1])] + popularity = [(d[:,i].sum(),i) for i in range(d.shape[1])] elif self.method == 'avg': # find the mean rating for each item - popularity = [(d[:,i].mean(),i) for i in xrange(d.shape[1])] + popularity = [(d[:,i].mean(),i) for i in range(d.shape[1])] elif self.method == 'thresh': # count the number of ratings above thresh for each item - popularity = [(sum(d[:,i].data>self.thresh),i) for i in xrange(d.shape[1])] + popularity = [(sum(d[:,i].data>self.thresh),i) for i in range(d.shape[1])] popularity.sort(reverse=True) self.pop_items = [(i,c) for (c,i) in popularity] diff --git a/mrec/sparse.py b/mrec/sparse.py index b08541e..4fccffc 100644 --- a/mrec/sparse.py +++ b/mrec/sparse.py @@ -192,14 +192,14 @@ def ensure_sparse_cols(self,max_density,remove_lowest=True): max_nnz = int(max_density) else: max_nnz = int(max_density*self.shape[0]) - for j in xrange(self.shape[1]): + for j in range(self.shape[1]): col = self.fast_get_col(j) excess = col.nnz - max_nnz if excess > 0: if remove_lowest: zero_entries = np.argsort(col.data)[:excess] else: - zero_entries = random.sample(xrange(col.nnz),excess) + zero_entries = random.sample(range(col.nnz),excess) col.data[zero_entries] = 0 self.fast_update_col(j,col.data) diff --git a/mrec/testing.py b/mrec/testing.py index 75c5945..22d3f59 100644 --- a/mrec/testing.py +++ b/mrec/testing.py @@ -4,7 +4,7 @@ from sklearn.utils.testing import assert_array_equal def get_random_coo_matrix(rows=3,cols=10,nnz=20): - row_col = random.sample(xrange(rows*cols),nnz) # ensure are unique + row_col = random.sample(range(rows*cols),nnz) # ensure are unique row = [i // cols for i in row_col] col = [i % cols for i in row_col] data = np.random.randint(0,nnz*5,nnz) diff --git a/mrec/tests/test_base_recommender.py b/mrec/tests/test_base_recommender.py index a75dea9..37b37a6 100644 --- a/mrec/tests/test_base_recommender.py +++ b/mrec/tests/test_base_recommender.py @@ -62,8 +62,8 @@ def test_zero_known_item_scores(): r = BaseRecommender() safe = r._zero_known_item_scores(predictions,train) num_users,num_items = predictions.shape - for u in xrange(num_users): - for i in xrange(num_items): + for u in range(num_users): + for i in range(num_items): if i in train[u].indices: assert_less_equal(safe[u,i],0) else: diff --git a/mrec/tests/test_sparse.py b/mrec/tests/test_sparse.py index f1179da..6081913 100644 --- a/mrec/tests/test_sparse.py +++ b/mrec/tests/test_sparse.py @@ -42,21 +42,21 @@ def test_fast_get_col(): X = get_random_coo_matrix().tocsc() m = fast_sparse_matrix(X) rows,cols = X.shape - for j in xrange(cols): + for j in range(cols): assert_array_equal(m.fast_get_col(j).toarray(),X[:,j].toarray()) def test_fast_update_col(): X = get_random_coo_matrix().tocsc() m = fast_sparse_matrix(X) cols = X.shape[1] - for j in xrange(cols): + for j in range(cols): vals = m.fast_get_col(j).data if (vals==0).all(): continue vals[vals!=0] += 1 m.fast_update_col(j,vals) expected = X[:,j].toarray() - for i in xrange(expected.shape[0]): + for i in range(expected.shape[0]): if expected[i] != 0: expected[i] += 1 assert_array_equal(m.fast_get_col(j).toarray(),expected) From 5e25af10bb3ed5237c0071525d15071913b82b5d Mon Sep 17 00:00:00 2001 From: Boris Shminke Date: Sun, 29 Oct 2017 14:56:31 +0300 Subject: [PATCH 04/17] pickle is a binary format --- mrec/base_recommender.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mrec/base_recommender.py b/mrec/base_recommender.py index b6ff0b7..83ca7bf 100644 --- a/mrec/base_recommender.py +++ b/mrec/base_recommender.py @@ -86,7 +86,7 @@ def save(self,filepath): if archive: np.savez(filepath,**archive) else: - pickle.dump(self,open(filepath,'w')) + pickle.dump(self,open(filepath,'wb')) def _create_archive(self): """ From bbefdd5bfd67fd5084c61a5ecaee9d9c66b2a1b7 Mon Sep 17 00:00:00 2001 From: Boris Shminke Date: Sun, 29 Oct 2017 16:22:10 +0300 Subject: [PATCH 05/17] iteritems changed to items --- mrec/evaluation/__init__.py | 2 +- mrec/evaluation/metrics.py | 6 +++--- mrec/examples/predict.py | 4 ++-- 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/mrec/evaluation/__init__.py b/mrec/evaluation/__init__.py index 6b6455a..bb309f5 100644 --- a/mrec/evaluation/__init__.py +++ b/mrec/evaluation/__init__.py @@ -18,7 +18,7 @@ def __init__(self,compute_metrics,max_items): def _add_metrics(self,predicted,actual): metrics = self.compute_metrics(predicted,actual) if metrics: - for m,val in metrics.iteritems(): + for m,val in metrics.items(): self.cum_metrics[m] += val self.count += 1 diff --git a/mrec/evaluation/metrics.py b/mrec/evaluation/metrics.py index 80f951b..cc93630 100644 --- a/mrec/evaluation/metrics.py +++ b/mrec/evaluation/metrics.py @@ -62,7 +62,7 @@ def run_evaluation(models,retrain,get_split,num_runs,evaluation_func): for i,model in enumerate(models): retrain(model,train) run_metrics = evaluation_func(model,train,users,test) - for m,val in run_metrics.iteritems(): + for m,val in run_metrics.items(): print(m,val) metrics[i][m].append(val) return metrics @@ -83,10 +83,10 @@ def sort_metrics_by_name(names): prefix2val[name].append(val) else: prefix2val[name] = [] - for name,vals in prefix2val.iteritems(): + for name,vals in prefix2val.items(): prefix2val[name] = sorted(vals) ret = [] - for name,vals in sorted(prefix2val.iteritems()): + for name,vals in sorted(prefix2val.items()): if vals: for val in vals: ret.append('{0}@{1}'.format(name,val)) diff --git a/mrec/examples/predict.py b/mrec/examples/predict.py index ad6c760..61d60e3 100644 --- a/mrec/examples/predict.py +++ b/mrec/examples/predict.py @@ -82,7 +82,7 @@ def process(view,opts,modelfile,trainfile,testfile,featurefile,outdir,evaluator) tot_count = 0 for results in processed: for cum_metrics,count in results: - for m,val in cum_metrics.iteritems(): + for m,val in cum_metrics.items(): avg_metrics[m] += val tot_count += count for m in avg_metrics: @@ -142,7 +142,7 @@ def estimate_users_per_task(mb_per_task,input_format,trainfile,modelfile): if mb_per_task <= required_mb_per_task: raise RuntimeError('requires at least {0}MB per task, increase --mb_per_task if you can'.format(required_mb_per_task)) - return users_per_task,num_users + return int(users_per_task), int(num_users) def get_dataset_size(input_format,datafile): logging.info('loading dataset to get size...') From a64bb54fdc81dfb56b4b42dd7d6b1544173d9fe5 Mon Sep 17 00:00:00 2001 From: Boris Shminke Date: Sun, 29 Oct 2017 15:04:20 +0300 Subject: [PATCH 06/17] np.loads requires bytes input now --- mrec/base_recommender.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/mrec/base_recommender.py b/mrec/base_recommender.py index 83ca7bf..76d127d 100644 --- a/mrec/base_recommender.py +++ b/mrec/base_recommender.py @@ -117,7 +117,7 @@ def load(filepath): if isinstance(r,BaseRecommender): model = r else: - model = np.loads(str(r['model'])) + model = np.loads(r['model']) model._load_archive(r) # restore any fields serialized separately return model @@ -148,7 +148,7 @@ def read_recommender_description(filepath): if isinstance(r,BaseRecommender): model = r else: - model = np.loads(str(r['model'])) + model = np.loads(r['model']) return str(model) def __str__(self): From 65c262000816295241f905422b06f5ef67956db0 Mon Sep 17 00:00:00 2001 From: Boris Shminke Date: Sun, 29 Oct 2017 15:28:44 +0300 Subject: [PATCH 07/17] IPython.parallel became ipyparallel --- mrec/examples/predict.py | 2 +- mrec/examples/train.py | 2 +- mrec/examples/tune_slim.py | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/mrec/examples/predict.py b/mrec/examples/predict.py index 61d60e3..22f0edf 100644 --- a/mrec/examples/predict.py +++ b/mrec/examples/predict.py @@ -164,7 +164,7 @@ def main(): import os from optparse import OptionParser - from IPython.parallel import Client + from ipyparallel import Client from mrec.evaluation.metrics import compute_main_metrics, compute_hit_rate from mrec.evaluation import Evaluator diff --git a/mrec/examples/train.py b/mrec/examples/train.py index 7fe3c92..0bccb8f 100644 --- a/mrec/examples/train.py +++ b/mrec/examples/train.py @@ -19,7 +19,7 @@ def main(): import glob import subprocess from optparse import OptionParser - from IPython.parallel import Client + from ipyparallel import Client from mrec import load_fast_sparse_matrix, save_recommender from mrec.item_similarity.slim import SLIM diff --git a/mrec/examples/tune_slim.py b/mrec/examples/tune_slim.py index 4327a1a..a1ea762 100644 --- a/mrec/examples/tune_slim.py +++ b/mrec/examples/tune_slim.py @@ -12,7 +12,7 @@ from sklearn.grid_search import ParameterGrid except ImportError: from sklearn.grid_search import IterGrid as ParameterGrid -from IPython.parallel import Client +from ipyparallel import Client from mrec import load_fast_sparse_matrix From a2d67f278ab8078dbb39cfbc2162c4969fcac7aa Mon Sep 17 00:00:00 2001 From: Boris Shminke Date: Sun, 29 Oct 2017 15:50:21 +0300 Subject: [PATCH 08/17] move to more explicit package naming --- mrec/__init__.py | 4 ++-- mrec/examples/evaluate.py | 2 +- mrec/examples/factors.py | 2 +- mrec/examples/predict.py | 2 +- mrec/examples/prepare.py | 2 +- mrec/examples/train.py | 2 +- mrec/item_similarity/knn.py | 2 +- mrec/item_similarity/precomputed.py | 2 +- mrec/item_similarity/recommender.py | 4 ++-- mrec/item_similarity/slim.py | 2 +- mrec/mf/evaluate.py | 2 +- mrec/mf/model/warp2.py | 2 +- mrec/mf/warp.py | 4 ++-- mrec/mf/warp2.py | 4 ++-- mrec/popularity.py | 4 ++-- mrec/reranking_recommender.py | 2 +- 16 files changed, 21 insertions(+), 21 deletions(-) diff --git a/mrec/__init__.py b/mrec/__init__.py index ec961ee..329b194 100644 --- a/mrec/__init__.py +++ b/mrec/__init__.py @@ -6,8 +6,8 @@ except ImportError: import pickle -from sparse import fast_sparse_matrix, loadtxt, loadz, savez -from base_recommender import BaseRecommender +from mrec.sparse import fast_sparse_matrix, loadtxt, loadz, savez +from mrec.base_recommender import BaseRecommender __version__ = '0.3.1' diff --git a/mrec/examples/evaluate.py b/mrec/examples/evaluate.py index 24d6633..76d5452 100644 --- a/mrec/examples/evaluate.py +++ b/mrec/examples/evaluate.py @@ -16,7 +16,7 @@ def main(): from mrec.evaluation.metrics import compute_main_metrics, compute_hit_rate from mrec.evaluation import Evaluator from mrec.evaluation.metrics import print_report - from filename_conventions import get_testfile, get_recsfile + from mrec.examples.filename_conventions import get_testfile, get_recsfile logging.basicConfig(level=logging.INFO,format='[%(asctime)s] %(levelname)s: %(message)s') diff --git a/mrec/examples/factors.py b/mrec/examples/factors.py index b2d54d6..87053c1 100644 --- a/mrec/examples/factors.py +++ b/mrec/examples/factors.py @@ -14,7 +14,7 @@ def main(): from mrec import save_recommender from mrec.mf.recommender import MatrixFactorizationRecommender - from filename_conventions import get_modelfile + from mrec.examples.filename_conventions import get_modelfile logging.basicConfig(level=logging.INFO,format='[%(asctime)s] %(levelname)s: %(message)s') diff --git a/mrec/examples/predict.py b/mrec/examples/predict.py index 22f0edf..d699751 100644 --- a/mrec/examples/predict.py +++ b/mrec/examples/predict.py @@ -25,7 +25,7 @@ from mrec.mf.recommender import MatrixFactorizationRecommender from mrec.item_similarity.recommender import ItemSimilarityRecommender -from filename_conventions import * +from mrec.examples.filename_conventions import * ONE_MB = 2**20 diff --git a/mrec/examples/prepare.py b/mrec/examples/prepare.py index 3c94ab2..9405ce1 100644 --- a/mrec/examples/prepare.py +++ b/mrec/examples/prepare.py @@ -46,7 +46,7 @@ def main(): from optparse import OptionParser from mrec.evaluation.preprocessing import TSVParser, SplitCreator - from filename_conventions import get_sortedfile, get_splitfile + from mrec.examples.filename_conventions import get_sortedfile, get_splitfile logging.basicConfig(level=logging.INFO,format='[%(asctime)s] %(levelname)s: %(message)s') diff --git a/mrec/examples/train.py b/mrec/examples/train.py index 0bccb8f..510eb12 100644 --- a/mrec/examples/train.py +++ b/mrec/examples/train.py @@ -10,7 +10,7 @@ easy to generate data for cross-validated evaluation. """ -from filename_conventions import * +from mrec.examples.filename_conventions import * def main(): diff --git a/mrec/item_similarity/knn.py b/mrec/item_similarity/knn.py index cb60f3d..be57901 100644 --- a/mrec/item_similarity/knn.py +++ b/mrec/item_similarity/knn.py @@ -5,7 +5,7 @@ import numpy as np from sklearn.metrics.pairwise import cosine_similarity -from recommender import ItemSimilarityRecommender +from mrec.item_similarity.recommender import ItemSimilarityRecommender class KNNRecommender(ItemSimilarityRecommender): """ diff --git a/mrec/item_similarity/precomputed.py b/mrec/item_similarity/precomputed.py index f083434..139bed8 100644 --- a/mrec/item_similarity/precomputed.py +++ b/mrec/item_similarity/precomputed.py @@ -2,7 +2,7 @@ Make recommendations from a precomputed item similarity matrix. """ -from recommender import ItemSimilarityRecommender +from mrec.item_similarity.recommender import ItemSimilarityRecommender class PrecomputedItemSimilarityRecommender(ItemSimilarityRecommender): """ diff --git a/mrec/item_similarity/recommender.py b/mrec/item_similarity/recommender.py index a8eb1ef..d263413 100644 --- a/mrec/item_similarity/recommender.py +++ b/mrec/item_similarity/recommender.py @@ -10,8 +10,8 @@ from operator import itemgetter from scipy.sparse import csr_matrix, coo_matrix -from ..sparse import fast_sparse_matrix -from ..base_recommender import BaseRecommender +from mrec.sparse import fast_sparse_matrix +from mrec.base_recommender import BaseRecommender class ItemSimilarityRecommender(BaseRecommender): """ diff --git a/mrec/item_similarity/slim.py b/mrec/item_similarity/slim.py index b800428..0b02c40 100644 --- a/mrec/item_similarity/slim.py +++ b/mrec/item_similarity/slim.py @@ -16,7 +16,7 @@ import sklearn import numpy as np -from recommender import ItemSimilarityRecommender +from mrec.item_similarity.recommender import ItemSimilarityRecommender def parse_version(version_string): diff --git a/mrec/mf/evaluate.py b/mrec/mf/evaluate.py index d1fe163..393bc6f 100644 --- a/mrec/mf/evaluate.py +++ b/mrec/mf/evaluate.py @@ -8,7 +8,7 @@ def retrain_recommender(model,dataset): except ImportError: from sklearn.grid_search import IterGrid as ParameterGrid from optparse import OptionParser - from warp import WARPMFRecommender + from mrec.mf.warp import WARPMFRecommender from mrec.evaluation.metrics import * diff --git a/mrec/mf/model/warp2.py b/mrec/mf/model/warp2.py index 66a5925..cbbeced 100644 --- a/mrec/mf/model/warp2.py +++ b/mrec/mf/model/warp2.py @@ -2,7 +2,7 @@ import scipy import random -from warp import WARPBatchUpdate, WARPDecomposition, WARP +from mrec.mf.model.warp import WARPBatchUpdate, WARPDecomposition, WARP from warp_fast import warp2_sample class WARP2BatchUpdate(WARPBatchUpdate): diff --git a/mrec/mf/warp.py b/mrec/mf/warp.py index 7bbdf68..1d2f7d3 100644 --- a/mrec/mf/warp.py +++ b/mrec/mf/warp.py @@ -3,8 +3,8 @@ from mrec.evaluation import metrics -from recommender import MatrixFactorizationRecommender -from model.warp import WARP +from mrec.mf.recommender import MatrixFactorizationRecommender +from mrec.mf.model.warp import WARP class WARPMFRecommender(MatrixFactorizationRecommender): """ diff --git a/mrec/mf/warp2.py b/mrec/mf/warp2.py index 3e4be69..6dcd6ec 100644 --- a/mrec/mf/warp2.py +++ b/mrec/mf/warp2.py @@ -1,7 +1,7 @@ import numpy as np -from warp import WARPMFRecommender -from model.warp2 import WARP2 +from mrec.mf.warp import WARPMFRecommender +from mrec.mf.model.warp2 import WARP2 class WARP2MFRecommender(WARPMFRecommender): """ diff --git a/mrec/popularity.py b/mrec/popularity.py index 96eac95..a286d87 100644 --- a/mrec/popularity.py +++ b/mrec/popularity.py @@ -5,8 +5,8 @@ import numpy as np -from base_recommender import BaseRecommender -from sparse import fast_sparse_matrix +from mrec.base_recommender import BaseRecommender +from mrec.sparse import fast_sparse_matrix class ItemPopularityRecommender(BaseRecommender): """ diff --git a/mrec/reranking_recommender.py b/mrec/reranking_recommender.py index ade5912..60f6b44 100644 --- a/mrec/reranking_recommender.py +++ b/mrec/reranking_recommender.py @@ -9,7 +9,7 @@ import pickle import numpy as np -from base_recommender import BaseRecommender +from mrec.base_recommender import BaseRecommender class RerankingRecommender(BaseRecommender): """ From 153b4854a36cf0560e26583b7357b049d66337e2 Mon Sep 17 00:00:00 2001 From: inpefess Date: Sun, 29 Oct 2017 20:13:44 +0300 Subject: [PATCH 09/17] fix a bug of running 0 parallel tasks --- mrec/parallel/item_similarity.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mrec/parallel/item_similarity.py b/mrec/parallel/item_similarity.py index 371109c..54e398c 100644 --- a/mrec/parallel/item_similarity.py +++ b/mrec/parallel/item_similarity.py @@ -31,7 +31,7 @@ def run(self,view,model,input_format,trainfile,num_engines,simsdir,overwrite,max logging.info('creating tasks...') tasks = self.create_tasks(model,input_format,trainfile,simsdir,num_items,num_engines,max_sims,done) - if num_engines > 0: + if num_engines > 0 and len(tasks) > 0: logging.info('running %d tasks in parallel across ipython' ' engines...', len(tasks)) async_job = view.map_async(process,tasks,retries=2) From 2e84d1cb92955f0816dda65a3580cb82cee23846 Mon Sep 17 00:00:00 2001 From: inpefess Date: Sun, 29 Oct 2017 20:59:53 +0300 Subject: [PATCH 10/17] python code style --- mrec/__init__.py | 37 ++++--- mrec/base_recommender.py | 36 +++---- mrec/evaluation/__init__.py | 22 ++-- mrec/evaluation/metrics.py | 111 ++++++++++--------- mrec/evaluation/preprocessing.py | 52 ++++----- mrec/evaluation/tests/test_metrics.py | 56 +++++----- mrec/examples/convert.py | 41 +++---- mrec/examples/evaluate.py | 42 ++++---- mrec/examples/factors.py | 31 +++--- mrec/examples/filename_conventions.py | 48 +++++---- mrec/examples/predict.py | 147 ++++++++++++++------------ mrec/examples/prepare.py | 86 ++++++++------- mrec/examples/train.py | 116 ++++++++++++-------- mrec/examples/tune_slim.py | 87 +++++++++------ mrec/item_similarity/knn.py | 55 +++++----- mrec/item_similarity/precomputed.py | 12 +-- mrec/item_similarity/recommender.py | 90 ++++++++-------- mrec/item_similarity/slim.py | 77 +++++++------- mrec/mf/climf.py | 70 ++++++------ mrec/mf/evaluate.py | 30 +++--- mrec/mf/model/warp.py | 114 ++++++++++---------- mrec/mf/model/warp2.py | 73 ++++++------- mrec/mf/recommender.py | 47 ++++---- mrec/mf/warp.py | 44 ++++---- mrec/mf/warp2.py | 40 +++---- mrec/mf/wrmf.py | 45 ++++---- mrec/parallel/evaluate.py | 11 +- mrec/parallel/item_similarity.py | 73 ++++++------- mrec/parallel/predict.py | 34 +++--- mrec/parallel/warp.py | 53 +++++----- mrec/parallel/wrmf.py | 85 ++++++++------- mrec/popularity.py | 27 +++-- mrec/reranking_recommender.py | 57 +++++----- mrec/sparse.py | 65 ++++++------ mrec/testing.py | 18 ++-- mrec/tests/test_base_recommender.py | 44 ++++---- mrec/tests/test_mrec.py | 18 ++-- mrec/tests/test_sparse.py | 65 ++++++------ 38 files changed, 1166 insertions(+), 993 deletions(-) diff --git a/mrec/__init__.py b/mrec/__init__.py index 329b194..bcc1e4d 100644 --- a/mrec/__init__.py +++ b/mrec/__init__.py @@ -1,6 +1,5 @@ -import numpy as np -from scipy.sparse import coo_matrix, csr_matrix from scipy.io import mmread, mmwrite + try: import cPickle as pickle except ImportError: @@ -11,7 +10,8 @@ __version__ = '0.3.1' -def load_fast_sparse_matrix(input_format,filepath): + +def load_fast_sparse_matrix(input_format, filepath): """ Load a fast_sparse_matrix from an input file of the specified format, by delegating to the appropriate static method. @@ -30,14 +30,15 @@ def load_fast_sparse_matrix(input_format,filepath): if input_format == 'tsv': return fast_sparse_matrix.loadtxt(filepath) elif input_format == 'csv': - return fast_sparse_matrix.loadtxt(filepath,delimiter=',') + return fast_sparse_matrix.loadtxt(filepath, delimiter=',') elif input_format == 'mm': return fast_sparse_matrix.loadmm(filepath) elif input_format == 'fsm': return fast_sparse_matrix.load(filepath) raise ValueError('unknown input format: {0}'.format(input_format)) -def load_sparse_matrix(input_format,filepath): + +def load_sparse_matrix(input_format, filepath): """ Load a scipy.sparse.csr_matrix from an input file of the specified format. @@ -56,7 +57,7 @@ def load_sparse_matrix(input_format,filepath): if input_format == 'tsv': return loadtxt(filepath) elif input_format == 'csv': - return loadtxt(filepath,delimiter=',') + return loadtxt(filepath, delimiter=',') elif input_format == 'mm': return mmread(filepath).tocsr() elif input_format == 'npz': @@ -65,7 +66,8 @@ def load_sparse_matrix(input_format,filepath): return fast_sparse_matrix.load(filepath).X raise ValueError('unknown input format: {0}'.format(input_format)) -def save_sparse_matrix(data,fmt,filepath): + +def save_sparse_matrix(data, fmt, filepath): """ Save a scipy sparse matrix in the specified format. Row and column indices will be converted to 1-indexed if you specify a plain text @@ -87,24 +89,25 @@ def save_sparse_matrix(data,fmt,filepath): """ if fmt == 'tsv': m = data.tocoo() - with open(filepath,'w') as out: - for u,i,v in zip(m.row,m.col,m.data): - print('{0}\t{1}\t{2}'.format(u+1,i+1,v), file=out) + with open(filepath, 'w') as out: + for u, i, v in zip(m.row, m.col, m.data): + print('{0}\t{1}\t{2}'.format(u + 1, i + 1, v), file=out) elif fmt == 'csv': m = data.tocoo() - with open(filepath,'w') as out: - for u,i,v in zip(m.row,m.col,m.data): - print('{0},{1},{2}'.format(u+1,i+1,v), file=out) + with open(filepath, 'w') as out: + for u, i, v in zip(m.row, m.col, m.data): + print('{0},{1},{2}'.format(u + 1, i + 1, v), file=out) elif fmt == 'mm': - mmwrite(filepath,data) + mmwrite(filepath, data) elif fmt == 'npz': - savez(data.tocoo(),filepath) + savez(data.tocoo(), filepath) elif fmt == 'fsm': fast_sparse_matrix(data).save(filepath) else: raise ValueError('unknown output format: {0}'.format(fmt)) -def save_recommender(model,filepath): + +def save_recommender(model, filepath): """ Save a recommender model to file. @@ -117,6 +120,7 @@ def save_recommender(model,filepath): """ model.save(filepath) + def load_recommender(filepath): """ Load a recommender model from file after it has been saved by @@ -129,6 +133,7 @@ def load_recommender(filepath): """ return BaseRecommender.load(filepath) + def read_recommender_description(filepath): """ Read a recommender model description from file after it has diff --git a/mrec/base_recommender.py b/mrec/base_recommender.py index 76d127d..1321257 100644 --- a/mrec/base_recommender.py +++ b/mrec/base_recommender.py @@ -5,6 +5,7 @@ import numpy as np from scipy.sparse import csr_matrix + class BaseRecommender(object): """ Minimal interface to be implemented by recommenders, along with @@ -23,7 +24,7 @@ class BaseRecommender(object): and the batch methods to recommend items. """ - def recommend_items(self,dataset,u,max_items=10,return_scores=True,item_features=None): + def recommend_items(self, dataset, u, max_items=10, return_scores=True, item_features=None): """ Recommend new items for a user. @@ -48,7 +49,7 @@ def recommend_items(self,dataset,u,max_items=10,return_scores=True,item_features """ raise NotImplementedError('you must implement recommend_items()') - def fit(self,train,item_features=None): + def fit(self, train, item_features=None): """ Train on supplied data. In general you will want to implement this rather than computing recommendations on @@ -63,7 +64,7 @@ def fit(self,train,item_features=None): """ raise NotImplementedError('you should implement fit()') - def save(self,filepath): + def save(self, filepath): """ Serialize model to file. @@ -84,9 +85,9 @@ def save(self,filepath): archive = self._create_archive() if archive: - np.savez(filepath,**archive) + np.savez(filepath, **archive) else: - pickle.dump(self,open(filepath,'wb')) + pickle.dump(self, open(filepath, 'wb')) def _create_archive(self): """ @@ -114,7 +115,7 @@ def load(filepath): The filepath to read from. """ r = np.load(filepath) - if isinstance(r,BaseRecommender): + if isinstance(r, BaseRecommender): model = r else: model = np.loads(r['model']) @@ -144,15 +145,15 @@ def read_recommender_description(filepath): filepath : str The filepath to read from. """ - r = np.load(filepath,mmap_mode='r') - if isinstance(r,BaseRecommender): + r = np.load(filepath, mmap_mode='r') + if isinstance(r, BaseRecommender): model = r else: model = np.loads(r['model']) return str(model) def __str__(self): - if hasattr(self,'description'): + if hasattr(self, 'description'): return self.description return 'unspecified recommender: you should set self.description or implement __str__()' @@ -191,9 +192,9 @@ def batch_recommend_items(self, """ recs = [] for u in range(self.num_users): - if show_progress and u%1000 == 0: - print(u,'..',) - recs.append(self.recommend_items(dataset,u,max_items,return_scores)) + if show_progress and u % 1000 == 0: + print(u, '..', ) + recs.append(self.recommend_items(dataset, u, max_items, return_scores)) if show_progress: print() return recs @@ -234,9 +235,9 @@ def range_recommend_items(self, This provides a default implementation, you will be able to optimize this for most recommenders. """ - return [self.recommend_items(dataset,u,max_items,return_scores) for u in range(user_start,user_end)] + return [self.recommend_items(dataset, u, max_items, return_scores) for u in range(user_start, user_end)] - def _zero_known_item_scores(self,r,train): + def _zero_known_item_scores(self, r, train): """ Helper function to set predicted scores/ratings for training items to zero or less, to avoid recommending already known items. @@ -255,7 +256,7 @@ def _zero_known_item_scores(self,r,train): in train. """ col = train.indices - if isinstance(r,csr_matrix): + if isinstance(r, csr_matrix): max_score = r.data.max() else: max_score = r.max() @@ -265,8 +266,7 @@ def _zero_known_item_scores(self,r,train): # u,i for which train[u,i] has been explicitly set to zero row = np.zeros(col.shape) for u in range(train.shape[0]): - start,end = train.indptr[u],train.indptr[u+1] + start, end = train.indptr[u], train.indptr[u + 1] if end > start: row[start:end] = u - return r - csr_matrix((data,(row,col)),shape=r.shape) - + return r - csr_matrix((data, (row, col)), shape=r.shape) diff --git a/mrec/evaluation/__init__.py b/mrec/evaluation/__init__.py index bb309f5..8550749 100644 --- a/mrec/evaluation/__init__.py +++ b/mrec/evaluation/__init__.py @@ -11,18 +11,18 @@ class Evaluator(object): The number of recommendations needed to compute the evaluation function. """ - def __init__(self,compute_metrics,max_items): + def __init__(self, compute_metrics, max_items): self.compute_metrics = compute_metrics self.max_items = max_items - def _add_metrics(self,predicted,actual): - metrics = self.compute_metrics(predicted,actual) + def _add_metrics(self, predicted, actual): + metrics = self.compute_metrics(predicted, actual) if metrics: - for m,val in metrics.items(): + for m, val in metrics.items(): self.cum_metrics[m] += val self.count += 1 - def process(self,testdata,recsfile,start,end,offset=1): + def process(self, testdata, recsfile, start, end, offset=1): """ Parameters ---------- @@ -54,19 +54,19 @@ def process(self,testdata,recsfile,start,end,offset=1): last_user = start recs = [] for line in open(recsfile): - user,item,score = line.strip().split('\t') - user = int(user)-1 # convert to 0-indxed - item = int(item)-1 + user, item, score = line.strip().split('\t') + user = int(user) - 1 # convert to 0-indxed + item = int(item) - 1 if user >= end: break if user < start: continue if user != last_user: - self._add_metrics(recs,testdata[last_user,:].indices.tolist()) + self._add_metrics(recs, testdata[last_user, :].indices.tolist()) last_user = user recs = [] if len(recs) < self.max_items: recs.append(item) - self._add_metrics(recs,testdata[last_user,:].indices.tolist()) + self._add_metrics(recs, testdata[last_user, :].indices.tolist()) - return self.cum_metrics,self.count + return self.cum_metrics, self.count diff --git a/mrec/evaluation/metrics.py b/mrec/evaluation/metrics.py index cc93630..21f8988 100644 --- a/mrec/evaluation/metrics.py +++ b/mrec/evaluation/metrics.py @@ -4,47 +4,50 @@ * with prec@k and MRR """ +from collections import defaultdict + import numpy as np from scipy import stats -from collections import defaultdict + # classes to access known items for each test user class get_known_items_from_dict(object): - - def __init__(self,data): + def __init__(self, data): self.data = data - def __call__(self,u): + def __call__(self, u): return self.data[u] -class get_known_items_from_csr_matrix(object): - def __init__(self,data): +class get_known_items_from_csr_matrix(object): + def __init__(self, data): self.data = data - def __call__(self,u): + def __call__(self, u): return self.data[u].indices -class get_known_items_from_thresholded_csr_matrix(object): - def __init__(self,data,min_value): +class get_known_items_from_thresholded_csr_matrix(object): + def __init__(self, data, min_value): self.data = data self.min_value = min_value - def __call__(self,u): + def __call__(self, u): items = self.data[u].toarray().flatten() - items[items= self.thresh: if self.binarize: val = 1 else: val = 0 - return int(user),(int(item),val) + return int(user), (int(item), val) + class SplitCreator(object): """ @@ -59,38 +61,38 @@ class SplitCreator(object): enough items. """ - def __init__(self,test_size,normalize=False,discard_zeros=False,sample_before_thresholding=False): + def __init__(self, test_size, normalize=False, discard_zeros=False, sample_before_thresholding=False): self.test_size = test_size self.normalize = normalize self.discard_zeros = discard_zeros self.sample_before_thresholding = sample_before_thresholding - def handle(self,vals): + def handle(self, vals): if self.sample_before_thresholding: - train,test = self.split(vals) + train, test = self.split(vals) else: - train,test = self.stratified_split(vals) - train = [(v,c) for v,c in train if not self.discard_zeros or c > 0] - test = [(v,c) for v,c in test if c > 0] + train, test = self.stratified_split(vals) + train = [(v, c) for v, c in train if not self.discard_zeros or c > 0] + test = [(v, c) for v, c in test if c > 0] if self.normalize: - norm = sum(c*c for v,c in train)**0.5 + norm = sum(c * c for v, c in train) ** 0.5 if norm > 0: - train = [(v,c/norm) for v,c in train] - return train,test + train = [(v, c / norm) for v, c in train] + return train, test - def pos_neg_vals(self,vals): + def pos_neg_vals(self, vals): vals = list(vals) - pos = [(v,c) for v,c in vals if c > 0] - neg = [(v,0) for v,c in vals if c == 0] - return pos,neg + pos = [(v, c) for v, c in vals if c > 0] + neg = [(v, 0) for v, c in vals if c == 0] + return pos, neg - def split(self,vals): + def split(self, vals): random.shuffle(vals) num_train = self.num_train(vals) - return vals[:num_train],vals[num_train:] + return vals[:num_train], vals[num_train:] - def stratified_split(self,vals): - pos,neg = self.pos_neg_vals(vals) + def stratified_split(self, vals): + pos, neg = self.pos_neg_vals(vals) random.shuffle(pos) train = pos[:self.num_train(pos)] if not self.discard_zeros: @@ -98,9 +100,9 @@ def stratified_split(self,vals): train.extend(neg[:self.num_train(neg)]) random.shuffle(train) test = pos[self.num_train(pos):] - return train,test + return train, test - def num_train(self,vals): + def num_train(self, vals): if self.test_size >= 1: - return len(vals)-self.test_size - return int(len(vals)*(1.0-self.test_size)) + return len(vals) - self.test_size + return int(len(vals) * (1.0 - self.test_size)) diff --git a/mrec/evaluation/tests/test_metrics.py b/mrec/evaluation/tests/test_metrics.py index 05e21ba..da84721 100644 --- a/mrec/evaluation/tests/test_metrics.py +++ b/mrec/evaluation/tests/test_metrics.py @@ -3,38 +3,42 @@ from mrec.evaluation import metrics + def test_sort_metrics_by_name(): - names = ['recall@10','z-score','auc','recall@5'] - expected = ['auc','recall@5','recall@10','z-score'] - assert_equal(expected,metrics.sort_metrics_by_name(names)) + names = ['recall@10', 'z-score', 'auc', 'recall@5'] + expected = ['auc', 'recall@5', 'recall@10', 'z-score'] + assert_equal(expected, metrics.sort_metrics_by_name(names)) + def test_prec(): - true = [2,8,6,4] - predicted = [6,5,8,7] - expected = [1,0.5,2./3.,0.5] - for k in range(1,5): - assert_equal(metrics.prec([],true,k),0) - assert_equal(metrics.prec(true,true,k),1) - assert_equal(metrics.prec(predicted,true,k),expected[k-1]) - assert_equal(metrics.prec(true,true,5),0.8) - assert_equal(metrics.prec(true,true,5,ignore_missing=True),1) - assert_equal(metrics.prec(predicted,true,5),0.4) - assert_equal(metrics.prec(predicted,true,5,ignore_missing=True),expected[3]) + true = [2, 8, 6, 4] + predicted = [6, 5, 8, 7] + expected = [1, 0.5, 2. / 3., 0.5] + for k in range(1, 5): + assert_equal(metrics.prec([], true, k), 0) + assert_equal(metrics.prec(true, true, k), 1) + assert_equal(metrics.prec(predicted, true, k), expected[k - 1]) + assert_equal(metrics.prec(true, true, 5), 0.8) + assert_equal(metrics.prec(true, true, 5, ignore_missing=True), 1) + assert_equal(metrics.prec(predicted, true, 5), 0.4) + assert_equal(metrics.prec(predicted, true, 5, ignore_missing=True), expected[3]) + def test_hit_rate(): - predicted = [6,5,8,7] - for true in [[],[2,8]]: - for k in range(1,5): + predicted = [6, 5, 8, 7] + for true in [[], [2, 8]]: + for k in range(1, 5): with assert_raises(ValueError): - metrics.hit_rate(predicted,true,k) + metrics.hit_rate(predicted, true, k) true = [5] - expected = [0,1,1,1] - for k in range(1,5): - assert_equal(metrics.hit_rate(predicted,true,k),expected[k-1]) + expected = [0, 1, 1, 1] + for k in range(1, 5): + assert_equal(metrics.hit_rate(predicted, true, k), expected[k - 1]) + def test_rr(): - true = [2,8,6,4] - predicted = [5,7,6,8] - expected = [0,0,1./3.,1./3.] - for k in range(1,5): - assert_equal(metrics.rr(predicted[:k],true),expected[k-1]) + true = [2, 8, 6, 4] + predicted = [5, 7, 6, 8] + expected = [0, 0, 1. / 3., 1. / 3.] + for k in range(1, 5): + assert_equal(metrics.rr(predicted[:k], true), expected[k - 1]) diff --git a/mrec/examples/convert.py b/mrec/examples/convert.py index b1ba950..b0775fe 100644 --- a/mrec/examples/convert.py +++ b/mrec/examples/convert.py @@ -2,13 +2,13 @@ Convert sparse matrix from one file format to another. """ -import os import subprocess -def tsv2mtx(infile,outfile): - num_users,num_items,nnz = 0,0,0 + +def tsv2mtx(infile, outfile): + num_users, num_items, nnz = 0, 0, 0 for line in open(infile): - u,i,v = line.strip().split() + u, i, v = line.strip().split() u = int(u) i = int(i) if u > num_users: @@ -16,12 +16,13 @@ def tsv2mtx(infile,outfile): if i > num_items: num_items = i nnz += 1 - headerfile = outfile+'.header' - with open(headerfile,'w') as header: + headerfile = outfile + '.header' + with open(headerfile, 'w') as header: print('%%MatrixMarket matrix coordinate real general', file=header) - print('{0} {1} {2}'.format(num_users,num_items,nnz), file=header) - subprocess.check_call(['cat',headerfile,infile],stdout=open(outfile,'w')) - subprocess.check_call(['rm',headerfile]) + print('{0} {1} {2}'.format(num_users, num_items, nnz), file=header) + subprocess.check_call(['cat', headerfile, infile], stdout=open(outfile, 'w')) + subprocess.check_call(['rm', headerfile]) + def main(): from optparse import OptionParser @@ -29,12 +30,14 @@ def main(): from mrec import load_sparse_matrix, save_sparse_matrix parser = OptionParser() - parser.add_option('--input_format',dest='input_format',help='format of input dataset tsv | csv | mm (matrixmarket) | csr (scipy.sparse.csr_matrix) | fsm (mrec.sparse.fast_sparse_matrix)') - parser.add_option('--input',dest='input',help='filepath to input') - parser.add_option('--output_format',dest='output_format',help='format of output dataset(s) tsv | csv | mm (matrixmarket) | csr (scipy.sparse.csr_matrix) | fsm (mrec.sparse.fast_sparse_matrix)') - parser.add_option('--output',dest='output',help='filepath for output') - - (opts,args) = parser.parse_args() + parser.add_option('--input_format', dest='input_format', + help='format of input dataset tsv | csv | mm (matrixmarket) | csr (scipy.sparse.csr_matrix) | fsm (mrec.sparse.fast_sparse_matrix)') + parser.add_option('--input', dest='input', help='filepath to input') + parser.add_option('--output_format', dest='output_format', + help='format of output dataset(s) tsv | csv | mm (matrixmarket) | csr (scipy.sparse.csr_matrix) | fsm (mrec.sparse.fast_sparse_matrix)') + parser.add_option('--output', dest='output', help='filepath for output') + + (opts, args) = parser.parse_args() if not opts.input or not opts.output or not opts.input_format or not opts.output_format: parser.print_help() raise SystemExit @@ -44,11 +47,11 @@ def main(): if opts.input_format == 'tsv' and opts.output_format == 'mm': # we can do this without loading the data - tsv2mtx(opts.input,opts.output) + tsv2mtx(opts.input, opts.output) else: - data = load_sparse_matrix(opts.input_format,opts.input) - save_sparse_matrix(data,opts.output_format,opts.output) + data = load_sparse_matrix(opts.input_format, opts.input) + save_sparse_matrix(data, opts.output_format, opts.output) + if __name__ == '__main__': main() - diff --git a/mrec/examples/evaluate.py b/mrec/examples/evaluate.py index 76d5452..0bd17d0 100644 --- a/mrec/examples/evaluate.py +++ b/mrec/examples/evaluate.py @@ -4,8 +4,8 @@ to the training filepaths. """ -def main(): +def main(): import os import logging import glob @@ -18,19 +18,24 @@ def main(): from mrec.evaluation.metrics import print_report from mrec.examples.filename_conventions import get_testfile, get_recsfile - logging.basicConfig(level=logging.INFO,format='[%(asctime)s] %(levelname)s: %(message)s') + logging.basicConfig(level=logging.INFO, format='[%(asctime)s] %(levelname)s: %(message)s') parser = OptionParser() - parser.add_option('--input_format',dest='input_format',help='format of training dataset(s) tsv | csv | mm (matrixmarket) | fsm (fast_sparse_matrix)') - parser.add_option('--test_input_format',dest='test_input_format',default='npz',help='format of test dataset(s) tsv | csv | mm (matrixmarket) | npz (numpy binary) (default: %default)') - parser.add_option('--train',dest='train',help='glob specifying path(s) to training dataset(s) IMPORTANT: must be in quotes if it includes the * wildcard') - parser.add_option('--recsdir',dest='recsdir',help='directory containing tsv files of precomputed recommendations') - parser.add_option('--metrics',dest='metrics',default='main',help='which set of metrics to compute, main|hitrate (default: %default)') - parser.add_option('--description',dest='description',help='description of model which generated the recommendations') - metrics_funcs = {'main':compute_main_metrics, - 'hitrate':compute_hit_rate} - - (opts,args) = parser.parse_args() + parser.add_option('--input_format', dest='input_format', + help='format of training dataset(s) tsv | csv | mm (matrixmarket) | fsm (fast_sparse_matrix)') + parser.add_option('--test_input_format', dest='test_input_format', default='npz', + help='format of test dataset(s) tsv | csv | mm (matrixmarket) | npz (numpy binary) (default: %default)') + parser.add_option('--train', dest='train', + help='glob specifying path(s) to training dataset(s) IMPORTANT: must be in quotes if it includes the * wildcard') + parser.add_option('--recsdir', dest='recsdir', help='directory containing tsv files of precomputed recommendations') + parser.add_option('--metrics', dest='metrics', default='main', + help='which set of metrics to compute, main|hitrate (default: %default)') + parser.add_option('--description', dest='description', + help='description of model which generated the recommendations') + metrics_funcs = {'main': compute_main_metrics, + 'hitrate': compute_hit_rate} + + (opts, args) = parser.parse_args() if not opts.input_format or not opts.train or not opts.recsdir \ or opts.metrics not in metrics_funcs: parser.print_help() @@ -39,7 +44,7 @@ def main(): opts.train = os.path.abspath(os.path.expanduser(opts.train)) opts.recsdir = os.path.abspath(os.path.expanduser(opts.recsdir)) - evaluator = Evaluator(metrics_funcs[opts.metrics],max_items=20) + evaluator = Evaluator(metrics_funcs[opts.metrics], max_items=20) trainfiles = glob.glob(opts.train) @@ -47,14 +52,15 @@ def main(): for trainfile in trainfiles: logging.info('processing {0}...'.format(trainfile)) testfile = get_testfile(trainfile) - recsfile = get_recsfile(trainfile,opts.recsdir) - testdata = load_sparse_matrix(opts.test_input_format,testfile).tocsr() - cum_metrics,count = evaluator.process(testdata,recsfile,0,testdata.shape[0]) + recsfile = get_recsfile(trainfile, opts.recsdir) + testdata = load_sparse_matrix(opts.test_input_format, testfile).tocsr() + cum_metrics, count = evaluator.process(testdata, recsfile, 0, testdata.shape[0]) if cum_metrics is not None: for m in cum_metrics: - all_metrics[m].append(float(cum_metrics[m])/count) + all_metrics[m].append(float(cum_metrics[m]) / count) + + print_report([opts.description], [all_metrics]) - print_report([opts.description],[all_metrics]) if __name__ == '__main__': main() diff --git a/mrec/examples/factors.py b/mrec/examples/factors.py index 87053c1..2bd3366 100644 --- a/mrec/examples/factors.py +++ b/mrec/examples/factors.py @@ -3,9 +3,8 @@ and evaluation recommendations with mrec scripts. """ -def main(): - import os +def main(): import logging import subprocess from optparse import OptionParser @@ -16,17 +15,20 @@ def main(): from mrec.mf.recommender import MatrixFactorizationRecommender from mrec.examples.filename_conventions import get_modelfile - logging.basicConfig(level=logging.INFO,format='[%(asctime)s] %(levelname)s: %(message)s') + logging.basicConfig(level=logging.INFO, format='[%(asctime)s] %(levelname)s: %(message)s') parser = OptionParser() - parser.add_option('--factor_format',dest='factor_format',help='format of factor files tsv | mm (matrixmarket) | npy (numpy array)') - parser.add_option('--user_factors',dest='user_factors',help='user factors filepath') - parser.add_option('--item_factors',dest='item_factors',help='item factors filepath') - parser.add_option('--train',dest='train',help='filepath to training data, just used to apply naming convention to output model saved here') - parser.add_option('--outdir',dest='outdir',help='directory for output') - parser.add_option('--description',dest='description',help='optional description of how factors were computed, will be saved with model so it can be output with evaluation results') - - (opts,args) = parser.parse_args() + parser.add_option('--factor_format', dest='factor_format', + help='format of factor files tsv | mm (matrixmarket) | npy (numpy array)') + parser.add_option('--user_factors', dest='user_factors', help='user factors filepath') + parser.add_option('--item_factors', dest='item_factors', help='item factors filepath') + parser.add_option('--train', dest='train', + help='filepath to training data, just used to apply naming convention to output model saved here') + parser.add_option('--outdir', dest='outdir', help='directory for output') + parser.add_option('--description', dest='description', + help='optional description of how factors were computed, will be saved with model so it can be output with evaluation results') + + (opts, args) = parser.parse_args() if not opts.factor_format or not opts.user_factors or not opts.item_factors \ or not opts.outdir: parser.print_help() @@ -54,12 +56,13 @@ def main(): logging.info('saving model...') logging.info('creating output directory {0}...'.format(opts.outdir)) - subprocess.check_call(['mkdir','-p',opts.outdir]) + subprocess.check_call(['mkdir', '-p', opts.outdir]) - modelfile = get_modelfile(opts.train,opts.outdir) - save_recommender(model,modelfile) + modelfile = get_modelfile(opts.train, opts.outdir) + save_recommender(model, modelfile) logging.info('done') + if __name__ == '__main__': main() diff --git a/mrec/examples/filename_conventions.py b/mrec/examples/filename_conventions.py index 0906827..bfbf778 100644 --- a/mrec/examples/filename_conventions.py +++ b/mrec/examples/filename_conventions.py @@ -10,42 +10,52 @@ import os + def get_testfile(trainfile): filename = os.path.basename(trainfile) - return os.path.join(os.path.dirname(trainfile),filename.replace('train','test')) + return os.path.join(os.path.dirname(trainfile), filename.replace('train', 'test')) + -def get_simsdir(trainfile,outdir): +def get_simsdir(trainfile, outdir): filename = os.path.basename(trainfile) - return os.path.join(outdir,'{0}-sims'.format(filename)) + return os.path.join(outdir, '{0}-sims'.format(filename)) + -def get_recsdir(trainfile,outdir): +def get_recsdir(trainfile, outdir): filename = os.path.basename(trainfile) - return os.path.join(outdir,'{0}-recs'.format(filename)) + return os.path.join(outdir, '{0}-recs'.format(filename)) -def get_modelsdir(trainfile,outdir): + +def get_modelsdir(trainfile, outdir): filename = os.path.basename(trainfile) - return os.path.join(outdir,'{0}-models'.format(filename)) + return os.path.join(outdir, '{0}-models'.format(filename)) + -def get_factorsdir(trainfile,outdir): +def get_factorsdir(trainfile, outdir): filename = os.path.basename(trainfile) - return os.path.join(outdir,'{0}-factors'.format(filename)) + return os.path.join(outdir, '{0}-factors'.format(filename)) -def get_simsfile(trainfile,outdir): + +def get_simsfile(trainfile, outdir): filename = os.path.basename(trainfile) - return os.path.join(outdir,'{0}.sims.tsv'.format(filename)) + return os.path.join(outdir, '{0}.sims.tsv'.format(filename)) + -def get_recsfile(trainfile,outdir): +def get_recsfile(trainfile, outdir): filename = os.path.basename(trainfile) - return os.path.join(outdir,'{0}.recs.tsv'.format(filename)) + return os.path.join(outdir, '{0}.recs.tsv'.format(filename)) + -def get_modelfile(trainfile,outdir): +def get_modelfile(trainfile, outdir): filename = os.path.basename(trainfile) - return os.path.join(outdir,'{0}.model.npz'.format(filename)) + return os.path.join(outdir, '{0}.model.npz'.format(filename)) -def get_sortedfile(infile,outdir): + +def get_sortedfile(infile, outdir): filename = os.path.basename(infile) - return os.path.join(outdir,'{0}.sorted'.format(filename)) + return os.path.join(outdir, '{0}.sorted'.format(filename)) + -def get_splitfile(infile,outdir,split_type,i): +def get_splitfile(infile, outdir, split_type, i): filename = os.path.basename(infile) - return os.path.join(outdir,'{0}.{1}.{2}'.format(filename,split_type,i)) + return os.path.join(outdir, '{0}.{1}.{2}'.format(filename, split_type, i)) diff --git a/mrec/examples/predict.py b/mrec/examples/predict.py index d699751..07440fd 100644 --- a/mrec/examples/predict.py +++ b/mrec/examples/predict.py @@ -11,29 +11,26 @@ makes it easy to run a cross-validated evaluation. """ -import math import glob +import logging import re -import os import subprocess -from shutil import rmtree -import logging from collections import defaultdict +from shutil import rmtree from mrec import load_sparse_matrix, read_recommender_description, load_recommender -from mrec.parallel import predict -from mrec.mf.recommender import MatrixFactorizationRecommender -from mrec.item_similarity.recommender import ItemSimilarityRecommender - from mrec.examples.filename_conventions import * +from mrec.item_similarity.recommender import ItemSimilarityRecommender +from mrec.mf.recommender import MatrixFactorizationRecommender +from mrec.parallel import predict -ONE_MB = 2**20 +ONE_MB = 2 ** 20 -def process(view,opts,modelfile,trainfile,testfile,featurefile,outdir,evaluator): - recsdir = get_recsdir(trainfile,opts.outdir) +def process(view, opts, modelfile, trainfile, testfile, featurefile, outdir, evaluator): + recsdir = get_recsdir(trainfile, opts.outdir) logging.info('creating recs directory {0}...'.format(recsdir)) - subprocess.check_call(['mkdir','-p',recsdir]) + subprocess.check_call(['mkdir', '-p', recsdir]) done = [] if not opts.overwrite: @@ -57,7 +54,7 @@ def process(view,opts,modelfile,trainfile,testfile,featurefile,outdir,evaluator) logging.info('running in parallel across ipython engines...') results = [] - results.append(view.map_async(predict.run,tasks,retries=2)) + results.append(view.map_async(predict.run, tasks, retries=2)) # wait for tasks to complete processed = [r.get() for r in results] @@ -69,10 +66,10 @@ def process(view,opts,modelfile,trainfile,testfile,featurefile,outdir,evaluator) if remaining == 0: logging.info('SUCCESS: all tasks completed') logging.info('concatenating {0} partial output files...'.format(len(done))) - paths = [os.path.join(recsdir,'recs.{0}-{1}.tsv'.format(start,end)) for start,end in done] - cmd = ['cat']+paths - recsfile = get_recsfile(trainfile,outdir) - subprocess.check_call(cmd,stdout=open(recsfile,'w')) + paths = [os.path.join(recsdir, 'recs.{0}-{1}.tsv'.format(start, end)) for start, end in done] + cmd = ['cat'] + paths + recsfile = get_recsfile(trainfile, outdir) + subprocess.check_call(cmd, stdout=open(recsfile, 'w')) logging.info('removing partial output files...') rmtree(recsdir) logging.info('done') @@ -81,18 +78,19 @@ def process(view,opts,modelfile,trainfile,testfile,featurefile,outdir,evaluator) avg_metrics = defaultdict(float) tot_count = 0 for results in processed: - for cum_metrics,count in results: - for m,val in cum_metrics.items(): + for cum_metrics, count in results: + for m, val in cum_metrics.items(): avg_metrics[m] += val tot_count += count for m in avg_metrics: avg_metrics[m] /= float(tot_count) else: - logging.error('FAILED: {0}/{1} tasks did not complete successfully'.format(remaining,len(tasks))) + logging.error('FAILED: {0}/{1} tasks did not complete successfully'.format(remaining, len(tasks))) logging.error('try rerunning the command to retry the remaining tasks') avg_metrics = None - return read_recommender_description(modelfile),avg_metrics + return read_recommender_description(modelfile), avg_metrics + def create_tasks(modelfile, input_format, @@ -105,92 +103,105 @@ def create_tasks(modelfile, mb_per_task, done, evaluator): - users_per_task,num_users = estimate_users_per_task(mb_per_task,input_format,trainfile,modelfile) + users_per_task, num_users = estimate_users_per_task(mb_per_task, input_format, trainfile, modelfile) tasks = [] - for start in range(0,num_users,users_per_task): - end = min(num_users,start+users_per_task) - generate = (start,end) not in done - tasks.append((modelfile,input_format,trainfile,test_input_format,testfile,item_feature_format,featurefile,outdir,start,end,evaluator,generate)) - logging.info('created {0} tasks, {1} users per task'.format(len(tasks),users_per_task)) + for start in range(0, num_users, users_per_task): + end = min(num_users, start + users_per_task) + generate = (start, end) not in done + tasks.append((modelfile, input_format, trainfile, test_input_format, testfile, item_feature_format, featurefile, + outdir, start, end, evaluator, generate)) + logging.info('created {0} tasks, {1} users per task'.format(len(tasks), users_per_task)) return tasks -def estimate_users_per_task(mb_per_task,input_format,trainfile,modelfile): - num_users,num_items,nnz = get_dataset_size(input_format,trainfile) + +def estimate_users_per_task(mb_per_task, input_format, trainfile, modelfile): + num_users, num_items, nnz = get_dataset_size(input_format, trainfile) logging.info('loading model to get size...') model = load_recommender(modelfile) # we load the training and test data on every task # - let's guess that worst case the test data will be the same size - required_mb_per_task = 2*(nnz*16)/ONE_MB - if isinstance(model,MatrixFactorizationRecommender): + required_mb_per_task = 2 * (nnz * 16) / ONE_MB + if isinstance(model, MatrixFactorizationRecommender): # we have to load the factors on every task - required_mb_per_task += ((model.U.size+model.V.size)*16)/ONE_MB + required_mb_per_task += ((model.U.size + model.V.size) * 16) / ONE_MB if mb_per_task > required_mb_per_task: # remaining mem usage is dominated by computed scores: - users_per_task = ((mb_per_task-required_mb_per_task)*ONE_MB) / (num_items*16) - elif isinstance(model,ItemSimilarityRecommender): + users_per_task = ((mb_per_task - required_mb_per_task) * ONE_MB) / (num_items * 16) + elif isinstance(model, ItemSimilarityRecommender): # we have to load the similarity matrix on every task - required_mb_per_task += (model.similarity_matrix.nnz*16)/ONE_MB + required_mb_per_task += (model.similarity_matrix.nnz * 16) / ONE_MB if mb_per_task > required_mb_per_task: # estimate additional usage from avg items per user and sims per item items_per_user = nnz / num_users sims_per_item = model.similarity_matrix.nnz / num_items - users_per_task = ((mb_per_task-required_mb_per_task)*ONE_MB) / (items_per_user*sims_per_item*16) + users_per_task = ((mb_per_task - required_mb_per_task) * ONE_MB) / (items_per_user * sims_per_item * 16) else: # assume nothing else to load users_per_task = num_users if mb_per_task <= required_mb_per_task: - raise RuntimeError('requires at least {0}MB per task, increase --mb_per_task if you can'.format(required_mb_per_task)) + raise RuntimeError( + 'requires at least {0}MB per task, increase --mb_per_task if you can'.format(required_mb_per_task)) return int(users_per_task), int(num_users) -def get_dataset_size(input_format,datafile): + +def get_dataset_size(input_format, datafile): logging.info('loading dataset to get size...') - dataset = load_sparse_matrix(input_format,datafile) - return dataset.shape[0],dataset.shape[1],dataset.nnz + dataset = load_sparse_matrix(input_format, datafile) + return dataset.shape[0], dataset.shape[1], dataset.nnz + def find_done(outdir): - success_files = glob.glob(os.path.join(outdir,'*.SUCCESS')) + success_files = glob.glob(os.path.join(outdir, '*.SUCCESS')) r = re.compile('.*?([0-9]+)-([0-9]+)\.SUCCESS$') done = [] for path in success_files: m = r.match(path) start = int(m.group(1)) end = int(m.group(2)) - done.append((start,end)) + done.append((start, end)) return done -def main(): +def main(): import os from optparse import OptionParser from ipyparallel import Client from mrec.evaluation.metrics import compute_main_metrics, compute_hit_rate from mrec.evaluation import Evaluator - from mrec import load_recommender from mrec.evaluation.metrics import print_report - logging.basicConfig(level=logging.INFO,format='[%(asctime)s] %(levelname)s: %(message)s') + logging.basicConfig(level=logging.INFO, format='[%(asctime)s] %(levelname)s: %(message)s') parser = OptionParser() - parser.add_option('--mb_per_task',dest='mb_per_task',type='int',default=None,help='approximate memory limit per task in MB, so total memory usage is num_engines * mb_per_task (default: share all available RAM across engines)') - parser.add_option('--input_format',dest='input_format',help='format of training dataset(s) tsv | csv | mm (matrixmarket) | fsm (fast_sparse_matrix)') - parser.add_option('--test_input_format',dest='test_input_format',default='npz',help='format of test dataset(s) tsv | csv | mm (matrixmarket) | npz (numpy binary) (default: %default)') - parser.add_option('--train',dest='train',help='glob specifying path(s) to training dataset(s) IMPORTANT: must be in quotes if it includes the * wildcard') - parser.add_option('--modeldir',dest='modeldir',help='directory containing trained models') - parser.add_option('--outdir',dest='outdir',help='directory for output files') - parser.add_option('--metrics',dest='metrics',default='main',help='which set of metrics to compute, main|hitrate (default: %default)') - parser.add_option('--item_feature_format',dest='item_feature_format',help='format of item features tsv | csv | mm (matrixmarket) | npz (numpy arrays)') - parser.add_option('--item_features',dest='item_features',help='path to sparse item features in tsv format (item_id,feature_id,val)') - parser.add_option('--overwrite',dest='overwrite',action='store_true',default=False,help='overwrite existing files in outdir (default: %default)') - parser.add_option('--packer',dest='packer',default='json',help='packer for IPython.parallel (default: %default)') - parser.add_option('--add_module_paths',dest='add_module_paths',help='optional comma-separated list of paths to append to pythonpath (useful if you need to import uninstalled modules to IPython engines on a cluster)') - - metrics_funcs = {'main':compute_main_metrics, - 'hitrate':compute_hit_rate} - - (opts,args) = parser.parse_args() + parser.add_option('--mb_per_task', dest='mb_per_task', type='int', default=None, + help='approximate memory limit per task in MB, so total memory usage is num_engines * mb_per_task (default: share all available RAM across engines)') + parser.add_option('--input_format', dest='input_format', + help='format of training dataset(s) tsv | csv | mm (matrixmarket) | fsm (fast_sparse_matrix)') + parser.add_option('--test_input_format', dest='test_input_format', default='npz', + help='format of test dataset(s) tsv | csv | mm (matrixmarket) | npz (numpy binary) (default: %default)') + parser.add_option('--train', dest='train', + help='glob specifying path(s) to training dataset(s) IMPORTANT: must be in quotes if it includes the * wildcard') + parser.add_option('--modeldir', dest='modeldir', help='directory containing trained models') + parser.add_option('--outdir', dest='outdir', help='directory for output files') + parser.add_option('--metrics', dest='metrics', default='main', + help='which set of metrics to compute, main|hitrate (default: %default)') + parser.add_option('--item_feature_format', dest='item_feature_format', + help='format of item features tsv | csv | mm (matrixmarket) | npz (numpy arrays)') + parser.add_option('--item_features', dest='item_features', + help='path to sparse item features in tsv format (item_id,feature_id,val)') + parser.add_option('--overwrite', dest='overwrite', action='store_true', default=False, + help='overwrite existing files in outdir (default: %default)') + parser.add_option('--packer', dest='packer', default='json', help='packer for IPython.parallel (default: %default)') + parser.add_option('--add_module_paths', dest='add_module_paths', + help='optional comma-separated list of paths to append to pythonpath (useful if you need to import uninstalled modules to IPython engines on a cluster)') + + metrics_funcs = {'main': compute_main_metrics, + 'hitrate': compute_hit_rate} + + (opts, args) = parser.parse_args() if not opts.input_format or not opts.train or not opts.outdir \ or not opts.modeldir or opts.metrics not in metrics_funcs: parser.print_help() @@ -206,7 +217,7 @@ def main(): if opts.mb_per_task is None: import psutil num_engines = len(view) - opts.mb_per_task = psutil.virtual_memory().available/ONE_MB/(num_engines+1) # don't take *all* the memory + opts.mb_per_task = psutil.virtual_memory().available / ONE_MB / (num_engines + 1) # don't take *all* the memory if opts.add_module_paths: c[:].execute('import sys') @@ -214,7 +225,7 @@ def main(): logging.info('adding {0} to pythonpath on all engines'.format(path)) c[:].execute("sys.path.append('{0}')".format(path)) - evaluator = Evaluator(metrics_funcs[opts.metrics],max_items=20) + evaluator = Evaluator(metrics_funcs[opts.metrics], max_items=20) trainfiles = glob.glob(opts.train) @@ -222,9 +233,10 @@ def main(): all_metrics = defaultdict(list) for trainfile in trainfiles: logging.info('processing {0}...'.format(trainfile)) - modelfile = get_modelfile(trainfile,opts.modeldir) + modelfile = get_modelfile(trainfile, opts.modeldir) testfile = get_testfile(trainfile) - description,metrics = process(view,opts,modelfile,trainfile,testfile,opts.item_features,opts.outdir,evaluator) + description, metrics = process(view, opts, modelfile, trainfile, testfile, opts.item_features, opts.outdir, + evaluator) descriptions.add(description) if metrics is not None: for m in metrics: @@ -234,7 +246,8 @@ def main(): if len(descriptions) > 1: logging.warn('You are aggregating metrics from different models! {}'.format(description)) - print_report([description],[all_metrics]) + print_report([description], [all_metrics]) + if __name__ == '__main__': main() diff --git a/mrec/examples/prepare.py b/mrec/examples/prepare.py index 9405ce1..a7bc8ee 100644 --- a/mrec/examples/prepare.py +++ b/mrec/examples/prepare.py @@ -1,44 +1,44 @@ class Processor(object): - - def __init__(self,splitter,parser,min_items_per_user,preprocess=None): + def __init__(self, splitter, parser, min_items_per_user, preprocess=None): self.splitter = splitter self.parser = parser self.min_items_per_user = min_items_per_user self.preprocess = preprocess - def output(self,user,vals,outfile): - for v,c in vals: - print('{0}\t{1}\t{2}'.format(user,v,c), file=outfile) + def output(self, user, vals, outfile): + for v, c in vals: + print('{0}\t{1}\t{2}'.format(user, v, c), file=outfile) - def handle(self,user,vals): + def handle(self, user, vals): if len(vals) >= self.min_items_per_user: if self.preprocess is not None: vals = self.preprocess(vals) - train,test = self.splitter.handle(vals) - self.output(user,train,self.train_out) - self.output(user,test,self.test_out) + train, test = self.splitter.handle(vals) + self.output(user, train, self.train_out) + self.output(user, test, self.test_out) else: self.too_few_items += 1 - def create_split(self,infile,train_out,test_out): + def create_split(self, infile, train_out, test_out): self.train_out = train_out self.test_out = test_out self.too_few_items = 0 last_user = None vals = [] for line in infile: - user,val = self.parser.parse(line) + user, val = self.parser.parse(line) if user != last_user: if last_user is not None: - self.handle(last_user,vals) + self.handle(last_user, vals) last_user = user vals = [] vals.append(val) - self.handle(last_user,vals) + self.handle(last_user, vals) def get_too_few_items(self): return self.too_few_items + def main(): import os import logging @@ -48,22 +48,28 @@ def main(): from mrec.evaluation.preprocessing import TSVParser, SplitCreator from mrec.examples.filename_conventions import get_sortedfile, get_splitfile - logging.basicConfig(level=logging.INFO,format='[%(asctime)s] %(levelname)s: %(message)s') + logging.basicConfig(level=logging.INFO, format='[%(asctime)s] %(levelname)s: %(message)s') parser = OptionParser() - parser.add_option('--dataset',dest='dataset',help='path to input dataset in tsv format') - parser.add_option('--delimiter',dest='delimiter',default='\t',help='input delimiter (default: tab)') - parser.add_option('--outdir',dest='outdir',help='directory for output files') - parser.add_option('--num_splits',dest='num_splits',type='int',default=5,help='number of train/test splits to create (default: %default)') - parser.add_option('--min_items_per_user',dest='min_items_per_user',type='int',default=10,help='skip users with less than this number of ratings (default: %default)') - parser.add_option('--binarize',dest='binarize',action='store_true',default=False,help='binarize ratings') - parser.add_option('--normalize',dest='normalize',action='store_true',help='scale training ratings to unit norm') - parser.add_option('--rating_thresh',dest='rating_thresh',type='float',default=0,help='treat ratings below this as zero (default: %default)') - parser.add_option('--test_size',dest='test_size',type='float',default=0.5,help='target number of test items for each user, if test_size >= 1 treat as an absolute number, otherwise treat as a fraction of the total items (default: %default)') - parser.add_option('--discard_zeros',dest='discard_zeros',action='store_true',help='discard zero training ratings after thresholding (not recommended, incompatible with using training items to guarantee that recommendations are novel)') - parser.add_option('--sample_before_thresholding',dest='sample_before_thresholding',action='store_true',help='choose test items before thresholding ratings (not recommended, test items below threshold will then be discarded)') - - (opts,args) = parser.parse_args() + parser.add_option('--dataset', dest='dataset', help='path to input dataset in tsv format') + parser.add_option('--delimiter', dest='delimiter', default='\t', help='input delimiter (default: tab)') + parser.add_option('--outdir', dest='outdir', help='directory for output files') + parser.add_option('--num_splits', dest='num_splits', type='int', default=5, + help='number of train/test splits to create (default: %default)') + parser.add_option('--min_items_per_user', dest='min_items_per_user', type='int', default=10, + help='skip users with less than this number of ratings (default: %default)') + parser.add_option('--binarize', dest='binarize', action='store_true', default=False, help='binarize ratings') + parser.add_option('--normalize', dest='normalize', action='store_true', help='scale training ratings to unit norm') + parser.add_option('--rating_thresh', dest='rating_thresh', type='float', default=0, + help='treat ratings below this as zero (default: %default)') + parser.add_option('--test_size', dest='test_size', type='float', default=0.5, + help='target number of test items for each user, if test_size >= 1 treat as an absolute number, otherwise treat as a fraction of the total items (default: %default)') + parser.add_option('--discard_zeros', dest='discard_zeros', action='store_true', + help='discard zero training ratings after thresholding (not recommended, incompatible with using training items to guarantee that recommendations are novel)') + parser.add_option('--sample_before_thresholding', dest='sample_before_thresholding', action='store_true', + help='choose test items before thresholding ratings (not recommended, test items below threshold will then be discarded)') + + (opts, args) = parser.parse_args() if not opts.dataset or not opts.outdir: parser.print_help() raise SystemExit @@ -72,30 +78,30 @@ def main(): opts.outdir = os.path.abspath(opts.outdir) logging.info('sorting input data...') - infile = get_sortedfile(opts.dataset,opts.outdir) - subprocess.check_call(['mkdir','-p',opts.outdir]) - subprocess.check_call(['sort','-k1','-n',opts.dataset],stdout=open(infile,'w')) + infile = get_sortedfile(opts.dataset, opts.outdir) + subprocess.check_call(['mkdir', '-p', opts.outdir]) + subprocess.check_call(['sort', '-k1', '-n', opts.dataset], stdout=open(infile, 'w')) - parser = TSVParser(thresh=opts.rating_thresh,binarize=opts.binarize,delimiter=opts.delimiter) - splitter = SplitCreator(test_size=opts.test_size,normalize=opts.normalize,discard_zeros=opts.discard_zeros, + parser = TSVParser(thresh=opts.rating_thresh, binarize=opts.binarize, delimiter=opts.delimiter) + splitter = SplitCreator(test_size=opts.test_size, normalize=opts.normalize, discard_zeros=opts.discard_zeros, sample_before_thresholding=opts.sample_before_thresholding) - processor = Processor(splitter,parser,opts.min_items_per_user) + processor = Processor(splitter, parser, opts.min_items_per_user) for i in range(opts.num_splits): - trainfile = get_splitfile(opts.dataset,opts.outdir,'train',i) - testfile = get_splitfile(opts.dataset,opts.outdir,'test',i) + trainfile = get_splitfile(opts.dataset, opts.outdir, 'train', i) + testfile = get_splitfile(opts.dataset, opts.outdir, 'test', i) - logging.info('creating split {0}: {1} {2}'.format(i,trainfile,testfile)) - processor.create_split(open(infile),open(trainfile,'w'),open(testfile,'w')) + logging.info('creating split {0}: {1} {2}'.format(i, trainfile, testfile)) + processor.create_split(open(infile), open(trainfile, 'w'), open(testfile, 'w')) too_few_items = processor.get_too_few_items() if (too_few_items): - logging.info('skipped {0} users with less than {1} ratings'.format(too_few_items,opts.min_items_per_user)) + logging.info('skipped {0} users with less than {1} ratings'.format(too_few_items, opts.min_items_per_user)) logging.info('cleaning up...') - subprocess.check_call(['rm',infile]) + subprocess.check_call(['rm', infile]) logging.info('done') + if __name__ == '__main__': main() - diff --git a/mrec/examples/train.py b/mrec/examples/train.py index 510eb12..2588be9 100644 --- a/mrec/examples/train.py +++ b/mrec/examples/train.py @@ -12,8 +12,8 @@ from mrec.examples.filename_conventions import * -def main(): +def main(): import os import logging import glob @@ -32,34 +32,54 @@ def main(): from mrec.parallel.wrmf import WRMFRunner from mrec.parallel.warp import WARPMFRunner - logging.basicConfig(level=logging.INFO,format='[%(asctime)s] %(levelname)s: %(message)s') + logging.basicConfig(level=logging.INFO, format='[%(asctime)s] %(levelname)s: %(message)s') parser = OptionParser() - parser.add_option('-n','--num_engines',dest='num_engines',type='int',default=0,help='number of IPython engines to use') - parser.add_option('--input_format',dest='input_format',help='format of training dataset(s) tsv | csv | mm (matrixmarket) | fsm (fast_sparse_matrix)') - parser.add_option('--train',dest='train',help='glob specifying path(s) to training dataset(s) IMPORTANT: must be in quotes if it includes the * wildcard') - parser.add_option('--outdir',dest='outdir',help='directory for output files') - parser.add_option('--overwrite',dest='overwrite',action='store_true',help='overwrite existing files in outdir') - parser.add_option('--model',dest='model',default='slim',help='type of model to train: slim | knn | wrmf | warp | popularity (default: %default)') - parser.add_option('--max_sims',dest='max_sims',type='int',default=100,help='max similar items to output for each training item (default: %default)') - parser.add_option('--learner',dest='learner',default='sgd',help='underlying learner for SLIM learner: sgd | elasticnet | fs_sgd (default: %default)') - parser.add_option('--l1_reg',dest='l1_reg',type='float',default=0.001,help='l1 regularization constant (default: %default)') - parser.add_option('--l2_reg',dest='l2_reg',type='float',default=0.0001,help='l2 regularization constant (default: %default)') - parser.add_option('--metric',dest='metric',default='cosine',help='metric for knn recommender: cosine | dot (default: %default)') - parser.add_option('--num_factors',dest='num_factors',type='int',default=80,help='number of latent factors (default: %default)') - parser.add_option('--alpha',dest='alpha',type='float',default=1.0,help='wrmf confidence constant (default: %default)') - parser.add_option('--lbda',dest='lbda',type='float',default=0.015,help='wrmf regularization constant (default: %default)') - parser.add_option('--als_iters',dest='als_iters',type='int',default=15,help='number of als iterations (default: %default)') - parser.add_option('--gamma',dest='gamma',type='float',default=0.01,help='warp learning rate (default: %default)') - parser.add_option('--C',dest='C',type='float',default=100.0,help='warp regularization constant (default: %default)') - parser.add_option('--item_feature_format',dest='item_feature_format',help='format of item features tsv | csv | mm (matrixmarket) | npz (numpy arrays)') - parser.add_option('--item_features',dest='item_features',help='path to sparse item features in tsv format (item_id,feature_id,val)') - parser.add_option('--popularity_method',dest='popularity_method',default='count',help='how to compute popularity for baseline recommender: count | sum | avg | thresh (default: %default)') - parser.add_option('--popularity_thresh',dest='popularity_thresh',type='float',default=0,help='ignore scores below this when computing popularity for baseline recommender (default: %default)') - parser.add_option('--packer',dest='packer',default='json',help='packer for IPython.parallel (default: %default)') - parser.add_option('--add_module_paths',dest='add_module_paths',help='optional comma-separated list of paths to append to pythonpath (useful if you need to import uninstalled modules to IPython engines on a cluster)') - - (opts,args) = parser.parse_args() + parser.add_option('-n', '--num_engines', dest='num_engines', type='int', default=0, + help='number of IPython engines to use') + parser.add_option('--input_format', dest='input_format', + help='format of training dataset(s) tsv | csv | mm (matrixmarket) | fsm (fast_sparse_matrix)') + parser.add_option('--train', dest='train', + help='glob specifying path(s) to training dataset(s) IMPORTANT: must be in quotes if it includes the * wildcard') + parser.add_option('--outdir', dest='outdir', help='directory for output files') + parser.add_option('--overwrite', dest='overwrite', action='store_true', help='overwrite existing files in outdir') + parser.add_option('--model', dest='model', default='slim', + help='type of model to train: slim | knn | wrmf | warp | popularity (default: %default)') + parser.add_option('--max_sims', dest='max_sims', type='int', default=100, + help='max similar items to output for each training item (default: %default)') + parser.add_option('--learner', dest='learner', default='sgd', + help='underlying learner for SLIM learner: sgd | elasticnet | fs_sgd (default: %default)') + parser.add_option('--l1_reg', dest='l1_reg', type='float', default=0.001, + help='l1 regularization constant (default: %default)') + parser.add_option('--l2_reg', dest='l2_reg', type='float', default=0.0001, + help='l2 regularization constant (default: %default)') + parser.add_option('--metric', dest='metric', default='cosine', + help='metric for knn recommender: cosine | dot (default: %default)') + parser.add_option('--num_factors', dest='num_factors', type='int', default=80, + help='number of latent factors (default: %default)') + parser.add_option('--alpha', dest='alpha', type='float', default=1.0, + help='wrmf confidence constant (default: %default)') + parser.add_option('--lbda', dest='lbda', type='float', default=0.015, + help='wrmf regularization constant (default: %default)') + parser.add_option('--als_iters', dest='als_iters', type='int', default=15, + help='number of als iterations (default: %default)') + parser.add_option('--gamma', dest='gamma', type='float', default=0.01, + help='warp learning rate (default: %default)') + parser.add_option('--C', dest='C', type='float', default=100.0, + help='warp regularization constant (default: %default)') + parser.add_option('--item_feature_format', dest='item_feature_format', + help='format of item features tsv | csv | mm (matrixmarket) | npz (numpy arrays)') + parser.add_option('--item_features', dest='item_features', + help='path to sparse item features in tsv format (item_id,feature_id,val)') + parser.add_option('--popularity_method', dest='popularity_method', default='count', + help='how to compute popularity for baseline recommender: count | sum | avg | thresh (default: %default)') + parser.add_option('--popularity_thresh', dest='popularity_thresh', type='float', default=0, + help='ignore scores below this when computing popularity for baseline recommender (default: %default)') + parser.add_option('--packer', dest='packer', default='json', help='packer for IPython.parallel (default: %default)') + parser.add_option('--add_module_paths', dest='add_module_paths', + help='optional comma-separated list of paths to append to pythonpath (useful if you need to import uninstalled modules to IPython engines on a cluster)') + + (opts, args) = parser.parse_args() if not opts.input_format or not opts.train or not opts.outdir or not opts.num_engines: parser.print_help() raise SystemExit @@ -71,14 +91,14 @@ def main(): if opts.model == 'popularity': # special case, don't need to run in parallel - subprocess.check_call(['mkdir','-p',opts.outdir]) + subprocess.check_call(['mkdir', '-p', opts.outdir]) for trainfile in trainfiles: logging.info('processing {0}...'.format(trainfile)) - model = ItemPopularityRecommender(method=opts.popularity_method,thresh=opts.popularity_thresh) - dataset = load_fast_sparse_matrix(opts.input_format,trainfile) + model = ItemPopularityRecommender(method=opts.popularity_method, thresh=opts.popularity_thresh) + dataset = load_fast_sparse_matrix(opts.input_format, trainfile) model.fit(dataset) - modelfile = get_modelfile(trainfile,opts.outdir) - save_recommender(model,modelfile) + modelfile = get_modelfile(trainfile, opts.outdir) + save_recommender(model, modelfile) logging.info('done') return @@ -95,9 +115,10 @@ def main(): if opts.model == 'slim': if opts.learner == 'fs_sgd': num_selected_features = 2 * opts.max_sims # preselect this many candidate similar items - model = SLIM(l1_reg=opts.l1_reg,l2_reg=opts.l2_reg,model=opts.learner,num_selected_features=num_selected_features) + model = SLIM(l1_reg=opts.l1_reg, l2_reg=opts.l2_reg, model=opts.learner, + num_selected_features=num_selected_features) else: - model = SLIM(l1_reg=opts.l1_reg,l2_reg=opts.l2_reg,model=opts.learner) + model = SLIM(l1_reg=opts.l1_reg, l2_reg=opts.l2_reg, model=opts.learner) elif opts.model == 'knn': if opts.metric == 'cosine': model = CosineKNNRecommender(k=opts.max_sims) @@ -107,33 +128,36 @@ def main(): parser.print_help() raise SystemExit('unknown metric: {0}'.format(opts.metric)) elif opts.model == 'wrmf': - model = WRMFRecommender(d=opts.num_factors,alpha=opts.alpha,lbda=opts.lbda,num_iters=opts.als_iters) + model = WRMFRecommender(d=opts.num_factors, alpha=opts.alpha, lbda=opts.lbda, num_iters=opts.als_iters) elif opts.model == 'warp': - num_factors_per_engine = max(opts.num_factors/opts.num_engines,1) + num_factors_per_engine = max(opts.num_factors / opts.num_engines, 1) if opts.item_features: - model = WARP2MFRecommender(d=num_factors_per_engine,gamma=opts.gamma,C=opts.C) + model = WARP2MFRecommender(d=num_factors_per_engine, gamma=opts.gamma, C=opts.C) else: - model = WARPMFRecommender(d=num_factors_per_engine,gamma=opts.gamma,C=opts.C) + model = WARPMFRecommender(d=num_factors_per_engine, gamma=opts.gamma, C=opts.C) else: parser.print_help() raise SystemExit('unknown model type: {0}'.format(opts.model)) for trainfile in trainfiles: logging.info('processing {0}...'.format(trainfile)) - modelfile = get_modelfile(trainfile,opts.outdir) + modelfile = get_modelfile(trainfile, opts.outdir) if opts.model == 'wrmf': runner = WRMFRunner() - factorsdir = get_factorsdir(trainfile,opts.outdir) - runner.run(view,model,opts.input_format,trainfile,opts.num_engines,factorsdir,modelfile) + factorsdir = get_factorsdir(trainfile, opts.outdir) + runner.run(view, model, opts.input_format, trainfile, opts.num_engines, factorsdir, modelfile) elif opts.model == 'warp': runner = WARPMFRunner() - modelsdir = get_modelsdir(trainfile,opts.outdir) - runner.run(view,model,opts.input_format,trainfile,opts.item_feature_format,opts.item_features,opts.num_engines,modelsdir,opts.overwrite,modelfile) + modelsdir = get_modelsdir(trainfile, opts.outdir) + runner.run(view, model, opts.input_format, trainfile, opts.item_feature_format, opts.item_features, + opts.num_engines, modelsdir, opts.overwrite, modelfile) else: runner = ItemSimilarityRunner() - simsdir = get_simsdir(trainfile,opts.outdir) - simsfile = get_simsfile(trainfile,opts.outdir) - runner.run(view,model,opts.input_format,trainfile,opts.num_engines,simsdir,opts.overwrite,opts.max_sims,simsfile,modelfile) + simsdir = get_simsdir(trainfile, opts.outdir) + simsfile = get_simsfile(trainfile, opts.outdir) + runner.run(view, model, opts.input_format, trainfile, opts.num_engines, simsdir, opts.overwrite, + opts.max_sims, simsfile, modelfile) + if __name__ == '__main__': main() diff --git a/mrec/examples/tune_slim.py b/mrec/examples/tune_slim.py index a1ea762..1f82ce2 100644 --- a/mrec/examples/tune_slim.py +++ b/mrec/examples/tune_slim.py @@ -3,11 +3,12 @@ constants for SLIM by looking at model sparsity. """ +import logging import random from math import log10 -import logging from operator import itemgetter from optparse import OptionParser + try: from sklearn.grid_search import ParameterGrid except ImportError: @@ -16,63 +17,76 @@ from mrec import load_fast_sparse_matrix + def estimate_sparsity(task): from mrec.item_similarity.slim import SLIM - args,dataset,min_nnz,sample_items = task + args, dataset, min_nnz, sample_items = task model = SLIM(**args) tot_nnz = 0 tot_neg = 0 below_min_nnz = 0 for i in sample_items: - w = model.compute_similarities(dataset,i) - nnz = sum(w>0) + w = model.compute_similarities(dataset, i) + nnz = sum(w > 0) tot_nnz += nnz if nnz < min_nnz: below_min_nnz += 1 - tot_neg += sum(w<0) + tot_neg += sum(w < 0) num_samples = len(sample_items) - avg_nnz = float(tot_nnz)/num_samples - too_few_sims = float(below_min_nnz)/num_samples - avg_neg = float(tot_neg)/num_samples - return args,avg_nnz,too_few_sims,avg_neg + avg_nnz = float(tot_nnz) / num_samples + too_few_sims = float(below_min_nnz) / num_samples + avg_neg = float(tot_neg) / num_samples + return args, avg_nnz, too_few_sims, avg_neg + + +def pow_range(small, big): + return [10 ** v for v in range(int(log10(small)), int(log10(big)) + 1)] -def pow_range(small,big): - return [10**v for v in range(int(log10(small)),int(log10(big))+1)] def main(): parser = OptionParser() - parser.add_option('-d','--dataset',dest='dataset',help='path to dataset') - parser.add_option('--input_format',dest='input_format',help='format of training dataset(s) tsv | csv | mm (matrixmarket) | fsm (fast_sparse_matrix)') - parser.add_option('--l1_min',dest='l1_min',type='float',help='min l1 constant to try (expected to be a power of 10)') - parser.add_option('--l1_max',dest='l1_max',type='float',help='max l1 constant to try (expected to be a power of 10)') - parser.add_option('--l2_min',dest='l2_min',type='float',help='min l2 constant to try (expected to be a power of 10)') - parser.add_option('--l2_max',dest='l2_max',type='float',help='max l2 constant to try (expected to be a power of 10)') - parser.add_option('--max_sims',dest='max_sims',type='int',default=2000,help='max desired number of positive item similarity weights (default: %default)') - parser.add_option('--min_sims',dest='min_sims',type='int',default=15,help='min desired number of positive item similarity weights (default: %default)') - parser.add_option('--max_sparse',dest='max_sparse',type='float',default=0.01,help='max allowable proportion of items with less than min_sims positive similarity weights (default: %default)') - parser.add_option('--num_samples',dest='num_samples',type='int',default=100,help='number of sample items to evaluate for each regularization setting') - parser.add_option('--packer',dest='packer',default='json',help='packer for IPython.parallel (default: %default)') - parser.add_option('--add_module_paths',dest='add_module_paths',help='comma-separated list of paths to append to pythonpath to enable import of uninstalled modules') - - (opts,args) = parser.parse_args() + parser.add_option('-d', '--dataset', dest='dataset', help='path to dataset') + parser.add_option('--input_format', dest='input_format', + help='format of training dataset(s) tsv | csv | mm (matrixmarket) | fsm (fast_sparse_matrix)') + parser.add_option('--l1_min', dest='l1_min', type='float', + help='min l1 constant to try (expected to be a power of 10)') + parser.add_option('--l1_max', dest='l1_max', type='float', + help='max l1 constant to try (expected to be a power of 10)') + parser.add_option('--l2_min', dest='l2_min', type='float', + help='min l2 constant to try (expected to be a power of 10)') + parser.add_option('--l2_max', dest='l2_max', type='float', + help='max l2 constant to try (expected to be a power of 10)') + parser.add_option('--max_sims', dest='max_sims', type='int', default=2000, + help='max desired number of positive item similarity weights (default: %default)') + parser.add_option('--min_sims', dest='min_sims', type='int', default=15, + help='min desired number of positive item similarity weights (default: %default)') + parser.add_option('--max_sparse', dest='max_sparse', type='float', default=0.01, + help='max allowable proportion of items with less than min_sims positive similarity weights (default: %default)') + parser.add_option('--num_samples', dest='num_samples', type='int', default=100, + help='number of sample items to evaluate for each regularization setting') + parser.add_option('--packer', dest='packer', default='json', help='packer for IPython.parallel (default: %default)') + parser.add_option('--add_module_paths', dest='add_module_paths', + help='comma-separated list of paths to append to pythonpath to enable import of uninstalled modules') + + (opts, args) = parser.parse_args() if not opts.dataset or not opts.input_format or not opts.l1_min or not opts.l1_max or not opts.l2_min or not opts.l2_max: parser.print_help() raise SystemExit - logging.basicConfig(level=logging.INFO,format='[%(asctime)s] %(levelname)s: %(message)s') + logging.basicConfig(level=logging.INFO, format='[%(asctime)s] %(levelname)s: %(message)s') - dataset = load_fast_sparse_matrix(opts.input_format,opts.dataset) + dataset = load_fast_sparse_matrix(opts.input_format, opts.dataset) - params = {'l1_reg':pow_range(opts.l1_min,opts.l1_max), - 'l2_reg':pow_range(opts.l2_min,opts.l2_max)} + params = {'l1_reg': pow_range(opts.l1_min, opts.l1_max), + 'l2_reg': pow_range(opts.l2_min, opts.l2_max)} num_items = dataset.shape[1] - sample_items = random.sample(range(num_items),opts.num_samples) + sample_items = random.sample(range(num_items), opts.num_samples) logging.info('preparing tasks for a grid search of these values:') logging.info(params) - tasks = [(args,dataset,opts.min_sims,sample_items) for args in ParameterGrid(params)] + tasks = [(args, dataset, opts.min_sims, sample_items) for args in ParameterGrid(params)] c = Client(packer=opts.packer) view = c.load_balanced_view() @@ -84,19 +98,22 @@ def main(): c[:].execute("sys.path.append('{0}')".format(path)) logging.info('running {0} tasks in parallel...'.format(len(tasks))) - results = view.map(estimate_sparsity,tasks,ordered=False) + results = view.map(estimate_sparsity, tasks, ordered=False) - candidates = [(args,nsims,nsparse,nneg) for args,nsims,nsparse,nneg in results if nsims <= opts.max_sims and nsparse <= opts.max_sparse] + candidates = [(args, nsims, nsparse, nneg) for args, nsims, nsparse, nneg in results if + nsims <= opts.max_sims and nsparse <= opts.max_sparse] if candidates: - best = min(candidates,key=itemgetter(1)) + best = min(candidates, key=itemgetter(1)) print('best parameter setting: {0}'.format(best[0])) print('mean # positive similarity weights per item = {0:.3}'.format(best[1])) - print('proportion of items with fewer than {0} positive similarity weights = {1:.3}'.format(opts.min_sims,best[2])) + print('proportion of items with fewer than {0} positive similarity weights = {1:.3}'.format(opts.min_sims, + best[2])) print('mean # negative similarity weights per item = {0:.3}'.format(best[3])) else: print('no parameter settings satisfied the conditions, try increasing --min_sims, --max_sims or --max_sparse') + if __name__ == '__main__': main() diff --git a/mrec/item_similarity/knn.py b/mrec/item_similarity/knn.py index be57901..d620078 100644 --- a/mrec/item_similarity/knn.py +++ b/mrec/item_similarity/knn.py @@ -5,8 +5,10 @@ import numpy as np from sklearn.metrics.pairwise import cosine_similarity + from mrec.item_similarity.recommender import ItemSimilarityRecommender + class KNNRecommender(ItemSimilarityRecommender): """ Abstract base class for k-nn recommenders. You must supply an @@ -18,21 +20,21 @@ class KNNRecommender(ItemSimilarityRecommender): The number of nearest neighbouring items to retain """ - def __init__(self,k): + def __init__(self, k): self.k = k - def compute_similarities(self,dataset,j): + def compute_similarities(self, dataset, j): A = dataset.X a = dataset.fast_get_col(j) - d = self.compute_all_similarities(A,a) + d = self.compute_all_similarities(A, a) d[j] = 0 # zero out self-similarity # now zero out similarities for all but top-k items - nn = d.argsort()[-1:-1-self.k:-1] + nn = d.argsort()[-1:-1 - self.k:-1] w = np.zeros(A.shape[1]) w[nn] = d[nn] return w - def compute_all_similarities(self,A,a): + def compute_all_similarities(self, A, a): """ Compute similarity scores between item vector a and all the rows of A. @@ -51,29 +53,32 @@ def compute_all_similarities(self,A,a): """ pass + class DotProductKNNRecommender(KNNRecommender): """ Similarity between two items is their dot product (i.e. cooccurrence count if input data is binary). """ - def compute_all_similarities(self,A,a): + def compute_all_similarities(self, A, a): return A.T.dot(a).toarray().flatten() def __str__(self): return 'DotProductKNNRecommender(k={0})'.format(self.k) + class CosineKNNRecommender(KNNRecommender): """ Similarity between two items is their cosine distance. """ - def compute_all_similarities(self,A,a): - return cosine_similarity(A.T,a.T).flatten() + def compute_all_similarities(self, A, a): + return cosine_similarity(A.T, a.T).flatten() def __str__(self): return 'CosineKNNRecommender(k={0})'.format(self.k) + if __name__ == '__main__': # use knn models like this: @@ -99,23 +104,23 @@ def __str__(self): 3 4 1 """ print(data) - dataset = load_fast_sparse_matrix('mm',StringIO.StringIO(data)) - num_users,num_items = dataset.shape + dataset = load_fast_sparse_matrix('mm', StringIO.StringIO(data)) + num_users, num_items = dataset.shape model = CosineKNNRecommender(k=2) num_samples = 2 - def output(i,j,val): + def output(i, j, val): # convert back to 1-indexed - print('{0}\t{1}\t{2:.3f}'.format(i+1,j+1,val)) + print('{0}\t{1}\t{2:.3f}'.format(i + 1, j + 1, val)) print('computing some item similarities...') print('item\tsim\tweight') # if we want we can compute these individually without calling fit() - for i in random.sample(range(num_items),num_samples): - for j,weight in model.get_similar_items(i,max_similar_items=2,dataset=dataset): - output(i,j,weight) + for i in random.sample(range(num_items), num_samples): + for j, weight in model.get_similar_items(i, max_similar_items=2, dataset=dataset): + output(i, j, weight) print('learning entire similarity matrix...') # more usually we just call train() on the entire dataset @@ -123,19 +128,19 @@ def output(i,j,val): model.fit(dataset) print('making some recommendations...') print('user\trec\tscore') - for u in random.sample(range(num_users),num_samples): - for i,score in model.recommend_items(dataset.X,u,max_items=10): - output(u,i,score) + for u in random.sample(range(num_users), num_samples): + for i, score in model.recommend_items(dataset.X, u, max_items=10): + output(u, i, score) print('making batch recommendations...') recs = model.batch_recommend_items(dataset.X) for u in range(num_users): - for i,score in recs[u]: - output(u,i,score) + for i, score in recs[u]: + output(u, i, score) print('making range recommendations...') - for start,end in [(0,2),(2,3)]: - recs = model.range_recommend_items(dataset.X,start,end) - for u in range(start,end): - for i,score in recs[u-start]: - output(u,i,score) + for start, end in [(0, 2), (2, 3)]: + recs = model.range_recommend_items(dataset.X, start, end) + for u in range(start, end): + for i, score in recs[u - start]: + output(u, i, score) diff --git a/mrec/item_similarity/precomputed.py b/mrec/item_similarity/precomputed.py index 139bed8..ea2bcb1 100644 --- a/mrec/item_similarity/precomputed.py +++ b/mrec/item_similarity/precomputed.py @@ -4,6 +4,7 @@ from mrec.item_similarity.recommender import ItemSimilarityRecommender + class PrecomputedItemSimilarityRecommender(ItemSimilarityRecommender): """ Wrapper class to make recommendations using a precomputed item similarity matrix. @@ -16,18 +17,17 @@ class PrecomputedItemSimilarityRecommender(ItemSimilarityRecommender): The precomputed item similarity matrix. """ - - def __init__(self,description,similarity_matrix): + def __init__(self, description, similarity_matrix): self.description = description self.set_similarity_matrix(similarity_matrix) - def set_similarity_matrix(self,similarity_matrix): + def set_similarity_matrix(self, similarity_matrix): self.similarity_matrix = similarity_matrix - def compute_similarities(self,j): - return self.similarity_matrix[j,:] + def compute_similarities(self, j): + return self.similarity_matrix[j, :] - def fit(self,dataset,item_features=None): + def fit(self, dataset, item_features=None): pass def __str__(self): diff --git a/mrec/item_similarity/recommender.py b/mrec/item_similarity/recommender.py index d263413..0e35ce3 100644 --- a/mrec/item_similarity/recommender.py +++ b/mrec/item_similarity/recommender.py @@ -6,12 +6,14 @@ import cPickle as pickle except ImportError: import pickle -import numpy as np from operator import itemgetter + +import numpy as np from scipy.sparse import csr_matrix, coo_matrix -from mrec.sparse import fast_sparse_matrix from mrec.base_recommender import BaseRecommender +from mrec.sparse import fast_sparse_matrix + class ItemSimilarityRecommender(BaseRecommender): """ @@ -20,7 +22,7 @@ class ItemSimilarityRecommender(BaseRecommender): need to supply the compute_similarities() method. """ - def fit(self,dataset,item_features=None): + def fit(self, dataset, item_features=None): """ Learn the complete similarity matrix from a user-item matrix. @@ -32,22 +34,22 @@ def fit(self,dataset,item_features=None): item_features : array_like, shape = [num_items, num_features] Features for items in training set, ignored here. """ - if not isinstance(dataset,fast_sparse_matrix): + if not isinstance(dataset, fast_sparse_matrix): dataset = fast_sparse_matrix(dataset) - num_users,num_items = dataset.shape + num_users, num_items = dataset.shape # build up a sparse similarity matrix data = [] row = [] col = [] for j in range(num_items): - w = self.compute_similarities(dataset,j) - for k,v in enumerate(w): + w = self.compute_similarities(dataset, j) + for k, v in enumerate(w): if v != 0: data.append(v) row.append(j) col.append(k) - idx = np.array([row,col],dtype='int32') - self.similarity_matrix = csr_matrix((data,idx),(num_items,num_items)) + idx = np.array([row, col], dtype='int32') + self.similarity_matrix = csr_matrix((data, idx), (num_items, num_items)) def _create_archive(self): """ @@ -65,17 +67,17 @@ def _create_archive(self): self.similarity_matrix = None m = pickle.dumps(self) self.similarity_matrix = tmp - if isinstance(self.similarity_matrix,np.ndarray): - archive = {'mat':self.similarity_matrix,'model':m} - elif isinstance(self.similarity_matrix,csr_matrix): + if isinstance(self.similarity_matrix, np.ndarray): + archive = {'mat': self.similarity_matrix, 'model': m} + elif isinstance(self.similarity_matrix, csr_matrix): d = self.similarity_matrix.tocoo(copy=False) - archive = {'row':d.row,'col':d.col,'data':d.data,'shape':d.shape,'model':m} + archive = {'row': d.row, 'col': d.col, 'data': d.data, 'shape': d.shape, 'model': m} else: # similarity matrix has unexpected type archive = None return archive - def _load_archive(self,archive): + def _load_archive(self, archive): """ Load fields from a numpy archive. """ @@ -86,11 +88,11 @@ def _load_archive(self,archive): row = archive['row'] col = archive['col'] shape = archive['shape'] - self.similarity_matrix = coo_matrix((data,(row,col)),shape=shape).tocsr() + self.similarity_matrix = coo_matrix((data, (row, col)), shape=shape).tocsr() else: raise IOError('unexpected serialization format, cannot find similarity matrix') - def load_similarity_matrix(self,filepath,num_items,offset=1): + def load_similarity_matrix(self, filepath, num_items, offset=1): """ Load a precomputed similarity matrix from tsv. @@ -104,13 +106,13 @@ def load_similarity_matrix(self,filepath,num_items,offset=1): Item index offset i.e. 1 if indices in file are 1-indexed. """ y = np.loadtxt(filepath) - row = y[:,0] - col = y[:,1] - data = y[:,2] - idx = np.array([row,col],dtype='int32')-offset - self.similarity_matrix = csr_matrix((data,idx),(num_items,num_items)) + row = y[:, 0] + col = y[:, 1] + data = y[:, 2] + idx = np.array([row, col], dtype='int32') - offset + self.similarity_matrix = csr_matrix((data, idx), (num_items, num_items)) - def compute_similarities(self,dataset,j): + def compute_similarities(self, dataset, j): """ Compute pairwise similarity scores between the j-th item and every item in the dataset. @@ -129,7 +131,7 @@ def compute_similarities(self,dataset,j): """ pass - def get_similar_items(self,j,max_similar_items=30,dataset=None): + def get_similar_items(self, j, max_similar_items=30, dataset=None): """ Get the most similar items to a supplied item. @@ -149,16 +151,16 @@ def get_similar_items(self,j,max_similar_items=30,dataset=None): Sorted list of similar items, best first. Each entry is a tuple of the form (i,score). """ - if hasattr(self,'similarity_matrix') and self.similarity_matrix is not None: - w = zip(self.similarity_matrix[j].indices,self.similarity_matrix[j].data) - sims = sorted(w,key=itemgetter(1),reverse=True)[:max_similar_items] - sims = [(i,f) for i,f in sims if f > 0] + if hasattr(self, 'similarity_matrix') and self.similarity_matrix is not None: + w = zip(self.similarity_matrix[j].indices, self.similarity_matrix[j].data) + sims = sorted(w, key=itemgetter(1), reverse=True)[:max_similar_items] + sims = [(i, f) for i, f in sims if f > 0] else: - w = self.compute_similarities(dataset,j) - sims = [(i,w[i]) for i in w.argsort()[-1:-max_similar_items-1:-1] if w[i] > 0] + w = self.compute_similarities(dataset, j) + sims = [(i, w[i]) for i in w.argsort()[-1:-max_similar_items - 1:-1] if w[i] > 0] return sims - def recommend_items(self,dataset,u,max_items=10,return_scores=True,item_features=None): + def recommend_items(self, dataset, u, max_items=10, return_scores=True, item_features=None): """ Recommend new items for a user. Assumes you've already called fit() to learn the similarity matrix. @@ -191,7 +193,7 @@ def recommend_items(self,dataset,u,max_items=10,return_scores=True,item_features for i in r.argsort()[::-1]: if i not in known_items: if return_scores: - recs.append((i,r[i])) + recs.append((i, r[i])) else: recs.append(i) if len(recs) >= max_items: @@ -231,7 +233,8 @@ def batch_recommend_items(self, r = dataset * self.similarity_matrix.T except AttributeError: raise AttributeError('you must call fit() before trying to recommend items') - return self._get_recommendations_from_predictions(r,dataset,0,r.shape[0],max_items,return_scores,show_progress) + return self._get_recommendations_from_predictions(r, dataset, 0, r.shape[0], max_items, return_scores, + show_progress) def range_recommend_items(self, dataset, @@ -266,12 +269,13 @@ def range_recommend_items(self, else just a list of idxs. """ try: - r = dataset[user_start:user_end,:] * self.similarity_matrix.T + r = dataset[user_start:user_end, :] * self.similarity_matrix.T except AttributeError: raise AttributeError('you must call fit() before trying to recommend items') - return self._get_recommendations_from_predictions(r,dataset,user_start,user_end,max_items,return_scores) + return self._get_recommendations_from_predictions(r, dataset, user_start, user_end, max_items, return_scores) - def _get_recommendations_from_predictions(self,r,dataset,user_start,user_end,max_items,return_scores=True,show_progress=False): + def _get_recommendations_from_predictions(self, r, dataset, user_start, user_end, max_items, return_scores=True, + show_progress=False): """ Select recommendations given predicted scores/ratings. @@ -298,17 +302,17 @@ def _get_recommendations_from_predictions(self,r,dataset,user_start,user_end,max Each entry is a list of (idx,score) pairs if return_scores is True, else just a list of idxs. """ - r = self._zero_known_item_scores(r,dataset[user_start:user_end,:]) - recs = [[] for u in range(user_start,user_end)] - for u in range(user_start,user_end): + r = self._zero_known_item_scores(r, dataset[user_start:user_end, :]) + recs = [[] for u in range(user_start, user_end)] + for u in range(user_start, user_end): ux = u - user_start - if show_progress and ux%1000 == 0: - print(ux,'..',) - ru = r[ux,:] + if show_progress and ux % 1000 == 0: + print(ux, '..', ) + ru = r[ux, :] if return_scores: - recs[ux] = [(i,v) for v,i in sorted(zip(ru.data,ru.indices),reverse=True) if v > 0][:max_items] + recs[ux] = [(i, v) for v, i in sorted(zip(ru.data, ru.indices), reverse=True) if v > 0][:max_items] else: - recs[ux] = [i for v,i in sorted(zip(ru.data,ru.indices),reverse=True) if v > 0][:max_items] + recs[ux] = [i for v, i in sorted(zip(ru.data, ru.indices), reverse=True) if v > 0][:max_items] if show_progress: print() return recs diff --git a/mrec/item_similarity/slim.py b/mrec/item_similarity/slim.py index 0b02c40..ccd426e 100644 --- a/mrec/item_similarity/slim.py +++ b/mrec/item_similarity/slim.py @@ -11,10 +11,9 @@ http://glaros.dtc.umn.edu/gkhome/fetch/papers/SLIM2011icdm.pdf """ -from sklearn.linear_model import SGDRegressor, ElasticNet -from sklearn.preprocessing import binarize -import sklearn import numpy as np +import sklearn +from sklearn.linear_model import SGDRegressor, ElasticNet from mrec.item_similarity.recommender import ItemSimilarityRecommender @@ -30,16 +29,16 @@ class NNFeatureSelectingSGDRegressor(object): Wraps nearest-neighbour feature selection and regression in a single model. """ - def __init__(self,model,k): + def __init__(self, model, k): self.model = model self.k = k - def fit(self,A,a): + def fit(self, A, a): # find k-NN by brute force d = A.T.dot(a).flatten() # distance = dot product - nn = d.argsort()[-1:-1-self.k:-1] + nn = d.argsort()[-1:-1 - self.k:-1] # fit the model to selected features only - self.model.fit(A[:,nn],a) + self.model.fit(A[:, nn], a) # set our weights for the selected "features" i.e. items self.coef_ = np.zeros(A.shape[1]) self.coef_[nn] = self.model.coef_ @@ -47,6 +46,7 @@ def fit(self,A,a): def __str__(self): return 'NN-feature selecting {0}'.format(self.model) + class SLIM(ItemSimilarityRecommender): """ Parameters @@ -68,6 +68,7 @@ class SLIM(ItemSimilarityRecommender): :elasticnet: ElasticNet :fs_sgd: NNFeatureSelectingSGDRegressor """ + def __init__(self, l1_reg=0.001, l2_reg=0.0001, @@ -75,39 +76,40 @@ def __init__(self, ignore_negative_weights=False, num_selected_features=200, model='sgd'): - alpha = l1_reg+l2_reg - l1_ratio = l1_reg/alpha + alpha = l1_reg + l2_reg + l1_ratio = l1_reg / alpha if parse_version(sklearn.__version__) <= (0, 14, 1): # Backward compat: in old versions of scikit-learn l1_ratio had # the opposite sign... l1_ratio = (1 - l1_ratio) if model == 'sgd': - self.model = SGDRegressor(penalty='elasticnet',fit_intercept=fit_intercept,alpha=alpha,l1_ratio=l1_ratio) + self.model = SGDRegressor(penalty='elasticnet', fit_intercept=fit_intercept, alpha=alpha, l1_ratio=l1_ratio) elif model == 'elasticnet': - self.model = ElasticNet(alpha=alpha,l1_ratio=l1_ratio,positive=True,fit_intercept=fit_intercept,copy_X=False) + self.model = ElasticNet(alpha=alpha, l1_ratio=l1_ratio, positive=True, fit_intercept=fit_intercept, + copy_X=False) elif model == 'fs_sgd': - m = SGDRegressor(penalty='elasticnet',fit_intercept=fit_intercept,alpha=alpha,l1_ratio=l1_ratio) - self.model = NNFeatureSelectingSGDRegressor(m,num_selected_features) + m = SGDRegressor(penalty='elasticnet', fit_intercept=fit_intercept, alpha=alpha, l1_ratio=l1_ratio) + self.model = NNFeatureSelectingSGDRegressor(m, num_selected_features) else: raise SystemExit('unknown model type: {0}'.format(model)) self.ignore_negative_weights = ignore_negative_weights - def compute_similarities(self,dataset,j): + def compute_similarities(self, dataset, j): """Compute item similarity weights for item j.""" # zero out the j-th column of the input so we get w[j] = 0 a = dataset.fast_get_col(j) - dataset.fast_update_col(j,np.zeros(a.nnz)) - self.model.fit(dataset.X,a.toarray().ravel()) + dataset.fast_update_col(j, np.zeros(a.nnz)) + self.model.fit(dataset.X, a.toarray().ravel()) # reinstate the j-th column - dataset.fast_update_col(j,a.data) + dataset.fast_update_col(j, a.data) w = self.model.coef_ if self.ignore_negative_weights: - w[w<0] = 0 + w[w < 0] = 0 return w - def compute_similarities_from_vec(self,dataset,a): + def compute_similarities_from_vec(self, dataset, a): """Compute item similarity weights for out-of-dataset item vector.""" - self.model.fit(dataset.X,a) + self.model.fit(dataset.X, a) return self.model.coef_ def __str__(self): @@ -116,6 +118,7 @@ def __str__(self): else: return 'SLIM({0})'.format(self.model) + if __name__ == '__main__': # use SLIM like this: @@ -141,23 +144,23 @@ def __str__(self): 3 4 1 """ print(data) - dataset = load_fast_sparse_matrix('mm',StringIO.StringIO(data)) - num_users,num_items = dataset.shape + dataset = load_fast_sparse_matrix('mm', StringIO.StringIO(data)) + num_users, num_items = dataset.shape model = SLIM() num_samples = 2 - def output(i,j,val): + def output(i, j, val): # convert back to 1-indexed - print('{0}\t{1}\t{2:.3f}'.format(i+1,j+1,val)) + print('{0}\t{1}\t{2:.3f}'.format(i + 1, j + 1, val)) print('computing some item similarities...') print('item\tsim\tweight') # if we want we can compute these individually without calling fit() - for i in random.sample(range(num_items),num_samples): - for j,weight in model.get_similar_items(i,max_similar_items=10,dataset=dataset): - output(i,j,weight) + for i in random.sample(range(num_items), num_samples): + for j, weight in model.get_similar_items(i, max_similar_items=10, dataset=dataset): + output(i, j, weight) print('learning entire similarity matrix...') # usually we'll call train() on the entire dataset @@ -165,19 +168,19 @@ def output(i,j,val): model.fit(dataset) print('making some recommendations...') print('user\trec\tscore') - for u in random.sample(range(num_users),num_samples): - for i,score in model.recommend_items(dataset.X,u,max_items=10): - output(u,i,score) + for u in random.sample(range(num_users), num_samples): + for i, score in model.recommend_items(dataset.X, u, max_items=10): + output(u, i, score) print('making batch recommendations...') recs = model.batch_recommend_items(dataset.X) for u in range(num_users): - for i,score in recs[u]: - output(u,i,score) + for i, score in recs[u]: + output(u, i, score) print('making range recommendations...') - for start,end in [(0,2),(2,3)]: - recs = model.range_recommend_items(dataset.X,start,end) - for u in range(start,end): - for i,score in recs[u-start]: - output(u,i,score) + for start, end in [(0, 2), (2, 3)]: + recs = model.range_recommend_items(dataset.X, start, end) + for u in range(start, end): + for i, score in recs[u - start]: + output(u, i, score) diff --git a/mrec/mf/climf.py b/mrec/mf/climf.py index e144610..a9cb1f2 100644 --- a/mrec/mf/climf.py +++ b/mrec/mf/climf.py @@ -11,7 +11,7 @@ """ from math import exp, log -import random + import numpy as np from mrec.mf.recommender import MatrixFactorizationRecommender @@ -22,32 +22,33 @@ def g(x): """sigmoid function""" - return 1/(1+exp(-x)) + return 1 / (1 + exp(-x)) + def dg(x): """derivative of sigmoid function""" - return exp(x)/(1+exp(x))**2 + return exp(x) / (1 + exp(x)) ** 2 -class CLiMFRecommender(MatrixFactorizationRecommender): - def __init__(self,d,lbda=0.01,gamma=0.01,max_iters=25): +class CLiMFRecommender(MatrixFactorizationRecommender): + def __init__(self, d, lbda=0.01, gamma=0.01, max_iters=25): self.d = d self.lbda = lbda self.gamma = gamma self.max_iters = max_iters - def fit(self,data): - self.U = 0.01*np.random.random_sample((data.shape[0],self.d)) - self.V = 0.01*np.random.random_sample((data.shape[1],self.d)) + def fit(self, data): + self.U = 0.01 * np.random.random_sample((data.shape[0], self.d)) + self.V = 0.01 * np.random.random_sample((data.shape[1], self.d)) # TODO: create a validation set for iter in range(self.max_iters): - print('iteration {0}:'.format(iter+1)) + print('iteration {0}:'.format(iter + 1)) print('objective = {0:.4f}'.format(self.objective(data))) self.update(data) # TODO: compute MRR on validation set, terminate if appropriate - def precompute_f(self,data,i): + def precompute_f(self, data, i): """ precompute f[j] = @@ -61,10 +62,10 @@ def precompute_f(self,data,i): dot products for all j in data[i] """ items = data[i].indices - f = dict((j,np.dot(self.U[i],self.V[j])) for j in items) + f = dict((j, np.dot(self.U[i], self.V[j])) for j in items) return f - def objective(self,data): + def objective(self, data): """ compute objective function F(U,V) @@ -76,16 +77,16 @@ def objective(self,data): returns: current value of F(U,V) """ - F = -0.5*self.lbda*(np.sum(self.U*self.U)+np.sum(self.V*self.V)) + F = -0.5 * self.lbda * (np.sum(self.U * self.U) + np.sum(self.V * self.V)) for i in range(len(self.U)): - f = self.precompute_f(data,i) + f = self.precompute_f(data, i) for j in f: F += log(g(f[j])) for k in f: - F += log(1-g(f[k]-f[j])) + F += log(1 - g(f[k] - f[j])) return F - def update(self,data): + def update(self, data): """ update user/item factors using stochastic gradient ascent @@ -97,19 +98,19 @@ def update(self,data): gamma: learning rate """ for i in range(len(self.U)): - dU = -self.lbda*self.U[i] - f = self.precompute_f(data,i) + dU = -self.lbda * self.U[i] + f = self.precompute_f(data, i) for j in f: - dV = g(-f[j])-self.lbda*self.V[j] + dV = g(-f[j]) - self.lbda * self.V[j] for k in f: - dV += dg(f[j]-f[k])*(1/(1-g(f[k]-f[j]))-1/(1-g(f[j]-f[k])))*self.U[i] - self.V[j] += self.gamma*dV - dU += g(-f[j])*self.V[j] + dV += dg(f[j] - f[k]) * (1 / (1 - g(f[k] - f[j])) - 1 / (1 - g(f[j] - f[k]))) * self.U[i] + self.V[j] += self.gamma * dV + dU += g(-f[j]) * self.V[j] for k in f: - dU += (self.V[j]-self.V[k])*dg(f[k]-f[j])/(1-g(f[k]-f[j])) - self.U[i] += self.gamma*dU + dU += (self.V[j] - self.V[k]) * dg(f[k] - f[j]) / (1 - g(f[k] - f[j])) + self.U[i] += self.gamma * dU - def compute_mrr(self,data,test_users=None): + def compute_mrr(self, data, test_users=None): """ compute average Mean Reciprocal Rank of data according to factors @@ -125,23 +126,24 @@ def compute_mrr(self,data,test_users=None): mrr = [] if test_users is None: test_users = range(len(self.U)) - for ix,i in enumerate(test_users): + for ix, i in enumerate(test_users): items = set(data[i].indices) if not items: continue - predictions = np.sum(np.tile(self.U[i],(len(self.V),1))*self.V,axis=1) + predictions = np.sum(np.tile(self.U[i], (len(self.V), 1)) * self.V, axis=1) found = False - for rank,item in enumerate(np.argsort(predictions)[::-1]): + for rank, item in enumerate(np.argsort(predictions)[::-1]): if item in items: - mrr.append(1.0/(rank+1)) + mrr.append(1.0 / (rank + 1)) found = True break if not found: - print('fail, no relevant items predicted for test user {0}'.format(i+1)) + print('fail, no relevant items predicted for test user {0}'.format(i + 1)) print('known items: {0}'.format(items)) - assert(len(mrr) == len(test_users)) + assert (len(mrr) == len(test_users)) return np.mean(mrr) + def main(): import sys from mrec import load_sparse_matrix, save_recommender @@ -152,13 +154,15 @@ def main(): outfile = sys.argv[3] # load training set as scipy sparse matrix - train = load_sparse_matrix(file_format,filepath) + train = load_sparse_matrix(file_format, filepath) model = CLiMFRecommender(d=5) model.fit(train) - save_recommender(model,outfile) + save_recommender(model, outfile) + if __name__ == '__main__': import cProfile + cProfile.run('main()') diff --git a/mrec/mf/evaluate.py b/mrec/mf/evaluate.py index 393bc6f..eec3067 100644 --- a/mrec/mf/evaluate.py +++ b/mrec/mf/evaluate.py @@ -1,6 +1,7 @@ -def retrain_recommender(model,dataset): +def retrain_recommender(model, dataset): model.fit(dataset.X) + if __name__ == '__main__': try: @@ -13,17 +14,20 @@ def retrain_recommender(model,dataset): from mrec.evaluation.metrics import * parser = OptionParser() - parser.add_option('-m','--main_split_dir',dest='main_split_dir',help='directory containing 50/50 splits for main evaluation') - parser.add_option('-l','--loo_split_dir',dest='loo_split_dir',help='directory containing LOO splits for hit rate evaluation') - parser.add_option('-n','--num_splits',dest='num_splits',type='int',default=5,help='number of splits in each directory (default: %default)') - - (opts,args) = parser.parse_args() + parser.add_option('-m', '--main_split_dir', dest='main_split_dir', + help='directory containing 50/50 splits for main evaluation') + parser.add_option('-l', '--loo_split_dir', dest='loo_split_dir', + help='directory containing LOO splits for hit rate evaluation') + parser.add_option('-n', '--num_splits', dest='num_splits', type='int', default=5, + help='number of splits in each directory (default: %default)') + + (opts, args) = parser.parse_args() if not (opts.main_split_dir or opts.loo_split_dir) or not opts.num_splits: parser.print_help() raise SystemExit print('doing a grid search for regularization parameters...') - params = {'d':[100],'gamma':[0.01],'C':[100],'max_iter':[100000],'validation_iters':[500]} + params = {'d': [100], 'gamma': [0.01], 'C': [100], 'max_iter': [100000], 'validation_iters': [500]} models = [WARPMFRecommender(**a) for a in ParameterGrid(params)] for train in glob: @@ -33,19 +37,19 @@ def retrain_recommender(model,dataset): # test is a dict id->[id,id,...] if opts.main_split_dir: - generate_main_metrics = generate_metrics(get_known_items_from_dict,compute_main_metrics) + generate_main_metrics = generate_metrics(get_known_items_from_dict, compute_main_metrics) main_metrics = run_evaluation(models, retrain_recommender, - load_splits(opts.main_split_dir,opts.num_splits), + load_splits(opts.main_split_dir, opts.num_splits), opts.num_splits, generate_main_metrics) - print_report(models,main_metrics) + print_report(models, main_metrics) if opts.loo_split_dir: - generate_hit_rate = generate_metrics(get_known_items_from_dict,compute_hit_rate) + generate_hit_rate = generate_metrics(get_known_items_from_dict, compute_hit_rate) hit_rate_metrics = run_evaluation(models, retrain_recommender, - load_splits(opts.loo_split_dir,opts.num_splits), + load_splits(opts.loo_split_dir, opts.num_splits), opts.num_splits, generate_hit_rate) - print_report(models,hit_rate_metrics) + print_report(models, hit_rate_metrics) diff --git a/mrec/mf/model/warp.py b/mrec/mf/model/warp.py index 41dbd4e..bf2d99f 100644 --- a/mrec/mf/model/warp.py +++ b/mrec/mf/model/warp.py @@ -1,26 +1,25 @@ import numpy as np -import random +from warp_fast import warp_sample, apply_updates from mrec.evaluation import metrics -from warp_fast import warp_sample, apply_updates class WARPBatchUpdate(object): """Collection of arrays to hold a batch of WARP sgd updates.""" - def __init__(self,batch_size,d): - self.u = np.zeros(batch_size,dtype='int32') - self.dU = np.zeros((batch_size,d),order='F') - self.v_pos = np.zeros(batch_size,dtype='int32') - self.dV_pos = np.zeros((batch_size,d)) - self.v_neg = np.zeros(batch_size,dtype='int32') - self.dV_neg = np.zeros((batch_size,d)) + def __init__(self, batch_size, d): + self.u = np.zeros(batch_size, dtype='int32') + self.dU = np.zeros((batch_size, d), order='F') + self.v_pos = np.zeros(batch_size, dtype='int32') + self.dV_pos = np.zeros((batch_size, d)) + self.v_neg = np.zeros(batch_size, dtype='int32') + self.dV_neg = np.zeros((batch_size, d)) def clear(self): pass - def set_update(self,ix,update): - u,v_pos,v_neg,dU,dV_pos,dV_neg = update + def set_update(self, ix, update): + u, v_pos, v_neg, dU, dV_pos, dV_neg = update self.u[ix] = u self.dU[ix] = dU self.v_pos[ix] = v_pos @@ -28,6 +27,7 @@ def set_update(self,ix,update): self.v_neg[ix] = v_neg self.dV_neg[ix] = dV_neg + class WARPDecomposition(object): """ Matrix embedding optimizing the WARP loss. @@ -42,14 +42,14 @@ class WARPDecomposition(object): The embedding dimension for the decomposition. """ - def __init__(self,num_rows,num_cols,d): + def __init__(self, num_rows, num_cols, d): # initialize factors to small random values - self.U = d**-0.5*np.random.random_sample((num_rows,d)) - self.V = d**-0.5*np.random.random_sample((num_cols,d)) + self.U = d ** -0.5 * np.random.random_sample((num_rows, d)) + self.V = d ** -0.5 * np.random.random_sample((num_cols, d)) # ensure memory layout avoids extra allocation in dot product self.U = np.asfortranarray(self.U) - def compute_gradient_step(self,u,i,j,L): + def compute_gradient_step(self, u, i, j, L): """ Compute a gradient step from results of sampling. @@ -80,24 +80,25 @@ def compute_gradient_step(self,u,i,j,L): dV_neg : numpy.ndarray Gradient step for V[j]. """ - dU = L*(self.V[i]-self.V[j]) - dV_pos = L*self.U[u] - dV_neg = -L*self.U[u] - return u,i,j,dU,dV_pos,dV_neg + dU = L * (self.V[i] - self.V[j]) + dV_pos = L * self.U[u] + dV_neg = -L * self.U[u] + return u, i, j, dU, dV_pos, dV_neg - def apply_updates(self,updates,gamma,C): + def apply_updates(self, updates, gamma, C): # delegate to cython implementation - apply_updates(self.U,updates.u,updates.dU,gamma,C) - apply_updates(self.V,updates.v_pos,updates.dV_pos,gamma,C) - apply_updates(self.V,updates.v_neg,updates.dV_neg,gamma,C) + apply_updates(self.U, updates.u, updates.dU, gamma, C) + apply_updates(self.V, updates.v_pos, updates.dV_pos, gamma, C) + apply_updates(self.V, updates.v_neg, updates.dV_neg, gamma, C) - def reconstruct(self,rows): + def reconstruct(self, rows): if rows is None: U = self.U else: - U = np.asfortranarray(self.U[rows,:]) + U = np.asfortranarray(self.U[rows, :]) return U.dot(self.V.T) + class WARP(object): """ Learn low-dimensional embedding optimizing the WARP loss. @@ -150,9 +151,11 @@ def __init__(self, self.max_trials = max_trials def __str__(self): - return 'WARP(d={0},gamma={1},C={2},max_iters={3},validation_iters={4},batch_size={5},positive_thresh={6},max_trials={7})'.format(self.d,self.gamma,self.C,self.max_iters,self.validation_iters,self.batch_size,self.positive_thresh,self.max_trials) + return 'WARP(d={0},gamma={1},C={2},max_iters={3},validation_iters={4},batch_size={5},positive_thresh={6},max_trials={7})'.format( + self.d, self.gamma, self.C, self.max_iters, self.validation_iters, self.batch_size, self.positive_thresh, + self.max_trials) - def fit(self,train,validation=None): + def fit(self, train, validation=None): """ Learn factors from training set. The dot product of the factors reconstructs the training matrix approximately, minimizing the @@ -174,56 +177,56 @@ def fit(self,train,validation=None): self : object This model itself. """ - num_rows,num_cols = train.shape - decomposition = WARPDecomposition(num_rows,num_cols,self.d) - updates = WARPBatchUpdate(self.batch_size,self.d) + num_rows, num_cols = train.shape + decomposition = WARPDecomposition(num_rows, num_cols, self.d) + updates = WARPBatchUpdate(self.batch_size, self.d) self.precompute_warp_loss(num_cols) - self._fit(decomposition,updates,train,validation) + self._fit(decomposition, updates, train, validation) self.U_ = decomposition.U self.V_ = decomposition.V return self - def _fit(self,decomposition,updates,train,validation): + def _fit(self, decomposition, updates, train, validation): precs = [] tot_trials = 0 for it in range(self.max_iters): if it % self.validation_iters == 0: - print('tot_trials',tot_trials) + print('tot_trials', tot_trials) tot_trials = 0 - prec = self.estimate_precision(decomposition,train,validation) + prec = self.estimate_precision(decomposition, train, validation) precs.append(prec) - print('{0}: validation precision = {1:.3f}'.format(it,precs[-1])) + print('{0}: validation precision = {1:.3f}'.format(it, precs[-1])) if len(precs) > 3 and precs[-1] < precs[-2] and precs[-2] < precs[-3]: print('validation precision got worse twice, terminating') break - tot_trials += self.compute_updates(train,decomposition,updates) - decomposition.apply_updates(updates,self.gamma,self.C) + tot_trials += self.compute_updates(train, decomposition, updates) + decomposition.apply_updates(updates, self.gamma, self.C) - def precompute_warp_loss(self,num_cols): + def precompute_warp_loss(self, num_cols): """ Precompute WARP loss for each possible rank: L(i) = \sum_{0,i}{1/(i+1)} """ - assert(num_cols>1) + assert (num_cols > 1) self.warp_loss = np.ones(num_cols) - for i in range(1,num_cols): - self.warp_loss[i] = self.warp_loss[i-1]+1.0/(i+1) + for i in range(1, num_cols): + self.warp_loss[i] = self.warp_loss[i - 1] + 1.0 / (i + 1) - def compute_updates(self,train,decomposition,updates): + def compute_updates(self, train, decomposition, updates): updates.clear() tot_trials = 0 for ix in range(self.batch_size): - u,i,j,N,trials = self.sample(train,decomposition) + u, i, j, N, trials = self.sample(train, decomposition) tot_trials += trials - L = self.estimate_warp_loss(train,u,N) - updates.set_update(ix,decomposition.compute_gradient_step(u,i,j,L)) + L = self.estimate_warp_loss(train, u, N) + updates.set_update(ix, decomposition.compute_gradient_step(u, i, j, L)) return tot_trials - def sample(self,train,decomposition): + def sample(self, train, decomposition): # delegate to cython implementation return warp_sample(decomposition.U, decomposition.V, @@ -233,13 +236,13 @@ def sample(self,train,decomposition): self.positive_thresh, self.max_trials) - def estimate_warp_loss(self,train,u,N): + def estimate_warp_loss(self, train, u, N): num_cols = train.shape[1] - nnz = train.indptr[u+1]-train.indptr[u] - estimated_rank = (num_cols-nnz-1)/N + nnz = train.indptr[u + 1] - train.indptr[u] + estimated_rank = (num_cols - nnz - 1) / N return self.warp_loss[estimated_rank] - def estimate_precision(self,decomposition,train,validation,k=30): + def estimate_precision(self, decomposition, train, validation, k=30): """ Compute prec@k for a sample of training rows. @@ -268,10 +271,10 @@ def estimate_precision(self,decomposition,train,validation,k=30): recommendations because we do not exclude training cols with zero ratings from the top-k predictions evaluated. """ - if isinstance(validation,dict): + if isinstance(validation, dict): have_validation_set = True rows = validation.keys() - elif isinstance(validation,(int,long)): + elif isinstance(validation, (int, long)): have_validation_set = False rows = range(validation) else: @@ -279,12 +282,11 @@ def estimate_precision(self,decomposition,train,validation,k=30): r = decomposition.reconstruct(rows) prec = 0 - for u,ru in zip(rows,r): + for u, ru in zip(rows, r): predicted = ru.argsort()[::-1][:k] if have_validation_set: actual = validation[u] else: actual = train[u].indices[train[u].data > 0] - prec += metrics.prec(predicted,actual,k) - return float(prec)/len(rows) - + prec += metrics.prec(predicted, actual, k) + return float(prec) / len(rows) diff --git a/mrec/mf/model/warp2.py b/mrec/mf/model/warp2.py index cbbeced..18b7417 100644 --- a/mrec/mf/model/warp2.py +++ b/mrec/mf/model/warp2.py @@ -1,25 +1,26 @@ import numpy as np import scipy -import random +from warp_fast import warp2_sample from mrec.mf.model.warp import WARPBatchUpdate, WARPDecomposition, WARP -from warp_fast import warp2_sample + class WARP2BatchUpdate(WARPBatchUpdate): """Collection of arrays to hold a batch of sgd updates.""" - def __init__(self,batch_size,num_features,d): - WARPBatchUpdate.__init__(self,batch_size,d) - self.dW = np.zeros((num_features,d)) + def __init__(self, batch_size, num_features, d): + WARPBatchUpdate.__init__(self, batch_size, d) + self.dW = np.zeros((num_features, d)) def clear(self): self.dW[:] = 0 - def set_update(self,ix,update): - u,v_pos,v_neg,dU,dV_pos,dV_neg,dW = update - WARPBatchUpdate.set_update(self,ix,(u,v_pos,v_neg,dU,dV_pos,dV_neg)) + def set_update(self, ix, update): + u, v_pos, v_neg, dU, dV_pos, dV_neg, dW = update + WARPBatchUpdate.set_update(self, ix, (u, v_pos, v_neg, dU, dV_pos, dV_neg)) self.dW += dW + class WARP2Decomposition(WARPDecomposition): """ Joint matrix and feature embedding optimizing the WARP loss. @@ -36,14 +37,14 @@ class WARP2Decomposition(WARPDecomposition): The embedding dimension. """ - def __init__(self,num_rows,num_cols,X,d): - WARPDecomposition.__init__(self,num_rows,num_cols,d) + def __init__(self, num_rows, num_cols, X, d): + WARPDecomposition.__init__(self, num_rows, num_cols, d) # W holds latent factors for each item feature - self.W = d**-0.5*np.random.random_sample((X.shape[1],d)) + self.W = d ** -0.5 * np.random.random_sample((X.shape[1], d)) self.X = X - self.is_sparse = isinstance(X,scipy.sparse.csr_matrix) + self.is_sparse = isinstance(X, scipy.sparse.csr_matrix) - def compute_gradient_step(self,u,i,j,L): + def compute_gradient_step(self, u, i, j, L): """ Compute a gradient step from results of sampling. @@ -76,33 +77,34 @@ def compute_gradient_step(self,u,i,j,L): dW : numpy.ndarray Gradient step for W. """ - dU = L*(self.V[i]-self.V[j]) - dV_pos = L*self.U[u] - dV_neg = -L*self.U[u] - dx = self.X[i]-self.X[j] + dU = L * (self.V[i] - self.V[j]) + dV_pos = L * self.U[u] + dV_neg = -L * self.U[u] + dx = self.X[i] - self.X[j] if not self.is_sparse: dx = np.atleast_2d(dx) - dW = L*dx.T.dot(np.atleast_2d(self.U[u])) - return u,i,j,dU,dV_pos,dV_neg,dW + dW = L * dx.T.dot(np.atleast_2d(self.U[u])) + return u, i, j, dU, dV_pos, dV_neg, dW - def apply_updates(self,updates,gamma,C): - WARPDecomposition.apply_updates(self,updates,gamma,C) - self.apply_matrix_update(self.W,updates.dW,gamma,C) + def apply_updates(self, updates, gamma, C): + WARPDecomposition.apply_updates(self, updates, gamma, C) + self.apply_matrix_update(self.W, updates.dW, gamma, C) - def apply_matrix_update(self,W,dW,gamma,C): - W += gamma*dW + def apply_matrix_update(self, W, dW, gamma, C): + W += gamma * dW # ensure that ||W_k|| < C for all k - p = np.sum(np.abs(W)**2,axis=-1)**0.5/C - p[p<1] = 1 - W /= p[:,np.newaxis] + p = np.sum(np.abs(W) ** 2, axis=-1) ** 0.5 / C + p[p < 1] = 1 + W /= p[:, np.newaxis] - def reconstruct(self,rows): + def reconstruct(self, rows): if rows is None: U = self.U else: - U = np.asfortranarray(self.U[rows,:]) + U = np.asfortranarray(self.U[rows, :]) return U.dot(self.V.T + self.X.dot(self.W).T) + class WARP2(WARP): """ Learn low-dimensional embedding optimizing the WARP loss. @@ -138,7 +140,7 @@ class WARP2(WARP): Item feature factors. """ - def fit(self,train,X,validation=None): + def fit(self, train, X, validation=None): """ Learn embedding from training set. A suitable dot product of the factors reconstructs the training matrix approximately, minimizing @@ -162,12 +164,12 @@ def fit(self,train,X,validation=None): self : object This model itself. """ - num_rows,num_cols = train.shape - decomposition = WARP2Decomposition(num_rows,num_cols,X,self.d) - updates = WARP2BatchUpdate(self.batch_size,X.shape[1],self.d) + num_rows, num_cols = train.shape + decomposition = WARP2Decomposition(num_rows, num_cols, X, self.d) + updates = WARP2BatchUpdate(self.batch_size, X.shape[1], self.d) self.precompute_warp_loss(num_cols) - self._fit(decomposition,updates,train,validation) + self._fit(decomposition, updates, train, validation) self.U_ = decomposition.U self.V_ = decomposition.V @@ -175,7 +177,7 @@ def fit(self,train,X,validation=None): return self - def sample(self,train,decomposition): + def sample(self, train, decomposition): # delegate to cython implementation return warp2_sample(decomposition.U, decomposition.V, @@ -186,4 +188,3 @@ def sample(self,train,decomposition): train.indptr, self.positive_thresh, self.max_trials) - diff --git a/mrec/mf/recommender.py b/mrec/mf/recommender.py index eaefe1c..f5303b4 100644 --- a/mrec/mf/recommender.py +++ b/mrec/mf/recommender.py @@ -8,10 +8,10 @@ except ImportError: import pickle import numpy as np -from scipy.sparse import csr_matrix from mrec.base_recommender import BaseRecommender + class MatrixFactorizationRecommender(BaseRecommender): """ Base class for matrix factorization recommenders. @@ -29,13 +29,13 @@ def _create_archive(self): """ # pickle the model without its factors # then use numpy to save the factors efficiently - tmp = (self.U,self.V) + tmp = (self.U, self.V) self.U = self.V = None m = pickle.dumps(self) - self.U,self.V = tmp - return {'model':m,'U':self.U,'V':self.V} + self.U, self.V = tmp + return {'model': m, 'U': self.U, 'V': self.V} - def _load_archive(self,archive): + def _load_archive(self, archive): """ Load fields from a numpy archive. """ @@ -43,11 +43,11 @@ def _load_archive(self,archive): self.V = archive['V'] def __str__(self): - if hasattr(self,'description'): + if hasattr(self, 'description'): return self.description return 'MatrixFactorizationRecommender' - def fit(self,train): + def fit(self, train): """ Learn user and item factors from training dataset. @@ -58,7 +58,7 @@ def fit(self,train): """ pass - def load_factors(self,user_factor_filepath,item_factor_filepath,fmt): + def load_factors(self, user_factor_filepath, item_factor_filepath, fmt): """ Load precomputed user and item factors from file. @@ -88,7 +88,7 @@ def load_factors(self,user_factor_filepath,item_factor_filepath,fmt): # ensure that memory layout avoids extra allocation in dot product self.U = np.asfortranarray(self.U) - def recommend_items(self,dataset,u,max_items=10,return_scores=True,item_features=None): + def recommend_items(self, dataset, u, max_items=10, return_scores=True, item_features=None): """ Recommend up to max_items most highly recommended items for user u. Assumes you've already called fit() to learn the factors. @@ -112,10 +112,10 @@ def recommend_items(self,dataset,u,max_items=10,return_scores=True,item_features List of (idx,score) pairs if return_scores is True, else just a list of idxs. """ - r = self.predict_ratings(u,item_features=item_features) - return self._get_recommendations_from_predictions(r,dataset,u,u+1,max_items,return_scores)[0] + r = self.predict_ratings(u, item_features=item_features) + return self._get_recommendations_from_predictions(r, dataset, u, u + 1, max_items, return_scores)[0] - def predict_ratings(self,users=None,item_features=None): + def predict_ratings(self, users=None, item_features=None): """ Predict ratings/scores for all items for supplied users. Assumes you've already called fit() to learn the factors. @@ -136,13 +136,13 @@ def predict_ratings(self,users=None,item_features=None): predictions : numpy.ndarray, shape = [len(users), num_items] Predicted ratings for all items for each supplied user. """ - if isinstance(users,int): + if isinstance(users, int): users = [users] if users is None: U = self.U else: - U = np.asfortranarray(self.U[users,:]) + U = np.asfortranarray(self.U[users, :]) return U.dot(self.V.T) def batch_recommend_items(self, @@ -175,7 +175,8 @@ def batch_recommend_items(self, else just a list of idxs. """ r = self.predict_ratings(item_features=item_features) - return self._get_recommendations_from_predictions(r,dataset,0,r.shape[0],max_items,return_scores,show_progress) + return self._get_recommendations_from_predictions(r, dataset, 0, r.shape[0], max_items, return_scores, + show_progress) def range_recommend_items(self, dataset, @@ -209,8 +210,8 @@ def range_recommend_items(self, Each entry is a list of (idx,score) pairs if return_scores is True, else just a list of idxs. """ - r = self.predict_ratings(range(user_start,user_end),item_features=item_features) - return self._get_recommendations_from_predictions(r,dataset,user_start,user_end,max_items,return_scores) + r = self.predict_ratings(range(user_start, user_end), item_features=item_features) + return self._get_recommendations_from_predictions(r, dataset, user_start, user_end, max_items, return_scores) def _get_recommendations_from_predictions(self, r, @@ -246,15 +247,15 @@ def _get_recommendations_from_predictions(self, Each entry is a list of (idx,score) pairs if return_scores is True, else just a list of idxs. """ - r = np.array(self._zero_known_item_scores(r,dataset[user_start:user_end,:])) - recs = [[] for u in range(user_start,user_end)] - for u in range(user_start,user_end): + r = np.array(self._zero_known_item_scores(r, dataset[user_start:user_end, :])) + recs = [[] for u in range(user_start, user_end)] + for u in range(user_start, user_end): ux = u - user_start - if show_progress and ux%1000 == 0: - print(ux,'..',) + if show_progress and ux % 1000 == 0: + print(ux, '..', ) ru = r[ux] if return_scores: - recs[ux] = [(i,ru[i]) for i in ru.argsort()[::-1] if ru[i] > 0][:max_items] + recs[ux] = [(i, ru[i]) for i in ru.argsort()[::-1] if ru[i] > 0][:max_items] else: recs[ux] = [i for i in ru.argsort()[::-1] if ru[i] > 0][:max_items] if show_progress: diff --git a/mrec/mf/warp.py b/mrec/mf/warp.py index 1d2f7d3..dfbbfe4 100644 --- a/mrec/mf/warp.py +++ b/mrec/mf/warp.py @@ -1,10 +1,10 @@ -import numpy as np import random -from mrec.evaluation import metrics +import numpy as np -from mrec.mf.recommender import MatrixFactorizationRecommender from mrec.mf.model.warp import WARP +from mrec.mf.recommender import MatrixFactorizationRecommender + class WARPMFRecommender(MatrixFactorizationRecommender): """ @@ -27,7 +27,7 @@ class WARPMFRecommender(MatrixFactorizationRecommender): In practice it means that we optimize for ranks 1 to max_trials-1. """ - def __init__(self,d,gamma,C,batch_size=10,positive_thresh=0.00001,max_trials=50): + def __init__(self, d, gamma, C, batch_size=10, positive_thresh=0.00001, max_trials=50): self.d = d self.gamma = gamma self.C = C @@ -35,7 +35,7 @@ def __init__(self,d,gamma,C,batch_size=10,positive_thresh=0.00001,max_trials=50) self.positive_thresh = positive_thresh self.max_trials = max_trials - def fit(self,train,item_features=None): + def fit(self, train, item_features=None): """ Learn factors from training set. @@ -46,15 +46,16 @@ def fit(self,train,item_features=None): item_features : array_like, shape = [num_items, num_features] Features for each item in the dataset, ignored here. """ - max_iters,validation_iters,validation = self.create_validation_set(train) - model = WARP(self.d,self.gamma,self.C,max_iters,validation_iters,self.batch_size,self.positive_thresh,self.max_trials) + max_iters, validation_iters, validation = self.create_validation_set(train) + model = WARP(self.d, self.gamma, self.C, max_iters, validation_iters, self.batch_size, self.positive_thresh, + self.max_trials) self.description = 'WARPMF({0})'.format(model) - model.fit(train,validation) + model.fit(train, validation) self.U = model.U_ self.V = model.V_ - def create_validation_set(self,train): + def create_validation_set(self, train): """ Hide and return half of the known items for a sample of users, and estimate the number of sgd iterations to run. @@ -75,42 +76,43 @@ def create_validation_set(self,train): """ # use 1% of users for validation, with a floor num_users = train.shape[0] - num_validation_users = max(num_users/100,100) + num_validation_users = max(num_users / 100, 100) # ensure reasonable expected number of updates per validation user - validation_iters = 100*num_users/num_validation_users + validation_iters = 100 * num_users / num_validation_users # and reasonable number of validation cycles - max_iters = 30*validation_iters + max_iters = 30 * validation_iters - print(num_validation_users,'validation users') - print(validation_iters,'validation iters') - print(max_iters,'max_iters') + print(num_validation_users, 'validation users') + print(validation_iters, 'validation iters') + print(max_iters, 'max_iters') validation = dict() for u in range(num_validation_users): positive = np.where(train[u].data > 0)[0] - hidden = random.sample(positive,positive.shape[0]/2) + hidden = random.sample(positive, positive.shape[0] / 2) if hidden: train[u].data[hidden] = 0 validation[u] = train[u].indices[hidden] - return max_iters,validation_iters,validation + return max_iters, validation_iters, validation + def main(): import sys from mrec import load_sparse_matrix, save_recommender - from mrec.sparse import fast_sparse_matrix file_format = sys.argv[1] filepath = sys.argv[2] outfile = sys.argv[3] # load training set as scipy sparse matrix - train = load_sparse_matrix(file_format,filepath) + train = load_sparse_matrix(file_format, filepath) - model = WARPMFRecommender(d=100,gamma=0.01,C=100.0,batch_size=10) + model = WARPMFRecommender(d=100, gamma=0.01, C=100.0, batch_size=10) model.fit(train) - save_recommender(model,outfile) + save_recommender(model, outfile) + if __name__ == '__main__': main() diff --git a/mrec/mf/warp2.py b/mrec/mf/warp2.py index 6dcd6ec..0754eab 100644 --- a/mrec/mf/warp2.py +++ b/mrec/mf/warp2.py @@ -1,7 +1,8 @@ import numpy as np -from mrec.mf.warp import WARPMFRecommender from mrec.mf.model.warp2 import WARP2 +from mrec.mf.warp import WARPMFRecommender + class WARP2MFRecommender(WARPMFRecommender): """ @@ -26,9 +27,9 @@ class WARP2MFRecommender(WARPMFRecommender): """ def __str__(self): - return 'WARP2MF(d={0},gamma={1},C={2})'.format(self.d,self.gamma,self.C) + return 'WARP2MF(d={0},gamma={1},C={2})'.format(self.d, self.gamma, self.C) - def fit(self,train,item_features=None): + def fit(self, train, item_features=None): """ Learn factors from training set and item features. @@ -39,16 +40,17 @@ def fit(self,train,item_features=None): item_features : array_like, shape = [num_items, num_features] Features for each item in the dataset. """ - max_iters,validation_iters,validation = self.create_validation_set(train) - model = WARP2(self.d,self.gamma,self.C,max_iters,validation_iters,self.batch_size,self.positive_thresh,self.max_trials) + max_iters, validation_iters, validation = self.create_validation_set(train) + model = WARP2(self.d, self.gamma, self.C, max_iters, validation_iters, self.batch_size, self.positive_thresh, + self.max_trials) self.description = 'WARP2MF({0})'.format(model) - model.fit(train,item_features,validation) + model.fit(train, item_features, validation) self.U = model.U_ self.V = model.V_ self.W = model.W_ - def predict_ratings(self,users=None,item_features=None): + def predict_ratings(self, users=None, item_features=None): """ Predict ratings/scores for all items for supplied users. Assumes you've already called fit() to learn the factors. @@ -69,38 +71,40 @@ def predict_ratings(self,users=None,item_features=None): predictions : numpy.ndarray, shape = [len(users), num_items] Predicted ratings for all items for each supplied user. """ - if isinstance(users,int): + if isinstance(users, int): users = [users] if users is None: U = self.U else: - U = np.asfortranarray(self.U[users,:]) + U = np.asfortranarray(self.U[users, :]) return U.dot(self.V.T + item_features.dot(self.W).T) -def main(file_format,filepath,feature_format,feature_file,outfile): + +def main(file_format, filepath, feature_format, feature_file, outfile): from mrec import load_sparse_matrix, save_recommender - from mrec.sparse import fast_sparse_matrix # load training set - train = load_sparse_matrix(file_format,filepath) + train = load_sparse_matrix(file_format, filepath) # load item features, assume they are tsv: item_id,feature_id,val - X = load_sparse_matrix(feature_format,feature_file).toarray() + X = load_sparse_matrix(feature_format, feature_file).toarray() # strip features for any trailing items that don't appear in training set num_items = train.shape[1] - X = X[:num_items,:] + X = X[:num_items, :] + + model = WARP2MFRecommender(d=100, gamma=0.01, C=100.0, batch_size=10) + model.fit(train, X) - model = WARP2MFRecommender(d=100,gamma=0.01,C=100.0,batch_size=10) - model.fit(train,X) + save_recommender(model, outfile) - save_recommender(model,outfile) if __name__ == '__main__': import sys + file_format = sys.argv[1] filepath = sys.argv[2] feature_format = sys.argv[3] feature_file = sys.argv[4] outfile = sys.argv[5] - main(file_format,filepath,feature_format,feature_file,outfile) + main(file_format, filepath, feature_format, feature_file, outfile) diff --git a/mrec/mf/wrmf.py b/mrec/mf/wrmf.py index 96691ab..5f657d9 100644 --- a/mrec/mf/wrmf.py +++ b/mrec/mf/wrmf.py @@ -11,8 +11,9 @@ import numpy as np from scipy.sparse import csr_matrix -from mrec.sparse import fast_sparse_matrix from mrec.mf.recommender import MatrixFactorizationRecommender +from mrec.sparse import fast_sparse_matrix + class WRMFRecommender(MatrixFactorizationRecommender): """ @@ -28,21 +29,22 @@ class WRMFRecommender(MatrixFactorizationRecommender): Number of iterations of alternating least squares. """ - def __init__(self,d,alpha=1,lbda=0.015,num_iters=15): + def __init__(self, d, alpha=1, lbda=0.015, num_iters=15): self.d = d self.alpha = alpha self.lbda = lbda self.num_iters = num_iters def __str__(self): - return 'WRMFRecommender (d={0},alpha={1},lambda={2},num_iters={3})'.format(self.d,self.alpha,self.lbda,self.num_iters) + return 'WRMFRecommender (d={0},alpha={1},lambda={2},num_iters={3})'.format(self.d, self.alpha, self.lbda, + self.num_iters) - def init_factors(self,num_factors,assign_values=True): + def init_factors(self, num_factors, assign_values=True): if assign_values: - return self.d**-0.5*np.random.random_sample((num_factors,self.d)) - return np.empty((num_factors,self.d)) + return self.d ** -0.5 * np.random.random_sample((num_factors, self.d)) + return np.empty((num_factors, self.d)) - def fit(self,train,item_features=None): + def fit(self, train, item_features=None): """ Learn factors from training set. User and item factors are fitted alternately. @@ -57,42 +59,42 @@ def fit(self,train,item_features=None): if type(train) == csr_matrix: train = fast_sparse_matrix(train) - num_users,num_items = train.shape + num_users, num_items = train.shape - self.U = self.init_factors(num_users,False) # don't need values, will compute them + self.U = self.init_factors(num_users, False) # don't need values, will compute them self.V = self.init_factors(num_items) for it in range(self.num_iters): - print('iteration',it) + print('iteration', it) # fit user factors VV = self.V.T.dot(self.V) for u in range(num_users): # get (positive i.e. non-zero scored) items for user indices = train.X[u].nonzero()[1] if indices.size: - self.U[u,:] = self.update(indices,self.V,VV) + self.U[u, :] = self.update(indices, self.V, VV) else: - self.U[u,:] = np.zeros(self.d) + self.U[u, :] = np.zeros(self.d) # fit item factors UU = self.U.T.dot(self.U) for i in range(num_items): indices = train.fast_get_col(i).nonzero()[0] if indices.size: - self.V[i,:] = self.update(indices,self.U,UU) + self.V[i, :] = self.update(indices, self.U, UU) else: - self.V[i,:] = np.zeros(self.d) + self.V[i, :] = np.zeros(self.d) - def update(self,indices,H,HH): + def update(self, indices, H, HH): """ Update latent factors for a single user or item. """ - Hix = H[indices,:] - M = HH + self.alpha*Hix.T.dot(Hix) + np.diag(self.lbda*np.ones(self.d)) - return np.dot(np.linalg.inv(M),(1+self.alpha)*Hix.sum(axis=0)) + Hix = H[indices, :] + M = HH + self.alpha * Hix.T.dot(Hix) + np.diag(self.lbda * np.ones(self.d)) + return np.dot(np.linalg.inv(M), (1 + self.alpha) * Hix.sum(axis=0)) + def main(): import sys from mrec import load_sparse_matrix, save_recommender - from mrec.sparse import fast_sparse_matrix from mrec.mf.wrmf import WRMFRecommender file_format = sys.argv[1] @@ -100,12 +102,13 @@ def main(): outfile = sys.argv[3] # load training set as scipy sparse matrix - train = load_sparse_matrix(file_format,filepath) + train = load_sparse_matrix(file_format, filepath) model = WRMFRecommender(d=5) model.fit(train) - save_recommender(model,outfile) + save_recommender(model, outfile) + if __name__ == '__main__': main() diff --git a/mrec/parallel/evaluate.py b/mrec/parallel/evaluate.py index 7e0461d..8094fcd 100644 --- a/mrec/parallel/evaluate.py +++ b/mrec/parallel/evaluate.py @@ -2,18 +2,15 @@ Evaluation task to run on an ipython engine. """ -def run(task): +def run(task): # import modules required by engine - import numpy as np - from scipy.sparse import coo_matrix - from collections import defaultdict from mrec import load_sparse_matrix - input_format,testfile,recsfile,start,end,evaluator = task + input_format, testfile, recsfile, start, end, evaluator = task # load the test data - testdata = load_sparse_matrix(input_format,testfile) + testdata = load_sparse_matrix(input_format, testfile) - return evaluator.process(testdata,recsfile,start,end) + return evaluator.process(testdata, recsfile, start, end) diff --git a/mrec/parallel/item_similarity.py b/mrec/parallel/item_similarity.py index 54e398c..4c1f452 100644 --- a/mrec/parallel/item_similarity.py +++ b/mrec/parallel/item_similarity.py @@ -1,25 +1,25 @@ -import math import glob -import re +import logging +import math import os +import re import subprocess from shutil import rmtree -import logging from mrec import load_sparse_matrix, save_recommender -class ItemSimilarityRunner(object): - def run(self,view,model,input_format,trainfile,num_engines,simsdir,overwrite,max_sims,simsfile,modelfile): +class ItemSimilarityRunner(object): + def run(self, view, model, input_format, trainfile, num_engines, simsdir, overwrite, max_sims, simsfile, modelfile): logging.info('finding number of items...') - dataset = load_sparse_matrix(input_format,trainfile) - num_users,num_items = dataset.shape + dataset = load_sparse_matrix(input_format, trainfile) + num_users, num_items = dataset.shape del dataset logging.info('%d users and %d items', num_users, num_items) logging.info('creating sims directory {0}...'.format(simsdir)) - subprocess.check_call(['mkdir','-p',simsdir]) + subprocess.check_call(['mkdir', '-p', simsdir]) done = [] if not overwrite: @@ -29,12 +29,12 @@ def run(self,view,model,input_format,trainfile,num_engines,simsdir,overwrite,max logging.info('found {0} output files'.format(len(done))) logging.info('creating tasks...') - tasks = self.create_tasks(model,input_format,trainfile,simsdir,num_items,num_engines,max_sims,done) + tasks = self.create_tasks(model, input_format, trainfile, simsdir, num_items, num_engines, max_sims, done) if num_engines > 0 and len(tasks) > 0: logging.info('running %d tasks in parallel across ipython' ' engines...', len(tasks)) - async_job = view.map_async(process,tasks,retries=2) + async_job = view.map_async(process, tasks, retries=2) # wait for tasks to complete results = async_job.get() else: @@ -48,43 +48,44 @@ def run(self,view,model,input_format,trainfile,num_engines,simsdir,overwrite,max if remaining == 0: logging.info('SUCCESS: all tasks completed') logging.info('concatenating {0} partial output files...'.format(len(done))) - paths = [os.path.join(simsdir,'sims.{0}-{1}.tsv'.format(start,end)) for start,end in done] - cmd = ['cat']+paths - subprocess.check_call(cmd,stdout=open(simsfile,'w')) + paths = [os.path.join(simsdir, 'sims.{0}-{1}.tsv'.format(start, end)) for start, end in done] + cmd = ['cat'] + paths + subprocess.check_call(cmd, stdout=open(simsfile, 'w')) logging.info('removing partial output files...') rmtree(simsdir) logging.info('loading %d items in %s model from %s', num_items, type(model).__name__, simsfile) - model.load_similarity_matrix(simsfile,num_items) - save_recommender(model,modelfile) + model.load_similarity_matrix(simsfile, num_items) + save_recommender(model, modelfile) logging.info('done') else: - logging.error('FAILED: {0}/{1} tasks did not complete successfully'.format(remaining,len(tasks))) + logging.error('FAILED: {0}/{1} tasks did not complete successfully'.format(remaining, len(tasks))) logging.error('try rerunning the command to retry the remaining tasks') - def find_done(self,outdir): - success_files = glob.glob(os.path.join(outdir,'*.SUCCESS')) + def find_done(self, outdir): + success_files = glob.glob(os.path.join(outdir, '*.SUCCESS')) r = re.compile('.*?([0-9]+)-([0-9]+)\.SUCCESS$') done = [] for path in success_files: m = r.match(path) start = int(m.group(1)) end = int(m.group(2)) - done.append((start,end)) + done.append((start, end)) return done - def create_tasks(self,model,input_format,trainfile,outdir,num_items,num_engines,max_similar_items,done): + def create_tasks(self, model, input_format, trainfile, outdir, num_items, num_engines, max_similar_items, done): if num_engines == 0: # special marker for sequential run num_engines = 1 - items_per_engine = int(math.ceil(float(num_items)/num_engines)) + items_per_engine = int(math.ceil(float(num_items) / num_engines)) tasks = [] - for start in range(0,num_items,items_per_engine): - end = min(num_items,start+items_per_engine) - if (start,end) not in done: - tasks.append((model,input_format,trainfile,outdir,start,end,max_similar_items)) + for start in range(0, num_items, items_per_engine): + end = min(num_items, start + items_per_engine) + if (start, end) not in done: + tasks.append((model, input_format, trainfile, outdir, start, end, max_similar_items)) return tasks + def process(task): """ Training task to run on an ipython engine. @@ -95,27 +96,27 @@ def process(task): import subprocess from mrec import load_fast_sparse_matrix - model,input_format,trainfile,outdir,start,end,max_similar_items = task + model, input_format, trainfile, outdir, start, end, max_similar_items = task # initialise the model - dataset = load_fast_sparse_matrix(input_format,trainfile) - if hasattr(model,'similarity_matrix'): + dataset = load_fast_sparse_matrix(input_format, trainfile) + if hasattr(model, 'similarity_matrix'): # clear out any existing similarity matrix to trigger recomputation of # the item-item similarities from the users' ratings. model.similarity_matrix = None # write sims directly to file as we compute them - outfile = os.path.join(outdir,'sims.{0}-{1}.tsv'.format(start,end)) - out = open(outfile,'w') - for j in range(start,end): - w = model.get_similar_items(j,max_similar_items=max_similar_items,dataset=dataset) - for k,v in w: - print('{0}\t{1}\t{2}'.format(j+1,k+1,v), file=out) # write as 1-indexed + outfile = os.path.join(outdir, 'sims.{0}-{1}.tsv'.format(start, end)) + out = open(outfile, 'w') + for j in range(start, end): + w = model.get_similar_items(j, max_similar_items=max_similar_items, dataset=dataset) + for k, v in w: + print('{0}\t{1}\t{2}'.format(j + 1, k + 1, v), file=out) # write as 1-indexed out.close() # record success - cmd = ['touch',os.path.join(outdir,'{0}-{1}.SUCCESS'.format(start,end))] + cmd = ['touch', os.path.join(outdir, '{0}-{1}.SUCCESS'.format(start, end))] subprocess.check_call(cmd) # return the range that we've processed - return start,end + return start, end diff --git a/mrec/parallel/predict.py b/mrec/parallel/predict.py index ae92e22..3d16a5c 100644 --- a/mrec/parallel/predict.py +++ b/mrec/parallel/predict.py @@ -2,48 +2,46 @@ Prediction task to run on an ipython engine. """ -def run(task): +def run(task): # import modules required by engine import os import subprocess - import numpy as np - from scipy.sparse import coo_matrix from mrec import load_sparse_matrix, load_recommender - from mrec.evaluation import Evaluator - modelfile,input_format,trainfile,test_input_format,testfile,feature_format,featurefile,outdir,start,end,evaluator,generate = task + modelfile, input_format, trainfile, test_input_format, testfile, feature_format, featurefile, outdir, start, end, evaluator, generate = task # initialise the model model = load_recommender(modelfile) - outfile = os.path.join(outdir,'recs.{0}-{1}.tsv'.format(start,end)) + outfile = os.path.join(outdir, 'recs.{0}-{1}.tsv'.format(start, end)) if generate: # generate recommendations for our batch of users - dataset = load_sparse_matrix(input_format,trainfile) - out = open(outfile,'w') + dataset = load_sparse_matrix(input_format, trainfile) + out = open(outfile, 'w') if featurefile is not None: # currently runs much faster if features are loaded as a dense matrix - item_features = load_sparse_matrix(feature_format,featurefile).toarray() + item_features = load_sparse_matrix(feature_format, featurefile).toarray() # strip features for any trailing items that don't appear in training set num_items = dataset.shape[1] - item_features = item_features[:num_items,:] - recs = model.range_recommend_items(dataset,start,end,max_items=20,return_scores=True,item_features=item_features) + item_features = item_features[:num_items, :] + recs = model.range_recommend_items(dataset, start, end, max_items=20, return_scores=True, + item_features=item_features) else: - recs = model.range_recommend_items(dataset,start,end,max_items=20,return_scores=True) - for u,items in zip(range(start,end),recs): - for i,w in items: - print('{0}\t{1}\t{2}'.format(u+1,i+1,w), file=out) # write as 1-indexed + recs = model.range_recommend_items(dataset, start, end, max_items=20, return_scores=True) + for u, items in zip(range(start, end), recs): + for i, w in items: + print('{0}\t{1}\t{2}'.format(u + 1, i + 1, w), file=out) # write as 1-indexed out.close() # record success - cmd = ['touch',os.path.join(outdir,'{0}-{1}.SUCCESS'.format(start,end))] + cmd = ['touch', os.path.join(outdir, '{0}-{1}.SUCCESS'.format(start, end))] subprocess.check_call(cmd) # load the test data - testdata = load_sparse_matrix(test_input_format,testfile).tocsr() + testdata = load_sparse_matrix(test_input_format, testfile).tocsr() # return evaluation metrics - return evaluator.process(testdata,outfile,start,end) + return evaluator.process(testdata, outfile, start, end) diff --git a/mrec/parallel/warp.py b/mrec/parallel/warp.py index 99d3241..fba5b2e 100644 --- a/mrec/parallel/warp.py +++ b/mrec/parallel/warp.py @@ -1,15 +1,16 @@ import glob -import re +import logging import os +import re import subprocess from shutil import rmtree -import logging + import numpy as np from mrec import save_recommender, load_recommender -class WARPMFRunner(object): +class WARPMFRunner(object): def run(self, view, model, @@ -23,7 +24,7 @@ def run(self, modelfile): logging.info('creating models directory {0}...'.format(workdir)) - subprocess.check_call(['mkdir','-p',workdir]) + subprocess.check_call(['mkdir', '-p', workdir]) done = [] if not overwrite: @@ -44,7 +45,7 @@ def run(self, if tasks: logging.info('running in parallel across ipython engines...') - async_job = view.map_async(process,tasks,retries=2) + async_job = view.map_async(process, tasks, retries=2) # wait for tasks to complete results = async_job.get() @@ -59,22 +60,22 @@ def run(self, logging.info('SUCCESS: all tasks completed') logging.info('concatenating {0} models...'.format(len(done))) for ix in sorted(done): - partial_model = load_recommender(self.get_modelfile(ix,workdir)) + partial_model = load_recommender(self.get_modelfile(ix, workdir)) if ix == 0: model = partial_model else: # concatenate factors model.d += partial_model.d - model.U = np.hstack((model.U,partial_model.U)) - model.V = np.hstack((model.V,partial_model.V)) - if hasattr(model,'W'): - model.W = np.hstack((model.W,partial_model.W)) - save_recommender(model,modelfile) + model.U = np.hstack((model.U, partial_model.U)) + model.V = np.hstack((model.V, partial_model.V)) + if hasattr(model, 'W'): + model.W = np.hstack((model.W, partial_model.W)) + save_recommender(model, modelfile) logging.info('removing partial output files...') rmtree(workdir) logging.info('done') else: - logging.error('FAILED: {0}/{1} tasks did not complete successfully'.format(remaining,len(tasks))) + logging.error('FAILED: {0}/{1} tasks did not complete successfully'.format(remaining, len(tasks))) logging.error('try rerunning the command to retry the remaining tasks') def create_tasks(self, @@ -89,12 +90,12 @@ def create_tasks(self, tasks = [] for ix in range(num_engines): if ix not in done: - outfile = self.get_modelfile(ix,outdir) - tasks.append((model,input_format,trainfile,feature_format,featurefile,outfile,ix,num_engines)) + outfile = self.get_modelfile(ix, outdir) + tasks.append((model, input_format, trainfile, feature_format, featurefile, outfile, ix, num_engines)) return tasks - def find_done(self,outdir): - success_files = glob.glob(os.path.join(outdir,'*.SUCCESS')) + def find_done(self, outdir): + success_files = glob.glob(os.path.join(outdir, '*.SUCCESS')) r = re.compile('.*?([0-9]+)\.model\.npz\.SUCCESS$') done = [] for path in success_files: @@ -103,8 +104,9 @@ def find_done(self,outdir): done.append(ix) return done - def get_modelfile(self,ix,workdir): - return os.path.join(workdir,'{0}.model.npz'.format(ix)) + def get_modelfile(self, ix, workdir): + return os.path.join(workdir, '{0}.model.npz'.format(ix)) + def process(task): """ @@ -112,26 +114,25 @@ def process(task): """ # import modules required by engine - import os import subprocess from mrec import load_sparse_matrix, save_recommender - model,input_format,trainfile,feature_format,featurefile,outfile,offset,step = task + model, input_format, trainfile, feature_format, featurefile, outfile, offset, step = task - dataset = load_sparse_matrix(input_format,trainfile) + dataset = load_sparse_matrix(input_format, trainfile) if featurefile is not None: # currently runs much faster if features are loaded as a dense matrix - item_features = load_sparse_matrix(feature_format,featurefile).toarray() + item_features = load_sparse_matrix(feature_format, featurefile).toarray() # strip features for any trailing items that don't appear in training set num_items = dataset.shape[1] - item_features = item_features[:num_items,:] - model.fit(dataset,item_features=item_features) + item_features = item_features[:num_items, :] + model.fit(dataset, item_features=item_features) else: model.fit(dataset) - save_recommender(model,outfile) + save_recommender(model, outfile) # record success - cmd = ['touch','{0}.SUCCESS'.format(outfile)] + cmd = ['touch', '{0}.SUCCESS'.format(outfile)] subprocess.check_call(cmd) # return the offset for the samples that we've learned from diff --git a/mrec/parallel/wrmf.py b/mrec/parallel/wrmf.py index 95dc8e8..dd70118 100644 --- a/mrec/parallel/wrmf.py +++ b/mrec/parallel/wrmf.py @@ -1,77 +1,90 @@ import glob import logging +import math import os import subprocess from shutil import rmtree -import math + import numpy as np from mrec import load_sparse_matrix, save_recommender -def get_user_indices(data,u): + +def get_user_indices(data, u): # get (positive i.e. non-zero scored) items for user return data.X[u].nonzero()[1] -def get_item_indices(data,i): + +def get_item_indices(data, i): # get users for item return data.fast_get_col(i).nonzero()[0] -def get_factor_files(workdir,factor_type): + +def get_factor_files(workdir, factor_type): # return partial factor files in sorted order so they can simply be stacked - factor_files = glob.glob(os.path.join(workdir,'{0}.*.npy'.format(factor_type))) - return sorted(factor_files,key=lambda x: int(x[:-4][x[:-4].rfind('.')+1:])) + factor_files = glob.glob(os.path.join(workdir, '{0}.*.npy'.format(factor_type))) + return sorted(factor_files, key=lambda x: int(x[:-4][x[:-4].rfind('.') + 1:])) + def get_user_factor_files(workdir): - return get_factor_files(workdir,'U') + return get_factor_files(workdir, 'U') + def get_item_factor_files(workdir): - return get_factor_files(workdir,'V') + return get_factor_files(workdir, 'V') + -def init_item_factors(model,data): - num_users,num_items = data.shape +def init_item_factors(model, data): + num_users, num_items = data.shape return model.init_factors(num_items) -class WRMFRunner(object): - def run(self,view,model,input_format,trainfile,num_engines,workdir,modelfile): +class WRMFRunner(object): + def run(self, view, model, input_format, trainfile, num_engines, workdir, modelfile): logging.info('creating factors directory {0}'.format(workdir)) - subprocess.check_call(['mkdir','-p',workdir]) + subprocess.check_call(['mkdir', '-p', workdir]) logging.info('getting data size') - data = load_sparse_matrix(input_format,trainfile) - num_users,num_items = data.shape + data = load_sparse_matrix(input_format, trainfile) + num_users, num_items = data.shape del data for it in range(model.num_iters): logging.info('iteration {0}'.format(it)) - tasks = self.create_tasks(num_users,num_engines,model,input_format,trainfile,workdir,'U',get_user_indices,get_item_factor_files,init_item_factors) - self.run_tasks(view,tasks) - tasks = self.create_tasks(num_items,num_engines,model,input_format,trainfile,workdir,'V',get_item_indices,get_user_factor_files,None) # won't need to initialize user factors - self.run_tasks(view,tasks) + tasks = self.create_tasks(num_users, num_engines, model, input_format, trainfile, workdir, 'U', + get_user_indices, get_item_factor_files, init_item_factors) + self.run_tasks(view, tasks) + tasks = self.create_tasks(num_items, num_engines, model, input_format, trainfile, workdir, 'V', + get_item_indices, get_user_factor_files, + None) # won't need to initialize user factors + self.run_tasks(view, tasks) model.U = np.vstack([np.load(f) for f in get_user_factor_files(workdir)]) model.V = np.vstack([np.load(f) for f in get_item_factor_files(workdir)]) - save_recommender(model,modelfile) + save_recommender(model, modelfile) logging.info('removing partial output files') rmtree(workdir) logging.info('done') - def run_tasks(self,view,tasks): - async_job = view.map_async(compute_factors,tasks,retries=2) + def run_tasks(self, view, tasks): + async_job = view.map_async(compute_factors, tasks, retries=2) # wait for tasks to complete result = async_job.get() - def create_tasks(self,num_factors,num_engines,model,input_format,trainfile,workdir,factor_type,get_indices,get_fixed_factor_files,init_fixed_factors): - factors_per_engine = int(math.ceil(float(num_factors)/num_engines)) + def create_tasks(self, num_factors, num_engines, model, input_format, trainfile, workdir, factor_type, get_indices, + get_fixed_factor_files, init_fixed_factors): + factors_per_engine = int(math.ceil(float(num_factors) / num_engines)) tasks = [] - for start in range(0,num_factors,factors_per_engine): - end = min(num_factors,start+factors_per_engine) + for start in range(0, num_factors, factors_per_engine): + end = min(num_factors, start + factors_per_engine) fixed_factor_files = get_fixed_factor_files(workdir) - tasks.append((model,input_format,trainfile,factor_type,get_indices,init_fixed_factors,fixed_factor_files,start,end,workdir)) + tasks.append((model, input_format, trainfile, factor_type, get_indices, init_fixed_factors, + fixed_factor_files, start, end, workdir)) return tasks + def compute_factors(task): """ WRMF update method to run on an IPython engine. @@ -84,22 +97,22 @@ def compute_factors(task): import numpy as np from mrec import load_fast_sparse_matrix - model,input_format,trainfile,factor_type,get_indices,init_fixed_factors,fixed_factor_files,start,end,workdir = task + model, input_format, trainfile, factor_type, get_indices, init_fixed_factors, fixed_factor_files, start, end, workdir = task - data = load_fast_sparse_matrix(input_format,trainfile) + data = load_fast_sparse_matrix(input_format, trainfile) if fixed_factor_files: H = np.vstack([np.load(f) for f in fixed_factor_files]) else: - H = init_fixed_factors(model,data) + H = init_fixed_factors(model, data) HH = H.T.dot(H) - W = np.zeros(((end-start),model.d)) - for j in range(start,end): - indices = get_indices(data,j) + W = np.zeros(((end - start), model.d)) + for j in range(start, end): + indices = get_indices(data, j) if indices.size: - W[j-start,:] = model.update(indices,H,HH) + W[j - start, :] = model.update(indices, H, HH) - np.save(os.path.join(workdir,'{0}.{1}.npy'.format(factor_type,start)),W) + np.save(os.path.join(workdir, '{0}.{1}.npy'.format(factor_type, start)), W) - return start,end + return start, end diff --git a/mrec/popularity.py b/mrec/popularity.py index a286d87..fb0a5fc 100644 --- a/mrec/popularity.py +++ b/mrec/popularity.py @@ -3,11 +3,10 @@ intended to provide a baseline for evaluations. """ -import numpy as np - from mrec.base_recommender import BaseRecommender from mrec.sparse import fast_sparse_matrix + class ItemPopularityRecommender(BaseRecommender): """ Create an unpersonalized item popularity recommender, useful @@ -28,14 +27,14 @@ class ItemPopularityRecommender(BaseRecommender): popularity. """ - def __init__(self,method='count',thresh=0): + def __init__(self, method='count', thresh=0): self.description = 'ItemPop' - if method not in ['count','sum','avg','thresh']: + if method not in ['count', 'sum', 'avg', 'thresh']: raise ValueError('invalid value for method parameter') self.method = method self.thresh = thresh - def fit(self,dataset,item_features=None): + def fit(self, dataset, item_features=None): """ Compute the most popular items using the method specified in the constructor. @@ -47,26 +46,26 @@ def fit(self,dataset,item_features=None): item_features : array_like, shape = [num_items, num_features] Features for items in training set, ignored here. """ - if isinstance(dataset,fast_sparse_matrix): + if isinstance(dataset, fast_sparse_matrix): d = dataset.X.tocsc() else: d = dataset.tocsc() if self.method == 'count': # count the total number of ratings for each item - popularity = [(d[:,i].nnz,i) for i in range(d.shape[1])] + popularity = [(d[:, i].nnz, i) for i in range(d.shape[1])] elif self.method == 'sum': # find the sum of the ratings for each item - popularity = [(d[:,i].sum(),i) for i in range(d.shape[1])] + popularity = [(d[:, i].sum(), i) for i in range(d.shape[1])] elif self.method == 'avg': # find the mean rating for each item - popularity = [(d[:,i].mean(),i) for i in range(d.shape[1])] + popularity = [(d[:, i].mean(), i) for i in range(d.shape[1])] elif self.method == 'thresh': # count the number of ratings above thresh for each item - popularity = [(sum(d[:,i].data>self.thresh),i) for i in range(d.shape[1])] + popularity = [(sum(d[:, i].data > self.thresh), i) for i in range(d.shape[1])] popularity.sort(reverse=True) - self.pop_items = [(i,c) for (c,i) in popularity] + self.pop_items = [(i, c) for (c, i) in popularity] - def recommend_items(self,dataset,u,max_items=10,return_scores=True,item_features=None): + def recommend_items(self, dataset, u, max_items=10, return_scores=True, item_features=None): """ Recommend new items for a user. Assumes you've already called fit(). @@ -93,10 +92,10 @@ def recommend_items(self,dataset,u,max_items=10,return_scores=True,item_features """ known_items = set(dataset[u].indices) recs = [] - for i,c in self.pop_items: + for i, c in self.pop_items: if i not in known_items: if return_scores: - recs.append((i,c)) + recs.append((i, c)) else: recs.append(i) if len(recs) >= max_items: diff --git a/mrec/reranking_recommender.py b/mrec/reranking_recommender.py index 60f6b44..50be902 100644 --- a/mrec/reranking_recommender.py +++ b/mrec/reranking_recommender.py @@ -11,6 +11,7 @@ from mrec.base_recommender import BaseRecommender + class RerankingRecommender(BaseRecommender): """ A secondary recommender that combines an item similarity @@ -28,31 +29,31 @@ class RerankingRecommender(BaseRecommender): The number of candidate items drawn from the first model for each user. """ - def __init__(self,item_similarity_recommender,mf_recommender,num_candidates=100): + def __init__(self, item_similarity_recommender, mf_recommender, num_candidates=100): self.item_similarity_recommender = item_similarity_recommender self.mf_recommender = mf_recommender self.num_candidates = num_candidates - self.description = 'RerankingRecommender({0},{1})'.format(self.item_similarity_recommender,self.mf_recommender) + self.description = 'RerankingRecommender({0},{1})'.format(self.item_similarity_recommender, self.mf_recommender) def _create_archive(self): archive = self.item_similarity_recommender._create_archive() archive['item_similarity_model'] = archive['model'] archive.update(self.mf_recommender._create_archive()) archive['mf_model'] = archive['model'] - tmp = self.item_similarity_recommender,self.mf_recommender + tmp = self.item_similarity_recommender, self.mf_recommender self.item_similarity_model = self.mf_recommender = None m = pickle.dumps(self) - self.item_similarity_model,self.mf_recommender = tmp + self.item_similarity_model, self.mf_recommender = tmp archive['model'] = m return archive - def _load_archive(self,archive): + def _load_archive(self, archive): self.item_similarity_recommender = np.loads(str(archive['item_similarity_model'])) self.item_similarity_recommender._load_archive(archive) self.mf_recommender = np.loads(str(archive['mf_model'])) self.mf_recommender._load_archive(archive) - def fit(self,train,item_features=None): + def fit(self, train, item_features=None): """ Fit both models to the training data. @@ -68,10 +69,10 @@ def fit(self,train,item_features=None): You are not obliged to call this, alternatively you can pass ready trained models to the RerankingRecommender constructor. """ - self.item_similarity_recommender.fit(train,item_features) - self.mf_recommender.fit(train,item_features) + self.item_similarity_recommender.fit(train, item_features) + self.mf_recommender.fit(train, item_features) - def rerank(self,u,candidates,max_items,return_scores): + def rerank(self, u, candidates, max_items, return_scores): """ Use latent factors to rerank candidate recommended items for a user and return the highest scoring. @@ -94,14 +95,14 @@ def rerank(self,u,candidates,max_items,return_scores): just a list of idxs. """ r = self.mf_recommender.U[u].dot(self.mf_recommender.V[candidates].T) - reranked = r.argsort()[:-1-max_items:-1] + reranked = r.argsort()[:-1 - max_items:-1] if return_scores: - recs = [(candidates[i],r[i]) for i in reranked] + recs = [(candidates[i], r[i]) for i in reranked] else: recs = [candidates[i] for i in reranked] return recs - def recommend_items(self,dataset,u,max_items=10,return_scores=True,item_features=None): + def recommend_items(self, dataset, u, max_items=10, return_scores=True, item_features=None): """ Recommend new items for a user. @@ -124,8 +125,9 @@ def recommend_items(self,dataset,u,max_items=10,return_scores=True,item_features List of (idx,score) pairs if return_scores is True, else just a list of idxs. """ - candidates = self.item_similarity_recommender.recommend_items(dataset,u,self.num_candidates,return_scores=False) - return self.rerank(u,candidates,max_items,return_scores=return_scores) + candidates = self.item_similarity_recommender.recommend_items(dataset, u, self.num_candidates, + return_scores=False) + return self.rerank(u, candidates, max_items, return_scores=return_scores) def batch_recommend_items(self, dataset, @@ -155,9 +157,10 @@ def batch_recommend_items(self, Each entry is a list of (idx,score) pairs if return_scores is True, else just a list of idxs. """ - recs = self.item_similarity_recommender.batch_recommend_items(dataset,self.num_candidates,return_scores=False,item_features=item_features) - for u,candidates in enumerate(recs): - recs[u] = self.rerank(u,candidates,max_items,return_scores=return_scores) + recs = self.item_similarity_recommender.batch_recommend_items(dataset, self.num_candidates, return_scores=False, + item_features=item_features) + for u, candidates in enumerate(recs): + recs[u] = self.rerank(u, candidates, max_items, return_scores=return_scores) return recs def range_recommend_items(self, @@ -192,15 +195,17 @@ def range_recommend_items(self, Each entry is a list of (idx,score) pairs if return_scores is True, else just a list of idxs. """ - recs = self.item_similarity_recommender.range_recommend_items(dataset,user_start,user_end,self.num_candidates,return_scores=False,item_features=item_features) - for u,candidates in enumerate(recs): - recs[u] = self.rerank(user_start+u,candidates,max_items,return_scores=return_scores) + recs = self.item_similarity_recommender.range_recommend_items(dataset, user_start, user_end, + self.num_candidates, return_scores=False, + item_features=item_features) + for u, candidates in enumerate(recs): + recs[u] = self.rerank(user_start + u, candidates, max_items, return_scores=return_scores) return recs + def main(): import sys from mrec import load_sparse_matrix, save_recommender - from mrec.sparse import fast_sparse_matrix from mrec.item_similarity.knn import CosineKNNRecommender from mrec.mf.warp import WARPMFRecommender from mrec.reranking_recommender import RerankingRecommender @@ -210,16 +215,16 @@ def main(): outfile = sys.argv[3] # load training set as scipy sparse matrix - train = load_sparse_matrix(file_format,filepath) + train = load_sparse_matrix(file_format, filepath) item_sim_model = CosineKNNRecommender(k=100) - mf_model = WARPMFRecommender(d=80,gamma=0.01,C=100.0,max_iters=25000,validation_iters=1000,batch_size=10) - recommender = RerankingRecommender(item_sim_model,mf_model,num_candidates=100) + mf_model = WARPMFRecommender(d=80, gamma=0.01, C=100.0, max_iters=25000, validation_iters=1000, batch_size=10) + recommender = RerankingRecommender(item_sim_model, mf_model, num_candidates=100) recommender.fit(train) - save_recommender(recommender,outfile) + save_recommender(recommender, outfile) + if __name__ == '__main__': main() - diff --git a/mrec/sparse.py b/mrec/sparse.py index 4fccffc..f7884ae 100644 --- a/mrec/sparse.py +++ b/mrec/sparse.py @@ -3,11 +3,13 @@ """ import random + import numpy as np -from scipy.sparse import csr_matrix, coo_matrix from scipy.io import mmread +from scipy.sparse import csr_matrix, coo_matrix -def loadtxt(filepath,comments='#',delimiter=None,skiprows=0,usecols=None,index_offset=1): + +def loadtxt(filepath, comments='#', delimiter=None, skiprows=0, usecols=None, index_offset=1): """ Load a scipy sparse matrix from simply formatted data such as TSV, handles similar input to numpy.loadtxt(). @@ -36,16 +38,17 @@ def loadtxt(filepath,comments='#',delimiter=None,skiprows=0,usecols=None,index_o mat : scipy.sparse.csr_matrix The sparse matrix. """ - d = np.loadtxt(filepath,comments=comments,delimiter=delimiter,skiprows=skiprows,usecols=usecols) + d = np.loadtxt(filepath, comments=comments, delimiter=delimiter, skiprows=skiprows, usecols=usecols) if d.shape[1] < 3: raise ValueError('invalid number of columns in input') - row = d[:,0]-index_offset - col = d[:,1]-index_offset - data = d[:,2] - shape = (max(row)+1,max(col)+1) - return csr_matrix((data,(row,col)),shape=shape) + row = d[:, 0] - index_offset + col = d[:, 1] - index_offset + data = d[:, 2] + shape = (max(row) + 1, max(col) + 1) + return csr_matrix((data, (row, col)), shape=shape) + -def savez(d,file): +def savez(d, file): """ Save a sparse matrix to file in numpy binary format. @@ -58,7 +61,8 @@ def savez(d,file): where the matrix will be saved. If file is a string, the ``.npz`` extension will be appended to the file name if it is not already there. """ - np.savez(file,row=d.row,col=d.col,data=d.data,shape=d.shape) + np.savez(file, row=d.row, col=d.col, data=d.data, shape=d.shape) + def loadz(file): """ @@ -75,7 +79,8 @@ def loadz(file): The sparse matrix. """ y = np.load(file) - return coo_matrix((y['data'],(y['row'],y['col'])),shape=y['shape']) + return coo_matrix((y['data'], (y['row'], y['col'])), shape=y['shape']) + class fast_sparse_matrix(object): """ @@ -95,7 +100,8 @@ class fast_sparse_matrix(object): >>> col = fsm.fast_get_col(2) # get a column quickly >>> row = fsm.X[1] # get a row as usual """ - def __init__(self,X,col_view=None): + + def __init__(self, X, col_view=None): """ Create a fast_sparse_matrix from a csr_matrix X. Note that X is not copied and its values will be modified by @@ -126,7 +132,7 @@ def shape(self): """ return self.X.shape - def fast_get_col(self,j): + def fast_get_col(self, j): """ Return column j of the underlying matrix. @@ -140,11 +146,11 @@ def fast_get_col(self,j): col : scipy.sparse.csc_matrix Copy of column j of the matrix. """ - col = self.col_view[:,j].copy() + col = self.col_view[:, j].copy() col.data = self.X.data[col.data] return col - def fast_update_col(self,j,vals): + def fast_update_col(self, j, vals): """ Update values of existing non-zeros in column of the underlying matrix. @@ -159,10 +165,10 @@ def fast_update_col(self,j,vals): only change the value of existing non-zero entries of column j, it cannot add new ones. """ - dataptr = self.col_view[:,j].data + dataptr = self.col_view[:, j].data self.X.data[dataptr] = vals - def ensure_sparse_cols(self,max_density,remove_lowest=True): + def ensure_sparse_cols(self, max_density, remove_lowest=True): """ Ensure that no column of the matrix excess the specified density, setting excess entries to zero where necessary. @@ -191,7 +197,7 @@ def ensure_sparse_cols(self,max_density,remove_lowest=True): if max_density >= 1: max_nnz = int(max_density) else: - max_nnz = int(max_density*self.shape[0]) + max_nnz = int(max_density * self.shape[0]) for j in range(self.shape[1]): col = self.fast_get_col(j) excess = col.nnz - max_nnz @@ -199,11 +205,11 @@ def ensure_sparse_cols(self,max_density,remove_lowest=True): if remove_lowest: zero_entries = np.argsort(col.data)[:excess] else: - zero_entries = random.sample(range(col.nnz),excess) + zero_entries = random.sample(range(col.nnz), excess) col.data[zero_entries] = 0 - self.fast_update_col(j,col.data) + self.fast_update_col(j, col.data) - def save(self,filepath): + def save(self, filepath): """ Save to file as arrays in numpy binary format. @@ -214,8 +220,8 @@ def save(self,filepath): """ d = self.X.tocoo(copy=False) v = self.col_view.tocoo(copy=False) - np.savez(filepath,row=d.row,col=d.col,data=d.data,shape=d.shape, - v_row=v.row,v_col=v.col,v_data=v.data,v_shape=v.shape) + np.savez(filepath, row=d.row, col=d.col, data=d.data, shape=d.shape, + v_row=v.row, v_col=v.col, v_data=v.data, v_shape=v.shape) @staticmethod def load(filepath): @@ -227,13 +233,13 @@ def load(filepath): filepath : str The filepath to load. """ - y = np.load(filepath,mmap_mode='r') - X = coo_matrix((y['data'],(y['row'],y['col'])),shape=y['shape']) - col_view = coo_matrix((y['v_data'],(y['v_row'],y['v_col'])),shape=y['v_shape']) - return fast_sparse_matrix(X,col_view.tocsc()) + y = np.load(filepath, mmap_mode='r') + X = coo_matrix((y['data'], (y['row'], y['col'])), shape=y['shape']) + col_view = coo_matrix((y['v_data'], (y['v_row'], y['v_col'])), shape=y['v_shape']) + return fast_sparse_matrix(X, col_view.tocsc()) @staticmethod - def loadtxt(filepath,comments='#',delimiter=None,skiprows=0,usecols=None,index_offset=1): + def loadtxt(filepath, comments='#', delimiter=None, skiprows=0, usecols=None, index_offset=1): """ Create a fast_sparse_matrix from simply formatted data such as TSV, handles similar input to numpy.loadtxt(). @@ -262,7 +268,7 @@ def loadtxt(filepath,comments='#',delimiter=None,skiprows=0,usecols=None,index_o mat : mrec.sparse.fast_sparse_matrix A fast_sparse_matrix holding the data in the file. """ - X = loadtxt(filepath,comments=comments,delimiter=delimiter,skiprows=skiprows,usecols=usecols) + X = loadtxt(filepath, comments=comments, delimiter=delimiter, skiprows=skiprows, usecols=usecols) return fast_sparse_matrix(X) @staticmethod @@ -282,4 +288,3 @@ def loadmm(filepath): """ X = mmread(filepath) return fast_sparse_matrix(X) - diff --git a/mrec/testing.py b/mrec/testing.py index 22d3f59..41885b1 100644 --- a/mrec/testing.py +++ b/mrec/testing.py @@ -1,21 +1,23 @@ import random + import numpy as np from scipy.sparse import coo_matrix from sklearn.utils.testing import assert_array_equal -def get_random_coo_matrix(rows=3,cols=10,nnz=20): - row_col = random.sample(range(rows*cols),nnz) # ensure are unique + +def get_random_coo_matrix(rows=3, cols=10, nnz=20): + row_col = random.sample(range(rows * cols), nnz) # ensure are unique row = [i // cols for i in row_col] col = [i % cols for i in row_col] - data = np.random.randint(0,nnz*5,nnz) - return coo_matrix((data,(row,col)),shape=(rows,cols)) + data = np.random.randint(0, nnz * 5, nnz) + return coo_matrix((data, (row, col)), shape=(rows, cols)) -def assert_sparse_matrix_equal(X,Y): + +def assert_sparse_matrix_equal(X, Y): expected = X.toarray() actual = Y.toarray() # it's possible that we had trailing empty columns in X # - there's no way we can know about these sometimes e.g. # when reading back from file - expected = expected[:actual.shape[0],:actual.shape[1]] - assert_array_equal(expected,actual) - + expected = expected[:actual.shape[0], :actual.shape[1]] + assert_array_equal(expected, actual) diff --git a/mrec/tests/test_base_recommender.py b/mrec/tests/test_base_recommender.py index 37b37a6..34c6eff 100644 --- a/mrec/tests/test_base_recommender.py +++ b/mrec/tests/test_base_recommender.py @@ -3,68 +3,76 @@ except ImportError: import pickle import tempfile -import os + import numpy as np from nose.tools import assert_less_equal -from sklearn.utils.testing import assert_raises -from sklearn.utils.testing import assert_equal from sklearn.utils.testing import assert_array_equal +from sklearn.utils.testing import assert_equal +from sklearn.utils.testing import assert_raises +from mrec.base_recommender import BaseRecommender from mrec.testing import get_random_coo_matrix -from mrec.base_recommender import BaseRecommender class MyRecommender(BaseRecommender): def __init__(self): self.foo = np.ndarray(range(10)) self.description = 'my recommender' + def _create_archive(self): tmp = self.foo self.foo = None m = pickle.dumps(self) self.foo = tmp - return {'model':m,'foo':self.foo} - def _load_archive(self,archive): + return {'model': m, 'foo': self.foo} + + def _load_archive(self, archive): self.foo = archive['foo'] + def save_load(r): - f,path = tempfile.mkstemp(suffix='.npz') + f, path = tempfile.mkstemp(suffix='.npz') r.save(path) return BaseRecommender.load(path) + def check_read_description(r): - f,path = tempfile.mkstemp(suffix='.npz') + f, path = tempfile.mkstemp(suffix='.npz') r.save(path) d = BaseRecommender.read_recommender_description(path) - assert_equal(str(r),d) + assert_equal(str(r), d) + def test_save_filepath_condition(): r = BaseRecommender() invalid_filepath = 'no suffix' - assert_raises(ValueError,r.save,invalid_filepath) + assert_raises(ValueError, r.save, invalid_filepath) + def test_save_load(): r = save_load(BaseRecommender()) - assert_equal(type(r),BaseRecommender) + assert_equal(type(r), BaseRecommender) r = MyRecommender() r2 = save_load(r) - assert_equal(type(r2),type(r)) - assert_array_equal(r2.foo,r.foo) - assert_equal(r2.description,r.description) + assert_equal(type(r2), type(r)) + assert_array_equal(r2.foo, r.foo) + assert_equal(r2.description, r.description) + def test_read_recommender_description(): check_read_description(BaseRecommender()) check_read_description(MyRecommender()) + def test_zero_known_item_scores(): train = get_random_coo_matrix().tocsr() predictions = np.random.random_sample(train.shape) r = BaseRecommender() - safe = r._zero_known_item_scores(predictions,train) - num_users,num_items = predictions.shape + safe = r._zero_known_item_scores(predictions, train) + num_users, num_items = predictions.shape for u in range(num_users): for i in range(num_items): if i in train[u].indices: - assert_less_equal(safe[u,i],0) + assert_less_equal(safe[u, i], 0) else: - assert_equal(safe[u,i],predictions[u,i]) + assert_equal(safe[u, i], predictions[u, i]) diff --git a/mrec/tests/test_mrec.py b/mrec/tests/test_mrec.py index 09291b5..669194e 100644 --- a/mrec/tests/test_mrec.py +++ b/mrec/tests/test_mrec.py @@ -1,23 +1,23 @@ -import tempfile import os - -from mrec.testing import get_random_coo_matrix -from mrec.testing import assert_sparse_matrix_equal +import tempfile from mrec import load_sparse_matrix from mrec import save_sparse_matrix +from mrec.testing import assert_sparse_matrix_equal +from mrec.testing import get_random_coo_matrix + def test_save_load_sparse_matrix(): X = get_random_coo_matrix() - for fmt in ['tsv','csv','npz','mm','fsm']: + for fmt in ['tsv', 'csv', 'npz', 'mm', 'fsm']: if fmt == 'mm': suffix = '.mtx' elif fmt == 'npz' or fmt == 'fsm': suffix = '.npz' else: suffix = '' - f,path = tempfile.mkstemp(suffix=suffix) - save_sparse_matrix(X,fmt,path) - Y = load_sparse_matrix(fmt,path) - assert_sparse_matrix_equal(X,Y) + f, path = tempfile.mkstemp(suffix=suffix) + save_sparse_matrix(X, fmt, path) + Y = load_sparse_matrix(fmt, path) + assert_sparse_matrix_equal(X, Y) os.remove(path) diff --git a/mrec/tests/test_sparse.py b/mrec/tests/test_sparse.py index 6081913..470eb93 100644 --- a/mrec/tests/test_sparse.py +++ b/mrec/tests/test_sparse.py @@ -1,49 +1,54 @@ -import tempfile import os -from sklearn.utils.testing import assert_equal -from sklearn.utils.testing import assert_array_equal +import tempfile -from mrec.testing import get_random_coo_matrix -from mrec.testing import assert_sparse_matrix_equal +from sklearn.utils.testing import assert_array_equal +from sklearn.utils.testing import assert_equal +from mrec.sparse import fast_sparse_matrix from mrec.sparse import loadtxt -from mrec.sparse import savez from mrec.sparse import loadz -from mrec.sparse import fast_sparse_matrix +from mrec.sparse import savez +from mrec.testing import assert_sparse_matrix_equal +from mrec.testing import get_random_coo_matrix + def test_loadtxt(): X = get_random_coo_matrix() - f,path = tempfile.mkstemp(suffix='.npz') - with open(path,'w') as f: - for i,j,v in zip(X.row,X.col,X.data): - print('{0}\t{1}\t{2}'.format(i+1,j+1,v), file=f) + f, path = tempfile.mkstemp(suffix='.npz') + with open(path, 'w') as f: + for i, j, v in zip(X.row, X.col, X.data): + print('{0}\t{1}\t{2}'.format(i + 1, j + 1, v), file=f) Y = loadtxt(path) os.remove(path) - assert_sparse_matrix_equal(X,Y) + assert_sparse_matrix_equal(X, Y) + def test_savez_loadz(): m = get_random_coo_matrix() - f,path = tempfile.mkstemp(suffix='.npz') - savez(m,path) + f, path = tempfile.mkstemp(suffix='.npz') + savez(m, path) n = loadz(path) os.remove(path) - assert_array_equal(n.toarray(),m.toarray()) + assert_array_equal(n.toarray(), m.toarray()) + def test_init_fast_sparse_matrix(): X = get_random_coo_matrix() Y = X.tocsr() Z = X.tocsc() - for M in [X,Y,Z]: + for M in [X, Y, Z]: m = fast_sparse_matrix(M) - assert_array_equal(m.X.toarray(),M.toarray()) - assert_equal(m.shape,M.shape) + assert_array_equal(m.X.toarray(), M.toarray()) + assert_equal(m.shape, M.shape) + def test_fast_get_col(): X = get_random_coo_matrix().tocsc() m = fast_sparse_matrix(X) - rows,cols = X.shape + rows, cols = X.shape for j in range(cols): - assert_array_equal(m.fast_get_col(j).toarray(),X[:,j].toarray()) + assert_array_equal(m.fast_get_col(j).toarray(), X[:, j].toarray()) + def test_fast_update_col(): X = get_random_coo_matrix().tocsc() @@ -51,25 +56,25 @@ def test_fast_update_col(): cols = X.shape[1] for j in range(cols): vals = m.fast_get_col(j).data - if (vals==0).all(): + if (vals == 0).all(): continue - vals[vals!=0] += 1 - m.fast_update_col(j,vals) - expected = X[:,j].toarray() + vals[vals != 0] += 1 + m.fast_update_col(j, vals) + expected = X[:, j].toarray() for i in range(expected.shape[0]): if expected[i] != 0: expected[i] += 1 - assert_array_equal(m.fast_get_col(j).toarray(),expected) + assert_array_equal(m.fast_get_col(j).toarray(), expected) + def test_save_load(): """Save to file as arrays in numpy binary format.""" X = get_random_coo_matrix() m = fast_sparse_matrix(X) - f,path = tempfile.mkstemp(suffix='.npz') + f, path = tempfile.mkstemp(suffix='.npz') m.save(path) n = fast_sparse_matrix.load(path) os.remove(path) - assert_equal(m.shape,n.shape) - assert_array_equal(m.X.toarray(),n.X.toarray()) - assert_array_equal(m.col_view.toarray(),n.col_view.toarray()) - + assert_equal(m.shape, n.shape) + assert_array_equal(m.X.toarray(), n.X.toarray()) + assert_array_equal(m.col_view.toarray(), n.col_view.toarray()) From 294bafa74b00b078d3b3b7bc28266a0d2a59b923 Mon Sep 17 00:00:00 2001 From: inpefess Date: Sun, 29 Oct 2017 21:02:52 +0300 Subject: [PATCH 11/17] remove all cPickle mentions --- mrec/__init__.py | 7 +------ mrec/base_recommender.py | 6 ++---- mrec/item_similarity/recommender.py | 5 +---- mrec/mf/recommender.py | 6 ++---- mrec/reranking_recommender.py | 6 ++---- mrec/tests/test_base_recommender.py | 5 +---- 6 files changed, 9 insertions(+), 26 deletions(-) diff --git a/mrec/__init__.py b/mrec/__init__.py index bcc1e4d..0492a66 100644 --- a/mrec/__init__.py +++ b/mrec/__init__.py @@ -1,12 +1,7 @@ from scipy.io import mmread, mmwrite -try: - import cPickle as pickle -except ImportError: - import pickle - -from mrec.sparse import fast_sparse_matrix, loadtxt, loadz, savez from mrec.base_recommender import BaseRecommender +from mrec.sparse import fast_sparse_matrix, loadtxt, loadz, savez __version__ = '0.3.1' diff --git a/mrec/base_recommender.py b/mrec/base_recommender.py index 1321257..fc9bf11 100644 --- a/mrec/base_recommender.py +++ b/mrec/base_recommender.py @@ -1,7 +1,5 @@ -try: - import cPickle as pickle -except ImportError: - import pickle +import pickle + import numpy as np from scipy.sparse import csr_matrix diff --git a/mrec/item_similarity/recommender.py b/mrec/item_similarity/recommender.py index 0e35ce3..94f8aac 100644 --- a/mrec/item_similarity/recommender.py +++ b/mrec/item_similarity/recommender.py @@ -2,10 +2,7 @@ Base class for item similarity recommenders. """ -try: - import cPickle as pickle -except ImportError: - import pickle +import pickle from operator import itemgetter import numpy as np diff --git a/mrec/mf/recommender.py b/mrec/mf/recommender.py index f5303b4..13bcfc4 100644 --- a/mrec/mf/recommender.py +++ b/mrec/mf/recommender.py @@ -3,10 +3,8 @@ by matrix factorization. """ -try: - import cPickle as pickle -except ImportError: - import pickle +import pickle + import numpy as np from mrec.base_recommender import BaseRecommender diff --git a/mrec/reranking_recommender.py b/mrec/reranking_recommender.py index 50be902..86ada35 100644 --- a/mrec/reranking_recommender.py +++ b/mrec/reranking_recommender.py @@ -3,10 +3,8 @@ and then reranks them using a matrix factorization model. """ -try: - import cPickle as pickle -except ImportError: - import pickle +import pickle + import numpy as np from mrec.base_recommender import BaseRecommender diff --git a/mrec/tests/test_base_recommender.py b/mrec/tests/test_base_recommender.py index 34c6eff..f61e408 100644 --- a/mrec/tests/test_base_recommender.py +++ b/mrec/tests/test_base_recommender.py @@ -1,7 +1,4 @@ -try: - import cPickle as pickle -except ImportError: - import pickle +import pickle import tempfile import numpy as np From 09d228222ed9041aad3f240245d9d62cdbfb36ac Mon Sep 17 00:00:00 2001 From: inpefess Date: Sun, 29 Oct 2017 21:09:37 +0300 Subject: [PATCH 12/17] move to newer scikit-learn version --- mrec/examples/tune_slim.py | 5 +---- mrec/mf/evaluate.py | 5 +---- 2 files changed, 2 insertions(+), 8 deletions(-) diff --git a/mrec/examples/tune_slim.py b/mrec/examples/tune_slim.py index 1f82ce2..4e72d8d 100644 --- a/mrec/examples/tune_slim.py +++ b/mrec/examples/tune_slim.py @@ -9,11 +9,8 @@ from operator import itemgetter from optparse import OptionParser -try: - from sklearn.grid_search import ParameterGrid -except ImportError: - from sklearn.grid_search import IterGrid as ParameterGrid from ipyparallel import Client +from sklearn.model_selection import ParameterGrid from mrec import load_fast_sparse_matrix diff --git a/mrec/mf/evaluate.py b/mrec/mf/evaluate.py index eec3067..c22ef63 100644 --- a/mrec/mf/evaluate.py +++ b/mrec/mf/evaluate.py @@ -4,10 +4,7 @@ def retrain_recommender(model, dataset): if __name__ == '__main__': - try: - from sklearn.grid_search import ParameterGrid - except ImportError: - from sklearn.grid_search import IterGrid as ParameterGrid + from sklearn.model_selection import ParameterGrid from optparse import OptionParser from mrec.mf.warp import WARPMFRecommender From 1195c9e9044ebf195be551a659a77686135b0d89 Mon Sep 17 00:00:00 2001 From: inpefess Date: Sun, 29 Oct 2017 21:15:06 +0300 Subject: [PATCH 13/17] refresh requirements --- setup.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 2447a0f..a054a66 100644 --- a/setup.py +++ b/setup.py @@ -29,7 +29,8 @@ install_requires=['numpy', 'scipy', 'scikit-learn', - 'ipython <= 4.0.0', + 'nose', + 'ipyparallel', 'cython', 'psutil'], entry_points={ From 0fab30f0ad57482ac04b60c2db14ab098a407744 Mon Sep 17 00:00:00 2001 From: inpefess Date: Sun, 29 Oct 2017 21:20:21 +0300 Subject: [PATCH 14/17] various little style enhancement --- mrec/evaluation/preprocessing.py | 2 +- mrec/examples/predict.py | 2 +- mrec/examples/prepare.py | 2 +- mrec/mf/climf.py | 4 ++-- mrec/reranking_recommender.py | 2 -- 5 files changed, 5 insertions(+), 7 deletions(-) diff --git a/mrec/evaluation/preprocessing.py b/mrec/evaluation/preprocessing.py index c9ef7b6..423ef72 100644 --- a/mrec/evaluation/preprocessing.py +++ b/mrec/evaluation/preprocessing.py @@ -54,7 +54,7 @@ class SplitCreator(object): sample_before_thresholding : bool (default: False) If True then consider any item seen by the user for inclusion in the test group, even though only items - with positive scrore will be selected. If the input + with positive score will be selected. If the input includes items with zero scores this means that the test set may be smaller than the requested size for some users, even though they have apparently seen diff --git a/mrec/examples/predict.py b/mrec/examples/predict.py index 07440fd..bbba9f3 100644 --- a/mrec/examples/predict.py +++ b/mrec/examples/predict.py @@ -244,7 +244,7 @@ def main(): description = ' AND '.join(descriptions) if len(descriptions) > 1: - logging.warn('You are aggregating metrics from different models! {}'.format(description)) + logging.warning('You are aggregating metrics from different models! {}'.format(description)) print_report([description], [all_metrics]) diff --git a/mrec/examples/prepare.py b/mrec/examples/prepare.py index a7bc8ee..ec1ac45 100644 --- a/mrec/examples/prepare.py +++ b/mrec/examples/prepare.py @@ -95,7 +95,7 @@ def main(): processor.create_split(open(infile), open(trainfile, 'w'), open(testfile, 'w')) too_few_items = processor.get_too_few_items() - if (too_few_items): + if too_few_items: logging.info('skipped {0} users with less than {1} ratings'.format(too_few_items, opts.min_items_per_user)) logging.info('cleaning up...') diff --git a/mrec/mf/climf.py b/mrec/mf/climf.py index a9cb1f2..8729cb9 100644 --- a/mrec/mf/climf.py +++ b/mrec/mf/climf.py @@ -42,8 +42,8 @@ def fit(self, data): self.V = 0.01 * np.random.random_sample((data.shape[1], self.d)) # TODO: create a validation set - for iter in range(self.max_iters): - print('iteration {0}:'.format(iter + 1)) + for some_iter in range(self.max_iters): + print('iteration {0}:'.format(some_iter + 1)) print('objective = {0:.4f}'.format(self.objective(data))) self.update(data) # TODO: compute MRR on validation set, terminate if appropriate diff --git a/mrec/reranking_recommender.py b/mrec/reranking_recommender.py index 86ada35..ed262ff 100644 --- a/mrec/reranking_recommender.py +++ b/mrec/reranking_recommender.py @@ -144,8 +144,6 @@ def batch_recommend_items(self, Maximum number of recommended items to return. return_scores : bool If true return a score along with each recommended item. - show_progress: bool - If true print something to stdout to show progress. item_features : array_like, shape = [num_items, num_features] Features for items in training set, required by some recommenders. From 95b4a30f8c1c9f6d674537b8019d818daa7676a4 Mon Sep 17 00:00:00 2001 From: inpefess Date: Sun, 29 Oct 2017 21:33:24 +0300 Subject: [PATCH 15/17] README change --- README.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/README.rst b/README.rst index b8aeaf9..ec1e0cb 100644 --- a/README.rst +++ b/README.rst @@ -9,6 +9,7 @@ mrec recommender systems library Introduction ------------ +This fork is Python 3 only. `mrec` is a Python package developed at `Mendeley `_ to support recommender systems development and evaluation. The package currently focuses on item similarity and other methods that work well on implicit feedback, and on experimental evaluation. Why another package when there are already some really good software projects implementing recommender systems? From 71452b6fd19f4b529db770d8bb26c266fa71c598 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=A8=D0=BC=D0=B8=D0=BD=D0=BA=D0=B5=20=D0=91=D0=BE=D1=80?= =?UTF-8?q?=D0=B8=D1=81=20=D0=90=D0=BD=D0=B4=D1=80=D0=B5=D0=B5=D0=B2=D0=B8?= =?UTF-8?q?=D1=87?= Date: Sun, 3 Feb 2019 10:17:57 +0300 Subject: [PATCH 16/17] python version update for Travis --- .travis.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.travis.yml b/.travis.yml index a502794..126084c 100644 --- a/.travis.yml +++ b/.travis.yml @@ -1,7 +1,7 @@ language: python python: - - "2.7" + - "3.7" sudo: false From aae5f32c9836a60cbef03ba21c3e7871c0064504 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=A8=D0=BC=D0=B8=D0=BD=D0=BA=D0=B5=20=D0=91=D0=BE=D1=80?= =?UTF-8?q?=D0=B8=D1=81=20=D0=90=D0=BD=D0=B4=D1=80=D0=B5=D0=B5=D0=B2=D0=B8?= =?UTF-8?q?=D1=87?= Date: Sun, 3 Feb 2019 10:25:35 +0300 Subject: [PATCH 17/17] fixup! python version update for Travis --- .travis.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.travis.yml b/.travis.yml index 126084c..2afe9c4 100644 --- a/.travis.yml +++ b/.travis.yml @@ -1,7 +1,7 @@ language: python python: - - "3.7" + - "3.6" sudo: false