diff --git a/.gitignore b/.gitignore old mode 100644 new mode 100755 index 5f649686e..b34b8fdae --- a/.gitignore +++ b/.gitignore @@ -16,3 +16,47 @@ TAGS /build/ /dist/ *.egg-info/ + +# Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio and Webstorm +# Reference: https://intellij-support.jetbrains.com/hc/en-us/articles/206544839 + +# User-specific stuff: +.idea/ +.idea/workspace.xml +.idea/tasks.xml + +# Sensitive or high-churn files: +.idea/dataSources/ +.idea/dataSources.ids +.idea/dataSources.xml +.idea/dataSources.local.xml +.idea/sqlDataSources.xml +.idea/dynamic.xml +.idea/uiDesigner.xml + +# Gradle: +.idea/gradle.xml +.idea/libraries + +# Mongo Explorer plugin: +.idea/mongoSettings.xml + +## File-based project format: +*.iws + +## Plugin-specific files: + +# IntelliJ +/out/ + +# mpeltonen/sbt-idea plugin +.idea_modules/ + +# JIRA plugin +atlassian-ide-plugin.xml + +# Crashlytics plugin (for Android Studio and IntelliJ) +com_crashlytics_export_strings.xml +crashlytics.properties +crashlytics-build.properties +fabric.properties diff --git a/.nfs000000000f7c03bc0003551c b/.nfs000000000f7c03bc0003551c new file mode 100755 index 000000000..604f07af7 --- /dev/null +++ b/.nfs000000000f7c03bc0003551c @@ -0,0 +1,6 @@ +#!/bin/bash +# Copyright (c) 2016, NVIDIA CORPORATION. All rights reserved. + +set -e + +python2 -m digits $@ diff --git a/.travis.yml b/.travis.yml index 5a10e8c19..35cb219bb 100644 --- a/.travis.yml +++ b/.travis.yml @@ -121,7 +121,8 @@ before_install: - deactivate - virtualenv --system-site-packages ~/venv - source ~/venv/bin/activate - + - "sudo apt-get install libboost-filesystem1.55-dev +libboost-python1.55-dev libboost-system1.55-dev libboost-thread1.55-dev" install: - mkdir -p ~/.config/matplotlib - echo "backend:agg" > ~/.config/matplotlib/matplotlibrc diff --git a/digits/config/__init__.py b/digits/config/__init__.py old mode 100644 new mode 100755 index a903cec87..ee37dcac2 --- a/digits/config/__init__.py +++ b/digits/config/__init__.py @@ -5,6 +5,7 @@ option_list = {} from . import ( # noqa + system_type, caffe, gpu_list, jobs_dir, @@ -12,6 +13,7 @@ torch, server_name, store_option, + ) diff --git a/digits/config/jobs_dir.py b/digits/config/jobs_dir.py old mode 100644 new mode 100755 diff --git a/digits/config/system_type.py b/digits/config/system_type.py new file mode 100755 index 000000000..8cfaf032f --- /dev/null +++ b/digits/config/system_type.py @@ -0,0 +1,10 @@ +from __future__ import absolute_import +from . import option_list +from digits.extensions.cluster_management.cluster_factory import cluster_factory +if cluster_factory.use_cluster: + system_type = cluster_factory.selected_system + +else: + system_type = 'interactive' + +option_list['system_type'] = system_type diff --git a/digits/dataset/forms.py b/digits/dataset/forms.py old mode 100644 new mode 100755 index e8133ff75..f943eb38a --- a/digits/dataset/forms.py +++ b/digits/dataset/forms.py @@ -3,6 +3,7 @@ from flask.ext.wtf import Form from wtforms.validators import DataRequired +from wtforms import validators from digits import utils @@ -20,3 +21,13 @@ class DatasetForm(Form): group_name = utils.forms.StringField('Group Name', tooltip="An optional group name for organization on the main page." ) + + # slurm options + slurm_selector = utils.forms.BooleanField('Use slurm?') + slurm_time_limit = utils.forms.IntegerField('Task time limit', tooltip='in minutes', default=0, ) + slurm_cpu_count = utils.forms.IntegerField('Use this many cores', validators=[ + validators.NumberRange(min=1, max=128) + ], default=8, ) + slurm_mem = utils.forms.IntegerField('Use this much memory (GB)', validators=[ + validators.NumberRange(min=1, max=128) + ], default=10, ) diff --git a/digits/dataset/generic/job.py b/digits/dataset/generic/job.py old mode 100644 new mode 100755 index e40b0d44d..5ceb1ad86 --- a/digits/dataset/generic/job.py +++ b/digits/dataset/generic/job.py @@ -35,6 +35,15 @@ def __init__(self, self.extension_id = extension_id self.extension_userdata = extension_userdata + try: + self.time_limit = kwargs.pop('time_limit', None) + self.s_cpu_count = kwargs.pop('s_cpu_count', None) + self.s_mem = kwargs.pop('s_mem', None) + except: + self.time_limit + self.s_cpu_count + self.s_mem + super(GenericDatasetJob, self).__init__(**kwargs) self.pickver_job_dataset_extension = PICKLE_VERSION @@ -45,6 +54,9 @@ def __init__(self, job=self, backend=self.backend, stage=stage, + time_limit=self.time_limit, + s_cpu_count=self.s_cpu_count, + s_mem=self.s_mem, ) ) diff --git a/digits/dataset/generic/views.py b/digits/dataset/generic/views.py old mode 100644 new mode 100755 index 9ef3ac87e..9c488ebfa --- a/digits/dataset/generic/views.py +++ b/digits/dataset/generic/views.py @@ -3,6 +3,8 @@ import os # Find the best implementation available +from digits.config import config_value + try: from cStringIO import StringIO except ImportError: @@ -54,7 +56,8 @@ def new(extension_id): extension_title=extension.get_title(), extension_id=extension_id, extension_html=rendered_extension, - form=form + form=form, + system_type=config_value('system_type') ) @@ -96,7 +99,9 @@ def create(extension_id): extension_id=extension_id, extension_html=rendered_extension, form=form, - errors=errors), 400 + errors=errors, + system_type=config_value('system_type') + ), 400 # create instance of extension class extension = extension_class(**extension_form.data) @@ -104,6 +109,7 @@ def create(extension_id): job = None try: # create job + job = GenericDatasetJob( username=utils.auth.get_username(), name=form.dataset_name.data, @@ -116,6 +122,9 @@ def create(extension_id): force_same_shape=form.dsopts_force_same_shape.data, extension_id=extension_id, extension_userdata=extension.get_user_data(), + time_limit=form.slurm_time_limit.data, + s_cpu_count=form.slurm_cpu_count.data, + s_mem=form.slurm_mem.data, ) # Save form data with the job so we can easily clone it later. @@ -199,7 +208,7 @@ def explore(): return flask.render_template( 'datasets/images/explore.html', page=page, size=size, job=job, imgs=imgs, labels=None, - pages=pages, label=None, total_entries=total_entries, db=db) + pages=pages, label=None, total_entries=total_entries, db=db, system_type=config_value('system_type')) def show(job, related_jobs=None): diff --git a/digits/dataset/images/classification/views.py b/digits/dataset/images/classification/views.py old mode 100644 new mode 100755 index 42b201029..2f8423db6 --- a/digits/dataset/images/classification/views.py +++ b/digits/dataset/images/classification/views.py @@ -5,6 +5,8 @@ import shutil # Find the best implementation available +from digits.config import config_value + try: from cStringIO import StringIO except ImportError: @@ -23,7 +25,6 @@ from digits.utils.routing import request_wants_json, job_from_request from digits.webapp import scheduler - blueprint = flask.Blueprint(__name__, __name__) @@ -115,6 +116,9 @@ def from_folders(job, form): compression=compression, mean_file=utils.constants.MEAN_FILE_CAFFE, labels_file=job.labels_file, + time_limit=form.slurm_time_limit.data, + s_cpu_count=form.slurm_cpu_count.data, + s_mem=form.slurm_mem.data, ) ) @@ -131,6 +135,9 @@ def from_folders(job, form): encoding=encoding, compression=compression, labels_file=job.labels_file, + time_limit=form.slurm_time_limit.data, + s_cpu_count=form.slurm_cpu_count.data, + s_mem=form.slurm_mem.data, ) ) @@ -147,6 +154,9 @@ def from_folders(job, form): encoding=encoding, compression=compression, labels_file=job.labels_file, + time_limit=form.slurm_time_limit.data, + s_cpu_count=form.slurm_cpu_count.data, + s_mem=form.slurm_mem.data, ) ) @@ -198,6 +208,10 @@ def from_files(job, form): mean_file=utils.constants.MEAN_FILE_CAFFE, labels_file=job.labels_file, shuffle=shuffle, + time_limit=form.slurm_time_limit.data, + s_cpu_count=form.slurm_cpu_count.data, + s_mem=form.slurm_mem.data, + ) ) @@ -229,6 +243,9 @@ def from_files(job, form): compression=compression, labels_file=job.labels_file, shuffle=shuffle, + time_limit=form.slurm_time_limit.data, + s_cpu_count=form.slurm_cpu_count.data, + s_mem=form.slurm_mem.data, ) ) @@ -260,6 +277,9 @@ def from_files(job, form): compression=compression, labels_file=job.labels_file, shuffle=shuffle, + time_limit=form.slurm_time_limit.data, + s_cpu_count=form.slurm_cpu_count.data, + s_mem=form.slurm_mem.data, ) ) @@ -275,7 +295,8 @@ def new(): # Is there a request to clone a job with ?clone= fill_form_if_cloned(form) - return flask.render_template('datasets/images/classification/new.html', form=form) + return flask.render_template('datasets/images/classification/new.html', form=form, + system_type=config_value('system_type')) @blueprint.route('.json', methods=['POST']) @@ -296,7 +317,8 @@ def create(): if request_wants_json(): return flask.jsonify({'errors': form.errors}), 400 else: - return flask.render_template('datasets/images/classification/new.html', form=form), 400 + return flask.render_template('datasets/images/classification/new.html', form=form, + system_type=config_value('system_type')), 400 job = None try: @@ -309,7 +331,10 @@ def create(): int(form.resize_width.data), int(form.resize_channels.data), ), - resize_mode=form.resize_mode.data + resize_mode=form.resize_mode.data, + time_limit=form.slurm_time_limit.data, + s_cpu_count=form.slurm_cpu_count.data, + s_mem=form.slurm_mem.data, ) if form.method.data == 'folder': diff --git a/digits/dataset/images/generic/test_lmdb_creator.py b/digits/dataset/images/generic/test_lmdb_creator.py index b7b0a2145..5b99a7875 100755 --- a/digits/dataset/images/generic/test_lmdb_creator.py +++ b/digits/dataset/images/generic/test_lmdb_creator.py @@ -209,3 +209,4 @@ def _save_mean(mean, filename): ) print 'Done after %s seconds' % (time.time() - start_time,) + diff --git a/digits/dataset/images/generic/views.py b/digits/dataset/images/generic/views.py old mode 100644 new mode 100755 index e7a0c944f..f8be3b131 --- a/digits/dataset/images/generic/views.py +++ b/digits/dataset/images/generic/views.py @@ -10,6 +10,7 @@ from digits.webapp import scheduler from digits.utils.forms import fill_form_if_cloned, save_form_to_job from digits.utils.routing import request_wants_json +from digits.config import config_value blueprint = flask.Blueprint(__name__, __name__) @@ -24,8 +25,7 @@ def new(): # Is there a request to clone a job with ?clone= fill_form_if_cloned(form) - - return flask.render_template('datasets/images/generic/new.html', form=form) + return flask.render_template('datasets/images/generic/new.html', form=form, system_type=config_value('system_type')) @blueprint.route('.json', methods=['POST']) @@ -46,7 +46,8 @@ def create(): if request_wants_json(): return flask.jsonify({'errors': form.errors}), 400 else: - return flask.render_template('datasets/images/generic/new.html', form=form), 400 + return flask.render_template('datasets/images/generic/new.html', form=form, + system_type=config_value('system_type')), 400 job = None try: @@ -55,6 +56,7 @@ def create(): name=form.dataset_name.data, group=form.group_name.data, mean_file=form.prebuilt_mean_file.data.strip(), + ) if form.method.data == 'prebuilt': @@ -70,6 +72,9 @@ def create(): database=form.prebuilt_train_images.data, purpose=form.prebuilt_train_images.label.text, force_same_shape=force_same_shape, + time_limit=form.slurm_time_limit.data, + s_cpu_count=form.slurm_cpu_count.data, + s_mem=form.slurm_mem.data, ) ) @@ -80,6 +85,9 @@ def create(): database=form.prebuilt_train_labels.data, purpose=form.prebuilt_train_labels.label.text, force_same_shape=force_same_shape, + time_limit=form.slurm_time_limit.data, + s_cpu_count=form.slurm_cpu_count.data, + s_mem=form.slurm_mem.data, ) ) @@ -90,6 +98,9 @@ def create(): database=form.prebuilt_val_images.data, purpose=form.prebuilt_val_images.label.text, force_same_shape=force_same_shape, + time_limit=form.slurm_time_limit.data, + s_cpu_count=form.slurm_cpu_count.data, + s_mem=form.slurm_mem.data, ) ) if form.prebuilt_val_labels.data: @@ -99,6 +110,9 @@ def create(): database=form.prebuilt_val_labels.data, purpose=form.prebuilt_val_labels.label.text, force_same_shape=force_same_shape, + time_limit=form.slurm_time_limit.data, + s_cpu_count=form.slurm_cpu_count.data, + s_mem=form.slurm_mem.data, ) ) diff --git a/digits/dataset/images/job.py b/digits/dataset/images/job.py old mode 100644 new mode 100755 index 6b351dce3..462811483 --- a/digits/dataset/images/job.py +++ b/digits/dataset/images/job.py @@ -20,6 +20,14 @@ def __init__(self, **kwargs): """ self.image_dims = kwargs.pop('image_dims', None) self.resize_mode = kwargs.pop('resize_mode', None) + try: + self.time_limit = kwargs.pop('time_limit', None) + self.s_cpu_count = kwargs.pop('s_cpu_count', None) + self.s_mem = kwargs.pop('s_mem', None) + except: + self.time_limit + self.s_cpu_count + self.s_mem super(ImageDatasetJob, self).__init__(**kwargs) self.pickver_job_dataset_image = PICKLE_VERSION diff --git a/digits/dataset/tasks/analyze_db.py b/digits/dataset/tasks/analyze_db.py old mode 100644 new mode 100755 index 360af2a40..793443c7d --- a/digits/dataset/tasks/analyze_db.py +++ b/digits/dataset/tasks/analyze_db.py @@ -28,6 +28,14 @@ def __init__(self, database, purpose, **kwargs): Keyword arguments: force_same_shape -- if True, enforce that every entry in the database has the same shape """ + try: + self.time_limit = kwargs.pop('time_limit', None) + self.s_cpu_count = kwargs.pop('s_cpu_count', None) + self.s_mem = kwargs.pop('s_mem', None) + except: + self.time_limit + self.s_cpu_count + self.s_mem self.force_same_shape = kwargs.pop('force_same_shape', False) super(AnalyzeDbTask, self).__init__(**kwargs) diff --git a/digits/dataset/tasks/create_db.py b/digits/dataset/tasks/create_db.py old mode 100644 new mode 100755 index 013757a4e..7255f150e --- a/digits/dataset/tasks/create_db.py +++ b/digits/dataset/tasks/create_db.py @@ -44,6 +44,15 @@ def __init__(self, input_file, db_name, backend, image_dims, **kwargs): self.mean_file = kwargs.pop('mean_file', None) self.labels_file = kwargs.pop('labels_file', None) + try: + self.time_limit = kwargs.pop('time_limit', None) + self.s_cpu_count = kwargs.pop('s_cpu_count', None) + self.s_mem = kwargs.pop('s_mem', None) + except: + self.time_limit + self.s_cpu_count + self.s_mem + super(CreateDbTask, self).__init__(**kwargs) self.pickver_task_createdb = PICKLE_VERSION diff --git a/digits/dataset/tasks/create_generic_db.py b/digits/dataset/tasks/create_generic_db.py old mode 100644 new mode 100755 index 1ec8a96f8..070a58a30 --- a/digits/dataset/tasks/create_generic_db.py +++ b/digits/dataset/tasks/create_generic_db.py @@ -32,6 +32,14 @@ def __init__(self, job, backend, stage, **kwargs): self.feature_shape = None self.label_shape = None self.mean_file = None + try: + self.time_limit = kwargs.pop('time_limit', None) + self.s_cpu_count = kwargs.pop('s_cpu_count', None) + self.s_mem = kwargs.pop('s_mem', None) + except: + self.time_limit + self.s_cpu_count + self.s_mem super(CreateGenericDbTask, self).__init__(**kwargs) self.pickver_task_create_generic_db = PICKLE_VERSION diff --git a/digits/dataset/tasks/parse_folder.py b/digits/dataset/tasks/parse_folder.py old mode 100644 new mode 100755 index 1450801a6..765a43866 --- a/digits/dataset/tasks/parse_folder.py +++ b/digits/dataset/tasks/parse_folder.py @@ -35,6 +35,15 @@ def __init__(self, folder, **kwargs): self.min_per_category = kwargs.pop('min_per_category', 2) self.max_per_category = kwargs.pop('max_per_category', None) + try: + self.time_limit = kwargs.pop('time_limit', None) + self.s_cpu_count = kwargs.pop('s_cpu_count', None) + self.s_mem = kwargs.pop('s_mem', None) + except: + self.time_limit + self.s_cpu_count + self.s_mem + super(ParseFolderTask, self).__init__(**kwargs) self.pickver_task_parsefolder = PICKLE_VERSION diff --git a/digits/device_query.py b/digits/device_query.py index 9f13a09cd..c996df26d 100755 --- a/digits/device_query.py +++ b/digits/device_query.py @@ -220,6 +220,8 @@ def get_devices(force_reload=False): rc = cudart.cudaDeviceGetPCIBusId(ctypes.c_char_p(pciBusID_str), 16, x) if rc == 0: properties.pciBusID_str = pciBusID_str + if rc != 0: + raise RuntimeError('cudaDeviceGetPCIBusId() failed with error #%s' % rc) devices.append(properties) else: print 'cudaGetDeviceProperties() failed with error #%s' % rc @@ -242,7 +244,6 @@ def get_nvml_info(device_id): device = get_device(device_id) if device is None: return None - nvml = get_nvml() if nvml is None: return None @@ -291,6 +292,7 @@ def get_nvml_info(device_id): if __name__ == '__main__': + parser = argparse.ArgumentParser(description='DIGITS Device Query') parser.add_argument('-v', '--verbose', action='store_true') args = parser.parse_args() diff --git a/digits/extensions/cluster_management/__init__.py b/digits/extensions/cluster_management/__init__.py new file mode 100755 index 000000000..e941c158a --- /dev/null +++ b/digits/extensions/cluster_management/__init__.py @@ -0,0 +1,7 @@ +import os +import cluster_factory +import slurm + +if os.environ.get('JENKINS_URL') is not None: + cluster_factory.cluster_factory.set_system('slurm') + slurm.get_digits_tmpdir() diff --git a/digits/extensions/cluster_management/cluster_factory.py b/digits/extensions/cluster_management/cluster_factory.py new file mode 100755 index 000000000..44b0e5f48 --- /dev/null +++ b/digits/extensions/cluster_management/cluster_factory.py @@ -0,0 +1,33 @@ +import slurm +from digits.config import option_list + + +class cluster_factory: + selected_system = 'interactive' + use_cluster = True + + def __init__(self): + pass + + def get_cluster_manager(self): + if cluster_factory.selected_system == 'slurm': + return slurm.slurm_manager() + + @staticmethod + def get_running_systems(): + running_systems = ['interactive'] + if slurm.test_if_slurm_system(): + running_systems.append('slurm') + # add more systems here + return running_systems + + @staticmethod + def set_system(system): + cluster_factory.selected_system = system + option_list['system_type'] = cluster_factory.selected_system + if cluster_factory.selected_system == 'slurm': + slurm.get_digits_tmpdir() + + # enter this to set system_type + # cluster_factory.cluster_factory.selected_system = "slurm" + # cluster_factory.cluster_factory.set_system("slurm") diff --git a/digits/extensions/cluster_management/cluster_manager.py b/digits/extensions/cluster_management/cluster_manager.py new file mode 100755 index 000000000..d909df8f0 --- /dev/null +++ b/digits/extensions/cluster_management/cluster_manager.py @@ -0,0 +1,9 @@ +class cluster_manager: + def __init__(self): + pass + + def pack_args(self, args, time_limit, cpu_count, mem, gpu_count, t_type): + pass + + def kill_task(self, job_num): + pass diff --git a/digits/extensions/cluster_management/slurm.py b/digits/extensions/cluster_management/slurm.py new file mode 100755 index 000000000..da8146408 --- /dev/null +++ b/digits/extensions/cluster_management/slurm.py @@ -0,0 +1,73 @@ +import os +import tempfile +import digits +from cluster_manager import cluster_manager +import subprocess + + +def get_digits_tmpdir(): + # users should set DIGITS_TMP to a dir that is available to all nodes + if os.environ.get('JENKINS_URL') is not None: + os.environ['DIGITS_SLURM_TMP'] = os.environ.get('WORKSPACE') + "/tmp" + if os.environ.get('DIGITS_SLURM_TMP') is None: + os.environ['DIGITS_SLURM_TMP'] = os.environ.get('HOME') + "/tmp" + os.environ['TMPDIR'] = os.path.abspath(os.environ.get('DIGITS_SLURM_TMP')) + os.environ['TEMP'] = os.path.abspath(os.environ.get('DIGITS_SLURM_TMP')) + os.environ['TMP'] = os.path.abspath(os.environ.get('DIGITS_SLURM_TMP')) + tempfile.tempdir = os.environ['TMPDIR'] + return os.environ['TMPDIR'] + + +def test_if_slurm_system(): + try: + if os.environ.get('SLURM_HOME'): + # get_digits_tmpdir() + return True + else: + return False + + except OSError: + return False + + +class slurm_manager(cluster_manager): + def __init__(self): + get_digits_tmpdir() + + def pack_args(self, args, time_limit, cpu_count, mem, gpu_count, t_type): + gpu_arg_idx = [i for i, arg in enumerate(args) if arg.startswith('--gpu')] + if gpu_arg_idx: + gpu_arg_idx = gpu_arg_idx[0] + gpus = gpu_count + print time_limit + print cpu_count + print mem + + if not time_limit or time_limit == 0: + time_limit = 30 + if not cpu_count: + cpu_count = 1 + if not mem: + mem = 4 + + # set caffe to use all available gpus + # This is assuming that $CUDA_VISIBLE_DEVICES is set for each task on the nodes\ + + if issubclass(t_type, digits.model.tasks.TrainTask): + if gpu_arg_idx: + args[gpu_arg_idx] = '--gpu=all' + + if gpus == 0: + args = ['salloc', '-t', str(time_limit), '-c', str(cpu_count), + '--mem=' + str(mem) + 'GB', 'srun'] + args + else: + args = ['salloc', '-t', str(time_limit), '-c', str(cpu_count), + '--mem=' + str(mem) + 'GB', + '--gres=gpu:' + str(gpus), 'srun'] + args + + return args + + def kill_task(self, job_num): + args = ['scancel', job_num] + subprocess.call(args) + return diff --git a/digits/inference/tasks/inference.py b/digits/inference/tasks/inference.py old mode 100644 new mode 100755 diff --git a/digits/job.py b/digits/job.py old mode 100644 new mode 100755 diff --git a/digits/model/forms.py b/digits/model/forms.py old mode 100644 new mode 100755 index 797e2e9a3..4fa167101 --- a/digits/model/forms.py +++ b/digits/model/forms.py @@ -1,382 +1,386 @@ -# Copyright (c) 2014-2017, NVIDIA CORPORATION. All rights reserved. - -import os - -import flask -from flask.ext.wtf import Form -import wtforms -from wtforms import validators - -from digits.config import config_value -from digits.device_query import get_device, get_nvml_info -from digits import utils -from digits.utils import sizeof_fmt -from digits.utils.forms import validate_required_iff -from digits import frameworks - - -class ModelForm(Form): - - # Methods - - def selection_exists_in_choices(form, field): - found = False - for choice in field.choices: - if choice[0] == field.data: - found = True - if not found: - raise validators.ValidationError("Selected job doesn't exist. Maybe it was deleted by another user.") - - def validate_NetParameter(form, field): - fw = frameworks.get_framework_by_id(form['framework'].data) - try: - # below function raises a BadNetworkException in case of validation error - fw.validate_network(field.data) - except frameworks.errors.BadNetworkError as e: - raise validators.ValidationError('Bad network: %s' % e.message) - - def validate_file_exists(form, field): - from_client = bool(form.python_layer_from_client.data) - - filename = '' - if not from_client and field.type == 'StringField': - filename = field.data - - if filename == '': - return - - if not os.path.isfile(filename): - raise validators.ValidationError('Server side file, %s, does not exist.' % filename) - - def validate_py_ext(form, field): - from_client = bool(form.python_layer_from_client.data) - - filename = '' - if from_client and field.type == 'FileField': - filename = flask.request.files[field.name].filename - elif not from_client and field.type == 'StringField': - filename = field.data - - if filename == '': - return - - (root, ext) = os.path.splitext(filename) - if ext != '.py' and ext != '.pyc': - raise validators.ValidationError('Python file, %s, needs .py or .pyc extension.' % filename) - - # Fields - - # The options for this get set in the view (since they are dynamic) - dataset = utils.forms.SelectField( - 'Select Dataset', - choices=[], - tooltip="Choose the dataset to use for this model." - ) - - python_layer_from_client = utils.forms.BooleanField( - u'Use client-side file', - default=False, - ) - - python_layer_client_file = utils.forms.FileField( - u'Client-side file', - validators=[ - validate_py_ext - ], - tooltip="Choose a Python file on the client containing layer definitions." - ) - python_layer_server_file = utils.forms.StringField( - u'Server-side file', - validators=[ - validate_file_exists, - validate_py_ext - ], - tooltip="Choose a Python file on the server containing layer definitions." - ) - - train_epochs = utils.forms.IntegerField( - 'Training epochs', - validators=[ - validators.NumberRange(min=1) - ], - default=30, - tooltip="How many passes through the training data?" - ) - - snapshot_interval = utils.forms.FloatField( - 'Snapshot interval (in epochs)', - default=1, - validators=[ - validators.NumberRange(min=0), - ], - tooltip="How many epochs of training between taking a snapshot?" - ) - - val_interval = utils.forms.FloatField( - 'Validation interval (in epochs)', - default=1, - validators=[ - validators.NumberRange(min=0) - ], - tooltip="How many epochs of training between running through one pass of the validation data?" - ) - - random_seed = utils.forms.IntegerField( - 'Random seed', - validators=[ - validators.NumberRange(min=0), - validators.Optional(), - ], - tooltip=('If you provide a random seed, then back-to-back runs with ' - 'the same model and dataset should give identical results.') - ) - - batch_size = utils.forms.MultiIntegerField( - 'Batch size', - validators=[ - utils.forms.MultiNumberRange(min=1), - utils.forms.MultiOptional(), - ], - tooltip="How many images to process at once. If blank, values are used from the network definition." - ) - - batch_accumulation = utils.forms.IntegerField( - 'Batch Accumulation', - validators=[ - validators.NumberRange(min=1), - validators.Optional(), - ], - tooltip=("Accumulate gradients over multiple batches (useful when you " - "need a bigger batch size for training but it doesn't fit in memory).") - ) - - # Solver types - - solver_type = utils.forms.SelectField( - 'Solver type', - choices=[ - ('SGD', 'Stochastic gradient descent (SGD)'), - ('NESTEROV', "Nesterov's accelerated gradient (NAG)"), - ('ADAGRAD', 'Adaptive gradient (AdaGrad)'), - ('RMSPROP', 'RMSprop'), - ('ADADELTA', 'AdaDelta'), - ('ADAM', 'Adam'), - ], - default='SGD', - tooltip="What type of solver will be used?", - ) - - def validate_solver_type(form, field): - fw = frameworks.get_framework_by_id(form.framework) - if fw is not None: - if not fw.supports_solver_type(field.data): - raise validators.ValidationError( - 'Solver type not supported by this framework') - - # Additional settings specific to selected solver - - rms_decay = utils.forms.FloatField( - 'RMS decay value', - default=0.99, - validators=[ - validators.NumberRange(min=0), - ], - tooltip=("If the gradient updates results in oscillations the gradient is reduced " - "by times 1-rms_decay. Otherwise it will be increased by rms_decay.") - ) - - # Learning rate - - learning_rate = utils.forms.MultiFloatField( - 'Base Learning Rate', - default=0.01, - validators=[ - utils.forms.MultiNumberRange(min=0), - ], - tooltip=("Affects how quickly the network learns. If you are getting " - "NaN for your loss, you probably need to lower this value.") - ) - - lr_policy = wtforms.SelectField( - 'Policy', - choices=[ - ('fixed', 'Fixed'), - ('step', 'Step Down'), - ('multistep', 'Step Down (arbitrary steps)'), - ('exp', 'Exponential Decay'), - ('inv', 'Inverse Decay'), - ('poly', 'Polynomial Decay'), - ('sigmoid', 'Sigmoid Decay'), - ], - default='step' - ) - - lr_step_size = wtforms.FloatField('Step Size', default=33) - lr_step_gamma = wtforms.FloatField('Gamma', default=0.1) - lr_multistep_values = wtforms.StringField('Step Values', default="50,85") - - def validate_lr_multistep_values(form, field): - if form.lr_policy.data == 'multistep': - for value in field.data.split(','): - try: - float(value) - except ValueError: - raise validators.ValidationError('invalid value') - - lr_multistep_gamma = wtforms.FloatField('Gamma', default=0.5) - lr_exp_gamma = wtforms.FloatField('Gamma', default=0.95) - lr_inv_gamma = wtforms.FloatField('Gamma', default=0.1) - lr_inv_power = wtforms.FloatField('Power', default=0.5) - lr_poly_power = wtforms.FloatField('Power', default=3) - lr_sigmoid_step = wtforms.FloatField('Step', default=50) - lr_sigmoid_gamma = wtforms.FloatField('Gamma', default=0.1) - - # Network - - # Use a SelectField instead of a HiddenField so that the default value - # is used when nothing is provided (through the REST API) - method = wtforms.SelectField( - u'Network type', - choices=[ - ('standard', 'Standard network'), - ('previous', 'Previous network'), - ('pretrained', 'Pretrained network'), - ('custom', 'Custom network'), - ], - default='standard', - ) - - # framework - hidden field, set by Javascript to the selected framework ID - framework = wtforms.HiddenField( - 'framework', - validators=[ - validators.AnyOf( - [fw.get_id() for fw in frameworks.get_frameworks()], - message='The framework you choose is not currently supported.' - ) - ], - default=frameworks.get_frameworks()[0].get_id() - ) - - # The options for this get set in the view (since they are dependent on the data type) - standard_networks = wtforms.RadioField( - 'Standard Networks', - validators=[ - validate_required_iff(method='standard'), - ], - ) - - previous_networks = wtforms.RadioField( - 'Previous Networks', - choices=[], - validators=[ - validate_required_iff(method='previous'), - selection_exists_in_choices, - ], - ) - - pretrained_networks = wtforms.RadioField( - 'Pretrained Networks', - choices=[], - validators=[ - validate_required_iff(method='pretrained'), - selection_exists_in_choices, - ], - ) - - custom_network = utils.forms.TextAreaField( - 'Custom Network', - validators=[ - validate_required_iff(method='custom'), - validate_NetParameter, - ], - ) - - custom_network_snapshot = utils.forms.TextField( - 'Pretrained model(s)', - tooltip=("Paths to pretrained model files, separated by '%s'. " - "Only edit this field if you understand how fine-tuning " - "works in caffe or torch." % os.path.pathsep) - ) - - def validate_custom_network_snapshot(form, field): - if form.method.data == 'custom': - for filename in field.data.strip().split(os.path.pathsep): - if filename and not os.path.exists(filename): - raise validators.ValidationError('File "%s" does not exist' % filename) - - # Select one of several GPUs - select_gpu = wtforms.RadioField( - 'Select which GPU you would like to use', - choices=[('next', 'Next available')] + [( - index, - '#%s - %s (%s memory)' % ( - index, - get_device(index).name, - sizeof_fmt( - get_nvml_info(index)['memory']['total'] - if get_nvml_info(index) and 'memory' in get_nvml_info(index) - else get_device(index).totalGlobalMem) - ), - ) for index in config_value('gpu_list').split(',') if index], - default='next', - ) - - # Select N of several GPUs - select_gpus = utils.forms.SelectMultipleField( - 'Select which GPU[s] you would like to use', - choices=[( - index, - '#%s - %s (%s memory)' % ( - index, - get_device(index).name, - sizeof_fmt( - get_nvml_info(index)['memory']['total'] - if get_nvml_info(index) and 'memory' in get_nvml_info(index) - else get_device(index).totalGlobalMem) - ), - ) for index in config_value('gpu_list').split(',') if index], - tooltip="The job won't start until all of the chosen GPUs are available." - ) - - # XXX For testing - # The Flask test framework can't handle SelectMultipleFields correctly - select_gpus_list = wtforms.StringField('Select which GPU[s] you would like to use (comma separated)') - - def validate_select_gpus(form, field): - if form.select_gpus_list.data: - field.data = form.select_gpus_list.data.split(',') - - # Use next available N GPUs - select_gpu_count = wtforms.IntegerField('Use this many GPUs (next available)', - validators=[ - validators.NumberRange(min=1, max=len( - config_value('gpu_list').split(','))) - ], - default=1, - ) - - def validate_select_gpu_count(form, field): - if field.data is None: - if form.select_gpus.data: - # Make this field optional - field.errors[:] = [] - raise validators.StopValidation() - - model_name = utils.forms.StringField('Model Name', - validators=[ - validators.DataRequired() - ], - tooltip="An identifier, later used to refer to this model in the Application." - ) - - group_name = utils.forms.StringField('Group Name', - tooltip="An optional group name for organization on the main page." - ) - - # allows shuffling data during training (for frameworks that support this, as indicated by - # their Framework.can_shuffle_data() method) - shuffle = utils.forms.BooleanField('Shuffle Train Data', - default=True, - tooltip='For every epoch, shuffle the data before training.' - ) +# Copyright (c) 2014-2017, NVIDIA CORPORATION. All rights reserved. + +import os + +import flask +from flask.ext.wtf import Form +import wtforms +from wtforms import validators + +from digits.config import config_value +from digits.device_query import get_device, get_nvml_info +from digits import utils +from digits.utils import sizeof_fmt +from digits.utils.forms import validate_required_iff +from digits import frameworks + + +class ModelForm(Form): + # Methods + + def selection_exists_in_choices(form, field): + found = False + for choice in field.choices: + if choice[0] == field.data: + found = True + if not found: + raise validators.ValidationError("Selected job doesn't exist. Maybe it was deleted by another user.") + + def validate_NetParameter(form, field): + fw = frameworks.get_framework_by_id(form['framework'].data) + try: + # below function raises a BadNetworkException in case of validation error + fw.validate_network(field.data) + except frameworks.errors.BadNetworkError as e: + raise validators.ValidationError('Bad network: %s' % e.message) + + def validate_file_exists(form, field): + from_client = bool(form.python_layer_from_client.data) + + filename = '' + if not from_client and field.type == 'StringField': + filename = field.data + + if filename == '': + return + + if not os.path.isfile(filename): + raise validators.ValidationError('Server side file, %s, does not exist.' % filename) + + def validate_py_ext(form, field): + from_client = bool(form.python_layer_from_client.data) + + filename = '' + if from_client and field.type == 'FileField': + filename = flask.request.files[field.name].filename + elif not from_client and field.type == 'StringField': + filename = field.data + + if filename == '': + return + + (root, ext) = os.path.splitext(filename) + if ext != '.py' and ext != '.pyc': + raise validators.ValidationError('Python file, %s, needs .py or .pyc extension.' % filename) + + # Fields + + # The options for this get set in the view (since they are dynamic) + dataset = utils.forms.SelectField( + 'Select Dataset', + choices=[], + tooltip="Choose the dataset to use for this model." + ) + + python_layer_from_client = utils.forms.BooleanField( + u'Use client-side file', + default=False, + ) + + python_layer_client_file = utils.forms.FileField( + u'Client-side file', + validators=[ + validate_py_ext + ], + tooltip="Choose a Python file on the client containing layer definitions." + ) + python_layer_server_file = utils.forms.StringField( + u'Server-side file', + validators=[ + validate_file_exists, + validate_py_ext + ], + tooltip="Choose a Python file on the server containing layer definitions." + ) + + train_epochs = utils.forms.IntegerField( + 'Training epochs', + validators=[ + validators.NumberRange(min=1) + ], + default=30, + tooltip="How many passes through the training data?" + ) + + snapshot_interval = utils.forms.FloatField( + 'Snapshot interval (in epochs)', + default=1, + validators=[ + validators.NumberRange(min=0), + ], + tooltip="How many epochs of training between taking a snapshot?" + ) + + val_interval = utils.forms.FloatField( + 'Validation interval (in epochs)', + default=1, + validators=[ + validators.NumberRange(min=0) + ], + tooltip="How many epochs of training between running through one pass of the validation data?" + ) + + random_seed = utils.forms.IntegerField( + 'Random seed', + validators=[ + validators.NumberRange(min=0), + validators.Optional(), + ], + tooltip=('If you provide a random seed, then back-to-back runs with ' + 'the same model and dataset should give identical results.') + ) + + batch_size = utils.forms.MultiIntegerField( + 'Batch size', + validators=[ + utils.forms.MultiNumberRange(min=1), + utils.forms.MultiOptional(), + ], + tooltip="How many images to process at once. If blank, values are used from the network definition." + ) + + batch_accumulation = utils.forms.IntegerField( + 'Batch Accumulation', + validators=[ + validators.NumberRange(min=1), + validators.Optional(), + ], + tooltip=("Accumulate gradients over multiple batches (useful when you " + "need a bigger batch size for training but it doesn't fit in memory).") + ) + + # Solver types + + solver_type = utils.forms.SelectField( + 'Solver type', + choices=[ + ('SGD', 'Stochastic gradient descent (SGD)'), + ('NESTEROV', "Nesterov's accelerated gradient (NAG)"), + ('ADAGRAD', 'Adaptive gradient (AdaGrad)'), + ('RMSPROP', 'RMSprop'), + ('ADADELTA', 'AdaDelta'), + ('ADAM', 'Adam'), + ], + default='SGD', + tooltip="What type of solver will be used?", + ) + + def validate_solver_type(form, field): + fw = frameworks.get_framework_by_id(form.framework) + if fw is not None: + if not fw.supports_solver_type(field.data): + raise validators.ValidationError( + 'Solver type not supported by this framework') + + # Additional settings specific to selected solver + + rms_decay = utils.forms.FloatField( + 'RMS decay value', + default=0.99, + validators=[ + validators.NumberRange(min=0), + ], + tooltip=("If the gradient updates results in oscillations the gradient is reduced " + "by times 1-rms_decay. Otherwise it will be increased by rms_decay.") + ) + + # Learning rate + + learning_rate = utils.forms.MultiFloatField( + 'Base Learning Rate', + default=0.01, + validators=[ + utils.forms.MultiNumberRange(min=0), + ], + tooltip=("Affects how quickly the network learns. If you are getting " + "NaN for your loss, you probably need to lower this value.") + ) + + lr_policy = wtforms.SelectField( + 'Policy', + choices=[ + ('fixed', 'Fixed'), + ('step', 'Step Down'), + ('multistep', 'Step Down (arbitrary steps)'), + ('exp', 'Exponential Decay'), + ('inv', 'Inverse Decay'), + ('poly', 'Polynomial Decay'), + ('sigmoid', 'Sigmoid Decay'), + ], + default='step' + ) + + lr_step_size = wtforms.FloatField('Step Size', default=33) + lr_step_gamma = wtforms.FloatField('Gamma', default=0.1) + lr_multistep_values = wtforms.StringField('Step Values', default="50,85") + + def validate_lr_multistep_values(form, field): + if form.lr_policy.data == 'multistep': + for value in field.data.split(','): + try: + float(value) + except ValueError: + raise validators.ValidationError('invalid value') + + lr_multistep_gamma = wtforms.FloatField('Gamma', default=0.5) + lr_exp_gamma = wtforms.FloatField('Gamma', default=0.95) + lr_inv_gamma = wtforms.FloatField('Gamma', default=0.1) + lr_inv_power = wtforms.FloatField('Power', default=0.5) + lr_poly_power = wtforms.FloatField('Power', default=3) + lr_sigmoid_step = wtforms.FloatField('Step', default=50) + lr_sigmoid_gamma = wtforms.FloatField('Gamma', default=0.1) + + # Network + + # Use a SelectField instead of a HiddenField so that the default value + # is used when nothing is provided (through the REST API) + method = wtforms.SelectField( + u'Network type', + choices=[ + ('standard', 'Standard network'), + ('previous', 'Previous network'), + ('pretrained', 'Pretrained network'), + ('custom', 'Custom network'), + ], + default='standard', + ) + + # framework - hidden field, set by Javascript to the selected framework ID + framework = wtforms.HiddenField( + 'framework', + validators=[ + validators.AnyOf( + [fw.get_id() for fw in frameworks.get_frameworks()], + message='The framework you choose is not currently supported.' + ) + ], + default=frameworks.get_frameworks()[0].get_id() + ) + + # The options for this get set in the view (since they are dependent on the data type) + standard_networks = wtforms.RadioField( + 'Standard Networks', + validators=[ + validate_required_iff(method='standard'), + ], + ) + + previous_networks = wtforms.RadioField( + 'Previous Networks', + choices=[], + validators=[ + validate_required_iff(method='previous'), + selection_exists_in_choices, + ], + ) + + pretrained_networks = wtforms.RadioField( + 'Pretrained Networks', + choices=[], + validators=[ + validate_required_iff(method='pretrained'), + selection_exists_in_choices, + ], + ) + + custom_network = utils.forms.TextAreaField( + 'Custom Network', + validators=[ + validate_required_iff(method='custom'), + validate_NetParameter, + ], + ) + + custom_network_snapshot = utils.forms.TextField( + 'Pretrained model(s)', + tooltip=("Paths to pretrained model files, separated by '%s'. " + "Only edit this field if you understand how fine-tuning " + "works in caffe or torch." % os.path.pathsep) + ) + + def validate_custom_network_snapshot(form, field): + if form.method.data == 'custom': + for filename in field.data.strip().split(os.path.pathsep): + if filename and not os.path.exists(filename): + raise validators.ValidationError('File "%s" does not exist' % filename) + + # Select one of several GPUs + select_gpu = wtforms.RadioField( + 'Select which GPU you would like to use', + choices=[('next', 'Next available')] + [(index, '#%s - %s (%s memory)' % ( + index, + get_device(index).name, + sizeof_fmt( + get_nvml_info(index)['memory']['total'] + if get_nvml_info(index) and 'memory' in get_nvml_info(index) + else get_device(index).totalGlobalMem) + ),) for index in config_value('gpu_list').split(',') if index], default='next', ) + # slurm options + slurm_selector = utils.forms.BooleanField('Use slurm?') + slurm_time_limit = utils.forms.IntegerField('Task time limit', tooltip='in minutes', default=0, ) + slurm_cpu_count = utils.forms.IntegerField('Use this many cores', validators=[ + validators.NumberRange(min=1, max=128) + ], default=8, ) + slurm_mem = utils.forms.IntegerField('Use this much memory (GB)', validators=[ + validators.NumberRange(min=1, max=128) + ], default=30, ) + + # Select N of several GPUs + select_gpus = utils.forms.SelectMultipleField( + 'Select which GPU[s] you would like to use', + choices=[(index, '#%s - %s (%s memory)' % ( + index, get_device(index).name, + sizeof_fmt(get_nvml_info(index)['memory']['total'] + if get_nvml_info(index) and 'memory' in get_nvml_info(index) + else get_device(index).totalGlobalMem)), + ) for index in config_value('gpu_list').split(',') if index], + tooltip="The job won't start until all of the chosen GPUs are available.") + + # XXX For testing + # The Flask test framework can't handle SelectMultipleFields correctly + select_gpus_list = wtforms.StringField('Select which GPU[s] you would like to use (comma separated)') + + def validate_select_gpus(form, field): + if form.select_gpus_list.data: + field.data = form.select_gpus_list.data.split(',') + + # Use next available N GPUs + select_gpu_count_slurm = wtforms.IntegerField('Use this many GPUs (next available)', + validators=[ + validators.NumberRange(min=1) + ], + default=1, + ) + + select_gpu_count = wtforms.IntegerField('Use this many GPUs (next available)', + validators=[ + validators.NumberRange(min=1, max=len( + config_value('gpu_list').split(','))) + ], + default=1, + ) + + def validate_select_gpu_count(form, field): + if field.data is None: + if form.select_gpus.data: + # Make this field optional + field.errors[:] = [] + raise validators.StopValidation() + + model_name = utils.forms.StringField('Model Name', + validators=[ + validators.DataRequired() + ], + tooltip="An identifier, later used to refer to this model in the Application." + ) + + group_name = utils.forms.StringField('Group Name', + tooltip="An optional group name for organization on the main page." + ) + + # allows shuffling data during training (for frameworks that support this, as indicated by + # their Framework.can_shuffle_data() method) + shuffle = utils.forms.BooleanField('Shuffle Train Data', + default=True, + tooltip='For every epoch, shuffle the data before training.' + ) diff --git a/digits/model/images/classification/test_views.py b/digits/model/images/classification/test_views.py old mode 100644 new mode 100755 diff --git a/digits/model/images/classification/views.py b/digits/model/images/classification/views.py old mode 100644 new mode 100755 index 9d5d12012..2e7d3329a --- a/digits/model/images/classification/views.py +++ b/digits/model/images/classification/views.py @@ -22,7 +22,6 @@ from digits.utils.forms import fill_form_if_cloned, save_form_to_job from digits.utils.routing import request_wants_json, job_from_request from digits.webapp import scheduler - blueprint = flask.Blueprint(__name__, __name__) """ @@ -64,18 +63,23 @@ def new(): """ Return a form for a new ImageClassificationModelJob """ + # cluster_factory.cluster_factory.get_running_systems() + form = ImageClassificationModelForm() form.dataset.choices = get_datasets() form.standard_networks.choices = get_standard_networks() form.standard_networks.default = get_default_standard_network() form.previous_networks.choices = get_previous_networks() form.pretrained_networks.choices = get_pretrained_networks() + if config_value("system_type") == 'slurm': + form.select_gpu_count.validators = form.select_gpu_count_slurm.validators prev_network_snapshots = get_previous_network_snapshots() # Is there a request to clone a job with ?clone= fill_form_if_cloned(form) - + if config_value('system_type') == 'slurm': + config_value('caffe')['multi_gpu'] = True return flask.render_template('models/images/classification/new.html', form=form, frameworks=frameworks.get_frameworks(), @@ -83,6 +87,7 @@ def new(): previous_networks_fullinfo=get_previous_networks_fulldetails(), pretrained_networks_fullinfo=get_pretrained_networks_fulldetails(), multi_gpu=config_value('caffe')['multi_gpu'], + system_type=config_value('system_type'), ) @@ -101,7 +106,8 @@ def create(): form.standard_networks.default = get_default_standard_network() form.previous_networks.choices = get_previous_networks() form.pretrained_networks.choices = get_pretrained_networks() - + if config_value("system_type") == 'slurm': + form.select_gpu_count.validators = form.select_gpu_count_slurm.validators prev_network_snapshots = get_previous_network_snapshots() # Is there a request to clone a job with ?clone= @@ -118,6 +124,7 @@ def create(): previous_networks_fullinfo=get_previous_networks_fulldetails(), pretrained_networks_fullinfo=get_pretrained_networks_fulldetails(), multi_gpu=config_value('caffe')['multi_gpu'], + system_type=config_value('system_type'), ), 400 datasetJob = scheduler.get_job(form.dataset.data) @@ -303,6 +310,9 @@ def create(): rms_decay=form.rms_decay.data, shuffle=form.shuffle.data, data_aug=data_aug, + time_limit=form.slurm_time_limit.data, + s_cpu_count=form.slurm_cpu_count.data, + s_mem=form.slurm_mem.data, ) ) @@ -333,6 +343,7 @@ def show(job, related_jobs=None): """ Called from digits.model.views.models_show() """ + return flask.render_template( 'models/images/classification/show.html', job=job, @@ -369,6 +380,7 @@ def classify_one(): image_path = flask.request.form['image_path'] elif 'image_file' in flask.request.files and flask.request.files['image_file']: outfile = tempfile.mkstemp(suffix='.png') + flask.request.files['image_file'].save(outfile[1]) image_path = outfile[1] os.close(outfile[0]) diff --git a/digits/model/images/generic/views.py b/digits/model/images/generic/views.py old mode 100644 new mode 100755 index 01a9270d7..715e37ae4 --- a/digits/model/images/generic/views.py +++ b/digits/model/images/generic/views.py @@ -39,6 +39,8 @@ def new(extension_id=None): form.pretrained_networks.choices = get_pretrained_networks() prev_network_snapshots = get_previous_network_snapshots() + if config_value("system_type") == 'slurm': + form.select_gpu_count.validators = form.select_gpu_count_slurm.validators # Is there a request to clone a job with ?clone= fill_form_if_cloned(form) @@ -52,6 +54,7 @@ def new(extension_id=None): previous_networks_fullinfo=get_previous_networks_fulldetails(), pretrained_networks_fullinfo=get_pretrained_networks_fulldetails(), multi_gpu=config_value('caffe')['multi_gpu'], + system_type=config_value('system_type'), ) @@ -73,6 +76,8 @@ def create(extension_id=None): form.pretrained_networks.choices = get_pretrained_networks() prev_network_snapshots = get_previous_network_snapshots() + if config_value("system_type") == 'slurm': + form.select_gpu_count.validators = form.select_gpu_count_slurm.validators # Is there a request to clone a job with ?clone= fill_form_if_cloned(form) @@ -91,6 +96,7 @@ def create(extension_id=None): previous_networks_fullinfo=get_previous_networks_fulldetails(), pretrained_networks_fullinfo=get_pretrained_networks_fulldetails(), multi_gpu=config_value('caffe')['multi_gpu'], + system_type=config_value('system_type'), ), 400 datasetJob = scheduler.get_job(form.dataset.data) @@ -262,10 +268,14 @@ def create(extension_id=None): rms_decay=form.rms_decay.data, shuffle=form.shuffle.data, data_aug=data_aug, + time_limit=form.slurm_time_limit.data, + s_cpu_count=form.slurm_cpu_count.data, + s_mem=form.slurm_mem.data, ) ) # Save form data with the job so we can easily clone it later. + save_form_to_job(job, form) jobs.append(job) diff --git a/digits/model/tasks/caffe_train.py b/digits/model/tasks/caffe_train.py old mode 100644 new mode 100755 index 6512e2f5a..247f7b234 --- a/digits/model/tasks/caffe_train.py +++ b/digits/model/tasks/caffe_train.py @@ -111,7 +111,7 @@ def upgrade_network(network): @staticmethod def set_mode(gpu): if gpu is not None: - caffe.set_device(gpu) + caffe.set_device(int(gpu)) caffe.set_mode_gpu() else: caffe.set_mode_cpu() diff --git a/digits/model/tasks/torch_train.py b/digits/model/tasks/torch_train.py old mode 100644 new mode 100755 index 64cedfa25..be4d7eae5 --- a/digits/model/tasks/torch_train.py +++ b/digits/model/tasks/torch_train.py @@ -603,7 +603,8 @@ def infer_one_image(self, image, snapshot_epoch=None, layers=None, gpu=None): if gpu is not None: args.append('--type=cuda') # make only the selected GPU visible - env['CUDA_VISIBLE_DEVICES'] = subprocess_visible_devices([gpu]) + if config_value('system_type') == 'interactive': + env['CUDA_VISIBLE_DEVICES'] = subprocess_visible_devices([gpu]) else: args.append('--type=float') diff --git a/digits/model/tasks/train.py b/digits/model/tasks/train.py old mode 100644 new mode 100755 index 5c4201df9..f80450d51 --- a/digits/model/tasks/train.py +++ b/digits/model/tasks/train.py @@ -64,6 +64,11 @@ def __init__(self, job, dataset, train_epochs, snapshot_interval, learning_rate, self.framework_id = kwargs.pop('framework_id', None) self.data_aug = kwargs.pop('data_aug', None) + # slurm job options + + self.time_limit = kwargs.pop('time_limit', None) + self.s_cpu_count = kwargs.pop('s_cpu_count', None) + self.s_mem = kwargs.pop('s_mem', None) super(TrainTask, self).__init__(job_dir=job.dir(), **kwargs) self.pickver_task_train = PICKLE_VERSION @@ -125,6 +130,9 @@ def __setstate__(self, state): @override def offer_resources(self, resources): + # gives non interactive tasks as many gpus as they want + if self.system_type != 'interactive': + return {'gpus': [(str(i), 1) for i in range(0, self.gpu_count)]} if 'gpus' not in resources: return None if not resources['gpus']: @@ -162,12 +170,13 @@ def offer_resources(self, resources): def before_run(self): # start a thread which sends SocketIO updates about hardware utilization gpus = None - if 'gpus' in self.current_resources: - gpus = [identifier for (identifier, value) in self.current_resources['gpus']] + if self.system_type == 'interactive': + if 'gpus' in self.current_resources: + gpus = [identifier for (identifier, value) in self.current_resources['gpus']] - self._hw_socketio_thread = gevent.spawn( - self.hw_socketio_updater, - gpus) + self._hw_socketio_thread = gevent.spawn( + self.hw_socketio_updater, + gpus) def hw_socketio_updater(self, gpus): """ diff --git a/digits/pretrained_model/test_views.py b/digits/pretrained_model/test_views.py old mode 100644 new mode 100755 index a0f103b0f..baed325a8 --- a/digits/pretrained_model/test_views.py +++ b/digits/pretrained_model/test_views.py @@ -16,6 +16,7 @@ # May be too short on a slow system + TIMEOUT_DATASET = 45 TIMEOUT_MODEL = 60 diff --git a/digits/scheduler.py b/digits/scheduler.py old mode 100644 new mode 100755 index a706bb0bc..bf6af8ed5 --- a/digits/scheduler.py +++ b/digits/scheduler.py @@ -102,8 +102,8 @@ def __init__(self, gpu_list=None, verbose=False): """ self.jobs = OrderedDict() self.verbose = verbose - # Keeps track of resource usage + self.resources = { # TODO: break this into CPU cores, memory usage, IO usage, etc. 'parse_folder_task_pool': [Resource()], @@ -397,12 +397,14 @@ def start_this_job(job): # try to start the task if task.ready_to_queue(): requested_resources = task.offer_resources(self.resources) - if requested_resources is None: + if requested_resources is None and task.system_type == 'interactive': task.status = Status.WAIT else: - if self.reserve_resources(task, requested_resources): - gevent.spawn(self.run_task, - task, requested_resources) + # This stops digits from repeatedly spawning slurm jobs when waiting + if task.system_type == 'interactive' or task.status != Status.WAIT: + if self.reserve_resources(task, requested_resources): + gevent.spawn(self.run_task, + task, requested_resources) elif task.status == Status.RUN: # job is not done alldone = False @@ -467,19 +469,21 @@ def reserve_resources(self, task, resources): """ try: # reserve resources - for resource_type, requests in resources.iteritems(): - for identifier, value in requests: - found = False - for resource in self.resources[resource_type]: - if resource.identifier == identifier: - resource.allocate(task, value) - self.emit_gpus_available() - found = True - break - if not found: - raise RuntimeError('Resource "%s" with identifier="%s" not found' % ( - resource_type, identifier)) - task.current_resources = resources + # no need to do this for non interactive systems as they should be running their own scheduling + if task.system_type == 'interactive': + for resource_type, requests in resources.iteritems(): + for identifier, value in requests: + found = False + for resource in self.resources[resource_type]: + if resource.identifier == identifier: + resource.allocate(task, value) + self.emit_gpus_available() + found = True + break + if not found: + raise RuntimeError('Resource "%s" with identifier="%s" not found' % ( + resource_type, identifier)) + task.current_resources = resources return True except Exception as e: self.task_error(task, e) diff --git a/digits/task.py b/digits/task.py old mode 100644 new mode 100755 index be36ac370..1c1b49d2a --- a/digits/task.py +++ b/digits/task.py @@ -16,7 +16,9 @@ from .config import config_value from .status import Status, StatusCls import digits.log +from digits.extensions.cluster_management import cluster_factory +# from digits.extensions.cluster_management.slurm import pack_slurm_args # NOTE: Increment this every time the pickled version changes PICKLE_VERSION = 1 @@ -29,9 +31,19 @@ class Task(StatusCls): """ def __init__(self, job_dir, parents=None): + # Detect if slurm is available + # This should be moved to a better location that contains system infor + # as this should contain only job based information + # TODO add other systems to the detection + self.system_type = config_value('system_type') + super(Task, self).__init__() self.pickver_task = PICKLE_VERSION + # vars for slurm job details + self.node = "" + self.job_num = "" + self.job_dir = job_dir self.job_id = os.path.basename(job_dir) @@ -101,6 +113,8 @@ def on_status_update(self): 'css': self.status.css, 'show': (self.status in [Status.RUN, Status.ERROR]), 'running': self.status.is_running(), + 'node': self.node, + 'job_num': self.job_num } with app.app_context(): message['html'] = flask.render_template('status_updates.html', @@ -180,6 +194,7 @@ def before_run(self): pass def run(self, resources): + """ Execute the task @@ -198,14 +213,38 @@ def run(self, resources): args = [str(x) for x in args] self.logger.info('%s task started.' % self.name()) - self.status = Status.RUN - unrecognized_output = [] - import sys env['PYTHONPATH'] = os.pathsep.join(['.', self.job_dir, env.get('PYTHONPATH', '')] + sys.path) - - # https://docs.python.org/2/library/subprocess.html#converting-argument-sequence + # SLURM PROCESSING + print type(self) + if self.system_type != 'interactive': + cf = cluster_factory.cluster_factory() + cm = cf.get_cluster_manager() + # Check for arguments if missing fill with defaults + try: + self.gpu_count + except: + self.gpu_count = 1 + try: + self.time_limit + except: + self.time_limit = 0 + try: + self.s_cpu_count + except: + self.s_cpu_count = 0 + try: + self.s_mem + except: + self.s_mem = 0 + # Create a slurm command + args = cm.pack_args(args, self.time_limit, + self.s_cpu_count, self.s_mem, self.gpu_count, type(self)) + self.status = Status.WAIT + else: + self.status = Status.RUN + # del args[len(args) - 1] if platform.system() == 'Windows': args = ' '.join(args) self.logger.info('Task subprocess args: "{}"'.format(args)) @@ -215,7 +254,7 @@ def run(self, resources): self.p = subprocess.Popen(args, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, - cwd=self.job_dir, + cwd=os.path.relpath(self.job_dir), close_fds=False if platform.system() == 'Windows' else True, env=env, ) @@ -227,10 +266,16 @@ def run(self, resources): for line in utils.nonblocking_readlines(self.p.stdout): if self.aborted.is_set(): if sigterm_time is None: + print "graceful shutdown" # Attempt graceful shutdown - self.p.send_signal(signal.SIGTERM) - sigterm_time = time.time() - self.status = Status.ABORT + if self.job_num: + cm.kill_task(self.job_num) + sigterm_time = time.time() + self.status = Status.ABORT + else: + self.p.send_signal(signal.SIGTERM) + sigterm_time = time.time() + self.status = Status.ABORT break if line is not None: @@ -238,17 +283,27 @@ def run(self, resources): line = line.strip() if line: + print line + + if not self.job_num and line.find('allocation') > 1: + jobNums = [int(s) for s in line.split() if s.isdigit()] + self.job_num = str(jobNums[0]) + # self.on_status_update() + if self.status != Status.RUN and line.find('Granted') >= 0: + self.status = Status.RUN if not self.process_output(line): self.logger.warning('%s unrecognized output: %s' % (self.name(), line.strip())) unrecognized_output.append(line) else: time.sleep(0.05) if sigterm_time is not None and (time.time() - sigterm_time > sigterm_timeout): + print "sending sigterm" self.p.send_signal(signal.SIGKILL) self.logger.warning('Sent SIGKILL to task "%s"' % self.name()) time.sleep(0.1) time.sleep(0.01) except: + print "exception" self.p.terminate() self.after_run() raise @@ -288,7 +343,11 @@ def preprocess_output_digits(self, line): """ # NOTE: This must change when the logging format changes # YYYY-MM-DD HH:MM:SS [LEVEL] message + if line.find('allocation') > 1: + jobNums = [int(s) for s in line.split() if s.isdigit()] + self.job_num = str(jobNums[0]) match = re.match(r'(\S{10} \S{8}) \[(\w+)\s*\] (.*)$', line) + if match: timestr = match.group(1) timestamp = time.mktime(time.strptime(timestr, digits.log.DATE_FORMAT)) @@ -309,6 +368,7 @@ def preprocess_output_digits(self, line): return (None, None, None) def process_output(self, line): + """ Process a line of output from the task Returns True if the output was able to be processed diff --git a/digits/templates/datasets/generic/new.html b/digits/templates/datasets/generic/new.html old mode 100644 new mode 100755 index 9cd54b9cc..3000b4ce0 --- a/digits/templates/datasets/generic/new.html +++ b/digits/templates/datasets/generic/new.html @@ -32,6 +32,7 @@

New {{ extension_title }} Dataset

+ {% include '/partials/slurm_options.html' %}
{{form.dsopts_feature_encoding.label}} @@ -70,6 +71,7 @@

New {{ extension_title }} Dataset

{{ form.dataset_name.label }} {{ form.dataset_name(class='form-control') }}
+
diff --git a/digits/templates/datasets/images/classification/new.html b/digits/templates/datasets/images/classification/new.html old mode 100644 new mode 100755 index bbea816b7..3d89592d5 --- a/digits/templates/datasets/images/classification/new.html +++ b/digits/templates/datasets/images/classification/new.html @@ -375,6 +375,7 @@

New Image Classification Dataset

+ {% include '/partials/slurm_options.html' %}
{{ form.backend.label }} diff --git a/digits/templates/datasets/images/classification/show.html b/digits/templates/datasets/images/classification/show.html old mode 100644 new mode 100755 diff --git a/digits/templates/datasets/images/generic/new.html b/digits/templates/datasets/images/generic/new.html old mode 100644 new mode 100755 index d0dc68ba1..7087175af --- a/digits/templates/datasets/images/generic/new.html +++ b/digits/templates/datasets/images/generic/new.html @@ -97,6 +97,7 @@

New Image Dataset

+ {% include '/partials/slurm_options.html' %}
{{ form.group_name.label }} diff --git a/digits/templates/job.html b/digits/templates/job.html old mode 100644 new mode 100755 index 305a79c00..7b337d9ba --- a/digits/templates/job.html +++ b/digits/templates/job.html @@ -1,331 +1,338 @@ -{# Copyright (c) 2014-2017, NVIDIA CORPORATION. All rights reserved. #} - -{% from "helper.html" import print_flashes, print_exception %} - -{% extends "layout.html" %} - -{% block title %} -{{job.name()}} -{% endblock %} - -{% block head %} - {% if job.status.is_running() %} - {% with namespace = "/jobs" %} - {% set room = job.id() %} - {% include 'socketio.html' %} - {% endwith %} - - {% endif %} -{% endblock %} - -{% block nav %} -
  • {{job.job_type()}}
  • -{% endblock %} - -{% block content %} -
    -
    -
    -
    -

    {{ job.name() }}

    - {% if job | has_permission('edit') %} - - {% endif %} -
    - -
    - {% if job.username %} - Owner: {{job.username}} - {% endif %} -
    - {% if job.form_data is defined %} - Clone Job - {% else %} - Clone Job - {% endif %} - Abort Job - Delete Job -
    -
    - -
    - -{{ print_flashes() }} - -
    - -
    - {% block job_content %} - {% endblock %} -
    - - -
    -
    -
    -
    -

    - Job Status - {{job.status.name}} -

    -
    -
    -
    -
    - {% with updates = job.status_history %} - {% include "status_updates.html" %} - {% endwith %} -
    -
    - {% for task in job.tasks %} -
    - -
    -
    - -
    - {% with updates = task.status_history %} - {% if task.status == 'E' %} - {% set exception = task.exception %} - {% set traceback = task.traceback %} - {% endif %} - {% include "status_updates.html" %} - {% endwith %} -
    -
    -
    -
    - {% endfor %} -
    -
    -
    -
    -
    - - - - - {% if related_jobs is not none and related_jobs|length > 0 %} -
    -
    -

    - Related jobs -

    -
    -
    -
    -
    - - {% for r_job in related_jobs %} - {% if prev_job_model != r_job.job_type() %} -

    {{r_job.job_type()}}

    - {% set prev_job_model = r_job.job_type() %} - {% endif %} -
    - -
    -
    -
    - {% with updates = r_job.status_history %} - {% include "status_updates.html" %} - {% endwith %} -
    -
    -
    -
    - {% endfor %} -
    -
    -
    -
    - {% endif %} - -
    -
    -

    Notes

    -
    -
    -
    -

    {{ job.notes() }}

    - {% if job | has_permission('edit') %} - - {% endif %} -
    - - -
    - -
    -
    - -
    -
    - {% block job_content_details %} - {% endblock %} -
    - -{% endblock %} - +{# Copyright (c) 2014-2017, NVIDIA CORPORATION. All rights reserved. #} + +{% from "helper.html" import print_flashes, print_exception %} + +{% extends "layout.html" %} + +{% block title %} +{{job.name()}} +{% endblock %} + +{% block head %} + {% if job.status.is_running() %} + {% with namespace = "/jobs" %} + {% set room = job.id() %} + {% include 'socketio.html' %} + {% endwith %} + + {% endif %} +{% endblock %} + +{% block nav %} +
  • {{job.job_type()}}
  • +{% endblock %} + +{% block content %} +
    +
    + +
    + + +
    +

    {{ job.name() }}

    + {% if job | has_permission('edit') %} + + {% endif %} +
    + +
    + {% if job.username %} + Owner: {{job.username}} + {% endif %} +
    + {% if job.form_data is defined %} + Clone Job + {% else %} + Clone Job + {% endif %} + Abort Job + Delete Job +
    +
    + +
    + +{{ print_flashes() }} + +
    + +
    + {% block job_content %} + {% endblock %} +
    + + +
    +
    +
    +
    +

    + Job Status + {{job.status.name}} +

    +
    +
    +
    +
    + {% with updates = job.status_history %} + {% include "status_updates.html" %} + {% endwith %} +
    +
    + {% for task in job.tasks %} +
    + + {%if task.job_num %}Slurm job number: {{task.job_num}} {% endif %} +
    +
    + +
    + {% with updates = task.status_history %} + {% if task.status == 'E' %} + {% set exception = task.exception %} + {% set traceback = task.traceback %} + {% endif %} + {% include "status_updates.html" %} + {% endwith %} +
    +
    +
    +
    + {% endfor %} +
    +
    +
    +
    +
    + + + + + {% if related_jobs is not none and related_jobs|length > 0 %} +
    +
    +

    + Related jobs +

    +
    +
    +
    +
    + + {% for r_job in related_jobs %} + {% if prev_job_model != r_job.job_type() %} +

    {{r_job.job_type()}}

    + {% set prev_job_model = r_job.job_type() %} + {% endif %} +
    + +
    +
    + +
    + {% with updates = r_job.status_history %} + {% include "status_updates.html" %} + {% endwith %} +
    +
    +
    +
    + {% endfor %} +
    +
    +
    +
    + {% endif %} + +
    +
    +

    Notes

    +
    +
    +
    +

    {{ job.notes() }}

    + {% if job | has_permission('edit') %} + + {% endif %} +
    + + +
    + +
    +
    + +
    +
    + {% block job_content_details %} + {% endblock %} +
    + +{% endblock %} diff --git a/digits/templates/layout.html b/digits/templates/layout.html old mode 100644 new mode 100755 index 7157da232..fb7be5527 --- a/digits/templates/layout.html +++ b/digits/templates/layout.html @@ -50,6 +50,13 @@ {% endif %}

    + {% if system_type %} +
  • + +
  • + {% endif %}
  • New Model
  • -{% endblock %} - -{% block content %} - - - - -
    - {{ form.hidden_tag() }} - - {{ print_errors(form) }} - -
    -
    -
    -
    - {{form.dataset.label}} - {{form.dataset.tooltip}} - {{form.dataset(class='form-control', size=5)}} -
    -
    -
    - -
    -
    -

    Python Layers

    - {{form.python_layer_from_client.explanation(file='/models/python_layer_explanation.html')}} -
    -
    - {{form.python_layer_client_file.label}} - {{form.python_layer_client_file.tooltip}} - {{form.python_layer_client_file(class='form-control')}} -
    -
    - {{form.python_layer_server_file.label}} - {{form.python_layer_server_file.tooltip}} - {{form.python_layer_server_file(class='form-control autocomplete_path')}} -
    - -
    -
    -
    - - -
    -
    -

    Solver Options

    - - - - -
    - {{form.train_epochs.label}} - {{form.train_epochs.tooltip}} - {{form.train_epochs(class='form-control')}} -
    -
    - {{form.snapshot_interval.label}} - {{form.snapshot_interval.tooltip}} - {{form.snapshot_interval(class='form-control')}} -
    -
    - {{form.val_interval.label}} - {{form.val_interval.tooltip}} - {{form.val_interval(class='form-control')}} -
    - {# TODO: neat progress bar #} -
    - {{form.random_seed.label}} - {{form.random_seed.tooltip}} - {{form.random_seed(class='form-control', placeholder='[none]')}} -
    -
    - {{form.batch_size.label}} - {{form.batch_size.tooltip}} - - {{form.batch_size.small_text}} - - {{form.batch_size(class='form-control', placeholder='[network defaults]')}} -
    - -
    - {{form.solver_type.label}} - {{form.solver_type.tooltip}} - {{form.solver_type(class='form-control')}} -
    -
    - {{form.rms_decay.label}} - {{form.rms_decay.tooltip}} - {{form.rms_decay(class='form-control')}} -
    -
    - {{form.learning_rate.label}} - {{form.learning_rate.tooltip}} - - {{form.learning_rate.small_text}} - - {{form.learning_rate(class='form-control learning-rate-option')}} -
    - -

    - -

    - - - -
    -
    - -
    -
    -

    Data Transformations

    -
    - {{form.use_mean.label}} - {{form.use_mean.tooltip}} - {{form.use_mean(class='form-control')}} -
    -
    - {{form.crop_size.label}} - {{form.crop_size.tooltip}} - {{form.crop_size(class='form-control', placeholder='none')}} -
    -
    - -
    -
    - -
    - -
    -
    - - - -
    -
    {{form.framework(class='form-control')}}
    -
    - {% include "models/images/classification/partials/new/network_tab_standard.html" %} -
    -
    - {% include "models/images/classification/partials/new/network_tab_previous.html" %} -
    -
    - {% include "models/images/classification/partials/new/network_tab_pretrained.html" %} -
    - - -
    - -
    - -
    - -
    - {{form.custom_network.label}} - {{form.custom_network.explanation(file='/models/images/classification/custom_network_explanation.html')}} -
    {{form.custom_network(class='form-control', rows=10)}}
    - Visualize -
    -
    -
    - - - -
    - {{form.custom_network_snapshot.label}} - {{form.custom_network_snapshot.tooltip}} - {{form.custom_network_snapshot(class='form-control autocomplete_path')}} -
    -
    -
    - {{ form.method(style="display:none;") }} - -
    -
    - -
    -
    - {% if form.select_gpu.choices|length > 2 and not multi_gpu %} -
    - {{form.select_gpu.label.text}}
    - {% for choice in form.select_gpu %} -
    - -
    - {% endfor %} -
    - {% endif %} - {% if form.select_gpus.choices| length > 1 and multi_gpu %} -
    - {{form.select_gpu_count.label}} - {{form.select_gpu_count(class='form-control')}} -
    -

    or

    -
    - {{form.select_gpus.label}} - {{form.select_gpus.tooltip}} - {{form.select_gpus(class='form-control', size=4)}} -
    - - {% endif %} -
    - {{form.group_name.label}} - {{form.group_name.tooltip}} - {{form.group_name(class='form-control')}} -
    -
    - {{form.model_name.label}} - {{form.model_name.tooltip}} - {{form.model_name(class='form-control')}} -
    - -
    -
    - -
    - - - -{% endblock %} + +{# Copyright (c) 2015-2017, NVIDIA CORPORATION. All rights reserved. #} + +{% from "helper.html" import print_flashes %} +{% from "helper.html" import print_errors %} +{% from "helper.html" import mark_errors %} + +{% extends "layout.html" %} + +{% block title %} +New Image Classification Model +{% endblock %} + +{% block nav %} +
  • New Model
  • +{% endblock %} + +{% block content %} + + + + +
    + {{ form.hidden_tag() }} + + {{ print_errors(form) }} + +
    +
    +
    +
    + {{form.dataset.label}} + {{form.dataset.tooltip}} + {{form.dataset(class='form-control', size=5)}} +
    +
    +
    + +
    +
    +

    Python Layers

    + {{form.python_layer_from_client.explanation(file='/models/python_layer_explanation.html')}} +
    +
    + {{form.python_layer_client_file.label}} + {{form.python_layer_client_file.tooltip}} + {{form.python_layer_client_file(class='form-control')}} +
    +
    + {{form.python_layer_server_file.label}} + {{form.python_layer_server_file.tooltip}} + {{form.python_layer_server_file(class='form-control autocomplete_path')}} +
    + +
    +
    +
    + + +
    +
    +

    Solver Options

    + + + + +
    + {{form.train_epochs.label}} + {{form.train_epochs.tooltip}} + {{form.train_epochs(class='form-control')}} +
    +
    + {{form.snapshot_interval.label}} + {{form.snapshot_interval.tooltip}} + {{form.snapshot_interval(class='form-control')}} +
    +
    + {{form.val_interval.label}} + {{form.val_interval.tooltip}} + {{form.val_interval(class='form-control')}} +
    + {# TODO: neat progress bar #} +
    + {{form.random_seed.label}} + {{form.random_seed.tooltip}} + {{form.random_seed(class='form-control', placeholder='[none]')}} +
    +
    + {{form.batch_size.label}} + {{form.batch_size.tooltip}} + + {{form.batch_size.small_text}} + + {{form.batch_size(class='form-control', placeholder='[network defaults]')}} +
    + +
    + {{form.solver_type.label}} + {{form.solver_type.tooltip}} + {{form.solver_type(class='form-control')}} +
    +
    + {{form.rms_decay.label}} + {{form.rms_decay.tooltip}} + {{form.rms_decay(class='form-control')}} +
    +
    + {{form.learning_rate.label}} + {{form.learning_rate.tooltip}} + + {{form.learning_rate.small_text}} + + {{form.learning_rate(class='form-control learning-rate-option')}} +
    + +

    + +

    + + + +
    +
    + +
    +
    +

    Data Transformations

    +
    + {{form.use_mean.label}} + {{form.use_mean.tooltip}} + {{form.use_mean(class='form-control')}} +
    +
    + {{form.crop_size.label}} + {{form.crop_size.tooltip}} + {{form.crop_size(class='form-control', placeholder='none')}} +
    +
    + +
    +
    + +
    + +
    +
    + + + +
    +
    {{form.framework(class='form-control')}}
    +
    + {% include "models/images/classification/partials/new/network_tab_standard.html" %} +
    +
    + {% include "models/images/classification/partials/new/network_tab_previous.html" %} +
    +
    + {% include "models/images/classification/partials/new/network_tab_pretrained.html" %} +
    + + +
    + +
    + +
    + +
    + {{form.custom_network.label}} + {{form.custom_network.explanation(file='/models/images/classification/custom_network_explanation.html')}} +
    {{form.custom_network(class='form-control', rows=10)}}
    + Visualize +
    +
    +
    + + + +
    + {{form.custom_network_snapshot.label}} + {{form.custom_network_snapshot.tooltip}} + {{form.custom_network_snapshot(class='form-control autocomplete_path')}} +
    +
    +
    + {{ form.method(style="display:none;") }} + +
    +
    + {% if system_type == 'slurm' %} + +
    +
    + {{form.slurm_time_limit.label}} + {{form.slurm_time_limit.tooltip}} + {{form.slurm_time_limit(class='form-control')}} +
    +
    + {{form.slurm_cpu_count.label}} + {{form.slurm_cpu_count.tooltip}} + {{form.slurm_cpu_count(class='form-control')}} +
    +
    + {{form.slurm_mem.label}} + {{form.slurm_mem.tooltip}} + {{form.slurm_mem(class='form-control')}} +
    +
    + {% endif %} + +
    + +
    + {% if form.select_gpu.choices|length > 2 and not multi_gpu %} +
    + {{form.select_gpu.label.text}}
    + {% for choice in form.select_gpu %} +
    + +
    + {% endfor %} +
    + {% endif %} + +
    + {{form.select_gpu_count.label}} + {{form.select_gpu_count(class='form-control')}} +
    + + {% if form.select_gpu.choices|length > 1 and multi_gpu and system_type != 'slurm' %} +

    or

    +
    + {{form.select_gpus.label}} + {{form.select_gpus.tooltip}} + {{form.select_gpus(class='form-control', size=4)}} +
    + {% endif %} + + + +
    + {{form.group_name.label}} + {{form.group_name.tooltip}} + {{form.group_name(class='form-control')}} +
    +
    + {{form.model_name.label}} + {{form.model_name.tooltip}} + {{form.model_name(class='form-control')}} +
    + +
    + +
    + +
    + + + +{% endblock %} diff --git a/digits/templates/models/images/classification/show.html b/digits/templates/models/images/classification/show.html old mode 100644 new mode 100755 index da2f3f971..915f6a468 --- a/digits/templates/models/images/classification/show.html +++ b/digits/templates/models/images/classification/show.html @@ -10,9 +10,11 @@ {% set task = job.train_task() %}
    +
    +
    Job Directory
    {{ job.dir() }}
    Disk Size
    diff --git a/digits/templates/models/images/generic/new.html b/digits/templates/models/images/generic/new.html old mode 100644 new mode 100755 index 8a99ec1c4..dfb00c531 --- a/digits/templates/models/images/generic/new.html +++ b/digits/templates/models/images/generic/new.html @@ -1,680 +1,708 @@ -{# Copyright (c) 2015-2017, NVIDIA CORPORATION. All rights reserved. #} - -{% from "helper.html" import print_flashes %} -{% from "helper.html" import print_errors %} -{% from "helper.html" import mark_errors %} - -{% extends "layout.html" %} - -{% block title %} -New {% if extension_title %}{{ extension_title }}{% else %}Image{% endif %} Model -{% endblock %} - -{% block nav %} -
  • New Model
  • -{% endblock %} - -{% block content %} - - - - -
    - {{ form.hidden_tag() }} - - {{ print_errors(form) }} - -
    -
    -
    -
    - {{form.dataset.label}} - {{form.dataset.tooltip}} - {{form.dataset(class='form-control', size=5)}} -
    -
    -
    -
    -
    -

    Python Layers

    - {{form.python_layer_from_client.explanation(file='/models/python_layer_explanation.html')}} -
    -
    - {{form.python_layer_client_file.label}} - {{form.python_layer_client_file.tooltip}} - {{form.python_layer_client_file(class='form-control')}} -
    -
    - {{form.python_layer_server_file.label}} - {{form.python_layer_server_file.tooltip}} - {{form.python_layer_server_file(class='form-control autocomplete_path')}} -
    - -
    -
    -
    - -
    -
    -

    Solver Options

    - - - - - -
    - {{form.train_epochs.label}} - {{form.train_epochs.tooltip}} - {{form.train_epochs(class='form-control')}} -
    -
    - {{form.snapshot_interval.label}} - {{form.snapshot_interval.tooltip}} - {{form.snapshot_interval(class='form-control')}} -
    -
    - {{form.val_interval.label}} - {{form.val_interval.tooltip}} - {{form.val_interval(class='form-control')}} -
    - {# TODO: neat progress bar #} -
    - {{form.random_seed.label}} - {{form.random_seed.tooltip}} - {{form.random_seed(class='form-control', placeholder='[none]')}} -
    -
    - {{form.batch_size.label}} - {{form.batch_size.tooltip}} - - {{form.batch_size.small_text}} - - {{form.batch_size(class='form-control', placeholder='[network defaults]')}} -
    - -
    - {{form.solver_type.label}} - {{form.solver_type.tooltip}} - {{form.solver_type(class='form-control')}} -
    -
    - {{form.rms_decay.label}} - {{form.rms_decay.tooltip}} - {{form.rms_decay(class='form-control')}} -
    -
    - {{form.learning_rate.label}} - {{form.learning_rate.tooltip}} - - {{form.learning_rate.small_text}} - - {{form.learning_rate(class='form-control learning-rate-option')}} -
    - -

    - -

    - - - -
    -
    -
    -
    -

    Data Transformations

    -
    - {{form.use_mean.label}} - {{form.use_mean.tooltip}} - {{form.use_mean(class='form-control')}} -
    -
    - {{form.crop_size.label}} - {{form.crop_size.tooltip}} - {{form.crop_size(class='form-control', placeholder='none')}} -
    -
    - -
    -
    - -
    - -
    -
    - - - -
    -
    - {% include "models/images/generic/partials/new/network_tab_standard.html" %} -
    -
    - {% include "models/images/generic/partials/new/network_tab_previous.html" %} -
    -
    - {% include "models/images/generic/partials/new/network_tab_pretrained.html" %} -
    - -
    - - -
    - {{form.custom_network.label}} - {{form.custom_network.explanation(file='/models/images/generic/custom_network_explanation.html')}} -
    {{form.custom_network(class='form-control', rows=10)}}
    - Visualize -
    -
    - - -
    - {{form.custom_network_snapshot.label}} - {{form.custom_network_snapshot.tooltip}} - {{form.custom_network_snapshot(class='form-control autocomplete_path')}} -
    -
    -
    - {{ form.method(style="display:none;") }} - -
    -
    - -
    -
    - {% if form.select_gpu.choices|length > 2 and not multi_gpu %} -
    - {{form.select_gpu.label.text}}
    - {% for choice in form.select_gpu %} -
    - -
    - {% endfor %} -
    - {% endif %} - {% if form.select_gpus.choices| length > 1 and multi_gpu %} -
    - {{form.select_gpu_count.label}} - {{form.select_gpu_count(class='form-control')}} -
    -

    or

    -
    - {{form.select_gpus.label}} - {{form.select_gpus.tooltip}} - {{form.select_gpus(class='form-control', size=4)}} -
    - - {% endif %} -
    - {{form.group_name.label}} - {{form.group_name.tooltip}} - {{form.group_name(class='form-control')}} -
    -
    - {{form.model_name.label}} - {{form.model_name.tooltip}} - {{form.model_name(class='form-control')}} -
    - -
    -
    - -
    - -{% endblock %} +{# Copyright (c) 2015-2017, NVIDIA CORPORATION. All rights reserved. #} + +{% from "helper.html" import print_flashes %} +{% from "helper.html" import print_errors %} +{% from "helper.html" import mark_errors %} + +{% extends "layout.html" %} + +{% block title %} +New {% if extension_title %}{{ extension_title }}{% else %}Image{% endif %} Model +{% endblock %} + +{% block nav %} +
  • New Model
  • +{% endblock %} + +{% block content %} + + + + +
    + {{ form.hidden_tag() }} + + {{ print_errors(form) }} + +
    +
    +
    +
    + {{form.dataset.label}} + {{form.dataset.tooltip}} + {{form.dataset(class='form-control', size=5)}} +
    +
    +
    +
    +
    +

    Python Layers

    + {{form.python_layer_from_client.explanation(file='/models/python_layer_explanation.html')}} +
    +
    + {{form.python_layer_client_file.label}} + {{form.python_layer_client_file.tooltip}} + {{form.python_layer_client_file(class='form-control')}} +
    +
    + {{form.python_layer_server_file.label}} + {{form.python_layer_server_file.tooltip}} + {{form.python_layer_server_file(class='form-control autocomplete_path')}} +
    + +
    +
    +
    + +
    +
    +

    Solver Options

    + + + + + +
    + {{form.train_epochs.label}} + {{form.train_epochs.tooltip}} + {{form.train_epochs(class='form-control')}} +
    +
    + {{form.snapshot_interval.label}} + {{form.snapshot_interval.tooltip}} + {{form.snapshot_interval(class='form-control')}} +
    +
    + {{form.val_interval.label}} + {{form.val_interval.tooltip}} + {{form.val_interval(class='form-control')}} +
    + {# TODO: neat progress bar #} +
    + {{form.random_seed.label}} + {{form.random_seed.tooltip}} + {{form.random_seed(class='form-control', placeholder='[none]')}} +
    +
    + {{form.batch_size.label}} + {{form.batch_size.tooltip}} + + {{form.batch_size.small_text}} + + {{form.batch_size(class='form-control', placeholder='[network defaults]')}} +
    + +
    + {{form.solver_type.label}} + {{form.solver_type.tooltip}} + {{form.solver_type(class='form-control')}} +
    +
    + {{form.rms_decay.label}} + {{form.rms_decay.tooltip}} + {{form.rms_decay(class='form-control')}} +
    +
    + {{form.learning_rate.label}} + {{form.learning_rate.tooltip}} + + {{form.learning_rate.small_text}} + + {{form.learning_rate(class='form-control learning-rate-option')}} +
    + +

    + +

    + + + +
    +
    +
    +
    +

    Data Transformations

    +
    + {{form.use_mean.label}} + {{form.use_mean.tooltip}} + {{form.use_mean(class='form-control')}} +
    +
    + {{form.crop_size.label}} + {{form.crop_size.tooltip}} + {{form.crop_size(class='form-control', placeholder='none')}} +
    +
    + +
    +
    + +
    + +
    +
    + + + +
    +
    + {% include "models/images/generic/partials/new/network_tab_standard.html" %} +
    +
    + {% include "models/images/generic/partials/new/network_tab_previous.html" %} +
    +
    + {% include "models/images/generic/partials/new/network_tab_pretrained.html" %} +
    + +
    + + +
    + {{form.custom_network.label}} + {{form.custom_network.explanation(file='/models/images/generic/custom_network_explanation.html')}} +
    {{form.custom_network(class='form-control', rows=10)}}
    + Visualize +
    +
    + + +
    + {{form.custom_network_snapshot.label}} + {{form.custom_network_snapshot.tooltip}} + {{form.custom_network_snapshot(class='form-control autocomplete_path')}} +
    +
    +
    + {{ form.method(style="display:none;") }} + +
    +
    + {% if system_type == 'slurm' %} + +
    +
    + {{form.slurm_time_limit.label}} + {{form.slurm_time_limit.tooltip}} + {{form.slurm_time_limit(class='form-control')}} +
    +
    + {{form.slurm_cpu_count.label}} + {{form.slurm_cpu_count.tooltip}} + {{form.slurm_cpu_count(class='form-control')}} +
    +
    + {{form.slurm_mem.label}} + {{form.slurm_mem.tooltip}} + {{form.slurm_mem(class='form-control')}} +
    +
    + {% endif %} + + +
    +
    + {% if form.select_gpu.choices|length > 2 and not multi_gpu %} +
    + {{form.select_gpu.label.text}}
    + {% for choice in form.select_gpu %} +
    + +
    + {% endfor %} +
    + {% endif %} + +
    + {{form.select_gpu_count.label}} + {{form.select_gpu_count(class='form-control')}} +
    + {% if form.select_gpu.choices|length > 1 and multi_gpu and system_type != 'slurm' %} + +

    or

    +
    + {{form.select_gpus.label}} + {{form.select_gpus.tooltip}} + {{form.select_gpus(class='form-control', size=4)}} +
    + {% endif %} + + + +
    + {{form.group_name.label}} + {{form.group_name.tooltip}} + {{form.group_name(class='form-control')}} +
    +
    + {{form.model_name.label}} + {{form.model_name.tooltip}} + {{form.model_name(class='form-control')}} +
    + +
    +
    + +
    + +{% endblock %} + + + diff --git a/digits/templates/partials/slurm_options.html b/digits/templates/partials/slurm_options.html new file mode 100755 index 000000000..29210d724 --- /dev/null +++ b/digits/templates/partials/slurm_options.html @@ -0,0 +1,20 @@ +{% if system_type == 'slurm' %} + +
    +
    + {{form.slurm_time_limit.label}} + {{form.slurm_time_limit.tooltip}} + {{form.slurm_time_limit(class='form-control')}} +
    +
    + {{form.slurm_cpu_count.label}} + {{form.slurm_cpu_count.tooltip}} + {{form.slurm_cpu_count(class='form-control')}} +
    +
    + {{form.slurm_mem.label}} + {{form.slurm_mem.tooltip}} + {{form.slurm_mem(class='form-control')}} +
    +
    +{% endif %} \ No newline at end of file diff --git a/digits/test_views.py b/digits/test_views.py old mode 100644 new mode 100755 index e964a3acb..444af0e29 --- a/digits/test_views.py +++ b/digits/test_views.py @@ -98,6 +98,7 @@ def job_info_html(cls, job_id, job_type='jobs'): """ url = '/%s/%s' % (job_type, job_id) rv = cls.app.get(url) + print url assert rv.status_code == 200, 'Cannot get info from job %s. "%s" returned %s' % (job_id, url, rv.status_code) return rv.data diff --git a/digits/tools/inference.py b/digits/tools/inference.py index ce026ddb2..8b915d64f 100755 --- a/digits/tools/inference.py +++ b/digits/tools/inference.py @@ -9,6 +9,7 @@ import PIL.Image import os import sys + try: from cStringIO import StringIO except ImportError: @@ -21,13 +22,12 @@ from digits.inference.errors import InferenceError # noqa from digits.job import Job # noqa from digits.utils.lmdbreader import DbReader # noqa - +import os # Import digits.config before caffe to set the path import caffe_pb2 # noqa logger = logging.getLogger('digits.tools.inference') - """ Perform inference on a list of images using the specified model """ @@ -46,15 +46,20 @@ def infer(input_list, """ Perform inference on a list of images using the specified model """ + # Get the gpu that slurm as set for the task + print gpu + logger.info("++++++++++++ inference.py ++++++++++++++++") + logger.info(gpu) + # if digits.config.config_value('system_type') != 'interactive': + # gpu = os.environ.get('CUDA_VISIBLE_DEVICES') + logger.info(gpu) # job directory defaults to that defined in DIGITS config if jobs_dir == 'none': jobs_dir = digits.config.config_value('jobs_dir') - # load model job model_dir = os.path.join(jobs_dir, model_id) assert os.path.isdir(model_dir), "Model dir %s does not exist" % model_dir model = Job.load(model_dir) - # load dataset job dataset_dir = os.path.join(jobs_dir, model.dataset_id) assert os.path.isdir(dataset_dir), "Dataset dir %s does not exist" % dataset_dir @@ -86,8 +91,8 @@ def infer(input_list, resize_mode = dataset.resize_mode if hasattr(dataset, 'resize_mode') else 'squash' n_input_samples = 0 # number of samples we were able to load - input_ids = [] # indices of samples within file list - input_data = [] # sample data + input_ids = [] # indices of samples within file list + input_data = [] # sample data if input_is_db: # load images from database @@ -122,6 +127,7 @@ def infer(input_list, paths = None with open(input_list) as infile: paths = infile.readlines() + # load and resize images for idx, path in enumerate(paths): path = path.strip() @@ -205,6 +211,7 @@ def infer(input_list, db.close() logger.info('Saved data to %s', db_path) + if __name__ == '__main__': parser = argparse.ArgumentParser(description='Inference tool - DIGITS') diff --git a/digits/tools/test_create_generic_db.py b/digits/tools/test_create_generic_db.py old mode 100644 new mode 100755 diff --git a/digits/tools/torch/main.lua b/digits/tools/torch/main.lua old mode 100644 new mode 100755 diff --git a/digits/views.py b/digits/views.py old mode 100644 new mode 100755 index 6068cecc5..6f320f1d5 --- a/digits/views.py +++ b/digits/views.py @@ -18,6 +18,7 @@ from digits import dataset, extensions, model, utils, pretrained_model from digits.log import logger from digits.utils.routing import request_wants_json +from digits.extensions.cluster_management.cluster_factory import cluster_factory blueprint = flask.Blueprint(__name__, __name__) @@ -317,6 +318,7 @@ def group(): return 'Jobs regrouped.' + # Authentication/login @@ -604,6 +606,7 @@ def clone_job(clone): else: raise werkzeug.exceptions.BadRequest('Invalid job type') + # Error handling @@ -641,12 +644,14 @@ def handle_error(e): trace=trace, ), status_code + # Register this handler for all error codes # Necessary for flask<=0.10.1 for code in HTTP_STATUS_CODES: if code not in [301]: app.register_error_handler(code, handle_error) + # File serving @@ -661,6 +666,7 @@ def serve_file(path): jobs_dir = config_value('jobs_dir') return flask.send_from_directory(jobs_dir, path) + # Path Completion @@ -710,6 +716,7 @@ def extension_static(extension_type, extension_id, filename): rootdir = os.path.join(digits_root, *['extensions', 'view', extension.get_dirname(), 'static']) return flask.send_from_directory(rootdir, filename) + # SocketIO functions # /home @@ -730,6 +737,7 @@ def on_disconnect_home(): """ pass + # /jobs @@ -769,3 +777,17 @@ def on_leave_jobs(): del flask.session['room'] # print '>>> Somebody left room %s' % room leave_room(room) + + +@blueprint.route('/system/') +def change_system_type(system_type): + """ + change scheduling system + """ + print system_type + # Get the URL to redirect to after logging in + next_url = utils.routing.get_request_arg('next') or \ + flask.request.referrer or flask.url_for('.home') + cluster_factory.set_system(system_type) + response = flask.make_response(flask.redirect(next_url)) + return response diff --git a/digits/webapp.py b/digits/webapp.py old mode 100644 new mode 100755 index caea4f6dc..a904b0ab9 --- a/digits/webapp.py +++ b/digits/webapp.py @@ -13,7 +13,7 @@ from digits.utils import filesystem as fs # noqa from digits.utils.store import StoreCache # noqa import digits.scheduler # noqa - +from digits.extensions.cluster_management.cluster_factory import cluster_factory # Create Flask, Scheduler and SocketIO objects app = flask.Flask(__name__) @@ -33,6 +33,7 @@ app.jinja_env.globals['server_name'] = config_value('server_name') app.jinja_env.globals['server_version'] = digits.__version__ app.jinja_env.globals['caffe_version'] = config_value('caffe')['version'] +app.jinja_env.globals['system_types'] = cluster_factory.get_running_systems() app.jinja_env.globals['caffe_flavor'] = config_value('caffe')['flavor'] app.jinja_env.globals['dir_hash'] = fs.dir_hash( os.path.join(os.path.dirname(digits.__file__), 'static')) diff --git a/examples/classification/use_archive.py b/examples/classification/use_archive.py index 96fafada5..9e98ae9ff 100755 --- a/examples/classification/use_archive.py +++ b/examples/classification/use_archive.py @@ -95,3 +95,4 @@ def classify_with_archive(archive, image_files, batch_size=None, use_gpu=True): ) print 'Script took %f seconds.' % (time.time() - script_start_time,) + diff --git a/examples/object-detection/prepare_kitti_data.py b/examples/object-detection/prepare_kitti_data.py index f24cf9e6a..a6c5f64bc 100755 --- a/examples/object-detection/prepare_kitti_data.py +++ b/examples/object-detection/prepare_kitti_data.py @@ -196,3 +196,4 @@ def split_for_training(split_dir, train_dir, val_dir, use_symlinks=(not args.no_symlinks), ) print 'Done.' + diff --git a/examples/semantic-segmentation/net_surgery.py b/examples/semantic-segmentation/net_surgery.py index 1719a76bd..654a57769 100755 --- a/examples/semantic-segmentation/net_surgery.py +++ b/examples/semantic-segmentation/net_surgery.py @@ -63,3 +63,4 @@ def transplant(new_net, net, suffix=''): if __name__ == '__main__': generate_fcn() + diff --git a/examples/siamese/create_db.py b/examples/siamese/create_db.py index 651a22fd2..9698743c2 100755 --- a/examples/siamese/create_db.py +++ b/examples/siamese/create_db.py @@ -251,3 +251,4 @@ def _save_mean(mean, filename): ) print 'Done after %s seconds' % (time.time() - start_time,) + diff --git a/packaging/deb/templates/digits.config b/packaging/deb/templates/digits.config index 2259c89ed..a8710c639 100755 --- a/packaging/deb/templates/digits.config +++ b/packaging/deb/templates/digits.config @@ -1,67 +1,67 @@ -#!/bin/sh -set -e - -# Source debconf library -. /usr/share/debconf/confmodule - -port_is_valid() { - case "$1" in - '-1') return 0 ;; - [0-9]*) - if [ "$1" -le "65535" ] 2>/dev/null - then - return 0 - fi - ;; - esac - return 1 -} - -# Get current value for port (or default) -# Since the priority is medium, this won't show up by default -db_input medium digits/port || true -db_go -db_get digits/port -PORT="$RET" - - -if port_is_valid $PORT -then - PORT_VALID=true -else - PORT_VALID=false -fi - -# loop until we have a valid port, or the same value is entered twice -while [ "$PORT_VALID" = false ] -do - # Add error messages - if [ "$PORT_VALID" = false ] - then - db_reset digits/port-invalid - db_input critical digits/port-invalid || true - fi - - # Ask for the port - db_fset digits/port seen false - db_input critical digits/port || true - db_go - db_get digits/port - NEW_PORT="$RET" - - # Exit if value unchanged - if [ "$NEW_PORT" = "$PORT" ] - then - echo Port unchanged. Aborting. - exit 1 - fi - - # Recalculate valid/bound - PORT="$NEW_PORT" - if port_is_valid $PORT - then - PORT_VALID=true - else - PORT_VALID=false - fi -done +#!/bin/sh +set -e + +# Source debconf library +. /usr/share/debconf/confmodule + +port_is_valid() { + case "$1" in + '-1') return 0 ;; + [0-9]*) + if [ "$1" -le "65535" ] 2>/dev/null + then + return 0 + fi + ;; + esac + return 1 +} + +# Get current value for port (or default) +# Since the priority is medium, this won't show up by default +db_input medium digits/port || true +db_go +db_get digits/port +PORT="$RET" + + +if port_is_valid $PORT +then + PORT_VALID=true +else + PORT_VALID=false +fi + +# loop until we have a valid port, or the same value is entered twice +while [ "$PORT_VALID" = false ] +do + # Add error messages + if [ "$PORT_VALID" = false ] + then + db_reset digits/port-invalid + db_input critical digits/port-invalid || true + fi + + # Ask for the port + db_fset digits/port seen false + db_input critical digits/port || true + db_go + db_get digits/port + NEW_PORT="$RET" + + # Exit if value unchanged + if [ "$NEW_PORT" = "$PORT" ] + then + echo Port unchanged. Aborting. + exit 1 + fi + + # Recalculate valid/bound + PORT="$NEW_PORT" + if port_is_valid $PORT + then + PORT_VALID=true + else + PORT_VALID=false + fi +done diff --git a/packaging/deb/templates/digits.postinst b/packaging/deb/templates/digits.postinst index cff9cef8d..40d9c7f46 100755 --- a/packaging/deb/templates/digits.postinst +++ b/packaging/deb/templates/digits.postinst @@ -1,51 +1,51 @@ -#!/bin/sh -set -e - -# Read port from debconf -. /usr/share/debconf/confmodule -db_get digits/port -PORT="$RET" -db_stop - -#DEBHELPER# - -case "$1" in - configure) - # Update permissions on new directories - chown www-data /var/lib/digits/jobs /var/log/digits - - # Disable default NGINX site - NGINX_NEEDS_RELOAD=false - DEFAULT_SITE=/etc/nginx/sites-enabled/default - if [ -L "$DEFAULT_SITE" ] - then - echo WARNING - Disabling default nginx site at $DEFAULT_SITE - rm -f $DEFAULT_SITE - NGINX_NEEDS_RELOAD=true - fi - - # Enable nginx site - SITE_FILE=/etc/nginx/sites-available/digits.nginx-site - SITE_LINK=/etc/nginx/sites-enabled/digits.nginx-site - if [ "$PORT" != "-1" ] - then - sed "s/.*AUTOCONFIG port.*/ listen ${PORT}; #AUTOCONFIG port (DO NOT DELETE THIS LINE)/" $SITE_FILE -i - rm -f $SITE_LINK - ln -s $SITE_FILE $SITE_LINK - NGINX_NEEDS_RELOAD=true - fi - if [ "$NGINX_NEEDS_RELOAD" = true ] - then - service nginx reload - fi - ;; - - abort-upgrade|abort-remove|abort-deconfigure) - ;; - - *) - echo "postinst called with unknown argument \`$1'" >&2 - exit 1 - ;; -esac - +#!/bin/sh +set -e + +# Read port from debconf +. /usr/share/debconf/confmodule +db_get digits/port +PORT="$RET" +db_stop + +#DEBHELPER# + +case "$1" in + configure) + # Update permissions on new directories + chown www-data /var/lib/digits/jobs /var/log/digits + + # Disable default NGINX site + NGINX_NEEDS_RELOAD=false + DEFAULT_SITE=/etc/nginx/sites-enabled/default + if [ -L "$DEFAULT_SITE" ] + then + echo WARNING - Disabling default nginx site at $DEFAULT_SITE + rm -f $DEFAULT_SITE + NGINX_NEEDS_RELOAD=true + fi + + # Enable nginx site + SITE_FILE=/etc/nginx/sites-available/digits.nginx-site + SITE_LINK=/etc/nginx/sites-enabled/digits.nginx-site + if [ "$PORT" != "-1" ] + then + sed "s/.*AUTOCONFIG port.*/ listen ${PORT}; #AUTOCONFIG port (DO NOT DELETE THIS LINE)/" $SITE_FILE -i + rm -f $SITE_LINK + ln -s $SITE_FILE $SITE_LINK + NGINX_NEEDS_RELOAD=true + fi + if [ "$NGINX_NEEDS_RELOAD" = true ] + then + service nginx reload + fi + ;; + + abort-upgrade|abort-remove|abort-deconfigure) + ;; + + *) + echo "postinst called with unknown argument \`$1'" >&2 + exit 1 + ;; +esac + diff --git a/packaging/deb/templates/digits.postrm b/packaging/deb/templates/digits.postrm index 98edcc7b3..03f283e87 100755 --- a/packaging/deb/templates/digits.postrm +++ b/packaging/deb/templates/digits.postrm @@ -1,19 +1,19 @@ -#!/bin/sh -set -e - -case "$1" in - purge) - # Remove all job data - rm -rf /var/lib/digits/jobs /var/log/digits - ;; - - remove|upgrade|disappear|failed-upgrade|abort-install|abort-upgrade) - ;; - - *) - echo "postrm called with unknown argument \`$1'" >&2 - exit 1 - ;; -esac - -#DEBHELPER# +#!/bin/sh +set -e + +case "$1" in + purge) + # Remove all job data + rm -rf /var/lib/digits/jobs /var/log/digits + ;; + + remove|upgrade|disappear|failed-upgrade|abort-install|abort-upgrade) + ;; + + *) + echo "postrm called with unknown argument \`$1'" >&2 + exit 1 + ;; +esac + +#DEBHELPER# diff --git a/packaging/deb/templates/digits.preinst b/packaging/deb/templates/digits.preinst index 97588e380..5905445c9 100755 --- a/packaging/deb/templates/digits.preinst +++ b/packaging/deb/templates/digits.preinst @@ -1,22 +1,22 @@ -#!/bin/sh -set -e - -case "$1" in - install|upgrade) - if [ -d /usr/share/digits/digits/jobs ]; then - # Copy job data from previous install - mkdir -p /var/lib/digits - mv /usr/share/digits/digits/jobs /var/lib/digits/ - fi - ;; - - abort-upgrade) - ;; - - *) - echo "preinst called with unknown argument \`$1'" >&2 - exit 1 - ;; -esac - -#DEBHELPER# +#!/bin/sh +set -e + +case "$1" in + install|upgrade) + if [ -d /usr/share/digits/digits/jobs ]; then + # Copy job data from previous install + mkdir -p /var/lib/digits + mv /usr/share/digits/digits/jobs /var/lib/digits/ + fi + ;; + + abort-upgrade) + ;; + + *) + echo "preinst called with unknown argument \`$1'" >&2 + exit 1 + ;; +esac + +#DEBHELPER# diff --git a/packaging/deb/templates/digits.prerm b/packaging/deb/templates/digits.prerm index fd4d4ff93..81e7127fe 100755 --- a/packaging/deb/templates/digits.prerm +++ b/packaging/deb/templates/digits.prerm @@ -1,25 +1,25 @@ -#!/bin/sh -set -e - -case "$1" in - remove) - # Disable NGINX site - SITE_LINK=/etc/nginx/sites-enabled/digits-nginx.site - if [ -L "$SITE_LINK" ] - then - rm -f $SITE_LINK - service nginx reload - fi - ;; - - upgrade|deconfigure|failed-upgrade) - ;; - - *) - echo "prerm called with unknown argument \`$1'" >&2 - exit 1 - ;; -esac - - -#DEBHELPER# +#!/bin/sh +set -e + +case "$1" in + remove) + # Disable NGINX site + SITE_LINK=/etc/nginx/sites-enabled/digits-nginx.site + if [ -L "$SITE_LINK" ] + then + rm -f $SITE_LINK + service nginx reload + fi + ;; + + upgrade|deconfigure|failed-upgrade) + ;; + + *) + echo "prerm called with unknown argument \`$1'" >&2 + exit 1 + ;; +esac + + +#DEBHELPER# diff --git a/packaging/deb/templates/rules b/packaging/deb/templates/rules index caecb2111..f6505aa49 100755 --- a/packaging/deb/templates/rules +++ b/packaging/deb/templates/rules @@ -1,7 +1,7 @@ -#!/usr/bin/make -f - -%: - dh $@ --parallel --with=python2,systemd --buildsystem=pybuild - -override_dh_auto_test: - : +#!/usr/bin/make -f + +%: + dh $@ --parallel --with=python2,systemd --buildsystem=pybuild + +override_dh_auto_test: + : diff --git a/seg.png b/seg.png new file mode 100644 index 000000000..6d52b8861 Binary files /dev/null and b/seg.png differ diff --git a/tmp/caffe.INFO b/tmp/caffe.INFO new file mode 120000 index 000000000..5fb5b200f --- /dev/null +++ b/tmp/caffe.INFO @@ -0,0 +1 @@ +caffe.g008.zim021.log.INFO.20161209-120550.17366 \ No newline at end of file