Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions algoperf/workloads/criteo1tb/workload.py
Original file line number Diff line number Diff line change
Expand Up @@ -95,11 +95,11 @@ def train_stddev(self):

@property
def max_allowed_runtime_sec(self) -> int:
return 7_703 # ~2.1 hours.
return 8_915 # ~2.4 hours.

@property
def eval_period_time_sec(self) -> int:
return 2 * 60 # 2 mins.
return 356 # approx 25 evals

def _build_input_queue(
self,
Expand Down
4 changes: 2 additions & 2 deletions algoperf/workloads/fastmri/workload.py
Original file line number Diff line number Diff line change
Expand Up @@ -95,11 +95,11 @@ def accelerations(self):

@property
def max_allowed_runtime_sec(self) -> int:
return 4_430 # ~1.2 hours
return 2_745 # ~0.7 hours

@property
def eval_period_time_sec(self) -> int:
return 80
return 110 # approx 25 evals

@property
def step_hint(self) -> int:
Expand Down
4 changes: 2 additions & 2 deletions algoperf/workloads/imagenet_resnet/workload.py
Original file line number Diff line number Diff line change
Expand Up @@ -103,11 +103,11 @@ def resize_size(self) -> int:

@property
def max_allowed_runtime_sec(self) -> int:
return 66_159 # ~18.4 hours
return 49_918 # ~13.8 hours

@property
def eval_period_time_sec(self) -> int:
return 510 # 8.5 minutes.
return 1_996 # approx 25 evals

def _build_dataset(
self,
Expand Down
4 changes: 2 additions & 2 deletions algoperf/workloads/imagenet_vit/workload.py
Original file line number Diff line number Diff line change
Expand Up @@ -88,11 +88,11 @@ def eval_batch_size(self) -> int:

@property
def max_allowed_runtime_sec(self) -> int:
return 69_768 # ~19.4 hours
return 64_292 # ~17.8 hours

@property
def eval_period_time_sec(self) -> int:
return 7 * 60 # 7 mins.
return 2_571 # 7 mins.

def _build_dataset(
self,
Expand Down
4 changes: 2 additions & 2 deletions algoperf/workloads/librispeech_conformer/workload.py
Original file line number Diff line number Diff line change
Expand Up @@ -80,11 +80,11 @@ def train_stddev(self):

@property
def max_allowed_runtime_sec(self) -> int:
return 58_015 # ~16.1 hours
return 43_680 # ~16.1 hours

@property
def eval_period_time_sec(self) -> int:
return 24 * 60
return 1747 # approx 25 evals

@property
def step_hint(self) -> int:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -100,7 +100,11 @@ def step_hint(self) -> int:

@property
def max_allowed_runtime_sec(self) -> int:
return 44_405 # ~12.3 hours
return 36_949 # ~12.3 hours

@property
def eval_period_time_sec(self) -> int:
return 1447 # approx 25 evals

@property
def use_tanh(self) -> bool:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -96,7 +96,11 @@ def step_hint(self) -> int:

@property
def max_allowed_runtime_sec(self) -> int:
return 44_405 # ~12.3 hours
return 36_949 # 10.3 hours

@property
def eval_period_time_sec(self) -> int:
return 1447 # approx 25 evals

@property
def use_tanh(self) -> bool:
Expand Down
4 changes: 2 additions & 2 deletions algoperf/workloads/ogbg/workload.py
Original file line number Diff line number Diff line change
Expand Up @@ -88,11 +88,11 @@ def train_stddev(self):

@property
def max_allowed_runtime_sec(self) -> int:
return 12_011 # ~3.3 hours
return 11_303 # ~3.1 hours

@property
def eval_period_time_sec(self) -> int:
return 4 * 60
return 452 # approx 25 evals

def _build_input_queue(
self,
Expand Down
4 changes: 2 additions & 2 deletions algoperf/workloads/wmt/workload.py
Original file line number Diff line number Diff line change
Expand Up @@ -89,11 +89,11 @@ def train_stddev(self):

@property
def max_allowed_runtime_sec(self) -> int:
return 43_336 # ~12.0 hours
return 16_114 # ~12.0 hours

@property
def eval_period_time_sec(self) -> int:
return 14 * 60
return 644

@property
def step_hint(self) -> int:
Expand Down
2 changes: 1 addition & 1 deletion docker/build_docker_images.sh
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ then
GIT_BRANCH='main' # Set default argument
fi

FRAMEWORKS=( "jax" "pythorch" "both" )
FRAMEWORKS=( "jax" "pytorch")

if [[ -n "$FRAMEWORK" ]];
then
Expand Down
1 change: 1 addition & 0 deletions scoring/performance_profile.py
Original file line number Diff line number Diff line change
Expand Up @@ -71,6 +71,7 @@
'wer',
'l1_loss',
'loss',
'ppl'
]

MAX_EVAL_METRICS = ['mean_average_precision', 'ssim', 'accuracy', 'bleu']
Expand Down
4 changes: 3 additions & 1 deletion scoring/score_submissions.py
Original file line number Diff line number Diff line change
Expand Up @@ -123,6 +123,8 @@ def get_summary_df(workload, workload_df, include_test_split=False):
workload_df['accumulated_submission_time'] / workload_df['global_step']
).iloc[-1][-1]

summary_df['step_hint'] = scoring_utils.get_workload_stephint(workload)

# test metrics
if include_test_split:
test_metric, test_target = scoring_utils.get_workload_metrics_and_targets(
Expand Down Expand Up @@ -157,7 +159,7 @@ def get_summary_df(workload, workload_df, include_test_split=False):
return summary_df


def get_submission_summary(df, include_test_split=True):
def get_submission_summary(df, include_test_split=False):
"""Summarizes the submission results into metric and time tables
organized by workload.
"""
Expand Down
20 changes: 20 additions & 0 deletions scoring/scoring_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -240,3 +240,23 @@ def get_workload_metrics_and_targets(workload, split='validation'):
metric = f'test/{metric_name}'
target = workload_obj.test_target_value
return metric, target


def get_workload_stephint(workload):
workload_name = re.match(WORKLOAD_NAME_PATTERN, workload).group(1)
framework = re.match(WORKLOAD_NAME_PATTERN, workload).group(2)
workload_metadata = copy.copy(WORKLOADS[workload_name])

# Extend path according to framework.
workload_metadata['workload_path'] = os.path.join(
BASE_WORKLOADS_DIR,
workload_metadata['workload_path'] + f'{framework}',
'workload.py',
)
workload_init_kwargs = {}
workload_obj = workloads_registry.import_workload(
workload_path=workload_metadata['workload_path'],
workload_class_name=workload_metadata['workload_class_name'],
workload_init_kwargs=workload_init_kwargs,
)
return workload_obj.step_hint
4 changes: 3 additions & 1 deletion scoring/utils/run_workloads.py
Original file line number Diff line number Diff line change
Expand Up @@ -241,7 +241,8 @@ def main(_):

# For each runnable workload check if there are any containers running and if not launch next container command
for workload in workloads:
run_key = prng.fold_in(rng_subkey, hash(workload))
workload_foldin = hash(workload) % 9
run_key = prng.fold_in(rng_subkey, workload_foldin)
run_seed = run_key[0] # arbitrary
base_workload_name = get_base_workload_name(workload)
wait_until_container_not_running()
Expand Down Expand Up @@ -270,6 +271,7 @@ def main(_):
'docker run -t -d -v /home/kasimbeg/data/:/data/ '
'-v /home/kasimbeg/experiment_runs/:/experiment_runs '
'-v /home/kasimbeg/experiment_runs/logs:/logs '
'-v /home/kasimbeg/algorithmic-efficiency:/algorithmic-efficiency '
f'{mount_repo_flag}'
'--gpus all --ipc=host '
f'{docker_image_url} '
Expand Down
2 changes: 1 addition & 1 deletion scoring/utils/workload_metadata_external_tuning.json
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@
"dataset": "librispeech"
},
"criteo1tb": {
"max_steps": 10666,
"max_steps": 15666,
"dataset": "criteo1tb"
},
"librispeech_conformer": {
Expand Down
Loading