Skip to content
Open
140 changes: 139 additions & 1 deletion mlperf_logging/result_summarizer/result_summarizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,10 @@
import itertools
import pandas as pd
import yaml
import hashlib
import math
import operator
import uuid as uuidlib

from ..compliance_checker import mlp_compliance
from ..compliance_checker.mlp_compliance import usage_choices, rule_choices
Expand Down Expand Up @@ -691,6 +695,130 @@ def _fill_empty_benchmark_scores(
benchmark_scores[benchmark] = None


def _add_id_to_summary(summary):
"""Add private ids to the summary file based on the json in id_json_path, which is a list of sha256 ids where the position in the list is the id. If id_json_path is specified but the file does not exist, it is created from scratch.

Args:
summary (pd.DataFrame): Summary dataframe.
"""


# Code from humanhash3, which is public domain.
DEFAULT_WORDLIST = (
'ack', 'alabama', 'alanine', 'alaska', 'alpha', 'angel', 'apart', 'april',
'arizona', 'arkansas', 'artist', 'asparagus', 'aspen', 'august', 'autumn',
'avocado', 'bacon', 'bakerloo', 'batman', 'beer', 'berlin', 'beryllium',
'black', 'blossom', 'blue', 'bluebird', 'bravo', 'bulldog', 'burger',
'butter', 'california', 'carbon', 'cardinal', 'carolina', 'carpet', 'cat',
'ceiling', 'charlie', 'chicken', 'coffee', 'cola', 'cold', 'colorado',
'comet', 'connecticut', 'crazy', 'cup', 'dakota', 'december', 'delaware',
'delta', 'diet', 'don', 'double', 'early', 'earth', 'east', 'echo',
'edward', 'eight', 'eighteen', 'eleven', 'emma', 'enemy', 'equal',
'failed', 'fanta', 'fifteen', 'fillet', 'finch', 'fish', 'five', 'fix',
'floor', 'florida', 'football', 'four', 'fourteen', 'foxtrot', 'freddie',
'friend', 'fruit', 'gee', 'georgia', 'glucose', 'golf', 'green', 'grey',
'hamper', 'happy', 'harry', 'hawaii', 'helium', 'high', 'hot', 'hotel',
'hydrogen', 'idaho', 'illinois', 'india', 'indigo', 'ink', 'iowa',
'island', 'item', 'jersey', 'jig', 'johnny', 'juliet', 'july', 'jupiter',
'kansas', 'kentucky', 'kilo', 'king', 'kitten', 'lactose', 'lake', 'lamp',
'lemon', 'leopard', 'lima', 'lion', 'lithium', 'london', 'louisiana',
'low', 'magazine', 'magnesium', 'maine', 'mango', 'march', 'mars',
'maryland', 'massachusetts', 'may', 'mexico', 'michigan', 'mike',
'minnesota', 'mirror', 'mississippi', 'missouri', 'mobile', 'mockingbird',
'monkey', 'montana', 'moon', 'mountain', 'muppet', 'music', 'nebraska',
'neptune', 'network', 'nevada', 'nine', 'nineteen', 'nitrogen', 'north',
'november', 'nuts', 'october', 'ohio', 'oklahoma', 'one', 'orange',
'oranges', 'oregon', 'oscar', 'oven', 'oxygen', 'papa', 'paris', 'pasta',
'pennsylvania', 'pip', 'pizza', 'pluto', 'potato', 'princess', 'purple',
'quebec', 'queen', 'quiet', 'red', 'river', 'robert', 'robin', 'romeo',
'rugby', 'sad', 'salami', 'saturn', 'september', 'seven', 'seventeen',
'shade', 'sierra', 'single', 'sink', 'six', 'sixteen', 'skylark', 'snake',
'social', 'sodium', 'solar', 'south', 'spaghetti', 'speaker', 'spring',
'stairway', 'steak', 'stream', 'summer', 'sweet', 'table', 'tango', 'ten',
'tennessee', 'tennis', 'texas', 'thirteen', 'three', 'timing', 'triple',
'twelve', 'twenty', 'two', 'uncle', 'undress', 'uniform', 'uranus', 'utah',
'vegan', 'venus', 'vermont', 'victor', 'video', 'violet', 'virginia',
'washington', 'west', 'whiskey', 'white', 'william', 'winner', 'winter',
'wisconsin', 'wolfram', 'wyoming', 'xray', 'yankee', 'yellow', 'zebra',
'zulu')

class HumanHasher(object):

def __init__(self, wordlist=DEFAULT_WORDLIST):
self.wordlist = wordlist

def humanize_list(self, hexdigest, words=4):
# Gets a list of byte values between 0-255.
bytes_ = map(lambda x: int(x, 16),
map(''.join, zip(hexdigest[::2], hexdigest[1::2])))
# Compress an arbitrary number of bytes to `words`.
compressed = self.compress(bytes_, words)

return [str(self.wordlist[byte]) for byte in compressed]

def humanize(self, hexdigest, words=4, separator='-'):
# Map the compressed byte values through the word list.
return separator.join(self.humanize_list(hexdigest, words))

@staticmethod
def compress(bytes_, target):
bytes_list = list(bytes_)

length = len(bytes_list)
# If there are less than the target number bytes, return input bytes
if target >= length:
return bytes_

# Split `bytes` evenly into `target` segments
# Each segment hashes `seg_size` bytes, rounded down for some
seg_size = float(length) / float(target)
# Initialize `target` number of segments
segments = [0] * target
seg_num = 0

# Use a simple XOR checksum-like function for compression
for i, byte in enumerate(bytes_list):
# Divide the byte index by the segment size to assign its segment
# Floor to create a valid segment index
# Min to ensure the index is within `target`
seg_num = min(int(math.floor(i / seg_size)), target-1)
# Apply XOR to the existing segment and the byte
segments[seg_num] = operator.xor(segments[seg_num], byte)

return segments

def uuid(self, **params):
digest = str(uuidlib.uuid4()).replace('-', '')
return self.humanize(digest, **params), digest



def get_hash(row):
columns_for_hashing = [
'division',
'submitter',
'system',
'number_of_nodes',
'host_processor_model_name',
'host_processors_count',
'accelerator_model_name',
'accelerators_count',
'framework'
]
to_hash = ''.join(str(row[c]) for c in columns_for_hashing)
return hashlib.sha256(to_hash.encode('utf-8')).hexdigest()

summary['hash'] = summary.apply(get_hash, axis=1)

humanhasha = HumanHasher()

summary['id'] = summary['hash'].apply(lambda x: humanhasha.humanize(x))

return summary




def summarize_results(folder, usage, ruleset, csv_file=None, **kwargs):
"""Summarizes a set of results.

Expand Down Expand Up @@ -837,6 +965,8 @@ def _check_and_update_system_specs(desc_keys, column_name, query=None):
return strong_scaling_summary, weak_scaling_summary, power_summary, power_weak_scaling_summary




def get_parser():
parser = argparse.ArgumentParser(
prog='mlperf_logging.result_summarizer',
Expand All @@ -857,6 +987,11 @@ def get_parser():
type=str,
choices=rule_choices(),
help='the ruleset such as 0.6.0, 0.7.0, or 1.0.0')

parser.add_argument('--generate_private_ids',
action='store_true',
help='Generate private IDs for each run.')

parser.add_argument('--werror',
action='store_true',
help='Treat warnings as errors')
Expand All @@ -874,6 +1009,7 @@ def get_parser():
'--xlsx',
type=str,
help='Exports a xlsx of the results to the path specified')


return parser

Expand Down Expand Up @@ -1042,7 +1178,9 @@ def _print_and_write(summaries, weak_scaling=False, mode='w', power = False):

# Sort rows by their values
summaries = summaries.sort_values(by=cols)
print(summaries)
if args.generate_private_ids:
summaries = _add_id_to_summary(summaries)

if args.csv is not None:
csv = args.csv
assert csv.endswith(".csv")
Expand Down
Loading