Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
107 changes: 107 additions & 0 deletions config_offload.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,107 @@
RSBench:
fetch: 'git clone https://github.com/ANL-CESAR/RSBench.git;
cd RSBench; git checkout 5b795bc1e11e2c9f22f17974d971be084d96096c;
git apply ../../patches/RSBench.patch'
tags: ['proxy']
build_dir: 'RSBench'
build: {
omp-offload-clang: [ 'cd openmp-offload; make COMPILER=clang
CFLAGS="-v -Ofast -fopenmp -fopenmp-targets=nvptx64-nvidia-cuda -Xopenmp-target -march=sm_70 -fsave-optimization-record -save-stats";
cp rsbench ../' ],
omp-offload-ibm: [ 'cd openmp-offload; make COMPILER=ibm OPTIMIZE=yes
CFLAGS="-Xnvcc --resource-usage -Ofast -std=gnu99 -Wall -qsmp=omp -qoffload -qtgtarch=sm_70" COMPILER=ibm;
cp rsbench ../' ],
#Cuda compilation statistics are not parsed a.t.m.
#The comparision will not be apple to apple comparisions unless
#the implementations are equivalent.
#Namely, the same regions are executed on the gpu.
cuda: [ 'cd cuda; make SM_VERSION="70";
cp rsbench ../' ]
}
copy: [ 'rsbench' ]
bin: 'rsbench'
run: 'env OMP_NUM_THREADS=1 OMP_PROC_BIND=true ./rsbench'
input: '-s small -m event -t 1'
measure: 'Runtime. *(\d+\.\d+) .*seconds'
clean: [ 'git clean -fx' ]

XSBench:
fetch: 'git clone https://github.com/ANL-CESAR/XSBench.git;
cd XSBench; git checkout 9921857305836963dcfebb9d6006e7260dfe7eb3
git apply ../../patches/XSBench.patch'
tags: ['proxy']
build_dir: 'XSBench'
build: {
omp-offload-clang: [ 'cd openmp-offload; make COMPILER=clang OPTIMIZE=yes
CFLAGS="-v -Ofast -fopenmp -fopenmp-targets=nvptx64-nvidia-cuda -Xopenmp-target -march=sm_70";
cp XSBench ../' ],
omp-offload-ibm: [ 'cd openmp-offload; make COMPILER=ibm OPTIMIZE=yes
CFLAGS="-Xnvcc --resource-usage -Ofast -std=gnu99 -Wall -qsmp=omp -qoffload -qtgtarch=sm_70" COMPILER=ibm;
cp XSBench ../' ],
#Cuda compilation statistics are not parsed a.t.m.
#The comparision will not be apple to apple comparisions unless
#the implementations are equivalent.
#Namely, the same regions are executed on the gpu.
cuda: [ 'cd cuda; make SM_VERSION="70";
cp XSBench ../' ]
}
copy: [ 'XSBench' ]
bin: 'XSBench'
run: 'env OMP_NUM_THREADS=1 OMP_PROC_BIND=true ./XSBench'
input: '-t 1 -m event -s small'
measure: 'Runtime. *(\d+\.\d+) .*seconds'
clean: [ 'git clean -fx' ]

miniFE:
fetch: 'git clone https://github.com/Mantevo/miniFE.git;
cd miniFE; git checkout c043cd1bafebad7fad58904625768024ddb33b73'
tags: ['proxy']
build_dir: 'miniFE'
build: {
omp-offload-clang: [ 'cd openmp45/src;
make CC=clang CXX=clang++
CFLAGS="-v -O3 -fopenmp -fopenmp-targets=nvptx64-nvidia-cuda
--cuda-path=${CUDA_HOME} -ffp-contract=fast -I. -I../utils
-I../fem -DMINIFE_SCALAR=double -DMINIFE_LOCAL_ORDINAL=int
-DMINIFE_GLOBAL_ORDINAL=int -DMINIFE_CSR_MATRIX
-DMINIFE_INFO=1 -DMINIFE_KERNELS=0"; cp miniFE.x ../../'
],
omp-offload-ibm: [ 'cd openmp45/src;
make CC=xlc CXX=xlC CFLAGS="-Xnvcc --resource-usage -O3 -qsmp=omp -qoffload -qtgtarch=sm_70
--cuda-path=${CUDA_HOME} -ffp-contract=fast -I. -I../utils -I../fem -DMINIFE_SCALAR=double
-DMINIFE_LOCAL_ORDINAL=int -DMINIFE_GLOBAL_ORDINAL=int -DMINIFE_CSR_MATRIX
-DMINIFE_INFO=0 -DMINIFE_KERNELS=0"; cp miniFE.x ../../'
]
}
copy: [ 'miniFE.x' ]
bin: 'miniFE.x'
run: 'env OMP_NUM_THREADS=1 OMP_PROC_BIND=true ./miniFE.x'
#input is way to small for a gpu application. Maybe try something larger.
input: '-nx 64'
measure: ''
clean: [ 'git clean -fx' ]

miniQMC:
fetch: 'git clone https://github.com/QMCPACK/miniqmc.git;
cd miniqmc; git checkout 5bc6dd7086f85a2905628e7d1eab739c77c88c1e;'
tags: ['proxy']
build_dir: 'miniqmc'
build: {
omp-offload-clang: ['rm -r build/; mkdir build; cd build;
cmake -DCMAKE_C_COMPILER=clang -DENABLE_OFFLOAD=1
-DUSE_OBJECT_TARGET=ON
-DCMAKE_CXX_COMPILER=clang++ -DCMAKE_LINKER=clang++
-DCMAKE_EXE_LINKER_FLAGS=-v ..; make miniqmc; cp ./bin/miniqmc ../'],
omp-offload-ibm: ['rm -r build/; mkdir build; cd build;
cmake -DCMAKE_C_COMPILER=xlc_r -DENABLE_OFFLOAD=1
-DCMAKE_CXX_COMPILER=xlC_r -DCMAKE_LINKER=xlC
-DCMAKE_EXE_LINKER_FLAGS="-Xnvcc --resource-usage" ..;
make miniqmc; cp ./bin/miniqmc ../']
}
copy: [ 'miniqmc' ]
bin: 'miniqmc'
run: 'env OMP_NUM_THREADS=1 OMP_PROC_BIND=true ./miniqmc'
input: '-g "2 2 1"'
measure: 'Total\s+ *(\d+\.\d+) '
clean: [ 'git reset --hard; git clean -fx' ]

108 changes: 101 additions & 7 deletions harness.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,64 @@
import itertools
import math

# This is a "pretty" fix when we dump yaml files, mangled function names can be quite long. When yaml is using such names as keys, it adds a "?" by default
# which is not nice to read. This variable will define maximum width.

def write_compile_logs(output_directory, stdout, stderr):
with open(("%s/build_log.out") % (output_directory), "w") as f:
f.write(stdout)
with open(("%s/build_log.err") % (output_directory), "w") as f:
f.write(stderr)

def get_nvlink_info(error_str):
gpu_compile_info = {}
current_func = None
global_info = 0
function_name = 1
function_stats = 2
current_state = global_info
for l in error_str.split("\n"):
if "nvlink" in l:
if current_state == global_info:
gmem = re.findall('nvlink info\s+:\s+([0-9]+) bytes gmem',l)
if len(gmem) != 0:
gpu_compile_info['global gmem bytes'] = int(gmem[0])
current_state = function_name
elif current_state == function_name:
func_name = re.findall('nvlink info\s+: Function properties for \'(.*?)\':',l)
if len(func_name) != 0:
current_func = func_name[0]
current_state = function_stats
if current_func in gpu_compile_info:
print('Should never happen')
print('Same function name occurs more than once in object file...')
print('%s'%str(current_func))
print('Exit ...')
sys.exit(0)
gpu_compile_info[str(current_func)] = {}
elif current_state == function_stats:
current_state = function_name
gpu_compile_info[current_func] = {}
if 'registers' in l:
regs = re.findall('nvlink info\s+:\s+used ([0-9]+) registers,',l)
gpu_compile_info[current_func]['registers'] = int(regs[0])
if 'stack' in l:
stack = re.findall('([0-9]+) stack,',l)
gpu_compile_info[current_func]["stack"] = int(stack[0])
if 'bytes smem' in l:
smem = re.findall('([0-9]+) bytes smem,', l)
gpu_compile_info[current_func]["smem"] = int(smem[0])
if 'bytes cmem' in l:
cmem = re.findall('([0-9]+) bytes cmem\[([0-9]+)\]',l)
for v in cmem:
key='cmem[%s]' % v[1]
gpu_compile_info[current_func][key] = int(v[0])
if 'bytes lmem' in l:
lmem = re.findall('([0-9]+) bytes lmem', l)
gpu_compile_info[current_func]['lmem'] = int(lmem[0])
current_func = None
return gpu_compile_info

def run(config, program, reps, dry):
print('Launching program', program, 'with modes', config[program]['build'])
exe = config[program]['run'] + ' ' + config[program]['input']
Expand All @@ -46,11 +104,12 @@ def run(config, program, reps, dry):
try:
if (not mode in results[program]) or (not results[program][mode]):
start = 0
results[program][mode] = []
elif len(results[program][mode]) < reps:
start = len(results[program][mode])
results[program][mode] = {}
results[program][mode]['exec_time'] = []
elif len(results[program][mode]['exec_time']) < reps:
start = len(results[program][mode]['exec_time'])
else:
print('FOUND', program, mode, 'runs', len(results[program][mode]) )
print('FOUND', program, mode, 'runs', len(results[program][mode]['exec_time']) )
continue
except Exception as e:
print('ERROR', e, 'running', program, 'mode', mode)
Expand Down Expand Up @@ -92,10 +151,10 @@ def run(config, program, reps, dry):
else:
runtime = t2 - t1

results[program][mode].append(runtime)
results[program][mode]['exec_time'].append(runtime)

with open('./results/results-%s.yaml'%(program), 'w') as f:
yaml.dump( results, f )
yaml.dump( results, f)

#def merge_nested_dict(d1, d2):
# for k in d1:
Expand Down Expand Up @@ -193,16 +252,51 @@ def compile_and_install(config, program, repo_dir, mode):
return

os.makedirs( bin_dir, exist_ok=True )

#Create directory to store build logs
build_log_dir = 'build_logs/%s/%s/' % (program, mode)

print('Making Build Log Directory...')
os.makedirs( build_log_dir, exist_ok=True )

print('Making Results Directory...')
os.makedirs( './results', exist_ok=True)

print('Clean...')
subprocess.run( config[program]['clean'], cwd=build_dir, shell=True)
print('===> Build...program %s mode %s\n%s' % (program, mode, config[program]['build'][mode]) )
try:
subprocess.run( config[program]['build'][mode], cwd=build_dir, shell=True )
p = subprocess.run( config[program]['build'][mode], cwd=build_dir, shell=True, capture_output=True)
except Exception as e:
print('building %s mode %s failed'%(program, mode), e)
write_compile_logs(build_log_dir, p.stdout.decode('utf-8'), p.stderr.decode('utf-8'))
input('key...')
sys.exit(1)

print('Storing compilation outputs')
write_compile_logs(build_log_dir, p.stdout.decode('utf-8'), p.stderr.decode('utf-8'))

print('Getting compilation statistics from nvinfo')
nv_info = get_nvlink_info(p.stderr.decode('utf-8'))

print('Storing compilation statistis')
results = { program: {} }
try:
with open('./results/results-%s.yaml'%(program), 'r') as f:
results = yaml.load(f, Loader=CLoader)
except FileNotFoundError as e:
pass

if (not mode in results[program]) or (not results[program][mode]):
results[program][mode] = {}
results[program][mode]['exec_time'] = []
results[program][mode]['compile_stats'] = {}

results[program][mode]['compile_stats'] = nv_info

with open('./results/results-%s.yaml'%(program), 'w') as f:
yaml.dump( results, f)

print('Merge stats and reports...')
merge_stats_reports( program, build_dir, mode )
if mode == 'omp':
Expand Down
38 changes: 38 additions & 0 deletions patches/RSBench.patch
Original file line number Diff line number Diff line change
@@ -1,3 +1,41 @@
diff --git a/cuda/io.cu b/cuda/io.cu
index 26f132f..f4ac121 100644
--- a/cuda/io.cu
+++ b/cuda/io.cu
@@ -194,6 +194,13 @@ Input read_CLI( int argc, char * argv[] )
else
print_CLI_error();
}
+ else if( strcmp(arg, "-t") == 0 )
+ {
+ if( ++i < argc )
+ input.nthreads = 1;
+ else
+ print_CLI_error();
+ }
else
print_CLI_error();
}
diff --git a/openmp-offload/io.c b/openmp-offload/io.c
index 1c13494..0d04e56 100644
--- a/openmp-offload/io.c
+++ b/openmp-offload/io.c
@@ -194,7 +194,14 @@ Input read_CLI( int argc, char * argv[] )
else
print_CLI_error();
}
- else
+ else if( strcmp(arg, "-t") == 0 )
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is this to enable multi-threaded target creation? Is an option on the number of threads correct? Shouldn't be 1 for correct program execution?

+ {
+ if( ++i < argc )
+ input.nthreads = atoi(argv[i]);
+ else
+ print_CLI_error();
+ }
+ else
print_CLI_error();
}

diff --git a/openmp-threading/io.c b/openmp-threading/io.c
index dc5201f..a179d97 100644
--- a/openmp-threading/io.c
Expand Down
22 changes: 22 additions & 0 deletions patches/XSBench.patch
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
diff --git a/openmp-offload/io.c b/openmp-offload/io.c
index 5c981ee..c2fd0b7 100644
--- a/openmp-offload/io.c
+++ b/openmp-offload/io.c
@@ -275,8 +275,16 @@ Inputs read_CLI( int argc, char * argv[] )
{
char * arg = argv[i];

+ // This will be ignored
+ if( strcmp(arg, "-t") == 0 )
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is an option on the number of threads correct? Shouldn't be 1 for correct program execution?

+ {
+ if( ++i < argc )
+ input.nthreads = atoi(argv[i]);
+ else
+ print_CLI_error();
+ }
// n_gridpoints (-g)
- if( strcmp(arg, "-g") == 0 )
+ else if( strcmp(arg, "-g") == 0 )
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Change to if, otherwise -t precludes -g

{
if( ++i < argc )
{