llnl · koparasy · Feb 19, 2021 · Apr 15, 2021 · Apr 17, 2021 · ggeorgakoudis
diff --git a/config_offload.yaml b/config_offload.yaml
@@ -0,0 +1,107 @@
+RSBench:
+    fetch: 'git clone https://github.com/ANL-CESAR/RSBench.git;
+            cd RSBench; git checkout 5b795bc1e11e2c9f22f17974d971be084d96096c;
+            git apply ../../patches/RSBench.patch'
+    tags: ['proxy']
+    build_dir: 'RSBench'
+    build: {
+        omp-offload-clang: [ 'cd openmp-offload; make COMPILER=clang
+            CFLAGS="-v -Ofast -fopenmp -fopenmp-targets=nvptx64-nvidia-cuda -Xopenmp-target -march=sm_70 -fsave-optimization-record -save-stats";
+            cp rsbench ../' ],
+        omp-offload-ibm: [ 'cd openmp-offload; make COMPILER=ibm OPTIMIZE=yes
+            CFLAGS="-Xnvcc --resource-usage -Ofast -std=gnu99 -Wall -qsmp=omp -qoffload -qtgtarch=sm_70" COMPILER=ibm;
+            cp rsbench ../' ],
+        #Cuda compilation statistics are not parsed a.t.m. 
+        #The comparision will not be apple to apple comparisions unless 
+        #the implementations are equivalent.
+        #Namely, the same regions are executed on the gpu.
+        cuda: [ 'cd cuda; make SM_VERSION="70";
+            cp rsbench ../' ]
+    }
+    copy: [ 'rsbench' ]
+    bin: 'rsbench'
+    run: 'env OMP_NUM_THREADS=1 OMP_PROC_BIND=true ./rsbench'
+    input: '-s small -m event -t 1'
+    measure: 'Runtime. *(\d+\.\d+) .*seconds' 
+    clean: [ 'git clean -fx' ]
+
+XSBench:
+    fetch: 'git clone https://github.com/ANL-CESAR/XSBench.git;
+            cd XSBench; git checkout 9921857305836963dcfebb9d6006e7260dfe7eb3
+            git apply ../../patches/XSBench.patch'
+    tags: ['proxy']
+    build_dir: 'XSBench'
+    build: {
+        omp-offload-clang: [ 'cd openmp-offload; make COMPILER=clang OPTIMIZE=yes
+            CFLAGS="-v -Ofast -fopenmp -fopenmp-targets=nvptx64-nvidia-cuda -Xopenmp-target -march=sm_70";
+            cp XSBench ../' ],
+        omp-offload-ibm: [ 'cd openmp-offload; make COMPILER=ibm OPTIMIZE=yes
+            CFLAGS="-Xnvcc --resource-usage -Ofast -std=gnu99 -Wall -qsmp=omp -qoffload -qtgtarch=sm_70" COMPILER=ibm;
+            cp XSBench ../' ],
+        #Cuda compilation statistics are not parsed a.t.m. 
+        #The comparision will not be apple to apple comparisions unless 
+        #the implementations are equivalent.
+        #Namely, the same regions are executed on the gpu.
+        cuda: [ 'cd cuda; make SM_VERSION="70";
+            cp XSBench ../' ]
+    }
+    copy: [ 'XSBench' ]
+    bin: 'XSBench'
+    run: 'env OMP_NUM_THREADS=1 OMP_PROC_BIND=true ./XSBench'
+    input: '-t 1 -m event -s small'
+    measure: 'Runtime. *(\d+\.\d+) .*seconds' 
+    clean: [ 'git clean -fx' ]
+
+miniFE:
+    fetch: 'git clone https://github.com/Mantevo/miniFE.git;
+            cd miniFE; git checkout c043cd1bafebad7fad58904625768024ddb33b73'
+    tags: ['proxy']
+    build_dir: 'miniFE'
+    build: {
+        omp-offload-clang: [ 'cd openmp45/src; 
+        make  CC=clang CXX=clang++ 
+        CFLAGS="-v -O3 -fopenmp -fopenmp-targets=nvptx64-nvidia-cuda 
+        --cuda-path=${CUDA_HOME} -ffp-contract=fast -I. -I../utils 
+        -I../fem -DMINIFE_SCALAR=double -DMINIFE_LOCAL_ORDINAL=int 
+        -DMINIFE_GLOBAL_ORDINAL=int -DMINIFE_CSR_MATRIX  
+        -DMINIFE_INFO=1 -DMINIFE_KERNELS=0"; cp miniFE.x ../../'
+        ],
+        omp-offload-ibm: [ 'cd openmp45/src; 
+            make CC=xlc CXX=xlC  CFLAGS="-Xnvcc --resource-usage -O3  -qsmp=omp -qoffload -qtgtarch=sm_70 
+            --cuda-path=${CUDA_HOME} -ffp-contract=fast -I. -I../utils  -I../fem -DMINIFE_SCALAR=double 
+            -DMINIFE_LOCAL_ORDINAL=int  -DMINIFE_GLOBAL_ORDINAL=int -DMINIFE_CSR_MATRIX  
+            -DMINIFE_INFO=0 -DMINIFE_KERNELS=0"; cp miniFE.x ../../'
+        ]
+    }
+    copy: [ 'miniFE.x' ]
+    bin: 'miniFE.x'
+    run: 'env OMP_NUM_THREADS=1 OMP_PROC_BIND=true ./miniFE.x'
+#input is way to small for a gpu application. Maybe try something larger.
+    input: '-nx 64'
+    measure: ''
+    clean: [ 'git clean -fx' ]
+
+miniQMC:
+    fetch: 'git clone https://github.com/QMCPACK/miniqmc.git;
+            cd miniqmc; git checkout 5bc6dd7086f85a2905628e7d1eab739c77c88c1e;' 
+    tags: ['proxy']
+    build_dir: 'miniqmc'
+    build: {
+        omp-offload-clang: ['rm -r build/; mkdir build; cd build;
+                         cmake -DCMAKE_C_COMPILER=clang -DENABLE_OFFLOAD=1
+                        -DUSE_OBJECT_TARGET=ON
+                        -DCMAKE_CXX_COMPILER=clang++ -DCMAKE_LINKER=clang++ 
+                        -DCMAKE_EXE_LINKER_FLAGS=-v ..; make miniqmc; cp ./bin/miniqmc ../'],
+        omp-offload-ibm: ['rm -r build/; mkdir build; cd build;
+                          cmake -DCMAKE_C_COMPILER=xlc_r -DENABLE_OFFLOAD=1 
+                          -DCMAKE_CXX_COMPILER=xlC_r -DCMAKE_LINKER=xlC  
+                          -DCMAKE_EXE_LINKER_FLAGS="-Xnvcc --resource-usage" ..;
+                          make miniqmc; cp ./bin/miniqmc ../']
+    }
+    copy: [ 'miniqmc' ]
+    bin: 'miniqmc'
+    run: 'env OMP_NUM_THREADS=1 OMP_PROC_BIND=true ./miniqmc'
+    input: '-g "2 2 1"'
+    measure: 'Total\s+ *(\d+\.\d+) ' 
+    clean: [ 'git reset --hard; git clean -fx' ]
+
diff --git a/harness.py b/harness.py
@@ -27,6 +27,64 @@
 import itertools
 import math
 
+# This is a "pretty" fix when we dump yaml files, mangled function names can be quite long. When yaml is using such names as keys, it adds a "?" by default
+# which is not nice to read. This variable will define maximum width.
+
+def write_compile_logs(output_directory, stdout, stderr):
+    with open(("%s/build_log.out") % (output_directory), "w") as f:
+        f.write(stdout)
+    with open(("%s/build_log.err") % (output_directory), "w") as f:
+        f.write(stderr)
+
+def get_nvlink_info(error_str):
+    gpu_compile_info = {}
+    current_func = None
+    global_info = 0
+    function_name = 1
+    function_stats = 2
+    current_state = global_info
+    for l in error_str.split("\n"):
+        if "nvlink" in l:
+            if current_state == global_info:
+                gmem = re.findall('nvlink info\s+:\s+([0-9]+) bytes gmem',l)
+                if len(gmem) != 0:
+                    gpu_compile_info['global gmem bytes'] = int(gmem[0])
+                    current_state = function_name
+            elif current_state ==  function_name:
+                func_name = re.findall('nvlink info\s+: Function properties for \'(.*?)\':',l)
+                if len(func_name) != 0:
+                    current_func = func_name[0]
+                    current_state = function_stats
+                    if current_func in gpu_compile_info:
+                        print('Should never happen')
+                        print('Same function name occurs more than once in object file...')
+                        print('%s'%str(current_func))
+                        print('Exit ...')
+                        sys.exit(0)
+                    gpu_compile_info[str(current_func)] = {}
+            elif current_state == function_stats:
+                current_state = function_name
+                gpu_compile_info[current_func] = {}
+                if 'registers' in l:
+                    regs = re.findall('nvlink info\s+:\s+used ([0-9]+) registers,',l)
+                    gpu_compile_info[current_func]['registers'] = int(regs[0])
+                if 'stack' in l:
+                    stack = re.findall('([0-9]+) stack,',l)
+                    gpu_compile_info[current_func]["stack"] = int(stack[0])
+                if 'bytes smem' in l:
+                    smem = re.findall('([0-9]+) bytes smem,', l)
+                    gpu_compile_info[current_func]["smem"] = int(smem[0])
+                if 'bytes cmem' in l:
+                    cmem = re.findall('([0-9]+) bytes cmem\[([0-9]+)\]',l)
+                    for v in cmem:
+                        key='cmem[%s]' % v[1]
+                        gpu_compile_info[current_func][key] = int(v[0])
+                if 'bytes lmem' in l:
+                    lmem = re.findall('([0-9]+) bytes lmem', l)
+                    gpu_compile_info[current_func]['lmem'] = int(lmem[0])
+                current_func = None
+    return gpu_compile_info
+
 def run(config, program, reps, dry):
     print('Launching program', program, 'with modes', config[program]['build'])
     exe = config[program]['run'] + ' ' + config[program]['input']
@@ -46,11 +104,12 @@ def run(config, program, reps, dry):
         try:
             if (not mode in results[program]) or (not results[program][mode]):
                 start = 0
-                results[program][mode] = []
-            elif len(results[program][mode]) < reps:
-                start = len(results[program][mode])
+                results[program][mode] = {}
+                results[program][mode]['exec_time'] = []
+            elif len(results[program][mode]['exec_time']) < reps:
+                start = len(results[program][mode]['exec_time'])
             else:
-                print('FOUND', program, mode, 'runs', len(results[program][mode]) )
+                print('FOUND', program, mode, 'runs', len(results[program][mode]['exec_time']) )
                 continue
         except Exception as e:
             print('ERROR', e, 'running', program, 'mode', mode)
@@ -92,10 +151,10 @@ def run(config, program, reps, dry):
                 else:
                     runtime = t2 - t1
 
-                results[program][mode].append(runtime)
+                results[program][mode]['exec_time'].append(runtime)
 
                 with open('./results/results-%s.yaml'%(program), 'w') as f:
-                    yaml.dump( results, f )
+                    yaml.dump( results, f)
 
 #def merge_nested_dict(d1, d2):
 #    for k in d1:
@@ -193,16 +252,51 @@ def compile_and_install(config, program, repo_dir, mode):
         return
 
     os.makedirs( bin_dir, exist_ok=True )
+
+    #Create directory to store build logs 
+    build_log_dir = 'build_logs/%s/%s/' % (program, mode)
+
+    print('Making Build Log Directory...')
+    os.makedirs( build_log_dir, exist_ok=True )
+
+    print('Making Results Directory...')
+    os.makedirs( './results', exist_ok=True)
+
     print('Clean...')
     subprocess.run( config[program]['clean'], cwd=build_dir, shell=True)
     print('===> Build...program %s mode %s\n%s' % (program, mode, config[program]['build'][mode]) )
     try:
-        subprocess.run( config[program]['build'][mode], cwd=build_dir, shell=True )
+        p = subprocess.run( config[program]['build'][mode], cwd=build_dir, shell=True, capture_output=True)
     except Exception as e:
         print('building %s mode %s failed'%(program, mode), e)
+        write_compile_logs(build_log_dir, p.stdout.decode('utf-8'), p.stderr.decode('utf-8'))
         input('key...')
         sys.exit(1)
 
+    print('Storing compilation outputs')
+    write_compile_logs(build_log_dir, p.stdout.decode('utf-8'), p.stderr.decode('utf-8'))
+
+    print('Getting compilation statistics from nvinfo')
+    nv_info = get_nvlink_info(p.stderr.decode('utf-8'))
+
+    print('Storing compilation statistis')
+    results = { program: {} }
+    try:
+        with open('./results/results-%s.yaml'%(program), 'r') as f:
+            results = yaml.load(f, Loader=CLoader)
+    except FileNotFoundError as e:
+        pass
+
+    if (not mode in results[program]) or (not results[program][mode]):
+        results[program][mode] = {}
+        results[program][mode]['exec_time'] = []
+        results[program][mode]['compile_stats'] = {}
+
+    results[program][mode]['compile_stats'] = nv_info
+
+    with open('./results/results-%s.yaml'%(program), 'w') as f:
+        yaml.dump( results, f)
+
     print('Merge stats and reports...')
     merge_stats_reports( program, build_dir, mode )
     if mode == 'omp':

diff --git a/patches/RSBench.patch b/patches/RSBench.patch
@@ -1,3 +1,41 @@
+diff --git a/cuda/io.cu b/cuda/io.cu
+index 26f132f..f4ac121 100644
+--- a/cuda/io.cu
++++ b/cuda/io.cu
+@@ -194,6 +194,13 @@ Input read_CLI( int argc, char * argv[] )
+ 			else
+ 				print_CLI_error();
+ 		}
++        else if( strcmp(arg, "-t") == 0 )
++		{
++			if( ++i < argc )
++				input.nthreads = 1;
++			else
++				print_CLI_error();
++		}
+ 		else
+ 			print_CLI_error();
+ 	}
+diff --git a/openmp-offload/io.c b/openmp-offload/io.c
+index 1c13494..0d04e56 100644
+--- a/openmp-offload/io.c
++++ b/openmp-offload/io.c
+@@ -194,7 +194,14 @@ Input read_CLI( int argc, char * argv[] )
+ 			else
+ 				print_CLI_error();
+ 		}
+-		else
++        else if( strcmp(arg, "-t") == 0 )
++		{
++			if( ++i < argc )
++				input.nthreads = atoi(argv[i]);
++			else
++				print_CLI_error();
++		}
++        else
+ 			print_CLI_error();
+ 	}
+
 diff --git a/openmp-threading/io.c b/openmp-threading/io.c
 index dc5201f..a179d97 100644
 --- a/openmp-threading/io.c

diff --git a/patches/XSBench.patch b/patches/XSBench.patch
@@ -0,0 +1,22 @@
+diff --git a/openmp-offload/io.c b/openmp-offload/io.c
+index 5c981ee..c2fd0b7 100644
+--- a/openmp-offload/io.c
++++ b/openmp-offload/io.c
+@@ -275,8 +275,16 @@ Inputs read_CLI( int argc, char * argv[] )
+ 	{
+ 		char * arg = argv[i];
+
++		// This will be ignored
++		if( strcmp(arg, "-t") == 0 )
++		{
++			if( ++i < argc )
++				input.nthreads = atoi(argv[i]);
++			else
++				print_CLI_error();
++		}
+ 		// n_gridpoints (-g)
+-		if( strcmp(arg, "-g") == 0 )
++        else if( strcmp(arg, "-g") == 0 )
+ 		{	
+ 			if( ++i < argc )
+ 			{