diff --git a/llvm_ir_dataset_utils/tools/outlier_analysis.py b/llvm_ir_dataset_utils/tools/outlier_analysis.py new file mode 100644 index 0000000..434adcf --- /dev/null +++ b/llvm_ir_dataset_utils/tools/outlier_analysis.py @@ -0,0 +1,568 @@ +import subprocess +from collections import Counter +import os +from os import listdir +from os.path import join +from typing import List +import pandas as pd +import parallelbar +import ray +from itertools import repeat + +from opt_analysis_tools import parse_pass_analysis_exec + +import psutil +import time +import re + +num_cpus = psutil.cpu_count(logical=False) + +opt_load_args = [ + "opt", + "-load", + "RemoveFunctionBodyPass/build/libRemoveFunctionBody.so", + "-load-pass-plugin=RemoveFunctionBodyPass/build/libRemoveFunctionBody.so", + "-passes=remove-fn-body", +] + +""" + inputs: + - i: index of function whose function body is to be removed + - src: path to source bitcode/IR file + - dst: path to destination bitcode file +""" + + +def remove_fn(i: int, src: str, dst: str): + if not (os.path.isfile(src) or os.path.exists(os.path.dirname(dst))): + print("invalid file path in either src or dst argument") + return None + command = opt_load_args + [ + "-index", + f"{i}", + src, + "-o", + dst, + ] + + subprocess.run(command) + + +""" + inputs: + - i: index of function whose function body is to be removed + - bc: bitcode module (binary format) + + output: + - modified bitcode module (binary format) + + error: + - if wrong value input fed into the command +""" + + +def remove_fn_bc(i: int, bc): + if i < 0: + print("No negative index!") + return None + try: + with subprocess.Popen( + opt_load_args + + [ + "-index", + f"{i}", + ], + stdout=subprocess.PIPE, + stderr=subprocess.STDOUT, + stdin=subprocess.PIPE, + ) as proc: + return proc.communicate(input=bc)[0] + except ValueError as e: + print(e) + return None + + +""" +Get number of functions by running opt command with remove-fn-body and parse the first line of output +""" + + +def get_n_functions(file_path: str): + bc = None + with open(file_path, mode="rb") as f: + bc = f.read() + try: + with subprocess.Popen( + opt_load_args + + [ + "-index", + "-1", + "--disable-output", + ], + stdout=subprocess.PIPE, + stderr=subprocess.STDOUT, + stdin=subprocess.PIPE, + ) as proc: + return int(proc.communicate(input=bc)[0].decode("utf-8").split("\n")[0][2:]) + except ValueError as e: + print(e) + return -1 + + +""" + Input: bitcode module (binary format) + Output: LLVM IR (string) +""" + + +def get_ir(bitcode_module): + with subprocess.Popen( + opt_load_args + + [ + "-index", + "-1", + "-S", + ], + stdout=subprocess.PIPE, + stderr=subprocess.STDOUT, + stdin=subprocess.PIPE, + ) as proc: + return proc.communicate(input=bitcode_module)[0].decode("utf-8") + + +""" +Non-optimized version of get outliers. +Inputs: +- file_path: path to bitcode file +- opt: O1, O2, O3, Oz,... optimization options (case-sensitive) +- outlier_threshold: ratio of number of outlier passes over total number of passes +- quantile: percentile of passes given runtime data to be considered outlier for every pass +""" + + +def get_outliers(file_path: str, opt: str, outlier_threshold=1, quantile=0.95): + n_functions = get_n_functions(file_path=file_path) + + tmp_path = file_path.split("/") + tmp_path[-2] = "_tmp" + tmp_path = "/".join(tmp_path) + os.makedirs(os.path.dirname(tmp_path), exist_ok=True) + + ref_data = pd.read_csv( + f"pass_runtime/transformations/{opt.lower()}_cpp.csv" + ) # TODO: filename needs to be abstracted away! + groups = ref_data.groupby("pass").quantile( + q=quantile + ) # TODO: put this outside of the loop + bc = None + n_removed = 0 + src_path = file_path + fraction_outliers = [] + for i in range(n_functions): + if n_removed > 0: + src_path = tmp_path + with open(src_path, mode="rb") as f: + bc = f.read() + + # delete (non-save) function ith + tmp = remove_fn_bc(i, bc) + + # this ignores the src_path and only takes the bitcode module + time_analysis_tmp = parse_pass_analysis_exec(src_path, True, True, opt, tmp) + + # if removed function still results in module being in outlier range, + # delete that function because it doesn't affect the outliers + # (minimize functions in module such that outliers are preserved) + + outlier_status, fraction_outlier = is_outlier( + time_analysis_tmp, ref_data=groups, threshold=outlier_threshold + ) + + if outlier_status: + remove_fn(i, src_path, tmp_path) + n_removed += 1 + src_path = tmp_path + + print(f"i: {i}. n_removed: {n_removed}. src_path={src_path}") + fraction_outliers.append(fraction_outlier) + return (n_removed, n_functions, sum(fraction_outliers) / len(fraction_outliers)) + + +""" +Helper function for is_outlier. +Inputs: +- time: time elapsed for pass_name execution +- pass_name: name of pass to check +- ref_data: pandas DataFrame of reference data on distributions of all passes +- time col: column name that specifies time in DataFrame +""" + + +def check_outliers(time, pass_name, ref_data, time_col="fraction_total_time"): + return time >= ref_data.loc[pass_name][time_col] + + +""" +Check if bitcode module (as data) has outlier in any of its pass, return True if so +- threshold: percentage of passes being outliers to return True. range: [0,1] +""" + + +def is_outlier( + data, + ref_data: pd.DataFrame, + time_col="fraction_total_time", + threshold=1, +): + + data_len = len(data["pass-exec"]) + times = [data["pass-exec"][i][0] for i in range(data_len)] + pass_names = [data["pass-exec"][i][1] for i in range(data_len)] + + outliers = parallelbar.progress_starmap( + check_outliers, + zip(times, pass_names, repeat(ref_data), repeat(time_col)), + total=data_len, + ) + + print(Counter(outliers)[True] / len(outliers)) + return ( + Counter(outliers)[True] / len(outliers) >= threshold, + Counter(outliers)[True] / len(outliers), + ) + + +""" +Same with get_outliers function but works on a directory of bitcode module files. +Inputs: +- dir_path: absolute path to directory containing bitcode module files +- opt: optimization option +- outlier_threshold: threshold of fraction of function being outlier for all pass to classify as outlier +""" + + +def preserve_outliers_dir(dir_path: str, opt: str, outlier_threshold=1): + files = listdir(dir_path) + fp = parallelbar.progress_starmap( + join, zip(repeat(dir_path), files), total=len(files) + ) + + data = [] + for f in fp: + data.append(get_outliers(f, opt, outlier_threshold=outlier_threshold)) + print(f) + + return data + + +""" +Given path to bitcode module file and a pass name, write new file with +outlier functions extracted. + +Inputs: +- file_path: absolute path to bitcode module file. +- opt: optimization pipeline in {O1,O2,O3,Oz}. +- pass_name: pass name in which outlier threshold used to extract + outlier functions. +- quantile (optional): percentile used as outlier threshold with range [0,1]. +- ref_data (optional): pandas.DataFrame object used as reference for outlier extraction. +- dst (optional): If empty, write file to a directory '_tmp'. Else, write file to directory dst. +- abs_threshold (optional): In seconds, absolute wall time threshold to consider for outlier extraction. + Used to minimize noise. + +Output: +- If bitcode module file is an outlier, return a tuple + (number of functions removed, total number of functions in module). +- If bitcode module file is an outlier but unexpected error from retrieving outlier data, returns (-2, -2). +- If pass_name is not in opt time-pass analysis of the file, return (-3,-3). +- If bitcode module file is not outlier, return (0, number of functions). +""" + + +@ray.remote +def get_outliers_pass_specific( + file_path: str, + opt: str, + pass_name: str, + quantile: float = 0.95, + ref_data: pd.DataFrame = None, + dst="", + abs_threshold=0.005, +): + tmp_path = file_path.split("/") + tmp_path[-2] = "_tmp" if dst == "" else dst + tmp_path = "/".join(tmp_path) + os.makedirs(os.path.dirname(tmp_path), exist_ok=True) + + if ref_data is None: + ref_data = pd.read_csv( + f"pass_runtime/transformations/{opt.lower()}_cpp.csv" + ) # TODO: abstract this away. currently hard coding transformations pass data + + bc = None + n_removed = 0 + src_path = file_path + + analysis = parse_pass_analysis_exec( + src_path, True, True, opt, bitcode_module=None, dict_format=True + ) + if analysis is not None: + analysis = analysis["pass-exec"] + else: + return None + + groups = ref_data.groupby("pass").quantile( + q=quantile + ) # TODO: replace "pass" with parameter + + if pass_name in analysis: + abs_time, rel_time = analysis[pass_name] + else: + return (-3, -3) # TODO: come up with better error signals + + n_functions = get_n_functions(file_path=file_path) + + if ( + check_outliers(rel_time, pass_name, groups, time_col="rel_time") + and abs_time >= abs_threshold + ): + try: + for i in range(n_functions): + if n_removed > 0: + src_path = tmp_path + with open(src_path, mode="rb") as f: + bc = f.read() + + tmp = remove_fn_bc(i, bc) + analysis = parse_pass_analysis_exec( + src_path, True, True, opt, tmp, dict_format=True + ) + if analysis is not None: + analysis = analysis["pass-exec"] + else: + return (-2, -2) # deprecated. should never happen. + + if pass_name in analysis and check_outliers( + analysis[pass_name][1], pass_name, groups, time_col="rel_time" + ): + remove_fn(i, src_path, tmp_path) + n_removed += 1 + src_path = tmp_path + return (n_removed, n_functions) + except KeyError: + return ( + -1, + -1, + ) # deprecated since if statement above check for this condition + + return (0, n_functions) + + +""" +Given a directory of all bitcode module files and a pass name, +output all outlier modules with outlier functions being preserved. + +Inputs: +- dir_path: path to directory containing bitcode module files. +- opt: optimization pipeline option in {O1,O2,O3,Oz}. +- pass_name: name of pass. +- quantile (optional): threshold to consider which time being an outlier. +- fp_list (optional): List of bitcode module files to process outlier + preservation. Default: get all files in dir_path directory. +- ref_data (optional): pandas.DataFrame object as a reference for + getting outlier data from pass_name. +- dst (optional): destination path to write module file with + outlier functions preserved to. + +Output: dictionary type with key being pass_name and value being a list of +tuples. Each tuple represents number of functions removed (1st element) +and total number of functions (2nd element) of a module. +""" + + +@ray.remote +def preserve_outliers_dir_pass_specific( + dir_path: str, + opt: str, + pass_name: str, + quantile: float = 0.95, + fp_list: List = [], + ref_data: pd.DataFrame = None, + dst: str = "", +): + if len(fp_list) > 0: + fp = fp_list + else: + files = listdir(dir_path) + fp = [join(dir_path, files[i]) for i in range(len(files))] + if ref_data is not None: + df = ref_data + else: + df = pd.read_csv( + f"pass_runtime/transformations/{opt.lower()}_cpp.csv" + ) # TODO: change this + data = [] + s = time.time() + results = [ + get_outliers_pass_specific.remote( + fp[i], opt, pass_name, quantile, ref_data=df, dst=dst + ) + for i in range(len(fp)) + ] + + unfinished = results + while unfinished: + finished, unfinished = ray.wait(unfinished, num_returns=1) + data.extend(ray.get(finished)) + print(f"{pass_name}: time elapsed={time.time() - s}", flush=True) + + return {pass_name: data} + + +""" +Check if the number of outlier modules modified fit with the number of outlier modules +given by `quantile`. + +Inputs: +- dir_path: absolute path to source directory. +- ref_data: pandas.DataFrame +- opt: optimization option. +- quantile (optional): threshold to be considered as outlier for relative time. +- pass_col (optional): label of categorical column storing all pass names in `ref_data`. +- pass_list (optional): passes to check. If empty, check all passes in `ref_data`. +""" + + +def check_correctness( + dir_path: str, + ref_data: pd.DataFrame, + opt: str, + quantile: float = 0.95, + pass_col="pass", + pass_list=[], +): + if len(pass_list) != 0: + passes = pass_list + else: + passes = ref_data[pass_col].unique() # numpy array + files = listdir(dir_path) + n_files = len(files) + fp = [join(dir_path, files[i]) for i in range(len(files))] + data = {} + tmp_dir_names = [f"tmp_{''.join(re.split('<|,|>|llvm::|, ', p))}" for p in passes] + results = [ + preserve_outliers_dir_pass_specific.remote( + dir_path, opt, passes[i], quantile, fp_list=fp, dst=tmp_dir_names[i] + ) + for i in range(len(passes)) + ] + unfinished = results + while unfinished: + finished, unfinished = ray.wait(unfinished, num_returns=1) + result = ray.get(finished[0]) + pass_name = next(iter(result.keys())) + result_dict = Counter(result[pass_name]) + data[pass_name] = ( + 1 + - ( + result_dict[(0, 0)] + + result_dict[(-1, -1)] + + result_dict[(-2, -2)] + + result_dict[(-3, -3)] + ) + / n_files + ) + + return data + + +""" +Same as check_outlier, but return if both relative and absolute wall time of bitcode +module file meet the outlier threshold. + +Inputs: +- file_path: path to bitcode module file. +- pass_name: pass name. +- opt: optimization pipeline. +- rel_threshold_value: relative wall time threshold. +- abs_threshold_value: absolute wall time threshold. + +Output: True if `pass_name` of bitcode module file in `opt` pipeline meeet both +relative and absolute wall time threshold. False otherwise. +""" + + +### RELATIVE TIME DATA +@ray.remote +def check_outliers2( + file_path: str, + pass_name: str, + opt: str, + rel_threshold_value: float, + abs_threshold_value: float, +): + analysis = parse_pass_analysis_exec( + file_path, True, True, opt, bitcode_module=None, dict_format=True + ) + analysis_abs = parse_pass_analysis_exec( + file_path, + relative=False, + bitcode_file=True, + opt=opt, + bitcode_module=None, + dict_format=True, + ) + if analysis is not None and analysis_abs is not None: + analysis = analysis["pass-exec"] + analysis_abs = analysis_abs["pass-exec"] + else: + return None + + if pass_name in analysis and pass_name in analysis_abs: + time_rel = analysis[pass_name] + time_abs = analysis_abs[pass_name] + else: + return None + + return time_rel >= rel_threshold_value and time_abs >= abs_threshold_value + + +""" +Return list of bitcode module files that are outliers for specific pass name. +Inputs: +- file_dir: directory to source module files. +- pass_name: pass name. +- opt: optimization pipeline. +- rel_threshold_value: relative wall time threshold. +- abs_threshold_value: absolute wall time threshold. +- fp (optional): list of bitcode module files. If empty, function processes all files in `file_dir`. +""" + + +def get_outliers_pass_specific2( + file_dir: str, + pass_name: str, + opt: str, + rel_threshold_value: float, + abs_threshold_value: float, + fp: List = [], +): + if fp != []: + files = fp + else: + files = listdir(file_dir) + + bools = ray.get( + [ + check_outliers2.remote( + join(file_dir, file), + pass_name, + opt, + rel_threshold_value, + abs_threshold_value, + ) + for file in files + ] + ) + + return [files[fi] for fi in range(len(files)) if bools[fi] is True]