From 9904a5d07a6e3fb3392e9d4de9a0786d27bd6f1c Mon Sep 17 00:00:00 2001 From: Andrew Kallai Date: Sat, 10 Aug 2024 06:07:57 -0400 Subject: [PATCH 01/11] Adding python files for graphing and processing csv data. --- .../python_processing/fitting_and_plotting.py | 35 +++++++++++++++++++ .../python_processing/read_column.py | 16 +++++++++ 2 files changed, 51 insertions(+) create mode 100644 llvm_ir_dataset_utils/compile_time_analysis_tools/python_processing/fitting_and_plotting.py create mode 100644 llvm_ir_dataset_utils/compile_time_analysis_tools/python_processing/read_column.py diff --git a/llvm_ir_dataset_utils/compile_time_analysis_tools/python_processing/fitting_and_plotting.py b/llvm_ir_dataset_utils/compile_time_analysis_tools/python_processing/fitting_and_plotting.py new file mode 100644 index 0000000..6284ccc --- /dev/null +++ b/llvm_ir_dataset_utils/compile_time_analysis_tools/python_processing/fitting_and_plotting.py @@ -0,0 +1,35 @@ +# -*- coding: ascii -*- +import numpy as np +import matplotlib.pyplot as plt +from read_column import open_and_load +# Usage: +# from fitting_and_plotting import plot_functionality + + +def plot_functionality(lang: str, show: bool = False) -> None: + ''' + Function to graph csv data for text segment size and instructions counts. + ''' + textseg_data, inst_data = open_and_load(lang) + c, b, a = np.polyfit(textseg_data, inst_data, 2) + + x_axis = range(min(textseg_data), max(textseg_data), 10) + z = np.polyval([c, b, a], x_axis) + + plt.scatter(textseg_data, inst_data) + plt.xscale("log") + plt.yscale("log") + plt.gca().set_ylim([10**8, 10**13]) + plt.xlabel("text_segment_size (bytes)") + plt.ylabel("compiler_cpu_instructions_count") + if (lang == "cpp"): + plt.title("Clang++ Compiler Instructions vs. Text Segment Size ("+lang+")") + else: + plt.title("Clang Compiler Instructions vs. Text Segment Size ("+lang+")") + plt.plot(x_axis, z, 'r') + equation = f"${c:.1e}x^2 + {b:.1e}x + {a:.1e}$" + plt.legend([f"fit: {equation}", "original"]) + if (show): + plt.show() + else: + plt.savefig(fname=lang+"_instvtext.pdf", format="pdf") diff --git a/llvm_ir_dataset_utils/compile_time_analysis_tools/python_processing/read_column.py b/llvm_ir_dataset_utils/compile_time_analysis_tools/python_processing/read_column.py new file mode 100644 index 0000000..e3e1910 --- /dev/null +++ b/llvm_ir_dataset_utils/compile_time_analysis_tools/python_processing/read_column.py @@ -0,0 +1,16 @@ +# -*- coding: ascii -*- +import csv + + +def open_and_load(lang: str, STORAGE: str = '/tmp') -> [int]: + ''' + Function to read csv files containing text segment size and instruction counts data. + ''' + textseg_data: [int] = [] + inst_data: [int] = [] + with open(STORAGE+lang+"_combined_results.csv", mode='r', newline='') as file: + for x in csv.DictReader(file): + textseg_data.append(int(x[" text_segment_size"])) + + inst_data.append(int(x[" instructions"])) + return textseg_data, inst_data From 94b86ad61d538eae4c68d34dfd10e16a6149445a Mon Sep 17 00:00:00 2001 From: Andrew Kallai Date: Fri, 23 Aug 2024 15:42:46 -0400 Subject: [PATCH 02/11] Adding changes to utlities to plot and process csv files. Adding initial jupyter notebook file to display results. --- .../python_processing/fitting_and_plotting.py | 74 +++++++-- .../plot_and_display_utlities.ipynb | 154 ++++++++++++++++++ .../python_processing/read_column.py | 31 ++-- 3 files changed, 233 insertions(+), 26 deletions(-) create mode 100644 llvm_ir_dataset_utils/compile_time_analysis_tools/python_processing/plot_and_display_utlities.ipynb diff --git a/llvm_ir_dataset_utils/compile_time_analysis_tools/python_processing/fitting_and_plotting.py b/llvm_ir_dataset_utils/compile_time_analysis_tools/python_processing/fitting_and_plotting.py index 6284ccc..9fdc72d 100644 --- a/llvm_ir_dataset_utils/compile_time_analysis_tools/python_processing/fitting_and_plotting.py +++ b/llvm_ir_dataset_utils/compile_time_analysis_tools/python_processing/fitting_and_plotting.py @@ -1,16 +1,22 @@ -# -*- coding: ascii -*- -import numpy as np -import matplotlib.pyplot as plt -from read_column import open_and_load -# Usage: -# from fitting_and_plotting import plot_functionality +""" +Utilies for plotting instruction counts & text segment size scatter plots, as well as instruction counts histograms. + +plot_instruction_counts_v_textseg + Returns: None + Example Usage: plot_instruction_counts_v_textseg("c", "/tmp", show=True) + plot_instruction_counts_histograms + Returns: None + Example Usage: plot_instruction_counts_histograms("c", "/tmp", show=True) +""" +import matplotlib.pyplot as plt +from read_column import pandas_df_with_outlier_scores -def plot_functionality(lang: str, show: bool = False) -> None: - ''' - Function to graph csv data for text segment size and instructions counts. - ''' - textseg_data, inst_data = open_and_load(lang) +def plot_instruction_counts_v_textseg(lang: str, storage: str, show: bool = False) -> None: + import numpy as np + df = pandas_df_with_outlier_scores(lang, storage) + textseg_data = df["text_segment"] + inst_data = df["instruction"] c, b, a = np.polyfit(textseg_data, inst_data, 2) x_axis = range(min(textseg_data), max(textseg_data), 10) @@ -20,8 +26,9 @@ def plot_functionality(lang: str, show: bool = False) -> None: plt.xscale("log") plt.yscale("log") plt.gca().set_ylim([10**8, 10**13]) - plt.xlabel("text_segment_size (bytes)") - plt.ylabel("compiler_cpu_instructions_count") + plt.gca().set_xlim([10**(-1), 10**9]) + plt.xlabel("Text Segment Size (bytes)") + plt.ylabel("Compiler CPU Instructions Count") if (lang == "cpp"): plt.title("Clang++ Compiler Instructions vs. Text Segment Size ("+lang+")") else: @@ -30,6 +37,47 @@ def plot_functionality(lang: str, show: bool = False) -> None: equation = f"${c:.1e}x^2 + {b:.1e}x + {a:.1e}$" plt.legend([f"fit: {equation}", "original"]) if (show): + print(len(textseg_data)) plt.show() else: plt.savefig(fname=lang+"_instvtext.pdf", format="pdf") + plt.close() + + +def plot_instruction_counts_histograms(lang: str, storage: str, show: bool = False) -> None: + df = pandas_df_with_outlier_scores(lang, storage) + inst_data = df["instruction"] + plt.hist(inst_data, bins='auto', alpha=1, color='b') + plt.title("Histogram of Compiler Instructions ("+lang+")") + plt.xscale("log") + plt.yscale("log") + plt.gca().set_ylim([10**(-1), 10**5]) + plt.gca().set_xlim([10**8, 10**13]) + plt.xlabel('Compiler CPU Instructions Count') + plt.ylabel('No. of IR Files') + + plt.text(max(inst_data), 1.1, format(max(inst_data), '.2e'), ha='center') + + if (show): + print(len(inst_data)) + plt.show() + else: + plt.savefig(fname=lang+"_hist.pdf", format="pdf") + plt.close() + +def print_outlier_rows(lang: str, storage: str, show) -> None: + df = pandas_df_with_outlier_scores(lang, storage) + outl = df.nlargest(10, "outlier_scores") +# print(outl.to_string(index=False)) + outl.style.hide_index() + print(outl) +# cols = outl.loc[:, "BasicBlockCount":"CallWithPointerArgumentCount"] + +# for col in cols: +# non_zero_values = outl[col][outl[col] != 0] +# if not non_zero_values.empty: +# print(f"Non-zero values in column {col}:") +# print(non_zero_values) + + + diff --git a/llvm_ir_dataset_utils/compile_time_analysis_tools/python_processing/plot_and_display_utlities.ipynb b/llvm_ir_dataset_utils/compile_time_analysis_tools/python_processing/plot_and_display_utlities.ipynb new file mode 100644 index 0000000..e2dee94 --- /dev/null +++ b/llvm_ir_dataset_utils/compile_time_analysis_tools/python_processing/plot_and_display_utlities.ipynb @@ -0,0 +1,154 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " file text_segment instruction BasicBlockCount \\\n", + "4477 file4504 4102289 639943870091 146673 \n", + "364 file369 897703 517186177449 30629 \n", + "13629 file13687 251257 507769720840 9335 \n", + "3007 file3026 782770 458695637213 27226 \n", + "16940 file17015 31982373 438351265829 380796 \n", + "20175 file20265 1977670 317035646729 72528 \n", + "29662 file29786 1868172 300746094074 69423 \n", + "10381 file10427 1868172 300720888123 69423 \n", + "1667 file1680 1534476 214007426790 55480 \n", + "20841 file20931 1519168 211214208675 46363 \n", + "\n", + " BlocksReachedFromConditionalInstruction Uses \\\n", + "4477 122900 161756 \n", + "364 24080 15219 \n", + "13629 7167 4459 \n", + "3007 22377 15349 \n", + "16940 276328 208458 \n", + "20175 60702 27729 \n", + "29662 57219 27998 \n", + "10381 57219 27998 \n", + "1667 43498 20981 \n", + "20841 37508 18374 \n", + "\n", + " DirectCallsToDefinedFunctions LoadInstCount StoreInstCount \\\n", + "4477 161745 1204668 842106 \n", + "364 14710 81272 56693 \n", + "13629 4309 22582 15908 \n", + "3007 15116 70912 49911 \n", + "16940 208412 1644587 1427034 \n", + "20175 26870 159768 95017 \n", + "29662 27139 150196 87140 \n", + "10381 27139 150196 87140 \n", + "1667 20175 127244 77850 \n", + "20841 17383 110236 67690 \n", + "\n", + " MaxLoopDepth ... CallReturnsPointerCount \\\n", + "4477 953 ... 2 \n", + "364 200 ... 5226 \n", + "13629 56 ... 1572 \n", + "3007 90 ... 4424 \n", + "16940 5208 ... 9739 \n", + "20175 286 ... 8509 \n", + "29662 282 ... 8292 \n", + "10381 282 ... 8292 \n", + "1667 392 ... 7683 \n", + "20841 280 ... 6749 \n", + "\n", + " CallReturnsVectorIntCount CallReturnsVectorFloatCount \\\n", + "4477 0 0 \n", + "364 0 0 \n", + "13629 0 0 \n", + "3007 0 0 \n", + "16940 0 0 \n", + "20175 0 0 \n", + "29662 0 0 \n", + "10381 0 0 \n", + "1667 0 0 \n", + "20841 0 16 \n", + "\n", + " CallReturnsVectorPointerCount CallWithManyArgumentsCount \\\n", + "4477 0 4948 \n", + "364 0 1088 \n", + "13629 0 248 \n", + "3007 0 181 \n", + "16940 0 20707 \n", + "20175 0 2365 \n", + "29662 0 2364 \n", + "10381 0 2364 \n", + "1667 0 2375 \n", + "20841 0 2141 \n", + "\n", + " CallWithPointerArgumentCount percentage pass_name \\\n", + "4477 266915 96.5 VerifierPass \n", + "364 32818 93.3 VerifierPass \n", + "13629 9572 93.4 VerifierPass \n", + "3007 27721 94.7 VerifierPass \n", + "16940 568566 90.3 VerifierPass \n", + "20175 55524 93.9 VerifierPass \n", + "29662 54821 93.4 VerifierPass \n", + "10381 54821 91.7 VerifierPass \n", + "1667 54131 94.8 VerifierPass \n", + "20841 47185 94.8 VerifierPass \n", + "\n", + " normalized_instruction outlier_scores \n", + "4477 100.000000 196.500000 \n", + "364 80.800201 174.100201 \n", + "13629 79.327429 172.727429 \n", + "3007 71.652044 166.352044 \n", + "16940 68.470102 158.770102 \n", + "20175 49.495849 143.395849 \n", + "29662 46.948097 140.348097 \n", + "10381 46.944155 138.644155 \n", + "1667 33.381819 128.181819 \n", + "20841 32.944948 127.744948 \n", + "\n", + "[10 rows x 51 columns]\n" + ] + } + ], + "source": [ + "%matplotlib inline\n", + "from fitting_and_plotting import plot_instruction_counts_v_textseg, plot_instruction_counts_histograms, print_outlier_rows\n", + "\n", + "#lot_instruction_counts_v_textseg(\"c\", \"/home/3302/hf_py_code/compile/csv_data/inst_scatterplots/\", show=True)\n", + "#lot_instruction_counts_histograms(\"c\", \"/home/3302/hf_py_code/compile/csv_data/inst_scatterplots/\", show=True)\n", + "#rint_outlier(\"c\")\n", + "\n", + "#for i in [\"c\", \"cpp\", \"rust\", \"swift\"]:\n", + "# plot_histogram(i, show=True)\n", + "# plot_histogram(i,show=False)\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.0" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/llvm_ir_dataset_utils/compile_time_analysis_tools/python_processing/read_column.py b/llvm_ir_dataset_utils/compile_time_analysis_tools/python_processing/read_column.py index e3e1910..fd7107d 100644 --- a/llvm_ir_dataset_utils/compile_time_analysis_tools/python_processing/read_column.py +++ b/llvm_ir_dataset_utils/compile_time_analysis_tools/python_processing/read_column.py @@ -1,16 +1,21 @@ -# -*- coding: ascii -*- -import csv +"""Utilities for reading and processing csv data with features for outlier analysis. +pandas_df_with_outlier_scores + Returns: pandas.core.frame.DataFrame + Example usage: pandas_df_with_outlier_scores('c', '/tmp', file_name='_other_suffix.csv', write_to_csv=True) +""" -def open_and_load(lang: str, STORAGE: str = '/tmp') -> [int]: - ''' - Function to read csv files containing text segment size and instruction counts data. - ''' - textseg_data: [int] = [] - inst_data: [int] = [] - with open(STORAGE+lang+"_combined_results.csv", mode='r', newline='') as file: - for x in csv.DictReader(file): - textseg_data.append(int(x[" text_segment_size"])) - inst_data.append(int(x[" instructions"])) - return textseg_data, inst_data +def pandas_df_with_outlier_scores(lang: str, storage: str, file_name_suffix: str = '_combined.csv', write_to_csv: bool = False): + import pandas as pd + + df = pd.read_csv(storage+lang+file_name_suffix, skipinitialspace=True) + data = df["instruction"] + min_value = data.min() + max_value = data.max() + df['normalized_instruction'] = ( + data - min_value) / (max_value - min_value) * 100 + df['outlier_scores'] = df["percentage"] + df['normalized_instruction'] + if (write_to_csv): + df.to_csv('normalized_file.csv', index=False) + return df From b44c4f92cefe3cae42c1f34d71f4ab3d55a93ccd Mon Sep 17 00:00:00 2001 From: Andrew Kallai Date: Sat, 24 Aug 2024 19:21:33 -0400 Subject: [PATCH 03/11] Updates to plotting, read csv file, and jupyter notebook. --- .../python_processing/fitting_and_plotting.py | 27 +--- .../plot_and_display_utlities.ipynb | 153 +++++------------- .../python_processing/read_column.py | 26 +-- 3 files changed, 59 insertions(+), 147 deletions(-) diff --git a/llvm_ir_dataset_utils/compile_time_analysis_tools/python_processing/fitting_and_plotting.py b/llvm_ir_dataset_utils/compile_time_analysis_tools/python_processing/fitting_and_plotting.py index 9fdc72d..5dd2185 100644 --- a/llvm_ir_dataset_utils/compile_time_analysis_tools/python_processing/fitting_and_plotting.py +++ b/llvm_ir_dataset_utils/compile_time_analysis_tools/python_processing/fitting_and_plotting.py @@ -1,5 +1,4 @@ -""" -Utilies for plotting instruction counts & text segment size scatter plots, as well as instruction counts histograms. +"""Utilies for plotting instruction counts & text segment size scatter plots, as well as instruction counts histograms. plot_instruction_counts_v_textseg Returns: None @@ -10,11 +9,12 @@ Example Usage: plot_instruction_counts_histograms("c", "/tmp", show=True) """ import matplotlib.pyplot as plt -from read_column import pandas_df_with_outlier_scores +from read_column import csv_to_pandas_df + def plot_instruction_counts_v_textseg(lang: str, storage: str, show: bool = False) -> None: import numpy as np - df = pandas_df_with_outlier_scores(lang, storage) + df = csv_to_pandas_df(lang, storage) textseg_data = df["text_segment"] inst_data = df["instruction"] c, b, a = np.polyfit(textseg_data, inst_data, 2) @@ -45,7 +45,7 @@ def plot_instruction_counts_v_textseg(lang: str, storage: str, show: bool = Fals def plot_instruction_counts_histograms(lang: str, storage: str, show: bool = False) -> None: - df = pandas_df_with_outlier_scores(lang, storage) + df = csv_to_pandas_df(lang, storage) inst_data = df["instruction"] plt.hist(inst_data, bins='auto', alpha=1, color='b') plt.title("Histogram of Compiler Instructions ("+lang+")") @@ -64,20 +64,3 @@ def plot_instruction_counts_histograms(lang: str, storage: str, show: bool = Fal else: plt.savefig(fname=lang+"_hist.pdf", format="pdf") plt.close() - -def print_outlier_rows(lang: str, storage: str, show) -> None: - df = pandas_df_with_outlier_scores(lang, storage) - outl = df.nlargest(10, "outlier_scores") -# print(outl.to_string(index=False)) - outl.style.hide_index() - print(outl) -# cols = outl.loc[:, "BasicBlockCount":"CallWithPointerArgumentCount"] - -# for col in cols: -# non_zero_values = outl[col][outl[col] != 0] -# if not non_zero_values.empty: -# print(f"Non-zero values in column {col}:") -# print(non_zero_values) - - - diff --git a/llvm_ir_dataset_utils/compile_time_analysis_tools/python_processing/plot_and_display_utlities.ipynb b/llvm_ir_dataset_utils/compile_time_analysis_tools/python_processing/plot_and_display_utlities.ipynb index e2dee94..aff403d 100644 --- a/llvm_ir_dataset_utils/compile_time_analysis_tools/python_processing/plot_and_display_utlities.ipynb +++ b/llvm_ir_dataset_utils/compile_time_analysis_tools/python_processing/plot_and_display_utlities.ipynb @@ -2,124 +2,49 @@ "cells": [ { "cell_type": "code", - "execution_count": 1, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - " file text_segment instruction BasicBlockCount \\\n", - "4477 file4504 4102289 639943870091 146673 \n", - "364 file369 897703 517186177449 30629 \n", - "13629 file13687 251257 507769720840 9335 \n", - "3007 file3026 782770 458695637213 27226 \n", - "16940 file17015 31982373 438351265829 380796 \n", - "20175 file20265 1977670 317035646729 72528 \n", - "29662 file29786 1868172 300746094074 69423 \n", - "10381 file10427 1868172 300720888123 69423 \n", - "1667 file1680 1534476 214007426790 55480 \n", - "20841 file20931 1519168 211214208675 46363 \n", - "\n", - " BlocksReachedFromConditionalInstruction Uses \\\n", - "4477 122900 161756 \n", - "364 24080 15219 \n", - "13629 7167 4459 \n", - "3007 22377 15349 \n", - "16940 276328 208458 \n", - "20175 60702 27729 \n", - "29662 57219 27998 \n", - "10381 57219 27998 \n", - "1667 43498 20981 \n", - "20841 37508 18374 \n", - "\n", - " DirectCallsToDefinedFunctions LoadInstCount StoreInstCount \\\n", - "4477 161745 1204668 842106 \n", - "364 14710 81272 56693 \n", - "13629 4309 22582 15908 \n", - "3007 15116 70912 49911 \n", - "16940 208412 1644587 1427034 \n", - "20175 26870 159768 95017 \n", - "29662 27139 150196 87140 \n", - "10381 27139 150196 87140 \n", - "1667 20175 127244 77850 \n", - "20841 17383 110236 67690 \n", - "\n", - " MaxLoopDepth ... CallReturnsPointerCount \\\n", - "4477 953 ... 2 \n", - "364 200 ... 5226 \n", - "13629 56 ... 1572 \n", - "3007 90 ... 4424 \n", - "16940 5208 ... 9739 \n", - "20175 286 ... 8509 \n", - "29662 282 ... 8292 \n", - "10381 282 ... 8292 \n", - "1667 392 ... 7683 \n", - "20841 280 ... 6749 \n", - "\n", - " CallReturnsVectorIntCount CallReturnsVectorFloatCount \\\n", - "4477 0 0 \n", - "364 0 0 \n", - "13629 0 0 \n", - "3007 0 0 \n", - "16940 0 0 \n", - "20175 0 0 \n", - "29662 0 0 \n", - "10381 0 0 \n", - "1667 0 0 \n", - "20841 0 16 \n", - "\n", - " CallReturnsVectorPointerCount CallWithManyArgumentsCount \\\n", - "4477 0 4948 \n", - "364 0 1088 \n", - "13629 0 248 \n", - "3007 0 181 \n", - "16940 0 20707 \n", - "20175 0 2365 \n", - "29662 0 2364 \n", - "10381 0 2364 \n", - "1667 0 2375 \n", - "20841 0 2141 \n", - "\n", - " CallWithPointerArgumentCount percentage pass_name \\\n", - "4477 266915 96.5 VerifierPass \n", - "364 32818 93.3 VerifierPass \n", - "13629 9572 93.4 VerifierPass \n", - "3007 27721 94.7 VerifierPass \n", - "16940 568566 90.3 VerifierPass \n", - "20175 55524 93.9 VerifierPass \n", - "29662 54821 93.4 VerifierPass \n", - "10381 54821 91.7 VerifierPass \n", - "1667 54131 94.8 VerifierPass \n", - "20841 47185 94.8 VerifierPass \n", - "\n", - " normalized_instruction outlier_scores \n", - "4477 100.000000 196.500000 \n", - "364 80.800201 174.100201 \n", - "13629 79.327429 172.727429 \n", - "3007 71.652044 166.352044 \n", - "16940 68.470102 158.770102 \n", - "20175 49.495849 143.395849 \n", - "29662 46.948097 140.348097 \n", - "10381 46.944155 138.644155 \n", - "1667 33.381819 128.181819 \n", - "20841 32.944948 127.744948 \n", - "\n", - "[10 rows x 51 columns]\n" - ] - } - ], + "outputs": [], "source": [ "%matplotlib inline\n", - "from fitting_and_plotting import plot_instruction_counts_v_textseg, plot_instruction_counts_histograms, print_outlier_rows\n", + "from fitting_and_plotting import plot_instruction_counts_v_textseg, plot_instruction_counts_histograms\n", + "from read_column import outlier_rows\n", + "import csv\n", + "from IPython.display import HTML, display\n", + "import pandas \n", "\n", - "#lot_instruction_counts_v_textseg(\"c\", \"/home/3302/hf_py_code/compile/csv_data/inst_scatterplots/\", show=True)\n", - "#lot_instruction_counts_histograms(\"c\", \"/home/3302/hf_py_code/compile/csv_data/inst_scatterplots/\", show=True)\n", - "#rint_outlier(\"c\")\n", - "\n", - "#for i in [\"c\", \"cpp\", \"rust\", \"swift\"]:\n", - "# plot_histogram(i, show=True)\n", - "# plot_histogram(i,show=False)\n" + "langs = pandas.read_csv('../dataset_download/indices.csv')[\"language\"]\n", + "CSV_STORAGE=\"\"\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "for i in langs:\n", + " plot_instruction_counts_histograms(i, CSV_STORAGE, show=True)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "for i in langs:\n", + " plot_instruction_counts_v_textseg(i, CSV_STORAGE, show=True)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "for i in langs:\n", + " display(HTML(outlier_rows(i, CSV_STORAGE).to_html(index=False)))\n" ] }, { diff --git a/llvm_ir_dataset_utils/compile_time_analysis_tools/python_processing/read_column.py b/llvm_ir_dataset_utils/compile_time_analysis_tools/python_processing/read_column.py index fd7107d..8c08096 100644 --- a/llvm_ir_dataset_utils/compile_time_analysis_tools/python_processing/read_column.py +++ b/llvm_ir_dataset_utils/compile_time_analysis_tools/python_processing/read_column.py @@ -1,21 +1,25 @@ """Utilities for reading and processing csv data with features for outlier analysis. -pandas_df_with_outlier_scores +csv_to_pandas_df Returns: pandas.core.frame.DataFrame - Example usage: pandas_df_with_outlier_scores('c', '/tmp', file_name='_other_suffix.csv', write_to_csv=True) + Example usage: csv_to_pandas('c', '/tmp', file_name='_other_suffix.csv', write_to_csv=True) +outlier_rows + Returns: pandas.core.frame.DataFrame + Example usage: outlier_rows('c', '/tmp') """ -def pandas_df_with_outlier_scores(lang: str, storage: str, file_name_suffix: str = '_combined.csv', write_to_csv: bool = False): +def csv_to_pandas_df(lang: str, storage: str, file_name_suffix: str = 'combined.csv', write_to_csv: bool = False): import pandas as pd - df = pd.read_csv(storage+lang+file_name_suffix, skipinitialspace=True) - data = df["instruction"] - min_value = data.min() - max_value = data.max() - df['normalized_instruction'] = ( - data - min_value) / (max_value - min_value) * 100 - df['outlier_scores'] = df["percentage"] + df['normalized_instruction'] + df = pd.read_csv(storage+lang+'_'+file_name_suffix, skipinitialspace=True) if (write_to_csv): - df.to_csv('normalized_file.csv', index=False) + df.to_csv(lang+'_normalized_'+file_name_suffix, index=False) return df + + +def outlier_rows(lang: str, storage: str) -> None: + df = csv_to_pandas_df(lang, storage) + outl = df.nlargest(df.shape[0]//2, "percentage") + # print(outl[outl.instruction > outl["instruction"].quantile(q=.75, interpolation='lower')]) + return outl.nlargest(10, "instruction") From 4f55b57ebd30e4daf9fbe457cfd5fdcad34c851d Mon Sep 17 00:00:00 2001 From: Andrew Kallai Date: Sat, 24 Aug 2024 19:31:18 -0400 Subject: [PATCH 04/11] Removed unecessary code. --- .../compile_time_analysis_tools/python_processing/read_column.py | 1 - 1 file changed, 1 deletion(-) diff --git a/llvm_ir_dataset_utils/compile_time_analysis_tools/python_processing/read_column.py b/llvm_ir_dataset_utils/compile_time_analysis_tools/python_processing/read_column.py index 8c08096..412fe78 100644 --- a/llvm_ir_dataset_utils/compile_time_analysis_tools/python_processing/read_column.py +++ b/llvm_ir_dataset_utils/compile_time_analysis_tools/python_processing/read_column.py @@ -21,5 +21,4 @@ def csv_to_pandas_df(lang: str, storage: str, file_name_suffix: str = 'combined. def outlier_rows(lang: str, storage: str) -> None: df = csv_to_pandas_df(lang, storage) outl = df.nlargest(df.shape[0]//2, "percentage") - # print(outl[outl.instruction > outl["instruction"].quantile(q=.75, interpolation='lower')]) return outl.nlargest(10, "instruction") From 6d1f68f45a08ce744adcdfe27079616ccd75d3c2 Mon Sep 17 00:00:00 2001 From: Andrew Kallai Date: Sat, 24 Aug 2024 19:34:34 -0400 Subject: [PATCH 05/11] extra removals --- .../python_processing/read_column.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llvm_ir_dataset_utils/compile_time_analysis_tools/python_processing/read_column.py b/llvm_ir_dataset_utils/compile_time_analysis_tools/python_processing/read_column.py index 412fe78..8b656a2 100644 --- a/llvm_ir_dataset_utils/compile_time_analysis_tools/python_processing/read_column.py +++ b/llvm_ir_dataset_utils/compile_time_analysis_tools/python_processing/read_column.py @@ -18,7 +18,7 @@ def csv_to_pandas_df(lang: str, storage: str, file_name_suffix: str = 'combined. return df -def outlier_rows(lang: str, storage: str) -> None: +def outlier_rows(lang: str, storage: str): df = csv_to_pandas_df(lang, storage) outl = df.nlargest(df.shape[0]//2, "percentage") return outl.nlargest(10, "instruction") From a585725df731ee6ee838f8b6e8c44bc83a39df81 Mon Sep 17 00:00:00 2001 From: Andrew Kallai Date: Fri, 30 Aug 2024 13:20:32 -0400 Subject: [PATCH 06/11] Fixed code formatting, modified docstrings, and improved outlier analysis process in read_column.py. --- .../python_processing/fitting_and_plotting.py | 100 ++++++++++-------- .../python_processing/read_column.py | 31 ++++-- 2 files changed, 73 insertions(+), 58 deletions(-) diff --git a/llvm_ir_dataset_utils/compile_time_analysis_tools/python_processing/fitting_and_plotting.py b/llvm_ir_dataset_utils/compile_time_analysis_tools/python_processing/fitting_and_plotting.py index 5dd2185..14076e3 100644 --- a/llvm_ir_dataset_utils/compile_time_analysis_tools/python_processing/fitting_and_plotting.py +++ b/llvm_ir_dataset_utils/compile_time_analysis_tools/python_processing/fitting_and_plotting.py @@ -1,4 +1,4 @@ -"""Utilies for plotting instruction counts & text segment size scatter plots, as well as instruction counts histograms. +"""plot_instruction_counts_v_textseg is used to make compiler instruction counts & text segment size scatter plots, and plot_instruction_counts_histograms is used to make compiler instruction counts histograms. plot_instruction_counts_v_textseg Returns: None @@ -12,55 +12,61 @@ from read_column import csv_to_pandas_df -def plot_instruction_counts_v_textseg(lang: str, storage: str, show: bool = False) -> None: - import numpy as np - df = csv_to_pandas_df(lang, storage) - textseg_data = df["text_segment"] - inst_data = df["instruction"] - c, b, a = np.polyfit(textseg_data, inst_data, 2) +def plot_instruction_counts_v_textseg(lang: str, + storage: str, + show: bool = False) -> None: + import numpy as np + df = csv_to_pandas_df(lang, storage) + textseg_data = df["text_segment"] + inst_data = df["instruction"] + c, b, a = np.polyfit(textseg_data, inst_data, 2) - x_axis = range(min(textseg_data), max(textseg_data), 10) - z = np.polyval([c, b, a], x_axis) + x_axis = range(min(textseg_data), max(textseg_data), 10) + z = np.polyval([c, b, a], x_axis) - plt.scatter(textseg_data, inst_data) - plt.xscale("log") - plt.yscale("log") - plt.gca().set_ylim([10**8, 10**13]) - plt.gca().set_xlim([10**(-1), 10**9]) - plt.xlabel("Text Segment Size (bytes)") - plt.ylabel("Compiler CPU Instructions Count") - if (lang == "cpp"): - plt.title("Clang++ Compiler Instructions vs. Text Segment Size ("+lang+")") - else: - plt.title("Clang Compiler Instructions vs. Text Segment Size ("+lang+")") - plt.plot(x_axis, z, 'r') - equation = f"${c:.1e}x^2 + {b:.1e}x + {a:.1e}$" - plt.legend([f"fit: {equation}", "original"]) - if (show): - print(len(textseg_data)) - plt.show() - else: - plt.savefig(fname=lang+"_instvtext.pdf", format="pdf") - plt.close() + plt.scatter(textseg_data, inst_data) + plt.xscale("log") + plt.yscale("log") + plt.gca().set_ylim([10**8, 10**13]) + plt.gca().set_xlim([10**(-1), 10**9]) + plt.xlabel("Text Segment Size (bytes)") + plt.ylabel("Compiler CPU Instructions Count") + if (lang == "cpp"): + plt.title("Clang++ Compiler Instructions vs. Text Segment Size (" + lang + + ")") + else: + plt.title("Clang Compiler Instructions vs. Text Segment Size (" + lang + + ")") + plt.plot(x_axis, z, 'r') + equation = f"${c:.1e}x^2 + {b:.1e}x + {a:.1e}$" + plt.legend([f"fit: {equation}", "original"]) + if (show): + print(len(textseg_data)) + plt.show() + else: + plt.savefig(fname=lang + "_instvtext.pdf", format="pdf") + plt.close() -def plot_instruction_counts_histograms(lang: str, storage: str, show: bool = False) -> None: - df = csv_to_pandas_df(lang, storage) - inst_data = df["instruction"] - plt.hist(inst_data, bins='auto', alpha=1, color='b') - plt.title("Histogram of Compiler Instructions ("+lang+")") - plt.xscale("log") - plt.yscale("log") - plt.gca().set_ylim([10**(-1), 10**5]) - plt.gca().set_xlim([10**8, 10**13]) - plt.xlabel('Compiler CPU Instructions Count') - plt.ylabel('No. of IR Files') +def plot_instruction_counts_histograms(lang: str, + storage: str, + show: bool = False) -> None: + df = csv_to_pandas_df(lang, storage) + inst_data = df["instruction"] + plt.hist(inst_data, bins='auto', alpha=1, color='b') + plt.title("Histogram of Compiler Instructions (" + lang + ")") + plt.xscale("log") + plt.yscale("log") + plt.gca().set_ylim([10**(-1), 10**5]) + plt.gca().set_xlim([10**8, 10**13]) + plt.xlabel('Compiler CPU Instructions Count') + plt.ylabel('No. of IR Files') - plt.text(max(inst_data), 1.1, format(max(inst_data), '.2e'), ha='center') + plt.text(max(inst_data), 1.1, format(max(inst_data), '.2e'), ha='center') - if (show): - print(len(inst_data)) - plt.show() - else: - plt.savefig(fname=lang+"_hist.pdf", format="pdf") - plt.close() + if (show): + print(len(inst_data)) + plt.show() + else: + plt.savefig(fname=lang + "_hist.pdf", format="pdf") + plt.close() diff --git a/llvm_ir_dataset_utils/compile_time_analysis_tools/python_processing/read_column.py b/llvm_ir_dataset_utils/compile_time_analysis_tools/python_processing/read_column.py index 8b656a2..06cf587 100644 --- a/llvm_ir_dataset_utils/compile_time_analysis_tools/python_processing/read_column.py +++ b/llvm_ir_dataset_utils/compile_time_analysis_tools/python_processing/read_column.py @@ -1,4 +1,4 @@ -"""Utilities for reading and processing csv data with features for outlier analysis. +"""csv_to_pandas_df is used to read a IR features CSV file and create a Pandas dataframe for it, and outlier_rows is used to filter the Pandas dataframe to the IR files which can be classified as outliers. csv_to_pandas_df Returns: pandas.core.frame.DataFrame @@ -9,16 +9,25 @@ """ -def csv_to_pandas_df(lang: str, storage: str, file_name_suffix: str = 'combined.csv', write_to_csv: bool = False): - import pandas as pd +def csv_to_pandas_df(lang: str, + storage: str, + file_name_suffix: str = 'combined.csv'): + import pandas as pd - df = pd.read_csv(storage+lang+'_'+file_name_suffix, skipinitialspace=True) - if (write_to_csv): - df.to_csv(lang+'_normalized_'+file_name_suffix, index=False) - return df + df = pd.read_csv( + storage + lang + '_' + file_name_suffix, skipinitialspace=True) + return df -def outlier_rows(lang: str, storage: str): - df = csv_to_pandas_df(lang, storage) - outl = df.nlargest(df.shape[0]//2, "percentage") - return outl.nlargest(10, "instruction") +def outlier_rows(lang: str, + storage: str, + outlier_num: int = 10, + write_to_csv: bool = False): + df = csv_to_pandas_df(lang, storage) + outl = df.nlargest(df.shape[0] // 2, "percentage") + outl = outl[outl.instruction > outl["instruction"].quantile( + q=.75, interpolation='lower')] + outl = outl.nlargest(outlier_num, "percentage") + if (write_to_csv): + df.to_csv(lang + '_outliers.csv', index=False) + return outl From cd06e34f01606484dacf9d2b8c124f565f0b849f Mon Sep 17 00:00:00 2001 From: Andrew Kallai Date: Fri, 30 Aug 2024 13:27:45 -0400 Subject: [PATCH 07/11] Fixed docstring consistency. --- .../python_processing/read_column.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/llvm_ir_dataset_utils/compile_time_analysis_tools/python_processing/read_column.py b/llvm_ir_dataset_utils/compile_time_analysis_tools/python_processing/read_column.py index 06cf587..e4b0e99 100644 --- a/llvm_ir_dataset_utils/compile_time_analysis_tools/python_processing/read_column.py +++ b/llvm_ir_dataset_utils/compile_time_analysis_tools/python_processing/read_column.py @@ -2,10 +2,10 @@ csv_to_pandas_df Returns: pandas.core.frame.DataFrame - Example usage: csv_to_pandas('c', '/tmp', file_name='_other_suffix.csv', write_to_csv=True) + Example usage: csv_to_pandas('c', '/tmp', file_name_suffix='_other_suffix.csv') outlier_rows Returns: pandas.core.frame.DataFrame - Example usage: outlier_rows('c', '/tmp') + Example usage: outlier_rows('c', '/tmp', outlier_num=20, write_to_csv=True) """ From 42578100e42f673c2881a3fa86711c5fce0b4a33 Mon Sep 17 00:00:00 2001 From: Andrew Kallai Date: Sun, 1 Sep 2024 16:43:00 -0400 Subject: [PATCH 08/11] Removed unecessary code from notebook. Fixed bug with read_column.py --- .../python_processing/plot_and_display_utlities.ipynb | 5 ++--- .../python_processing/read_column.py | 2 +- 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/llvm_ir_dataset_utils/compile_time_analysis_tools/python_processing/plot_and_display_utlities.ipynb b/llvm_ir_dataset_utils/compile_time_analysis_tools/python_processing/plot_and_display_utlities.ipynb index aff403d..b6f3ba9 100644 --- a/llvm_ir_dataset_utils/compile_time_analysis_tools/python_processing/plot_and_display_utlities.ipynb +++ b/llvm_ir_dataset_utils/compile_time_analysis_tools/python_processing/plot_and_display_utlities.ipynb @@ -9,11 +9,10 @@ "%matplotlib inline\n", "from fitting_and_plotting import plot_instruction_counts_v_textseg, plot_instruction_counts_histograms\n", "from read_column import outlier_rows\n", - "import csv\n", "from IPython.display import HTML, display\n", - "import pandas \n", + "from pandas import read_csv\n", "\n", - "langs = pandas.read_csv('../dataset_download/indices.csv')[\"language\"]\n", + "langs = read_csv('../dataset_download/indices.csv')[\"language\"]\n", "CSV_STORAGE=\"\"\n" ] }, diff --git a/llvm_ir_dataset_utils/compile_time_analysis_tools/python_processing/read_column.py b/llvm_ir_dataset_utils/compile_time_analysis_tools/python_processing/read_column.py index e4b0e99..72efefb 100644 --- a/llvm_ir_dataset_utils/compile_time_analysis_tools/python_processing/read_column.py +++ b/llvm_ir_dataset_utils/compile_time_analysis_tools/python_processing/read_column.py @@ -15,7 +15,7 @@ def csv_to_pandas_df(lang: str, import pandas as pd df = pd.read_csv( - storage + lang + '_' + file_name_suffix, skipinitialspace=True) + storage + '/' + lang + '_' + file_name_suffix, skipinitialspace=True) return df From 723e89b4b30719621056f8c0f7a0d563600374b7 Mon Sep 17 00:00:00 2001 From: Andrew Kallai Date: Sun, 1 Sep 2024 17:41:34 -0400 Subject: [PATCH 09/11] Modified notebook for clarity. --- .../python_processing/plot_and_display_utlities.ipynb | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/llvm_ir_dataset_utils/compile_time_analysis_tools/python_processing/plot_and_display_utlities.ipynb b/llvm_ir_dataset_utils/compile_time_analysis_tools/python_processing/plot_and_display_utlities.ipynb index b6f3ba9..710dcbb 100644 --- a/llvm_ir_dataset_utils/compile_time_analysis_tools/python_processing/plot_and_display_utlities.ipynb +++ b/llvm_ir_dataset_utils/compile_time_analysis_tools/python_processing/plot_and_display_utlities.ipynb @@ -13,6 +13,9 @@ "from pandas import read_csv\n", "\n", "langs = read_csv('../dataset_download/indices.csv')[\"language\"]\n", + "#filter out a language\n", + "#langs = [x for x in langs if x != \"julia\"]\n", + "#location of CSV for each language\n", "CSV_STORAGE=\"\"\n" ] }, @@ -43,7 +46,7 @@ "outputs": [], "source": [ "for i in langs:\n", - " display(HTML(outlier_rows(i, CSV_STORAGE).to_html(index=False)))\n" + " display(HTML(outlier_rows(i, CSV_STORAGE).to_html(index=False)))" ] }, { From 8ee3277ffbe4dbbde27b556372d19178b60f32b7 Mon Sep 17 00:00:00 2001 From: Andrew Kallai Date: Sat, 14 Sep 2024 18:31:56 -0400 Subject: [PATCH 10/11] Updated docstrings. --- .../python_processing/fitting_and_plotting.py | 46 +++++++++++++++---- .../python_processing/read_column.py | 46 +++++++++++++++---- 2 files changed, 76 insertions(+), 16 deletions(-) diff --git a/llvm_ir_dataset_utils/compile_time_analysis_tools/python_processing/fitting_and_plotting.py b/llvm_ir_dataset_utils/compile_time_analysis_tools/python_processing/fitting_and_plotting.py index 14076e3..d2b42ac 100644 --- a/llvm_ir_dataset_utils/compile_time_analysis_tools/python_processing/fitting_and_plotting.py +++ b/llvm_ir_dataset_utils/compile_time_analysis_tools/python_processing/fitting_and_plotting.py @@ -1,21 +1,35 @@ -"""plot_instruction_counts_v_textseg is used to make compiler instruction counts & text segment size scatter plots, and plot_instruction_counts_histograms is used to make compiler instruction counts histograms. +"""Functions for plotting instruction counts and text segment size data. -plot_instruction_counts_v_textseg - Returns: None - Example Usage: plot_instruction_counts_v_textseg("c", "/tmp", show=True) +This script contains two functions. One function makes a scatter plot of +compiler instruction counts and text segment size data. The other makes +a histogram of the compiler instruction counts. - plot_instruction_counts_histograms - Returns: None - Example Usage: plot_instruction_counts_histograms("c", "/tmp", show=True) +Example Usage: from fitting_and_plotting import * """ import matplotlib.pyplot as plt from read_column import csv_to_pandas_df +import numpy as np def plot_instruction_counts_v_textseg(lang: str, storage: str, show: bool = False) -> None: - import numpy as np + """Display a scatter plot of the data using matplotlib pyplot. + + The function internally creates a pandas dataframe from the provided + CSV file location. The columns pertaining to text segment size and + compiler CPU instruction counts are plotted in a scatter plot with + logarithmic axis. The x and y bounds of the plot are limited to + provide a standard range of values for all plots. + + Args: + lang: A string which represents the type of IR file data being + accessed + storage: A string which is the path to the IR CSV data + show: A boolean which if set to True will print the number of data + points and show the scatter plot using pyplot.show(), otherwise + the plot is saved to a .pdf file + """ df = csv_to_pandas_df(lang, storage) textseg_data = df["text_segment"] inst_data = df["instruction"] @@ -51,6 +65,22 @@ def plot_instruction_counts_v_textseg(lang: str, def plot_instruction_counts_histograms(lang: str, storage: str, show: bool = False) -> None: + """Display a histogram of the compile time data using matplotlib pyplot. + + The function internally creates a pandas dataframe from the provided + CSV file location. The columns pertaining to compiler CPU instruction + counts are plotted in a histogram with logarithmic axis. The x and y + bounds of the plot are limited to provide a standard range of values + for all plots. + + Args: + lang: A string which represents the type of IR file data being + accessed + storage: A string which is the path to the IR CSV data + show: A boolean which if set to True will print the number of data + points and display the histogram using pyplot.show(), otherwise + the plot is saved to a .pdf file + """ df = csv_to_pandas_df(lang, storage) inst_data = df["instruction"] plt.hist(inst_data, bins='auto', alpha=1, color='b') diff --git a/llvm_ir_dataset_utils/compile_time_analysis_tools/python_processing/read_column.py b/llvm_ir_dataset_utils/compile_time_analysis_tools/python_processing/read_column.py index 72efefb..dbb53a7 100644 --- a/llvm_ir_dataset_utils/compile_time_analysis_tools/python_processing/read_column.py +++ b/llvm_ir_dataset_utils/compile_time_analysis_tools/python_processing/read_column.py @@ -1,18 +1,29 @@ -"""csv_to_pandas_df is used to read a IR features CSV file and create a Pandas dataframe for it, and outlier_rows is used to filter the Pandas dataframe to the IR files which can be classified as outliers. +"""Functions to create a dataframe from a CSV and extract the outlying rows. -csv_to_pandas_df - Returns: pandas.core.frame.DataFrame - Example usage: csv_to_pandas('c', '/tmp', file_name_suffix='_other_suffix.csv') -outlier_rows - Returns: pandas.core.frame.DataFrame - Example usage: outlier_rows('c', '/tmp', outlier_num=20, write_to_csv=True) +The functions in this script can be used to create a Pandas dataframe +for the CSV data extracted from the IR dataset and analyze it. Using a +simple outlier extraction method, rows which constitute file data can be +chosen on a basis which finds IR files to be outliers. + +Example usage: from read_column import * """ +import pandas as pd def csv_to_pandas_df(lang: str, storage: str, file_name_suffix: str = 'combined.csv'): - import pandas as pd + """Creates a Pandas dataframe from the specified CSV file. + + Args: + lang: A string which represents the type of IR file data being + accessed + storage: A string which is the path to the IR CSV data + file_name_suffix: The suffix present on each CSV file with prefix + as lang + + Returns: pandas.core.frame.DataFrame + """ df = pd.read_csv( storage + '/' + lang + '_' + file_name_suffix, skipinitialspace=True) @@ -23,6 +34,25 @@ def outlier_rows(lang: str, storage: str, outlier_num: int = 10, write_to_csv: bool = False): + """Creates a Pandas dataframe from the specified CSV file. + + The function creates a dataframe and initially filters out half of the + rows which do not fall into the 50th percentile for the "percentage" + column. Then, any row which does not fall into the 75th percentile for + the "instruction" column is filtered out. Using outlier_num, the + largest n rows for "percentage" are returned as the outlying files + dataframe. + + Args: + lang: A string which represents the type of IR file data being + accessed + storage: A string which is the path to the IR CSV data + outlier_num: The number of IR file outliers to display + write_to_csv: If True, the resulting dataframe will be written to a + CSV file + + Returns: pandas.core.frame.DataFrame + """ df = csv_to_pandas_df(lang, storage) outl = df.nlargest(df.shape[0] // 2, "percentage") outl = outl[outl.instruction > outl["instruction"].quantile( From ed7255bea2eaddbc53fd97a8afe5eecd63a87ccd Mon Sep 17 00:00:00 2001 From: Andrew Kallai Date: Fri, 24 Jan 2025 19:54:22 -0500 Subject: [PATCH 11/11] Adding a more clear variable name for the outliers and fixing the csv output to have only the outliers data. --- .../python_processing/read_column.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/llvm_ir_dataset_utils/compile_time_analysis_tools/python_processing/read_column.py b/llvm_ir_dataset_utils/compile_time_analysis_tools/python_processing/read_column.py index dbb53a7..fc47dc0 100644 --- a/llvm_ir_dataset_utils/compile_time_analysis_tools/python_processing/read_column.py +++ b/llvm_ir_dataset_utils/compile_time_analysis_tools/python_processing/read_column.py @@ -54,10 +54,10 @@ def outlier_rows(lang: str, Returns: pandas.core.frame.DataFrame """ df = csv_to_pandas_df(lang, storage) - outl = df.nlargest(df.shape[0] // 2, "percentage") - outl = outl[outl.instruction > outl["instruction"].quantile( - q=.75, interpolation='lower')] - outl = outl.nlargest(outlier_num, "percentage") + outlier_df = df.nlargest(df.shape[0] // 2, "percentage") + outlier_df = outlier_df[outlier_df.instruction > outlier_df["instruction"] + .quantile(q=.75, interpolation='lower')] + outlier_df = outlier_df.nlargest(outlier_num, "percentage") if (write_to_csv): - df.to_csv(lang + '_outliers.csv', index=False) - return outl + outlier_df.to_csv(lang + '_outliers.csv', index=False) + return outlier_df