diff --git a/llvm_ir_dataset_utils/compile_time_analysis_tools/python_processing/fitting_and_plotting.py b/llvm_ir_dataset_utils/compile_time_analysis_tools/python_processing/fitting_and_plotting.py new file mode 100644 index 0000000..d2b42ac --- /dev/null +++ b/llvm_ir_dataset_utils/compile_time_analysis_tools/python_processing/fitting_and_plotting.py @@ -0,0 +1,102 @@ +"""Functions for plotting instruction counts and text segment size data. + +This script contains two functions. One function makes a scatter plot of +compiler instruction counts and text segment size data. The other makes +a histogram of the compiler instruction counts. + +Example Usage: from fitting_and_plotting import * +""" +import matplotlib.pyplot as plt +from read_column import csv_to_pandas_df +import numpy as np + + +def plot_instruction_counts_v_textseg(lang: str, + storage: str, + show: bool = False) -> None: + """Display a scatter plot of the data using matplotlib pyplot. + + The function internally creates a pandas dataframe from the provided + CSV file location. The columns pertaining to text segment size and + compiler CPU instruction counts are plotted in a scatter plot with + logarithmic axis. The x and y bounds of the plot are limited to + provide a standard range of values for all plots. + + Args: + lang: A string which represents the type of IR file data being + accessed + storage: A string which is the path to the IR CSV data + show: A boolean which if set to True will print the number of data + points and show the scatter plot using pyplot.show(), otherwise + the plot is saved to a .pdf file + """ + df = csv_to_pandas_df(lang, storage) + textseg_data = df["text_segment"] + inst_data = df["instruction"] + c, b, a = np.polyfit(textseg_data, inst_data, 2) + + x_axis = range(min(textseg_data), max(textseg_data), 10) + z = np.polyval([c, b, a], x_axis) + + plt.scatter(textseg_data, inst_data) + plt.xscale("log") + plt.yscale("log") + plt.gca().set_ylim([10**8, 10**13]) + plt.gca().set_xlim([10**(-1), 10**9]) + plt.xlabel("Text Segment Size (bytes)") + plt.ylabel("Compiler CPU Instructions Count") + if (lang == "cpp"): + plt.title("Clang++ Compiler Instructions vs. Text Segment Size (" + lang + + ")") + else: + plt.title("Clang Compiler Instructions vs. Text Segment Size (" + lang + + ")") + plt.plot(x_axis, z, 'r') + equation = f"${c:.1e}x^2 + {b:.1e}x + {a:.1e}$" + plt.legend([f"fit: {equation}", "original"]) + if (show): + print(len(textseg_data)) + plt.show() + else: + plt.savefig(fname=lang + "_instvtext.pdf", format="pdf") + plt.close() + + +def plot_instruction_counts_histograms(lang: str, + storage: str, + show: bool = False) -> None: + """Display a histogram of the compile time data using matplotlib pyplot. + + The function internally creates a pandas dataframe from the provided + CSV file location. The columns pertaining to compiler CPU instruction + counts are plotted in a histogram with logarithmic axis. The x and y + bounds of the plot are limited to provide a standard range of values + for all plots. + + Args: + lang: A string which represents the type of IR file data being + accessed + storage: A string which is the path to the IR CSV data + show: A boolean which if set to True will print the number of data + points and display the histogram using pyplot.show(), otherwise + the plot is saved to a .pdf file + """ + df = csv_to_pandas_df(lang, storage) + inst_data = df["instruction"] + plt.hist(inst_data, bins='auto', alpha=1, color='b') + plt.title("Histogram of Compiler Instructions (" + lang + ")") + plt.xscale("log") + plt.yscale("log") + plt.gca().set_ylim([10**(-1), 10**5]) + plt.gca().set_xlim([10**8, 10**13]) + plt.xlabel('Compiler CPU Instructions Count') + plt.ylabel('No. of IR Files') + + plt.text(max(inst_data), 1.1, format(max(inst_data), '.2e'), ha='center') + + if (show): + print(len(inst_data)) + plt.show() + else: + plt.savefig(fname=lang + "_hist.pdf", format="pdf") + plt.close() diff --git a/llvm_ir_dataset_utils/compile_time_analysis_tools/python_processing/plot_and_display_utlities.ipynb b/llvm_ir_dataset_utils/compile_time_analysis_tools/python_processing/plot_and_display_utlities.ipynb new file mode 100644 index 0000000..710dcbb --- /dev/null +++ b/llvm_ir_dataset_utils/compile_time_analysis_tools/python_processing/plot_and_display_utlities.ipynb @@ -0,0 +1,81 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "%matplotlib inline\n", + "from fitting_and_plotting import plot_instruction_counts_v_textseg, plot_instruction_counts_histograms\n", + "from read_column import outlier_rows\n", + "from IPython.display import HTML, display\n", + "from pandas import read_csv\n", + "\n", + "langs = read_csv('../dataset_download/indices.csv')[\"language\"]\n", + "#filter out a language\n", + "#langs = [x for x in langs if x != \"julia\"]\n", + "#location of CSV for each language\n", + "CSV_STORAGE=\"\"\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "for i in langs:\n", + " plot_instruction_counts_histograms(i, CSV_STORAGE, show=True)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "for i in langs:\n", + " plot_instruction_counts_v_textseg(i, CSV_STORAGE, show=True)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "for i in langs:\n", + " display(HTML(outlier_rows(i, CSV_STORAGE).to_html(index=False)))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.0" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/llvm_ir_dataset_utils/compile_time_analysis_tools/python_processing/read_column.py b/llvm_ir_dataset_utils/compile_time_analysis_tools/python_processing/read_column.py new file mode 100644 index 0000000..fc47dc0 --- /dev/null +++ b/llvm_ir_dataset_utils/compile_time_analysis_tools/python_processing/read_column.py @@ -0,0 +1,63 @@ +"""Functions to create a dataframe from a CSV and extract the outlying rows. + +The functions in this script can be used to create a Pandas dataframe +for the CSV data extracted from the IR dataset and analyze it. Using a +simple outlier extraction method, rows which constitute file data can be +chosen on a basis which finds IR files to be outliers. + +Example usage: from read_column import * +""" +import pandas as pd + + +def csv_to_pandas_df(lang: str, + storage: str, + file_name_suffix: str = 'combined.csv'): + """Creates a Pandas dataframe from the specified CSV file. + + Args: + lang: A string which represents the type of IR file data being + accessed + storage: A string which is the path to the IR CSV data + file_name_suffix: The suffix present on each CSV file with prefix + as lang + + Returns: pandas.core.frame.DataFrame + """ + + df = pd.read_csv( + storage + '/' + lang + '_' + file_name_suffix, skipinitialspace=True) + return df + + +def outlier_rows(lang: str, + storage: str, + outlier_num: int = 10, + write_to_csv: bool = False): + """Creates a Pandas dataframe from the specified CSV file. + + The function creates a dataframe and initially filters out half of the + rows which do not fall into the 50th percentile for the "percentage" + column. Then, any row which does not fall into the 75th percentile for + the "instruction" column is filtered out. Using outlier_num, the + largest n rows for "percentage" are returned as the outlying files + dataframe. + + Args: + lang: A string which represents the type of IR file data being + accessed + storage: A string which is the path to the IR CSV data + outlier_num: The number of IR file outliers to display + write_to_csv: If True, the resulting dataframe will be written to a + CSV file + + Returns: pandas.core.frame.DataFrame + """ + df = csv_to_pandas_df(lang, storage) + outlier_df = df.nlargest(df.shape[0] // 2, "percentage") + outlier_df = outlier_df[outlier_df.instruction > outlier_df["instruction"] + .quantile(q=.75, interpolation='lower')] + outlier_df = outlier_df.nlargest(outlier_num, "percentage") + if (write_to_csv): + outlier_df.to_csv(lang + '_outliers.csv', index=False) + return outlier_df