llvm-ml · andrewkallai · Aug 10, 2024 · Aug 23, 2024 · Aug 24, 2024 · Aug 24, 2024
diff --git a/llvm_ir_dataset_utils/compile_time_analysis_tools/python_processing/fitting_and_plotting.py b/llvm_ir_dataset_utils/compile_time_analysis_tools/python_processing/fitting_and_plotting.py
@@ -0,0 +1,102 @@
+"""Functions for plotting instruction counts and text segment size data.
+
+This script contains two functions. One function makes a scatter plot of
+compiler instruction counts and text segment size data. The other makes
+a histogram of the compiler instruction counts.
+
+Example Usage: from fitting_and_plotting import *
+"""
+import matplotlib.pyplot as plt
+from read_column import csv_to_pandas_df
+import numpy as np
+
+
+def plot_instruction_counts_v_textseg(lang: str,
+                                      storage: str,
+                                      show: bool = False) -> None:
+  """Display a scatter plot of the data using matplotlib pyplot.
+
+  The function internally creates a pandas dataframe from the provided
+  CSV file location. The columns pertaining to text segment size and
+  compiler CPU instruction counts are plotted in a scatter plot with
+  logarithmic axis. The x and y bounds of the plot are limited to
+  provide a standard range of values for all plots.
+
+  Args:
+    lang: A string which represents the type of IR file data being
+      accessed
+    storage: A string which is the path to the IR CSV data
+    show: A boolean which if set to True will print the number of data
+      points and show the scatter plot using pyplot.show(), otherwise
+      the plot is saved to a .pdf file
+  """
+  df = csv_to_pandas_df(lang, storage)
+  textseg_data = df["text_segment"]
+  inst_data = df["instruction"]
+  c, b, a = np.polyfit(textseg_data, inst_data, 2)
+
+  x_axis = range(min(textseg_data), max(textseg_data), 10)
+  z = np.polyval([c, b, a], x_axis)
+
+  plt.scatter(textseg_data, inst_data)
+  plt.xscale("log")
+  plt.yscale("log")
+  plt.gca().set_ylim([10**8, 10**13])
+  plt.gca().set_xlim([10**(-1), 10**9])
+  plt.xlabel("Text Segment Size (bytes)")
+  plt.ylabel("Compiler CPU Instructions Count")
+  if (lang == "cpp"):
+    plt.title("Clang++ Compiler Instructions vs. Text Segment Size (" + lang +
+              ")")
+  else:
+    plt.title("Clang Compiler Instructions vs. Text Segment Size (" + lang +
+              ")")
+  plt.plot(x_axis, z, 'r')
+  equation = f"${c:.1e}x^2 + {b:.1e}x + {a:.1e}$"
+  plt.legend([f"fit: {equation}", "original"])
+  if (show):
+    print(len(textseg_data))
+    plt.show()
+  else:
+    plt.savefig(fname=lang + "_instvtext.pdf", format="pdf")
+  plt.close()
+
+
+def plot_instruction_counts_histograms(lang: str,
+                                       storage: str,
+                                       show: bool = False) -> None:
+  """Display a histogram of the compile time data using matplotlib pyplot.
+
+  The function internally creates a pandas dataframe from the provided
+  CSV file location. The columns pertaining to compiler CPU instruction
+  counts are plotted in a histogram with logarithmic axis. The x and y
+  bounds of the plot are limited to provide a standard range of values
+  for all plots.
+
+  Args:
+    lang: A string which represents the type of IR file data being
+      accessed
+    storage: A string which is the path to the IR CSV data
+    show: A boolean which if set to True will print the number of data
+      points and display the histogram using pyplot.show(), otherwise
+      the plot is saved to a .pdf file
+  """
+  df = csv_to_pandas_df(lang, storage)
+  inst_data = df["instruction"]
+  plt.hist(inst_data, bins='auto', alpha=1, color='b')
+  plt.title("Histogram of Compiler Instructions (" + lang + ")")
+  plt.xscale("log")
+  plt.yscale("log")
+  plt.gca().set_ylim([10**(-1), 10**5])
+  plt.gca().set_xlim([10**8, 10**13])
+  plt.xlabel('Compiler CPU Instructions Count')
+  plt.ylabel('No. of IR Files')
+
+  plt.text(max(inst_data), 1.1, format(max(inst_data), '.2e'), ha='center')
+
+  if (show):
+    print(len(inst_data))
+    plt.show()
+  else:
+    plt.savefig(fname=lang + "_hist.pdf", format="pdf")
+  plt.close()
diff --git a/...taset_utils/compile_time_analysis_tools/python_processing/plot_and_display_utlities.ipynb b/...taset_utils/compile_time_analysis_tools/python_processing/plot_and_display_utlities.ipynb
@@ -0,0 +1,81 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%matplotlib inline\n",
+    "from fitting_and_plotting import plot_instruction_counts_v_textseg, plot_instruction_counts_histograms\n",
+    "from read_column import outlier_rows\n",
+    "from IPython.display import HTML, display\n",
+    "from pandas import read_csv\n",
+    "\n",
+    "langs = read_csv('../dataset_download/indices.csv')[\"language\"]\n",
+    "#filter out a language\n",
+    "#langs = [x for x in langs if x != \"julia\"]\n",
+    "#location of CSV for each language\n",
+    "CSV_STORAGE=\"\"\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "for i in langs:\n",
+    "    plot_instruction_counts_histograms(i, CSV_STORAGE, show=True)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "for i in langs:\n",
+    "    plot_instruction_counts_v_textseg(i, CSV_STORAGE, show=True)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "for i in langs:\n",
+    "    display(HTML(outlier_rows(i, CSV_STORAGE).to_html(index=False)))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.7.0"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/llvm_ir_dataset_utils/compile_time_analysis_tools/python_processing/read_column.py b/llvm_ir_dataset_utils/compile_time_analysis_tools/python_processing/read_column.py
@@ -0,0 +1,63 @@
+"""Functions to create a dataframe from a CSV and extract the outlying rows.
+
+The functions in this script can be used to create a Pandas dataframe
+for the CSV data extracted from the IR dataset and analyze it. Using a
+simple outlier extraction method, rows which constitute file data can be
+chosen on a basis which finds IR files to be outliers.
+
+Example usage: from read_column import *
+"""
+import pandas as pd
+
+
+def csv_to_pandas_df(lang: str,
+                     storage: str,
+                     file_name_suffix: str = 'combined.csv'):
+  """Creates a Pandas dataframe from the specified CSV file.
+
+  Args:
+    lang: A string which represents the type of IR file data being
+      accessed
+    storage: A string which is the path to the IR CSV data
+    file_name_suffix: The suffix present on each CSV file with prefix
+      as lang
+
+  Returns: pandas.core.frame.DataFrame
+  """
+
+  df = pd.read_csv(
+      storage + '/' + lang + '_' + file_name_suffix, skipinitialspace=True)
+  return df
+
+
+def outlier_rows(lang: str,
+                 storage: str,
+                 outlier_num: int = 10,
+                 write_to_csv: bool = False):
+  """Creates a Pandas dataframe from the specified CSV file.
+
+  The function creates a dataframe and initially filters out half of the
+  rows which do not fall into the 50th percentile for the "percentage"
+  column. Then, any row which does not fall into the 75th percentile for
+  the "instruction" column is filtered out. Using outlier_num, the
+  largest n rows for "percentage" are returned as the outlying files
+  dataframe.
+
+  Args:
+    lang: A string which represents the type of IR file data being
+      accessed
+    storage: A string which is the path to the IR CSV data
+    outlier_num: The number of IR file outliers to display
+    write_to_csv: If True, the resulting dataframe will be written to a
+      CSV file
+
+  Returns: pandas.core.frame.DataFrame
+  """
+  df = csv_to_pandas_df(lang, storage)
+  outlier_df = df.nlargest(df.shape[0] // 2, "percentage")
+  outlier_df = outlier_df[outlier_df.instruction > outlier_df["instruction"]
+                          .quantile(q=.75, interpolation='lower')]
+  outlier_df = outlier_df.nlargest(outlier_num, "percentage")
+  if (write_to_csv):
+    outlier_df.to_csv(lang + '_outliers.csv', index=False)
+  return outlier_df