From 9904a5d07a6e3fb3392e9d4de9a0786d27bd6f1c Mon Sep 17 00:00:00 2001
From: Andrew Kallai <kallai.andrew@gmail.com>
Date: Sat, 10 Aug 2024 06:07:57 -0400
Subject: [PATCH 01/11] Adding python files for graphing and processing csv
 data.

---
 .../python_processing/fitting_and_plotting.py | 35 +++++++++++++++++++
 .../python_processing/read_column.py          | 16 +++++++++
 2 files changed, 51 insertions(+)
 create mode 100644 llvm_ir_dataset_utils/compile_time_analysis_tools/python_processing/fitting_and_plotting.py
 create mode 100644 llvm_ir_dataset_utils/compile_time_analysis_tools/python_processing/read_column.py

diff --git a/llvm_ir_dataset_utils/compile_time_analysis_tools/python_processing/fitting_and_plotting.py b/llvm_ir_dataset_utils/compile_time_analysis_tools/python_processing/fitting_and_plotting.py
new file mode 100644
index 0000000..6284ccc
--- /dev/null
+++ b/llvm_ir_dataset_utils/compile_time_analysis_tools/python_processing/fitting_and_plotting.py
@@ -0,0 +1,35 @@
+# -*- coding: ascii -*-
+import numpy as np
+import matplotlib.pyplot as plt
+from read_column import open_and_load
+# Usage:
+# from fitting_and_plotting import plot_functionality
+
+
+def plot_functionality(lang: str, show: bool = False) -> None:
+    '''
+    Function to graph csv data for text segment size and instructions counts.
+    '''
+    textseg_data, inst_data = open_and_load(lang)
+    c, b, a = np.polyfit(textseg_data, inst_data, 2)
+
+    x_axis = range(min(textseg_data), max(textseg_data), 10)
+    z = np.polyval([c, b, a], x_axis)
+
+    plt.scatter(textseg_data, inst_data)
+    plt.xscale("log")
+    plt.yscale("log")
+    plt.gca().set_ylim([10**8, 10**13])
+    plt.xlabel("text_segment_size (bytes)")
+    plt.ylabel("compiler_cpu_instructions_count")
+    if (lang == "cpp"):
+        plt.title("Clang++ Compiler Instructions vs. Text Segment Size ("+lang+")")
+    else:
+        plt.title("Clang Compiler Instructions vs. Text Segment Size ("+lang+")")
+    plt.plot(x_axis, z, 'r')
+    equation = f"${c:.1e}x^2 + {b:.1e}x + {a:.1e}$"
+    plt.legend([f"fit: {equation}", "original"])
+    if (show):
+        plt.show()
+    else:
+        plt.savefig(fname=lang+"_instvtext.pdf", format="pdf")
diff --git a/llvm_ir_dataset_utils/compile_time_analysis_tools/python_processing/read_column.py b/llvm_ir_dataset_utils/compile_time_analysis_tools/python_processing/read_column.py
new file mode 100644
index 0000000..e3e1910
--- /dev/null
+++ b/llvm_ir_dataset_utils/compile_time_analysis_tools/python_processing/read_column.py
@@ -0,0 +1,16 @@
+# -*- coding: ascii -*-
+import csv
+
+
+def open_and_load(lang: str, STORAGE: str = '/tmp') -> [int]:
+    '''
+    Function to read csv files containing text segment size and instruction counts data.
+    '''
+    textseg_data: [int] = []
+    inst_data: [int] = []
+    with open(STORAGE+lang+"_combined_results.csv", mode='r', newline='') as file:
+        for x in csv.DictReader(file):
+            textseg_data.append(int(x[" text_segment_size"]))
+
+            inst_data.append(int(x[" instructions"]))
+    return textseg_data, inst_data

From 94b86ad61d538eae4c68d34dfd10e16a6149445a Mon Sep 17 00:00:00 2001
From: Andrew Kallai <kallai.andrew@gmail.com>
Date: Fri, 23 Aug 2024 15:42:46 -0400
Subject: [PATCH 02/11] Adding changes to utlities to plot and process csv
 files. Adding initial jupyter notebook file to display results.

---
 .../python_processing/fitting_and_plotting.py |  74 +++++++--
 .../plot_and_display_utlities.ipynb           | 154 ++++++++++++++++++
 .../python_processing/read_column.py          |  31 ++--
 3 files changed, 233 insertions(+), 26 deletions(-)
 create mode 100644 llvm_ir_dataset_utils/compile_time_analysis_tools/python_processing/plot_and_display_utlities.ipynb

diff --git a/llvm_ir_dataset_utils/compile_time_analysis_tools/python_processing/fitting_and_plotting.py b/llvm_ir_dataset_utils/compile_time_analysis_tools/python_processing/fitting_and_plotting.py
index 6284ccc..9fdc72d 100644
--- a/llvm_ir_dataset_utils/compile_time_analysis_tools/python_processing/fitting_and_plotting.py
+++ b/llvm_ir_dataset_utils/compile_time_analysis_tools/python_processing/fitting_and_plotting.py
@@ -1,16 +1,22 @@
-# -*- coding: ascii -*-
-import numpy as np
-import matplotlib.pyplot as plt
-from read_column import open_and_load
-# Usage:
-# from fitting_and_plotting import plot_functionality
+"""
+Utilies for plotting instruction counts & text segment size scatter plots, as well as instruction counts histograms.
+
+plot_instruction_counts_v_textseg
+  Returns: None
+  Example Usage: plot_instruction_counts_v_textseg("c", "/tmp", show=True)
 
+ plot_instruction_counts_histograms
+  Returns: None
+  Example Usage: plot_instruction_counts_histograms("c", "/tmp", show=True)
+"""
+import matplotlib.pyplot as plt
+from read_column import pandas_df_with_outlier_scores
 
-def plot_functionality(lang: str, show: bool = False) -> None:
-    '''
-    Function to graph csv data for text segment size and instructions counts.
-    '''
-    textseg_data, inst_data = open_and_load(lang)
+def plot_instruction_counts_v_textseg(lang: str, storage: str, show: bool = False) -> None:
+    import numpy as np
+    df = pandas_df_with_outlier_scores(lang, storage)
+    textseg_data = df["text_segment"]
+    inst_data = df["instruction"]
     c, b, a = np.polyfit(textseg_data, inst_data, 2)
 
     x_axis = range(min(textseg_data), max(textseg_data), 10)
@@ -20,8 +26,9 @@ def plot_functionality(lang: str, show: bool = False) -> None:
     plt.xscale("log")
     plt.yscale("log")
     plt.gca().set_ylim([10**8, 10**13])
-    plt.xlabel("text_segment_size (bytes)")
-    plt.ylabel("compiler_cpu_instructions_count")
+    plt.gca().set_xlim([10**(-1), 10**9])
+    plt.xlabel("Text Segment Size (bytes)")
+    plt.ylabel("Compiler CPU Instructions Count")
     if (lang == "cpp"):
         plt.title("Clang++ Compiler Instructions vs. Text Segment Size ("+lang+")")
     else:
@@ -30,6 +37,47 @@ def plot_functionality(lang: str, show: bool = False) -> None:
     equation = f"${c:.1e}x^2 + {b:.1e}x + {a:.1e}$"
     plt.legend([f"fit: {equation}", "original"])
     if (show):
+        print(len(textseg_data))
         plt.show()
     else:
         plt.savefig(fname=lang+"_instvtext.pdf", format="pdf")
+    plt.close()
+
+
+def plot_instruction_counts_histograms(lang: str, storage: str, show: bool = False) -> None:
+    df = pandas_df_with_outlier_scores(lang, storage)
+    inst_data = df["instruction"]
+    plt.hist(inst_data, bins='auto', alpha=1, color='b')
+    plt.title("Histogram of Compiler Instructions ("+lang+")")
+    plt.xscale("log")
+    plt.yscale("log")
+    plt.gca().set_ylim([10**(-1), 10**5])
+    plt.gca().set_xlim([10**8, 10**13])
+    plt.xlabel('Compiler CPU Instructions Count')
+    plt.ylabel('No. of IR Files')
+
+    plt.text(max(inst_data), 1.1, format(max(inst_data), '.2e'), ha='center')
+
+    if (show):
+        print(len(inst_data))
+        plt.show()
+    else:
+        plt.savefig(fname=lang+"_hist.pdf", format="pdf")
+    plt.close()
+
+def print_outlier_rows(lang: str, storage: str, show) -> None:
+  df = pandas_df_with_outlier_scores(lang, storage)
+  outl = df.nlargest(10, "outlier_scores")
+#  print(outl.to_string(index=False))
+  outl.style.hide_index()
+  print(outl)
+#  cols = outl.loc[:, "BasicBlockCount":"CallWithPointerArgumentCount"]
+
+#  for col in cols:
+#        non_zero_values = outl[col][outl[col] != 0]
+#        if not non_zero_values.empty:
+#          print(f"Non-zero values in column {col}:")
+#          print(non_zero_values)
+
+
+
diff --git a/llvm_ir_dataset_utils/compile_time_analysis_tools/python_processing/plot_and_display_utlities.ipynb b/llvm_ir_dataset_utils/compile_time_analysis_tools/python_processing/plot_and_display_utlities.ipynb
new file mode 100644
index 0000000..e2dee94
--- /dev/null
+++ b/llvm_ir_dataset_utils/compile_time_analysis_tools/python_processing/plot_and_display_utlities.ipynb
@@ -0,0 +1,154 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "            file  text_segment   instruction  BasicBlockCount  \\\n",
+      "4477    file4504       4102289  639943870091           146673   \n",
+      "364      file369        897703  517186177449            30629   \n",
+      "13629  file13687        251257  507769720840             9335   \n",
+      "3007    file3026        782770  458695637213            27226   \n",
+      "16940  file17015      31982373  438351265829           380796   \n",
+      "20175  file20265       1977670  317035646729            72528   \n",
+      "29662  file29786       1868172  300746094074            69423   \n",
+      "10381  file10427       1868172  300720888123            69423   \n",
+      "1667    file1680       1534476  214007426790            55480   \n",
+      "20841  file20931       1519168  211214208675            46363   \n",
+      "\n",
+      "       BlocksReachedFromConditionalInstruction    Uses  \\\n",
+      "4477                                    122900  161756   \n",
+      "364                                      24080   15219   \n",
+      "13629                                     7167    4459   \n",
+      "3007                                     22377   15349   \n",
+      "16940                                   276328  208458   \n",
+      "20175                                    60702   27729   \n",
+      "29662                                    57219   27998   \n",
+      "10381                                    57219   27998   \n",
+      "1667                                     43498   20981   \n",
+      "20841                                    37508   18374   \n",
+      "\n",
+      "       DirectCallsToDefinedFunctions  LoadInstCount  StoreInstCount  \\\n",
+      "4477                          161745        1204668          842106   \n",
+      "364                            14710          81272           56693   \n",
+      "13629                           4309          22582           15908   \n",
+      "3007                           15116          70912           49911   \n",
+      "16940                         208412        1644587         1427034   \n",
+      "20175                          26870         159768           95017   \n",
+      "29662                          27139         150196           87140   \n",
+      "10381                          27139         150196           87140   \n",
+      "1667                           20175         127244           77850   \n",
+      "20841                          17383         110236           67690   \n",
+      "\n",
+      "       MaxLoopDepth       ...        CallReturnsPointerCount  \\\n",
+      "4477            953       ...                              2   \n",
+      "364             200       ...                           5226   \n",
+      "13629            56       ...                           1572   \n",
+      "3007             90       ...                           4424   \n",
+      "16940          5208       ...                           9739   \n",
+      "20175           286       ...                           8509   \n",
+      "29662           282       ...                           8292   \n",
+      "10381           282       ...                           8292   \n",
+      "1667            392       ...                           7683   \n",
+      "20841           280       ...                           6749   \n",
+      "\n",
+      "       CallReturnsVectorIntCount  CallReturnsVectorFloatCount  \\\n",
+      "4477                           0                            0   \n",
+      "364                            0                            0   \n",
+      "13629                          0                            0   \n",
+      "3007                           0                            0   \n",
+      "16940                          0                            0   \n",
+      "20175                          0                            0   \n",
+      "29662                          0                            0   \n",
+      "10381                          0                            0   \n",
+      "1667                           0                            0   \n",
+      "20841                          0                           16   \n",
+      "\n",
+      "       CallReturnsVectorPointerCount  CallWithManyArgumentsCount  \\\n",
+      "4477                               0                        4948   \n",
+      "364                                0                        1088   \n",
+      "13629                              0                         248   \n",
+      "3007                               0                         181   \n",
+      "16940                              0                       20707   \n",
+      "20175                              0                        2365   \n",
+      "29662                              0                        2364   \n",
+      "10381                              0                        2364   \n",
+      "1667                               0                        2375   \n",
+      "20841                              0                        2141   \n",
+      "\n",
+      "       CallWithPointerArgumentCount  percentage     pass_name  \\\n",
+      "4477                         266915        96.5  VerifierPass   \n",
+      "364                           32818        93.3  VerifierPass   \n",
+      "13629                          9572        93.4  VerifierPass   \n",
+      "3007                          27721        94.7  VerifierPass   \n",
+      "16940                        568566        90.3  VerifierPass   \n",
+      "20175                         55524        93.9  VerifierPass   \n",
+      "29662                         54821        93.4  VerifierPass   \n",
+      "10381                         54821        91.7  VerifierPass   \n",
+      "1667                          54131        94.8  VerifierPass   \n",
+      "20841                         47185        94.8  VerifierPass   \n",
+      "\n",
+      "       normalized_instruction  outlier_scores  \n",
+      "4477               100.000000      196.500000  \n",
+      "364                 80.800201      174.100201  \n",
+      "13629               79.327429      172.727429  \n",
+      "3007                71.652044      166.352044  \n",
+      "16940               68.470102      158.770102  \n",
+      "20175               49.495849      143.395849  \n",
+      "29662               46.948097      140.348097  \n",
+      "10381               46.944155      138.644155  \n",
+      "1667                33.381819      128.181819  \n",
+      "20841               32.944948      127.744948  \n",
+      "\n",
+      "[10 rows x 51 columns]\n"
+     ]
+    }
+   ],
+   "source": [
+    "%matplotlib inline\n",
+    "from fitting_and_plotting import plot_instruction_counts_v_textseg, plot_instruction_counts_histograms, print_outlier_rows\n",
+    "\n",
+    "#lot_instruction_counts_v_textseg(\"c\", \"/home/3302/hf_py_code/compile/csv_data/inst_scatterplots/\", show=True)\n",
+    "#lot_instruction_counts_histograms(\"c\", \"/home/3302/hf_py_code/compile/csv_data/inst_scatterplots/\", show=True)\n",
+    "#rint_outlier(\"c\")\n",
+    "\n",
+    "#for i in [\"c\", \"cpp\", \"rust\", \"swift\"]:\n",
+    "#    plot_histogram(i, show=True)\n",
+    "#    plot_histogram(i,show=False)\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.7.0"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/llvm_ir_dataset_utils/compile_time_analysis_tools/python_processing/read_column.py b/llvm_ir_dataset_utils/compile_time_analysis_tools/python_processing/read_column.py
index e3e1910..fd7107d 100644
--- a/llvm_ir_dataset_utils/compile_time_analysis_tools/python_processing/read_column.py
+++ b/llvm_ir_dataset_utils/compile_time_analysis_tools/python_processing/read_column.py
@@ -1,16 +1,21 @@
-# -*- coding: ascii -*-
-import csv
+"""Utilities for reading and processing csv data with features for outlier analysis.
 
+pandas_df_with_outlier_scores
+  Returns: pandas.core.frame.DataFrame
+  Example usage: pandas_df_with_outlier_scores('c', '/tmp', file_name='_other_suffix.csv', write_to_csv=True)
+"""
 
-def open_and_load(lang: str, STORAGE: str = '/tmp') -> [int]:
-    '''
-    Function to read csv files containing text segment size and instruction counts data.
-    '''
-    textseg_data: [int] = []
-    inst_data: [int] = []
-    with open(STORAGE+lang+"_combined_results.csv", mode='r', newline='') as file:
-        for x in csv.DictReader(file):
-            textseg_data.append(int(x[" text_segment_size"]))
 
-            inst_data.append(int(x[" instructions"]))
-    return textseg_data, inst_data
+def pandas_df_with_outlier_scores(lang: str, storage: str, file_name_suffix: str = '_combined.csv', write_to_csv: bool = False):
+    import pandas as pd
+
+    df = pd.read_csv(storage+lang+file_name_suffix, skipinitialspace=True)
+    data = df["instruction"]
+    min_value = data.min()
+    max_value = data.max()
+    df['normalized_instruction'] = (
+        data - min_value) / (max_value - min_value) * 100
+    df['outlier_scores'] = df["percentage"] + df['normalized_instruction']
+    if (write_to_csv):
+        df.to_csv('normalized_file.csv', index=False)
+    return df

From b44c4f92cefe3cae42c1f34d71f4ab3d55a93ccd Mon Sep 17 00:00:00 2001
From: Andrew Kallai <kallai.andrew@gmail.com>
Date: Sat, 24 Aug 2024 19:21:33 -0400
Subject: [PATCH 03/11] Updates to plotting, read csv file, and jupyter
 notebook.

---
 .../python_processing/fitting_and_plotting.py |  27 +---
 .../plot_and_display_utlities.ipynb           | 153 +++++-------------
 .../python_processing/read_column.py          |  26 +--
 3 files changed, 59 insertions(+), 147 deletions(-)

diff --git a/llvm_ir_dataset_utils/compile_time_analysis_tools/python_processing/fitting_and_plotting.py b/llvm_ir_dataset_utils/compile_time_analysis_tools/python_processing/fitting_and_plotting.py
index 9fdc72d..5dd2185 100644
--- a/llvm_ir_dataset_utils/compile_time_analysis_tools/python_processing/fitting_and_plotting.py
+++ b/llvm_ir_dataset_utils/compile_time_analysis_tools/python_processing/fitting_and_plotting.py
@@ -1,5 +1,4 @@
-"""
-Utilies for plotting instruction counts & text segment size scatter plots, as well as instruction counts histograms.
+"""Utilies for plotting instruction counts & text segment size scatter plots, as well as instruction counts histograms.
 
 plot_instruction_counts_v_textseg
   Returns: None
@@ -10,11 +9,12 @@
   Example Usage: plot_instruction_counts_histograms("c", "/tmp", show=True)
 """
 import matplotlib.pyplot as plt
-from read_column import pandas_df_with_outlier_scores
+from read_column import csv_to_pandas_df
+
 
 def plot_instruction_counts_v_textseg(lang: str, storage: str, show: bool = False) -> None:
     import numpy as np
-    df = pandas_df_with_outlier_scores(lang, storage)
+    df = csv_to_pandas_df(lang, storage)
     textseg_data = df["text_segment"]
     inst_data = df["instruction"]
     c, b, a = np.polyfit(textseg_data, inst_data, 2)
@@ -45,7 +45,7 @@ def plot_instruction_counts_v_textseg(lang: str, storage: str, show: bool = Fals
 
 
 def plot_instruction_counts_histograms(lang: str, storage: str, show: bool = False) -> None:
-    df = pandas_df_with_outlier_scores(lang, storage)
+    df = csv_to_pandas_df(lang, storage)
     inst_data = df["instruction"]
     plt.hist(inst_data, bins='auto', alpha=1, color='b')
     plt.title("Histogram of Compiler Instructions ("+lang+")")
@@ -64,20 +64,3 @@ def plot_instruction_counts_histograms(lang: str, storage: str, show: bool = Fal
     else:
         plt.savefig(fname=lang+"_hist.pdf", format="pdf")
     plt.close()
-
-def print_outlier_rows(lang: str, storage: str, show) -> None:
-  df = pandas_df_with_outlier_scores(lang, storage)
-  outl = df.nlargest(10, "outlier_scores")
-#  print(outl.to_string(index=False))
-  outl.style.hide_index()
-  print(outl)
-#  cols = outl.loc[:, "BasicBlockCount":"CallWithPointerArgumentCount"]
-
-#  for col in cols:
-#        non_zero_values = outl[col][outl[col] != 0]
-#        if not non_zero_values.empty:
-#          print(f"Non-zero values in column {col}:")
-#          print(non_zero_values)
-
-
-
diff --git a/llvm_ir_dataset_utils/compile_time_analysis_tools/python_processing/plot_and_display_utlities.ipynb b/llvm_ir_dataset_utils/compile_time_analysis_tools/python_processing/plot_and_display_utlities.ipynb
index e2dee94..aff403d 100644
--- a/llvm_ir_dataset_utils/compile_time_analysis_tools/python_processing/plot_and_display_utlities.ipynb
+++ b/llvm_ir_dataset_utils/compile_time_analysis_tools/python_processing/plot_and_display_utlities.ipynb
@@ -2,124 +2,49 @@
  "cells": [
   {
    "cell_type": "code",
-   "execution_count": 1,
+   "execution_count": null,
    "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "            file  text_segment   instruction  BasicBlockCount  \\\n",
-      "4477    file4504       4102289  639943870091           146673   \n",
-      "364      file369        897703  517186177449            30629   \n",
-      "13629  file13687        251257  507769720840             9335   \n",
-      "3007    file3026        782770  458695637213            27226   \n",
-      "16940  file17015      31982373  438351265829           380796   \n",
-      "20175  file20265       1977670  317035646729            72528   \n",
-      "29662  file29786       1868172  300746094074            69423   \n",
-      "10381  file10427       1868172  300720888123            69423   \n",
-      "1667    file1680       1534476  214007426790            55480   \n",
-      "20841  file20931       1519168  211214208675            46363   \n",
-      "\n",
-      "       BlocksReachedFromConditionalInstruction    Uses  \\\n",
-      "4477                                    122900  161756   \n",
-      "364                                      24080   15219   \n",
-      "13629                                     7167    4459   \n",
-      "3007                                     22377   15349   \n",
-      "16940                                   276328  208458   \n",
-      "20175                                    60702   27729   \n",
-      "29662                                    57219   27998   \n",
-      "10381                                    57219   27998   \n",
-      "1667                                     43498   20981   \n",
-      "20841                                    37508   18374   \n",
-      "\n",
-      "       DirectCallsToDefinedFunctions  LoadInstCount  StoreInstCount  \\\n",
-      "4477                          161745        1204668          842106   \n",
-      "364                            14710          81272           56693   \n",
-      "13629                           4309          22582           15908   \n",
-      "3007                           15116          70912           49911   \n",
-      "16940                         208412        1644587         1427034   \n",
-      "20175                          26870         159768           95017   \n",
-      "29662                          27139         150196           87140   \n",
-      "10381                          27139         150196           87140   \n",
-      "1667                           20175         127244           77850   \n",
-      "20841                          17383         110236           67690   \n",
-      "\n",
-      "       MaxLoopDepth       ...        CallReturnsPointerCount  \\\n",
-      "4477            953       ...                              2   \n",
-      "364             200       ...                           5226   \n",
-      "13629            56       ...                           1572   \n",
-      "3007             90       ...                           4424   \n",
-      "16940          5208       ...                           9739   \n",
-      "20175           286       ...                           8509   \n",
-      "29662           282       ...                           8292   \n",
-      "10381           282       ...                           8292   \n",
-      "1667            392       ...                           7683   \n",
-      "20841           280       ...                           6749   \n",
-      "\n",
-      "       CallReturnsVectorIntCount  CallReturnsVectorFloatCount  \\\n",
-      "4477                           0                            0   \n",
-      "364                            0                            0   \n",
-      "13629                          0                            0   \n",
-      "3007                           0                            0   \n",
-      "16940                          0                            0   \n",
-      "20175                          0                            0   \n",
-      "29662                          0                            0   \n",
-      "10381                          0                            0   \n",
-      "1667                           0                            0   \n",
-      "20841                          0                           16   \n",
-      "\n",
-      "       CallReturnsVectorPointerCount  CallWithManyArgumentsCount  \\\n",
-      "4477                               0                        4948   \n",
-      "364                                0                        1088   \n",
-      "13629                              0                         248   \n",
-      "3007                               0                         181   \n",
-      "16940                              0                       20707   \n",
-      "20175                              0                        2365   \n",
-      "29662                              0                        2364   \n",
-      "10381                              0                        2364   \n",
-      "1667                               0                        2375   \n",
-      "20841                              0                        2141   \n",
-      "\n",
-      "       CallWithPointerArgumentCount  percentage     pass_name  \\\n",
-      "4477                         266915        96.5  VerifierPass   \n",
-      "364                           32818        93.3  VerifierPass   \n",
-      "13629                          9572        93.4  VerifierPass   \n",
-      "3007                          27721        94.7  VerifierPass   \n",
-      "16940                        568566        90.3  VerifierPass   \n",
-      "20175                         55524        93.9  VerifierPass   \n",
-      "29662                         54821        93.4  VerifierPass   \n",
-      "10381                         54821        91.7  VerifierPass   \n",
-      "1667                          54131        94.8  VerifierPass   \n",
-      "20841                         47185        94.8  VerifierPass   \n",
-      "\n",
-      "       normalized_instruction  outlier_scores  \n",
-      "4477               100.000000      196.500000  \n",
-      "364                 80.800201      174.100201  \n",
-      "13629               79.327429      172.727429  \n",
-      "3007                71.652044      166.352044  \n",
-      "16940               68.470102      158.770102  \n",
-      "20175               49.495849      143.395849  \n",
-      "29662               46.948097      140.348097  \n",
-      "10381               46.944155      138.644155  \n",
-      "1667                33.381819      128.181819  \n",
-      "20841               32.944948      127.744948  \n",
-      "\n",
-      "[10 rows x 51 columns]\n"
-     ]
-    }
-   ],
+   "outputs": [],
    "source": [
     "%matplotlib inline\n",
-    "from fitting_and_plotting import plot_instruction_counts_v_textseg, plot_instruction_counts_histograms, print_outlier_rows\n",
+    "from fitting_and_plotting import plot_instruction_counts_v_textseg, plot_instruction_counts_histograms\n",
+    "from read_column import outlier_rows\n",
+    "import csv\n",
+    "from IPython.display import HTML, display\n",
+    "import pandas   \n",
     "\n",
-    "#lot_instruction_counts_v_textseg(\"c\", \"/home/3302/hf_py_code/compile/csv_data/inst_scatterplots/\", show=True)\n",
-    "#lot_instruction_counts_histograms(\"c\", \"/home/3302/hf_py_code/compile/csv_data/inst_scatterplots/\", show=True)\n",
-    "#rint_outlier(\"c\")\n",
-    "\n",
-    "#for i in [\"c\", \"cpp\", \"rust\", \"swift\"]:\n",
-    "#    plot_histogram(i, show=True)\n",
-    "#    plot_histogram(i,show=False)\n"
+    "langs = pandas.read_csv('../dataset_download/indices.csv')[\"language\"]\n",
+    "CSV_STORAGE=\"\"\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "for i in langs:\n",
+    "    plot_instruction_counts_histograms(i, CSV_STORAGE, show=True)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "for i in langs:\n",
+    "    plot_instruction_counts_v_textseg(i, CSV_STORAGE, show=True)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "for i in langs:\n",
+    "    display(HTML(outlier_rows(i, CSV_STORAGE).to_html(index=False)))\n"
    ]
   },
   {
diff --git a/llvm_ir_dataset_utils/compile_time_analysis_tools/python_processing/read_column.py b/llvm_ir_dataset_utils/compile_time_analysis_tools/python_processing/read_column.py
index fd7107d..8c08096 100644
--- a/llvm_ir_dataset_utils/compile_time_analysis_tools/python_processing/read_column.py
+++ b/llvm_ir_dataset_utils/compile_time_analysis_tools/python_processing/read_column.py
@@ -1,21 +1,25 @@
 """Utilities for reading and processing csv data with features for outlier analysis.
 
-pandas_df_with_outlier_scores
+csv_to_pandas_df
   Returns: pandas.core.frame.DataFrame
-  Example usage: pandas_df_with_outlier_scores('c', '/tmp', file_name='_other_suffix.csv', write_to_csv=True)
+  Example usage: csv_to_pandas('c', '/tmp', file_name='_other_suffix.csv', write_to_csv=True)
+outlier_rows
+  Returns: pandas.core.frame.DataFrame
+  Example usage: outlier_rows('c', '/tmp')
 """
 
 
-def pandas_df_with_outlier_scores(lang: str, storage: str, file_name_suffix: str = '_combined.csv', write_to_csv: bool = False):
+def csv_to_pandas_df(lang: str, storage: str, file_name_suffix: str = 'combined.csv', write_to_csv: bool = False):
     import pandas as pd
 
-    df = pd.read_csv(storage+lang+file_name_suffix, skipinitialspace=True)
-    data = df["instruction"]
-    min_value = data.min()
-    max_value = data.max()
-    df['normalized_instruction'] = (
-        data - min_value) / (max_value - min_value) * 100
-    df['outlier_scores'] = df["percentage"] + df['normalized_instruction']
+    df = pd.read_csv(storage+lang+'_'+file_name_suffix, skipinitialspace=True)
     if (write_to_csv):
-        df.to_csv('normalized_file.csv', index=False)
+        df.to_csv(lang+'_normalized_'+file_name_suffix, index=False)
     return df
+
+
+def outlier_rows(lang: str, storage: str) -> None:
+    df = csv_to_pandas_df(lang, storage)
+    outl = df.nlargest(df.shape[0]//2, "percentage")
+    #  print(outl[outl.instruction > outl["instruction"].quantile(q=.75, interpolation='lower')])
+    return outl.nlargest(10, "instruction")

From 4f55b57ebd30e4daf9fbe457cfd5fdcad34c851d Mon Sep 17 00:00:00 2001
From: Andrew Kallai <kallai.andrew@gmail.com>
Date: Sat, 24 Aug 2024 19:31:18 -0400
Subject: [PATCH 04/11] Removed unecessary code.

---
 .../compile_time_analysis_tools/python_processing/read_column.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/llvm_ir_dataset_utils/compile_time_analysis_tools/python_processing/read_column.py b/llvm_ir_dataset_utils/compile_time_analysis_tools/python_processing/read_column.py
index 8c08096..412fe78 100644
--- a/llvm_ir_dataset_utils/compile_time_analysis_tools/python_processing/read_column.py
+++ b/llvm_ir_dataset_utils/compile_time_analysis_tools/python_processing/read_column.py
@@ -21,5 +21,4 @@ def csv_to_pandas_df(lang: str, storage: str, file_name_suffix: str = 'combined.
 def outlier_rows(lang: str, storage: str) -> None:
     df = csv_to_pandas_df(lang, storage)
     outl = df.nlargest(df.shape[0]//2, "percentage")
-    #  print(outl[outl.instruction > outl["instruction"].quantile(q=.75, interpolation='lower')])
     return outl.nlargest(10, "instruction")

From 6d1f68f45a08ce744adcdfe27079616ccd75d3c2 Mon Sep 17 00:00:00 2001
From: Andrew Kallai <kallai.andrew@gmail.com>
Date: Sat, 24 Aug 2024 19:34:34 -0400
Subject: [PATCH 05/11] extra removals

---
 .../python_processing/read_column.py                            | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llvm_ir_dataset_utils/compile_time_analysis_tools/python_processing/read_column.py b/llvm_ir_dataset_utils/compile_time_analysis_tools/python_processing/read_column.py
index 412fe78..8b656a2 100644
--- a/llvm_ir_dataset_utils/compile_time_analysis_tools/python_processing/read_column.py
+++ b/llvm_ir_dataset_utils/compile_time_analysis_tools/python_processing/read_column.py
@@ -18,7 +18,7 @@ def csv_to_pandas_df(lang: str, storage: str, file_name_suffix: str = 'combined.
     return df
 
 
-def outlier_rows(lang: str, storage: str) -> None:
+def outlier_rows(lang: str, storage: str):
     df = csv_to_pandas_df(lang, storage)
     outl = df.nlargest(df.shape[0]//2, "percentage")
     return outl.nlargest(10, "instruction")

From a585725df731ee6ee838f8b6e8c44bc83a39df81 Mon Sep 17 00:00:00 2001
From: Andrew Kallai <kallai.andrew@gmail.com>
Date: Fri, 30 Aug 2024 13:20:32 -0400
Subject: [PATCH 06/11] Fixed code formatting, modified docstrings, and
 improved outlier analysis process in read_column.py.

---
 .../python_processing/fitting_and_plotting.py | 100 ++++++++++--------
 .../python_processing/read_column.py          |  31 ++++--
 2 files changed, 73 insertions(+), 58 deletions(-)

diff --git a/llvm_ir_dataset_utils/compile_time_analysis_tools/python_processing/fitting_and_plotting.py b/llvm_ir_dataset_utils/compile_time_analysis_tools/python_processing/fitting_and_plotting.py
index 5dd2185..14076e3 100644
--- a/llvm_ir_dataset_utils/compile_time_analysis_tools/python_processing/fitting_and_plotting.py
+++ b/llvm_ir_dataset_utils/compile_time_analysis_tools/python_processing/fitting_and_plotting.py
@@ -1,4 +1,4 @@
-"""Utilies for plotting instruction counts & text segment size scatter plots, as well as instruction counts histograms.
+"""plot_instruction_counts_v_textseg is used to make compiler instruction counts & text segment size scatter plots, and plot_instruction_counts_histograms is used to make compiler instruction counts histograms.
 
 plot_instruction_counts_v_textseg
   Returns: None
@@ -12,55 +12,61 @@
 from read_column import csv_to_pandas_df
 
 
-def plot_instruction_counts_v_textseg(lang: str, storage: str, show: bool = False) -> None:
-    import numpy as np
-    df = csv_to_pandas_df(lang, storage)
-    textseg_data = df["text_segment"]
-    inst_data = df["instruction"]
-    c, b, a = np.polyfit(textseg_data, inst_data, 2)
+def plot_instruction_counts_v_textseg(lang: str,
+                                      storage: str,
+                                      show: bool = False) -> None:
+  import numpy as np
+  df = csv_to_pandas_df(lang, storage)
+  textseg_data = df["text_segment"]
+  inst_data = df["instruction"]
+  c, b, a = np.polyfit(textseg_data, inst_data, 2)
 
-    x_axis = range(min(textseg_data), max(textseg_data), 10)
-    z = np.polyval([c, b, a], x_axis)
+  x_axis = range(min(textseg_data), max(textseg_data), 10)
+  z = np.polyval([c, b, a], x_axis)
 
-    plt.scatter(textseg_data, inst_data)
-    plt.xscale("log")
-    plt.yscale("log")
-    plt.gca().set_ylim([10**8, 10**13])
-    plt.gca().set_xlim([10**(-1), 10**9])
-    plt.xlabel("Text Segment Size (bytes)")
-    plt.ylabel("Compiler CPU Instructions Count")
-    if (lang == "cpp"):
-        plt.title("Clang++ Compiler Instructions vs. Text Segment Size ("+lang+")")
-    else:
-        plt.title("Clang Compiler Instructions vs. Text Segment Size ("+lang+")")
-    plt.plot(x_axis, z, 'r')
-    equation = f"${c:.1e}x^2 + {b:.1e}x + {a:.1e}$"
-    plt.legend([f"fit: {equation}", "original"])
-    if (show):
-        print(len(textseg_data))
-        plt.show()
-    else:
-        plt.savefig(fname=lang+"_instvtext.pdf", format="pdf")
-    plt.close()
+  plt.scatter(textseg_data, inst_data)
+  plt.xscale("log")
+  plt.yscale("log")
+  plt.gca().set_ylim([10**8, 10**13])
+  plt.gca().set_xlim([10**(-1), 10**9])
+  plt.xlabel("Text Segment Size (bytes)")
+  plt.ylabel("Compiler CPU Instructions Count")
+  if (lang == "cpp"):
+    plt.title("Clang++ Compiler Instructions vs. Text Segment Size (" + lang +
+              ")")
+  else:
+    plt.title("Clang Compiler Instructions vs. Text Segment Size (" + lang +
+              ")")
+  plt.plot(x_axis, z, 'r')
+  equation = f"${c:.1e}x^2 + {b:.1e}x + {a:.1e}$"
+  plt.legend([f"fit: {equation}", "original"])
+  if (show):
+    print(len(textseg_data))
+    plt.show()
+  else:
+    plt.savefig(fname=lang + "_instvtext.pdf", format="pdf")
+  plt.close()
 
 
-def plot_instruction_counts_histograms(lang: str, storage: str, show: bool = False) -> None:
-    df = csv_to_pandas_df(lang, storage)
-    inst_data = df["instruction"]
-    plt.hist(inst_data, bins='auto', alpha=1, color='b')
-    plt.title("Histogram of Compiler Instructions ("+lang+")")
-    plt.xscale("log")
-    plt.yscale("log")
-    plt.gca().set_ylim([10**(-1), 10**5])
-    plt.gca().set_xlim([10**8, 10**13])
-    plt.xlabel('Compiler CPU Instructions Count')
-    plt.ylabel('No. of IR Files')
+def plot_instruction_counts_histograms(lang: str,
+                                       storage: str,
+                                       show: bool = False) -> None:
+  df = csv_to_pandas_df(lang, storage)
+  inst_data = df["instruction"]
+  plt.hist(inst_data, bins='auto', alpha=1, color='b')
+  plt.title("Histogram of Compiler Instructions (" + lang + ")")
+  plt.xscale("log")
+  plt.yscale("log")
+  plt.gca().set_ylim([10**(-1), 10**5])
+  plt.gca().set_xlim([10**8, 10**13])
+  plt.xlabel('Compiler CPU Instructions Count')
+  plt.ylabel('No. of IR Files')
 
-    plt.text(max(inst_data), 1.1, format(max(inst_data), '.2e'), ha='center')
+  plt.text(max(inst_data), 1.1, format(max(inst_data), '.2e'), ha='center')
 
-    if (show):
-        print(len(inst_data))
-        plt.show()
-    else:
-        plt.savefig(fname=lang+"_hist.pdf", format="pdf")
-    plt.close()
+  if (show):
+    print(len(inst_data))
+    plt.show()
+  else:
+    plt.savefig(fname=lang + "_hist.pdf", format="pdf")
+  plt.close()
diff --git a/llvm_ir_dataset_utils/compile_time_analysis_tools/python_processing/read_column.py b/llvm_ir_dataset_utils/compile_time_analysis_tools/python_processing/read_column.py
index 8b656a2..06cf587 100644
--- a/llvm_ir_dataset_utils/compile_time_analysis_tools/python_processing/read_column.py
+++ b/llvm_ir_dataset_utils/compile_time_analysis_tools/python_processing/read_column.py
@@ -1,4 +1,4 @@
-"""Utilities for reading and processing csv data with features for outlier analysis.
+"""csv_to_pandas_df is used to read a IR features CSV file and create a Pandas dataframe for it, and outlier_rows is used to filter the Pandas dataframe to the IR files which can be classified as outliers. 
 
 csv_to_pandas_df
   Returns: pandas.core.frame.DataFrame
@@ -9,16 +9,25 @@
 """
 
 
-def csv_to_pandas_df(lang: str, storage: str, file_name_suffix: str = 'combined.csv', write_to_csv: bool = False):
-    import pandas as pd
+def csv_to_pandas_df(lang: str,
+                     storage: str,
+                     file_name_suffix: str = 'combined.csv'):
+  import pandas as pd
 
-    df = pd.read_csv(storage+lang+'_'+file_name_suffix, skipinitialspace=True)
-    if (write_to_csv):
-        df.to_csv(lang+'_normalized_'+file_name_suffix, index=False)
-    return df
+  df = pd.read_csv(
+      storage + lang + '_' + file_name_suffix, skipinitialspace=True)
+  return df
 
 
-def outlier_rows(lang: str, storage: str):
-    df = csv_to_pandas_df(lang, storage)
-    outl = df.nlargest(df.shape[0]//2, "percentage")
-    return outl.nlargest(10, "instruction")
+def outlier_rows(lang: str,
+                 storage: str,
+                 outlier_num: int = 10,
+                 write_to_csv: bool = False):
+  df = csv_to_pandas_df(lang, storage)
+  outl = df.nlargest(df.shape[0] // 2, "percentage")
+  outl = outl[outl.instruction > outl["instruction"].quantile(
+      q=.75, interpolation='lower')]
+  outl = outl.nlargest(outlier_num, "percentage")
+  if (write_to_csv):
+    df.to_csv(lang + '_outliers.csv', index=False)
+  return outl

From cd06e34f01606484dacf9d2b8c124f565f0b849f Mon Sep 17 00:00:00 2001
From: Andrew Kallai <kallai.andrew@gmail.com>
Date: Fri, 30 Aug 2024 13:27:45 -0400
Subject: [PATCH 07/11] Fixed docstring consistency.

---
 .../python_processing/read_column.py                          | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/llvm_ir_dataset_utils/compile_time_analysis_tools/python_processing/read_column.py b/llvm_ir_dataset_utils/compile_time_analysis_tools/python_processing/read_column.py
index 06cf587..e4b0e99 100644
--- a/llvm_ir_dataset_utils/compile_time_analysis_tools/python_processing/read_column.py
+++ b/llvm_ir_dataset_utils/compile_time_analysis_tools/python_processing/read_column.py
@@ -2,10 +2,10 @@
 
 csv_to_pandas_df
   Returns: pandas.core.frame.DataFrame
-  Example usage: csv_to_pandas('c', '/tmp', file_name='_other_suffix.csv', write_to_csv=True)
+  Example usage: csv_to_pandas('c', '/tmp', file_name_suffix='_other_suffix.csv')
 outlier_rows
   Returns: pandas.core.frame.DataFrame
-  Example usage: outlier_rows('c', '/tmp')
+  Example usage: outlier_rows('c', '/tmp', outlier_num=20, write_to_csv=True)
 """
 
 

From 42578100e42f673c2881a3fa86711c5fce0b4a33 Mon Sep 17 00:00:00 2001
From: Andrew Kallai <kallai.andrew@gmail.com>
Date: Sun, 1 Sep 2024 16:43:00 -0400
Subject: [PATCH 08/11] Removed unecessary code from notebook. Fixed bug with
 read_column.py

---
 .../python_processing/plot_and_display_utlities.ipynb        | 5 ++---
 .../python_processing/read_column.py                         | 2 +-
 2 files changed, 3 insertions(+), 4 deletions(-)

diff --git a/llvm_ir_dataset_utils/compile_time_analysis_tools/python_processing/plot_and_display_utlities.ipynb b/llvm_ir_dataset_utils/compile_time_analysis_tools/python_processing/plot_and_display_utlities.ipynb
index aff403d..b6f3ba9 100644
--- a/llvm_ir_dataset_utils/compile_time_analysis_tools/python_processing/plot_and_display_utlities.ipynb
+++ b/llvm_ir_dataset_utils/compile_time_analysis_tools/python_processing/plot_and_display_utlities.ipynb
@@ -9,11 +9,10 @@
     "%matplotlib inline\n",
     "from fitting_and_plotting import plot_instruction_counts_v_textseg, plot_instruction_counts_histograms\n",
     "from read_column import outlier_rows\n",
-    "import csv\n",
     "from IPython.display import HTML, display\n",
-    "import pandas   \n",
+    "from pandas import read_csv\n",
     "\n",
-    "langs = pandas.read_csv('../dataset_download/indices.csv')[\"language\"]\n",
+    "langs = read_csv('../dataset_download/indices.csv')[\"language\"]\n",
     "CSV_STORAGE=\"\"\n"
    ]
   },
diff --git a/llvm_ir_dataset_utils/compile_time_analysis_tools/python_processing/read_column.py b/llvm_ir_dataset_utils/compile_time_analysis_tools/python_processing/read_column.py
index e4b0e99..72efefb 100644
--- a/llvm_ir_dataset_utils/compile_time_analysis_tools/python_processing/read_column.py
+++ b/llvm_ir_dataset_utils/compile_time_analysis_tools/python_processing/read_column.py
@@ -15,7 +15,7 @@ def csv_to_pandas_df(lang: str,
   import pandas as pd
 
   df = pd.read_csv(
-      storage + lang + '_' + file_name_suffix, skipinitialspace=True)
+      storage + '/' + lang + '_' + file_name_suffix, skipinitialspace=True)
   return df
 
 

From 723e89b4b30719621056f8c0f7a0d563600374b7 Mon Sep 17 00:00:00 2001
From: Andrew Kallai <kallai.andrew@gmail.com>
Date: Sun, 1 Sep 2024 17:41:34 -0400
Subject: [PATCH 09/11] Modified notebook for clarity.

---
 .../python_processing/plot_and_display_utlities.ipynb        | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/llvm_ir_dataset_utils/compile_time_analysis_tools/python_processing/plot_and_display_utlities.ipynb b/llvm_ir_dataset_utils/compile_time_analysis_tools/python_processing/plot_and_display_utlities.ipynb
index b6f3ba9..710dcbb 100644
--- a/llvm_ir_dataset_utils/compile_time_analysis_tools/python_processing/plot_and_display_utlities.ipynb
+++ b/llvm_ir_dataset_utils/compile_time_analysis_tools/python_processing/plot_and_display_utlities.ipynb
@@ -13,6 +13,9 @@
     "from pandas import read_csv\n",
     "\n",
     "langs = read_csv('../dataset_download/indices.csv')[\"language\"]\n",
+    "#filter out a language\n",
+    "#langs = [x for x in langs if x != \"julia\"]\n",
+    "#location of CSV for each language\n",
     "CSV_STORAGE=\"\"\n"
    ]
   },
@@ -43,7 +46,7 @@
    "outputs": [],
    "source": [
     "for i in langs:\n",
-    "    display(HTML(outlier_rows(i, CSV_STORAGE).to_html(index=False)))\n"
+    "    display(HTML(outlier_rows(i, CSV_STORAGE).to_html(index=False)))"
    ]
   },
   {

From 8ee3277ffbe4dbbde27b556372d19178b60f32b7 Mon Sep 17 00:00:00 2001
From: Andrew Kallai <kallai.andrew@gmail.com>
Date: Sat, 14 Sep 2024 18:31:56 -0400
Subject: [PATCH 10/11] Updated docstrings.

---
 .../python_processing/fitting_and_plotting.py | 46 +++++++++++++++----
 .../python_processing/read_column.py          | 46 +++++++++++++++----
 2 files changed, 76 insertions(+), 16 deletions(-)

diff --git a/llvm_ir_dataset_utils/compile_time_analysis_tools/python_processing/fitting_and_plotting.py b/llvm_ir_dataset_utils/compile_time_analysis_tools/python_processing/fitting_and_plotting.py
index 14076e3..d2b42ac 100644
--- a/llvm_ir_dataset_utils/compile_time_analysis_tools/python_processing/fitting_and_plotting.py
+++ b/llvm_ir_dataset_utils/compile_time_analysis_tools/python_processing/fitting_and_plotting.py
@@ -1,21 +1,35 @@
-"""plot_instruction_counts_v_textseg is used to make compiler instruction counts & text segment size scatter plots, and plot_instruction_counts_histograms is used to make compiler instruction counts histograms.
+"""Functions for plotting instruction counts and text segment size data.
 
-plot_instruction_counts_v_textseg
-  Returns: None
-  Example Usage: plot_instruction_counts_v_textseg("c", "/tmp", show=True)
+This script contains two functions. One function makes a scatter plot of
+compiler instruction counts and text segment size data. The other makes
+a histogram of the compiler instruction counts.
 
- plot_instruction_counts_histograms
-  Returns: None
-  Example Usage: plot_instruction_counts_histograms("c", "/tmp", show=True)
+Example Usage: from fitting_and_plotting import *
 """
 import matplotlib.pyplot as plt
 from read_column import csv_to_pandas_df
+import numpy as np
 
 
 def plot_instruction_counts_v_textseg(lang: str,
                                       storage: str,
                                       show: bool = False) -> None:
-  import numpy as np
+  """Display a scatter plot of the data using matplotlib pyplot.
+
+  The function internally creates a pandas dataframe from the provided
+  CSV file location. The columns pertaining to text segment size and
+  compiler CPU instruction counts are plotted in a scatter plot with
+  logarithmic axis. The x and y bounds of the plot are limited to
+  provide a standard range of values for all plots.
+
+  Args:
+    lang: A string which represents the type of IR file data being
+      accessed
+    storage: A string which is the path to the IR CSV data
+    show: A boolean which if set to True will print the number of data
+      points and show the scatter plot using pyplot.show(), otherwise
+      the plot is saved to a .pdf file
+  """
   df = csv_to_pandas_df(lang, storage)
   textseg_data = df["text_segment"]
   inst_data = df["instruction"]
@@ -51,6 +65,22 @@ def plot_instruction_counts_v_textseg(lang: str,
 def plot_instruction_counts_histograms(lang: str,
                                        storage: str,
                                        show: bool = False) -> None:
+  """Display a histogram of the compile time data using matplotlib pyplot.
+
+  The function internally creates a pandas dataframe from the provided
+  CSV file location. The columns pertaining to compiler CPU instruction
+  counts are plotted in a histogram with logarithmic axis. The x and y
+  bounds of the plot are limited to provide a standard range of values
+  for all plots.
+
+  Args:
+    lang: A string which represents the type of IR file data being
+      accessed
+    storage: A string which is the path to the IR CSV data
+    show: A boolean which if set to True will print the number of data
+      points and display the histogram using pyplot.show(), otherwise
+      the plot is saved to a .pdf file
+  """
   df = csv_to_pandas_df(lang, storage)
   inst_data = df["instruction"]
   plt.hist(inst_data, bins='auto', alpha=1, color='b')
diff --git a/llvm_ir_dataset_utils/compile_time_analysis_tools/python_processing/read_column.py b/llvm_ir_dataset_utils/compile_time_analysis_tools/python_processing/read_column.py
index 72efefb..dbb53a7 100644
--- a/llvm_ir_dataset_utils/compile_time_analysis_tools/python_processing/read_column.py
+++ b/llvm_ir_dataset_utils/compile_time_analysis_tools/python_processing/read_column.py
@@ -1,18 +1,29 @@
-"""csv_to_pandas_df is used to read a IR features CSV file and create a Pandas dataframe for it, and outlier_rows is used to filter the Pandas dataframe to the IR files which can be classified as outliers. 
+"""Functions to create a dataframe from a CSV and extract the outlying rows.
 
-csv_to_pandas_df
-  Returns: pandas.core.frame.DataFrame
-  Example usage: csv_to_pandas('c', '/tmp', file_name_suffix='_other_suffix.csv')
-outlier_rows
-  Returns: pandas.core.frame.DataFrame
-  Example usage: outlier_rows('c', '/tmp', outlier_num=20, write_to_csv=True)
+The functions in this script can be used to create a Pandas dataframe
+for the CSV data extracted from the IR dataset and analyze it. Using a
+simple outlier extraction method, rows which constitute file data can be
+chosen on a basis which finds IR files to be outliers.
+
+Example usage: from read_column import *
 """
+import pandas as pd
 
 
 def csv_to_pandas_df(lang: str,
                      storage: str,
                      file_name_suffix: str = 'combined.csv'):
-  import pandas as pd
+  """Creates a Pandas dataframe from the specified CSV file.
+
+  Args:
+    lang: A string which represents the type of IR file data being
+      accessed
+    storage: A string which is the path to the IR CSV data
+    file_name_suffix: The suffix present on each CSV file with prefix
+      as lang
+
+  Returns: pandas.core.frame.DataFrame
+  """
 
   df = pd.read_csv(
       storage + '/' + lang + '_' + file_name_suffix, skipinitialspace=True)
@@ -23,6 +34,25 @@ def outlier_rows(lang: str,
                  storage: str,
                  outlier_num: int = 10,
                  write_to_csv: bool = False):
+  """Creates a Pandas dataframe from the specified CSV file.
+
+  The function creates a dataframe and initially filters out half of the
+  rows which do not fall into the 50th percentile for the "percentage"
+  column. Then, any row which does not fall into the 75th percentile for
+  the "instruction" column is filtered out. Using outlier_num, the
+  largest n rows for "percentage" are returned as the outlying files
+  dataframe.
+
+  Args:
+    lang: A string which represents the type of IR file data being
+      accessed
+    storage: A string which is the path to the IR CSV data
+    outlier_num: The number of IR file outliers to display
+    write_to_csv: If True, the resulting dataframe will be written to a
+      CSV file
+  
+  Returns: pandas.core.frame.DataFrame
+  """
   df = csv_to_pandas_df(lang, storage)
   outl = df.nlargest(df.shape[0] // 2, "percentage")
   outl = outl[outl.instruction > outl["instruction"].quantile(

From ed7255bea2eaddbc53fd97a8afe5eecd63a87ccd Mon Sep 17 00:00:00 2001
From: Andrew Kallai <kallai.andrew@gmail.com>
Date: Fri, 24 Jan 2025 19:54:22 -0500
Subject: [PATCH 11/11] Adding a more clear variable name for the outliers and
 fixing the csv output to have only the outliers data.

---
 .../python_processing/read_column.py                 | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/llvm_ir_dataset_utils/compile_time_analysis_tools/python_processing/read_column.py b/llvm_ir_dataset_utils/compile_time_analysis_tools/python_processing/read_column.py
index dbb53a7..fc47dc0 100644
--- a/llvm_ir_dataset_utils/compile_time_analysis_tools/python_processing/read_column.py
+++ b/llvm_ir_dataset_utils/compile_time_analysis_tools/python_processing/read_column.py
@@ -54,10 +54,10 @@ def outlier_rows(lang: str,
   Returns: pandas.core.frame.DataFrame
   """
   df = csv_to_pandas_df(lang, storage)
-  outl = df.nlargest(df.shape[0] // 2, "percentage")
-  outl = outl[outl.instruction > outl["instruction"].quantile(
-      q=.75, interpolation='lower')]
-  outl = outl.nlargest(outlier_num, "percentage")
+  outlier_df = df.nlargest(df.shape[0] // 2, "percentage")
+  outlier_df = outlier_df[outlier_df.instruction > outlier_df["instruction"]
+                          .quantile(q=.75, interpolation='lower')]
+  outlier_df = outlier_df.nlargest(outlier_num, "percentage")
   if (write_to_csv):
-    df.to_csv(lang + '_outliers.csv', index=False)
-  return outl
+    outlier_df.to_csv(lang + '_outliers.csv', index=False)
+  return outlier_df