Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -0,0 +1,102 @@
"""Functions for plotting instruction counts and text segment size data.

This script contains two functions. One function makes a scatter plot of
compiler instruction counts and text segment size data. The other makes
a histogram of the compiler instruction counts.

Example Usage: from fitting_and_plotting import *
"""
import matplotlib.pyplot as plt
from read_column import csv_to_pandas_df
import numpy as np


def plot_instruction_counts_v_textseg(lang: str,
storage: str,
show: bool = False) -> None:
"""Display a scatter plot of the data using matplotlib pyplot.

The function internally creates a pandas dataframe from the provided
CSV file location. The columns pertaining to text segment size and
compiler CPU instruction counts are plotted in a scatter plot with
logarithmic axis. The x and y bounds of the plot are limited to
provide a standard range of values for all plots.

Args:
lang: A string which represents the type of IR file data being
accessed
storage: A string which is the path to the IR CSV data
show: A boolean which if set to True will print the number of data
points and show the scatter plot using pyplot.show(), otherwise
the plot is saved to a .pdf file
"""
df = csv_to_pandas_df(lang, storage)
textseg_data = df["text_segment"]
inst_data = df["instruction"]
c, b, a = np.polyfit(textseg_data, inst_data, 2)

x_axis = range(min(textseg_data), max(textseg_data), 10)
z = np.polyval([c, b, a], x_axis)

plt.scatter(textseg_data, inst_data)
plt.xscale("log")
plt.yscale("log")
plt.gca().set_ylim([10**8, 10**13])
plt.gca().set_xlim([10**(-1), 10**9])
plt.xlabel("Text Segment Size (bytes)")
plt.ylabel("Compiler CPU Instructions Count")
if (lang == "cpp"):
plt.title("Clang++ Compiler Instructions vs. Text Segment Size (" + lang +
")")
else:
plt.title("Clang Compiler Instructions vs. Text Segment Size (" + lang +
")")
plt.plot(x_axis, z, 'r')
equation = f"${c:.1e}x^2 + {b:.1e}x + {a:.1e}$"
plt.legend([f"fit: {equation}", "original"])
if (show):
print(len(textseg_data))
plt.show()
else:
plt.savefig(fname=lang + "_instvtext.pdf", format="pdf")
plt.close()


def plot_instruction_counts_histograms(lang: str,
storage: str,
show: bool = False) -> None:
"""Display a histogram of the compile time data using matplotlib pyplot.

The function internally creates a pandas dataframe from the provided
CSV file location. The columns pertaining to compiler CPU instruction
counts are plotted in a histogram with logarithmic axis. The x and y
bounds of the plot are limited to provide a standard range of values
for all plots.

Args:
lang: A string which represents the type of IR file data being
accessed
storage: A string which is the path to the IR CSV data
show: A boolean which if set to True will print the number of data
points and display the histogram using pyplot.show(), otherwise
the plot is saved to a .pdf file
"""
df = csv_to_pandas_df(lang, storage)
inst_data = df["instruction"]
plt.hist(inst_data, bins='auto', alpha=1, color='b')
plt.title("Histogram of Compiler Instructions (" + lang + ")")
plt.xscale("log")
plt.yscale("log")
plt.gca().set_ylim([10**(-1), 10**5])
plt.gca().set_xlim([10**8, 10**13])
plt.xlabel('Compiler CPU Instructions Count')
plt.ylabel('No. of IR Files')

plt.text(max(inst_data), 1.1, format(max(inst_data), '.2e'), ha='center')

if (show):
print(len(inst_data))
plt.show()
else:
plt.savefig(fname=lang + "_hist.pdf", format="pdf")
plt.close()
Original file line number Diff line number Diff line change
@@ -0,0 +1,81 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"%matplotlib inline\n",
"from fitting_and_plotting import plot_instruction_counts_v_textseg, plot_instruction_counts_histograms\n",
"from read_column import outlier_rows\n",
"from IPython.display import HTML, display\n",
"from pandas import read_csv\n",
"\n",
"langs = read_csv('../dataset_download/indices.csv')[\"language\"]\n",
"#filter out a language\n",
"#langs = [x for x in langs if x != \"julia\"]\n",
"#location of CSV for each language\n",
"CSV_STORAGE=\"\"\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"for i in langs:\n",
" plot_instruction_counts_histograms(i, CSV_STORAGE, show=True)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"for i in langs:\n",
" plot_instruction_counts_v_textseg(i, CSV_STORAGE, show=True)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"for i in langs:\n",
" display(HTML(outlier_rows(i, CSV_STORAGE).to_html(index=False)))"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.0"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
"""Functions to create a dataframe from a CSV and extract the outlying rows.

The functions in this script can be used to create a Pandas dataframe
for the CSV data extracted from the IR dataset and analyze it. Using a
simple outlier extraction method, rows which constitute file data can be
chosen on a basis which finds IR files to be outliers.

Example usage: from read_column import *
"""
import pandas as pd


def csv_to_pandas_df(lang: str,
storage: str,
file_name_suffix: str = 'combined.csv'):
"""Creates a Pandas dataframe from the specified CSV file.

Args:
lang: A string which represents the type of IR file data being
accessed
storage: A string which is the path to the IR CSV data
file_name_suffix: The suffix present on each CSV file with prefix
as lang

Returns: pandas.core.frame.DataFrame
"""

df = pd.read_csv(
storage + '/' + lang + '_' + file_name_suffix, skipinitialspace=True)
return df


def outlier_rows(lang: str,
storage: str,
outlier_num: int = 10,
write_to_csv: bool = False):
"""Creates a Pandas dataframe from the specified CSV file.

The function creates a dataframe and initially filters out half of the
rows which do not fall into the 50th percentile for the "percentage"
column. Then, any row which does not fall into the 75th percentile for
the "instruction" column is filtered out. Using outlier_num, the
largest n rows for "percentage" are returned as the outlying files
dataframe.

Args:
lang: A string which represents the type of IR file data being
accessed
storage: A string which is the path to the IR CSV data
outlier_num: The number of IR file outliers to display
write_to_csv: If True, the resulting dataframe will be written to a
CSV file

Returns: pandas.core.frame.DataFrame
"""
df = csv_to_pandas_df(lang, storage)
outlier_df = df.nlargest(df.shape[0] // 2, "percentage")
outlier_df = outlier_df[outlier_df.instruction > outlier_df["instruction"]
.quantile(q=.75, interpolation='lower')]
outlier_df = outlier_df.nlargest(outlier_num, "percentage")
if (write_to_csv):
outlier_df.to_csv(lang + '_outliers.csv', index=False)
return outlier_df