Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -5,3 +5,4 @@ llvm_ir_dataset_utils.egg-info/
*.tar
*.sif
*.swp
*~
31 changes: 31 additions & 0 deletions llvm_ir_dataset_utils/compile_time_analysis_tools/Makefile
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
SHELL := /bin/bash

WILD := $(shell echo {$(begin)..$(end)})

ifneq ($(CC), clang)
$(warning WARNING: SETTING CC TO clang OR clang++)
override CC := clang
ifeq ($(lang), cpp)
override CC := clang++
endif
endif

all: $(WILD)

$(WILD):
@perf stat --no-big-num -e instructions:u -o \
$(lang)/perf_stat_files/[email protected] \
$(CC) -O3 -c $(lang)/bc_files/[email protected] \
-o $(lang)/object_files/[email protected]
@instruct=$$(awk '/instructions/ {print $$1}' \
$(lang)/perf_stat_files/[email protected]); \
echo "file$@, $$instruct" >> $(lang)/instruction_counts/[email protected]
@size=$$(llvm-size $(lang)/object_files/[email protected] | awk 'NR==2 {print $$1}'); \
echo "file$@, $$size" >> $(lang)/text_segment_counts/[email protected]
@ir_features=$$(python3 -m llvm_ir_dataset_utils.compile_time_analysis_tools.write_ir_counts \
$(lang)/bc_files/[email protected]); \
echo "file$@, $$ir_features" >> $(lang)/ir_features_counts/[email protected]
@max_pass=$$($(CC) -w -c -ftime-report $(lang)/bc_files/[email protected] -o /dev/null 2>&1 | \
awk '!/ignoring feature/' | awk 'NR==7 {print $$(NF-1) ", " $$NF}' | sed 's/%)//'); \
echo "file$@, $$max_pass" >> $(lang)/max_pass_counts/[email protected]

40 changes: 40 additions & 0 deletions llvm_ir_dataset_utils/compile_time_analysis_tools/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
# Compile Time Analysis Tools

- SLURM_files/batch_main_body.sh
- SLURM_files/create_batch_files.sh
- SLURM_files/job_template.sh
- Makefile
- combine_outputs.sh
- write_ir_counts.py

**create_batch_files.sh** must be provided a storage argument, temporary \
directory argument, and a path to the relevant makefile (where \
Makefile is). Other configurable args are the number of threads used \
by the make command in the batch scripts, and the maximum number of \
SLURM jobs which a user can have in the SLURM queue.

Examples:
`./create_batch_files.sh /lustre /tmp
path_to_ir_dataset_repo/llvm_ir_dataset_utils/compile_time_analysis_tools`

`./create_batch_files.sh /lustre /tmp
path_to_ir_dataset_repo/llvm_ir_dataset_utils/compile_time_analysis_tools
32 400`

When executed, settings will be configured for batch job scripts which are \
created in the format [_language_extension_]batch.sh. The content of those \
scripts is a combination of **job_template.sh** (from SLURM config) and \
**batch_main_body.sh** (for the main execution).

**write_ir_counts.py** is automatically used by **Makefile** to generate \
IR features count data for each IR module being processed.

**combine_outputs.sh** takes two args: <language_extension> and \
<storage_location>. Example: `./combine_outputs.sh c /lustre`. The \
script combines all the temporary result folders made in \
<storage_location>/<language_extension>/. It removes the folders and \
creates temporary files from which the final csv files are constructed \
for the given language. Current data collected includes text segment \
size, user CPU instruction counts during compile time, IR feature \
counts sourced from the LLVM pass `print<func-properties>`, and maximum \
relative time pass names and percentage counts.
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@

set -o errexit
DATA_NAMES=("text_segment" "instruction" "ir_features" "max_pass")
if [ -z "$SLURM_JOB_ID" ]; then
I=$((${START}+1))
STOP=$(($I+${SIZE}-1))
else
BATCH=$(($SIZE/$SLURM_ARRAY_TASK_MAX))
I=$((${SLURM_ARRAY_TASK_ID}*${BATCH}+1+${START}))
STOP=$(($I+${BATCH}-1))
if [ $SLURM_ARRAY_TASK_ID -eq $SLURM_ARRAY_TASK_MAX ]; then
STOP=$(($I+${SIZE}%$SLURM_ARRAY_TASK_MAX-1))
fi
fi

cd ${TEMP_DIR}
mkdir -p ir_bc_files/ps_$I/${TYPE}
cd ir_bc_files/ps_$I/${TYPE}
mkdir -p bc_files ${DATA_NAMES[1]}_counts perf_stat_files \
${DATA_NAMES[0]}_counts ${DATA_NAMES[2]}_counts \
${DATA_NAMES[3]}_counts object_files

eval tar --extract --file=${STORAGE}/${TYPE}/${TYPE}_bc_files.tar \
bc_files/file{$I..$STOP}.bc

cd ..
make --ignore-errors --makefile=${MAKE_PATH}/Makefile \
--jobs=${THREADS} lang="${TYPE}" begin="$I" end="$STOP"

TARGET_DIR="${STORAGE}/${TYPE}/ps_$I"
mkdir -p $TARGET_DIR

for element in "${DATA_NAMES[@]}"; do
> ${TARGET_DIR}/${element}.csv
eval cat ${TYPE}/${element}_counts/${element}{$I..$STOP}.csv \
>> ${TARGET_DIR}/${element}.csv
done

cd ..
rm -r ps_$I
Original file line number Diff line number Diff line change
@@ -0,0 +1,75 @@
#!/bin/bash
set -o errexit

#USAGE
#./create_batch_files.sh <STORAGE_PATH> <TEMP_DIR> <MAKEFILE_PATH> [THREADS] \
# [SLURM_MAX]

if [ -z "$1" ]; then
echo "Missing storage argument."
exit 1
else
STORAGE="$1"
fi
if [ -z "$2" ]; then
echo "Missing temporary directory argument."
exit 1
else
TEMP_DIR="$2"
fi
if [ -z "$3" ]; then
echo "Missing makefile location argument."
exit 1
else
MAKE_PATH="$3"
fi
if [ -z "$4" ]; then
THREADS=8
else
THREADS="$4"
fi
if [ -z "$5" ]; then
SLURM_MAX=399
else
SLURM_MAX="$5"
SLURM_MAX=$((SLURM_MAX-1))
fi

lang=()
start_ids=()
sizes=()

while IFS=',' read -r language start_index end_index; do
lang+=($language)
start_ids+=($start_index)
sizes+=($((${end_index}-${start_index})))
done < <(tail -n +2 "../dataset_download/indices.csv")

length=${#lang[@]}

for (( i=0; i<$length; i++ ))
do
mkdir -p ${STORAGE}/${lang[$i]}/job_results
js="${lang[$i]}_batch.sh"
cp job_template.sh $js
if [ ${sizes[$i]} -le $SLURM_MAX ]; then
echo "#SBATCH --array=0-$((${sizes[$i]}-1))" >> $js
else
echo "#SBATCH --array=0-${SLURM_MAX}" >> $js
fi

echo "#SBATCH --output=${STORAGE}/${lang[$i]}/job_results/slurm-%A_%a.out" >> $js
echo "#SBATCH --error=${STORAGE}/${lang[$i]}/job_results/slurm-%A_%a.out" >> $js

echo "START=${start_ids[$i]}" >> $js
echo "TYPE=${lang[$i]}" >> $js
echo "SIZE=${sizes[$i]}" >> $js
echo "STORAGE=${STORAGE}" >> $js
echo "TEMP_DIR=${TEMP_DIR}" >> $js
echo "MAKE_PATH=${MAKE_PATH}" >> $js
echo "THREADS=${THREADS}" >> $js
echo "export PYTHONPATH=\"${PYTHONPATH}\"" >> $js
cat batch_main_body.sh >> $js
chmod 744 $js
done

Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
#!/bin/bash -l
#
#SBATCH --nodes=1 --ntasks=1 --cpus-per-task=8
#SBATCH --job-name=compiler_batch
#SBATCH --partition=standard
#SBATCH --time=0-00:10:00
#SBATCH --export=NONE
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
#!/bin/bash
set -o errexit

#Usage:
#./combine_outputs.sh <language> <storage>

if [ -z "$1" ]; then
echo "Missing language argument."
exit 1
else
LANGUAGE="$1"
fi

if [ -z "$2" ]; then
echo "Missing storage argument."
exit 1
else
STORAGE="$2"
fi


cd ${STORAGE}

mkdir -p ${LANGUAGE}/results
TARGET_PREFIX="${LANGUAGE}/results/${LANGUAGE}"

boolean=1
DATA_NAMES=("text_segment" "instruction" "ir_features" "max_pass")

for element in "${DATA_NAMES[@]}"; do
if [[ ${element} == ${DATA_NAMES[2]} ]]; then
OUTPUT=$(python3 -m llvm_ir_dataset_utils.compile_time_analysis_tools.write_ir_counts "/dev/null")
echo "file, $OUTPUT" \
> ${TARGET_PREFIX}_${element}.csv
elif [[ ${element} == ${DATA_NAMES[3]} ]]; then
echo "file, percentage, pass_name" \
> ${TARGET_PREFIX}_${element}.csv
else
echo "file, ${element}" \
> ${TARGET_PREFIX}_${element}.csv
fi
ls ${LANGUAGE}/ps_[0-9]*/${element}.csv | xargs cat \
>> ${TARGET_PREFIX}_${element}.csv

sort -nk1.5 ${TARGET_PREFIX}_${element}.csv \
-o ${TARGET_PREFIX}_${element}.csv
if [ $boolean -eq 1 ]; then
awk -F',' '{print $1}' ${TARGET_PREFIX}_${DATA_NAMES[0]}.csv > ${TARGET_PREFIX}_combined.csv
boolean=0
fi
awk -F',' -v OFS=',' 'NR==FNR {for (i=2; i<=NF; i++) cols[FNR]=(cols[FNR]?cols[FNR] OFS:"") $i; next} {print $0, cols[FNR]}' \
${TARGET_PREFIX}_${element}.csv \
${TARGET_PREFIX}_combined.csv \
> ${TARGET_PREFIX}_temp.csv
mv ${TARGET_PREFIX}_temp.csv ${TARGET_PREFIX}_combined.csv
rm ${TARGET_PREFIX}_${element}.csv
done

sed -n -i '/, ,/!p' ${TARGET_PREFIX}_combined.csv

rm -r ${LANGUAGE}/ps_*
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
"""Script to print function analysis properties for IR bitcode files.

The script uses argparse to expect a bitcode file path as argument. The
main function invokes a print() to display the imported function's
output.

Example usage:
python write_ir_counts.py /tmp/c/file1.bc
python write_ir_counts.py /dev/null
"""
from llvm_ir_dataset_utils.util.bitcode_module import get_function_properties_total
import argparse
from datasets import load_dataset, parallel


def main() -> None:
"""Function takes a bitcode filename from argparse and prints the function
analysis properties counts for that file.

If a valid bitcode file is provided to argparse, the counts will be
printed using llvm_ir_dataset_utils.util.bitcode_module.get_function_p
roperties_total. The bitcode file is loaded into memory as a bytes
object to pass in as an argument. If "/dev/null" is provided to
argparse, the Hugging Face IR dataset will be loaded and the loop will
start from the first index of the dataset to obtain a bitcode file
with valid function analysis properties field names. Each output is
comma seperated values.
"""
parser = argparse.ArgumentParser(
description="Process a bitcode file and print field counts.")
parser.add_argument(
'filename', type=str, help="Path to the bitcode (.bc) file.")
filename = parser.parse_args().filename
if (filename == "/dev/null"):
with parallel.parallel_backend('spark'):
dataset = load_dataset('llvm-ml/ComPile', split='train', num_proc=2)
for i in range(0, dataset.num_rows):
bc_file = dataset[i]["content"]
output = get_function_properties_total(bc_file, names=True)
if (output != None):
print(', '.join(output))
break
else:
print(', '.join(
str(x)
for x in get_function_properties_total(open(filename, "rb").read())))


if __name__ == '__main__':
main()
18 changes: 18 additions & 0 deletions llvm_ir_dataset_utils/util/bitcode_module.py
Original file line number Diff line number Diff line change
Expand Up @@ -232,6 +232,24 @@ def get_instruction_counts(bitcode_module, additional_passes=''):
]


def get_function_properties_total(bitcode_module,
additional_passes='',
names=False):
properties_or_error = get_function_properties_module(bitcode_module,
additional_passes)
if properties_or_error[0]:
return None
else:
if (names):
return [key[0] for key in properties_or_error[1].items()]
else:
return [
sum([int(i)
for i in key[1]])
for key in properties_or_error[1].items()
]


def get_instruction_histogram(bitcode_module, additional_passes=''):
if additional_passes != '':
additional_passes += ','
Expand Down