diff --git a/.gitignore b/.gitignore index 4828cef..0b551d8 100644 --- a/.gitignore +++ b/.gitignore @@ -5,3 +5,4 @@ llvm_ir_dataset_utils.egg-info/ *.tar *.sif *.swp +*~ diff --git a/llvm_ir_dataset_utils/compile_time_analysis_tools/Makefile b/llvm_ir_dataset_utils/compile_time_analysis_tools/Makefile new file mode 100644 index 0000000..0385a53 --- /dev/null +++ b/llvm_ir_dataset_utils/compile_time_analysis_tools/Makefile @@ -0,0 +1,31 @@ +SHELL := /bin/bash + +WILD := $(shell echo {$(begin)..$(end)}) + +ifneq ($(CC), clang) + $(warning WARNING: SETTING CC TO clang OR clang++) + override CC := clang + ifeq ($(lang), cpp) + override CC := clang++ + endif +endif + +all: $(WILD) + +$(WILD): + @perf stat --no-big-num -e instructions:u -o \ + $(lang)/perf_stat_files/file$@.txt \ + $(CC) -O3 -c $(lang)/bc_files/file$@.bc \ + -o $(lang)/object_files/file$@.o + @instruct=$$(awk '/instructions/ {print $$1}' \ + $(lang)/perf_stat_files/file$@.txt); \ + echo "file$@, $$instruct" >> $(lang)/instruction_counts/instruction$@.csv + @size=$$(llvm-size $(lang)/object_files/file$@.o | awk 'NR==2 {print $$1}'); \ + echo "file$@, $$size" >> $(lang)/text_segment_counts/text_segment$@.csv + @ir_features=$$(python3 -m llvm_ir_dataset_utils.compile_time_analysis_tools.write_ir_counts \ + $(lang)/bc_files/file$@.bc); \ + echo "file$@, $$ir_features" >> $(lang)/ir_features_counts/ir_features$@.csv + @max_pass=$$($(CC) -w -c -ftime-report $(lang)/bc_files/file$@.bc -o /dev/null 2>&1 | \ + awk '!/ignoring feature/' | awk 'NR==7 {print $$(NF-1) ", " $$NF}' | sed 's/%)//'); \ + echo "file$@, $$max_pass" >> $(lang)/max_pass_counts/max_pass$@.csv + diff --git a/llvm_ir_dataset_utils/compile_time_analysis_tools/README.md b/llvm_ir_dataset_utils/compile_time_analysis_tools/README.md new file mode 100644 index 0000000..25152da --- /dev/null +++ b/llvm_ir_dataset_utils/compile_time_analysis_tools/README.md @@ -0,0 +1,40 @@ +# Compile Time Analysis Tools + +- SLURM_files/batch_main_body.sh +- SLURM_files/create_batch_files.sh +- SLURM_files/job_template.sh +- Makefile +- combine_outputs.sh +- write_ir_counts.py + +**create_batch_files.sh** must be provided a storage argument, temporary \ +directory argument, and a path to the relevant makefile (where \ +Makefile is). Other configurable args are the number of threads used \ +by the make command in the batch scripts, and the maximum number of \ +SLURM jobs which a user can have in the SLURM queue. + +Examples: + `./create_batch_files.sh /lustre /tmp + path_to_ir_dataset_repo/llvm_ir_dataset_utils/compile_time_analysis_tools` + + `./create_batch_files.sh /lustre /tmp + path_to_ir_dataset_repo/llvm_ir_dataset_utils/compile_time_analysis_tools + 32 400` + +When executed, settings will be configured for batch job scripts which are \ +created in the format [_language_extension_]batch.sh. The content of those \ +scripts is a combination of **job_template.sh** (from SLURM config) and \ +**batch_main_body.sh** (for the main execution). + +**write_ir_counts.py** is automatically used by **Makefile** to generate \ +IR features count data for each IR module being processed. + +**combine_outputs.sh** takes two args: and \ +. Example: `./combine_outputs.sh c /lustre`. The \ +script combines all the temporary result folders made in \ +//. It removes the folders and \ +creates temporary files from which the final csv files are constructed \ +for the given language. Current data collected includes text segment \ +size, user CPU instruction counts during compile time, IR feature \ +counts sourced from the LLVM pass `print`, and maximum \ +relative time pass names and percentage counts. diff --git a/llvm_ir_dataset_utils/compile_time_analysis_tools/SLURM_files/batch_main_body.sh b/llvm_ir_dataset_utils/compile_time_analysis_tools/SLURM_files/batch_main_body.sh new file mode 100644 index 0000000..6ae6286 --- /dev/null +++ b/llvm_ir_dataset_utils/compile_time_analysis_tools/SLURM_files/batch_main_body.sh @@ -0,0 +1,40 @@ + +set -o errexit +DATA_NAMES=("text_segment" "instruction" "ir_features" "max_pass") +if [ -z "$SLURM_JOB_ID" ]; then + I=$((${START}+1)) + STOP=$(($I+${SIZE}-1)) +else + BATCH=$(($SIZE/$SLURM_ARRAY_TASK_MAX)) + I=$((${SLURM_ARRAY_TASK_ID}*${BATCH}+1+${START})) + STOP=$(($I+${BATCH}-1)) + if [ $SLURM_ARRAY_TASK_ID -eq $SLURM_ARRAY_TASK_MAX ]; then + STOP=$(($I+${SIZE}%$SLURM_ARRAY_TASK_MAX-1)) + fi +fi + +cd ${TEMP_DIR} +mkdir -p ir_bc_files/ps_$I/${TYPE} +cd ir_bc_files/ps_$I/${TYPE} +mkdir -p bc_files ${DATA_NAMES[1]}_counts perf_stat_files \ + ${DATA_NAMES[0]}_counts ${DATA_NAMES[2]}_counts \ + ${DATA_NAMES[3]}_counts object_files + +eval tar --extract --file=${STORAGE}/${TYPE}/${TYPE}_bc_files.tar \ + bc_files/file{$I..$STOP}.bc + +cd .. +make --ignore-errors --makefile=${MAKE_PATH}/Makefile \ + --jobs=${THREADS} lang="${TYPE}" begin="$I" end="$STOP" + +TARGET_DIR="${STORAGE}/${TYPE}/ps_$I" +mkdir -p $TARGET_DIR + +for element in "${DATA_NAMES[@]}"; do + > ${TARGET_DIR}/${element}.csv + eval cat ${TYPE}/${element}_counts/${element}{$I..$STOP}.csv \ + >> ${TARGET_DIR}/${element}.csv +done + +cd .. +rm -r ps_$I diff --git a/llvm_ir_dataset_utils/compile_time_analysis_tools/SLURM_files/create_batch_files.sh b/llvm_ir_dataset_utils/compile_time_analysis_tools/SLURM_files/create_batch_files.sh new file mode 100755 index 0000000..685f1af --- /dev/null +++ b/llvm_ir_dataset_utils/compile_time_analysis_tools/SLURM_files/create_batch_files.sh @@ -0,0 +1,75 @@ +#!/bin/bash +set -o errexit + +#USAGE +#./create_batch_files.sh [THREADS] \ +# [SLURM_MAX] + +if [ -z "$1" ]; then + echo "Missing storage argument." + exit 1 +else + STORAGE="$1" +fi +if [ -z "$2" ]; then + echo "Missing temporary directory argument." + exit 1 +else + TEMP_DIR="$2" +fi +if [ -z "$3" ]; then + echo "Missing makefile location argument." + exit 1 +else + MAKE_PATH="$3" +fi +if [ -z "$4" ]; then + THREADS=8 +else + THREADS="$4" +fi +if [ -z "$5" ]; then + SLURM_MAX=399 +else + SLURM_MAX="$5" + SLURM_MAX=$((SLURM_MAX-1)) +fi + +lang=() +start_ids=() +sizes=() + +while IFS=',' read -r language start_index end_index; do + lang+=($language) + start_ids+=($start_index) + sizes+=($((${end_index}-${start_index}))) +done < <(tail -n +2 "../dataset_download/indices.csv") + +length=${#lang[@]} + +for (( i=0; i<$length; i++ )) +do + mkdir -p ${STORAGE}/${lang[$i]}/job_results + js="${lang[$i]}_batch.sh" + cp job_template.sh $js + if [ ${sizes[$i]} -le $SLURM_MAX ]; then + echo "#SBATCH --array=0-$((${sizes[$i]}-1))" >> $js + else + echo "#SBATCH --array=0-${SLURM_MAX}" >> $js + fi + + echo "#SBATCH --output=${STORAGE}/${lang[$i]}/job_results/slurm-%A_%a.out" >> $js + echo "#SBATCH --error=${STORAGE}/${lang[$i]}/job_results/slurm-%A_%a.out" >> $js + + echo "START=${start_ids[$i]}" >> $js + echo "TYPE=${lang[$i]}" >> $js + echo "SIZE=${sizes[$i]}" >> $js + echo "STORAGE=${STORAGE}" >> $js + echo "TEMP_DIR=${TEMP_DIR}" >> $js + echo "MAKE_PATH=${MAKE_PATH}" >> $js + echo "THREADS=${THREADS}" >> $js + echo "export PYTHONPATH=\"${PYTHONPATH}\"" >> $js + cat batch_main_body.sh >> $js + chmod 744 $js +done + diff --git a/llvm_ir_dataset_utils/compile_time_analysis_tools/SLURM_files/job_template.sh b/llvm_ir_dataset_utils/compile_time_analysis_tools/SLURM_files/job_template.sh new file mode 100644 index 0000000..b8d20fe --- /dev/null +++ b/llvm_ir_dataset_utils/compile_time_analysis_tools/SLURM_files/job_template.sh @@ -0,0 +1,7 @@ +#!/bin/bash -l +# +#SBATCH --nodes=1 --ntasks=1 --cpus-per-task=8 +#SBATCH --job-name=compiler_batch +#SBATCH --partition=standard +#SBATCH --time=0-00:10:00 +#SBATCH --export=NONE diff --git a/llvm_ir_dataset_utils/compile_time_analysis_tools/combine_outputs.sh b/llvm_ir_dataset_utils/compile_time_analysis_tools/combine_outputs.sh new file mode 100755 index 0000000..a00a2ba --- /dev/null +++ b/llvm_ir_dataset_utils/compile_time_analysis_tools/combine_outputs.sh @@ -0,0 +1,61 @@ +#!/bin/bash +set -o errexit + +#Usage: +#./combine_outputs.sh + +if [ -z "$1" ]; then + echo "Missing language argument." + exit 1 +else + LANGUAGE="$1" +fi + +if [ -z "$2" ]; then + echo "Missing storage argument." + exit 1 +else + STORAGE="$2" +fi + + +cd ${STORAGE} + +mkdir -p ${LANGUAGE}/results +TARGET_PREFIX="${LANGUAGE}/results/${LANGUAGE}" + +boolean=1 +DATA_NAMES=("text_segment" "instruction" "ir_features" "max_pass") + +for element in "${DATA_NAMES[@]}"; do + if [[ ${element} == ${DATA_NAMES[2]} ]]; then + OUTPUT=$(python3 -m llvm_ir_dataset_utils.compile_time_analysis_tools.write_ir_counts "/dev/null") + echo "file, $OUTPUT" \ + > ${TARGET_PREFIX}_${element}.csv + elif [[ ${element} == ${DATA_NAMES[3]} ]]; then + echo "file, percentage, pass_name" \ + > ${TARGET_PREFIX}_${element}.csv + else + echo "file, ${element}" \ + > ${TARGET_PREFIX}_${element}.csv + fi + ls ${LANGUAGE}/ps_[0-9]*/${element}.csv | xargs cat \ + >> ${TARGET_PREFIX}_${element}.csv + + sort -nk1.5 ${TARGET_PREFIX}_${element}.csv \ + -o ${TARGET_PREFIX}_${element}.csv + if [ $boolean -eq 1 ]; then + awk -F',' '{print $1}' ${TARGET_PREFIX}_${DATA_NAMES[0]}.csv > ${TARGET_PREFIX}_combined.csv + boolean=0 + fi + awk -F',' -v OFS=',' 'NR==FNR {for (i=2; i<=NF; i++) cols[FNR]=(cols[FNR]?cols[FNR] OFS:"") $i; next} {print $0, cols[FNR]}' \ + ${TARGET_PREFIX}_${element}.csv \ + ${TARGET_PREFIX}_combined.csv \ + > ${TARGET_PREFIX}_temp.csv + mv ${TARGET_PREFIX}_temp.csv ${TARGET_PREFIX}_combined.csv + rm ${TARGET_PREFIX}_${element}.csv +done + +sed -n -i '/, ,/!p' ${TARGET_PREFIX}_combined.csv + +rm -r ${LANGUAGE}/ps_* diff --git a/llvm_ir_dataset_utils/compile_time_analysis_tools/write_ir_counts.py b/llvm_ir_dataset_utils/compile_time_analysis_tools/write_ir_counts.py new file mode 100644 index 0000000..d6da0d0 --- /dev/null +++ b/llvm_ir_dataset_utils/compile_time_analysis_tools/write_ir_counts.py @@ -0,0 +1,50 @@ +"""Script to print function analysis properties for IR bitcode files. + +The script uses argparse to expect a bitcode file path as argument. The +main function invokes a print() to display the imported function's +output. + +Example usage: + python write_ir_counts.py /tmp/c/file1.bc + python write_ir_counts.py /dev/null +""" +from llvm_ir_dataset_utils.util.bitcode_module import get_function_properties_total +import argparse +from datasets import load_dataset, parallel + + +def main() -> None: + """Function takes a bitcode filename from argparse and prints the function + analysis properties counts for that file. + + If a valid bitcode file is provided to argparse, the counts will be + printed using llvm_ir_dataset_utils.util.bitcode_module.get_function_p + roperties_total. The bitcode file is loaded into memory as a bytes + object to pass in as an argument. If "/dev/null" is provided to + argparse, the Hugging Face IR dataset will be loaded and the loop will + start from the first index of the dataset to obtain a bitcode file + with valid function analysis properties field names. Each output is + comma seperated values. + """ + parser = argparse.ArgumentParser( + description="Process a bitcode file and print field counts.") + parser.add_argument( + 'filename', type=str, help="Path to the bitcode (.bc) file.") + filename = parser.parse_args().filename + if (filename == "/dev/null"): + with parallel.parallel_backend('spark'): + dataset = load_dataset('llvm-ml/ComPile', split='train', num_proc=2) + for i in range(0, dataset.num_rows): + bc_file = dataset[i]["content"] + output = get_function_properties_total(bc_file, names=True) + if (output != None): + print(', '.join(output)) + break + else: + print(', '.join( + str(x) + for x in get_function_properties_total(open(filename, "rb").read()))) + + +if __name__ == '__main__': + main() diff --git a/llvm_ir_dataset_utils/util/bitcode_module.py b/llvm_ir_dataset_utils/util/bitcode_module.py index f7f98bd..167e1c8 100644 --- a/llvm_ir_dataset_utils/util/bitcode_module.py +++ b/llvm_ir_dataset_utils/util/bitcode_module.py @@ -232,6 +232,24 @@ def get_instruction_counts(bitcode_module, additional_passes=''): ] +def get_function_properties_total(bitcode_module, + additional_passes='', + names=False): + properties_or_error = get_function_properties_module(bitcode_module, + additional_passes) + if properties_or_error[0]: + return None + else: + if (names): + return [key[0] for key in properties_or_error[1].items()] + else: + return [ + sum([int(i) + for i in key[1]]) + for key in properties_or_error[1].items() + ] + + def get_instruction_histogram(bitcode_module, additional_passes=''): if additional_passes != '': additional_passes += ','