llvm-ml · andrewkallai · Jul 17, 2024 · Jul 27, 2024 · Aug 2, 2024 · Aug 9, 2024
diff --git a/.gitignore b/.gitignore
@@ -5,3 +5,4 @@ llvm_ir_dataset_utils.egg-info/
 *.tar
 *.sif
 *.swp
+*~
diff --git a/llvm_ir_dataset_utils/compile_time_analysis_tools/Makefile b/llvm_ir_dataset_utils/compile_time_analysis_tools/Makefile
@@ -0,0 +1,31 @@
+SHELL := /bin/bash
+
+WILD := $(shell echo {$(begin)..$(end)})
+
+ifneq ($(CC), clang)
+  $(warning WARNING: SETTING CC TO clang OR clang++)
+  override CC := clang
+  ifeq ($(lang), cpp)
+  override CC := clang++ 
+  endif
+endif
+
+all: $(WILD) 
+
+$(WILD):
+	@perf stat --no-big-num -e instructions:u -o \
+		$(lang)/perf_stat_files/[email protected] \
+		$(CC) -O3 -c $(lang)/bc_files/[email protected] \
+		-o $(lang)/object_files/[email protected]
+	@instruct=$$(awk '/instructions/ {print $$1}' \
+		$(lang)/perf_stat_files/[email protected]); \
+		echo "file$@, $$instruct" >> $(lang)/instruction_counts/[email protected]
+	@size=$$(llvm-size $(lang)/object_files/[email protected] | awk 'NR==2 {print $$1}'); \
+		echo "file$@, $$size" >> $(lang)/text_segment_counts/[email protected]
+	@ir_features=$$(python3 -m llvm_ir_dataset_utils.compile_time_analysis_tools.write_ir_counts \
+		$(lang)/bc_files/[email protected]); \
+		echo "file$@, $$ir_features" >> $(lang)/ir_features_counts/[email protected]
+	@max_pass=$$($(CC) -w -c -ftime-report $(lang)/bc_files/[email protected] -o /dev/null 2>&1 | \
+		awk '!/ignoring feature/' | awk 'NR==7 {print $$(NF-1) ", " $$NF}' | sed 's/%)//'); \
+		echo "file$@, $$max_pass" >> $(lang)/max_pass_counts/[email protected] 
+
diff --git a/llvm_ir_dataset_utils/compile_time_analysis_tools/README.md b/llvm_ir_dataset_utils/compile_time_analysis_tools/README.md
@@ -0,0 +1,40 @@
+# Compile Time Analysis Tools
+
+- SLURM_files/batch_main_body.sh
+- SLURM_files/create_batch_files.sh
+- SLURM_files/job_template.sh
+- Makefile
+- combine_outputs.sh
+- write_ir_counts.py
+
+**create_batch_files.sh** must be provided a storage argument, temporary \
+directory argument, and a path to the relevant makefile (where \
+Makefile is). Other configurable args are the number of threads used \
+by the make command in the batch scripts, and the maximum number of \
+SLURM jobs which a user can have in the SLURM queue. 
+
+Examples:
+  `./create_batch_files.sh /lustre /tmp 
+  path_to_ir_dataset_repo/llvm_ir_dataset_utils/compile_time_analysis_tools`
+
+  `./create_batch_files.sh /lustre /tmp
+    path_to_ir_dataset_repo/llvm_ir_dataset_utils/compile_time_analysis_tools
+    32 400`
+
+When executed, settings will be configured for batch job scripts which are \
+created in the format [_language_extension_]batch.sh. The content of those \
+scripts is a combination of **job_template.sh** (from SLURM config) and \
+**batch_main_body.sh** (for the main execution).
+
+**write_ir_counts.py** is automatically used by **Makefile** to generate \
+IR features count data for each IR module being processed.
+
+**combine_outputs.sh** takes two args: <language_extension> and \
+<storage_location>. Example: `./combine_outputs.sh c /lustre`. The \
+script combines all the temporary result folders made in \
+<storage_location>/<language_extension>/. It removes the folders and \
+creates temporary files from which the final csv files are constructed \
+for the given language. Current data collected includes text segment \
+size, user CPU instruction counts during compile time, IR feature \
+counts sourced from the LLVM pass `print<func-properties>`, and maximum \
+relative time pass names and percentage counts.
diff --git a/llvm_ir_dataset_utils/compile_time_analysis_tools/SLURM_files/batch_main_body.sh b/llvm_ir_dataset_utils/compile_time_analysis_tools/SLURM_files/batch_main_body.sh
@@ -0,0 +1,40 @@
+
+set -o errexit
+DATA_NAMES=("text_segment" "instruction" "ir_features" "max_pass")
+if [ -z "$SLURM_JOB_ID" ]; then
+  I=$((${START}+1))
+  STOP=$(($I+${SIZE}-1))
+else
+  BATCH=$(($SIZE/$SLURM_ARRAY_TASK_MAX))
+  I=$((${SLURM_ARRAY_TASK_ID}*${BATCH}+1+${START}))
+  STOP=$(($I+${BATCH}-1))
+  if [ $SLURM_ARRAY_TASK_ID -eq $SLURM_ARRAY_TASK_MAX ]; then
+    STOP=$(($I+${SIZE}%$SLURM_ARRAY_TASK_MAX-1))
+  fi
+fi
+
+cd ${TEMP_DIR}
+mkdir -p ir_bc_files/ps_$I/${TYPE}
+cd ir_bc_files/ps_$I/${TYPE}
+mkdir -p bc_files ${DATA_NAMES[1]}_counts perf_stat_files \
+  ${DATA_NAMES[0]}_counts ${DATA_NAMES[2]}_counts \
+  ${DATA_NAMES[3]}_counts object_files
+
+eval tar --extract --file=${STORAGE}/${TYPE}/${TYPE}_bc_files.tar \
+  bc_files/file{$I..$STOP}.bc
+
+cd ..
+make --ignore-errors --makefile=${MAKE_PATH}/Makefile \
+  --jobs=${THREADS} lang="${TYPE}" begin="$I" end="$STOP"
+
+TARGET_DIR="${STORAGE}/${TYPE}/ps_$I"
+mkdir -p $TARGET_DIR
+
+for element in "${DATA_NAMES[@]}"; do
+  > ${TARGET_DIR}/${element}.csv
+  eval cat ${TYPE}/${element}_counts/${element}{$I..$STOP}.csv \
+  >> ${TARGET_DIR}/${element}.csv
+done
+
+cd ..
+rm -r ps_$I
diff --git a/llvm_ir_dataset_utils/compile_time_analysis_tools/SLURM_files/create_batch_files.sh b/llvm_ir_dataset_utils/compile_time_analysis_tools/SLURM_files/create_batch_files.sh
@@ -0,0 +1,75 @@
+#!/bin/bash
+set -o errexit
+
+#USAGE
+#./create_batch_files.sh <STORAGE_PATH> <TEMP_DIR> <MAKEFILE_PATH> [THREADS] \
+#  [SLURM_MAX]
+
+if [ -z "$1" ]; then
+  echo "Missing storage argument."
+  exit 1
+else
+  STORAGE="$1"
+fi
+if [ -z "$2" ]; then
+  echo "Missing temporary directory argument."
+  exit 1
+else
+  TEMP_DIR="$2"
+fi
+if [ -z "$3" ]; then
+  echo "Missing makefile location argument."
+  exit 1
+else
+  MAKE_PATH="$3"
+fi
+if [ -z "$4" ]; then
+  THREADS=8
+else
+  THREADS="$4"
+fi
+if [ -z "$5" ]; then
+  SLURM_MAX=399
+else
+  SLURM_MAX="$5"
+  SLURM_MAX=$((SLURM_MAX-1))
+fi
+
+lang=()
+start_ids=()
+sizes=()
+
+while IFS=',' read -r language start_index end_index; do
+  lang+=($language)
+  start_ids+=($start_index)
+  sizes+=($((${end_index}-${start_index})))
+done < <(tail -n +2 "../dataset_download/indices.csv")
+
+length=${#lang[@]}
+
+for (( i=0; i<$length; i++ ))
+do
+  mkdir -p ${STORAGE}/${lang[$i]}/job_results
+  js="${lang[$i]}_batch.sh"
+  cp job_template.sh $js
+  if [ ${sizes[$i]} -le $SLURM_MAX ]; then
+    echo "#SBATCH --array=0-$((${sizes[$i]}-1))" >> $js
+  else
+    echo "#SBATCH --array=0-${SLURM_MAX}" >> $js
+  fi 
+
+  echo "#SBATCH --output=${STORAGE}/${lang[$i]}/job_results/slurm-%A_%a.out" >> $js
+  echo "#SBATCH --error=${STORAGE}/${lang[$i]}/job_results/slurm-%A_%a.out" >> $js
+
+  echo "START=${start_ids[$i]}" >> $js
+  echo "TYPE=${lang[$i]}" >> $js
+  echo "SIZE=${sizes[$i]}" >> $js
+  echo "STORAGE=${STORAGE}" >> $js
+  echo "TEMP_DIR=${TEMP_DIR}" >> $js
+  echo "MAKE_PATH=${MAKE_PATH}" >> $js
+  echo "THREADS=${THREADS}" >> $js
+  echo "export PYTHONPATH=\"${PYTHONPATH}\"" >> $js
+  cat batch_main_body.sh >> $js
+  chmod 744 $js
+done
+
diff --git a/llvm_ir_dataset_utils/compile_time_analysis_tools/SLURM_files/job_template.sh b/llvm_ir_dataset_utils/compile_time_analysis_tools/SLURM_files/job_template.sh
@@ -0,0 +1,7 @@
+#!/bin/bash -l
+#
+#SBATCH --nodes=1 --ntasks=1 --cpus-per-task=8
+#SBATCH --job-name=compiler_batch
+#SBATCH --partition=standard
+#SBATCH --time=0-00:10:00
+#SBATCH --export=NONE
diff --git a/llvm_ir_dataset_utils/compile_time_analysis_tools/combine_outputs.sh b/llvm_ir_dataset_utils/compile_time_analysis_tools/combine_outputs.sh
@@ -0,0 +1,61 @@
+#!/bin/bash
+set -o errexit
+
+#Usage:
+#./combine_outputs.sh <language> <storage>
+
+if [ -z "$1" ]; then
+  echo "Missing language argument."
+  exit 1
+else
+  LANGUAGE="$1"
+fi
+
+if [ -z "$2" ]; then
+  echo "Missing storage argument."
+  exit 1
+else
+  STORAGE="$2"
+fi
+
+
+cd ${STORAGE}
+
+mkdir -p ${LANGUAGE}/results
+TARGET_PREFIX="${LANGUAGE}/results/${LANGUAGE}"
+
+boolean=1
+DATA_NAMES=("text_segment" "instruction" "ir_features" "max_pass")
+
+for element in "${DATA_NAMES[@]}"; do
+  if [[ ${element} == ${DATA_NAMES[2]} ]]; then
+    OUTPUT=$(python3 -m llvm_ir_dataset_utils.compile_time_analysis_tools.write_ir_counts "/dev/null")
+    echo "file, $OUTPUT" \
+    > ${TARGET_PREFIX}_${element}.csv
+  elif [[ ${element} == ${DATA_NAMES[3]} ]]; then
+    echo "file, percentage, pass_name" \
+    > ${TARGET_PREFIX}_${element}.csv
+  else
+    echo "file, ${element}" \
+    > ${TARGET_PREFIX}_${element}.csv
+  fi
+  ls ${LANGUAGE}/ps_[0-9]*/${element}.csv | xargs cat \
+  >> ${TARGET_PREFIX}_${element}.csv
+
+  sort -nk1.5 ${TARGET_PREFIX}_${element}.csv \
+  -o ${TARGET_PREFIX}_${element}.csv
+  if [ $boolean -eq 1 ]; then
+    awk -F',' '{print $1}' ${TARGET_PREFIX}_${DATA_NAMES[0]}.csv > ${TARGET_PREFIX}_combined.csv
+    boolean=0
+  fi
+  awk -F',' -v OFS=',' 'NR==FNR {for (i=2; i<=NF; i++) cols[FNR]=(cols[FNR]?cols[FNR] OFS:"") $i; next} {print $0, cols[FNR]}' \
+    ${TARGET_PREFIX}_${element}.csv \
+    ${TARGET_PREFIX}_combined.csv \
+    > ${TARGET_PREFIX}_temp.csv
+  mv ${TARGET_PREFIX}_temp.csv ${TARGET_PREFIX}_combined.csv
+  rm ${TARGET_PREFIX}_${element}.csv
+done
+
+sed -n -i '/, ,/!p' ${TARGET_PREFIX}_combined.csv
+
+rm -r ${LANGUAGE}/ps_*
diff --git a/llvm_ir_dataset_utils/compile_time_analysis_tools/write_ir_counts.py b/llvm_ir_dataset_utils/compile_time_analysis_tools/write_ir_counts.py
@@ -0,0 +1,50 @@
+"""Script to print function analysis properties for IR bitcode files.
+
+The script uses argparse to expect a bitcode file path as argument. The
+main function invokes a print() to display the imported function's
+output.
+
+Example usage:
+  python write_ir_counts.py /tmp/c/file1.bc 
+  python write_ir_counts.py /dev/null
+"""
+from llvm_ir_dataset_utils.util.bitcode_module import get_function_properties_total
+import argparse
+from datasets import load_dataset, parallel
+
+
+def main() -> None:
+  """Function takes a bitcode filename from argparse and prints the function
+  analysis properties counts for that file.
+
+  If a valid bitcode file is provided to argparse, the counts will be
+  printed using llvm_ir_dataset_utils.util.bitcode_module.get_function_p
+  roperties_total. The bitcode file is loaded into memory as a bytes
+  object to pass in as an argument. If "/dev/null" is provided to
+  argparse, the Hugging Face IR dataset will be loaded and the loop will
+  start from the first index of the dataset to obtain a bitcode file
+  with valid function analysis properties field names. Each output is
+  comma seperated values.
+  """
+  parser = argparse.ArgumentParser(
+      description="Process a bitcode file and print field counts.")
+  parser.add_argument(
+      'filename', type=str, help="Path to the bitcode (.bc) file.")
+  filename = parser.parse_args().filename
+  if (filename == "/dev/null"):
+    with parallel.parallel_backend('spark'):
+      dataset = load_dataset('llvm-ml/ComPile', split='train', num_proc=2)
+    for i in range(0, dataset.num_rows):
+      bc_file = dataset[i]["content"]
+      output = get_function_properties_total(bc_file, names=True)
+      if (output != None):
+        print(', '.join(output))
+        break
+  else:
+    print(', '.join(
+        str(x)
+        for x in get_function_properties_total(open(filename, "rb").read())))
+
+
+if __name__ == '__main__':
+  main()
diff --git a/llvm_ir_dataset_utils/util/bitcode_module.py b/llvm_ir_dataset_utils/util/bitcode_module.py
@@ -232,6 +232,24 @@ def get_instruction_counts(bitcode_module, additional_passes=''):
     ]
 
 
+def get_function_properties_total(bitcode_module,
+                                  additional_passes='',
+                                  names=False):
+  properties_or_error = get_function_properties_module(bitcode_module,
+                                                       additional_passes)
+  if properties_or_error[0]:
+    return None
+  else:
+    if (names):
+      return [key[0] for key in properties_or_error[1].items()]
+    else:
+      return [
+          sum([int(i)
+               for i in key[1]])
+          for key in properties_or_error[1].items()
+      ]
+
+
 def get_instruction_histogram(bitcode_module, additional_passes=''):
   if additional_passes != '':
     additional_passes += ','
-Original file line number
+Diff line change
@@ Expand Up / @@ -5,3 +5,4 @@ llvm_ir_dataset_utils.egg-info/ @@
     *.tar
     *.sif
     *.swp
+    *~