From 986538b8107f84cd57ba075759fa13711800db7d Mon Sep 17 00:00:00 2001 From: Andrew Kallai Date: Sat, 10 Aug 2024 05:55:29 -0400 Subject: [PATCH 1/6] Added dataset download files to write the dataset from the HF api to disk. Also added bash script to create tar files from the IR files on disk. --- .../datatset_download/create_tar.sh | 29 +++++++++ .../datatset_download/write_data_files.py | 60 +++++++++++++++++++ 2 files changed, 89 insertions(+) create mode 100755 llvm_ir_dataset_utils/compile_time_analysis_tools/datatset_download/create_tar.sh create mode 100644 llvm_ir_dataset_utils/compile_time_analysis_tools/datatset_download/write_data_files.py diff --git a/llvm_ir_dataset_utils/compile_time_analysis_tools/datatset_download/create_tar.sh b/llvm_ir_dataset_utils/compile_time_analysis_tools/datatset_download/create_tar.sh new file mode 100755 index 0000000..b525601 --- /dev/null +++ b/llvm_ir_dataset_utils/compile_time_analysis_tools/datatset_download/create_tar.sh @@ -0,0 +1,29 @@ +#!/bin/bash +set -o errexit +#Usage: +#./create_tar.sh [storage] + +if [ -z "$1" ]; then + echo "Missing language argument." + exit 1 +else + LANGUAGE="$1" +fi + +if [ -z "$2" ]; then + STORAGE="/tmp" +else + STORAGE="$2" +fi + +cd ${STORAGE}/${LANGUAGE} + +for dir in [0-9]*_temp; do + cd $dir + tar --append --file="../${LANGUAGE}_bc_files.tar" \ + --transform=s,^,bc_files/, file[0-9]*.bc + cd .. + + rm -r "${dir}" +done + diff --git a/llvm_ir_dataset_utils/compile_time_analysis_tools/datatset_download/write_data_files.py b/llvm_ir_dataset_utils/compile_time_analysis_tools/datatset_download/write_data_files.py new file mode 100644 index 0000000..bc0a199 --- /dev/null +++ b/llvm_ir_dataset_utils/compile_time_analysis_tools/datatset_download/write_data_files.py @@ -0,0 +1,60 @@ +# -*- coding: ascii -*- +from datasets import load_dataset, parallel +import os +import multiprocessing +import csv +from sys import argv + +# Usage: +# python write_data_files.py [STORAGE] + +STORAGE: str +if len(argv) > 1: + STORAGE = argv[1] +else: + STORAGE = '/tmp' + +lang_list: [str] +global j +global dir_name +j: int +dir_name: str +BATCH_SIZE: int = 15000 +file_indices: [dict] = [] + + +def write_file(index: [int], bytes_item: [bytes]): + filename = f'{dir_name}/file{index+j+1}.bc' + with open(filename, 'wb') as file: + file.write(bytes_item) + + +with parallel.parallel_backend('spark'): + dataset = load_dataset('llvm-ml/ComPile', split='train', num_proc=2) + +lang_list = dataset["language"] +langs = dataset.unique("language") +pool = multiprocessing.pool.ThreadPool(processes=multiprocessing.cpu_count()) + +for i in range(0, len(langs)): + start_index = lang_list.index(langs[i]) + if (i+1 != len(langs)): + end_index = lang_list.index(langs[i+1]) + else: + end_index = len(lang_list) + file_indices.append( + {"language": langs[i], "start_index": start_index, "end_index": end_index}) + for j in range(start_index, end_index, BATCH_SIZE): + dir_name = os.path.join(STORAGE, f'{STORAGE}/{langs[i]}/{j}_temp') + os.makedirs(dir_name, exist_ok=True) + bytes_enumeration = enumerate( + dataset[j:j+BATCH_SIZE if (j+BATCH_SIZE <= end_index) else end_index]['content']) + pool.starmap(write_file, bytes_enumeration) + +pool.close() + +with open('indices.csv', mode='w', newline='') as file: + writer = csv.DictWriter(file, fieldnames=[ + "language", "start_index", "end_index"], dialect='unix', quoting=csv.QUOTE_NONE) + writer.writeheader() + writer.writerows(file_indices) From cab77f3692ee6e4a9ba484414be142c24f108e21 Mon Sep 17 00:00:00 2001 From: Andrew Kallai Date: Sun, 25 Aug 2024 14:26:10 -0400 Subject: [PATCH 2/6] Adding changes to file for writing dataset files. These changes include argparse functionality, elimination of global variables, and script execution layout. --- .../datatset_download/create_tar.sh | 5 +- .../datatset_download/write_data_files.py | 124 ++++++++++-------- 2 files changed, 70 insertions(+), 59 deletions(-) diff --git a/llvm_ir_dataset_utils/compile_time_analysis_tools/datatset_download/create_tar.sh b/llvm_ir_dataset_utils/compile_time_analysis_tools/datatset_download/create_tar.sh index b525601..da4db3b 100755 --- a/llvm_ir_dataset_utils/compile_time_analysis_tools/datatset_download/create_tar.sh +++ b/llvm_ir_dataset_utils/compile_time_analysis_tools/datatset_download/create_tar.sh @@ -1,7 +1,7 @@ #!/bin/bash set -o errexit #Usage: -#./create_tar.sh [storage] +#./create_tar.sh if [ -z "$1" ]; then echo "Missing language argument." @@ -11,7 +11,8 @@ else fi if [ -z "$2" ]; then - STORAGE="/tmp" + echo "Missing storage argument." + exit 1 else STORAGE="$2" fi diff --git a/llvm_ir_dataset_utils/compile_time_analysis_tools/datatset_download/write_data_files.py b/llvm_ir_dataset_utils/compile_time_analysis_tools/datatset_download/write_data_files.py index bc0a199..04b2081 100644 --- a/llvm_ir_dataset_utils/compile_time_analysis_tools/datatset_download/write_data_files.py +++ b/llvm_ir_dataset_utils/compile_time_analysis_tools/datatset_download/write_data_files.py @@ -1,60 +1,70 @@ -# -*- coding: ascii -*- +"""Script to write ir dataset files to a storage location. + +First the index counts for each language are written to a file "indices.csv". +Then each bitcode file is written to the specified storage location, from which +the files can be written to a tar file. +""" + from datasets import load_dataset, parallel -import os +from os import makedirs import multiprocessing import csv -from sys import argv - -# Usage: -# python write_data_files.py [STORAGE] - -STORAGE: str -if len(argv) > 1: - STORAGE = argv[1] -else: - STORAGE = '/tmp' - -lang_list: [str] -global j -global dir_name -j: int -dir_name: str -BATCH_SIZE: int = 15000 -file_indices: [dict] = [] - - -def write_file(index: [int], bytes_item: [bytes]): - filename = f'{dir_name}/file{index+j+1}.bc' - with open(filename, 'wb') as file: - file.write(bytes_item) - - -with parallel.parallel_backend('spark'): - dataset = load_dataset('llvm-ml/ComPile', split='train', num_proc=2) - -lang_list = dataset["language"] -langs = dataset.unique("language") -pool = multiprocessing.pool.ThreadPool(processes=multiprocessing.cpu_count()) - -for i in range(0, len(langs)): - start_index = lang_list.index(langs[i]) - if (i+1 != len(langs)): - end_index = lang_list.index(langs[i+1]) - else: - end_index = len(lang_list) - file_indices.append( - {"language": langs[i], "start_index": start_index, "end_index": end_index}) - for j in range(start_index, end_index, BATCH_SIZE): - dir_name = os.path.join(STORAGE, f'{STORAGE}/{langs[i]}/{j}_temp') - os.makedirs(dir_name, exist_ok=True) - bytes_enumeration = enumerate( - dataset[j:j+BATCH_SIZE if (j+BATCH_SIZE <= end_index) else end_index]['content']) - pool.starmap(write_file, bytes_enumeration) - -pool.close() - -with open('indices.csv', mode='w', newline='') as file: - writer = csv.DictWriter(file, fieldnames=[ - "language", "start_index", "end_index"], dialect='unix', quoting=csv.QUOTE_NONE) - writer.writeheader() - writer.writerows(file_indices) +import argparse + +parser = argparse.ArgumentParser( + description="Configure path to store bitcode files, and configure batch size.") +parser.add_argument('storage', type=str, + help='Path to the storage location.') +parser.add_argument('-b', '--batchsize', nargs='?', type=int, default=15000, + help='Number of files to be written per pool of threads. Default value is 15000.') +args = parser.parse_args() + + +def write_dataset_files_and_index_info() -> None: + STORAGE: str = args.storage + BATCH_SIZE: int = args.batchsize + + def write_file(index: [int], bytes_and_dir: (bytes, str)): + filename = f'{bytes_and_dir[1]}/file{index+1}.bc' + with open(filename, 'wb') as file: + file.write(bytes_and_dir[0]) + + with parallel.parallel_backend('spark'): + dataset = load_dataset('llvm-ml/ComPile', split='train', num_proc=2) + + lang_list: [str] = dataset["language"] + langs = dataset.unique("language") + file_indices: [dict] = [] + + for i in range(0, len(langs)): + start_index = lang_list.index(langs[i]) + if (i+1 != len(langs)): + end_index = lang_list.index(langs[i+1]) + else: + end_index = len(lang_list) + file_indices.append( + {"language": langs[i], "start_index": start_index, "end_index": end_index}) + with open('indices.csv', mode='w', newline='') as file: + writer = csv.DictWriter(file, fieldnames=[ + "language", "start_index", "end_index"], dialect='unix', quoting=csv.QUOTE_NONE) + writer.writeheader() + writer.writerows(file_indices) + + pool = multiprocessing.pool.ThreadPool( + processes=multiprocessing.cpu_count()) + + for i in range(0, len(file_indices)): + start_index = file_indices[i]["start_index"] + end_index = file_indices[i]["end_index"] + for j in range(start_index, end_index, BATCH_SIZE): + dir_name = f'{STORAGE}/{file_indices[i]["language"]}/{j}_temp' + makedirs(dir_name, exist_ok=True) + bytes_enumeration = enumerate( + [(bytes_item, dir_name) for bytes_item in dataset[j:j+BATCH_SIZE if (j+BATCH_SIZE <= end_index) else end_index]['content']], start=j) + pool.starmap(write_file, bytes_enumeration) + + pool.close() + + +if __name__ == '__main__': + write_dataset_files_and_index_info() From 52f0049b48087e70259d4615e8a82779093820dc Mon Sep 17 00:00:00 2001 From: Andrew Kallai Date: Sat, 31 Aug 2024 22:31:48 -0400 Subject: [PATCH 3/6] Adding changes for writing dataset files to tar in python. --- .../datatset_download/create_tar.sh | 30 ---- .../datatset_download/write_data_files.py | 159 +++++++++++------- 2 files changed, 95 insertions(+), 94 deletions(-) delete mode 100755 llvm_ir_dataset_utils/compile_time_analysis_tools/datatset_download/create_tar.sh diff --git a/llvm_ir_dataset_utils/compile_time_analysis_tools/datatset_download/create_tar.sh b/llvm_ir_dataset_utils/compile_time_analysis_tools/datatset_download/create_tar.sh deleted file mode 100755 index da4db3b..0000000 --- a/llvm_ir_dataset_utils/compile_time_analysis_tools/datatset_download/create_tar.sh +++ /dev/null @@ -1,30 +0,0 @@ -#!/bin/bash -set -o errexit -#Usage: -#./create_tar.sh - -if [ -z "$1" ]; then - echo "Missing language argument." - exit 1 -else - LANGUAGE="$1" -fi - -if [ -z "$2" ]; then - echo "Missing storage argument." - exit 1 -else - STORAGE="$2" -fi - -cd ${STORAGE}/${LANGUAGE} - -for dir in [0-9]*_temp; do - cd $dir - tar --append --file="../${LANGUAGE}_bc_files.tar" \ - --transform=s,^,bc_files/, file[0-9]*.bc - cd .. - - rm -r "${dir}" -done - diff --git a/llvm_ir_dataset_utils/compile_time_analysis_tools/datatset_download/write_data_files.py b/llvm_ir_dataset_utils/compile_time_analysis_tools/datatset_download/write_data_files.py index 04b2081..f974cfd 100644 --- a/llvm_ir_dataset_utils/compile_time_analysis_tools/datatset_download/write_data_files.py +++ b/llvm_ir_dataset_utils/compile_time_analysis_tools/datatset_download/write_data_files.py @@ -1,70 +1,101 @@ -"""Script to write ir dataset files to a storage location. +"""Script to write ir dataset files to a specified storage location. -First the index counts for each language are written to a file "indices.csv". -Then each bitcode file is written to the specified storage location, from which -the files can be written to a tar file. +In write_dataset_files_and_index_info, the index counts for each language are written to a file "indices.csv". +Then each bitcode file is written using available threads into a tar file corresponding to the IR language. + + +get_args + Returns: argparse.Namespace + Example usage: get_args() + +write_dataset_files_and_index_info + Returns: None + Example usage: write_dataset_files_and_index_info("/tmp") """ -from datasets import load_dataset, parallel -from os import makedirs -import multiprocessing -import csv -import argparse - -parser = argparse.ArgumentParser( - description="Configure path to store bitcode files, and configure batch size.") -parser.add_argument('storage', type=str, - help='Path to the storage location.') -parser.add_argument('-b', '--batchsize', nargs='?', type=int, default=15000, - help='Number of files to be written per pool of threads. Default value is 15000.') -args = parser.parse_args() - - -def write_dataset_files_and_index_info() -> None: - STORAGE: str = args.storage - BATCH_SIZE: int = args.batchsize - - def write_file(index: [int], bytes_and_dir: (bytes, str)): - filename = f'{bytes_and_dir[1]}/file{index+1}.bc' - with open(filename, 'wb') as file: - file.write(bytes_and_dir[0]) - - with parallel.parallel_backend('spark'): - dataset = load_dataset('llvm-ml/ComPile', split='train', num_proc=2) - - lang_list: [str] = dataset["language"] - langs = dataset.unique("language") - file_indices: [dict] = [] - - for i in range(0, len(langs)): - start_index = lang_list.index(langs[i]) - if (i+1 != len(langs)): - end_index = lang_list.index(langs[i+1]) - else: - end_index = len(lang_list) - file_indices.append( - {"language": langs[i], "start_index": start_index, "end_index": end_index}) - with open('indices.csv', mode='w', newline='') as file: - writer = csv.DictWriter(file, fieldnames=[ - "language", "start_index", "end_index"], dialect='unix', quoting=csv.QUOTE_NONE) - writer.writeheader() - writer.writerows(file_indices) - - pool = multiprocessing.pool.ThreadPool( - processes=multiprocessing.cpu_count()) - - for i in range(0, len(file_indices)): - start_index = file_indices[i]["start_index"] - end_index = file_indices[i]["end_index"] - for j in range(start_index, end_index, BATCH_SIZE): - dir_name = f'{STORAGE}/{file_indices[i]["language"]}/{j}_temp' - makedirs(dir_name, exist_ok=True) - bytes_enumeration = enumerate( - [(bytes_item, dir_name) for bytes_item in dataset[j:j+BATCH_SIZE if (j+BATCH_SIZE <= end_index) else end_index]['content']], start=j) - pool.starmap(write_file, bytes_enumeration) - - pool.close() + +def get_args(): + import argparse + + parser = argparse.ArgumentParser( + description="Configure path to store bitcode files, and configure batch size." + ) + parser.add_argument('storage', type=str, help='Path to the storage location.') + parser.add_argument( + '-b', + '--batchsize', + nargs='?', + type=int, + default=15000, + help='Number of files to be written per pool of threads. Default value is 15000.' + ) + return parser.parse_args() + + +def write_dataset_files_and_index_info(storage: str) -> None: + from datasets import load_dataset, parallel + from os import makedirs, listdir + import threading + import csv + import tarfile + from io import BytesIO + from time import time + + def create_tar(dataset_subset, start_index: int, dir_name: str, + language: str): + with tarfile.open(dir_name + '/' + language + '_bc_files.tar', 'a:') as tar: + for x in enumerate((dataset_subset[i]["content"] + for i in range(0, dataset_subset.num_rows))): + tarinfo = tarfile.TarInfo(name=f'bc_files/file{x[0]+1+start_index}.bc') + file_obj = BytesIO(x[1]) + tarinfo.size = file_obj.getbuffer().nbytes + tarinfo.mtime = time() + tar.addfile(tarinfo, fileobj=file_obj) + + with parallel.parallel_backend('spark'): + dataset = load_dataset('llvm-ml/ComPile', split='train', num_proc=2) + + lang_list: [str] = dataset["language"] + langs = dataset.unique("language") + file_indices: [dict] = [] + + for i in range(0, len(langs)): + start_index = lang_list.index(langs[i]) + if (i + 1 != len(langs)): + end_index = lang_list.index(langs[i + 1]) + else: + end_index = len(lang_list) + file_indices.append({ + "language": langs[i], + "start_index": start_index, + "end_index": end_index + }) + with open('indices.csv', mode='w', newline='') as file: + writer = csv.DictWriter( + file, + fieldnames=["language", "start_index", "end_index"], + dialect='unix', + quoting=csv.QUOTE_NONE) + writer.writeheader() + writer.writerows(file_indices) + + threads = [] + for i in range(0, len(file_indices)): + start_index = file_indices[i]["start_index"] + end_index = file_indices[i]["end_index"] + dir_name = f'{storage}/{file_indices[i]["language"]}' + makedirs(dir_name, exist_ok=True) + thread = threading.Thread( + target=create_tar, + args=(dataset.select(range(start_index, end_index)), start_index, + dir_name, file_indices[i]["language"])) + threads.append(thread) + for thread in threads: + thread.start() + for thread in threads: + thread.join() if __name__ == '__main__': - write_dataset_files_and_index_info() + args = get_args() + write_dataset_files_and_index_info(storage=args.storage) From 5fa24ffe1fe11fdab89b1c2173925fd364c704bf Mon Sep 17 00:00:00 2001 From: Andrew Kallai Date: Sun, 1 Sep 2024 10:33:46 -0400 Subject: [PATCH 4/6] Removed unecessary batch size argument. --- .../datatset_download/write_data_files.py | 10 +--------- 1 file changed, 1 insertion(+), 9 deletions(-) diff --git a/llvm_ir_dataset_utils/compile_time_analysis_tools/datatset_download/write_data_files.py b/llvm_ir_dataset_utils/compile_time_analysis_tools/datatset_download/write_data_files.py index f974cfd..d2241a7 100644 --- a/llvm_ir_dataset_utils/compile_time_analysis_tools/datatset_download/write_data_files.py +++ b/llvm_ir_dataset_utils/compile_time_analysis_tools/datatset_download/write_data_files.py @@ -1,7 +1,7 @@ """Script to write ir dataset files to a specified storage location. In write_dataset_files_and_index_info, the index counts for each language are written to a file "indices.csv". -Then each bitcode file is written using available threads into a tar file corresponding to the IR language. +Then, each bitcode file is written using available threads into respective tar files named [lang]_bc_files.tar. Each file added to a tar file is named bc_files/file[index].bc, where index is a number that is between the range of start_index and end_index (not including end) in "indices.csv" and is incremented by one (smallest index is 1). get_args @@ -21,14 +21,6 @@ def get_args(): description="Configure path to store bitcode files, and configure batch size." ) parser.add_argument('storage', type=str, help='Path to the storage location.') - parser.add_argument( - '-b', - '--batchsize', - nargs='?', - type=int, - default=15000, - help='Number of files to be written per pool of threads. Default value is 15000.' - ) return parser.parse_args() From 5b8ee64388d272ff8c8aeccf6118bd75ab0d6043 Mon Sep 17 00:00:00 2001 From: Andrew Kallai Date: Sun, 1 Sep 2024 11:55:15 -0400 Subject: [PATCH 5/6] Fixed spelling mistake in directory name. --- .../{datatset_download => dataset_download}/write_data_files.py | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename llvm_ir_dataset_utils/compile_time_analysis_tools/{datatset_download => dataset_download}/write_data_files.py (100%) diff --git a/llvm_ir_dataset_utils/compile_time_analysis_tools/datatset_download/write_data_files.py b/llvm_ir_dataset_utils/compile_time_analysis_tools/dataset_download/write_data_files.py similarity index 100% rename from llvm_ir_dataset_utils/compile_time_analysis_tools/datatset_download/write_data_files.py rename to llvm_ir_dataset_utils/compile_time_analysis_tools/dataset_download/write_data_files.py From 1b936eee677325f73df16b54f1f2d03c18abdde7 Mon Sep 17 00:00:00 2001 From: Andrew Kallai Date: Sun, 15 Sep 2024 18:25:31 -0400 Subject: [PATCH 6/6] Modified docstrings and import statement locations. --- .../dataset_download/write_data_files.py | 68 ++++++++++++++----- 1 file changed, 50 insertions(+), 18 deletions(-) diff --git a/llvm_ir_dataset_utils/compile_time_analysis_tools/dataset_download/write_data_files.py b/llvm_ir_dataset_utils/compile_time_analysis_tools/dataset_download/write_data_files.py index d2241a7..711f37e 100644 --- a/llvm_ir_dataset_utils/compile_time_analysis_tools/dataset_download/write_data_files.py +++ b/llvm_ir_dataset_utils/compile_time_analysis_tools/dataset_download/write_data_files.py @@ -1,21 +1,35 @@ -"""Script to write ir dataset files to a specified storage location. +"""Script to write IR dataset files to a specified storage location. -In write_dataset_files_and_index_info, the index counts for each language are written to a file "indices.csv". -Then, each bitcode file is written using available threads into respective tar files named [lang]_bc_files.tar. Each file added to a tar file is named bc_files/file[index].bc, where index is a number that is between the range of start_index and end_index (not including end) in "indices.csv" and is incremented by one (smallest index is 1). +Two functions are contained in the script. One function retrieves a +storage location argument for the script. The other function accesses +the ComPile dataset using the Hugging Face API and writes the bitcode +files in the dataset to tar files corresponding to their IR language. +the index counts for each language are written to a file "indices.csv". +Then, each bitcode file is written using available threads into +respective tar files named [lang]_bc_files.tar. Each file added to a tar +file is named bc_files/file[index].bc, where index is a number that is +between the range of start_index and end_index (not including end) in +"indices.csv" and is incremented by one (smallest index is 1). -get_args - Returns: argparse.Namespace - Example usage: get_args() - -write_dataset_files_and_index_info - Returns: None - Example usage: write_dataset_files_and_index_info("/tmp") +Example usage: python write_data_files.py /tmp """ +import argparse +from datasets import load_dataset, parallel +from os import makedirs, listdir +import threading +import csv +import tarfile +from io import BytesIO +from time import time + def get_args(): - import argparse + """Function to return the provided storage argument for the script. + + Returns: argparse.Namespace + """ parser = argparse.ArgumentParser( description="Configure path to store bitcode files, and configure batch size." @@ -25,13 +39,31 @@ def get_args(): def write_dataset_files_and_index_info(storage: str) -> None: - from datasets import load_dataset, parallel - from os import makedirs, listdir - import threading - import csv - import tarfile - from io import BytesIO - from time import time + """Function to write each IR bitcode file to a tar archive. + + The function first loads the ComPile dataset into a HF datasets + Dataset object. It does this using an experimental parallel backend to + slightly speed up load times. Then, a list of dictionaries is made, + where each dict contains the starting and ending index for each IR + file type based on language. For example, if the entire dataset + consisted of C and C++ IR modules, then the dictionary for C would + note language='c', starting_index=0, ending_index=(C++ starting + index). The +1 for the ending_index allows for direct use in range(a, + b) syntax, but is NOT suitable for right inclusive syntax. The entries + of each dictionary are then written to a CSV file name 'indices.csv' + for further use by other scripts. The contents of each dictionary are + used to provide information to the n number of threads, where n is the + number of languages in the dataset. Each thread when started calls the + create_tar() sub-function. The sub-function uses a generator + expression to access the bitcode files from a subset (taken by + language) of the original Dataset object. This allows for low memory + usage while performing in-memory writing of each bitcode file to a tar + archive which is named according to the given language (i.e., + c_bc_files.tar). + + Args: + storage: Storage location for the tar archives + """ def create_tar(dataset_subset, start_index: int, dir_name: str, language: str):