From 986538b8107f84cd57ba075759fa13711800db7d Mon Sep 17 00:00:00 2001
From: Andrew Kallai <kallai.andrew@gmail.com>
Date: Sat, 10 Aug 2024 05:55:29 -0400
Subject: [PATCH 1/6] Added dataset download files to write the dataset from
 the HF api to disk. Also added bash script to create tar files from the IR
 files on disk.

---
 .../datatset_download/create_tar.sh           | 29 +++++++++
 .../datatset_download/write_data_files.py     | 60 +++++++++++++++++++
 2 files changed, 89 insertions(+)
 create mode 100755 llvm_ir_dataset_utils/compile_time_analysis_tools/datatset_download/create_tar.sh
 create mode 100644 llvm_ir_dataset_utils/compile_time_analysis_tools/datatset_download/write_data_files.py
diff --git a/llvm_ir_dataset_utils/compile_time_analysis_tools/datatset_download/create_tar.sh b/llvm_ir_dataset_utils/compile_time_analysis_tools/datatset_download/create_tar.sh
new file mode 100755
index 0000000..b525601
--- /dev/null
+++ b/llvm_ir_dataset_utils/compile_time_analysis_tools/datatset_download/create_tar.sh
@@ -0,0 +1,29 @@
+#!/bin/bash
+set -o errexit
+#Usage:
+#./create_tar.sh <language> [storage]
+
+if [ -z "$1" ]; then
+  echo "Missing language argument."
+  exit 1
+else
+  LANGUAGE="$1"
+fi
+
+if [ -z "$2" ]; then
+  STORAGE="/tmp"
+else
+  STORAGE="$2"
+fi
+
+cd ${STORAGE}/${LANGUAGE}
+
+for dir in [0-9]*_temp; do
+  cd $dir
+  tar --append --file="../${LANGUAGE}_bc_files.tar" \
+  --transform=s,^,bc_files/, file[0-9]*.bc
+  cd ..
+
+  rm -r "${dir}"
+done
+
diff --git a/llvm_ir_dataset_utils/compile_time_analysis_tools/datatset_download/write_data_files.py b/llvm_ir_dataset_utils/compile_time_analysis_tools/datatset_download/write_data_files.py
new file mode 100644
index 0000000..bc0a199
--- /dev/null
+++ b/llvm_ir_dataset_utils/compile_time_analysis_tools/datatset_download/write_data_files.py
@@ -0,0 +1,60 @@
+# -*- coding: ascii -*-
+from datasets import load_dataset, parallel
+import os
+import multiprocessing
+import csv
+from sys import argv
+
+# Usage:
+# python write_data_files.py [STORAGE]
+
+STORAGE: str
+if len(argv) > 1:
+    STORAGE = argv[1]
+else:
+    STORAGE = '/tmp'
+
+lang_list: [str]
+global j
+global dir_name
+j: int
+dir_name: str
+BATCH_SIZE: int = 15000
+file_indices: [dict] = []
+
+
+def write_file(index: [int], bytes_item: [bytes]):
+    filename = f'{dir_name}/file{index+j+1}.bc'
+    with open(filename, 'wb') as file:
+        file.write(bytes_item)
+
+
+with parallel.parallel_backend('spark'):
+    dataset = load_dataset('llvm-ml/ComPile', split='train', num_proc=2)
+
+lang_list = dataset["language"]
+langs = dataset.unique("language")
+pool = multiprocessing.pool.ThreadPool(processes=multiprocessing.cpu_count())
+
+for i in range(0, len(langs)):
+    start_index = lang_list.index(langs[i])
+    if (i+1 != len(langs)):
+        end_index = lang_list.index(langs[i+1])
+    else:
+        end_index = len(lang_list)
+    file_indices.append(
+        {"language": langs[i], "start_index": start_index, "end_index": end_index})
+    for j in range(start_index, end_index, BATCH_SIZE):
+        dir_name = os.path.join(STORAGE, f'{STORAGE}/{langs[i]}/{j}_temp')
+        os.makedirs(dir_name, exist_ok=True)
+        bytes_enumeration = enumerate(
+            dataset[j:j+BATCH_SIZE if (j+BATCH_SIZE <= end_index) else end_index]['content'])
+        pool.starmap(write_file, bytes_enumeration)
+
+pool.close()
+
+with open('indices.csv', mode='w', newline='') as file:
+    writer = csv.DictWriter(file, fieldnames=[
+                            "language", "start_index", "end_index"], dialect='unix', quoting=csv.QUOTE_NONE)
+    writer.writeheader()
+    writer.writerows(file_indices)

From cab77f3692ee6e4a9ba484414be142c24f108e21 Mon Sep 17 00:00:00 2001
From: Andrew Kallai <kallai.andrew@gmail.com>
Date: Sun, 25 Aug 2024 14:26:10 -0400
Subject: [PATCH 2/6] Adding changes to file for writing dataset files. These
 changes include argparse functionality, elimination of global variables, and
 script execution layout.

---
 .../datatset_download/create_tar.sh           |   5 +-
 .../datatset_download/write_data_files.py     | 124 ++++++++++--------
 2 files changed, 70 insertions(+), 59 deletions(-)

diff --git a/llvm_ir_dataset_utils/compile_time_analysis_tools/datatset_download/create_tar.sh b/llvm_ir_dataset_utils/compile_time_analysis_tools/datatset_download/create_tar.sh
index b525601..da4db3b 100755
--- a/llvm_ir_dataset_utils/compile_time_analysis_tools/datatset_download/create_tar.sh
+++ b/llvm_ir_dataset_utils/compile_time_analysis_tools/datatset_download/create_tar.sh
@@ -1,7 +1,7 @@
 #!/bin/bash
 set -o errexit
 #Usage:
-#./create_tar.sh <language> [storage]
+#./create_tar.sh <language> <storage>
 
 if [ -z "$1" ]; then
   echo "Missing language argument."
@@ -11,7 +11,8 @@ else
 fi
 
 if [ -z "$2" ]; then
-  STORAGE="/tmp"
+  echo "Missing storage argument."
+  exit 1
 else
   STORAGE="$2"
 fi
diff --git a/llvm_ir_dataset_utils/compile_time_analysis_tools/datatset_download/write_data_files.py b/llvm_ir_dataset_utils/compile_time_analysis_tools/datatset_download/write_data_files.py
index bc0a199..04b2081 100644
--- a/llvm_ir_dataset_utils/compile_time_analysis_tools/datatset_download/write_data_files.py
+++ b/llvm_ir_dataset_utils/compile_time_analysis_tools/datatset_download/write_data_files.py
@@ -1,60 +1,70 @@
-# -*- coding: ascii -*-
+"""Script to write ir dataset files to a storage location.
+
+First the index counts for each language are written to a file "indices.csv".
+Then each bitcode file is written to the specified storage location, from which
+the files can be written to a tar file.
+"""
+
 from datasets import load_dataset, parallel
-import os
+from os import makedirs
 import multiprocessing
 import csv
-from sys import argv
-
-# Usage:
-# python write_data_files.py [STORAGE]
-
-STORAGE: str
-if len(argv) > 1:
-    STORAGE = argv[1]
-else:
-    STORAGE = '/tmp'
-
-lang_list: [str]
-global j
-global dir_name
-j: int
-dir_name: str
-BATCH_SIZE: int = 15000
-file_indices: [dict] = []
-
-
-def write_file(index: [int], bytes_item: [bytes]):
-    filename = f'{dir_name}/file{index+j+1}.bc'
-    with open(filename, 'wb') as file:
-        file.write(bytes_item)
-
-
-with parallel.parallel_backend('spark'):
-    dataset = load_dataset('llvm-ml/ComPile', split='train', num_proc=2)
-
-lang_list = dataset["language"]
-langs = dataset.unique("language")
-pool = multiprocessing.pool.ThreadPool(processes=multiprocessing.cpu_count())
-
-for i in range(0, len(langs)):
-    start_index = lang_list.index(langs[i])
-    if (i+1 != len(langs)):
-        end_index = lang_list.index(langs[i+1])
-    else:
-        end_index = len(lang_list)
-    file_indices.append(
-        {"language": langs[i], "start_index": start_index, "end_index": end_index})
-    for j in range(start_index, end_index, BATCH_SIZE):
-        dir_name = os.path.join(STORAGE, f'{STORAGE}/{langs[i]}/{j}_temp')
-        os.makedirs(dir_name, exist_ok=True)
-        bytes_enumeration = enumerate(
-            dataset[j:j+BATCH_SIZE if (j+BATCH_SIZE <= end_index) else end_index]['content'])
-        pool.starmap(write_file, bytes_enumeration)
-
-pool.close()
-
-with open('indices.csv', mode='w', newline='') as file:
-    writer = csv.DictWriter(file, fieldnames=[
-                            "language", "start_index", "end_index"], dialect='unix', quoting=csv.QUOTE_NONE)
-    writer.writeheader()
-    writer.writerows(file_indices)
+import argparse
+
+parser = argparse.ArgumentParser(
+    description="Configure path to store bitcode files, and configure batch size.")
+parser.add_argument('storage', type=str,
+                    help='Path to the storage location.')
+parser.add_argument('-b', '--batchsize', nargs='?', type=int, default=15000,
+                    help='Number of files to be written per pool of threads. Default value is 15000.')
+args = parser.parse_args()
+
+
+def write_dataset_files_and_index_info() -> None:
+    STORAGE: str = args.storage
+    BATCH_SIZE: int = args.batchsize
+
+    def write_file(index: [int], bytes_and_dir: (bytes, str)):
+        filename = f'{bytes_and_dir[1]}/file{index+1}.bc'
+        with open(filename, 'wb') as file:
+            file.write(bytes_and_dir[0])
+
+    with parallel.parallel_backend('spark'):
+        dataset = load_dataset('llvm-ml/ComPile', split='train', num_proc=2)
+
+    lang_list: [str] = dataset["language"]
+    langs = dataset.unique("language")
+    file_indices: [dict] = []
+
+    for i in range(0, len(langs)):
+        start_index = lang_list.index(langs[i])
+        if (i+1 != len(langs)):
+            end_index = lang_list.index(langs[i+1])
+        else:
+            end_index = len(lang_list)
+        file_indices.append(
+            {"language": langs[i], "start_index": start_index, "end_index": end_index})
+        with open('indices.csv', mode='w', newline='') as file:
+            writer = csv.DictWriter(file, fieldnames=[
+                "language", "start_index", "end_index"], dialect='unix', quoting=csv.QUOTE_NONE)
+            writer.writeheader()
+            writer.writerows(file_indices)
+
+    pool = multiprocessing.pool.ThreadPool(
+        processes=multiprocessing.cpu_count())
+
+    for i in range(0, len(file_indices)):
+        start_index = file_indices[i]["start_index"]
+        end_index = file_indices[i]["end_index"]
+        for j in range(start_index, end_index, BATCH_SIZE):
+            dir_name = f'{STORAGE}/{file_indices[i]["language"]}/{j}_temp'
+            makedirs(dir_name, exist_ok=True)
+            bytes_enumeration = enumerate(
+                [(bytes_item, dir_name) for bytes_item in dataset[j:j+BATCH_SIZE if (j+BATCH_SIZE <= end_index) else end_index]['content']], start=j)
+            pool.starmap(write_file, bytes_enumeration)
+
+    pool.close()
+
+
+if __name__ == '__main__':
+    write_dataset_files_and_index_info()

From 52f0049b48087e70259d4615e8a82779093820dc Mon Sep 17 00:00:00 2001
From: Andrew Kallai <kallai.andrew@gmail.com>
Date: Sat, 31 Aug 2024 22:31:48 -0400
Subject: [PATCH 3/6] Adding changes for writing dataset files to tar in
 python.

---
 .../datatset_download/create_tar.sh           |  30 ----
 .../datatset_download/write_data_files.py     | 159 +++++++++++-------
 2 files changed, 95 insertions(+), 94 deletions(-)
 delete mode 100755 llvm_ir_dataset_utils/compile_time_analysis_tools/datatset_download/create_tar.sh

diff --git a/llvm_ir_dataset_utils/compile_time_analysis_tools/datatset_download/create_tar.sh b/llvm_ir_dataset_utils/compile_time_analysis_tools/datatset_download/create_tar.sh
deleted file mode 100755
index da4db3b..0000000
--- a/llvm_ir_dataset_utils/compile_time_analysis_tools/datatset_download/create_tar.sh
+++ /dev/null
@@ -1,30 +0,0 @@
-#!/bin/bash
-set -o errexit
-#Usage:
-#./create_tar.sh <language> <storage>
-
-if [ -z "$1" ]; then
-  echo "Missing language argument."
-  exit 1
-else
-  LANGUAGE="$1"
-fi
-
-if [ -z "$2" ]; then
-  echo "Missing storage argument."
-  exit 1
-else
-  STORAGE="$2"
-fi
-
-cd ${STORAGE}/${LANGUAGE}
-
-for dir in [0-9]*_temp; do
-  cd $dir
-  tar --append --file="../${LANGUAGE}_bc_files.tar" \
-  --transform=s,^,bc_files/, file[0-9]*.bc
-  cd ..
-
-  rm -r "${dir}"
-done
-
diff --git a/llvm_ir_dataset_utils/compile_time_analysis_tools/datatset_download/write_data_files.py b/llvm_ir_dataset_utils/compile_time_analysis_tools/datatset_download/write_data_files.py
index 04b2081..f974cfd 100644
--- a/llvm_ir_dataset_utils/compile_time_analysis_tools/datatset_download/write_data_files.py
+++ b/llvm_ir_dataset_utils/compile_time_analysis_tools/datatset_download/write_data_files.py
@@ -1,70 +1,101 @@
-"""Script to write ir dataset files to a storage location.
+"""Script to write ir dataset files to a specified storage location.
 
-First the index counts for each language are written to a file "indices.csv".
-Then each bitcode file is written to the specified storage location, from which
-the files can be written to a tar file.
+In write_dataset_files_and_index_info, the index counts for each language are written to a file "indices.csv".
+Then each bitcode file is written using available threads into a tar file corresponding to the IR language.
+
+
+get_args
+  Returns: argparse.Namespace
+  Example usage: get_args()
+
+write_dataset_files_and_index_info
+  Returns: None
+  Example usage: write_dataset_files_and_index_info("/tmp")
 """
 
-from datasets import load_dataset, parallel
-from os import makedirs
-import multiprocessing
-import csv
-import argparse
-
-parser = argparse.ArgumentParser(
-    description="Configure path to store bitcode files, and configure batch size.")
-parser.add_argument('storage', type=str,
-                    help='Path to the storage location.')
-parser.add_argument('-b', '--batchsize', nargs='?', type=int, default=15000,
-                    help='Number of files to be written per pool of threads. Default value is 15000.')
-args = parser.parse_args()
-
-
-def write_dataset_files_and_index_info() -> None:
-    STORAGE: str = args.storage
-    BATCH_SIZE: int = args.batchsize
-
-    def write_file(index: [int], bytes_and_dir: (bytes, str)):
-        filename = f'{bytes_and_dir[1]}/file{index+1}.bc'
-        with open(filename, 'wb') as file:
-            file.write(bytes_and_dir[0])
-
-    with parallel.parallel_backend('spark'):
-        dataset = load_dataset('llvm-ml/ComPile', split='train', num_proc=2)
-
-    lang_list: [str] = dataset["language"]
-    langs = dataset.unique("language")
-    file_indices: [dict] = []
-
-    for i in range(0, len(langs)):
-        start_index = lang_list.index(langs[i])
-        if (i+1 != len(langs)):
-            end_index = lang_list.index(langs[i+1])
-        else:
-            end_index = len(lang_list)
-        file_indices.append(
-            {"language": langs[i], "start_index": start_index, "end_index": end_index})
-        with open('indices.csv', mode='w', newline='') as file:
-            writer = csv.DictWriter(file, fieldnames=[
-                "language", "start_index", "end_index"], dialect='unix', quoting=csv.QUOTE_NONE)
-            writer.writeheader()
-            writer.writerows(file_indices)
-
-    pool = multiprocessing.pool.ThreadPool(
-        processes=multiprocessing.cpu_count())
-
-    for i in range(0, len(file_indices)):
-        start_index = file_indices[i]["start_index"]
-        end_index = file_indices[i]["end_index"]
-        for j in range(start_index, end_index, BATCH_SIZE):
-            dir_name = f'{STORAGE}/{file_indices[i]["language"]}/{j}_temp'
-            makedirs(dir_name, exist_ok=True)
-            bytes_enumeration = enumerate(
-                [(bytes_item, dir_name) for bytes_item in dataset[j:j+BATCH_SIZE if (j+BATCH_SIZE <= end_index) else end_index]['content']], start=j)
-            pool.starmap(write_file, bytes_enumeration)
-
-    pool.close()
+
+def get_args():
+  import argparse
+
+  parser = argparse.ArgumentParser(
+      description="Configure path to store bitcode files, and configure batch size."
+  )
+  parser.add_argument('storage', type=str, help='Path to the storage location.')
+  parser.add_argument(
+      '-b',
+      '--batchsize',
+      nargs='?',
+      type=int,
+      default=15000,
+      help='Number of files to be written per pool of threads. Default value is 15000.'
+  )
+  return parser.parse_args()
+
+
+def write_dataset_files_and_index_info(storage: str) -> None:
+  from datasets import load_dataset, parallel
+  from os import makedirs, listdir
+  import threading
+  import csv
+  import tarfile
+  from io import BytesIO
+  from time import time
+
+  def create_tar(dataset_subset, start_index: int, dir_name: str,
+                 language: str):
+    with tarfile.open(dir_name + '/' + language + '_bc_files.tar', 'a:') as tar:
+      for x in enumerate((dataset_subset[i]["content"]
+                          for i in range(0, dataset_subset.num_rows))):
+        tarinfo = tarfile.TarInfo(name=f'bc_files/file{x[0]+1+start_index}.bc')
+        file_obj = BytesIO(x[1])
+        tarinfo.size = file_obj.getbuffer().nbytes
+        tarinfo.mtime = time()
+        tar.addfile(tarinfo, fileobj=file_obj)
+
+  with parallel.parallel_backend('spark'):
+    dataset = load_dataset('llvm-ml/ComPile', split='train', num_proc=2)
+
+  lang_list: [str] = dataset["language"]
+  langs = dataset.unique("language")
+  file_indices: [dict] = []
+
+  for i in range(0, len(langs)):
+    start_index = lang_list.index(langs[i])
+    if (i + 1 != len(langs)):
+      end_index = lang_list.index(langs[i + 1])
+    else:
+      end_index = len(lang_list)
+    file_indices.append({
+        "language": langs[i],
+        "start_index": start_index,
+        "end_index": end_index
+    })
+    with open('indices.csv', mode='w', newline='') as file:
+      writer = csv.DictWriter(
+          file,
+          fieldnames=["language", "start_index", "end_index"],
+          dialect='unix',
+          quoting=csv.QUOTE_NONE)
+      writer.writeheader()
+      writer.writerows(file_indices)
+
+  threads = []
+  for i in range(0, len(file_indices)):
+    start_index = file_indices[i]["start_index"]
+    end_index = file_indices[i]["end_index"]
+    dir_name = f'{storage}/{file_indices[i]["language"]}'
+    makedirs(dir_name, exist_ok=True)
+    thread = threading.Thread(
+        target=create_tar,
+        args=(dataset.select(range(start_index, end_index)), start_index,
+              dir_name, file_indices[i]["language"]))
+    threads.append(thread)
+  for thread in threads:
+    thread.start()
+  for thread in threads:
+    thread.join()
 
 
 if __name__ == '__main__':
-    write_dataset_files_and_index_info()
+  args = get_args()
+  write_dataset_files_and_index_info(storage=args.storage)

From 5fa24ffe1fe11fdab89b1c2173925fd364c704bf Mon Sep 17 00:00:00 2001
From: Andrew Kallai <kallai.andrew@gmail.com>
Date: Sun, 1 Sep 2024 10:33:46 -0400
Subject: [PATCH 4/6] Removed unecessary batch size argument.

---
 .../datatset_download/write_data_files.py              | 10 +---------
 1 file changed, 1 insertion(+), 9 deletions(-)

diff --git a/llvm_ir_dataset_utils/compile_time_analysis_tools/datatset_download/write_data_files.py b/llvm_ir_dataset_utils/compile_time_analysis_tools/datatset_download/write_data_files.py
index f974cfd..d2241a7 100644
--- a/llvm_ir_dataset_utils/compile_time_analysis_tools/datatset_download/write_data_files.py
+++ b/llvm_ir_dataset_utils/compile_time_analysis_tools/datatset_download/write_data_files.py
@@ -1,7 +1,7 @@
 """Script to write ir dataset files to a specified storage location.
 
 In write_dataset_files_and_index_info, the index counts for each language are written to a file "indices.csv".
-Then each bitcode file is written using available threads into a tar file corresponding to the IR language.
+Then, each bitcode file is written using available threads into respective tar files named [lang]_bc_files.tar. Each file added to a tar file is named bc_files/file[index].bc, where index is a number that is between the range of start_index and end_index (not including end) in "indices.csv" and is incremented by one (smallest index is 1).
 
 
 get_args
@@ -21,14 +21,6 @@ def get_args():
       description="Configure path to store bitcode files, and configure batch size."
   )
   parser.add_argument('storage', type=str, help='Path to the storage location.')
-  parser.add_argument(
-      '-b',
-      '--batchsize',
-      nargs='?',
-      type=int,
-      default=15000,
-      help='Number of files to be written per pool of threads. Default value is 15000.'
-  )
   return parser.parse_args()
 
 

From 5b8ee64388d272ff8c8aeccf6118bd75ab0d6043 Mon Sep 17 00:00:00 2001
From: Andrew Kallai <kallai.andrew@gmail.com>
Date: Sun, 1 Sep 2024 11:55:15 -0400
Subject: [PATCH 5/6] Fixed spelling mistake in directory name.

---
 .../{datatset_download => dataset_download}/write_data_files.py   | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 rename llvm_ir_dataset_utils/compile_time_analysis_tools/{datatset_download => dataset_download}/write_data_files.py (100%)

diff --git a/llvm_ir_dataset_utils/compile_time_analysis_tools/datatset_download/write_data_files.py b/llvm_ir_dataset_utils/compile_time_analysis_tools/dataset_download/write_data_files.py
similarity index 100%
rename from llvm_ir_dataset_utils/compile_time_analysis_tools/datatset_download/write_data_files.py
rename to llvm_ir_dataset_utils/compile_time_analysis_tools/dataset_download/write_data_files.py

From 1b936eee677325f73df16b54f1f2d03c18abdde7 Mon Sep 17 00:00:00 2001
From: Andrew Kallai <kallai.andrew@gmail.com>
Date: Sun, 15 Sep 2024 18:25:31 -0400
Subject: [PATCH 6/6] Modified docstrings and import statement locations.

---
 .../dataset_download/write_data_files.py      | 68 ++++++++++++++-----
 1 file changed, 50 insertions(+), 18 deletions(-)

diff --git a/llvm_ir_dataset_utils/compile_time_analysis_tools/dataset_download/write_data_files.py b/llvm_ir_dataset_utils/compile_time_analysis_tools/dataset_download/write_data_files.py
index d2241a7..711f37e 100644
--- a/llvm_ir_dataset_utils/compile_time_analysis_tools/dataset_download/write_data_files.py
+++ b/llvm_ir_dataset_utils/compile_time_analysis_tools/dataset_download/write_data_files.py
@@ -1,21 +1,35 @@
-"""Script to write ir dataset files to a specified storage location.
+"""Script to write IR dataset files to a specified storage location.
 
-In write_dataset_files_and_index_info, the index counts for each language are written to a file "indices.csv".
-Then, each bitcode file is written using available threads into respective tar files named [lang]_bc_files.tar. Each file added to a tar file is named bc_files/file[index].bc, where index is a number that is between the range of start_index and end_index (not including end) in "indices.csv" and is incremented by one (smallest index is 1).
+Two functions are contained in the script. One function retrieves a
+storage location argument for the script. The other function accesses
+the ComPile dataset using the Hugging Face API and writes the bitcode
+files in the dataset to tar files corresponding to their IR language.
 
+the index counts for each language are written to a file "indices.csv".
+Then, each bitcode file is written using available threads into
+respective tar files named [lang]_bc_files.tar. Each file added to a tar
+file is named bc_files/file[index].bc, where index is a number that is
+between the range of start_index and end_index (not including end) in
+"indices.csv" and is incremented by one (smallest index is 1).
 
-get_args
-  Returns: argparse.Namespace
-  Example usage: get_args()
-
-write_dataset_files_and_index_info
-  Returns: None
-  Example usage: write_dataset_files_and_index_info("/tmp")
+Example usage: python write_data_files.py /tmp
 """
 
+import argparse
+from datasets import load_dataset, parallel
+from os import makedirs, listdir
+import threading
+import csv
+import tarfile
+from io import BytesIO
+from time import time
+
 
 def get_args():
-  import argparse
+  """Function to return the provided storage argument for the script.
+
+  Returns: argparse.Namespace
+  """
 
   parser = argparse.ArgumentParser(
       description="Configure path to store bitcode files, and configure batch size."
@@ -25,13 +39,31 @@ def get_args():
 
 
 def write_dataset_files_and_index_info(storage: str) -> None:
-  from datasets import load_dataset, parallel
-  from os import makedirs, listdir
-  import threading
-  import csv
-  import tarfile
-  from io import BytesIO
-  from time import time
+  """Function to write each IR bitcode file to a tar archive.
+
+  The function first loads the ComPile dataset into a HF datasets
+  Dataset object. It does this using an experimental parallel backend to
+  slightly speed up load times. Then, a list of dictionaries is made,
+  where each dict contains the starting and ending index for each IR
+  file type based on language. For example, if the entire dataset
+  consisted of C and C++ IR modules, then the dictionary for C would
+  note language='c', starting_index=0, ending_index=(C++ starting
+  index). The +1 for the ending_index allows for direct use in range(a,
+  b) syntax, but is NOT suitable for right inclusive syntax. The entries
+  of each dictionary are then written to a CSV file name 'indices.csv'
+  for further use by other scripts. The contents of each dictionary are
+  used to provide information to the n number of threads, where n is the
+  number of languages in the dataset. Each thread when started calls the
+  create_tar() sub-function. The sub-function uses a generator
+  expression to access the bitcode files from a subset (taken by
+  language) of the original Dataset object. This allows for low memory
+  usage while performing in-memory writing of each bitcode file to a tar
+  archive which is named according to the given language (i.e.,
+  c_bc_files.tar).
+
+  Args:
+    storage: Storage location for the tar archives
+  """
 
   def create_tar(dataset_subset, start_index: int, dir_name: str,
                  language: str):