From 88d2ff3297104b89b6c2fe59819cf7095375dd68 Mon Sep 17 00:00:00 2001 From: Aiden Grossman Date: Sun, 16 Jun 2024 19:38:46 -0700 Subject: [PATCH 1/3] Add tool to gather self contained sources --- .../tools/gather_self_contained_sources.py | 56 +++++++++++++++++++ 1 file changed, 56 insertions(+) create mode 100644 llvm_ir_dataset_utils/tools/gather_self_contained_sources.py diff --git a/llvm_ir_dataset_utils/tools/gather_self_contained_sources.py b/llvm_ir_dataset_utils/tools/gather_self_contained_sources.py new file mode 100644 index 0000000..1fb2164 --- /dev/null +++ b/llvm_ir_dataset_utils/tools/gather_self_contained_sources.py @@ -0,0 +1,56 @@ +"""Tool for gathering self-contained source files so they can be built later.""" + +import os +import shutil +import logging +import json +import glob + +from absl import app +from absl import flags + +from llvm_ir_dataset_utils.util import parallel + +FLAGS = flags.FLAGS + +flags.DEFINE_string('folder', None, 'The folder to look for source files in.') +flags.DEFINE_string('output_file', None, + 'The path to a JSON file to dump the output into.') + +flags.mark_flag_as_required('folder') +flags.mark_flag_as_required('output_file') + +SOURCE_EXTENSION = 'c' +SOURCE_FILES_PER_BATCH = 64 + + +def find_files(folder_path): + file_names = [] + + folder_glob = os.path.join(folder_path, f'**/*.{SOURCE_EXTENSION}') + for file_name in glob.glob( + os.path.join(folder_path, f'**/*.{SOURCE_EXTENSION}'), recursive=True): + file_names.append(file_name) + + return file_names + + +def main(_): + source_files = find_files(FLAGS.folder) + + logging.info(f'Done collecting source files, found {len(source_files)}') + + batches = parallel.split_batches(source_files, SOURCE_FILES_PER_BATCH) + + logging.info(f'Done creating batches, have {len(batches)}') + + output_spec = {'batches': batches} + + with open(FLAGS.output_file, 'w') as output_file_handle: + json.dump(output_spec, output_file_handle, indent=4) + + logging.info('Finished outputting batches') + + +if __name__ == '__main__': + app.run(main) From e597ec9712d1939632e7abfc222038fcfe6cf04d Mon Sep 17 00:00:00 2001 From: Aiden Grossman Date: Sun, 16 Jun 2024 19:52:46 -0700 Subject: [PATCH 2/3] Add self contained builder --- llvm_ir_dataset_utils/builders/builder.py | 7 ++- .../builders/self_contained_builder.py | 43 +++++++++++++++++++ 2 files changed, 49 insertions(+), 1 deletion(-) create mode 100644 llvm_ir_dataset_utils/builders/self_contained_builder.py diff --git a/llvm_ir_dataset_utils/builders/builder.py b/llvm_ir_dataset_utils/builders/builder.py index dd70924..835950a 100644 --- a/llvm_ir_dataset_utils/builders/builder.py +++ b/llvm_ir_dataset_utils/builders/builder.py @@ -14,7 +14,8 @@ from llvm_ir_dataset_utils.builders import (autoconf_builder, cargo_builder, cmake_builder, julia_builder, manual_builder, spack_builder, - swift_builder, portage_builder) + swift_builder, portage_builder, + self_contained_builder) from llvm_ir_dataset_utils.sources import source from llvm_ir_dataset_utils.util import file, licenses @@ -235,6 +236,10 @@ def parse_and_build_from_description( threads, corpus_description["package_name"], ) + elif corpus_description["build_system"] == "self_contained": + build_log = self_contained_builder.perform_build( + corpus_description["source_file_list"], build_dir, corpus_dir) + self_contained_builder.extract_ir(build_dir, corpus_dir, threads) else: raise ValueError( f"Build system {corpus_description['build_system']} is not supported") diff --git a/llvm_ir_dataset_utils/builders/self_contained_builder.py b/llvm_ir_dataset_utils/builders/self_contained_builder.py new file mode 100644 index 0000000..1f1618f --- /dev/null +++ b/llvm_ir_dataset_utils/builders/self_contained_builder.py @@ -0,0 +1,43 @@ +"""Module for building and extracting builder from a set of self-contained +C/c++ files.""" + +import subprocess +import os + +from mlgo.corpus import extract_ir_lib +from mlgo.corpus import make_corpus_lib + + +def compile_file(source_file, object_file): + command_vector = [ + 'clang', '-Xclang', '-fembed-bitcode=all', source_file, '-o', object_file + ] + compile_process = subprocess.run( + command_vector, stderr=subprocess.STDOUT, stdout=subprocess.PIPE) + assert (compile_process.returncode == 0) + + +def perform_build(source_file_list, build_dir, corpus_dir): + for source_file in source_file_list: + object_file = os.path.join(build_dir, os.path.basename(source_file) + '.o') + compile_file(source_file, object_file) + + return { + 'targets': [{ + 'success': True, + 'build_log': None, + 'name': 'self_contained' + }] + } + + +# TODO(boomanaiden154): This is duplicated with extract_ir in the manual builder. +# We might want to look into refactoring to consolidate the two functions at some +# point. +def extract_ir(build_dir, corpus_dir, threads): + objects = extract_ir_lib.load_from_directory(build_dir, corpus_dir) + relative_output_paths = extract_ir_lib.run_extraction(objects, threads, + "llvm-objcopy", None, + None, ".llvmcmd", + ".llvmbc") + extract_ir_lib.write_corpus_manifest(None, relative_output_paths, corpus_dir) From 97b3c812ea4acb51fe11b1d2823c28b819f2dd8b Mon Sep 17 00:00:00 2001 From: Aiden Grossman Date: Sun, 16 Jun 2024 20:23:14 -0700 Subject: [PATCH 3/3] Add build script --- .../builders/self_contained_builder.py | 7 ++- .../tools/build_self_contained_sources.py | 63 +++++++++++++++++++ 2 files changed, 68 insertions(+), 2 deletions(-) create mode 100644 llvm_ir_dataset_utils/tools/build_self_contained_sources.py diff --git a/llvm_ir_dataset_utils/builders/self_contained_builder.py b/llvm_ir_dataset_utils/builders/self_contained_builder.py index 1f1618f..84ae392 100644 --- a/llvm_ir_dataset_utils/builders/self_contained_builder.py +++ b/llvm_ir_dataset_utils/builders/self_contained_builder.py @@ -3,6 +3,7 @@ import subprocess import os +import logging from mlgo.corpus import extract_ir_lib from mlgo.corpus import make_corpus_lib @@ -10,11 +11,13 @@ def compile_file(source_file, object_file): command_vector = [ - 'clang', '-Xclang', '-fembed-bitcode=all', source_file, '-o', object_file + 'clang', '-Xclang', '-fembed-bitcode=all', '-c', source_file, '-o', + object_file ] compile_process = subprocess.run( command_vector, stderr=subprocess.STDOUT, stdout=subprocess.PIPE) - assert (compile_process.returncode == 0) + if compile_process.returncode != 0: + logging.warning('Compiler returned non-zero exit code') def perform_build(source_file_list, build_dir, corpus_dir): diff --git a/llvm_ir_dataset_utils/tools/build_self_contained_sources.py b/llvm_ir_dataset_utils/tools/build_self_contained_sources.py new file mode 100644 index 0000000..29e3ba6 --- /dev/null +++ b/llvm_ir_dataset_utils/tools/build_self_contained_sources.py @@ -0,0 +1,63 @@ +"""Tool for building a list of self contained sources.""" + +import logging +import json + +from absl import app +from absl import flags + +import ray + +from llvm_ir_dataset_utils.builders import builder + +FLAGS = flags.FLAGS + +flags.DEFINE_string('batch_list', None, 'The path to the batch list.') +flags.DEFINE_string('source_dir', '/tmp/source', + 'The path to the source dir. Not used by this builder.') +flags.DEFINE_string('build_dir', None, 'The path to the build dir.') +flags.DEFINE_string('corpus_dir', None, 'The directory to place the corpus in.') +flags.DEFINE_bool( + 'archive_corpus', False, + 'Whether or not to put the output corpus into an arxiv to reduce inode usage' +) + + +def main(_): + ray.init() + + with open(FLAGS.batch_list) as batch_list_handle: + batch_list = json.load(batch_list_handle) + + batch_futures = [] + + for index, batch_info in enumerate(batch_list['batches']): + corpus_description = { + 'sources': [], + 'folder_name': f'batch-{index}', + 'build_system': 'self_contained', + 'package_name': 'batch-{index}', + 'license': 'UNKNOWN', + 'license_source': None, + 'source_file_list': batch_info + } + + batch_futures.append( + builder.get_build_future( + corpus_description, + FLAGS.source_dir, + FLAGS.build_dir, + FLAGS.corpus_dir, + 1, {}, + cleanup=True, + archive_corpus=FLAGS.archive_corpus)) + + while len(batch_futures) > 0: + finished, batch_futures = ray.wait(batch_futures, timeout=5.0) + finished_data = ray.get(finished) + logging.info( + f'Just finished {len(finished_data)}, {len(batch_futures)} remaining.') + + +if __name__ == '__main__': + app.run(main)