Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 6 additions & 1 deletion llvm_ir_dataset_utils/builders/builder.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,8 @@
from llvm_ir_dataset_utils.builders import (autoconf_builder, cargo_builder,
cmake_builder, julia_builder,
manual_builder, spack_builder,
swift_builder, portage_builder)
swift_builder, portage_builder,
self_contained_builder)
from llvm_ir_dataset_utils.sources import source
from llvm_ir_dataset_utils.util import file, licenses

Expand Down Expand Up @@ -235,6 +236,10 @@ def parse_and_build_from_description(
threads,
corpus_description["package_name"],
)
elif corpus_description["build_system"] == "self_contained":
build_log = self_contained_builder.perform_build(
corpus_description["source_file_list"], build_dir, corpus_dir)
self_contained_builder.extract_ir(build_dir, corpus_dir, threads)
else:
raise ValueError(
f"Build system {corpus_description['build_system']} is not supported")
Expand Down
46 changes: 46 additions & 0 deletions llvm_ir_dataset_utils/builders/self_contained_builder.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
"""Module for building and extracting builder from a set of self-contained
C/c++ files."""

import subprocess
import os
import logging

from mlgo.corpus import extract_ir_lib
from mlgo.corpus import make_corpus_lib


def compile_file(source_file, object_file):
command_vector = [
'clang', '-Xclang', '-fembed-bitcode=all', '-c', source_file, '-o',
object_file
]
compile_process = subprocess.run(
command_vector, stderr=subprocess.STDOUT, stdout=subprocess.PIPE)
if compile_process.returncode != 0:
logging.warning('Compiler returned non-zero exit code')


def perform_build(source_file_list, build_dir, corpus_dir):
for source_file in source_file_list:
object_file = os.path.join(build_dir, os.path.basename(source_file) + '.o')
compile_file(source_file, object_file)

return {
'targets': [{
'success': True,
'build_log': None,
'name': 'self_contained'
}]
}


# TODO(boomanaiden154): This is duplicated with extract_ir in the manual builder.
# We might want to look into refactoring to consolidate the two functions at some
# point.
def extract_ir(build_dir, corpus_dir, threads):
objects = extract_ir_lib.load_from_directory(build_dir, corpus_dir)
relative_output_paths = extract_ir_lib.run_extraction(objects, threads,
"llvm-objcopy", None,
None, ".llvmcmd",
".llvmbc")
extract_ir_lib.write_corpus_manifest(None, relative_output_paths, corpus_dir)
63 changes: 63 additions & 0 deletions llvm_ir_dataset_utils/tools/build_self_contained_sources.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
"""Tool for building a list of self contained sources."""

import logging
import json

from absl import app
from absl import flags

import ray

from llvm_ir_dataset_utils.builders import builder

FLAGS = flags.FLAGS

flags.DEFINE_string('batch_list', None, 'The path to the batch list.')
flags.DEFINE_string('source_dir', '/tmp/source',
'The path to the source dir. Not used by this builder.')
flags.DEFINE_string('build_dir', None, 'The path to the build dir.')
flags.DEFINE_string('corpus_dir', None, 'The directory to place the corpus in.')
flags.DEFINE_bool(
'archive_corpus', False,
'Whether or not to put the output corpus into an arxiv to reduce inode usage'
)


def main(_):
ray.init()

with open(FLAGS.batch_list) as batch_list_handle:
batch_list = json.load(batch_list_handle)

batch_futures = []

for index, batch_info in enumerate(batch_list['batches']):
corpus_description = {
'sources': [],
'folder_name': f'batch-{index}',
'build_system': 'self_contained',
'package_name': 'batch-{index}',
'license': 'UNKNOWN',
'license_source': None,
'source_file_list': batch_info
}

batch_futures.append(
builder.get_build_future(
corpus_description,
FLAGS.source_dir,
FLAGS.build_dir,
FLAGS.corpus_dir,
1, {},
cleanup=True,
archive_corpus=FLAGS.archive_corpus))

while len(batch_futures) > 0:
finished, batch_futures = ray.wait(batch_futures, timeout=5.0)
finished_data = ray.get(finished)
logging.info(
f'Just finished {len(finished_data)}, {len(batch_futures)} remaining.')


if __name__ == '__main__':
app.run(main)
56 changes: 56 additions & 0 deletions llvm_ir_dataset_utils/tools/gather_self_contained_sources.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
"""Tool for gathering self-contained source files so they can be built later."""

import os
import shutil
import logging
import json
import glob

from absl import app
from absl import flags

from llvm_ir_dataset_utils.util import parallel

FLAGS = flags.FLAGS

flags.DEFINE_string('folder', None, 'The folder to look for source files in.')
flags.DEFINE_string('output_file', None,
'The path to a JSON file to dump the output into.')

flags.mark_flag_as_required('folder')
flags.mark_flag_as_required('output_file')

SOURCE_EXTENSION = 'c'
SOURCE_FILES_PER_BATCH = 64


def find_files(folder_path):
file_names = []

folder_glob = os.path.join(folder_path, f'**/*.{SOURCE_EXTENSION}')
for file_name in glob.glob(
os.path.join(folder_path, f'**/*.{SOURCE_EXTENSION}'), recursive=True):
file_names.append(file_name)

return file_names


def main(_):
source_files = find_files(FLAGS.folder)

logging.info(f'Done collecting source files, found {len(source_files)}')

batches = parallel.split_batches(source_files, SOURCE_FILES_PER_BATCH)

logging.info(f'Done creating batches, have {len(batches)}')

output_spec = {'batches': batches}

with open(FLAGS.output_file, 'w') as output_file_handle:
json.dump(output_spec, output_file_handle, indent=4)

logging.info('Finished outputting batches')


if __name__ == '__main__':
app.run(main)