Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 5 additions & 10 deletions alphapulldown/scripts/parse_input.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,7 @@
from absl import flags, app, logging
import json
from alphapulldown.utils.modelling_setup import parse_fold, create_custom_info
from alphapulldown.utils.create_combinations import process_files
import io
from alphapulldown_input_parser import generate_fold_specifications

logging.set_verbosity(logging.INFO)

Expand All @@ -23,16 +22,12 @@
FLAGS = flags.FLAGS

def main(argv):
buffer = io.StringIO()
_ = process_files(
specifications = generate_fold_specifications(
input_files=FLAGS.input_list,
output_path=buffer,
exclude_permutations = True
delimiter=FLAGS.protein_delimiter,
exclude_permutations=True,
)
buffer.seek(0)
all_folds = buffer.readlines()
all_folds = [x.strip() for x in all_folds]
parsed = parse_fold(all_folds, FLAGS.features_directory, FLAGS.protein_delimiter)
parsed = parse_fold(specifications, FLAGS.features_directory, FLAGS.protein_delimiter)
data = create_custom_info(parsed)

with open(FLAGS.output_prefix + "data.json", 'w') as out_f:
Expand Down
15 changes: 5 additions & 10 deletions alphapulldown/scripts/run_multimer_jobs.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,6 @@

Authors: Dingquan Yu, Valentin Maurer <[email protected]>
"""
import io
import warnings
import subprocess
from absl import app, logging, flags
Expand All @@ -14,7 +13,7 @@
import jax
gpus = jax.local_devices(backend='gpu')
from alphapulldown.scripts.run_structure_prediction import FLAGS
from alphapulldown.utils.create_combinations import process_files
from alphapulldown_input_parser import generate_fold_specifications

logging.set_verbosity(logging.INFO)

Expand Down Expand Up @@ -55,16 +54,12 @@ def main(argv):
DeprecationWarning,
)

buffer = io.StringIO()
_ = process_files(
specifications = generate_fold_specifications(
input_files=protein_lists,
output_path=buffer,
exclude_permutations = True
delimiter="+",
exclude_permutations=True,
)
buffer.seek(0)
all_folds = buffer.readlines()
all_folds = [x.strip().replace(",", ":") for x in all_folds]
all_folds = [x.strip().replace(";", "+") for x in all_folds]
all_folds = [spec.replace(",", ":").replace(";", "+") for spec in specifications]
if FLAGS.dry_run:
logging.info(f"Dry run: the total number of jobs to be run: {len(all_folds)}")
sys.exit(0)
Expand Down
16 changes: 8 additions & 8 deletions alphapulldown/scripts/split_jobs_into_clusters.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
import argparse
import io
from alphapulldown.utils.create_combinations import process_files
from alphapulldown_input_parser import generate_fold_specifications
from alphapulldown.utils.modelling_setup import parse_fold, create_custom_info, create_interactors
from alphapulldown.objects import MultimericObject
import pandas as pd
Expand Down Expand Up @@ -132,20 +131,21 @@ def main():
protein_lists = args.protein_lists
if args.mode == "all_vs_all":
protein_lists = [args.protein_lists[0], args.protein_lists[0]]
# buffer = io.StringIO()
import time
start = time.time()
all_combinations = process_files(input_files=protein_lists)
specifications = generate_fold_specifications(
input_files=protein_lists,
delimiter=args.protein_delimiter,
exclude_permutations=True,
)

all_folds = ["+".join(combo) for combo in all_combinations]
all_folds = [x.strip().replace(",", ":") for x in all_folds]
all_folds = [x.strip().replace(";", "+") for x in all_folds]
all_folds = [spec.replace(",", ":").replace(";", "+") for spec in specifications]
end = time.time()
diff1 = end - start
cluster_jobs(all_folds, args)
end = time.time()
diff2 = end - start
logger.info(f"process_files steps takes {diff1}s and total time is: {diff2}")
logger.info(f"generate_fold_specifications step takes {diff1}s and total time is: {diff2}")


if __name__ == "__main__":
Expand Down
62 changes: 0 additions & 62 deletions alphapulldown/utils/create_combinations.py

This file was deleted.

126 changes: 28 additions & 98 deletions alphapulldown/utils/modelling_setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,123 +4,53 @@
#

import os
import sys
import pickle
import lzma
from typing import List,Dict,Union
from typing import Dict, List, Tuple, Union
import numpy as np
from alphafold.data.tools import jackhmmer
from alphafold.data import templates
from alphapulldown_input_parser import RegionSelection
from alphapulldown_input_parser import parse_fold as _external_parse_fold
from alphapulldown.objects import MonomericObject
from os.path import exists,join
from alphapulldown.objects import ChoppedObject
from alphapulldown.utils.file_handling import make_dir_monomer_dictionary
from absl import logging
logging.set_verbosity(logging.INFO)



def parse_fold(input_list, features_directory, protein_delimiter):
"""
Parses a list of protein fold specifications and returns structured folding jobs.

Args:
input_list (list): List of protein fold specifications as strings.
features_directory (list): List of directories to search for protein feature files.
protein_delimiter (str): Delimiter used to separate different protein folds.

Returns:
list: A list of folding jobs, each represented by a list of dictionaries.

Raises:
FileNotFoundError: If any required protein features are missing.
ValueError: If the format of the input specifications is incorrect.
"""
def format_error(spec):
print(f"Your format: {spec} is wrong. The program will terminate.")
sys.exit(1)

def extract_copy_and_regions(tokens, spec):
# try head copy then tail copy, default to 1
if len(tokens) > 1:
try:
return int(tokens[1]), tokens[2:]
except ValueError:
pass
try:
return int(tokens[-1]), tokens[1:-1]
except ValueError:
pass
return 1, tokens[1:]

def parse_regions(region_tokens, spec):
if not region_tokens:
return "all"
regions = []
for tok in region_tokens:
parts = tok.split("-")
if len(parts) != 2:
format_error(spec)
try:
regions.append(tuple(map(int, parts)))
except ValueError:
format_error(spec)
return regions

def feature_exists(name):
return any(
exists(join(dirpath, f"{name}{ext}"))
for dirpath in features_directory
for ext in (".pkl", ".pkl.xz")
)

def json_exists(name):
return any(
exists(join(dirpath, name))
for dirpath in features_directory
)

all_folding_jobs = []
missing_features = set()

for spec in input_list:
formatted_folds = []
for pf in spec.split(protein_delimiter):
# Handle JSON input
if pf.endswith('.json'):
json_name = pf
if json_exists(json_name):
for d in features_directory:
path = join(d, json_name)
if exists(path):
formatted_folds.append({'json_input': path})
break
else:
missing_features.add(json_name)
continue

# Handle protein input
tokens = pf.split(":")
if not tokens or not tokens[0]:
format_error(spec)
def _normalise_fold_entry(entry: Dict[str, Union[str, RegionSelection]]) -> Dict[str, Union[str, List[Tuple[int, int]]]]:
"""Convert entries from alphapulldown-input-parser into legacy AlphaPulldown format."""
if "json_input" in entry:
return {"json_input": entry["json_input"]}

name = tokens[0]
number, region_tokens = extract_copy_and_regions(tokens, spec)
regions = parse_regions(region_tokens, spec)
if len(entry) != 1:
return entry

if not feature_exists(name):
missing_features.add(name)
continue
name, selection = next(iter(entry.items()))
if isinstance(selection, RegionSelection):
if selection.is_all:
value: Union[str, List[Tuple[int, int]]] = "all"
else:
value = [(region.start, region.end) for region in selection.regions]
return {name: value}

formatted_folds += [{name: regions} for _ in range(number)]
return {name: selection}

if formatted_folds:
all_folding_jobs.append(formatted_folds)

if missing_features:
raise FileNotFoundError(f"{sorted(missing_features)} not found in {features_directory}")
def parse_fold(input_list, features_directory, protein_delimiter):
"""Parse fold specifications using alphapulldown-input-parser and normalise the output."""
parsed_jobs = _external_parse_fold(
input_list=input_list,
features_directory=features_directory,
protein_delimiter=protein_delimiter,
)

return all_folding_jobs
normalised_jobs: List[List[Dict[str, Union[str, List[Tuple[int, int]]]]]] = []
for job in parsed_jobs:
normalised_jobs.append([_normalise_fold_entry(entry) for entry in job])
return normalised_jobs

def pad_input_features(feature_dict: dict,
desired_num_res : int, desired_num_msa : int) -> None:
Expand Down
1 change: 1 addition & 0 deletions setup.cfg
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,7 @@ include_package_data = True
python_requires = >=3.8
install_requires =
absl-py >= 0.13.0
alphapulldown-input-parser
dm-haiku
dm-tree>=0.1.6
h5py>=3.1.0
Expand Down
27 changes: 11 additions & 16 deletions test/check_alphafold2_predictions.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,6 @@
"""
from __future__ import annotations

import io
import os
import json
import pickle
Expand All @@ -19,7 +18,7 @@
from absl.testing import absltest, parameterized

import alphapulldown
from alphapulldown.utils.create_combinations import process_files
from alphapulldown_input_parser import generate_fold_specifications

# --------------------------------------------------------------------------- #
# configuration / logging #
Expand Down Expand Up @@ -161,16 +160,14 @@ def _args(self, *, plist, mode, script):
) + f"={self.test_protein_lists_dir / plist}",
]
else:
buffer = io.StringIO()
_ = process_files(
specifications = generate_fold_specifications(
input_files=[str(self.test_protein_lists_dir / plist)],
output_path=buffer,
exclude_permutations=True
delimiter="+",
exclude_permutations=True,
)
buffer.seek(0)
lines = [
x.strip().replace(",", ":").replace(";", "+")
for x in buffer.readlines() if x.strip()
spec.replace(",", ":").replace(";", "+")
for spec in specifications if spec.strip()
]
formatted_input = lines[0] if lines else ""
return [
Expand Down Expand Up @@ -321,16 +318,14 @@ def test_dropout_increases_diversity(self):
no_dropout_output_dir.mkdir(parents=True, exist_ok=True)

# Use simple test input
buffer = io.StringIO()
_ = process_files(
specifications = generate_fold_specifications(
input_files=[str(self.protein_lists)],
output_path=buffer,
exclude_permutations=True
delimiter="+",
exclude_permutations=True,
)
buffer.seek(0)
lines = [
x.strip().replace(",", ":").replace(";", "+")
for x in buffer.readlines() if x.strip()
spec.replace(",", ":").replace(";", "+")
for spec in specifications if spec.strip()
]
formatted_input = lines[0] if lines else ""

Expand Down
Loading
Loading