Skip to content

Commit 3e68921

Browse files
author
Tobias Hofmann
committed
bugfixes and introduced keep_paralogs flag
1 parent df6c5e9 commit 3e68921

File tree

4 files changed

+217
-188
lines changed

4 files changed

+217
-188
lines changed

docs/notebook/.ipynb_checkpoints/extract_contigs-checkpoint.ipynb

Lines changed: 100 additions & 99 deletions
Large diffs are not rendered by default.

docs/notebook/extract_contigs.ipynb

Lines changed: 94 additions & 83 deletions
Large diffs are not rendered by default.

secapr/align_sequences.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,7 @@
4040

4141
from __future__ import print_function
4242
import os
43+
import re
4344
import sys
4445
import copy
4546
import tempfile
@@ -243,7 +244,8 @@ def main(args):
243244
locus_name = t.description.split('|')[-1]
244245
rest_of_string = '|'.join(t.description.split('|')[:-1])
245246
string_to_replace = '%s_'%str(locus_name)
246-
t.id = rest_of_string.replace(string_to_replace, '')
247+
new_string = t.id
248+
t.id = re.sub(string_to_replace, '', new_string)
247249
t.name = t.id
248250
t.description = ''
249251
write_alignments_to_outdir(log, args.output, alignments, args.output_format)

secapr/find_target_contigs.py

Lines changed: 20 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -60,6 +60,12 @@ def add_arguments(parser):
6060
default=False,
6161
help="Use this flag in case you want to keep those contigs that span across multiple exons.",
6262
)
63+
parser.add_argument(
64+
"--keep-paralogs",
65+
action='store_true',
66+
default=False,
67+
help="Use this flag in case you want to keep loci with signs of paralogy (multiple contig matches). One randomely selected contig will be selected for these loci.",
68+
)
6369
parser.add_argument(
6470
'--disable_stats',
6571
action='store_true',
@@ -125,21 +131,30 @@ def find_duplicates(exon_contig_dict,contig_exon_dict):
125131
return invalid_exon_loci, exons_with_multiple_hits, contigs_matching_multiple_exons
126132

127133

128-
def get_list_of_valid_exons_and_contigs(exon_contig_dict,duplicate_loci,exons_with_multiple_hits,contigs_matching_multiple_exon_dict,keep_duplicates_boolean,outdir):
134+
def get_list_of_valid_exons_and_contigs(exon_contig_dict,duplicate_loci,exons_with_multiple_hits,contigs_matching_multiple_exon_dict,keep_duplicates_boolean,keep_paralogs_boolean,outdir):
129135
# summarize all exons that should be excluded form further processing (duplicates)
130136
if keep_duplicates_boolean:
131137
# then only mark the list exons_with_multiple_hits as bad exons
132-
invalid_exons_unique = list(set(exons_with_multiple_hits))
138+
invalid_exons_temp = list(set(exons_with_multiple_hits))
133139
valid_contigs_matching_multiple_exon_dict = {}
134140
for exon in contigs_matching_multiple_exon_dict.keys():
135-
if not exon in invalid_exons_unique:
141+
if not exon in invalid_exons_temp:
136142
valid_contigs_matching_multiple_exon_dict.setdefault(exon,contigs_matching_multiple_exon_dict[exon])
137143
dupl_info = pd.DataFrame.from_dict(valid_contigs_matching_multiple_exon_dict, orient='index')
138144
dupl_info.to_csv(os.path.join(outdir,'info_contigs_matching_multiple_exons.txt'),header=False,sep="\t")
145+
if keep_paralogs_boolean:
146+
invalid_exons_unique = []
147+
invalid_exons_temp = list(set(duplicate_loci))
148+
paralogous_exons = {}
149+
for exon in exons_with_multiple_hits:
150+
paralogous_exons.setdefault(exon,exon_contig_dict[exon])
151+
paralog_info = pd.DataFrame.from_dict(paralogous_exons, orient='index')
152+
paralog_info.to_csv(os.path.join(outdir,'info_paralogous_loci.txt'),header=False,sep="\t")
153+
print('Warning: Found %i paralogous loci. One randomely selected copy of the respective contigs for each paralogous locus will be kept, due to the use of the --keep_paralogs flag. It is not recommendable to use paralogous loci for phylogenetic inference!'%len(invalid_exons_temp))
139154
else:
140155
# remove all duplicates
141156
invalid_exons_unique = list(set(duplicate_loci))
142-
print(len(invalid_exons_unique), 'possibly paralogous exons detected - excluded from processing')
157+
print(len(invalid_exons_unique), 'possibly paralogous exons detected - excluded from processing')
143158
# get list of valid contig names
144159
valid_contig_names = []
145160
for exon in exon_contig_dict:
@@ -265,7 +280,7 @@ def main(args):
265280
# mark duplicate loci
266281
duplicate_loci, possible_paralogous, contigs_covering_several_loci = find_duplicates(exon_contig_dict,contig_exon_dict)
267282
# remove duplicate loci from the list of targeted loci and contigs
268-
target_contigs = get_list_of_valid_exons_and_contigs(exon_contig_dict,duplicate_loci,possible_paralogous,contig_multi_exon_dict,args.keep_duplicates,subfolder)
283+
target_contigs = get_list_of_valid_exons_and_contigs(exon_contig_dict,duplicate_loci,possible_paralogous,contig_multi_exon_dict,args.keep_duplicates,args.keep_paralogs,subfolder)
269284
# load the actual contig sequences
270285
contig_sequences = SeqIO.parse(open(contig_file),'fasta')
271286
# write those contigs that match the reference library to the file

0 commit comments

Comments
 (0)