Skip to content

Commit a3fc104

Browse files
author
Tobias Andermann
committed
bugfix clean_reads
1 parent 1dd1190 commit a3fc104

File tree

6 files changed

+104
-75
lines changed

6 files changed

+104
-75
lines changed

build/lib/secapr/_version.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -8,11 +8,11 @@
88

99
version_json = '''
1010
{
11-
"date": "2022-03-31T18:44:33+0200",
12-
"dirty": false,
11+
"date": "2022-03-31T19:57:19+0200",
12+
"dirty": true,
1313
"error": null,
14-
"full-revisionid": "96068e14956985bb98c4f73a64dfcf124b3f7086",
15-
"version": "2.2.3+12.g96068e1"
14+
"full-revisionid": "1dd1190c2db1975140823836653618bcbfc812e8",
15+
"version": "2.2.3+13.g1dd1190.dirty"
1616
}
1717
''' # END VERSION_JSON
1818

build/lib/secapr/clean_reads.py

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -258,9 +258,10 @@ def main(args):
258258
shared_stems = []
259259
for i in filenamelist:
260260
longest_common_substrings = [longest_common_substring(i,j) for j in filenamelist]
261-
longest_common_substrings_length = np.array([len(i) for i in longest_common_substrings])
262-
best_match = np.sort(longest_common_substrings_length)[-2] # pick the second longest, because longest is match with itself
263-
shared_file_stem = longest_common_substrings[np.where(longest_common_substrings_length==best_match)[0][0]]
261+
selected_longest_common_substrings = [i for i in longest_common_substrings if not any([i.endswith(ext) for ext in included_extenstions])]
262+
longest_common_substrings_length = np.array([len(i) for i in selected_longest_common_substrings])
263+
best_match = np.sort(longest_common_substrings_length)[-1] # pick the longest, because the match with itself is already filtered out in previous step
264+
shared_file_stem = selected_longest_common_substrings[np.where(longest_common_substrings_length==best_match)[0][0]]
264265
shared_stems.append(shared_file_stem)
265266
paired_filename_list = []
266267
for namestem in np.unique(shared_stems):

build/lib/secapr/reference_assembly.py

Lines changed: 90 additions & 63 deletions
Original file line numberDiff line numberDiff line change
@@ -190,47 +190,90 @@ def create_sample_reference_fasta(reference_folder,sample_id,alignments):
190190
return reference
191191

192192

193-
def mapping_bwa(forward,backward,reference,sample_id,sample_output_folder, args, log):
193+
def mapping_bwa(subfolder_path,reference,sample_id,sample_output_folder, args):
194+
log = os.path.join(sample_output_folder, 'log')
195+
if not os.path.exists(log):
196+
os.makedirs(log)
194197
#Indexing
195198
command1 = ["bwa","index",reference]
196199
#print(command1)
197200
bwa_out = os.path.join(log, "bwa_screen_out.txt")
198-
try:
199-
with open(bwa_out, 'w') as logfile:
200-
sp1 = subprocess.Popen(command1, shell=False, stderr = subprocess.STDOUT, stdout=logfile)
201-
sp1.wait()
202-
except:
203-
print(("Running bwa (%s) caused an error." %bwa))
204-
sys.exit()
205-
201+
with open(bwa_out, 'w') as logfile:
202+
sp1 = subprocess.Popen(command1, shell=False, stderr = subprocess.STDOUT, stdout=logfile)
203+
sp1.wait()
206204
#Mapping
207-
command2 = ["bwa","mem","-t",str(args.cores),"-k",str(args.k),"-w",str(args.w),"-d",str(args.d),"-r",str(args.r),"-c",str(args.c),"-A",str(args.a),"-B",str(args.b),"-O",str(args.o),"-E",str(args.e),"-L",str(args.l),"-U",str(args.u),"-M",reference,forward,backward]
208-
"""
209-
Copied from bwa manual (http://bio-bwa.sourceforge.net/bwa.shtml#3):
210-
-k INT Minimum seed length. Matches shorter than INT will be missed. The alignment speed is usually insensitive to this value unless it significantly deviates 20. [19]
211-
-w INT Band width. Essentially, gaps longer than INT will not be found. Note that the maximum gap length is also affected by the scoring matrix and the hit length, not solely determined by this option. [100]
212-
-d INT Off-diagonal X-dropoff (Z-dropoff). Stop extension when the difference between the best and the current extension score is above |i-j|*A+INT, where i and j are the current positions of the query and reference, respectively, and A is the matching score. Z-dropoff is similar to BLAST’s X-dropoff except that it doesn’t penalize gaps in one of the sequences in the alignment. Z-dropoff not only avoids unnecessary extension, but also reduces poor alignments inside a long good alignment. [100]
213-
-r FLOAT Trigger re-seeding for a MEM longer than minSeedLen*FLOAT. This is a key heuristic parameter for tuning the performance. Larger value yields fewer seeds, which leads to faster alignment speed but lower accuracy. [1.5]
214-
-c INT Discard a MEM if it has more than INT occurence in the genome. This is an insensitive parameter. [10000]
215-
-A INT Matching score. [1]
216-
-B INT Mismatch penalty. The sequence error rate is approximately: {.75 * exp[-log(4) * B/A]}. [4]
217-
-O INT Gap open penalty. [6]
218-
-E INT Gap extension penalty. A gap of length k costs O + k*E (i.e. -O is for opening a zero-length gap). [1]
219-
-L INT Clipping penalty. When performing SW extension, BWA-MEM keeps track of the best score reaching the end of query. If this score is larger than the best SW score minus the clipping penalty, clipping will not be applied. Note that in this case, the SAM AS tag reports the best SW score; clipping penalty is not deducted. [5]
220-
-U INT Penalty for an unpaired read pair. BWA-MEM scores an unpaired read pair as scoreRead1+scoreRead2-INT and scores a paired as scoreRead1+scoreRead2-insertPenalty. It compares these two scores to determine whether we should force pairing. [9]
221-
-M Mark shorter split hits as secondary (for Picard compatibility).
222-
"""
223-
sam_name = os.path.join(sample_output_folder,'%s.sam'%sample_id)
224-
print ("Mapping...")
225-
#print(command2)
226-
with open(sam_name, 'w') as out, open(bwa_out, 'a') as err:
227-
sp2 = subprocess.Popen(command2, stderr = err, stdout=out)
228-
sp2.wait()
205+
read1_files = sorted(glob.glob(os.path.join(subfolder_path, '_clean-READ1.fastq.gz')))
206+
read2_files = sorted(glob.glob(os.path.join(subfolder_path, '_clean-READ2.fastq.gz')))
207+
samfiles = []
208+
for i, forward in enumerate(read1_files):
209+
backward = read2_files[i]
210+
command2 = ["bwa",
211+
"mem",
212+
"-t",
213+
str(args.cores),
214+
"-k",
215+
str(args.k),
216+
"-w",
217+
str(args.w),
218+
"-d",
219+
str(args.d),
220+
"-r",
221+
str(args.r),
222+
"-c",
223+
str(args.c),
224+
"-A",
225+
str(args.a),
226+
"-B",
227+
str(args.b),
228+
"-O",
229+
str(args.o),
230+
"-E",
231+
str(args.e),
232+
"-L",
233+
str(args.l),
234+
"-U",
235+
str(args.u),
236+
"-M",
237+
reference,
238+
forward,
239+
backward]
240+
"""
241+
Copied from bwa manual (http://bio-bwa.sourceforge.net/bwa.shtml#3):
242+
-k INT Minimum seed length. Matches shorter than INT will be missed. The alignment speed is usually insensitive to this value unless it significantly deviates 20. [19]
243+
-w INT Band width. Essentially, gaps longer than INT will not be found. Note that the maximum gap length is also affected by the scoring matrix and the hit length, not solely determined by this option. [100]
244+
-d INT Off-diagonal X-dropoff (Z-dropoff). Stop extension when the difference between the best and the current extension score is above |i-j|*A+INT, where i and j are the current positions of the query and reference, respectively, and A is the matching score. Z-dropoff is similar to BLAST’s X-dropoff except that it doesn’t penalize gaps in one of the sequences in the alignment. Z-dropoff not only avoids unnecessary extension, but also reduces poor alignments inside a long good alignment. [100]
245+
-r FLOAT Trigger re-seeding for a MEM longer than minSeedLen*FLOAT. This is a key heuristic parameter for tuning the performance. Larger value yields fewer seeds, which leads to faster alignment speed but lower accuracy. [1.5]
246+
-c INT Discard a MEM if it has more than INT occurence in the genome. This is an insensitive parameter. [10000]
247+
-A INT Matching score. [1]
248+
-B INT Mismatch penalty. The sequence error rate is approximately: {.75 * exp[-log(4) * B/A]}. [4]
249+
-O INT Gap open penalty. [6]
250+
-E INT Gap extension penalty. A gap of length k costs O + k*E (i.e. -O is for opening a zero-length gap). [1]
251+
-L INT Clipping penalty. When performing SW extension, BWA-MEM keeps track of the best score reaching the end of query. If this score is larger than the best SW score minus the clipping penalty, clipping will not be applied. Note that in this case, the SAM AS tag reports the best SW score; clipping penalty is not deducted. [5]
252+
-U INT Penalty for an unpaired read pair. BWA-MEM scores an unpaired read pair as scoreRead1+scoreRead2-INT and scores a paired as scoreRead1+scoreRead2-insertPenalty. It compares these two scores to determine whether we should force pairing. [9]
253+
-M Mark shorter split hits as secondary (for Picard compatibility).
254+
"""
255+
sam_name = os.path.join(sample_output_folder, '%s_%i.sam' %(sample_id,i))
256+
print("Mapping...")
257+
# print(command2)
258+
with open(sam_name, 'w') as out, open(bwa_out, 'a') as err:
259+
sp2 = subprocess.Popen(command2, stderr=err, stdout=out)
260+
sp2.wait()
261+
samfiles.append(sam_name)
262+
if len(samfiles) > 1:
263+
# merge SAM files in case there were multiple fastq files per sample
264+
final_sam_name = os.path.join(sample_output_folder, '%s.sam' %sample_id)
265+
command_merge = ['samtools',
266+
'merge',
267+
final_sam_name]
268+
command_merge += samfiles
269+
sp3 = subprocess.Popen(command_merge, stderr=subprocess.PIPE)
270+
else:
271+
final_sam_name = samfiles[0]
229272

230273
#Converting to bam-format with samtools
231274
print ("Converting to bam...")
232275
raw_bam = os.path.join(sample_output_folder,"%s_raw.bam" %sample_id)
233-
command3 = ["samtools","view","-b","-o",raw_bam,"-S",sam_name]
276+
command3 = ["samtools","view","-b","-o",raw_bam,"-S",final_sam_name]
234277
#print(command3)
235278
sp3 = subprocess.Popen(command3,stderr=subprocess.PIPE)
236279
sp3.wait()
@@ -246,9 +289,9 @@ def mapping_bwa(forward,backward,reference,sample_id,sample_output_folder, args,
246289
#print(command5)
247290
sp5 = subprocess.Popen(command5)
248291
sp5.wait()
249-
292+
250293
#Remove some big and unnecessary intermediate files
251-
os.remove(sam_name)
294+
[os.remove(sam_name) for sam_name in samfiles]
252295
os.remove(raw_bam)
253296

254297
return sorted_bam
@@ -283,7 +326,6 @@ def mapping_clc(forward,backward,reference,sample_id,sample_output_folder):
283326

284327

285328
def clean_with_samtools(sample_output_folder,sample_id,sorted_bam,log):
286-
287329
samtools_out = "%s/%s_no_dupls_sorted.bam" %(sample_output_folder,sample_id)
288330
dupl_log = "%s/%s_dupls.log" %(log,sample_id)
289331
run_samtools_rmdup = [
@@ -779,34 +821,19 @@ def main(args):
779821
os.makedirs(tmp_folder)
780822
pickle_path = os.path.join(tmp_folder,'%s_reference.txt' %sample_id)
781823
np.savetxt(pickle_path,np.array([reference]),fmt='%s')
782-
# with open(pickle_path, 'wb') as handle:
783-
# pickle.dump(reference, handle, protocol=pickle.HIGHEST_PROTOCOL)
784-
785-
786-
# Loop through each sample-folder and find read-files
787-
read1_files = sorted(glob.glob(os.path.join(subfolder_path,'_clean-READ1.fastq.gz')))
788-
read2_files = sorted(glob.glob(os.path.join(subfolder_path, '_clean-READ2.fastq.gz')))
789-
for i,forward in enumerate(read1_files):
790-
backward = read2_files[i]
791-
792-
793-
# samtools merge joined.sam tpella5.sam tpella9.sam
794-
795-
796-
sorted_bam = ""
797-
log = os.path.join(sample_output_folder,'log')
798-
if not os.path.exists(log):
799-
os.makedirs(log)
800-
if mapper == "bwa":
801-
sorted_bam = mapping_bwa(forward,backward,reference,sample_id,sample_output_folder,args,log)
802-
if not args.keep_duplicates:
803-
sorted_bam, dupl_bam = clean_with_samtools(sample_output_folder,sample_id,sorted_bam,log)
804-
name_stem = '%s_bam_consensus' %sample_id
805-
bam_consensus_file = bam_consensus(reference,sorted_bam,name_stem,sample_output_folder,min_cov)
806-
if not args.keep_duplicates:
807-
dupl_output_folder = ('/').join(dupl_bam.split('/')[:-1])
808-
dupl_name_stem = '%s_with_duplicates_bam_consensus' %sample_id
809-
bam_consensus_with_duplicates = bam_consensus(reference,dupl_bam,dupl_name_stem,dupl_output_folder,min_cov)
824+
825+
sorted_bam = mapping_bwa(subfolder_path,reference,sample_id,sample_output_folder,args)
826+
827+
if not args.keep_duplicates:
828+
sorted_bam, dupl_bam = clean_with_samtools(sample_output_folder,sample_id,sorted_bam,log)
829+
name_stem = '%s_bam_consensus' %sample_id
830+
bam_consensus_file = bam_consensus(reference,sorted_bam,name_stem,sample_output_folder,min_cov)
831+
if not args.keep_duplicates:
832+
dupl_output_folder = ('/').join(dupl_bam.split('/')[:-1])
833+
dupl_name_stem = '%s_with_duplicates_bam_consensus' %sample_id
834+
bam_consensus_with_duplicates = bam_consensus(reference,dupl_bam,dupl_name_stem,dupl_output_folder,min_cov)
835+
836+
810837
join_fastas(out_dir,sample_out_list)
811838
# create file with read-coverage overview
812839
print(('\n'+"#" * 50))

recipe/install_secapr_env.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -22,4 +22,4 @@ conda install -y bwa
2222
conda install -y samtools==1.3.1
2323
conda install -y trimal
2424
conda install -y secapr
25-
#pip install https://github.com/AntonelliLab/seqcap_processor/archive/refs/tags/v2.2.3.tar.gz
25+
#pip install https://github.com/AntonelliLab/seqcap_processor/archive/refs/tags/v2.2.5.tar.gz

secapr.egg-info/PKG-INFO

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
Metadata-Version: 2.1
22
Name: secapr
3-
Version: 2.2.3+12.g96068e1
3+
Version: 2.2.3+13.g1dd1190.dirty
44
Summary: Process sequence-capture fastq files into alignments for phylogenetic analyses
55
Home-page: https://github.com/AntonelliLab/seqcap_processor
66
Author: Tobias Andermann

secapr/clean_reads.py

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -258,9 +258,10 @@ def main(args):
258258
shared_stems = []
259259
for i in filenamelist:
260260
longest_common_substrings = [longest_common_substring(i,j) for j in filenamelist]
261-
longest_common_substrings_length = np.array([len(i) for i in longest_common_substrings])
262-
best_match = np.sort(longest_common_substrings_length)[-2] # pick the second longest, because longest is match with itself
263-
shared_file_stem = longest_common_substrings[np.where(longest_common_substrings_length==best_match)[0][0]]
261+
selected_longest_common_substrings = [i for i in longest_common_substrings if not any([i.endswith(ext) for ext in included_extenstions])]
262+
longest_common_substrings_length = np.array([len(i) for i in selected_longest_common_substrings])
263+
best_match = np.sort(longest_common_substrings_length)[-1] # pick the longest, because the match with itself is already filtered out in previous step
264+
shared_file_stem = selected_longest_common_substrings[np.where(longest_common_substrings_length==best_match)[0][0]]
264265
shared_stems.append(shared_file_stem)
265266
paired_filename_list = []
266267
for namestem in np.unique(shared_stems):

0 commit comments

Comments
 (0)