Skip to content

Commit 4ada682

Browse files
author
Tobias Hofmann
committed
bug fix for clean_reads function
1 parent e31aab9 commit 4ada682

File tree

7 files changed

+28
-38
lines changed

7 files changed

+28
-38
lines changed

docs/notebook/.ipynb_checkpoints/contig_assembly-checkpoint.ipynb

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -37,9 +37,7 @@
3737
{
3838
"cell_type": "code",
3939
"execution_count": 2,
40-
"metadata": {
41-
"collapsed": false
42-
},
40+
"metadata": {},
4341
"outputs": [
4442
{
4543
"name": "stdout",
@@ -119,7 +117,7 @@
119117
"name": "python",
120118
"nbconvert_exporter": "python",
121119
"pygments_lexer": "ipython3",
122-
"version": "3.6.0"
120+
"version": "3.6.4"
123121
}
124122
},
125123
"nbformat": 4,

docs/notebook/align_contigs.ipynb

Lines changed: 3 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -10,9 +10,7 @@
1010
{
1111
"cell_type": "code",
1212
"execution_count": 3,
13-
"metadata": {
14-
"collapsed": false
15-
},
13+
"metadata": {},
1614
"outputs": [
1715
{
1816
"name": "stdout",
@@ -113,9 +111,7 @@
113111
{
114112
"cell_type": "code",
115113
"execution_count": 3,
116-
"metadata": {
117-
"collapsed": false
118-
},
114+
"metadata": {},
119115
"outputs": [
120116
{
121117
"data": {
@@ -183,7 +179,7 @@
183179
"name": "python",
184180
"nbconvert_exporter": "python",
185181
"pygments_lexer": "ipython3",
186-
"version": "3.6.0"
182+
"version": "3.6.4"
187183
}
188184
},
189185
"nbformat": 4,

docs/notebook/extract_contigs.ipynb

Lines changed: 6 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -30,9 +30,7 @@
3030
{
3131
"cell_type": "code",
3232
"execution_count": 3,
33-
"metadata": {
34-
"collapsed": false
35-
},
33+
"metadata": {},
3634
"outputs": [
3735
{
3836
"name": "stdout",
@@ -67,9 +65,7 @@
6765
{
6866
"cell_type": "code",
6967
"execution_count": 3,
70-
"metadata": {
71-
"collapsed": false
72-
},
68+
"metadata": {},
7369
"outputs": [
7470
{
7571
"name": "stdout",
@@ -131,9 +127,7 @@
131127
{
132128
"cell_type": "code",
133129
"execution_count": 1,
134-
"metadata": {
135-
"collapsed": false
136-
},
130+
"metadata": {},
137131
"outputs": [
138132
{
139133
"data": {
@@ -320,9 +314,7 @@
320314
{
321315
"cell_type": "code",
322316
"execution_count": 59,
323-
"metadata": {
324-
"collapsed": false
325-
},
317+
"metadata": {},
326318
"outputs": [
327319
{
328320
"name": "stdout",
@@ -369,9 +361,7 @@
369361
{
370362
"cell_type": "code",
371363
"execution_count": 1,
372-
"metadata": {
373-
"collapsed": false
374-
},
364+
"metadata": {},
375365
"outputs": [
376366
{
377367
"data": {
@@ -429,7 +419,7 @@
429419
"name": "python",
430420
"nbconvert_exporter": "python",
431421
"pygments_lexer": "ipython3",
432-
"version": "3.6.0"
422+
"version": "3.6.4"
433423
}
434424
},
435425
"nbformat": 4,

recipe/meta.yaml

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
{% set version = "1.1.7" %}
1+
{% set version = "1.1.8" %}
22

33
package:
44
name: secapr
@@ -7,7 +7,7 @@ package:
77
source:
88
fn: secapr_{{ version }}.tar.gz
99
url: https://github.com/AntonelliLab/seqcap_processor/archive/v{{ version }}.tar.gz
10-
sha256: 8c0b49798af310493236964455e5f5168270045be7fd7884feae3ed0994783ab
10+
sha256: 88cb3ea4ddb5e77175aedba0419855274fc8a624e8c5a856e6e5d107ca9d3cbe
1111

1212
build:
1313
skip: True # [not py27]
@@ -33,16 +33,18 @@ requirements:
3333
- picard ==1.126
3434
- seqtk >=1.0.82,<=1.2
3535
- bwa >=0.7
36-
- lastz ==1.0.2
36+
- lastz
3737
- mafft >=7.2
3838
- muscle ==3.8.31
3939
- trimmomatic ==0.33
40-
- abyss ==1.5.2
40+
- abyss
41+
- trinity
4142
- fastqc 0.11*
4243
- pandas ==0.22.0
4344
- numpy ==1.14
4445
- cogent ==1.5.3
4546
- r-base
47+
- r-tidyverse
4648

4749
test:
4850
imports:

secapr.egg-info/PKG-INFO

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
Metadata-Version: 1.1
22
Name: secapr
3-
Version: 1.1.0+42.gac74d29.dirty
3+
Version: 1.1.0+45.ge31aab9.dirty
44
Summary: Process sequence-capture fastq files into alignments for phylogenetic analyses
55
Home-page: https://github.com/AntonelliLab/seqcap_processor
66
Author: Tobias Hofmann

secapr/clean_reads.py

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -166,6 +166,8 @@ def main(args):
166166
os.makedirs(out_dir)
167167
# Find samples for which both reads exist
168168
read_pairs = find_fastq_pairs(name_pattern, work_dir)
169+
if len(read_pairs) ==0:
170+
sys.exit('***SECAPR-ERROR: No FASTQ files were found. Check if correct path is provided for --input flag and if all FASTQ files are unzipped')
169171
# For each pair execute the quality_trim command (trimmomatic)
170172
#read_count_file = open("%s/read_count_overview.txt" %out_dir, "w")
171173
#countlog=csv.writer(read_count_file, delimiter='\t')
@@ -201,9 +203,9 @@ def main(args):
201203
# Remove the delimiter after the sample name in case it is part of the key
202204
if key.endswith(delimiter[0]):
203205
clean_key = rchop(key,delimiter[0])
204-
stats_df = quality_trim(r1,r2,clean_key,work_dir,out_dir,barcodes,conf,adapt_index,seed_mismatches,palindrome_clip_threshold,simple_clip_threshold,window_size,required_quality,leading,trailing,tail_crop,head_crop,min_length,stats_dict)
206+
stats_df = quality_trim(r1,r2,clean_key,work_dir,out_dir,barcodes,conf,adapt_index,seed_mismatches,palindrome_clip_threshold,simple_clip_threshold,window_size,required_quality,leading,trailing,tail_crop,head_crop,min_length,stats_dict,cores)
205207
else:
206-
stats_df = quality_trim(r1,r2,key,work_dir,out_dir,barcodes,conf,adapt_index,seed_mismatches,palindrome_clip_threshold,simple_clip_threshold,window_size,required_quality,leading,trailing,tail_crop,head_crop,min_length,stats_dict)
208+
stats_df = quality_trim(r1,r2,key,work_dir,out_dir,barcodes,conf,adapt_index,seed_mismatches,palindrome_clip_threshold,simple_clip_threshold,window_size,required_quality,leading,trailing,tail_crop,head_crop,min_length,stats_dict,cores)
207209
stats_df.to_csv(os.path.join(out_dir,'sample_stats.txt'),sep = '\t',index=False)
208210

209211
def find_barcode(direction,sample_id,barcodes):
@@ -301,7 +303,7 @@ def find_fastq_pairs(name_pattern,work_dir):
301303

302304
return rev_file_info
303305

304-
def quality_trim(r1,r2,sample_id,work_dir,out_dir,barcodes,conf,adapt_index,seed_mismatches,palindrome_clip_threshold,simple_clip_threshold,window_size,required_quality,leading,trailing,tail_crop,head_crop,min_length,stats_dict):
306+
def quality_trim(r1,r2,sample_id,work_dir,out_dir,barcodes,conf,adapt_index,seed_mismatches,palindrome_clip_threshold,simple_clip_threshold,window_size,required_quality,leading,trailing,tail_crop,head_crop,min_length,stats_dict,cores):
305307
print ('#' * 50)
306308
print ("Processing %s...\n" %sample_id)
307309
# Forward and backward read file paths

secapr/quality_check.py

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -57,14 +57,16 @@ def main(args):
5757
p.communicate()
5858

5959
# write the r-plotting script to file
60-
r_plotting_script = 'if (!require("pacman")) install.packages("pacman",repos = "http://cran.us.r-project.org")\npacman::p_load(optparse,tidyverse)\n##optparser options\noption_list <- list(\n make_option(c("-i", "--input_folder"), type="character", default=getwd(),\n help="The path to thefolder with the fastqc results"),\n make_option(c("-o", "--output_file"), type="character", default="QC_plots.pdf",\n help="Give the name of the pdf file where the plots are to be saved."),\n make_option(c("-p", "--print"), type="logical", default=TRUE,\n help="Print sample ids of samples that failed QC.")\n \n)\n\nopt <- parse_args(OptionParser(option_list=option_list))\n\n#load fastQC summaries and create per test table\ninp <- list.files(opt$input_folder, pattern = ".zip")\n\n\nfastqc_results <- lapply(inp, function(k){\n unzip(paste(opt$input_folder, k, sep = "/"),exdir = opt$input_folder)\n inpu <- read_delim(paste(paste(gsub(".zip", "", paste(opt$input_folder,k, sep = "/"))), \n "summary.txt", sep = "/"), delim = "\t")\n out <- as_data_frame(t(inpu[, 1])) %>%\n mutate(sample.id = names(inpu)[3])\n names(out) <- c(gsub(" ", "_", unlist(inpu[,2])), "sample_id")\n unlink(x = paste(opt$input_folder, gsub(".zip", "", k), sep = "/"), \n recursive = T, force = T)\n \n return(out)\n})\n\noutp <- do.call("rbind.data.frame", fastqc_results)%>%\n select(ID = sample_id,\n PBQ = Per_base_sequence_quality,\n PTQ = Per_tile_sequence_quality,\n PSQ = Per_sequence_quality_scores,\n PBC = Per_base_sequence_content,\n SGC = Per_sequence_GC_content,\n PBN = Per_base_N_content,\n SLD = Sequence_Length_Distribution,\n SDL = Sequence_Duplication_Levels,\n ORS = Overrepresented_sequences,\n AdC = Adapter_Content,\n KmC = Kmer_Content)\n\n#change table format\nret <- outp %>% \n group_by(ID) %>%\n gather(test, status, PBQ:KmC)\n\n#plot how many samples failed the test\nqc.fail <- ggplot()+\n geom_bar(data = ret, aes(x = test, fill = status), stat = "count", position = "dodge")+\n theme_bw()\n\n#plot which sample failed which test\nqc.samples <- ggplot()+\n geom_tile(data = ret, aes(y = ID, x = test, fill = as.factor(status)))+\n scale_fill_discrete(name = "status")+\n xlab("FastQC test")+\n ylab("Samples")+\n theme_bw()+\n theme(\n axis.text.y = element_blank()\n )\n\n#plot pdf\npdf(opt$output_file)\nprint(qc.fail)\nprint(qc.samples)\ndev.off()\n\npng(gsub(".pdf", "1.png", opt$output_file))\nprint(qc.fail)\ndev.off()\n\npng(gsub(".pdf", "2.png", opt$output_file))\nprint(qc.samples)\ndev.off()\n\n#table with samples that faild a test\nfail <- ret %>%\n filter(status == "FAIL")\n\n#get the ID number of the failed samples\nfail.samp <- fail %>%\n filter(!duplicated(ID)) %>%\n select(ID)%>%\n unlist() %>%\n parse_number()%>%\n unique() %>%\n sort()\n\nif(opt$print){\n write(sprintf("The following sample failed at least one test: %s \n", fail.samp), stdout())\n}\n'
60+
r_plotting_script = 'opt <- c()\nopt$input_folder = workdir\nopt$output_file =paste0(workdir, "/QC_plots.pdf")\n\n#load fastQC summaries and create per test table\ninp <- list.files(opt$input_folder, pattern = ".zip")\n\n\nfastqc_results <- lapply(inp, function(k){\n unzip(paste(opt$input_folder, k, sep = "/"),exdir = opt$input_folder)\n inpu <- read_delim(paste(paste(gsub(".zip", "", paste(opt$input_folder,k, sep = "/"))), \n "summary.txt", sep = "/"), delim = "\t")\n out <- as_data_frame(t(inpu[, 1])) %>%\n mutate(sample.id = names(inpu)[3])\n names(out) <- c(gsub(" ", "_", unlist(inpu[,2])), "sample_id")\n unlink(x = paste(opt$input_folder, gsub(".zip", "", k), sep = "/"), \n recursive = T, force = T)\n \n return(out)\n})\n\noutp <- do.call("rbind.data.frame", fastqc_results)%>%\n select(ID = sample_id,\n PBQ = Per_base_sequence_quality,\n PTQ = Per_tile_sequence_quality,\n PSQ = Per_sequence_quality_scores,\n PBC = Per_base_sequence_content,\n SGC = Per_sequence_GC_content,\n PBN = Per_base_N_content,\n SLD = Sequence_Length_Distribution,\n SDL = Sequence_Duplication_Levels,\n ORS = Overrepresented_sequences,\n AdC = Adapter_Content)\n\n#change table format\nret <- outp %>% \n group_by(ID) %>%\n gather(test, status, PBQ:AdC)\n\n#plot how many samples failed the test\nqc.fail <- ggplot()+\n geom_bar(data = ret, aes(x = test, fill = status), stat = "count", position = "dodge")+\n theme_bw()\n\n#plot which sample failed which test\nqc.samples <- ggplot()+\n geom_tile(data = ret, aes(y = ID, x = test, fill = as.factor(status)))+\n scale_fill_discrete(name = "status")+\n xlab("FastQC test")+\n ylab("Samples")+\n theme_bw()+\n theme(\n axis.text.y = element_blank()\n )\n\n#plot pdf\npdf(opt$output_file)\nprint(qc.fail)\nprint(qc.samples)\ndev.off()\n\npng(gsub(".pdf", "1.png", opt$output_file))\nprint(qc.fail)\ndev.off()\n\npng(gsub(".pdf", "2.png", opt$output_file))\nprint(qc.samples)\ndev.off()\n\n#table with samples that faild a test\nfail <- ret %>%\n filter(status == "FAIL")\n\n#get the ID number of the failed samples\nfail.samp <- fail %>%\n filter(!duplicated(ID)) %>%\n select(ID)%>%\n unlist() %>%\n parse_number()%>%\n unique() %>%\n sort()'
61+
add_to_script = 'library(tidyverse)\nworkdir = "%s"\n' %out_folder
62+
new_r_plotting_script = add_to_script + r_plotting_script
6163
r_script_path = os.path.join(out_folder,'fastqc_visualization.r')
6264
text_file = open(r_script_path, "w")
63-
text_file.write(r_plotting_script)
65+
text_file.write(new_r_plotting_script)
6466
text_file.close()
6567

6668
# execute r-plotting script
67-
print('Running R-code for plotting: This step can take several minutes when executed for the first time, since R needs to install all dependencies.')
69+
print('Running R-code for plotting...')
6870
final_plot = os.path.join(out_folder,'quality_summary_all_samples.pdf')
6971
plotting_cmd = [
7072
'Rscript %s -i %s -o %s' %(r_script_path,out_folder,final_plot)

0 commit comments

Comments
 (0)