bug fix for clean_reads function

Tobias Hofmann · Tobias Hofmann · commit 4ada682e2e28 · 2018-05-17T18:39:48.000-04:00
diff --git a/docs/notebook/.ipynb_checkpoints/contig_assembly-checkpoint.ipynb b/docs/notebook/.ipynb_checkpoints/contig_assembly-checkpoint.ipynb
@@ -37,9 +37,7 @@
   {
    "cell_type": "code",
    "execution_count": 2,
-   "metadata": {
-    "collapsed": false
-   },
+   "metadata": {},
    "outputs": [
     {
      "name": "stdout",
@@ -119,7 +117,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.6.0"
+   "version": "3.6.4"
   }
  },
  "nbformat": 4,
diff --git a/docs/notebook/align_contigs.ipynb b/docs/notebook/align_contigs.ipynb
@@ -10,9 +10,7 @@
   {
    "cell_type": "code",
    "execution_count": 3,
-   "metadata": {
-    "collapsed": false
-   },
+   "metadata": {},
    "outputs": [
     {
      "name": "stdout",
@@ -113,9 +111,7 @@
   {
    "cell_type": "code",
    "execution_count": 3,
-   "metadata": {
-    "collapsed": false
-   },
+   "metadata": {},
    "outputs": [
     {
      "data": {
@@ -183,7 +179,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.6.0"
+   "version": "3.6.4"
   }
  },
  "nbformat": 4,
diff --git a/docs/notebook/extract_contigs.ipynb b/docs/notebook/extract_contigs.ipynb
@@ -30,9 +30,7 @@
   {
    "cell_type": "code",
    "execution_count": 3,
-   "metadata": {
-    "collapsed": false
-   },
+   "metadata": {},
    "outputs": [
     {
      "name": "stdout",
@@ -67,9 +65,7 @@
   {
    "cell_type": "code",
    "execution_count": 3,
-   "metadata": {
-    "collapsed": false
-   },
+   "metadata": {},
    "outputs": [
     {
      "name": "stdout",
@@ -131,9 +127,7 @@
   {
    "cell_type": "code",
    "execution_count": 1,
-   "metadata": {
-    "collapsed": false
-   },
+   "metadata": {},
    "outputs": [
     {
      "data": {
@@ -320,9 +314,7 @@
   {
    "cell_type": "code",
    "execution_count": 59,
-   "metadata": {
-    "collapsed": false
-   },
+   "metadata": {},
    "outputs": [
     {
      "name": "stdout",
@@ -369,9 +361,7 @@
   {
    "cell_type": "code",
    "execution_count": 1,
-   "metadata": {
-    "collapsed": false
-   },
+   "metadata": {},
    "outputs": [
     {
      "data": {
@@ -429,7 +419,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.6.0"
+   "version": "3.6.4"
   }
  },
  "nbformat": 4,
diff --git a/recipe/meta.yaml b/recipe/meta.yaml
@@ -1,4 +1,4 @@
-{% set version = "1.1.7" %}
+{% set version = "1.1.8" %}
 
 package:
   name: secapr
@@ -7,7 +7,7 @@ package:
 source:
   fn: secapr_{{ version }}.tar.gz
   url: https://github.com/AntonelliLab/seqcap_processor/archive/v{{ version }}.tar.gz
-  sha256: 8c0b49798af310493236964455e5f5168270045be7fd7884feae3ed0994783ab
+  sha256: 88cb3ea4ddb5e77175aedba0419855274fc8a624e8c5a856e6e5d107ca9d3cbe
 
 build:
   skip: True  # [not py27]
@@ -33,16 +33,18 @@ requirements:
     - picard ==1.126
     - seqtk >=1.0.82,<=1.2
     - bwa >=0.7
-    - lastz ==1.0.2
+    - lastz
     - mafft >=7.2
     - muscle ==3.8.31
     - trimmomatic ==0.33
-    - abyss ==1.5.2
+    - abyss
+    - trinity
     - fastqc 0.11*
     - pandas ==0.22.0
     - numpy ==1.14
     - cogent ==1.5.3
     - r-base
+    - r-tidyverse
     
 test:
   imports:
diff --git a/secapr.egg-info/PKG-INFO b/secapr.egg-info/PKG-INFO
@@ -1,6 +1,6 @@
 Metadata-Version: 1.1
 Name: secapr
-Version: 1.1.0+42.gac74d29.dirty
+Version: 1.1.0+45.ge31aab9.dirty
 Summary: Process sequence-capture fastq files into alignments for phylogenetic analyses
 Home-page: https://github.com/AntonelliLab/seqcap_processor
 Author: Tobias Hofmann
diff --git a/secapr/clean_reads.py b/secapr/clean_reads.py
@@ -166,6 +166,8 @@ def main(args):
         os.makedirs(out_dir)
     # Find samples for which both reads exist
     read_pairs = find_fastq_pairs(name_pattern, work_dir)
+    if len(read_pairs) ==0:
+        sys.exit('***SECAPR-ERROR: No FASTQ files were found. Check if correct path is provided for --input flag and if all FASTQ files are unzipped')
     # For each pair execute the quality_trim command (trimmomatic)
     #read_count_file = open("%s/read_count_overview.txt" %out_dir, "w")
     #countlog=csv.writer(read_count_file, delimiter='\t')
@@ -201,9 +203,9 @@ def main(args):
                 # Remove the delimiter after the sample name in case it is part of the key
                 if key.endswith(delimiter[0]):
                     clean_key = rchop(key,delimiter[0])
-                    stats_df = quality_trim(r1,r2,clean_key,work_dir,out_dir,barcodes,conf,adapt_index,seed_mismatches,palindrome_clip_threshold,simple_clip_threshold,window_size,required_quality,leading,trailing,tail_crop,head_crop,min_length,stats_dict)
+                    stats_df = quality_trim(r1,r2,clean_key,work_dir,out_dir,barcodes,conf,adapt_index,seed_mismatches,palindrome_clip_threshold,simple_clip_threshold,window_size,required_quality,leading,trailing,tail_crop,head_crop,min_length,stats_dict,cores)
                 else:
-                    stats_df = quality_trim(r1,r2,key,work_dir,out_dir,barcodes,conf,adapt_index,seed_mismatches,palindrome_clip_threshold,simple_clip_threshold,window_size,required_quality,leading,trailing,tail_crop,head_crop,min_length,stats_dict)
+                    stats_df = quality_trim(r1,r2,key,work_dir,out_dir,barcodes,conf,adapt_index,seed_mismatches,palindrome_clip_threshold,simple_clip_threshold,window_size,required_quality,leading,trailing,tail_crop,head_crop,min_length,stats_dict,cores)
     stats_df.to_csv(os.path.join(out_dir,'sample_stats.txt'),sep = '\t',index=False)
 
 def find_barcode(direction,sample_id,barcodes):
@@ -301,7 +303,7 @@ def find_fastq_pairs(name_pattern,work_dir):
 
     return rev_file_info
 
-def quality_trim(r1,r2,sample_id,work_dir,out_dir,barcodes,conf,adapt_index,seed_mismatches,palindrome_clip_threshold,simple_clip_threshold,window_size,required_quality,leading,trailing,tail_crop,head_crop,min_length,stats_dict):
+def quality_trim(r1,r2,sample_id,work_dir,out_dir,barcodes,conf,adapt_index,seed_mismatches,palindrome_clip_threshold,simple_clip_threshold,window_size,required_quality,leading,trailing,tail_crop,head_crop,min_length,stats_dict,cores):
     print ('#' * 50)
     print ("Processing %s...\n" %sample_id)
     # Forward and backward read file paths
diff --git a/secapr/quality_check.py b/secapr/quality_check.py
@@ -57,14 +57,16 @@ def main(args):
         p.communicate()
 
     # write the r-plotting script to file
-    r_plotting_script = 'if (!require("pacman")) install.packages("pacman",repos = "http://cran.us.r-project.org")\npacman::p_load(optparse,tidyverse)\n##optparser options\noption_list <- list(\n  make_option(c("-i", "--input_folder"), type="character", default=getwd(),\n              help="The path to thefolder with the fastqc results"),\n  make_option(c("-o", "--output_file"), type="character", default="QC_plots.pdf",\n              help="Give the name of the pdf file where the plots are to be saved."),\n  make_option(c("-p", "--print"), type="logical", default=TRUE,\n              help="Print sample ids of samples that failed QC.")\n  \n)\n\nopt <- parse_args(OptionParser(option_list=option_list))\n\n#load fastQC summaries and create per test table\ninp <- list.files(opt$input_folder, pattern = ".zip")\n\n\nfastqc_results <- lapply(inp, function(k){\n  unzip(paste(opt$input_folder, k, sep = "/"),exdir = opt$input_folder)\n  inpu <- read_delim(paste(paste(gsub(".zip", "", paste(opt$input_folder,k, sep = "/"))), \n                           "summary.txt", sep = "/"), delim = "\t")\n  out <- as_data_frame(t(inpu[, 1])) %>%\n    mutate(sample.id = names(inpu)[3])\n  names(out) <- c(gsub(" ", "_", unlist(inpu[,2])), "sample_id")\n  unlink(x = paste(opt$input_folder, gsub(".zip", "", k), sep = "/"), \n         recursive = T, force = T)\n  \n  return(out)\n})\n\noutp <- do.call("rbind.data.frame", fastqc_results)%>%\n  select(ID = sample_id,\n         PBQ = Per_base_sequence_quality,\n         PTQ = Per_tile_sequence_quality,\n         PSQ = Per_sequence_quality_scores,\n         PBC = Per_base_sequence_content,\n         SGC = Per_sequence_GC_content,\n         PBN = Per_base_N_content,\n         SLD = Sequence_Length_Distribution,\n         SDL = Sequence_Duplication_Levels,\n         ORS = Overrepresented_sequences,\n         AdC = Adapter_Content,\n         KmC = Kmer_Content)\n\n#change table format\nret <- outp %>% \n  group_by(ID) %>%\n  gather(test, status, PBQ:KmC)\n\n#plot how many samples failed the test\nqc.fail <- ggplot()+\n  geom_bar(data = ret, aes(x = test, fill = status), stat = "count", position = "dodge")+\n  theme_bw()\n\n#plot which sample failed which test\nqc.samples <- ggplot()+\n  geom_tile(data = ret, aes(y = ID, x = test, fill = as.factor(status)))+\n  scale_fill_discrete(name = "status")+\n  xlab("FastQC test")+\n  ylab("Samples")+\n  theme_bw()+\n  theme(\n    axis.text.y = element_blank()\n  )\n\n#plot pdf\npdf(opt$output_file)\nprint(qc.fail)\nprint(qc.samples)\ndev.off()\n\npng(gsub(".pdf", "1.png", opt$output_file))\nprint(qc.fail)\ndev.off()\n\npng(gsub(".pdf", "2.png", opt$output_file))\nprint(qc.samples)\ndev.off()\n\n#table with samples that faild a test\nfail <- ret %>%\n  filter(status == "FAIL")\n\n#get the ID number of the failed samples\nfail.samp <- fail %>%\n  filter(!duplicated(ID)) %>%\n  select(ID)%>%\n  unlist() %>%\n  parse_number()%>%\n  unique() %>%\n  sort()\n\nif(opt$print){\n  write(sprintf("The following sample failed at least one test: %s \n", fail.samp), stdout())\n}\n'
+    r_plotting_script = 'opt <- c()\nopt$input_folder = workdir\nopt$output_file =paste0(workdir, "/QC_plots.pdf")\n\n#load fastQC summaries and create per test table\ninp <- list.files(opt$input_folder, pattern = ".zip")\n\n\nfastqc_results <- lapply(inp, function(k){\n  unzip(paste(opt$input_folder, k, sep = "/"),exdir = opt$input_folder)\n  inpu <- read_delim(paste(paste(gsub(".zip", "", paste(opt$input_folder,k, sep = "/"))), \n                           "summary.txt", sep = "/"), delim = "\t")\n  out <- as_data_frame(t(inpu[, 1])) %>%\n    mutate(sample.id = names(inpu)[3])\n  names(out) <- c(gsub(" ", "_", unlist(inpu[,2])), "sample_id")\n  unlink(x = paste(opt$input_folder, gsub(".zip", "", k), sep = "/"), \n         recursive = T, force = T)\n  \n  return(out)\n})\n\noutp <- do.call("rbind.data.frame", fastqc_results)%>%\n  select(ID = sample_id,\n         PBQ = Per_base_sequence_quality,\n         PTQ = Per_tile_sequence_quality,\n         PSQ = Per_sequence_quality_scores,\n         PBC = Per_base_sequence_content,\n         SGC = Per_sequence_GC_content,\n         PBN = Per_base_N_content,\n         SLD = Sequence_Length_Distribution,\n         SDL = Sequence_Duplication_Levels,\n         ORS = Overrepresented_sequences,\n         AdC = Adapter_Content)\n\n#change table format\nret <- outp %>% \n  group_by(ID) %>%\n  gather(test, status, PBQ:AdC)\n\n#plot how many samples failed the test\nqc.fail <- ggplot()+\n  geom_bar(data = ret, aes(x = test, fill = status), stat = "count", position = "dodge")+\n  theme_bw()\n\n#plot which sample failed which test\nqc.samples <- ggplot()+\n  geom_tile(data = ret, aes(y = ID, x = test, fill = as.factor(status)))+\n  scale_fill_discrete(name = "status")+\n  xlab("FastQC test")+\n  ylab("Samples")+\n  theme_bw()+\n  theme(\n    axis.text.y = element_blank()\n  )\n\n#plot pdf\npdf(opt$output_file)\nprint(qc.fail)\nprint(qc.samples)\ndev.off()\n\npng(gsub(".pdf", "1.png", opt$output_file))\nprint(qc.fail)\ndev.off()\n\npng(gsub(".pdf", "2.png", opt$output_file))\nprint(qc.samples)\ndev.off()\n\n#table with samples that faild a test\nfail <- ret %>%\n  filter(status == "FAIL")\n\n#get the ID number of the failed samples\nfail.samp <- fail %>%\n  filter(!duplicated(ID)) %>%\n  select(ID)%>%\n  unlist() %>%\n  parse_number()%>%\n  unique() %>%\n  sort()'
+    add_to_script = 'library(tidyverse)\nworkdir = "%s"\n' %out_folder 
+    new_r_plotting_script = add_to_script + r_plotting_script
     r_script_path = os.path.join(out_folder,'fastqc_visualization.r')
     text_file = open(r_script_path, "w")
-    text_file.write(r_plotting_script)
+    text_file.write(new_r_plotting_script)
     text_file.close()
     
     # execute r-plotting script
-    print('Running R-code for plotting: This step can take several minutes when executed for the first time, since R needs to install all dependencies.')
+    print('Running R-code for plotting...')
     final_plot = os.path.join(out_folder,'quality_summary_all_samples.pdf')
     plotting_cmd = [
         'Rscript %s -i %s -o %s' %(r_script_path,out_folder,final_plot)

Original file line number	Diff line number	Diff line change
`@@ -37,9 +37,7 @@`
`37`	`37`	`{`
`38`	`38`	`"cell_type": "code",`
`39`	`39`	`"execution_count": 2,`
`40`		`- "metadata": {`
`41`		`- "collapsed": false`
`42`		`- },`
	`40`	`+ "metadata": {},`
`43`	`41`	`"outputs": [`
`44`	`42`	`{`
`45`	`43`	`"name": "stdout",`
`@@ -119,7 +117,7 @@`
`119`	`117`	`"name": "python",`
`120`	`118`	`"nbconvert_exporter": "python",`
`121`	`119`	`"pygments_lexer": "ipython3",`
`122`		`- "version": "3.6.0"`
	`120`	`+ "version": "3.6.4"`
`123`	`121`	`}`
`124`	`122`	`},`
`125`	`123`	`"nbformat": 4,`
Original file line number	Diff line number	Diff line change
`@@ -10,9 +10,7 @@`
`10`	`10`	`{`
`11`	`11`	`"cell_type": "code",`
`12`	`12`	`"execution_count": 3,`
`13`		`- "metadata": {`
`14`		`- "collapsed": false`
`15`		`- },`
	`13`	`+ "metadata": {},`
`16`	`14`	`"outputs": [`
`17`	`15`	`{`
`18`	`16`	`"name": "stdout",`
`@@ -113,9 +111,7 @@`
`113`	`111`	`{`
`114`	`112`	`"cell_type": "code",`
`115`	`113`	`"execution_count": 3,`
`116`		`- "metadata": {`
`117`		`- "collapsed": false`
`118`		`- },`
	`114`	`+ "metadata": {},`
`119`	`115`	`"outputs": [`
`120`	`116`	`{`
`121`	`117`	`"data": {`
`@@ -183,7 +179,7 @@`
`183`	`179`	`"name": "python",`
`184`	`180`	`"nbconvert_exporter": "python",`
`185`	`181`	`"pygments_lexer": "ipython3",`
`186`		`- "version": "3.6.0"`
	`182`	`+ "version": "3.6.4"`
`187`	`183`	`}`
`188`	`184`	`},`
`189`	`185`	`"nbformat": 4,`
Original file line number	Diff line number	Diff line change
`@@ -30,9 +30,7 @@`
`30`	`30`	`{`
`31`	`31`	`"cell_type": "code",`
`32`	`32`	`"execution_count": 3,`
`33`		`- "metadata": {`
`34`		`- "collapsed": false`
`35`		`- },`
	`33`	`+ "metadata": {},`
`36`	`34`	`"outputs": [`
`37`	`35`	`{`
`38`	`36`	`"name": "stdout",`
`@@ -67,9 +65,7 @@`
`67`	`65`	`{`
`68`	`66`	`"cell_type": "code",`
`69`	`67`	`"execution_count": 3,`
`70`		`- "metadata": {`
`71`		`- "collapsed": false`
`72`		`- },`
	`68`	`+ "metadata": {},`
`73`	`69`	`"outputs": [`
`74`	`70`	`{`
`75`	`71`	`"name": "stdout",`
`@@ -131,9 +127,7 @@`
`131`	`127`	`{`
`132`	`128`	`"cell_type": "code",`
`133`	`129`	`"execution_count": 1,`
`134`		`- "metadata": {`
`135`		`- "collapsed": false`
`136`		`- },`
	`130`	`+ "metadata": {},`
`137`	`131`	`"outputs": [`
`138`	`132`	`{`
`139`	`133`	`"data": {`
`@@ -320,9 +314,7 @@`
`320`	`314`	`{`
`321`	`315`	`"cell_type": "code",`
`322`	`316`	`"execution_count": 59,`
`323`		`- "metadata": {`
`324`		`- "collapsed": false`
`325`		`- },`
	`317`	`+ "metadata": {},`
`326`	`318`	`"outputs": [`
`327`	`319`	`{`
`328`	`320`	`"name": "stdout",`
`@@ -369,9 +361,7 @@`
`369`	`361`	`{`
`370`	`362`	`"cell_type": "code",`
`371`	`363`	`"execution_count": 1,`
`372`		`- "metadata": {`
`373`		`- "collapsed": false`
`374`		`- },`
	`364`	`+ "metadata": {},`
`375`	`365`	`"outputs": [`
`376`	`366`	`{`
`377`	`367`	`"data": {`
`@@ -429,7 +419,7 @@`
`429`	`419`	`"name": "python",`
`430`	`420`	`"nbconvert_exporter": "python",`
`431`	`421`	`"pygments_lexer": "ipython3",`
`432`		`- "version": "3.6.0"`
	`422`	`+ "version": "3.6.4"`
`433`	`423`	`}`
`434`	`424`	`},`
`435`	`425`	`"nbformat": 4,`