updated readme

Tobias Hofmann · Tobias Hofmann · commit 375c9b0538bb · 2018-07-17T12:54:22.000+02:00
diff --git a/README.md b/README.md
@@ -1,6 +1,6 @@
 # <img src="images/secapr_logo.png" width="100"> SEquence CApture PRocessor (SECAPR)
 
-**Original Publication: https://doi.org/10.7287/peerj.preprints.26477v3**
+**Original Publication: https://doi.org/10.7717/peerj.5175**
 
 ___
 
@@ -23,6 +23,6 @@ Some functions in this pipeline are inspired by the scripts from the [Phyluce pi
 
 #### Please cite:
 
-**Andermann T**, Cano Á, Zizka A, Bacon C, Antonelli A. (2018) SECAPR - A bioinformatics pipeline for the rapid and user-friendly processing of Illumina sequences, from raw reads to alignments. PeerJ Preprints. doi: 10.7287/peerj.preprints.26477v3
+**Andermann T.**, Cano Á., Zizka A., Bacon C., Antonelli A. 2018. SECAPR — A bioinformatics pipeline for the rapid and user-friendly processing of targeted enriched Illumina sequences, from raw reads to alignments. PeerJ 6:e5175. DOI: 10.7717/peerj.5175.
 
 Faircloth BC. 2015. PHYLUCE is a software package for the analysis of conserved genomic loci. bioRxiv. doi: 10.1101/027904.
diff --git a/recipe/meta.yaml b/recipe/meta.yaml
@@ -1,4 +1,4 @@
-{% set version = "1.1.10" %}
+{% set version = "1.1.11" %}
 
 package:
   name: secapr
@@ -7,45 +7,46 @@ package:
 source:
   fn: secapr_{{ version }}.tar.gz
   url: https://github.com/AntonelliLab/seqcap_processor/archive/v{{ version }}.tar.gz
-  sha256: 175830fa23619a374719c12340504fb50f9b00d6136b49c2a925873ecda687d1
+  sha256: a14cea6d58154759da84c9626b17b6010626b907980e9e5bbde5ee9c2be5a3ca
 
 build:
   skip: True  # [not py27]
-  number: 0
-  script: $PYTHON setup.py install --single-version-externally-managed --record=record.txt
+  number: 2
+  script: python -m pip install --no-deps --ignore-installed .
   entry_points:
     - secapr = secapr.__main__:main
 
 requirements:
-  build:
+  host:
     - python
+    - pip
     - setuptools
     - biopython
     - pandas
     - r-base
 
   run:
     - python
-    - biopython ==1.68
-    - emboss ==6.5.7
+    - biopython
+    - emboss
     - bcftools ==1.8
-    - samtools ==0.1.19
-    - picard ==1.126
+    - samtools 0.1.19
+    - picard
     - seqtk >=1.0.82,<=1.2
     - bwa >=0.7
     - lastz
     - mafft >=7.2
-    - muscle ==3.8.31
-    - trimmomatic ==0.33
+    - muscle
+    - trimmomatic
     - abyss
     - trinity
-    - fastqc 0.11*
-    - pandas ==0.22.0
-    - numpy ==1.14
-    - cogent ==1.5.3
+    - fastqc
+    - pandas
+    - numpy
+    - cogent
     - r-base
     - r-tidyverse
-    
+
 test:
   imports:
     - secapr
@@ -56,4 +57,4 @@ about:
   home: 'https://github.com/AntonelliLab/seqcap_processor'
   license: MIT
   license_file: LICENSE
-  summary: 'Process sequence-capture FASTQ files into alignments for phylogenetic analyses. Integrates allele phasing, producing haplotype alignments.'
+  summary: 'Process sequence-capture FASTQ files into alignments for phylogenetic analyses. Integrates allele phasing, producing haplotype alignments.'
diff --git a/secapr/quality_check.py b/secapr/quality_check.py
@@ -57,7 +57,8 @@ def main(args):
         p.communicate()
 
     # write the r-plotting script to file
-    r_plotting_script = 'opt <- c()\nopt$input_folder = workdir\nopt$output_file =paste0(workdir, "/QC_plots.pdf")\n\n#load fastQC summaries and create per test table\ninp <- list.files(opt$input_folder, pattern = ".zip")\n\n\nfastqc_results <- lapply(inp, function(k){\n  unzip(paste(opt$input_folder, k, sep = "/"),exdir = opt$input_folder)\n  inpu <- read_delim(paste(paste(gsub(".zip", "", paste(opt$input_folder,k, sep = "/"))), \n                           "summary.txt", sep = "/"), delim = "\t")\n  out <- as_data_frame(t(inpu[, 1])) %>%\n    mutate(sample.id = names(inpu)[3])\n  names(out) <- c(gsub(" ", "_", unlist(inpu[,2])), "sample_id")\n  unlink(x = paste(opt$input_folder, gsub(".zip", "", k), sep = "/"), \n         recursive = T, force = T)\n  \n  return(out)\n})\n\noutp <- do.call("rbind.data.frame", fastqc_results)%>%\n  select(ID = sample_id,\n         PBQ = Per_base_sequence_quality,\n         PTQ = Per_tile_sequence_quality,\n         PSQ = Per_sequence_quality_scores,\n         PBC = Per_base_sequence_content,\n         SGC = Per_sequence_GC_content,\n         PBN = Per_base_N_content,\n         SLD = Sequence_Length_Distribution,\n         SDL = Sequence_Duplication_Levels,\n         ORS = Overrepresented_sequences,\n         AdC = Adapter_Content)\n\n#change table format\nret <- outp %>% \n  group_by(ID) %>%\n  gather(test, status, PBQ:AdC)\n\n#plot how many samples failed the test\nqc.fail <- ggplot()+\n  geom_bar(data = ret, aes(x = test, fill = status), stat = "count", position = "dodge")+\n  theme_bw()\n\n#plot which sample failed which test\nqc.samples <- ggplot()+\n  geom_tile(data = ret, aes(y = ID, x = test, fill = as.factor(status)))+\n  scale_fill_discrete(name = "status")+\n  xlab("FastQC test")+\n  ylab("Samples")+\n  theme_bw()+\n  theme(\n    axis.text.y = element_blank()\n  )\n\n#plot pdf\npdf(opt$output_file)\nprint(qc.fail)\nprint(qc.samples)\ndev.off()\n\npng(gsub(".pdf", "1.png", opt$output_file))\nprint(qc.fail)\ndev.off()\n\npng(gsub(".pdf", "2.png", opt$output_file))\nprint(qc.samples)\ndev.off()\n\n#table with samples that faild a test\nfail <- ret %>%\n  filter(status == "FAIL")\n\n#get the ID number of the failed samples\nfail.samp <- fail %>%\n  filter(!duplicated(ID)) %>%\n  select(ID)%>%\n  unlist() %>%\n  parse_number()%>%\n  unique() %>%\n  sort()'
+    r_plotting_script = 'opt <- c()\nopt$input_folder = workdir\nopt$output_file =paste0(workdir, "/QC_plots.pdf")\n\n#load fastQC summaries and create per test table\ninp <- list.files(opt$input_folder, pattern = ".zip")\n\n\nfastqc_results <- lapply(inp, function(k){\n  unzip(paste(opt$input_folder, k, sep = "/"),exdir = opt$input_folder)\n  inpu <- read_delim(paste(paste(gsub(".zip", "", paste(opt$input_folder,k, sep = "/"))), \n                           "summary.txt", sep = "/"), delim = "\t")\n  out <- as_data_frame(t(inpu[, 1])) %>%\n    mutate(sample.id = names(inpu)[3])\n  names(out) <- c(gsub(" ", "_", unlist(inpu[,2])), "sample_id")\n  unlink(x = paste(opt$input_folder, gsub(".zip", "", k), sep = "/"), \n         recursive = T, force = T)\n  \n  return(out)\n})\n\nn_cols = length(fastqc_results[[1]])\nif (n_cols==11){\n  outp <- do.call("rbind.data.frame", fastqc_results)%>%\n    select(ID = sample_id,\n           PBQ = Per_base_sequence_quality,\n           PTQ = Per_tile_sequence_quality,\n           PSQ = Per_sequence_quality_scores,\n           PBC = Per_base_sequence_content,\n           SGC = Per_sequence_GC_content,\n           PBN = Per_base_N_content,\n           SLD = Sequence_Length_Distribution,\n           SDL = Sequence_Duplication_Levels,\n           ORS = Overrepresented_sequences,\n           AdC = Adapter_Content)\n  #change table format\n  ret <- outp %>% \n    group_by(ID) %>%\n    gather(test, status, PBQ:AdC)\n\n}\nif (n_cols==12){\n  outp <- do.call("rbind.data.frame", fastqc_results)%>%\n    select(ID = sample_id,\n           PBQ = Per_base_sequence_quality,\n           PTQ = Per_tile_sequence_quality,\n           PSQ = Per_sequence_quality_scores,\n           PBC = Per_base_sequence_content,\n           SGC = Per_sequence_GC_content,\n           PBN = Per_base_N_content,\n           SLD = Sequence_Length_Distribution,\n           SDL = Sequence_Duplication_Levels,\n           ORS = Overrepresented_sequences,\n           AdC = Adapter_Content,\n           KmC = Kmer_Content)\n  #change table format\n  ret <- outp %>% \n    group_by(ID) %>%\n    gather(test, status, PBQ:KmC)\n}\n\n#plot how many samples failed the test\nqc.fail <- ggplot()+\n  geom_bar(data = ret, aes(x = test, fill = status), stat = "count", position = "dodge")+\n  theme_bw()\n\n#plot which sample failed which test\nqc.samples <- ggplot()+\n  geom_tile(data = ret, aes(y = ID, x = test, fill = as.factor(status)))+\n  scale_fill_discrete(name = "status")+\n  xlab("FastQC test")+\n  ylab("Samples")+\n  theme_bw()+\n  theme(\n    axis.text.y = element_blank()\n  )\n\n#plot pdf\npdf(opt$output_file)\nprint(qc.fail)\nprint(qc.samples)\ndev.off()\n\npng(gsub(".pdf", "1.png", opt$output_file))\nprint(qc.fail)\ndev.off()\n\npng(gsub(".pdf", "2.png", opt$output_file))\nprint(qc.samples)\ndev.off()\n\n#table with samples that faild a test\nfail <- ret %>%\n  filter(status == "FAIL")\n\n#get the ID number of the failed samples\nfail.samp <- fail %>%\n  filter(!duplicated(ID)) %>%\n  select(ID)%>%\n  unlist() %>%\n  parse_number()%>%\n  unique() %>%\n  sort()'
+    
     add_to_script = 'library(tidyverse)\nworkdir = "%s"\n' %out_folder 
     new_r_plotting_script = add_to_script + r_plotting_script
     r_script_path = os.path.join(out_folder,'fastqc_visualization.r')