implemented python fastqc plot

Tobias Hofmann · Tobias Hofmann · commit 64814e5575be · 2019-02-26T13:08:58.000+01:00
diff --git a/documentation.ipynb b/documentation.ipynb
@@ -24,7 +24,7 @@
     "***\n",
     "\n",
     "## Installation & Setup\n",
-    "SeCaPr is available as a conda package on the bioconda channel. This makes installation very simple. Follow the instructions on this page to get the SECAPR pipeline set up and ready to use:\n",
+    "SECAPR is available as a conda package on the bioconda channel. This makes installation very simple. Follow the instructions on this page to get the SECAPR pipeline set up and ready to use:\n",
     "\n",
     "<div class=\"alert alert-block alert-info\">\n",
     "**INFO:** Commands in blue boxes have to be executed from a bash-command line terminal.\n",
@@ -106,15 +106,20 @@
     "<div class=\"alert alert-block alert-warning\">IMPORTANT : When you are using the SECAPR pipeline, make sure the secapr_env is activated. Activate with **source activate secapr_env**\n",
     "</div>\n",
     "\n",
-    "***"
+    "***\n",
+    "\n",
+    "### 5. Install SECAPR development version\n",
+    "\n",
+    "The development version of SECAPR is stored on this GitHub page and contains the newest updates, which might not yet be available through the conda version. However you need to install the SECAPR environment with conda first by following the steps above. Once the environment is installed, you can update SECAPR to the development version by following these steps:\n",
+    "\n",
+    "1. Connect to your secapr environment (`source activate secapr_env`)\n",
+    "2. Remove the current secapr installation (`conda remove secapr`)\n",
+    "3. Download the new version from github (`wget https://github.com/AntonelliLab/seqcap_processor/archive/master.zip`)\n",
+    "4. Unzip the downloaded file (`unzip master.zip`)\n",
+    "5. Move the unzipped directory to a safe location on your computer, i.e. not on your Desktop or Download folder, since this will be the path where secapr will be executed from in the future\n",
+    "6. Enter the unzipped secapr directory (`cd seqcap_processor-master`)\n",
+    "7. Install secapr from the folder (`python -m pip install -e .`)"
    ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": []
   }
  ],
  "metadata": {
diff --git a/recipe/meta.yaml b/recipe/meta.yaml
@@ -1,12 +1,12 @@
-{% set version = "1.1.12" %}
+{% set version = "1.1.13" %}
 
 package:
   name: secapr
   version: {{ version }}
 
 source:
   url: https://github.com/AntonelliLab/seqcap_processor/archive/v{{ version }}.tar.gz
-  sha256: 0e6693940aaea1e43adc52aaf03581a9d8f448fc3771acdf9ca3fa3c8b7588eb
+  sha256: d4d90767c5ca1ba28906685b456f1b726b0386b84f4938c633f7ffeff8e6c08e
 
 build:
   skip: True  # [not py27] 
diff --git a/secapr/quality_check.py b/secapr/quality_check.py
@@ -14,6 +14,11 @@
 import subprocess
 import pandas as pd
 import numpy as np
+import matplotlib.pyplot as plt
+from matplotlib import colors
+import zipfile
+import collections
+
 
 def add_arguments(parser):
     parser.add_argument(
@@ -31,6 +36,80 @@ def add_arguments(parser):
         help='The output directory where quality-test results will be saved'
     )
 
+def get_test_results(fastqc_log_content):
+    test_results = [i for i in fastqc_log_content if i.startswith('>>')]
+    test_names = [string.split('\t')[0].replace('>>','') for string in test_results if not string == '>>END_MODULE']
+    test_results = [string.split('\t')[-1] for string in test_results if not string == '>>END_MODULE']
+    return test_names,test_results
+
+def plot_fastqc_results(fastqc_out_folder):
+    zip_files = []
+    for root, dirnames, filenames in os.walk(fastqc_out_folder):
+        for filename in fnmatch.filter(filenames, '*.zip'):
+            zip_files.append(os.path.join(root, filename))
+    sample_test_results_dict = {}
+    for file in zip_files:
+        sample_name = file.split('/')[-1].replace('_fastqc.zip','')
+        archive = zipfile.ZipFile(file,'r')
+        target_file = [i for i in archive.namelist() if i.endswith('fastqc_data.txt')][0]
+        fastqc_log = archive.read(target_file)
+        fastqc_log_formatted = str(fastqc_log).replace('\\t','\t').split('\n')
+        labels,results = get_test_results(fastqc_log_formatted)
+        #print(results)
+        num_results = [0 if i == 'pass' else i for i in results]
+        num_results = [1 if i == 'warn' else i for i in num_results]
+        num_results = [2 if i == 'fail' else i for i in num_results]
+        sample_test_results_dict[sample_name] = num_results
+
+    label_abbrevations = []
+    for i in labels:
+        split_string = i.split(' ')
+        abbrevation = []
+        for j in split_string:
+            letter = j[0]
+            abbrevation.append(letter)
+        abbrevation = ''.join(abbrevation)
+        label_abbrevations.append(abbrevation)
+    # plot the sample overview
+    ordered_dict = collections.OrderedDict(sorted(sample_test_results_dict.items()))
+    samples = list(ordered_dict.keys())
+    values = np.array(list(ordered_dict.values()))
+
+    fig = plt.figure(figsize=(8,len(samples)))
+    plt.imshow(values, interpolation='nearest', cmap=colors.ListedColormap(['green','yellow','red']))
+    plt.yticks(range(values.shape[0]), samples)
+    plt.xticks(range(values.shape[1]), label_abbrevations)
+    plt.xlabel('FastQC test (abbrevated names)')
+    plt.ylabel('Sample name')
+    plt.title('FastQC results by sample')
+    fig.savefig(os.path.join(fastqc_out_folder,'quality_summary_all_samples_1.pdf'), dpi = 500,transparent=True,bbox_inches='tight')
+
+    # plot the test overview
+    all_pass_counts = [list(col).count(0) for col in values.T]
+    all_warn_counts = [list(col).count(1) for col in values.T]
+    all_fail_counts = [list(col).count(2) for col in values.T]
+
+    barWidth=0.3
+    r2 = np.arange(len(all_pass_counts))
+    r1 = [x - barWidth for x in r2]
+    r3 = [x + barWidth for x in r2]
+
+    fig = plt.figure(figsize=(8,len(samples)))
+    plt.bar(r1, all_pass_counts, color='green', width=barWidth, edgecolor='black', label='pass')
+    plt.bar(r2, all_warn_counts, color='yellow', width=barWidth, edgecolor='black', label='warn')
+    plt.bar(r3, all_fail_counts, color='red', width=barWidth, edgecolor='black', label='fail')
+    plt.xticks(range(values.shape[1]), label_abbrevations)
+    for border in np.array(r3)+0.66*barWidth:
+        plt.axvline(border,color='black',linestyle='--',alpha=0.5)
+    plt.yticks(range(len(samples)+1), range(len(samples)+1))
+    plt.xlim(0-barWidth-0.75*barWidth,)
+    plt.xlabel('FastQC test (abbrevated names)')
+    plt.ylabel('number of samples')
+    plt.title('FastQC results by test type')
+    #plt.legend()
+    fig.savefig(os.path.join(fastqc_out_folder,'quality_summary_all_samples_2.pdf'), dpi = 500,transparent=True,bbox_inches='tight')
+
+
 def main(args):
     # Set working directory
     out_folder = args.output
@@ -56,22 +135,4 @@ def main(args):
         p = subprocess.Popen(fastqc_cmd, stdout=log_err_file, stderr=log_err_file, shell=True)
         p.communicate()
 
-    # write the r-plotting script to file
-    r_plotting_script = 'opt <- c()\nopt$input_folder = workdir\nopt$output_file =paste0(workdir, "/QC_plots.pdf")\n\n#load fastQC summaries and create per test table\ninp <- list.files(opt$input_folder, pattern = ".zip")\n\n\nfastqc_results <- lapply(inp, function(k){\n  unzip(paste(opt$input_folder, k, sep = "/"),exdir = opt$input_folder)\n  inpu <- read_delim(paste(paste(gsub(".zip", "", paste(opt$input_folder,k, sep = "/"))), \n                           "summary.txt", sep = "/"), delim = "\t")\n  out <- as_data_frame(t(inpu[, 1])) %>%\n    mutate(sample.id = names(inpu)[3])\n  names(out) <- c(gsub(" ", "_", unlist(inpu[,2])), "sample_id")\n  unlink(x = paste(opt$input_folder, gsub(".zip", "", k), sep = "/"), \n         recursive = T, force = T)\n  \n  return(out)\n})\n\nn_cols = length(fastqc_results[[1]])\nif (n_cols==11){\n  outp <- do.call("rbind.data.frame", fastqc_results)%>%\n    select(ID = sample_id,\n           PBQ = Per_base_sequence_quality,\n           PTQ = Per_tile_sequence_quality,\n           PSQ = Per_sequence_quality_scores,\n           PBC = Per_base_sequence_content,\n           SGC = Per_sequence_GC_content,\n           PBN = Per_base_N_content,\n           SLD = Sequence_Length_Distribution,\n           SDL = Sequence_Duplication_Levels,\n           ORS = Overrepresented_sequences,\n           AdC = Adapter_Content)\n  #change table format\n  ret <- outp %>% \n    group_by(ID) %>%\n    gather(test, status, PBQ:AdC)\n\n}\nif (n_cols==12){\n  outp <- do.call("rbind.data.frame", fastqc_results)%>%\n    select(ID = sample_id,\n           PBQ = Per_base_sequence_quality,\n           PTQ = Per_tile_sequence_quality,\n           PSQ = Per_sequence_quality_scores,\n           PBC = Per_base_sequence_content,\n           SGC = Per_sequence_GC_content,\n           PBN = Per_base_N_content,\n           SLD = Sequence_Length_Distribution,\n           SDL = Sequence_Duplication_Levels,\n           ORS = Overrepresented_sequences,\n           AdC = Adapter_Content,\n           KmC = Kmer_Content)\n  #change table format\n  ret <- outp %>% \n    group_by(ID) %>%\n    gather(test, status, PBQ:KmC)\n}\n\n#plot how many samples failed the test\nqc.fail <- ggplot()+\n  geom_bar(data = ret, aes(x = test, fill = status), stat = "count", position = "dodge")+\n  theme_bw()\n\n#plot which sample failed which test\nqc.samples <- ggplot()+\n  geom_tile(data = ret, aes(y = ID, x = test, fill = as.factor(status)))+\n  scale_fill_discrete(name = "status")+\n  xlab("FastQC test")+\n  ylab("Samples")+\n  theme_bw()+\n  theme(\n    axis.text.y = element_blank()\n  )\n\n#plot pdf\npdf(opt$output_file)\nprint(qc.fail)\nprint(qc.samples)\ndev.off()\n\npng(gsub(".pdf", "1.png", opt$output_file))\nprint(qc.fail)\ndev.off()\n\npng(gsub(".pdf", "2.png", opt$output_file))\nprint(qc.samples)\ndev.off()\n\n#table with samples that faild a test\nfail <- ret %>%\n  filter(status == "FAIL")\n\n#get the ID number of the failed samples\nfail.samp <- fail %>%\n  filter(!duplicated(ID)) %>%\n  select(ID)%>%\n  unlist() %>%\n  parse_number()%>%\n  unique() %>%\n  sort()'
-    
-    add_to_script = 'library(tidyverse)\nworkdir = "%s"\n' %out_folder 
-    new_r_plotting_script = add_to_script + r_plotting_script
-    r_script_path = os.path.join(out_folder,'fastqc_visualization.r')
-    text_file = open(r_script_path, "w")
-    text_file.write(new_r_plotting_script)
-    text_file.close()
-    
-    # execute r-plotting script
-    print('Running R-code for plotting...')
-    final_plot = os.path.join(out_folder,'quality_summary_all_samples.pdf')
-    plotting_cmd = [
-        'Rscript %s -i %s -o %s' %(r_script_path,out_folder,final_plot)
-    ]
-    with open(os.path.join(out_folder, "r_plotting_screen_out.txt"), 'w') as log_err_file:
-        p = subprocess.Popen(plotting_cmd, stdout=log_err_file, stderr=log_err_file, shell=True)
-        p.communicate()
+    plot_fastqc_results(out_folder)
diff --git a/src/plot_quality_test_results.py b/src/plot_quality_test_results.py
@@ -0,0 +1,91 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+Created on Tue Feb 26 09:39:51 2019
+
+@author: Tobias Andermann (tobias.andermann@bioenv.gu.se)
+"""
+
+import numpy as np
+import matplotlib.pyplot as plt
+from matplotlib import colors
+import glob
+import zipfile
+import collections
+
+def get_test_results(fastqc_log_content):
+    test_results = [i for i in fastqc_log_content if i.startswith('>>')]
+    test_names = [string.split('\t')[0].replace('>>','') for string in test_results if not string == '>>END_MODULE']
+    test_results = [string.split('\t')[-1] for string in test_results if not string == '>>END_MODULE']
+    return test_names,test_results
+
+
+input_dir = '/Users/tobias/GitHub/seqcap_processor/data/raw/test_folder_quality_check/'
+zip_files = glob.glob('%s*.zip'%input_dir)
+sample_test_results_dict = {}
+for file in zip_files:
+    sample_name = file.split('/')[-1].replace('_fastqc.zip','')
+    archive = zipfile.ZipFile(file,'r')
+    target_file = [i for i in archive.namelist() if i.endswith('fastqc_data.txt')][0]
+    fastqc_log = archive.read(target_file)
+    fastqc_log_formatted = str(fastqc_log).replace('\\t','\t').split('\\n')
+    labels,results = get_test_results(fastqc_log_formatted)
+    num_results = [0 if i == 'pass' else i for i in results]
+    num_results = [1 if i == 'warn' else i for i in num_results]
+    num_results = [2 if i == 'fail' else i for i in num_results]
+    sample_test_results_dict[sample_name] = num_results
+
+label_abbrevations = []
+for i in labels:
+    split_string = i.split(' ')
+    abbrevation = []
+    for j in split_string:
+        letter = j[0]
+        abbrevation.append(letter)
+    abbrevation = ''.join(abbrevation)
+    label_abbrevations.append(abbrevation)
+
+
+# plot the sampel overview
+ordered_dict = collections.OrderedDict(sorted(sample_test_results_dict.items()))
+samples = list(ordered_dict.keys())
+values = np.array(list(ordered_dict.values()))
+
+fig = plt.figure(figsize=(8,len(samples)))
+plt.imshow(values, interpolation='nearest', cmap=colors.ListedColormap(['green','yellow','red']))
+plt.yticks(range(values.shape[0]), samples)
+plt.xticks(range(values.shape[1]), label_abbrevations)
+plt.xlabel('FastQC test (abbrevated names)')
+plt.ylabel('Sample name')
+plt.title('FastQC results by sample')
+fig.savefig('/Users/tobias/Desktop/test1.pdf', dpi = 500,transparent=True)#bbox_inches='tight',
+
+# plot the test overview
+all_pass_counts = [list(col).count(0) for col in values.T]
+all_warn_counts = [list(col).count(1) for col in values.T]
+all_fail_counts = [list(col).count(2) for col in values.T]
+
+barWidth=0.3
+r2 = np.arange(len(all_pass_counts))
+r1 = [x - barWidth for x in r2]
+r3 = [x + barWidth for x in r2]
+
+fig = plt.figure(figsize=(8,len(samples)))
+plt.bar(r1, all_pass_counts, color='green', width=barWidth, edgecolor='black', label='pass')
+plt.bar(r2, all_warn_counts, color='yellow', width=barWidth, edgecolor='black', label='warn')
+plt.bar(r3, all_fail_counts, color='red', width=barWidth, edgecolor='black', label='fail')
+plt.xticks(range(values.shape[1]), label_abbrevations)
+for border in np.array(r3)+0.66*barWidth:
+    plt.axvline(border,color='black',linestyle='--',alpha=0.5)
+plt.yticks(range(len(samples)+1), range(len(samples)+1))
+plt.xlim(0-barWidth-0.75*barWidth,)
+plt.xlabel('FastQC test (abbrevated names)')
+plt.ylabel('number of samples')
+plt.title('FastQC results by test type')
+plt.legend()
+fig.savefig('/Users/tobias/Desktop/test.pdf', dpi = 500,transparent=True)#bbox_inches='tight',
+
+
+
+
+