Skip to content

Commit 64814e5

Browse files
author
Tobias Hofmann
committed
implemented python fastqc plot
1 parent 3e68921 commit 64814e5

File tree

4 files changed

+187
-30
lines changed

4 files changed

+187
-30
lines changed

documentation.ipynb

Lines changed: 14 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,7 @@
2424
"***\n",
2525
"\n",
2626
"## Installation & Setup\n",
27-
"SeCaPr is available as a conda package on the bioconda channel. This makes installation very simple. Follow the instructions on this page to get the SECAPR pipeline set up and ready to use:\n",
27+
"SECAPR is available as a conda package on the bioconda channel. This makes installation very simple. Follow the instructions on this page to get the SECAPR pipeline set up and ready to use:\n",
2828
"\n",
2929
"<div class=\"alert alert-block alert-info\">\n",
3030
"**INFO:** Commands in blue boxes have to be executed from a bash-command line terminal.\n",
@@ -106,15 +106,20 @@
106106
"<div class=\"alert alert-block alert-warning\">IMPORTANT : When you are using the SECAPR pipeline, make sure the secapr_env is activated. Activate with **source activate secapr_env**\n",
107107
"</div>\n",
108108
"\n",
109-
"***"
109+
"***\n",
110+
"\n",
111+
"### 5. Install SECAPR development version\n",
112+
"\n",
113+
"The development version of SECAPR is stored on this GitHub page and contains the newest updates, which might not yet be available through the conda version. However you need to install the SECAPR environment with conda first by following the steps above. Once the environment is installed, you can update SECAPR to the development version by following these steps:\n",
114+
"\n",
115+
"1. Connect to your secapr environment (`source activate secapr_env`)\n",
116+
"2. Remove the current secapr installation (`conda remove secapr`)\n",
117+
"3. Download the new version from github (`wget https://github.com/AntonelliLab/seqcap_processor/archive/master.zip`)\n",
118+
"4. Unzip the downloaded file (`unzip master.zip`)\n",
119+
"5. Move the unzipped directory to a safe location on your computer, i.e. not on your Desktop or Download folder, since this will be the path where secapr will be executed from in the future\n",
120+
"6. Enter the unzipped secapr directory (`cd seqcap_processor-master`)\n",
121+
"7. Install secapr from the folder (`python -m pip install -e .`)"
110122
]
111-
},
112-
{
113-
"cell_type": "code",
114-
"execution_count": null,
115-
"metadata": {},
116-
"outputs": [],
117-
"source": []
118123
}
119124
],
120125
"metadata": {

recipe/meta.yaml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,12 @@
1-
{% set version = "1.1.12" %}
1+
{% set version = "1.1.13" %}
22

33
package:
44
name: secapr
55
version: {{ version }}
66

77
source:
88
url: https://github.com/AntonelliLab/seqcap_processor/archive/v{{ version }}.tar.gz
9-
sha256: 0e6693940aaea1e43adc52aaf03581a9d8f448fc3771acdf9ca3fa3c8b7588eb
9+
sha256: d4d90767c5ca1ba28906685b456f1b726b0386b84f4938c633f7ffeff8e6c08e
1010

1111
build:
1212
skip: True # [not py27]

secapr/quality_check.py

Lines changed: 80 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,11 @@
1414
import subprocess
1515
import pandas as pd
1616
import numpy as np
17+
import matplotlib.pyplot as plt
18+
from matplotlib import colors
19+
import zipfile
20+
import collections
21+
1722

1823
def add_arguments(parser):
1924
parser.add_argument(
@@ -31,6 +36,80 @@ def add_arguments(parser):
3136
help='The output directory where quality-test results will be saved'
3237
)
3338

39+
def get_test_results(fastqc_log_content):
40+
test_results = [i for i in fastqc_log_content if i.startswith('>>')]
41+
test_names = [string.split('\t')[0].replace('>>','') for string in test_results if not string == '>>END_MODULE']
42+
test_results = [string.split('\t')[-1] for string in test_results if not string == '>>END_MODULE']
43+
return test_names,test_results
44+
45+
def plot_fastqc_results(fastqc_out_folder):
46+
zip_files = []
47+
for root, dirnames, filenames in os.walk(fastqc_out_folder):
48+
for filename in fnmatch.filter(filenames, '*.zip'):
49+
zip_files.append(os.path.join(root, filename))
50+
sample_test_results_dict = {}
51+
for file in zip_files:
52+
sample_name = file.split('/')[-1].replace('_fastqc.zip','')
53+
archive = zipfile.ZipFile(file,'r')
54+
target_file = [i for i in archive.namelist() if i.endswith('fastqc_data.txt')][0]
55+
fastqc_log = archive.read(target_file)
56+
fastqc_log_formatted = str(fastqc_log).replace('\\t','\t').split('\n')
57+
labels,results = get_test_results(fastqc_log_formatted)
58+
#print(results)
59+
num_results = [0 if i == 'pass' else i for i in results]
60+
num_results = [1 if i == 'warn' else i for i in num_results]
61+
num_results = [2 if i == 'fail' else i for i in num_results]
62+
sample_test_results_dict[sample_name] = num_results
63+
64+
label_abbrevations = []
65+
for i in labels:
66+
split_string = i.split(' ')
67+
abbrevation = []
68+
for j in split_string:
69+
letter = j[0]
70+
abbrevation.append(letter)
71+
abbrevation = ''.join(abbrevation)
72+
label_abbrevations.append(abbrevation)
73+
# plot the sample overview
74+
ordered_dict = collections.OrderedDict(sorted(sample_test_results_dict.items()))
75+
samples = list(ordered_dict.keys())
76+
values = np.array(list(ordered_dict.values()))
77+
78+
fig = plt.figure(figsize=(8,len(samples)))
79+
plt.imshow(values, interpolation='nearest', cmap=colors.ListedColormap(['green','yellow','red']))
80+
plt.yticks(range(values.shape[0]), samples)
81+
plt.xticks(range(values.shape[1]), label_abbrevations)
82+
plt.xlabel('FastQC test (abbrevated names)')
83+
plt.ylabel('Sample name')
84+
plt.title('FastQC results by sample')
85+
fig.savefig(os.path.join(fastqc_out_folder,'quality_summary_all_samples_1.pdf'), dpi = 500,transparent=True,bbox_inches='tight')
86+
87+
# plot the test overview
88+
all_pass_counts = [list(col).count(0) for col in values.T]
89+
all_warn_counts = [list(col).count(1) for col in values.T]
90+
all_fail_counts = [list(col).count(2) for col in values.T]
91+
92+
barWidth=0.3
93+
r2 = np.arange(len(all_pass_counts))
94+
r1 = [x - barWidth for x in r2]
95+
r3 = [x + barWidth for x in r2]
96+
97+
fig = plt.figure(figsize=(8,len(samples)))
98+
plt.bar(r1, all_pass_counts, color='green', width=barWidth, edgecolor='black', label='pass')
99+
plt.bar(r2, all_warn_counts, color='yellow', width=barWidth, edgecolor='black', label='warn')
100+
plt.bar(r3, all_fail_counts, color='red', width=barWidth, edgecolor='black', label='fail')
101+
plt.xticks(range(values.shape[1]), label_abbrevations)
102+
for border in np.array(r3)+0.66*barWidth:
103+
plt.axvline(border,color='black',linestyle='--',alpha=0.5)
104+
plt.yticks(range(len(samples)+1), range(len(samples)+1))
105+
plt.xlim(0-barWidth-0.75*barWidth,)
106+
plt.xlabel('FastQC test (abbrevated names)')
107+
plt.ylabel('number of samples')
108+
plt.title('FastQC results by test type')
109+
#plt.legend()
110+
fig.savefig(os.path.join(fastqc_out_folder,'quality_summary_all_samples_2.pdf'), dpi = 500,transparent=True,bbox_inches='tight')
111+
112+
34113
def main(args):
35114
# Set working directory
36115
out_folder = args.output
@@ -56,22 +135,4 @@ def main(args):
56135
p = subprocess.Popen(fastqc_cmd, stdout=log_err_file, stderr=log_err_file, shell=True)
57136
p.communicate()
58137

59-
# write the r-plotting script to file
60-
r_plotting_script = 'opt <- c()\nopt$input_folder = workdir\nopt$output_file =paste0(workdir, "/QC_plots.pdf")\n\n#load fastQC summaries and create per test table\ninp <- list.files(opt$input_folder, pattern = ".zip")\n\n\nfastqc_results <- lapply(inp, function(k){\n unzip(paste(opt$input_folder, k, sep = "/"),exdir = opt$input_folder)\n inpu <- read_delim(paste(paste(gsub(".zip", "", paste(opt$input_folder,k, sep = "/"))), \n "summary.txt", sep = "/"), delim = "\t")\n out <- as_data_frame(t(inpu[, 1])) %>%\n mutate(sample.id = names(inpu)[3])\n names(out) <- c(gsub(" ", "_", unlist(inpu[,2])), "sample_id")\n unlink(x = paste(opt$input_folder, gsub(".zip", "", k), sep = "/"), \n recursive = T, force = T)\n \n return(out)\n})\n\nn_cols = length(fastqc_results[[1]])\nif (n_cols==11){\n outp <- do.call("rbind.data.frame", fastqc_results)%>%\n select(ID = sample_id,\n PBQ = Per_base_sequence_quality,\n PTQ = Per_tile_sequence_quality,\n PSQ = Per_sequence_quality_scores,\n PBC = Per_base_sequence_content,\n SGC = Per_sequence_GC_content,\n PBN = Per_base_N_content,\n SLD = Sequence_Length_Distribution,\n SDL = Sequence_Duplication_Levels,\n ORS = Overrepresented_sequences,\n AdC = Adapter_Content)\n #change table format\n ret <- outp %>% \n group_by(ID) %>%\n gather(test, status, PBQ:AdC)\n\n}\nif (n_cols==12){\n outp <- do.call("rbind.data.frame", fastqc_results)%>%\n select(ID = sample_id,\n PBQ = Per_base_sequence_quality,\n PTQ = Per_tile_sequence_quality,\n PSQ = Per_sequence_quality_scores,\n PBC = Per_base_sequence_content,\n SGC = Per_sequence_GC_content,\n PBN = Per_base_N_content,\n SLD = Sequence_Length_Distribution,\n SDL = Sequence_Duplication_Levels,\n ORS = Overrepresented_sequences,\n AdC = Adapter_Content,\n KmC = Kmer_Content)\n #change table format\n ret <- outp %>% \n group_by(ID) %>%\n gather(test, status, PBQ:KmC)\n}\n\n#plot how many samples failed the test\nqc.fail <- ggplot()+\n geom_bar(data = ret, aes(x = test, fill = status), stat = "count", position = "dodge")+\n theme_bw()\n\n#plot which sample failed which test\nqc.samples <- ggplot()+\n geom_tile(data = ret, aes(y = ID, x = test, fill = as.factor(status)))+\n scale_fill_discrete(name = "status")+\n xlab("FastQC test")+\n ylab("Samples")+\n theme_bw()+\n theme(\n axis.text.y = element_blank()\n )\n\n#plot pdf\npdf(opt$output_file)\nprint(qc.fail)\nprint(qc.samples)\ndev.off()\n\npng(gsub(".pdf", "1.png", opt$output_file))\nprint(qc.fail)\ndev.off()\n\npng(gsub(".pdf", "2.png", opt$output_file))\nprint(qc.samples)\ndev.off()\n\n#table with samples that faild a test\nfail <- ret %>%\n filter(status == "FAIL")\n\n#get the ID number of the failed samples\nfail.samp <- fail %>%\n filter(!duplicated(ID)) %>%\n select(ID)%>%\n unlist() %>%\n parse_number()%>%\n unique() %>%\n sort()'
61-
62-
add_to_script = 'library(tidyverse)\nworkdir = "%s"\n' %out_folder
63-
new_r_plotting_script = add_to_script + r_plotting_script
64-
r_script_path = os.path.join(out_folder,'fastqc_visualization.r')
65-
text_file = open(r_script_path, "w")
66-
text_file.write(new_r_plotting_script)
67-
text_file.close()
68-
69-
# execute r-plotting script
70-
print('Running R-code for plotting...')
71-
final_plot = os.path.join(out_folder,'quality_summary_all_samples.pdf')
72-
plotting_cmd = [
73-
'Rscript %s -i %s -o %s' %(r_script_path,out_folder,final_plot)
74-
]
75-
with open(os.path.join(out_folder, "r_plotting_screen_out.txt"), 'w') as log_err_file:
76-
p = subprocess.Popen(plotting_cmd, stdout=log_err_file, stderr=log_err_file, shell=True)
77-
p.communicate()
138+
plot_fastqc_results(out_folder)

src/plot_quality_test_results.py

Lines changed: 91 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,91 @@
1+
#!/usr/bin/env python3
2+
# -*- coding: utf-8 -*-
3+
"""
4+
Created on Tue Feb 26 09:39:51 2019
5+
6+
@author: Tobias Andermann ([email protected])
7+
"""
8+
9+
import numpy as np
10+
import matplotlib.pyplot as plt
11+
from matplotlib import colors
12+
import glob
13+
import zipfile
14+
import collections
15+
16+
def get_test_results(fastqc_log_content):
17+
test_results = [i for i in fastqc_log_content if i.startswith('>>')]
18+
test_names = [string.split('\t')[0].replace('>>','') for string in test_results if not string == '>>END_MODULE']
19+
test_results = [string.split('\t')[-1] for string in test_results if not string == '>>END_MODULE']
20+
return test_names,test_results
21+
22+
23+
input_dir = '/Users/tobias/GitHub/seqcap_processor/data/raw/test_folder_quality_check/'
24+
zip_files = glob.glob('%s*.zip'%input_dir)
25+
sample_test_results_dict = {}
26+
for file in zip_files:
27+
sample_name = file.split('/')[-1].replace('_fastqc.zip','')
28+
archive = zipfile.ZipFile(file,'r')
29+
target_file = [i for i in archive.namelist() if i.endswith('fastqc_data.txt')][0]
30+
fastqc_log = archive.read(target_file)
31+
fastqc_log_formatted = str(fastqc_log).replace('\\t','\t').split('\\n')
32+
labels,results = get_test_results(fastqc_log_formatted)
33+
num_results = [0 if i == 'pass' else i for i in results]
34+
num_results = [1 if i == 'warn' else i for i in num_results]
35+
num_results = [2 if i == 'fail' else i for i in num_results]
36+
sample_test_results_dict[sample_name] = num_results
37+
38+
label_abbrevations = []
39+
for i in labels:
40+
split_string = i.split(' ')
41+
abbrevation = []
42+
for j in split_string:
43+
letter = j[0]
44+
abbrevation.append(letter)
45+
abbrevation = ''.join(abbrevation)
46+
label_abbrevations.append(abbrevation)
47+
48+
49+
# plot the sampel overview
50+
ordered_dict = collections.OrderedDict(sorted(sample_test_results_dict.items()))
51+
samples = list(ordered_dict.keys())
52+
values = np.array(list(ordered_dict.values()))
53+
54+
fig = plt.figure(figsize=(8,len(samples)))
55+
plt.imshow(values, interpolation='nearest', cmap=colors.ListedColormap(['green','yellow','red']))
56+
plt.yticks(range(values.shape[0]), samples)
57+
plt.xticks(range(values.shape[1]), label_abbrevations)
58+
plt.xlabel('FastQC test (abbrevated names)')
59+
plt.ylabel('Sample name')
60+
plt.title('FastQC results by sample')
61+
fig.savefig('/Users/tobias/Desktop/test1.pdf', dpi = 500,transparent=True)#bbox_inches='tight',
62+
63+
# plot the test overview
64+
all_pass_counts = [list(col).count(0) for col in values.T]
65+
all_warn_counts = [list(col).count(1) for col in values.T]
66+
all_fail_counts = [list(col).count(2) for col in values.T]
67+
68+
barWidth=0.3
69+
r2 = np.arange(len(all_pass_counts))
70+
r1 = [x - barWidth for x in r2]
71+
r3 = [x + barWidth for x in r2]
72+
73+
fig = plt.figure(figsize=(8,len(samples)))
74+
plt.bar(r1, all_pass_counts, color='green', width=barWidth, edgecolor='black', label='pass')
75+
plt.bar(r2, all_warn_counts, color='yellow', width=barWidth, edgecolor='black', label='warn')
76+
plt.bar(r3, all_fail_counts, color='red', width=barWidth, edgecolor='black', label='fail')
77+
plt.xticks(range(values.shape[1]), label_abbrevations)
78+
for border in np.array(r3)+0.66*barWidth:
79+
plt.axvline(border,color='black',linestyle='--',alpha=0.5)
80+
plt.yticks(range(len(samples)+1), range(len(samples)+1))
81+
plt.xlim(0-barWidth-0.75*barWidth,)
82+
plt.xlabel('FastQC test (abbrevated names)')
83+
plt.ylabel('number of samples')
84+
plt.title('FastQC results by test type')
85+
plt.legend()
86+
fig.savefig('/Users/tobias/Desktop/test.pdf', dpi = 500,transparent=True)#bbox_inches='tight',
87+
88+
89+
90+
91+

0 commit comments

Comments
 (0)