Skip to content

Commit ac77360

Browse files
author
Tobias Hofmann
committed
min contig length for abyss assembly
1 parent 64814e5 commit ac77360

File tree

8 files changed

+95
-36
lines changed

8 files changed

+95
-36
lines changed

docs/notebook/.ipynb_checkpoints/cleaning_trimming-checkpoint.ipynb

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,8 @@
77
"# Cleaning and trimming of fastq files\n",
88
"\n",
99
"\n",
10+
"<div class=\"alert alert-block alert-warning\">**Please check:** Is `secapr_env` activated? You can test with `conda info --envs`. Activate the correct environment with `source activate secapr_env`</div>\n",
11+
"\n",
1012
"<div class=\"alert alert-block alert-info\">All data operations in this tutorial are executed from within the jupyter notebooks that were used to generate this documentation. All jupyter notebooks are stored in the folder `docs/notebook` of the `secapr` GitHub project. That means that **all file- and script-paths are in relation to the notebook directory** (`docs/notebook`). When following the tutorial you may have to adjust the paths, either using absolute paths or paths relative to your working directory.</div>\n",
1113
"\n",
1214
"\n",

docs/notebook/.ipynb_checkpoints/contig_assembly-checkpoint.ipynb

Lines changed: 17 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -36,43 +36,33 @@
3636
},
3737
{
3838
"cell_type": "code",
39-
"execution_count": 2,
39+
"execution_count": 5,
4040
"metadata": {},
4141
"outputs": [
4242
{
43-
"name": "stdout",
43+
"name": "stderr",
4444
"output_type": "stream",
4545
"text": [
46-
"usage: secapr assemble_reads [-h] --input INPUT --output OUTPUT\n",
47-
" [--assembler {trinity,abyss}] [--kmer KMER]\n",
48-
" [--contig_length CONTIG_LENGTH] [--single_reads]\n",
49-
" [--cores CORES]\n",
50-
"\n",
51-
"Assemble trimmed Illumina read files (fastq)\n",
52-
"\n",
53-
"optional arguments:\n",
54-
" -h, --help show this help message and exit\n",
55-
" --input INPUT Call the folder that contains the trimmed reads,\n",
56-
" organized in a separate subfolder for each sample. The\n",
57-
" name of the subfolder has to start with the sample\n",
58-
" name, delimited with an underscore [_]\n",
59-
" --output OUTPUT The output directory where results will be saved\n",
60-
" --assembler {trinity,abyss}\n",
61-
" The assembler to use.\n",
62-
" --kmer KMER Set the kmer value\n",
63-
" --contig_length CONTIG_LENGTH\n",
64-
" Set the minimum contig length for Trinity assembly.\n",
65-
" Contigs that are shorter than this threshold will be\n",
66-
" discarded.\n",
67-
" --single_reads Use this flag if you additionally want to use single\n",
68-
" reads for the assembly\n",
69-
" --cores CORES For parallel processing you can set the number of\n",
70-
" cores you want to run Trinity on.\n"
46+
"Traceback (most recent call last):\n",
47+
" File \"/Users/tobias/anaconda3/envs/secapr_env/bin/secapr\", line 11, in <module>\n",
48+
" load_entry_point('secapr', 'console_scripts', 'secapr')()\n",
49+
" File \"/Users/tobias/GitHub/seqcap_processor/secapr/__main__.py\", line 42, in main\n",
50+
" module = importlib.import_module('.' + command_name, 'secapr')\n",
51+
" File \"/Users/tobias/anaconda3/envs/secapr_env/lib/python2.7/importlib/__init__.py\", line 37, in import_module\n",
52+
" __import__(name)\n",
53+
" File \"/Users/tobias/GitHub/seqcap_processor/secapr/quality_check.py\", line 17, in <module>\n",
54+
" import matplotlib.pyplot as plt\n",
55+
" File \"/Users/tobias/anaconda3/envs/secapr_env/lib/python2.7/site-packages/matplotlib/pyplot.py\", line 113, in <module>\n",
56+
" _backend_mod, new_figure_manager, draw_if_interactive, _show = pylab_setup()\n",
57+
" File \"/Users/tobias/anaconda3/envs/secapr_env/lib/python2.7/site-packages/matplotlib/backends/__init__.py\", line 60, in pylab_setup\n",
58+
" [backend_name], 0)\n",
59+
"ImportError: No module named ipykernel.pylab.backend_inline\n"
7160
]
7261
}
7362
],
7463
"source": [
7564
"%%bash\n",
65+
"source activate secapr_env\n",
7666
"secapr assemble_reads -h"
7767
]
7868
},

docs/notebook/cleaning_trimming.ipynb

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,8 @@
77
"# Cleaning and trimming of fastq files\n",
88
"\n",
99
"\n",
10+
"<div class=\"alert alert-block alert-warning\">**Please check:** Is `secapr_env` activated? You can test with `conda info --envs`. Activate the correct environment with `source activate secapr_env`</div>\n",
11+
"\n",
1012
"<div class=\"alert alert-block alert-info\">All data operations in this tutorial are executed from within the jupyter notebooks that were used to generate this documentation. All jupyter notebooks are stored in the folder `docs/notebook` of the `secapr` GitHub project. That means that **all file- and script-paths are in relation to the notebook directory** (`docs/notebook`). When following the tutorial you may have to adjust the paths, either using absolute paths or paths relative to your working directory.</div>\n",
1113
"\n",
1214
"\n",

docs/notebook/contig_assembly.ipynb

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -36,7 +36,7 @@
3636
},
3737
{
3838
"cell_type": "code",
39-
"execution_count": 2,
39+
"execution_count": 1,
4040
"metadata": {},
4141
"outputs": [
4242
{
@@ -73,6 +73,7 @@
7373
],
7474
"source": [
7575
"%%bash\n",
76+
"\n",
7677
"secapr assemble_reads -h"
7778
]
7879
},

secapr/assemble_reads.py

Lines changed: 30 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -55,7 +55,7 @@ def add_arguments(parser):
5555
'--contig_length',
5656
type=int,
5757
default=200,
58-
help='[Option only for Trinity assembler] Set the minimum contig length for the assembly. Contigs that are shorter than this threshold will be discarded.'
58+
help='Set the minimum contig length for the assembly. Contigs that are shorter than this threshold will be discarded.'
5959
)
6060
parser.add_argument(
6161
'--max_memory',
@@ -152,9 +152,10 @@ def main(args):
152152
#mv_cmd2 = "mv %s/coverage.hist %s" %(home_dir,sample_output_folder)
153153
#os.system(mv_cmd2)
154154
#_______________________________________________________________________________________________________________________________________
155-
contig_count_df = get_stats_abyss(sample_output_folder,sample_id,sample_contig_count_dict)
155+
contig_count_df,contig_file = get_stats_abyss(sample_output_folder,sample_id,sample_contig_count_dict)
156+
remove_short_contigs(contig_file,min_length)
156157
else:
157-
print ("Error: Read-files for sample %s could not be found.Please check if fastq file names end with 'READ1.fastq' and 'READ2.fastq' respectively." %sample_id)
158+
print ("Error: Read-files for sample %s could not be found.Please check if fastq file names end with 'READ1.fastq' and 'READ2.fastq' respectively and if all files are unzipped." %sample_id)
158159
raise SystemExit
159160
if not args.disable_stats:
160161
try:
@@ -278,22 +279,45 @@ def edit_trinity_headers(contig_file,new_contig_file):
278279
new_fasta.write(line)
279280
new_fasta.close()
280281

281-
def count_contigs(contig):
282+
def count_contigs(contig_file):
282283
"""Return a count of contigs from a fasta file"""
283-
return sum([1 for line in open(contig, 'rU').readlines() if line.startswith('>')])
284+
return sum([1 for line in open(contig_file, 'rU').readlines() if line.startswith('>')])
285+
286+
def remove_short_contigs(contig_file,min_length):
287+
fasta = open(contig_file,'r')
288+
fasta_content = list(fasta)
289+
counter = 0
290+
indeces_to_keep = []
291+
for i,line in enumerate(fasta_content):
292+
if not line.startswith('>'):
293+
contig_length = len(line.replace('\n',''))
294+
if contig_length < min_length:
295+
pass
296+
else:
297+
# line number of header
298+
indeces_to_keep.append(i-1)
299+
# line number of sequence
300+
indeces_to_keep.append(i)
301+
new_fasta_content = list(np.array(fasta_content)[indeces_to_keep])
302+
new_fasta = open(contig_file,'w')
303+
for line in new_fasta_content:
304+
new_fasta.write(line)
305+
new_fasta.close()
306+
284307

285308
def get_stats_abyss(sample_output_folder,sample_id,sample_contig_count_dict):
286309
#contig_count_cmd = subprocess.Popen(["tail", "-n", "2", "%s/%s.fa" %('/'.join(sample_output_folder.split('/')[:-2]),sample_id)], stdout=subprocess.PIPE)
287310
#contig_count_pre = contig_count_cmd.communicate()[0]
288311
contig_file = "%s/%s.fa" %('/'.join(sample_output_folder.split('/')[:-2]),sample_id)
289312
contig_count = count_contigs(contig_file)
313+
290314
#contig_count = contig_count_pre.split(' ')[0].replace('>','')
291315
sample_contig_count_dict.setdefault(sample_id,contig_count)
292316
stats_df=pd.DataFrame.from_dict(sample_contig_count_dict, orient='index').reset_index()
293317
stats_df.columns = ['sample', 'total_contig_count']
294318
print('#'*50)
295319
print(stats_df)
296-
return(stats_df)
320+
return(stats_df,contig_file)
297321
#contig_count, header, percent, sequence = contig_count_pre.split("\t")
298322

299323
def cleanup_trinity_assembly_folder(sample_output_folder, sample_id):

secapr/quality_check.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -109,7 +109,6 @@ def plot_fastqc_results(fastqc_out_folder):
109109
#plt.legend()
110110
fig.savefig(os.path.join(fastqc_out_folder,'quality_summary_all_samples_2.pdf'), dpi = 500,transparent=True,bbox_inches='tight')
111111

112-
113112
def main(args):
114113
# Set working directory
115114
out_folder = args.output
@@ -122,6 +121,9 @@ def main(args):
122121
for root, dirnames, filenames in os.walk(input_folder):
123122
for filename in fnmatch.filter(filenames, '*.fastq'):
124123
matches.append(os.path.join(root, filename))
124+
if len(matches) == 0:
125+
print('No files with the ending .fastq found in input folder. Please check path and ensure that all readfiles are unzipped and have the filending ".fastq"')
126+
sys.exit()
125127
fastq_df = pd.DataFrame(index=np.arange(0,len(matches)), columns=['filepaths'])
126128
fastq_df['filepaths'] = matches
127129
fastq_list_path = os.path.join(out_folder,'fastq_file_list.txt')

src/plot_quality_test_results.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -88,4 +88,3 @@ def get_test_results(fastqc_log_content):
8888

8989

9090

91-

src/remove_short_contigs.py

Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,39 @@
1+
#!/usr/bin/env python3
2+
# -*- coding: utf-8 -*-
3+
"""
4+
Created on Tue Feb 26 15:21:03 2019
5+
6+
@author: Tobias Andermann ([email protected])
7+
"""
8+
9+
import numpy as np
10+
import pandas as pd
11+
import matplotlib.pyplot as plt
12+
13+
14+
15+
contig_file = '/Users/tobias/Desktop/1063.fa'
16+
#contig_file_new = '/Users/tobias/GitHub/seqcap_processor/data/processed/contigs/1063_removed_short_contigs.fa'
17+
min_length = 200
18+
fasta = open(contig_file,'r')
19+
fasta_content = list(fasta)
20+
counter = 0
21+
indeces_to_keep = []
22+
for i,line in enumerate(fasta_content):
23+
if not line.startswith('>'):
24+
contig_length = len(line.replace('\n',''))
25+
if contig_length < min_length:
26+
pass
27+
else:
28+
# line number of header
29+
indeces_to_keep.append(i-1)
30+
# line number of sequence
31+
indeces_to_keep.append(i)
32+
33+
new_fasta_content = list(np.array(fasta_content)[indeces_to_keep])
34+
new_fasta = open(contig_file,'w')
35+
for line in new_fasta_content:
36+
new_fasta.write(line)
37+
new_fasta.close()
38+
39+

0 commit comments

Comments
 (0)