min contig length for abyss assembly

Tobias Hofmann · Tobias Hofmann · commit ac773604f445 · 2019-02-26T17:51:59.000+01:00
diff --git a/docs/notebook/.ipynb_checkpoints/cleaning_trimming-checkpoint.ipynb b/docs/notebook/.ipynb_checkpoints/cleaning_trimming-checkpoint.ipynb
@@ -7,6 +7,8 @@
     "# Cleaning and trimming of fastq files\n",
     "\n",
     "\n",
+    "<div class=\"alert alert-block alert-warning\">**Please check:** Is `secapr_env` activated? You can test with `conda info --envs`. Activate the correct environment with `source activate secapr_env`</div>\n",
+    "\n",
     "<div class=\"alert alert-block alert-info\">All data operations in this tutorial are executed from within the jupyter notebooks that were used to generate this documentation. All jupyter notebooks are stored in the folder `docs/notebook` of the `secapr` GitHub project. That means that **all file- and script-paths are in relation to the notebook directory** (`docs/notebook`). When following the tutorial you may have to adjust the paths, either using absolute paths or paths relative to your working directory.</div>\n",
     "\n",
     "\n",
diff --git a/docs/notebook/.ipynb_checkpoints/contig_assembly-checkpoint.ipynb b/docs/notebook/.ipynb_checkpoints/contig_assembly-checkpoint.ipynb
@@ -36,43 +36,33 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 2,
+   "execution_count": 5,
    "metadata": {},
    "outputs": [
     {
-     "name": "stdout",
+     "name": "stderr",
      "output_type": "stream",
      "text": [
-      "usage: secapr assemble_reads [-h] --input INPUT --output OUTPUT\n",
-      "                             [--assembler {trinity,abyss}] [--kmer KMER]\n",
-      "                             [--contig_length CONTIG_LENGTH] [--single_reads]\n",
-      "                             [--cores CORES]\n",
-      "\n",
-      "Assemble trimmed Illumina read files (fastq)\n",
-      "\n",
-      "optional arguments:\n",
-      "  -h, --help            show this help message and exit\n",
-      "  --input INPUT         Call the folder that contains the trimmed reads,\n",
-      "                        organized in a separate subfolder for each sample. The\n",
-      "                        name of the subfolder has to start with the sample\n",
-      "                        name, delimited with an underscore [_]\n",
-      "  --output OUTPUT       The output directory where results will be saved\n",
-      "  --assembler {trinity,abyss}\n",
-      "                        The assembler to use.\n",
-      "  --kmer KMER           Set the kmer value\n",
-      "  --contig_length CONTIG_LENGTH\n",
-      "                        Set the minimum contig length for Trinity assembly.\n",
-      "                        Contigs that are shorter than this threshold will be\n",
-      "                        discarded.\n",
-      "  --single_reads        Use this flag if you additionally want to use single\n",
-      "                        reads for the assembly\n",
-      "  --cores CORES         For parallel processing you can set the number of\n",
-      "                        cores you want to run Trinity on.\n"
+      "Traceback (most recent call last):\n",
+      "  File \"/Users/tobias/anaconda3/envs/secapr_env/bin/secapr\", line 11, in <module>\n",
+      "    load_entry_point('secapr', 'console_scripts', 'secapr')()\n",
+      "  File \"/Users/tobias/GitHub/seqcap_processor/secapr/__main__.py\", line 42, in main\n",
+      "    module = importlib.import_module('.' + command_name, 'secapr')\n",
+      "  File \"/Users/tobias/anaconda3/envs/secapr_env/lib/python2.7/importlib/__init__.py\", line 37, in import_module\n",
+      "    __import__(name)\n",
+      "  File \"/Users/tobias/GitHub/seqcap_processor/secapr/quality_check.py\", line 17, in <module>\n",
+      "    import matplotlib.pyplot as plt\n",
+      "  File \"/Users/tobias/anaconda3/envs/secapr_env/lib/python2.7/site-packages/matplotlib/pyplot.py\", line 113, in <module>\n",
+      "    _backend_mod, new_figure_manager, draw_if_interactive, _show = pylab_setup()\n",
+      "  File \"/Users/tobias/anaconda3/envs/secapr_env/lib/python2.7/site-packages/matplotlib/backends/__init__.py\", line 60, in pylab_setup\n",
+      "    [backend_name], 0)\n",
+      "ImportError: No module named ipykernel.pylab.backend_inline\n"
      ]
     }
    ],
    "source": [
     "%%bash\n",
+    "source activate secapr_env\n",
     "secapr assemble_reads -h"
    ]
   },
diff --git a/docs/notebook/cleaning_trimming.ipynb b/docs/notebook/cleaning_trimming.ipynb
@@ -7,6 +7,8 @@
     "# Cleaning and trimming of fastq files\n",
     "\n",
     "\n",
+    "<div class=\"alert alert-block alert-warning\">**Please check:** Is `secapr_env` activated? You can test with `conda info --envs`. Activate the correct environment with `source activate secapr_env`</div>\n",
+    "\n",
     "<div class=\"alert alert-block alert-info\">All data operations in this tutorial are executed from within the jupyter notebooks that were used to generate this documentation. All jupyter notebooks are stored in the folder `docs/notebook` of the `secapr` GitHub project. That means that **all file- and script-paths are in relation to the notebook directory** (`docs/notebook`). When following the tutorial you may have to adjust the paths, either using absolute paths or paths relative to your working directory.</div>\n",
     "\n",
     "\n",
diff --git a/docs/notebook/contig_assembly.ipynb b/docs/notebook/contig_assembly.ipynb
@@ -36,7 +36,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 2,
+   "execution_count": 1,
    "metadata": {},
    "outputs": [
     {
@@ -73,6 +73,7 @@
    ],
    "source": [
     "%%bash\n",
+    "\n",
     "secapr assemble_reads -h"
    ]
   },
diff --git a/secapr/assemble_reads.py b/secapr/assemble_reads.py
@@ -55,7 +55,7 @@ def add_arguments(parser):
         '--contig_length',
         type=int,
         default=200,
-        help='[Option only for Trinity assembler] Set the minimum contig length for the assembly. Contigs that are shorter than this threshold will be discarded.'
+        help='Set the minimum contig length for the assembly. Contigs that are shorter than this threshold will be discarded.'
     )
     parser.add_argument(
         '--max_memory',
@@ -152,9 +152,10 @@ def main(args):
                         #mv_cmd2 = "mv %s/coverage.hist %s" %(home_dir,sample_output_folder)
                         #os.system(mv_cmd2)
 #_______________________________________________________________________________________________________________________________________
-                        contig_count_df = get_stats_abyss(sample_output_folder,sample_id,sample_contig_count_dict)
+                        contig_count_df,contig_file = get_stats_abyss(sample_output_folder,sample_id,sample_contig_count_dict)
+                        remove_short_contigs(contig_file,min_length)
                 else:
-                    print ("Error: Read-files for sample %s could not be found.Please check if fastq file names end with 'READ1.fastq' and 'READ2.fastq' respectively." %sample_id)
+                    print ("Error: Read-files for sample %s could not be found.Please check if fastq file names end with 'READ1.fastq' and 'READ2.fastq' respectively and if all files are unzipped." %sample_id)
                     raise SystemExit
     if not args.disable_stats:
         try:
@@ -278,22 +279,45 @@ def edit_trinity_headers(contig_file,new_contig_file):
             new_fasta.write(line)
     new_fasta.close()
 
-def count_contigs(contig):
+def count_contigs(contig_file):
     """Return a count of contigs from a fasta file"""
-    return sum([1 for line in open(contig, 'rU').readlines() if line.startswith('>')])
+    return sum([1 for line in open(contig_file, 'rU').readlines() if line.startswith('>')])
+
+def remove_short_contigs(contig_file,min_length):
+    fasta =  open(contig_file,'r')
+    fasta_content = list(fasta)
+    counter = 0
+    indeces_to_keep = []
+    for i,line in enumerate(fasta_content):
+        if not line.startswith('>'):
+            contig_length = len(line.replace('\n',''))
+            if contig_length < min_length:
+                pass
+            else:
+                # line number of header
+                indeces_to_keep.append(i-1)
+                # line number of sequence
+                indeces_to_keep.append(i)
+    new_fasta_content = list(np.array(fasta_content)[indeces_to_keep])
+    new_fasta = open(contig_file,'w')
+    for line in new_fasta_content:
+        new_fasta.write(line)
+    new_fasta.close()
+
 
 def get_stats_abyss(sample_output_folder,sample_id,sample_contig_count_dict):
     #contig_count_cmd = subprocess.Popen(["tail", "-n", "2", "%s/%s.fa" %('/'.join(sample_output_folder.split('/')[:-2]),sample_id)], stdout=subprocess.PIPE)
     #contig_count_pre = contig_count_cmd.communicate()[0]
     contig_file = "%s/%s.fa" %('/'.join(sample_output_folder.split('/')[:-2]),sample_id)
     contig_count = count_contigs(contig_file)
+
     #contig_count = contig_count_pre.split(' ')[0].replace('>','')
     sample_contig_count_dict.setdefault(sample_id,contig_count)
     stats_df=pd.DataFrame.from_dict(sample_contig_count_dict, orient='index').reset_index()
     stats_df.columns = ['sample', 'total_contig_count']
     print('#'*50)
     print(stats_df)
-    return(stats_df)
+    return(stats_df,contig_file)
     #contig_count, header, percent, sequence = contig_count_pre.split("\t")    
 
 def cleanup_trinity_assembly_folder(sample_output_folder, sample_id):
diff --git a/secapr/quality_check.py b/secapr/quality_check.py
@@ -109,7 +109,6 @@ def plot_fastqc_results(fastqc_out_folder):
     #plt.legend()
     fig.savefig(os.path.join(fastqc_out_folder,'quality_summary_all_samples_2.pdf'), dpi = 500,transparent=True,bbox_inches='tight')
 
-
 def main(args):
     # Set working directory
     out_folder = args.output
@@ -122,6 +121,9 @@ def main(args):
     for root, dirnames, filenames in os.walk(input_folder):
         for filename in fnmatch.filter(filenames, '*.fastq'):
             matches.append(os.path.join(root, filename))
+    if len(matches) == 0:
+        print('No files with the ending .fastq found in input folder. Please check path and ensure that all readfiles are unzipped and have the filending ".fastq"')
+        sys.exit()
     fastq_df = pd.DataFrame(index=np.arange(0,len(matches)), columns=['filepaths'])
     fastq_df['filepaths'] = matches
     fastq_list_path = os.path.join(out_folder,'fastq_file_list.txt')
diff --git a/src/plot_quality_test_results.py b/src/plot_quality_test_results.py
@@ -88,4 +88,3 @@ def get_test_results(fastqc_log_content):
 
 
 
-
diff --git a/src/remove_short_contigs.py b/src/remove_short_contigs.py
@@ -0,0 +1,39 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+Created on Tue Feb 26 15:21:03 2019
+
+@author: Tobias Andermann (tobias.andermann@bioenv.gu.se)
+"""
+
+import numpy as np
+import pandas as pd
+import matplotlib.pyplot as plt
+
+
+
+contig_file = '/Users/tobias/Desktop/1063.fa'
+#contig_file_new = '/Users/tobias/GitHub/seqcap_processor/data/processed/contigs/1063_removed_short_contigs.fa'
+min_length = 200
+fasta =  open(contig_file,'r')
+fasta_content = list(fasta)
+counter = 0
+indeces_to_keep = []
+for i,line in enumerate(fasta_content):
+    if not line.startswith('>'):
+        contig_length = len(line.replace('\n',''))
+        if contig_length < min_length:
+            pass
+        else:
+            # line number of header
+            indeces_to_keep.append(i-1)
+            # line number of sequence
+            indeces_to_keep.append(i)
+
+new_fasta_content = list(np.array(fasta_content)[indeces_to_keep])
+new_fasta = open(contig_file,'w')
+for line in new_fasta_content:
+    new_fasta.write(line)
+new_fasta.close()
+
+

Original file line number	Diff line number	Diff line change
`@@ -36,7 +36,7 @@`
`36`	`36`	`},`
`37`	`37`	`{`
`38`	`38`	`"cell_type": "code",`
`39`		`- "execution_count": 2,`
	`39`	`+ "execution_count": 1,`
`40`	`40`	`"metadata": {},`
`41`	`41`	`"outputs": [`
`42`	`42`	`{`
`@@ -73,6 +73,7 @@`
`73`	`73`	`],`
`74`	`74`	`"source": [`
`75`	`75`	`"%%bash\n",`
	`76`	`+ "\n",`
`76`	`77`	`"secapr assemble_reads -h"`
`77`	`78`	`]`
`78`	`79`	`},`
Original file line number	Diff line number	Diff line change
`@@ -88,4 +88,3 @@ def get_test_results(fastqc_log_content):`
`88`	`88`
`89`	`89`
`90`	`90`
`91`		`-`