Updating seqweaver (#188)

aviyalitman · web-flow · commit 6ff4a5ef4feb · 2023-10-30T22:32:27.000-04:00
* init get_new_training_data script and strand spec

* refactor main script, fix strand-spec

* debugging and testing update_seqweaver

* fixed h5 file output

* added class modules for training seqweaver

* added validation/training strat

* debugging main update seqweaver module

* strand backward compatibility

* further fixes to backward compatibility

* val partition fix

* indexing fix for backward compatibility

* addressing kathy's comments

* fixed relative paths in update_seqweaver

* handling strand=. as None
diff --git a/models/seqweaver.py b/models/seqweaver.py
@@ -0,0 +1,70 @@
+"""
+Seqweaver architecture (Park & Troyanskaya, 2021).
+"""
+import torch
+import torch.nn as nn
+
+
+class LambdaBase(nn.Sequential):
+    def __init__(self, fn, *args):
+        super(LambdaBase, self).__init__(*args)
+        self.lambda_func = fn
+
+    def forward_prepare(self, input):
+        output = []
+        for module in self._modules.values():
+            output.append(module(input))
+        return output if output else input
+
+
+class Lambda(LambdaBase):
+    def forward(self, input):
+        return self.lambda_func(self.forward_prepare(input))
+
+
+class Seqweaver(nn.Module):
+
+    def __init__(self, n_classes):  # 217 human, 43 mouse
+        super(Seqweaver, self).__init__()
+        self.model = nn.Sequential(
+	        nn.Conv2d(4, 160, (1, 8)),
+	        nn.ReLU(),
+	        nn.MaxPool2d((1, 4), (1, 4)),
+	        nn.Dropout(0.1),
+	        nn.Conv2d(160, 320, (1, 8)),
+	        nn.ReLU(),
+	        nn.MaxPool2d((1, 4), (1, 4)),
+	        nn.Dropout(0.1),
+	        nn.Conv2d(320, 480, (1, 8)),
+	        nn.ReLU(),
+	        nn.Dropout(0.3))
+        self.fc = nn.Sequential(
+	        Lambda(lambda x: torch.reshape(x, (x.size(0), 25440))),
+	        nn.Sequential(
+                Lambda(lambda x: x.reshape(1, -1)
+                       if 1 == len(x.size()) else x),
+                nn.Linear(25440, n_classes)
+            ),
+	        nn.ReLU(),
+	        nn.Sequential(
+                Lambda(lambda x: x.view(1, -1)
+                       if 1 == len(x.size()) else x),
+                nn.Linear(n_classes, n_classes)
+            ),
+            nn.Sigmoid(),
+        )
+
+    def forward(self, x):
+        x = x.unsqueeze(2)
+        x = self.model(x)
+        x = self.fc(x)
+        return x
+
+
+def criterion():
+    return nn.BCELoss()
+
+
+def get_optimizer(lr):
+    return (torch.optim.SGD,
+            {"lr": lr, "weight_decay": 1e-6, "momentum": 0.9})
diff --git a/scripts/update_seqweaver.py b/scripts/update_seqweaver.py
@@ -0,0 +1,152 @@
+"""
+This module provides the `UpdateSeqweaver` class, which wraps the master bed file
+containing all of the features' binding sites parsed from CLIP-seq.
+It supports new dataset construction and training for Seqweaver.
+
+"""
+import h5py
+import gzip
+import numpy as np
+import sys
+
+from selene_sdk.sequences.genome import Genome
+from selene_sdk.targets.genomic_features import GenomicFeatures
+from selene_sdk.samplers.dataloader import H5DataLoader
+from selene_sdk.train_model import TrainModel
+from selene_sdk.utils.config import load_path
+from selene_sdk.utils.config_utils import parse_configs_and_run
+
+class UpdateSeqweaver():
+    """
+    Stores a dataset specifying sequence regions and features.
+    Accepts a tabix-indexed `*.bed` file with the following columns,
+    in order:
+        [chrom, start, end, feature, strand]
+
+    Parameters
+    ----------
+    input_path : str
+        Path to the tabix-indexed dataset. Note that for the file to
+        be tabix-indexed, it must have been compressed with `bgzip`.
+        Thus, `input_path` should be a `*.gz` file with a
+        corresponding `*.tbi` file in the same directory.
+    output_path : str
+        Path to the output constructed-training data file.
+    feature_path : str
+        Path to a '\n'-delimited .txt file containing feature names.
+    hg_fasta : str
+        Path to an indexed FASTA file -- a `*.fasta` file with
+        a corresponding `*.fai` file in the same directory. This file
+        should contain the target organism's genome sequence.
+
+    """
+    def __init__(self, input_path, train_path, validate_path, feature_path, hg_fasta, yaml_path, val_prop=0.1, sequence_len=1000):
+        """
+        Constructs a new `UpdateSeqweaver` object.
+        """
+        self.input_path = input_path
+        self.train_path = train_path
+        self.validate_path = validate_path
+        self.feature_path = feature_path
+        self.yaml_path = yaml_path
+        self.val_prop = val_prop
+
+        self.hg_fasta = hg_fasta
+
+        self.sequence_len = sequence_len
+
+        with open(self.feature_path, 'r') as handle:
+            self.feature_set = [line.split('\n')[0] for line in handle.readlines()]
+
+    def _from_midpoint(self, start, end):
+        """
+        Computes start and end of the sequence about the peak midpoint.
+
+        Parameters
+        ----------
+        start : int
+            The 0-based first position in the region.
+        end : int
+            One past the 0-based last position in the region.
+
+        Returns
+        -------
+        seq_start : int
+            Sequence start position about the peak midpoint.
+        seq_end : int
+            Sequence end position about the peak midpoint.
+        """
+        region_len = end - start
+        midpoint = start + region_len // 2
+        seq_start = midpoint - np.floor(self.sequence_len / 2.)
+        seq_end = midpoint + np.ceil(self.sequence_len / 2.)
+
+        return int(seq_start), int(seq_end)
+
+    def construct_training_data(self):
+        """
+        Construct training dataset from bed file and write to output_file.
+
+        Parameters
+        ----------
+        output_path : str
+            Path to the output file for the constructed training data.
+        colname_file : str
+            Path to a .txt file containing newline-delimited feature names.
+
+        """
+        list_of_regions = []
+        with gzip.open(self.input_path) as f:
+            for line in f:
+                line = [str(data,'utf-8') for data in line.strip().split()]
+                list_of_regions.append(line)
+
+        seqs = Genome(self.hg_fasta, blacklist_regions = 'hg19')
+        targets = GenomicFeatures(self.input_path,
+                  features = self.feature_set, feature_thresholds = 0.5)
+
+        data_seqs = []
+        data_labels = []
+        for r in list_of_regions:
+            chrom, start, end, target, strand = r
+            start, end = int(start), int(end)
+            sstart, ssend = self._from_midpoint(start, end)
+
+            # 1 x 4 x 1000 bp
+            # get_encoding_from_coords : Converts sequence to one-hot-encoding for each of the 4 bases
+            dna_seq, has_unk = seqs.get_encoding_from_coords_check_unk(chrom, sstart, ssend, strand=strand)
+            if has_unk:
+                continue
+            if len(dna_seq) != self.sequence_len:
+                continue
+
+            # 1 x n_features
+            # get_feature_data: Computes which features overlap with the given region.
+            labels = targets.get_feature_data(chrom, start, end, strand=strand)
+
+            data_seqs.append(dna_seq)
+            data_labels.append(labels)
+
+        # partition some to validation before writing
+        val_count = int(np.floor(self.val_prop * len(data_seqs)))
+        validate_seqs = data_seqs[:val_count]
+        validate_labels = data_labels[:val_count]
+        training_seqs = data_seqs[val_count:]
+        training_labels = data_labels[val_count:]
+
+        with h5py.File(self.validate_path, "w") as fh:
+            fh.create_dataset("valid_sequences", data=np.array(validate_seqs, dtype=np.int64))
+            fh.create_dataset("valid_targets", data=np.array(validate_labels, dtype=np.int64))
+
+        with h5py.File(self.train_path, "w") as fh:
+            fh.create_dataset("train_sequences", data=np.array(training_seqs, dtype=np.int64))
+            fh.create_dataset("train_targets", data=np.array(training_labels, dtype=np.int64))
+
+    def _load_yaml(self):
+        # load yaml configuration
+        return load_path(self.yaml_path)
+
+    def train_model(self):
+        # load config file and train model
+        yaml_config = self._load_yaml()
+        parse_configs_and_run(yaml_config)
diff --git a/selene_sdk/targets/genomic_features.py b/selene_sdk/targets/genomic_features.py
@@ -102,7 +102,7 @@ def _is_positive_row(start, end,
 
 
 def _get_feature_data(chrom, start, end,
-                      thresholds, feature_index_dict, get_feature_rows):
+                      thresholds, feature_index_dict, get_feature_rows, strand=None):
     """
     Generates a target vector for the given query region.
 
@@ -125,6 +125,9 @@ def _get_feature_data(chrom, start, end,
     get_feature_rows : types.FunctionType
         A function that takes coordinates and returns rows
         (`list(tuple(int, int, str))`).
+    strand : {'+', '-'}, optional
+            The strand the sequence is located on. Default is None (no strand provided).
+            If '+' or '-' is passed in, only retrieve rows with the correct matching strand.
 
     Returns
     -------
@@ -133,7 +136,7 @@ def _get_feature_data(chrom, start, end,
         `i`th feature is positive, and zero otherwise.
 
     """
-    rows = get_feature_rows(chrom, start, end)
+    rows = get_feature_rows(chrom, start, end, strand=strand)
     return _fast_get_feature_data(
         start, end, thresholds, feature_index_dict, rows)
 
@@ -303,7 +306,7 @@ def dfunc(self, *args, **kwargs):
             return func(self, *args, **kwargs)
         return dfunc
 
-    def _query_tabix(self, chrom, start, end):
+    def _query_tabix(self, chrom, start, end, strand=None):
         """
         Queries a tabix-indexed `*.bed` file for features falling into
         the specified region.
@@ -317,6 +320,9 @@ def _query_tabix(self, chrom, start, end):
             The 0-based start position of the query coordinates.
         end : int
             One past the last position of the query coordinates.
+        strand : {'+', '-'}, optional
+            The strand the sequence is located on. Default is None (no strand provided).
+            If '+' or '-' is passed in, only retrieve rows with the correct matching strand.
 
         Returns
         -------
@@ -329,12 +335,16 @@ def _query_tabix(self, chrom, start, end):
 
         """
         try:
-            return self.data.query(chrom, start, end)
+            tabix_query = self.data.query(chrom, start, end)
+            if strand == '+' or strand == '-':
+                return [line for line in tabix_query if str(line[4]) == strand] # strand specificity
+            else: # not strand specific
+                return tabix_query
         except tabix.TabixError:
             return None
 
     @init
-    def is_positive(self, chrom, start, end):
+    def is_positive(self, chrom, start, end, strand=None):
         """
         Determines whether the query the `chrom` queried contains any
         genomic features within the :math:`[start, end)` region. If so,
@@ -357,11 +367,11 @@ def is_positive(self, chrom, start, end):
             assume the error was the result of no features being present
             in the queried region and return `False`.
         """
-        rows = self._query_tabix(chrom, start, end)
+        rows = self._query_tabix(chrom, start, end, strand=strand)
         return _any_positive_rows(rows, start, end, self.feature_thresholds)
 
     @init
-    def get_feature_data(self, chrom, start, end):
+    def get_feature_data(self, chrom, start, end, strand=None):
         """
         Computes which features overlap with the given region.
 
@@ -373,6 +383,9 @@ def get_feature_data(self, chrom, start, end):
             The 0-based first position in the region.
         end : int
             One past the 0-based last position in the region.
+        strand : {'+', '-'}, optional
+            The strand the sequence is located on. Default is None (no strand provided).
+            If '+' or '-' is passed in, only retrieve rows with the correct matching strand.
 
         Returns
         -------
@@ -388,7 +401,7 @@ def get_feature_data(self, chrom, start, end):
         """
         if self._feature_thresholds_vec is None:
             features = np.zeros(self.n_features)
-            rows = self._query_tabix(chrom, start, end)
+            rows = self._query_tabix(chrom, start, end, strand=strand) # strand specificity
             if not rows:
                 return features
             for r in rows:
@@ -398,4 +411,4 @@ def get_feature_data(self, chrom, start, end):
             return features
         return _get_feature_data(
             chrom, start, end, self._feature_thresholds_vec,
-            self.feature_index_dict, self._query_tabix)
+            self.feature_index_dict, self._query_tabix, strand=strand)