diff --git a/bioframe/__init__.py b/bioframe/__init__.py index abc2ea36..1de445e6 100644 --- a/bioframe/__init__.py +++ b/bioframe/__init__.py @@ -68,6 +68,7 @@ "select_labels", "select_mask", "setdiff", + "shift", "sort_bedframe", "subtract", "trim", @@ -141,6 +142,7 @@ select_labels, select_mask, setdiff, + shift, sort_bedframe, subtract, trim, diff --git a/bioframe/ops.py b/bioframe/ops.py index 51607ecf..5b5fe174 100644 --- a/bioframe/ops.py +++ b/bioframe/ops.py @@ -1,3 +1,5 @@ +import warnings + import numpy as np import pandas as pd @@ -18,6 +20,7 @@ "closest", "subtract", "setdiff", + "shift", "count_overlaps", "trim", "complement", @@ -147,7 +150,113 @@ def select(df, region, cols=None): return df.loc[select_mask(df, region, cols)] +def shift(df, amount, along=None, drop_invalid=False, cols=None): + """ + Translate the bounds of each genomic interval. + + Different shift amounts can be applied to leading and trailing bounds, and + can be applied in a strand-aware manner. Negative values indicate a shift + leftwards or upstream. + + Parameters + ---------- + df : pandas.DataFrame + + amount : int, array-like, or pair of int or array-like, optional + The amount(s) by which the bounds are linearly shifted. If a pair + ``(x, y)``, shift the leading bound by ``x`` and the trailing bound by + ``y``. Negative and positive values shift in the upstream and + downstream directions, respectively. Features are taken to assume the + reference orientation unless ``along`` is specified. + + along: str, array-like, or None + Name of column that will set up/downstream orientation for each + feature. The column should contain compliant strand values + ("+", "-", "."). Unstranded features will be ignored. + + drop_invalid: bool, optional [default: False] + Remove any intervals having negative length after shifting bounds. + By default, they will not be removed but a warning will be raised. + + cols : (str, str, str) or None + The names of columns containing the chromosome, start and end of the + genomic intervals. Default values are 'chrom', 'start', 'end'. + + Returns + ------- + pandas.DataFrame + + Notes + ----- + See :func:`bioframe.trim` for trimming interals after expansion or shift. + """ + ck, sk, ek = _get_default_colnames() if cols is None else cols + checks.is_bedframe(df, raise_errors=True, cols=[ck, sk, ek]) + + if along is not None: + if isinstance(along, str): + if along not in df.columns: + raise ValueError( + f'Cannot do strand-aware operation: {along} column is missing.' + ) + strands = df[along] + else: + strands = along + + if not strands.isin(['+', '-', '.']).all(): + missing_strand = (~strands.isin(['+', '-', '.'])).sum() + raise ValueError( + 'Cannot do strand-aware operation: strand information missing ' + f'for {missing_strand}/{df.shape[0]} ranges.' + ) + + if not isinstance(amount, (list, tuple)): + amount = (amount, amount) + elif len(amount) != 2: + raise ValueError( + "`amount` should be a single object or a sequence of length 2; " + f"got length {len(amount)}." + ) + + out = df.copy() + if along is None: + out[sk] = df[sk] + amount[0] + out[ek] = df[ek] + amount[1] + else: + out[sk] = np.where( + strands == '+', + df[sk] + amount[0], + np.where( + strands == '-', + df[sk] - amount[1], + df[sk] + ) + ) + out[ek] = np.where( + strands == '+', + df[ek] + amount[1], + np.where( + strands == '-', + df[ek] - amount[0], + df[ek] + ) + ) + + is_neglen = (out[ek] - out[sk]) < 0 + if is_neglen.any(): + if drop_invalid: + out = out.loc[~is_neglen] + else: + warnings.warn( + f"Operation produced {is_neglen.sum()}/{out.shape[0]} " + "intervals with negative length." + ) + + return out + + def expand(df, pad=None, scale=None, side="both", cols=None): + """ Expand each interval by an amount specified with `pad`. @@ -185,9 +294,7 @@ def expand(df, pad=None, scale=None, side="both", cols=None): Notes ----- See :func:`bioframe.trim` for trimming interals after expansion. - """ - ck, sk, ek = _get_default_colnames() if cols is None else cols checks.is_bedframe(df, raise_errors=True, cols=[ck, sk, ek]) diff --git a/tests/test_ops.py b/tests/test_ops.py index bd46c166..bf71c0ec 100644 --- a/tests/test_ops.py +++ b/tests/test_ops.py @@ -214,6 +214,170 @@ def test_trim(): ) +def test_shift(): + df = pd.DataFrame( + [ + ["chr1", 1000, 1200, "+"], + ["chr1", 800, 1200, "-"], + ["chrX", 1000, 1500, "+"], + ], + columns=["chrom", "start", "end", "strand"], + ) + + pd.testing.assert_frame_equal( + bioframe.shift(df, 10), + pd.DataFrame( + [ + ["chr1", 1000 + 10, 1200 + 10, "+"], + ["chr1", 800 + 10, 1200 + 10, "-"], + ["chrX", 1000 + 10, 1500 + 10, "+"], + ], + columns=["chrom", "start", "end", "strand"], + ) + ) + + pd.testing.assert_frame_equal( + bioframe.shift(df, -10), + pd.DataFrame( + [ + ["chr1", 1000 - 10, 1200 - 10, "+"], + ["chr1", 800 - 10, 1200 - 10, "-"], + ["chrX", 1000 - 10, 1500 - 10, "+"], + ], + columns=["chrom", "start", "end", "strand"], + ) + ) + + pd.testing.assert_frame_equal( + bioframe.shift(df, (-10, 20)), + pd.DataFrame( + [ + ["chr1", 1000 - 10, 1200 + 20, "+"], + ["chr1", 800 - 10, 1200 + 20, "-"], + ["chrX", 1000 - 10, 1500 + 20, "+"], + ], + columns=["chrom", "start", "end", "strand"], + ) + ) + + pd.testing.assert_frame_equal( + bioframe.shift(df, (10, -20)), + pd.DataFrame( + [ + ["chr1", 1000 + 10, 1200 - 20, "+"], + ["chr1", 800 + 10, 1200 - 20, "-"], + ["chrX", 1000 + 10, 1500 - 20, "+"], + ], + columns=["chrom", "start", "end", "strand"], + ) + ) + + pd.testing.assert_frame_equal( + bioframe.shift(df, (10, -200), drop_invalid=True), + pd.DataFrame( + [ + ["chr1", 800 + 10, 1200 - 200, "-"], + ["chrX", 1000 + 10, 1500 - 200, "+"], + ], + columns=["chrom", "start", "end", "strand"], + index=[1, 2], + ) + ) + + +def test_shift_strandaware(): + df = pd.DataFrame( + [ + ["chr1", 1000, 1200, "+"], + ["chr1", 800, 1200, "-"], + ["chrX", 1000, 1500, "+"], + ], + columns=["chrom", "start", "end", "strand"], + ) + + pd.testing.assert_frame_equal( + bioframe.shift(df, 10, along="strand"), + pd.DataFrame( + [ + ["chr1", 1000 + 10, 1200 + 10, "+"], + ["chr1", 800 - 10, 1200 - 10, "-"], + ["chrX", 1000 + 10, 1500 + 10, "+"], + ], + columns=["chrom", "start", "end", "strand"], + ) + ) + + pd.testing.assert_frame_equal( + bioframe.shift(df, -10, along="strand"), + pd.DataFrame( + [ + ["chr1", 1000 - 10, 1200 - 10, "+"], + ["chr1", 800 + 10, 1200 + 10, "-"], + ["chrX", 1000 - 10, 1500 - 10, "+"], + ], + columns=["chrom", "start", "end", "strand"], + ) + ) + + pd.testing.assert_frame_equal( + bioframe.shift(df, (-10, 20), along="strand"), + pd.DataFrame( + [ + ["chr1", 1000 - 10, 1200 + 20, "+"], + ["chr1", 800 - 20, 1200 + 10, "-"], + ["chrX", 1000 - 10, 1500 + 20, "+"], + ], + columns=["chrom", "start", "end", "strand"], + ) + ) + + pd.testing.assert_frame_equal( + bioframe.shift(df, (10, -20), along="strand"), + pd.DataFrame( + [ + ["chr1", 1000 + 10, 1200 - 20, "+"], + ["chr1", 800 + 20, 1200 - 10, "-"], + ["chrX", 1000 + 10, 1500 - 20, "+"], + ], + columns=["chrom", "start", "end", "strand"], + ) + ) + + pd.testing.assert_frame_equal( + bioframe.shift(df, (10, -200), along="strand", drop_invalid=True), + pd.DataFrame( + [ + ["chr1", 800 + 200, 1200 - 10, "-"], + ["chrX", 1000 + 10, 1500 - 200, "+"], + ], + columns=["chrom", "start", "end", "strand"], + index=[1, 2], + ) + ) + + +def test_shift_strandaware_unstranded(): + df = pd.DataFrame( + [ + ["chr1", 1000, 1200, "+"], + ["chr1", 800, 1200, "."], + ["chrX", 1000, 1500, "+"], + ], + columns=["chrom", "start", "end", "strand"], + ) + pd.testing.assert_frame_equal( + bioframe.shift(df, (10, -20), along="strand"), + pd.DataFrame( + [ + ["chr1", 1000 + 10, 1200 - 20, "+"], + ["chr1", 800, 1200, "."], + ["chrX", 1000 + 10, 1500 - 20, "+"], + ], + columns=["chrom", "start", "end", "strand"], + ) + ) + + def test_expand(): d = """chrom start end 0 chr1 1 5