Allow passing flexible positions to positional embedding layers (#2369)

abheesht17 · web-flow · commit ec906a3f673c · 2025-08-26T11:10:29.000+05:30
diff --git a/keras_hub/src/layers/modeling/position_embedding.py b/keras_hub/src/layers/modeling/position_embedding.py
@@ -31,6 +31,11 @@ class PositionEmbedding(keras.layers.Layer):
         start_index: An integer or integer tensor. The starting position to
             compute the position embedding from. This is useful during cached
             decoding, where each position is predicted separately in a loop.
+        positions: Tensor of shape `(sequence_length,)` or
+            `(batch_size, sequence_length)`. Custom positions for the input
+            sequence. If specified, this tensor will be used to
+            compute the position embedding, and the `start_index` argument will
+            be ignored. This is useful for cases with non-standard positions.
 
     Example:
 
@@ -91,18 +96,28 @@ def build(self, inputs_shape):
         )
         self.built = True
 
-    def call(self, inputs, start_index=0):
+    def call(self, inputs, start_index=0, positions=None):
         shape = ops.shape(inputs)
         feature_length = shape[-1]
         sequence_length = shape[-2]
         # trim to match the length of the input sequence, which might be less
         # than the sequence_length of the layer.
         position_embeddings = ops.convert_to_tensor(self.position_embeddings)
-        position_embeddings = ops.slice(
-            position_embeddings,
-            (start_index, 0),
-            (sequence_length, feature_length),
-        )
+        if positions is None:
+            position_embeddings = ops.slice(
+                position_embeddings,
+                (start_index, 0),
+                (sequence_length, feature_length),
+            )
+        else:
+            # Take care of unbatched `positions`.
+            if len(ops.shape(positions)) == 1:
+                positions = ops.expand_dims(positions, axis=0)
+
+            position_embeddings = ops.take(
+                position_embeddings, positions, axis=0
+            )
+
         return ops.broadcast_to(position_embeddings, shape)
 
     def compute_output_shape(self, input_shape):
diff --git a/keras_hub/src/layers/modeling/position_embedding_test.py b/keras_hub/src/layers/modeling/position_embedding_test.py
@@ -141,3 +141,24 @@ def test_start_index(self):
                 sequential_output, (0, i, 0), parial_output
             )
         self.assertAllClose(full_output, sequential_output)
+
+    def test_positions(self):
+        batch_size, seq_length, feature_size = 2, 4, 5
+        data = random.uniform(shape=(batch_size, seq_length, feature_size))
+        positions = np.array([[0, 0, 1, 2], [1, 2, 3, 0]])
+
+        layer = PositionEmbedding(seq_length)
+        output = layer(data, positions=positions)
+
+        expected_output = []
+        for b_idx in range(batch_size):
+            for s_idx in range(seq_length):
+                actual_position = positions[b_idx, s_idx]
+                expected_output.append(
+                    layer.position_embeddings.numpy()[actual_position]
+                )
+
+        expected_output = np.reshape(
+            np.array(expected_output), (batch_size, seq_length, feature_size)
+        )
+        self.assertAllClose(output, expected_output, rtol=1e-5, atol=1e-5)
diff --git a/keras_hub/src/layers/modeling/rotary_embedding.py b/keras_hub/src/layers/modeling/rotary_embedding.py
@@ -37,6 +37,11 @@ class RotaryEmbedding(keras.layers.Layer):
         start_index: An integer or integer tensor. The starting position to
             compute the rotary embedding from. This is useful during cached
             decoding, where each position is predicted separately in a loop.
+        positions: Tensor of shape `(sequence_length,)` or
+            `(batch_size, sequence_length)`. Custom positions for the input
+            sequence. If specified, this tensor will be used to
+            compute the rotary embedding, and the `start_index` argument will
+            be ignored. This is useful for cases with non-standard positions.
 
     Examples:
 
@@ -76,6 +81,11 @@ def __init__(
         self.built = True
 
     def call(self, inputs, start_index=0, positions=None):
+        # Take care of unbatched `positions`.
+        if positions is not None:
+            if len(ops.shape(positions)) == 1:
+                positions = ops.expand_dims(positions, axis=0)
+
         inputs = ops.moveaxis(
             inputs, (self.feature_axis, self.sequence_axis), (-1, 1)
         )
@@ -103,6 +113,7 @@ def _compute_positions(self, inputs, start_index=0):
         return positions + ops.cast(start_index, dtype="float32")
 
     def _compute_cos_sin_embedding(self, inputs, start_index=0, positions=None):
+        batch_axis = 0
         feature_axis = len(inputs.shape) - 1
         sequence_axis = 1
 
@@ -111,21 +122,20 @@ def _compute_cos_sin_embedding(self, inputs, start_index=0, positions=None):
 
         if positions is None:
             positions = self._compute_positions(inputs, start_index)
+            positions = ops.expand_dims(positions, axis=batch_axis)
         else:
             positions = ops.cast(positions, "float32")
-
         positions = positions / ops.cast(self.scaling_factor, "float32")
-        freq = ops.einsum("i,j->ij", positions, inverse_freq)
+
+        freq = ops.einsum("bi,j->bij", positions, inverse_freq)
+
         embedding = ops.stack((freq, freq), axis=-2)
         embedding = ops.reshape(
             embedding, (*ops.shape(freq)[:-1], ops.shape(freq)[-1] * 2)
         )
 
-        # Reshape the embedding to be broadcastable with input shape.
-        if feature_axis < sequence_axis:
-            embedding = ops.transpose(embedding)
         for axis in range(len(inputs.shape)):
-            if axis != sequence_axis and axis != feature_axis:
+            if axis not in (batch_axis, sequence_axis, feature_axis):
                 embedding = ops.expand_dims(embedding, axis)
 
         cos_emb = ops.cast(ops.cos(embedding), self.compute_dtype)
diff --git a/keras_hub/src/layers/modeling/rotary_embedding_test.py b/keras_hub/src/layers/modeling/rotary_embedding_test.py
@@ -107,7 +107,7 @@ def test_float16_dtype(self):
         # output dtype for this layer should be float16.
         self.assertEqual(outputs.dtype, "float16")
 
-    def test_positions_array(self):
+    def test_positions_1d_array(self):
         rng = np.random.default_rng(0)
         x = rng.standard_normal(size=(1, 2, 1, 16)).astype(np.float32)
         positions = ops.cast([0, 0], "float32")
@@ -152,9 +152,49 @@ def test_positions_array(self):
         # fmt: on
 
         layer = RotaryEmbedding()
-        got = layer(x, positions=positions)
+        output = layer(x, positions=positions)
 
-        np.testing.assert_allclose(expected, ops.convert_to_numpy(got))
+        np.testing.assert_allclose(expected, ops.convert_to_numpy(output))
+
+    def test_positions_2d_array(self):
+        rng = np.random.default_rng(0)
+        x = rng.standard_normal(size=(2, 2, 1, 16)).astype(np.float32)
+        positions = ops.cast([[0, 0], [0, 1]], "float32")
+
+        # fmt: off
+        expected = np.array(
+            [
+                [
+                    [[0.12573022, -0.13210486, 0.64042264, 0.10490011,
+                     -0.5356694, 0.36159506, 1.304, 0.94708097,
+                    -0.70373523, -1.2654215, -0.62327445, 0.04132598,
+                    -2.3250308, -0.21879166, -1.245911, -0.7322674]],
+                    [[-0.544259, -0.31630015, 0.41163054, 1.0425134,
+                      -0.12853466, 1.3664634, -0.6651947, 0.35151008,
+                      0.90347016, 0.0940123, -0.7434993, -0.9217254,
+                     -0.45772582, 0.22019513, -1.0096182, -0.20917557]
+                ]
+            ],
+                [
+                    [[-0.159225017, 0.540845573, 0.214659125, 0.355372697,
+                      -0.653828621, -0.129613638, 0.783975482, 1.49343109,
+                      -1.25906551, 1.51392376, 1.34587538, 0.781311393,
+                      0.264455616, -0.313922822, 1.45802069, 1.96025836]],
+                    [[0.611709595, 1.03343689, 0.47380957, -1.18679309,
+                      -8.96309502e-05, 0.660170913, -1.29010022, 0.395278841,
+                      1.74827969, 1.07050526, -1.14252377, -0.699575782,
+                      -0.436457992, -1.1677202, 1.73807859, -0.495785743]
+                ]
+            ]
+        ],
+            dtype=np.float32
+        )  # noqa
+        # fmt: on
+
+        layer = RotaryEmbedding()
+        output = layer(x, positions=positions)
+
+        self.assertAllClose(output, expected, rtol=1e-5, atol=1e-5)
 
     def test_rope_scaling(self):
         # Reference values computed from Huggingface llama implementation
diff --git a/keras_hub/src/layers/modeling/sine_position_encoding.py b/keras_hub/src/layers/modeling/sine_position_encoding.py
@@ -30,6 +30,11 @@ class SinePositionEncoding(keras.layers.Layer):
         start_index: An integer or integer tensor. The starting position to
             compute the encoding from. This is useful during cached decoding,
             where each position is predicted separately in a loop.
+        positions: Tensor of shape `(sequence_length,)` or
+            `(batch_size, sequence_length)`. Custom positions for the input
+            sequence. If specified, this tensor will be used to
+            compute the position embedding, and the `start_index` argument will
+            be ignored. This is useful for cases with non-standard positions.
 
     Example:
     ```python
@@ -58,27 +63,35 @@ def __init__(
         self.max_wavelength = max_wavelength
         self.built = True
 
-    def call(self, inputs, start_index=0):
+    def call(self, inputs, start_index=0, positions=None):
         shape = ops.shape(inputs)
         seq_length = shape[-2]
         hidden_size = shape[-1]
-        positions = ops.arange(seq_length)
-        positions = ops.cast(positions + start_index, self.compute_dtype)
+
+        if positions is None:
+            positions = ops.arange(seq_length)
+            positions = ops.cast(positions + start_index, self.compute_dtype)
+
+        # Take care of unbatched `positions`.
+        if len(ops.shape(positions)) == 1:
+            positions = ops.expand_dims(positions, axis=0)
+
         min_freq = ops.cast(1 / self.max_wavelength, dtype=self.compute_dtype)
         timescales = ops.power(
             min_freq,
             ops.cast(2 * (ops.arange(hidden_size) // 2), self.compute_dtype)
             / ops.cast(hidden_size, self.compute_dtype),
         )
-        angles = ops.expand_dims(positions, 1) * ops.expand_dims(timescales, 0)
+        angles = ops.einsum("bi,j->bij", positions, timescales)
+
         # even indices are sine, odd are cosine
         cos_mask = ops.cast(ops.arange(hidden_size) % 2, self.compute_dtype)
         sin_mask = 1 - cos_mask
-        # embedding shape is [seq_length, hidden_size]
-        positional_encodings = (
-            ops.sin(angles) * sin_mask + ops.cos(angles) * cos_mask
-        )
 
+        # embedding shape is `[bsz (or 1), seq_length, hidden_size]`.
+        positional_encodings = ops.einsum(
+            "bij,j->bij", ops.sin(angles), sin_mask
+        ) + ops.einsum("bij,j->bij", ops.cos(angles), cos_mask)
         return ops.broadcast_to(positional_encodings, shape)
 
     def get_config(self):
diff --git a/keras_hub/src/layers/modeling/sine_position_encoding_test.py b/keras_hub/src/layers/modeling/sine_position_encoding_test.py
@@ -94,3 +94,16 @@ def test_start_index(self):
                 sequential_output, (0, i, 0), parial_output
             )
         self.assertAllClose(full_output, sequential_output)
+
+    def test_positions(self):
+        batch_size, seq_length, feature_size = 2, 2, 4
+        data = random.uniform(shape=(batch_size, seq_length, feature_size))
+        positions = ops.array([[0, 1], [1, 0]])
+
+        layer = SinePositionEncoding()
+        output = layer(data, positions=positions)
+
+        pos_0 = [0.0, 1.0, 0.0, 1.0]
+        pos_1 = [0.84147, 0.54030, 0.009999, 0.99995]
+        expected = [[pos_0, pos_1], [pos_1, pos_0]]
+        self.assertAllClose(expected, output, rtol=1e-5, atol=1e-5)
diff --git a/keras_hub/src/layers/modeling/token_and_position_embedding.py b/keras_hub/src/layers/modeling/token_and_position_embedding.py
@@ -120,11 +120,12 @@ def get_config(self):
         )
         return config
 
-    def call(self, inputs, start_index=0):
+    def call(self, inputs, start_index=0, positions=None):
         embedded_tokens = self.token_embedding(inputs)
         embedded_positions = self.position_embedding(
             embedded_tokens,
             start_index=start_index,
+            positions=positions,
         )
         outputs = embedded_tokens + embedded_positions
         return outputs

Original file line number	Diff line number	Diff line change
`@@ -120,11 +120,12 @@ def get_config(self):`
`120`	`120`	`)`
`121`	`121`	`return config`
`122`	`122`
`123`		`- def call(self, inputs, start_index=0):`
	`123`	`+ def call(self, inputs, start_index=0, positions=None):`
`124`	`124`	`embedded_tokens = self.token_embedding(inputs)`
`125`	`125`	`embedded_positions = self.position_embedding(`
`126`	`126`	`embedded_tokens,`
`127`	`127`	`start_index=start_index,`
	`128`	`+ positions=positions,`
`128`	`129`	`)`
`129`	`130`	`outputs = embedded_tokens + embedded_positions`
`130`	`131`	`return outputs`