keras-team · LakshmiKalaKadali · Jul 30, 2025 · Jul 30, 2025 · Jul 30, 2025 · Dec 10, 2025
diff --git a/keras_hub/src/models/whisper/whisper_audio_converter.py b/keras_hub/src/models/whisper/whisper_audio_converter.py
@@ -1,4 +1,5 @@
-import numpy as np
+import keras
+import keras.ops as ops
 
 from keras_hub.src.api_export import keras_hub_export
 from keras_hub.src.layers.preprocessing.audio_converter import AudioConverter
@@ -33,23 +34,6 @@ class WhisperAudioConverter(AudioConverter):
         max_audio_length: int. The length of each audio chunk in
             seconds. The input audio tensor will be padded/trimmed to
             `max_audio_length * sampling_rate`. Defaults to `30`.
-
-    Examples:
-    ```python
-    audio_tensor = tf.ones((8000,), dtype="float32")
-
-    # Compute the log-mel spectrogram.
-    audio_converter = keras_hub.layers.WhisperAudioConverter.from_preset(
-        "whisper_base_en",
-    )
-    audio_converter(audio_tensor)
-
-    # Compute the log-mel spectrogram for a batch of audio tensors.
-    audio_tensor_1 = tf.ones((8000,), dtype="float32")
-    audio_tensor_2 = tf.ones((10000,), dtype="float32")
-    audio_tensor = tf.ragged.stack([audio_tensor_1, audio_tensor_2], axis=0)
-    audio_converter(audio_tensor)
-    ```
     """
 
     backbone_cls = WhisperBackbone
@@ -84,33 +68,34 @@ def audio_shape(self):
         """Returns the preprocessed size of a single audio sample."""
         return (self.max_audio_length, self.num_mels)
 
+    def _get_rfftfreq_keras(self):
+        n = self.num_fft_bins
+        d = 1.0 / self.sampling_rate
+
+        if n % 2 == 0:
+            freqs = ops.arange(0, n // 2 + 1, dtype="float32") / (d * n)
+        else:
+            freqs = ops.arange(0, (n - 1) // 2 + 1, dtype="float32") / (d * n)
+
+        return freqs
+
     def _get_mel_filters(self):
         """
         Adapted from Hugging Face
         (https://github.com/huggingface/transformers/blob/v4.27.1/src/transformers/models/whisper/feature_extraction_whisper.py#L86)
         """
 
-        # TODO: Convert to TensorFlow ops (if possible).
-
-        dtype = np.float32
+        dtype = self.compute_dtype  # Use the class's dtype
         # Initialize the weights
-        weights = np.zeros(
+        weights = ops.zeros(
             (self.num_mels, int(1 + self.num_fft_bins // 2)), dtype=dtype
         )
-
         # Center freqs of each FFT bin
-        fftfreqs = np.fft.rfftfreq(
-            n=self.num_fft_bins, d=1.0 / self.sampling_rate
-        )
-
+        fftfreqs = self._get_rfftfreq_keras()
         # 'Center freqs' of mel bands - uniformly spaced between limits
         min_mel = 0.0
         max_mel = 45.245640471924965
-
-        mels = np.linspace(min_mel, max_mel, self.num_mels + 2)
-
-        mels = np.asanyarray(mels)
-
+        mels = ops.linspace(min_mel, max_mel, self.num_mels + 2)
         # Fill in the linear scale
         f_min = 0.0
         f_sp = 200.0 / 3
@@ -119,118 +104,249 @@ def _get_mel_filters(self):
         # And now the nonlinear scale
         min_log_hz = 1000.0  # beginning of log region (Hz)
         min_log_mel = (min_log_hz - f_min) / f_sp  # same (Mels)
-        logstep = np.log(6.4) / 27.0  # step size for log region
-
+        logstep = ops.log(6.4) / 27.0  # step size for log region
         # If we have vector data, vectorize
         log_t = mels >= min_log_mel
-        freqs[log_t] = min_log_hz * np.exp(
-            logstep * (mels[log_t] - min_log_mel)
+        freqs = ops.where(
+            log_t, min_log_hz * ops.exp(logstep * (mels - min_log_mel)), freqs
         )
-
         mel_f = freqs
 
-        fdiff = np.diff(mel_f)
-        ramps = np.subtract.outer(mel_f, fftfreqs)
+        fdiff = ops.diff(mel_f)
+        ramps = (
+            ops.expand_dims(mel_f, axis=1) - fftfreqs
+        )  # keras subtract outer
 
+        weights_list = []
         for i in range(self.num_mels):
             # lower and upper slopes for all bins
             lower = -ramps[i] / fdiff[i]
             upper = ramps[i + 2] / fdiff[i + 1]
 
             # .. then intersect them with each other and zero
-            weights[i] = np.maximum(0, np.minimum(lower, upper))
+            weights_i = ops.maximum(0, ops.minimum(lower, upper))
+            weights_list.append(weights_i)
+
+        weights = ops.stack(weights_list)
 
         # Slaney-style mel is scaled to be approx constant energy per channel
         enorm = 2.0 / (mel_f[2 : self.num_mels + 2] - mel_f[: self.num_mels])
-        weights *= enorm[:, np.newaxis]
+        weights *= ops.expand_dims(enorm, axis=1)
 
-        weights = np.transpose(weights)
-        return tf.constant(weights, dtype=self.compute_dtype)
+        weights = ops.transpose(weights)
+        return weights
 
     def _extract_audio_features(self, audio):
-        audio = tf.cast(audio, self.compute_dtype)
+        audio = ops.cast(audio, self.compute_dtype)
         # Use "reflection" padding - `tf.signal.stft` uses symmetric padding
         # internally.
-        audio = tf.pad(
+        audio = ops.pad(
             audio,
-            paddings=[[0, 0], [self.num_fft_bins // 2, self.num_fft_bins // 2]],
-            mode="REFLECT",
+            pad_width=[
+                [0, 0],
+                [self.num_fft_bins // 2, self.num_fft_bins // 2],
+            ],
+            mode="reflect",
         )
-
-        # Compute the mel spectrogram.
-        stft = tf.signal.stft(
+        stft = ops.stft(
             audio,
-            frame_length=self.num_fft_bins,
-            frame_step=self.stride,
+            sequence_length=self.num_fft_bins,
+            sequence_stride=self.stride,
             fft_length=self.num_fft_bins,
+            center=False,
         )
-        magnitudes = tf.square(tf.abs(stft[:, :-1, :]))
+        stft = ops.sum(stft, axis=0)
+        magnitudes = ops.square(ops.absolute(stft[:, :-1, :]))
 
-        mel_spec = tf.matmul(
+        mel_spec = ops.matmul(
             magnitudes,
             self.mel_filters,
         )
 
         def tf_log10(x):
             """Computes log base 10 of input tensor using TensorFlow."""
-            numerator = tf.math.log(x)
-            denominator = tf.math.log(tf.constant(10, dtype=numerator.dtype))
+            numerator = ops.log(x)
+            denominator = ops.log(
+                ops.cast(ops.array(10), dtype=numerator.dtype)
+            )
             return numerator / denominator
 
         # Clamp the values to a minimum value of 1e-10. This is done to avoid
         # taking the log of 0, i.e., for numerical stability.
-        mel_spec = tf.maximum(mel_spec, 1e-10)
+        mel_spec = ops.maximum(mel_spec, 1e-10)
 
         # Calculate the log mel spectrogram.
         log_spec = tf_log10(mel_spec)
         # Dynamic range compression.
-        log_spec_shape = tf.shape(log_spec)
-        max_value_minus_eight = tf.math.subtract(
-            tf.math.reduce_max(log_spec, axis=[1, 2]),
-            tf.cast(8, dtype=log_spec.dtype),
+        log_spec_shape = ops.shape(log_spec)
+        max_value_minus_eight = ops.subtract(
+            ops.max(log_spec, axis=[1, 2]),
+            ops.cast(8, dtype=log_spec.dtype),
         )
-        max_value_minus_eight = tf.expand_dims(max_value_minus_eight, axis=1)
-        max_value_minus_eight = tf.repeat(
+        max_value_minus_eight = ops.expand_dims(max_value_minus_eight, axis=1)
+        max_value_minus_eight = ops.repeat(
             max_value_minus_eight,
             repeats=log_spec_shape[1] * log_spec_shape[2],
             axis=1,
         )
-        max_value_minus_eight = tf.reshape(
-            max_value_minus_eight, shape=log_spec_shape
+        max_value_minus_eight = ops.reshape(
+            max_value_minus_eight, newshape=log_spec_shape
         )
-        log_spec = tf.maximum(log_spec, max_value_minus_eight)
+        log_spec = ops.maximum(log_spec, max_value_minus_eight)
         # Normalization.
-        type_cast_four = tf.cast(4, dtype=log_spec.dtype)
-        log_spec = tf.math.divide(
-            tf.math.add(log_spec, type_cast_four),
+        type_cast_four = ops.cast(4, dtype=log_spec.dtype)
+        log_spec = ops.divide(
+            ops.add(log_spec, type_cast_four),
             type_cast_four,
         )
-
         return log_spec
 
-    def call(self, audio):
-        if not isinstance(audio, (tf.Tensor, tf.RaggedTensor)):
-            audio = tf.convert_to_tensor(audio)
+    def call(
+        self,
+        inputs,
+        padding=None,
+        max_length=None,
+        pad_to_multiple_of=None,
+    ):
+        input_shape = keras.ops.shape(inputs)
+        input_rank = (
+            len(input_shape)
+            if isinstance(input_shape, (list, tuple))
+            else input_shape.rank
+        )
+        rank_1_input = input_rank == 1
 
-        rank_1_input = audio.shape.rank == 1
         if rank_1_input:
-            audio = tf.expand_dims(audio, 0)
-
-        # Convert the tensor to a Ragged Tensor.
-        if isinstance(audio, tf.Tensor):
-            audio = tf.RaggedTensor.from_tensor(audio)
-
-        # Pad audio.
-        audio_shape = audio.shape.as_list()
-        audio_shape[-1] = self.num_samples
-        audio = audio.to_tensor(shape=audio_shape)
-
-        # Find the log mel spectrogram.
-        log_spec = self._extract_audio_features(audio)
+            inputs = ops.expand_dims(inputs, 0)
+        # Convert to dense tensor with proper padding/truncation
+        processed_inputs = self.variable_length_inputs(
+            inputs, padding, max_length, pad_to_multiple_of
+        )
+        # Extract features
+        log_spec = self._extract_audio_features(processed_inputs)
         if rank_1_input:
-            log_spec = tf.squeeze(log_spec, 0)
+            log_spec = ops.squeeze(log_spec, 0)
+
         return log_spec
 
+    # handling variable length inputs
+    def variable_length_inputs(
+        self, inputs, padding=None, max_length=None, pad_to_multiple_of=None
+    ):
+        """Handles variable length inputs with padding or truncation."""
+
+        # Determine the appropriate target length
+        if padding == "max_length" and max_length is not None:
+            target_length = max_length
+        else:
+            # Use default max_audio_length
+            target_length = self.num_samples
+
+        if pad_to_multiple_of:
+            target_length = (
+                (target_length + pad_to_multiple_of - 1) // pad_to_multiple_of
+            ) * pad_to_multiple_of
+
+        # Get current shape and length
+        audio_shape = keras.ops.shape(inputs)
+        audio_length = audio_shape[1]
+
+        if padding == "max_length" and max_length is not None:
+            is_padding_required = keras.ops.less(audio_length, target_length)
+            is_trunc_required = keras.ops.greater(audio_length, target_length)
+
+            def pad_fn():
+                padding_amount = target_length - audio_length
+                paddings = [[0, 0], [0, padding_amount]]
+                return keras.ops.pad(
+                    inputs,
+                    paddings,
+                    mode="constant",
+                    constant_values=self.padding_value,
+                )
+
+            def trunc_fn():
+                return keras.ops.slice(
+                    inputs,
+                    [0, 0],
+                    [-1, target_length],
+                )
+
+            # Check if we're in symbolic execution
+            is_tf_symbolic = (
+                tf is not None
+                and hasattr(inputs, "graph")
+                and hasattr(inputs.graph, "as_graph_def")
+            )
+            use_tf_graph_ops = tf is not None and is_tf_symbolic
+
+            if use_tf_graph_ops and keras.config.backend() != "torch":
+                processed_inputs = tf.cond(
+                    is_padding_required,
+                    pad_fn,
+                    lambda: tf.cond(
+                        is_trunc_required, trunc_fn, lambda: inputs
+                    ),
+                )
+            else:
+                is_padding_bool = keras.ops.convert_to_numpy(
+                    is_padding_required
+                )
+                is_trunc_bool = keras.ops.convert_to_numpy(is_trunc_required)
+
+                if is_padding_bool:
+                    padding_amount = target_length - audio_length
+                    paddings = [[0, 0], [0, padding_amount]]
+                    processed_inputs = keras.ops.pad(
+                        inputs,
+                        paddings,
+                        mode="constant",
+                        constant_values=self.padding_value,
+                    )
+                elif is_trunc_bool:
+                    processed_inputs = inputs[:, :target_length]
+                else:
+                    processed_inputs = inputs
+        else:
+            # No explicit padding - just pad/truncate to default max length
+            is_padding_required = keras.ops.less(audio_length, target_length)
+            is_trunc_required = keras.ops.greater(audio_length, target_length)
+
+            # Use eager execution approach for simplicity
+            is_padding_bool = keras.ops.convert_to_numpy(is_padding_required)
+            is_trunc_bool = keras.ops.convert_to_numpy(is_trunc_required)
+
+            if is_padding_bool:
+                padding_amount = target_length - audio_length
+                paddings = [[0, 0], [0, padding_amount]]
+                processed_inputs = keras.ops.pad(
+                    inputs,
+                    paddings,
+                    mode="constant",
+                    constant_values=self.padding_value,
+                )
+            elif is_trunc_bool:
+                processed_inputs = inputs[:, :target_length]
+            else:
+                processed_inputs = inputs
+
+        return processed_inputs
+
+    def compute_output_shape(self, input_shape):
+        """Compute output shape for variable-length inputs."""
+
+        if len(input_shape) == 1:
+            # For single audio sample - returns 2D shape (frames, mels)
+            num_frames = (self.num_samples + self.stride - 1) // self.stride
+            return (num_frames, self.num_mels)
+        elif len(input_shape) == 2:
+            # For batch of audio samples -returns 3D shape (batch, frames, mels)
+            batch_size = input_shape[0]
+            num_frames = (self.num_samples + self.stride - 1) // self.stride
+            return (batch_size, num_frames, self.num_mels)
+        else:
+            raise ValueError("Input shape must be rank 1 or 2.")
+
     def get_config(self):
         config = super().get_config()
         config.update(