Skip to content

Convert TF and Numpy ops in whisper_audio_convert.py to Keras Ops #2225

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 3 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
292 changes: 204 additions & 88 deletions keras_hub/src/models/whisper/whisper_audio_converter.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import numpy as np
import keras
import keras.ops as ops

from keras_hub.src.api_export import keras_hub_export
from keras_hub.src.layers.preprocessing.audio_converter import AudioConverter
Expand Down Expand Up @@ -33,23 +34,6 @@ class WhisperAudioConverter(AudioConverter):
max_audio_length: int. The length of each audio chunk in
seconds. The input audio tensor will be padded/trimmed to
`max_audio_length * sampling_rate`. Defaults to `30`.

Examples:
```python
audio_tensor = tf.ones((8000,), dtype="float32")

# Compute the log-mel spectrogram.
audio_converter = keras_hub.layers.WhisperAudioConverter.from_preset(
"whisper_base_en",
)
audio_converter(audio_tensor)

# Compute the log-mel spectrogram for a batch of audio tensors.
audio_tensor_1 = tf.ones((8000,), dtype="float32")
audio_tensor_2 = tf.ones((10000,), dtype="float32")
audio_tensor = tf.ragged.stack([audio_tensor_1, audio_tensor_2], axis=0)
audio_converter(audio_tensor)
```
"""

backbone_cls = WhisperBackbone
Expand Down Expand Up @@ -84,33 +68,34 @@ def audio_shape(self):
"""Returns the preprocessed size of a single audio sample."""
return (self.max_audio_length, self.num_mels)

def _get_rfftfreq_keras(self):
n = self.num_fft_bins
d = 1.0 / self.sampling_rate

if n % 2 == 0:
freqs = ops.arange(0, n // 2 + 1, dtype="float32") / (d * n)
else:
freqs = ops.arange(0, (n - 1) // 2 + 1, dtype="float32") / (d * n)

return freqs

def _get_mel_filters(self):
"""
Adapted from Hugging Face
(https://github.com/huggingface/transformers/blob/v4.27.1/src/transformers/models/whisper/feature_extraction_whisper.py#L86)
"""

# TODO: Convert to TensorFlow ops (if possible).

dtype = np.float32
dtype = self.compute_dtype # Use the class's dtype
# Initialize the weights
weights = np.zeros(
weights = ops.zeros(
(self.num_mels, int(1 + self.num_fft_bins // 2)), dtype=dtype
)

# Center freqs of each FFT bin
fftfreqs = np.fft.rfftfreq(
n=self.num_fft_bins, d=1.0 / self.sampling_rate
)

fftfreqs = self._get_rfftfreq_keras()
# 'Center freqs' of mel bands - uniformly spaced between limits
min_mel = 0.0
max_mel = 45.245640471924965

mels = np.linspace(min_mel, max_mel, self.num_mels + 2)

mels = np.asanyarray(mels)

mels = ops.linspace(min_mel, max_mel, self.num_mels + 2)
# Fill in the linear scale
f_min = 0.0
f_sp = 200.0 / 3
Expand All @@ -119,118 +104,249 @@ def _get_mel_filters(self):
# And now the nonlinear scale
min_log_hz = 1000.0 # beginning of log region (Hz)
min_log_mel = (min_log_hz - f_min) / f_sp # same (Mels)
logstep = np.log(6.4) / 27.0 # step size for log region

logstep = ops.log(6.4) / 27.0 # step size for log region
# If we have vector data, vectorize
log_t = mels >= min_log_mel
freqs[log_t] = min_log_hz * np.exp(
logstep * (mels[log_t] - min_log_mel)
freqs = ops.where(
log_t, min_log_hz * ops.exp(logstep * (mels - min_log_mel)), freqs
)

mel_f = freqs

fdiff = np.diff(mel_f)
ramps = np.subtract.outer(mel_f, fftfreqs)
fdiff = ops.diff(mel_f)
ramps = (
ops.expand_dims(mel_f, axis=1) - fftfreqs
) # keras subtract outer

weights_list = []
for i in range(self.num_mels):
# lower and upper slopes for all bins
lower = -ramps[i] / fdiff[i]
upper = ramps[i + 2] / fdiff[i + 1]

# .. then intersect them with each other and zero
weights[i] = np.maximum(0, np.minimum(lower, upper))
weights_i = ops.maximum(0, ops.minimum(lower, upper))
weights_list.append(weights_i)

weights = ops.stack(weights_list)

# Slaney-style mel is scaled to be approx constant energy per channel
enorm = 2.0 / (mel_f[2 : self.num_mels + 2] - mel_f[: self.num_mels])
weights *= enorm[:, np.newaxis]
weights *= ops.expand_dims(enorm, axis=1)

weights = np.transpose(weights)
return tf.constant(weights, dtype=self.compute_dtype)
weights = ops.transpose(weights)
return weights

def _extract_audio_features(self, audio):
audio = tf.cast(audio, self.compute_dtype)
audio = ops.cast(audio, self.compute_dtype)
# Use "reflection" padding - `tf.signal.stft` uses symmetric padding
# internally.
audio = tf.pad(
audio = ops.pad(
audio,
paddings=[[0, 0], [self.num_fft_bins // 2, self.num_fft_bins // 2]],
mode="REFLECT",
pad_width=[
[0, 0],
[self.num_fft_bins // 2, self.num_fft_bins // 2],
],
mode="reflect",
)

# Compute the mel spectrogram.
stft = tf.signal.stft(
stft = ops.stft(
audio,
frame_length=self.num_fft_bins,
frame_step=self.stride,
sequence_length=self.num_fft_bins,
sequence_stride=self.stride,
fft_length=self.num_fft_bins,
center=False,
)
magnitudes = tf.square(tf.abs(stft[:, :-1, :]))
stft = ops.sum(stft, axis=0)
magnitudes = ops.square(ops.absolute(stft[:, :-1, :]))

mel_spec = tf.matmul(
mel_spec = ops.matmul(
magnitudes,
self.mel_filters,
)

def tf_log10(x):
"""Computes log base 10 of input tensor using TensorFlow."""
numerator = tf.math.log(x)
denominator = tf.math.log(tf.constant(10, dtype=numerator.dtype))
numerator = ops.log(x)
denominator = ops.log(
ops.cast(ops.array(10), dtype=numerator.dtype)
)
return numerator / denominator

# Clamp the values to a minimum value of 1e-10. This is done to avoid
# taking the log of 0, i.e., for numerical stability.
mel_spec = tf.maximum(mel_spec, 1e-10)
mel_spec = ops.maximum(mel_spec, 1e-10)

# Calculate the log mel spectrogram.
log_spec = tf_log10(mel_spec)
# Dynamic range compression.
log_spec_shape = tf.shape(log_spec)
max_value_minus_eight = tf.math.subtract(
tf.math.reduce_max(log_spec, axis=[1, 2]),
tf.cast(8, dtype=log_spec.dtype),
log_spec_shape = ops.shape(log_spec)
max_value_minus_eight = ops.subtract(
ops.max(log_spec, axis=[1, 2]),
ops.cast(8, dtype=log_spec.dtype),
)
max_value_minus_eight = tf.expand_dims(max_value_minus_eight, axis=1)
max_value_minus_eight = tf.repeat(
max_value_minus_eight = ops.expand_dims(max_value_minus_eight, axis=1)
max_value_minus_eight = ops.repeat(
max_value_minus_eight,
repeats=log_spec_shape[1] * log_spec_shape[2],
axis=1,
)
max_value_minus_eight = tf.reshape(
max_value_minus_eight, shape=log_spec_shape
max_value_minus_eight = ops.reshape(
max_value_minus_eight, newshape=log_spec_shape
)
log_spec = tf.maximum(log_spec, max_value_minus_eight)
log_spec = ops.maximum(log_spec, max_value_minus_eight)
# Normalization.
type_cast_four = tf.cast(4, dtype=log_spec.dtype)
log_spec = tf.math.divide(
tf.math.add(log_spec, type_cast_four),
type_cast_four = ops.cast(4, dtype=log_spec.dtype)
log_spec = ops.divide(
ops.add(log_spec, type_cast_four),
type_cast_four,
)

return log_spec

def call(self, audio):
if not isinstance(audio, (tf.Tensor, tf.RaggedTensor)):
audio = tf.convert_to_tensor(audio)
def call(
self,
inputs,
padding=None,
max_length=None,
pad_to_multiple_of=None,
):
input_shape = keras.ops.shape(inputs)
input_rank = (
len(input_shape)
if isinstance(input_shape, (list, tuple))
else input_shape.rank
)
rank_1_input = input_rank == 1

rank_1_input = audio.shape.rank == 1
if rank_1_input:
audio = tf.expand_dims(audio, 0)

# Convert the tensor to a Ragged Tensor.
if isinstance(audio, tf.Tensor):
audio = tf.RaggedTensor.from_tensor(audio)

# Pad audio.
audio_shape = audio.shape.as_list()
audio_shape[-1] = self.num_samples
audio = audio.to_tensor(shape=audio_shape)

# Find the log mel spectrogram.
log_spec = self._extract_audio_features(audio)
inputs = ops.expand_dims(inputs, 0)
# Convert to dense tensor with proper padding/truncation
processed_inputs = self.variable_length_inputs(
inputs, padding, max_length, pad_to_multiple_of
)
# Extract features
log_spec = self._extract_audio_features(processed_inputs)
if rank_1_input:
log_spec = tf.squeeze(log_spec, 0)
log_spec = ops.squeeze(log_spec, 0)

return log_spec

# handling variable length inputs
def variable_length_inputs(
self, inputs, padding=None, max_length=None, pad_to_multiple_of=None
):
"""Handles variable length inputs with padding or truncation."""

# Determine the appropriate target length
if padding == "max_length" and max_length is not None:
target_length = max_length
else:
# Use default max_audio_length
target_length = self.num_samples

if pad_to_multiple_of:
target_length = (
(target_length + pad_to_multiple_of - 1) // pad_to_multiple_of
) * pad_to_multiple_of

# Get current shape and length
audio_shape = keras.ops.shape(inputs)
audio_length = audio_shape[1]

if padding == "max_length" and max_length is not None:
is_padding_required = keras.ops.less(audio_length, target_length)
is_trunc_required = keras.ops.greater(audio_length, target_length)

def pad_fn():
padding_amount = target_length - audio_length
paddings = [[0, 0], [0, padding_amount]]
return keras.ops.pad(
inputs,
paddings,
mode="constant",
constant_values=self.padding_value,
)

def trunc_fn():
return keras.ops.slice(
inputs,
[0, 0],
[-1, target_length],
)

# Check if we're in symbolic execution
is_tf_symbolic = (
tf is not None
and hasattr(inputs, "graph")
and hasattr(inputs.graph, "as_graph_def")
)
use_tf_graph_ops = tf is not None and is_tf_symbolic

if use_tf_graph_ops and keras.config.backend() != "torch":
processed_inputs = tf.cond(
is_padding_required,
pad_fn,
lambda: tf.cond(
is_trunc_required, trunc_fn, lambda: inputs
),
)
else:
is_padding_bool = keras.ops.convert_to_numpy(
is_padding_required
)
is_trunc_bool = keras.ops.convert_to_numpy(is_trunc_required)

if is_padding_bool:
padding_amount = target_length - audio_length
paddings = [[0, 0], [0, padding_amount]]
processed_inputs = keras.ops.pad(
inputs,
paddings,
mode="constant",
constant_values=self.padding_value,
)
elif is_trunc_bool:
processed_inputs = inputs[:, :target_length]
else:
processed_inputs = inputs
else:
# No explicit padding - just pad/truncate to default max length
is_padding_required = keras.ops.less(audio_length, target_length)
is_trunc_required = keras.ops.greater(audio_length, target_length)

# Use eager execution approach for simplicity
is_padding_bool = keras.ops.convert_to_numpy(is_padding_required)
is_trunc_bool = keras.ops.convert_to_numpy(is_trunc_required)

if is_padding_bool:
padding_amount = target_length - audio_length
paddings = [[0, 0], [0, padding_amount]]
processed_inputs = keras.ops.pad(
inputs,
paddings,
mode="constant",
constant_values=self.padding_value,
)
elif is_trunc_bool:
processed_inputs = inputs[:, :target_length]
else:
processed_inputs = inputs

return processed_inputs

def compute_output_shape(self, input_shape):
"""Compute output shape for variable-length inputs."""

if len(input_shape) == 1:
# For single audio sample - returns 2D shape (frames, mels)
num_frames = (self.num_samples + self.stride - 1) // self.stride
return (num_frames, self.num_mels)
elif len(input_shape) == 2:
# For batch of audio samples -returns 3D shape (batch, frames, mels)
batch_size = input_shape[0]
num_frames = (self.num_samples + self.stride - 1) // self.stride
return (batch_size, num_frames, self.num_mels)
else:
raise ValueError("Input shape must be rank 1 or 2.")

def get_config(self):
config = super().get_config()
config.update(
Expand Down
Loading
Loading