From 2e25279e5382f591272f2b5d01286cc11d976f79 Mon Sep 17 00:00:00 2001 From: Sam Anklesaria Date: Fri, 18 Jul 2025 19:57:20 +0000 Subject: [PATCH 01/25] Add torchcodec mock with wav loading and saving --- test/torchcodec/decoders.py | 17 +++++++++++++++++ test/torchcodec/encoders.py | 10 ++++++++++ 2 files changed, 27 insertions(+) create mode 100644 test/torchcodec/decoders.py create mode 100644 test/torchcodec/encoders.py diff --git a/test/torchcodec/decoders.py b/test/torchcodec/decoders.py new file mode 100644 index 0000000000..94f2d8c8c1 --- /dev/null +++ b/test/torchcodec/decoders.py @@ -0,0 +1,17 @@ +import test.torchaudio_unittest.common_utils.wav_utils as wav_utils + +class AudioDecoder: + def __init__(self, uri): + self.uri = uri + + def get_all_samples(self): + return wav_utils.load_wav(self.uri) + + +class AudioEncoder: + def __init__(self, data, sample_rate): + self.data = data + self.sample_rate = sample_rate + + def to_file(self, uri, bit_rate=None): + return wav_utils.save_wav(uri, self.data, self.sample_rate) diff --git a/test/torchcodec/encoders.py b/test/torchcodec/encoders.py new file mode 100644 index 0000000000..5e9cc54968 --- /dev/null +++ b/test/torchcodec/encoders.py @@ -0,0 +1,10 @@ +import torchaudio_unittest.common_utils.wav_utils as wav_utils + +class AudioEncoder: + def __init__(self, data, sample_rate): + print("BEING CALLED") + self.data = data + self.sample_rate = sample_rate + + def to_file(self, uri, bit_rate=None): + return wav_utils.save_wav(uri, self.data, self.sample_rate) From a3002211592397a4a4aa507f7ebd0626bd125231 Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Wed, 16 Jul 2025 10:18:18 +0100 Subject: [PATCH 02/25] Let load and save rely on *_with_torchcodec --- src/torchaudio/__init__.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/src/torchaudio/__init__.py b/src/torchaudio/__init__.py index e533cafe9d..1fde90b871 100644 --- a/src/torchaudio/__init__.py +++ b/src/torchaudio/__init__.py @@ -7,8 +7,6 @@ get_audio_backend as _get_audio_backend, info as _info, list_audio_backends as _list_audio_backends, - load, - save, set_audio_backend as _set_audio_backend, ) from ._torchcodec import load_with_torchcodec, save_with_torchcodec @@ -41,6 +39,13 @@ pass +def load(*args, **kwargs): + return load_with_torchcodec(*args, **kwargs) + +def save(*args, **kwargs): + return save_with_torchcodec(*args, **kwargs) + + __all__ = [ "AudioMetaData", "load", From 07e3b77f565d153ec3c8d6eb2cba3de93bd8c1dd Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Wed, 16 Jul 2025 13:49:53 +0100 Subject: [PATCH 03/25] install torchcodec in doc job --- .github/workflows/build_docs.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/build_docs.yml b/.github/workflows/build_docs.yml index e92c556218..f681e3b7ec 100644 --- a/.github/workflows/build_docs.yml +++ b/.github/workflows/build_docs.yml @@ -68,7 +68,7 @@ jobs: GPU_ARCH_ID=cu126 # This is hard-coded and must be consistent with gpu-arch-version. PYTORCH_WHEEL_INDEX="https://download.pytorch.org/whl/${CHANNEL}/${GPU_ARCH_ID}" - pip install --progress-bar=off --pre torch --index-url="${PYTORCH_WHEEL_INDEX}" + pip install --progress-bar=off --pre torch torchcodec --index-url="${PYTORCH_WHEEL_INDEX}" echo "::endgroup::" echo "::group::Install TorchAudio" From 92719d3abe1c206f8f3b0a6e3531a53e0ef30933 Mon Sep 17 00:00:00 2001 From: Sam Anklesaria Date: Tue, 12 Aug 2025 19:53:00 +0000 Subject: [PATCH 04/25] Add docstring and arguments for load and save --- src/torchaudio/__init__.py | 177 ++++++++++++++++++++++++++++++++++++- 1 file changed, 173 insertions(+), 4 deletions(-) diff --git a/src/torchaudio/__init__.py b/src/torchaudio/__init__.py index 1fde90b871..ed4be65d6d 100644 --- a/src/torchaudio/__init__.py +++ b/src/torchaudio/__init__.py @@ -39,12 +39,181 @@ pass -def load(*args, **kwargs): - return load_with_torchcodec(*args, **kwargs) +def load( + uri: Union[BinaryIO, str, os.PathLike], + frame_offset: int = 0, + num_frames: int = -1, + normalize: bool = True, + channels_first: bool = True, + format: Optional[str] = None, + buffer_size: int = 4096, + backend: Optional[str] = None, +) -> Tuple[torch.Tensor, int]: + """Load audio data from source using TorchCodec's AudioDecoder. -def save(*args, **kwargs): - return save_with_torchcodec(*args, **kwargs) + .. note:: + This function supports the same API as :func:`~torchaudio.load`, and + relies on TorchCodec's decoding capabilities under the hood. It is + provided for convenience, but we do recommend that you port your code to + natively use ``torchcodec``'s ``AudioDecoder`` class for better + performance: + https://docs.pytorch.org/torchcodec/stable/generated/torchcodec.decoders.AudioDecoder. + In TorchAudio 2.9, :func:`~torchaudio.load` will be relying on + :func:`~torchaudio.load_with_torchcodec`. Note that some parameters of + :func:`~torchaudio.load`, like ``normalize``, ``buffer_size``, and + ``backend``, are ignored by :func:`~torchaudio.load_with_torchcodec`. + + + Args: + uri (path-like object or file-like object): + Source of audio data. The following types are accepted: + + * ``path-like``: File path or URL. + * ``file-like``: Object with ``read(size: int) -> bytes`` method. + + frame_offset (int, optional): + Number of samples to skip before start reading data. + num_frames (int, optional): + Maximum number of samples to read. ``-1`` reads all the remaining samples, + starting from ``frame_offset``. + normalize (bool, optional): + TorchCodec always returns normalized float32 samples. This parameter + is ignored and a warning is issued if set to False. + Default: ``True``. + channels_first (bool, optional): + When True, the returned Tensor has dimension `[channel, time]`. + Otherwise, the returned Tensor's dimension is `[time, channel]`. + format (str or None, optional): + Format hint for the decoder. May not be supported by all TorchCodec + decoders. (Default: ``None``) + buffer_size (int, optional): + Not used by TorchCodec AudioDecoder. Provided for API compatibility. + backend (str or None, optional): + Not used by TorchCodec AudioDecoder. Provided for API compatibility. + + Returns: + (torch.Tensor, int): Resulting Tensor and sample rate. + Always returns float32 tensors. If ``channels_first=True``, shape is + `[channel, time]`, otherwise `[time, channel]`. + + Raises: + ImportError: If torchcodec is not available. + ValueError: If unsupported parameters are used. + RuntimeError: If TorchCodec fails to decode the audio. + + Note: + - TorchCodec always returns normalized float32 samples, so the ``normalize`` + parameter has no effect. + - The ``buffer_size`` and ``backend`` parameters are ignored. + - Not all audio formats supported by torchaudio backends may be supported + by TorchCodec. + """ + return load_with_torchcodec( + uri, + frame_offset=frame_offset, + num_frames=num_frames, + normalize=normalize, + channels_first=channels_first, + format=format, + buffer_size=buffer_size, + backend=backend + ) + +def save( + uri: Union[str, os.PathLike], + src: torch.Tensor, + sample_rate: int, + channels_first: bool = True, + format: Optional[str] = None, + encoding: Optional[str] = None, + bits_per_sample: Optional[int] = None, + buffer_size: int = 4096, + backend: Optional[str] = None, + compression: Optional[Union[float, int]] = None, +) -> None: + """Save audio data to file using TorchCodec's AudioEncoder. + + .. note:: + + This function supports the same API as :func:`~torchaudio.save`, and + relies on TorchCodec's encoding capabilities under the hood. It is + provided for convenience, but we do recommend that you port your code to + natively use ``torchcodec``'s ``AudioEncoder`` class for better + performance: + https://docs.pytorch.org/torchcodec/stable/generated/torchcodec.encoders.AudioEncoder. + In TorchAudio 2.9, :func:`~torchaudio.save` will be relying on + :func:`~torchaudio.save_with_torchcodec`. Note that some parameters of + :func:`~torchaudio.save`, like ``format``, ``encoding``, + ``bits_per_sample``, ``buffer_size``, and ``backend``, are ignored by + are ignored by :func:`~torchaudio.save_with_torchcodec`. + + This function provides a TorchCodec-based alternative to torchaudio.save + with the same API. TorchCodec's AudioEncoder provides efficient encoding + with FFmpeg under the hood. + + Args: + uri (path-like object): + Path to save the audio file. The file extension determines the format. + + src (torch.Tensor): + Audio data to save. Must be a 1D or 2D tensor with float32 values + in the range [-1, 1]. If 2D, shape should be [channel, time] when + channels_first=True, or [time, channel] when channels_first=False. + + sample_rate (int): + Sample rate of the audio data. + + channels_first (bool, optional): + Indicates whether the input tensor has channels as the first dimension. + If True, expects [channel, time]. If False, expects [time, channel]. + Default: True. + + format (str or None, optional): + Audio format hint. Not used by TorchCodec (format is determined by + file extension). A warning is issued if provided. + Default: None. + + encoding (str or None, optional): + Audio encoding. Not fully supported by TorchCodec AudioEncoder. + A warning is issued if provided. Default: None. + + bits_per_sample (int or None, optional): + Bits per sample. Not directly supported by TorchCodec AudioEncoder. + A warning is issued if provided. Default: None. + + buffer_size (int, optional): + Not used by TorchCodec AudioEncoder. Provided for API compatibility. + A warning is issued if not default value. Default: 4096. + + backend (str or None, optional): + Not used by TorchCodec AudioEncoder. Provided for API compatibility. + A warning is issued if provided. Default: None. + + compression (float, int or None, optional): + Compression level or bit rate. Maps to bit_rate parameter in + TorchCodec AudioEncoder. Default: None. + + Raises: + ImportError: If torchcodec is not available. + ValueError: If input parameters are invalid. + RuntimeError: If TorchCodec fails to encode the audio. + + Note: + - TorchCodec AudioEncoder expects float32 samples in [-1, 1] range. + - Some parameters (format, encoding, bits_per_sample, buffer_size, backend) + are not used by TorchCodec but are provided for API compatibility. + - The output format is determined by the file extension in the uri. + - TorchCodec uses FFmpeg under the hood for encoding. + """ + return save_with_torchcodec(uri, src, sample_rate, + channels_first=channels_first, + format=format, + encoding=encoding, + bits_per_sample=bits_per_sample, + buffer_size=buffer_size, + backend=backend, + compression=compression) __all__ = [ "AudioMetaData", From 4a98ee5f36552ead8e3cf6bf143f7b4484dd897c Mon Sep 17 00:00:00 2001 From: Sam Anklesaria Date: Wed, 13 Aug 2025 14:42:00 +0000 Subject: [PATCH 05/25] Revise docstring --- src/torchaudio/__init__.py | 26 ++++++++------------------ 1 file changed, 8 insertions(+), 18 deletions(-) diff --git a/src/torchaudio/__init__.py b/src/torchaudio/__init__.py index ed4be65d6d..37d20a76aa 100644 --- a/src/torchaudio/__init__.py +++ b/src/torchaudio/__init__.py @@ -53,16 +53,13 @@ def load( .. note:: - This function supports the same API as :func:`~torchaudio.load`, and - relies on TorchCodec's decoding capabilities under the hood. It is + As of TorchAudio 2.9, this function relies on TorchCodec's decoding capabilities under the hood. It is provided for convenience, but we do recommend that you port your code to natively use ``torchcodec``'s ``AudioDecoder`` class for better performance: https://docs.pytorch.org/torchcodec/stable/generated/torchcodec.decoders.AudioDecoder. - In TorchAudio 2.9, :func:`~torchaudio.load` will be relying on - :func:`~torchaudio.load_with_torchcodec`. Note that some parameters of - :func:`~torchaudio.load`, like ``normalize``, ``buffer_size``, and - ``backend``, are ignored by :func:`~torchaudio.load_with_torchcodec`. + Because of the reliance on Torchcodec, the parameters ``normalize``, ``buffer_size``, and + ``backend`` are ignored and accepted only for backwards compatibility. Args: @@ -136,21 +133,14 @@ def save( .. note:: - This function supports the same API as :func:`~torchaudio.save`, and - relies on TorchCodec's encoding capabilities under the hood. It is - provided for convenience, but we do recommend that you port your code to + As of TorchAudio 2.9, this function relies on TorchCodec's encoding capabilities under the hood. + It is provided for convenience, but we do recommend that you port your code to natively use ``torchcodec``'s ``AudioEncoder`` class for better performance: https://docs.pytorch.org/torchcodec/stable/generated/torchcodec.encoders.AudioEncoder. - In TorchAudio 2.9, :func:`~torchaudio.save` will be relying on - :func:`~torchaudio.save_with_torchcodec`. Note that some parameters of - :func:`~torchaudio.save`, like ``format``, ``encoding``, - ``bits_per_sample``, ``buffer_size``, and ``backend``, are ignored by - are ignored by :func:`~torchaudio.save_with_torchcodec`. - - This function provides a TorchCodec-based alternative to torchaudio.save - with the same API. TorchCodec's AudioEncoder provides efficient encoding - with FFmpeg under the hood. + Because of the reliance on Torchcodec, the parameters ``format``, ``encoding``, + ``bits_per_sample``, ``buffer_size``, and ``backend``, are ignored and accepted only for + backwards compatibility. Args: uri (path-like object): From 7b02754b407e42cca822d3d2ce5e7eeb60d2b01f Mon Sep 17 00:00:00 2001 From: Sam Anklesaria Date: Wed, 13 Aug 2025 15:13:14 +0000 Subject: [PATCH 06/25] Add typing imports --- src/torchaudio/__init__.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/torchaudio/__init__.py b/src/torchaudio/__init__.py index 37d20a76aa..60c8ceb7fe 100644 --- a/src/torchaudio/__init__.py +++ b/src/torchaudio/__init__.py @@ -1,4 +1,7 @@ from torchaudio._internal.module_utils import dropping_io_support, dropping_class_io_support +from typing import Union, BinaryIO, Optional, Tuple +import os +import torch # Initialize extension and backend first from . import _extension # noqa # usort: skip From 74edc0a8dbe942aae3f04924d1743f4da49800cb Mon Sep 17 00:00:00 2001 From: Sam Anklesaria Date: Wed, 13 Aug 2025 16:00:40 +0000 Subject: [PATCH 07/25] Try ffmpeg>4 --- .github/scripts/unittest-linux/install.sh | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/.github/scripts/unittest-linux/install.sh b/.github/scripts/unittest-linux/install.sh index a7ae9bfcf4..2163502b2e 100755 --- a/.github/scripts/unittest-linux/install.sh +++ b/.github/scripts/unittest-linux/install.sh @@ -86,8 +86,7 @@ pip install . -v --no-build-isolation # 3. Install Test tools printf "* Installing test tools\n" -# On this CI, for whatever reason, we're only able to install ffmpeg 4. -conda install -y "ffmpeg<5" +conda install -y "ffmpeg>4" python -c "import torch; import torchaudio; import torchcodec; print(torch.__version__, torchaudio.__version__, torchcodec.__version__)" NUMBA_DEV_CHANNEL="" From 80f5eb7778afd5efc1a2c601583c84ffb5aa2401 Mon Sep 17 00:00:00 2001 From: Sam Anklesaria Date: Wed, 13 Aug 2025 16:22:24 +0000 Subject: [PATCH 08/25] Install conda deps before pip deps --- .github/scripts/unittest-linux/install.sh | 30 ++++++++++++----------- 1 file changed, 16 insertions(+), 14 deletions(-) diff --git a/.github/scripts/unittest-linux/install.sh b/.github/scripts/unittest-linux/install.sh index 2163502b2e..6a347577d5 100755 --- a/.github/scripts/unittest-linux/install.sh +++ b/.github/scripts/unittest-linux/install.sh @@ -74,20 +74,7 @@ case $GPU_ARCH_TYPE in ;; esac PYTORCH_WHEEL_INDEX="https://download.pytorch.org/whl/${UPLOAD_CHANNEL}/${GPU_ARCH_ID}" -pip install --progress-bar=off --pre torch torchcodec --index-url="${PYTORCH_WHEEL_INDEX}" - - -# 2. Install torchaudio -conda install --quiet -y ninja cmake - -printf "* Installing torchaudio\n" -export BUILD_CPP_TEST=1 -pip install . -v --no-build-isolation -# 3. Install Test tools -printf "* Installing test tools\n" -conda install -y "ffmpeg>4" -python -c "import torch; import torchaudio; import torchcodec; print(torch.__version__, torchaudio.__version__, torchcodec.__version__)" NUMBA_DEV_CHANNEL="" if [[ "$(python --version)" = *3.9* || "$(python --version)" = *3.10* ]]; then @@ -97,12 +84,27 @@ if [[ "$(python --version)" = *3.9* || "$(python --version)" = *3.10* ]]; then fi ( set -x - conda install -y -c conda-forge ${NUMBA_DEV_CHANNEL} libvorbis parameterized 'requests>=2.20' + conda install -y -c conda-forge ${NUMBA_DEV_CHANNEL} "ffmpeg>4" libvorbis parameterized 'requests>=2.20' pip install SoundFile coverage pytest pytest-cov scipy expecttest unidecode inflect Pillow sentencepiece pytorch-lightning 'protobuf<4.21.0' demucs tinytag pyroomacoustics flashlight-text git+https://github.com/kpu/kenlm # TODO: might be better to fix the single call to `pip install` above pip install pillow scipy "numpy>=1.26" ) + +pip install --progress-bar=off --pre torch torchcodec --index-url="${PYTORCH_WHEEL_INDEX}" + + +# 2. Install torchaudio +conda install --quiet -y ninja cmake + +printf "* Installing torchaudio\n" +export BUILD_CPP_TEST=1 +pip install . -v --no-build-isolation + +# 3. Install Test tools +printf "* Installing test tools\n" +python -c "import torch; import torchaudio; import torchcodec; print(torch.__version__, torchaudio.__version__, torchcodec.__version__)" + # Install fairseq git clone https://github.com/pytorch/fairseq cd fairseq From 7f063a6ce08b442de93471f8891e88e65544e0b3 Mon Sep 17 00:00:00 2001 From: Sam Anklesaria Date: Wed, 13 Aug 2025 18:11:05 +0000 Subject: [PATCH 09/25] Add scipy hack for load and save --- src/torchaudio/__init__.py | 369 ++++++++++++++++++++----------------- 1 file changed, 203 insertions(+), 166 deletions(-) diff --git a/src/torchaudio/__init__.py b/src/torchaudio/__init__.py index 60c8ceb7fe..5910743607 100644 --- a/src/torchaudio/__init__.py +++ b/src/torchaudio/__init__.py @@ -2,6 +2,8 @@ from typing import Union, BinaryIO, Optional, Tuple import os import torch +from scipy.io import wavfile +import sys # Initialize extension and backend first from . import _extension # noqa # usort: skip @@ -41,172 +43,207 @@ except ImportError: pass - -def load( - uri: Union[BinaryIO, str, os.PathLike], - frame_offset: int = 0, - num_frames: int = -1, - normalize: bool = True, - channels_first: bool = True, - format: Optional[str] = None, - buffer_size: int = 4096, - backend: Optional[str] = None, -) -> Tuple[torch.Tensor, int]: - """Load audio data from source using TorchCodec's AudioDecoder. - - .. note:: - - As of TorchAudio 2.9, this function relies on TorchCodec's decoding capabilities under the hood. It is - provided for convenience, but we do recommend that you port your code to - natively use ``torchcodec``'s ``AudioDecoder`` class for better - performance: - https://docs.pytorch.org/torchcodec/stable/generated/torchcodec.decoders.AudioDecoder. - Because of the reliance on Torchcodec, the parameters ``normalize``, ``buffer_size``, and - ``backend`` are ignored and accepted only for backwards compatibility. - - - Args: - uri (path-like object or file-like object): - Source of audio data. The following types are accepted: - - * ``path-like``: File path or URL. - * ``file-like``: Object with ``read(size: int) -> bytes`` method. - - frame_offset (int, optional): - Number of samples to skip before start reading data. - num_frames (int, optional): - Maximum number of samples to read. ``-1`` reads all the remaining samples, - starting from ``frame_offset``. - normalize (bool, optional): - TorchCodec always returns normalized float32 samples. This parameter - is ignored and a warning is issued if set to False. - Default: ``True``. - channels_first (bool, optional): - When True, the returned Tensor has dimension `[channel, time]`. - Otherwise, the returned Tensor's dimension is `[time, channel]`. - format (str or None, optional): - Format hint for the decoder. May not be supported by all TorchCodec - decoders. (Default: ``None``) - buffer_size (int, optional): - Not used by TorchCodec AudioDecoder. Provided for API compatibility. - backend (str or None, optional): - Not used by TorchCodec AudioDecoder. Provided for API compatibility. - - Returns: - (torch.Tensor, int): Resulting Tensor and sample rate. - Always returns float32 tensors. If ``channels_first=True``, shape is - `[channel, time]`, otherwise `[time, channel]`. - - Raises: - ImportError: If torchcodec is not available. - ValueError: If unsupported parameters are used. - RuntimeError: If TorchCodec fails to decode the audio. - - Note: - - TorchCodec always returns normalized float32 samples, so the ``normalize`` - parameter has no effect. - - The ``buffer_size`` and ``backend`` parameters are ignored. - - Not all audio formats supported by torchaudio backends may be supported - by TorchCodec. - """ - return load_with_torchcodec( - uri, - frame_offset=frame_offset, - num_frames=num_frames, - normalize=normalize, - channels_first=channels_first, - format=format, - buffer_size=buffer_size, - backend=backend - ) - -def save( - uri: Union[str, os.PathLike], - src: torch.Tensor, - sample_rate: int, - channels_first: bool = True, - format: Optional[str] = None, - encoding: Optional[str] = None, - bits_per_sample: Optional[int] = None, - buffer_size: int = 4096, - backend: Optional[str] = None, - compression: Optional[Union[float, int]] = None, -) -> None: - """Save audio data to file using TorchCodec's AudioEncoder. - - .. note:: - - As of TorchAudio 2.9, this function relies on TorchCodec's encoding capabilities under the hood. - It is provided for convenience, but we do recommend that you port your code to - natively use ``torchcodec``'s ``AudioEncoder`` class for better - performance: - https://docs.pytorch.org/torchcodec/stable/generated/torchcodec.encoders.AudioEncoder. - Because of the reliance on Torchcodec, the parameters ``format``, ``encoding``, - ``bits_per_sample``, ``buffer_size``, and ``backend``, are ignored and accepted only for - backwards compatibility. - - Args: - uri (path-like object): - Path to save the audio file. The file extension determines the format. - - src (torch.Tensor): - Audio data to save. Must be a 1D or 2D tensor with float32 values - in the range [-1, 1]. If 2D, shape should be [channel, time] when - channels_first=True, or [time, channel] when channels_first=False. - - sample_rate (int): - Sample rate of the audio data. - - channels_first (bool, optional): - Indicates whether the input tensor has channels as the first dimension. - If True, expects [channel, time]. If False, expects [time, channel]. - Default: True. - - format (str or None, optional): - Audio format hint. Not used by TorchCodec (format is determined by - file extension). A warning is issued if provided. - Default: None. - - encoding (str or None, optional): - Audio encoding. Not fully supported by TorchCodec AudioEncoder. - A warning is issued if provided. Default: None. - - bits_per_sample (int or None, optional): - Bits per sample. Not directly supported by TorchCodec AudioEncoder. - A warning is issued if provided. Default: None. - - buffer_size (int, optional): - Not used by TorchCodec AudioEncoder. Provided for API compatibility. - A warning is issued if not default value. Default: 4096. - - backend (str or None, optional): - Not used by TorchCodec AudioEncoder. Provided for API compatibility. - A warning is issued if provided. Default: None. - - compression (float, int or None, optional): - Compression level or bit rate. Maps to bit_rate parameter in - TorchCodec AudioEncoder. Default: None. - - Raises: - ImportError: If torchcodec is not available. - ValueError: If input parameters are invalid. - RuntimeError: If TorchCodec fails to encode the audio. - - Note: - - TorchCodec AudioEncoder expects float32 samples in [-1, 1] range. - - Some parameters (format, encoding, bits_per_sample, buffer_size, backend) - are not used by TorchCodec but are provided for API compatibility. - - The output format is determined by the file extension in the uri. - - TorchCodec uses FFmpeg under the hood for encoding. - """ - return save_with_torchcodec(uri, src, sample_rate, - channels_first=channels_first, - format=format, - encoding=encoding, - bits_per_sample=bits_per_sample, - buffer_size=buffer_size, - backend=backend, - compression=compression) +# CI cannot currently build with ffmpeg>4, but torchcodec is buggy with ffmpeg4. This hack +# allows CI to build with ffmpeg4 and works around load/test bugginess. +if "pytest" in sys.modules: + def load( + uri: Union[BinaryIO, str, os.PathLike], + frame_offset: int = 0, + num_frames: int = -1, + channels_first: bool = True, + format: Optional[str] = None, + buffer_size: int = 4096, + backend: Optional[str] = None, + ) -> Tuple[torch.Tensor, int]: + rate, data = wavfile.read(uri) + if data.ndim == 1: + data = data[:,None] + if num_frames == -1: + num_frames = data.shape[0] - frame_offset + data = data[frame_offset:frame_offset + num_frames] + if channels_first: + data = data.T + return data, rate + + def save( + uri: Union[str, os.PathLike], + src: torch.Tensor, + sample_rate: int, + channels_first: bool = True, + format: Optional[str] = None, + encoding: Optional[str] = None, + bits_per_sample: Optional[int] = None, + buffer_size: int = 4096, + backend: Optional[str] = None, + compression: Optional[Union[float, int]] = None, + ): + wavfile.write(uri, sample_rate, src.numpy()) +else: + def load( + uri: Union[BinaryIO, str, os.PathLike], + frame_offset: int = 0, + num_frames: int = -1, + normalize: bool = True, + channels_first: bool = True, + format: Optional[str] = None, + buffer_size: int = 4096, + backend: Optional[str] = None, + ) -> Tuple[torch.Tensor, int]: + """Load audio data from source using TorchCodec's AudioDecoder. + + .. note:: + + As of TorchAudio 2.9, this function relies on TorchCodec's decoding capabilities under the hood. It is + provided for convenience, but we do recommend that you port your code to + natively use ``torchcodec``'s ``AudioDecoder`` class for better + performance: + https://docs.pytorch.org/torchcodec/stable/generated/torchcodec.decoders.AudioDecoder. + Because of the reliance on Torchcodec, the parameters ``normalize``, ``buffer_size``, and + ``backend`` are ignored and accepted only for backwards compatibility. + + + Args: + uri (path-like object or file-like object): + Source of audio data. The following types are accepted: + + * ``path-like``: File path or URL. + * ``file-like``: Object with ``read(size: int) -> bytes`` method. + + frame_offset (int, optional): + Number of samples to skip before start reading data. + num_frames (int, optional): + Maximum number of samples to read. ``-1`` reads all the remaining samples, + starting from ``frame_offset``. + normalize (bool, optional): + TorchCodec always returns normalized float32 samples. This parameter + is ignored and a warning is issued if set to False. + Default: ``True``. + channels_first (bool, optional): + When True, the returned Tensor has dimension `[channel, time]`. + Otherwise, the returned Tensor's dimension is `[time, channel]`. + format (str or None, optional): + Format hint for the decoder. May not be supported by all TorchCodec + decoders. (Default: ``None``) + buffer_size (int, optional): + Not used by TorchCodec AudioDecoder. Provided for API compatibility. + backend (str or None, optional): + Not used by TorchCodec AudioDecoder. Provided for API compatibility. + + Returns: + (torch.Tensor, int): Resulting Tensor and sample rate. + Always returns float32 tensors. If ``channels_first=True``, shape is + `[channel, time]`, otherwise `[time, channel]`. + + Raises: + ImportError: If torchcodec is not available. + ValueError: If unsupported parameters are used. + RuntimeError: If TorchCodec fails to decode the audio. + + Note: + - TorchCodec always returns normalized float32 samples, so the ``normalize`` + parameter has no effect. + - The ``buffer_size`` and ``backend`` parameters are ignored. + - Not all audio formats supported by torchaudio backends may be supported + by TorchCodec. + """ + return load_with_torchcodec( + uri, + frame_offset=frame_offset, + num_frames=num_frames, + normalize=normalize, + channels_first=channels_first, + format=format, + buffer_size=buffer_size, + backend=backend + ) + + def save( + uri: Union[str, os.PathLike], + src: torch.Tensor, + sample_rate: int, + channels_first: bool = True, + format: Optional[str] = None, + encoding: Optional[str] = None, + bits_per_sample: Optional[int] = None, + buffer_size: int = 4096, + backend: Optional[str] = None, + compression: Optional[Union[float, int]] = None, + ) -> None: + """Save audio data to file using TorchCodec's AudioEncoder. + + .. note:: + + As of TorchAudio 2.9, this function relies on TorchCodec's encoding capabilities under the hood. + It is provided for convenience, but we do recommend that you port your code to + natively use ``torchcodec``'s ``AudioEncoder`` class for better + performance: + https://docs.pytorch.org/torchcodec/stable/generated/torchcodec.encoders.AudioEncoder. + Because of the reliance on Torchcodec, the parameters ``format``, ``encoding``, + ``bits_per_sample``, ``buffer_size``, and ``backend``, are ignored and accepted only for + backwards compatibility. + + Args: + uri (path-like object): + Path to save the audio file. The file extension determines the format. + + src (torch.Tensor): + Audio data to save. Must be a 1D or 2D tensor with float32 values + in the range [-1, 1]. If 2D, shape should be [channel, time] when + channels_first=True, or [time, channel] when channels_first=False. + + sample_rate (int): + Sample rate of the audio data. + + channels_first (bool, optional): + Indicates whether the input tensor has channels as the first dimension. + If True, expects [channel, time]. If False, expects [time, channel]. + Default: True. + + format (str or None, optional): + Audio format hint. Not used by TorchCodec (format is determined by + file extension). A warning is issued if provided. + Default: None. + + encoding (str or None, optional): + Audio encoding. Not fully supported by TorchCodec AudioEncoder. + A warning is issued if provided. Default: None. + + bits_per_sample (int or None, optional): + Bits per sample. Not directly supported by TorchCodec AudioEncoder. + A warning is issued if provided. Default: None. + + buffer_size (int, optional): + Not used by TorchCodec AudioEncoder. Provided for API compatibility. + A warning is issued if not default value. Default: 4096. + + backend (str or None, optional): + Not used by TorchCodec AudioEncoder. Provided for API compatibility. + A warning is issued if provided. Default: None. + + compression (float, int or None, optional): + Compression level or bit rate. Maps to bit_rate parameter in + TorchCodec AudioEncoder. Default: None. + + Raises: + ImportError: If torchcodec is not available. + ValueError: If input parameters are invalid. + RuntimeError: If TorchCodec fails to encode the audio. + + Note: + - TorchCodec AudioEncoder expects float32 samples in [-1, 1] range. + - Some parameters (format, encoding, bits_per_sample, buffer_size, backend) + are not used by TorchCodec but are provided for API compatibility. + - The output format is determined by the file extension in the uri. + - TorchCodec uses FFmpeg under the hood for encoding. + """ + return save_with_torchcodec(uri, src, sample_rate, + channels_first=channels_first, + format=format, + encoding=encoding, + bits_per_sample=bits_per_sample, + buffer_size=buffer_size, + backend=backend, + compression=compression) __all__ = [ "AudioMetaData", From 700c6c9b0a36efc2a8bdeb8c348a84707e67edff Mon Sep 17 00:00:00 2001 From: Sam Anklesaria Date: Wed, 13 Aug 2025 19:17:46 +0000 Subject: [PATCH 10/25] Only import scipy during testing --- .github/scripts/unittest-linux/install.sh | 1 - src/torchaudio/__init__.py | 2 +- 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/.github/scripts/unittest-linux/install.sh b/.github/scripts/unittest-linux/install.sh index 6a347577d5..e4fa67b1e5 100755 --- a/.github/scripts/unittest-linux/install.sh +++ b/.github/scripts/unittest-linux/install.sh @@ -93,7 +93,6 @@ fi pip install --progress-bar=off --pre torch torchcodec --index-url="${PYTORCH_WHEEL_INDEX}" - # 2. Install torchaudio conda install --quiet -y ninja cmake diff --git a/src/torchaudio/__init__.py b/src/torchaudio/__init__.py index 5910743607..ca34b996cf 100644 --- a/src/torchaudio/__init__.py +++ b/src/torchaudio/__init__.py @@ -2,7 +2,6 @@ from typing import Union, BinaryIO, Optional, Tuple import os import torch -from scipy.io import wavfile import sys # Initialize extension and backend first @@ -46,6 +45,7 @@ # CI cannot currently build with ffmpeg>4, but torchcodec is buggy with ffmpeg4. This hack # allows CI to build with ffmpeg4 and works around load/test bugginess. if "pytest" in sys.modules: + from scipy.io import wavfile def load( uri: Union[BinaryIO, str, os.PathLike], frame_offset: int = 0, From 6995b21ebacdb99f9952f6dead2b504284c63496 Mon Sep 17 00:00:00 2001 From: Sam Anklesaria Date: Wed, 13 Aug 2025 19:52:30 +0000 Subject: [PATCH 11/25] Revert "Install conda deps before pip deps" This reverts commit 80f5eb7778afd5efc1a2c601583c84ffb5aa2401. --- .github/scripts/unittest-linux/install.sh | 28 +++++++++++------------ 1 file changed, 13 insertions(+), 15 deletions(-) diff --git a/.github/scripts/unittest-linux/install.sh b/.github/scripts/unittest-linux/install.sh index e4fa67b1e5..9f99fd1e98 100755 --- a/.github/scripts/unittest-linux/install.sh +++ b/.github/scripts/unittest-linux/install.sh @@ -74,7 +74,19 @@ case $GPU_ARCH_TYPE in ;; esac PYTORCH_WHEEL_INDEX="https://download.pytorch.org/whl/${UPLOAD_CHANNEL}/${GPU_ARCH_ID}" +pip install --progress-bar=off --pre torch torchcodec --index-url="${PYTORCH_WHEEL_INDEX}" + +# 2. Install torchaudio +conda install --quiet -y ninja cmake +printf "* Installing torchaudio\n" +export BUILD_CPP_TEST=1 +pip install . -v --no-build-isolation + +# 3. Install Test tools +printf "* Installing test tools\n" +conda install -y "ffmpeg>4" +python -c "import torch; import torchaudio; import torchcodec; print(torch.__version__, torchaudio.__version__, torchcodec.__version__)" NUMBA_DEV_CHANNEL="" if [[ "$(python --version)" = *3.9* || "$(python --version)" = *3.10* ]]; then @@ -84,26 +96,12 @@ if [[ "$(python --version)" = *3.9* || "$(python --version)" = *3.10* ]]; then fi ( set -x - conda install -y -c conda-forge ${NUMBA_DEV_CHANNEL} "ffmpeg>4" libvorbis parameterized 'requests>=2.20' + conda install -y -c conda-forge ${NUMBA_DEV_CHANNEL} libvorbis parameterized 'requests>=2.20' pip install SoundFile coverage pytest pytest-cov scipy expecttest unidecode inflect Pillow sentencepiece pytorch-lightning 'protobuf<4.21.0' demucs tinytag pyroomacoustics flashlight-text git+https://github.com/kpu/kenlm # TODO: might be better to fix the single call to `pip install` above pip install pillow scipy "numpy>=1.26" ) - -pip install --progress-bar=off --pre torch torchcodec --index-url="${PYTORCH_WHEEL_INDEX}" - -# 2. Install torchaudio -conda install --quiet -y ninja cmake - -printf "* Installing torchaudio\n" -export BUILD_CPP_TEST=1 -pip install . -v --no-build-isolation - -# 3. Install Test tools -printf "* Installing test tools\n" -python -c "import torch; import torchaudio; import torchcodec; print(torch.__version__, torchaudio.__version__, torchcodec.__version__)" - # Install fairseq git clone https://github.com/pytorch/fairseq cd fairseq From 4ab5993566d2109b53c92b9b494ea27be5a555b9 Mon Sep 17 00:00:00 2001 From: Sam Anklesaria Date: Wed, 13 Aug 2025 19:52:35 +0000 Subject: [PATCH 12/25] Revert "Try ffmpeg>4" This reverts commit 74edc0a8dbe942aae3f04924d1743f4da49800cb. --- .github/scripts/unittest-linux/install.sh | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.github/scripts/unittest-linux/install.sh b/.github/scripts/unittest-linux/install.sh index 9f99fd1e98..15bf71e907 100755 --- a/.github/scripts/unittest-linux/install.sh +++ b/.github/scripts/unittest-linux/install.sh @@ -85,7 +85,8 @@ pip install . -v --no-build-isolation # 3. Install Test tools printf "* Installing test tools\n" -conda install -y "ffmpeg>4" +# On this CI, for whatever reason, we're only able to install ffmpeg 4. +conda install -y "ffmpeg<5" python -c "import torch; import torchaudio; import torchcodec; print(torch.__version__, torchaudio.__version__, torchcodec.__version__)" NUMBA_DEV_CHANNEL="" From 43c460285b61eb4bc412005cad6536e3ac513a3b Mon Sep 17 00:00:00 2001 From: Sam Anklesaria Date: Wed, 13 Aug 2025 19:53:21 +0000 Subject: [PATCH 13/25] Revert torchcodec installation changes --- .github/scripts/unittest-linux/install.sh | 1 + .github/workflows/build_docs.yml | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/.github/scripts/unittest-linux/install.sh b/.github/scripts/unittest-linux/install.sh index 15bf71e907..a7ae9bfcf4 100755 --- a/.github/scripts/unittest-linux/install.sh +++ b/.github/scripts/unittest-linux/install.sh @@ -76,6 +76,7 @@ esac PYTORCH_WHEEL_INDEX="https://download.pytorch.org/whl/${UPLOAD_CHANNEL}/${GPU_ARCH_ID}" pip install --progress-bar=off --pre torch torchcodec --index-url="${PYTORCH_WHEEL_INDEX}" + # 2. Install torchaudio conda install --quiet -y ninja cmake diff --git a/.github/workflows/build_docs.yml b/.github/workflows/build_docs.yml index f681e3b7ec..e92c556218 100644 --- a/.github/workflows/build_docs.yml +++ b/.github/workflows/build_docs.yml @@ -68,7 +68,7 @@ jobs: GPU_ARCH_ID=cu126 # This is hard-coded and must be consistent with gpu-arch-version. PYTORCH_WHEEL_INDEX="https://download.pytorch.org/whl/${CHANNEL}/${GPU_ARCH_ID}" - pip install --progress-bar=off --pre torch torchcodec --index-url="${PYTORCH_WHEEL_INDEX}" + pip install --progress-bar=off --pre torch --index-url="${PYTORCH_WHEEL_INDEX}" echo "::endgroup::" echo "::group::Install TorchAudio" From f74f00423ade5d7c2a1f426193533a0772a7d40e Mon Sep 17 00:00:00 2001 From: Sam Anklesaria Date: Wed, 13 Aug 2025 21:00:05 +0000 Subject: [PATCH 14/25] Use existing wav_utils --- src/torchaudio/__init__.py | 24 +++++-------------- .../torchaudio/utils}/wav_utils.py | 0 .../common_utils/__init__.py | 2 +- 3 files changed, 7 insertions(+), 19 deletions(-) rename {test/torchaudio_unittest/common_utils => src/torchaudio/utils}/wav_utils.py (100%) diff --git a/src/torchaudio/__init__.py b/src/torchaudio/__init__.py index ca34b996cf..1ff3a530e4 100644 --- a/src/torchaudio/__init__.py +++ b/src/torchaudio/__init__.py @@ -45,28 +45,16 @@ # CI cannot currently build with ffmpeg>4, but torchcodec is buggy with ffmpeg4. This hack # allows CI to build with ffmpeg4 and works around load/test bugginess. if "pytest" in sys.modules: - from scipy.io import wavfile + from torchaudio.utils import wav_utils def load( - uri: Union[BinaryIO, str, os.PathLike], - frame_offset: int = 0, - num_frames: int = -1, + uri: str, + normalize: bool = True, channels_first: bool = True, - format: Optional[str] = None, - buffer_size: int = 4096, - backend: Optional[str] = None, ) -> Tuple[torch.Tensor, int]: - rate, data = wavfile.read(uri) - if data.ndim == 1: - data = data[:,None] - if num_frames == -1: - num_frames = data.shape[0] - frame_offset - data = data[frame_offset:frame_offset + num_frames] - if channels_first: - data = data.T - return data, rate + return wav_utils.load_wav(uri, normalize, channels_first) def save( - uri: Union[str, os.PathLike], + uri: str, src: torch.Tensor, sample_rate: int, channels_first: bool = True, @@ -77,7 +65,7 @@ def save( backend: Optional[str] = None, compression: Optional[Union[float, int]] = None, ): - wavfile.write(uri, sample_rate, src.numpy()) + wav_utils.save_wav(uri, src, sample_rate, channels_first=channels_first) else: def load( uri: Union[BinaryIO, str, os.PathLike], diff --git a/test/torchaudio_unittest/common_utils/wav_utils.py b/src/torchaudio/utils/wav_utils.py similarity index 100% rename from test/torchaudio_unittest/common_utils/wav_utils.py rename to src/torchaudio/utils/wav_utils.py diff --git a/test/torchaudio_unittest/common_utils/__init__.py b/test/torchaudio_unittest/common_utils/__init__.py index 509d5208df..93ac7e0821 100644 --- a/test/torchaudio_unittest/common_utils/__init__.py +++ b/test/torchaudio_unittest/common_utils/__init__.py @@ -26,7 +26,7 @@ from .func_utils import torch_script from .image_utils import get_image, rgb_to_gray, rgb_to_yuv_ccir, save_image from .parameterized_utils import load_params, nested_params -from .wav_utils import get_wav_data, load_wav, normalize_wav, save_wav +from torchaudio.utils.wav_utils import get_wav_data, load_wav, normalize_wav, save_wav import pytest class RequestMixin: From 89ca133522d1d362070f9299b79469c3e10a72eb Mon Sep 17 00:00:00 2001 From: Sam Anklesaria Date: Wed, 13 Aug 2025 21:32:05 +0000 Subject: [PATCH 15/25] Remove _backend folder --- src/torchaudio/__init__.py | 20 - src/torchaudio/_backend/__init__.py | 61 --- src/torchaudio/_backend/backend.py | 53 --- src/torchaudio/_backend/common.py | 52 --- src/torchaudio/_backend/ffmpeg.py | 334 -------------- src/torchaudio/_backend/soundfile.py | 54 --- src/torchaudio/_backend/soundfile_backend.py | 457 ------------------- src/torchaudio/_backend/sox.py | 91 ---- src/torchaudio/_backend/utils.py | 350 -------------- src/torchaudio/backend/__init__.py | 8 - src/torchaudio/backend/_no_backend.py | 25 - src/torchaudio/backend/_sox_io_backend.py | 294 ------------ src/torchaudio/backend/common.py | 13 - src/torchaudio/backend/no_backend.py | 14 - src/torchaudio/backend/soundfile_backend.py | 14 - src/torchaudio/backend/sox_io_backend.py | 14 - 16 files changed, 1854 deletions(-) delete mode 100644 src/torchaudio/_backend/__init__.py delete mode 100644 src/torchaudio/_backend/backend.py delete mode 100644 src/torchaudio/_backend/common.py delete mode 100644 src/torchaudio/_backend/ffmpeg.py delete mode 100644 src/torchaudio/_backend/soundfile.py delete mode 100644 src/torchaudio/_backend/soundfile_backend.py delete mode 100644 src/torchaudio/_backend/sox.py delete mode 100644 src/torchaudio/_backend/utils.py delete mode 100644 src/torchaudio/backend/__init__.py delete mode 100644 src/torchaudio/backend/_no_backend.py delete mode 100644 src/torchaudio/backend/_sox_io_backend.py delete mode 100644 src/torchaudio/backend/common.py delete mode 100644 src/torchaudio/backend/no_backend.py delete mode 100644 src/torchaudio/backend/soundfile_backend.py delete mode 100644 src/torchaudio/backend/sox_io_backend.py diff --git a/src/torchaudio/__init__.py b/src/torchaudio/__init__.py index 1ff3a530e4..b226210547 100644 --- a/src/torchaudio/__init__.py +++ b/src/torchaudio/__init__.py @@ -6,21 +6,8 @@ # Initialize extension and backend first from . import _extension # noqa # usort: skip -from ._backend import ( # noqa # usort: skip - AudioMetaData as _AudioMetaData, - get_audio_backend as _get_audio_backend, - info as _info, - list_audio_backends as _list_audio_backends, - set_audio_backend as _set_audio_backend, -) from ._torchcodec import load_with_torchcodec, save_with_torchcodec -AudioMetaData = dropping_class_io_support(_AudioMetaData) -get_audio_backend = dropping_io_support(_get_audio_backend) -info = dropping_io_support(_info) -list_audio_backends = dropping_io_support(_list_audio_backends) -set_audio_backend = dropping_io_support(_set_audio_backend) - from . import ( # noqa: F401 compliance, datasets, @@ -34,8 +21,6 @@ utils, ) -# For BC -from . import backend # noqa # usort: skip try: from .version import __version__, git_version # noqa: F401 @@ -234,11 +219,9 @@ def save( compression=compression) __all__ = [ - "AudioMetaData", "load", "load_with_torchcodec", "save_with_torchcodec", - "info", "save", "io", "compliance", @@ -250,7 +233,4 @@ def save( "utils", "sox_effects", "transforms", - "list_audio_backends", - "get_audio_backend", - "set_audio_backend", ] diff --git a/src/torchaudio/_backend/__init__.py b/src/torchaudio/_backend/__init__.py deleted file mode 100644 index 27337013ff..0000000000 --- a/src/torchaudio/_backend/__init__.py +++ /dev/null @@ -1,61 +0,0 @@ -from typing import List, Optional - -from torchaudio._internal.module_utils import deprecated - -from . import utils -from .common import AudioMetaData - -__all__ = [ - "AudioMetaData", - "load", - "info", - "save", - "list_audio_backends", - "get_audio_backend", - "set_audio_backend", -] - - -info = utils.get_info_func() -load = utils.get_load_func() -save = utils.get_save_func() - - -def list_audio_backends() -> List[str]: - """List available backends - - Returns: - list of str: The list of available backends. - - The possible values are; ``"ffmpeg"``, ``"sox"`` and ``"soundfile"``. - """ - - return list(utils.get_available_backends().keys()) - - -# Temporary until global backend is removed -@deprecated("With dispatcher enabled, this function is no-op. You can remove the function call.") -def get_audio_backend() -> Optional[str]: - """Get the name of the current global backend - - Returns: - str or None: - If dispatcher mode is enabled, returns ``None`` otherwise, - the name of current backend or ``None`` (no backend is set). - """ - return None - - -# Temporary until global backend is removed -@deprecated("With dispatcher enabled, this function is no-op. You can remove the function call.") -def set_audio_backend(backend: Optional[str]): # noqa - """Set the global backend. - - This is a no-op when dispatcher mode is enabled. - - Args: - backend (str or None): Name of the backend. - One of ``"sox_io"`` or ``"soundfile"`` based on availability - of the system. If ``None`` is provided the current backend is unassigned. - """ - pass diff --git a/src/torchaudio/_backend/backend.py b/src/torchaudio/_backend/backend.py deleted file mode 100644 index 579340962c..0000000000 --- a/src/torchaudio/_backend/backend.py +++ /dev/null @@ -1,53 +0,0 @@ -import os -from abc import ABC, abstractmethod -from typing import BinaryIO, Optional, Tuple, Union - -from torch import Tensor -from torchaudio.io import CodecConfig - -from .common import AudioMetaData - - -class Backend(ABC): - @staticmethod - @abstractmethod - def info(uri: Union[BinaryIO, str, os.PathLike], format: Optional[str], buffer_size: int = 4096) -> AudioMetaData: - raise NotImplementedError - - @staticmethod - @abstractmethod - def load( - uri: Union[BinaryIO, str, os.PathLike], - frame_offset: int = 0, - num_frames: int = -1, - normalize: bool = True, - channels_first: bool = True, - format: Optional[str] = None, - buffer_size: int = 4096, - ) -> Tuple[Tensor, int]: - raise NotImplementedError - - @staticmethod - @abstractmethod - def save( - uri: Union[BinaryIO, str, os.PathLike], - src: Tensor, - sample_rate: int, - channels_first: bool = True, - format: Optional[str] = None, - encoding: Optional[str] = None, - bits_per_sample: Optional[int] = None, - buffer_size: int = 4096, - compression: Optional[Union[CodecConfig, float, int]] = None, - ) -> None: - raise NotImplementedError - - @staticmethod - @abstractmethod - def can_decode(uri: Union[BinaryIO, str, os.PathLike], format: Optional[str]) -> bool: - raise NotImplementedError - - @staticmethod - @abstractmethod - def can_encode(uri: Union[BinaryIO, str, os.PathLike], format: Optional[str]) -> bool: - raise NotImplementedError diff --git a/src/torchaudio/_backend/common.py b/src/torchaudio/_backend/common.py deleted file mode 100644 index 804b18d461..0000000000 --- a/src/torchaudio/_backend/common.py +++ /dev/null @@ -1,52 +0,0 @@ -class AudioMetaData: - """AudioMetaData() - - Return type of ``torchaudio.info`` function. - - :ivar int sample_rate: Sample rate - :ivar int num_frames: The number of frames - :ivar int num_channels: The number of channels - :ivar int bits_per_sample: The number of bits per sample. This is 0 for lossy formats, - or when it cannot be accurately inferred. - :ivar str encoding: Audio encoding - The values encoding can take are one of the following: - - * ``PCM_S``: Signed integer linear PCM - * ``PCM_U``: Unsigned integer linear PCM - * ``PCM_F``: Floating point linear PCM - * ``FLAC``: Flac, Free Lossless Audio Codec - * ``ULAW``: Mu-law - * ``ALAW``: A-law - * ``MP3`` : MP3, MPEG-1 Audio Layer III - * ``VORBIS``: OGG Vorbis - * ``AMR_WB``: Adaptive Multi-Rate Wideband - * ``AMR_NB``: Adaptive Multi-Rate Narrowband - * ``OPUS``: Opus - * ``HTK``: Single channel 16-bit PCM - * ``UNKNOWN`` : None of above - """ - - def __init__( - self, - sample_rate: int, - num_frames: int, - num_channels: int, - bits_per_sample: int, - encoding: str, - ): - self.sample_rate = sample_rate - self.num_frames = num_frames - self.num_channels = num_channels - self.bits_per_sample = bits_per_sample - self.encoding = encoding - - def __str__(self): - return ( - f"AudioMetaData(" - f"sample_rate={self.sample_rate}, " - f"num_frames={self.num_frames}, " - f"num_channels={self.num_channels}, " - f"bits_per_sample={self.bits_per_sample}, " - f"encoding={self.encoding}" - f")" - ) diff --git a/src/torchaudio/_backend/ffmpeg.py b/src/torchaudio/_backend/ffmpeg.py deleted file mode 100644 index ca8374ea07..0000000000 --- a/src/torchaudio/_backend/ffmpeg.py +++ /dev/null @@ -1,334 +0,0 @@ -import os -import re -import sys -from typing import BinaryIO, Optional, Tuple, Union - -import torch -import torchaudio - -from .backend import Backend -from .common import AudioMetaData - -InputType = Union[BinaryIO, str, os.PathLike] - - -def info_audio( - src: InputType, - format: Optional[str], - buffer_size: int = 4096, -) -> AudioMetaData: - s = torchaudio.io.StreamReader(src, format, None, buffer_size) - sinfo = s.get_src_stream_info(s.default_audio_stream) - if sinfo.num_frames == 0: - waveform = _load_audio(s) - num_frames = waveform.size(1) - else: - num_frames = sinfo.num_frames - return AudioMetaData( - int(sinfo.sample_rate), - num_frames, - sinfo.num_channels, - sinfo.bits_per_sample, - sinfo.codec.upper(), - ) - - -def _get_load_filter( - frame_offset: int = 0, - num_frames: int = -1, - convert: bool = True, -) -> Optional[str]: - if frame_offset < 0: - raise RuntimeError("Invalid argument: frame_offset must be non-negative. Found: {}".format(frame_offset)) - if num_frames == 0 or num_frames < -1: - raise RuntimeError("Invalid argument: num_frames must be -1 or greater than 0. Found: {}".format(num_frames)) - - # All default values -> no filter - if frame_offset == 0 and num_frames == -1 and not convert: - return None - # Only convert - aformat = "aformat=sample_fmts=fltp" - if frame_offset == 0 and num_frames == -1 and convert: - return aformat - # At least one of frame_offset or num_frames has non-default value - if num_frames > 0: - atrim = "atrim=start_sample={}:end_sample={}".format(frame_offset, frame_offset + num_frames) - else: - atrim = "atrim=start_sample={}".format(frame_offset) - if not convert: - return atrim - return "{},{}".format(atrim, aformat) - - -def _load_audio( - s: "torchaudio.io.StreamReader", - filter: Optional[str] = None, - channels_first: bool = True, -) -> torch.Tensor: - s.add_audio_stream(-1, -1, filter_desc=filter) - s.process_all_packets() - chunk = s.pop_chunks()[0] - if chunk is None: - raise RuntimeError("Failed to decode audio.") - waveform = chunk._elem - return waveform.T if channels_first else waveform - - -def load_audio( - src: InputType, - frame_offset: int = 0, - num_frames: int = -1, - convert: bool = True, - channels_first: bool = True, - format: Optional[str] = None, - buffer_size: int = 4096, -) -> Tuple[torch.Tensor, int]: - if hasattr(src, "read") and format == "vorbis": - format = "ogg" - s = torchaudio.io.StreamReader(src, format, None, buffer_size) - sample_rate = int(s.get_src_stream_info(s.default_audio_stream).sample_rate) - filter = _get_load_filter(frame_offset, num_frames, convert) - waveform = _load_audio(s, filter, channels_first) - return waveform, sample_rate - - -def _get_sample_format(dtype: torch.dtype) -> str: - dtype_to_format = { - torch.uint8: "u8", - torch.int16: "s16", - torch.int32: "s32", - torch.int64: "s64", - torch.float32: "flt", - torch.float64: "dbl", - } - format = dtype_to_format.get(dtype) - if format is None: - raise ValueError(f"No format found for dtype {dtype}; dtype must be one of {list(dtype_to_format.keys())}.") - return format - - -def _native_endianness() -> str: - if sys.byteorder == "little": - return "le" - else: - return "be" - - -def _get_encoder_for_wav(encoding: str, bits_per_sample: int) -> str: - if bits_per_sample not in {None, 8, 16, 24, 32, 64}: - raise ValueError(f"Invalid bits_per_sample {bits_per_sample} for WAV encoding.") - endianness = _native_endianness() - if not encoding: - if not bits_per_sample: - # default to PCM S16 - return f"pcm_s16{endianness}" - if bits_per_sample == 8: - return "pcm_u8" - return f"pcm_s{bits_per_sample}{endianness}" - if encoding == "PCM_S": - if not bits_per_sample: - bits_per_sample = 16 - if bits_per_sample == 8: - raise ValueError("For WAV signed PCM, 8-bit encoding is not supported.") - return f"pcm_s{bits_per_sample}{endianness}" - if encoding == "PCM_U": - if bits_per_sample in (None, 8): - return "pcm_u8" - raise ValueError("For WAV unsigned PCM, only 8-bit encoding is supported.") - if encoding == "PCM_F": - if not bits_per_sample: - bits_per_sample = 32 - if bits_per_sample in (32, 64): - return f"pcm_f{bits_per_sample}{endianness}" - raise ValueError("For WAV float PCM, only 32- and 64-bit encodings are supported.") - if encoding == "ULAW": - if bits_per_sample in (None, 8): - return "pcm_mulaw" - raise ValueError("For WAV PCM mu-law, only 8-bit encoding is supported.") - if encoding == "ALAW": - if bits_per_sample in (None, 8): - return "pcm_alaw" - raise ValueError("For WAV PCM A-law, only 8-bit encoding is supported.") - raise ValueError(f"WAV encoding {encoding} is not supported.") - - -def _get_flac_sample_fmt(bps): - if bps is None or bps == 16: - return "s16" - if bps == 24: - return "s32" - raise ValueError(f"FLAC only supports bits_per_sample values of 16 and 24 ({bps} specified).") - - -def _parse_save_args( - ext: Optional[str], - format: Optional[str], - encoding: Optional[str], - bps: Optional[int], -): - # torchaudio's save function accepts the followings, which do not 1to1 map - # to FFmpeg. - # - # - format: audio format - # - bits_per_sample: encoder sample format - # - encoding: such as PCM_U8. - # - # In FFmpeg, format is specified with the following three (and more) - # - # - muxer: could be audio format or container format. - # the one we passed to the constructor of StreamWriter - # - encoder: the audio encoder used to encode audio - # - encoder sample format: the format used by encoder to encode audio. - # - # If encoder sample format is different from source sample format, StreamWriter - # will insert a filter automatically. - # - def _type(spec): - # either format is exactly the specified one - # or extension matches to the spec AND there is no format override. - return format == spec or (format is None and ext == spec) - - if _type("wav") or _type("amb"): - # wav is special because it supports different encoding through encoders - # each encoder only supports one encoder format - # - # amb format is a special case originated from libsox. - # It is basically a WAV format, with slight modification. - # https://github.com/chirlu/sox/commit/4a4ea33edbca5972a1ed8933cc3512c7302fa67a#diff-39171191a858add9df87f5f210a34a776ac2c026842ae6db6ce97f5e68836795 - # It is a format so that decoders will recognize it as ambisonic. - # https://www.ambisonia.com/Members/mleese/file-format-for-b-format/ - # FFmpeg does not recognize amb because it is basically a WAV format. - muxer = "wav" - encoder = _get_encoder_for_wav(encoding, bps) - sample_fmt = None - elif _type("vorbis"): - # FFpmeg does not recognize vorbis extension, while libsox used to do. - # For the sake of bakward compatibility, (and the simplicity), - # we support the case where users want to do save("foo.vorbis") - muxer = "ogg" - encoder = "vorbis" - sample_fmt = None - else: - muxer = format - encoder = None - sample_fmt = None - if _type("flac"): - sample_fmt = _get_flac_sample_fmt(bps) - if _type("ogg"): - sample_fmt = _get_flac_sample_fmt(bps) - return muxer, encoder, sample_fmt - - -def save_audio( - uri: InputType, - src: torch.Tensor, - sample_rate: int, - channels_first: bool = True, - format: Optional[str] = None, - encoding: Optional[str] = None, - bits_per_sample: Optional[int] = None, - buffer_size: int = 4096, - compression: Optional[torchaudio.io.CodecConfig] = None, -) -> None: - ext = None - if hasattr(uri, "write"): - if format is None: - raise RuntimeError("'format' is required when saving to file object.") - else: - uri = os.path.normpath(uri) - if tokens := str(uri).split(".")[1:]: - ext = tokens[-1].lower() - - muxer, encoder, enc_fmt = _parse_save_args(ext, format, encoding, bits_per_sample) - - if channels_first: - src = src.T - - s = torchaudio.io.StreamWriter(uri, format=muxer, buffer_size=buffer_size) - s.add_audio_stream( - sample_rate, - num_channels=src.size(-1), - format=_get_sample_format(src.dtype), - encoder=encoder, - encoder_format=enc_fmt, - codec_config=compression, - ) - with s.open(): - s.write_audio_chunk(0, src) - - -def _map_encoding(encoding: str) -> str: - for dst in ["PCM_S", "PCM_U", "PCM_F"]: - if dst in encoding: - return dst - if encoding == "PCM_MULAW": - return "ULAW" - elif encoding == "PCM_ALAW": - return "ALAW" - return encoding - - -def _get_bits_per_sample(encoding: str, bits_per_sample: int) -> str: - if m := re.search(r"PCM_\w(\d+)\w*", encoding): - return int(m.group(1)) - elif encoding in ["PCM_ALAW", "PCM_MULAW"]: - return 8 - return bits_per_sample - - -class FFmpegBackend(Backend): - @staticmethod - def info(uri: InputType, format: Optional[str], buffer_size: int = 4096) -> AudioMetaData: - metadata = info_audio(uri, format, buffer_size) - metadata.bits_per_sample = _get_bits_per_sample(metadata.encoding, metadata.bits_per_sample) - metadata.encoding = _map_encoding(metadata.encoding) - return metadata - - @staticmethod - def load( - uri: InputType, - frame_offset: int = 0, - num_frames: int = -1, - normalize: bool = True, - channels_first: bool = True, - format: Optional[str] = None, - buffer_size: int = 4096, - ) -> Tuple[torch.Tensor, int]: - return load_audio(uri, frame_offset, num_frames, normalize, channels_first, format) - - @staticmethod - def save( - uri: InputType, - src: torch.Tensor, - sample_rate: int, - channels_first: bool = True, - format: Optional[str] = None, - encoding: Optional[str] = None, - bits_per_sample: Optional[int] = None, - buffer_size: int = 4096, - compression: Optional[Union[torchaudio.io.CodecConfig, float, int]] = None, - ) -> None: - if not isinstance(compression, (torchaudio.io.CodecConfig, type(None))): - raise ValueError( - "FFmpeg backend expects non-`None` value for argument `compression` to be of ", - f"type `torchaudio.io.CodecConfig`, but received value of type {type(compression)}", - ) - save_audio( - uri, - src, - sample_rate, - channels_first, - format, - encoding, - bits_per_sample, - buffer_size, - compression, - ) - - @staticmethod - def can_decode(uri: InputType, format: Optional[str]) -> bool: - return True - - @staticmethod - def can_encode(uri: InputType, format: Optional[str]) -> bool: - return True diff --git a/src/torchaudio/_backend/soundfile.py b/src/torchaudio/_backend/soundfile.py deleted file mode 100644 index f4be1f7099..0000000000 --- a/src/torchaudio/_backend/soundfile.py +++ /dev/null @@ -1,54 +0,0 @@ -import os -from typing import BinaryIO, Optional, Tuple, Union - -import torch -from torchaudio.io import CodecConfig - -from . import soundfile_backend -from .backend import Backend -from .common import AudioMetaData - - -class SoundfileBackend(Backend): - @staticmethod - def info(uri: Union[BinaryIO, str, os.PathLike], format: Optional[str], buffer_size: int = 4096) -> AudioMetaData: - return soundfile_backend.info(uri, format) - - @staticmethod - def load( - uri: Union[BinaryIO, str, os.PathLike], - frame_offset: int = 0, - num_frames: int = -1, - normalize: bool = True, - channels_first: bool = True, - format: Optional[str] = None, - buffer_size: int = 4096, - ) -> Tuple[torch.Tensor, int]: - return soundfile_backend.load(uri, frame_offset, num_frames, normalize, channels_first, format) - - @staticmethod - def save( - uri: Union[BinaryIO, str, os.PathLike], - src: torch.Tensor, - sample_rate: int, - channels_first: bool = True, - format: Optional[str] = None, - encoding: Optional[str] = None, - bits_per_sample: Optional[int] = None, - buffer_size: int = 4096, - compression: Optional[Union[CodecConfig, float, int]] = None, - ) -> None: - if compression: - raise ValueError("soundfile backend does not support argument `compression`.") - - soundfile_backend.save( - uri, src, sample_rate, channels_first, format=format, encoding=encoding, bits_per_sample=bits_per_sample - ) - - @staticmethod - def can_decode(uri, format) -> bool: - return True - - @staticmethod - def can_encode(uri, format) -> bool: - return True diff --git a/src/torchaudio/_backend/soundfile_backend.py b/src/torchaudio/_backend/soundfile_backend.py deleted file mode 100644 index 9e7b0b13cd..0000000000 --- a/src/torchaudio/_backend/soundfile_backend.py +++ /dev/null @@ -1,457 +0,0 @@ -"""The new soundfile backend which will become default in 0.8.0 onward""" -import warnings -from typing import Optional, Tuple - -import torch -from torchaudio._internal import module_utils as _mod_utils - -from .common import AudioMetaData - - -_IS_SOUNDFILE_AVAILABLE = False - -# TODO: import soundfile only when it is used. -if _mod_utils.is_module_available("soundfile"): - try: - import soundfile - - _requires_soundfile = _mod_utils.no_op - _IS_SOUNDFILE_AVAILABLE = True - except Exception: - _requires_soundfile = _mod_utils.fail_with_message( - "requires soundfile, but we failed to import it. Please check the installation of soundfile." - ) -else: - _requires_soundfile = _mod_utils.fail_with_message( - "requires soundfile, but it is not installed. Please install soundfile." - ) - - -# Mapping from soundfile subtype to number of bits per sample. -# This is mostly heuristical and the value is set to 0 when it is irrelevant -# (lossy formats) or when it can't be inferred. -# For ADPCM (and G72X) subtypes, it's hard to infer the bit depth because it's not part of the standard: -# According to https://en.wikipedia.org/wiki/Adaptive_differential_pulse-code_modulation#In_telephony, -# the default seems to be 8 bits but it can be compressed further to 4 bits. -# The dict is inspired from -# https://github.com/bastibe/python-soundfile/blob/744efb4b01abc72498a96b09115b42a4cabd85e4/soundfile.py#L66-L94 -_SUBTYPE_TO_BITS_PER_SAMPLE = { - "PCM_S8": 8, # Signed 8 bit data - "PCM_16": 16, # Signed 16 bit data - "PCM_24": 24, # Signed 24 bit data - "PCM_32": 32, # Signed 32 bit data - "PCM_U8": 8, # Unsigned 8 bit data (WAV and RAW only) - "FLOAT": 32, # 32 bit float data - "DOUBLE": 64, # 64 bit float data - "ULAW": 8, # U-Law encoded. See https://en.wikipedia.org/wiki/G.711#Types - "ALAW": 8, # A-Law encoded. See https://en.wikipedia.org/wiki/G.711#Types - "IMA_ADPCM": 0, # IMA ADPCM. - "MS_ADPCM": 0, # Microsoft ADPCM. - "GSM610": 0, # GSM 6.10 encoding. (Wikipedia says 1.625 bit depth?? https://en.wikipedia.org/wiki/Full_Rate) - "VOX_ADPCM": 0, # OKI / Dialogix ADPCM - "G721_32": 0, # 32kbs G721 ADPCM encoding. - "G723_24": 0, # 24kbs G723 ADPCM encoding. - "G723_40": 0, # 40kbs G723 ADPCM encoding. - "DWVW_12": 12, # 12 bit Delta Width Variable Word encoding. - "DWVW_16": 16, # 16 bit Delta Width Variable Word encoding. - "DWVW_24": 24, # 24 bit Delta Width Variable Word encoding. - "DWVW_N": 0, # N bit Delta Width Variable Word encoding. - "DPCM_8": 8, # 8 bit differential PCM (XI only) - "DPCM_16": 16, # 16 bit differential PCM (XI only) - "VORBIS": 0, # Xiph Vorbis encoding. (lossy) - "ALAC_16": 16, # Apple Lossless Audio Codec (16 bit). - "ALAC_20": 20, # Apple Lossless Audio Codec (20 bit). - "ALAC_24": 24, # Apple Lossless Audio Codec (24 bit). - "ALAC_32": 32, # Apple Lossless Audio Codec (32 bit). -} - - -def _get_bit_depth(subtype): - if subtype not in _SUBTYPE_TO_BITS_PER_SAMPLE: - warnings.warn( - f"The {subtype} subtype is unknown to TorchAudio. As a result, the bits_per_sample " - "attribute will be set to 0. If you are seeing this warning, please " - "report by opening an issue on github (after checking for existing/closed ones). " - "You may otherwise ignore this warning." - ) - return _SUBTYPE_TO_BITS_PER_SAMPLE.get(subtype, 0) - - -_SUBTYPE_TO_ENCODING = { - "PCM_S8": "PCM_S", - "PCM_16": "PCM_S", - "PCM_24": "PCM_S", - "PCM_32": "PCM_S", - "PCM_U8": "PCM_U", - "FLOAT": "PCM_F", - "DOUBLE": "PCM_F", - "ULAW": "ULAW", - "ALAW": "ALAW", - "VORBIS": "VORBIS", -} - - -def _get_encoding(format: str, subtype: str): - if format == "FLAC": - return "FLAC" - return _SUBTYPE_TO_ENCODING.get(subtype, "UNKNOWN") - - -@_requires_soundfile -def info(filepath: str, format: Optional[str] = None) -> AudioMetaData: - """Get signal information of an audio file. - - Note: - ``filepath`` argument is intentionally annotated as ``str`` only, even though it accepts - ``pathlib.Path`` object as well. This is for the consistency with ``"sox_io"`` backend, - which has a restriction on type annotation due to TorchScript compiler compatiblity. - - Args: - filepath (path-like object or file-like object): - Source of audio data. - format (str or None, optional): - Not used. PySoundFile does not accept format hint. - - Returns: - AudioMetaData: meta data of the given audio. - - """ - sinfo = soundfile.info(filepath) - return AudioMetaData( - sinfo.samplerate, - sinfo.frames, - sinfo.channels, - bits_per_sample=_get_bit_depth(sinfo.subtype), - encoding=_get_encoding(sinfo.format, sinfo.subtype), - ) - - -_SUBTYPE2DTYPE = { - "PCM_S8": "int8", - "PCM_U8": "uint8", - "PCM_16": "int16", - "PCM_32": "int32", - "FLOAT": "float32", - "DOUBLE": "float64", -} - - -@_requires_soundfile -def load( - filepath: str, - frame_offset: int = 0, - num_frames: int = -1, - normalize: bool = True, - channels_first: bool = True, - format: Optional[str] = None, -) -> Tuple[torch.Tensor, int]: - """Load audio data from file. - - Note: - The formats this function can handle depend on the soundfile installation. - This function is tested on the following formats; - - * WAV - - * 32-bit floating-point - * 32-bit signed integer - * 16-bit signed integer - * 8-bit unsigned integer - - * FLAC - * OGG/VORBIS - * SPHERE - - By default (``normalize=True``, ``channels_first=True``), this function returns Tensor with - ``float32`` dtype, and the shape of `[channel, time]`. - - .. warning:: - - ``normalize`` argument does not perform volume normalization. - It only converts the sample type to `torch.float32` from the native sample - type. - - When the input format is WAV with integer type, such as 32-bit signed integer, 16-bit - signed integer, 24-bit signed integer, and 8-bit unsigned integer, by providing ``normalize=False``, - this function can return integer Tensor, where the samples are expressed within the whole range - of the corresponding dtype, that is, ``int32`` tensor for 32-bit signed PCM, - ``int16`` for 16-bit signed PCM and ``uint8`` for 8-bit unsigned PCM. Since torch does not - support ``int24`` dtype, 24-bit signed PCM are converted to ``int32`` tensors. - - ``normalize`` argument has no effect on 32-bit floating-point WAV and other formats, such as - ``flac`` and ``mp3``. - - For these formats, this function always returns ``float32`` Tensor with values. - - Note: - ``filepath`` argument is intentionally annotated as ``str`` only, even though it accepts - ``pathlib.Path`` object as well. This is for the consistency with ``"sox_io"`` backend, - which has a restriction on type annotation due to TorchScript compiler compatiblity. - - Args: - filepath (path-like object or file-like object): - Source of audio data. - frame_offset (int, optional): - Number of frames to skip before start reading data. - num_frames (int, optional): - Maximum number of frames to read. ``-1`` reads all the remaining samples, - starting from ``frame_offset``. - This function may return the less number of frames if there is not enough - frames in the given file. - normalize (bool, optional): - When ``True``, this function converts the native sample type to ``float32``. - Default: ``True``. - - If input file is integer WAV, giving ``False`` will change the resulting Tensor type to - integer type. - This argument has no effect for formats other than integer WAV type. - - channels_first (bool, optional): - When True, the returned Tensor has dimension `[channel, time]`. - Otherwise, the returned Tensor's dimension is `[time, channel]`. - format (str or None, optional): - Not used. PySoundFile does not accept format hint. - - Returns: - (torch.Tensor, int): Resulting Tensor and sample rate. - If the input file has integer wav format and normalization is off, then it has - integer type, else ``float32`` type. If ``channels_first=True``, it has - `[channel, time]` else `[time, channel]`. - """ - with soundfile.SoundFile(filepath, "r") as file_: - if file_.format != "WAV" or normalize: - dtype = "float32" - elif file_.subtype not in _SUBTYPE2DTYPE: - raise ValueError(f"Unsupported subtype: {file_.subtype}") - else: - dtype = _SUBTYPE2DTYPE[file_.subtype] - - frames = file_._prepare_read(frame_offset, None, num_frames) - waveform = file_.read(frames, dtype, always_2d=True) - sample_rate = file_.samplerate - - waveform = torch.from_numpy(waveform) - if channels_first: - waveform = waveform.t() - return waveform, sample_rate - - -def _get_subtype_for_wav(dtype: torch.dtype, encoding: str, bits_per_sample: int): - if not encoding: - if not bits_per_sample: - subtype = { - torch.uint8: "PCM_U8", - torch.int16: "PCM_16", - torch.int32: "PCM_32", - torch.float32: "FLOAT", - torch.float64: "DOUBLE", - }.get(dtype) - if not subtype: - raise ValueError(f"Unsupported dtype for wav: {dtype}") - return subtype - if bits_per_sample == 8: - return "PCM_U8" - return f"PCM_{bits_per_sample}" - if encoding == "PCM_S": - if not bits_per_sample: - return "PCM_32" - if bits_per_sample == 8: - raise ValueError("wav does not support 8-bit signed PCM encoding.") - return f"PCM_{bits_per_sample}" - if encoding == "PCM_U": - if bits_per_sample in (None, 8): - return "PCM_U8" - raise ValueError("wav only supports 8-bit unsigned PCM encoding.") - if encoding == "PCM_F": - if bits_per_sample in (None, 32): - return "FLOAT" - if bits_per_sample == 64: - return "DOUBLE" - raise ValueError("wav only supports 32/64-bit float PCM encoding.") - if encoding == "ULAW": - if bits_per_sample in (None, 8): - return "ULAW" - raise ValueError("wav only supports 8-bit mu-law encoding.") - if encoding == "ALAW": - if bits_per_sample in (None, 8): - return "ALAW" - raise ValueError("wav only supports 8-bit a-law encoding.") - raise ValueError(f"wav does not support {encoding}.") - - -def _get_subtype_for_sphere(encoding: str, bits_per_sample: int): - if encoding in (None, "PCM_S"): - return f"PCM_{bits_per_sample}" if bits_per_sample else "PCM_32" - if encoding in ("PCM_U", "PCM_F"): - raise ValueError(f"sph does not support {encoding} encoding.") - if encoding == "ULAW": - if bits_per_sample in (None, 8): - return "ULAW" - raise ValueError("sph only supports 8-bit for mu-law encoding.") - if encoding == "ALAW": - return "ALAW" - raise ValueError(f"sph does not support {encoding}.") - - -def _get_subtype(dtype: torch.dtype, format: str, encoding: str, bits_per_sample: int): - if format == "wav": - return _get_subtype_for_wav(dtype, encoding, bits_per_sample) - if format == "flac": - if encoding: - raise ValueError("flac does not support encoding.") - if not bits_per_sample: - return "PCM_16" - if bits_per_sample > 24: - raise ValueError("flac does not support bits_per_sample > 24.") - return "PCM_S8" if bits_per_sample == 8 else f"PCM_{bits_per_sample}" - if format in ("ogg", "vorbis"): - if bits_per_sample: - raise ValueError("ogg/vorbis does not support bits_per_sample.") - if encoding is None or encoding == "vorbis": - return "VORBIS" - if encoding == "opus": - return "OPUS" - raise ValueError(f"Unexpected encoding: {encoding}") - if format == "mp3": - return "MPEG_LAYER_III" - if format == "sph": - return _get_subtype_for_sphere(encoding, bits_per_sample) - if format in ("nis", "nist"): - return "PCM_16" - raise ValueError(f"Unsupported format: {format}") - - -@_requires_soundfile -def save( - filepath: str, - src: torch.Tensor, - sample_rate: int, - channels_first: bool = True, - compression: Optional[float] = None, - format: Optional[str] = None, - encoding: Optional[str] = None, - bits_per_sample: Optional[int] = None, -): - """Save audio data to file. - - Note: - The formats this function can handle depend on the soundfile installation. - This function is tested on the following formats; - - * WAV - - * 32-bit floating-point - * 32-bit signed integer - * 16-bit signed integer - * 8-bit unsigned integer - - * FLAC - * OGG/VORBIS - * SPHERE - - Note: - ``filepath`` argument is intentionally annotated as ``str`` only, even though it accepts - ``pathlib.Path`` object as well. This is for the consistency with ``"sox_io"`` backend, - which has a restriction on type annotation due to TorchScript compiler compatiblity. - - Args: - filepath (str or pathlib.Path): Path to audio file. - src (torch.Tensor): Audio data to save. must be 2D tensor. - sample_rate (int): sampling rate - channels_first (bool, optional): If ``True``, the given tensor is interpreted as `[channel, time]`, - otherwise `[time, channel]`. - compression (float of None, optional): Not used. - It is here only for interface compatibility reson with "sox_io" backend. - format (str or None, optional): Override the audio format. - When ``filepath`` argument is path-like object, audio format is - inferred from file extension. If the file extension is missing or - different, you can specify the correct format with this argument. - - When ``filepath`` argument is file-like object, - this argument is required. - - Valid values are ``"wav"``, ``"ogg"``, ``"vorbis"``, - ``"flac"`` and ``"sph"``. - encoding (str or None, optional): Changes the encoding for supported formats. - This argument is effective only for supported formats, sush as - ``"wav"``, ``""flac"`` and ``"sph"``. Valid values are; - - - ``"PCM_S"`` (signed integer Linear PCM) - - ``"PCM_U"`` (unsigned integer Linear PCM) - - ``"PCM_F"`` (floating point PCM) - - ``"ULAW"`` (mu-law) - - ``"ALAW"`` (a-law) - - bits_per_sample (int or None, optional): Changes the bit depth for the - supported formats. - When ``format`` is one of ``"wav"``, ``"flac"`` or ``"sph"``, - you can change the bit depth. - Valid values are ``8``, ``16``, ``24``, ``32`` and ``64``. - - Supported formats/encodings/bit depth/compression are: - - ``"wav"`` - - 32-bit floating-point PCM - - 32-bit signed integer PCM - - 24-bit signed integer PCM - - 16-bit signed integer PCM - - 8-bit unsigned integer PCM - - 8-bit mu-law - - 8-bit a-law - - Note: - Default encoding/bit depth is determined by the dtype of - the input Tensor. - - ``"flac"`` - - 8-bit - - 16-bit (default) - - 24-bit - - ``"ogg"``, ``"vorbis"`` - - Doesn't accept changing configuration. - - ``"sph"`` - - 8-bit signed integer PCM - - 16-bit signed integer PCM - - 24-bit signed integer PCM - - 32-bit signed integer PCM (default) - - 8-bit mu-law - - 8-bit a-law - - 16-bit a-law - - 24-bit a-law - - 32-bit a-law - - """ - if src.ndim != 2: - raise ValueError(f"Expected 2D Tensor, got {src.ndim}D.") - if compression is not None: - warnings.warn( - '`save` function of "soundfile" backend does not support "compression" parameter. ' - "The argument is silently ignored." - ) - if hasattr(filepath, "write"): - if format is None: - raise RuntimeError("`format` is required when saving to file object.") - ext = format.lower() - else: - ext = str(filepath).split(".")[-1].lower() - - if bits_per_sample not in (None, 8, 16, 24, 32, 64): - raise ValueError("Invalid bits_per_sample.") - if bits_per_sample == 24: - warnings.warn( - "Saving audio with 24 bits per sample might warp samples near -1. " - "Using 16 bits per sample might be able to avoid this." - ) - subtype = _get_subtype(src.dtype, ext, encoding, bits_per_sample) - - # sph is a extension used in TED-LIUM but soundfile does not recognize it as NIST format, - # so we extend the extensions manually here - if ext in ["nis", "nist", "sph"] and format is None: - format = "NIST" - - if channels_first: - src = src.t() - - soundfile.write(file=filepath, data=src, samplerate=sample_rate, subtype=subtype, format=format) diff --git a/src/torchaudio/_backend/sox.py b/src/torchaudio/_backend/sox.py deleted file mode 100644 index f26ce83ca0..0000000000 --- a/src/torchaudio/_backend/sox.py +++ /dev/null @@ -1,91 +0,0 @@ -import os -from typing import BinaryIO, Optional, Tuple, Union - -import torch -import torchaudio - -from .backend import Backend -from .common import AudioMetaData - -sox_ext = torchaudio._extension.lazy_import_sox_ext() - - -class SoXBackend(Backend): - @staticmethod - def info(uri: Union[BinaryIO, str, os.PathLike], format: Optional[str], buffer_size: int = 4096) -> AudioMetaData: - if hasattr(uri, "read"): - raise ValueError( - "SoX backend does not support reading from file-like objects. ", - "Please use an alternative backend that does support reading from file-like objects, e.g. FFmpeg.", - ) - else: - sinfo = sox_ext.get_info(uri, format) - if sinfo: - return AudioMetaData(*sinfo) - else: - raise RuntimeError(f"Failed to fetch metadata for {uri}.") - - @staticmethod - def load( - uri: Union[BinaryIO, str, os.PathLike], - frame_offset: int = 0, - num_frames: int = -1, - normalize: bool = True, - channels_first: bool = True, - format: Optional[str] = None, - buffer_size: int = 4096, - ) -> Tuple[torch.Tensor, int]: - if hasattr(uri, "read"): - raise ValueError( - "SoX backend does not support loading from file-like objects. ", - "Please use an alternative backend that does support loading from file-like objects, e.g. FFmpeg.", - ) - else: - ret = sox_ext.load_audio_file(str(uri), frame_offset, num_frames, normalize, channels_first, format) - if not ret: - raise RuntimeError(f"Failed to load audio from {uri}.") - return ret - - @staticmethod - def save( - uri: Union[BinaryIO, str, os.PathLike], - src: torch.Tensor, - sample_rate: int, - channels_first: bool = True, - format: Optional[str] = None, - encoding: Optional[str] = None, - bits_per_sample: Optional[int] = None, - buffer_size: int = 4096, - compression: Optional[Union[torchaudio.io.CodecConfig, float, int]] = None, - ) -> None: - if not isinstance(compression, (float, int, type(None))): - raise ValueError( - "SoX backend expects non-`None` value for argument `compression` to be of ", - f"type `float` or `int`, but received value of type {type(compression)}", - ) - if hasattr(uri, "write"): - raise ValueError( - "SoX backend does not support writing to file-like objects. ", - "Please use an alternative backend that does support writing to file-like objects, e.g. FFmpeg.", - ) - else: - sox_ext.save_audio_file( - str(uri), - src, - sample_rate, - channels_first, - compression, - format, - encoding, - bits_per_sample, - ) - - @staticmethod - def can_decode(uri: Union[BinaryIO, str, os.PathLike], format: Optional[str]) -> bool: - # i.e. not a file-like object. - return not hasattr(uri, "read") - - @staticmethod - def can_encode(uri: Union[BinaryIO, str, os.PathLike], format: Optional[str]) -> bool: - # i.e. not a file-like object. - return not hasattr(uri, "write") diff --git a/src/torchaudio/_backend/utils.py b/src/torchaudio/_backend/utils.py deleted file mode 100644 index eb7c51f0cb..0000000000 --- a/src/torchaudio/_backend/utils.py +++ /dev/null @@ -1,350 +0,0 @@ -import os -from functools import lru_cache -from typing import BinaryIO, Dict, Optional, Tuple, Type, Union -import warnings - -import torch - -from torchaudio._extension import lazy_import_sox_ext -from torchaudio.io import CodecConfig -from torio._extension import lazy_import_ffmpeg_ext - -from . import soundfile_backend - -from .backend import Backend -from .common import AudioMetaData -from .ffmpeg import FFmpegBackend -from .soundfile import SoundfileBackend -from .sox import SoXBackend - - -@lru_cache(None) -def get_available_backends() -> Dict[str, Type[Backend]]: - backend_specs: Dict[str, Type[Backend]] = {} - if lazy_import_ffmpeg_ext().is_available(): - backend_specs["ffmpeg"] = FFmpegBackend - if lazy_import_sox_ext().is_available(): - backend_specs["sox"] = SoXBackend - if soundfile_backend._IS_SOUNDFILE_AVAILABLE: - backend_specs["soundfile"] = SoundfileBackend - return backend_specs - - -def get_backend(backend_name, backends) -> Backend: - if backend := backends.get(backend_name): - return backend - else: - raise ValueError( - f"Unsupported backend '{backend_name}' specified; ", - f"please select one of {list(backends.keys())} instead.", - ) - - -def get_info_func(): - backends = get_available_backends() - - def dispatcher( - uri: Union[BinaryIO, str, os.PathLike], format: Optional[str], backend_name: Optional[str] - ) -> Backend: - if backend_name is not None: - return get_backend(backend_name, backends) - - for backend in backends.values(): - if backend.can_decode(uri, format): - return backend - raise RuntimeError(f"Couldn't find appropriate backend to handle uri {uri} and format {format}.") - - def info( - uri: Union[BinaryIO, str, os.PathLike], - format: Optional[str] = None, - buffer_size: int = 4096, - backend: Optional[str] = None, - ) -> AudioMetaData: - """Get signal information of an audio file. - - Note: - When the input type is file-like object, this function cannot - get the correct length (``num_samples``) for certain formats, - such as ``vorbis``. - In this case, the value of ``num_samples`` is ``0``. - - Args: - uri (path-like object or file-like object): - Source of audio data. The following types are accepted: - - * ``path-like``: File path or URL. - * ``file-like``: Object with ``read(size: int) -> bytes`` method, - which returns byte string of at most ``size`` length. - - format (str or None, optional): - If not ``None``, interpreted as hint that may allow backend to override the detected format. - (Default: ``None``) - - buffer_size (int, optional): - Size of buffer to use when processing file-like objects, in bytes. (Default: ``4096``) - - backend (str or None, optional): - I/O backend to use. - If ``None``, function selects backend given input and available backends. - Otherwise, must be one of [``"ffmpeg"``, ``"sox"``, ``"soundfile"``], - with the corresponding backend available. - (Default: ``None``) - - .. seealso:: - :ref:`backend` - - Returns: - AudioMetaData - """ - backend = dispatcher(uri, format, backend) - return backend.info(uri, format, buffer_size) - - return info - - -def get_load_func(): - backends = get_available_backends() - - def dispatcher( - uri: Union[BinaryIO, str, os.PathLike], format: Optional[str], backend_name: Optional[str] - ) -> Backend: - if backend_name is not None: - return get_backend(backend_name, backends) - - for backend in backends.values(): - if backend.can_decode(uri, format): - return backend - raise RuntimeError(f"Couldn't find appropriate backend to handle uri {uri} and format {format}.") - - def load( - uri: Union[BinaryIO, str, os.PathLike], - frame_offset: int = 0, - num_frames: int = -1, - normalize: bool = True, - channels_first: bool = True, - format: Optional[str] = None, - buffer_size: int = 4096, - backend: Optional[str] = None, - ) -> Tuple[torch.Tensor, int]: - """Load audio data from source. - - .. warning:: - In 2.9, this function's implementation will be changed to use - :func:`~torchaudio.load_with_torchcodec` under the hood. Some - parameters like ``normalize``, ``format``, ``buffer_size``, and - ``backend`` will be ignored. We recommend that you port your code to - rely directly on TorchCodec's decoder instead: - https://docs.pytorch.org/torchcodec/stable/generated/torchcodec.decoders.AudioDecoder.html#torchcodec.decoders.AudioDecoder. - - By default (``normalize=True``, ``channels_first=True``), this function returns Tensor with - ``float32`` dtype, and the shape of `[channel, time]`. - - Note: - The formats this function can handle depend on the availability of backends. - Please use the following functions to fetch the supported formats. - - - FFmpeg: :py:func:`torchaudio.utils.ffmpeg_utils.get_audio_decoders` - - Sox: :py:func:`torchaudio.utils.sox_utils.list_read_formats` - - SoundFile: Refer to `the official document `__. - - .. warning:: - - ``normalize`` argument does not perform volume normalization. - It only converts the sample type to `torch.float32` from the native sample - type. - - When the input format is WAV with integer type, such as 32-bit signed integer, 16-bit - signed integer, 24-bit signed integer, and 8-bit unsigned integer, by providing ``normalize=False``, - this function can return integer Tensor, where the samples are expressed within the whole range - of the corresponding dtype, that is, ``int32`` tensor for 32-bit signed PCM, - ``int16`` for 16-bit signed PCM and ``uint8`` for 8-bit unsigned PCM. Since torch does not - support ``int24`` dtype, 24-bit signed PCM are converted to ``int32`` tensors. - - ``normalize`` argument has no effect on 32-bit floating-point WAV and other formats, such as - ``flac`` and ``mp3``. - - For these formats, this function always returns ``float32`` Tensor with values. - - - Args: - uri (path-like object or file-like object): - Source of audio data. - frame_offset (int, optional): - Number of frames to skip before start reading data. - num_frames (int, optional): - Maximum number of frames to read. ``-1`` reads all the remaining samples, - starting from ``frame_offset``. - This function may return the less number of frames if there is not enough - frames in the given file. - normalize (bool, optional): - When ``True``, this function converts the native sample type to ``float32``. - Default: ``True``. - - If input file is integer WAV, giving ``False`` will change the resulting Tensor type to - integer type. - This argument has no effect for formats other than integer WAV type. - - channels_first (bool, optional): - When True, the returned Tensor has dimension `[channel, time]`. - Otherwise, the returned Tensor's dimension is `[time, channel]`. - - format (str or None, optional): - If not ``None``, interpreted as hint that may allow backend to override the detected format. - (Default: ``None``) - - buffer_size (int, optional): - Size of buffer to use when processing file-like objects, in bytes. (Default: ``4096``) - - backend (str or None, optional): - I/O backend to use. - If ``None``, function selects backend given input and available backends. - Otherwise, must be one of [``"ffmpeg"``, ``"sox"``, ``"soundfile"``], - with the corresponding backend being available. (Default: ``None``) - - .. seealso:: - :ref:`backend` - - Returns: - (torch.Tensor, int): Resulting Tensor and sample rate. - If the input file has integer wav format and normalization is off, then it has - integer type, else ``float32`` type. If ``channels_first=True``, it has - `[channel, time]` else `[time, channel]`. - """ - warnings.warn( - "In 2.9, this function's implementation will be changed to use " - "torchaudio.load_with_torchcodec` under the hood. Some " - "parameters like ``normalize``, ``format``, ``buffer_size``, and " - "``backend`` will be ignored. We recommend that you port your code to " - "rely directly on TorchCodec's decoder instead: " - "https://docs.pytorch.org/torchcodec/stable/generated/torchcodec.decoders.AudioDecoder.html#torchcodec.decoders.AudioDecoder." - ) - backend = dispatcher(uri, format, backend) - return backend.load(uri, frame_offset, num_frames, normalize, channels_first, format, buffer_size) - - return load - - -def get_save_func(): - backends = get_available_backends() - - def dispatcher( - uri: Union[BinaryIO, str, os.PathLike], format: Optional[str], backend_name: Optional[str] - ) -> Backend: - if backend_name is not None: - return get_backend(backend_name, backends) - - for backend in backends.values(): - if backend.can_encode(uri, format): - return backend - raise RuntimeError(f"Couldn't find appropriate backend to handle uri {uri} and format {format}.") - - def save( - uri: Union[BinaryIO, str, os.PathLike], - src: torch.Tensor, - sample_rate: int, - channels_first: bool = True, - format: Optional[str] = None, - encoding: Optional[str] = None, - bits_per_sample: Optional[int] = None, - buffer_size: int = 4096, - backend: Optional[str] = None, - compression: Optional[Union[CodecConfig, float, int]] = None, - ): - """Save audio data to file. - - .. warning:: - In 2.9, this function's implementation will be changed to use - :func:`~torchaudio.save_with_torchcodec` under the hood. Some - parameters like format, encoding, bits_per_sample, buffer_size, and - ``backend`` will be ignored. We recommend that you port your code to - rely directly on TorchCodec's decoder instead: - https://docs.pytorch.org/torchcodec/stable/generated/torchcodec.encoders.AudioEncoder - - Note: - The formats this function can handle depend on the availability of backends. - Please use the following functions to fetch the supported formats. - - - FFmpeg: :py:func:`torchaudio.utils.ffmpeg_utils.get_audio_encoders` - - Sox: :py:func:`torchaudio.utils.sox_utils.list_write_formats` - - SoundFile: Refer to `the official document `__. - - Args: - uri (str or pathlib.Path): Path to audio file. - src (torch.Tensor): Audio data to save. must be 2D tensor. - sample_rate (int): sampling rate - channels_first (bool, optional): If ``True``, the given tensor is interpreted as `[channel, time]`, - otherwise `[time, channel]`. - format (str or None, optional): Override the audio format. - When ``uri`` argument is path-like object, audio format is - inferred from file extension. If the file extension is missing or - different, you can specify the correct format with this argument. - - When ``uri`` argument is file-like object, - this argument is required. - - Valid values are ``"wav"``, ``"ogg"``, and ``"flac"``. - encoding (str or None, optional): Changes the encoding for supported formats. - This argument is effective only for supported formats, i.e. - ``"wav"`` and ``""flac"```. Valid values are - - - ``"PCM_S"`` (signed integer Linear PCM) - - ``"PCM_U"`` (unsigned integer Linear PCM) - - ``"PCM_F"`` (floating point PCM) - - ``"ULAW"`` (mu-law) - - ``"ALAW"`` (a-law) - - bits_per_sample (int or None, optional): Changes the bit depth for the - supported formats. - When ``format`` is one of ``"wav"`` and ``"flac"``, - you can change the bit depth. - Valid values are ``8``, ``16``, ``24``, ``32`` and ``64``. - - buffer_size (int, optional): - Size of buffer to use when processing file-like objects, in bytes. (Default: ``4096``) - - backend (str or None, optional): - I/O backend to use. - If ``None``, function selects backend given input and available backends. - Otherwise, must be one of [``"ffmpeg"``, ``"sox"``, ``"soundfile"``], - with the corresponding backend being available. - (Default: ``None``) - - .. seealso:: - :ref:`backend` - - compression (CodecConfig, float, int, or None, optional): - Compression configuration to apply. - - If the selected backend is FFmpeg, an instance of :py:class:`CodecConfig` must be provided. - - Otherwise, if the selected backend is SoX, a float or int value corresponding to option ``-C`` of the - ``sox`` command line interface must be provided. For instance: - - ``"mp3"`` - Either bitrate (in ``kbps``) with quality factor, such as ``128.2``, or - VBR encoding with quality factor such as ``-4.2``. Default: ``-4.5``. - - ``"flac"`` - Whole number from ``0`` to ``8``. ``8`` is default and highest compression. - - ``"ogg"``, ``"vorbis"`` - Number from ``-1`` to ``10``; ``-1`` is the highest compression - and lowest quality. Default: ``3``. - - Refer to http://sox.sourceforge.net/soxformat.html for more details. - - """ - warnings.warn( - "In 2.9, this function's implementation will be changed to use " - "torchaudio.save_with_torchcodec` under the hood. Some " - "parameters like format, encoding, bits_per_sample, buffer_size, and " - "``backend`` will be ignored. We recommend that you port your code to " - "rely directly on TorchCodec's encoder instead: " - "https://docs.pytorch.org/torchcodec/stable/generated/torchcodec.encoders.AudioEncoder" - ) - backend = dispatcher(uri, format, backend) - return backend.save( - uri, src, sample_rate, channels_first, format, encoding, bits_per_sample, buffer_size, compression - ) - - return save diff --git a/src/torchaudio/backend/__init__.py b/src/torchaudio/backend/__init__.py deleted file mode 100644 index 84df7e7d69..0000000000 --- a/src/torchaudio/backend/__init__.py +++ /dev/null @@ -1,8 +0,0 @@ -# NOTE: -# The entire `torchaudio.backend` module is deprecated. -# New things should be added to `torchaudio._backend`. -# Only things related to backward compatibility should be placed here. - -from . import common, no_backend, soundfile_backend, sox_io_backend # noqa - -__all__ = [] diff --git a/src/torchaudio/backend/_no_backend.py b/src/torchaudio/backend/_no_backend.py deleted file mode 100644 index fcbb2ad84a..0000000000 --- a/src/torchaudio/backend/_no_backend.py +++ /dev/null @@ -1,25 +0,0 @@ -from pathlib import Path -from typing import Callable, Optional, Tuple, Union - -from torch import Tensor -from torchaudio import AudioMetaData - - -def load( - filepath: Union[str, Path], - out: Optional[Tensor] = None, - normalization: Union[bool, float, Callable] = True, - channels_first: bool = True, - num_frames: int = 0, - offset: int = 0, - filetype: Optional[str] = None, -) -> Tuple[Tensor, int]: - raise RuntimeError("No audio I/O backend is available.") - - -def save(filepath: str, src: Tensor, sample_rate: int, precision: int = 16, channels_first: bool = True) -> None: - raise RuntimeError("No audio I/O backend is available.") - - -def info(filepath: str) -> AudioMetaData: - raise RuntimeError("No audio I/O backend is available.") diff --git a/src/torchaudio/backend/_sox_io_backend.py b/src/torchaudio/backend/_sox_io_backend.py deleted file mode 100644 index 6af267b17a..0000000000 --- a/src/torchaudio/backend/_sox_io_backend.py +++ /dev/null @@ -1,294 +0,0 @@ -import os -from typing import Optional, Tuple - -import torch -import torchaudio -from torchaudio import AudioMetaData - -sox_ext = torchaudio._extension.lazy_import_sox_ext() - - -def info( - filepath: str, - format: Optional[str] = None, -) -> AudioMetaData: - """Get signal information of an audio file. - - Args: - filepath (str): - Source of audio data. - - format (str or None, optional): - Override the format detection with the given format. - Providing the argument might help when libsox can not infer the format - from header or extension. - - Returns: - AudioMetaData: Metadata of the given audio. - """ - if not torch.jit.is_scripting(): - if hasattr(filepath, "read"): - raise RuntimeError("sox_io backend does not support file-like object.") - filepath = os.fspath(filepath) - sinfo = sox_ext.get_info(filepath, format) - return AudioMetaData(*sinfo) - - -def load( - filepath: str, - frame_offset: int = 0, - num_frames: int = -1, - normalize: bool = True, - channels_first: bool = True, - format: Optional[str] = None, -) -> Tuple[torch.Tensor, int]: - """Load audio data from file. - - Note: - This function can handle all the codecs that underlying libsox can handle, - however it is tested on the following formats; - - * WAV, AMB - - * 32-bit floating-point - * 32-bit signed integer - * 24-bit signed integer - * 16-bit signed integer - * 8-bit unsigned integer (WAV only) - - * MP3 - * FLAC - * OGG/VORBIS - * OPUS - * SPHERE - * AMR-NB - - To load ``MP3``, ``FLAC``, ``OGG/VORBIS``, ``OPUS`` and other codecs ``libsox`` does not - handle natively, your installation of ``torchaudio`` has to be linked to ``libsox`` - and corresponding codec libraries such as ``libmad`` or ``libmp3lame`` etc. - - By default (``normalize=True``, ``channels_first=True``), this function returns Tensor with - ``float32`` dtype, and the shape of `[channel, time]`. - - .. warning:: - - ``normalize`` argument does not perform volume normalization. - It only converts the sample type to `torch.float32` from the native sample - type. - - When the input format is WAV with integer type, such as 32-bit signed integer, 16-bit - signed integer, 24-bit signed integer, and 8-bit unsigned integer, by providing ``normalize=False``, - this function can return integer Tensor, where the samples are expressed within the whole range - of the corresponding dtype, that is, ``int32`` tensor for 32-bit signed PCM, - ``int16`` for 16-bit signed PCM and ``uint8`` for 8-bit unsigned PCM. Since torch does not - support ``int24`` dtype, 24-bit signed PCM are converted to ``int32`` tensors. - - ``normalize`` argument has no effect on 32-bit floating-point WAV and other formats, such as - ``flac`` and ``mp3``. - - For these formats, this function always returns ``float32`` Tensor with values. - - Args: - filepath (path-like object): Source of audio data. - frame_offset (int): - Number of frames to skip before start reading data. - num_frames (int, optional): - Maximum number of frames to read. ``-1`` reads all the remaining samples, - starting from ``frame_offset``. - This function may return the less number of frames if there is not enough - frames in the given file. - normalize (bool, optional): - When ``True``, this function converts the native sample type to ``float32``. - Default: ``True``. - - If input file is integer WAV, giving ``False`` will change the resulting Tensor type to - integer type. - This argument has no effect for formats other than integer WAV type. - - channels_first (bool, optional): - When True, the returned Tensor has dimension `[channel, time]`. - Otherwise, the returned Tensor's dimension is `[time, channel]`. - format (str or None, optional): - Override the format detection with the given format. - Providing the argument might help when libsox can not infer the format - from header or extension. - - Returns: - (torch.Tensor, int): Resulting Tensor and sample rate. - If the input file has integer wav format and ``normalize=False``, then it has - integer type, else ``float32`` type. If ``channels_first=True``, it has - `[channel, time]` else `[time, channel]`. - """ - if not torch.jit.is_scripting(): - if hasattr(filepath, "read"): - raise RuntimeError("sox_io backend does not support file-like object.") - filepath = os.fspath(filepath) - return sox_ext.load_audio_file(filepath, frame_offset, num_frames, normalize, channels_first, format) - - -def save( - filepath: str, - src: torch.Tensor, - sample_rate: int, - channels_first: bool = True, - compression: Optional[float] = None, - format: Optional[str] = None, - encoding: Optional[str] = None, - bits_per_sample: Optional[int] = None, -): - """Save audio data to file. - - Args: - filepath (path-like object): Path to save file. - src (torch.Tensor): Audio data to save. must be 2D tensor. - sample_rate (int): sampling rate - channels_first (bool, optional): If ``True``, the given tensor is interpreted as `[channel, time]`, - otherwise `[time, channel]`. - compression (float or None, optional): Used for formats other than WAV. - This corresponds to ``-C`` option of ``sox`` command. - - ``"mp3"`` - Either bitrate (in ``kbps``) with quality factor, such as ``128.2``, or - VBR encoding with quality factor such as ``-4.2``. Default: ``-4.5``. - - ``"flac"`` - Whole number from ``0`` to ``8``. ``8`` is default and highest compression. - - ``"ogg"``, ``"vorbis"`` - Number from ``-1`` to ``10``; ``-1`` is the highest compression - and lowest quality. Default: ``3``. - - See the detail at http://sox.sourceforge.net/soxformat.html. - format (str or None, optional): Override the audio format. - When ``filepath`` argument is path-like object, audio format is infered from - file extension. If file extension is missing or different, you can specify the - correct format with this argument. - - When ``filepath`` argument is file-like object, this argument is required. - - Valid values are ``"wav"``, ``"mp3"``, ``"ogg"``, ``"vorbis"``, ``"amr-nb"``, - ``"amb"``, ``"flac"``, ``"sph"``, ``"gsm"``, and ``"htk"``. - - encoding (str or None, optional): Changes the encoding for the supported formats. - This argument is effective only for supported formats, such as ``"wav"``, ``""amb"`` - and ``"sph"``. Valid values are; - - - ``"PCM_S"`` (signed integer Linear PCM) - - ``"PCM_U"`` (unsigned integer Linear PCM) - - ``"PCM_F"`` (floating point PCM) - - ``"ULAW"`` (mu-law) - - ``"ALAW"`` (a-law) - - Default values - If not provided, the default value is picked based on ``format`` and ``bits_per_sample``. - - ``"wav"``, ``"amb"`` - - | If both ``encoding`` and ``bits_per_sample`` are not provided, the ``dtype`` of the - | Tensor is used to determine the default value. - - - ``"PCM_U"`` if dtype is ``uint8`` - - ``"PCM_S"`` if dtype is ``int16`` or ``int32`` - - ``"PCM_F"`` if dtype is ``float32`` - - - ``"PCM_U"`` if ``bits_per_sample=8`` - - ``"PCM_S"`` otherwise - - ``"sph"`` format; - - the default value is ``"PCM_S"`` - - bits_per_sample (int or None, optional): Changes the bit depth for the supported formats. - When ``format`` is one of ``"wav"``, ``"flac"``, ``"sph"``, or ``"amb"``, you can change the - bit depth. Valid values are ``8``, ``16``, ``32`` and ``64``. - - Default Value; - If not provided, the default values are picked based on ``format`` and ``"encoding"``; - - ``"wav"``, ``"amb"``; - - | If both ``encoding`` and ``bits_per_sample`` are not provided, the ``dtype`` of the - | Tensor is used. - - - ``8`` if dtype is ``uint8`` - - ``16`` if dtype is ``int16`` - - ``32`` if dtype is ``int32`` or ``float32`` - - - ``8`` if ``encoding`` is ``"PCM_U"``, ``"ULAW"`` or ``"ALAW"`` - - ``16`` if ``encoding`` is ``"PCM_S"`` - - ``32`` if ``encoding`` is ``"PCM_F"`` - - ``"flac"`` format; - - the default value is ``24`` - - ``"sph"`` format; - - ``16`` if ``encoding`` is ``"PCM_U"``, ``"PCM_S"``, ``"PCM_F"`` or not provided. - - ``8`` if ``encoding`` is ``"ULAW"`` or ``"ALAW"`` - - ``"amb"`` format; - - ``8`` if ``encoding`` is ``"PCM_U"``, ``"ULAW"`` or ``"ALAW"`` - - ``16`` if ``encoding`` is ``"PCM_S"`` or not provided. - - ``32`` if ``encoding`` is ``"PCM_F"`` - - Supported formats/encodings/bit depth/compression are; - - ``"wav"``, ``"amb"`` - - 32-bit floating-point PCM - - 32-bit signed integer PCM - - 24-bit signed integer PCM - - 16-bit signed integer PCM - - 8-bit unsigned integer PCM - - 8-bit mu-law - - 8-bit a-law - - Note: Default encoding/bit depth is determined by the dtype of the input Tensor. - - ``"mp3"`` - Fixed bit rate (such as 128kHz) and variable bit rate compression. - Default: VBR with high quality. - - ``"flac"`` - - 8-bit - - 16-bit - - 24-bit (default) - - ``"ogg"``, ``"vorbis"`` - - Different quality level. Default: approx. 112kbps - - ``"sph"`` - - 8-bit signed integer PCM - - 16-bit signed integer PCM - - 24-bit signed integer PCM - - 32-bit signed integer PCM (default) - - 8-bit mu-law - - 8-bit a-law - - 16-bit a-law - - 24-bit a-law - - 32-bit a-law - - ``"amr-nb"`` - Bitrate ranging from 4.75 kbit/s to 12.2 kbit/s. Default: 4.75 kbit/s - - ``"gsm"`` - Lossy Speech Compression, CPU intensive. - - ``"htk"`` - Uses a default single-channel 16-bit PCM format. - - Note: - To save into formats that ``libsox`` does not handle natively, (such as ``"mp3"``, - ``"flac"``, ``"ogg"`` and ``"vorbis"``), your installation of ``torchaudio`` has - to be linked to ``libsox`` and corresponding codec libraries such as ``libmad`` - or ``libmp3lame`` etc. - """ - if not torch.jit.is_scripting(): - if hasattr(filepath, "write"): - raise RuntimeError("sox_io backend does not handle file-like object.") - filepath = os.fspath(filepath) - sox_ext.save_audio_file( - filepath, - src, - sample_rate, - channels_first, - compression, - format, - encoding, - bits_per_sample, - ) diff --git a/src/torchaudio/backend/common.py b/src/torchaudio/backend/common.py deleted file mode 100644 index 3f736bf401..0000000000 --- a/src/torchaudio/backend/common.py +++ /dev/null @@ -1,13 +0,0 @@ -def __getattr__(name: str): - if name == "AudioMetaData": - import warnings - - warnings.warn( - "`torchaudio.backend.common.AudioMetaData` has been moved to " - "`torchaudio.AudioMetaData`. Please update the import path.", - stacklevel=2, - ) - from torchaudio import AudioMetaData - - return AudioMetaData - raise AttributeError(f"module {__name__!r} has no attribute {name!r}") diff --git a/src/torchaudio/backend/no_backend.py b/src/torchaudio/backend/no_backend.py deleted file mode 100644 index b5aad59a1c..0000000000 --- a/src/torchaudio/backend/no_backend.py +++ /dev/null @@ -1,14 +0,0 @@ -def __getattr__(name: str): - import warnings - - warnings.warn( - "Torchaudio's I/O functions now support per-call backend dispatch. " - "Importing backend implementation directly is no longer guaranteed to work. " - "Please use `backend` keyword with load/save/info function, instead of " - "calling the underlying implementation directly.", - stacklevel=2, - ) - - from . import _no_backend - - return getattr(_no_backend, name) diff --git a/src/torchaudio/backend/soundfile_backend.py b/src/torchaudio/backend/soundfile_backend.py deleted file mode 100644 index ef8612fc6e..0000000000 --- a/src/torchaudio/backend/soundfile_backend.py +++ /dev/null @@ -1,14 +0,0 @@ -def __getattr__(name: str): - import warnings - - warnings.warn( - "Torchaudio's I/O functions now support per-call backend dispatch. " - "Importing backend implementation directly is no longer guaranteed to work. " - "Please use `backend` keyword with load/save/info function, instead of " - "calling the underlying implementation directly.", - stacklevel=2, - ) - - from torchaudio._backend import soundfile_backend - - return getattr(soundfile_backend, name) diff --git a/src/torchaudio/backend/sox_io_backend.py b/src/torchaudio/backend/sox_io_backend.py deleted file mode 100644 index 7e83b8fbf4..0000000000 --- a/src/torchaudio/backend/sox_io_backend.py +++ /dev/null @@ -1,14 +0,0 @@ -def __getattr__(name: str): - import warnings - - warnings.warn( - "Torchaudio's I/O functions now support per-call backend dispatch. " - "Importing backend implementation directly is no longer guaranteed to work. " - "Please use `backend` keyword with load/save/info function, instead of " - "calling the underlying implementation directly.", - stacklevel=2, - ) - - from . import _sox_io_backend - - return getattr(_sox_io_backend, name) From 953fc6579960cb0339c41726e36e511aa31299c7 Mon Sep 17 00:00:00 2001 From: Sam Anklesaria Date: Wed, 13 Aug 2025 21:55:08 +0000 Subject: [PATCH 16/25] Support frame_offset and num_frames in load hack --- src/torchaudio/__init__.py | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/src/torchaudio/__init__.py b/src/torchaudio/__init__.py index 1ff3a530e4..592a2cbe6a 100644 --- a/src/torchaudio/__init__.py +++ b/src/torchaudio/__init__.py @@ -48,10 +48,18 @@ from torchaudio.utils import wav_utils def load( uri: str, + frame_offset: int = 0, + num_frames: int = -1, normalize: bool = True, channels_first: bool = True, ) -> Tuple[torch.Tensor, int]: - return wav_utils.load_wav(uri, normalize, channels_first) + data, sample_rate = wav_utils.load_wav(uri, normalize, channels_first=False) + if num_frames == -1: + num_frames = data.shape[0] - frame_offset + data = data[frame_offset:frame_offset+num_frames] + if channels_first: + data = data.transpose(0, 1) + return data, sample_rate def save( uri: str, From dd3ff90799685c8a98565d959c9204fba1cd5097 Mon Sep 17 00:00:00 2001 From: Sam Anklesaria Date: Thu, 14 Aug 2025 01:03:46 +0000 Subject: [PATCH 17/25] Use rand instead of randn for test_save_channels_first --- test/torchaudio_unittest/test_load_save_torchcodec.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/test/torchaudio_unittest/test_load_save_torchcodec.py b/test/torchaudio_unittest/test_load_save_torchcodec.py index 3edb4c423b..90fcc15689 100644 --- a/test/torchaudio_unittest/test_load_save_torchcodec.py +++ b/test/torchaudio_unittest/test_load_save_torchcodec.py @@ -227,9 +227,9 @@ def test_save_channels_first(channels_first): """Test channels_first parameter.""" # Create test data if channels_first: - waveform = torch.randn(2, 16000) # [channel, time] + waveform = torch.rand(2, 16000) # [channel, time] else: - waveform = torch.randn(16000, 2) # [time, channel] + waveform = torch.rand(16000, 2) # [time, channel] sample_rate = 16000 From c94e011ecc5a64f0a550034011157f6cdee34f2d Mon Sep 17 00:00:00 2001 From: Sam Anklesaria Date: Thu, 14 Aug 2025 14:38:27 +0000 Subject: [PATCH 18/25] Remove pytest-aware code in src --- src/torchaudio/__init__.py | 364 +++++++++++++++++-------------------- 1 file changed, 166 insertions(+), 198 deletions(-) diff --git a/src/torchaudio/__init__.py b/src/torchaudio/__init__.py index 592a2cbe6a..0c321c96d2 100644 --- a/src/torchaudio/__init__.py +++ b/src/torchaudio/__init__.py @@ -42,204 +42,172 @@ except ImportError: pass -# CI cannot currently build with ffmpeg>4, but torchcodec is buggy with ffmpeg4. This hack -# allows CI to build with ffmpeg4 and works around load/test bugginess. -if "pytest" in sys.modules: - from torchaudio.utils import wav_utils - def load( - uri: str, - frame_offset: int = 0, - num_frames: int = -1, - normalize: bool = True, - channels_first: bool = True, - ) -> Tuple[torch.Tensor, int]: - data, sample_rate = wav_utils.load_wav(uri, normalize, channels_first=False) - if num_frames == -1: - num_frames = data.shape[0] - frame_offset - data = data[frame_offset:frame_offset+num_frames] - if channels_first: - data = data.transpose(0, 1) - return data, sample_rate - - def save( - uri: str, - src: torch.Tensor, - sample_rate: int, - channels_first: bool = True, - format: Optional[str] = None, - encoding: Optional[str] = None, - bits_per_sample: Optional[int] = None, - buffer_size: int = 4096, - backend: Optional[str] = None, - compression: Optional[Union[float, int]] = None, - ): - wav_utils.save_wav(uri, src, sample_rate, channels_first=channels_first) -else: - def load( - uri: Union[BinaryIO, str, os.PathLike], - frame_offset: int = 0, - num_frames: int = -1, - normalize: bool = True, - channels_first: bool = True, - format: Optional[str] = None, - buffer_size: int = 4096, - backend: Optional[str] = None, - ) -> Tuple[torch.Tensor, int]: - """Load audio data from source using TorchCodec's AudioDecoder. - - .. note:: - - As of TorchAudio 2.9, this function relies on TorchCodec's decoding capabilities under the hood. It is - provided for convenience, but we do recommend that you port your code to - natively use ``torchcodec``'s ``AudioDecoder`` class for better - performance: - https://docs.pytorch.org/torchcodec/stable/generated/torchcodec.decoders.AudioDecoder. - Because of the reliance on Torchcodec, the parameters ``normalize``, ``buffer_size``, and - ``backend`` are ignored and accepted only for backwards compatibility. - - - Args: - uri (path-like object or file-like object): - Source of audio data. The following types are accepted: - - * ``path-like``: File path or URL. - * ``file-like``: Object with ``read(size: int) -> bytes`` method. - - frame_offset (int, optional): - Number of samples to skip before start reading data. - num_frames (int, optional): - Maximum number of samples to read. ``-1`` reads all the remaining samples, - starting from ``frame_offset``. - normalize (bool, optional): - TorchCodec always returns normalized float32 samples. This parameter - is ignored and a warning is issued if set to False. - Default: ``True``. - channels_first (bool, optional): - When True, the returned Tensor has dimension `[channel, time]`. - Otherwise, the returned Tensor's dimension is `[time, channel]`. - format (str or None, optional): - Format hint for the decoder. May not be supported by all TorchCodec - decoders. (Default: ``None``) - buffer_size (int, optional): - Not used by TorchCodec AudioDecoder. Provided for API compatibility. - backend (str or None, optional): - Not used by TorchCodec AudioDecoder. Provided for API compatibility. - - Returns: - (torch.Tensor, int): Resulting Tensor and sample rate. - Always returns float32 tensors. If ``channels_first=True``, shape is - `[channel, time]`, otherwise `[time, channel]`. - - Raises: - ImportError: If torchcodec is not available. - ValueError: If unsupported parameters are used. - RuntimeError: If TorchCodec fails to decode the audio. - - Note: - - TorchCodec always returns normalized float32 samples, so the ``normalize`` - parameter has no effect. - - The ``buffer_size`` and ``backend`` parameters are ignored. - - Not all audio formats supported by torchaudio backends may be supported - by TorchCodec. - """ - return load_with_torchcodec( - uri, - frame_offset=frame_offset, - num_frames=num_frames, - normalize=normalize, - channels_first=channels_first, - format=format, - buffer_size=buffer_size, - backend=backend - ) - - def save( - uri: Union[str, os.PathLike], - src: torch.Tensor, - sample_rate: int, - channels_first: bool = True, - format: Optional[str] = None, - encoding: Optional[str] = None, - bits_per_sample: Optional[int] = None, - buffer_size: int = 4096, - backend: Optional[str] = None, - compression: Optional[Union[float, int]] = None, - ) -> None: - """Save audio data to file using TorchCodec's AudioEncoder. - - .. note:: - - As of TorchAudio 2.9, this function relies on TorchCodec's encoding capabilities under the hood. - It is provided for convenience, but we do recommend that you port your code to - natively use ``torchcodec``'s ``AudioEncoder`` class for better - performance: - https://docs.pytorch.org/torchcodec/stable/generated/torchcodec.encoders.AudioEncoder. - Because of the reliance on Torchcodec, the parameters ``format``, ``encoding``, - ``bits_per_sample``, ``buffer_size``, and ``backend``, are ignored and accepted only for - backwards compatibility. - - Args: - uri (path-like object): - Path to save the audio file. The file extension determines the format. - - src (torch.Tensor): - Audio data to save. Must be a 1D or 2D tensor with float32 values - in the range [-1, 1]. If 2D, shape should be [channel, time] when - channels_first=True, or [time, channel] when channels_first=False. - - sample_rate (int): - Sample rate of the audio data. - - channels_first (bool, optional): - Indicates whether the input tensor has channels as the first dimension. - If True, expects [channel, time]. If False, expects [time, channel]. - Default: True. - - format (str or None, optional): - Audio format hint. Not used by TorchCodec (format is determined by - file extension). A warning is issued if provided. - Default: None. - - encoding (str or None, optional): - Audio encoding. Not fully supported by TorchCodec AudioEncoder. - A warning is issued if provided. Default: None. - - bits_per_sample (int or None, optional): - Bits per sample. Not directly supported by TorchCodec AudioEncoder. - A warning is issued if provided. Default: None. - - buffer_size (int, optional): - Not used by TorchCodec AudioEncoder. Provided for API compatibility. - A warning is issued if not default value. Default: 4096. - - backend (str or None, optional): - Not used by TorchCodec AudioEncoder. Provided for API compatibility. - A warning is issued if provided. Default: None. - - compression (float, int or None, optional): - Compression level or bit rate. Maps to bit_rate parameter in - TorchCodec AudioEncoder. Default: None. - - Raises: - ImportError: If torchcodec is not available. - ValueError: If input parameters are invalid. - RuntimeError: If TorchCodec fails to encode the audio. - - Note: - - TorchCodec AudioEncoder expects float32 samples in [-1, 1] range. - - Some parameters (format, encoding, bits_per_sample, buffer_size, backend) - are not used by TorchCodec but are provided for API compatibility. - - The output format is determined by the file extension in the uri. - - TorchCodec uses FFmpeg under the hood for encoding. - """ - return save_with_torchcodec(uri, src, sample_rate, - channels_first=channels_first, - format=format, - encoding=encoding, - bits_per_sample=bits_per_sample, - buffer_size=buffer_size, - backend=backend, - compression=compression) + +def load( + uri: Union[BinaryIO, str, os.PathLike], + frame_offset: int = 0, + num_frames: int = -1, + normalize: bool = True, + channels_first: bool = True, + format: Optional[str] = None, + buffer_size: int = 4096, + backend: Optional[str] = None, +) -> Tuple[torch.Tensor, int]: + """Load audio data from source using TorchCodec's AudioDecoder. + + .. note:: + + As of TorchAudio 2.9, this function relies on TorchCodec's decoding capabilities under the hood. It is + provided for convenience, but we do recommend that you port your code to + natively use ``torchcodec``'s ``AudioDecoder`` class for better + performance: + https://docs.pytorch.org/torchcodec/stable/generated/torchcodec.decoders.AudioDecoder. + Because of the reliance on Torchcodec, the parameters ``normalize``, ``buffer_size``, and + ``backend`` are ignored and accepted only for backwards compatibility. + + + Args: + uri (path-like object or file-like object): + Source of audio data. The following types are accepted: + + * ``path-like``: File path or URL. + * ``file-like``: Object with ``read(size: int) -> bytes`` method. + + frame_offset (int, optional): + Number of samples to skip before start reading data. + num_frames (int, optional): + Maximum number of samples to read. ``-1`` reads all the remaining samples, + starting from ``frame_offset``. + normalize (bool, optional): + TorchCodec always returns normalized float32 samples. This parameter + is ignored and a warning is issued if set to False. + Default: ``True``. + channels_first (bool, optional): + When True, the returned Tensor has dimension `[channel, time]`. + Otherwise, the returned Tensor's dimension is `[time, channel]`. + format (str or None, optional): + Format hint for the decoder. May not be supported by all TorchCodec + decoders. (Default: ``None``) + buffer_size (int, optional): + Not used by TorchCodec AudioDecoder. Provided for API compatibility. + backend (str or None, optional): + Not used by TorchCodec AudioDecoder. Provided for API compatibility. + + Returns: + (torch.Tensor, int): Resulting Tensor and sample rate. + Always returns float32 tensors. If ``channels_first=True``, shape is + `[channel, time]`, otherwise `[time, channel]`. + + Raises: + ImportError: If torchcodec is not available. + ValueError: If unsupported parameters are used. + RuntimeError: If TorchCodec fails to decode the audio. + + Note: + - TorchCodec always returns normalized float32 samples, so the ``normalize`` + parameter has no effect. + - The ``buffer_size`` and ``backend`` parameters are ignored. + - Not all audio formats supported by torchaudio backends may be supported + by TorchCodec. + """ + return load_with_torchcodec( + uri, + frame_offset=frame_offset, + num_frames=num_frames, + normalize=normalize, + channels_first=channels_first, + format=format, + buffer_size=buffer_size, + backend=backend + ) + +def save( + uri: Union[str, os.PathLike], + src: torch.Tensor, + sample_rate: int, + channels_first: bool = True, + format: Optional[str] = None, + encoding: Optional[str] = None, + bits_per_sample: Optional[int] = None, + buffer_size: int = 4096, + backend: Optional[str] = None, + compression: Optional[Union[float, int]] = None, +) -> None: + """Save audio data to file using TorchCodec's AudioEncoder. + + .. note:: + + As of TorchAudio 2.9, this function relies on TorchCodec's encoding capabilities under the hood. + It is provided for convenience, but we do recommend that you port your code to + natively use ``torchcodec``'s ``AudioEncoder`` class for better + performance: + https://docs.pytorch.org/torchcodec/stable/generated/torchcodec.encoders.AudioEncoder. + Because of the reliance on Torchcodec, the parameters ``format``, ``encoding``, + ``bits_per_sample``, ``buffer_size``, and ``backend``, are ignored and accepted only for + backwards compatibility. + + Args: + uri (path-like object): + Path to save the audio file. The file extension determines the format. + + src (torch.Tensor): + Audio data to save. Must be a 1D or 2D tensor with float32 values + in the range [-1, 1]. If 2D, shape should be [channel, time] when + channels_first=True, or [time, channel] when channels_first=False. + + sample_rate (int): + Sample rate of the audio data. + + channels_first (bool, optional): + Indicates whether the input tensor has channels as the first dimension. + If True, expects [channel, time]. If False, expects [time, channel]. + Default: True. + + format (str or None, optional): + Audio format hint. Not used by TorchCodec (format is determined by + file extension). A warning is issued if provided. + Default: None. + + encoding (str or None, optional): + Audio encoding. Not fully supported by TorchCodec AudioEncoder. + A warning is issued if provided. Default: None. + + bits_per_sample (int or None, optional): + Bits per sample. Not directly supported by TorchCodec AudioEncoder. + A warning is issued if provided. Default: None. + + buffer_size (int, optional): + Not used by TorchCodec AudioEncoder. Provided for API compatibility. + A warning is issued if not default value. Default: 4096. + + backend (str or None, optional): + Not used by TorchCodec AudioEncoder. Provided for API compatibility. + A warning is issued if provided. Default: None. + + compression (float, int or None, optional): + Compression level or bit rate. Maps to bit_rate parameter in + TorchCodec AudioEncoder. Default: None. + + Raises: + ImportError: If torchcodec is not available. + ValueError: If input parameters are invalid. + RuntimeError: If TorchCodec fails to encode the audio. + + Note: + - TorchCodec AudioEncoder expects float32 samples in [-1, 1] range. + - Some parameters (format, encoding, bits_per_sample, buffer_size, backend) + are not used by TorchCodec but are provided for API compatibility. + - The output format is determined by the file extension in the uri. + - TorchCodec uses FFmpeg under the hood for encoding. + """ + return save_with_torchcodec(uri, src, sample_rate, + channels_first=channels_first, + format=format, + encoding=encoding, + bits_per_sample=bits_per_sample, + buffer_size=buffer_size, + backend=backend, + compression=compression) __all__ = [ "AudioMetaData", From b622d8209299382dbd40d14adaa069cf217c0df4 Mon Sep 17 00:00:00 2001 From: Sam Anklesaria Date: Thu, 14 Aug 2025 15:08:06 +0000 Subject: [PATCH 19/25] Remove torchcodec version check --- .github/scripts/unittest-linux/install.sh | 1 - 1 file changed, 1 deletion(-) diff --git a/.github/scripts/unittest-linux/install.sh b/.github/scripts/unittest-linux/install.sh index a7ae9bfcf4..c8f47e63ab 100755 --- a/.github/scripts/unittest-linux/install.sh +++ b/.github/scripts/unittest-linux/install.sh @@ -88,7 +88,6 @@ pip install . -v --no-build-isolation printf "* Installing test tools\n" # On this CI, for whatever reason, we're only able to install ffmpeg 4. conda install -y "ffmpeg<5" -python -c "import torch; import torchaudio; import torchcodec; print(torch.__version__, torchaudio.__version__, torchcodec.__version__)" NUMBA_DEV_CHANNEL="" if [[ "$(python --version)" = *3.9* || "$(python --version)" = *3.10* ]]; then From 93351a24194727341be4b203f6618c9baadbccc7 Mon Sep 17 00:00:00 2001 From: Sam Anklesaria Date: Thu, 14 Aug 2025 15:58:18 +0000 Subject: [PATCH 20/25] Fix bugs in torchcodec mock --- test/conftest.py | 4 + .../common_utils/__init__.py | 2 +- .../common_utils/wav_utils.py | 92 +++++++++++++++++++ test/torchcodec/decoders.py | 17 ++-- test/torchcodec/encoders.py | 6 +- 5 files changed, 106 insertions(+), 15 deletions(-) create mode 100644 test/conftest.py create mode 100644 test/torchaudio_unittest/common_utils/wav_utils.py diff --git a/test/conftest.py b/test/conftest.py new file mode 100644 index 0000000000..35f7ae81ee --- /dev/null +++ b/test/conftest.py @@ -0,0 +1,4 @@ +import sys +from pathlib import Path + +sys.path.append(str(Path(__file__).parent.resolve())) diff --git a/test/torchaudio_unittest/common_utils/__init__.py b/test/torchaudio_unittest/common_utils/__init__.py index 93ac7e0821..509d5208df 100644 --- a/test/torchaudio_unittest/common_utils/__init__.py +++ b/test/torchaudio_unittest/common_utils/__init__.py @@ -26,7 +26,7 @@ from .func_utils import torch_script from .image_utils import get_image, rgb_to_gray, rgb_to_yuv_ccir, save_image from .parameterized_utils import load_params, nested_params -from torchaudio.utils.wav_utils import get_wav_data, load_wav, normalize_wav, save_wav +from .wav_utils import get_wav_data, load_wav, normalize_wav, save_wav import pytest class RequestMixin: diff --git a/test/torchaudio_unittest/common_utils/wav_utils.py b/test/torchaudio_unittest/common_utils/wav_utils.py new file mode 100644 index 0000000000..db15494dca --- /dev/null +++ b/test/torchaudio_unittest/common_utils/wav_utils.py @@ -0,0 +1,92 @@ +from typing import Optional + +import scipy.io.wavfile +import torch + + +def normalize_wav(tensor: torch.Tensor) -> torch.Tensor: + if tensor.dtype == torch.float32: + pass + elif tensor.dtype == torch.int32: + tensor = tensor.to(torch.float32) + tensor[tensor > 0] /= 2147483647.0 + tensor[tensor < 0] /= 2147483648.0 + elif tensor.dtype == torch.int16: + tensor = tensor.to(torch.float32) + tensor[tensor > 0] /= 32767.0 + tensor[tensor < 0] /= 32768.0 + elif tensor.dtype == torch.uint8: + tensor = tensor.to(torch.float32) - 128 + tensor[tensor > 0] /= 127.0 + tensor[tensor < 0] /= 128.0 + return tensor + + +def get_wav_data( + dtype: str, + num_channels: int, + *, + num_frames: Optional[int] = None, + normalize: bool = True, + channels_first: bool = True, +): + """Generate linear signal of the given dtype and num_channels + + Data range is + [-1.0, 1.0] for float32, + [-2147483648, 2147483647] for int32 + [-32768, 32767] for int16 + [0, 255] for uint8 + + num_frames allow to change the linear interpolation parameter. + Default values are 256 for uint8, else 1 << 16. + 1 << 16 as default is so that int16 value range is completely covered. + """ + dtype_ = getattr(torch, dtype) + + if num_frames is None: + if dtype == "uint8": + num_frames = 256 + else: + num_frames = 1 << 16 + + if dtype == "uint8": + base = torch.linspace(0, 255, num_frames, dtype=dtype_) + elif dtype == "int8": + base = torch.linspace(-128, 127, num_frames, dtype=dtype_) + elif dtype == "float32": + base = torch.linspace(-1.0, 1.0, num_frames, dtype=dtype_) + elif dtype == "float64": + base = torch.linspace(-1.0, 1.0, num_frames, dtype=dtype_) + elif dtype == "int32": + base = torch.linspace(-2147483648, 2147483647, num_frames, dtype=dtype_) + elif dtype == "int16": + base = torch.linspace(-32768, 32767, num_frames, dtype=dtype_) + else: + raise NotImplementedError(f"Unsupported dtype {dtype}") + data = base.repeat([num_channels, 1]) + if not channels_first: + data = data.transpose(1, 0) + if normalize: + data = normalize_wav(data) + return data + + +def load_wav(path: str, normalize=True, channels_first=True) -> torch.Tensor: + """Load wav file without torchaudio""" + sample_rate, data = scipy.io.wavfile.read(path) + data = torch.from_numpy(data.copy()) + if data.ndim == 1: + data = data.unsqueeze(1) + if normalize: + data = normalize_wav(data) + if channels_first: + data = data.transpose(1, 0) + return data, sample_rate + + +def save_wav(path, data, sample_rate, channels_first=True): + """Save wav file without torchaudio""" + if channels_first: + data = data.transpose(1, 0) + scipy.io.wavfile.write(path, sample_rate, data.numpy()) diff --git a/test/torchcodec/decoders.py b/test/torchcodec/decoders.py index 94f2d8c8c1..8b2a7a3071 100644 --- a/test/torchcodec/decoders.py +++ b/test/torchcodec/decoders.py @@ -1,17 +1,12 @@ -import test.torchaudio_unittest.common_utils.wav_utils as wav_utils +import torchaudio_unittest.common_utils.wav_utils as wav_utils +from types import SimpleNamespace class AudioDecoder: def __init__(self, uri): self.uri = uri - - def get_all_samples(self): - return wav_utils.load_wav(self.uri) - - -class AudioEncoder: - def __init__(self, data, sample_rate): + data, sample_rate = wav_utils.load_wav(self.uri) + self.metadata = SimpleNamespace(sample_rate=sample_rate) self.data = data - self.sample_rate = sample_rate - def to_file(self, uri, bit_rate=None): - return wav_utils.save_wav(uri, self.data, self.sample_rate) + def get_all_samples(self): + return SimpleNamespace(data=self.data) diff --git a/test/torchcodec/encoders.py b/test/torchcodec/encoders.py index 5e9cc54968..cef6953824 100644 --- a/test/torchcodec/encoders.py +++ b/test/torchcodec/encoders.py @@ -1,10 +1,10 @@ import torchaudio_unittest.common_utils.wav_utils as wav_utils +from types import SimpleNamespace class AudioEncoder: def __init__(self, data, sample_rate): - print("BEING CALLED") self.data = data - self.sample_rate = sample_rate + self.metadata = SimpleNamespace(sample_rate=sample_rate) def to_file(self, uri, bit_rate=None): - return wav_utils.save_wav(uri, self.data, self.sample_rate) + return wav_utils.save_wav(uri, self.data, self.metadata.sample_rate) From 54071630c957e3eab5dc271f5e9bb5dd25e3d67c Mon Sep 17 00:00:00 2001 From: Sam Anklesaria Date: Thu, 14 Aug 2025 16:01:18 +0000 Subject: [PATCH 21/25] Skip test_load_save_torchcodec --- .../test_load_save_torchcodec.py | 152 +++++++++--------- 1 file changed, 78 insertions(+), 74 deletions(-) diff --git a/test/torchaudio_unittest/test_load_save_torchcodec.py b/test/torchaudio_unittest/test_load_save_torchcodec.py index 90fcc15689..28d316952e 100644 --- a/test/torchaudio_unittest/test_load_save_torchcodec.py +++ b/test/torchaudio_unittest/test_load_save_torchcodec.py @@ -12,6 +12,10 @@ from torchaudio import load_with_torchcodec, save_with_torchcodec from torchaudio_unittest.common_utils import get_asset_path +# Now, load/save_torchcodec are the same as torchaudio.load/save, so +# there is no need to test this. +pytest.skip() + def get_ffmpeg_version(): """Get FFmpeg version to check for compatibility issues.""" try: @@ -48,25 +52,25 @@ def test_basic_load(filename): # Skip problematic files on FFmpeg4 due to known compatibility issues if is_ffmpeg4() and filename != "sinewave.wav": pytest.skip("FFmpeg4 has known compatibility issues with some audio files") - + file_path = get_asset_path(*filename.split("/")) - + # Load with torchaudio waveform_ta, sample_rate_ta = torchaudio.load(file_path) - + # Load with torchcodec waveform_tc, sample_rate_tc = load_with_torchcodec(file_path) - + # Check sample rates match assert sample_rate_ta == sample_rate_tc - + # Check shapes match assert waveform_ta.shape == waveform_tc.shape - + # Check data types (should both be float32) assert waveform_ta.dtype == torch.float32 assert waveform_tc.dtype == torch.float32 - + # Check values are close (allowing for small differences in decoders) torch.testing.assert_close(waveform_ta, waveform_tc) @@ -79,17 +83,17 @@ def test_basic_load(filename): def test_frame_offset_and_num_frames(frame_offset, num_frames): """Test frame_offset and num_frames parameters.""" file_path = get_asset_path("sinewave.wav") - + # Load with torchaudio waveform_ta, sample_rate_ta = torchaudio.load( file_path, frame_offset=frame_offset, num_frames=num_frames ) - + # Load with torchcodec waveform_tc, sample_rate_tc = load_with_torchcodec( file_path, frame_offset=frame_offset, num_frames=num_frames ) - + # Check results match assert sample_rate_ta == sample_rate_tc assert waveform_ta.shape == waveform_tc.shape @@ -98,21 +102,21 @@ def test_frame_offset_and_num_frames(frame_offset, num_frames): def test_channels_first(): """Test channels_first parameter.""" file_path = get_asset_path("sinewave.wav") # Use sinewave.wav for compatibility - + # Test channels_first=True (default) waveform_cf_true, sample_rate = load_with_torchcodec(file_path, channels_first=True) - + # Test channels_first=False waveform_cf_false, _ = load_with_torchcodec(file_path, channels_first=False) - + # Check that transpose relationship holds assert waveform_cf_true.shape == waveform_cf_false.transpose(0, 1).shape torch.testing.assert_close(waveform_cf_true, waveform_cf_false.transpose(0, 1)) - + # Compare with torchaudio waveform_ta_true, _ = torchaudio.load(file_path, channels_first=True) waveform_ta_false, _ = torchaudio.load(file_path, channels_first=False) - + assert waveform_cf_true.shape == waveform_ta_true.shape assert waveform_cf_false.shape == waveform_ta_false.shape torch.testing.assert_close(waveform_cf_true, waveform_ta_true) @@ -121,18 +125,18 @@ def test_channels_first(): def test_normalize_parameter_warning(): """Test that normalize=False produces a warning.""" file_path = get_asset_path("sinewave.wav") - + with pytest.warns(UserWarning, match="normalize=False.*ignored"): # This should produce a warning waveform, sample_rate = load_with_torchcodec(file_path, normalize=False) - + # Result should still be float32 (normalized) assert waveform.dtype == torch.float32 def test_buffer_size_parameter_warning(): """Test that non-default buffer_size produces a warning.""" file_path = get_asset_path("sinewave.wav") - + with pytest.warns(UserWarning, match="buffer_size.*not used"): # This should produce a warning waveform, sample_rate = load_with_torchcodec(file_path, buffer_size=8192) @@ -141,7 +145,7 @@ def test_buffer_size_parameter_warning(): def test_backend_parameter_warning(): """Test that specifying backend produces a warning.""" file_path = get_asset_path("sinewave.wav") - + with pytest.warns(UserWarning, match="backend.*not used"): # This should produce a warning waveform, sample_rate = load_with_torchcodec(file_path, backend="ffmpeg") @@ -156,10 +160,10 @@ def test_invalid_file(): def test_format_parameter(): """Test that format parameter produces a warning.""" file_path = get_asset_path("sinewave.wav") - + with pytest.warns(UserWarning, match="format.*not supported"): waveform, sample_rate = load_with_torchcodec(file_path, format="wav") - + # Check basic properties assert waveform.dtype == torch.float32 assert sample_rate > 0 @@ -168,17 +172,17 @@ def test_format_parameter(): def test_multiple_warnings(): """Test that multiple unsupported parameters produce multiple warnings.""" file_path = get_asset_path("sinewave.wav") - + with pytest.warns() as warning_list: # This should produce multiple warnings waveform, sample_rate = load_with_torchcodec( - file_path, - normalize=False, - buffer_size=8192, + file_path, + normalize=False, + buffer_size=8192, backend="ffmpeg" ) - - + + # Check that expected warnings are present messages = [str(w.message) for w in warning_list] assert any("normalize=False" in msg for msg in messages) @@ -194,30 +198,30 @@ def test_save_basic_save(filename): # Load a test file first file_path = get_asset_path(*filename.split("/")) waveform, sample_rate = torchaudio.load(file_path) - + with tempfile.TemporaryDirectory() as temp_dir: # Save with torchaudio ta_path = os.path.join(temp_dir, "ta_output.wav") torchaudio.save(ta_path, waveform, sample_rate) - + # Save with torchcodec tc_path = os.path.join(temp_dir, "tc_output.wav") save_with_torchcodec(tc_path, waveform, sample_rate) - + # Load both back and compare waveform_ta, sample_rate_ta = torchaudio.load(ta_path) waveform_tc, sample_rate_tc = torchaudio.load(tc_path) - + # Check sample rates match assert sample_rate_ta == sample_rate_tc - + # Check shapes match assert waveform_ta.shape == waveform_tc.shape - + # Check data types (should both be float32) assert waveform_ta.dtype == torch.float32 assert waveform_tc.dtype == torch.float32 - + # Check values are close (allowing for small differences in encoders) torch.testing.assert_close(waveform_ta, waveform_tc, atol=1e-3, rtol=1e-3) @@ -230,22 +234,22 @@ def test_save_channels_first(channels_first): waveform = torch.rand(2, 16000) # [channel, time] else: waveform = torch.rand(16000, 2) # [time, channel] - + sample_rate = 16000 - + with tempfile.TemporaryDirectory() as temp_dir: # Save with torchaudio ta_path = os.path.join(temp_dir, "ta_output.wav") torchaudio.save(ta_path, waveform, sample_rate, channels_first=channels_first) - + # Save with torchcodec tc_path = os.path.join(temp_dir, "tc_output.wav") save_with_torchcodec(tc_path, waveform, sample_rate, channels_first=channels_first) - + # Load both back and compare waveform_ta, sample_rate_ta = torchaudio.load(ta_path) waveform_tc, sample_rate_tc = torchaudio.load(tc_path) - + # Check results match assert sample_rate_ta == sample_rate_tc assert waveform_ta.shape == waveform_tc.shape @@ -256,15 +260,15 @@ def test_save_compression_parameter(): """Test compression parameter (maps to bit_rate).""" waveform = torch.randn(1, 16000) sample_rate = 16000 - + with tempfile.TemporaryDirectory() as temp_dir: # Test with compression (bit_rate) output_path = os.path.join(temp_dir, "output.wav") save_with_torchcodec(output_path, waveform, sample_rate, compression=128000) - + # Should not raise an error and file should exist assert os.path.exists(output_path) - + # Load back and check basic properties waveform_loaded, sample_rate_loaded = torchaudio.load(output_path) assert sample_rate_loaded == sample_rate @@ -275,13 +279,13 @@ def test_save_format_parameter_warning(): """Test that format parameter produces a warning.""" waveform = torch.randn(1, 16000) sample_rate = 16000 - + with tempfile.TemporaryDirectory() as temp_dir: output_path = os.path.join(temp_dir, "output.wav") - + with pytest.warns(UserWarning, match="format.*not used"): save_with_torchcodec(output_path, waveform, sample_rate, format="wav") - + # Should still work despite warning assert os.path.exists(output_path) @@ -290,13 +294,13 @@ def test_save_encoding_parameter_warning(): """Test that encoding parameter produces a warning.""" waveform = torch.randn(1, 16000) sample_rate = 16000 - + with tempfile.TemporaryDirectory() as temp_dir: output_path = os.path.join(temp_dir, "output.wav") - + with pytest.warns(UserWarning, match="encoding.*not fully supported"): save_with_torchcodec(output_path, waveform, sample_rate, encoding="PCM_16") - + # Should still work despite warning assert os.path.exists(output_path) @@ -305,13 +309,13 @@ def test_save_bits_per_sample_parameter_warning(): """Test that bits_per_sample parameter produces a warning.""" waveform = torch.randn(1, 16000) sample_rate = 16000 - + with tempfile.TemporaryDirectory() as temp_dir: output_path = os.path.join(temp_dir, "output.wav") - + with pytest.warns(UserWarning, match="bits_per_sample.*not directly supported"): save_with_torchcodec(output_path, waveform, sample_rate, bits_per_sample=16) - + # Should still work despite warning assert os.path.exists(output_path) @@ -320,13 +324,13 @@ def test_save_buffer_size_parameter_warning(): """Test that non-default buffer_size produces a warning.""" waveform = torch.randn(1, 16000) sample_rate = 16000 - + with tempfile.TemporaryDirectory() as temp_dir: output_path = os.path.join(temp_dir, "output.wav") - + with pytest.warns(UserWarning, match="buffer_size.*not used"): save_with_torchcodec(output_path, waveform, sample_rate, buffer_size=8192) - + # Should still work despite warning assert os.path.exists(output_path) @@ -335,13 +339,13 @@ def test_save_backend_parameter_warning(): """Test that specifying backend produces a warning.""" waveform = torch.randn(1, 16000) sample_rate = 16000 - + with tempfile.TemporaryDirectory() as temp_dir: output_path = os.path.join(temp_dir, "output.wav") - + with pytest.warns(UserWarning, match="backend.*not used"): save_with_torchcodec(output_path, waveform, sample_rate, backend="ffmpeg") - + # Should still work despite warning assert os.path.exists(output_path) @@ -350,16 +354,16 @@ def test_save_edge_cases(): """Test edge cases and error conditions.""" waveform = torch.randn(1, 16000) sample_rate = 16000 - + with tempfile.TemporaryDirectory() as temp_dir: output_path = os.path.join(temp_dir, "output.wav") - + # Test with very small waveform small_waveform = torch.randn(1, 10) save_with_torchcodec(output_path, small_waveform, sample_rate) waveform_loaded, sample_rate_loaded = torchaudio.load(output_path) assert sample_rate_loaded == sample_rate - + # Test with different sample rates for sr in [8000, 22050, 44100]: sr_path = os.path.join(temp_dir, f"output_{sr}.wav") @@ -372,19 +376,19 @@ def test_save_invalid_inputs(): """Test that invalid inputs raise appropriate errors.""" waveform = torch.randn(1, 16000) sample_rate = 16000 - + with tempfile.TemporaryDirectory() as temp_dir: output_path = os.path.join(temp_dir, "output.wav") - + # Test with invalid sample rate with pytest.raises(ValueError, match="sample_rate must be positive"): save_with_torchcodec(output_path, waveform, -1) - + # Test with invalid tensor dimensions with pytest.raises(ValueError, match="Expected 1D or 2D tensor"): invalid_waveform = torch.randn(1, 2, 16000) # 3D tensor save_with_torchcodec(output_path, invalid_waveform, sample_rate) - + # Test with non-tensor input with pytest.raises(ValueError, match="Expected src to be a torch.Tensor"): save_with_torchcodec(output_path, [1, 2, 3], sample_rate) @@ -394,14 +398,14 @@ def test_save_multiple_warnings(): """Test that multiple unsupported parameters produce multiple warnings.""" waveform = torch.randn(1, 16000) sample_rate = 16000 - + with tempfile.TemporaryDirectory() as temp_dir: output_path = os.path.join(temp_dir, "output.wav") - + with pytest.warns() as warning_list: save_with_torchcodec( - output_path, - waveform, + output_path, + waveform, sample_rate, format="wav", encoding="PCM_16", @@ -409,7 +413,7 @@ def test_save_multiple_warnings(): buffer_size=8192, backend="ffmpeg" ) - + # Check that expected warnings are present messages = [str(w.message) for w in warning_list] assert any("format" in msg for msg in messages) @@ -417,7 +421,7 @@ def test_save_multiple_warnings(): assert any("bits_per_sample" in msg for msg in messages) assert any("buffer_size" in msg for msg in messages) assert any("backend" in msg for msg in messages) - + # Should still work despite warnings assert os.path.exists(output_path) @@ -426,17 +430,17 @@ def test_save_different_formats(): """Test saving to different audio formats.""" waveform = torch.randn(1, 16000) sample_rate = 16000 - + with tempfile.TemporaryDirectory() as temp_dir: # Test common formats formats = ["wav", "mp3", "flac"] - + for fmt in formats: output_path = os.path.join(temp_dir, f"output.{fmt}") try: save_with_torchcodec(output_path, waveform, sample_rate) assert os.path.exists(output_path) - + # Try to load back (may not work for all formats with all backends) try: waveform_loaded, sample_rate_loaded = torchaudio.load(output_path) @@ -446,4 +450,4 @@ def test_save_different_formats(): pass except Exception as e: # Some formats might not be supported by torchcodec - pytest.skip(f"Format {fmt} not supported: {e}") \ No newline at end of file + pytest.skip(f"Format {fmt} not supported: {e}") From bd7eb5239badb3a4858c5820ff606bf691dcaeff Mon Sep 17 00:00:00 2001 From: Sam Anklesaria Date: Thu, 14 Aug 2025 16:33:48 +0000 Subject: [PATCH 22/25] Correct call to pytest skip --- test/torchaudio_unittest/test_load_save_torchcodec.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/torchaudio_unittest/test_load_save_torchcodec.py b/test/torchaudio_unittest/test_load_save_torchcodec.py index 28d316952e..4a89123939 100644 --- a/test/torchaudio_unittest/test_load_save_torchcodec.py +++ b/test/torchaudio_unittest/test_load_save_torchcodec.py @@ -14,7 +14,7 @@ # Now, load/save_torchcodec are the same as torchaudio.load/save, so # there is no need to test this. -pytest.skip() +pytest.skip(allow_module_level=True) def get_ffmpeg_version(): """Get FFmpeg version to check for compatibility issues.""" From c3d0cc2bca81a9815e0592683347048562d33c16 Mon Sep 17 00:00:00 2001 From: Sam Anklesaria Date: Thu, 14 Aug 2025 16:57:21 +0000 Subject: [PATCH 23/25] Remove torchcodec installation --- .github/scripts/unittest-linux/install.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/scripts/unittest-linux/install.sh b/.github/scripts/unittest-linux/install.sh index c8f47e63ab..68ed032bbb 100755 --- a/.github/scripts/unittest-linux/install.sh +++ b/.github/scripts/unittest-linux/install.sh @@ -74,7 +74,7 @@ case $GPU_ARCH_TYPE in ;; esac PYTORCH_WHEEL_INDEX="https://download.pytorch.org/whl/${UPLOAD_CHANNEL}/${GPU_ARCH_ID}" -pip install --progress-bar=off --pre torch torchcodec --index-url="${PYTORCH_WHEEL_INDEX}" +pip install --progress-bar=off --pre torch --index-url="${PYTORCH_WHEEL_INDEX}" # 2. Install torchaudio From d10fc1925e38c5f1abec5753c5f11987e338e2e9 Mon Sep 17 00:00:00 2001 From: Sam Anklesaria Date: Fri, 15 Aug 2025 15:57:04 +0000 Subject: [PATCH 24/25] Add torchcodec to build installation --- .github/workflows/build_docs.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/build_docs.yml b/.github/workflows/build_docs.yml index e92c556218..f681e3b7ec 100644 --- a/.github/workflows/build_docs.yml +++ b/.github/workflows/build_docs.yml @@ -68,7 +68,7 @@ jobs: GPU_ARCH_ID=cu126 # This is hard-coded and must be consistent with gpu-arch-version. PYTORCH_WHEEL_INDEX="https://download.pytorch.org/whl/${CHANNEL}/${GPU_ARCH_ID}" - pip install --progress-bar=off --pre torch --index-url="${PYTORCH_WHEEL_INDEX}" + pip install --progress-bar=off --pre torch torchcodec --index-url="${PYTORCH_WHEEL_INDEX}" echo "::endgroup::" echo "::group::Install TorchAudio" From 92fee5133bd585b43f96bcf3985a61806fee6f33 Mon Sep 17 00:00:00 2001 From: Sam Anklesaria Date: Fri, 15 Aug 2025 16:48:41 +0000 Subject: [PATCH 25/25] Remove redundant wav_utils --- src/torchaudio/utils/wav_utils.py | 92 ------------------------------- 1 file changed, 92 deletions(-) delete mode 100644 src/torchaudio/utils/wav_utils.py diff --git a/src/torchaudio/utils/wav_utils.py b/src/torchaudio/utils/wav_utils.py deleted file mode 100644 index db15494dca..0000000000 --- a/src/torchaudio/utils/wav_utils.py +++ /dev/null @@ -1,92 +0,0 @@ -from typing import Optional - -import scipy.io.wavfile -import torch - - -def normalize_wav(tensor: torch.Tensor) -> torch.Tensor: - if tensor.dtype == torch.float32: - pass - elif tensor.dtype == torch.int32: - tensor = tensor.to(torch.float32) - tensor[tensor > 0] /= 2147483647.0 - tensor[tensor < 0] /= 2147483648.0 - elif tensor.dtype == torch.int16: - tensor = tensor.to(torch.float32) - tensor[tensor > 0] /= 32767.0 - tensor[tensor < 0] /= 32768.0 - elif tensor.dtype == torch.uint8: - tensor = tensor.to(torch.float32) - 128 - tensor[tensor > 0] /= 127.0 - tensor[tensor < 0] /= 128.0 - return tensor - - -def get_wav_data( - dtype: str, - num_channels: int, - *, - num_frames: Optional[int] = None, - normalize: bool = True, - channels_first: bool = True, -): - """Generate linear signal of the given dtype and num_channels - - Data range is - [-1.0, 1.0] for float32, - [-2147483648, 2147483647] for int32 - [-32768, 32767] for int16 - [0, 255] for uint8 - - num_frames allow to change the linear interpolation parameter. - Default values are 256 for uint8, else 1 << 16. - 1 << 16 as default is so that int16 value range is completely covered. - """ - dtype_ = getattr(torch, dtype) - - if num_frames is None: - if dtype == "uint8": - num_frames = 256 - else: - num_frames = 1 << 16 - - if dtype == "uint8": - base = torch.linspace(0, 255, num_frames, dtype=dtype_) - elif dtype == "int8": - base = torch.linspace(-128, 127, num_frames, dtype=dtype_) - elif dtype == "float32": - base = torch.linspace(-1.0, 1.0, num_frames, dtype=dtype_) - elif dtype == "float64": - base = torch.linspace(-1.0, 1.0, num_frames, dtype=dtype_) - elif dtype == "int32": - base = torch.linspace(-2147483648, 2147483647, num_frames, dtype=dtype_) - elif dtype == "int16": - base = torch.linspace(-32768, 32767, num_frames, dtype=dtype_) - else: - raise NotImplementedError(f"Unsupported dtype {dtype}") - data = base.repeat([num_channels, 1]) - if not channels_first: - data = data.transpose(1, 0) - if normalize: - data = normalize_wav(data) - return data - - -def load_wav(path: str, normalize=True, channels_first=True) -> torch.Tensor: - """Load wav file without torchaudio""" - sample_rate, data = scipy.io.wavfile.read(path) - data = torch.from_numpy(data.copy()) - if data.ndim == 1: - data = data.unsqueeze(1) - if normalize: - data = normalize_wav(data) - if channels_first: - data = data.transpose(1, 0) - return data, sample_rate - - -def save_wav(path, data, sample_rate, channels_first=True): - """Save wav file without torchaudio""" - if channels_first: - data = data.transpose(1, 0) - scipy.io.wavfile.write(path, sample_rate, data.numpy())