From 0ca50ec3b9f5def60b27a8cc2416c5be45b514b3 Mon Sep 17 00:00:00 2001 From: 98440 <984400286@qq.com> Date: Fri, 17 Jan 2025 02:30:57 +0800 Subject: [PATCH 1/2] support XPU --- README.md | 15 +++++++++++---- cosyvoice/cli/cosyvoice.py | 6 ++++++ cosyvoice/cli/frontend.py | 3 ++- cosyvoice/cli/model.py | 8 ++++---- requirements.txt | 3 +-- 5 files changed, 24 insertions(+), 11 deletions(-) diff --git a/README.md b/README.md index bd02df76..b113b85c 100644 --- a/README.md +++ b/README.md @@ -53,20 +53,27 @@ - Clone the repo ``` sh -git clone --recursive https://github.com/FunAudioLLM/CosyVoice.git +git clone --recursive https://github.com/DDXDB/CosyVoice-XPU.git # If you failed to clone submodule due to network failures, please run following command until success -cd CosyVoice +cd CosyVoice-XPU git submodule update --init --recursive ``` +- Install `Intel® Deep Learning Essentials` or `Intel® oneAPI Base Toolkit` +- please see:https://pytorch.org/docs/main/notes/get_start_xpu.html - Install Conda: please see https://docs.conda.io/en/latest/miniconda.html - Create Conda env: ``` sh -conda create -n cosyvoice -y python=3.10 -conda activate cosyvoice +conda create -n cosyvoice-XPU -y python=3.10 +conda activate cosyvoice-XPU # pynini is required by WeTextProcessing, use conda to install it as it can be executed on all platform. conda install -y -c conda-forge pynini==2.1.5 +# Start oneAPI env and Install pytorch+XPU +call C:\Program Files (x86)\Intel\oneAPI\compiler\2025.0\env\vars.bat +call C:\Program Files (x86)\Intel\oneAPI\ocloc\2025.0\env\vars.bat +pip3 install --pre torch torchvision torchaudio --index-url https://download.pytorch.org/whl/nightly/xpu + pip install -r requirements.txt -i https://mirrors.aliyun.com/pypi/simple/ --trusted-host=mirrors.aliyun.com # If you encounter sox compatibility issues diff --git a/cosyvoice/cli/cosyvoice.py b/cosyvoice/cli/cosyvoice.py index 2da3d0a1..5c6aef64 100644 --- a/cosyvoice/cli/cosyvoice.py +++ b/cosyvoice/cli/cosyvoice.py @@ -44,6 +44,9 @@ def __init__(self, model_dir, load_jit=False, load_trt=False, fp16=False): if torch.cuda.is_available() is False and (load_jit is True or load_trt is True or fp16 is True): load_jit, load_trt, fp16 = False, False, False logging.warning('no cuda device, set load_jit/load_trt/fp16 to False') + if torch.xpu.is_available() is False and (load_jit is True or load_trt is True or fp16 is True): + load_jit, load_trt, fp16 = False, False, False + logging.warning('no xpu device, set load_jit/load_trt/fp16 to False') self.model = CosyVoiceModel(configs['llm'], configs['flow'], configs['hift'], fp16) self.model.load('{}/llm.pt'.format(model_dir), '{}/flow.pt'.format(model_dir), @@ -144,6 +147,9 @@ def __init__(self, model_dir, load_jit=False, load_trt=False, fp16=False): if torch.cuda.is_available() is False and (load_jit is True or load_trt is True or fp16 is True): load_jit, load_trt, fp16 = False, False, False logging.warning('no cuda device, set load_jit/load_trt/fp16 to False') + if torch.xpu.is_available() is False and (load_jit is True or load_trt is True or fp16 is True): + load_jit, load_trt, fp16 = False, False, False + logging.warning('no xpu device, set load_jit/load_trt/fp16 to False') self.model = CosyVoice2Model(configs['llm'], configs['flow'], configs['hift'], fp16) self.model.load('{}/llm.pt'.format(model_dir), '{}/flow.pt'.format(model_dir), diff --git a/cosyvoice/cli/frontend.py b/cosyvoice/cli/frontend.py index ab59a936..41066837 100644 --- a/cosyvoice/cli/frontend.py +++ b/cosyvoice/cli/frontend.py @@ -45,13 +45,14 @@ def __init__(self, allowed_special: str = 'all'): self.tokenizer = get_tokenizer() self.feat_extractor = feat_extractor - self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') + self.device = torch.device('cuda' if torch.cuda.is_available() else 'xpu' if torch.xpu.is_available() else 'cpu') option = onnxruntime.SessionOptions() option.graph_optimization_level = onnxruntime.GraphOptimizationLevel.ORT_ENABLE_ALL option.intra_op_num_threads = 1 self.campplus_session = onnxruntime.InferenceSession(campplus_model, sess_options=option, providers=["CPUExecutionProvider"]) self.speech_tokenizer_session = onnxruntime.InferenceSession(speech_tokenizer_model, sess_options=option, providers=["CUDAExecutionProvider" if torch.cuda.is_available() else + "CUDAExecutionProvider" if torch.xpu.is_available() else "CPUExecutionProvider"]) if os.path.exists(spk2info): self.spk2info = torch.load(spk2info, map_location=self.device) diff --git a/cosyvoice/cli/model.py b/cosyvoice/cli/model.py index 9995c44c..a64208fa 100644 --- a/cosyvoice/cli/model.py +++ b/cosyvoice/cli/model.py @@ -30,7 +30,7 @@ def __init__(self, flow: torch.nn.Module, hift: torch.nn.Module, fp16: bool): - self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') + self.device = torch.device('cuda' if torch.cuda.is_available() else 'xpu' if torch.xpu.is_available() else 'cpu') self.llm = llm self.flow = flow self.hift = hift @@ -56,7 +56,7 @@ def __init__(self, # rtf and decoding related self.stream_scale_factor = 1 assert self.stream_scale_factor >= 1, 'stream_scale_factor should be greater than 1, change it according to your actual rtf' - self.llm_context = torch.cuda.stream(torch.cuda.Stream(self.device)) if torch.cuda.is_available() else nullcontext() + self.llm_context = torch.cuda.stream(torch.cuda.Stream(self.device)) if torch.cuda.is_available() else torch.xpu.stream(torch.xpu.Stream(self.device)) if torch.xpu.is_available() else nullcontext() self.lock = threading.Lock() # dict used to store session related variable self.tts_speech_token_dict = {} @@ -275,7 +275,7 @@ def __init__(self, flow: torch.nn.Module, hift: torch.nn.Module, fp16: bool): - self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') + self.device = torch.device('cuda' if torch.cuda.is_available() else 'xpu' if torch.xpu.is_available() else 'cpu') self.llm = llm self.flow = flow self.hift = hift @@ -296,7 +296,7 @@ def __init__(self, self.speech_window = np.hamming(2 * self.source_cache_len) # rtf and decoding related self.stream_scale_factor = 1 - self.llm_context = torch.cuda.stream(torch.cuda.Stream(self.device)) if torch.cuda.is_available() else nullcontext() + self.llm_context = torch.cuda.stream(torch.cuda.Stream(self.device)) if torch.cuda.is_available() else torch.xpu.stream(torch.xpu.Stream(self.device)) if torch.xpu.is_available() else nullcontext() self.lock = threading.Lock() # dict used to store session related variable self.tts_speech_token_dict = {} diff --git a/requirements.txt b/requirements.txt index f304aed8..365d0542 100644 --- a/requirements.txt +++ b/requirements.txt @@ -28,8 +28,7 @@ tensorboard==2.14.0 tensorrt-cu12==10.0.1; sys_platform == 'linux' tensorrt-cu12-bindings==10.0.1; sys_platform == 'linux' tensorrt-cu12-libs==10.0.1; sys_platform == 'linux' -torch==2.3.1 -torchaudio==2.3.1 + transformers==4.40.1 uvicorn==0.30.0 wget==3.2 From b724f196efc2fbc525ab4a15c19dcd3e2c5f9895 Mon Sep 17 00:00:00 2001 From: 98440 <984400286@qq.com> Date: Fri, 14 Feb 2025 22:47:11 +0800 Subject: [PATCH 2/2] support IPEX --- cosyvoice/cli/cosyvoice.py | 5 +++++ cosyvoice/cli/frontend.py | 5 +++++ cosyvoice/cli/model.py | 5 +++++ requirements.txt | 2 +- 4 files changed, 16 insertions(+), 1 deletion(-) diff --git a/cosyvoice/cli/cosyvoice.py b/cosyvoice/cli/cosyvoice.py index 3c8fde1f..c83d0c74 100644 --- a/cosyvoice/cli/cosyvoice.py +++ b/cosyvoice/cli/cosyvoice.py @@ -18,6 +18,11 @@ from hyperpyyaml import load_hyperpyyaml from modelscope import snapshot_download import torch +try: + import intel_extension_for_pytorch as ipex +except Exception: + pass + from cosyvoice.cli.frontend import CosyVoiceFrontEnd from cosyvoice.cli.model import CosyVoiceModel, CosyVoice2Model from cosyvoice.utils.file_utils import logging diff --git a/cosyvoice/cli/frontend.py b/cosyvoice/cli/frontend.py index c7b597f8..87e55ec2 100644 --- a/cosyvoice/cli/frontend.py +++ b/cosyvoice/cli/frontend.py @@ -16,6 +16,11 @@ import json import onnxruntime import torch +try: + import intel_extension_for_pytorch as ipex +except Exception: + pass + import numpy as np import whisper from typing import Callable diff --git a/cosyvoice/cli/model.py b/cosyvoice/cli/model.py index 66a272e1..ccd998b5 100644 --- a/cosyvoice/cli/model.py +++ b/cosyvoice/cli/model.py @@ -14,6 +14,11 @@ import os from typing import Generator import torch +try: + import intel_extension_for_pytorch as ipex +except Exception: + pass + import numpy as np import threading import time diff --git a/requirements.txt b/requirements.txt index aeff6ecb..7cdcced5 100644 --- a/requirements.txt +++ b/requirements.txt @@ -35,4 +35,4 @@ uvicorn==0.30.0 wget==3.2 fastapi==0.115.6 fastapi-cli==0.0.4 -WeTextProcessing==1.0.3 +WeTextProcessing==1.0.3 \ No newline at end of file