From 0ca50ec3b9f5def60b27a8cc2416c5be45b514b3 Mon Sep 17 00:00:00 2001
From: 98440 <984400286@qq.com>
Date: Fri, 17 Jan 2025 02:30:57 +0800
Subject: [PATCH 1/2] support XPU

---
 README.md                  | 15 +++++++++++----
 cosyvoice/cli/cosyvoice.py |  6 ++++++
 cosyvoice/cli/frontend.py  |  3 ++-
 cosyvoice/cli/model.py     |  8 ++++----
 requirements.txt           |  3 +--
 5 files changed, 24 insertions(+), 11 deletions(-)

diff --git a/README.md b/README.md
index bd02df76..b113b85c 100644
--- a/README.md
+++ b/README.md
@@ -53,20 +53,27 @@
 
 - Clone the repo
 ``` sh
-git clone --recursive https://github.com/FunAudioLLM/CosyVoice.git
+git clone --recursive https://github.com/DDXDB/CosyVoice-XPU.git
 # If you failed to clone submodule due to network failures, please run following command until success
-cd CosyVoice
+cd CosyVoice-XPU
 git submodule update --init --recursive
 ```
 
+- Install `Intel® Deep Learning Essentials` or `Intel® oneAPI Base Toolkit`
+- please see：https://pytorch.org/docs/main/notes/get_start_xpu.html
 - Install Conda: please see https://docs.conda.io/en/latest/miniconda.html
 - Create Conda env:
 
 ``` sh
-conda create -n cosyvoice -y python=3.10
-conda activate cosyvoice
+conda create -n cosyvoice-XPU -y python=3.10
+conda activate cosyvoice-XPU
 # pynini is required by WeTextProcessing, use conda to install it as it can be executed on all platform.
 conda install -y -c conda-forge pynini==2.1.5
+# Start oneAPI env and Install pytorch+XPU
+call C:\Program Files (x86)\Intel\oneAPI\compiler\2025.0\env\vars.bat
+call C:\Program Files (x86)\Intel\oneAPI\ocloc\2025.0\env\vars.bat
+pip3 install --pre torch torchvision torchaudio --index-url https://download.pytorch.org/whl/nightly/xpu
+
 pip install -r requirements.txt -i https://mirrors.aliyun.com/pypi/simple/ --trusted-host=mirrors.aliyun.com
 
 # If you encounter sox compatibility issues
diff --git a/cosyvoice/cli/cosyvoice.py b/cosyvoice/cli/cosyvoice.py
index 2da3d0a1..5c6aef64 100644
--- a/cosyvoice/cli/cosyvoice.py
+++ b/cosyvoice/cli/cosyvoice.py
@@ -44,6 +44,9 @@ def __init__(self, model_dir, load_jit=False, load_trt=False, fp16=False):
         if torch.cuda.is_available() is False and (load_jit is True or load_trt is True or fp16 is True):
             load_jit, load_trt, fp16 = False, False, False
             logging.warning('no cuda device, set load_jit/load_trt/fp16 to False')
+            if torch.xpu.is_available() is False and (load_jit is True or load_trt is True or fp16 is True):
+                load_jit, load_trt, fp16 = False, False, False
+                logging.warning('no xpu device, set load_jit/load_trt/fp16 to False')
         self.model = CosyVoiceModel(configs['llm'], configs['flow'], configs['hift'], fp16)
         self.model.load('{}/llm.pt'.format(model_dir),
                         '{}/flow.pt'.format(model_dir),
@@ -144,6 +147,9 @@ def __init__(self, model_dir, load_jit=False, load_trt=False, fp16=False):
         if torch.cuda.is_available() is False and (load_jit is True or load_trt is True or fp16 is True):
             load_jit, load_trt, fp16 = False, False, False
             logging.warning('no cuda device, set load_jit/load_trt/fp16 to False')
+            if torch.xpu.is_available() is False and (load_jit is True or load_trt is True or fp16 is True):
+                load_jit, load_trt, fp16 = False, False, False
+                logging.warning('no xpu device, set load_jit/load_trt/fp16 to False')
         self.model = CosyVoice2Model(configs['llm'], configs['flow'], configs['hift'], fp16)
         self.model.load('{}/llm.pt'.format(model_dir),
                         '{}/flow.pt'.format(model_dir),
diff --git a/cosyvoice/cli/frontend.py b/cosyvoice/cli/frontend.py
index ab59a936..41066837 100644
--- a/cosyvoice/cli/frontend.py
+++ b/cosyvoice/cli/frontend.py
@@ -45,13 +45,14 @@ def __init__(self,
                  allowed_special: str = 'all'):
         self.tokenizer = get_tokenizer()
         self.feat_extractor = feat_extractor
-        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+        self.device = torch.device('cuda' if torch.cuda.is_available() else 'xpu' if torch.xpu.is_available() else 'cpu')
         option = onnxruntime.SessionOptions()
         option.graph_optimization_level = onnxruntime.GraphOptimizationLevel.ORT_ENABLE_ALL
         option.intra_op_num_threads = 1
         self.campplus_session = onnxruntime.InferenceSession(campplus_model, sess_options=option, providers=["CPUExecutionProvider"])
         self.speech_tokenizer_session = onnxruntime.InferenceSession(speech_tokenizer_model, sess_options=option,
                                                                      providers=["CUDAExecutionProvider" if torch.cuda.is_available() else
+                                                                                "CUDAExecutionProvider" if torch.xpu.is_available() else
                                                                                 "CPUExecutionProvider"])
         if os.path.exists(spk2info):
             self.spk2info = torch.load(spk2info, map_location=self.device)
diff --git a/cosyvoice/cli/model.py b/cosyvoice/cli/model.py
index 9995c44c..a64208fa 100644
--- a/cosyvoice/cli/model.py
+++ b/cosyvoice/cli/model.py
@@ -30,7 +30,7 @@ def __init__(self,
                  flow: torch.nn.Module,
                  hift: torch.nn.Module,
                  fp16: bool):
-        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+        self.device = torch.device('cuda' if torch.cuda.is_available() else 'xpu' if torch.xpu.is_available() else 'cpu')
         self.llm = llm
         self.flow = flow
         self.hift = hift
@@ -56,7 +56,7 @@ def __init__(self,
         # rtf and decoding related
         self.stream_scale_factor = 1
         assert self.stream_scale_factor >= 1, 'stream_scale_factor should be greater than 1, change it according to your actual rtf'
-        self.llm_context = torch.cuda.stream(torch.cuda.Stream(self.device)) if torch.cuda.is_available() else nullcontext()
+        self.llm_context = torch.cuda.stream(torch.cuda.Stream(self.device)) if torch.cuda.is_available() else torch.xpu.stream(torch.xpu.Stream(self.device)) if torch.xpu.is_available() else nullcontext()
         self.lock = threading.Lock()
         # dict used to store session related variable
         self.tts_speech_token_dict = {}
@@ -275,7 +275,7 @@ def __init__(self,
                  flow: torch.nn.Module,
                  hift: torch.nn.Module,
                  fp16: bool):
-        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+        self.device = torch.device('cuda' if torch.cuda.is_available() else 'xpu' if torch.xpu.is_available() else 'cpu')
         self.llm = llm
         self.flow = flow
         self.hift = hift
@@ -296,7 +296,7 @@ def __init__(self,
         self.speech_window = np.hamming(2 * self.source_cache_len)
         # rtf and decoding related
         self.stream_scale_factor = 1
-        self.llm_context = torch.cuda.stream(torch.cuda.Stream(self.device)) if torch.cuda.is_available() else nullcontext()
+        self.llm_context = torch.cuda.stream(torch.cuda.Stream(self.device)) if torch.cuda.is_available() else torch.xpu.stream(torch.xpu.Stream(self.device)) if torch.xpu.is_available() else nullcontext()
         self.lock = threading.Lock()
         # dict used to store session related variable
         self.tts_speech_token_dict = {}
diff --git a/requirements.txt b/requirements.txt
index f304aed8..365d0542 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -28,8 +28,7 @@ tensorboard==2.14.0
 tensorrt-cu12==10.0.1; sys_platform == 'linux'
 tensorrt-cu12-bindings==10.0.1; sys_platform == 'linux'
 tensorrt-cu12-libs==10.0.1; sys_platform == 'linux'
-torch==2.3.1
-torchaudio==2.3.1
+
 transformers==4.40.1
 uvicorn==0.30.0
 wget==3.2

From b724f196efc2fbc525ab4a15c19dcd3e2c5f9895 Mon Sep 17 00:00:00 2001
From: 98440 <984400286@qq.com>
Date: Fri, 14 Feb 2025 22:47:11 +0800
Subject: [PATCH 2/2] support IPEX

---
 cosyvoice/cli/cosyvoice.py | 5 +++++
 cosyvoice/cli/frontend.py  | 5 +++++
 cosyvoice/cli/model.py     | 5 +++++
 requirements.txt           | 2 +-
 4 files changed, 16 insertions(+), 1 deletion(-)

diff --git a/cosyvoice/cli/cosyvoice.py b/cosyvoice/cli/cosyvoice.py
index 3c8fde1f..c83d0c74 100644
--- a/cosyvoice/cli/cosyvoice.py
+++ b/cosyvoice/cli/cosyvoice.py
@@ -18,6 +18,11 @@
 from hyperpyyaml import load_hyperpyyaml
 from modelscope import snapshot_download
 import torch
+try:
+    import intel_extension_for_pytorch as ipex
+except Exception:
+    pass
+
 from cosyvoice.cli.frontend import CosyVoiceFrontEnd
 from cosyvoice.cli.model import CosyVoiceModel, CosyVoice2Model
 from cosyvoice.utils.file_utils import logging
diff --git a/cosyvoice/cli/frontend.py b/cosyvoice/cli/frontend.py
index c7b597f8..87e55ec2 100644
--- a/cosyvoice/cli/frontend.py
+++ b/cosyvoice/cli/frontend.py
@@ -16,6 +16,11 @@
 import json
 import onnxruntime
 import torch
+try:
+    import intel_extension_for_pytorch as ipex
+except Exception:
+    pass
+
 import numpy as np
 import whisper
 from typing import Callable
diff --git a/cosyvoice/cli/model.py b/cosyvoice/cli/model.py
index 66a272e1..ccd998b5 100644
--- a/cosyvoice/cli/model.py
+++ b/cosyvoice/cli/model.py
@@ -14,6 +14,11 @@
 import os
 from typing import Generator
 import torch
+try:
+    import intel_extension_for_pytorch as ipex
+except Exception:
+    pass
+
 import numpy as np
 import threading
 import time
diff --git a/requirements.txt b/requirements.txt
index aeff6ecb..7cdcced5 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -35,4 +35,4 @@ uvicorn==0.30.0
 wget==3.2
 fastapi==0.115.6
 fastapi-cli==0.0.4
-WeTextProcessing==1.0.3
+WeTextProcessing==1.0.3
\ No newline at end of file