nextcloud · lukasdotcom · Jun 25, 2025 · Apr 29, 2025 · May 5, 2025 · May 5, 2025
diff --git a/AUTHORS.md b/AUTHORS.md
@@ -11,8 +11,9 @@
 - John Molakvoæ <[email protected]>
 - Juergen Kellerer <[email protected]>
 - Julien Veyssier <[email protected]>
+- Lukas Schaefer <[email protected]>
 - Marcel Klehr <[email protected]>
-- Sami Finnilä <[email protected]>
 - Micke Nordin <[email protected]>
 - rakekniven <[email protected]>
 - Richard Steinmetz <[email protected]>
+- Sami Finnilä <[email protected]>
diff --git a/README.md b/README.md
@@ -63,6 +63,14 @@ Positive:
 Negative:
 * The training data is not freely available, limiting the ability of external parties to check and correct for bias or optimise the model’s performance and CO2 usage.
 
+### Rating for Text-To-Speech via the OpenAI API: 🔴
+
+Negative:
+* The software for training and inferencing of this model is proprietary, limiting running it locally or training by yourself
+* The trained model is not freely available, so the model can not be ran on-premises
+* The training data is not freely available, limiting the ability of external parties to check and correct for bias or optimise the model’s performance and CO2 usage.
+
+
 ### Rating for Text generation via LocalAI: 🟢
 
 Positive:

diff --git a/appinfo/info.xml b/appinfo/info.xml
@@ -65,6 +65,13 @@ Positive:
 Negative:
 * The training data is not freely available, limiting the ability of external parties to check and correct for bias or optimise the model’s performance and CO2 usage.
 
+### Rating for Text-To-Speech via the OpenAI API: 🔴
+
+Negative:
+* The software for training and inferencing of this model is proprietary, limiting running it locally or training by yourself
+* The trained model is not freely available, so the model can not be ran on-premises
+* The training data is not freely available, limiting the ability of external parties to check and correct for bias or optimise the model’s performance and CO2 usage.
+
 ### Rating for Text generation via LocalAI: 🟢
 
 Positive:

diff --git a/composer.lock b/composer.lock
diff --git a/lib/AppInfo/Application.php b/lib/AppInfo/Application.php
@@ -18,6 +18,7 @@
 use OCA\OpenAi\TaskProcessing\ReformulateProvider;
 use OCA\OpenAi\TaskProcessing\SummaryProvider;
 use OCA\OpenAi\TaskProcessing\TextToImageProvider;
+use OCA\OpenAi\TaskProcessing\TextToSpeechProvider;
 use OCA\OpenAi\TaskProcessing\TextToTextChatProvider;
 use OCA\OpenAi\TaskProcessing\TextToTextProvider;
 use OCA\OpenAi\TaskProcessing\TopicsProvider;
@@ -40,6 +41,12 @@ class Application extends App implements IBootstrap {
 	public const DEFAULT_COMPLETION_MODEL_ID = 'gpt-3.5-turbo';
 	public const DEFAULT_IMAGE_MODEL_ID = 'dall-e-2';
 	public const DEFAULT_TRANSCRIPTION_MODEL_ID = 'whisper-1';
+	public const DEFAULT_SPEECH_MODEL_ID = 'tts-1-hd';
+	public const DEFAULT_SPEECH_VOICE = 'alloy';
+	public const DEFAULT_SPEECH_VOICES = [
+		'alloy', 'ash', 'ballad', 'coral', 'echo', 'fable',
+		'onyx', 'nova', 'sage', 'shimmer', 'verse'
+	];
 	public const DEFAULT_DEFAULT_IMAGE_SIZE = '1024x1024';
 	public const MAX_GENERATION_IDLE_TIME = 60 * 60 * 24 * 10;
 	public const DEFAULT_CHUNK_SIZE = 10000;
@@ -56,11 +63,13 @@ class Application extends App implements IBootstrap {
 	public const QUOTA_TYPE_TEXT = 0;
 	public const QUOTA_TYPE_IMAGE = 1;
 	public const QUOTA_TYPE_TRANSCRIPTION = 2;
+	public const QUOTA_TYPE_SPEECH = 3;
 
 	public const DEFAULT_QUOTAS = [
 		self::QUOTA_TYPE_TEXT => 0, // 0 = unlimited
 		self::QUOTA_TYPE_IMAGE => 0, // 0 = unlimited
 		self::QUOTA_TYPE_TRANSCRIPTION => 0, // 0 = unlimited
+		self::QUOTA_TYPE_SPEECH => 0, // 0 = unlimited
 
 	];
 
@@ -110,6 +119,10 @@ public function register(IRegistrationContext $context): void {
 				$context->registerTaskProcessingProvider(\OCA\OpenAi\TaskProcessing\ProofreadProvider::class);
 			}
 		}
+		if (!class_exists('OCP\\TaskProcessing\\TaskTypes\\TextToSpeech')) {
+			$context->registerTaskProcessingTaskType(\OCA\OpenAi\TaskProcessing\TextToSpeechTaskType::class);
+		}
+		$context->registerTaskProcessingProvider(TextToSpeechProvider::class);
 		if ($this->appConfig->getValueString(Application::APP_ID, 't2i_provider_enabled', '1') === '1') {
 			$context->registerTaskProcessingProvider(TextToImageProvider::class);
 		}

diff --git a/lib/Service/OpenAiAPIService.php b/lib/Service/OpenAiAPIService.php
@@ -28,6 +28,8 @@
 use OCP\TaskProcessing\ShapeEnumValue;
 use Psr\Log\LoggerInterface;
 use RuntimeException;
+use Throwable;
+use function json_encode;
 
 /**
  * Service to make requests to OpenAI/LocalAI REST API
@@ -132,7 +134,7 @@ public function getModels(string $userId): array {
 			throw $e;
 		}
 		if (isset($modelsResponse['error'])) {
-			$this->logger->warning('Error retrieving models: ' . \json_encode($modelsResponse));
+			$this->logger->warning('Error retrieving models: ' . json_encode($modelsResponse));
 			$this->areCredsValid = false;
 			throw new Exception($modelsResponse['error'], Http::STATUS_INTERNAL_SERVER_ERROR);
 		}
@@ -142,7 +144,7 @@ public function getModels(string $userId): array {
 		}
 
 		if (!$this->isModelListValid($modelsResponse['data'])) {
-			$this->logger->warning('Invalid models response: ' . \json_encode($modelsResponse));
+			$this->logger->warning('Invalid models response: ' . json_encode($modelsResponse));
 			$this->areCredsValid = false;
 			throw new Exception($this->l10n->t('Invalid models response received'), Http::STATUS_INTERNAL_SERVER_ERROR);
 		}
@@ -185,7 +187,7 @@ public function getModelEnumValues(?string $userId): array {
 				array_unshift($modelEnumValues, new ShapeEnumValue($this->l10n->t('Default'), 'Default'));
 			}
 			return $modelEnumValues;
-		} catch (\Throwable $e) {
+		} catch (Throwable $e) {
 			// avoid flooding the logs with errors from calls of task processing
 			$this->logger->info('Error getting model enum values', ['exception' => $e]);
 			return [];
@@ -248,6 +250,8 @@ public function translatedQuotaType(int $type): string {
 				return $this->l10n->t('Image generation');
 			case Application::QUOTA_TYPE_TRANSCRIPTION:
 				return $this->l10n->t('Audio transcription');
+			case Application::QUOTA_TYPE_SPEECH:
+				return $this->l10n->t('Text to speech');
 			default:
 				return $this->l10n->t('Unknown');
 		}
@@ -266,6 +270,8 @@ public function translatedQuotaUnit(int $type): string {
 				return $this->l10n->t('images');
 			case Application::QUOTA_TYPE_TRANSCRIPTION:
 				return $this->l10n->t('seconds');
+			case Application::QUOTA_TYPE_SPEECH:
+				return $this->l10n->t('characters');
 			default:
 				return $this->l10n->t('Unknown');
 		}
@@ -742,6 +748,41 @@ public function getImageRequestOptions(?string $userId): array {
 		return $requestOptions;
 	}
 
+	/**
+	 * @param string|null $userId
+	 * @param string $prompt
+	 * @param string $model
+	 * @param string $voice
+	 * @param float $speed
+	 * @return array
+	 * @throws Exception
+	 */
+	public function requestSpeechCreation(
+		?string $userId, string $prompt, string $model, string $voice, float $speed = 1,
+	): array {
+		if ($this->isQuotaExceeded($userId, Application::QUOTA_TYPE_SPEECH)) {
+			throw new Exception($this->l10n->t('Speech generation quota exceeded'), Http::STATUS_TOO_MANY_REQUESTS);
+		}
+
+		$params = [
+			'input' => $prompt,
+			'voice' => $voice === Application::DEFAULT_MODEL_ID ? Application::DEFAULT_SPEECH_VOICE : $voice,
+			'model' => $model === Application::DEFAULT_MODEL_ID ? Application::DEFAULT_SPEECH_MODEL_ID : $model,
+			'response_format' => 'mp3',
+			'speed' => $speed,
+		];
+
+		$apiResponse = $this->request($userId, 'audio/speech', $params, 'POST');
+
+		try {
+			$charCount = mb_strlen($prompt);
+			$this->quotaUsageMapper->createQuotaUsage($userId ?? '', Application::QUOTA_TYPE_SPEECH, $charCount);
+		} catch (DBException $e) {
+			$this->logger->warning('Could not create quota usage for user: ' . $userId . ' and quota type: ' . Application::QUOTA_TYPE_SPEECH . '. Error: ' . $e->getMessage());
+		}
+		return $apiResponse;
+	}
+
 	/**
 	 * @return int
 	 */
@@ -893,9 +934,16 @@ public function request(?string $userId, string $endPoint, array $params = [], s
 
 			if ($respCode >= 400) {
 				return ['error' => $this->l10n->t('Bad credentials')];
-			} else {
-				return json_decode($body, true) ?: [];
 			}
+			if ($response->getHeader('Content-Type') === 'application/json') {
+				$parsedBody = json_decode($body, true);
+				if ($parsedBody === null) {
+					$this->logger->warning('Could not JSON parse the response', ['body' => $body]);
+					return ['error' => 'Could not JSON parse the response'];
+				}
+				return $parsedBody;
+			}
+			return ['body' => $body];
 		} catch (ClientException|ServerException $e) {
 			$responseBody = $e->getResponse()->getBody();
 			$parsedResponseBody = json_decode($responseBody, true);