From f178063a9d05ca02302decc57b0246594e17c72d Mon Sep 17 00:00:00 2001
From: Julien Veyssier <julien-nc@posteo.net>
Date: Tue, 1 Jul 2025 18:26:52 +0200
Subject: [PATCH 01/11] implement audio chat provider

Signed-off-by: Julien Veyssier <julien-nc@posteo.net>
---
 lib/AppInfo/Application.php                   |   4 +
 lib/Service/OpenAiAPIService.php              |  28 +-
 .../AudioToAudioChatProvider.php              | 256 ++++++++++++++++++
 .../AudioToAudioChatTaskType.php              |  82 ++++++
 4 files changed, 367 insertions(+), 3 deletions(-)
 create mode 100644 lib/TaskProcessing/AudioToAudioChatProvider.php
 create mode 100644 lib/TaskProcessing/AudioToAudioChatTaskType.php
diff --git a/lib/AppInfo/Application.php b/lib/AppInfo/Application.php
index 2097212e..54f23c85 100644
--- a/lib/AppInfo/Application.php
+++ b/lib/AppInfo/Application.php
@@ -9,6 +9,8 @@
 
 use OCA\OpenAi\Capabilities;
 use OCA\OpenAi\OldProcessing\Translation\TranslationProvider as OldTranslationProvider;
+use OCA\OpenAi\TaskProcessing\AudioToAudioChatProvider;
+use OCA\OpenAi\TaskProcessing\AudioToAudioChatTaskType;
 use OCA\OpenAi\TaskProcessing\AudioToTextProvider;
 use OCA\OpenAi\TaskProcessing\ChangeToneProvider;
 use OCA\OpenAi\TaskProcessing\ChangeToneTaskType;
@@ -130,6 +132,8 @@ public function register(IRegistrationContext $context): void {
 		if ($this->appConfig->getValueString(Application::APP_ID, 't2i_provider_enabled', '1') === '1') {
 			$context->registerTaskProcessingProvider(TextToImageProvider::class);
 		}
+		$context->registerTaskProcessingTaskType(AudioToAudioChatTaskType::class);
+		$context->registerTaskProcessingProvider(AudioToAudioChatProvider::class);
 
 		$context->registerCapability(Capabilities::class);
 	}
diff --git a/lib/Service/OpenAiAPIService.php b/lib/Service/OpenAiAPIService.php
index 0ebe187f..1c4bede7 100644
--- a/lib/Service/OpenAiAPIService.php
+++ b/lib/Service/OpenAiAPIService.php
@@ -437,7 +437,8 @@ public function createCompletion(
 	 * @param array|null $extraParams
 	 * @param string|null $toolMessage JSON string with role, content, tool_call_id
 	 * @param array|null $tools
-	 * @return array{messages: array<string>, tool_calls: array<string>}
+	 * @param string|null $userAudioPromptBase64
+	 * @return array{messages: array<string>, tool_calls: array<string>, audio_messages: list<array<string, mixed>>}
 	 * @throws Exception
 	 */
 	public function createChatCompletion(
@@ -451,6 +452,7 @@ public function createChatCompletion(
 		?array $extraParams = null,
 		?string $toolMessage = null,
 		?array $tools = null,
+		?string $userAudioPromptBase64 = null,
 	): array {
 		if ($this->isQuotaExceeded($userId, Application::QUOTA_TYPE_TEXT)) {
 			throw new Exception($this->l10n->t('Text generation quota exceeded'), Http::STATUS_TOO_MANY_REQUESTS);
@@ -494,8 +496,24 @@ public function createChatCompletion(
 				$messages[] = $message;
 			}
 		}
-		if ($userPrompt !== null) {
-			$messages[] = ['role' => 'user', 'content' => $userPrompt];
+		if ($userPrompt !== null || $userAudioPromptBase64 !== null) {
+			$message = ['role' => 'user', 'content' => []];
+			if ($userPrompt !== null) {
+				$message['content'][] = [
+					'type' => 'text',
+					'text' => $userPrompt,
+				];
+			}
+			if ($userAudioPromptBase64 !== null) {
+				$message['content'][] = [
+					'type' => 'input_audio',
+					'input_audio' => [
+						'data' => $userAudioPromptBase64,
+						'format' => 'mp3',
+					],
+				];
+			}
+			$messages[] = $message;
 		}
 		if ($toolMessage !== null) {
 			$msgs = json_decode($toolMessage, true);
@@ -555,6 +573,7 @@ public function createChatCompletion(
 		$completions = [
 			'messages' => [],
 			'tool_calls' => [],
+			'audio_messages' => [],
 		];
 
 		foreach ($response['choices'] as $choice) {
@@ -583,6 +602,9 @@ public function createChatCompletion(
 			if (isset($choice['message']['content']) && is_string($choice['message']['content'])) {
 				$completions['messages'][] = $choice['message']['content'];
 			}
+			if (isset($choice['message']['audio'], $choice['message']['audio']['data']) && is_string($choice['message']['audio']['data'])) {
+				$completions['audio_messages'][] = $choice['message'];
+			}
 		}
 
 		return $completions;
diff --git a/lib/TaskProcessing/AudioToAudioChatProvider.php b/lib/TaskProcessing/AudioToAudioChatProvider.php
new file mode 100644
index 00000000..7a879b70
--- /dev/null
+++ b/lib/TaskProcessing/AudioToAudioChatProvider.php
@@ -0,0 +1,256 @@
+<?php
+
+declare(strict_types=1);
+
+/**
+ * SPDX-FileCopyrightText: 2025 Nextcloud GmbH and Nextcloud contributors
+ * SPDX-License-Identifier: AGPL-3.0-or-later
+ */
+
+namespace OCA\OpenAi\TaskProcessing;
+
+use Exception;
+use OCA\OpenAi\AppInfo\Application;
+use OCA\OpenAi\Service\OpenAiAPIService;
+use OCA\OpenAi\Service\OpenAiSettingsService;
+use OCP\Files\File;
+use OCP\IAppConfig;
+use OCP\IL10N;
+use OCP\TaskProcessing\EShapeType;
+use OCP\TaskProcessing\ISynchronousProvider;
+use OCP\TaskProcessing\ShapeDescriptor;
+use OCP\TaskProcessing\ShapeEnumValue;
+use Psr\Log\LoggerInterface;
+use RuntimeException;
+
+class AudioToAudioChatProvider implements ISynchronousProvider {
+
+	public function __construct(
+		private OpenAiAPIService $openAiAPIService,
+		private OpenAiSettingsService $openAiSettingsService,
+		private IL10N $l,
+		private LoggerInterface $logger,
+		private IAppConfig $appConfig,
+		private ?string $userId,
+	) {
+	}
+
+	public function getId(): string {
+		return Application::APP_ID . '-audio2audio:chat';
+	}
+
+	public function getName(): string {
+		return $this->openAiAPIService->getServiceName();
+	}
+
+	public function getTaskTypeId(): string {
+		if (class_exists('OCP\\TaskProcessing\\TaskTypes\\AudioToAudioChat')) {
+			return \OCP\TaskProcessing\TaskTypes\AudioToAudioChat::ID;
+		}
+		return AudioToAudioChatTaskType::ID;
+	}
+
+	public function getExpectedRuntime(): int {
+		return $this->openAiAPIService->getExpTextProcessingTime();
+	}
+
+	public function getInputShapeEnumValues(): array {
+		return [];
+	}
+
+	public function getInputShapeDefaults(): array {
+		return [];
+	}
+
+
+	public function getOptionalInputShape(): array {
+		return [
+			'llm_model' => new ShapeDescriptor(
+				$this->l->t('Completion model'),
+				$this->l->t('The model used to generate the completion'),
+				EShapeType::Enum
+			),
+			'voice' => new ShapeDescriptor(
+				$this->l->t('Voice'),
+				$this->l->t('The voice to use'),
+				EShapeType::Enum
+			),
+			'tts_model' => new ShapeDescriptor(
+				$this->l->t('Text-to-speech model'),
+				$this->l->t('The model used to generate the speech'),
+				EShapeType::Enum
+			),
+			'speed' => new ShapeDescriptor(
+				$this->l->t('Speed'),
+				$this->openAiAPIService->isUsingOpenAi()
+					? $this->l->t('Speech speed modifier (Valid values: 0.25-4)')
+					: $this->l->t('Speech speed modifier'),
+				EShapeType::Number
+			)
+		];
+	}
+
+	public function getOptionalInputShapeEnumValues(): array {
+		$voices = json_decode($this->appConfig->getValueString(Application::APP_ID, 'tts_voices')) ?: Application::DEFAULT_SPEECH_VOICES;
+		$models = $this->openAiAPIService->getModelEnumValues($this->userId);
+		return [
+			'voice' => array_map(function ($v) { return new ShapeEnumValue($v, $v); }, $voices),
+			'llm_model' => $models,
+			'tts_model' => $models,
+		];
+	}
+
+	public function getOptionalInputShapeDefaults(): array {
+		$adminVoice = $this->appConfig->getValueString(Application::APP_ID, 'default_speech_voice') ?: Application::DEFAULT_SPEECH_VOICE;
+		$adminTtsModel = $this->appConfig->getValueString(Application::APP_ID, 'default_speech_model_id') ?: Application::DEFAULT_SPEECH_MODEL_ID;
+		$adminLlmModel = $this->openAiAPIService->isUsingOpenAi()
+			? ($this->appConfig->getValueString(Application::APP_ID, 'default_completion_model_id', Application::DEFAULT_MODEL_ID) ?: Application::DEFAULT_MODEL_ID)
+			: $this->appConfig->getValueString(Application::APP_ID, 'default_completion_model_id');
+		return [
+			'voice' => $adminVoice,
+			'tts_model' => $adminTtsModel,
+			'speed' => 1,
+			'llm_model' => $adminLlmModel,
+		];
+	}
+
+	public function getOutputShapeEnumValues(): array {
+		return [];
+	}
+
+	public function getOptionalOutputShape(): array {
+		return [
+			'input_transcript' => new ShapeDescriptor(
+				$this->l->t('Input transcript'),
+				$this->l->t('Input transcription'),
+				EShapeType::Text,
+			),
+			'output_transcript' => new ShapeDescriptor(
+				$this->l->t('Output transcript'),
+				$this->l->t('Response transcription'),
+				EShapeType::Text,
+			),
+		];
+	}
+
+	public function getOptionalOutputShapeEnumValues(): array {
+		return [];
+	}
+
+	public function process(?string $userId, array $input, callable $reportProgress): array {
+		if (!isset($input['input']) || !$input['input'] instanceof File || !$input['input']->isReadable()) {
+			throw new RuntimeException('Invalid input file');
+		}
+		$inputFile = $input['input'];
+
+		if (!isset($input['system_prompt']) || !is_string($input['system_prompt'])) {
+			throw new RuntimeException('Invalid system_prompt');
+		}
+		$systemPrompt = $input['system_prompt'];
+
+		if (!isset($input['history']) || !is_array($input['history'])) {
+			throw new RuntimeException('Invalid history');
+		}
+		$history = $input['history'];
+
+		if (isset($input['tts_model']) && is_string($input['tts_model'])) {
+			$ttsModel = $input['tts_model'];
+		} else {
+			$ttsModel = $this->appConfig->getValueString(Application::APP_ID, 'default_speech_model_id', Application::DEFAULT_SPEECH_MODEL_ID) ?: Application::DEFAULT_SPEECH_MODEL_ID;
+		}
+
+		if (isset($input['llm_model']) && is_string($input['llm_model'])) {
+			$llmModel = $input['llm_model'];
+		} else {
+			$llmModel = $this->appConfig->getValueString(Application::APP_ID, 'default_completion_model_id', Application::DEFAULT_MODEL_ID) ?: Application::DEFAULT_MODEL_ID;
+		}
+
+
+		if (isset($input['voice']) && is_string($input['voice'])) {
+			$voice = $input['voice'];
+		} else {
+			$voice = $this->appConfig->getValueString(Application::APP_ID, 'default_speech_voice', Application::DEFAULT_SPEECH_VOICE) ?: Application::DEFAULT_SPEECH_VOICE;
+		}
+
+		$speed = 1;
+		if (isset($input['speed']) && is_numeric($input['speed'])) {
+			$speed = $input['speed'];
+			if ($this->openAiAPIService->isUsingOpenAi()) {
+				if ($speed > 4) {
+					$speed = 4;
+				} elseif ($speed < 0.25) {
+					$speed = 0.25;
+				}
+			}
+		}
+
+		$sttModel = $this->appConfig->getValueString(Application::APP_ID, 'default_stt_model_id', Application::DEFAULT_MODEL_ID) ?: Application::DEFAULT_MODEL_ID;
+
+		/////////////// Using the chat API if connected to OpenAI
+		if ($this->openAiAPIService->isUsingOpenAi()) {
+			$b64Audio = base64_encode($inputFile->getContent());
+			$extraParams = [
+				'modalities' => ['text', 'audio'],
+				'audio' => ['voice' => $voice, 'format' => 'mp3'],
+			];
+			$completion = $this->openAiAPIService->createChatCompletion(
+				$userId, 'gpt-4o-audio-preview', null, $systemPrompt, $history, 1, 1000,
+				$extraParams, null, null, $b64Audio,
+			);
+			$message = array_pop($completion['audio_messages']);
+			$result = [
+				'output' => base64_decode($message['audio']['data']),
+				'output_transcript' => $message['audio']['transcript'],
+			];
+
+			// we still want the input transcription
+			try {
+				$inputTranscription = $this->openAiAPIService->transcribeFile($userId, $inputFile, false, $sttModel);
+				$result['input_transcript'] = $inputTranscription;
+			} catch (Exception $e) {
+				$this->logger->warning('OpenAI\'s Whisper transcription failed with: ' . $e->getMessage(), ['exception' => $e]);
+			}
+
+			return $result;
+		}
+
+		//////////////// 3 steps: STT -> LLM -> TTS
+		// speech to text
+		try {
+			$inputTranscription = $this->openAiAPIService->transcribeFile($userId, $inputFile, false, $sttModel);
+		} catch (Exception $e) {
+			$this->logger->warning('OpenAI\'s Whisper transcription failed with: ' . $e->getMessage(), ['exception' => $e]);
+			throw new RuntimeException('OpenAI\'s Whisper transcription failed with: ' . $e->getMessage());
+		}
+
+		// free prompt
+		try {
+			$completion = $this->openAiAPIService->createChatCompletion($userId, $llmModel, $inputTranscription, $systemPrompt, $history, 1, 1000);
+			$completion = $completion['messages'];
+		} catch (Exception $e) {
+			throw new RuntimeException('OpenAI/LocalAI request failed: ' . $e->getMessage());
+		}
+		if (count($completion) === 0) {
+			throw new RuntimeException('No completion in OpenAI/LocalAI response.');
+		}
+		$llmResult = array_pop($completion);
+
+		// text to speech
+		try {
+			$apiResponse = $this->openAiAPIService->requestSpeechCreation($userId, $llmResult, $ttsModel, $voice, $speed);
+
+			if (!isset($apiResponse['body'])) {
+				$this->logger->warning('OpenAI/LocalAI\'s text to speech generation failed: no speech returned');
+				throw new RuntimeException('OpenAI/LocalAI\'s text to speech generation failed: no speech returned');
+			}
+			return [
+				'output' => $apiResponse['body'],
+				'output_transcript' => $llmResult,
+				'input_transcript' => $inputTranscription,
+			];
+		} catch (\Exception $e) {
+			$this->logger->warning('OpenAI/LocalAI\'s text to image generation failed with: ' . $e->getMessage(), ['exception' => $e]);
+			throw new RuntimeException('OpenAI/LocalAI\'s text to image generation failed with: ' . $e->getMessage());
+		}
+	}
+}
diff --git a/lib/TaskProcessing/AudioToAudioChatTaskType.php b/lib/TaskProcessing/AudioToAudioChatTaskType.php
new file mode 100644
index 00000000..ba5eaed5
--- /dev/null
+++ b/lib/TaskProcessing/AudioToAudioChatTaskType.php
@@ -0,0 +1,82 @@
+<?php
+
+declare(strict_types=1);
+
+/**
+ * SPDX-FileCopyrightText: 2024 Nextcloud GmbH and Nextcloud contributors
+ * SPDX-License-Identifier: AGPL-3.0-or-later
+ */
+
+namespace OCA\OpenAi\TaskProcessing;
+
+use OCA\OpenAi\AppInfo\Application;
+use OCP\IL10N;
+use OCP\TaskProcessing\EShapeType;
+use OCP\TaskProcessing\ITaskType;
+use OCP\TaskProcessing\ShapeDescriptor;
+
+class AudioToAudioChatTaskType implements ITaskType {
+	public const ID = Application::APP_ID . ':audio2audio:chat';
+
+	public function __construct(
+		private IL10N $l,
+	) {
+	}
+
+	/**
+	 * @inheritDoc
+	 */
+	public function getName(): string {
+		return $this->l->t('ioa Voice chat');
+	}
+
+	/**
+	 * @inheritDoc
+	 */
+	public function getDescription(): string {
+		return $this->l->t('ioa Voice chat with the assistant');
+	}
+
+	/**
+	 * @return string
+	 */
+	public function getId(): string {
+		return self::ID;
+	}
+
+	/**
+	 * @return ShapeDescriptor[]
+	 */
+	public function getInputShape(): array {
+		return [
+			'system_prompt' => new ShapeDescriptor(
+				$this->l->t('System prompt'),
+				$this->l->t('Define rules and assumptions that the assistant should follow during the conversation.'),
+				EShapeType::Text,
+			),
+			'input' => new ShapeDescriptor(
+				$this->l->t('Chat voice message'),
+				$this->l->t('Describe a task that you want the assistant to do or ask a question'),
+				EShapeType::Audio,
+			),
+			'history' => new ShapeDescriptor(
+				$this->l->t('Chat history'),
+				$this->l->t('The history of chat messages before the current message, starting with a message by the user'),
+				EShapeType::ListOfTexts,
+			),
+		];
+	}
+
+	/**
+	 * @return ShapeDescriptor[]
+	 */
+	public function getOutputShape(): array {
+		return [
+			'output' => new ShapeDescriptor(
+				$this->l->t('Response voice message'),
+				$this->l->t('The generated response as part of the conversation'),
+				EShapeType::Audio
+			),
+		];
+	}
+}

From 6334caed279ebb3e8767c364283ee52198c4eeec Mon Sep 17 00:00:00 2001
From: Julien Veyssier <julien-nc@posteo.net>
Date: Thu, 3 Jul 2025 17:56:13 +0200
Subject: [PATCH 02/11] adjust AudioToAudioChatTaskType

Signed-off-by: Julien Veyssier <julien-nc@posteo.net>
---
 lib/TaskProcessing/AudioToAudioChatProvider.php | 9 +--------
 lib/TaskProcessing/AudioToAudioChatTaskType.php | 5 +++++
 2 files changed, 6 insertions(+), 8 deletions(-)

diff --git a/lib/TaskProcessing/AudioToAudioChatProvider.php b/lib/TaskProcessing/AudioToAudioChatProvider.php
index 7a879b70..5f673bc2 100644
--- a/lib/TaskProcessing/AudioToAudioChatProvider.php
+++ b/lib/TaskProcessing/AudioToAudioChatProvider.php
@@ -12,7 +12,6 @@
 use Exception;
 use OCA\OpenAi\AppInfo\Application;
 use OCA\OpenAi\Service\OpenAiAPIService;
-use OCA\OpenAi\Service\OpenAiSettingsService;
 use OCP\Files\File;
 use OCP\IAppConfig;
 use OCP\IL10N;
@@ -27,7 +26,6 @@ class AudioToAudioChatProvider implements ISynchronousProvider {
 
 	public function __construct(
 		private OpenAiAPIService $openAiAPIService,
-		private OpenAiSettingsService $openAiSettingsService,
 		private IL10N $l,
 		private LoggerInterface $logger,
 		private IAppConfig $appConfig,
@@ -125,11 +123,6 @@ public function getOptionalOutputShape(): array {
 				$this->l->t('Input transcription'),
 				EShapeType::Text,
 			),
-			'output_transcript' => new ShapeDescriptor(
-				$this->l->t('Output transcript'),
-				$this->l->t('Response transcription'),
-				EShapeType::Text,
-			),
 		];
 	}
 
@@ -249,7 +242,7 @@ public function process(?string $userId, array $input, callable $reportProgress)
 				'input_transcript' => $inputTranscription,
 			];
 		} catch (\Exception $e) {
-			$this->logger->warning('OpenAI/LocalAI\'s text to image generation failed with: ' . $e->getMessage(), ['exception' => $e]);
+			$this->logger->warning('OpenAI/LocalAI\'s text to speech generation failed with: ' . $e->getMessage(), ['exception' => $e]);
 			throw new RuntimeException('OpenAI/LocalAI\'s text to image generation failed with: ' . $e->getMessage());
 		}
 	}
diff --git a/lib/TaskProcessing/AudioToAudioChatTaskType.php b/lib/TaskProcessing/AudioToAudioChatTaskType.php
index ba5eaed5..f4d43849 100644
--- a/lib/TaskProcessing/AudioToAudioChatTaskType.php
+++ b/lib/TaskProcessing/AudioToAudioChatTaskType.php
@@ -77,6 +77,11 @@ public function getOutputShape(): array {
 				$this->l->t('The generated response as part of the conversation'),
 				EShapeType::Audio
 			),
+			'output_transcript' => new ShapeDescriptor(
+				$this->l->t('Output transcript'),
+				$this->l->t('Response transcription'),
+				EShapeType::Text,
+			),
 		];
 	}
 }

From e3a6d2d26fb82a592ca7bfaac967468e5386b45a Mon Sep 17 00:00:00 2001
From: Julien Veyssier <julien-nc@posteo.net>
Date: Thu, 3 Jul 2025 18:05:24 +0200
Subject: [PATCH 03/11] feat(audio-chat): add condition to register provider
 and task type

Signed-off-by: Julien Veyssier <julien-nc@posteo.net>
---
 lib/AppInfo/Application.php | 17 +++++++++++++++--
 1 file changed, 15 insertions(+), 2 deletions(-)

diff --git a/lib/AppInfo/Application.php b/lib/AppInfo/Application.php
index 54f23c85..1a46168c 100644
--- a/lib/AppInfo/Application.php
+++ b/lib/AppInfo/Application.php
@@ -132,8 +132,21 @@ public function register(IRegistrationContext $context): void {
 		if ($this->appConfig->getValueString(Application::APP_ID, 't2i_provider_enabled', '1') === '1') {
 			$context->registerTaskProcessingProvider(TextToImageProvider::class);
 		}
-		$context->registerTaskProcessingTaskType(AudioToAudioChatTaskType::class);
-		$context->registerTaskProcessingProvider(AudioToAudioChatProvider::class);
+
+		// only register audio chat stuff if we're using OpenAI or stt+llm+tts are enabled
+		$serviceUrl = $this->appConfig->getValueString(Application::APP_ID, 'url');
+		$isUsingOpenAI = $serviceUrl === '' || $serviceUrl === Application::OPENAI_API_BASE_URL;
+		if (
+			$isUsingOpenAI
+			|| (
+				$this->appConfig->getValueString(Application::APP_ID, 'stt_provider_enabled', '1') === '1'
+				&& $this->appConfig->getValueString(Application::APP_ID, 'llm_provider_enabled', '1') === '1'
+				&& $this->appConfig->getValueString(Application::APP_ID, 'tts_provider_enabled', '1') === '1'
+			)
+		) {
+			$context->registerTaskProcessingTaskType(AudioToAudioChatTaskType::class);
+			$context->registerTaskProcessingProvider(AudioToAudioChatProvider::class);
+		}
 
 		$context->registerCapability(Capabilities::class);
 	}

From b87598770ee4224524420cff8c8d80bf39d3232c Mon Sep 17 00:00:00 2001
From: Julien Veyssier <julien-nc@posteo.net>
Date: Thu, 3 Jul 2025 18:15:15 +0200
Subject: [PATCH 04/11] review adjustments

Signed-off-by: Julien Veyssier <julien-nc@posteo.net>
---
 .../AudioToAudioChatProvider.php              | 20 +++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/lib/TaskProcessing/AudioToAudioChatProvider.php b/lib/TaskProcessing/AudioToAudioChatProvider.php
index 5f673bc2..db891512 100644
--- a/lib/TaskProcessing/AudioToAudioChatProvider.php
+++ b/lib/TaskProcessing/AudioToAudioChatProvider.php
@@ -69,8 +69,8 @@ public function getOptionalInputShape(): array {
 				EShapeType::Enum
 			),
 			'voice' => new ShapeDescriptor(
-				$this->l->t('Voice'),
-				$this->l->t('The voice to use'),
+				$this->l->t('Output voice'),
+				$this->l->t('The voice used to generate speech'),
 				EShapeType::Enum
 			),
 			'tts_model' => new ShapeDescriptor(
@@ -120,7 +120,7 @@ public function getOptionalOutputShape(): array {
 		return [
 			'input_transcript' => new ShapeDescriptor(
 				$this->l->t('Input transcript'),
-				$this->l->t('Input transcription'),
+				$this->l->t('Transcription of the input audio'),
 				EShapeType::Text,
 			),
 		];
@@ -132,7 +132,7 @@ public function getOptionalOutputShapeEnumValues(): array {
 
 	public function process(?string $userId, array $input, callable $reportProgress): array {
 		if (!isset($input['input']) || !$input['input'] instanceof File || !$input['input']->isReadable()) {
-			throw new RuntimeException('Invalid input file');
+			throw new RuntimeException('Invalid input audio file in the "input" field. A readable file is expected.');
 		}
 		$inputFile = $input['input'];
 
@@ -142,7 +142,7 @@ public function process(?string $userId, array $input, callable $reportProgress)
 		$systemPrompt = $input['system_prompt'];
 
 		if (!isset($input['history']) || !is_array($input['history'])) {
-			throw new RuntimeException('Invalid history');
+			throw new RuntimeException('Invalid chat history, array expected');
 		}
 		$history = $input['history'];
 
@@ -160,9 +160,9 @@ public function process(?string $userId, array $input, callable $reportProgress)
 
 
 		if (isset($input['voice']) && is_string($input['voice'])) {
-			$voice = $input['voice'];
+			$outputVoice = $input['voice'];
 		} else {
-			$voice = $this->appConfig->getValueString(Application::APP_ID, 'default_speech_voice', Application::DEFAULT_SPEECH_VOICE) ?: Application::DEFAULT_SPEECH_VOICE;
+			$outputVoice = $this->appConfig->getValueString(Application::APP_ID, 'default_speech_voice', Application::DEFAULT_SPEECH_VOICE) ?: Application::DEFAULT_SPEECH_VOICE;
 		}
 
 		$speed = 1;
@@ -184,7 +184,7 @@ public function process(?string $userId, array $input, callable $reportProgress)
 			$b64Audio = base64_encode($inputFile->getContent());
 			$extraParams = [
 				'modalities' => ['text', 'audio'],
-				'audio' => ['voice' => $voice, 'format' => 'mp3'],
+				'audio' => ['voice' => $outputVoice, 'format' => 'mp3'],
 			];
 			$completion = $this->openAiAPIService->createChatCompletion(
 				$userId, 'gpt-4o-audio-preview', null, $systemPrompt, $history, 1, 1000,
@@ -230,7 +230,7 @@ public function process(?string $userId, array $input, callable $reportProgress)
 
 		// text to speech
 		try {
-			$apiResponse = $this->openAiAPIService->requestSpeechCreation($userId, $llmResult, $ttsModel, $voice, $speed);
+			$apiResponse = $this->openAiAPIService->requestSpeechCreation($userId, $llmResult, $ttsModel, $outputVoice, $speed);
 
 			if (!isset($apiResponse['body'])) {
 				$this->logger->warning('OpenAI/LocalAI\'s text to speech generation failed: no speech returned');
@@ -243,7 +243,7 @@ public function process(?string $userId, array $input, callable $reportProgress)
 			];
 		} catch (\Exception $e) {
 			$this->logger->warning('OpenAI/LocalAI\'s text to speech generation failed with: ' . $e->getMessage(), ['exception' => $e]);
-			throw new RuntimeException('OpenAI/LocalAI\'s text to image generation failed with: ' . $e->getMessage());
+			throw new RuntimeException('OpenAI/LocalAI\'s text to speech generation failed with: ' . $e->getMessage());
 		}
 	}
 }

From 2054ea65b75f9f65d09c9ff35ea2a39a0abe0966 Mon Sep 17 00:00:00 2001
From: Julien Veyssier <julien-nc@posteo.net>
Date: Thu, 3 Jul 2025 18:28:18 +0200
Subject: [PATCH 05/11] fix tests

Signed-off-by: Julien Veyssier <julien-nc@posteo.net>
---
 tests/unit/Providers/OpenAiProviderTest.php | 23 ++++++++++++++-------
 1 file changed, 15 insertions(+), 8 deletions(-)

diff --git a/tests/unit/Providers/OpenAiProviderTest.php b/tests/unit/Providers/OpenAiProviderTest.php
index ee9923b2..fe0d2bd5 100644
--- a/tests/unit/Providers/OpenAiProviderTest.php
+++ b/tests/unit/Providers/OpenAiProviderTest.php
@@ -140,7 +140,7 @@ public function testFreePromptProvider(): void {
 		$options = ['timeout' => Application::OPENAI_DEFAULT_REQUEST_TIMEOUT, 'headers' => ['User-Agent' => Application::USER_AGENT, 'Authorization' => self::AUTHORIZATION_HEADER, 'Content-Type' => 'application/json']];
 		$options['body'] = json_encode([
 			'model' => Application::DEFAULT_COMPLETION_MODEL_ID,
-			'messages' => [['role' => 'user', 'content' => $prompt]],
+			'messages' => [['role' => 'user', 'content' => [['type' => 'text', 'text' => $prompt]]]],
 			'n' => $n,
 			'max_completion_tokens' => Application::DEFAULT_MAX_NUM_OF_TOKENS,
 			'user' => self::TEST_USER1,
@@ -204,7 +204,7 @@ public function testEmojiProvider(): void {
 		$message = 'Give me an emoji for the following text. Output only the emoji without any other characters.' . "\n\n" . $prompt;
 		$options['body'] = json_encode([
 			'model' => Application::DEFAULT_COMPLETION_MODEL_ID,
-			'messages' => [['role' => 'user', 'content' => $message]],
+			'messages' => [['role' => 'user', 'content' => [['type' => 'text', 'text' => $message]]]],
 			'n' => $n,
 			'max_completion_tokens' => Application::DEFAULT_MAX_NUM_OF_TOKENS,
 			'user' => self::TEST_USER1,
@@ -269,7 +269,7 @@ public function testHeadlineProvider(): void {
 		$message = 'Give me the headline of the following text in its original language. Do not output the language. Output only the headline without any quotes or additional punctuation.' . "\n\n" . $prompt;
 		$options['body'] = json_encode([
 			'model' => Application::DEFAULT_COMPLETION_MODEL_ID,
-			'messages' => [['role' => 'user', 'content' => $message]],
+			'messages' => [['role' => 'user', 'content' => [['type' => 'text', 'text' => $message]]]],
 			'n' => $n,
 			'max_completion_tokens' => Application::DEFAULT_MAX_NUM_OF_TOKENS,
 			'user' => self::TEST_USER1,
@@ -334,7 +334,7 @@ public function testChangeToneProvider(): void {
 		$message = "Reformulate the following text in a $toneInput tone in its original language. Output only the reformulation. Here is the text:" . "\n\n" . $textInput . "\n\n" . 'Do not mention the used language in your reformulation. Here is your reformulation in the same language:';
 		$options['body'] = json_encode([
 			'model' => Application::DEFAULT_COMPLETION_MODEL_ID,
-			'messages' => [['role' => 'user', 'content' => $message]],
+			'messages' => [['role' => 'user', 'content' => [['type' => 'text', 'text' => $message]]]],
 			'n' => $n,
 			'max_completion_tokens' => Application::DEFAULT_MAX_NUM_OF_TOKENS,
 			'user' => self::TEST_USER1,
@@ -400,8 +400,10 @@ public function testSummaryProvider(): void {
 			. 'You should only return the summary without any additional information.';
 		$options['body'] = json_encode([
 			'model' => Application::DEFAULT_COMPLETION_MODEL_ID,
-			'messages' => [['role' => 'system', 'content' => $systemPrompt],
-				['role' => 'user', 'content' => $prompt]],
+			'messages' => [
+				['role' => 'system', 'content' => $systemPrompt],
+				['role' => 'user', 'content' => [['type' => 'text', 'text' => $prompt]]],
+			],
 			'n' => $n,
 			'max_completion_tokens' => Application::DEFAULT_MAX_NUM_OF_TOKENS,
 			'user' => self::TEST_USER1,
@@ -465,7 +467,10 @@ public function testProofreadProvider(): void {
 		$systemPrompt = 'Proofread the following text. List all spelling and grammar mistakes and how to correct them. Output only the list.';
 		$options['body'] = json_encode([
 			'model' => Application::DEFAULT_COMPLETION_MODEL_ID,
-			'messages' => [['role' => 'system', 'content' => $systemPrompt],['role' => 'user', 'content' => $prompt]],
+			'messages' => [
+				['role' => 'system', 'content' => $systemPrompt],
+				['role' => 'user', 'content' => [['type' => 'text', 'text' => $prompt]]],
+			],
 			'n' => $n,
 			'max_completion_tokens' => Application::DEFAULT_MAX_NUM_OF_TOKENS,
 			'user' => self::TEST_USER1,
@@ -533,7 +538,9 @@ public function testTranslationProvider(): void {
 		$options = ['timeout' => Application::OPENAI_DEFAULT_REQUEST_TIMEOUT, 'headers' => ['User-Agent' => Application::USER_AGENT, 'Authorization' => self::AUTHORIZATION_HEADER, 'Content-Type' => 'application/json']];
 		$options['body'] = json_encode([
 			'model' => Application::DEFAULT_COMPLETION_MODEL_ID,
-			'messages' => [['role' => 'user', 'content' => 'Translate from ' . $fromLang . ' to English (US): ' . $inputText]],
+			'messages' => [
+				['role' => 'user', 'content' => [['type' => 'text', 'text' => 'Translate from ' . $fromLang . ' to English (US): ' . $inputText]]],
+			],
 			'n' => $n,
 			'max_completion_tokens' => Application::DEFAULT_MAX_NUM_OF_TOKENS,
 			'user' => self::TEST_USER1,

From 456a8a17125585e057acbf4dbdf43d551b07b322 Mon Sep 17 00:00:00 2001
From: Julien Veyssier <julien-nc@posteo.net>
Date: Fri, 4 Jul 2025 14:33:37 +0200
Subject: [PATCH 06/11] change defaults and enum values if using openai or not,
 use llmModel input if using chat endpoint

Signed-off-by: Julien Veyssier <julien-nc@posteo.net>
---
 .../AudioToAudioChatProvider.php              | 47 ++++++++++++-------
 1 file changed, 31 insertions(+), 16 deletions(-)

diff --git a/lib/TaskProcessing/AudioToAudioChatProvider.php b/lib/TaskProcessing/AudioToAudioChatProvider.php
index db891512..8fbdc89f 100644
--- a/lib/TaskProcessing/AudioToAudioChatProvider.php
+++ b/lib/TaskProcessing/AudioToAudioChatProvider.php
@@ -62,7 +62,8 @@ public function getInputShapeDefaults(): array {
 
 
 	public function getOptionalInputShape(): array {
-		return [
+		$isUsingOpenAi = $this->openAiAPIService->isUsingOpenAi();
+		$ois = [
 			'llm_model' => new ShapeDescriptor(
 				$this->l->t('Completion model'),
 				$this->l->t('The model used to generate the completion'),
@@ -73,43 +74,54 @@ public function getOptionalInputShape(): array {
 				$this->l->t('The voice used to generate speech'),
 				EShapeType::Enum
 			),
-			'tts_model' => new ShapeDescriptor(
+		];
+		if (!$isUsingOpenAi) {
+			$ois['tts_model'] = new ShapeDescriptor(
 				$this->l->t('Text-to-speech model'),
 				$this->l->t('The model used to generate the speech'),
 				EShapeType::Enum
-			),
-			'speed' => new ShapeDescriptor(
+			);
+			$ois['speed'] = new ShapeDescriptor(
 				$this->l->t('Speed'),
 				$this->openAiAPIService->isUsingOpenAi()
 					? $this->l->t('Speech speed modifier (Valid values: 0.25-4)')
 					: $this->l->t('Speech speed modifier'),
 				EShapeType::Number
-			)
-		];
+			);
+		}
+		return $ois;
 	}
 
 	public function getOptionalInputShapeEnumValues(): array {
+		$isUsingOpenAi = $this->openAiAPIService->isUsingOpenAi();
 		$voices = json_decode($this->appConfig->getValueString(Application::APP_ID, 'tts_voices')) ?: Application::DEFAULT_SPEECH_VOICES;
 		$models = $this->openAiAPIService->getModelEnumValues($this->userId);
-		return [
+		$enumValues = [
 			'voice' => array_map(function ($v) { return new ShapeEnumValue($v, $v); }, $voices),
 			'llm_model' => $models,
-			'tts_model' => $models,
 		];
+		if (!$isUsingOpenAi) {
+			$enumValues['tts_model'] = $models;
+		}
+		return $enumValues;
 	}
 
 	public function getOptionalInputShapeDefaults(): array {
+		$isUsingOpenAi = $this->openAiAPIService->isUsingOpenAi();
 		$adminVoice = $this->appConfig->getValueString(Application::APP_ID, 'default_speech_voice') ?: Application::DEFAULT_SPEECH_VOICE;
-		$adminTtsModel = $this->appConfig->getValueString(Application::APP_ID, 'default_speech_model_id') ?: Application::DEFAULT_SPEECH_MODEL_ID;
-		$adminLlmModel = $this->openAiAPIService->isUsingOpenAi()
-			? ($this->appConfig->getValueString(Application::APP_ID, 'default_completion_model_id', Application::DEFAULT_MODEL_ID) ?: Application::DEFAULT_MODEL_ID)
+		$adminLlmModel = $isUsingOpenAi
+			? 'gpt-4o-audio-preview'
 			: $this->appConfig->getValueString(Application::APP_ID, 'default_completion_model_id');
-		return [
+		$defaults = [
 			'voice' => $adminVoice,
-			'tts_model' => $adminTtsModel,
-			'speed' => 1,
 			'llm_model' => $adminLlmModel,
 		];
+		if (!$isUsingOpenAi) {
+			$adminTtsModel = $this->appConfig->getValueString(Application::APP_ID, 'default_speech_model_id') ?: Application::DEFAULT_SPEECH_MODEL_ID;
+			$defaults['tts_model'] = $adminTtsModel;
+			$defaults['speed'] = 1;
+		}
+		return $defaults;
 	}
 
 	public function getOutputShapeEnumValues(): array {
@@ -155,7 +167,10 @@ public function process(?string $userId, array $input, callable $reportProgress)
 		if (isset($input['llm_model']) && is_string($input['llm_model'])) {
 			$llmModel = $input['llm_model'];
 		} else {
-			$llmModel = $this->appConfig->getValueString(Application::APP_ID, 'default_completion_model_id', Application::DEFAULT_MODEL_ID) ?: Application::DEFAULT_MODEL_ID;
+			$isUsingOpenAi = $this->openAiAPIService->isUsingOpenAi();
+			$llmModel = $isUsingOpenAi
+				? 'gpt-4o-audio-preview'
+				: ($this->appConfig->getValueString(Application::APP_ID, 'default_completion_model_id', Application::DEFAULT_MODEL_ID) ?: Application::DEFAULT_MODEL_ID);
 		}
 
 
@@ -187,7 +202,7 @@ public function process(?string $userId, array $input, callable $reportProgress)
 				'audio' => ['voice' => $outputVoice, 'format' => 'mp3'],
 			];
 			$completion = $this->openAiAPIService->createChatCompletion(
-				$userId, 'gpt-4o-audio-preview', null, $systemPrompt, $history, 1, 1000,
+				$userId, $llmModel, null, $systemPrompt, $history, 1, 1000,
 				$extraParams, null, null, $b64Audio,
 			);
 			$message = array_pop($completion['audio_messages']);

From f00717ea9a39daef286028f802866fe0fe2164a7 Mon Sep 17 00:00:00 2001
From: Julien Veyssier <julien-nc@posteo.net>
Date: Fri, 4 Jul 2025 14:40:04 +0200
Subject: [PATCH 07/11] use service name in logs instead of hardcoded value,
 fall back to app ID

Signed-off-by: Julien Veyssier <julien-nc@posteo.net>
---
 .../AudioToAudioChatProvider.php              | 20 ++++++++++---------
 1 file changed, 11 insertions(+), 9 deletions(-)

diff --git a/lib/TaskProcessing/AudioToAudioChatProvider.php b/lib/TaskProcessing/AudioToAudioChatProvider.php
index 8fbdc89f..88291474 100644
--- a/lib/TaskProcessing/AudioToAudioChatProvider.php
+++ b/lib/TaskProcessing/AudioToAudioChatProvider.php
@@ -194,6 +194,8 @@ public function process(?string $userId, array $input, callable $reportProgress)
 
 		$sttModel = $this->appConfig->getValueString(Application::APP_ID, 'default_stt_model_id', Application::DEFAULT_MODEL_ID) ?: Application::DEFAULT_MODEL_ID;
 
+		$serviceName = $this->appConfig->getValueString(Application::APP_ID, 'service_name') ?: Application::APP_ID;
+
 		/////////////// Using the chat API if connected to OpenAI
 		if ($this->openAiAPIService->isUsingOpenAi()) {
 			$b64Audio = base64_encode($inputFile->getContent());
@@ -216,7 +218,7 @@ public function process(?string $userId, array $input, callable $reportProgress)
 				$inputTranscription = $this->openAiAPIService->transcribeFile($userId, $inputFile, false, $sttModel);
 				$result['input_transcript'] = $inputTranscription;
 			} catch (Exception $e) {
-				$this->logger->warning('OpenAI\'s Whisper transcription failed with: ' . $e->getMessage(), ['exception' => $e]);
+				$this->logger->warning($serviceName . ' transcription failed with: ' . $e->getMessage(), ['exception' => $e]);
 			}
 
 			return $result;
@@ -227,8 +229,8 @@ public function process(?string $userId, array $input, callable $reportProgress)
 		try {
 			$inputTranscription = $this->openAiAPIService->transcribeFile($userId, $inputFile, false, $sttModel);
 		} catch (Exception $e) {
-			$this->logger->warning('OpenAI\'s Whisper transcription failed with: ' . $e->getMessage(), ['exception' => $e]);
-			throw new RuntimeException('OpenAI\'s Whisper transcription failed with: ' . $e->getMessage());
+			$this->logger->warning($serviceName . ' transcription failed with: ' . $e->getMessage(), ['exception' => $e]);
+			throw new RuntimeException($serviceName . ' transcription failed with: ' . $e->getMessage());
 		}
 
 		// free prompt
@@ -236,10 +238,10 @@ public function process(?string $userId, array $input, callable $reportProgress)
 			$completion = $this->openAiAPIService->createChatCompletion($userId, $llmModel, $inputTranscription, $systemPrompt, $history, 1, 1000);
 			$completion = $completion['messages'];
 		} catch (Exception $e) {
-			throw new RuntimeException('OpenAI/LocalAI request failed: ' . $e->getMessage());
+			throw new RuntimeException($serviceName . ' chat completion request failed: ' . $e->getMessage());
 		}
 		if (count($completion) === 0) {
-			throw new RuntimeException('No completion in OpenAI/LocalAI response.');
+			throw new RuntimeException('No completion in ' . $serviceName . ' response.');
 		}
 		$llmResult = array_pop($completion);
 
@@ -248,8 +250,8 @@ public function process(?string $userId, array $input, callable $reportProgress)
 			$apiResponse = $this->openAiAPIService->requestSpeechCreation($userId, $llmResult, $ttsModel, $outputVoice, $speed);
 
 			if (!isset($apiResponse['body'])) {
-				$this->logger->warning('OpenAI/LocalAI\'s text to speech generation failed: no speech returned');
-				throw new RuntimeException('OpenAI/LocalAI\'s text to speech generation failed: no speech returned');
+				$this->logger->warning($serviceName . ' text to speech generation failed: no speech returned');
+				throw new RuntimeException($serviceName . ' text to speech generation failed: no speech returned');
 			}
 			return [
 				'output' => $apiResponse['body'],
@@ -257,8 +259,8 @@ public function process(?string $userId, array $input, callable $reportProgress)
 				'input_transcript' => $inputTranscription,
 			];
 		} catch (\Exception $e) {
-			$this->logger->warning('OpenAI/LocalAI\'s text to speech generation failed with: ' . $e->getMessage(), ['exception' => $e]);
-			throw new RuntimeException('OpenAI/LocalAI\'s text to speech generation failed with: ' . $e->getMessage());
+			$this->logger->warning($serviceName . ' text to speech generation failed with: ' . $e->getMessage(), ['exception' => $e]);
+			throw new RuntimeException($serviceName . ' text to speech generation failed with: ' . $e->getMessage());
 		}
 	}
 }

From 47b9c867b088ab82b7cfe49d1d45071fb0781be8 Mon Sep 17 00:00:00 2001
From: Julien Veyssier <julien-nc@posteo.net>
Date: Mon, 7 Jul 2025 11:58:08 +0200
Subject: [PATCH 08/11] remove fallback audio chat task type, register the
 provider only if the server task type is available, handle the case when the
 chat endpoint does not return audio

Signed-off-by: Julien Veyssier <julien-nc@posteo.net>
---
 lib/AppInfo/Application.php                   |  6 +-
 .../AudioToAudioChatProvider.php              | 42 ++++++---
 .../AudioToAudioChatTaskType.php              | 87 -------------------
 3 files changed, 31 insertions(+), 104 deletions(-)
 delete mode 100644 lib/TaskProcessing/AudioToAudioChatTaskType.php

diff --git a/lib/AppInfo/Application.php b/lib/AppInfo/Application.php
index 1a46168c..7ddd8eec 100644
--- a/lib/AppInfo/Application.php
+++ b/lib/AppInfo/Application.php
@@ -10,7 +10,6 @@
 use OCA\OpenAi\Capabilities;
 use OCA\OpenAi\OldProcessing\Translation\TranslationProvider as OldTranslationProvider;
 use OCA\OpenAi\TaskProcessing\AudioToAudioChatProvider;
-use OCA\OpenAi\TaskProcessing\AudioToAudioChatTaskType;
 use OCA\OpenAi\TaskProcessing\AudioToTextProvider;
 use OCA\OpenAi\TaskProcessing\ChangeToneProvider;
 use OCA\OpenAi\TaskProcessing\ChangeToneTaskType;
@@ -144,8 +143,9 @@ public function register(IRegistrationContext $context): void {
 				&& $this->appConfig->getValueString(Application::APP_ID, 'tts_provider_enabled', '1') === '1'
 			)
 		) {
-			$context->registerTaskProcessingTaskType(AudioToAudioChatTaskType::class);
-			$context->registerTaskProcessingProvider(AudioToAudioChatProvider::class);
+			if (class_exists('OCP\\TaskProcessing\\TaskTypes\\AudioToAudioChat')) {
+				$context->registerTaskProcessingProvider(AudioToAudioChatProvider::class);
+			}
 		}
 
 		$context->registerCapability(Capabilities::class);
diff --git a/lib/TaskProcessing/AudioToAudioChatProvider.php b/lib/TaskProcessing/AudioToAudioChatProvider.php
index 88291474..cc36b5e0 100644
--- a/lib/TaskProcessing/AudioToAudioChatProvider.php
+++ b/lib/TaskProcessing/AudioToAudioChatProvider.php
@@ -19,6 +19,7 @@
 use OCP\TaskProcessing\ISynchronousProvider;
 use OCP\TaskProcessing\ShapeDescriptor;
 use OCP\TaskProcessing\ShapeEnumValue;
+use OCP\TaskProcessing\TaskTypes\AudioToAudioChat;
 use Psr\Log\LoggerInterface;
 use RuntimeException;
 
@@ -42,10 +43,7 @@ public function getName(): string {
 	}
 
 	public function getTaskTypeId(): string {
-		if (class_exists('OCP\\TaskProcessing\\TaskTypes\\AudioToAudioChat')) {
-			return \OCP\TaskProcessing\TaskTypes\AudioToAudioChat::ID;
-		}
-		return AudioToAudioChatTaskType::ID;
+		return AudioToAudioChat::ID;
 	}
 
 	public function getExpectedRuntime(): int {
@@ -129,13 +127,7 @@ public function getOutputShapeEnumValues(): array {
 	}
 
 	public function getOptionalOutputShape(): array {
-		return [
-			'input_transcript' => new ShapeDescriptor(
-				$this->l->t('Input transcript'),
-				$this->l->t('Transcription of the input audio'),
-				EShapeType::Text,
-			),
-		];
+		return [];
 	}
 
 	public function getOptionalOutputShapeEnumValues(): array {
@@ -203,14 +195,35 @@ public function process(?string $userId, array $input, callable $reportProgress)
 				'modalities' => ['text', 'audio'],
 				'audio' => ['voice' => $outputVoice, 'format' => 'mp3'],
 			];
+			$systemPrompt .= ' Producing text responses will break the user interface. Important: You have multimodal voice capability, and you use voice exclusively to respond.';
 			$completion = $this->openAiAPIService->createChatCompletion(
 				$userId, $llmModel, null, $systemPrompt, $history, 1, 1000,
 				$extraParams, null, null, $b64Audio,
 			);
 			$message = array_pop($completion['audio_messages']);
+			// TODO find a way to force the model to answer with audio when there is only text in the history
+			// https://community.openai.com/t/gpt-4o-audio-preview-responds-in-text-not-audio/1006486/5
+			if ($message === null) {
+				// no audio, TTS the text message
+				try {
+					$textResponse = array_pop($completion['messages']);
+					$apiResponse = $this->openAiAPIService->requestSpeechCreation($userId, $textResponse, $ttsModel, $outputVoice, $speed);
+					if (!isset($apiResponse['body'])) {
+						$this->logger->warning($serviceName . ' text to speech generation failed: no speech returned');
+						throw new RuntimeException($serviceName . ' text to speech generation failed: no speech returned');
+					}
+					$output = $apiResponse['body'];
+				} catch (\Exception $e) {
+					$this->logger->warning($serviceName . ' text to speech generation failed with: ' . $e->getMessage(), ['exception' => $e]);
+					throw new RuntimeException($serviceName . ' text to speech generation failed with: ' . $e->getMessage());
+				}
+			} else {
+				$output = base64_decode($message['audio']['data']);
+				$textResponse = $message['audio']['transcript'];
+			}
 			$result = [
-				'output' => base64_decode($message['audio']['data']),
-				'output_transcript' => $message['audio']['transcript'],
+				'output' => $output,
+				'output_transcript' => $textResponse,
 			];
 
 			// we still want the input transcription
@@ -218,7 +231,8 @@ public function process(?string $userId, array $input, callable $reportProgress)
 				$inputTranscription = $this->openAiAPIService->transcribeFile($userId, $inputFile, false, $sttModel);
 				$result['input_transcript'] = $inputTranscription;
 			} catch (Exception $e) {
-				$this->logger->warning($serviceName . ' transcription failed with: ' . $e->getMessage(), ['exception' => $e]);
+				$this->logger->warning($serviceName . ' audio input transcription failed with: ' . $e->getMessage(), ['exception' => $e]);
+				throw new RuntimeException($serviceName . ' audio input transcription failed with: ' . $e->getMessage());
 			}
 
 			return $result;
diff --git a/lib/TaskProcessing/AudioToAudioChatTaskType.php b/lib/TaskProcessing/AudioToAudioChatTaskType.php
deleted file mode 100644
index f4d43849..00000000
--- a/lib/TaskProcessing/AudioToAudioChatTaskType.php
+++ /dev/null
@@ -1,87 +0,0 @@
-<?php
-
-declare(strict_types=1);
-
-/**
- * SPDX-FileCopyrightText: 2024 Nextcloud GmbH and Nextcloud contributors
- * SPDX-License-Identifier: AGPL-3.0-or-later
- */
-
-namespace OCA\OpenAi\TaskProcessing;
-
-use OCA\OpenAi\AppInfo\Application;
-use OCP\IL10N;
-use OCP\TaskProcessing\EShapeType;
-use OCP\TaskProcessing\ITaskType;
-use OCP\TaskProcessing\ShapeDescriptor;
-
-class AudioToAudioChatTaskType implements ITaskType {
-	public const ID = Application::APP_ID . ':audio2audio:chat';
-
-	public function __construct(
-		private IL10N $l,
-	) {
-	}
-
-	/**
-	 * @inheritDoc
-	 */
-	public function getName(): string {
-		return $this->l->t('ioa Voice chat');
-	}
-
-	/**
-	 * @inheritDoc
-	 */
-	public function getDescription(): string {
-		return $this->l->t('ioa Voice chat with the assistant');
-	}
-
-	/**
-	 * @return string
-	 */
-	public function getId(): string {
-		return self::ID;
-	}
-
-	/**
-	 * @return ShapeDescriptor[]
-	 */
-	public function getInputShape(): array {
-		return [
-			'system_prompt' => new ShapeDescriptor(
-				$this->l->t('System prompt'),
-				$this->l->t('Define rules and assumptions that the assistant should follow during the conversation.'),
-				EShapeType::Text,
-			),
-			'input' => new ShapeDescriptor(
-				$this->l->t('Chat voice message'),
-				$this->l->t('Describe a task that you want the assistant to do or ask a question'),
-				EShapeType::Audio,
-			),
-			'history' => new ShapeDescriptor(
-				$this->l->t('Chat history'),
-				$this->l->t('The history of chat messages before the current message, starting with a message by the user'),
-				EShapeType::ListOfTexts,
-			),
-		];
-	}
-
-	/**
-	 * @return ShapeDescriptor[]
-	 */
-	public function getOutputShape(): array {
-		return [
-			'output' => new ShapeDescriptor(
-				$this->l->t('Response voice message'),
-				$this->l->t('The generated response as part of the conversation'),
-				EShapeType::Audio
-			),
-			'output_transcript' => new ShapeDescriptor(
-				$this->l->t('Output transcript'),
-				$this->l->t('Response transcription'),
-				EShapeType::Text,
-			),
-		];
-	}
-}

From 42c644c9e0a4980523d6b1389dbd496f4920dad3 Mon Sep 17 00:00:00 2001
From: Julien Veyssier <julien-nc@posteo.net>
Date: Wed, 9 Jul 2025 15:28:45 +0200
Subject: [PATCH 09/11] isolate oneStep audio2audio processing in a provider
 method

Signed-off-by: Julien Veyssier <julien-nc@posteo.net>
---
 .../AudioToAudioChatProvider.php              | 103 ++++++++++--------
 1 file changed, 56 insertions(+), 47 deletions(-)

diff --git a/lib/TaskProcessing/AudioToAudioChatProvider.php b/lib/TaskProcessing/AudioToAudioChatProvider.php
index cc36b5e0..ca9929ef 100644
--- a/lib/TaskProcessing/AudioToAudioChatProvider.php
+++ b/lib/TaskProcessing/AudioToAudioChatProvider.php
@@ -185,58 +185,15 @@ public function process(?string $userId, array $input, callable $reportProgress)
 		}
 
 		$sttModel = $this->appConfig->getValueString(Application::APP_ID, 'default_stt_model_id', Application::DEFAULT_MODEL_ID) ?: Application::DEFAULT_MODEL_ID;
-
 		$serviceName = $this->appConfig->getValueString(Application::APP_ID, 'service_name') ?: Application::APP_ID;
 
 		/////////////// Using the chat API if connected to OpenAI
+		// there is an issue if the history mostly contains text, the model will answer text even if we add the audio modality
+		/*
 		if ($this->openAiAPIService->isUsingOpenAi()) {
-			$b64Audio = base64_encode($inputFile->getContent());
-			$extraParams = [
-				'modalities' => ['text', 'audio'],
-				'audio' => ['voice' => $outputVoice, 'format' => 'mp3'],
-			];
-			$systemPrompt .= ' Producing text responses will break the user interface. Important: You have multimodal voice capability, and you use voice exclusively to respond.';
-			$completion = $this->openAiAPIService->createChatCompletion(
-				$userId, $llmModel, null, $systemPrompt, $history, 1, 1000,
-				$extraParams, null, null, $b64Audio,
-			);
-			$message = array_pop($completion['audio_messages']);
-			// TODO find a way to force the model to answer with audio when there is only text in the history
-			// https://community.openai.com/t/gpt-4o-audio-preview-responds-in-text-not-audio/1006486/5
-			if ($message === null) {
-				// no audio, TTS the text message
-				try {
-					$textResponse = array_pop($completion['messages']);
-					$apiResponse = $this->openAiAPIService->requestSpeechCreation($userId, $textResponse, $ttsModel, $outputVoice, $speed);
-					if (!isset($apiResponse['body'])) {
-						$this->logger->warning($serviceName . ' text to speech generation failed: no speech returned');
-						throw new RuntimeException($serviceName . ' text to speech generation failed: no speech returned');
-					}
-					$output = $apiResponse['body'];
-				} catch (\Exception $e) {
-					$this->logger->warning($serviceName . ' text to speech generation failed with: ' . $e->getMessage(), ['exception' => $e]);
-					throw new RuntimeException($serviceName . ' text to speech generation failed with: ' . $e->getMessage());
-				}
-			} else {
-				$output = base64_decode($message['audio']['data']);
-				$textResponse = $message['audio']['transcript'];
-			}
-			$result = [
-				'output' => $output,
-				'output_transcript' => $textResponse,
-			];
-
-			// we still want the input transcription
-			try {
-				$inputTranscription = $this->openAiAPIService->transcribeFile($userId, $inputFile, false, $sttModel);
-				$result['input_transcript'] = $inputTranscription;
-			} catch (Exception $e) {
-				$this->logger->warning($serviceName . ' audio input transcription failed with: ' . $e->getMessage(), ['exception' => $e]);
-				throw new RuntimeException($serviceName . ' audio input transcription failed with: ' . $e->getMessage());
-			}
-
-			return $result;
+			return $this->oneStep($userId, $systemPrompt, $inputFile, $history, $outputVoice, $sttModel, $llmModel, $ttsModel, $speed, $serviceName);
 		}
+		*/
 
 		//////////////// 3 steps: STT -> LLM -> TTS
 		// speech to text
@@ -277,4 +234,56 @@ public function process(?string $userId, array $input, callable $reportProgress)
 			throw new RuntimeException($serviceName . ' text to speech generation failed with: ' . $e->getMessage());
 		}
 	}
+
+	private function oneStep(
+		?string $userId, string $systemPrompt, File $inputFile, array $history, string $outputVoice,
+		string $sttModel, string $llmModel, string $ttsModel, float $speed, string $serviceName
+	): array {
+		$b64Audio = base64_encode($inputFile->getContent());
+		$extraParams = [
+			'modalities' => ['text', 'audio'],
+			'audio' => ['voice' => $outputVoice, 'format' => 'mp3'],
+		];
+		$systemPrompt .= ' Producing text responses will break the user interface. Important: You have multimodal voice capability, and you use voice exclusively to respond.';
+		$completion = $this->openAiAPIService->createChatCompletion(
+			$userId, $llmModel, null, $systemPrompt, $history, 1, 1000,
+			$extraParams, null, null, $b64Audio,
+		);
+		$message = array_pop($completion['audio_messages']);
+		// TODO find a way to force the model to answer with audio when there is only text in the history
+		// https://community.openai.com/t/gpt-4o-audio-preview-responds-in-text-not-audio/1006486/5
+		if ($message === null) {
+			// no audio, TTS the text message
+			try {
+				$textResponse = array_pop($completion['messages']);
+				$apiResponse = $this->openAiAPIService->requestSpeechCreation($userId, $textResponse, $ttsModel, $outputVoice, $speed);
+				if (!isset($apiResponse['body'])) {
+					$this->logger->warning($serviceName . ' text to speech generation failed: no speech returned');
+					throw new RuntimeException($serviceName . ' text to speech generation failed: no speech returned');
+				}
+				$output = $apiResponse['body'];
+			} catch (\Exception $e) {
+				$this->logger->warning($serviceName . ' text to speech generation failed with: ' . $e->getMessage(), ['exception' => $e]);
+				throw new RuntimeException($serviceName . ' text to speech generation failed with: ' . $e->getMessage());
+			}
+		} else {
+			$output = base64_decode($message['audio']['data']);
+			$textResponse = $message['audio']['transcript'];
+		}
+		$result = [
+			'output' => $output,
+			'output_transcript' => $textResponse,
+		];
+
+		// we still want the input transcription
+		try {
+			$inputTranscription = $this->openAiAPIService->transcribeFile($userId, $inputFile, false, $sttModel);
+			$result['input_transcript'] = $inputTranscription;
+		} catch (Exception $e) {
+			$this->logger->warning($serviceName . ' audio input transcription failed with: ' . $e->getMessage(), ['exception' => $e]);
+			throw new RuntimeException($serviceName . ' audio input transcription failed with: ' . $e->getMessage());
+		}
+
+		return $result;
+	}
 }

From cd67b7fdeb4a13afc57764267dd044c59f9a5e5f Mon Sep 17 00:00:00 2001
From: Julien Veyssier <julien-nc@posteo.net>
Date: Wed, 9 Jul 2025 16:26:11 +0200
Subject: [PATCH 10/11] try to return the remote audio ID when using the chat
 endpoint with a multimodal model

Signed-off-by: Julien Veyssier <julien-nc@posteo.net>
---
 .../AudioToAudioChatProvider.php              | 107 ++++++++++--------
 psalm.xml                                     |   1 +
 2 files changed, 61 insertions(+), 47 deletions(-)

diff --git a/lib/TaskProcessing/AudioToAudioChatProvider.php b/lib/TaskProcessing/AudioToAudioChatProvider.php
index ca9929ef..afa54e1e 100644
--- a/lib/TaskProcessing/AudioToAudioChatProvider.php
+++ b/lib/TaskProcessing/AudioToAudioChatProvider.php
@@ -127,7 +127,13 @@ public function getOutputShapeEnumValues(): array {
 	}
 
 	public function getOptionalOutputShape(): array {
-		return [];
+		return [
+			'audio_id' => new ShapeDescriptor(
+				$this->l->t('Remote audio ID'),
+				$this->l->t('The ID of the audio response returned by the remote service'),
+				EShapeType::Text
+			),
+		];
 	}
 
 	public function getOptionalOutputShapeEnumValues(): array {
@@ -187,58 +193,21 @@ public function process(?string $userId, array $input, callable $reportProgress)
 		$sttModel = $this->appConfig->getValueString(Application::APP_ID, 'default_stt_model_id', Application::DEFAULT_MODEL_ID) ?: Application::DEFAULT_MODEL_ID;
 		$serviceName = $this->appConfig->getValueString(Application::APP_ID, 'service_name') ?: Application::APP_ID;
 
-		/////////////// Using the chat API if connected to OpenAI
+		// Using the chat API if connected to OpenAI
 		// there is an issue if the history mostly contains text, the model will answer text even if we add the audio modality
-		/*
 		if ($this->openAiAPIService->isUsingOpenAi()) {
 			return $this->oneStep($userId, $systemPrompt, $inputFile, $history, $outputVoice, $sttModel, $llmModel, $ttsModel, $speed, $serviceName);
 		}
-		*/
-
-		//////////////// 3 steps: STT -> LLM -> TTS
-		// speech to text
-		try {
-			$inputTranscription = $this->openAiAPIService->transcribeFile($userId, $inputFile, false, $sttModel);
-		} catch (Exception $e) {
-			$this->logger->warning($serviceName . ' transcription failed with: ' . $e->getMessage(), ['exception' => $e]);
-			throw new RuntimeException($serviceName . ' transcription failed with: ' . $e->getMessage());
-		}
-
-		// free prompt
-		try {
-			$completion = $this->openAiAPIService->createChatCompletion($userId, $llmModel, $inputTranscription, $systemPrompt, $history, 1, 1000);
-			$completion = $completion['messages'];
-		} catch (Exception $e) {
-			throw new RuntimeException($serviceName . ' chat completion request failed: ' . $e->getMessage());
-		}
-		if (count($completion) === 0) {
-			throw new RuntimeException('No completion in ' . $serviceName . ' response.');
-		}
-		$llmResult = array_pop($completion);
-
-		// text to speech
-		try {
-			$apiResponse = $this->openAiAPIService->requestSpeechCreation($userId, $llmResult, $ttsModel, $outputVoice, $speed);
 
-			if (!isset($apiResponse['body'])) {
-				$this->logger->warning($serviceName . ' text to speech generation failed: no speech returned');
-				throw new RuntimeException($serviceName . ' text to speech generation failed: no speech returned');
-			}
-			return [
-				'output' => $apiResponse['body'],
-				'output_transcript' => $llmResult,
-				'input_transcript' => $inputTranscription,
-			];
-		} catch (\Exception $e) {
-			$this->logger->warning($serviceName . ' text to speech generation failed with: ' . $e->getMessage(), ['exception' => $e]);
-			throw new RuntimeException($serviceName . ' text to speech generation failed with: ' . $e->getMessage());
-		}
+		// 3 steps: STT -> LLM -> TTS
+		return $this->threeSteps($userId, $systemPrompt, $inputFile, $history, $outputVoice, $sttModel, $llmModel, $ttsModel, $speed, $serviceName);
 	}
 
 	private function oneStep(
 		?string $userId, string $systemPrompt, File $inputFile, array $history, string $outputVoice,
-		string $sttModel, string $llmModel, string $ttsModel, float $speed, string $serviceName
+		string $sttModel, string $llmModel, string $ttsModel, float $speed, string $serviceName,
 	): array {
+		$result = [];
 		$b64Audio = base64_encode($inputFile->getContent());
 		$extraParams = [
 			'modalities' => ['text', 'audio'],
@@ -269,11 +238,12 @@ private function oneStep(
 		} else {
 			$output = base64_decode($message['audio']['data']);
 			$textResponse = $message['audio']['transcript'];
+			if (isset($message['audio']['id'])) {
+				$result['audio_id'] = $message['audio']['id'];
+			}
 		}
-		$result = [
-			'output' => $output,
-			'output_transcript' => $textResponse,
-		];
+		$result['output'] = $output;
+		$result['output_transcript'] = $textResponse;
 
 		// we still want the input transcription
 		try {
@@ -286,4 +256,47 @@ private function oneStep(
 
 		return $result;
 	}
+
+	private function threeSteps(
+		?string $userId, string $systemPrompt, File $inputFile, array $history, string $outputVoice,
+		string $sttModel, string $llmModel, string $ttsModel, float $speed, string $serviceName,
+	): array {
+		// speech to text
+		try {
+			$inputTranscription = $this->openAiAPIService->transcribeFile($userId, $inputFile, false, $sttModel);
+		} catch (Exception $e) {
+			$this->logger->warning($serviceName . ' transcription failed with: ' . $e->getMessage(), ['exception' => $e]);
+			throw new RuntimeException($serviceName . ' transcription failed with: ' . $e->getMessage());
+		}
+
+		// free prompt
+		try {
+			$completion = $this->openAiAPIService->createChatCompletion($userId, $llmModel, $inputTranscription, $systemPrompt, $history, 1, 1000);
+			$completion = $completion['messages'];
+		} catch (Exception $e) {
+			throw new RuntimeException($serviceName . ' chat completion request failed: ' . $e->getMessage());
+		}
+		if (count($completion) === 0) {
+			throw new RuntimeException('No completion in ' . $serviceName . ' response.');
+		}
+		$llmResult = array_pop($completion);
+
+		// text to speech
+		try {
+			$apiResponse = $this->openAiAPIService->requestSpeechCreation($userId, $llmResult, $ttsModel, $outputVoice, $speed);
+
+			if (!isset($apiResponse['body'])) {
+				$this->logger->warning($serviceName . ' text to speech generation failed: no speech returned');
+				throw new RuntimeException($serviceName . ' text to speech generation failed: no speech returned');
+			}
+			return [
+				'output' => $apiResponse['body'],
+				'output_transcript' => $llmResult,
+				'input_transcript' => $inputTranscription,
+			];
+		} catch (\Exception $e) {
+			$this->logger->warning($serviceName . ' text to speech generation failed with: ' . $e->getMessage(), ['exception' => $e]);
+			throw new RuntimeException($serviceName . ' text to speech generation failed with: ' . $e->getMessage());
+		}
+	}
 }
diff --git a/psalm.xml b/psalm.xml
index 8ca4dcbc..128fcc86 100644
--- a/psalm.xml
+++ b/psalm.xml
@@ -39,6 +39,7 @@
 				<referencedClass name="OCP\TaskProcessing\EShapeType" />
 				<referencedClass name="OCP\TaskProcessing\TaskTypes\TextToTextProofread" />
 				<referencedClass name="OCP\TaskProcessing\TaskTypes\TextToTextChatWithTools" />
+				<referencedClass name="OCP\TaskProcessing\TaskTypes\AudioToAudioChat" />
 			</errorLevel>
 		</UndefinedClass>
 		<UndefinedDocblockClass>

From 3ea9b777ebcb63adc6ad67c9c012d2db35874f9d Mon Sep 17 00:00:00 2001
From: Julien Veyssier <julien-nc@posteo.net>
Date: Thu, 10 Jul 2025 12:09:04 +0200
Subject: [PATCH 11/11] return remote audio expiration date

Signed-off-by: Julien Veyssier <julien-nc@posteo.net>
---
 lib/TaskProcessing/AudioToAudioChatProvider.php | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/lib/TaskProcessing/AudioToAudioChatProvider.php b/lib/TaskProcessing/AudioToAudioChatProvider.php
index afa54e1e..637843f5 100644
--- a/lib/TaskProcessing/AudioToAudioChatProvider.php
+++ b/lib/TaskProcessing/AudioToAudioChatProvider.php
@@ -133,6 +133,11 @@ public function getOptionalOutputShape(): array {
 				$this->l->t('The ID of the audio response returned by the remote service'),
 				EShapeType::Text
 			),
+			'audio_expires_at' => new ShapeDescriptor(
+				$this->l->t('Remote audio expiration date'),
+				$this->l->t('The remote audio response stays available in the service until this date'),
+				EShapeType::Number
+			),
 		];
 	}
 
@@ -241,6 +246,9 @@ private function oneStep(
 			if (isset($message['audio']['id'])) {
 				$result['audio_id'] = $message['audio']['id'];
 			}
+			if (isset($message['audio']['expires_at'])) {
+				$result['audio_expires_at'] = $message['audio']['expires_at'];
+			}
 		}
 		$result['output'] = $output;
 		$result['output_transcript'] = $textResponse;