From f178063a9d05ca02302decc57b0246594e17c72d Mon Sep 17 00:00:00 2001 From: Julien Veyssier Date: Tue, 1 Jul 2025 18:26:52 +0200 Subject: [PATCH 01/11] implement audio chat provider Signed-off-by: Julien Veyssier --- lib/AppInfo/Application.php | 4 + lib/Service/OpenAiAPIService.php | 28 +- .../AudioToAudioChatProvider.php | 256 ++++++++++++++++++ .../AudioToAudioChatTaskType.php | 82 ++++++ 4 files changed, 367 insertions(+), 3 deletions(-) create mode 100644 lib/TaskProcessing/AudioToAudioChatProvider.php create mode 100644 lib/TaskProcessing/AudioToAudioChatTaskType.php diff --git a/lib/AppInfo/Application.php b/lib/AppInfo/Application.php index 2097212e..54f23c85 100644 --- a/lib/AppInfo/Application.php +++ b/lib/AppInfo/Application.php @@ -9,6 +9,8 @@ use OCA\OpenAi\Capabilities; use OCA\OpenAi\OldProcessing\Translation\TranslationProvider as OldTranslationProvider; +use OCA\OpenAi\TaskProcessing\AudioToAudioChatProvider; +use OCA\OpenAi\TaskProcessing\AudioToAudioChatTaskType; use OCA\OpenAi\TaskProcessing\AudioToTextProvider; use OCA\OpenAi\TaskProcessing\ChangeToneProvider; use OCA\OpenAi\TaskProcessing\ChangeToneTaskType; @@ -130,6 +132,8 @@ public function register(IRegistrationContext $context): void { if ($this->appConfig->getValueString(Application::APP_ID, 't2i_provider_enabled', '1') === '1') { $context->registerTaskProcessingProvider(TextToImageProvider::class); } + $context->registerTaskProcessingTaskType(AudioToAudioChatTaskType::class); + $context->registerTaskProcessingProvider(AudioToAudioChatProvider::class); $context->registerCapability(Capabilities::class); } diff --git a/lib/Service/OpenAiAPIService.php b/lib/Service/OpenAiAPIService.php index 0ebe187f..1c4bede7 100644 --- a/lib/Service/OpenAiAPIService.php +++ b/lib/Service/OpenAiAPIService.php @@ -437,7 +437,8 @@ public function createCompletion( * @param array|null $extraParams * @param string|null $toolMessage JSON string with role, content, tool_call_id * @param array|null $tools - * @return array{messages: array, tool_calls: array} + * @param string|null $userAudioPromptBase64 + * @return array{messages: array, tool_calls: array, audio_messages: list>} * @throws Exception */ public function createChatCompletion( @@ -451,6 +452,7 @@ public function createChatCompletion( ?array $extraParams = null, ?string $toolMessage = null, ?array $tools = null, + ?string $userAudioPromptBase64 = null, ): array { if ($this->isQuotaExceeded($userId, Application::QUOTA_TYPE_TEXT)) { throw new Exception($this->l10n->t('Text generation quota exceeded'), Http::STATUS_TOO_MANY_REQUESTS); @@ -494,8 +496,24 @@ public function createChatCompletion( $messages[] = $message; } } - if ($userPrompt !== null) { - $messages[] = ['role' => 'user', 'content' => $userPrompt]; + if ($userPrompt !== null || $userAudioPromptBase64 !== null) { + $message = ['role' => 'user', 'content' => []]; + if ($userPrompt !== null) { + $message['content'][] = [ + 'type' => 'text', + 'text' => $userPrompt, + ]; + } + if ($userAudioPromptBase64 !== null) { + $message['content'][] = [ + 'type' => 'input_audio', + 'input_audio' => [ + 'data' => $userAudioPromptBase64, + 'format' => 'mp3', + ], + ]; + } + $messages[] = $message; } if ($toolMessage !== null) { $msgs = json_decode($toolMessage, true); @@ -555,6 +573,7 @@ public function createChatCompletion( $completions = [ 'messages' => [], 'tool_calls' => [], + 'audio_messages' => [], ]; foreach ($response['choices'] as $choice) { @@ -583,6 +602,9 @@ public function createChatCompletion( if (isset($choice['message']['content']) && is_string($choice['message']['content'])) { $completions['messages'][] = $choice['message']['content']; } + if (isset($choice['message']['audio'], $choice['message']['audio']['data']) && is_string($choice['message']['audio']['data'])) { + $completions['audio_messages'][] = $choice['message']; + } } return $completions; diff --git a/lib/TaskProcessing/AudioToAudioChatProvider.php b/lib/TaskProcessing/AudioToAudioChatProvider.php new file mode 100644 index 00000000..7a879b70 --- /dev/null +++ b/lib/TaskProcessing/AudioToAudioChatProvider.php @@ -0,0 +1,256 @@ +openAiAPIService->getServiceName(); + } + + public function getTaskTypeId(): string { + if (class_exists('OCP\\TaskProcessing\\TaskTypes\\AudioToAudioChat')) { + return \OCP\TaskProcessing\TaskTypes\AudioToAudioChat::ID; + } + return AudioToAudioChatTaskType::ID; + } + + public function getExpectedRuntime(): int { + return $this->openAiAPIService->getExpTextProcessingTime(); + } + + public function getInputShapeEnumValues(): array { + return []; + } + + public function getInputShapeDefaults(): array { + return []; + } + + + public function getOptionalInputShape(): array { + return [ + 'llm_model' => new ShapeDescriptor( + $this->l->t('Completion model'), + $this->l->t('The model used to generate the completion'), + EShapeType::Enum + ), + 'voice' => new ShapeDescriptor( + $this->l->t('Voice'), + $this->l->t('The voice to use'), + EShapeType::Enum + ), + 'tts_model' => new ShapeDescriptor( + $this->l->t('Text-to-speech model'), + $this->l->t('The model used to generate the speech'), + EShapeType::Enum + ), + 'speed' => new ShapeDescriptor( + $this->l->t('Speed'), + $this->openAiAPIService->isUsingOpenAi() + ? $this->l->t('Speech speed modifier (Valid values: 0.25-4)') + : $this->l->t('Speech speed modifier'), + EShapeType::Number + ) + ]; + } + + public function getOptionalInputShapeEnumValues(): array { + $voices = json_decode($this->appConfig->getValueString(Application::APP_ID, 'tts_voices')) ?: Application::DEFAULT_SPEECH_VOICES; + $models = $this->openAiAPIService->getModelEnumValues($this->userId); + return [ + 'voice' => array_map(function ($v) { return new ShapeEnumValue($v, $v); }, $voices), + 'llm_model' => $models, + 'tts_model' => $models, + ]; + } + + public function getOptionalInputShapeDefaults(): array { + $adminVoice = $this->appConfig->getValueString(Application::APP_ID, 'default_speech_voice') ?: Application::DEFAULT_SPEECH_VOICE; + $adminTtsModel = $this->appConfig->getValueString(Application::APP_ID, 'default_speech_model_id') ?: Application::DEFAULT_SPEECH_MODEL_ID; + $adminLlmModel = $this->openAiAPIService->isUsingOpenAi() + ? ($this->appConfig->getValueString(Application::APP_ID, 'default_completion_model_id', Application::DEFAULT_MODEL_ID) ?: Application::DEFAULT_MODEL_ID) + : $this->appConfig->getValueString(Application::APP_ID, 'default_completion_model_id'); + return [ + 'voice' => $adminVoice, + 'tts_model' => $adminTtsModel, + 'speed' => 1, + 'llm_model' => $adminLlmModel, + ]; + } + + public function getOutputShapeEnumValues(): array { + return []; + } + + public function getOptionalOutputShape(): array { + return [ + 'input_transcript' => new ShapeDescriptor( + $this->l->t('Input transcript'), + $this->l->t('Input transcription'), + EShapeType::Text, + ), + 'output_transcript' => new ShapeDescriptor( + $this->l->t('Output transcript'), + $this->l->t('Response transcription'), + EShapeType::Text, + ), + ]; + } + + public function getOptionalOutputShapeEnumValues(): array { + return []; + } + + public function process(?string $userId, array $input, callable $reportProgress): array { + if (!isset($input['input']) || !$input['input'] instanceof File || !$input['input']->isReadable()) { + throw new RuntimeException('Invalid input file'); + } + $inputFile = $input['input']; + + if (!isset($input['system_prompt']) || !is_string($input['system_prompt'])) { + throw new RuntimeException('Invalid system_prompt'); + } + $systemPrompt = $input['system_prompt']; + + if (!isset($input['history']) || !is_array($input['history'])) { + throw new RuntimeException('Invalid history'); + } + $history = $input['history']; + + if (isset($input['tts_model']) && is_string($input['tts_model'])) { + $ttsModel = $input['tts_model']; + } else { + $ttsModel = $this->appConfig->getValueString(Application::APP_ID, 'default_speech_model_id', Application::DEFAULT_SPEECH_MODEL_ID) ?: Application::DEFAULT_SPEECH_MODEL_ID; + } + + if (isset($input['llm_model']) && is_string($input['llm_model'])) { + $llmModel = $input['llm_model']; + } else { + $llmModel = $this->appConfig->getValueString(Application::APP_ID, 'default_completion_model_id', Application::DEFAULT_MODEL_ID) ?: Application::DEFAULT_MODEL_ID; + } + + + if (isset($input['voice']) && is_string($input['voice'])) { + $voice = $input['voice']; + } else { + $voice = $this->appConfig->getValueString(Application::APP_ID, 'default_speech_voice', Application::DEFAULT_SPEECH_VOICE) ?: Application::DEFAULT_SPEECH_VOICE; + } + + $speed = 1; + if (isset($input['speed']) && is_numeric($input['speed'])) { + $speed = $input['speed']; + if ($this->openAiAPIService->isUsingOpenAi()) { + if ($speed > 4) { + $speed = 4; + } elseif ($speed < 0.25) { + $speed = 0.25; + } + } + } + + $sttModel = $this->appConfig->getValueString(Application::APP_ID, 'default_stt_model_id', Application::DEFAULT_MODEL_ID) ?: Application::DEFAULT_MODEL_ID; + + /////////////// Using the chat API if connected to OpenAI + if ($this->openAiAPIService->isUsingOpenAi()) { + $b64Audio = base64_encode($inputFile->getContent()); + $extraParams = [ + 'modalities' => ['text', 'audio'], + 'audio' => ['voice' => $voice, 'format' => 'mp3'], + ]; + $completion = $this->openAiAPIService->createChatCompletion( + $userId, 'gpt-4o-audio-preview', null, $systemPrompt, $history, 1, 1000, + $extraParams, null, null, $b64Audio, + ); + $message = array_pop($completion['audio_messages']); + $result = [ + 'output' => base64_decode($message['audio']['data']), + 'output_transcript' => $message['audio']['transcript'], + ]; + + // we still want the input transcription + try { + $inputTranscription = $this->openAiAPIService->transcribeFile($userId, $inputFile, false, $sttModel); + $result['input_transcript'] = $inputTranscription; + } catch (Exception $e) { + $this->logger->warning('OpenAI\'s Whisper transcription failed with: ' . $e->getMessage(), ['exception' => $e]); + } + + return $result; + } + + //////////////// 3 steps: STT -> LLM -> TTS + // speech to text + try { + $inputTranscription = $this->openAiAPIService->transcribeFile($userId, $inputFile, false, $sttModel); + } catch (Exception $e) { + $this->logger->warning('OpenAI\'s Whisper transcription failed with: ' . $e->getMessage(), ['exception' => $e]); + throw new RuntimeException('OpenAI\'s Whisper transcription failed with: ' . $e->getMessage()); + } + + // free prompt + try { + $completion = $this->openAiAPIService->createChatCompletion($userId, $llmModel, $inputTranscription, $systemPrompt, $history, 1, 1000); + $completion = $completion['messages']; + } catch (Exception $e) { + throw new RuntimeException('OpenAI/LocalAI request failed: ' . $e->getMessage()); + } + if (count($completion) === 0) { + throw new RuntimeException('No completion in OpenAI/LocalAI response.'); + } + $llmResult = array_pop($completion); + + // text to speech + try { + $apiResponse = $this->openAiAPIService->requestSpeechCreation($userId, $llmResult, $ttsModel, $voice, $speed); + + if (!isset($apiResponse['body'])) { + $this->logger->warning('OpenAI/LocalAI\'s text to speech generation failed: no speech returned'); + throw new RuntimeException('OpenAI/LocalAI\'s text to speech generation failed: no speech returned'); + } + return [ + 'output' => $apiResponse['body'], + 'output_transcript' => $llmResult, + 'input_transcript' => $inputTranscription, + ]; + } catch (\Exception $e) { + $this->logger->warning('OpenAI/LocalAI\'s text to image generation failed with: ' . $e->getMessage(), ['exception' => $e]); + throw new RuntimeException('OpenAI/LocalAI\'s text to image generation failed with: ' . $e->getMessage()); + } + } +} diff --git a/lib/TaskProcessing/AudioToAudioChatTaskType.php b/lib/TaskProcessing/AudioToAudioChatTaskType.php new file mode 100644 index 00000000..ba5eaed5 --- /dev/null +++ b/lib/TaskProcessing/AudioToAudioChatTaskType.php @@ -0,0 +1,82 @@ +l->t('ioa Voice chat'); + } + + /** + * @inheritDoc + */ + public function getDescription(): string { + return $this->l->t('ioa Voice chat with the assistant'); + } + + /** + * @return string + */ + public function getId(): string { + return self::ID; + } + + /** + * @return ShapeDescriptor[] + */ + public function getInputShape(): array { + return [ + 'system_prompt' => new ShapeDescriptor( + $this->l->t('System prompt'), + $this->l->t('Define rules and assumptions that the assistant should follow during the conversation.'), + EShapeType::Text, + ), + 'input' => new ShapeDescriptor( + $this->l->t('Chat voice message'), + $this->l->t('Describe a task that you want the assistant to do or ask a question'), + EShapeType::Audio, + ), + 'history' => new ShapeDescriptor( + $this->l->t('Chat history'), + $this->l->t('The history of chat messages before the current message, starting with a message by the user'), + EShapeType::ListOfTexts, + ), + ]; + } + + /** + * @return ShapeDescriptor[] + */ + public function getOutputShape(): array { + return [ + 'output' => new ShapeDescriptor( + $this->l->t('Response voice message'), + $this->l->t('The generated response as part of the conversation'), + EShapeType::Audio + ), + ]; + } +} From 6334caed279ebb3e8767c364283ee52198c4eeec Mon Sep 17 00:00:00 2001 From: Julien Veyssier Date: Thu, 3 Jul 2025 17:56:13 +0200 Subject: [PATCH 02/11] adjust AudioToAudioChatTaskType Signed-off-by: Julien Veyssier --- lib/TaskProcessing/AudioToAudioChatProvider.php | 9 +-------- lib/TaskProcessing/AudioToAudioChatTaskType.php | 5 +++++ 2 files changed, 6 insertions(+), 8 deletions(-) diff --git a/lib/TaskProcessing/AudioToAudioChatProvider.php b/lib/TaskProcessing/AudioToAudioChatProvider.php index 7a879b70..5f673bc2 100644 --- a/lib/TaskProcessing/AudioToAudioChatProvider.php +++ b/lib/TaskProcessing/AudioToAudioChatProvider.php @@ -12,7 +12,6 @@ use Exception; use OCA\OpenAi\AppInfo\Application; use OCA\OpenAi\Service\OpenAiAPIService; -use OCA\OpenAi\Service\OpenAiSettingsService; use OCP\Files\File; use OCP\IAppConfig; use OCP\IL10N; @@ -27,7 +26,6 @@ class AudioToAudioChatProvider implements ISynchronousProvider { public function __construct( private OpenAiAPIService $openAiAPIService, - private OpenAiSettingsService $openAiSettingsService, private IL10N $l, private LoggerInterface $logger, private IAppConfig $appConfig, @@ -125,11 +123,6 @@ public function getOptionalOutputShape(): array { $this->l->t('Input transcription'), EShapeType::Text, ), - 'output_transcript' => new ShapeDescriptor( - $this->l->t('Output transcript'), - $this->l->t('Response transcription'), - EShapeType::Text, - ), ]; } @@ -249,7 +242,7 @@ public function process(?string $userId, array $input, callable $reportProgress) 'input_transcript' => $inputTranscription, ]; } catch (\Exception $e) { - $this->logger->warning('OpenAI/LocalAI\'s text to image generation failed with: ' . $e->getMessage(), ['exception' => $e]); + $this->logger->warning('OpenAI/LocalAI\'s text to speech generation failed with: ' . $e->getMessage(), ['exception' => $e]); throw new RuntimeException('OpenAI/LocalAI\'s text to image generation failed with: ' . $e->getMessage()); } } diff --git a/lib/TaskProcessing/AudioToAudioChatTaskType.php b/lib/TaskProcessing/AudioToAudioChatTaskType.php index ba5eaed5..f4d43849 100644 --- a/lib/TaskProcessing/AudioToAudioChatTaskType.php +++ b/lib/TaskProcessing/AudioToAudioChatTaskType.php @@ -77,6 +77,11 @@ public function getOutputShape(): array { $this->l->t('The generated response as part of the conversation'), EShapeType::Audio ), + 'output_transcript' => new ShapeDescriptor( + $this->l->t('Output transcript'), + $this->l->t('Response transcription'), + EShapeType::Text, + ), ]; } } From e3a6d2d26fb82a592ca7bfaac967468e5386b45a Mon Sep 17 00:00:00 2001 From: Julien Veyssier Date: Thu, 3 Jul 2025 18:05:24 +0200 Subject: [PATCH 03/11] feat(audio-chat): add condition to register provider and task type Signed-off-by: Julien Veyssier --- lib/AppInfo/Application.php | 17 +++++++++++++++-- 1 file changed, 15 insertions(+), 2 deletions(-) diff --git a/lib/AppInfo/Application.php b/lib/AppInfo/Application.php index 54f23c85..1a46168c 100644 --- a/lib/AppInfo/Application.php +++ b/lib/AppInfo/Application.php @@ -132,8 +132,21 @@ public function register(IRegistrationContext $context): void { if ($this->appConfig->getValueString(Application::APP_ID, 't2i_provider_enabled', '1') === '1') { $context->registerTaskProcessingProvider(TextToImageProvider::class); } - $context->registerTaskProcessingTaskType(AudioToAudioChatTaskType::class); - $context->registerTaskProcessingProvider(AudioToAudioChatProvider::class); + + // only register audio chat stuff if we're using OpenAI or stt+llm+tts are enabled + $serviceUrl = $this->appConfig->getValueString(Application::APP_ID, 'url'); + $isUsingOpenAI = $serviceUrl === '' || $serviceUrl === Application::OPENAI_API_BASE_URL; + if ( + $isUsingOpenAI + || ( + $this->appConfig->getValueString(Application::APP_ID, 'stt_provider_enabled', '1') === '1' + && $this->appConfig->getValueString(Application::APP_ID, 'llm_provider_enabled', '1') === '1' + && $this->appConfig->getValueString(Application::APP_ID, 'tts_provider_enabled', '1') === '1' + ) + ) { + $context->registerTaskProcessingTaskType(AudioToAudioChatTaskType::class); + $context->registerTaskProcessingProvider(AudioToAudioChatProvider::class); + } $context->registerCapability(Capabilities::class); } From b87598770ee4224524420cff8c8d80bf39d3232c Mon Sep 17 00:00:00 2001 From: Julien Veyssier Date: Thu, 3 Jul 2025 18:15:15 +0200 Subject: [PATCH 04/11] review adjustments Signed-off-by: Julien Veyssier --- .../AudioToAudioChatProvider.php | 20 +++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/lib/TaskProcessing/AudioToAudioChatProvider.php b/lib/TaskProcessing/AudioToAudioChatProvider.php index 5f673bc2..db891512 100644 --- a/lib/TaskProcessing/AudioToAudioChatProvider.php +++ b/lib/TaskProcessing/AudioToAudioChatProvider.php @@ -69,8 +69,8 @@ public function getOptionalInputShape(): array { EShapeType::Enum ), 'voice' => new ShapeDescriptor( - $this->l->t('Voice'), - $this->l->t('The voice to use'), + $this->l->t('Output voice'), + $this->l->t('The voice used to generate speech'), EShapeType::Enum ), 'tts_model' => new ShapeDescriptor( @@ -120,7 +120,7 @@ public function getOptionalOutputShape(): array { return [ 'input_transcript' => new ShapeDescriptor( $this->l->t('Input transcript'), - $this->l->t('Input transcription'), + $this->l->t('Transcription of the input audio'), EShapeType::Text, ), ]; @@ -132,7 +132,7 @@ public function getOptionalOutputShapeEnumValues(): array { public function process(?string $userId, array $input, callable $reportProgress): array { if (!isset($input['input']) || !$input['input'] instanceof File || !$input['input']->isReadable()) { - throw new RuntimeException('Invalid input file'); + throw new RuntimeException('Invalid input audio file in the "input" field. A readable file is expected.'); } $inputFile = $input['input']; @@ -142,7 +142,7 @@ public function process(?string $userId, array $input, callable $reportProgress) $systemPrompt = $input['system_prompt']; if (!isset($input['history']) || !is_array($input['history'])) { - throw new RuntimeException('Invalid history'); + throw new RuntimeException('Invalid chat history, array expected'); } $history = $input['history']; @@ -160,9 +160,9 @@ public function process(?string $userId, array $input, callable $reportProgress) if (isset($input['voice']) && is_string($input['voice'])) { - $voice = $input['voice']; + $outputVoice = $input['voice']; } else { - $voice = $this->appConfig->getValueString(Application::APP_ID, 'default_speech_voice', Application::DEFAULT_SPEECH_VOICE) ?: Application::DEFAULT_SPEECH_VOICE; + $outputVoice = $this->appConfig->getValueString(Application::APP_ID, 'default_speech_voice', Application::DEFAULT_SPEECH_VOICE) ?: Application::DEFAULT_SPEECH_VOICE; } $speed = 1; @@ -184,7 +184,7 @@ public function process(?string $userId, array $input, callable $reportProgress) $b64Audio = base64_encode($inputFile->getContent()); $extraParams = [ 'modalities' => ['text', 'audio'], - 'audio' => ['voice' => $voice, 'format' => 'mp3'], + 'audio' => ['voice' => $outputVoice, 'format' => 'mp3'], ]; $completion = $this->openAiAPIService->createChatCompletion( $userId, 'gpt-4o-audio-preview', null, $systemPrompt, $history, 1, 1000, @@ -230,7 +230,7 @@ public function process(?string $userId, array $input, callable $reportProgress) // text to speech try { - $apiResponse = $this->openAiAPIService->requestSpeechCreation($userId, $llmResult, $ttsModel, $voice, $speed); + $apiResponse = $this->openAiAPIService->requestSpeechCreation($userId, $llmResult, $ttsModel, $outputVoice, $speed); if (!isset($apiResponse['body'])) { $this->logger->warning('OpenAI/LocalAI\'s text to speech generation failed: no speech returned'); @@ -243,7 +243,7 @@ public function process(?string $userId, array $input, callable $reportProgress) ]; } catch (\Exception $e) { $this->logger->warning('OpenAI/LocalAI\'s text to speech generation failed with: ' . $e->getMessage(), ['exception' => $e]); - throw new RuntimeException('OpenAI/LocalAI\'s text to image generation failed with: ' . $e->getMessage()); + throw new RuntimeException('OpenAI/LocalAI\'s text to speech generation failed with: ' . $e->getMessage()); } } } From 2054ea65b75f9f65d09c9ff35ea2a39a0abe0966 Mon Sep 17 00:00:00 2001 From: Julien Veyssier Date: Thu, 3 Jul 2025 18:28:18 +0200 Subject: [PATCH 05/11] fix tests Signed-off-by: Julien Veyssier --- tests/unit/Providers/OpenAiProviderTest.php | 23 ++++++++++++++------- 1 file changed, 15 insertions(+), 8 deletions(-) diff --git a/tests/unit/Providers/OpenAiProviderTest.php b/tests/unit/Providers/OpenAiProviderTest.php index ee9923b2..fe0d2bd5 100644 --- a/tests/unit/Providers/OpenAiProviderTest.php +++ b/tests/unit/Providers/OpenAiProviderTest.php @@ -140,7 +140,7 @@ public function testFreePromptProvider(): void { $options = ['timeout' => Application::OPENAI_DEFAULT_REQUEST_TIMEOUT, 'headers' => ['User-Agent' => Application::USER_AGENT, 'Authorization' => self::AUTHORIZATION_HEADER, 'Content-Type' => 'application/json']]; $options['body'] = json_encode([ 'model' => Application::DEFAULT_COMPLETION_MODEL_ID, - 'messages' => [['role' => 'user', 'content' => $prompt]], + 'messages' => [['role' => 'user', 'content' => [['type' => 'text', 'text' => $prompt]]]], 'n' => $n, 'max_completion_tokens' => Application::DEFAULT_MAX_NUM_OF_TOKENS, 'user' => self::TEST_USER1, @@ -204,7 +204,7 @@ public function testEmojiProvider(): void { $message = 'Give me an emoji for the following text. Output only the emoji without any other characters.' . "\n\n" . $prompt; $options['body'] = json_encode([ 'model' => Application::DEFAULT_COMPLETION_MODEL_ID, - 'messages' => [['role' => 'user', 'content' => $message]], + 'messages' => [['role' => 'user', 'content' => [['type' => 'text', 'text' => $message]]]], 'n' => $n, 'max_completion_tokens' => Application::DEFAULT_MAX_NUM_OF_TOKENS, 'user' => self::TEST_USER1, @@ -269,7 +269,7 @@ public function testHeadlineProvider(): void { $message = 'Give me the headline of the following text in its original language. Do not output the language. Output only the headline without any quotes or additional punctuation.' . "\n\n" . $prompt; $options['body'] = json_encode([ 'model' => Application::DEFAULT_COMPLETION_MODEL_ID, - 'messages' => [['role' => 'user', 'content' => $message]], + 'messages' => [['role' => 'user', 'content' => [['type' => 'text', 'text' => $message]]]], 'n' => $n, 'max_completion_tokens' => Application::DEFAULT_MAX_NUM_OF_TOKENS, 'user' => self::TEST_USER1, @@ -334,7 +334,7 @@ public function testChangeToneProvider(): void { $message = "Reformulate the following text in a $toneInput tone in its original language. Output only the reformulation. Here is the text:" . "\n\n" . $textInput . "\n\n" . 'Do not mention the used language in your reformulation. Here is your reformulation in the same language:'; $options['body'] = json_encode([ 'model' => Application::DEFAULT_COMPLETION_MODEL_ID, - 'messages' => [['role' => 'user', 'content' => $message]], + 'messages' => [['role' => 'user', 'content' => [['type' => 'text', 'text' => $message]]]], 'n' => $n, 'max_completion_tokens' => Application::DEFAULT_MAX_NUM_OF_TOKENS, 'user' => self::TEST_USER1, @@ -400,8 +400,10 @@ public function testSummaryProvider(): void { . 'You should only return the summary without any additional information.'; $options['body'] = json_encode([ 'model' => Application::DEFAULT_COMPLETION_MODEL_ID, - 'messages' => [['role' => 'system', 'content' => $systemPrompt], - ['role' => 'user', 'content' => $prompt]], + 'messages' => [ + ['role' => 'system', 'content' => $systemPrompt], + ['role' => 'user', 'content' => [['type' => 'text', 'text' => $prompt]]], + ], 'n' => $n, 'max_completion_tokens' => Application::DEFAULT_MAX_NUM_OF_TOKENS, 'user' => self::TEST_USER1, @@ -465,7 +467,10 @@ public function testProofreadProvider(): void { $systemPrompt = 'Proofread the following text. List all spelling and grammar mistakes and how to correct them. Output only the list.'; $options['body'] = json_encode([ 'model' => Application::DEFAULT_COMPLETION_MODEL_ID, - 'messages' => [['role' => 'system', 'content' => $systemPrompt],['role' => 'user', 'content' => $prompt]], + 'messages' => [ + ['role' => 'system', 'content' => $systemPrompt], + ['role' => 'user', 'content' => [['type' => 'text', 'text' => $prompt]]], + ], 'n' => $n, 'max_completion_tokens' => Application::DEFAULT_MAX_NUM_OF_TOKENS, 'user' => self::TEST_USER1, @@ -533,7 +538,9 @@ public function testTranslationProvider(): void { $options = ['timeout' => Application::OPENAI_DEFAULT_REQUEST_TIMEOUT, 'headers' => ['User-Agent' => Application::USER_AGENT, 'Authorization' => self::AUTHORIZATION_HEADER, 'Content-Type' => 'application/json']]; $options['body'] = json_encode([ 'model' => Application::DEFAULT_COMPLETION_MODEL_ID, - 'messages' => [['role' => 'user', 'content' => 'Translate from ' . $fromLang . ' to English (US): ' . $inputText]], + 'messages' => [ + ['role' => 'user', 'content' => [['type' => 'text', 'text' => 'Translate from ' . $fromLang . ' to English (US): ' . $inputText]]], + ], 'n' => $n, 'max_completion_tokens' => Application::DEFAULT_MAX_NUM_OF_TOKENS, 'user' => self::TEST_USER1, From 456a8a17125585e057acbf4dbdf43d551b07b322 Mon Sep 17 00:00:00 2001 From: Julien Veyssier Date: Fri, 4 Jul 2025 14:33:37 +0200 Subject: [PATCH 06/11] change defaults and enum values if using openai or not, use llmModel input if using chat endpoint Signed-off-by: Julien Veyssier --- .../AudioToAudioChatProvider.php | 47 ++++++++++++------- 1 file changed, 31 insertions(+), 16 deletions(-) diff --git a/lib/TaskProcessing/AudioToAudioChatProvider.php b/lib/TaskProcessing/AudioToAudioChatProvider.php index db891512..8fbdc89f 100644 --- a/lib/TaskProcessing/AudioToAudioChatProvider.php +++ b/lib/TaskProcessing/AudioToAudioChatProvider.php @@ -62,7 +62,8 @@ public function getInputShapeDefaults(): array { public function getOptionalInputShape(): array { - return [ + $isUsingOpenAi = $this->openAiAPIService->isUsingOpenAi(); + $ois = [ 'llm_model' => new ShapeDescriptor( $this->l->t('Completion model'), $this->l->t('The model used to generate the completion'), @@ -73,43 +74,54 @@ public function getOptionalInputShape(): array { $this->l->t('The voice used to generate speech'), EShapeType::Enum ), - 'tts_model' => new ShapeDescriptor( + ]; + if (!$isUsingOpenAi) { + $ois['tts_model'] = new ShapeDescriptor( $this->l->t('Text-to-speech model'), $this->l->t('The model used to generate the speech'), EShapeType::Enum - ), - 'speed' => new ShapeDescriptor( + ); + $ois['speed'] = new ShapeDescriptor( $this->l->t('Speed'), $this->openAiAPIService->isUsingOpenAi() ? $this->l->t('Speech speed modifier (Valid values: 0.25-4)') : $this->l->t('Speech speed modifier'), EShapeType::Number - ) - ]; + ); + } + return $ois; } public function getOptionalInputShapeEnumValues(): array { + $isUsingOpenAi = $this->openAiAPIService->isUsingOpenAi(); $voices = json_decode($this->appConfig->getValueString(Application::APP_ID, 'tts_voices')) ?: Application::DEFAULT_SPEECH_VOICES; $models = $this->openAiAPIService->getModelEnumValues($this->userId); - return [ + $enumValues = [ 'voice' => array_map(function ($v) { return new ShapeEnumValue($v, $v); }, $voices), 'llm_model' => $models, - 'tts_model' => $models, ]; + if (!$isUsingOpenAi) { + $enumValues['tts_model'] = $models; + } + return $enumValues; } public function getOptionalInputShapeDefaults(): array { + $isUsingOpenAi = $this->openAiAPIService->isUsingOpenAi(); $adminVoice = $this->appConfig->getValueString(Application::APP_ID, 'default_speech_voice') ?: Application::DEFAULT_SPEECH_VOICE; - $adminTtsModel = $this->appConfig->getValueString(Application::APP_ID, 'default_speech_model_id') ?: Application::DEFAULT_SPEECH_MODEL_ID; - $adminLlmModel = $this->openAiAPIService->isUsingOpenAi() - ? ($this->appConfig->getValueString(Application::APP_ID, 'default_completion_model_id', Application::DEFAULT_MODEL_ID) ?: Application::DEFAULT_MODEL_ID) + $adminLlmModel = $isUsingOpenAi + ? 'gpt-4o-audio-preview' : $this->appConfig->getValueString(Application::APP_ID, 'default_completion_model_id'); - return [ + $defaults = [ 'voice' => $adminVoice, - 'tts_model' => $adminTtsModel, - 'speed' => 1, 'llm_model' => $adminLlmModel, ]; + if (!$isUsingOpenAi) { + $adminTtsModel = $this->appConfig->getValueString(Application::APP_ID, 'default_speech_model_id') ?: Application::DEFAULT_SPEECH_MODEL_ID; + $defaults['tts_model'] = $adminTtsModel; + $defaults['speed'] = 1; + } + return $defaults; } public function getOutputShapeEnumValues(): array { @@ -155,7 +167,10 @@ public function process(?string $userId, array $input, callable $reportProgress) if (isset($input['llm_model']) && is_string($input['llm_model'])) { $llmModel = $input['llm_model']; } else { - $llmModel = $this->appConfig->getValueString(Application::APP_ID, 'default_completion_model_id', Application::DEFAULT_MODEL_ID) ?: Application::DEFAULT_MODEL_ID; + $isUsingOpenAi = $this->openAiAPIService->isUsingOpenAi(); + $llmModel = $isUsingOpenAi + ? 'gpt-4o-audio-preview' + : ($this->appConfig->getValueString(Application::APP_ID, 'default_completion_model_id', Application::DEFAULT_MODEL_ID) ?: Application::DEFAULT_MODEL_ID); } @@ -187,7 +202,7 @@ public function process(?string $userId, array $input, callable $reportProgress) 'audio' => ['voice' => $outputVoice, 'format' => 'mp3'], ]; $completion = $this->openAiAPIService->createChatCompletion( - $userId, 'gpt-4o-audio-preview', null, $systemPrompt, $history, 1, 1000, + $userId, $llmModel, null, $systemPrompt, $history, 1, 1000, $extraParams, null, null, $b64Audio, ); $message = array_pop($completion['audio_messages']); From f00717ea9a39daef286028f802866fe0fe2164a7 Mon Sep 17 00:00:00 2001 From: Julien Veyssier Date: Fri, 4 Jul 2025 14:40:04 +0200 Subject: [PATCH 07/11] use service name in logs instead of hardcoded value, fall back to app ID Signed-off-by: Julien Veyssier --- .../AudioToAudioChatProvider.php | 20 ++++++++++--------- 1 file changed, 11 insertions(+), 9 deletions(-) diff --git a/lib/TaskProcessing/AudioToAudioChatProvider.php b/lib/TaskProcessing/AudioToAudioChatProvider.php index 8fbdc89f..88291474 100644 --- a/lib/TaskProcessing/AudioToAudioChatProvider.php +++ b/lib/TaskProcessing/AudioToAudioChatProvider.php @@ -194,6 +194,8 @@ public function process(?string $userId, array $input, callable $reportProgress) $sttModel = $this->appConfig->getValueString(Application::APP_ID, 'default_stt_model_id', Application::DEFAULT_MODEL_ID) ?: Application::DEFAULT_MODEL_ID; + $serviceName = $this->appConfig->getValueString(Application::APP_ID, 'service_name') ?: Application::APP_ID; + /////////////// Using the chat API if connected to OpenAI if ($this->openAiAPIService->isUsingOpenAi()) { $b64Audio = base64_encode($inputFile->getContent()); @@ -216,7 +218,7 @@ public function process(?string $userId, array $input, callable $reportProgress) $inputTranscription = $this->openAiAPIService->transcribeFile($userId, $inputFile, false, $sttModel); $result['input_transcript'] = $inputTranscription; } catch (Exception $e) { - $this->logger->warning('OpenAI\'s Whisper transcription failed with: ' . $e->getMessage(), ['exception' => $e]); + $this->logger->warning($serviceName . ' transcription failed with: ' . $e->getMessage(), ['exception' => $e]); } return $result; @@ -227,8 +229,8 @@ public function process(?string $userId, array $input, callable $reportProgress) try { $inputTranscription = $this->openAiAPIService->transcribeFile($userId, $inputFile, false, $sttModel); } catch (Exception $e) { - $this->logger->warning('OpenAI\'s Whisper transcription failed with: ' . $e->getMessage(), ['exception' => $e]); - throw new RuntimeException('OpenAI\'s Whisper transcription failed with: ' . $e->getMessage()); + $this->logger->warning($serviceName . ' transcription failed with: ' . $e->getMessage(), ['exception' => $e]); + throw new RuntimeException($serviceName . ' transcription failed with: ' . $e->getMessage()); } // free prompt @@ -236,10 +238,10 @@ public function process(?string $userId, array $input, callable $reportProgress) $completion = $this->openAiAPIService->createChatCompletion($userId, $llmModel, $inputTranscription, $systemPrompt, $history, 1, 1000); $completion = $completion['messages']; } catch (Exception $e) { - throw new RuntimeException('OpenAI/LocalAI request failed: ' . $e->getMessage()); + throw new RuntimeException($serviceName . ' chat completion request failed: ' . $e->getMessage()); } if (count($completion) === 0) { - throw new RuntimeException('No completion in OpenAI/LocalAI response.'); + throw new RuntimeException('No completion in ' . $serviceName . ' response.'); } $llmResult = array_pop($completion); @@ -248,8 +250,8 @@ public function process(?string $userId, array $input, callable $reportProgress) $apiResponse = $this->openAiAPIService->requestSpeechCreation($userId, $llmResult, $ttsModel, $outputVoice, $speed); if (!isset($apiResponse['body'])) { - $this->logger->warning('OpenAI/LocalAI\'s text to speech generation failed: no speech returned'); - throw new RuntimeException('OpenAI/LocalAI\'s text to speech generation failed: no speech returned'); + $this->logger->warning($serviceName . ' text to speech generation failed: no speech returned'); + throw new RuntimeException($serviceName . ' text to speech generation failed: no speech returned'); } return [ 'output' => $apiResponse['body'], @@ -257,8 +259,8 @@ public function process(?string $userId, array $input, callable $reportProgress) 'input_transcript' => $inputTranscription, ]; } catch (\Exception $e) { - $this->logger->warning('OpenAI/LocalAI\'s text to speech generation failed with: ' . $e->getMessage(), ['exception' => $e]); - throw new RuntimeException('OpenAI/LocalAI\'s text to speech generation failed with: ' . $e->getMessage()); + $this->logger->warning($serviceName . ' text to speech generation failed with: ' . $e->getMessage(), ['exception' => $e]); + throw new RuntimeException($serviceName . ' text to speech generation failed with: ' . $e->getMessage()); } } } From 47b9c867b088ab82b7cfe49d1d45071fb0781be8 Mon Sep 17 00:00:00 2001 From: Julien Veyssier Date: Mon, 7 Jul 2025 11:58:08 +0200 Subject: [PATCH 08/11] remove fallback audio chat task type, register the provider only if the server task type is available, handle the case when the chat endpoint does not return audio Signed-off-by: Julien Veyssier --- lib/AppInfo/Application.php | 6 +- .../AudioToAudioChatProvider.php | 42 ++++++--- .../AudioToAudioChatTaskType.php | 87 ------------------- 3 files changed, 31 insertions(+), 104 deletions(-) delete mode 100644 lib/TaskProcessing/AudioToAudioChatTaskType.php diff --git a/lib/AppInfo/Application.php b/lib/AppInfo/Application.php index 1a46168c..7ddd8eec 100644 --- a/lib/AppInfo/Application.php +++ b/lib/AppInfo/Application.php @@ -10,7 +10,6 @@ use OCA\OpenAi\Capabilities; use OCA\OpenAi\OldProcessing\Translation\TranslationProvider as OldTranslationProvider; use OCA\OpenAi\TaskProcessing\AudioToAudioChatProvider; -use OCA\OpenAi\TaskProcessing\AudioToAudioChatTaskType; use OCA\OpenAi\TaskProcessing\AudioToTextProvider; use OCA\OpenAi\TaskProcessing\ChangeToneProvider; use OCA\OpenAi\TaskProcessing\ChangeToneTaskType; @@ -144,8 +143,9 @@ public function register(IRegistrationContext $context): void { && $this->appConfig->getValueString(Application::APP_ID, 'tts_provider_enabled', '1') === '1' ) ) { - $context->registerTaskProcessingTaskType(AudioToAudioChatTaskType::class); - $context->registerTaskProcessingProvider(AudioToAudioChatProvider::class); + if (class_exists('OCP\\TaskProcessing\\TaskTypes\\AudioToAudioChat')) { + $context->registerTaskProcessingProvider(AudioToAudioChatProvider::class); + } } $context->registerCapability(Capabilities::class); diff --git a/lib/TaskProcessing/AudioToAudioChatProvider.php b/lib/TaskProcessing/AudioToAudioChatProvider.php index 88291474..cc36b5e0 100644 --- a/lib/TaskProcessing/AudioToAudioChatProvider.php +++ b/lib/TaskProcessing/AudioToAudioChatProvider.php @@ -19,6 +19,7 @@ use OCP\TaskProcessing\ISynchronousProvider; use OCP\TaskProcessing\ShapeDescriptor; use OCP\TaskProcessing\ShapeEnumValue; +use OCP\TaskProcessing\TaskTypes\AudioToAudioChat; use Psr\Log\LoggerInterface; use RuntimeException; @@ -42,10 +43,7 @@ public function getName(): string { } public function getTaskTypeId(): string { - if (class_exists('OCP\\TaskProcessing\\TaskTypes\\AudioToAudioChat')) { - return \OCP\TaskProcessing\TaskTypes\AudioToAudioChat::ID; - } - return AudioToAudioChatTaskType::ID; + return AudioToAudioChat::ID; } public function getExpectedRuntime(): int { @@ -129,13 +127,7 @@ public function getOutputShapeEnumValues(): array { } public function getOptionalOutputShape(): array { - return [ - 'input_transcript' => new ShapeDescriptor( - $this->l->t('Input transcript'), - $this->l->t('Transcription of the input audio'), - EShapeType::Text, - ), - ]; + return []; } public function getOptionalOutputShapeEnumValues(): array { @@ -203,14 +195,35 @@ public function process(?string $userId, array $input, callable $reportProgress) 'modalities' => ['text', 'audio'], 'audio' => ['voice' => $outputVoice, 'format' => 'mp3'], ]; + $systemPrompt .= ' Producing text responses will break the user interface. Important: You have multimodal voice capability, and you use voice exclusively to respond.'; $completion = $this->openAiAPIService->createChatCompletion( $userId, $llmModel, null, $systemPrompt, $history, 1, 1000, $extraParams, null, null, $b64Audio, ); $message = array_pop($completion['audio_messages']); + // TODO find a way to force the model to answer with audio when there is only text in the history + // https://community.openai.com/t/gpt-4o-audio-preview-responds-in-text-not-audio/1006486/5 + if ($message === null) { + // no audio, TTS the text message + try { + $textResponse = array_pop($completion['messages']); + $apiResponse = $this->openAiAPIService->requestSpeechCreation($userId, $textResponse, $ttsModel, $outputVoice, $speed); + if (!isset($apiResponse['body'])) { + $this->logger->warning($serviceName . ' text to speech generation failed: no speech returned'); + throw new RuntimeException($serviceName . ' text to speech generation failed: no speech returned'); + } + $output = $apiResponse['body']; + } catch (\Exception $e) { + $this->logger->warning($serviceName . ' text to speech generation failed with: ' . $e->getMessage(), ['exception' => $e]); + throw new RuntimeException($serviceName . ' text to speech generation failed with: ' . $e->getMessage()); + } + } else { + $output = base64_decode($message['audio']['data']); + $textResponse = $message['audio']['transcript']; + } $result = [ - 'output' => base64_decode($message['audio']['data']), - 'output_transcript' => $message['audio']['transcript'], + 'output' => $output, + 'output_transcript' => $textResponse, ]; // we still want the input transcription @@ -218,7 +231,8 @@ public function process(?string $userId, array $input, callable $reportProgress) $inputTranscription = $this->openAiAPIService->transcribeFile($userId, $inputFile, false, $sttModel); $result['input_transcript'] = $inputTranscription; } catch (Exception $e) { - $this->logger->warning($serviceName . ' transcription failed with: ' . $e->getMessage(), ['exception' => $e]); + $this->logger->warning($serviceName . ' audio input transcription failed with: ' . $e->getMessage(), ['exception' => $e]); + throw new RuntimeException($serviceName . ' audio input transcription failed with: ' . $e->getMessage()); } return $result; diff --git a/lib/TaskProcessing/AudioToAudioChatTaskType.php b/lib/TaskProcessing/AudioToAudioChatTaskType.php deleted file mode 100644 index f4d43849..00000000 --- a/lib/TaskProcessing/AudioToAudioChatTaskType.php +++ /dev/null @@ -1,87 +0,0 @@ -l->t('ioa Voice chat'); - } - - /** - * @inheritDoc - */ - public function getDescription(): string { - return $this->l->t('ioa Voice chat with the assistant'); - } - - /** - * @return string - */ - public function getId(): string { - return self::ID; - } - - /** - * @return ShapeDescriptor[] - */ - public function getInputShape(): array { - return [ - 'system_prompt' => new ShapeDescriptor( - $this->l->t('System prompt'), - $this->l->t('Define rules and assumptions that the assistant should follow during the conversation.'), - EShapeType::Text, - ), - 'input' => new ShapeDescriptor( - $this->l->t('Chat voice message'), - $this->l->t('Describe a task that you want the assistant to do or ask a question'), - EShapeType::Audio, - ), - 'history' => new ShapeDescriptor( - $this->l->t('Chat history'), - $this->l->t('The history of chat messages before the current message, starting with a message by the user'), - EShapeType::ListOfTexts, - ), - ]; - } - - /** - * @return ShapeDescriptor[] - */ - public function getOutputShape(): array { - return [ - 'output' => new ShapeDescriptor( - $this->l->t('Response voice message'), - $this->l->t('The generated response as part of the conversation'), - EShapeType::Audio - ), - 'output_transcript' => new ShapeDescriptor( - $this->l->t('Output transcript'), - $this->l->t('Response transcription'), - EShapeType::Text, - ), - ]; - } -} From 42c644c9e0a4980523d6b1389dbd496f4920dad3 Mon Sep 17 00:00:00 2001 From: Julien Veyssier Date: Wed, 9 Jul 2025 15:28:45 +0200 Subject: [PATCH 09/11] isolate oneStep audio2audio processing in a provider method Signed-off-by: Julien Veyssier --- .../AudioToAudioChatProvider.php | 103 ++++++++++-------- 1 file changed, 56 insertions(+), 47 deletions(-) diff --git a/lib/TaskProcessing/AudioToAudioChatProvider.php b/lib/TaskProcessing/AudioToAudioChatProvider.php index cc36b5e0..ca9929ef 100644 --- a/lib/TaskProcessing/AudioToAudioChatProvider.php +++ b/lib/TaskProcessing/AudioToAudioChatProvider.php @@ -185,58 +185,15 @@ public function process(?string $userId, array $input, callable $reportProgress) } $sttModel = $this->appConfig->getValueString(Application::APP_ID, 'default_stt_model_id', Application::DEFAULT_MODEL_ID) ?: Application::DEFAULT_MODEL_ID; - $serviceName = $this->appConfig->getValueString(Application::APP_ID, 'service_name') ?: Application::APP_ID; /////////////// Using the chat API if connected to OpenAI + // there is an issue if the history mostly contains text, the model will answer text even if we add the audio modality + /* if ($this->openAiAPIService->isUsingOpenAi()) { - $b64Audio = base64_encode($inputFile->getContent()); - $extraParams = [ - 'modalities' => ['text', 'audio'], - 'audio' => ['voice' => $outputVoice, 'format' => 'mp3'], - ]; - $systemPrompt .= ' Producing text responses will break the user interface. Important: You have multimodal voice capability, and you use voice exclusively to respond.'; - $completion = $this->openAiAPIService->createChatCompletion( - $userId, $llmModel, null, $systemPrompt, $history, 1, 1000, - $extraParams, null, null, $b64Audio, - ); - $message = array_pop($completion['audio_messages']); - // TODO find a way to force the model to answer with audio when there is only text in the history - // https://community.openai.com/t/gpt-4o-audio-preview-responds-in-text-not-audio/1006486/5 - if ($message === null) { - // no audio, TTS the text message - try { - $textResponse = array_pop($completion['messages']); - $apiResponse = $this->openAiAPIService->requestSpeechCreation($userId, $textResponse, $ttsModel, $outputVoice, $speed); - if (!isset($apiResponse['body'])) { - $this->logger->warning($serviceName . ' text to speech generation failed: no speech returned'); - throw new RuntimeException($serviceName . ' text to speech generation failed: no speech returned'); - } - $output = $apiResponse['body']; - } catch (\Exception $e) { - $this->logger->warning($serviceName . ' text to speech generation failed with: ' . $e->getMessage(), ['exception' => $e]); - throw new RuntimeException($serviceName . ' text to speech generation failed with: ' . $e->getMessage()); - } - } else { - $output = base64_decode($message['audio']['data']); - $textResponse = $message['audio']['transcript']; - } - $result = [ - 'output' => $output, - 'output_transcript' => $textResponse, - ]; - - // we still want the input transcription - try { - $inputTranscription = $this->openAiAPIService->transcribeFile($userId, $inputFile, false, $sttModel); - $result['input_transcript'] = $inputTranscription; - } catch (Exception $e) { - $this->logger->warning($serviceName . ' audio input transcription failed with: ' . $e->getMessage(), ['exception' => $e]); - throw new RuntimeException($serviceName . ' audio input transcription failed with: ' . $e->getMessage()); - } - - return $result; + return $this->oneStep($userId, $systemPrompt, $inputFile, $history, $outputVoice, $sttModel, $llmModel, $ttsModel, $speed, $serviceName); } + */ //////////////// 3 steps: STT -> LLM -> TTS // speech to text @@ -277,4 +234,56 @@ public function process(?string $userId, array $input, callable $reportProgress) throw new RuntimeException($serviceName . ' text to speech generation failed with: ' . $e->getMessage()); } } + + private function oneStep( + ?string $userId, string $systemPrompt, File $inputFile, array $history, string $outputVoice, + string $sttModel, string $llmModel, string $ttsModel, float $speed, string $serviceName + ): array { + $b64Audio = base64_encode($inputFile->getContent()); + $extraParams = [ + 'modalities' => ['text', 'audio'], + 'audio' => ['voice' => $outputVoice, 'format' => 'mp3'], + ]; + $systemPrompt .= ' Producing text responses will break the user interface. Important: You have multimodal voice capability, and you use voice exclusively to respond.'; + $completion = $this->openAiAPIService->createChatCompletion( + $userId, $llmModel, null, $systemPrompt, $history, 1, 1000, + $extraParams, null, null, $b64Audio, + ); + $message = array_pop($completion['audio_messages']); + // TODO find a way to force the model to answer with audio when there is only text in the history + // https://community.openai.com/t/gpt-4o-audio-preview-responds-in-text-not-audio/1006486/5 + if ($message === null) { + // no audio, TTS the text message + try { + $textResponse = array_pop($completion['messages']); + $apiResponse = $this->openAiAPIService->requestSpeechCreation($userId, $textResponse, $ttsModel, $outputVoice, $speed); + if (!isset($apiResponse['body'])) { + $this->logger->warning($serviceName . ' text to speech generation failed: no speech returned'); + throw new RuntimeException($serviceName . ' text to speech generation failed: no speech returned'); + } + $output = $apiResponse['body']; + } catch (\Exception $e) { + $this->logger->warning($serviceName . ' text to speech generation failed with: ' . $e->getMessage(), ['exception' => $e]); + throw new RuntimeException($serviceName . ' text to speech generation failed with: ' . $e->getMessage()); + } + } else { + $output = base64_decode($message['audio']['data']); + $textResponse = $message['audio']['transcript']; + } + $result = [ + 'output' => $output, + 'output_transcript' => $textResponse, + ]; + + // we still want the input transcription + try { + $inputTranscription = $this->openAiAPIService->transcribeFile($userId, $inputFile, false, $sttModel); + $result['input_transcript'] = $inputTranscription; + } catch (Exception $e) { + $this->logger->warning($serviceName . ' audio input transcription failed with: ' . $e->getMessage(), ['exception' => $e]); + throw new RuntimeException($serviceName . ' audio input transcription failed with: ' . $e->getMessage()); + } + + return $result; + } } From cd67b7fdeb4a13afc57764267dd044c59f9a5e5f Mon Sep 17 00:00:00 2001 From: Julien Veyssier Date: Wed, 9 Jul 2025 16:26:11 +0200 Subject: [PATCH 10/11] try to return the remote audio ID when using the chat endpoint with a multimodal model Signed-off-by: Julien Veyssier --- .../AudioToAudioChatProvider.php | 107 ++++++++++-------- psalm.xml | 1 + 2 files changed, 61 insertions(+), 47 deletions(-) diff --git a/lib/TaskProcessing/AudioToAudioChatProvider.php b/lib/TaskProcessing/AudioToAudioChatProvider.php index ca9929ef..afa54e1e 100644 --- a/lib/TaskProcessing/AudioToAudioChatProvider.php +++ b/lib/TaskProcessing/AudioToAudioChatProvider.php @@ -127,7 +127,13 @@ public function getOutputShapeEnumValues(): array { } public function getOptionalOutputShape(): array { - return []; + return [ + 'audio_id' => new ShapeDescriptor( + $this->l->t('Remote audio ID'), + $this->l->t('The ID of the audio response returned by the remote service'), + EShapeType::Text + ), + ]; } public function getOptionalOutputShapeEnumValues(): array { @@ -187,58 +193,21 @@ public function process(?string $userId, array $input, callable $reportProgress) $sttModel = $this->appConfig->getValueString(Application::APP_ID, 'default_stt_model_id', Application::DEFAULT_MODEL_ID) ?: Application::DEFAULT_MODEL_ID; $serviceName = $this->appConfig->getValueString(Application::APP_ID, 'service_name') ?: Application::APP_ID; - /////////////// Using the chat API if connected to OpenAI + // Using the chat API if connected to OpenAI // there is an issue if the history mostly contains text, the model will answer text even if we add the audio modality - /* if ($this->openAiAPIService->isUsingOpenAi()) { return $this->oneStep($userId, $systemPrompt, $inputFile, $history, $outputVoice, $sttModel, $llmModel, $ttsModel, $speed, $serviceName); } - */ - - //////////////// 3 steps: STT -> LLM -> TTS - // speech to text - try { - $inputTranscription = $this->openAiAPIService->transcribeFile($userId, $inputFile, false, $sttModel); - } catch (Exception $e) { - $this->logger->warning($serviceName . ' transcription failed with: ' . $e->getMessage(), ['exception' => $e]); - throw new RuntimeException($serviceName . ' transcription failed with: ' . $e->getMessage()); - } - - // free prompt - try { - $completion = $this->openAiAPIService->createChatCompletion($userId, $llmModel, $inputTranscription, $systemPrompt, $history, 1, 1000); - $completion = $completion['messages']; - } catch (Exception $e) { - throw new RuntimeException($serviceName . ' chat completion request failed: ' . $e->getMessage()); - } - if (count($completion) === 0) { - throw new RuntimeException('No completion in ' . $serviceName . ' response.'); - } - $llmResult = array_pop($completion); - - // text to speech - try { - $apiResponse = $this->openAiAPIService->requestSpeechCreation($userId, $llmResult, $ttsModel, $outputVoice, $speed); - if (!isset($apiResponse['body'])) { - $this->logger->warning($serviceName . ' text to speech generation failed: no speech returned'); - throw new RuntimeException($serviceName . ' text to speech generation failed: no speech returned'); - } - return [ - 'output' => $apiResponse['body'], - 'output_transcript' => $llmResult, - 'input_transcript' => $inputTranscription, - ]; - } catch (\Exception $e) { - $this->logger->warning($serviceName . ' text to speech generation failed with: ' . $e->getMessage(), ['exception' => $e]); - throw new RuntimeException($serviceName . ' text to speech generation failed with: ' . $e->getMessage()); - } + // 3 steps: STT -> LLM -> TTS + return $this->threeSteps($userId, $systemPrompt, $inputFile, $history, $outputVoice, $sttModel, $llmModel, $ttsModel, $speed, $serviceName); } private function oneStep( ?string $userId, string $systemPrompt, File $inputFile, array $history, string $outputVoice, - string $sttModel, string $llmModel, string $ttsModel, float $speed, string $serviceName + string $sttModel, string $llmModel, string $ttsModel, float $speed, string $serviceName, ): array { + $result = []; $b64Audio = base64_encode($inputFile->getContent()); $extraParams = [ 'modalities' => ['text', 'audio'], @@ -269,11 +238,12 @@ private function oneStep( } else { $output = base64_decode($message['audio']['data']); $textResponse = $message['audio']['transcript']; + if (isset($message['audio']['id'])) { + $result['audio_id'] = $message['audio']['id']; + } } - $result = [ - 'output' => $output, - 'output_transcript' => $textResponse, - ]; + $result['output'] = $output; + $result['output_transcript'] = $textResponse; // we still want the input transcription try { @@ -286,4 +256,47 @@ private function oneStep( return $result; } + + private function threeSteps( + ?string $userId, string $systemPrompt, File $inputFile, array $history, string $outputVoice, + string $sttModel, string $llmModel, string $ttsModel, float $speed, string $serviceName, + ): array { + // speech to text + try { + $inputTranscription = $this->openAiAPIService->transcribeFile($userId, $inputFile, false, $sttModel); + } catch (Exception $e) { + $this->logger->warning($serviceName . ' transcription failed with: ' . $e->getMessage(), ['exception' => $e]); + throw new RuntimeException($serviceName . ' transcription failed with: ' . $e->getMessage()); + } + + // free prompt + try { + $completion = $this->openAiAPIService->createChatCompletion($userId, $llmModel, $inputTranscription, $systemPrompt, $history, 1, 1000); + $completion = $completion['messages']; + } catch (Exception $e) { + throw new RuntimeException($serviceName . ' chat completion request failed: ' . $e->getMessage()); + } + if (count($completion) === 0) { + throw new RuntimeException('No completion in ' . $serviceName . ' response.'); + } + $llmResult = array_pop($completion); + + // text to speech + try { + $apiResponse = $this->openAiAPIService->requestSpeechCreation($userId, $llmResult, $ttsModel, $outputVoice, $speed); + + if (!isset($apiResponse['body'])) { + $this->logger->warning($serviceName . ' text to speech generation failed: no speech returned'); + throw new RuntimeException($serviceName . ' text to speech generation failed: no speech returned'); + } + return [ + 'output' => $apiResponse['body'], + 'output_transcript' => $llmResult, + 'input_transcript' => $inputTranscription, + ]; + } catch (\Exception $e) { + $this->logger->warning($serviceName . ' text to speech generation failed with: ' . $e->getMessage(), ['exception' => $e]); + throw new RuntimeException($serviceName . ' text to speech generation failed with: ' . $e->getMessage()); + } + } } diff --git a/psalm.xml b/psalm.xml index 8ca4dcbc..128fcc86 100644 --- a/psalm.xml +++ b/psalm.xml @@ -39,6 +39,7 @@ + From 3ea9b777ebcb63adc6ad67c9c012d2db35874f9d Mon Sep 17 00:00:00 2001 From: Julien Veyssier Date: Thu, 10 Jul 2025 12:09:04 +0200 Subject: [PATCH 11/11] return remote audio expiration date Signed-off-by: Julien Veyssier --- lib/TaskProcessing/AudioToAudioChatProvider.php | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/lib/TaskProcessing/AudioToAudioChatProvider.php b/lib/TaskProcessing/AudioToAudioChatProvider.php index afa54e1e..637843f5 100644 --- a/lib/TaskProcessing/AudioToAudioChatProvider.php +++ b/lib/TaskProcessing/AudioToAudioChatProvider.php @@ -133,6 +133,11 @@ public function getOptionalOutputShape(): array { $this->l->t('The ID of the audio response returned by the remote service'), EShapeType::Text ), + 'audio_expires_at' => new ShapeDescriptor( + $this->l->t('Remote audio expiration date'), + $this->l->t('The remote audio response stays available in the service until this date'), + EShapeType::Number + ), ]; } @@ -241,6 +246,9 @@ private function oneStep( if (isset($message['audio']['id'])) { $result['audio_id'] = $message['audio']['id']; } + if (isset($message['audio']['expires_at'])) { + $result['audio_expires_at'] = $message['audio']['expires_at']; + } } $result['output'] = $output; $result['output_transcript'] = $textResponse;