Skip to content

Commit d285a78

Browse files
committed
implement audio chat provider
Signed-off-by: Julien Veyssier <[email protected]>
1 parent 4f3b11f commit d285a78

File tree

4 files changed

+367
-3
lines changed

4 files changed

+367
-3
lines changed

lib/AppInfo/Application.php

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,8 @@
99

1010
use OCA\OpenAi\Capabilities;
1111
use OCA\OpenAi\OldProcessing\Translation\TranslationProvider as OldTranslationProvider;
12+
use OCA\OpenAi\TaskProcessing\AudioToAudioChatProvider;
13+
use OCA\OpenAi\TaskProcessing\AudioToAudioChatTaskType;
1214
use OCA\OpenAi\TaskProcessing\AudioToTextProvider;
1315
use OCA\OpenAi\TaskProcessing\ChangeToneProvider;
1416
use OCA\OpenAi\TaskProcessing\ChangeToneTaskType;
@@ -126,6 +128,8 @@ public function register(IRegistrationContext $context): void {
126128
if ($this->appConfig->getValueString(Application::APP_ID, 't2i_provider_enabled', '1') === '1') {
127129
$context->registerTaskProcessingProvider(TextToImageProvider::class);
128130
}
131+
$context->registerTaskProcessingTaskType(AudioToAudioChatTaskType::class);
132+
$context->registerTaskProcessingProvider(AudioToAudioChatProvider::class);
129133

130134
$context->registerCapability(Capabilities::class);
131135
}

lib/Service/OpenAiAPIService.php

Lines changed: 25 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -418,7 +418,8 @@ public function createCompletion(
418418
* @param array|null $extraParams
419419
* @param string|null $toolMessage JSON string with role, content, tool_call_id
420420
* @param array|null $tools
421-
* @return array{messages: array<string>, tool_calls: array<string>}
421+
* @param string|null $userAudioPromptBase64
422+
* @return array{messages: array<string>, tool_calls: array<string>, audio_messages: list<array<string, mixed>>}
422423
* @throws Exception
423424
*/
424425
public function createChatCompletion(
@@ -432,6 +433,7 @@ public function createChatCompletion(
432433
?array $extraParams = null,
433434
?string $toolMessage = null,
434435
?array $tools = null,
436+
?string $userAudioPromptBase64 = null,
435437
): array {
436438
if ($this->isQuotaExceeded($userId, Application::QUOTA_TYPE_TEXT)) {
437439
throw new Exception($this->l10n->t('Text generation quota exceeded'), Http::STATUS_TOO_MANY_REQUESTS);
@@ -475,8 +477,24 @@ public function createChatCompletion(
475477
$messages[] = $message;
476478
}
477479
}
478-
if ($userPrompt !== null) {
479-
$messages[] = ['role' => 'user', 'content' => $userPrompt];
480+
if ($userPrompt !== null || $userAudioPromptBase64 !== null) {
481+
$message = ['role' => 'user', 'content' => []];
482+
if ($userPrompt !== null) {
483+
$message['content'][] = [
484+
'type' => 'text',
485+
'text' => $userPrompt,
486+
];
487+
}
488+
if ($userAudioPromptBase64 !== null) {
489+
$message['content'][] = [
490+
'type' => 'input_audio',
491+
'input_audio' => [
492+
'data' => $userAudioPromptBase64,
493+
'format' => 'mp3',
494+
],
495+
];
496+
}
497+
$messages[] = $message;
480498
}
481499
if ($toolMessage !== null) {
482500
$msgs = json_decode($toolMessage, true);
@@ -536,6 +554,7 @@ public function createChatCompletion(
536554
$completions = [
537555
'messages' => [],
538556
'tool_calls' => [],
557+
'audio_messages' => [],
539558
];
540559

541560
foreach ($response['choices'] as $choice) {
@@ -564,6 +583,9 @@ public function createChatCompletion(
564583
if (isset($choice['message']['content']) && is_string($choice['message']['content'])) {
565584
$completions['messages'][] = $choice['message']['content'];
566585
}
586+
if (isset($choice['message']['audio'], $choice['message']['audio']['data']) && is_string($choice['message']['audio']['data'])) {
587+
$completions['audio_messages'][] = $choice['message'];
588+
}
567589
}
568590

569591
return $completions;
Lines changed: 256 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,256 @@
1+
<?php
2+
3+
declare(strict_types=1);
4+
5+
/**
6+
* SPDX-FileCopyrightText: 2025 Nextcloud GmbH and Nextcloud contributors
7+
* SPDX-License-Identifier: AGPL-3.0-or-later
8+
*/
9+
10+
namespace OCA\OpenAi\TaskProcessing;
11+
12+
use Exception;
13+
use OCA\OpenAi\AppInfo\Application;
14+
use OCA\OpenAi\Service\OpenAiAPIService;
15+
use OCA\OpenAi\Service\OpenAiSettingsService;
16+
use OCP\Files\File;
17+
use OCP\IAppConfig;
18+
use OCP\IL10N;
19+
use OCP\TaskProcessing\EShapeType;
20+
use OCP\TaskProcessing\ISynchronousProvider;
21+
use OCP\TaskProcessing\ShapeDescriptor;
22+
use OCP\TaskProcessing\ShapeEnumValue;
23+
use Psr\Log\LoggerInterface;
24+
use RuntimeException;
25+
26+
class AudioToAudioChatProvider implements ISynchronousProvider {
27+
28+
public function __construct(
29+
private OpenAiAPIService $openAiAPIService,
30+
private OpenAiSettingsService $openAiSettingsService,
31+
private IL10N $l,
32+
private LoggerInterface $logger,
33+
private IAppConfig $appConfig,
34+
private ?string $userId,
35+
) {
36+
}
37+
38+
public function getId(): string {
39+
return Application::APP_ID . '-audio2audio:chat';
40+
}
41+
42+
public function getName(): string {
43+
return $this->openAiAPIService->getServiceName();
44+
}
45+
46+
public function getTaskTypeId(): string {
47+
if (class_exists('OCP\\TaskProcessing\\TaskTypes\\AudioToAudioChat')) {
48+
return \OCP\TaskProcessing\TaskTypes\AudioToAudioChat::ID;
49+
}
50+
return AudioToAudioChatTaskType::ID;
51+
}
52+
53+
public function getExpectedRuntime(): int {
54+
return $this->openAiAPIService->getExpTextProcessingTime();
55+
}
56+
57+
public function getInputShapeEnumValues(): array {
58+
return [];
59+
}
60+
61+
public function getInputShapeDefaults(): array {
62+
return [];
63+
}
64+
65+
66+
public function getOptionalInputShape(): array {
67+
return [
68+
'llm_model' => new ShapeDescriptor(
69+
$this->l->t('Completion model'),
70+
$this->l->t('The model used to generate the completion'),
71+
EShapeType::Enum
72+
),
73+
'voice' => new ShapeDescriptor(
74+
$this->l->t('Voice'),
75+
$this->l->t('The voice to use'),
76+
EShapeType::Enum
77+
),
78+
'tts_model' => new ShapeDescriptor(
79+
$this->l->t('Text-to-speech model'),
80+
$this->l->t('The model used to generate the speech'),
81+
EShapeType::Enum
82+
),
83+
'speed' => new ShapeDescriptor(
84+
$this->l->t('Speed'),
85+
$this->openAiAPIService->isUsingOpenAi()
86+
? $this->l->t('Speech speed modifier (Valid values: 0.25-4)')
87+
: $this->l->t('Speech speed modifier'),
88+
EShapeType::Number
89+
)
90+
];
91+
}
92+
93+
public function getOptionalInputShapeEnumValues(): array {
94+
$voices = json_decode($this->appConfig->getValueString(Application::APP_ID, 'tts_voices')) ?: Application::DEFAULT_SPEECH_VOICES;
95+
$models = $this->openAiAPIService->getModelEnumValues($this->userId);
96+
return [
97+
'voice' => array_map(function ($v) { return new ShapeEnumValue($v, $v); }, $voices),
98+
'llm_model' => $models,
99+
'tts_model' => $models,
100+
];
101+
}
102+
103+
public function getOptionalInputShapeDefaults(): array {
104+
$adminVoice = $this->appConfig->getValueString(Application::APP_ID, 'default_speech_voice') ?: Application::DEFAULT_SPEECH_VOICE;
105+
$adminTtsModel = $this->appConfig->getValueString(Application::APP_ID, 'default_speech_model_id') ?: Application::DEFAULT_SPEECH_MODEL_ID;
106+
$adminLlmModel = $this->openAiAPIService->isUsingOpenAi()
107+
? ($this->appConfig->getValueString(Application::APP_ID, 'default_completion_model_id', Application::DEFAULT_MODEL_ID) ?: Application::DEFAULT_MODEL_ID)
108+
: $this->appConfig->getValueString(Application::APP_ID, 'default_completion_model_id');
109+
return [
110+
'voice' => $adminVoice,
111+
'tts_model' => $adminTtsModel,
112+
'speed' => 1,
113+
'llm_model' => $adminLlmModel,
114+
];
115+
}
116+
117+
public function getOutputShapeEnumValues(): array {
118+
return [];
119+
}
120+
121+
public function getOptionalOutputShape(): array {
122+
return [
123+
'input_transcript' => new ShapeDescriptor(
124+
$this->l->t('Input transcript'),
125+
$this->l->t('Input transcription'),
126+
EShapeType::Text,
127+
),
128+
'output_transcript' => new ShapeDescriptor(
129+
$this->l->t('Output transcript'),
130+
$this->l->t('Response transcription'),
131+
EShapeType::Text,
132+
),
133+
];
134+
}
135+
136+
public function getOptionalOutputShapeEnumValues(): array {
137+
return [];
138+
}
139+
140+
public function process(?string $userId, array $input, callable $reportProgress): array {
141+
if (!isset($input['input']) || !$input['input'] instanceof File || !$input['input']->isReadable()) {
142+
throw new RuntimeException('Invalid input file');
143+
}
144+
$inputFile = $input['input'];
145+
146+
if (!isset($input['system_prompt']) || !is_string($input['system_prompt'])) {
147+
throw new RuntimeException('Invalid system_prompt');
148+
}
149+
$systemPrompt = $input['system_prompt'];
150+
151+
if (!isset($input['history']) || !is_array($input['history'])) {
152+
throw new RuntimeException('Invalid history');
153+
}
154+
$history = $input['history'];
155+
156+
if (isset($input['tts_model']) && is_string($input['tts_model'])) {
157+
$ttsModel = $input['tts_model'];
158+
} else {
159+
$ttsModel = $this->appConfig->getValueString(Application::APP_ID, 'default_speech_model_id', Application::DEFAULT_SPEECH_MODEL_ID) ?: Application::DEFAULT_SPEECH_MODEL_ID;
160+
}
161+
162+
if (isset($input['llm_model']) && is_string($input['llm_model'])) {
163+
$llmModel = $input['llm_model'];
164+
} else {
165+
$llmModel = $this->appConfig->getValueString(Application::APP_ID, 'default_completion_model_id', Application::DEFAULT_MODEL_ID) ?: Application::DEFAULT_MODEL_ID;
166+
}
167+
168+
169+
if (isset($input['voice']) && is_string($input['voice'])) {
170+
$voice = $input['voice'];
171+
} else {
172+
$voice = $this->appConfig->getValueString(Application::APP_ID, 'default_speech_voice', Application::DEFAULT_SPEECH_VOICE) ?: Application::DEFAULT_SPEECH_VOICE;
173+
}
174+
175+
$speed = 1;
176+
if (isset($input['speed']) && is_numeric($input['speed'])) {
177+
$speed = $input['speed'];
178+
if ($this->openAiAPIService->isUsingOpenAi()) {
179+
if ($speed > 4) {
180+
$speed = 4;
181+
} elseif ($speed < 0.25) {
182+
$speed = 0.25;
183+
}
184+
}
185+
}
186+
187+
$sttModel = $this->appConfig->getValueString(Application::APP_ID, 'default_stt_model_id', Application::DEFAULT_MODEL_ID) ?: Application::DEFAULT_MODEL_ID;
188+
189+
/////////////// Using the chat API if connected to OpenAI
190+
if ($this->openAiAPIService->isUsingOpenAi()) {
191+
$b64Audio = base64_encode($inputFile->getContent());
192+
$extraParams = [
193+
'modalities' => ['text', 'audio'],
194+
'audio' => ['voice' => $voice, 'format' => 'mp3'],
195+
];
196+
$completion = $this->openAiAPIService->createChatCompletion(
197+
$userId, 'gpt-4o-audio-preview', null, $systemPrompt, $history, 1, 1000,
198+
$extraParams, null, null, $b64Audio,
199+
);
200+
$message = array_pop($completion['audio_messages']);
201+
$result = [
202+
'output' => base64_decode($message['audio']['data']),
203+
'output_transcript' => $message['audio']['transcript'],
204+
];
205+
206+
// we still want the input transcription
207+
try {
208+
$inputTranscription = $this->openAiAPIService->transcribeFile($userId, $inputFile, false, $sttModel);
209+
$result['input_transcript'] = $inputTranscription;
210+
} catch (Exception $e) {
211+
$this->logger->warning('OpenAI\'s Whisper transcription failed with: ' . $e->getMessage(), ['exception' => $e]);
212+
}
213+
214+
return $result;
215+
}
216+
217+
//////////////// 3 steps: STT -> LLM -> TTS
218+
// speech to text
219+
try {
220+
$inputTranscription = $this->openAiAPIService->transcribeFile($userId, $inputFile, false, $sttModel);
221+
} catch (Exception $e) {
222+
$this->logger->warning('OpenAI\'s Whisper transcription failed with: ' . $e->getMessage(), ['exception' => $e]);
223+
throw new RuntimeException('OpenAI\'s Whisper transcription failed with: ' . $e->getMessage());
224+
}
225+
226+
// free prompt
227+
try {
228+
$completion = $this->openAiAPIService->createChatCompletion($userId, $llmModel, $inputTranscription, $systemPrompt, $history, 1, 1000);
229+
$completion = $completion['messages'];
230+
} catch (Exception $e) {
231+
throw new RuntimeException('OpenAI/LocalAI request failed: ' . $e->getMessage());
232+
}
233+
if (count($completion) === 0) {
234+
throw new RuntimeException('No completion in OpenAI/LocalAI response.');
235+
}
236+
$llmResult = array_pop($completion);
237+
238+
// text to speech
239+
try {
240+
$apiResponse = $this->openAiAPIService->requestSpeechCreation($userId, $llmResult, $ttsModel, $voice, $speed);
241+
242+
if (!isset($apiResponse['body'])) {
243+
$this->logger->warning('OpenAI/LocalAI\'s text to speech generation failed: no speech returned');
244+
throw new RuntimeException('OpenAI/LocalAI\'s text to speech generation failed: no speech returned');
245+
}
246+
return [
247+
'output' => $apiResponse['body'],
248+
'output_transcript' => $llmResult,
249+
'input_transcript' => $inputTranscription,
250+
];
251+
} catch (\Exception $e) {
252+
$this->logger->warning('OpenAI/LocalAI\'s text to image generation failed with: ' . $e->getMessage(), ['exception' => $e]);
253+
throw new RuntimeException('OpenAI/LocalAI\'s text to image generation failed with: ' . $e->getMessage());
254+
}
255+
}
256+
}

0 commit comments

Comments
 (0)