Skip to content

feat(Text2Speech): Add support for text to speech #210

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 21 commits into from
Jun 25, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion AUTHORS.md
Original file line number Diff line number Diff line change
Expand Up @@ -11,8 +11,9 @@
- John Molakvoæ <[email protected]>
- Juergen Kellerer <[email protected]>
- Julien Veyssier <[email protected]>
- Lukas Schaefer <[email protected]>
- Marcel Klehr <[email protected]>
- Sami Finnilä <[email protected]>
- Micke Nordin <[email protected]>
- rakekniven <[email protected]>
- Richard Steinmetz <[email protected]>
- Sami Finnilä <[email protected]>
8 changes: 8 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,14 @@ Positive:
Negative:
* The training data is not freely available, limiting the ability of external parties to check and correct for bias or optimise the model’s performance and CO2 usage.

### Rating for Text-To-Speech via the OpenAI API: 🔴

Negative:
* The software for training and inferencing of this model is proprietary, limiting running it locally or training by yourself
* The trained model is not freely available, so the model can not be ran on-premises
* The training data is not freely available, limiting the ability of external parties to check and correct for bias or optimise the model’s performance and CO2 usage.


### Rating for Text generation via LocalAI: 🟢

Positive:
Expand Down
7 changes: 7 additions & 0 deletions appinfo/info.xml
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,13 @@ Positive:
Negative:
* The training data is not freely available, limiting the ability of external parties to check and correct for bias or optimise the model’s performance and CO2 usage.

### Rating for Text-To-Speech via the OpenAI API: 🔴

Negative:
* The software for training and inferencing of this model is proprietary, limiting running it locally or training by yourself
* The trained model is not freely available, so the model can not be ran on-premises
* The training data is not freely available, limiting the ability of external parties to check and correct for bias or optimise the model’s performance and CO2 usage.

### Rating for Text generation via LocalAI: 🟢

Positive:
Expand Down
8 changes: 4 additions & 4 deletions composer.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

13 changes: 13 additions & 0 deletions lib/AppInfo/Application.php
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
use OCA\OpenAi\TaskProcessing\ReformulateProvider;
use OCA\OpenAi\TaskProcessing\SummaryProvider;
use OCA\OpenAi\TaskProcessing\TextToImageProvider;
use OCA\OpenAi\TaskProcessing\TextToSpeechProvider;
use OCA\OpenAi\TaskProcessing\TextToTextChatProvider;
use OCA\OpenAi\TaskProcessing\TextToTextProvider;
use OCA\OpenAi\TaskProcessing\TopicsProvider;
Expand All @@ -40,6 +41,12 @@ class Application extends App implements IBootstrap {
public const DEFAULT_COMPLETION_MODEL_ID = 'gpt-3.5-turbo';
public const DEFAULT_IMAGE_MODEL_ID = 'dall-e-2';
public const DEFAULT_TRANSCRIPTION_MODEL_ID = 'whisper-1';
public const DEFAULT_SPEECH_MODEL_ID = 'tts-1-hd';
public const DEFAULT_SPEECH_VOICE = 'alloy';
public const DEFAULT_SPEECH_VOICES = [
'alloy', 'ash', 'ballad', 'coral', 'echo', 'fable',
'onyx', 'nova', 'sage', 'shimmer', 'verse'
];
public const DEFAULT_DEFAULT_IMAGE_SIZE = '1024x1024';
public const MAX_GENERATION_IDLE_TIME = 60 * 60 * 24 * 10;
public const DEFAULT_CHUNK_SIZE = 10000;
Expand All @@ -56,11 +63,13 @@ class Application extends App implements IBootstrap {
public const QUOTA_TYPE_TEXT = 0;
public const QUOTA_TYPE_IMAGE = 1;
public const QUOTA_TYPE_TRANSCRIPTION = 2;
public const QUOTA_TYPE_SPEECH = 3;

public const DEFAULT_QUOTAS = [
self::QUOTA_TYPE_TEXT => 0, // 0 = unlimited
self::QUOTA_TYPE_IMAGE => 0, // 0 = unlimited
self::QUOTA_TYPE_TRANSCRIPTION => 0, // 0 = unlimited
self::QUOTA_TYPE_SPEECH => 0, // 0 = unlimited

];

Expand Down Expand Up @@ -110,6 +119,10 @@ public function register(IRegistrationContext $context): void {
$context->registerTaskProcessingProvider(\OCA\OpenAi\TaskProcessing\ProofreadProvider::class);
}
}
if (!class_exists('OCP\\TaskProcessing\\TaskTypes\\TextToSpeech')) {
$context->registerTaskProcessingTaskType(\OCA\OpenAi\TaskProcessing\TextToSpeechTaskType::class);
}
$context->registerTaskProcessingProvider(TextToSpeechProvider::class);
if ($this->appConfig->getValueString(Application::APP_ID, 't2i_provider_enabled', '1') === '1') {
$context->registerTaskProcessingProvider(TextToImageProvider::class);
}
Expand Down
58 changes: 53 additions & 5 deletions lib/Service/OpenAiAPIService.php
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,8 @@
use OCP\TaskProcessing\ShapeEnumValue;
use Psr\Log\LoggerInterface;
use RuntimeException;
use Throwable;
use function json_encode;

/**
* Service to make requests to OpenAI/LocalAI REST API
Expand Down Expand Up @@ -132,7 +134,7 @@ public function getModels(string $userId): array {
throw $e;
}
if (isset($modelsResponse['error'])) {
$this->logger->warning('Error retrieving models: ' . \json_encode($modelsResponse));
$this->logger->warning('Error retrieving models: ' . json_encode($modelsResponse));
$this->areCredsValid = false;
throw new Exception($modelsResponse['error'], Http::STATUS_INTERNAL_SERVER_ERROR);
}
Expand All @@ -142,7 +144,7 @@ public function getModels(string $userId): array {
}

if (!$this->isModelListValid($modelsResponse['data'])) {
$this->logger->warning('Invalid models response: ' . \json_encode($modelsResponse));
$this->logger->warning('Invalid models response: ' . json_encode($modelsResponse));
$this->areCredsValid = false;
throw new Exception($this->l10n->t('Invalid models response received'), Http::STATUS_INTERNAL_SERVER_ERROR);
}
Expand Down Expand Up @@ -185,7 +187,7 @@ public function getModelEnumValues(?string $userId): array {
array_unshift($modelEnumValues, new ShapeEnumValue($this->l10n->t('Default'), 'Default'));
}
return $modelEnumValues;
} catch (\Throwable $e) {
} catch (Throwable $e) {
// avoid flooding the logs with errors from calls of task processing
$this->logger->info('Error getting model enum values', ['exception' => $e]);
return [];
Expand Down Expand Up @@ -248,6 +250,8 @@ public function translatedQuotaType(int $type): string {
return $this->l10n->t('Image generation');
case Application::QUOTA_TYPE_TRANSCRIPTION:
return $this->l10n->t('Audio transcription');
case Application::QUOTA_TYPE_SPEECH:
return $this->l10n->t('Text to speech');
default:
return $this->l10n->t('Unknown');
}
Expand All @@ -266,6 +270,8 @@ public function translatedQuotaUnit(int $type): string {
return $this->l10n->t('images');
case Application::QUOTA_TYPE_TRANSCRIPTION:
return $this->l10n->t('seconds');
case Application::QUOTA_TYPE_SPEECH:
return $this->l10n->t('characters');
default:
return $this->l10n->t('Unknown');
}
Expand Down Expand Up @@ -742,6 +748,41 @@ public function getImageRequestOptions(?string $userId): array {
return $requestOptions;
}

/**
* @param string|null $userId
* @param string $prompt
* @param string $model
* @param string $voice
* @param float $speed
* @return array
* @throws Exception
*/
public function requestSpeechCreation(
?string $userId, string $prompt, string $model, string $voice, float $speed = 1,
): array {
if ($this->isQuotaExceeded($userId, Application::QUOTA_TYPE_SPEECH)) {
throw new Exception($this->l10n->t('Speech generation quota exceeded'), Http::STATUS_TOO_MANY_REQUESTS);
}

$params = [
'input' => $prompt,
'voice' => $voice === Application::DEFAULT_MODEL_ID ? Application::DEFAULT_SPEECH_VOICE : $voice,
'model' => $model === Application::DEFAULT_MODEL_ID ? Application::DEFAULT_SPEECH_MODEL_ID : $model,
'response_format' => 'mp3',
'speed' => $speed,
];

$apiResponse = $this->request($userId, 'audio/speech', $params, 'POST');

try {
$charCount = mb_strlen($prompt);
$this->quotaUsageMapper->createQuotaUsage($userId ?? '', Application::QUOTA_TYPE_SPEECH, $charCount);
} catch (DBException $e) {
$this->logger->warning('Could not create quota usage for user: ' . $userId . ' and quota type: ' . Application::QUOTA_TYPE_SPEECH . '. Error: ' . $e->getMessage());
}
return $apiResponse;
}

/**
* @return int
*/
Expand Down Expand Up @@ -893,9 +934,16 @@ public function request(?string $userId, string $endPoint, array $params = [], s

if ($respCode >= 400) {
return ['error' => $this->l10n->t('Bad credentials')];
} else {
return json_decode($body, true) ?: [];
}
if ($response->getHeader('Content-Type') === 'application/json') {
$parsedBody = json_decode($body, true);
if ($parsedBody === null) {
$this->logger->warning('Could not JSON parse the response', ['body' => $body]);
return ['error' => 'Could not JSON parse the response'];
}
return $parsedBody;
}
return ['body' => $body];
} catch (ClientException|ServerException $e) {
$responseBody = $e->getResponse()->getBody();
$parsedResponseBody = json_decode($responseBody, true);
Expand Down
Loading
Loading