Skip to content

Feat: Add Analyze Image Task Type #226

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 8 commits into from
Jul 8, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions lib/AppInfo/Application.php
Original file line number Diff line number Diff line change
Expand Up @@ -118,6 +118,10 @@ public function register(IRegistrationContext $context): void {
if (class_exists('OCP\\TaskProcessing\\TaskTypes\\TextToTextProofread')) {
$context->registerTaskProcessingProvider(\OCA\OpenAi\TaskProcessing\ProofreadProvider::class);
}
if (!class_exists('OCP\\TaskProcessing\\TaskTypes\\AnalyzeImages')) {
$context->registerTaskProcessingTaskType(\OCA\OpenAi\TaskProcessing\AnalyzeImagesTaskType::class);
}
$context->registerTaskProcessingProvider(\OCA\OpenAi\TaskProcessing\AnalyzeImagesProvider::class);
}
if (!class_exists('OCP\\TaskProcessing\\TaskTypes\\TextToSpeech')) {
$context->registerTaskProcessingTaskType(\OCA\OpenAi\TaskProcessing\TextToSpeechTaskType::class);
Expand Down
193 changes: 193 additions & 0 deletions lib/TaskProcessing/AnalyzeImagesProvider.php
Original file line number Diff line number Diff line change
@@ -0,0 +1,193 @@
<?php

declare(strict_types=1);

/**
* SPDX-FileCopyrightText: 2025 Nextcloud GmbH and Nextcloud contributors
* SPDX-License-Identifier: AGPL-3.0-or-later
*/

namespace OCA\OpenAi\TaskProcessing;

use OCA\OpenAi\AppInfo\Application;
use OCA\OpenAi\Service\OpenAiAPIService;
use OCA\OpenAi\Service\OpenAiSettingsService;
use OCP\Files\File;
use OCP\IAppConfig;
use OCP\IL10N;
use OCP\TaskProcessing\EShapeType;
use OCP\TaskProcessing\ISynchronousProvider;
use OCP\TaskProcessing\ShapeDescriptor;
use Psr\Log\LoggerInterface;
use RuntimeException;

class AnalyzeImagesProvider implements ISynchronousProvider {

public function __construct(
private OpenAiAPIService $openAiAPIService,
private OpenAiSettingsService $openAiSettingsService,
private IL10N $l,
private LoggerInterface $logger,
private IAppConfig $appConfig,
private ?string $userId,
) {
}

public function getId(): string {
return Application::APP_ID . '-analyze-images';
}

public function getName(): string {
return $this->openAiAPIService->getServiceName();
}

public function getTaskTypeId(): string {
if (class_exists('OCP\\TaskProcessing\\TaskTypes\\AnalyzeImages')) {
return \OCP\TaskProcessing\TaskTypes\AnalyzeImages::ID;
}
return AnalyzeImagesTaskType::ID;
}

public function getExpectedRuntime(): int {
return $this->openAiAPIService->getExpTextProcessingTime();
}

public function getInputShapeEnumValues(): array {
return [];
}

public function getInputShapeDefaults(): array {
return [];
}


public function getOptionalInputShape(): array {
return [
'max_tokens' => new ShapeDescriptor(
$this->l->t('Maximum output words'),
$this->l->t('The maximum number of words/tokens that can be generated in the output.'),
EShapeType::Number
),
'model' => new ShapeDescriptor(
$this->l->t('Model'),
$this->l->t('The model used to generate the output'),
EShapeType::Enum
),
];
}

public function getOptionalInputShapeEnumValues(): array {
return [
'model' => $this->openAiAPIService->getModelEnumValues($this->userId),
];
}

public function getOptionalInputShapeDefaults(): array {
$adminModel = $this->openAiAPIService->isUsingOpenAi()
? ($this->appConfig->getValueString(Application::APP_ID, 'default_completion_model_id', Application::DEFAULT_MODEL_ID) ?: Application::DEFAULT_MODEL_ID)
: $this->appConfig->getValueString(Application::APP_ID, 'default_completion_model_id');
return [
'max_tokens' => 1000,
'model' => $adminModel,
];
}

public function getOutputShapeEnumValues(): array {
return [];
}

public function getOptionalOutputShape(): array {
return [];
}

public function getOptionalOutputShapeEnumValues(): array {
return [];
}

public function process(?string $userId, array $input, callable $reportProgress): array {

if (!$this->openAiAPIService->isUsingOpenAi() && !$this->openAiSettingsService->getChatEndpointEnabled()) {
throw new RuntimeException('Must support chat completion endpoint');
}

$history = [];

if (!isset($input['images']) || !is_array($input['images'])) {
throw new RuntimeException('Invalid file list');
}
// Maximum file count for openai is 500. Seems reasonable enough to enforce for all apis though (https://platform.openai.com/docs/guides/images-vision?api-mode=responses&format=url#image-input-requirements)
if (count($input['images']) > 500) {
throw new RuntimeException('Too many files given. Max is 500');
}
$fileSize = 0;
foreach ($input['images'] as $image) {
if (!$image instanceof File || !$image->isReadable()) {
throw new RuntimeException('Invalid input file');
}
$fileSize += intval($image->getSize());
// Maximum file size for openai is 50MB. Seems reasonable enough to enforce for all apis though. (https://platform.openai.com/docs/guides/images-vision?api-mode=responses&format=url#image-input-requirements)
if ($fileSize > 50 * 1000 * 1000) {
throw new RuntimeException('Filesize of input files too large. Max is 50MB');
}
$inputFile = base64_encode(stream_get_contents($image->fopen('rb')));
$fileType = $image->getMimeType();
if (!str_starts_with($fileType, 'image/')) {
throw new RuntimeException('Invalid input file type ' . $fileType);
}
if ($this->openAiAPIService->isUsingOpenAi()) {
$validFileTypes = [
'image/jpeg',
'image/png',
'image/gif',
'image/webp',
];
if (!in_array($fileType, $validFileTypes)) {
throw new RuntimeException('Invalid input file type for OpenAI ' . $fileType);
}
}
$history[] = json_encode([
'role' => 'user',
'content' => [
[
'type' => 'image_url',
'image_url' => [
'url' => 'data:' . $fileType . ';base64,' . $inputFile,
],
],
],
]);
}


if (!isset($input['input']) || !is_string($input['input'])) {
throw new RuntimeException('Invalid prompt');
}
$prompt = $input['input'];

if (isset($input['model']) && is_string($input['model'])) {
$model = $input['model'];
} else {
$model = $this->appConfig->getValueString(Application::APP_ID, 'default_completion_model_id', Application::DEFAULT_COMPLETION_MODEL_ID) ?: Application::DEFAULT_COMPLETION_MODEL_ID;
}

$maxTokens = null;
if (isset($input['max_tokens']) && is_int($input['max_tokens'])) {
$maxTokens = $input['max_tokens'];
}

try {
$systemPrompt = 'Take the user\'s question and answer it based on the provided images. Ensure that the answer matches the language of the user\'s question.';
$completion = $this->openAiAPIService->createChatCompletion($userId, $model, $prompt, $systemPrompt, $history, 1, $maxTokens);
$completion = $completion['messages'];

if (count($completion) > 0) {
return ['output' => array_pop($completion)];
}

throw new RuntimeException('No result in OpenAI/LocalAI response.');
} catch (\Exception $e) {
$this->logger->warning('OpenAI/LocalAI\'s image question generation failed with: ' . $e->getMessage(), ['exception' => $e]);
throw new RuntimeException('OpenAI/LocalAI\'s image question generation failed with: ' . $e->getMessage());
}
}
}
77 changes: 77 additions & 0 deletions lib/TaskProcessing/AnalyzeImagesTaskType.php
Original file line number Diff line number Diff line change
@@ -0,0 +1,77 @@
<?php

declare(strict_types=1);

/**
* SPDX-FileCopyrightText: 2025 Nextcloud GmbH and Nextcloud contributors
* SPDX-License-Identifier: AGPL-3.0-or-later
*/

namespace OCA\OpenAi\TaskProcessing;

use OCA\OpenAi\AppInfo\Application;
use OCP\IL10N;
use OCP\TaskProcessing\EShapeType;
use OCP\TaskProcessing\ITaskType;
use OCP\TaskProcessing\ShapeDescriptor;

class AnalyzeImagesTaskType implements ITaskType {
public const ID = Application::APP_ID . ':analyze-images';

public function __construct(
private IL10N $l,
) {
}

/**
* @inheritDoc
*/
public function getName(): string {
return $this->l->t('Analyze images');
}

/**
* @inheritDoc
*/
public function getDescription(): string {
return $this->l->t('Ask a question about the given images.');
}

/**
* @return string
*/
public function getId(): string {
return self::ID;
}

/**
* @return ShapeDescriptor[]
*/
public function getInputShape(): array {
return [
'images' => new ShapeDescriptor(
$this->l->t('Images'),
$this->l->t('Images to ask a question about'),
EShapeType::ListOfImages,
),
'input' => new ShapeDescriptor(
$this->l->t('Question'),
$this->l->t('What to ask about the image.'),
EShapeType::Text,
),
];
}

/**
* @return ShapeDescriptor[]
*/
public function getOutputShape(): array {
return [
'output' => new ShapeDescriptor(
$this->l->t('Generated response'),
$this->l->t('The answer to the question'),
EShapeType::Text
),
];
}
}
Loading