Merge pull request #226 from nextcloud/picture

lukasdotcom · web-flow · commit b717552adf42 · 2025-07-08T11:11:26.000-04:00
Feat: Add Analyze Image Task Type
diff --git a/lib/AppInfo/Application.php b/lib/AppInfo/Application.php
@@ -118,6 +118,10 @@ public function register(IRegistrationContext $context): void {
 			if (class_exists('OCP\\TaskProcessing\\TaskTypes\\TextToTextProofread')) {
 				$context->registerTaskProcessingProvider(\OCA\OpenAi\TaskProcessing\ProofreadProvider::class);
 			}
+			if (!class_exists('OCP\\TaskProcessing\\TaskTypes\\AnalyzeImages')) {
+				$context->registerTaskProcessingTaskType(\OCA\OpenAi\TaskProcessing\AnalyzeImagesTaskType::class);
+			}
+			$context->registerTaskProcessingProvider(\OCA\OpenAi\TaskProcessing\AnalyzeImagesProvider::class);
 		}
 		if (!class_exists('OCP\\TaskProcessing\\TaskTypes\\TextToSpeech')) {
 			$context->registerTaskProcessingTaskType(\OCA\OpenAi\TaskProcessing\TextToSpeechTaskType::class);
diff --git a/lib/TaskProcessing/AnalyzeImagesProvider.php b/lib/TaskProcessing/AnalyzeImagesProvider.php
@@ -0,0 +1,193 @@
+<?php
+
+declare(strict_types=1);
+
+/**
+ * SPDX-FileCopyrightText: 2025 Nextcloud GmbH and Nextcloud contributors
+ * SPDX-License-Identifier: AGPL-3.0-or-later
+ */
+
+namespace OCA\OpenAi\TaskProcessing;
+
+use OCA\OpenAi\AppInfo\Application;
+use OCA\OpenAi\Service\OpenAiAPIService;
+use OCA\OpenAi\Service\OpenAiSettingsService;
+use OCP\Files\File;
+use OCP\IAppConfig;
+use OCP\IL10N;
+use OCP\TaskProcessing\EShapeType;
+use OCP\TaskProcessing\ISynchronousProvider;
+use OCP\TaskProcessing\ShapeDescriptor;
+use Psr\Log\LoggerInterface;
+use RuntimeException;
+
+class AnalyzeImagesProvider implements ISynchronousProvider {
+
+	public function __construct(
+		private OpenAiAPIService $openAiAPIService,
+		private OpenAiSettingsService $openAiSettingsService,
+		private IL10N $l,
+		private LoggerInterface $logger,
+		private IAppConfig $appConfig,
+		private ?string $userId,
+	) {
+	}
+
+	public function getId(): string {
+		return Application::APP_ID . '-analyze-images';
+	}
+
+	public function getName(): string {
+		return $this->openAiAPIService->getServiceName();
+	}
+
+	public function getTaskTypeId(): string {
+		if (class_exists('OCP\\TaskProcessing\\TaskTypes\\AnalyzeImages')) {
+			return \OCP\TaskProcessing\TaskTypes\AnalyzeImages::ID;
+		}
+		return AnalyzeImagesTaskType::ID;
+	}
+
+	public function getExpectedRuntime(): int {
+		return $this->openAiAPIService->getExpTextProcessingTime();
+	}
+
+	public function getInputShapeEnumValues(): array {
+		return [];
+	}
+
+	public function getInputShapeDefaults(): array {
+		return [];
+	}
+
+
+	public function getOptionalInputShape(): array {
+		return [
+			'max_tokens' => new ShapeDescriptor(
+				$this->l->t('Maximum output words'),
+				$this->l->t('The maximum number of words/tokens that can be generated in the output.'),
+				EShapeType::Number
+			),
+			'model' => new ShapeDescriptor(
+				$this->l->t('Model'),
+				$this->l->t('The model used to generate the output'),
+				EShapeType::Enum
+			),
+		];
+	}
+
+	public function getOptionalInputShapeEnumValues(): array {
+		return [
+			'model' => $this->openAiAPIService->getModelEnumValues($this->userId),
+		];
+	}
+
+	public function getOptionalInputShapeDefaults(): array {
+		$adminModel = $this->openAiAPIService->isUsingOpenAi()
+			? ($this->appConfig->getValueString(Application::APP_ID, 'default_completion_model_id', Application::DEFAULT_MODEL_ID) ?: Application::DEFAULT_MODEL_ID)
+			: $this->appConfig->getValueString(Application::APP_ID, 'default_completion_model_id');
+		return [
+			'max_tokens' => 1000,
+			'model' => $adminModel,
+		];
+	}
+
+	public function getOutputShapeEnumValues(): array {
+		return [];
+	}
+
+	public function getOptionalOutputShape(): array {
+		return [];
+	}
+
+	public function getOptionalOutputShapeEnumValues(): array {
+		return [];
+	}
+
+	public function process(?string $userId, array $input, callable $reportProgress): array {
+
+		if (!$this->openAiAPIService->isUsingOpenAi() && !$this->openAiSettingsService->getChatEndpointEnabled()) {
+			throw new RuntimeException('Must support chat completion endpoint');
+		}
+
+		$history = [];
+
+		if (!isset($input['images']) || !is_array($input['images'])) {
+			throw new RuntimeException('Invalid file list');
+		}
+		// Maximum file count for openai is 500. Seems reasonable enough to enforce for all apis though (https://platform.openai.com/docs/guides/images-vision?api-mode=responses&format=url#image-input-requirements)
+		if (count($input['images']) > 500) {
+			throw new RuntimeException('Too many files given. Max is 500');
+		}
+		$fileSize = 0;
+		foreach ($input['images'] as $image) {
+			if (!$image instanceof File || !$image->isReadable()) {
+				throw new RuntimeException('Invalid input file');
+			}
+			$fileSize += intval($image->getSize());
+			// Maximum file size for openai is 50MB. Seems reasonable enough to enforce for all apis though. (https://platform.openai.com/docs/guides/images-vision?api-mode=responses&format=url#image-input-requirements)
+			if ($fileSize > 50 * 1000 * 1000) {
+				throw new RuntimeException('Filesize of input files too large. Max is 50MB');
+			}
+			$inputFile = base64_encode(stream_get_contents($image->fopen('rb')));
+			$fileType = $image->getMimeType();
+			if (!str_starts_with($fileType, 'image/')) {
+				throw new RuntimeException('Invalid input file type ' . $fileType);
+			}
+			if ($this->openAiAPIService->isUsingOpenAi()) {
+				$validFileTypes = [
+					'image/jpeg',
+					'image/png',
+					'image/gif',
+					'image/webp',
+				];
+				if (!in_array($fileType, $validFileTypes)) {
+					throw new RuntimeException('Invalid input file type for OpenAI ' . $fileType);
+				}
+			}
+			$history[] = json_encode([
+				'role' => 'user',
+				'content' => [
+					[
+						'type' => 'image_url',
+						'image_url' => [
+							'url' => 'data:' . $fileType . ';base64,' . $inputFile,
+						],
+					],
+				],
+			]);
+		}
+
+
+		if (!isset($input['input']) || !is_string($input['input'])) {
+			throw new RuntimeException('Invalid prompt');
+		}
+		$prompt = $input['input'];
+
+		if (isset($input['model']) && is_string($input['model'])) {
+			$model = $input['model'];
+		} else {
+			$model = $this->appConfig->getValueString(Application::APP_ID, 'default_completion_model_id', Application::DEFAULT_COMPLETION_MODEL_ID) ?: Application::DEFAULT_COMPLETION_MODEL_ID;
+		}
+
+		$maxTokens = null;
+		if (isset($input['max_tokens']) && is_int($input['max_tokens'])) {
+			$maxTokens = $input['max_tokens'];
+		}
+
+		try {
+			$systemPrompt = 'Take the user\'s question and answer it based on the provided images. Ensure that the answer matches the language of the user\'s question.';
+			$completion = $this->openAiAPIService->createChatCompletion($userId, $model, $prompt, $systemPrompt, $history, 1, $maxTokens);
+			$completion = $completion['messages'];
+
+			if (count($completion) > 0) {
+				return ['output' => array_pop($completion)];
+			}
+
+			throw new RuntimeException('No result in OpenAI/LocalAI response.');
+		} catch (\Exception $e) {
+			$this->logger->warning('OpenAI/LocalAI\'s image question generation failed with: ' . $e->getMessage(), ['exception' => $e]);
+			throw new RuntimeException('OpenAI/LocalAI\'s image question generation failed with: ' . $e->getMessage());
+		}
+	}
+}
diff --git a/lib/TaskProcessing/AnalyzeImagesTaskType.php b/lib/TaskProcessing/AnalyzeImagesTaskType.php
@@ -0,0 +1,77 @@
+<?php
+
+declare(strict_types=1);
+
+/**
+ * SPDX-FileCopyrightText: 2025 Nextcloud GmbH and Nextcloud contributors
+ * SPDX-License-Identifier: AGPL-3.0-or-later
+ */
+
+namespace OCA\OpenAi\TaskProcessing;
+
+use OCA\OpenAi\AppInfo\Application;
+use OCP\IL10N;
+use OCP\TaskProcessing\EShapeType;
+use OCP\TaskProcessing\ITaskType;
+use OCP\TaskProcessing\ShapeDescriptor;
+
+class AnalyzeImagesTaskType implements ITaskType {
+	public const ID = Application::APP_ID . ':analyze-images';
+
+	public function __construct(
+		private IL10N $l,
+	) {
+	}
+
+	/**
+	 * @inheritDoc
+	 */
+	public function getName(): string {
+		return $this->l->t('Analyze images');
+	}
+
+	/**
+	 * @inheritDoc
+	 */
+	public function getDescription(): string {
+		return $this->l->t('Ask a question about the given images.');
+	}
+
+	/**
+	 * @return string
+	 */
+	public function getId(): string {
+		return self::ID;
+	}
+
+	/**
+	 * @return ShapeDescriptor[]
+	 */
+	public function getInputShape(): array {
+		return [
+			'images' => new ShapeDescriptor(
+				$this->l->t('Images'),
+				$this->l->t('Images to ask a question about'),
+				EShapeType::ListOfImages,
+			),
+			'input' => new ShapeDescriptor(
+				$this->l->t('Question'),
+				$this->l->t('What to ask about the image.'),
+				EShapeType::Text,
+			),
+		];
+	}
+
+	/**
+	 * @return ShapeDescriptor[]
+	 */
+	public function getOutputShape(): array {
+		return [
+			'output' => new ShapeDescriptor(
+				$this->l->t('Generated response'),
+				$this->l->t('The answer to the question'),
+				EShapeType::Text
+			),
+		];
+	}
+}

Original file line number	Diff line number	Diff line change
`@@ -118,6 +118,10 @@ public function register(IRegistrationContext $context): void {`
`118`	`118`	`if (class_exists('OCP\\TaskProcessing\\TaskTypes\\TextToTextProofread')) {`
`119`	`119`	`$context->registerTaskProcessingProvider(\OCA\OpenAi\TaskProcessing\ProofreadProvider::class);`
`120`	`120`	`}`
	`121`	`+ if (!class_exists('OCP\\TaskProcessing\\TaskTypes\\AnalyzeImages')) {`
	`122`	`+ $context->registerTaskProcessingTaskType(\OCA\OpenAi\TaskProcessing\AnalyzeImagesTaskType::class);`
	`123`	`+ }`
	`124`	`+ $context->registerTaskProcessingProvider(\OCA\OpenAi\TaskProcessing\AnalyzeImagesProvider::class);`
`121`	`125`	`}`
`122`	`126`	`if (!class_exists('OCP\\TaskProcessing\\TaskTypes\\TextToSpeech')) {`
`123`	`127`	`$context->registerTaskProcessingTaskType(\OCA\OpenAi\TaskProcessing\TextToSpeechTaskType::class);`