Skip to content

Commit b717552

Browse files
authored
Merge pull request #226 from nextcloud/picture
Feat: Add Analyze Image Task Type
2 parents c714945 + 8f6c8cb commit b717552

File tree

3 files changed

+274
-0
lines changed

3 files changed

+274
-0
lines changed

lib/AppInfo/Application.php

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -118,6 +118,10 @@ public function register(IRegistrationContext $context): void {
118118
if (class_exists('OCP\\TaskProcessing\\TaskTypes\\TextToTextProofread')) {
119119
$context->registerTaskProcessingProvider(\OCA\OpenAi\TaskProcessing\ProofreadProvider::class);
120120
}
121+
if (!class_exists('OCP\\TaskProcessing\\TaskTypes\\AnalyzeImages')) {
122+
$context->registerTaskProcessingTaskType(\OCA\OpenAi\TaskProcessing\AnalyzeImagesTaskType::class);
123+
}
124+
$context->registerTaskProcessingProvider(\OCA\OpenAi\TaskProcessing\AnalyzeImagesProvider::class);
121125
}
122126
if (!class_exists('OCP\\TaskProcessing\\TaskTypes\\TextToSpeech')) {
123127
$context->registerTaskProcessingTaskType(\OCA\OpenAi\TaskProcessing\TextToSpeechTaskType::class);
Lines changed: 193 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,193 @@
1+
<?php
2+
3+
declare(strict_types=1);
4+
5+
/**
6+
* SPDX-FileCopyrightText: 2025 Nextcloud GmbH and Nextcloud contributors
7+
* SPDX-License-Identifier: AGPL-3.0-or-later
8+
*/
9+
10+
namespace OCA\OpenAi\TaskProcessing;
11+
12+
use OCA\OpenAi\AppInfo\Application;
13+
use OCA\OpenAi\Service\OpenAiAPIService;
14+
use OCA\OpenAi\Service\OpenAiSettingsService;
15+
use OCP\Files\File;
16+
use OCP\IAppConfig;
17+
use OCP\IL10N;
18+
use OCP\TaskProcessing\EShapeType;
19+
use OCP\TaskProcessing\ISynchronousProvider;
20+
use OCP\TaskProcessing\ShapeDescriptor;
21+
use Psr\Log\LoggerInterface;
22+
use RuntimeException;
23+
24+
class AnalyzeImagesProvider implements ISynchronousProvider {
25+
26+
public function __construct(
27+
private OpenAiAPIService $openAiAPIService,
28+
private OpenAiSettingsService $openAiSettingsService,
29+
private IL10N $l,
30+
private LoggerInterface $logger,
31+
private IAppConfig $appConfig,
32+
private ?string $userId,
33+
) {
34+
}
35+
36+
public function getId(): string {
37+
return Application::APP_ID . '-analyze-images';
38+
}
39+
40+
public function getName(): string {
41+
return $this->openAiAPIService->getServiceName();
42+
}
43+
44+
public function getTaskTypeId(): string {
45+
if (class_exists('OCP\\TaskProcessing\\TaskTypes\\AnalyzeImages')) {
46+
return \OCP\TaskProcessing\TaskTypes\AnalyzeImages::ID;
47+
}
48+
return AnalyzeImagesTaskType::ID;
49+
}
50+
51+
public function getExpectedRuntime(): int {
52+
return $this->openAiAPIService->getExpTextProcessingTime();
53+
}
54+
55+
public function getInputShapeEnumValues(): array {
56+
return [];
57+
}
58+
59+
public function getInputShapeDefaults(): array {
60+
return [];
61+
}
62+
63+
64+
public function getOptionalInputShape(): array {
65+
return [
66+
'max_tokens' => new ShapeDescriptor(
67+
$this->l->t('Maximum output words'),
68+
$this->l->t('The maximum number of words/tokens that can be generated in the output.'),
69+
EShapeType::Number
70+
),
71+
'model' => new ShapeDescriptor(
72+
$this->l->t('Model'),
73+
$this->l->t('The model used to generate the output'),
74+
EShapeType::Enum
75+
),
76+
];
77+
}
78+
79+
public function getOptionalInputShapeEnumValues(): array {
80+
return [
81+
'model' => $this->openAiAPIService->getModelEnumValues($this->userId),
82+
];
83+
}
84+
85+
public function getOptionalInputShapeDefaults(): array {
86+
$adminModel = $this->openAiAPIService->isUsingOpenAi()
87+
? ($this->appConfig->getValueString(Application::APP_ID, 'default_completion_model_id', Application::DEFAULT_MODEL_ID) ?: Application::DEFAULT_MODEL_ID)
88+
: $this->appConfig->getValueString(Application::APP_ID, 'default_completion_model_id');
89+
return [
90+
'max_tokens' => 1000,
91+
'model' => $adminModel,
92+
];
93+
}
94+
95+
public function getOutputShapeEnumValues(): array {
96+
return [];
97+
}
98+
99+
public function getOptionalOutputShape(): array {
100+
return [];
101+
}
102+
103+
public function getOptionalOutputShapeEnumValues(): array {
104+
return [];
105+
}
106+
107+
public function process(?string $userId, array $input, callable $reportProgress): array {
108+
109+
if (!$this->openAiAPIService->isUsingOpenAi() && !$this->openAiSettingsService->getChatEndpointEnabled()) {
110+
throw new RuntimeException('Must support chat completion endpoint');
111+
}
112+
113+
$history = [];
114+
115+
if (!isset($input['images']) || !is_array($input['images'])) {
116+
throw new RuntimeException('Invalid file list');
117+
}
118+
// Maximum file count for openai is 500. Seems reasonable enough to enforce for all apis though (https://platform.openai.com/docs/guides/images-vision?api-mode=responses&format=url#image-input-requirements)
119+
if (count($input['images']) > 500) {
120+
throw new RuntimeException('Too many files given. Max is 500');
121+
}
122+
$fileSize = 0;
123+
foreach ($input['images'] as $image) {
124+
if (!$image instanceof File || !$image->isReadable()) {
125+
throw new RuntimeException('Invalid input file');
126+
}
127+
$fileSize += intval($image->getSize());
128+
// Maximum file size for openai is 50MB. Seems reasonable enough to enforce for all apis though. (https://platform.openai.com/docs/guides/images-vision?api-mode=responses&format=url#image-input-requirements)
129+
if ($fileSize > 50 * 1000 * 1000) {
130+
throw new RuntimeException('Filesize of input files too large. Max is 50MB');
131+
}
132+
$inputFile = base64_encode(stream_get_contents($image->fopen('rb')));
133+
$fileType = $image->getMimeType();
134+
if (!str_starts_with($fileType, 'image/')) {
135+
throw new RuntimeException('Invalid input file type ' . $fileType);
136+
}
137+
if ($this->openAiAPIService->isUsingOpenAi()) {
138+
$validFileTypes = [
139+
'image/jpeg',
140+
'image/png',
141+
'image/gif',
142+
'image/webp',
143+
];
144+
if (!in_array($fileType, $validFileTypes)) {
145+
throw new RuntimeException('Invalid input file type for OpenAI ' . $fileType);
146+
}
147+
}
148+
$history[] = json_encode([
149+
'role' => 'user',
150+
'content' => [
151+
[
152+
'type' => 'image_url',
153+
'image_url' => [
154+
'url' => 'data:' . $fileType . ';base64,' . $inputFile,
155+
],
156+
],
157+
],
158+
]);
159+
}
160+
161+
162+
if (!isset($input['input']) || !is_string($input['input'])) {
163+
throw new RuntimeException('Invalid prompt');
164+
}
165+
$prompt = $input['input'];
166+
167+
if (isset($input['model']) && is_string($input['model'])) {
168+
$model = $input['model'];
169+
} else {
170+
$model = $this->appConfig->getValueString(Application::APP_ID, 'default_completion_model_id', Application::DEFAULT_COMPLETION_MODEL_ID) ?: Application::DEFAULT_COMPLETION_MODEL_ID;
171+
}
172+
173+
$maxTokens = null;
174+
if (isset($input['max_tokens']) && is_int($input['max_tokens'])) {
175+
$maxTokens = $input['max_tokens'];
176+
}
177+
178+
try {
179+
$systemPrompt = 'Take the user\'s question and answer it based on the provided images. Ensure that the answer matches the language of the user\'s question.';
180+
$completion = $this->openAiAPIService->createChatCompletion($userId, $model, $prompt, $systemPrompt, $history, 1, $maxTokens);
181+
$completion = $completion['messages'];
182+
183+
if (count($completion) > 0) {
184+
return ['output' => array_pop($completion)];
185+
}
186+
187+
throw new RuntimeException('No result in OpenAI/LocalAI response.');
188+
} catch (\Exception $e) {
189+
$this->logger->warning('OpenAI/LocalAI\'s image question generation failed with: ' . $e->getMessage(), ['exception' => $e]);
190+
throw new RuntimeException('OpenAI/LocalAI\'s image question generation failed with: ' . $e->getMessage());
191+
}
192+
}
193+
}
Lines changed: 77 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,77 @@
1+
<?php
2+
3+
declare(strict_types=1);
4+
5+
/**
6+
* SPDX-FileCopyrightText: 2025 Nextcloud GmbH and Nextcloud contributors
7+
* SPDX-License-Identifier: AGPL-3.0-or-later
8+
*/
9+
10+
namespace OCA\OpenAi\TaskProcessing;
11+
12+
use OCA\OpenAi\AppInfo\Application;
13+
use OCP\IL10N;
14+
use OCP\TaskProcessing\EShapeType;
15+
use OCP\TaskProcessing\ITaskType;
16+
use OCP\TaskProcessing\ShapeDescriptor;
17+
18+
class AnalyzeImagesTaskType implements ITaskType {
19+
public const ID = Application::APP_ID . ':analyze-images';
20+
21+
public function __construct(
22+
private IL10N $l,
23+
) {
24+
}
25+
26+
/**
27+
* @inheritDoc
28+
*/
29+
public function getName(): string {
30+
return $this->l->t('Analyze images');
31+
}
32+
33+
/**
34+
* @inheritDoc
35+
*/
36+
public function getDescription(): string {
37+
return $this->l->t('Ask a question about the given images.');
38+
}
39+
40+
/**
41+
* @return string
42+
*/
43+
public function getId(): string {
44+
return self::ID;
45+
}
46+
47+
/**
48+
* @return ShapeDescriptor[]
49+
*/
50+
public function getInputShape(): array {
51+
return [
52+
'images' => new ShapeDescriptor(
53+
$this->l->t('Images'),
54+
$this->l->t('Images to ask a question about'),
55+
EShapeType::ListOfImages,
56+
),
57+
'input' => new ShapeDescriptor(
58+
$this->l->t('Question'),
59+
$this->l->t('What to ask about the image.'),
60+
EShapeType::Text,
61+
),
62+
];
63+
}
64+
65+
/**
66+
* @return ShapeDescriptor[]
67+
*/
68+
public function getOutputShape(): array {
69+
return [
70+
'output' => new ShapeDescriptor(
71+
$this->l->t('Generated response'),
72+
$this->l->t('The answer to the question'),
73+
EShapeType::Text
74+
),
75+
];
76+
}
77+
}

0 commit comments

Comments
 (0)