Skip to content

Commit 9835734

Browse files
committed
add support for taking screenshots from pdf and txt documents using pymupdf
1 parent 0b7c7dd commit 9835734

File tree

2 files changed

+34
-5
lines changed

2 files changed

+34
-5
lines changed

pyproject.toml

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@ dependencies = [
1717
"opencv-python==4.10.0.84",
1818
"pillow==11.0.0",
1919
"python-magic==0.4.27",
20+
"PyMuPDF==1.25.1",
2021
]
2122
name = "bma-client-lib"
2223
description = "BornHack Media Archive Client Library"
@@ -79,3 +80,12 @@ convention = "google"
7980
[tool.mypy]
8081
mypy_path = "src"
8182
strict = true
83+
84+
[tool.pytest.ini_options]
85+
filterwarnings = [
86+
"error",
87+
# https://github.com/swig/swig/issues/2881
88+
'ignore:builtin type SwigPyPacked has no __module__ attribute:DeprecationWarning',
89+
'ignore:builtin type SwigPyObject has no __module__ attribute:DeprecationWarning',
90+
'ignore:builtin type swigvarlink has no __module__ attribute:DeprecationWarning',
91+
]

src/bma_client_lib/bma_client.py

Lines changed: 24 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@
1515
import exifread
1616
import httpx
1717
import magic
18+
import pymupdf
1819
from PIL import Image, ImageOps
1920

2021
from .datastructures import (
@@ -159,13 +160,11 @@ def _handle_image_conversion_job(
159160
logger.debug(f"Converting image size and AR took {time.time() - start} seconds")
160161
logger.debug("Done.")
161162

162-
def _handle_thumbnail_source_job(
163-
self, job: ThumbnailSourceJob, fileinfo: dict[str, str], screenshot_time_seconds: int = 60
164-
) -> None:
163+
def _handle_thumbnail_source_job(self, job: ThumbnailSourceJob, fileinfo: dict[str, str]) -> None:
165164
"""Create a thumbnail source for this file."""
166165
if fileinfo["filetype"] == "video":
167166
# use opencv to get video screenshot
168-
cv2_ss = self._get_video_screenshot(job=job, seconds=screenshot_time_seconds)
167+
cv2_ss = self._get_video_screenshot(job=job)
169168
cc = cv2.cvtColor(cv2_ss, cv2.COLOR_BGR2RGB)
170169
job.images = [Image.fromarray(cc)]
171170
# create an exif object with basic info
@@ -176,10 +175,22 @@ def _handle_thumbnail_source_job(
176175
exif[0x131] = self.clientinfo["client_version"]
177176
job.exif = exif
178177
return
178+
if fileinfo["filetype"] == "document":
179+
# use pymypdf to take a screenshot of page 1 of pdf/txt
180+
ss = self._get_document_screenshot(job=job)
181+
job.images = [ss]
182+
exif = Image.Exif()
183+
exif[0x100] = job.images[0].width
184+
exif[0x101] = job.images[0].height
185+
exif[0x10E] = f"ThumbnailSource for BMA document file {job.basefile_uuid}"
186+
exif[0x131] = self.clientinfo["client_version"]
187+
job.exif = exif
188+
return
189+
179190
# unsupported filetype
180191
raise JobNotSupportedError(job=job)
181192

182-
def _get_video_screenshot(self, job: ThumbnailSourceJob, seconds: int) -> Image.Image:
193+
def _get_video_screenshot(self, job: ThumbnailSourceJob, seconds: int = 60) -> Image.Image:
183194
"""Get a screenshot a certain number of seconds into the video."""
184195
path = self.path / job.source_url[1:]
185196
cam = cv2.VideoCapture(path)
@@ -198,6 +209,14 @@ def _get_video_screenshot(self, job: ThumbnailSourceJob, seconds: int) -> Image.
198209
cv2.destroyAllWindows()
199210
return frame # type: ignore[no-any-return]
200211

212+
def _get_document_screenshot(self, job: ThumbnailSourceJob, page: int = 0) -> Image.Image:
213+
"""Get a screenshot a certain number of pages into the pdf/txt file."""
214+
path = self.path / job.source_url[1:]
215+
doc = pymupdf.open(path)
216+
pdfpage = doc[page]
217+
pix = pdfpage.get_pixmap()
218+
return Image.frombytes("RGB", (pix.width, pix.height), pix.samples)
219+
201220
###############################################################################
202221

203222
def _write_and_upload_result(self, job: Job, filename: str) -> None:

0 commit comments

Comments
 (0)