1515import exifread
1616import httpx
1717import magic
18+ import pymupdf
1819from PIL import Image , ImageOps
1920
2021from .datastructures import (
@@ -159,13 +160,11 @@ def _handle_image_conversion_job(
159160 logger .debug (f"Converting image size and AR took { time .time () - start } seconds" )
160161 logger .debug ("Done." )
161162
162- def _handle_thumbnail_source_job (
163- self , job : ThumbnailSourceJob , fileinfo : dict [str , str ], screenshot_time_seconds : int = 60
164- ) -> None :
163+ def _handle_thumbnail_source_job (self , job : ThumbnailSourceJob , fileinfo : dict [str , str ]) -> None :
165164 """Create a thumbnail source for this file."""
166165 if fileinfo ["filetype" ] == "video" :
167166 # use opencv to get video screenshot
168- cv2_ss = self ._get_video_screenshot (job = job , seconds = screenshot_time_seconds )
167+ cv2_ss = self ._get_video_screenshot (job = job )
169168 cc = cv2 .cvtColor (cv2_ss , cv2 .COLOR_BGR2RGB )
170169 job .images = [Image .fromarray (cc )]
171170 # create an exif object with basic info
@@ -176,10 +175,22 @@ def _handle_thumbnail_source_job(
176175 exif [0x131 ] = self .clientinfo ["client_version" ]
177176 job .exif = exif
178177 return
178+ if fileinfo ["filetype" ] == "document" :
179+ # use pymypdf to take a screenshot of page 1 of pdf/txt
180+ ss = self ._get_document_screenshot (job = job )
181+ job .images = [ss ]
182+ exif = Image .Exif ()
183+ exif [0x100 ] = job .images [0 ].width
184+ exif [0x101 ] = job .images [0 ].height
185+ exif [0x10E ] = f"ThumbnailSource for BMA document file { job .basefile_uuid } "
186+ exif [0x131 ] = self .clientinfo ["client_version" ]
187+ job .exif = exif
188+ return
189+
179190 # unsupported filetype
180191 raise JobNotSupportedError (job = job )
181192
182- def _get_video_screenshot (self , job : ThumbnailSourceJob , seconds : int ) -> Image .Image :
193+ def _get_video_screenshot (self , job : ThumbnailSourceJob , seconds : int = 60 ) -> Image .Image :
183194 """Get a screenshot a certain number of seconds into the video."""
184195 path = self .path / job .source_url [1 :]
185196 cam = cv2 .VideoCapture (path )
@@ -198,6 +209,14 @@ def _get_video_screenshot(self, job: ThumbnailSourceJob, seconds: int) -> Image.
198209 cv2 .destroyAllWindows ()
199210 return frame # type: ignore[no-any-return]
200211
212+ def _get_document_screenshot (self , job : ThumbnailSourceJob , page : int = 0 ) -> Image .Image :
213+ """Get a screenshot a certain number of pages into the pdf/txt file."""
214+ path = self .path / job .source_url [1 :]
215+ doc = pymupdf .open (path )
216+ pdfpage = doc [page ]
217+ pix = pdfpage .get_pixmap ()
218+ return Image .frombytes ("RGB" , (pix .width , pix .height ), pix .samples )
219+
201220 ###############################################################################
202221
203222 def _write_and_upload_result (self , job : Job , filename : str ) -> None :
0 commit comments