Skip to content

Commit 777da09

Browse files
committed
Merge remote-tracking branch 'origin/main' into mark/enable-streaming
2 parents 66c546b + fab0c9f commit 777da09

File tree

6 files changed

+25
-6
lines changed

6 files changed

+25
-6
lines changed

README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -154,7 +154,7 @@ Request #3 => page_2_markdown + page_3_image
154154

155155
### Installation
156156

157-
- Install **poppler-utils** on the system, it should be available in path variable
157+
- Install **poppler** on the system, it should be available in path variable. See the [pdf2image documentation](https://pdf2image.readthedocs.io/en/latest/installation.html) for instructions by platform.
158158
- Install py-zerox:
159159

160160
```sh

node-zerox/src/handleWarnings.ts

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,15 @@
1+
// Tesseract relies on node-fetch v2, which has a deprecated version of punycode
2+
// Suppress the warning for now. Check in when teseract updates to node-fetch v3
3+
// https://github.com/naptha/tesseract.js/issues/876
4+
if (process.stderr.write === process.stderr.constructor.prototype.write) {
5+
const stdErrWrite = process.stderr.write;
6+
process.stderr.write = function (chunk: any, ...args: any[]) {
7+
const str = Buffer.isBuffer(chunk) ? chunk.toString() : chunk;
8+
9+
// Filter out the punycode deprecation warning
10+
if (str.includes("punycode")) {
11+
return true;
12+
}
13+
return stdErrWrite.apply(process.stderr, [chunk]);
14+
};
15+
}

node-zerox/src/index.ts

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@ import path from "path";
44
import pLimit, { Limit } from "p-limit";
55
import Tesseract from "tesseract.js";
66

7+
import "./handleWarnings";
78
import {
89
addWorkersToTesseractScheduler,
910
cleanupImage,

node-zerox/src/utils/model.ts

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@ import { LLMParams } from "../types";
22

33
const defaultLLMParams: LLMParams = {
44
frequencyPenalty: 0, // OpenAI defaults to 0
5-
maxTokens: 2000,
5+
maxTokens: 4000,
66
presencePenalty: 0, // OpenAI defaults to 0
77
temperature: 0,
88
topP: 1, // OpenAI defaults to 1

py_zerox/pyzerox/core/zerox.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77
import aiofiles
88
import aiofiles.os as async_os
99
import asyncio
10+
from ..constants import PDFConversionDefaultOptions
1011

1112
# Package Imports
1213
from ..processor import (
@@ -26,6 +27,8 @@ async def zerox(
2627
cleanup: bool = True,
2728
concurrency: int = 10,
2829
file_path: Optional[str] = "",
30+
image_density: int = PDFConversionDefaultOptions.DPI,
31+
image_height: tuple[Optional[int], int] = PDFConversionDefaultOptions.SIZE,
2932
maintain_format: bool = False,
3033
model: str = "gpt-4o-mini",
3134
output_dir: Optional[str] = None,
@@ -130,7 +133,7 @@ async def zerox(
130133
**subset_pdf_create_kwargs)
131134

132135
# Convert the file to a series of images, below function returns a list of image paths in page order
133-
images = await convert_pdf_to_images(local_path=local_path, temp_dir=temp_directory)
136+
images = await convert_pdf_to_images(image_density=image_density, image_height=image_height, local_path=local_path, temp_dir=temp_directory)
134137

135138
if maintain_format:
136139
for image in images:

py_zerox/pyzerox/processor/pdf.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -11,14 +11,14 @@
1111
from ..models import litellmmodel
1212

1313

14-
async def convert_pdf_to_images(local_path: str, temp_dir: str) -> List[str]:
14+
async def convert_pdf_to_images(image_density: int, image_height: tuple[Optional[int], int], local_path: str, temp_dir: str) -> List[str]:
1515
"""Converts a PDF file to a series of images in the temp_dir. Returns a list of image paths in page order."""
1616
options = {
1717
"pdf_path": local_path,
1818
"output_folder": temp_dir,
19-
"dpi": PDFConversionDefaultOptions.DPI,
19+
"dpi": image_density,
2020
"fmt": PDFConversionDefaultOptions.FORMAT,
21-
"size": PDFConversionDefaultOptions.SIZE,
21+
"size": image_height,
2222
"thread_count": PDFConversionDefaultOptions.THREAD_COUNT,
2323
"use_pdftocairo": PDFConversionDefaultOptions.USE_PDFTOCAIRO,
2424
"paths_only": True,

0 commit comments

Comments
 (0)