Skip to content

Commit f3d984c

Browse files
authored
✨ add getting page count of local source (#394)
1 parent 027a7df commit f3d984c

File tree

4 files changed

+98
-69
lines changed

4 files changed

+98
-69
lines changed

src/input/sources/localInputSource.ts

Lines changed: 38 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
import { errorHandler } from "../../errors/handler";
22
import { logger } from "../../logger";
33
import { compressImage } from "../../imageOperations";
4-
import { compressPdf } from "../../pdf";
4+
import { compressPdf, countPages } from "../../pdf";
55
import path from "path";
66
import * as fileType from "file-type";
77
import { PageOptions } from "../pageOptions";
@@ -83,6 +83,22 @@ export abstract class LocalInputSource extends InputSource {
8383
return mimeType;
8484
}
8585

86+
/**
87+
* Returns the file object as a Buffer.
88+
* @returns Buffer representation of the file object
89+
* @protected
90+
*/
91+
protected getBuffer(): Buffer {
92+
if (typeof this.fileObject === "string") {
93+
return Buffer.from(this.fileObject);
94+
}
95+
return this.fileObject;
96+
}
97+
98+
/**
99+
* Determines whether the current file is a PDF.
100+
* @returns {boolean} Returns true if the file is a PDF; otherwise, returns false.
101+
*/
86102
isPdf(): boolean {
87103
if (!this.initialized) {
88104
throw new Error(
@@ -97,15 +113,9 @@ export abstract class LocalInputSource extends InputSource {
97113
* @param pageOptions
98114
*/
99115
public async applyPageOptions(pageOptions: PageOptions) {
100-
if (!this.initialized) {
101-
await this.init();
102-
}
103-
if (!(this.fileObject instanceof Buffer)) {
104-
throw new Error(
105-
`Cannot modify an input source of type ${this.inputType}.`
106-
);
107-
}
108-
const processedPdf = await extractPages(this.fileObject, pageOptions);
116+
await this.init();
117+
const buffer = this.getBuffer();
118+
const processedPdf = await extractPages(buffer, pageOptions);
109119
this.fileObject = processedPdf.file;
110120
}
111121

@@ -137,15 +147,8 @@ export abstract class LocalInputSource extends InputSource {
137147
forceSourceText: boolean = false,
138148
disableSourceText: boolean = true
139149
) {
140-
if (!this.initialized) {
141-
await this.init();
142-
}
143-
let buffer: Buffer;
144-
if (typeof this.fileObject === "string") {
145-
buffer = Buffer.from(this.fileObject);
146-
} else {
147-
buffer = this.fileObject;
148-
}
150+
await this.init();
151+
const buffer = this.getBuffer();
149152
if (this.isPdf()){
150153
this.fileObject = await compressPdf(buffer, quality, forceSourceText, disableSourceText);
151154
} else {
@@ -158,13 +161,25 @@ export abstract class LocalInputSource extends InputSource {
158161
* @return boolean
159162
*/
160163
public async hasSourceText() {
161-
if (!this.initialized) {
162-
await this.init();
163-
}
164+
await this.init();
164165
if (!this.isPdf()){
165166
return false;
166167
}
167-
const buffer = typeof this.fileObject === "string" ? Buffer.from(this.fileObject) : this.fileObject;
168+
const buffer = this.getBuffer();
168169
return hasSourceText(buffer);
169170
}
171+
172+
/**
173+
* Returns the number of pages in the input source.
174+
* For PDFs, returns the actual page count. For images, returns 1.
175+
* @return Promise<number> The number of pages
176+
*/
177+
public async getPageCount(): Promise<number> {
178+
await this.init();
179+
if (!this.isPdf()) {
180+
return 1;
181+
}
182+
const buffer = this.getBuffer();
183+
return countPages(buffer);
184+
}
170185
}

src/pdf/pdfOperation.ts

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -84,6 +84,11 @@ export async function extractPages(
8484
return { file: fileBuffer, totalPagesRemoved: sumRemovedPages };
8585
}
8686

87+
/**
88+
* Count the number of pages in a pdf file.
89+
* @param file
90+
* @returns the number of pages in the file.
91+
*/
8792
export async function countPages(file: Buffer): Promise<number> {
8893
const currentPdf = await PDFDocument.load(file, {
8994
ignoreEncryption: true,

src/pdf/pdfUtils.ts

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -64,10 +64,6 @@ export async function extractTextFromPdf(pdfBuffer: Buffer): Promise<ExtractedPd
6464
};
6565
}
6666

67-
68-
69-
70-
7167
/**
7268
* Checks if a PDF contains source text.
7369
*

tests/inputs/sources.spec.ts

Lines changed: 55 additions & 42 deletions
Original file line numberDiff line numberDiff line change
@@ -35,79 +35,89 @@ describe("Test different types of input", () => {
3535
// don't provide an extension to see if we can detect MIME
3636
// type based on contents
3737
const filename = "receipt";
38-
const input = new Base64Input({
38+
const inputSource = new Base64Input({
3939
inputString: b64String,
4040
filename: filename,
4141
});
42-
await input.init();
43-
expect(input.inputType).to.equals(INPUT_TYPE_BASE64);
44-
expect(input.filename).to.equals(filename);
45-
expect(input.mimeType).to.equals("image/jpeg");
42+
await inputSource.init();
43+
expect(inputSource.inputType).to.equals(INPUT_TYPE_BASE64);
44+
expect(inputSource.filename).to.equals(filename);
45+
expect(inputSource.mimeType).to.equals("image/jpeg");
46+
expect(inputSource.isPdf()).to.false;
47+
expect(await inputSource.getPageCount()).to.equals(1);
4648
// we need to insert a newline very 76 chars to match the format
4749
// of the input file.
48-
const expectedString = input.fileObject
50+
const expectedString = inputSource.fileObject
4951
.toString("base64")
5052
.replace(/(.{76})/gm, "$1\n");
5153
expect(expectedString).to.eqls(b64String);
5254
});
5355

5456
it("should accept JPEG files from a path", async () => {
55-
const input = new PathInput({
57+
const inputSource = new PathInput({
5658
inputPath: path.join(__dirname, "../data/products/expense_receipts/default_sample.jpg"),
5759
});
58-
await input.init();
60+
await inputSource.init();
5961

6062
const expectedResult = await fs.promises.readFile(
6163
path.join(__dirname, "../data/products/expense_receipts/default_sample.jpg")
6264
);
63-
expect(input.inputType).to.equals(INPUT_TYPE_PATH);
64-
expect(input.filename).to.equals("default_sample.jpg");
65-
expect(input.mimeType).to.equals("image/jpeg");
66-
expect(input.fileObject).to.eqls(expectedResult);
65+
expect(inputSource.inputType).to.equals(INPUT_TYPE_PATH);
66+
expect(inputSource.filename).to.equals("default_sample.jpg");
67+
expect(inputSource.mimeType).to.equals("image/jpeg");
68+
expect(inputSource.isPdf()).to.false;
69+
expect(await inputSource.getPageCount()).to.equals(1);
70+
expect(inputSource.fileObject).to.eqls(expectedResult);
6771
});
6872

6973
it("should accept TIFF from a path", async () => {
70-
const input = new PathInput({
74+
const inputSource = new PathInput({
7175
inputPath: path.join(__dirname, "../data/file_types/receipt.tif"),
7276
});
73-
await input.init();
77+
await inputSource.init();
7478
const expectedResult = await fs.promises.readFile(
7579
path.join(__dirname, "../data/file_types/receipt.tif")
7680
);
77-
expect(input.inputType).to.equals(INPUT_TYPE_PATH);
78-
expect(input.filename).to.equals("receipt.tif");
79-
expect(input.mimeType).to.equals("image/tiff");
80-
expect(input.fileObject).to.eqls(expectedResult);
81+
expect(inputSource.inputType).to.equals(INPUT_TYPE_PATH);
82+
expect(inputSource.filename).to.equals("receipt.tif");
83+
expect(inputSource.mimeType).to.equals("image/tiff");
84+
expect(inputSource.isPdf()).to.false;
85+
expect(await inputSource.getPageCount()).to.equals(1);
86+
expect(inputSource.fileObject).to.eqls(expectedResult);
8187
});
8288

8389
it("should accept HEIC from a path", async () => {
84-
const input = new PathInput({
90+
const inputSource = new PathInput({
8591
inputPath: path.join(__dirname, "../data/file_types/receipt.heic"),
8692
});
87-
await input.init();
93+
await inputSource.init();
8894
const expectedResult = await fs.promises.readFile(
8995
path.join(__dirname, "../data/file_types/receipt.heic")
9096
);
91-
expect(input.inputType).to.equals(INPUT_TYPE_PATH);
92-
expect(input.filename).to.equals("receipt.heic");
93-
expect(input.mimeType).to.equals("image/heic");
94-
expect(input.fileObject).to.eqls(expectedResult);
97+
expect(inputSource.inputType).to.equals(INPUT_TYPE_PATH);
98+
expect(inputSource.filename).to.equals("receipt.heic");
99+
expect(inputSource.mimeType).to.equals("image/heic");
100+
expect(inputSource.isPdf()).to.false;
101+
expect(await inputSource.getPageCount()).to.equals(1);
102+
expect(inputSource.fileObject).to.eqls(expectedResult);
95103
});
96104

97105
it("should accept read streams", async () => {
98106
const filePath = path.join(__dirname, "../data/products/expense_receipts/default_sample.jpg");
99107
const stream = fs.createReadStream(filePath);
100108
const filename = "default_sample.jpg";
101-
const input = new StreamInput({
109+
const inputSource = new StreamInput({
102110
inputStream: stream,
103111
filename: filename,
104112
});
105-
await input.init();
106-
expect(input.inputType).to.equals(INPUT_TYPE_STREAM);
107-
expect(input.filename).to.equals(filename);
108-
expect(input.mimeType).to.equals("image/jpeg");
113+
await inputSource.init();
114+
expect(inputSource.inputType).to.equals(INPUT_TYPE_STREAM);
115+
expect(inputSource.filename).to.equals(filename);
116+
expect(inputSource.mimeType).to.equals("image/jpeg");
117+
expect(inputSource.isPdf()).to.false;
118+
expect(await inputSource.getPageCount()).to.equals(1);
109119
const expectedResult = await fs.promises.readFile(filePath);
110-
expect(input.fileObject.toString()).to.eqls(expectedResult.toString());
120+
expect(inputSource.fileObject.toString()).to.eqls(expectedResult.toString());
111121
});
112122

113123
it("should accept raw bytes", async () => {
@@ -116,16 +126,18 @@ describe("Test different types of input", () => {
116126
// don't provide an extension to see if we can detect MIME
117127
// type based on contents
118128
const filename = "receipt";
119-
const input = new BytesInput({
129+
const inputSource = new BytesInput({
120130
inputBytes: inputBytes,
121131
filename: filename,
122132
});
123-
await input.init();
124-
expect(input.inputType).to.equal(INPUT_TYPE_BYTES);
125-
expect(input.filename).to.equal(filename);
126-
expect(input.mimeType).to.equal("image/jpeg");
133+
await inputSource.init();
134+
expect(inputSource.inputType).to.equal(INPUT_TYPE_BYTES);
135+
expect(inputSource.filename).to.equal(filename);
136+
expect(inputSource.mimeType).to.equal("image/jpeg");
137+
expect(inputSource.isPdf()).to.false;
138+
expect(await inputSource.getPageCount()).to.equals(1);
127139
const expectedResult = await fs.promises.readFile(filePath);
128-
expect(Buffer.compare(input.fileObject, expectedResult)).to.equal(0);
140+
expect(Buffer.compare(inputSource.fileObject, expectedResult)).to.equal(0);
129141
});
130142

131143
it("should accept a Buffer", async () => {
@@ -135,15 +147,16 @@ describe("Test different types of input", () => {
135147
path.join(__dirname, "../data/products/invoices/invoice_10p.pdf")
136148
)
137149
);
138-
const input = new BufferInput({
150+
const inputSource = new BufferInput({
139151
buffer: buffer,
140152
filename: filename,
141153
});
142-
await input.init();
143-
expect(input.inputType).to.equals(INPUT_TYPE_BUFFER);
144-
expect(input.filename).to.equals(filename);
145-
expect(input.isPdf()).to.be.true;
146-
expect(input.fileObject).to.be.instanceOf(Buffer);
154+
await inputSource.init();
155+
expect(inputSource.inputType).to.equals(INPUT_TYPE_BUFFER);
156+
expect(inputSource.filename).to.equals(filename);
157+
expect(inputSource.isPdf()).to.be.true;
158+
expect(await inputSource.getPageCount()).to.equals(10);
159+
expect(inputSource.fileObject).to.be.instanceOf(Buffer);
147160
});
148161

149162

0 commit comments

Comments
 (0)