Skip to content

Commit dd5eb36

Browse files
committed
⚡️ Improve performance by aggregating pipeline transformations
1 parent b01417b commit dd5eb36

File tree

8 files changed

+303
-91
lines changed

8 files changed

+303
-91
lines changed

package.json

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -63,7 +63,6 @@
6363
"ts"
6464
],
6565
"testTimeout": 15000,
66-
"rootDir": "test",
6766
"testRegex": ".test.ts$",
6867
"transform": {
6968
"^.+\\.(t|j)s$": "ts-jest"

src/helpers.ts

Lines changed: 79 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,7 @@
11
import type { ExtraCol, ExtraColsMapper } from './types';
2+
import type { FullParserSettings } from './types';
3+
import { ElementHandle } from 'puppeteer';
4+
import { InvalidColumnError, MissingRequiredColumnsError } from './errors';
25

36
export const identity = <T>(value: T): T => value;
47

@@ -13,10 +16,10 @@ export const extraColsMapperFactory = (extraCols: ExtraCol[]): ExtraColsMapper =
1316
return a.position! - b.position!;
1417
});
1518

16-
// Append cols without position
19+
// Append columns without a concrete position
1720
const withoutPos = extraCols.filter((extraCol) => extraCol.position === undefined);
1821

19-
return (row: string[], key: keyof ExtraCol) => {
22+
return (row: string[], key: keyof ExtraCol = 'data') => {
2023
const newRow = row.slice();
2124

2225
withPos.forEach((extraCol) => {
@@ -27,6 +30,80 @@ export const extraColsMapperFactory = (extraCols: ExtraCol[]): ExtraColsMapper =
2730
};
2831
};
2932

33+
export async function getColumnsInfo(
34+
settings: FullParserSettings,
35+
headerRow: ElementHandle,
36+
extraColsMapper: ReturnType<typeof extraColsMapperFactory>,
37+
) {
38+
const allowedColNamesKeys = Object.keys(settings.allowedColNames);
39+
40+
// Will be updated during parsing and not found columns will be deleted
41+
const missingColNames = { ...settings.allowedColNames };
42+
43+
// Sorted by finding which was first visited
44+
// is index in which we traverse the table, second is final position
45+
const allowedIndexes: Record<string, number> = (
46+
await headerRow.$$eval(
47+
'td,th',
48+
(cells: Element[], newLine: string) => {
49+
return cells.map((cell) => (cell as HTMLTableCellElement).innerText.split(newLine));
50+
},
51+
settings.newLine,
52+
)
53+
).reduce((acc, text: string[], realIndex: number) => {
54+
const colName = String(settings.colFilter(text, realIndex));
55+
56+
if (settings.allowedColNames.hasOwnProperty(colName)) {
57+
delete missingColNames[colName];
58+
59+
const desiredIndex = allowedColNamesKeys.findIndex((key) => key === colName);
60+
Object.assign(acc, { [realIndex]: desiredIndex });
61+
}
62+
63+
return acc;
64+
}, {});
65+
66+
const missingRequiredColumns = diffFromSource(
67+
Object.values(missingColNames),
68+
settings.optionalColNames,
69+
);
70+
if (missingRequiredColumns.length > 0) {
71+
console.warn(`Not matched columns are following entries: `, missingRequiredColumns);
72+
throw new MissingRequiredColumnsError(
73+
'Number of filtered columns does not match to required columns count!',
74+
);
75+
}
76+
77+
const excludedKeyIndexes: number[] = [];
78+
const colKeyToIndexWithExcluded: Map<string, number> = new Map<string, number>();
79+
extraColsMapper(allowedColNamesKeys, 'colName').forEach((key, index) => {
80+
colKeyToIndexWithExcluded.set(key, index);
81+
colKeyToIndexWithExcluded.set(settings.allowedColNames[key] || key, index);
82+
83+
if (settings.temporaryColNames.includes(key)) {
84+
excludedKeyIndexes.push(index);
85+
}
86+
});
87+
88+
const getColumnIndex = (colName: string): number => {
89+
const index = colKeyToIndexWithExcluded.get(colName);
90+
if (index === undefined) {
91+
throw new InvalidColumnError(`Invalid column name! '${colName}'`);
92+
}
93+
94+
return index;
95+
};
96+
97+
return {
98+
indexes: {
99+
allowed: allowedIndexes,
100+
excluded: excludedKeyIndexes,
101+
},
102+
missingColNames: Object.values(missingColNames),
103+
getColumnIndex,
104+
};
105+
}
106+
30107
export const diffFromSource = <T>(source: T[], target: T[]): T[] => {
31108
return source.filter((x) => !target.includes(x));
32109
};

src/parseTable.ts

Lines changed: 29 additions & 85 deletions
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,10 @@
11
import { FullParserSettings, RowValidationPolicy } from './types';
2-
import { diffFromSource, extraColsMapperFactory } from './helpers';
2+
import { extraColsMapperFactory, getColumnsInfo } from './helpers';
33
import { ElementHandle } from 'puppeteer';
4-
import { InvalidColumnError, InvalidSettingsError, MissingRequiredColumnsError } from './errors';
5-
4+
import { InvalidSettingsError } from './errors';
5+
import PipelineExecutor from './pipelineExecutor';
66
export function parseTableFactory(settings: FullParserSettings) {
77
const extraColsMapper = extraColsMapperFactory(settings.extraCols);
8-
const allowedColNamesKeys = Object.keys(settings.allowedColNames);
98

109
const getHeaderRows = (table: ElementHandle) => {
1110
return settings.headerRowsSelector
@@ -17,16 +16,13 @@ export function parseTableFactory(settings: FullParserSettings) {
1716
return table.$$(settings.bodyRowsSelector);
1817
};
1918

20-
const getOutputHeaderRow = (
21-
excludedKeyIndexes: number[],
22-
nonFoundedColNames: FullParserSettings['allowedColNames'],
23-
) => {
19+
const getOutputHeaderRow = (excludedKeyIndexes: number[], missingColNames: string[]) => {
2420
const headerRowRaw = Object.values(settings.allowedColNames);
2521
const sortedHeader = extraColsMapper(headerRowRaw, 'colName');
2622

2723
const headerRow = sortedHeader
2824
.filter((_, index) => !excludedKeyIndexes.includes(index))
29-
.filter((key) => !Object.values(nonFoundedColNames).includes(key));
25+
.filter((key) => !missingColNames.includes(key));
3026

3127
return settings.rowValuesAsArray ? headerRow : headerRow.join(settings.csvSeparator);
3228
};
@@ -39,13 +35,12 @@ export function parseTableFactory(settings: FullParserSettings) {
3935
return (rows: string[]) => rows.length > 0;
4036
}
4137
if (settings.rowValidationPolicy === RowValidationPolicy.EXACT_MATCH) {
42-
const indexesCount = Object.keys(allowedIndexes).length;
43-
return (rows: string[]) => rows.length === indexesCount;
38+
return (rows: string[]) => rows.length === Object.keys(allowedIndexes).length;
4439
}
4540
throw new InvalidSettingsError('Unknown mode for the "rowValidationPolicy"');
4641
};
4742

48-
const filterSortCols =
43+
const getRowsData =
4944
(allowedIndexes: Record<string, number>) =>
5045
(row: ElementHandle): Promise<string[]> =>
5146
row.$$eval(
@@ -75,87 +70,36 @@ export function parseTableFactory(settings: FullParserSettings) {
7570
return [];
7671
}
7772

78-
const headerRow: ElementHandle =
79-
headerRows.length > 0 ? headerRows.shift()! : bodyRows.shift()!;
73+
const headerRow: ElementHandle = headerRows.length > 0 ? headerRows.shift() : bodyRows.shift();
74+
75+
const { indexes, getColumnIndex, missingColNames } = await getColumnsInfo(
76+
settings,
77+
headerRow,
78+
extraColsMapper,
79+
);
8080

8181
if (settings.reverseTraversal) {
8282
bodyRows.reverse();
8383
}
8484

85-
// Will be updated during parsing and not found columns will be deleted
86-
const nonFoundedColNames = { ...settings.allowedColNames };
87-
88-
// Sorted by finding which was first visited
89-
// is index in which we traverse the table, second is final position
90-
const allowedIndexes: Record<string, number> = (
91-
await headerRow.$$eval(
92-
'td,th',
93-
(cells: Element[], newLine: string) => {
94-
return cells.map((cell) => (cell as HTMLTableCellElement).innerText.split(newLine));
95-
},
96-
settings.newLine,
97-
)
98-
).reduce((acc, text: string[], realIndex: number) => {
99-
const colName = String(settings.colFilter(text, realIndex));
100-
101-
if (settings.allowedColNames.hasOwnProperty(colName)) {
102-
delete nonFoundedColNames[colName];
103-
104-
const desiredIndex = allowedColNamesKeys.findIndex((key) => key === colName);
105-
Object.assign(acc, { [realIndex]: desiredIndex });
106-
}
107-
108-
return acc;
109-
}, {});
110-
111-
const missingRequiredColumns = diffFromSource(
112-
Object.values(nonFoundedColNames),
113-
settings.optionalColNames,
114-
);
115-
if (missingRequiredColumns.length > 0) {
116-
console.warn(`Not matched columns are following entries: `, missingRequiredColumns);
117-
throw new MissingRequiredColumnsError(
118-
'Number of filtered columns does not match to required columns count!',
119-
);
120-
}
121-
122-
const excludedKeyIndexes: number[] = [];
123-
const colKeyToIndexWithExcluded: Map<string, number> = new Map<string, number>();
124-
extraColsMapper(allowedColNamesKeys, 'colName').forEach((key, index) => {
125-
colKeyToIndexWithExcluded.set(key, index);
126-
colKeyToIndexWithExcluded.set(settings.allowedColNames[key] || key, index);
127-
128-
if (settings.temporaryColNames.includes(key)) {
129-
excludedKeyIndexes.push(index);
130-
}
131-
});
132-
133-
const getColumnIndex = (colName: string): number => {
134-
const index = colKeyToIndexWithExcluded.get(colName);
135-
if (index === undefined) {
136-
throw new InvalidColumnError(`Invalid column name! '${colName}'`);
137-
}
138-
139-
return index;
140-
};
141-
142-
const finalRows = (await Promise.all(bodyRows.map(filterSortCols(allowedIndexes))))
143-
.filter(getRowStructureValidator(allowedIndexes))
144-
.map((row) => extraColsMapper(row, 'data'))
145-
.filter((row, index, rows) => settings.rowValidator(row, getColumnIndex, index, rows))
146-
.map((row) => row.map((cell, index) => settings.colParser(cell, index, getColumnIndex)))
147-
.map((row) => {
148-
settings.rowTransform(row, getColumnIndex);
149-
150-
const filteredRow = row.filter((_, index) => !excludedKeyIndexes.includes(index));
151-
return settings.rowValuesAsArray ? filteredRow : filteredRow.join(settings.csvSeparator);
152-
});
85+
const finalRows = new PipelineExecutor<
86+
string[][],
87+
typeof settings.rowValuesAsArray extends true ? string[][] : string[]
88+
>()
89+
.addFilter(getRowStructureValidator(indexes.allowed))
90+
.addMap((row) => extraColsMapper(row, 'data'))
91+
.addFilter((row, index, rows) => settings.rowValidator(row, getColumnIndex, index, rows))
92+
.addMap((row) => row.map((cell, index) => settings.colParser(cell, index, getColumnIndex)))
93+
.addTransform((row) => settings.rowTransform(row, getColumnIndex))
94+
.addMap((row) => row.filter((_, index: number) => !indexes.excluded.includes(index)))
95+
.addMap((row) => (settings.rowValuesAsArray ? row : row.join(settings.csvSeparator)))
96+
.execute(await Promise.all(bodyRows.map(getRowsData(indexes.allowed))));
15397

15498
if (addHeader) {
155-
const headerRow = getOutputHeaderRow(excludedKeyIndexes, nonFoundedColNames);
156-
finalRows.unshift(headerRow);
99+
const headerRow = getOutputHeaderRow(indexes.excluded, missingColNames);
100+
finalRows.unshift(headerRow as string);
157101
}
158102

159-
return finalRows as typeof settings.rowValuesAsArray extends true ? string[][] : string[];
103+
return finalRows;
160104
};
161105
}

src/pipelineExecutor.ts

Lines changed: 91 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,91 @@
1+
enum OperationType {
2+
FILTER,
3+
MAP,
4+
TRANSFORM,
5+
}
6+
7+
type IFilterOperation<T> = {
8+
type: OperationType.FILTER;
9+
callback: (item: T, index: number, arr: T[]) => boolean;
10+
};
11+
12+
type IMapOperation<T> = {
13+
type: OperationType.MAP;
14+
callback: (item: T, index: number, arr: T[]) => any;
15+
};
16+
17+
type ITransformOperation<T> = {
18+
type: OperationType.TRANSFORM;
19+
callback: (item: T, index: number, arr: T[]) => void;
20+
};
21+
22+
type Unwrapped<T> = T extends Array<infer R> ? R : T;
23+
24+
export type IOperation<T> = IFilterOperation<T> | IMapOperation<T> | ITransformOperation<T>;
25+
26+
class PipelineExecutor<T extends unknown[], R extends unknown[]> {
27+
private readonly operations: IOperation<any>[] = [];
28+
29+
public execute(input: T): R {
30+
const acc: unknown[] = [];
31+
32+
for (let index = 0; index < input.length; index++) {
33+
let value = input[index];
34+
let isValid = true;
35+
36+
for (const { type, callback } of this.operations) {
37+
if (type === OperationType.FILTER) {
38+
if (!callback(value, index, acc)) {
39+
isValid = false;
40+
break;
41+
}
42+
} else if (type === OperationType.MAP) {
43+
value = callback(value, index, acc);
44+
} else if (type === OperationType.TRANSFORM) {
45+
callback(value, index, acc);
46+
} else {
47+
throw new Error('Unknown executor operation!');
48+
}
49+
}
50+
51+
if (isValid) {
52+
acc.push(value);
53+
}
54+
}
55+
56+
return acc as R;
57+
}
58+
59+
private addOperation(operation: IOperation<any>) {
60+
this.operations.push(operation);
61+
return this;
62+
}
63+
64+
public clear() {
65+
this.operations.length = 0;
66+
return this;
67+
}
68+
69+
public addFilter<L = Unwrapped<T>>(callback: IFilterOperation<L>['callback']) {
70+
return this.addOperation({
71+
type: OperationType.FILTER,
72+
callback,
73+
});
74+
}
75+
76+
public addMap<L = Unwrapped<T>>(callback: IMapOperation<L>['callback']) {
77+
return this.addOperation({
78+
type: OperationType.MAP,
79+
callback,
80+
});
81+
}
82+
83+
public addTransform<L = Unwrapped<T>>(callback: ITransformOperation<L>['callback']) {
84+
return this.addOperation({
85+
type: OperationType.TRANSFORM,
86+
callback,
87+
});
88+
}
89+
}
90+
91+
export default PipelineExecutor;

src/settings.ts

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,7 @@ export const defaultSettings: ParserSettingsOptional = {
2626
bodyRowsSelector: 'tbody tr',
2727
};
2828

29-
export function preprocessSettings(options: ParserSettings): Required<ParserSettings> {
29+
export function preprocessSettings(options: ParserSettings): FullParserSettings {
3030
const settings: FullParserSettings = {
3131
...defaultSettings,
3232
...omitUndefined(options),
@@ -37,8 +37,8 @@ export function preprocessSettings(options: ParserSettings): Required<ParserSett
3737
}
3838

3939
export function validateSettings(
40-
settings: Required<ParserSettings>,
41-
): asserts settings is Required<ParserSettings> {
40+
settings: FullParserSettings,
41+
): asserts settings is FullParserSettings {
4242
const { extraCols, temporaryColNames, allowedColNames } = settings;
4343

4444
const hasConflict = extraCols

0 commit comments

Comments
 (0)