Skip to content

Commit 5ff578c

Browse files
committed
✨ Add custom aggregations support
1 parent dd5eb36 commit 5ff578c

File tree

8 files changed

+181
-34
lines changed

8 files changed

+181
-34
lines changed

.prettierrc

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
{
22
"singleQuote": true,
33
"trailingComma": "all",
4-
"printWidth": 100
4+
"printWidth": 100,
5+
"quoteProps": "consistent"
56
}

README.md

Lines changed: 31 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@ This library brings you abstraction between `puppeteer` and `page context`.
1010
- ✨ Respect the defined order of columns.
1111
- ✨ Appending custom columns with custom data.
1212
- ✨ Custom sanitization of data in cells.
13+
- ✨ Group and Aggregate data by your own function.
1314
- ✨ Merge data from two independent tables into one structure.
1415
- ✨ Handles invalid HTML structure
1516
- ✨ And much more!
@@ -39,6 +40,10 @@ interface ParserSettings {
3940
csvSeparator?: string; // (default: ';')
4041
newLine?: string; // (default: '\n')
4142
rowValidationPolicy?: RowValidationPolicy; // (default: 'NON_EMPTY')
43+
groupBy?: {
44+
cols: string[];
45+
handler?: (rows: string[][], getColumnIndex: GetColumnIndexType) => string[];
46+
}
4247
rowValidator: (
4348
row: string[],
4449
getColumnIndex: GetColumnIndexType,
@@ -63,9 +68,10 @@ interface ParserSettings {
6368
5. Run `rowValidator` function for every table row.
6469
6. Run `colParser` for every cell in a row.
6570
7. Run `rowTransform` function for each row.
66-
8. Add processed row to a temp array result.
67-
9. Add `header` column if `withHeader` property is `true`.
68-
10. Merge partial results and return them.
71+
8. Group results into buckets (`groupBy.cols`) property and pick the aggregated rows.
72+
9. Add processed row to a temp array result.
73+
10. 10.Add `header` column if `withHeader` property is `true`.
74+
11. Merge partial results and return them.
6975

7076
## Examples
7177

@@ -207,12 +213,27 @@ await tableParser(page, {
207213
});
208214
```
209215

210-
For more, look at `test` folder! 🙈
211-
212-
## TODO
216+
***Grouping and Aggregating**
217+
```typescript
218+
await tableParser(page, {
219+
selector: '#my-table',
220+
allowedColNames: {
221+
'Employee Name': 'name',
222+
'Age': 'age',
223+
},
224+
groupBy: {
225+
cols: ['name'],
226+
handler: (rows: string[][], getColumnIndex) => {
227+
const ageIndex = getColumnIndex('age');
228+
229+
// select one with the minimal age
230+
return rows.reduce((previous, current) =>
231+
previous[ageIndex] < current[ageIndex] ? previous : current,
232+
);
233+
},
234+
}
235+
});
236+
```
213237

214-
- [X] Add more examples
215-
- [X] Add tests
216-
- [X] Describe interfaces
217-
- [ ] Show merging table structures
238+
For more, look at the `test` folder! 🙈
218239

src/aggregations.ts

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,26 @@
1+
import { GetColumnIndexType, GroupByOptions } from './types';
2+
3+
const takeFirstHandler = (rows: string[][]) => rows[0];
4+
5+
export function groupBy(
6+
rows: string[][],
7+
{ cols, handler = takeFirstHandler }: GroupByOptions,
8+
getColumnIndex: GetColumnIndexType,
9+
): string[][] {
10+
const rowsByKey = new Map<string, string[][]>();
11+
12+
rows.forEach((row) => {
13+
const key = cols.map((col) => row[getColumnIndex(col)]).join('-');
14+
if (!rowsByKey.has(key)) {
15+
rowsByKey.set(key, []);
16+
}
17+
rowsByKey.get(key)!.push(row);
18+
});
19+
20+
return Array.from(rowsByKey.values()).map((rows) => {
21+
if (rows.length > 1) {
22+
return handler(rows, getColumnIndex);
23+
}
24+
return rows[0];
25+
});
26+
}

src/parseTable.ts

Lines changed: 17 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@ import { extraColsMapperFactory, getColumnsInfo } from './helpers';
33
import { ElementHandle } from 'puppeteer';
44
import { InvalidSettingsError } from './errors';
55
import PipelineExecutor from './pipelineExecutor';
6+
import { groupBy } from './aggregations';
67
export function parseTableFactory(settings: FullParserSettings) {
78
const extraColsMapper = extraColsMapperFactory(settings.extraCols);
89

@@ -16,15 +17,11 @@ export function parseTableFactory(settings: FullParserSettings) {
1617
return table.$$(settings.bodyRowsSelector);
1718
};
1819

19-
const getOutputHeaderRow = (excludedKeyIndexes: number[], missingColNames: string[]) => {
20+
const getOutputHeaderRow = (missingColNames: string[]) => {
2021
const headerRowRaw = Object.values(settings.allowedColNames);
2122
const sortedHeader = extraColsMapper(headerRowRaw, 'colName');
2223

23-
const headerRow = sortedHeader
24-
.filter((_, index) => !excludedKeyIndexes.includes(index))
25-
.filter((key) => !missingColNames.includes(key));
26-
27-
return settings.rowValuesAsArray ? headerRow : headerRow.join(settings.csvSeparator);
24+
return sortedHeader.filter((key) => !missingColNames.includes(key));
2825
};
2926

3027
const getRowStructureValidator = (allowedIndexes: Record<string, number>) => {
@@ -82,24 +79,29 @@ export function parseTableFactory(settings: FullParserSettings) {
8279
bodyRows.reverse();
8380
}
8481

85-
const finalRows = new PipelineExecutor<
86-
string[][],
87-
typeof settings.rowValuesAsArray extends true ? string[][] : string[]
88-
>()
82+
let parsedRows = new PipelineExecutor<string[][], string[][]>()
8983
.addFilter(getRowStructureValidator(indexes.allowed))
9084
.addMap((row) => extraColsMapper(row, 'data'))
9185
.addFilter((row, index, rows) => settings.rowValidator(row, getColumnIndex, index, rows))
9286
.addMap((row) => row.map((cell, index) => settings.colParser(cell, index, getColumnIndex)))
9387
.addTransform((row) => settings.rowTransform(row, getColumnIndex))
94-
.addMap((row) => row.filter((_, index: number) => !indexes.excluded.includes(index)))
95-
.addMap((row) => (settings.rowValuesAsArray ? row : row.join(settings.csvSeparator)))
9688
.execute(await Promise.all(bodyRows.map(getRowsData(indexes.allowed))));
9789

90+
if (settings.groupBy) {
91+
parsedRows = groupBy(parsedRows, settings.groupBy, getColumnIndex);
92+
}
93+
9894
if (addHeader) {
99-
const headerRow = getOutputHeaderRow(indexes.excluded, missingColNames);
100-
finalRows.unshift(headerRow as string);
95+
const headerRow = getOutputHeaderRow(missingColNames);
96+
parsedRows.unshift(headerRow);
10197
}
10298

103-
return finalRows;
99+
return new PipelineExecutor<
100+
string[][],
101+
typeof settings.rowValuesAsArray extends true ? string[][] : string[]
102+
>()
103+
.addMap((row) => row.filter((_, index: number) => !indexes.excluded.includes(index)))
104+
.addMap((row) => (settings.rowValuesAsArray ? row : row.join(settings.csvSeparator)))
105+
.execute(parsedRows);
104106
};
105107
}

src/settings.ts

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@ export const defaultSettings: ParserSettingsOptional = {
1313
csvSeparator: ';',
1414
newLine: '\n',
1515
rowValidationPolicy: RowValidationPolicy.NON_EMPTY,
16+
groupBy: undefined,
1617
rowValidator: () => true,
1718
rowTransform: () => {},
1819
asArray: false,
@@ -82,4 +83,13 @@ export function validateSettings(
8283
);
8384
}
8485
}
86+
87+
if (settings.groupBy) {
88+
if (!Array.isArray(settings.groupBy.cols)) {
89+
throw new InvalidSettingsError(`Columns in "groupBy" field must be typeof array`);
90+
}
91+
if (settings.groupBy.handler && typeof settings.groupBy.handler !== 'function') {
92+
throw new InvalidSettingsError(`Passed handler to the "groupBy" is not a function`);
93+
}
94+
}
8595
}

src/types.ts

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,13 +12,19 @@ export enum RowValidationPolicy {
1212
EXACT_MATCH = 'EXACT_MATCH',
1313
}
1414

15+
export type GroupByOptions = {
16+
cols: string[];
17+
handler?: (rows: string[][], getColumnIndex: GetColumnIndexType) => string[];
18+
};
19+
1520
export type ParserSettingsOptional = {
1621
temporaryColNames: string[];
1722
extraCols: ExtraCol[];
1823
withHeader: boolean;
1924
csvSeparator: string;
2025
newLine: string;
2126
rowValidationPolicy: RowValidationPolicy;
27+
groupBy: GroupByOptions;
2228
rowValidator: (
2329
row: string[],
2430
getColumnIndex: GetColumnIndexType,

test/assets/2.html

Lines changed: 44 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,44 @@
1+
<!DOCTYPE html>
2+
<html lang="en">
3+
<head>
4+
<title>Example #2</title>
5+
</head>
6+
<body>
7+
8+
<table id="employee-overview">
9+
<thead>
10+
<tr>
11+
<th>Employee Name</th>
12+
<th>Age</th>
13+
</tr>
14+
</thead>
15+
<tbody>
16+
<tr>
17+
<td>John M. Bolduc</td>
18+
<td>32</td>
19+
</tr>
20+
<tr>
21+
<td>Allan Meron</td>
22+
<td>40</td>
23+
</tr>
24+
<tr>
25+
<td>John M. Bolduc</td>
26+
<td>29</td>
27+
</tr>
28+
<tr>
29+
<td>Milan Lukeš</td>
30+
<td>33</td>
31+
</tr>
32+
<tr>
33+
<td>John M. Bolduc</td>
34+
<td>34</td>
35+
</tr>/tr>
36+
<tr>
37+
<td>Allan Meron</td>
38+
<td>90</td>
39+
</tr>
40+
</tbody>
41+
</table>
42+
43+
</body>
44+
</html>

test/index.test.ts

Lines changed: 45 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -420,20 +420,57 @@ describe('Basic parsing', () => {
420420
`);
421421
});
422422

423-
it('Parses large HTML table', async () => {
424-
await page.goto(`${getBaseUrl()}/large-table.html`);
423+
it.only('Handles aggregation', async () => {
424+
await page.goto(`${getBaseUrl()}/2.html`);
425425

426426
const data = await tableParser(page, {
427-
selector: 'table',
427+
selector: '#employee-overview',
428428
asArray: false,
429-
rowValidationPolicy: RowValidationPolicy.NON_EMPTY,
430429
allowedColNames: {
431-
H: 'first',
432-
V: 'cond',
433-
N: 'last',
430+
'Employee Name': 'name',
431+
'Age': 'age',
432+
},
433+
groupBy: {
434+
cols: ['name'],
434435
},
435436
});
436437

437-
expect(data).toBeTruthy();
438+
expect(data).toMatchInlineSnapshot(`
439+
"name;age
440+
John M. Bolduc;32
441+
Allan Meron;40
442+
Milan Lukeš;33"
443+
`);
444+
});
445+
446+
it('Handles aggregation with custom handler', async () => {
447+
await page.goto(`${getBaseUrl()}/2.html`);
448+
449+
const data = await tableParser(page, {
450+
selector: '#employee-overview',
451+
asArray: false,
452+
allowedColNames: {
453+
'Employee Name': 'name',
454+
'Age': 'age',
455+
},
456+
groupBy: {
457+
cols: ['name'],
458+
handler: (rows: string[][], getColumnIndex) => {
459+
const ageIndex = getColumnIndex('age');
460+
461+
// select one with the minimal age
462+
return rows.reduce((previous, current) =>
463+
previous[ageIndex] < current[ageIndex] ? previous : current,
464+
);
465+
},
466+
},
467+
});
468+
469+
expect(data).toMatchInlineSnapshot(`
470+
"name;age
471+
John M. Bolduc;29
472+
Allan Meron;40
473+
Milan Lukeš;33"
474+
`);
438475
});
439476
});

0 commit comments

Comments
 (0)