11import { FullParserSettings , RowValidationPolicy } from './types' ;
2- import { diffFromSource , extraColsMapperFactory } from './helpers' ;
2+ import { extraColsMapperFactory , getColumnsInfo } from './helpers' ;
33import { ElementHandle } from 'puppeteer' ;
4- import { InvalidColumnError , InvalidSettingsError , MissingRequiredColumnsError } from './errors' ;
5-
4+ import { InvalidSettingsError } from './errors' ;
5+ import PipelineExecutor from './pipelineExecutor' ;
66export function parseTableFactory ( settings : FullParserSettings ) {
77 const extraColsMapper = extraColsMapperFactory ( settings . extraCols ) ;
8- const allowedColNamesKeys = Object . keys ( settings . allowedColNames ) ;
98
109 const getHeaderRows = ( table : ElementHandle ) => {
1110 return settings . headerRowsSelector
@@ -17,16 +16,13 @@ export function parseTableFactory(settings: FullParserSettings) {
1716 return table . $$ ( settings . bodyRowsSelector ) ;
1817 } ;
1918
20- const getOutputHeaderRow = (
21- excludedKeyIndexes : number [ ] ,
22- nonFoundedColNames : FullParserSettings [ 'allowedColNames' ] ,
23- ) => {
19+ const getOutputHeaderRow = ( excludedKeyIndexes : number [ ] , missingColNames : string [ ] ) => {
2420 const headerRowRaw = Object . values ( settings . allowedColNames ) ;
2521 const sortedHeader = extraColsMapper ( headerRowRaw , 'colName' ) ;
2622
2723 const headerRow = sortedHeader
2824 . filter ( ( _ , index ) => ! excludedKeyIndexes . includes ( index ) )
29- . filter ( ( key ) => ! Object . values ( nonFoundedColNames ) . includes ( key ) ) ;
25+ . filter ( ( key ) => ! missingColNames . includes ( key ) ) ;
3026
3127 return settings . rowValuesAsArray ? headerRow : headerRow . join ( settings . csvSeparator ) ;
3228 } ;
@@ -39,13 +35,12 @@ export function parseTableFactory(settings: FullParserSettings) {
3935 return ( rows : string [ ] ) => rows . length > 0 ;
4036 }
4137 if ( settings . rowValidationPolicy === RowValidationPolicy . EXACT_MATCH ) {
42- const indexesCount = Object . keys ( allowedIndexes ) . length ;
43- return ( rows : string [ ] ) => rows . length === indexesCount ;
38+ return ( rows : string [ ] ) => rows . length === Object . keys ( allowedIndexes ) . length ;
4439 }
4540 throw new InvalidSettingsError ( 'Unknown mode for the "rowValidationPolicy"' ) ;
4641 } ;
4742
48- const filterSortCols =
43+ const getRowsData =
4944 ( allowedIndexes : Record < string , number > ) =>
5045 ( row : ElementHandle ) : Promise < string [ ] > =>
5146 row . $$eval (
@@ -75,87 +70,36 @@ export function parseTableFactory(settings: FullParserSettings) {
7570 return [ ] ;
7671 }
7772
78- const headerRow : ElementHandle =
79- headerRows . length > 0 ? headerRows . shift ( ) ! : bodyRows . shift ( ) ! ;
73+ const headerRow : ElementHandle = headerRows . length > 0 ? headerRows . shift ( ) : bodyRows . shift ( ) ;
74+
75+ const { indexes, getColumnIndex, missingColNames } = await getColumnsInfo (
76+ settings ,
77+ headerRow ,
78+ extraColsMapper ,
79+ ) ;
8080
8181 if ( settings . reverseTraversal ) {
8282 bodyRows . reverse ( ) ;
8383 }
8484
85- // Will be updated during parsing and not found columns will be deleted
86- const nonFoundedColNames = { ...settings . allowedColNames } ;
87-
88- // Sorted by finding which was first visited
89- // is index in which we traverse the table, second is final position
90- const allowedIndexes : Record < string , number > = (
91- await headerRow . $$eval (
92- 'td,th' ,
93- ( cells : Element [ ] , newLine : string ) => {
94- return cells . map ( ( cell ) => ( cell as HTMLTableCellElement ) . innerText . split ( newLine ) ) ;
95- } ,
96- settings . newLine ,
97- )
98- ) . reduce ( ( acc , text : string [ ] , realIndex : number ) => {
99- const colName = String ( settings . colFilter ( text , realIndex ) ) ;
100-
101- if ( settings . allowedColNames . hasOwnProperty ( colName ) ) {
102- delete nonFoundedColNames [ colName ] ;
103-
104- const desiredIndex = allowedColNamesKeys . findIndex ( ( key ) => key === colName ) ;
105- Object . assign ( acc , { [ realIndex ] : desiredIndex } ) ;
106- }
107-
108- return acc ;
109- } , { } ) ;
110-
111- const missingRequiredColumns = diffFromSource (
112- Object . values ( nonFoundedColNames ) ,
113- settings . optionalColNames ,
114- ) ;
115- if ( missingRequiredColumns . length > 0 ) {
116- console . warn ( `Not matched columns are following entries: ` , missingRequiredColumns ) ;
117- throw new MissingRequiredColumnsError (
118- 'Number of filtered columns does not match to required columns count!' ,
119- ) ;
120- }
121-
122- const excludedKeyIndexes : number [ ] = [ ] ;
123- const colKeyToIndexWithExcluded : Map < string , number > = new Map < string , number > ( ) ;
124- extraColsMapper ( allowedColNamesKeys , 'colName' ) . forEach ( ( key , index ) => {
125- colKeyToIndexWithExcluded . set ( key , index ) ;
126- colKeyToIndexWithExcluded . set ( settings . allowedColNames [ key ] || key , index ) ;
127-
128- if ( settings . temporaryColNames . includes ( key ) ) {
129- excludedKeyIndexes . push ( index ) ;
130- }
131- } ) ;
132-
133- const getColumnIndex = ( colName : string ) : number => {
134- const index = colKeyToIndexWithExcluded . get ( colName ) ;
135- if ( index === undefined ) {
136- throw new InvalidColumnError ( `Invalid column name! '${ colName } '` ) ;
137- }
138-
139- return index ;
140- } ;
141-
142- const finalRows = ( await Promise . all ( bodyRows . map ( filterSortCols ( allowedIndexes ) ) ) )
143- . filter ( getRowStructureValidator ( allowedIndexes ) )
144- . map ( ( row ) => extraColsMapper ( row , 'data' ) )
145- . filter ( ( row , index , rows ) => settings . rowValidator ( row , getColumnIndex , index , rows ) )
146- . map ( ( row ) => row . map ( ( cell , index ) => settings . colParser ( cell , index , getColumnIndex ) ) )
147- . map ( ( row ) => {
148- settings . rowTransform ( row , getColumnIndex ) ;
149-
150- const filteredRow = row . filter ( ( _ , index ) => ! excludedKeyIndexes . includes ( index ) ) ;
151- return settings . rowValuesAsArray ? filteredRow : filteredRow . join ( settings . csvSeparator ) ;
152- } ) ;
85+ const finalRows = new PipelineExecutor <
86+ string [ ] [ ] ,
87+ typeof settings . rowValuesAsArray extends true ? string [ ] [ ] : string [ ]
88+ > ( )
89+ . addFilter ( getRowStructureValidator ( indexes . allowed ) )
90+ . addMap ( ( row ) => extraColsMapper ( row , 'data' ) )
91+ . addFilter ( ( row , index , rows ) => settings . rowValidator ( row , getColumnIndex , index , rows ) )
92+ . addMap ( ( row ) => row . map ( ( cell , index ) => settings . colParser ( cell , index , getColumnIndex ) ) )
93+ . addTransform ( ( row ) => settings . rowTransform ( row , getColumnIndex ) )
94+ . addMap ( ( row ) => row . filter ( ( _ , index : number ) => ! indexes . excluded . includes ( index ) ) )
95+ . addMap ( ( row ) => ( settings . rowValuesAsArray ? row : row . join ( settings . csvSeparator ) ) )
96+ . execute ( await Promise . all ( bodyRows . map ( getRowsData ( indexes . allowed ) ) ) ) ;
15397
15498 if ( addHeader ) {
155- const headerRow = getOutputHeaderRow ( excludedKeyIndexes , nonFoundedColNames ) ;
156- finalRows . unshift ( headerRow ) ;
99+ const headerRow = getOutputHeaderRow ( indexes . excluded , missingColNames ) ;
100+ finalRows . unshift ( headerRow as string ) ;
157101 }
158102
159- return finalRows as typeof settings . rowValuesAsArray extends true ? string [ ] [ ] : string [ ] ;
103+ return finalRows ;
160104 } ;
161105}
0 commit comments