1
1
import collections
2
2
import string
3
- from typing import Dict , List
3
+ from typing import Dict , List , Optional
4
4
from selenium .webdriver .remote .webelement import WebElement
5
5
6
6
7
- def data_from_row (row : WebElement , cell_tag = "td" ) -> List [str ]:
7
+ def data_from_row (row : WebElement , cell_tag = "td" , cell_xpath = None ) -> List [str ]:
8
8
"""Extract data from a row and return it as a list.
9
9
10
10
Args:
11
11
row (WebElement): The row element.
12
12
cell_tag (str, optional): The HTML tag associated with the row cells. Defaults to "td".
13
+ cell_xpath (str, optional): The XPath expression associated with the row cels. Defaults to None.
14
+ If informed, overwrites the `cell_tag` definition.
13
15
14
16
Returns:
15
17
list: List of strings with the contents.
16
18
"""
19
+ if cell_xpath :
20
+ return [
21
+ col .text for col in row .find_elements_by_xpath (cell_xpath )
22
+ ]
23
+
17
24
return [
18
25
col .text for col in row .find_elements_by_tag_name (cell_tag )
19
26
]
@@ -48,14 +55,17 @@ def sanitize_header(labels: List[str]):
48
55
49
56
50
57
def table_to_dict (table : WebElement , has_header : bool = True ,
51
- skip_rows : int = 0 , header_tag : str = "th" ) -> List [Dict ]:
58
+ skip_rows : int = 0 , header_tag : str = "th" ,
59
+ cell_xpath : Optional [str ] = None ) -> List [Dict ]:
52
60
"""Convert a table WebElement to a dict of lists.
53
61
54
62
Args:
55
63
table (WebElement): The table element.
56
64
has_header (bool, optional): Whether or not to parse a header. Defaults to True.
57
65
skip_rows (int, optional): Number of rows to skip from the top. Defaults to 0.
58
66
header_tag (str, optional): The HTML tag associated with the header cell. Defaults to "th".
67
+ cell_xpath (str, optional): Optional cell XPath selector for complex row constructions.
68
+ If `cell_xpath` is not informed, the row data will come from `<td>` elements.
59
69
60
70
Returns:
61
71
list: List with dict for each row.
@@ -68,6 +78,10 @@ def table_to_dict(table: WebElement, has_header: bool = True,
68
78
if skip_rows :
69
79
rows = rows [skip_rows :]
70
80
81
+ if cell_xpath and not cell_xpath .startswith ('.' ):
82
+ # Convert into relative xpath
83
+ cell_xpath = f'.{ cell_xpath } '
84
+
71
85
# Parse header labels
72
86
if has_header :
73
87
# Read header labels
@@ -78,13 +92,18 @@ def table_to_dict(table: WebElement, has_header: bool = True,
78
92
rows = rows [1 :]
79
93
else :
80
94
# Make up header labels
81
- num_cols = len (rows [0 ].find_elements_by_tag_name ("td" ))
95
+ if cell_xpath :
96
+ cols = rows [0 ].find_elements_by_xpath (cell_xpath )
97
+ else :
98
+ cols = rows [0 ].find_elements_by_tag_name ("td" )
99
+
100
+ num_cols = len (cols )
82
101
labels = [f"col_{ i } " for i in range (num_cols )]
83
102
84
103
# Assemble output dictionary
85
104
out_list = []
86
105
for row in rows :
87
- row_data = data_from_row (row )
106
+ row_data = data_from_row (row , cell_xpath = cell_xpath )
88
107
out_list .append (dict (zip (labels , row_data )))
89
108
90
109
return out_list
0 commit comments