Skip to content

Commit 90cad69

Browse files
authored
Merge pull request #65 from hhslepicka/improve-parser
ENH: Improve table to dict parser to accept custom XPath for table cell.
2 parents 6ddb8bb + aaa9d03 commit 90cad69

File tree

1 file changed

+24
-5
lines changed

1 file changed

+24
-5
lines changed

botcity/web/parsers.py

Lines changed: 24 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,19 +1,26 @@
11
import collections
22
import string
3-
from typing import Dict, List
3+
from typing import Dict, List, Optional
44
from selenium.webdriver.remote.webelement import WebElement
55

66

7-
def data_from_row(row: WebElement, cell_tag="td") -> List[str]:
7+
def data_from_row(row: WebElement, cell_tag="td", cell_xpath=None) -> List[str]:
88
"""Extract data from a row and return it as a list.
99
1010
Args:
1111
row (WebElement): The row element.
1212
cell_tag (str, optional): The HTML tag associated with the row cells. Defaults to "td".
13+
cell_xpath (str, optional): The XPath expression associated with the row cels. Defaults to None.
14+
If informed, overwrites the `cell_tag` definition.
1315
1416
Returns:
1517
list: List of strings with the contents.
1618
"""
19+
if cell_xpath:
20+
return [
21+
col.text for col in row.find_elements_by_xpath(cell_xpath)
22+
]
23+
1724
return [
1825
col.text for col in row.find_elements_by_tag_name(cell_tag)
1926
]
@@ -48,14 +55,17 @@ def sanitize_header(labels: List[str]):
4855

4956

5057
def table_to_dict(table: WebElement, has_header: bool = True,
51-
skip_rows: int = 0, header_tag: str = "th") -> List[Dict]:
58+
skip_rows: int = 0, header_tag: str = "th",
59+
cell_xpath: Optional[str] = None) -> List[Dict]:
5260
"""Convert a table WebElement to a dict of lists.
5361
5462
Args:
5563
table (WebElement): The table element.
5664
has_header (bool, optional): Whether or not to parse a header. Defaults to True.
5765
skip_rows (int, optional): Number of rows to skip from the top. Defaults to 0.
5866
header_tag (str, optional): The HTML tag associated with the header cell. Defaults to "th".
67+
cell_xpath (str, optional): Optional cell XPath selector for complex row constructions.
68+
If `cell_xpath` is not informed, the row data will come from `<td>` elements.
5969
6070
Returns:
6171
list: List with dict for each row.
@@ -68,6 +78,10 @@ def table_to_dict(table: WebElement, has_header: bool = True,
6878
if skip_rows:
6979
rows = rows[skip_rows:]
7080

81+
if cell_xpath and not cell_xpath.startswith('.'):
82+
# Convert into relative xpath
83+
cell_xpath = f'.{cell_xpath}'
84+
7185
# Parse header labels
7286
if has_header:
7387
# Read header labels
@@ -78,13 +92,18 @@ def table_to_dict(table: WebElement, has_header: bool = True,
7892
rows = rows[1:]
7993
else:
8094
# Make up header labels
81-
num_cols = len(rows[0].find_elements_by_tag_name("td"))
95+
if cell_xpath:
96+
cols = rows[0].find_elements_by_xpath(cell_xpath)
97+
else:
98+
cols = rows[0].find_elements_by_tag_name("td")
99+
100+
num_cols = len(cols)
82101
labels = [f"col_{i}" for i in range(num_cols)]
83102

84103
# Assemble output dictionary
85104
out_list = []
86105
for row in rows:
87-
row_data = data_from_row(row)
106+
row_data = data_from_row(row, cell_xpath=cell_xpath)
88107
out_list.append(dict(zip(labels, row_data)))
89108

90109
return out_list

0 commit comments

Comments
 (0)