1+ import markdown
2+ from bs4 import BeautifulSoup , NavigableString , Tag
3+ import string
4+
5+
6+ class Helper :
7+ @staticmethod
8+ def parse_gfm_section (html_content ):
9+ """
10+ Parse a GitHub-Flavored Markdown section containing a table and surrounding content.
11+ Returns a dict with "before_html", "columns", "rows_html", and "after_html".
12+ """
13+ html = markdown .markdown (html_content , extensions = ['extra' ])
14+ soup = BeautifulSoup (html , "html.parser" )
15+
16+ table = soup .find ('table' )
17+ if not table :
18+ # If no table, treat entire content as before_html
19+ return {"before_html" : html , "columns" : [], "rows_html" : [], "after_html" : '' }
20+
21+ # Collect HTML before the table
22+ before_parts = [str (elem ) for elem in table .find_previous_siblings ()]
23+ before_html = '' .join (reversed (before_parts ))
24+
25+ # Collect HTML after the table
26+ after_parts = [str (elem ) for elem in table .find_next_siblings ()]
27+ after_html = '' .join (after_parts )
28+
29+ # Extract table headers
30+ headers = [th .get_text (strip = True ) for th in table .find_all ('th' )]
31+
32+ # Extract table rows (skip header)
33+ rows_html = []
34+ for tr in table .find_all ('tr' )[1 :]:
35+ cells = [str (td ) for td in tr .find_all ('td' )]
36+ rows_html .append (cells )
37+
38+ return {
39+ "before_html" : before_html ,
40+ "columns" : headers ,
41+ "rows_html" : rows_html ,
42+ "after_html" : after_html
43+ }
44+
45+ @staticmethod
46+ def parse_cell (html_td ):
47+ """Convert a table cell HTML into plain text or a dict for links/images."""
48+ soup = BeautifulSoup (html_td , "html.parser" )
49+ a = soup .find ('a' )
50+ if a :
51+ cell = {"url" : a .get ('href' , '' )}
52+ img = a .find ('img' )
53+ if img :
54+ cell .update ({
55+ "img_src" : img .get ('src' , '' ),
56+ "title" : img .get ('title' , '' ),
57+ "link_text" : a .get_text (strip = True )
58+ })
59+ else :
60+ cell ["link_text" ] = a .get_text (strip = True )
61+ return cell
62+ return soup .get_text (strip = True )
63+
64+ @staticmethod
65+ def parse_html_parts (html_fragment ):
66+ """
67+ Convert an HTML fragment into a list of parts.
68+ Each part is either:
69+ - {"text": "..."}
70+ - {"link": "url", "text": "..."}
71+ - {"img_src": "url", "alt": "...", "title": "..."}
72+ """
73+ soup = BeautifulSoup (html_fragment , 'html.parser' )
74+ parts = []
75+
76+ def handle_element (elem ):
77+ if isinstance (elem , NavigableString ):
78+ text = str (elem ).strip ()
79+ if text and not all (ch in string .punctuation for ch in text ):
80+ parts .append ({"text" : text })
81+ elif isinstance (elem , Tag ):
82+ if elem .name == 'a' :
83+ href = elem .get ('href' , '' )
84+ txt = elem .get_text (strip = True )
85+ parts .append ({"link" : href , "text" : txt })
86+ elif elem .name == 'img' :
87+ parts .append ({
88+ "img_src" : elem .get ('src' , '' ),
89+ "alt" : elem .get ('alt' , '' ),
90+ "title" : elem .get ('title' , '' )
91+ })
92+ else :
93+ # Recurse into children for nested tags
94+ for child in elem .children :
95+ handle_element (child )
96+
97+ for element in soup .contents :
98+ handle_element (element )
99+
100+ return parts
101+
102+ @staticmethod
103+ def section_to_json (section_result ):
104+ """
105+ Convert a parsed section into structured JSON.
106+ Returns {"before": [...], "table": [...], "after": [...]}.
107+ """
108+ # Build JSON rows for the table
109+ table_rows = []
110+ cols = section_result .get ('columns' , [])
111+ for row_html in section_result .get ('rows_html' , []):
112+ cells = [Helper .parse_cell (cell_html ) for cell_html in row_html ]
113+ table_rows .append (dict (zip (cols , cells )))
114+
115+ return {
116+ "before" : Helper .parse_html_parts (section_result .get ('before_html' , '' )),
117+ "table" : table_rows ,
118+ "after" : Helper .parse_html_parts (section_result .get ('after_html' , '' ))
119+ }
0 commit comments