Skip to content

gh-135661: Fix CDATA section parsing in HTMLParser #135665

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 11 commits into
base: main
Choose a base branch
from
Open
28 changes: 26 additions & 2 deletions Lib/html/parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -146,6 +146,7 @@ def reset(self):
self.lasttag = '???'
self.interesting = interesting_normal
self.cdata_elem = None
self._support_cdata = True
self._escapable = True
super().reset()

Expand Down Expand Up @@ -183,6 +184,19 @@ def clear_cdata_mode(self):
self.cdata_elem = None
self._escapable = True

def _set_support_cdata(self, flag=True):
"""Enable or disable support of the CDATA sections.
If enabled, "<[CDATA[" starts a CDATA section which ends with "]]>".
If disabled, "<[CDATA[" starts a bogus comments which ends with ">".

This method is not called by default. Its purpose is to be called
in custom handle_starttag() and handle_endtag() methods, with
value that depends on the adjusted current node.
See https://html.spec.whatwg.org/multipage/parsing.html#markup-declaration-open-state
for details.
"""
self._support_cdata = flag

# Internal -- handle data as far as reasonable. May leave state
# and data to be processed by a subsequent call. If 'end' is
# true, force handling all data as if followed by EOF marker.
Expand Down Expand Up @@ -258,7 +272,10 @@ def goahead(self, end):
break
self.handle_comment(rawdata[i+4:j])
elif startswith("<![CDATA[", i):
self.unknown_decl(rawdata[i+3:])
if self._support_cdata:
self.unknown_decl(rawdata[i+3:])
else:
self.handle_comment(rawdata[i+1:])
elif rawdata[i:i+9].lower() == '<!doctype':
self.handle_decl(rawdata[i+2:])
elif startswith("<!", i):
Expand Down Expand Up @@ -334,7 +351,14 @@ def parse_html_declaration(self, i):
# this case is actually already handled in goahead()
return self.parse_comment(i)
elif rawdata[i:i+9] == '<![CDATA[':
return self.parse_marked_section(i)
if self._support_cdata:
j = rawdata.find(']]>', i+9)
if j < 0:
return -1
self.unknown_decl(rawdata[i+3: j])
return j + 3
else:
return self.parse_bogus_comment(i)
elif rawdata[i:i+9].lower() == '<!doctype':
# find the closing >
gtpos = rawdata.find('>', i+9)
Expand Down
86 changes: 63 additions & 23 deletions Lib/test/test_htmlparser.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,10 +10,13 @@

class EventCollector(html.parser.HTMLParser):

def __init__(self, *args, **kw):
def __init__(self, *args, autocdata=False, **kw):
self.autocdata = autocdata
self.events = []
self.append = self.events.append
html.parser.HTMLParser.__init__(self, *args, **kw)
if autocdata:
self._set_support_cdata(False)

def get_events(self):
# Normalize the list of events so that buffer artefacts don't
Expand All @@ -34,12 +37,16 @@ def get_events(self):

def handle_starttag(self, tag, attrs):
self.append(("starttag", tag, attrs))
if self.autocdata and tag == 'svg':
self._set_support_cdata(True)

def handle_startendtag(self, tag, attrs):
self.append(("startendtag", tag, attrs))

def handle_endtag(self, tag):
self.append(("endtag", tag))
if self.autocdata and tag == 'svg':
self._set_support_cdata(False)

# all other markup

Expand Down Expand Up @@ -767,10 +774,6 @@ def test_eof_in_declarations(self):
('<!', [('comment', '')]),
('<!-', [('comment', '-')]),
('<![', [('comment', '[')]),
('<![CDATA[', [('unknown decl', 'CDATA[')]),
('<![CDATA[x', [('unknown decl', 'CDATA[x')]),
('<![CDATA[x]', [('unknown decl', 'CDATA[x]')]),
('<![CDATA[x]]', [('unknown decl', 'CDATA[x]]')]),
('<!DOCTYPE', [('decl', 'DOCTYPE')]),
('<!DOCTYPE ', [('decl', 'DOCTYPE ')]),
('<!DOCTYPE html', [('decl', 'DOCTYPE html')]),
Expand All @@ -783,6 +786,18 @@ def test_eof_in_declarations(self):
for html, expected in data:
self._run_check(html, expected)

@support.subTests('content', ['', 'x', 'x]', 'x]]'])
def test_eof_in_cdata(self, content):
self._run_check('<![CDATA[' + content,
[('unknown decl', 'CDATA[' + content)])
self._run_check('<![CDATA[' + content,
[('comment', '![CDATA[' + content)],
collector=EventCollector(autocdata=True))
self._run_check('<svg><text y="100"><![CDATA[' + content,
[('starttag', 'svg', []),
('starttag', 'text', [('y', '100')]),
('unknown decl', 'CDATA[' + content)])

def test_bogus_comments(self):
html = ('<!ELEMENT br EMPTY>'
'<! not really a comment >'
Expand Down Expand Up @@ -845,28 +860,53 @@ def test_broken_condcoms(self):
]
self._run_check(html, expected)

def test_cdata_declarations(self):
# More tests should be added. See also "8.2.4.42. Markup
# declaration open state", "8.2.4.69. CDATA section state",
# and issue 32876
html = ('<![CDATA[just some plain text]]>')
expected = [('unknown decl', 'CDATA[just some plain text')]
@support.subTests('content', [
'just some plain text',
'<!-- not a comment -->',
'&not-an-entity-ref;',
"<not a='start tag'>",
'',
'[[I have many brackets]]',
'I have a > in the middle',
'I have a ]] in the middle',
'] ]>',
']] >',
('\n'
' if (a < b && a > b) {\n'
' printf("[<marquee>How?</marquee>]");\n'
' }\n'),
])
def test_cdata_section_content(self, content):
# See "13.2.5.42 Markup declaration open state",
# "13.2.5.69 CDATA section state", and issue bpo-32876.
html = f'<svg><text y="100"><![CDATA[{content}]]></text></svg>'
expected = [
('starttag', 'svg', []),
('starttag', 'text', [('y', '100')]),
('unknown decl', 'CDATA[' + content),
('endtag', 'text'),
('endtag', 'svg'),
]
self._run_check(html, expected)
self._run_check(html, expected, collector=EventCollector(autocdata=True))

def test_cdata_declarations_multiline(self):
html = ('<code><![CDATA['
' if (a < b && a > b) {'
' printf("[<marquee>How?</marquee>]");'
' }'
']]></code>')
def test_cdata_section(self):
# See "13.2.5.42 Markup declaration open state".
html = ('<![CDATA[foo<br>bar]]>'
'<svg><text y="100"><![CDATA[foo<br>bar]]></text></svg>'
'<![CDATA[foo<br>bar]]>')
expected = [
('starttag', 'code', []),
('unknown decl',
'CDATA[ if (a < b && a > b) { '
'printf("[<marquee>How?</marquee>]"); }'),
('endtag', 'code')
('comment', '[CDATA[foo<br'),
('data', 'bar]]>'),
('starttag', 'svg', []),
('starttag', 'text', [('y', '100')]),
('unknown decl', 'CDATA[foo<br>bar'),
('endtag', 'text'),
('endtag', 'svg'),
('comment', '[CDATA[foo<br'),
('data', 'bar]]>'),
]
self._run_check(html, expected)
self._run_check(html, expected, collector=EventCollector(autocdata=True))

def test_convert_charrefs_dropped_text(self):
# #23144: make sure that all the events are triggered when
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
Fix CDATA section parsing in :class:`html.parser.HTMLParser` according to
the HTML5 standard: ``] ]>`` and ``]] >`` no longer end the CDATA section.
Comment on lines +1 to +2
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
Fix CDATA section parsing in :class:`html.parser.HTMLParser` according to
the HTML5 standard: ``] ]>`` and ``]] >`` no longer end the CDATA section.
Fix CDATA section parsing in :class:`html.parser.HTMLParser` according to
the HTML5 standard: ``] ]>`` and ``]] >`` no longer end the CDATA section.
By default, :meth:`~HTMLParser.handle_comment` is called for CDATA.
The old behavior (calling :meth:`~HTMLParser.unknown_decl`) can be restored
using a new method, :meth:`~HTMLParser.support_cdata`.

Add private method ``_set_support_cdata()`` which can be used to specify
how to parse ``<[CDATA[`` -- as a CDATA section in foreign content
(SVG or MathML) or as a bogus comment in the HTML namespace.
Loading