python
diff --git a/‎Doc/library/html.parser.rst‎
Lines changed: 20 additions & 13 deletions b/‎Doc/library/html.parser.rst‎
Lines changed: 20 additions & 13 deletions
diff --git a/‎Lib/html/parser.py‎
Lines changed: 18 additions & 6 deletions b/‎Lib/html/parser.py‎
Lines changed: 18 additions & 6 deletions
@@ -15,14 +15,18 @@
 This module defines a class :class:`HTMLParser` which serves as the basis for
 parsing text files formatted in HTML (HyperText Mark-up Language) and XHTML.
 
-.. class:: HTMLParser(*, convert_charrefs=True)
+.. class:: HTMLParser(*, convert_charrefs=True, scripting=False)
 
    Create a parser instance able to parse invalid markup.
 
-   If *convert_charrefs* is ``True`` (the default), all character
-   references (except the ones in ``script``/``style`` elements) are
+   If *convert_charrefs* is true (the default), all character
+   references (except the ones in elements like ``script`` and ``style``) are
    automatically converted to the corresponding Unicode characters.
 
+   If *scripting* is false (the default), the content of the ``noscript``
+   element is parsed normally; if it's true, it's returned as is without
+   being parsed.
+
    An :class:`.HTMLParser` instance is fed HTML data and calls handler methods
    when start tags, end tags, text, comments, and other markup elements are
    encountered.  The user should subclass :class:`.HTMLParser` and override its
@@ -37,6 +41,9 @@ parsing text files formatted in HTML (HyperText Mark-up Language) and XHTML.
    .. versionchanged:: 3.5
       The default value for argument *convert_charrefs* is now ``True``.
 
+   .. versionchanged:: 3.11.15
+      Added the *scripting* parameter.
+
 
 Example HTML Parser Application
 -------------------------------
@@ -159,24 +166,24 @@ implementations do nothing (except for :meth:`~HTMLParser.handle_startendtag`):
 .. method:: HTMLParser.handle_data(data)
 
    This method is called to process arbitrary data (e.g. text nodes and the
-   content of ``<script>...</script>`` and ``<style>...</style>``).
+   content of elements like ``script`` and ``style``).
 
 
 .. method:: HTMLParser.handle_entityref(name)
 
    This method is called to process a named character reference of the form
    ``&name;`` (e.g. ``&gt;``), where *name* is a general entity reference
-   (e.g. ``'gt'``).  This method is never called if *convert_charrefs* is
-   ``True``.
+   (e.g. ``'gt'``).
+   This method is only called if *convert_charrefs* is false.
 
 
 .. method:: HTMLParser.handle_charref(name)
 
    This method is called to process decimal and hexadecimal numeric character
    references of the form :samp:`&#{NNN};` and :samp:`&#x{NNN};`.  For example, the decimal
    equivalent for ``&gt;`` is ``&#62;``, whereas the hexadecimal is ``&#x3E;``;
-   in this case the method will receive ``'62'`` or ``'x3E'``.  This method
-   is never called if *convert_charrefs* is ``True``.
+   in this case the method will receive ``'62'`` or ``'x3E'``.
+   This method is only called if *convert_charrefs* is false.
 
 
 .. method:: HTMLParser.handle_comment(data)
@@ -284,8 +291,8 @@ Parsing an element with a few attributes and a title::
    Data     : Python
    End tag  : h1
 
-The content of ``script`` and ``style`` elements is returned as is, without
-further parsing::
+The content of elements like ``script`` and ``style`` is returned as is,
+without further parsing::
 
    >>> parser.feed('<style type="text/css">#python { color: green }</style>')
    Start tag: style
@@ -294,10 +301,10 @@ further parsing::
    End tag  : style
 
    >>> parser.feed('<script type="text/javascript">'
-   ...             'alert("<strong>hello!</strong>");</script>')
+   ...             'alert("<strong>hello! &#9786;</strong>");</script>')
    Start tag: script
         attr: ('type', 'text/javascript')
-   Data     : alert("<strong>hello!</strong>");
+   Data     : alert("<strong>hello! &#9786;</strong>");
    End tag  : script
 
 Parsing comments::
@@ -317,7 +324,7 @@ correct char (note: these 3 references are all equivalent to ``'>'``)::
 
 Feeding incomplete chunks to :meth:`~HTMLParser.feed` works, but
 :meth:`~HTMLParser.handle_data` might be called more than once
-(unless *convert_charrefs* is set to ``True``)::
+if *convert_charrefs* is false::
 
    >>> for chunk in ['<sp', 'an>buff', 'ered ', 'text</s', 'pan>']:
    ...     parser.feed(chunk)
 
@@ -109,16 +109,24 @@ class HTMLParser(_markupbase.ParserBase):
     argument.
     """
 
-    CDATA_CONTENT_ELEMENTS = ("script", "style")
+    # See the HTML5 specs section "13.4 Parsing HTML fragments".
+    # https://html.spec.whatwg.org/multipage/parsing.html#parsing-html-fragments
+    # CDATA_CONTENT_ELEMENTS are parsed in RAWTEXT mode
+    CDATA_CONTENT_ELEMENTS = ("script", "style", "xmp", "iframe", "noembed", "noframes")
     RCDATA_CONTENT_ELEMENTS = ("textarea", "title")
 
-    def __init__(self, *, convert_charrefs=True):
+    def __init__(self, *, convert_charrefs=True, scripting=False):
         """Initialize and reset this instance.
 
-        If convert_charrefs is True (the default), all character references
+        If convert_charrefs is true (the default), all character references
         are automatically converted to the corresponding Unicode characters.
+
+        If *scripting* is false (the default), the content of the
+        ``noscript`` element is parsed normally; if it's true,
+        it's returned as is without being parsed.
         """
         self.convert_charrefs = convert_charrefs
+        self.scripting = scripting
         self.reset()
 
     def reset(self):
@@ -153,7 +161,9 @@ def get_starttag_text(self):
     def set_cdata_mode(self, elem, *, escapable=False):
         self.cdata_elem = elem.lower()
         self._escapable = escapable
-        if escapable and not self.convert_charrefs:
+        if self.cdata_elem == 'plaintext':
+            self.interesting = re.compile(r'\Z')
+        elif escapable and not self.convert_charrefs:
             self.interesting = re.compile(r'&|</%s(?=[\t\n\r\f />])' % self.cdata_elem,
                                           re.IGNORECASE|re.ASCII)
         else:
@@ -434,8 +444,10 @@ def parse_starttag(self, i):
             self.handle_startendtag(tag, attrs)
         else:
             self.handle_starttag(tag, attrs)
-            if tag in self.CDATA_CONTENT_ELEMENTS:
-                self.set_cdata_mode(tag)
+            if (tag in self.CDATA_CONTENT_ELEMENTS or
+                (self.scripting and tag == "noscript") or
+                tag == "plaintext"):
+                self.set_cdata_mode(tag, escapable=False)
             elif tag in self.RCDATA_CONTENT_ELEMENTS:
                 self.set_cdata_mode(tag, escapable=True)
         return endpos