22
33namespace WordPress \DataLiberation \URL ;
44
5- use function WordPress \Encoding \_wp_scan_utf8 ;
6- use function WordPress \Encoding \_wp_scrub_utf8_fallback ;
75use function WordPress \Encoding \utf8_codepoint_at ;
86use function WordPress \Encoding \codepoint_to_utf8_bytes ;
7+ use function WordPress \Encoding \compat \_wp_scan_utf8 ;
8+ use function WordPress \Encoding \wp_scrub_utf8 ;
99
1010/**
1111 * Tokenizes CSS according to the CSS Syntax Level 3 specification.
@@ -742,6 +742,32 @@ public function get_token_value() {
742742 return $ this ->token_value ;
743743 }
744744
745+ /**
746+ * Determines whether the current token is a data URI.
747+ *
748+ * Only meaningful for URL and STRING tokens. Returns false for all other token types.
749+ *
750+ * @return bool Whether the current token value starts with "data:" (case-insensitive).
751+ */
752+ public function is_data_uri (): bool {
753+ if ( null === $ this ->token_value_starts_at || null === $ this ->token_value_length ) {
754+ return false ;
755+ }
756+
757+ if ( $ this ->token_value_length < 5 ) {
758+ return false ;
759+ }
760+
761+ $ offset = $ this ->token_value_starts_at ;
762+ return (
763+ ( 'd ' === $ this ->css [ $ offset ] || 'D ' === $ this ->css [ $ offset ] ) &&
764+ ( 'a ' === $ this ->css [ $ offset + 1 ] || 'A ' === $ this ->css [ $ offset + 1 ] ) &&
765+ ( 't ' === $ this ->css [ $ offset + 2 ] || 'T ' === $ this ->css [ $ offset + 2 ] ) &&
766+ ( 'a ' === $ this ->css [ $ offset + 3 ] || 'A ' === $ this ->css [ $ offset + 3 ] ) &&
767+ ': ' === $ this ->css [ $ offset + 4 ]
768+ );
769+ }
770+
745771 /**
746772 * Gets the token start at.
747773 *
@@ -788,40 +814,13 @@ public function get_token_value_length(): ?int {
788814 }
789815
790816 /**
791- * Determines whether the current token is a data URI.
792- *
793- * Only meaningful for URL and STRING tokens. Returns false for all other token types.
794- *
795- * @return bool Whether the current token value starts with "data:" (case-insensitive).
796- */
797- public function is_data_uri (): bool {
798- if ( null === $ this ->token_value_starts_at || null === $ this ->token_value_length ) {
799- return false ;
800- }
801-
802- if ( $ this ->token_value_length < 5 ) {
803- return false ;
804- }
805-
806- $ offset = $ this ->token_value_starts_at ;
807- return (
808- ( 'd ' === $ this ->css [ $ offset ] || 'D ' === $ this ->css [ $ offset ] ) &&
809- ( 'a ' === $ this ->css [ $ offset + 1 ] || 'A ' === $ this ->css [ $ offset + 1 ] ) &&
810- ( 't ' === $ this ->css [ $ offset + 2 ] || 'T ' === $ this ->css [ $ offset + 2 ] ) &&
811- ( 'a ' === $ this ->css [ $ offset + 3 ] || 'A ' === $ this ->css [ $ offset + 3 ] ) &&
812- ': ' === $ this ->css [ $ offset + 4 ]
813- );
814- }
815-
816- /**
817- * Sets the value of the current token.
817+ * Sets the value of the current URL token.
818818 *
819- * This method allows modifying URL or STRING token values . The new value
820- * will be properly escaped according to CSS syntax rules.
819+ * This method allows modifying the URL value in url() tokens . The new value
820+ * will be properly escaped according to CSS URL syntax rules.
821821 *
822- * Supported token types:
823- * - TOKEN_URL: URL value in url() tokens
824- * - TOKEN_STRING: String value (properly quoted and escaped)
822+ * Currently only URL tokens are supported. Attempting to set the value on
823+ * other token types will return false.
825824 *
826825 * Example:
827826 *
@@ -835,32 +834,22 @@ public function is_data_uri(): bool {
835834 * echo $processor->get_updated_css();
836835 * // Outputs: background: url(new.jpg);
837836 *
838- * @param string $new_value The new value (should not include url() wrapper or quotes ).
837+ * @param string $new_value The new URL value (should not include url() wrapper).
839838 * @return bool Whether the value was successfully updated.
840839 */
841840 public function set_token_value ( string $ new_value ): bool {
841+ // Only URL tokens are currently supported.
842+ if ( self ::TOKEN_URL !== $ this ->token_type ) {
843+ return false ;
844+ }
845+
842846 // Ensure we have valid token value boundaries.
843847 if ( null === $ this ->token_value_starts_at || null === $ this ->token_value_length ) {
844848 return false ;
845849 }
846850
847- $ escaped_value = null ;
848-
849- switch ( $ this ->token_type ) {
850- case self ::TOKEN_URL :
851- // Escape the URL value for quoted URL syntax.
852- $ escaped_value = $ this ->escape_url_value ( $ new_value );
853- break ;
854-
855- case self ::TOKEN_STRING :
856- // Escape the string value for quoted string syntax.
857- $ escaped_value = $ this ->escape_string_value ( $ new_value );
858- break ;
859-
860- default :
861- // Unsupported token type.
862- return false ;
863- }
851+ // Escape the URL value for unquoted URL syntax.
852+ $ escaped_value = $ this ->escape_url_value ( $ new_value );
864853
865854 // Queue the lexical update.
866855 $ this ->lexical_updates [] = array (
@@ -935,56 +924,6 @@ private function escape_url_value( string $unescaped ): string {
935924 return '" ' . $ escaped . '" ' ;
936925 }
937926
938- /**
939- * Escapes a string value for use in string token replacement.
940- *
941- * For STRING tokens, the value boundaries point to the content between quotes,
942- * so we must NOT add quotes ourselves - they're already in the source.
943- *
944- * @param string $unescaped Unescaped string value.
945- * @return string Escaped string value without surrounding quotes.
946- */
947- private function escape_string_value ( string $ unescaped ): string {
948- $ escaped = '' ;
949- $ at = 0 ;
950- while ( $ at < strlen ( $ unescaped ) ) {
951- $ safe_len = strcspn ( $ unescaped , "\n\r\f\\\"" , $ at );
952- if ( $ safe_len > 0 ) {
953- $ escaped .= substr ( $ unescaped , $ at , $ safe_len );
954- $ at += $ safe_len ;
955- continue ;
956- }
957-
958- $ unsafe_char = $ unescaped [ $ at ];
959- switch ( $ unsafe_char ) {
960- case "\r" :
961- ++$ at ;
962- $ escaped .= '\\a ' ;
963- if ( strlen ( $ unescaped ) > $ at + 1 && "\n" === $ unescaped [ $ at + 1 ] ) {
964- ++$ at ;
965- }
966- break ;
967- case "\f" :
968- case "\n" :
969- ++$ at ;
970- $ escaped .= '\\a ' ;
971- break ;
972- case '\\' :
973- ++$ at ;
974- $ escaped .= '\\5C ' ;
975- break ;
976- case '" ' :
977- ++$ at ;
978- $ escaped .= '\\22 ' ;
979- break ;
980- default :
981- _doing_it_wrong ( __METHOD__ , 'Unexpected character in string value: ' . $ unsafe_char , '1.0.0 ' );
982- break ;
983- }
984- }
985- return $ escaped ;
986- }
987-
988927 /**
989928 * Returns the CSS with all modifications applied.
990929 *
@@ -1615,7 +1554,7 @@ private function consume_ident_start_codepoint( $at ): int {
16151554 */
16161555 private function decode_string_or_url ( int $ start , int $ length ): string {
16171556 // Fast path: check if any processing is needed.
1618- $ slice = _wp_scrub_utf8_fallback ( substr ( $ this ->css , $ start , $ length ) );
1557+ $ slice = wp_scrub_utf8 ( substr ( $ this ->css , $ start , $ length ) );
16191558 $ special_chars = "\\\r\f\x00" ;
16201559 if ( false === strpbrk ( $ slice , $ special_chars ) ) {
16211560 // No special chars - return raw substring (almost zero allocations).
@@ -1885,86 +1824,4 @@ private function check_if_3_code_points_start_an_ident_sequence( int $offset ):
18851824
18861825 return $ this ->consume_ident_start_codepoint ( $ offset ) > 0 || $ this ->is_valid_escape ( $ offset );
18871826 }
1888-
1889- /**
1890- * Decodes CSS escape sequences in a string.
1891- *
1892- * This is a utility method that can be used by other classes to decode
1893- * CSS escapes in extracted values. It implements the same logic as the
1894- * incremental escape parsing done during tokenization.
1895- *
1896- * Handles:
1897- * - Hex escapes: \20 (space), \1F600 (emoji), up to 6 hex digits
1898- * - Character escapes: \(, \), \", \', \\
1899- * - Whitespace after hex escapes (single whitespace consumed)
1900- * - Escaped newlines (consumed, not included in output)
1901- *
1902- * @see https://www.w3.org/TR/css-syntax-3/#consume-escaped-code-point
1903- *
1904- * @param string $value Encoded string with CSS escapes.
1905- * @return string Decoded string with escapes resolved to their actual characters.
1906- */
1907- public static function decode_css_escapes ( string $ value ): string {
1908- $ length = strlen ( $ value );
1909- $ result = '' ;
1910- $ at = 0 ;
1911-
1912- while ( $ at < $ length ) {
1913- $ span = strcspn ( $ value , '\\' , $ at );
1914- if ( $ span > 0 ) {
1915- $ result .= substr ( $ value , $ at , $ span );
1916- $ at += $ span ;
1917- }
1918-
1919- if ( $ at >= $ length ) {
1920- break ;
1921- }
1922-
1923- ++$ at ;
1924- if ( $ at >= $ length ) {
1925- break ;
1926- }
1927-
1928- $ hex_len = strspn ( $ value , '0123456789abcdefABCDEF ' , $ at );
1929- if ( $ hex_len > 6 ) {
1930- $ hex_len = 6 ;
1931- }
1932-
1933- if ( $ hex_len > 0 ) {
1934- $ hex = substr ( $ value , $ at , $ hex_len );
1935- $ result .= codepoint_to_utf8_bytes ( hexdec ( $ hex ) );
1936- $ at += $ hex_len ;
1937-
1938- $ ws_len = strspn ( $ value , " \n\r\t\f" , $ at );
1939- if ( $ ws_len > 0 ) {
1940- if ( $ at + 1 < $ length && "\r" === $ value [ $ at ] && "\n" === $ value [ $ at + 1 ] ) {
1941- $ at += 2 ;
1942- } else {
1943- $ at += 1 ;
1944- }
1945- }
1946- continue ;
1947- }
1948-
1949- $ next = $ value [ $ at ];
1950-
1951- if ( "\n" === $ next || "\f" === $ next ) {
1952- ++$ at ;
1953- continue ;
1954- }
1955-
1956- if ( "\r" === $ next ) {
1957- ++$ at ;
1958- if ( $ at < $ length && "\n" === $ value [ $ at ] ) {
1959- ++$ at ;
1960- }
1961- continue ;
1962- }
1963-
1964- $ result .= $ next ;
1965- ++$ at ;
1966- }
1967-
1968- return $ result ;
1969- }
19701827}
0 commit comments