Skip to content

8364007: Add no-argument codePointCount method to CharSequence and String #26461

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 10 commits into
base: master
Choose a base branch
from
18 changes: 18 additions & 0 deletions src/java.base/share/classes/java/lang/AbstractStringBuilder.java
Original file line number Diff line number Diff line change
Expand Up @@ -529,6 +529,24 @@ public int codePointCount(int beginIndex, int endIndex) {
return StringUTF16.codePointCountSB(value, beginIndex, endIndex);
}

/**
* Returns the number of Unicode code points in
* this sequence. Unpaired surrogates count
* as one code point each.
*
* @return the number of Unicode code points in this String
* @since 26
*/
public int codePointCount() {
byte coder = this.coder;
int count = this.count;
byte[] value = this.value;
if (isLatin1(coder)) {
return count;
}
return StringUTF16.codePointCount(value, 0, count);
}

/**
* Returns the index within this sequence that is offset from the
* given {@code index} by {@code codePointOffset} code
Expand Down
25 changes: 25 additions & 0 deletions src/java.base/share/classes/java/lang/CharSequence.java
Original file line number Diff line number Diff line change
Expand Up @@ -253,6 +253,31 @@ public int nextInt() {
false);
}

/**
* Returns the number of Unicode code points in
* this sequence. Unpaired surrogates count
* as one code point each.
*
* @return the number of Unicode code points in this sequence
* @since 26
*/
public default int codePointCount() {
final int length = length();
int n = length;
final int lastIndex = length - 1;

// i < lastIndex works properly even for an empty sequence
// thank to the fact that the length/index type in Java is signed
for (int i = 0; i < lastIndex;) {
if (Character.isHighSurrogate(charAt(i++)) && Character.isLowSurrogate(charAt(i))) {
n--;
i++;
}
}

return n;
}

/**
* Compares two {@code CharSequence} instances lexicographically. Returns a
* negative value, zero, or a positive value if the first sequence is lexicographically
Expand Down
37 changes: 37 additions & 0 deletions src/java.base/share/classes/java/lang/Character.java
Original file line number Diff line number Diff line change
Expand Up @@ -9952,6 +9952,29 @@ public static int codePointCount(CharSequence seq, int beginIndex, int endIndex)
return n;
}

/**
* Returns the number of Unicode code points in the text range of
* the specified char sequence. Unpaired surrogates count as one
* code point each.
*
* @param seq the char sequence
* @return the number of Unicode code points in the char sequence
* @throws NullPointerException if {@code seq} is null.
* @since 26
*/
public static int codePointCount(CharSequence seq) {
final int length = seq.length();
int n = length;
for (int i = 0; i < length; ) {
if (isHighSurrogate(seq.charAt(i++)) && i < length &&
Copy link
Member

@myankelev myankelev Jul 24, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Imo this is quite hard to read, especially with i++ inside of the if statement. What do you think about changing it to this?

for (int i = 1; i < length-1; i++) {
    if (isHighSurrogate(seq.charAt(i)) &&
        isLowSurrogate(seq.charAt(i + 1))) {
        n--;
        i++;
    }
}

edit: fixed a typo in my example

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

In the first place it yields an incorrect result for sequences whose first character is a supplementary character.

jshell> int len(CharSequence seq) {
   ...>     final int length = seq.length();
   ...>     int n = length;
   ...>     for (int i = 1; i < length-1; i++) {
   ...>             if (isHighSurrogate(seq.charAt(i)) &&
   ...>                 isLowSurrogate(seq.charAt(i + 1))) {
   ...>                     n--;
   ...>                     i++;
   ...>             }
   ...>     }
   ...>     return n;
   ...> }
|  次を作成しました: メソッド len(CharSequence)。しかし、 method isHighSurrogate(char), and method isLowSurrogate(char)が宣言されるまで、起動できません

jshell> boolean isHighSurrogate(char ch) {
   ...>     return 0xd800 <= ch && ch <= 0xdbff;
   ...> }
|  次を作成しました: メソッド isHighSurrogate(char)

jshell> boolean isLowSurrogate(char ch) {
   ...>     return 0xdc00 <= ch && ch <= 0xdfff;
   ...> }
|  次を作成しました: メソッド isLowSurrogate(char)

jshell> len("𠮷");
$5 ==> 2

jshell> len("OK👍");
$6 ==> 3

jshell> len("👍👍");
$7 ==> 3

I will not change it alone unless the existing overload int codePointCount(CharSequence seq, int beginIndex, int endIndex) is also planned to be changed.

isLowSurrogate(seq.charAt(i))) {
n--;
i++;
}
}
return n;
}

/**
* Returns the number of Unicode code points in a subarray of the
* {@code char} array argument. The {@code offset}
Expand All @@ -9976,6 +9999,20 @@ public static int codePointCount(char[] a, int offset, int count) {
return codePointCountImpl(a, offset, count);
}

/**
* Returns the number of Unicode code points in the
* {@code char} array argument. Unpaired
* surrogates count as one code point each.
*
* @param a the {@code char} array
* @return the number of Unicode code points in the char array
* @throws NullPointerException if {@code a} is null.
* @since 26
*/
public static int codePointCount(char[] a) {
return codePointCountImpl(a, 0, a.length);
}

static int codePointCountImpl(char[] a, int offset, int count) {
int endIndex = offset + count;
int n = count;
Expand Down
15 changes: 15 additions & 0 deletions src/java.base/share/classes/java/lang/String.java
Original file line number Diff line number Diff line change
Expand Up @@ -1714,6 +1714,21 @@ public int codePointCount(int beginIndex, int endIndex) {
return StringUTF16.codePointCount(value, beginIndex, endIndex);
}

/**
* Returns the number of Unicode code points in
* this {@code String}. Unpaired surrogates count
* as one code point each.
*
* @return the number of Unicode code points in this String
* @since 26
*/
public int codePointCount() {
if (isLatin1()) {
return value.length;
}
return StringUTF16.codePointCount(value, 0, value.length >> 1);
}

/**
* Returns the index within this {@code String} that is
* offset from the given {@code index} by
Expand Down
8 changes: 8 additions & 0 deletions src/java.base/share/classes/java/lang/StringBuffer.java
Original file line number Diff line number Diff line change
Expand Up @@ -270,6 +270,14 @@ public synchronized int codePointCount(int beginIndex, int endIndex) {
return super.codePointCount(beginIndex, endIndex);
}

/**
* @since 26
*/
@Override
public synchronized int codePointCount() {
return super.codePointCount();
}

/**
* @throws IndexOutOfBoundsException {@inheritDoc}
* @since 1.5
Expand Down
4 changes: 2 additions & 2 deletions src/java.base/share/classes/java/util/regex/Pattern.java
Original file line number Diff line number Diff line change
Expand Up @@ -1658,7 +1658,7 @@ private static void normalizeSlice(String src, int off, int limit,
String seq = src.substring(off, j);
String nfd = Normalizer.normalize(seq, Normalizer.Form.NFD);
off = j;
if (nfd.codePointCount(0, nfd.length()) > 1) {
if (nfd.codePointCount() > 1) {
ch0 = nfd.codePointAt(0);
ch1 = nfd.codePointAt(Character.charCount(ch0));
if (Character.getType(ch1) == Character.NON_SPACING_MARK) {
Expand Down Expand Up @@ -4157,7 +4157,7 @@ boolean match(Matcher matcher, int i, CharSequence seq) {
while (i + n < j) {
String nfc = Normalizer.normalize(
seq.toString().substring(i, j), Normalizer.Form.NFC);
if (nfc.codePointCount(0, nfc.length()) == 1) {
if (nfc.codePointCount() == 1) {
if (predicate.is(nfc.codePointAt(0)) &&
next.match(matcher, j, seq)) {
return true;
Expand Down
14 changes: 11 additions & 3 deletions test/jdk/java/lang/Character/Supplementary.java
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2018, 2022, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2018, 2025, Oracle and/or its affiliates. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
Expand All @@ -23,7 +23,7 @@

/*
* @test
* @bug 4533872 4985214 4985217 4993841 5017268 5017280 8298033
* @bug 4533872 4985214 4985217 4993841 5017268 5017280 8298033 8364007
* @summary Unit tests for supplementary character support (JSR-204)
* @compile Supplementary.java
* @run main/timeout=600 Supplementary
Expand Down Expand Up @@ -334,6 +334,8 @@ static void test03(char[] a) {
/**
* Test codePointCount(CharSequence, int, int)
* codePointCount(char[], int, int, int, int)
* codePointCount(CharSequence)
* codePointCount(char[])
*/
static void test04(String str) {
int length = str.length();
Expand All @@ -347,9 +349,15 @@ static void test04(String str) {
checkCodePointCount(a, n, m);
}

int n = Character.codePointCount(str);
int m = codePointCount(str);
checkCodePointCount(str, n, m);
n = Character.codePointCount(a);
checkCodePointCount(a, n, m);

// test special cases
length = str.length();
int n = Character.codePointCount(str, 0, 0);
n = Character.codePointCount(str, 0, 0);
checkCodePointCount(str, n, 0);
n = Character.codePointCount(str, length, length);
checkCodePointCount(str, n, 0);
Expand Down
13 changes: 9 additions & 4 deletions test/jdk/java/lang/String/Supplementary.java
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2003, 2025, Oracle and/or its affiliates. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
Expand All @@ -24,7 +24,7 @@
/*
*
* @test
* @bug 4533872 4915683 4922962 4985217 5017280 6242664 6588260
* @bug 4533872 4915683 4922962 4985217 5017280 6242664 6588260 8364007
* @summary Unit tests for supplementary character support (JSR-204)
*/

Expand Down Expand Up @@ -386,9 +386,10 @@ static void test7() {
}

/**
* Test codePointCount(int, int)
* Test codePointCount(int, int) &
* codePointCount()
*
* This test case assumes that
* This test case assumes that Character.codePointCount() &
* Character.codePointCount(CharSequence, int, int) works
* correctly.
*/
Expand Down Expand Up @@ -419,6 +420,10 @@ static void test8() {
result, expected);
}

int result = str.codePointCount();
int expected = Character.codePointCount(str);
check(result != expected, "substring:codePointCount()", result, expected);

// test exceptions
testCodePointCount(null, 0, 0, NullPointerException.class);
testCodePointCount(str, -1, length, IndexOutOfBoundsException.class);
Expand Down
15 changes: 10 additions & 5 deletions test/jdk/java/lang/StringBuilder/Supplementary.java
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2003, 2013, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2003, 2025, Oracle and/or its affiliates. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
Expand All @@ -24,7 +24,7 @@
/*
*
* @test
* @bug 4533872 4915683 4985217 5017280
* @bug 4533872 4915683 4985217 5017280 8364007
* @summary Unit tests for supplementary character support (JSR-204)
*/

Expand Down Expand Up @@ -215,11 +215,11 @@ static void test4() {
}

/**
* Test codePointCount(int, int)
* Test codePointCount(int, int) & codePointCount()
*
* This test case assumes that
* Character.codePointCount(CharSequence, int, int) works
* correctly.
* Character.codePointCount(CharSequence, int, int) &
* Character.codePointCount(CharSequence) works correctly.
*/
static void test5() {
for (int i = 0; i < input.length; i++) {
Expand All @@ -239,6 +239,11 @@ static void test5() {
result, expected);
}


int result = sb.codePointCount();
int expected = Character.codePointCount(sb);
check(result != expected, "codePointCount()", result, expected);

// test exceptions
testCodePointCount(null, 0, 0, NullPointerException.class);
testCodePointCount(sb, -1, length, IndexOutOfBoundsException.class);
Expand Down