Skip to content

Commit c6d8b3a

Browse files
committed
Fixed word boundary patterns: they were ascii-only while java.util.regexp is unicode compliant
1 parent 2953c0d commit c6d8b3a

File tree

3 files changed

+60
-2
lines changed

3 files changed

+60
-2
lines changed

java/com/google/re2j/Unicode.java

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -66,6 +66,11 @@ private static boolean is(int[][] ranges, int r) {
6666
return ranges.length > 0 && r >= ranges[0][0] && is32(ranges, r);
6767
}
6868

69+
// isLetter reports whether the rune is a letter.
70+
static boolean isLetter(int r) {
71+
return is(UnicodeTables.Letter, r);
72+
}
73+
6974
// isUpper reports whether the rune is an upper case letter.
7075
static boolean isUpper(int r) {
7176
// See comment in isGraphic.

java/com/google/re2j/Utils.java

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -156,9 +156,9 @@ static int indexOf(byte[] source, byte[] target, int fromIndex) {
156156

157157
// isWordRune reports whether r is consider a ``word character''
158158
// during the evaluation of the \b and \B zero-width assertions.
159-
// These assertions are ASCII-only: the word characters are [A-Za-z0-9_].
159+
// These assertions are unicode compliant: the word characters are [\p{L}-z0-9_].
160160
static boolean isWordRune(int r) {
161-
return (('A' <= r && r <= 'Z') || ('a' <= r && r <= 'z') || ('0' <= r && r <= '9') || r == '_');
161+
return (Unicode.isLetter(r) || ('0' <= r && r <= '9') || r == '_');
162162
}
163163

164164
//// EMPTY_* flags

javatests/com/google/re2j/PatternTest.java

Lines changed: 53 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -193,4 +193,57 @@ public void testEquals() {
193193
Truth.assertThat(pattern1.hashCode()).isEqualTo(pattern2.hashCode());
194194
Truth.assertThat(pattern1).isNotEqualTo(pattern4);
195195
}
196+
197+
@Test
198+
public void testUnicodeWordBoundary() {
199+
final String pattern = "l\\p{L}*\\b";
200+
final String text = "l\u00E0";
201+
{
202+
final java.util.regex.Matcher matcher =
203+
java.util.regex.Pattern.compile(pattern).matcher(text);
204+
assertEquals(true, matcher.find());
205+
assertEquals("l\u00E0", text.substring(matcher.start(), matcher.end()));
206+
}
207+
{
208+
final com.google.re2j.Matcher matcher =
209+
com.google.re2j.Pattern.compile(pattern).matcher(text);
210+
assertEquals(true, matcher.find());
211+
assertEquals("l\u00E0", text.substring(matcher.start(), matcher.end()));
212+
}
213+
}
214+
215+
@Test
216+
public void testUnicodeWordBoundary2() {
217+
final String pattern = "d\u00E9\\p{L}*\\b";
218+
{
219+
final String text = "d\u00E9s";
220+
{
221+
final java.util.regex.Matcher matcher =
222+
java.util.regex.Pattern.compile(pattern).matcher(text);
223+
assertEquals(true, matcher.find());
224+
assertEquals("d\u00E9s", text.substring(matcher.start(), matcher.end()));
225+
}
226+
{
227+
final com.google.re2j.Matcher matcher =
228+
com.google.re2j.Pattern.compile(pattern).matcher(text);
229+
assertEquals(true, matcher.find());
230+
assertEquals("d\u00E9s", text.substring(matcher.start(), matcher.end()));
231+
}
232+
}
233+
{
234+
final String text = "d\u00E9";
235+
{
236+
final java.util.regex.Matcher matcher =
237+
java.util.regex.Pattern.compile(pattern).matcher(text);
238+
assertEquals(true, matcher.find());
239+
assertEquals("d\u00E9", text.substring(matcher.start(), matcher.end()));
240+
}
241+
{
242+
final com.google.re2j.Matcher matcher =
243+
com.google.re2j.Pattern.compile(pattern).matcher(text);
244+
assertEquals(true, matcher.find());
245+
assertEquals("d\u00E9", text.substring(matcher.start(), matcher.end()));
246+
}
247+
}
248+
}
196249
}

0 commit comments

Comments
 (0)