Skip to content

Commit cf7871d

Browse files
committed
Fixed word boundary patterns: they were ascii-only while java.util.regexp is unicode compliant
1 parent 86028a5 commit cf7871d

File tree

3 files changed

+54
-2
lines changed

3 files changed

+54
-2
lines changed

java/com/google/re2j/Unicode.java

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -66,6 +66,11 @@ private static boolean is(int[][] ranges, int r) {
6666
return ranges.length > 0 && r >= ranges[0][0] && is32(ranges, r);
6767
}
6868

69+
// isLetter reports whether the rune is a letter.
70+
static boolean isLetter(int r) {
71+
return is(UnicodeTables.Letter, r);
72+
}
73+
6974
// isUpper reports whether the rune is an upper case letter.
7075
static boolean isUpper(int r) {
7176
// See comment in isGraphic.

java/com/google/re2j/Utils.java

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -156,9 +156,9 @@ static int indexOf(byte[] source, byte[] target, int fromIndex) {
156156

157157
// isWordRune reports whether r is consider a ``word character''
158158
// during the evaluation of the \b and \B zero-width assertions.
159-
// These assertions are ASCII-only: the word characters are [A-Za-z0-9_].
159+
// These assertions are unicode compliant: the word characters are [\p{L}-z0-9_].
160160
static boolean isWordRune(int r) {
161-
return (('A' <= r && r <= 'Z') || ('a' <= r && r <= 'z') || ('0' <= r && r <= '9') || r == '_');
161+
return (Unicode.isLetter(r) || ('0' <= r && r <= '9') || r == '_');
162162
}
163163

164164
//// EMPTY_* flags

javatests/com/google/re2j/PatternTest.java

Lines changed: 47 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -193,4 +193,51 @@ public void testEquals() {
193193
Truth.assertThat(pattern1.hashCode()).isEqualTo(pattern2.hashCode());
194194
Truth.assertThat(pattern1).isNotEqualTo(pattern4);
195195
}
196+
197+
@Test
198+
public void testUnicodeWordBoundary() {
199+
final String pattern = "l\\p{L}*\\b";
200+
final String text = "l\u00E0";
201+
{
202+
final java.util.regex.Matcher matcher = java.util.regex.Pattern.compile(pattern).matcher(text);
203+
assertEquals(true, matcher.find());
204+
assertEquals("l\u00E0", text.substring(matcher.start(), matcher.end()));
205+
}
206+
{
207+
final com.google.re2j.Matcher matcher = com.google.re2j.Pattern.compile(pattern).matcher(text);
208+
assertEquals(true, matcher.find());
209+
assertEquals("l\u00E0", text.substring(matcher.start(), matcher.end()));
210+
}
211+
}
212+
213+
@Test
214+
public void testUnicodeWordBoundary2() {
215+
final String pattern = "d\u00E9\\p{L}*\\b";
216+
{
217+
final String text = "d\u00E9s";
218+
{
219+
final java.util.regex.Matcher matcher = java.util.regex.Pattern.compile(pattern).matcher(text);
220+
assertEquals(true, matcher.find());
221+
assertEquals("d\u00E9s", text.substring(matcher.start(), matcher.end()));
222+
}
223+
{
224+
final com.google.re2j.Matcher matcher = com.google.re2j.Pattern.compile(pattern).matcher(text);
225+
assertEquals(true, matcher.find());
226+
assertEquals("d\u00E9s", text.substring(matcher.start(), matcher.end()));
227+
}
228+
}
229+
{
230+
final String text = "d\u00E9";
231+
{
232+
final java.util.regex.Matcher matcher = java.util.regex.Pattern.compile(pattern).matcher(text);
233+
assertEquals(true, matcher.find());
234+
assertEquals("d\u00E9", text.substring(matcher.start(), matcher.end()));
235+
}
236+
{
237+
final com.google.re2j.Matcher matcher = com.google.re2j.Pattern.compile(pattern).matcher(text);
238+
assertEquals(true, matcher.find());
239+
assertEquals("d\u00E9", text.substring(matcher.start(), matcher.end()));
240+
}
241+
}
242+
}
196243
}

0 commit comments

Comments
 (0)