Skip to content

Commit caafc84

Browse files
gaobinlongandrross
authored andcommitted
Grok processor supports capturing multiple values for same field name (opensearch-project#18799)
* Grok processor supports capturing multiple values for same field name Signed-off-by: Binlong Gao <[email protected]> * Modify change log Signed-off-by: Binlong Gao <[email protected]> --------- Signed-off-by: Binlong Gao <[email protected]> Signed-off-by: Andrew Ross <[email protected]> Co-authored-by: Andrew Ross <[email protected]> Signed-off-by: Ankit Jain <[email protected]>
1 parent 4366b03 commit caafc84

File tree

9 files changed

+444
-12
lines changed

9 files changed

+444
-12
lines changed

CHANGELOG.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
1919
- Publish transport-grpc-spi exposing QueryBuilderProtoConverter and QueryBuilderProtoConverterRegistry ([#18949](https://github.com/opensearch-project/OpenSearch/pull/18949))
2020
- Support system generated search pipeline. ([#19128](https://github.com/opensearch-project/OpenSearch/pull/19128))
2121
- Add `epoch_micros` date format ([#14669](https://github.com/opensearch-project/OpenSearch/issues/14669))
22+
- Grok processor supports capturing multiple values for same field name ([#18799](https://github.com/opensearch-project/OpenSearch/pull/18799)
2223

2324
### Changed
2425
- Refactor `if-else` chains to use `Java 17 pattern matching switch expressions`(([#18965](https://github.com/opensearch-project/OpenSearch/pull/18965))

libs/grok/src/main/java/org/opensearch/grok/Grok.java

Lines changed: 18 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -99,29 +99,42 @@ public final class Grok {
9999
private final Regex compiledExpression;
100100
private final MatcherWatchdog matcherWatchdog;
101101
private final List<GrokCaptureConfig> captureConfig;
102+
private final boolean captureAllMatches;
102103

103104
public Grok(Map<String, String> patternBank, String grokPattern, Consumer<String> logCallBack) {
104-
this(patternBank, grokPattern, true, MatcherWatchdog.noop(), logCallBack);
105+
this(patternBank, grokPattern, true, MatcherWatchdog.noop(), logCallBack, false);
105106
}
106107

107108
public Grok(Map<String, String> patternBank, String grokPattern, MatcherWatchdog matcherWatchdog, Consumer<String> logCallBack) {
108-
this(patternBank, grokPattern, true, matcherWatchdog, logCallBack);
109+
this(patternBank, grokPattern, true, matcherWatchdog, logCallBack, false);
110+
}
111+
112+
public Grok(
113+
Map<String, String> patternBank,
114+
String grokPattern,
115+
MatcherWatchdog matcherWatchdog,
116+
Consumer<String> logCallBack,
117+
boolean captureAllMatches
118+
) {
119+
this(patternBank, grokPattern, true, matcherWatchdog, logCallBack, captureAllMatches);
109120
}
110121

111122
Grok(Map<String, String> patternBank, String grokPattern, boolean namedCaptures, Consumer<String> logCallBack) {
112-
this(patternBank, grokPattern, namedCaptures, MatcherWatchdog.noop(), logCallBack);
123+
this(patternBank, grokPattern, namedCaptures, MatcherWatchdog.noop(), logCallBack, false);
113124
}
114125

115126
private Grok(
116127
Map<String, String> patternBank,
117128
String grokPattern,
118129
boolean namedCaptures,
119130
MatcherWatchdog matcherWatchdog,
120-
Consumer<String> logCallBack
131+
Consumer<String> logCallBack,
132+
boolean captureAllMatches
121133
) {
122134
this.patternBank = patternBank;
123135
this.namedCaptures = namedCaptures;
124136
this.matcherWatchdog = matcherWatchdog;
137+
this.captureAllMatches = captureAllMatches;
125138

126139
validatePatternBank();
127140

@@ -395,7 +408,7 @@ public boolean match(byte[] utf8Bytes, int offset, int length, GrokCaptureExtrac
395408
if (result == Matcher.FAILED) {
396409
return false;
397410
}
398-
extracter.extract(utf8Bytes, offset, matcher.getEagerRegion());
411+
extracter.extract(utf8Bytes, offset, matcher.getEagerRegion(), captureAllMatches);
399412
return true;
400413
}
401414

libs/grok/src/main/java/org/opensearch/grok/GrokCaptureExtracter.java

Lines changed: 19 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -57,14 +57,29 @@ static class MapExtracter extends GrokCaptureExtracter {
5757
result = captureConfig.isEmpty() ? emptyMap() : new HashMap<>();
5858
fieldExtracters = new ArrayList<>(captureConfig.size());
5959
for (GrokCaptureConfig config : captureConfig) {
60-
fieldExtracters.add(config.objectExtracter(v -> result.put(config.name(), v)));
60+
fieldExtracters.add(config.objectExtracter(v -> {
61+
String fieldName = config.name();
62+
Object existing = result.get(fieldName);
63+
if (existing == null) {
64+
result.put(fieldName, v);
65+
} else if (existing instanceof List) {
66+
@SuppressWarnings("unchecked")
67+
List<Object> list = (List<Object>) existing;
68+
list.add(v);
69+
} else {
70+
List<Object> list = new ArrayList<>();
71+
list.add(existing);
72+
list.add(v);
73+
result.put(fieldName, list);
74+
}
75+
}));
6176
}
6277
}
6378

6479
@Override
65-
void extract(byte[] utf8Bytes, int offset, Region region) {
80+
void extract(byte[] utf8Bytes, int offset, Region region, boolean captureAllMatches) {
6681
for (GrokCaptureExtracter extracter : fieldExtracters) {
67-
extracter.extract(utf8Bytes, offset, region);
82+
extracter.extract(utf8Bytes, offset, region, captureAllMatches);
6883
}
6984
}
7085

@@ -73,5 +88,5 @@ Map<String, Object> result() {
7388
}
7489
}
7590

76-
abstract void extract(byte[] utf8Bytes, int offset, Region region);
91+
abstract void extract(byte[] utf8Bytes, int offset, Region region, boolean captureAllMatches);
7792
}

libs/grok/src/main/java/org/opensearch/grok/GrokCaptureType.java

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -104,13 +104,16 @@ static GrokCaptureType fromString(String str) {
104104
protected final GrokCaptureExtracter rawExtracter(int[] backRefs, Consumer<? super String> emit) {
105105
return new GrokCaptureExtracter() {
106106
@Override
107-
void extract(byte[] utf8Bytes, int offset, Region region) {
107+
void extract(byte[] utf8Bytes, int offset, Region region, boolean captureAllMatches) {
108108
for (int number : backRefs) {
109109
if (region.getBeg(number) >= 0) {
110110
int matchOffset = offset + region.getBeg(number);
111111
int matchLength = region.getEnd(number) - region.getBeg(number);
112112
emit.accept(new String(utf8Bytes, matchOffset, matchLength, StandardCharsets.UTF_8));
113-
return; // Capture only the first value.
113+
// return the first match value if captureAllMatches is false, else continue to capture all values
114+
if (!captureAllMatches) {
115+
return;
116+
}
114117
}
115118
}
116119
}

libs/grok/src/test/java/org/opensearch/grok/GrokTests.java

Lines changed: 98 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -739,6 +739,87 @@ public void testJavaFilePatternWithSpaces() {
739739
assertThat(grok.match("Test Class.java"), is(true));
740740
}
741741

742+
public void testMultipleCapturesWithSameFieldName() {
743+
// Test that multiple captures with the same field name are collected into a list
744+
BiConsumer<Long, Runnable> scheduler = getLongRunnableBiConsumer();
745+
// Pattern with repeated capture groups for the same field
746+
Grok grok = new Grok(
747+
Grok.BUILTIN_PATTERNS,
748+
"%{IP:ipaddress} %{IP:ipaddress}",
749+
MatcherWatchdog.newInstance(10, 200, System::currentTimeMillis, scheduler),
750+
logger::warn,
751+
true
752+
);
753+
754+
// Input with two different IP addresses
755+
Map<String, Object> matches = grok.captures("192.168.1.1 192.168.1.2");
756+
757+
assertNotNull("Should have matches", matches);
758+
Object ipaddress = matches.get("ipaddress");
759+
assertTrue("Should be a List", ipaddress instanceof List);
760+
761+
@SuppressWarnings("unchecked")
762+
List<String> ipList = (List<String>) ipaddress;
763+
assertEquals("Should have 2 elements", 2, ipList.size());
764+
assertEquals("First IP should match", "192.168.1.1", ipList.get(0));
765+
assertEquals("Second IP should match", "192.168.1.2", ipList.get(1));
766+
}
767+
768+
public void testMultipleCapturesWithSameFieldNameDifferentTypes() {
769+
BiConsumer<Long, Runnable> scheduler = getLongRunnableBiConsumer();
770+
// Pattern with repeated capture groups for the same field with different types
771+
Grok grok = new Grok(
772+
Grok.BUILTIN_PATTERNS,
773+
"%{NUMBER:value:int} %{NUMBER:value:long}",
774+
MatcherWatchdog.newInstance(10, 200, System::currentTimeMillis, scheduler),
775+
logger::warn,
776+
true
777+
);
778+
779+
// Input with two different numbers
780+
Map<String, Object> matches = grok.captures("123 456");
781+
782+
assertNotNull("Should have matches", matches);
783+
Object value = matches.get("value");
784+
assertTrue("Should be a List", value instanceof List);
785+
786+
@SuppressWarnings("unchecked")
787+
List<Object> valueList = (List<Object>) value;
788+
assertEquals("Should have 2 elements", 2, valueList.size());
789+
assertEquals("First value should be an Integer", Integer.valueOf(123), valueList.get(0));
790+
assertEquals("Second value should be a Long", Long.valueOf(456), valueList.get(1));
791+
}
792+
793+
public void testMultipleCapturesWithSameFieldNameInComplexPattern() {
794+
// Test a more complex pattern with multiple captures of the same field
795+
BiConsumer<Long, Runnable> scheduler = getLongRunnableBiConsumer();
796+
797+
// Pattern with multiple fields, one of which appears multiple times
798+
Grok grok = new Grok(
799+
Grok.BUILTIN_PATTERNS,
800+
"%{WORD:name} has IPs: %{IP:ip}, %{IP:ip} and %{IP:ip}",
801+
MatcherWatchdog.newInstance(10, 200, System::currentTimeMillis, scheduler),
802+
logger::warn,
803+
true
804+
);
805+
806+
// Input with a name and three IPs
807+
Map<String, Object> matches = grok.captures("Server has IPs: 192.168.1.1, 192.168.1.2 and 192.168.1.3");
808+
809+
assertNotNull("Should have matches", matches);
810+
assertEquals("Name should match", "Server", matches.get("name"));
811+
812+
Object ip = matches.get("ip");
813+
assertTrue("IP should be a List", ip instanceof List);
814+
815+
@SuppressWarnings("unchecked")
816+
List<String> ipList = (List<String>) ip;
817+
assertEquals("Should have 3 IPs", 3, ipList.size());
818+
assertEquals("First IP should match", "192.168.1.1", ipList.get(0));
819+
assertEquals("Second IP should match", "192.168.1.2", ipList.get(1));
820+
assertEquals("Third IP should match", "192.168.1.3", ipList.get(2));
821+
}
822+
742823
public void testLogCallBack() {
743824
AtomicReference<String> message = new AtomicReference<>();
744825
Grok grok = new Grok(Grok.BUILTIN_PATTERNS, ".*\\[.*%{SPACE}*\\].*", message::set);
@@ -747,6 +828,23 @@ public void testLogCallBack() {
747828
assertThat(message.get(), containsString("regular expression has redundant nested repeat operator"));
748829
}
749830

831+
private static BiConsumer<Long, Runnable> getLongRunnableBiConsumer() {
832+
AtomicBoolean run = new AtomicBoolean(true);
833+
return (delay, command) -> {
834+
try {
835+
Thread.sleep(delay);
836+
} catch (InterruptedException e) {
837+
throw new AssertionError(e);
838+
}
839+
Thread t = new Thread(() -> {
840+
if (run.get()) {
841+
command.run();
842+
}
843+
});
844+
t.start();
845+
};
846+
}
847+
750848
private void assertGrokedField(String fieldName) {
751849
String line = "foo";
752850
Grok grok = new Grok(Grok.BUILTIN_PATTERNS, "%{WORD:" + fieldName + "}", logger::warn);

modules/ingest-common/src/main/java/org/opensearch/ingest/common/GrokProcessor.java

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -58,6 +58,7 @@ public final class GrokProcessor extends AbstractProcessor {
5858
private final Grok grok;
5959
private final boolean traceMatch;
6060
private final boolean ignoreMissing;
61+
private final boolean captureAllMatches;
6162

6263
GrokProcessor(
6364
String tag,
@@ -67,14 +68,16 @@ public final class GrokProcessor extends AbstractProcessor {
6768
String matchField,
6869
boolean traceMatch,
6970
boolean ignoreMissing,
71+
boolean captureAllMatches,
7072
MatcherWatchdog matcherWatchdog
7173
) {
7274
super(tag, description);
7375
this.matchField = matchField;
7476
this.matchPatterns = matchPatterns;
75-
this.grok = new Grok(patternBank, combinePatterns(matchPatterns, traceMatch), matcherWatchdog, logger::debug);
77+
this.grok = new Grok(patternBank, combinePatterns(matchPatterns, traceMatch), matcherWatchdog, logger::debug, captureAllMatches);
7678
this.traceMatch = traceMatch;
7779
this.ignoreMissing = ignoreMissing;
80+
this.captureAllMatches = captureAllMatches;
7881
// Joni warnings are only emitted on an attempt to match, and the warning emitted for every call to match which is too verbose
7982
// so here we emit a warning (if there is one) to the logfile at warn level on construction / processor creation.
8083
new Grok(patternBank, combinePatterns(matchPatterns, traceMatch), matcherWatchdog, logger::warn).match("___nomatch___");
@@ -130,6 +133,10 @@ List<String> getMatchPatterns() {
130133
return matchPatterns;
131134
}
132135

136+
boolean isCaptureAllMatches() {
137+
return captureAllMatches;
138+
}
139+
133140
static String combinePatterns(List<String> patterns, boolean traceMatch) {
134141
String combinedPattern;
135142
if (patterns.size() > 1) {
@@ -176,6 +183,7 @@ public GrokProcessor create(
176183
List<String> matchPatterns = ConfigurationUtils.readList(TYPE, processorTag, config, "patterns");
177184
boolean traceMatch = ConfigurationUtils.readBooleanProperty(TYPE, processorTag, config, "trace_match", false);
178185
boolean ignoreMissing = ConfigurationUtils.readBooleanProperty(TYPE, processorTag, config, "ignore_missing", false);
186+
boolean captureAllMatches = ConfigurationUtils.readBooleanProperty(TYPE, processorTag, config, "capture_all_matches", false);
179187

180188
if (matchPatterns.isEmpty()) {
181189
throw newConfigurationException(TYPE, processorTag, "patterns", "List of patterns must not be empty");
@@ -195,6 +203,7 @@ public GrokProcessor create(
195203
matchField,
196204
traceMatch,
197205
ignoreMissing,
206+
captureAllMatches,
198207
matcherWatchdog
199208
);
200209
} catch (Exception e) {

modules/ingest-common/src/test/java/org/opensearch/ingest/common/GrokProcessorFactoryTests.java

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -134,4 +134,19 @@ public void testCreateWithInvalidPatternDefinition() throws Exception {
134134
equalTo("[patterns] Invalid regex pattern found in: [%{MY_PATTERN:name}!]. premature end of char-class")
135135
);
136136
}
137+
138+
public void testBuildWithCaptureAllMatches() throws Exception {
139+
GrokProcessor.Factory factory = new GrokProcessor.Factory(Collections.emptyMap(), MatcherWatchdog.noop());
140+
141+
Map<String, Object> config = new HashMap<>();
142+
config.put("field", "_field");
143+
config.put("patterns", Collections.singletonList("(?<foo>\\w+)"));
144+
config.put("capture_all_matches", true);
145+
String processorTag = randomAlphaOfLength(10);
146+
GrokProcessor processor = factory.create(null, processorTag, null, config);
147+
assertThat(processor.getTag(), equalTo(processorTag));
148+
assertThat(processor.getMatchField(), equalTo("_field"));
149+
assertThat(processor.getGrok(), notNullValue());
150+
assertThat(processor.isCaptureAllMatches(), is(true));
151+
}
137152
}

0 commit comments

Comments
 (0)