Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -75,6 +75,7 @@ public enum BuiltinFunctionName {
MVAPPEND(FunctionName.of("mvappend")),
MVJOIN(FunctionName.of("mvjoin")),
MVINDEX(FunctionName.of("mvindex")),
SPLIT(FunctionName.of("split")),
MVDEDUP(FunctionName.of("mvdedup")),
FORALL(FunctionName.of("forall")),
EXISTS(FunctionName.of("exists")),
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -194,6 +194,7 @@
import static org.opensearch.sql.expression.function.BuiltinFunctionName.SINH;
import static org.opensearch.sql.expression.function.BuiltinFunctionName.SPAN;
import static org.opensearch.sql.expression.function.BuiltinFunctionName.SPAN_BUCKET;
import static org.opensearch.sql.expression.function.BuiltinFunctionName.SPLIT;
import static org.opensearch.sql.expression.function.BuiltinFunctionName.SQRT;
import static org.opensearch.sql.expression.function.BuiltinFunctionName.STDDEV_POP;
import static org.opensearch.sql.expression.function.BuiltinFunctionName.STDDEV_SAMP;
Expand Down Expand Up @@ -976,6 +977,34 @@ void populate() {
builder.makeCall(SqlLibraryOperators.ARRAY_JOIN, array, delimiter),
PPLTypeChecker.family(SqlTypeFamily.ARRAY, SqlTypeFamily.CHARACTER));

// Register SPLIT with custom logic for empty delimiter
// Case 1: Delimiter is not empty string, use SPLIT
// Case 2: Delimiter is empty string, use REGEXP_EXTRACT_ALL with '.' pattern
register(
SPLIT,
(FunctionImp2)
(builder, str, delimiter) -> {
// Create condition: delimiter = ''
RexNode emptyString = builder.makeLiteral("");
RexNode isEmptyDelimiter =
builder.makeCall(SqlStdOperatorTable.EQUALS, delimiter, emptyString);

// For empty delimiter: split into characters using REGEXP_EXTRACT_ALL with '.'
// pattern This matches each individual character
RexNode dotPattern = builder.makeLiteral(".");
RexNode splitChars =
builder.makeCall(SqlLibraryOperators.REGEXP_EXTRACT_ALL, str, dotPattern);

// For non-empty delimiter: use standard SPLIT
RexNode normalSplit = builder.makeCall(SqlLibraryOperators.SPLIT, str, delimiter);

// Use CASE to choose between the two approaches
// CASE WHEN isEmptyDelimiter THEN splitChars ELSE normalSplit END
return builder.makeCall(
SqlStdOperatorTable.CASE, isEmptyDelimiter, splitChars, normalSplit);
},
PPLTypeChecker.family(SqlTypeFamily.CHARACTER, SqlTypeFamily.CHARACTER));

// Register MVINDEX to use Calcite's ITEM/ARRAY_SLICE with index normalization
register(
MVINDEX,
Expand Down
54 changes: 54 additions & 0 deletions docs/user/ppl/functions/collection.rst
Original file line number Diff line number Diff line change
Expand Up @@ -186,6 +186,60 @@ Example::
| 120 |
+--------+

SPLIT
-----

Description
>>>>>>>>>>>

Usage: split(str, delimiter) splits the string values on the delimiter and returns the string values as a multivalue field (array). Use an empty string ("") to split the original string into one value per character. If the delimiter is not found, returns an array containing the original string. If the input string is empty, returns an empty array.

Argument type: str: STRING, delimiter: STRING

Return type: ARRAY of STRING

Example::

os> source=people | eval test = 'buttercup;rarity;tenderhoof;dash', result = split(test, ';') | fields result | head 1
fetched rows / total rows = 1/1
+------------------------------------+
| result |
|------------------------------------|
| [buttercup,rarity,tenderhoof,dash] |
+------------------------------------+

os> source=people | eval test = '1a2b3c4def567890', result = split(test, 'def') | fields result | head 1
fetched rows / total rows = 1/1
+------------------+
| result |
|------------------|
| [1a2b3c4,567890] |
+------------------+

os> source=people | eval test = 'abcd', result = split(test, '') | fields result | head 1
fetched rows / total rows = 1/1
+-----------+
| result |
|-----------|
| [a,b,c,d] |
+-----------+

os> source=people | eval test = 'name::value', result = split(test, '::') | fields result | head 1
fetched rows / total rows = 1/1
+--------------+
| result |
|--------------|
| [name,value] |
+--------------+

os> source=people | eval test = 'hello', result = split(test, ',') | fields result | head 1
fetched rows / total rows = 1/1
+---------+
| result |
|---------|
| [hello] |
+---------+

MVJOIN
------

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -567,4 +567,43 @@ public void testMvdedupPreservesOrder() throws IOException {
// Should preserve first occurrence order: z, a, b, c
verifyDataRows(actual, rows(List.of("z", "a", "b", "c")));
}

@Test
public void testSplitWithSemicolonDelimiter() throws IOException {
JSONObject actual =
executeQuery(
String.format(
"source=%s | eval test = 'buttercup;rarity;tenderhoof;dash;mcintosh', result ="
+ " split(test, ';') | head 1 | fields result",
TEST_INDEX_BANK));

verifySchema(actual, schema("result", "array"));
verifyDataRows(actual, rows(List.of("buttercup", "rarity", "tenderhoof", "dash", "mcintosh")));
}

@Test
public void testSplitWithMultiCharDelimiter() throws IOException {
JSONObject actual =
executeQuery(
String.format(
"source=%s | eval test = '1a2b3c4def567890', result = split(test, 'def') | head 1 |"
+ " fields result",
TEST_INDEX_BANK));

verifySchema(actual, schema("result", "array"));
verifyDataRows(actual, rows(List.of("1a2b3c4", "567890")));
}

@Test
public void testSplitWithEmptyDelimiter() throws IOException {
JSONObject actual =
executeQuery(
String.format(
"source=%s | eval test = 'abcd', result = split(test, '') | head 1 | fields result",
TEST_INDEX_BANK));

verifySchema(actual, schema("result", "array"));
// Empty delimiter splits into individual characters
verifyDataRows(actual, rows(List.of("a", "b", "c", "d")));
}
}
1 change: 1 addition & 0 deletions ppl/src/main/antlr/OpenSearchPPLLexer.g4
Original file line number Diff line number Diff line change
Expand Up @@ -443,6 +443,7 @@ MVAPPEND: 'MVAPPEND';
MVJOIN: 'MVJOIN';
MVINDEX: 'MVINDEX';
MVDEDUP: 'MVDEDUP';
SPLIT: 'SPLIT';
FORALL: 'FORALL';
FILTER: 'FILTER';
TRANSFORM: 'TRANSFORM';
Expand Down
1 change: 1 addition & 0 deletions ppl/src/main/antlr/OpenSearchPPLParser.g4
Original file line number Diff line number Diff line change
Expand Up @@ -1097,6 +1097,7 @@ collectionFunctionName
| MVJOIN
| MVINDEX
| MVDEDUP
| SPLIT
| FORALL
| EXISTS
| FILTER
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -290,4 +290,81 @@ public void testMvdedupPreservesOrder() {
+ "LIMIT 1";
verifyPPLToSparkSQL(root, expectedSparkSql);
}

@Test
public void testSplitWithSemicolonDelimiter() {
String ppl =
"source=EMP | eval test = 'buttercup;rarity;tenderhoof', result = split(test, ';') | head"
+ " 1 | fields result";
RelNode root = getRelNode(ppl);

String expectedLogical =
"LogicalProject(result=[$9])\n"
+ " LogicalSort(fetch=[1])\n"
+ " LogicalProject(EMPNO=[$0], ENAME=[$1], JOB=[$2], MGR=[$3], HIREDATE=[$4],"
+ " SAL=[$5], COMM=[$6], DEPTNO=[$7], test=['buttercup;rarity;tenderhoof':VARCHAR],"
+ " result=[CASE(=(';', ''),"
+ " REGEXP_EXTRACT_ALL('buttercup;rarity;tenderhoof':VARCHAR, '.'),"
+ " SPLIT('buttercup;rarity;tenderhoof':VARCHAR, ';'))])\n"
+ " LogicalTableScan(table=[[scott, EMP]])\n";
verifyLogical(root, expectedLogical);

String expectedSparkSql =
"SELECT CASE WHEN ';' = '' THEN REGEXP_EXTRACT_ALL('buttercup;rarity;tenderhoof', "
+ "'.') ELSE SPLIT('buttercup;rarity;tenderhoof', ';') END "
+ "`result`\n"
+ "FROM `scott`.`EMP`\n"
+ "LIMIT 1";
verifyPPLToSparkSQL(root, expectedSparkSql);
}

@Test
public void testSplitWithMultiCharDelimiter() {
String ppl =
"source=EMP | eval test = '1a2b3c4def567890', result = split(test, 'def') | head 1 |"
+ " fields result";
RelNode root = getRelNode(ppl);

String expectedLogical =
"LogicalProject(result=[$9])\n"
+ " LogicalSort(fetch=[1])\n"
+ " LogicalProject(EMPNO=[$0], ENAME=[$1], JOB=[$2], MGR=[$3], HIREDATE=[$4],"
+ " SAL=[$5], COMM=[$6], DEPTNO=[$7], test=['1a2b3c4def567890':VARCHAR],"
+ " result=[CASE(=('def':VARCHAR, ''), REGEXP_EXTRACT_ALL('1a2b3c4def567890':VARCHAR,"
+ " '.'), SPLIT('1a2b3c4def567890':VARCHAR, 'def':VARCHAR))])\n"
+ " LogicalTableScan(table=[[scott, EMP]])\n";
verifyLogical(root, expectedLogical);

String expectedSparkSql =
"SELECT CASE WHEN 'def' = '' THEN REGEXP_EXTRACT_ALL('1a2b3c4def567890', "
+ "'.') ELSE SPLIT('1a2b3c4def567890', 'def') END `result`\n"
+ "FROM `scott`.`EMP`\n"
+ "LIMIT 1";
verifyPPLToSparkSQL(root, expectedSparkSql);
}

@Test
public void testSplitWithEmptyDelimiter() {
String ppl =
"source=EMP | eval test = 'abcd', result = split(test, '') | head 1 | fields result";
RelNode root = getRelNode(ppl);

// With empty delimiter, should split into individual characters
String expectedLogical =
"LogicalProject(result=[$9])\n"
+ " LogicalSort(fetch=[1])\n"
+ " LogicalProject(EMPNO=[$0], ENAME=[$1], JOB=[$2], MGR=[$3], HIREDATE=[$4],"
+ " SAL=[$5], COMM=[$6], DEPTNO=[$7], test=['abcd':VARCHAR],"
+ " result=[CASE(=('':VARCHAR, ''), REGEXP_EXTRACT_ALL('abcd':VARCHAR,"
+ " '.'), SPLIT('abcd':VARCHAR, '':VARCHAR))])\n"
+ " LogicalTableScan(table=[[scott, EMP]])\n";
verifyLogical(root, expectedLogical);

String expectedSparkSql =
"SELECT CASE WHEN '' = '' THEN REGEXP_EXTRACT_ALL('abcd', '.') "
+ "ELSE SPLIT('abcd', '') END `result`\n"
+ "FROM `scott`.`EMP`\n"
+ "LIMIT 1";
verifyPPLToSparkSQL(root, expectedSparkSql);
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -829,6 +829,22 @@ public void testMvindex() {
anonymize("source=t | eval result=mvindex(array(1, 2, 3, 4, 5), 1, 3) | fields result"));
}

@Test
public void testSplit() {
// Test split with delimiter
assertEquals(
"source=table | eval identifier=split(***,***) | fields + identifier",
anonymize("source=t | eval result=split('a;b;c', ';') | fields result"));
// Test split with field reference
assertEquals(
"source=table | eval identifier=split(identifier,***) | fields + identifier",
anonymize("source=t | eval result=split(text, ',') | fields result"));
// Test split with empty delimiter (splits into characters)
assertEquals(
"source=table | eval identifier=split(***,***) | fields + identifier",
anonymize("source=t | eval result=split('abcd', '') | fields result"));
}

@Test
public void testMvdedup() {
// Test mvdedup with array containing duplicates
Expand Down
Loading