diff --git a/core/src/main/java/org/opensearch/sql/expression/function/BuiltinFunctionName.java b/core/src/main/java/org/opensearch/sql/expression/function/BuiltinFunctionName.java index d30af69d32..49c500caee 100644 --- a/core/src/main/java/org/opensearch/sql/expression/function/BuiltinFunctionName.java +++ b/core/src/main/java/org/opensearch/sql/expression/function/BuiltinFunctionName.java @@ -75,6 +75,7 @@ public enum BuiltinFunctionName { MVAPPEND(FunctionName.of("mvappend")), MVJOIN(FunctionName.of("mvjoin")), MVINDEX(FunctionName.of("mvindex")), + SPLIT(FunctionName.of("split")), MVDEDUP(FunctionName.of("mvdedup")), FORALL(FunctionName.of("forall")), EXISTS(FunctionName.of("exists")), diff --git a/core/src/main/java/org/opensearch/sql/expression/function/PPLFuncImpTable.java b/core/src/main/java/org/opensearch/sql/expression/function/PPLFuncImpTable.java index a2ea4cfcb3..2a54efaecb 100644 --- a/core/src/main/java/org/opensearch/sql/expression/function/PPLFuncImpTable.java +++ b/core/src/main/java/org/opensearch/sql/expression/function/PPLFuncImpTable.java @@ -194,6 +194,7 @@ import static org.opensearch.sql.expression.function.BuiltinFunctionName.SINH; import static org.opensearch.sql.expression.function.BuiltinFunctionName.SPAN; import static org.opensearch.sql.expression.function.BuiltinFunctionName.SPAN_BUCKET; +import static org.opensearch.sql.expression.function.BuiltinFunctionName.SPLIT; import static org.opensearch.sql.expression.function.BuiltinFunctionName.SQRT; import static org.opensearch.sql.expression.function.BuiltinFunctionName.STDDEV_POP; import static org.opensearch.sql.expression.function.BuiltinFunctionName.STDDEV_SAMP; @@ -976,6 +977,34 @@ void populate() { builder.makeCall(SqlLibraryOperators.ARRAY_JOIN, array, delimiter), PPLTypeChecker.family(SqlTypeFamily.ARRAY, SqlTypeFamily.CHARACTER)); + // Register SPLIT with custom logic for empty delimiter + // Case 1: Delimiter is not empty string, use SPLIT + // Case 2: Delimiter is empty string, use REGEXP_EXTRACT_ALL with '.' pattern + register( + SPLIT, + (FunctionImp2) + (builder, str, delimiter) -> { + // Create condition: delimiter = '' + RexNode emptyString = builder.makeLiteral(""); + RexNode isEmptyDelimiter = + builder.makeCall(SqlStdOperatorTable.EQUALS, delimiter, emptyString); + + // For empty delimiter: split into characters using REGEXP_EXTRACT_ALL with '.' + // pattern This matches each individual character + RexNode dotPattern = builder.makeLiteral("."); + RexNode splitChars = + builder.makeCall(SqlLibraryOperators.REGEXP_EXTRACT_ALL, str, dotPattern); + + // For non-empty delimiter: use standard SPLIT + RexNode normalSplit = builder.makeCall(SqlLibraryOperators.SPLIT, str, delimiter); + + // Use CASE to choose between the two approaches + // CASE WHEN isEmptyDelimiter THEN splitChars ELSE normalSplit END + return builder.makeCall( + SqlStdOperatorTable.CASE, isEmptyDelimiter, splitChars, normalSplit); + }, + PPLTypeChecker.family(SqlTypeFamily.CHARACTER, SqlTypeFamily.CHARACTER)); + // Register MVINDEX to use Calcite's ITEM/ARRAY_SLICE with index normalization register( MVINDEX, diff --git a/docs/user/ppl/functions/collection.rst b/docs/user/ppl/functions/collection.rst index 34c0207464..fdea75d3e8 100644 --- a/docs/user/ppl/functions/collection.rst +++ b/docs/user/ppl/functions/collection.rst @@ -186,6 +186,60 @@ Example:: | 120 | +--------+ +SPLIT +----- + +Description +>>>>>>>>>>> + +Usage: split(str, delimiter) splits the string values on the delimiter and returns the string values as a multivalue field (array). Use an empty string ("") to split the original string into one value per character. If the delimiter is not found, returns an array containing the original string. If the input string is empty, returns an empty array. + +Argument type: str: STRING, delimiter: STRING + +Return type: ARRAY of STRING + +Example:: + + os> source=people | eval test = 'buttercup;rarity;tenderhoof;dash', result = split(test, ';') | fields result | head 1 + fetched rows / total rows = 1/1 + +------------------------------------+ + | result | + |------------------------------------| + | [buttercup,rarity,tenderhoof,dash] | + +------------------------------------+ + + os> source=people | eval test = '1a2b3c4def567890', result = split(test, 'def') | fields result | head 1 + fetched rows / total rows = 1/1 + +------------------+ + | result | + |------------------| + | [1a2b3c4,567890] | + +------------------+ + + os> source=people | eval test = 'abcd', result = split(test, '') | fields result | head 1 + fetched rows / total rows = 1/1 + +-----------+ + | result | + |-----------| + | [a,b,c,d] | + +-----------+ + + os> source=people | eval test = 'name::value', result = split(test, '::') | fields result | head 1 + fetched rows / total rows = 1/1 + +--------------+ + | result | + |--------------| + | [name,value] | + +--------------+ + + os> source=people | eval test = 'hello', result = split(test, ',') | fields result | head 1 + fetched rows / total rows = 1/1 + +---------+ + | result | + |---------| + | [hello] | + +---------+ + MVJOIN ------ diff --git a/integ-test/src/test/java/org/opensearch/sql/calcite/remote/CalciteArrayFunctionIT.java b/integ-test/src/test/java/org/opensearch/sql/calcite/remote/CalciteArrayFunctionIT.java index 52a6e181e2..31556e518b 100644 --- a/integ-test/src/test/java/org/opensearch/sql/calcite/remote/CalciteArrayFunctionIT.java +++ b/integ-test/src/test/java/org/opensearch/sql/calcite/remote/CalciteArrayFunctionIT.java @@ -567,4 +567,43 @@ public void testMvdedupPreservesOrder() throws IOException { // Should preserve first occurrence order: z, a, b, c verifyDataRows(actual, rows(List.of("z", "a", "b", "c"))); } + + @Test + public void testSplitWithSemicolonDelimiter() throws IOException { + JSONObject actual = + executeQuery( + String.format( + "source=%s | eval test = 'buttercup;rarity;tenderhoof;dash;mcintosh', result =" + + " split(test, ';') | head 1 | fields result", + TEST_INDEX_BANK)); + + verifySchema(actual, schema("result", "array")); + verifyDataRows(actual, rows(List.of("buttercup", "rarity", "tenderhoof", "dash", "mcintosh"))); + } + + @Test + public void testSplitWithMultiCharDelimiter() throws IOException { + JSONObject actual = + executeQuery( + String.format( + "source=%s | eval test = '1a2b3c4def567890', result = split(test, 'def') | head 1 |" + + " fields result", + TEST_INDEX_BANK)); + + verifySchema(actual, schema("result", "array")); + verifyDataRows(actual, rows(List.of("1a2b3c4", "567890"))); + } + + @Test + public void testSplitWithEmptyDelimiter() throws IOException { + JSONObject actual = + executeQuery( + String.format( + "source=%s | eval test = 'abcd', result = split(test, '') | head 1 | fields result", + TEST_INDEX_BANK)); + + verifySchema(actual, schema("result", "array")); + // Empty delimiter splits into individual characters + verifyDataRows(actual, rows(List.of("a", "b", "c", "d"))); + } } diff --git a/ppl/src/main/antlr/OpenSearchPPLLexer.g4 b/ppl/src/main/antlr/OpenSearchPPLLexer.g4 index ebe0fcb4f2..22f1b96bbb 100644 --- a/ppl/src/main/antlr/OpenSearchPPLLexer.g4 +++ b/ppl/src/main/antlr/OpenSearchPPLLexer.g4 @@ -443,6 +443,7 @@ MVAPPEND: 'MVAPPEND'; MVJOIN: 'MVJOIN'; MVINDEX: 'MVINDEX'; MVDEDUP: 'MVDEDUP'; +SPLIT: 'SPLIT'; FORALL: 'FORALL'; FILTER: 'FILTER'; TRANSFORM: 'TRANSFORM'; diff --git a/ppl/src/main/antlr/OpenSearchPPLParser.g4 b/ppl/src/main/antlr/OpenSearchPPLParser.g4 index 22121a1b1a..a36647d4dd 100644 --- a/ppl/src/main/antlr/OpenSearchPPLParser.g4 +++ b/ppl/src/main/antlr/OpenSearchPPLParser.g4 @@ -1097,6 +1097,7 @@ collectionFunctionName | MVJOIN | MVINDEX | MVDEDUP + | SPLIT | FORALL | EXISTS | FILTER diff --git a/ppl/src/test/java/org/opensearch/sql/ppl/calcite/CalcitePPLArrayFunctionTest.java b/ppl/src/test/java/org/opensearch/sql/ppl/calcite/CalcitePPLArrayFunctionTest.java index 176fb534f3..96529adea2 100644 --- a/ppl/src/test/java/org/opensearch/sql/ppl/calcite/CalcitePPLArrayFunctionTest.java +++ b/ppl/src/test/java/org/opensearch/sql/ppl/calcite/CalcitePPLArrayFunctionTest.java @@ -290,4 +290,81 @@ public void testMvdedupPreservesOrder() { + "LIMIT 1"; verifyPPLToSparkSQL(root, expectedSparkSql); } + + @Test + public void testSplitWithSemicolonDelimiter() { + String ppl = + "source=EMP | eval test = 'buttercup;rarity;tenderhoof', result = split(test, ';') | head" + + " 1 | fields result"; + RelNode root = getRelNode(ppl); + + String expectedLogical = + "LogicalProject(result=[$9])\n" + + " LogicalSort(fetch=[1])\n" + + " LogicalProject(EMPNO=[$0], ENAME=[$1], JOB=[$2], MGR=[$3], HIREDATE=[$4]," + + " SAL=[$5], COMM=[$6], DEPTNO=[$7], test=['buttercup;rarity;tenderhoof':VARCHAR]," + + " result=[CASE(=(';', '')," + + " REGEXP_EXTRACT_ALL('buttercup;rarity;tenderhoof':VARCHAR, '.')," + + " SPLIT('buttercup;rarity;tenderhoof':VARCHAR, ';'))])\n" + + " LogicalTableScan(table=[[scott, EMP]])\n"; + verifyLogical(root, expectedLogical); + + String expectedSparkSql = + "SELECT CASE WHEN ';' = '' THEN REGEXP_EXTRACT_ALL('buttercup;rarity;tenderhoof', " + + "'.') ELSE SPLIT('buttercup;rarity;tenderhoof', ';') END " + + "`result`\n" + + "FROM `scott`.`EMP`\n" + + "LIMIT 1"; + verifyPPLToSparkSQL(root, expectedSparkSql); + } + + @Test + public void testSplitWithMultiCharDelimiter() { + String ppl = + "source=EMP | eval test = '1a2b3c4def567890', result = split(test, 'def') | head 1 |" + + " fields result"; + RelNode root = getRelNode(ppl); + + String expectedLogical = + "LogicalProject(result=[$9])\n" + + " LogicalSort(fetch=[1])\n" + + " LogicalProject(EMPNO=[$0], ENAME=[$1], JOB=[$2], MGR=[$3], HIREDATE=[$4]," + + " SAL=[$5], COMM=[$6], DEPTNO=[$7], test=['1a2b3c4def567890':VARCHAR]," + + " result=[CASE(=('def':VARCHAR, ''), REGEXP_EXTRACT_ALL('1a2b3c4def567890':VARCHAR," + + " '.'), SPLIT('1a2b3c4def567890':VARCHAR, 'def':VARCHAR))])\n" + + " LogicalTableScan(table=[[scott, EMP]])\n"; + verifyLogical(root, expectedLogical); + + String expectedSparkSql = + "SELECT CASE WHEN 'def' = '' THEN REGEXP_EXTRACT_ALL('1a2b3c4def567890', " + + "'.') ELSE SPLIT('1a2b3c4def567890', 'def') END `result`\n" + + "FROM `scott`.`EMP`\n" + + "LIMIT 1"; + verifyPPLToSparkSQL(root, expectedSparkSql); + } + + @Test + public void testSplitWithEmptyDelimiter() { + String ppl = + "source=EMP | eval test = 'abcd', result = split(test, '') | head 1 | fields result"; + RelNode root = getRelNode(ppl); + + // With empty delimiter, should split into individual characters + String expectedLogical = + "LogicalProject(result=[$9])\n" + + " LogicalSort(fetch=[1])\n" + + " LogicalProject(EMPNO=[$0], ENAME=[$1], JOB=[$2], MGR=[$3], HIREDATE=[$4]," + + " SAL=[$5], COMM=[$6], DEPTNO=[$7], test=['abcd':VARCHAR]," + + " result=[CASE(=('':VARCHAR, ''), REGEXP_EXTRACT_ALL('abcd':VARCHAR," + + " '.'), SPLIT('abcd':VARCHAR, '':VARCHAR))])\n" + + " LogicalTableScan(table=[[scott, EMP]])\n"; + verifyLogical(root, expectedLogical); + + String expectedSparkSql = + "SELECT CASE WHEN '' = '' THEN REGEXP_EXTRACT_ALL('abcd', '.') " + + "ELSE SPLIT('abcd', '') END `result`\n" + + "FROM `scott`.`EMP`\n" + + "LIMIT 1"; + verifyPPLToSparkSQL(root, expectedSparkSql); + } } diff --git a/ppl/src/test/java/org/opensearch/sql/ppl/utils/PPLQueryDataAnonymizerTest.java b/ppl/src/test/java/org/opensearch/sql/ppl/utils/PPLQueryDataAnonymizerTest.java index 0f59e98e74..796156aae8 100644 --- a/ppl/src/test/java/org/opensearch/sql/ppl/utils/PPLQueryDataAnonymizerTest.java +++ b/ppl/src/test/java/org/opensearch/sql/ppl/utils/PPLQueryDataAnonymizerTest.java @@ -829,6 +829,22 @@ public void testMvindex() { anonymize("source=t | eval result=mvindex(array(1, 2, 3, 4, 5), 1, 3) | fields result")); } + @Test + public void testSplit() { + // Test split with delimiter + assertEquals( + "source=table | eval identifier=split(***,***) | fields + identifier", + anonymize("source=t | eval result=split('a;b;c', ';') | fields result")); + // Test split with field reference + assertEquals( + "source=table | eval identifier=split(identifier,***) | fields + identifier", + anonymize("source=t | eval result=split(text, ',') | fields result")); + // Test split with empty delimiter (splits into characters) + assertEquals( + "source=table | eval identifier=split(***,***) | fields + identifier", + anonymize("source=t | eval result=split('abcd', '') | fields result")); + } + @Test public void testMvdedup() { // Test mvdedup with array containing duplicates