Make truncate filter a normalizer token filter

nomoa · nomoa · commit 2e6541e56fd1 · 2025-11-13T10:58:30.000+01:00
This filter may be useful with keyword fields if preferring to truncate
the data rather than ignoring it (ignore above).
Move and re-purpose the TruncateTokenFilterTests from the server module
to analysis-common (the lucene truncate filter has moved to lucene a
long time ago).

Signed-off-by: David Causse &lt;dcausse@wikimedia.org&gt;
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -26,6 +26,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
 - Add async periodic flush task support for pull-based ingestion ([#19878](https://github.com/opensearch-project/OpenSearch/pull/19878))
 - Add support for context aware segments ([#19098](https://github.com/opensearch-project/OpenSearch/pull/19098))
 - Implement GRPC FunctionScoreQuery ([#19888](https://github.com/opensearch-project/OpenSearch/pull/19888))
+- Allow the truncate filter in normalizers ([#19778](https://github.com/opensearch-project/OpenSearch/issues/19778))
 
 ### Changed
 - Faster `terms` query creation for `keyword` field with index and docValues enabled ([#19350](https://github.com/opensearch-project/OpenSearch/pull/19350))
diff --git a/modules/analysis-common/src/main/java/org/opensearch/analysis/common/TruncateTokenFilterFactory.java b/modules/analysis-common/src/main/java/org/opensearch/analysis/common/TruncateTokenFilterFactory.java
@@ -38,8 +38,9 @@
 import org.opensearch.env.Environment;
 import org.opensearch.index.IndexSettings;
 import org.opensearch.index.analysis.AbstractTokenFilterFactory;
+import org.opensearch.index.analysis.NormalizingTokenFilterFactory;
 
-public class TruncateTokenFilterFactory extends AbstractTokenFilterFactory {
+public class TruncateTokenFilterFactory extends AbstractTokenFilterFactory implements NormalizingTokenFilterFactory {
 
     private final int length;
 
diff --git a/modules/analysis-common/src/test/java/org/opensearch/analysis/common/TruncateTokenFilterTests.java b/modules/analysis-common/src/test/java/org/opensearch/analysis/common/TruncateTokenFilterTests.java
@@ -0,0 +1,88 @@
+/*
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * The OpenSearch Contributors require contributions made to
+ * this file be licensed under the Apache-2.0 license or a
+ * compatible open source license.
+ */
+
+/*
+ * Licensed to Elasticsearch under one or more contributor
+ * license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright
+ * ownership. Elasticsearch licenses this file to you under
+ * the Apache License, Version 2.0 (the "License"); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*
+ * Modifications Copyright OpenSearch Contributors. See
+ * GitHub history for details.
+ */
+
+package org.opensearch.analysis.common;
+
+import org.apache.lucene.util.BytesRef;
+import org.opensearch.common.settings.Settings;
+import org.opensearch.env.Environment;
+import org.opensearch.index.analysis.AnalysisTestsHelper;
+import org.opensearch.index.analysis.NamedAnalyzer;
+import org.opensearch.test.OpenSearchTestCase;
+import org.opensearch.test.OpenSearchTokenStreamTestCase;
+
+import java.io.IOException;
+
+public class TruncateTokenFilterTests extends OpenSearchTokenStreamTestCase {
+
+    public void testFilter() throws IOException {
+        Settings settings = Settings.builder()
+            .put("index.analysis.filter.truncate.type", "truncate")
+            .put("index.analysis.filter.truncate.length", 3)
+            .put("index.analysis.analyzer.my_analyzer.type", "custom")
+            .put("index.analysis.analyzer.my_analyzer.tokenizer", "whitespace")
+            .putList("index.analysis.analyzer.my_analyzer.filter", "truncate")
+            .putList("index.analysis.normalizer.my_normalizer.filter", "truncate")
+            .put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString())
+            .build();
+        OpenSearchTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings(
+            settings,
+            new CommonAnalysisModulePlugin()
+        );
+        NamedAnalyzer analyzer = analysis.indexAnalyzers.get("my_analyzer");
+        assertNotNull(analyzer);
+
+        assertTokenStreamContents(analyzer.tokenStream("foo", "a bb ccc dddd"), new String[] { "a", "bb", "ccc", "ddd" });
+    }
+
+    public void testNormalizer() throws IOException {
+        Settings settings = Settings.builder()
+            .put("index.analysis.filter.truncate.type", "truncate")
+            .put("index.analysis.filter.truncate.length", 3)
+            .put("index.analysis.normalizer.my_normalizer.type", "custom")
+            .putList("index.analysis.normalizer.my_normalizer.filter", "truncate")
+            .put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString())
+            .build();
+        OpenSearchTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings(
+            settings,
+            new CommonAnalysisModulePlugin()
+        );
+        assertNull(analysis.indexAnalyzers.get("my_normalizer"));
+        NamedAnalyzer normalizer = analysis.indexAnalyzers.getNormalizer("my_normalizer");
+        assertNotNull(normalizer);
+        assertEquals("my_normalizer", normalizer.name());
+        assertEquals(new BytesRef("a"), normalizer.normalize("foo", "a"));
+        assertEquals(new BytesRef("bb"), normalizer.normalize("foo", "bb"));
+        assertEquals(new BytesRef("ccc"), normalizer.normalize("foo", "ccc"));
+        assertEquals(new BytesRef("ddd"), normalizer.normalize("foo", "dddd"));
+    }
+}
diff --git a/modules/analysis-common/src/yamlRestTest/resources/rest-api-spec/test/analysis-common/40_token_filters.yml b/modules/analysis-common/src/yamlRestTest/resources/rest-api-spec/test/analysis-common/40_token_filters.yml
@@ -824,6 +824,11 @@
                   my_truncate:
                     type: truncate
                     length: 3
+                normalizer:
+                  my_truncate_norm:
+                    type: custom
+                    filter:
+                      - my_truncate
     - do:
         indices.analyze:
           index: test
@@ -833,6 +838,14 @@
             filter:    [my_truncate]
     - length: { tokens: 1 }
     - match:  { tokens.0.token: foo }
+    - do:
+        indices.analyze:
+          index: test
+          body:
+            text:      foobar
+            normalizer: my_truncate_norm
+    - length: { tokens: 1 }
+    - match:  { tokens.0.token: foo }
 
 ---
 "pattern_capture":
diff --git a/server/src/test/java/org/opensearch/lucene/analysis/miscellaneous/TruncateTokenFilterTests.java b/server/src/test/java/org/opensearch/lucene/analysis/miscellaneous/TruncateTokenFilterTests.java