Skip to content

Commit 2e6541e

Browse files
committed
Make truncate filter a normalizer token filter
This filter may be useful with keyword fields if preferring to truncate the data rather than ignoring it (ignore above). Move and re-purpose the TruncateTokenFilterTests from the server module to analysis-common (the lucene truncate filter has moved to lucene a long time ago). Signed-off-by: David Causse <[email protected]>
1 parent f0a1dba commit 2e6541e

File tree

5 files changed

+104
-78
lines changed

5 files changed

+104
-78
lines changed

CHANGELOG.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
2626
- Add async periodic flush task support for pull-based ingestion ([#19878](https://github.com/opensearch-project/OpenSearch/pull/19878))
2727
- Add support for context aware segments ([#19098](https://github.com/opensearch-project/OpenSearch/pull/19098))
2828
- Implement GRPC FunctionScoreQuery ([#19888](https://github.com/opensearch-project/OpenSearch/pull/19888))
29+
- Allow the truncate filter in normalizers ([#19778](https://github.com/opensearch-project/OpenSearch/issues/19778))
2930

3031
### Changed
3132
- Faster `terms` query creation for `keyword` field with index and docValues enabled ([#19350](https://github.com/opensearch-project/OpenSearch/pull/19350))

modules/analysis-common/src/main/java/org/opensearch/analysis/common/TruncateTokenFilterFactory.java

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -38,8 +38,9 @@
3838
import org.opensearch.env.Environment;
3939
import org.opensearch.index.IndexSettings;
4040
import org.opensearch.index.analysis.AbstractTokenFilterFactory;
41+
import org.opensearch.index.analysis.NormalizingTokenFilterFactory;
4142

42-
public class TruncateTokenFilterFactory extends AbstractTokenFilterFactory {
43+
public class TruncateTokenFilterFactory extends AbstractTokenFilterFactory implements NormalizingTokenFilterFactory {
4344

4445
private final int length;
4546

Lines changed: 88 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,88 @@
1+
/*
2+
* SPDX-License-Identifier: Apache-2.0
3+
*
4+
* The OpenSearch Contributors require contributions made to
5+
* this file be licensed under the Apache-2.0 license or a
6+
* compatible open source license.
7+
*/
8+
9+
/*
10+
* Licensed to Elasticsearch under one or more contributor
11+
* license agreements. See the NOTICE file distributed with
12+
* this work for additional information regarding copyright
13+
* ownership. Elasticsearch licenses this file to you under
14+
* the Apache License, Version 2.0 (the "License"); you may
15+
* not use this file except in compliance with the License.
16+
* You may obtain a copy of the License at
17+
*
18+
* http://www.apache.org/licenses/LICENSE-2.0
19+
*
20+
* Unless required by applicable law or agreed to in writing,
21+
* software distributed under the License is distributed on an
22+
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
23+
* KIND, either express or implied. See the License for the
24+
* specific language governing permissions and limitations
25+
* under the License.
26+
*/
27+
28+
/*
29+
* Modifications Copyright OpenSearch Contributors. See
30+
* GitHub history for details.
31+
*/
32+
33+
package org.opensearch.analysis.common;
34+
35+
import org.apache.lucene.util.BytesRef;
36+
import org.opensearch.common.settings.Settings;
37+
import org.opensearch.env.Environment;
38+
import org.opensearch.index.analysis.AnalysisTestsHelper;
39+
import org.opensearch.index.analysis.NamedAnalyzer;
40+
import org.opensearch.test.OpenSearchTestCase;
41+
import org.opensearch.test.OpenSearchTokenStreamTestCase;
42+
43+
import java.io.IOException;
44+
45+
public class TruncateTokenFilterTests extends OpenSearchTokenStreamTestCase {
46+
47+
public void testFilter() throws IOException {
48+
Settings settings = Settings.builder()
49+
.put("index.analysis.filter.truncate.type", "truncate")
50+
.put("index.analysis.filter.truncate.length", 3)
51+
.put("index.analysis.analyzer.my_analyzer.type", "custom")
52+
.put("index.analysis.analyzer.my_analyzer.tokenizer", "whitespace")
53+
.putList("index.analysis.analyzer.my_analyzer.filter", "truncate")
54+
.putList("index.analysis.normalizer.my_normalizer.filter", "truncate")
55+
.put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString())
56+
.build();
57+
OpenSearchTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings(
58+
settings,
59+
new CommonAnalysisModulePlugin()
60+
);
61+
NamedAnalyzer analyzer = analysis.indexAnalyzers.get("my_analyzer");
62+
assertNotNull(analyzer);
63+
64+
assertTokenStreamContents(analyzer.tokenStream("foo", "a bb ccc dddd"), new String[] { "a", "bb", "ccc", "ddd" });
65+
}
66+
67+
public void testNormalizer() throws IOException {
68+
Settings settings = Settings.builder()
69+
.put("index.analysis.filter.truncate.type", "truncate")
70+
.put("index.analysis.filter.truncate.length", 3)
71+
.put("index.analysis.normalizer.my_normalizer.type", "custom")
72+
.putList("index.analysis.normalizer.my_normalizer.filter", "truncate")
73+
.put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString())
74+
.build();
75+
OpenSearchTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings(
76+
settings,
77+
new CommonAnalysisModulePlugin()
78+
);
79+
assertNull(analysis.indexAnalyzers.get("my_normalizer"));
80+
NamedAnalyzer normalizer = analysis.indexAnalyzers.getNormalizer("my_normalizer");
81+
assertNotNull(normalizer);
82+
assertEquals("my_normalizer", normalizer.name());
83+
assertEquals(new BytesRef("a"), normalizer.normalize("foo", "a"));
84+
assertEquals(new BytesRef("bb"), normalizer.normalize("foo", "bb"));
85+
assertEquals(new BytesRef("ccc"), normalizer.normalize("foo", "ccc"));
86+
assertEquals(new BytesRef("ddd"), normalizer.normalize("foo", "dddd"));
87+
}
88+
}

modules/analysis-common/src/yamlRestTest/resources/rest-api-spec/test/analysis-common/40_token_filters.yml

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -824,6 +824,11 @@
824824
my_truncate:
825825
type: truncate
826826
length: 3
827+
normalizer:
828+
my_truncate_norm:
829+
type: custom
830+
filter:
831+
- my_truncate
827832
- do:
828833
indices.analyze:
829834
index: test
@@ -833,6 +838,14 @@
833838
filter: [my_truncate]
834839
- length: { tokens: 1 }
835840
- match: { tokens.0.token: foo }
841+
- do:
842+
indices.analyze:
843+
index: test
844+
body:
845+
text: foobar
846+
normalizer: my_truncate_norm
847+
- length: { tokens: 1 }
848+
- match: { tokens.0.token: foo }
836849

837850
---
838851
"pattern_capture":

server/src/test/java/org/opensearch/lucene/analysis/miscellaneous/TruncateTokenFilterTests.java

Lines changed: 0 additions & 77 deletions
This file was deleted.

0 commit comments

Comments
 (0)