Skip to content

Commit 744aeed

Browse files
jan-elasticchrisparrinello
authored andcommitted
Fix ML tests failing with "no shards available" (elastic#136800)
* fix debug output in TransportGetDataFrameAnalyticsStatsAction * clear TrainedModelStatsService's queue upon MachineLearning reset * unmute tests * rename ResetAuditorActions -> ResetMlComponentsAction * Move clearing stats queue to reset action
1 parent 47b8495 commit 744aeed

File tree

7 files changed

+48
-72
lines changed

7 files changed

+48
-72
lines changed

muted-tests.yml

Lines changed: 0 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -137,18 +137,12 @@ tests:
137137
- class: org.elasticsearch.xpack.test.rest.XPackRestIT
138138
method: test {p0=snapshot/10_basic/Create a source only snapshot and then restore it}
139139
issue: https://github.com/elastic/elasticsearch/issues/122755
140-
- class: org.elasticsearch.smoketest.MlWithSecurityIT
141-
method: test {yaml=ml/data_frame_analytics_crud/Test get stats given multiple analytics}
142-
issue: https://github.com/elastic/elasticsearch/issues/123034
143140
- class: org.elasticsearch.indices.recovery.IndexRecoveryIT
144141
method: testSourceThrottling
145142
issue: https://github.com/elastic/elasticsearch/issues/123680
146143
- class: org.elasticsearch.smoketest.MlWithSecurityIT
147144
method: test {yaml=ml/3rd_party_deployment/Test start deployment fails while model download in progress}
148145
issue: https://github.com/elastic/elasticsearch/issues/120814
149-
- class: org.elasticsearch.smoketest.MlWithSecurityIT
150-
method: test {yaml=ml/start_data_frame_analytics/Test start classification analysis when the dependent variable is missing}
151-
issue: https://github.com/elastic/elasticsearch/issues/124168
152146
- class: org.elasticsearch.smoketest.MlWithSecurityIT
153147
method: test {yaml=ml/3rd_party_deployment/Test start and stop multiple deployments}
154148
issue: https://github.com/elastic/elasticsearch/issues/124315
@@ -161,15 +155,6 @@ tests:
161155
- class: org.elasticsearch.packaging.test.BootstrapCheckTests
162156
method: test10Install
163157
issue: https://github.com/elastic/elasticsearch/issues/124957
164-
- class: org.elasticsearch.smoketest.MlWithSecurityIT
165-
method: test {yaml=ml/data_frame_analytics_crud/Test get stats on newly created config}
166-
issue: https://github.com/elastic/elasticsearch/issues/121726
167-
- class: org.elasticsearch.smoketest.MlWithSecurityIT
168-
method: test {yaml=ml/data_frame_analytics_cat_apis/Test cat data frame analytics all jobs with header and column selection}
169-
issue: https://github.com/elastic/elasticsearch/issues/125641
170-
- class: org.elasticsearch.smoketest.MlWithSecurityIT
171-
method: test {yaml=ml/data_frame_analytics_cat_apis/Test cat data frame analytics single job with header}
172-
issue: https://github.com/elastic/elasticsearch/issues/125642
173158
- class: org.elasticsearch.xpack.test.rest.XPackRestIT
174159
method: test {p0=transform/transforms_start_stop/Test schedule_now on an already started transform}
175160
issue: https://github.com/elastic/elasticsearch/issues/120720
@@ -179,9 +164,6 @@ tests:
179164
- class: org.elasticsearch.xpack.core.common.notifications.AbstractAuditorTests
180165
method: testRecreateTemplateWhenDeleted
181166
issue: https://github.com/elastic/elasticsearch/issues/123232
182-
- class: org.elasticsearch.xpack.test.rest.XPackRestIT
183-
method: test {p0=ml/start_data_frame_analytics/Test start given dest index is not empty}
184-
issue: https://github.com/elastic/elasticsearch/issues/125909
185167
- class: org.elasticsearch.xpack.test.rest.XPackRestIT
186168
method: test {p0=transform/transforms_stats/Test get transform stats with timeout}
187169
issue: https://github.com/elastic/elasticsearch/issues/125975
@@ -197,15 +179,6 @@ tests:
197179
- class: org.elasticsearch.xpack.test.rest.XPackRestIT
198180
method: test {p0=transform/transforms_stats/Test get transform stats}
199181
issue: https://github.com/elastic/elasticsearch/issues/126270
200-
- class: org.elasticsearch.xpack.test.rest.XPackRestIT
201-
method: test {p0=ml/start_data_frame_analytics/Test start classification analysis when the dependent variable cardinality is too low}
202-
issue: https://github.com/elastic/elasticsearch/issues/126299
203-
- class: org.elasticsearch.smoketest.MlWithSecurityIT
204-
method: test {yaml=ml/start_data_frame_analytics/Test start classification analysis when the dependent variable cardinality is too low}
205-
issue: https://github.com/elastic/elasticsearch/issues/123200
206-
- class: org.elasticsearch.smoketest.MlWithSecurityIT
207-
method: test {yaml=ml/trained_model_cat_apis/Test cat trained models}
208-
issue: https://github.com/elastic/elasticsearch/issues/125750
209182
- class: org.elasticsearch.ingest.geoip.EnterpriseGeoIpDownloaderIT
210183
method: testEnterpriseDownloaderTask
211184
issue: https://github.com/elastic/elasticsearch/issues/126124
@@ -245,9 +218,6 @@ tests:
245218
- class: org.elasticsearch.cli.keystore.AddStringKeyStoreCommandTests
246219
method: testStdinWithMultipleValues
247220
issue: https://github.com/elastic/elasticsearch/issues/126882
248-
- class: org.elasticsearch.xpack.test.rest.XPackRestIT
249-
method: test {p0=ml/data_frame_analytics_cat_apis/Test cat data frame analytics all jobs with header}
250-
issue: https://github.com/elastic/elasticsearch/issues/127625
251221
- class: org.elasticsearch.xpack.ccr.action.ShardFollowTaskReplicationTests
252222
method: testChangeFollowerHistoryUUID
253223
issue: https://github.com/elastic/elasticsearch/issues/127680
@@ -333,9 +303,6 @@ tests:
333303
- class: org.elasticsearch.packaging.test.DockerTests
334304
method: test171AdditionalCliOptionsAreForwarded
335305
issue: https://github.com/elastic/elasticsearch/issues/120925
336-
- class: org.elasticsearch.xpack.test.rest.XPackRestIT
337-
method: test {p0=ml/delete_expired_data/Test delete expired data with body parameters}
338-
issue: https://github.com/elastic/elasticsearch/issues/131364
339306
- class: org.elasticsearch.packaging.test.DockerTests
340307
method: test070BindMountCustomPathConfAndJvmOptions
341308
issue: https://github.com/elastic/elasticsearch/issues/131366
Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -22,12 +22,12 @@
2222
import java.util.List;
2323
import java.util.Objects;
2424

25-
public class ResetAuditorAction extends ActionType<ResetAuditorAction.Response> {
25+
public class ResetMlComponentsAction extends ActionType<ResetMlComponentsAction.Response> {
2626

27-
public static final ResetAuditorAction INSTANCE = new ResetAuditorAction();
27+
public static final ResetMlComponentsAction INSTANCE = new ResetMlComponentsAction();
2828
public static final String NAME = "cluster:internal/xpack/ml/auditor/reset";
2929

30-
private ResetAuditorAction() {
30+
private ResetMlComponentsAction() {
3131
super(NAME);
3232
}
3333

x-pack/plugin/ml/qa/native-multi-node-tests/src/javaRestTest/java/org/elasticsearch/xpack/ml/integration/DeleteExpiredDataIT.java

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -105,7 +105,6 @@ public void testDeleteExpiredData_GivenNothingToDelete() throws Exception {
105105
client().execute(DeleteExpiredDataAction.INSTANCE, new DeleteExpiredDataAction.Request()).get();
106106
}
107107

108-
@AwaitsFix(bugUrl = "https://github.com/elastic/elasticsearch/issues/62699")
109108
public void testDeleteExpiredDataNoThrottle() throws Exception {
110109
testExpiredDeletion(null, 10010);
111110
}
@@ -152,7 +151,6 @@ public void testDeleteExpiredDataActionDeletesEmptyStateIndices() throws Excepti
152151
);
153152
}
154153

155-
@AwaitsFix(bugUrl = "https://github.com/elastic/elasticsearch/issues/62699")
156154
public void testDeleteExpiredDataWithStandardThrottle() throws Exception {
157155
testExpiredDeletion(-1.0f, 100);
158156
}

x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/MachineLearning.java

Lines changed: 14 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -160,8 +160,8 @@
160160
import org.elasticsearch.xpack.core.ml.action.PutTrainedModelAliasAction;
161161
import org.elasticsearch.xpack.core.ml.action.PutTrainedModelDefinitionPartAction;
162162
import org.elasticsearch.xpack.core.ml.action.PutTrainedModelVocabularyAction;
163-
import org.elasticsearch.xpack.core.ml.action.ResetAuditorAction;
164163
import org.elasticsearch.xpack.core.ml.action.ResetJobAction;
164+
import org.elasticsearch.xpack.core.ml.action.ResetMlComponentsAction;
165165
import org.elasticsearch.xpack.core.ml.action.RevertModelSnapshotAction;
166166
import org.elasticsearch.xpack.core.ml.action.SetResetModeAction;
167167
import org.elasticsearch.xpack.core.ml.action.SetUpgradeModeAction;
@@ -271,8 +271,8 @@
271271
import org.elasticsearch.xpack.ml.action.TransportPutTrainedModelAliasAction;
272272
import org.elasticsearch.xpack.ml.action.TransportPutTrainedModelDefinitionPartAction;
273273
import org.elasticsearch.xpack.ml.action.TransportPutTrainedModelVocabularyAction;
274-
import org.elasticsearch.xpack.ml.action.TransportResetAuditorAction;
275274
import org.elasticsearch.xpack.ml.action.TransportResetJobAction;
275+
import org.elasticsearch.xpack.ml.action.TransportResetMlComponentsAction;
276276
import org.elasticsearch.xpack.ml.action.TransportRevertModelSnapshotAction;
277277
import org.elasticsearch.xpack.ml.action.TransportSetResetModeAction;
278278
import org.elasticsearch.xpack.ml.action.TransportSetUpgradeModeAction;
@@ -805,7 +805,7 @@ public void loadExtensions(ExtensionLoader loader) {
805805
private final SetOnce<LearningToRankService> learningToRankService = new SetOnce<>();
806806
private final SetOnce<MlAutoscalingDeciderService> mlAutoscalingDeciderService = new SetOnce<>();
807807
private final SetOnce<DeploymentManager> deploymentManager = new SetOnce<>();
808-
private final SetOnce<TrainedModelAssignmentClusterService> trainedModelAllocationClusterServiceSetOnce = new SetOnce<>();
808+
private final SetOnce<TrainedModelAssignmentClusterService> trainedModelAllocationClusterService = new SetOnce<>();
809809

810810
private final SetOnce<MachineLearningExtension> machineLearningExtension = new SetOnce<>();
811811

@@ -1315,7 +1315,7 @@ public Collection<?> createComponents(PluginServices services) {
13151315
clusterService,
13161316
threadPool
13171317
);
1318-
trainedModelAllocationClusterServiceSetOnce.set(
1318+
trainedModelAllocationClusterService.set(
13191319
new TrainedModelAssignmentClusterService(
13201320
settings,
13211321
clusterService,
@@ -1391,7 +1391,8 @@ public Collection<?> createComponents(PluginServices services) {
13911391
trainedModelCacheMetadataService,
13921392
trainedModelProvider,
13931393
trainedModelAssignmentService,
1394-
trainedModelAllocationClusterServiceSetOnce.get(),
1394+
trainedModelAllocationClusterService.get(),
1395+
trainedModelStatsService,
13951396
deploymentManager.get(),
13961397
nodeAvailabilityZoneMapper,
13971398
new MachineLearningExtensionHolder(machineLearningExtension.get()),
@@ -1564,7 +1565,7 @@ public List<ActionHandler> getActions() {
15641565
actionHandlers.add(new ActionHandler(MlMemoryAction.INSTANCE, TransportMlMemoryAction.class));
15651566
actionHandlers.add(new ActionHandler(SetUpgradeModeAction.INSTANCE, TransportSetUpgradeModeAction.class));
15661567
actionHandlers.add(new ActionHandler(SetResetModeAction.INSTANCE, TransportSetResetModeAction.class));
1567-
actionHandlers.add(new ActionHandler(ResetAuditorAction.INSTANCE, TransportResetAuditorAction.class));
1568+
actionHandlers.add(new ActionHandler(ResetMlComponentsAction.INSTANCE, TransportResetMlComponentsAction.class));
15681569
// Included in this section as it's used by MlMemoryAction
15691570
actionHandlers.add(new ActionHandler(TrainedModelCacheInfoAction.INSTANCE, TransportTrainedModelCacheInfoAction.class));
15701571
actionHandlers.add(new ActionHandler(GetMlAutoscalingStats.INSTANCE, TransportGetMlAutoscalingStats.class));
@@ -2177,17 +2178,17 @@ public void cleanUpFeature(
21772178
});
21782179

21792180
ActionListener<ResetFeatureStateResponse.ResetFeatureStateStatus> resetAuditors = ActionListener.wrap(success -> {
2180-
// reset the auditors as aliases used may be removed
2181+
// reset components, such as the auditors the trained model stats queue
21812182
client.execute(
2182-
ResetAuditorAction.INSTANCE,
2183-
ResetAuditorAction.Request.RESET_AUDITOR_REQUEST,
2183+
ResetMlComponentsAction.INSTANCE,
2184+
ResetMlComponentsAction.Request.RESET_AUDITOR_REQUEST,
21842185
ActionListener.wrap(ignored -> unsetResetModeListener.onResponse(success), unsetResetModeListener::onFailure)
21852186
);
21862187
}, failure -> {
21872188
logger.error("failed to reset machine learning", failure);
21882189
client.execute(
2189-
ResetAuditorAction.INSTANCE,
2190-
ResetAuditorAction.Request.RESET_AUDITOR_REQUEST,
2190+
ResetMlComponentsAction.INSTANCE,
2191+
ResetMlComponentsAction.Request.RESET_AUDITOR_REQUEST,
21912192
ActionListener.wrap(ignored -> unsetResetModeListener.onFailure(failure), unsetResetModeListener::onFailure)
21922193
);
21932194
});
@@ -2329,11 +2330,11 @@ public void cleanUpFeature(
23292330
);
23302331
client.execute(CancelJobModelSnapshotUpgradeAction.INSTANCE, cancelSnapshotUpgradesReq, delegate);
23312332
}).delegateFailureAndWrap((delegate, acknowledgedResponse) -> {
2332-
if (trainedModelAllocationClusterServiceSetOnce.get() == null || machineLearningExtension.get().isNlpEnabled() == false) {
2333+
if (trainedModelAllocationClusterService.get() == null || machineLearningExtension.get().isNlpEnabled() == false) {
23332334
delegate.onResponse(AcknowledgedResponse.TRUE);
23342335
return;
23352336
}
2336-
trainedModelAllocationClusterServiceSetOnce.get().removeAllModelAssignments(delegate);
2337+
trainedModelAllocationClusterService.get().removeAllModelAssignments(delegate);
23372338
});
23382339

23392340
// validate no pipelines are using machine learning models

x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/action/TransportGetDataFrameAnalyticsStatsAction.java

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -61,6 +61,7 @@
6161
import org.elasticsearch.xpack.ml.utils.persistence.MlParserUtils;
6262

6363
import java.util.ArrayList;
64+
import java.util.Arrays;
6465
import java.util.Collections;
6566
import java.util.Comparator;
6667
import java.util.List;
@@ -278,7 +279,7 @@ private void searchStats(DataFrameAnalyticsConfig config, TaskId parentTaskId, A
278279
() -> format(
279280
"[%s] Item failure encountered during multi search for request [indices=%s, source=%s]: %s",
280281
config.getId(),
281-
itemRequest.indices(),
282+
Arrays.toString(itemRequest.indices()),
282283
itemRequest.source(),
283284
itemResponse.getFailureMessage()
284285
),
Lines changed: 25 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -17,72 +17,77 @@
1717
import org.elasticsearch.tasks.Task;
1818
import org.elasticsearch.threadpool.ThreadPool;
1919
import org.elasticsearch.transport.TransportService;
20-
import org.elasticsearch.xpack.core.ml.action.ResetAuditorAction;
20+
import org.elasticsearch.xpack.core.ml.action.ResetMlComponentsAction;
21+
import org.elasticsearch.xpack.ml.inference.TrainedModelStatsService;
2122
import org.elasticsearch.xpack.ml.notifications.AnomalyDetectionAuditor;
2223
import org.elasticsearch.xpack.ml.notifications.DataFrameAnalyticsAuditor;
2324
import org.elasticsearch.xpack.ml.notifications.InferenceAuditor;
2425

2526
import java.io.IOException;
2627
import java.util.List;
2728

28-
public class TransportResetAuditorAction extends TransportNodesAction<
29-
ResetAuditorAction.Request,
30-
ResetAuditorAction.Response,
31-
ResetAuditorAction.NodeRequest,
32-
ResetAuditorAction.Response.ResetResponse,
29+
public class TransportResetMlComponentsAction extends TransportNodesAction<
30+
ResetMlComponentsAction.Request,
31+
ResetMlComponentsAction.Response,
32+
ResetMlComponentsAction.NodeRequest,
33+
ResetMlComponentsAction.Response.ResetResponse,
3334
Void> {
3435

3536
private final AnomalyDetectionAuditor anomalyDetectionAuditor;
3637
private final DataFrameAnalyticsAuditor dfaAuditor;
3738
private final InferenceAuditor inferenceAuditor;
39+
private final TrainedModelStatsService trainedModelStatsService;
3840

3941
@Inject
40-
public TransportResetAuditorAction(
42+
public TransportResetMlComponentsAction(
4143
ThreadPool threadPool,
4244
ClusterService clusterService,
4345
TransportService transportService,
4446
ActionFilters actionFilters,
4547
AnomalyDetectionAuditor anomalyDetectionAuditor,
4648
DataFrameAnalyticsAuditor dfaAuditor,
47-
InferenceAuditor inferenceAuditor
49+
InferenceAuditor inferenceAuditor,
50+
TrainedModelStatsService trainedModelStatsService
4851
) {
4952
super(
50-
ResetAuditorAction.NAME,
53+
ResetMlComponentsAction.NAME,
5154
clusterService,
5255
transportService,
5356
actionFilters,
54-
ResetAuditorAction.NodeRequest::new,
57+
ResetMlComponentsAction.NodeRequest::new,
5558
threadPool.executor(ThreadPool.Names.MANAGEMENT)
5659
);
5760
this.anomalyDetectionAuditor = anomalyDetectionAuditor;
5861
this.dfaAuditor = dfaAuditor;
5962
this.inferenceAuditor = inferenceAuditor;
63+
this.trainedModelStatsService = trainedModelStatsService;
6064
}
6165

6266
@Override
63-
protected ResetAuditorAction.Response newResponse(
64-
ResetAuditorAction.Request request,
65-
List<ResetAuditorAction.Response.ResetResponse> resetResponses,
67+
protected ResetMlComponentsAction.Response newResponse(
68+
ResetMlComponentsAction.Request request,
69+
List<ResetMlComponentsAction.Response.ResetResponse> resetResponses,
6670
List<FailedNodeException> failures
6771
) {
68-
return new ResetAuditorAction.Response(clusterService.getClusterName(), resetResponses, failures);
72+
return new ResetMlComponentsAction.Response(clusterService.getClusterName(), resetResponses, failures);
6973
}
7074

7175
@Override
72-
protected ResetAuditorAction.NodeRequest newNodeRequest(ResetAuditorAction.Request request) {
73-
return new ResetAuditorAction.NodeRequest();
76+
protected ResetMlComponentsAction.NodeRequest newNodeRequest(ResetMlComponentsAction.Request request) {
77+
return new ResetMlComponentsAction.NodeRequest();
7478
}
7579

7680
@Override
77-
protected ResetAuditorAction.Response.ResetResponse newNodeResponse(StreamInput in, DiscoveryNode node) throws IOException {
78-
return new ResetAuditorAction.Response.ResetResponse(in);
81+
protected ResetMlComponentsAction.Response.ResetResponse newNodeResponse(StreamInput in, DiscoveryNode node) throws IOException {
82+
return new ResetMlComponentsAction.Response.ResetResponse(in);
7983
}
8084

8185
@Override
82-
protected ResetAuditorAction.Response.ResetResponse nodeOperation(ResetAuditorAction.NodeRequest request, Task task) {
86+
protected ResetMlComponentsAction.Response.ResetResponse nodeOperation(ResetMlComponentsAction.NodeRequest request, Task task) {
8387
anomalyDetectionAuditor.reset();
8488
dfaAuditor.reset();
8589
inferenceAuditor.reset();
86-
return new ResetAuditorAction.Response.ResetResponse(clusterService.localNode(), true);
90+
trainedModelStatsService.clearQueue();
91+
return new ResetMlComponentsAction.Response.ResetResponse(clusterService.localNode(), true);
8792
}
8893
}

x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/inference/TrainedModelStatsService.java

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -295,4 +295,8 @@ static UpdateRequest buildUpdateRequest(InferenceStats stats) {
295295
}
296296
return null;
297297
}
298+
299+
public void clearQueue() {
300+
statsQueue.clear();
301+
}
298302
}

0 commit comments

Comments
 (0)