Skip to content

Commit 8938499

Browse files
committed
Keep the TestException is Recoverable to ensure the second job is finished eventually
Add comments to help understand the UnalignedCheckpointRescaleITCase
1 parent 9828258 commit 8938499

File tree

2 files changed

+51
-21
lines changed

2 files changed

+51
-21
lines changed

flink-tests/src/test/java/org/apache/flink/test/checkpointing/UnalignedCheckpointRescaleITCase.java

Lines changed: 35 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@
2020
package org.apache.flink.test.checkpointing;
2121

2222
import org.apache.flink.api.common.JobExecutionResult;
23+
import org.apache.flink.api.common.JobStatus;
2324
import org.apache.flink.api.common.accumulators.LongCounter;
2425
import org.apache.flink.api.common.functions.FilterFunction;
2526
import org.apache.flink.api.common.functions.MapFunction;
@@ -366,14 +367,7 @@ public String map(Long value) throws Exception {
366367
})
367368
.name("long-to-string-map")
368369
.uid("long-to-string-map")
369-
.map(
370-
new FailingMapper<>(
371-
state -> false,
372-
state ->
373-
state.completedCheckpoints >= minCheckpoints / 2
374-
&& state.runNumber == 0,
375-
state -> false,
376-
state -> false))
370+
.map(getFailingMapper(minCheckpoints))
377371
.name("failing-map")
378372
.uid("failing-map")
379373
.setParallelism(parallelism)
@@ -394,14 +388,7 @@ void addFailingSink(
394388
DataStream<Long> combinedSource, long minCheckpoints, boolean slotSharing) {
395389
combinedSource
396390
.shuffle()
397-
.map(
398-
new FailingMapper<>(
399-
state -> false,
400-
state ->
401-
state.completedCheckpoints >= minCheckpoints / 2
402-
&& state.runNumber == 0,
403-
state -> false,
404-
state -> false))
391+
.map(getFailingMapper(minCheckpoints))
405392
.name("failing-map")
406393
.uid("failing-map")
407394
.slotSharingGroup(slotSharing ? "default" : "failing-map")
@@ -418,6 +405,25 @@ void addFailingSink(
418405
.slotSharingGroup(slotSharing ? "default" : "sink");
419406
}
420407

408+
/**
409+
* Creates a FailingMapper that only fails during snapshot operations.
410+
*
411+
* <p>Only fails during snapshotState() when completedCheckpoints >= minCheckpoints/2 AND
412+
* runNumber == 0. After job failovers internally, runNumber becomes attemptNumber > 0, so
413+
* failure condition is no longer satisfied. This ensures the mapper fails exactly once
414+
* during initial run to trigger job failover, but never fails again after failing over and
415+
* recovery from checkpoint.
416+
*/
417+
private static <T> FailingMapper<T> getFailingMapper(long minCheckpoints) {
418+
return new FailingMapper<>(
419+
state -> false,
420+
state ->
421+
state.completedCheckpoints >= minCheckpoints / 2
422+
&& state.runNumber == 0,
423+
state -> false,
424+
state -> false);
425+
}
426+
421427
DataStream<Long> createSourcePipeline(
422428
StreamExecutionEnvironment env,
423429
int minCheckpoints,
@@ -611,13 +617,23 @@ public UnalignedCheckpointRescaleITCase(
611617
this.sourceSleepMs = sourceSleepMs;
612618
}
613619

620+
/**
621+
* Tests unaligned checkpoint rescaling behavior.
622+
*
623+
* <p>Prescale phase: Job fails when completedCheckpoints >= minCheckpoints/2 via FailingMapper.
624+
* Generates checkpoint for rescale test.
625+
*
626+
* <p>Postscale phase: Job restores from checkpoint with different parallelism, failovers once,
627+
* and finishes after source generates all records.
628+
*/
614629
@Test
615630
public void shouldRescaleUnalignedCheckpoint() throws Exception {
616631
final UnalignedSettings prescaleSettings =
617632
new UnalignedSettings(topology)
618633
.setParallelism(oldParallelism)
619634
.setExpectedFailures(1)
620-
.setSourceSleepMs(sourceSleepMs);
635+
.setSourceSleepMs(sourceSleepMs)
636+
.setExpectedFinalJobStatus(JobStatus.FAILED);
621637
prescaleSettings.setGenerateCheckpoint(true);
622638
final String checkpointDir = super.execute(prescaleSettings);
623639
assertThat(checkpointDir)
@@ -627,7 +643,8 @@ public void shouldRescaleUnalignedCheckpoint() throws Exception {
627643
final UnalignedSettings postscaleSettings =
628644
new UnalignedSettings(topology)
629645
.setParallelism(newParallelism)
630-
.setExpectedFailures(1);
646+
.setExpectedFailures(1)
647+
.setExpectedFinalJobStatus(JobStatus.FINISHED);
631648
postscaleSettings.setRestoreCheckpoint(checkpointDir);
632649
super.execute(postscaleSettings);
633650
}

flink-tests/src/test/java/org/apache/flink/test/checkpointing/UnalignedCheckpointTestBase.java

Lines changed: 16 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@
1919

2020
import org.apache.flink.api.common.JobExecutionResult;
2121
import org.apache.flink.api.common.JobID;
22+
import org.apache.flink.api.common.JobStatus;
2223
import org.apache.flink.api.common.JobSubmissionResult;
2324
import org.apache.flink.api.common.accumulators.IntCounter;
2425
import org.apache.flink.api.common.accumulators.LongCounter;
@@ -57,13 +58,12 @@
5758
import org.apache.flink.runtime.state.FunctionSnapshotContext;
5859
import org.apache.flink.runtime.testutils.CommonTestUtils;
5960
import org.apache.flink.runtime.testutils.MiniClusterResourceConfiguration;
60-
import org.apache.flink.runtime.throwable.ThrowableAnnotation;
61-
import org.apache.flink.runtime.throwable.ThrowableType;
6261
import org.apache.flink.streaming.api.checkpoint.CheckpointedFunction;
6362
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
6463
import org.apache.flink.streaming.api.functions.co.RichCoFlatMapFunction;
6564
import org.apache.flink.streaming.api.functions.sink.legacy.RichSinkFunction;
6665
import org.apache.flink.streaming.api.graph.StreamGraph;
66+
import org.apache.flink.streaming.util.RestartStrategyUtils;
6767
import org.apache.flink.test.util.MiniClusterWithClientResource;
6868
import org.apache.flink.testutils.junit.FailsWithAdaptiveScheduler;
6969
import org.apache.flink.util.Collector;
@@ -104,6 +104,7 @@
104104

105105
import static org.apache.flink.shaded.guava33.com.google.common.collect.Iterables.getOnlyElement;
106106
import static org.apache.flink.util.Preconditions.checkState;
107+
import static org.assertj.core.api.Assertions.assertThat;
107108

108109
/**
109110
* Base class for tests related to unaligned checkpoints.
@@ -195,6 +196,11 @@ protected String execute(UnalignedSettings settings) throws Exception {
195196
.requestJobResult(jobID)
196197
.get()
197198
.toJobExecutionResult(getClass().getClassLoader()));
199+
if (settings.expectedFinalJobStatus != null) {
200+
assertThat(miniCluster.getMiniCluster().getJobStatus(jobID))
201+
.succeedsWithin(Duration.ofMinutes(1))
202+
.isEqualTo(settings.expectedFinalJobStatus);
203+
}
198204
System.out.println(
199205
"Finished " + getClass().getCanonicalName() + "#" + name.getMethodName() + ".");
200206
if (settings.generateCheckpoint) {
@@ -697,6 +703,7 @@ protected static class UnalignedSettings {
697703
private int failuresAfterSourceFinishes = 0;
698704
private ChannelType channelType = ChannelType.MIXED;
699705
private long sourceSleepMs = 0;
706+
@Nullable private JobStatus expectedFinalJobStatus = null;
700707

701708
public UnalignedSettings(DagCreator dagCreator) {
702709
this.dagCreator = dagCreator;
@@ -752,6 +759,11 @@ public UnalignedSettings setSourceSleepMs(long sourceSleepMs) {
752759
return this;
753760
}
754761

762+
public UnalignedSettings setExpectedFinalJobStatus(JobStatus expectedFinalJobStatus) {
763+
this.expectedFinalJobStatus = expectedFinalJobStatus;
764+
return this;
765+
}
766+
755767
public void configure(StreamExecutionEnvironment env) {
756768
env.enableCheckpointing(Math.max(100L, parallelism * 50L));
757769
env.getCheckpointConfig()
@@ -760,6 +772,8 @@ public void configure(StreamExecutionEnvironment env) {
760772
env.getCheckpointConfig()
761773
.setTolerableCheckpointFailureNumber(tolerableCheckpointFailures);
762774
env.setParallelism(parallelism);
775+
RestartStrategyUtils.configureFixedDelayRestartStrategy(
776+
env, generateCheckpoint ? expectedFailures / 2 : expectedFailures, 100L);
763777
env.getCheckpointConfig().enableUnalignedCheckpoints(true);
764778
// for custom partitioner
765779
env.getCheckpointConfig().setForceUnalignedCheckpoints(true);
@@ -1138,7 +1152,6 @@ protected static long checkHeader(long value) {
11381152
return value;
11391153
}
11401154

1141-
@ThrowableAnnotation(ThrowableType.NonRecoverableError)
11421155
static class TestException extends Exception {
11431156
public TestException(String s) {
11441157
super(s);

0 commit comments

Comments
 (0)