Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
34 commits
Select commit Hold shift + click to select a range
0b42939
init
aglinxinyuan Jan 3, 2026
e1323d3
Merge remote-tracking branch 'origin/main' into xinyuan-stage-by-stage
aglinxinyuan Jan 5, 2026
151aa93
update
aglinxinyuan Jan 7, 2026
9b3b253
update
aglinxinyuan Jan 7, 2026
75dc0dd
update
aglinxinyuan Jan 7, 2026
6a24f86
update
aglinxinyuan Jan 7, 2026
548b2f8
update
aglinxinyuan Jan 7, 2026
3b5f52f
update
aglinxinyuan Jan 9, 2026
a1119bc
update
aglinxinyuan Jan 10, 2026
7a7c82b
update
aglinxinyuan Jan 10, 2026
fcd9ceb
update
aglinxinyuan Jan 12, 2026
fe2b239
Merge branch 'main' into xinyuan-stage-by-stage
aglinxinyuan Jan 14, 2026
4b9877f
update
aglinxinyuan Jan 14, 2026
ef32551
fix fmt
aglinxinyuan Jan 14, 2026
b062d4b
fix fmt
aglinxinyuan Jan 14, 2026
3530e71
fix test
aglinxinyuan Jan 14, 2026
2e17500
fix test
aglinxinyuan Jan 14, 2026
96b6b36
Merge branch 'main' into xinyuan-stage-by-stage
aglinxinyuan Jan 15, 2026
a67c326
add defaultExecutionMode
aglinxinyuan Jan 15, 2026
afeb08e
Merge branch 'main' into xinyuan-stage-by-stage
aglinxinyuan Jan 15, 2026
916afde
Merge remote-tracking branch 'origin/xinyuan-stage-by-stage' into xin…
aglinxinyuan Jan 15, 2026
dc87329
add defaultExecutionMode
aglinxinyuan Jan 15, 2026
95a14c8
add defaultExecutionMode
aglinxinyuan Jan 16, 2026
db9df6c
rename
aglinxinyuan Jan 18, 2026
f67eb2e
rename
aglinxinyuan Jan 18, 2026
722fab1
rename
aglinxinyuan Jan 18, 2026
6195115
rename
aglinxinyuan Jan 18, 2026
fe8d99d
fix
aglinxinyuan Jan 20, 2026
f7819ad
Merge branch 'main' into xinyuan-stage-by-stage
aglinxinyuan Jan 20, 2026
5aa31b1
fix fmt
aglinxinyuan Jan 20, 2026
2f66ee3
fix fmt
aglinxinyuan Jan 20, 2026
189df54
test case
aglinxinyuan Jan 22, 2026
591b023
Merge branch 'main' into xinyuan-stage-by-stage
aglinxinyuan Jan 22, 2026
94181d9
fix fmt
aglinxinyuan Jan 22, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -304,10 +304,15 @@ class CostBasedScheduleGenerator(
*/
private def createRegionDAG(): DirectedAcyclicGraph[Region, RegionLink] = {
val searchResultFuture: Future[SearchResult] = Future {
if (ApplicationConfig.useTopDownSearch)
topDownSearch(globalSearch = ApplicationConfig.useGlobalSearch)
else
bottomUpSearch(globalSearch = ApplicationConfig.useGlobalSearch)
workflowContext.workflowSettings.executionMode match {
case ExecutionMode.MATERIALIZED =>
materializedSearch()
case ExecutionMode.PIPELINED =>
if (ApplicationConfig.useTopDownSearch)
topDownSearch(globalSearch = ApplicationConfig.useGlobalSearch)
else
bottomUpSearch(globalSearch = ApplicationConfig.useGlobalSearch)
}
}
val searchResult = Try(
Await.result(searchResultFuture, ApplicationConfig.searchTimeoutMilliseconds.milliseconds)
Expand Down Expand Up @@ -477,6 +482,29 @@ class CostBasedScheduleGenerator(
)
}

/** Constructs a baseline fully materialized region plan (one operator per region) and evaluates its cost. */
def materializedSearch(): SearchResult = {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is not actually doing a search, so the name is misleading. You can call it something like getFullyMaterializedSearchState

val startTime = System.nanoTime()

val (regionDAG, cost) =
tryConnectRegionDAG(physicalPlan.links) match {
case Left(dag) => (dag, allocateResourcesAndEvaluateCost(dag))
case Right(_) =>
(
new DirectedAcyclicGraph[Region, RegionLink](classOf[RegionLink]),
Double.PositiveInfinity
)
}

SearchResult(
state = Set.empty,
regionDAG = regionDAG,
cost = cost,
searchTimeNanoSeconds = System.nanoTime() - startTime,
numStatesExplored = 1
)
}

/**
* Another direction to perform the search. Depending on the configuration, either a global search or a greedy search
* will be performed to find an optimal plan. The search starts from a plan where all edges are materialized, and
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -151,8 +151,11 @@ class WorkflowCompiler(
val (physicalPlan, outputPortsNeedingStorage) =
expandLogicalPlan(logicalPlan, logicalPlanPojo.opsToViewResult, None)

context.workflowSettings =
WorkflowSettings(context.workflowSettings.dataTransferBatchSize, outputPortsNeedingStorage)
context.workflowSettings = WorkflowSettings(
context.workflowSettings.dataTransferBatchSize,
outputPortsNeedingStorage,
context.workflowSettings.executionMode
)

Workflow(context, logicalPlan, physicalPlan)
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -19,14 +19,21 @@

package org.apache.texera.amber.engine.architecture.scheduling

import org.apache.texera.amber.core.workflow.{PortIdentity, WorkflowContext}
import org.apache.texera.amber.core.workflow.{
ExecutionMode,
PortIdentity,
WorkflowContext,
WorkflowSettings
}
import org.apache.texera.amber.engine.common.virtualidentity.util.CONTROLLER
import org.apache.texera.amber.engine.e2e.TestUtils.buildWorkflow
import org.apache.texera.amber.operator.TestOperators
import org.apache.texera.workflow.LogicalLink
import org.scalamock.scalatest.MockFactory
import org.scalatest.flatspec.AnyFlatSpec

import scala.jdk.CollectionConverters._

class CostBasedScheduleGeneratorSpec extends AnyFlatSpec with MockFactory {

"CostBasedRegionPlanGenerator" should "finish bottom-up search using different pruning techniques with correct number of states explored in csv->->filter->join->filter2 workflow" in {
Expand Down Expand Up @@ -206,4 +213,306 @@ class CostBasedScheduleGeneratorSpec extends AnyFlatSpec with MockFactory {

}

// MATERIALIZED ExecutionMode tests - each operator should be a separate region
"CostBasedRegionPlanGenerator" should "create separate region for each operator in MATERIALIZED mode for simple csv workflow" in {
val csvOpDesc = TestOperators.smallCsvScanOpDesc()
val materializedContext = new WorkflowContext(
workflowSettings = WorkflowSettings(
dataTransferBatchSize = 400,
executionMode = ExecutionMode.MATERIALIZED
)
)
val workflow = buildWorkflow(
List(csvOpDesc),
List(),
materializedContext
)

val scheduleGenerator = new CostBasedScheduleGenerator(
workflow.context,
workflow.physicalPlan,
CONTROLLER
)
val result = scheduleGenerator.materializedSearch()

// Should only explore 1 state (fully materialized)
assert(result.numStatesExplored == 1)

// Each physical operator should be in its own region
val regions = result.regionDAG.vertexSet().asScala
val numPhysicalOps = workflow.physicalPlan.operators.size
assert(regions.size == numPhysicalOps, s"Expected $numPhysicalOps regions, got ${regions.size}")

// Each region should contain exactly 1 operator
regions.foreach { region =>
assert(
region.getOperators.size == 1,
s"Expected region to have 1 operator, got ${region.getOperators.size}"
)
}
}

"CostBasedRegionPlanGenerator" should "create separate region for each operator in MATERIALIZED mode for csv->keyword workflow" in {
val csvOpDesc = TestOperators.smallCsvScanOpDesc()
val keywordOpDesc = TestOperators.keywordSearchOpDesc("Region", "Asia")
val materializedContext = new WorkflowContext(
workflowSettings = WorkflowSettings(
dataTransferBatchSize = 400,
executionMode = ExecutionMode.MATERIALIZED
)
)
val workflow = buildWorkflow(
List(csvOpDesc, keywordOpDesc),
List(
LogicalLink(
csvOpDesc.operatorIdentifier,
PortIdentity(),
keywordOpDesc.operatorIdentifier,
PortIdentity()
)
),
materializedContext
)

val scheduleGenerator = new CostBasedScheduleGenerator(
workflow.context,
workflow.physicalPlan,
CONTROLLER
)
val result = scheduleGenerator.materializedSearch()

// Should only explore 1 state (fully materialized)
assert(result.numStatesExplored == 1)

// Each physical operator should be in its own region
val regions = result.regionDAG.vertexSet().asScala
val numPhysicalOps = workflow.physicalPlan.operators.size
assert(regions.size == numPhysicalOps, s"Expected $numPhysicalOps regions, got ${regions.size}")

// Each region should contain exactly 1 operator
regions.foreach { region =>
assert(
region.getOperators.size == 1,
s"Expected region to have 1 operator, got ${region.getOperators.size}"
)
}

// All links should be materialized (represented as region links)
val numRegionLinks = result.regionDAG.edgeSet().asScala.size
val numPhysicalLinks = workflow.physicalPlan.links.size
assert(
numRegionLinks == numPhysicalLinks,
s"Expected $numPhysicalLinks region links, got $numRegionLinks"
)
}

"CostBasedRegionPlanGenerator" should "create separate region for each operator in MATERIALIZED mode for csv->keyword->count workflow" in {
val csvOpDesc = TestOperators.smallCsvScanOpDesc()
val keywordOpDesc = TestOperators.keywordSearchOpDesc("Region", "Asia")
val countOpDesc = TestOperators.aggregateAndGroupByDesc(
"Region",
org.apache.texera.amber.operator.aggregate.AggregationFunction.COUNT,
List[String]()
)
val materializedContext = new WorkflowContext(
workflowSettings = WorkflowSettings(
dataTransferBatchSize = 400,
executionMode = ExecutionMode.MATERIALIZED
)
)
val workflow = buildWorkflow(
List(csvOpDesc, keywordOpDesc, countOpDesc),
List(
LogicalLink(
csvOpDesc.operatorIdentifier,
PortIdentity(),
keywordOpDesc.operatorIdentifier,
PortIdentity()
),
LogicalLink(
keywordOpDesc.operatorIdentifier,
PortIdentity(),
countOpDesc.operatorIdentifier,
PortIdentity()
)
),
materializedContext
)

val scheduleGenerator = new CostBasedScheduleGenerator(
workflow.context,
workflow.physicalPlan,
CONTROLLER
)
val result = scheduleGenerator.materializedSearch()

// Should only explore 1 state (fully materialized)
assert(result.numStatesExplored == 1)

// Each physical operator should be in its own region
val regions = result.regionDAG.vertexSet().asScala
val numPhysicalOps = workflow.physicalPlan.operators.size
assert(regions.size == numPhysicalOps, s"Expected $numPhysicalOps regions, got ${regions.size}")

// Each region should contain exactly 1 operator
regions.foreach { region =>
assert(
region.getOperators.size == 1,
s"Expected region to have 1 operator, got ${region.getOperators.size}"
)
}

// All links should be materialized (represented as region links)
val numRegionLinks = result.regionDAG.edgeSet().asScala.size
val numPhysicalLinks = workflow.physicalPlan.links.size
assert(
numRegionLinks == numPhysicalLinks,
s"Expected $numPhysicalLinks region links, got $numRegionLinks"
)
}

"CostBasedRegionPlanGenerator" should "create separate region for each operator in MATERIALIZED mode for join workflow" in {
val headerlessCsvOpDesc1 = TestOperators.headerlessSmallCsvScanOpDesc()
val headerlessCsvOpDesc2 = TestOperators.headerlessSmallCsvScanOpDesc()
val joinOpDesc = TestOperators.joinOpDesc("column-1", "column-1")
val materializedContext = new WorkflowContext(
workflowSettings = WorkflowSettings(
dataTransferBatchSize = 400,
executionMode = ExecutionMode.MATERIALIZED
)
)
val workflow = buildWorkflow(
List(
headerlessCsvOpDesc1,
headerlessCsvOpDesc2,
joinOpDesc
),
List(
LogicalLink(
headerlessCsvOpDesc1.operatorIdentifier,
PortIdentity(),
joinOpDesc.operatorIdentifier,
PortIdentity()
),
LogicalLink(
headerlessCsvOpDesc2.operatorIdentifier,
PortIdentity(),
joinOpDesc.operatorIdentifier,
PortIdentity(1)
)
),
materializedContext
)

val scheduleGenerator = new CostBasedScheduleGenerator(
workflow.context,
workflow.physicalPlan,
CONTROLLER
)
val result = scheduleGenerator.materializedSearch()

// Should only explore 1 state (fully materialized)
assert(result.numStatesExplored == 1)

// Each physical operator should be in its own region
val regions = result.regionDAG.vertexSet().asScala
val numPhysicalOps = workflow.physicalPlan.operators.size
assert(regions.size == numPhysicalOps, s"Expected $numPhysicalOps regions, got ${regions.size}")

// Each region should contain exactly 1 operator
regions.foreach { region =>
assert(
region.getOperators.size == 1,
s"Expected region to have 1 operator, got ${region.getOperators.size}"
)
}

// All links should be materialized (represented as region links)
val numRegionLinks = result.regionDAG.edgeSet().asScala.size
val numPhysicalLinks = workflow.physicalPlan.links.size
assert(
numRegionLinks == numPhysicalLinks,
s"Expected $numPhysicalLinks region links, got $numRegionLinks"
)
}

"CostBasedRegionPlanGenerator" should "create separate region for each operator in MATERIALIZED mode for complex csv->->filter->join->filter2 workflow" in {
val headerlessCsvOpDesc1 = TestOperators.headerlessSmallCsvScanOpDesc()
val keywordOpDesc = TestOperators.keywordSearchOpDesc("column-1", "Asia")
val joinOpDesc = TestOperators.joinOpDesc("column-1", "column-1")
val keywordOpDesc2 = TestOperators.keywordSearchOpDesc("column-1", "Asia")
val materializedContext = new WorkflowContext(
workflowSettings = WorkflowSettings(
dataTransferBatchSize = 400,
executionMode = ExecutionMode.MATERIALIZED
)
)
val workflow = buildWorkflow(
List(
headerlessCsvOpDesc1,
keywordOpDesc,
joinOpDesc,
keywordOpDesc2
),
List(
LogicalLink(
headerlessCsvOpDesc1.operatorIdentifier,
PortIdentity(),
joinOpDesc.operatorIdentifier,
PortIdentity()
),
LogicalLink(
headerlessCsvOpDesc1.operatorIdentifier,
PortIdentity(),
keywordOpDesc.operatorIdentifier,
PortIdentity()
),
LogicalLink(
keywordOpDesc.operatorIdentifier,
PortIdentity(),
joinOpDesc.operatorIdentifier,
PortIdentity(1)
),
LogicalLink(
joinOpDesc.operatorIdentifier,
PortIdentity(),
keywordOpDesc2.operatorIdentifier,
PortIdentity()
)
),
materializedContext
)

val scheduleGenerator = new CostBasedScheduleGenerator(
workflow.context,
workflow.physicalPlan,
CONTROLLER
)
val result = scheduleGenerator.materializedSearch()

// Should only explore 1 state (fully materialized)
assert(result.numStatesExplored == 1)

// Each physical operator should be in its own region
val regions = result.regionDAG.vertexSet().asScala
val numPhysicalOps = workflow.physicalPlan.operators.size
assert(regions.size == numPhysicalOps, s"Expected $numPhysicalOps regions, got ${regions.size}")

// Each region should contain exactly 1 operator
regions.foreach { region =>
assert(
region.getOperators.size == 1,
s"Expected region to have 1 operator, got ${region.getOperators.size}"
)
}

// All links should be materialized (represented as region links)
val numRegionLinks = result.regionDAG.edgeSet().asScala.size
val numPhysicalLinks = workflow.physicalPlan.links.size
assert(
numRegionLinks == numPhysicalLinks,
s"Expected $numPhysicalLinks region links, got $numRegionLinks"
)
}

}
Loading
Loading