Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion R/version.R
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
# Generated by rconfigure.py, do not edit by hand
# DuckDB version information

duckdb_version <- "1.5.0-dev2455"
duckdb_version <- "1.5.0-dev2471"

# Function to get DuckDB version without establishing a connection
get_duckdb_version <- function() {
Expand Down
6 changes: 3 additions & 3 deletions src/duckdb/src/function/table/version/pragma_version.cpp
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
#ifndef DUCKDB_PATCH_VERSION
#define DUCKDB_PATCH_VERSION "0-dev2455"
#define DUCKDB_PATCH_VERSION "0-dev2471"
#endif
#ifndef DUCKDB_MINOR_VERSION
#define DUCKDB_MINOR_VERSION 5
Expand All @@ -8,10 +8,10 @@
#define DUCKDB_MAJOR_VERSION 1
#endif
#ifndef DUCKDB_VERSION
#define DUCKDB_VERSION "v1.5.0-dev2455"
#define DUCKDB_VERSION "v1.5.0-dev2471"
#endif
#ifndef DUCKDB_SOURCE_ID
#define DUCKDB_SOURCE_ID "287ac426ad"
#define DUCKDB_SOURCE_ID "6c5e16c2fb"
#endif
#include "duckdb/function/table/system_functions.hpp"
#include "duckdb/main/database.hpp"
Expand Down
6 changes: 4 additions & 2 deletions src/duckdb/src/include/duckdb/function/table_function.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -114,14 +114,16 @@ struct TableFunctionBindInput {

struct RowGroupOrderOptions {
RowGroupOrderOptions(column_t column_idx_p, OrderByStatistics order_by_p, RowGroupOrderType order_type_p,
OrderByColumnType column_type_p)
: column_idx(column_idx_p), order_by(order_by_p), order_type(order_type_p), column_type(column_type_p) {
OrderByColumnType column_type_p, optional_idx row_limit_p = optional_idx())
: column_idx(column_idx_p), order_by(order_by_p), order_type(order_type_p), column_type(column_type_p),
row_limit(row_limit_p) {
}

const column_t column_idx;
const OrderByStatistics order_by;
const RowGroupOrderType order_type;
const OrderByColumnType column_type;
const optional_idx row_limit;
};

struct TableFunctionInitInput {
Expand Down
6 changes: 3 additions & 3 deletions src/duckdb/src/include/duckdb/storage/table/scan_state.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -194,18 +194,18 @@ class RowGroupReorderer {
optional_ptr<SegmentNode<RowGroup>> GetRootSegment(RowGroupSegmentTree &row_groups);
optional_ptr<SegmentNode<RowGroup>> GetNextRowGroup(SegmentNode<RowGroup> &row_group);

static Value RetrieveStat(const BaseStatistics &stats, OrderByStatistics order_by, OrderByColumnType column_type);

private:
const column_t column_idx;
const OrderByStatistics order_by;
const RowGroupOrderType order_type;
const OrderByColumnType column_type;
const optional_idx row_limit;

idx_t offset;
bool initialized;
vector<reference<SegmentNode<RowGroup>>> ordered_row_groups;

private:
static Value RetrieveStat(const BaseStatistics &stats, OrderByStatistics order_by, OrderByColumnType column_type);
};

class CollectionScanState {
Expand Down
33 changes: 28 additions & 5 deletions src/duckdb/src/optimizer/topn_optimizer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -17,18 +17,39 @@ namespace duckdb {

namespace {

bool CanReorderRowGroups(LogicalTopN &op) {
bool CanReorderRowGroups(LogicalTopN &op, bool &use_limit) {
use_limit = true;
for (const auto &order : op.orders) {
// We do not support any null-first orders as this requires unimplemented logic in the row group reorderer
if (order.null_order == OrderByNullType::NULLS_FIRST) {
use_limit = false;
break;
}
}

// Only reorder row groups if there are no additional limit operators since they could modify the order
reference<LogicalOperator> current_op = op;

while (!current_op.get().children.empty()) {
if (current_op.get().children.size() > 1) {
return false;
}
if (current_op.get().type == LogicalOperatorType::LOGICAL_LIMIT) {
const auto op_type = current_op.get().type;
if (op_type == LogicalOperatorType::LOGICAL_LIMIT) {
return false;
}
if (op_type == LogicalOperatorType::LOGICAL_FILTER ||
op_type == LogicalOperatorType::LOGICAL_AGGREGATE_AND_GROUP_BY) {
use_limit = false;
}
current_op = *current_op.get().children[0];
}
D_ASSERT(current_op.get().type == LogicalOperatorType::LOGICAL_GET);
auto &logical_get = current_op.get().Cast<LogicalGet>();
if (!logical_get.table_filters.filters.empty()) {
use_limit = false;
}

return true;
}

Expand Down Expand Up @@ -133,8 +154,9 @@ void TopN::PushdownDynamicFilters(LogicalTopN &op) {
// put the filter into the Top-N clause
op.dynamic_filter = filter_data;

bool use_limit = false;
bool use_custom_rowgroup_order =
CanReorderRowGroups(op) && (colref.return_type.IsNumeric() || colref.return_type.IsTemporal());
CanReorderRowGroups(op, use_limit) && (colref.return_type.IsNumeric() || colref.return_type.IsTemporal());

for (auto &target : pushdown_targets) {
auto &get = target.get;
Expand All @@ -156,8 +178,9 @@ void TopN::PushdownDynamicFilters(LogicalTopN &op) {
auto order_type =
op.orders[0].type == OrderType::ASCENDING ? RowGroupOrderType::ASC : RowGroupOrderType::DESC;
auto order_by = order_type == RowGroupOrderType::ASC ? OrderByStatistics::MIN : OrderByStatistics::MAX;
auto order_options =
make_uniq<RowGroupOrderOptions>(column_index.GetPrimaryIndex(), order_by, order_type, column_type);
auto row_limit = use_limit ? op.limit + op.offset : optional_idx();
auto order_options = make_uniq<RowGroupOrderOptions>(column_index.GetPrimaryIndex(), order_by, order_type,
column_type, row_limit);
get.function.set_scan_order(std::move(order_options), get.bind_data.get());
}
}
Expand Down
132 changes: 118 additions & 14 deletions src/duckdb/src/storage/table/scan_state.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,87 @@

namespace duckdb {

namespace {

struct RowGroupMapEntry {
reference<SegmentNode<RowGroup>> row_group;
unique_ptr<BaseStatistics> stats;
};

bool CompareValues(const Value &v1, const Value &v2, const OrderByStatistics order) {
return (order == OrderByStatistics::MAX && v1 < v2) || (order == OrderByStatistics::MIN && v1 > v2);
}

idx_t GetQualifyingTupleCount(RowGroup &row_group, BaseStatistics &stats, const OrderByColumnType type) {
if (!stats.CanHaveNull()) {
return row_group.count;
}

if (type == OrderByColumnType::NUMERIC) {
if (!NumericStats::HasMinMax(stats)) {
return 0;
}
if (NumericStats::IsConstant(stats)) {
return 1;
}
return 2;
}
// We cannot check if the min/max for StringStats have actually been set. As the strings may be truncated, we
// also cannot assume that min and max are the same
return 0;
}

template <typename It, typename End>
void AddRowGroups(It it, End end, vector<reference<SegmentNode<RowGroup>>> &ordered_row_groups, const idx_t row_limit,
const OrderByColumnType column_type, const OrderByStatistics stat_type) {
const auto opposite_stat_type =
stat_type == OrderByStatistics::MAX ? OrderByStatistics::MIN : OrderByStatistics::MAX;

idx_t qualifying_tuples = 0;
idx_t qualify_later = 0;

auto last_unresolved_entry = it;
auto &last_stats = it->second.stats;
idx_t last_unresolved_row_group_sum =
GetQualifyingTupleCount(*it->second.row_group.get().node, *last_stats, column_type);
auto last_unresolved_boundary = RowGroupReorderer::RetrieveStat(*last_stats, opposite_stat_type, column_type);

for (; it != end; ++it) {
auto &current_key = it->first;
auto &row_group = it->second.row_group;

while (last_unresolved_entry != it) {
if (!CompareValues(current_key, last_unresolved_boundary, stat_type)) {
if (current_key != std::prev(it)->first) {
// Row groups overlap: we can only guarantee one additional qualifying tuple
qualifying_tuples += qualify_later;
qualify_later = 0;
qualifying_tuples++;
} else {
// Row groups have the same order value, we can only guarantee a qualifying tuple later
qualify_later++;
}

break;
}
// Row groups do not overlap: we can guarantee that the tuples qualify
qualifying_tuples = last_unresolved_row_group_sum;
++last_unresolved_entry;
auto &upcoming_row_group = *last_unresolved_entry->second.row_group.get().node;
auto &upcoming_stats = *last_unresolved_entry->second.stats;

last_unresolved_row_group_sum += GetQualifyingTupleCount(upcoming_row_group, upcoming_stats, column_type);
last_unresolved_boundary = RowGroupReorderer::RetrieveStat(upcoming_stats, opposite_stat_type, column_type);
}
if (qualifying_tuples >= row_limit) {
return;
}
ordered_row_groups.emplace_back(row_group);
}
}

} // namespace

TableScanState::TableScanState() : table_state(*this), local_state(*this) {
}

Expand Down Expand Up @@ -112,7 +193,7 @@ void ScanFilterInfo::SetFilterAlwaysTrue(idx_t filter_idx) {

RowGroupReorderer::RowGroupReorderer(const RowGroupOrderOptions &options)
: column_idx(options.column_idx), order_by(options.order_by), order_type(options.order_type),
column_type(options.column_type), offset(0), initialized(false) {
column_type(options.column_type), row_limit(options.row_limit), offset(0), initialized(false) {
}

optional_ptr<SegmentNode<RowGroup>> RowGroupReorderer::GetNextRowGroup(SegmentNode<RowGroup> &row_group) {
Expand All @@ -134,6 +215,26 @@ Value RowGroupReorderer::RetrieveStat(const BaseStatistics &stats, OrderByStatis
return Value();
}

void SetRowGroupVectorWithLimit(const multimap<Value, RowGroupMapEntry> &row_group_map, const optional_idx row_limit,
const RowGroupOrderType order_type, const OrderByColumnType column_type,
vector<reference<SegmentNode<RowGroup>>> &ordered_row_groups) {
D_ASSERT(row_limit.IsValid());

const auto stat_type = order_type == RowGroupOrderType::ASC ? OrderByStatistics::MIN : OrderByStatistics::MAX;
ordered_row_groups.reserve(row_group_map.size());

Value previous_key;
if (order_type == RowGroupOrderType::ASC) {
auto it = row_group_map.begin();
auto end = row_group_map.end();
AddRowGroups(it, end, ordered_row_groups, row_limit.GetIndex(), column_type, stat_type);
} else {
auto it = row_group_map.rbegin();
auto end = row_group_map.rend();
AddRowGroups(it, end, ordered_row_groups, row_limit.GetIndex(), column_type, stat_type);
}
}

optional_ptr<SegmentNode<RowGroup>> RowGroupReorderer::GetRootSegment(RowGroupSegmentTree &row_groups) {
if (initialized) {
if (ordered_row_groups.empty()) {
Expand All @@ -144,27 +245,30 @@ optional_ptr<SegmentNode<RowGroup>> RowGroupReorderer::GetRootSegment(RowGroupSe

initialized = true;

multimap<Value, reference<SegmentNode<RowGroup>>> row_group_map;
for (auto &node : row_groups.SegmentNodes()) {
auto &row_group = *node.node;
auto stats = row_group.GetStatistics(column_idx);
multimap<Value, RowGroupMapEntry> row_group_map;
for (auto &row_group : row_groups.SegmentNodes()) {
auto stats = row_group.node->GetStatistics(column_idx);
Value comparison_value = RetrieveStat(*stats, order_by, column_type);

row_group_map.emplace(comparison_value, reference<SegmentNode<RowGroup>>(node));
auto entry = RowGroupMapEntry {row_group, std::move(stats)};
row_group_map.emplace(comparison_value, std::move(entry));
}

if (row_group_map.empty()) {
return nullptr;
}

ordered_row_groups.reserve(row_group_map.size());
if (order_type == RowGroupOrderType::ASC) {
for (auto &row_group : row_group_map) {
ordered_row_groups.emplace_back(row_group.second);
}
if (row_limit.IsValid()) {
SetRowGroupVectorWithLimit(row_group_map, row_limit, order_type, column_type, ordered_row_groups);
} else {
for (auto it = row_group_map.rbegin(); it != row_group_map.rend(); ++it) {
ordered_row_groups.emplace_back(it->second);
ordered_row_groups.reserve(row_group_map.size());
if (order_type == RowGroupOrderType::ASC) {
for (auto &row_group : row_group_map) {
ordered_row_groups.emplace_back(row_group.second.row_group);
}
} else {
for (auto it = row_group_map.rbegin(); it != row_group_map.rend(); ++it) {
ordered_row_groups.emplace_back(it->second.row_group);
}
}
}

Expand Down