From 761c03c0c0ef4b1304788834ea0b4973d1830c17 Mon Sep 17 00:00:00 2001 From: Innocent Date: Sun, 1 Feb 2026 17:22:16 -0700 Subject: [PATCH 1/2] feat: literal to type conversaion --- src/iceberg/expression/json_serde.cc | 138 ++++++++++++++- src/iceberg/expression/literal.cc | 47 ++++- src/iceberg/test/expression_json_test.cc | 97 +++++++++++ src/iceberg/test/literal_test.cc | 31 ++++ src/iceberg/test/transform_util_test.cc | 210 +++++++++++++++++++++++ src/iceberg/util/string_util.h | 24 +++ src/iceberg/util/transform_util.cc | 142 +++++++++++++++ src/iceberg/util/transform_util.h | 34 ++++ 8 files changed, 713 insertions(+), 10 deletions(-) diff --git a/src/iceberg/expression/json_serde.cc b/src/iceberg/expression/json_serde.cc index 9aea284d3..af2ff5908 100644 --- a/src/iceberg/expression/json_serde.cc +++ b/src/iceberg/expression/json_serde.cc @@ -298,10 +298,140 @@ Result ToJson(const Literal& literal) { } } -Result LiteralFromJson(const nlohmann::json& json, const Type* /*type*/) { - // TODO(gangwu): implement type-aware literal parsing equivalent to Java's - // SingleValueParser.fromJson(type, node). - return LiteralFromJson(json); +Result LiteralFromJson(const nlohmann::json& json, const Type* type) { + // If {"type": "literal", "value": } wrapper is present, unwrap it first. + if (json.is_object() && json.contains(kType) && + json[kType].get() == kLiteral && json.contains(kValue)) { + return LiteralFromJson(json[kValue], type); + } + // If no type context is provided, fall back to untyped parsing. + if (type == nullptr) return LiteralFromJson(json); + + // Type-aware parsing equivalent to Java's SingleValueParser.fromJson(type, node). + switch (type->type_id()) { + case TypeId::kBoolean: + if (!json.is_boolean()) [[unlikely]] { + return JsonParseError("Cannot parse {} as a boolean value", SafeDumpJson(json)); + } + return Literal::Boolean(json.get()); + + case TypeId::kInt: + if (!json.is_number_integer()) [[unlikely]] { + return JsonParseError("Cannot parse {} as an int value", SafeDumpJson(json)); + } + return Literal::Int(json.get()); + + case TypeId::kLong: + if (!json.is_number_integer()) [[unlikely]] { + return JsonParseError("Cannot parse {} as a long value", SafeDumpJson(json)); + } + return Literal::Long(json.get()); + + case TypeId::kFloat: + if (!json.is_number()) [[unlikely]] { + return JsonParseError("Cannot parse {} as a float value", SafeDumpJson(json)); + } + return Literal::Float(json.get()); + + case TypeId::kDouble: + if (!json.is_number()) [[unlikely]] { + return JsonParseError("Cannot parse {} as a double value", SafeDumpJson(json)); + } + return Literal::Double(json.get()); + + case TypeId::kString: + if (!json.is_string()) [[unlikely]] { + return JsonParseError("Cannot parse {} as a string value", SafeDumpJson(json)); + } + return Literal::String(json.get()); + + case TypeId::kDate: { + if (!json.is_string()) [[unlikely]] { + return JsonParseError("Cannot parse {} as a date value", SafeDumpJson(json)); + } + ICEBERG_ASSIGN_OR_RAISE(auto days, + TransformUtil::ParseDay(json.get())); + return Literal::Date(days); + } + + case TypeId::kTime: { + if (!json.is_string()) [[unlikely]] { + return JsonParseError("Cannot parse {} as a time value", SafeDumpJson(json)); + } + ICEBERG_ASSIGN_OR_RAISE(auto micros, + TransformUtil::ParseTime(json.get())); + return Literal::Time(micros); + } + + case TypeId::kTimestamp: { + if (!json.is_string()) [[unlikely]] { + return JsonParseError("Cannot parse {} as a timestamp value", SafeDumpJson(json)); + } + ICEBERG_ASSIGN_OR_RAISE(auto micros, + TransformUtil::ParseTimestamp(json.get())); + return Literal::Timestamp(micros); + } + + case TypeId::kTimestampTz: { + if (!json.is_string()) [[unlikely]] { + return JsonParseError("Cannot parse {} as a timestamptz value", + SafeDumpJson(json)); + } + ICEBERG_ASSIGN_OR_RAISE( + auto micros, TransformUtil::ParseTimestampWithZone(json.get())); + return Literal::TimestampTz(micros); + } + + case TypeId::kUuid: { + if (!json.is_string()) [[unlikely]] { + return JsonParseError("Cannot parse {} as a uuid value", SafeDumpJson(json)); + } + ICEBERG_ASSIGN_OR_RAISE(auto uuid, Uuid::FromString(json.get())); + return Literal::UUID(uuid); + } + + case TypeId::kBinary: { + if (!json.is_string()) [[unlikely]] { + return JsonParseError("Cannot parse {} as a binary value", SafeDumpJson(json)); + } + ICEBERG_ASSIGN_OR_RAISE(auto bytes, + StringUtils::HexStringToBytes(json.get())); + return Literal::Binary(std::move(bytes)); + } + + case TypeId::kFixed: { + if (!json.is_string()) [[unlikely]] { + return JsonParseError("Cannot parse {} as a fixed value", SafeDumpJson(json)); + } + const auto& fixed_type = internal::checked_cast(*type); + const std::string& hex = json.get(); + if (hex.size() != static_cast(fixed_type.length()) * 2) [[unlikely]] { + return JsonParseError("Cannot parse fixed[{}]: expected {} hex chars, got {}", + fixed_type.length(), fixed_type.length() * 2, hex.size()); + } + ICEBERG_ASSIGN_OR_RAISE(auto bytes, StringUtils::HexStringToBytes(hex)); + return Literal::Fixed(std::move(bytes)); + } + + case TypeId::kDecimal: { + if (!json.is_string()) [[unlikely]] { + return JsonParseError("Cannot parse {} as a decimal value", SafeDumpJson(json)); + } + const auto& dec_type = internal::checked_cast(*type); + int32_t parsed_scale = 0; + ICEBERG_ASSIGN_OR_RAISE( + auto dec, Decimal::FromString(json.get(), nullptr, &parsed_scale)); + if (parsed_scale != dec_type.scale()) [[unlikely]] { + return JsonParseError("Cannot parse {} as a {} value: the scale doesn't match", + SafeDumpJson(json), type->ToString()); + } + return Literal::Decimal(dec.value(), dec_type.precision(), dec_type.scale()); + } + + default: + return NotSupported("Unsupported type for literal JSON parsing: {}", + type->ToString()); + } } Result LiteralFromJson(const nlohmann::json& json) { diff --git a/src/iceberg/expression/literal.cc b/src/iceberg/expression/literal.cc index 88bafd78d..6d6f01448 100644 --- a/src/iceberg/expression/literal.cc +++ b/src/iceberg/expression/literal.cc @@ -23,11 +23,16 @@ #include #include #include +#include +#include "iceberg/type.h" #include "iceberg/util/checked_cast.h" #include "iceberg/util/conversions.h" +#include "iceberg/util/decimal.h" #include "iceberg/util/macros.h" +#include "iceberg/util/string_util.h" #include "iceberg/util/temporal_util.h" +#include "iceberg/util/transform_util.h" namespace iceberg { @@ -193,12 +198,42 @@ Result LiteralCaster::CastFromString( ICEBERG_ASSIGN_OR_RAISE(auto uuid, Uuid::FromString(str_val)); return Literal::UUID(uuid); } - case TypeId::kDate: - case TypeId::kTime: - case TypeId::kTimestamp: - case TypeId::kTimestampTz: - return NotImplemented("Cast from String to {} is not implemented yet", - target_type->ToString()); + case TypeId::kDate: { + ICEBERG_ASSIGN_OR_RAISE(auto days, TransformUtil::ParseDay(str_val)); + return Literal::Date(days); + } + case TypeId::kTime: { + ICEBERG_ASSIGN_OR_RAISE(auto micros, TransformUtil::ParseTime(str_val)); + return Literal::Time(micros); + } + case TypeId::kTimestamp: { + ICEBERG_ASSIGN_OR_RAISE(auto micros, TransformUtil::ParseTimestamp(str_val)); + return Literal::Timestamp(micros); + } + case TypeId::kTimestampTz: { + ICEBERG_ASSIGN_OR_RAISE(auto micros, + TransformUtil::ParseTimestampWithZone(str_val)); + return Literal::TimestampTz(micros); + } + case TypeId::kBinary: { + ICEBERG_ASSIGN_OR_RAISE(auto bytes, StringUtils::HexStringToBytes(str_val)); + return Literal::Binary(std::move(bytes)); + } + case TypeId::kFixed: { + ICEBERG_ASSIGN_OR_RAISE(auto bytes, StringUtils::HexStringToBytes(str_val)); + return Literal::Fixed(std::move(bytes)); + } + case TypeId::kDecimal: { + const auto& dec_type = internal::checked_cast(*target_type); + int32_t parsed_scale = 0; + ICEBERG_ASSIGN_OR_RAISE(auto dec, + Decimal::FromString(str_val, nullptr, &parsed_scale)); + if (parsed_scale != dec_type.scale()) { + return InvalidArgument("Cannot cast {} as a {} value: the scale doesn't match", + str_val, target_type->ToString()); + } + return Literal::Decimal(dec.value(), dec_type.precision(), dec_type.scale()); + } default: return NotSupported("Cast from String to {} is not supported", target_type->ToString()); diff --git a/src/iceberg/test/expression_json_test.cc b/src/iceberg/test/expression_json_test.cc index 8a146b128..60aa59491 100644 --- a/src/iceberg/test/expression_json_test.cc +++ b/src/iceberg/test/expression_json_test.cc @@ -18,6 +18,7 @@ */ #include +#include #include #include @@ -31,6 +32,7 @@ #include "iceberg/expression/literal.h" #include "iceberg/expression/predicate.h" #include "iceberg/schema.h" +#include "iceberg/schema_field.h" #include "iceberg/test/matchers.h" #include "iceberg/type.h" @@ -405,4 +407,99 @@ INSTANTIATE_TEST_SUITE_P( return info.param.name; }); +// --- LiteralFromJson(json, type) type-aware tests --- + +struct LiteralFromJsonTypedParam { + std::string name; + nlohmann::json json; + std::shared_ptr type; + TypeId expected_type_id; + std::optional expected_str; +}; + +class LiteralFromJsonTypedTest + : public ::testing::TestWithParam {}; + +TEST_P(LiteralFromJsonTypedTest, Parses) { + const auto& p = GetParam(); + ICEBERG_UNWRAP_OR_FAIL(auto lit, LiteralFromJson(p.json, p.type.get())); + EXPECT_EQ(lit.type()->type_id(), p.expected_type_id); + if (p.expected_str) EXPECT_EQ(lit.ToString(), *p.expected_str); +} + +INSTANTIATE_TEST_SUITE_P( + LiteralFromJsonTyped, LiteralFromJsonTypedTest, + ::testing::Values( + LiteralFromJsonTypedParam{"Boolean", nlohmann::json(true), boolean(), + TypeId::kBoolean, "true"}, + LiteralFromJsonTypedParam{"Int", nlohmann::json(123), int32(), TypeId::kInt, + "123"}, + LiteralFromJsonTypedParam{"Long", nlohmann::json(9876543210LL), int64(), + TypeId::kLong, "9876543210"}, + LiteralFromJsonTypedParam{"Float", nlohmann::json(1.5), float32(), TypeId::kFloat, + std::nullopt}, + LiteralFromJsonTypedParam{"Double", nlohmann::json(3.14), float64(), + TypeId::kDouble, std::nullopt}, + LiteralFromJsonTypedParam{"String", nlohmann::json("hello"), string(), + TypeId::kString, std::nullopt}, + LiteralFromJsonTypedParam{"DateString", nlohmann::json("2024-01-15"), date(), + TypeId::kDate, std::nullopt}, + LiteralFromJsonTypedParam{"Uuid", + nlohmann::json("f79c3e09-677c-4bbd-a479-3f349cb785e7"), + uuid(), TypeId::kUuid, std::nullopt}, + LiteralFromJsonTypedParam{"Binary", nlohmann::json("deadbeef"), binary(), + TypeId::kBinary, std::nullopt}, + LiteralFromJsonTypedParam{"Fixed", nlohmann::json("cafebabe"), fixed(4), + TypeId::kFixed, std::nullopt}, + LiteralFromJsonTypedParam{"DecimalMatchingScale", nlohmann::json("123.4500"), + decimal(9, 4), TypeId::kDecimal, "123.4500"}, + LiteralFromJsonTypedParam{"DecimalScaleZero", nlohmann::json("2"), decimal(9, 0), + TypeId::kDecimal, "2"}), + [](const ::testing::TestParamInfo& info) { + return info.param.name; + }); + +struct InvalidLiteralFromJsonTypedParam { + std::string name; + nlohmann::json json; + std::shared_ptr type; +}; + +class InvalidLiteralFromJsonTypedTest + : public ::testing::TestWithParam {}; + +TEST_P(InvalidLiteralFromJsonTypedTest, ReturnsError) { + const auto& p = GetParam(); + EXPECT_FALSE(LiteralFromJson(p.json, p.type.get()).has_value()); +} + +INSTANTIATE_TEST_SUITE_P( + LiteralFromJsonTyped, InvalidLiteralFromJsonTypedTest, + ::testing::Values( + InvalidLiteralFromJsonTypedParam{"BooleanTypeMismatch", nlohmann::json(42), + boolean()}, + InvalidLiteralFromJsonTypedParam{"DateTypeMismatch", nlohmann::json(true), + date()}, + InvalidLiteralFromJsonTypedParam{"UuidTypeMismatch", nlohmann::json(42), uuid()}, + InvalidLiteralFromJsonTypedParam{"BinaryInvalidHex", nlohmann::json("xyz"), + binary()}, + InvalidLiteralFromJsonTypedParam{"FixedLengthMismatch", nlohmann::json("cafe12"), + fixed(4)}, + InvalidLiteralFromJsonTypedParam{"DecimalScaleMismatch", nlohmann::json("123.45"), + decimal(9, 4)}, + InvalidLiteralFromJsonTypedParam{"DecimalNotString", nlohmann::json(123.45), + decimal(9, 2)}), + [](const ::testing::TestParamInfo& info) { + return info.param.name; + }); + +TEST(LiteralFromJsonTyped, SchemaAwareDatePredicateRoundTrip) { + auto schema = std::make_shared( + std::vector{SchemaField::MakeOptional(1, "event_date", date())}); + nlohmann::json pred_json = { + {"type", "eq"}, {"term", "event_date"}, {"value", "2024-01-15"}}; + ICEBERG_UNWRAP_OR_FAIL(auto expr, ExpressionFromJson(pred_json, schema.get())); + ASSERT_NE(expr, nullptr); +} + } // namespace iceberg diff --git a/src/iceberg/test/literal_test.cc b/src/iceberg/test/literal_test.cc index 01a7a7ce6..97724aad9 100644 --- a/src/iceberg/test/literal_test.cc +++ b/src/iceberg/test/literal_test.cc @@ -787,6 +787,37 @@ INSTANTIATE_TEST_SUITE_P( .target_type = uuid(), .expected_literal = Literal::UUID( Uuid::FromString("123e4567-e89b-12d3-a456-426614174000").value())}, + CastLiteralTestParam{.test_name = "StringToDate", + .source_literal = Literal::String("2024-01-16"), + .target_type = date(), + .expected_literal = Literal::Date(19738)}, + CastLiteralTestParam{.test_name = "StringToTime", + .source_literal = Literal::String("14:30"), + .target_type = time(), + .expected_literal = Literal::Time(52200000000LL)}, + CastLiteralTestParam{.test_name = "StringToTimestamp", + .source_literal = Literal::String("2026-01-01T00:00:01.500"), + .target_type = timestamp(), + .expected_literal = Literal::Timestamp(1767225601500000L)}, + CastLiteralTestParam{ + .test_name = "StringToTimestampTz", + .source_literal = Literal::String("2026-01-01T00:00:01.500+00:00"), + .target_type = timestamp_tz(), + .expected_literal = Literal::TimestampTz(1767225601500000L)}, + CastLiteralTestParam{.test_name = "StringToBinary", + .source_literal = Literal::String("010203FF"), + .target_type = binary(), + .expected_literal = Literal::Binary(std::vector{ + 0x01, 0x02, 0x03, 0xFF})}, + CastLiteralTestParam{.test_name = "StringToFixed", + .source_literal = Literal::String("01020304"), + .target_type = fixed(4), + .expected_literal = Literal::Fixed(std::vector{ + 0x01, 0x02, 0x03, 0x04})}, + CastLiteralTestParam{.test_name = "StringToDecimal", + .source_literal = Literal::String("1234.56"), + .target_type = decimal(6, 2), + .expected_literal = Literal::Decimal(123456, 6, 2)}, // Same type cast test CastLiteralTestParam{.test_name = "IntToInt", .source_literal = Literal::Int(42), diff --git a/src/iceberg/test/transform_util_test.cc b/src/iceberg/test/transform_util_test.cc index 76f6824b3..54f36cd07 100644 --- a/src/iceberg/test/transform_util_test.cc +++ b/src/iceberg/test/transform_util_test.cc @@ -21,6 +21,8 @@ #include +#include "iceberg/test/matchers.h" + namespace iceberg { TEST(TransformUtilTest, HumanYear) { @@ -157,4 +159,212 @@ TEST(TransformUtilTest, Base64Encode) { EXPECT_EQ("AA==", TransformUtil::Base64Encode({"\x00", 1})); } +struct ParseRoundTripParam { + std::string name; + std::string str; + int64_t value; + enum Kind { kDay, kTime, kTimestamp, kTimestampTz } kind; +}; + +class ParseRoundTripTest : public ::testing::TestWithParam {}; + +TEST_P(ParseRoundTripTest, RoundTrip) { + const auto& param = GetParam(); + switch (param.kind) { + case ParseRoundTripParam::kDay: { + EXPECT_EQ(TransformUtil::HumanDay(static_cast(param.value)), param.str); + ICEBERG_UNWRAP_OR_FAIL(auto parsed, TransformUtil::ParseDay(param.str)); + EXPECT_EQ(parsed, static_cast(param.value)); + break; + } + case ParseRoundTripParam::kTime: { + EXPECT_EQ(TransformUtil::HumanTime(param.value), param.str); + ICEBERG_UNWRAP_OR_FAIL(auto parsed, TransformUtil::ParseTime(param.str)); + EXPECT_EQ(parsed, param.value); + break; + } + case ParseRoundTripParam::kTimestamp: { + EXPECT_EQ(TransformUtil::HumanTimestamp(param.value), param.str); + ICEBERG_UNWRAP_OR_FAIL(auto parsed, TransformUtil::ParseTimestamp(param.str)); + EXPECT_EQ(parsed, param.value); + break; + } + case ParseRoundTripParam::kTimestampTz: { + EXPECT_EQ(TransformUtil::HumanTimestampWithZone(param.value), param.str); + ICEBERG_UNWRAP_OR_FAIL(auto parsed, + TransformUtil::ParseTimestampWithZone(param.str)); + EXPECT_EQ(parsed, param.value); + break; + } + } +} + +struct ParseOnlyParam { + std::string name; + std::string str; + int64_t value; + enum Kind { kDay, kTime, kTimestamp, kTimestampTz } kind; +}; + +class ParseOnlyTest : public ::testing::TestWithParam {}; + +TEST_P(ParseOnlyTest, ParsesCorrectly) { + const auto& param = GetParam(); + switch (param.kind) { + case ParseOnlyParam::kDay: { + ICEBERG_UNWRAP_OR_FAIL(auto parsed, TransformUtil::ParseDay(param.str)); + EXPECT_EQ(parsed, static_cast(param.value)); + break; + } + case ParseOnlyParam::kTime: { + ICEBERG_UNWRAP_OR_FAIL(auto parsed, TransformUtil::ParseTime(param.str)); + EXPECT_EQ(parsed, param.value); + break; + } + case ParseOnlyParam::kTimestamp: { + ICEBERG_UNWRAP_OR_FAIL(auto parsed, TransformUtil::ParseTimestamp(param.str)); + EXPECT_EQ(parsed, param.value); + break; + } + case ParseOnlyParam::kTimestampTz: { + ICEBERG_UNWRAP_OR_FAIL(auto parsed, + TransformUtil::ParseTimestampWithZone(param.str)); + EXPECT_EQ(parsed, param.value); + break; + } + } +} + +struct ParseTimeErrorParam { + std::string name; + std::string str; +}; + +class ParseTimeErrorTest : public ::testing::TestWithParam {}; + +TEST_P(ParseTimeErrorTest, ReturnsError) { + EXPECT_THAT(TransformUtil::ParseTime(GetParam().str), + IsError(ErrorKind::kInvalidArgument)); +} + +INSTANTIATE_TEST_SUITE_P( + TransformUtilTest, ParseRoundTripTest, + ::testing::Values( + // Day round-trips + ParseRoundTripParam{"DayEpoch", "1970-01-01", 0, ParseRoundTripParam::kDay}, + ParseRoundTripParam{"DayNext", "1970-01-02", 1, ParseRoundTripParam::kDay}, + ParseRoundTripParam{"DayBeforeEpoch", "1969-12-31", -1, + ParseRoundTripParam::kDay}, + ParseRoundTripParam{"DayYear999", "0999-12-31", -354286, + ParseRoundTripParam::kDay}, + ParseRoundTripParam{"DayNonLeap", "1971-01-01", 365, ParseRoundTripParam::kDay}, + ParseRoundTripParam{"DayY2K", "2000-01-01", 10957, ParseRoundTripParam::kDay}, + ParseRoundTripParam{"Day2026", "2026-01-01", 20454, ParseRoundTripParam::kDay}, + // Time round-trips + ParseRoundTripParam{"TimeMidnight", "00:00", 0, ParseRoundTripParam::kTime}, + ParseRoundTripParam{"TimeOneSec", "00:00:01", 1000000, + ParseRoundTripParam::kTime}, + ParseRoundTripParam{"TimeMillis", "00:00:01.500", 1500000, + ParseRoundTripParam::kTime}, + ParseRoundTripParam{"TimeOneMillis", "00:00:01.001", 1001000, + ParseRoundTripParam::kTime}, + ParseRoundTripParam{"TimeMicros", "00:00:01.000001", 1000001, + ParseRoundTripParam::kTime}, + ParseRoundTripParam{"TimeHourMinSec", "01:02:03", 3723000000, + ParseRoundTripParam::kTime}, + ParseRoundTripParam{"TimeEndOfDay", "23:59:59", 86399000000, + ParseRoundTripParam::kTime}, + // Timestamp round-trips + ParseRoundTripParam{"TimestampEpoch", "1970-01-01T00:00:00", 0, + ParseRoundTripParam::kTimestamp}, + ParseRoundTripParam{"TimestampOneSec", "1970-01-01T00:00:01", 1000000, + ParseRoundTripParam::kTimestamp}, + ParseRoundTripParam{"TimestampMillis", "2026-01-01T00:00:01.500", + 1767225601500000L, ParseRoundTripParam::kTimestamp}, + ParseRoundTripParam{"TimestampOneMillis", "2026-01-01T00:00:01.001", + 1767225601001000L, ParseRoundTripParam::kTimestamp}, + ParseRoundTripParam{"TimestampMicros", "2026-01-01T00:00:01.000001", + 1767225601000001L, ParseRoundTripParam::kTimestamp}, + // TimestampTz round-trips + ParseRoundTripParam{"TimestampTzEpoch", "1970-01-01T00:00:00+00:00", 0, + ParseRoundTripParam::kTimestampTz}, + ParseRoundTripParam{"TimestampTzOneSec", "1970-01-01T00:00:01+00:00", 1000000, + ParseRoundTripParam::kTimestampTz}, + ParseRoundTripParam{"TimestampTzMillis", "2026-01-01T00:00:01.500+00:00", + 1767225601500000L, ParseRoundTripParam::kTimestampTz}, + ParseRoundTripParam{"TimestampTzOneMillis", "2026-01-01T00:00:01.001+00:00", + 1767225601001000L, ParseRoundTripParam::kTimestampTz}, + ParseRoundTripParam{"TimestampTzMicros", "2026-01-01T00:00:01.000001+00:00", + 1767225601000001L, ParseRoundTripParam::kTimestampTz}), + [](const ::testing::TestParamInfo& info) { + return info.param.name; + }); + +INSTANTIATE_TEST_SUITE_P( + TransformUtilTest, ParseOnlyTest, + ::testing::Values( + // TimestampTz with "Z" suffix + ParseOnlyParam{"TimestampTzSuffixZ_Epoch", "1970-01-01T00:00:00Z", 0, + ParseOnlyParam::kTimestampTz}, + ParseOnlyParam{"TimestampTzSuffixZ_Millis", "2026-01-01T00:00:01.500Z", + 1767225601500000L, ParseOnlyParam::kTimestampTz}, + // TimestampTz with "-00:00" suffix + ParseOnlyParam{"TimestampTzNegZero_Epoch", "1970-01-01T00:00:00-00:00", 0, + ParseOnlyParam::kTimestampTz}, + ParseOnlyParam{"TimestampTzNegZero_Millis", "2026-01-01T00:00:01.500-00:00", + 1767225601500000L, ParseOnlyParam::kTimestampTz}, + // Fractional micros truncates nanos + ParseOnlyParam{"TimeTruncatesNanos", "00:00:01.123456789", 1123456, + ParseOnlyParam::kTime}, + // Fractional seconds (trimmed trailing zeros) + ParseOnlyParam{"1Digit", "00:00:01.5", 1500000, ParseOnlyParam::kTime}, + ParseOnlyParam{"2Digits", "00:00:01.50", 1500000, ParseOnlyParam::kTime}, + ParseOnlyParam{"2DigitsNonZero", "00:00:01.12", 1120000, ParseOnlyParam::kTime}, + ParseOnlyParam{"4Digits", "00:00:01.0001", 1000100, ParseOnlyParam::kTime}, + // Timestamp without seconds + ParseOnlyParam{"TimestampNoSec_Zero", "1970-01-01T00:00", 0, + ParseOnlyParam::kTimestamp}, + ParseOnlyParam{"TimestampNoSec_OneMin", "1970-01-01T00:01", 60000000, + ParseOnlyParam::kTimestamp}, + // TimestampTz without seconds + ParseOnlyParam{"TimestampTzNoSec_Offset", "1970-01-01T00:00+00:00", 0, + ParseOnlyParam::kTimestampTz}, + ParseOnlyParam{"TimestampTzNoSec_OneMin", "1970-01-01T00:01+00:00", 60000000, + ParseOnlyParam::kTimestampTz}, + ParseOnlyParam{"TimestampTzNoSec_Z", "1970-01-01T00:00Z", 0, + ParseOnlyParam::kTimestampTz}, + // Extended year with '+' prefix + ParseOnlyParam{"ExtendedYearPlusEpoch", "+1970-01-01", 0, ParseOnlyParam::kDay}, + ParseOnlyParam{"ExtendedYearPlus2026", "+2026-01-01", 20454, + ParseOnlyParam::kDay}, + ParseOnlyParam{"ExtendedYearMinus2026", "-2026-01-01", -1459509, + ParseOnlyParam::kDay}, + // Non-UTC timezone offsets + ParseOnlyParam{"TimestampTzPositiveOffset", "1970-01-01T05:00:00+05:00", 0, + ParseOnlyParam::kTimestampTz}, + ParseOnlyParam{"TimestampTzNegativeOffset", "1970-01-01T00:00:00-05:00", + 18000000000, ParseOnlyParam::kTimestampTz}, + ParseOnlyParam{"TimestampTzOffsetWithMillis", "2026-01-01T05:30:01.500+05:30", + 1767225601500000L, ParseOnlyParam::kTimestampTz}, + ParseOnlyParam{"TimestampTzNegOffsetToEpoch", "1969-12-31T19:00:00-05:00", 0, + ParseOnlyParam::kTimestampTz}, + ParseOnlyParam{"TimestampTzNoSecWithOffset", "1970-01-01T05:30+05:30", 0, + ParseOnlyParam::kTimestampTz}), + [](const ::testing::TestParamInfo& info) { return info.param.name; }); + +INSTANTIATE_TEST_SUITE_P( + TransformUtilTest, ParseTimeErrorTest, + ::testing::Values(ParseTimeErrorParam{"EmptyString", ""}, + ParseTimeErrorParam{"TooShort1Char", "1"}, + ParseTimeErrorParam{"TooShort2Chars", "12"}, + ParseTimeErrorParam{"TooShort4Chars", "12:3"}, + ParseTimeErrorParam{"MissingColon", "1200:00"}, + ParseTimeErrorParam{"OutofRangeHours", "24:00:00"}, + ParseTimeErrorParam{"OutofRangeMinutes", "12:60:00"}, + ParseTimeErrorParam{"OutofRangeSeconds", "12:30:61"}, + ParseTimeErrorParam{"SpaceInsteadOfColon", "12 30"}), + [](const ::testing::TestParamInfo& info) { + return info.param.name; + }); + } // namespace iceberg diff --git a/src/iceberg/util/string_util.h b/src/iceberg/util/string_util.h index fc202f0e6..da925a64c 100644 --- a/src/iceberg/util/string_util.h +++ b/src/iceberg/util/string_util.h @@ -28,9 +28,11 @@ #include #include #include +#include #include "iceberg/iceberg_export.h" #include "iceberg/result.h" +#include "iceberg/util/macros.h" namespace iceberg { @@ -91,6 +93,28 @@ class ICEBERG_EXPORT StringUtils { std::unreachable(); } + /// \brief Decode a hex string (upper or lower case) into bytes. + /// Returns an error if the string has odd length or contains invalid hex characters. + static Result> HexStringToBytes(std::string_view hex) { + if (hex.size() % 2 != 0) [[unlikely]] { + return InvalidArgument("Hex string must have even length, got: {}", hex.size()); + } + std::vector bytes; + bytes.reserve(hex.size() / 2); + auto nibble = [](char c) -> Result { + if (c >= '0' && c <= '9') return static_cast(c - '0'); + if (c >= 'a' && c <= 'f') return static_cast(c - 'a' + 10); + if (c >= 'A' && c <= 'F') return static_cast(c - 'A' + 10); + return InvalidArgument("Invalid hex character: '{}'", c); + }; + for (size_t i = 0; i < hex.size(); i += 2) { + ICEBERG_ASSIGN_OR_RAISE(auto hi, nibble(hex[i])); + ICEBERG_ASSIGN_OR_RAISE(auto lo, nibble(hex[i + 1])); + bytes.push_back(static_cast((hi << 4) | lo)); + } + return bytes; + } + template requires std::is_floating_point_v && (!FromChars) static Result ParseNumber(std::string_view str) { diff --git a/src/iceberg/util/transform_util.cc b/src/iceberg/util/transform_util.cc index fe1523437..35aaceddb 100644 --- a/src/iceberg/util/transform_util.cc +++ b/src/iceberg/util/transform_util.cc @@ -22,12 +22,50 @@ #include #include +#include "iceberg/util/macros.h" +#include "iceberg/util/string_util.h" + namespace iceberg { namespace { constexpr auto kEpochDate = std::chrono::year{1970} / std::chrono::January / 1; constexpr int64_t kMicrosPerMillis = 1'000; constexpr int64_t kMicrosPerSecond = 1'000'000; +constexpr int64_t kMicrosPerDay = 86'400'000'000LL; + +/// Parse a timezone offset of the form "+HH:mm" or "-HH:mm" and return the +/// offset in microseconds (positive for east of UTC, negative for west). +Result ParseTimezoneOffset(std::string_view offset) { + if (offset.size() != 6 || (offset[0] != '+' && offset[0] != '-') || offset[3] != ':') { + return InvalidArgument("Invalid timezone offset: '{}'", offset); + } + bool negative = offset[0] == '-'; + ICEBERG_ASSIGN_OR_RAISE(auto hours, + StringUtils::ParseNumber(offset.substr(1, 2))); + ICEBERG_ASSIGN_OR_RAISE(auto minutes, + StringUtils::ParseNumber(offset.substr(4, 2))); + if (hours > 18 || minutes > 59) { + return InvalidArgument("Invalid timezone offset: '{}'", offset); + } + auto micros = hours * 3'600 * kMicrosPerSecond + minutes * 60 * kMicrosPerSecond; + return negative ? -micros : micros; +} + +/// Parse fractional seconds (after '.') and return micros. +/// Digits beyond 6 are truncated (nanosecond precision). +Result ParseFractionalMicros(std::string_view frac) { + if (frac.empty()) { + return InvalidArgument("Invalid fractional seconds: '{}'", frac); + } + // Truncate to microsecond precision (6 digits), matching Java ISO_LOCAL_TIME behavior + if (frac.size() > 6) frac = frac.substr(0, 6); + ICEBERG_ASSIGN_OR_RAISE(auto val, StringUtils::ParseNumber(frac)); + // Right-pad to 6 digits: "500" → 500000, "001" → 1000, "000001" → 1 + for (size_t i = frac.size(); i < 6; ++i) { + val *= 10; + } + return static_cast(val); +} } // namespace std::string TransformUtil::HumanYear(int32_t year_ordinal) { @@ -92,6 +130,110 @@ std::string TransformUtil::HumanTimestampWithZone(int64_t timestamp_micros) { } } +Result TransformUtil::ParseDay(std::string_view str) { + // Expected format: "[+-]yyyy-MM-dd" + // Parse year, month, day manually, skipping leading '+' or '-' to find first date dash + auto dash1 = str.find('-', (!str.empty() && (str[0] == '-' || str[0] == '+')) ? 1 : 0); + auto dash2 = str.find('-', dash1 + 1); + if (str.size() < 10 || dash1 == std::string_view::npos || + dash2 == std::string_view::npos) [[unlikely]] { + return InvalidArgument("Invalid date string: '{}'", str); + } + auto year_str = str.substr(0, dash1); + // std::from_chars does not accept '+' prefix, strip it for positive extended years + if (!year_str.empty() && year_str[0] == '+') { + year_str = year_str.substr(1); + } + ICEBERG_ASSIGN_OR_RAISE(auto year, StringUtils::ParseNumber(year_str)); + ICEBERG_ASSIGN_OR_RAISE(auto month, StringUtils::ParseNumber( + str.substr(dash1 + 1, dash2 - dash1 - 1))); + ICEBERG_ASSIGN_OR_RAISE(auto day, + StringUtils::ParseNumber(str.substr(dash2 + 1))); + + auto ymd = std::chrono::year{year} / std::chrono::month{static_cast(month)} / + std::chrono::day{static_cast(day)}; + if (!ymd.ok()) [[unlikely]] { + return InvalidArgument("Invalid date: '{}'", str); + } + + auto days = std::chrono::sys_days(ymd) - std::chrono::sys_days(kEpochDate); + return static_cast(days.count()); +} + +Result TransformUtil::ParseTime(std::string_view str) { + if (str.size() < 5 || str[2] != ':') [[unlikely]] { + return InvalidArgument("Invalid time string: '{}'", str); + } + + ICEBERG_ASSIGN_OR_RAISE(auto hours, + StringUtils::ParseNumber(str.substr(0, 2))); + ICEBERG_ASSIGN_OR_RAISE(auto minutes, + StringUtils::ParseNumber(str.substr(3, 2))); + int64_t seconds = 0; + + int64_t frac_micros = 0; + if (str.size() > 5) { + if (str[5] != ':' || str.size() < 8) [[unlikely]] { + return InvalidArgument("Invalid time string: '{}'", str); + } + ICEBERG_ASSIGN_OR_RAISE(seconds, StringUtils::ParseNumber(str.substr(6, 2))); + if (str.size() > 8) { + if (str[8] != '.') [[unlikely]] { + return InvalidArgument("Invalid time string: '{}'", str); + } + ICEBERG_ASSIGN_OR_RAISE(frac_micros, ParseFractionalMicros(str.substr(9))); + } + } + + // check that hours, minutes, seconds are in valid ranges + if (hours < 0 || hours > 23 || minutes < 0 || minutes > 59 || seconds < 0 || + seconds > 60) [[unlikely]] { + return InvalidArgument("Invalid time string: '{}'", str); + } + + return hours * 3'600 * kMicrosPerSecond + minutes * 60 * kMicrosPerSecond + + seconds * kMicrosPerSecond + frac_micros; +} + +Result TransformUtil::ParseTimestamp(std::string_view str) { + // Format: "yyyy-MM-ddTHH:mm:ss[.SSS[SSS]]" + auto t_pos = str.find('T'); + if (t_pos == std::string_view::npos) [[unlikely]] { + return InvalidArgument("Invalid timestamp string (missing 'T'): '{}'", str); + } + + ICEBERG_ASSIGN_OR_RAISE(auto days, ParseDay(str.substr(0, t_pos))); + ICEBERG_ASSIGN_OR_RAISE(auto time_micros, ParseTime(str.substr(t_pos + 1))); + + return static_cast(days) * kMicrosPerDay + time_micros; +} + +Result TransformUtil::ParseTimestampWithZone(std::string_view str) { + if (str.empty()) [[unlikely]] { + return InvalidArgument("Invalid timestamptz string: '{}'", str); + } + + int64_t offset_micros = 0; + std::string_view timestamp_part; + + if (str.back() == 'Z') { + // "Z" suffix means UTC (offset = 0) + timestamp_part = str.substr(0, str.size() - 1); + } else if (str.size() >= 6 && + (str[str.size() - 6] == '+' || str[str.size() - 6] == '-')) { + // Parse "+HH:mm" or "-HH:mm" offset suffix + ICEBERG_ASSIGN_OR_RAISE(offset_micros, + ParseTimezoneOffset(str.substr(str.size() - 6))); + timestamp_part = str.substr(0, str.size() - 6); + } else { + return InvalidArgument("Invalid timestamptz string (missing timezone suffix): '{}'", + str); + } + + ICEBERG_ASSIGN_OR_RAISE(auto local_micros, ParseTimestamp(timestamp_part)); + return local_micros - offset_micros; +} + std::string TransformUtil::Base64Encode(std::string_view str_to_encode) { static constexpr std::string_view kBase64Chars = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"; diff --git a/src/iceberg/util/transform_util.h b/src/iceberg/util/transform_util.h index 7482b0dba..2be8c69b6 100644 --- a/src/iceberg/util/transform_util.h +++ b/src/iceberg/util/transform_util.h @@ -22,6 +22,7 @@ #include #include "iceberg/iceberg_export.h" +#include "iceberg/result.h" namespace iceberg { @@ -98,6 +99,39 @@ class ICEBERG_EXPORT TransformUtil { /// \return a string representation of this timestamp. static std::string HumanTimestampWithZone(int64_t timestamp_micros); + /// \brief Parses a date string in "[+-]yyyy-MM-dd" format into days since epoch. + /// + /// Supports an optional '+' or '-' prefix for extended years beyond 9999. + /// + /// \param str The date string to parse. + /// \return The number of days since 1970-01-01, or an error. + static Result ParseDay(std::string_view str); + + /// \brief Parses a time string into microseconds from midnight. + /// + /// Accepts: "HH:mm", "HH:mm:ss", "HH:mm:ss.SSS", "HH:mm:ss.SSSSSS". + /// + /// \param str The time string to parse. + /// \return The number of microseconds from midnight, or an error. + static Result ParseTime(std::string_view str); + + /// \brief Parses a timestamp string into microseconds since epoch. + /// + /// Accepts: "yyyy-MM-ddTHH:mm:ss", with optional fractional seconds (.SSS or .SSSSSS). + /// + /// \param str The timestamp string to parse. + /// \return The number of microseconds since epoch, or an error. + static Result ParseTimestamp(std::string_view str); + + /// \brief Parses a timestamp-with-zone string into microseconds since epoch (UTC). + /// + /// Accepts the same formats as ParseTimestamp, with a timezone suffix: + /// "Z", "+HH:mm", or "-HH:mm". Non-UTC offsets are converted to UTC. + /// + /// \param str The timestamp string to parse. + /// \return The number of microseconds since epoch (UTC), or an error. + static Result ParseTimestampWithZone(std::string_view str); + /// \brief Base64 encode a string static std::string Base64Encode(std::string_view str_to_encode); }; From 4d02577753d0607ac4877df45bde45a264131421 Mon Sep 17 00:00:00 2001 From: Gang Wu Date: Wed, 18 Mar 2026 21:57:48 +0800 Subject: [PATCH 2/2] minor fixes --- src/iceberg/CMakeLists.txt | 1 + src/iceberg/expression/json_serde.cc | 27 +++++++++----- src/iceberg/expression/literal.cc | 17 ++++++--- src/iceberg/meson.build | 1 + src/iceberg/test/expression_json_test.cc | 40 ++++++++++++++++++--- src/iceberg/util/string_util.cc | 46 ++++++++++++++++++++++++ src/iceberg/util/string_util.h | 25 +++---------- src/iceberg/util/transform_util.cc | 2 +- src/iceberg/util/transform_util.h | 9 +++-- 9 files changed, 127 insertions(+), 41 deletions(-) create mode 100644 src/iceberg/util/string_util.cc diff --git a/src/iceberg/CMakeLists.txt b/src/iceberg/CMakeLists.txt index 21e87bee4..ada9b473a 100644 --- a/src/iceberg/CMakeLists.txt +++ b/src/iceberg/CMakeLists.txt @@ -108,6 +108,7 @@ set(ICEBERG_SOURCES util/murmurhash3_internal.cc util/property_util.cc util/snapshot_util.cc + util/string_util.cc util/temporal_util.cc util/timepoint.cc util/transform_util.cc diff --git a/src/iceberg/expression/json_serde.cc b/src/iceberg/expression/json_serde.cc index af2ff5908..38e7a8e2f 100644 --- a/src/iceberg/expression/json_serde.cc +++ b/src/iceberg/expression/json_serde.cc @@ -17,6 +17,7 @@ * under the License. */ +#include #include #include @@ -315,11 +316,18 @@ Result LiteralFromJson(const nlohmann::json& json, const Type* type) { } return Literal::Boolean(json.get()); - case TypeId::kInt: + case TypeId::kInt: { if (!json.is_number_integer()) [[unlikely]] { return JsonParseError("Cannot parse {} as an int value", SafeDumpJson(json)); } - return Literal::Int(json.get()); + auto val = json.get(); + if (val < std::numeric_limits::min() || + val > std::numeric_limits::max()) [[unlikely]] { + return JsonParseError("Cannot parse {} as an int value: out of range", + SafeDumpJson(json)); + } + return Literal::Int(static_cast(val)); + } case TypeId::kLong: if (!json.is_number_integer()) [[unlikely]] { @@ -328,13 +336,13 @@ Result LiteralFromJson(const nlohmann::json& json, const Type* type) { return Literal::Long(json.get()); case TypeId::kFloat: - if (!json.is_number()) [[unlikely]] { + if (!json.is_number_float()) [[unlikely]] { return JsonParseError("Cannot parse {} as a float value", SafeDumpJson(json)); } return Literal::Float(json.get()); case TypeId::kDouble: - if (!json.is_number()) [[unlikely]] { + if (!json.is_number_float()) [[unlikely]] { return JsonParseError("Cannot parse {} as a double value", SafeDumpJson(json)); } return Literal::Double(json.get()); @@ -418,12 +426,15 @@ Result LiteralFromJson(const nlohmann::json& json, const Type* type) { return JsonParseError("Cannot parse {} as a decimal value", SafeDumpJson(json)); } const auto& dec_type = internal::checked_cast(*type); + int32_t parsed_precision = 0; int32_t parsed_scale = 0; ICEBERG_ASSIGN_OR_RAISE( - auto dec, Decimal::FromString(json.get(), nullptr, &parsed_scale)); - if (parsed_scale != dec_type.scale()) [[unlikely]] { - return JsonParseError("Cannot parse {} as a {} value: the scale doesn't match", - SafeDumpJson(json), type->ToString()); + auto dec, + Decimal::FromString(json.get(), &parsed_precision, &parsed_scale)); + if (parsed_precision > dec_type.precision() || parsed_scale != dec_type.scale()) + [[unlikely]] { + return JsonParseError("Cannot parse {} as a {} value", SafeDumpJson(json), + type->ToString()); } return Literal::Decimal(dec.value(), dec_type.precision(), dec_type.scale()); } diff --git a/src/iceberg/expression/literal.cc b/src/iceberg/expression/literal.cc index 6d6f01448..9b8060a19 100644 --- a/src/iceberg/expression/literal.cc +++ b/src/iceberg/expression/literal.cc @@ -220,17 +220,24 @@ Result LiteralCaster::CastFromString( return Literal::Binary(std::move(bytes)); } case TypeId::kFixed: { + const auto& fixed_type = internal::checked_cast(*target_type); + if (str_val.size() != static_cast(fixed_type.length()) * 2) { + return InvalidArgument("Cannot cast string to {}: expected {} hex chars, got {}", + target_type->ToString(), fixed_type.length() * 2, + str_val.size()); + } ICEBERG_ASSIGN_OR_RAISE(auto bytes, StringUtils::HexStringToBytes(str_val)); return Literal::Fixed(std::move(bytes)); } case TypeId::kDecimal: { const auto& dec_type = internal::checked_cast(*target_type); + int32_t parsed_precision = 0; int32_t parsed_scale = 0; - ICEBERG_ASSIGN_OR_RAISE(auto dec, - Decimal::FromString(str_val, nullptr, &parsed_scale)); - if (parsed_scale != dec_type.scale()) { - return InvalidArgument("Cannot cast {} as a {} value: the scale doesn't match", - str_val, target_type->ToString()); + ICEBERG_ASSIGN_OR_RAISE( + auto dec, Decimal::FromString(str_val, &parsed_precision, &parsed_scale)); + if (parsed_precision > dec_type.precision() || parsed_scale != dec_type.scale()) { + return InvalidArgument("Cannot cast {} as a {} value", str_val, + target_type->ToString()); } return Literal::Decimal(dec.value(), dec_type.precision(), dec_type.scale()); } diff --git a/src/iceberg/meson.build b/src/iceberg/meson.build index bfc502fd8..81af8dc30 100644 --- a/src/iceberg/meson.build +++ b/src/iceberg/meson.build @@ -126,6 +126,7 @@ iceberg_sources = files( 'util/murmurhash3_internal.cc', 'util/property_util.cc', 'util/snapshot_util.cc', + 'util/string_util.cc', 'util/temporal_util.cc', 'util/timepoint.cc', 'util/transform_util.cc', diff --git a/src/iceberg/test/expression_json_test.cc b/src/iceberg/test/expression_json_test.cc index 60aa59491..7b978ef70 100644 --- a/src/iceberg/test/expression_json_test.cc +++ b/src/iceberg/test/expression_json_test.cc @@ -424,7 +424,9 @@ TEST_P(LiteralFromJsonTypedTest, Parses) { const auto& p = GetParam(); ICEBERG_UNWRAP_OR_FAIL(auto lit, LiteralFromJson(p.json, p.type.get())); EXPECT_EQ(lit.type()->type_id(), p.expected_type_id); - if (p.expected_str) EXPECT_EQ(lit.ToString(), *p.expected_str); + if (p.expected_str) { + EXPECT_EQ(lit.ToString(), *p.expected_str); + } } INSTANTIATE_TEST_SUITE_P( @@ -493,13 +495,41 @@ INSTANTIATE_TEST_SUITE_P( return info.param.name; }); -TEST(LiteralFromJsonTyped, SchemaAwareDatePredicateRoundTrip) { +struct SchemaAwarePredicateParam { + std::string name; + std::string field_name; + std::shared_ptr field_type; + nlohmann::json value; +}; + +class SchemaAwarePredicateRoundTripTest + : public ::testing::TestWithParam {}; + +TEST_P(SchemaAwarePredicateRoundTripTest, RoundTrip) { + const auto& p = GetParam(); auto schema = std::make_shared( - std::vector{SchemaField::MakeOptional(1, "event_date", date())}); - nlohmann::json pred_json = { - {"type", "eq"}, {"term", "event_date"}, {"value", "2024-01-15"}}; + std::vector{SchemaField::MakeOptional(1, p.field_name, p.field_type)}); + nlohmann::json pred_json = {{"type", "eq"}, {"term", p.field_name}, {"value", p.value}}; ICEBERG_UNWRAP_OR_FAIL(auto expr, ExpressionFromJson(pred_json, schema.get())); ASSERT_NE(expr, nullptr); } +INSTANTIATE_TEST_SUITE_P( + LiteralFromJsonTyped, SchemaAwarePredicateRoundTripTest, + ::testing::Values( + SchemaAwarePredicateParam{"Date", "event_date", date(), "2024-01-15"}, + SchemaAwarePredicateParam{"Time", "event_time", time(), "14:30:00"}, + SchemaAwarePredicateParam{"Timestamp", "created_at", timestamp(), + "2026-01-01T00:00:01.500"}, + SchemaAwarePredicateParam{"TimestampTz", "updated_at", timestamp_tz(), + "2026-01-01T00:00:01.500+00:00"}, + SchemaAwarePredicateParam{"Uuid", "trace_id", uuid(), + "f79c3e09-677c-4bbd-a479-3f349cb785e7"}, + SchemaAwarePredicateParam{"Binary", "payload", binary(), "deadbeef"}, + SchemaAwarePredicateParam{"Fixed", "hash", fixed(4), "cafebabe"}, + SchemaAwarePredicateParam{"Decimal", "amount", decimal(9, 2), "123.45"}), + [](const ::testing::TestParamInfo& info) { + return info.param.name; + }); + } // namespace iceberg diff --git a/src/iceberg/util/string_util.cc b/src/iceberg/util/string_util.cc new file mode 100644 index 000000000..0454a62b5 --- /dev/null +++ b/src/iceberg/util/string_util.cc @@ -0,0 +1,46 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +#include "iceberg/util/string_util.h" + +#include "iceberg/util/macros.h" + +namespace iceberg { + +Result> StringUtils::HexStringToBytes(std::string_view hex) { + if (hex.size() % 2 != 0) [[unlikely]] { + return InvalidArgument("Hex string must have even length, got: {}", hex.size()); + } + std::vector bytes; + bytes.reserve(hex.size() / 2); + auto nibble = [](char c) -> Result { + if (c >= '0' && c <= '9') return static_cast(c - '0'); + if (c >= 'a' && c <= 'f') return static_cast(c - 'a' + 10); + if (c >= 'A' && c <= 'F') return static_cast(c - 'A' + 10); + return InvalidArgument("Invalid hex character: '{}'", c); + }; + for (size_t i = 0; i < hex.size(); i += 2) { + ICEBERG_ASSIGN_OR_RAISE(auto hi, nibble(hex[i])); + ICEBERG_ASSIGN_OR_RAISE(auto lo, nibble(hex[i + 1])); + bytes.push_back(static_cast((hi << 4) | lo)); + } + return bytes; +} + +} // namespace iceberg diff --git a/src/iceberg/util/string_util.h b/src/iceberg/util/string_util.h index da925a64c..36dfba30f 100644 --- a/src/iceberg/util/string_util.h +++ b/src/iceberg/util/string_util.h @@ -32,7 +32,6 @@ #include "iceberg/iceberg_export.h" #include "iceberg/result.h" -#include "iceberg/util/macros.h" namespace iceberg { @@ -80,6 +79,10 @@ class ICEBERG_EXPORT StringUtils { T value = 0; auto [ptr, ec] = std::from_chars(str.data(), str.data() + str.size(), value); if (ec == std::errc()) [[likely]] { + if (ptr != str.data() + str.size()) { + return InvalidArgument("Failed to parse {} from string '{}': trailing characters", + typeid(T).name(), str); + } return value; } if (ec == std::errc::invalid_argument) { @@ -95,25 +98,7 @@ class ICEBERG_EXPORT StringUtils { /// \brief Decode a hex string (upper or lower case) into bytes. /// Returns an error if the string has odd length or contains invalid hex characters. - static Result> HexStringToBytes(std::string_view hex) { - if (hex.size() % 2 != 0) [[unlikely]] { - return InvalidArgument("Hex string must have even length, got: {}", hex.size()); - } - std::vector bytes; - bytes.reserve(hex.size() / 2); - auto nibble = [](char c) -> Result { - if (c >= '0' && c <= '9') return static_cast(c - '0'); - if (c >= 'a' && c <= 'f') return static_cast(c - 'a' + 10); - if (c >= 'A' && c <= 'F') return static_cast(c - 'A' + 10); - return InvalidArgument("Invalid hex character: '{}'", c); - }; - for (size_t i = 0; i < hex.size(); i += 2) { - ICEBERG_ASSIGN_OR_RAISE(auto hi, nibble(hex[i])); - ICEBERG_ASSIGN_OR_RAISE(auto lo, nibble(hex[i + 1])); - bytes.push_back(static_cast((hi << 4) | lo)); - } - return bytes; - } + static Result> HexStringToBytes(std::string_view hex); template requires std::is_floating_point_v && (!FromChars) diff --git a/src/iceberg/util/transform_util.cc b/src/iceberg/util/transform_util.cc index 35aaceddb..a9221310e 100644 --- a/src/iceberg/util/transform_util.cc +++ b/src/iceberg/util/transform_util.cc @@ -187,7 +187,7 @@ Result TransformUtil::ParseTime(std::string_view str) { // check that hours, minutes, seconds are in valid ranges if (hours < 0 || hours > 23 || minutes < 0 || minutes > 59 || seconds < 0 || - seconds > 60) [[unlikely]] { + seconds > 59) [[unlikely]] { return InvalidArgument("Invalid time string: '{}'", str); } diff --git a/src/iceberg/util/transform_util.h b/src/iceberg/util/transform_util.h index 2be8c69b6..c23d08c8c 100644 --- a/src/iceberg/util/transform_util.h +++ b/src/iceberg/util/transform_util.h @@ -109,7 +109,9 @@ class ICEBERG_EXPORT TransformUtil { /// \brief Parses a time string into microseconds from midnight. /// - /// Accepts: "HH:mm", "HH:mm:ss", "HH:mm:ss.SSS", "HH:mm:ss.SSSSSS". + /// Accepts ISO-8601 local time formats: "HH:mm", "HH:mm:ss", or + /// "HH:mm:ss.f" where the fractional part can be 1-9 digits. + /// Digits beyond 6 (microsecond precision) are truncated. /// /// \param str The time string to parse. /// \return The number of microseconds from midnight, or an error. @@ -117,7 +119,9 @@ class ICEBERG_EXPORT TransformUtil { /// \brief Parses a timestamp string into microseconds since epoch. /// - /// Accepts: "yyyy-MM-ddTHH:mm:ss", with optional fractional seconds (.SSS or .SSSSSS). + /// Accepts ISO-8601 local date-time formats: "yyyy-MM-ddTHH:mm", + /// "yyyy-MM-ddTHH:mm:ss", or "yyyy-MM-ddTHH:mm:ss.f" where the + /// fractional part can be 1-9 digits (truncated to microseconds). /// /// \param str The timestamp string to parse. /// \return The number of microseconds since epoch, or an error. @@ -127,6 +131,7 @@ class ICEBERG_EXPORT TransformUtil { /// /// Accepts the same formats as ParseTimestamp, with a timezone suffix: /// "Z", "+HH:mm", or "-HH:mm". Non-UTC offsets are converted to UTC. + /// The seconds and fractional parts are optional (e.g. "yyyy-MM-ddTHH:mm+00:00"). /// /// \param str The timestamp string to parse. /// \return The number of microseconds since epoch (UTC), or an error.