From d28bcd0e1d726d7c7f4787582da86366f3a34dc6 Mon Sep 17 00:00:00 2001 From: Alex Stephen Date: Mon, 8 Sep 2025 11:20:32 -0700 Subject: [PATCH 1/8] Added a whole bunch of models --- pyiceberg/catalog/rest/expression.py | 113 ++++++++ pyiceberg/catalog/rest/planning_models.py | 309 ++++++++++++++++++++++ 2 files changed, 422 insertions(+) create mode 100644 pyiceberg/catalog/rest/expression.py create mode 100644 pyiceberg/catalog/rest/planning_models.py diff --git a/pyiceberg/catalog/rest/expression.py b/pyiceberg/catalog/rest/expression.py new file mode 100644 index 0000000000..60792241ed --- /dev/null +++ b/pyiceberg/catalog/rest/expression.py @@ -0,0 +1,113 @@ +from typing import Any, Dict, List, Union + +from pydantic import BaseModel, Field + +from pyiceberg.typedef import IcebergBaseModel + + +class ExpressionType(BaseModel): + __root__: str = Field( + ..., + example=[ + "true", + "false", + "eq", + "and", + "or", + "not", + "in", + "not-in", + "lt", + "lt-eq", + "gt", + "gt-eq", + "not-eq", + "starts-with", + "not-starts-with", + "is-null", + "not-null", + "is-nan", + "not-nan", + ], + ) + + +class TrueExpression(IcebergBaseModel): + type: ExpressionType = Field(default_factory=lambda: ExpressionType.parse_obj("true"), const=True) + + +class FalseExpression(IcebergBaseModel): + type: ExpressionType = Field(default_factory=lambda: ExpressionType.parse_obj("false"), const=True) + + +class Transform(IcebergBaseModel): + __root__: str = Field( + ..., + example=[ + "identity", + "year", + "month", + "day", + "hour", + "bucket[256]", + "truncate[16]", + ], + ) + + +class Reference(IcebergBaseModel): + __root__: str = Field(..., example=["column-name"]) + + +class TransformTerm(IcebergBaseModel): + type: str = Field("transform", const=True) + transform: Transform + term: Reference + + +class Term(BaseModel): + __root__: Union[Reference, TransformTerm] + + +class AndOrExpression(IcebergBaseModel): + type: ExpressionType + left: "Expression" + right: "Expression" + + +class NotExpression(IcebergBaseModel): + type: ExpressionType = Field(default_factory=lambda: ExpressionType.parse_obj("not"), const=True) + child: "Expression" + + +class SetExpression(IcebergBaseModel): + type: ExpressionType + term: Term + values: List[Dict[str, Any]] + + +class LiteralExpression(IcebergBaseModel): + type: ExpressionType + term: Term + value: Dict[str, Any] + + +class UnaryExpression(IcebergBaseModel): + type: ExpressionType + term: Term + value: Dict[str, Any] + + +class Expression(IcebergBaseModel): + __root__: Union[ + TrueExpression, + FalseExpression, + AndOrExpression, + NotExpression, + SetExpression, + LiteralExpression, + UnaryExpression, + ] + + +Expression.model_rebuild() diff --git a/pyiceberg/catalog/rest/planning_models.py b/pyiceberg/catalog/rest/planning_models.py new file mode 100644 index 0000000000..02d038c511 --- /dev/null +++ b/pyiceberg/catalog/rest/planning_models.py @@ -0,0 +1,309 @@ +from datetime import date +from typing import List, Literal, Optional, Union +from uuid import UUID + +from pydantic import Field + +from pyiceberg.catalog.rest.expression import Expression +from pyiceberg.typedef import IcebergBaseModel + + +class FieldName(IcebergBaseModel): + __root__: str = Field( + ..., + description="A full field name (including parent field names), such as those passed in APIs like Java `Schema#findField(String name)`.\nThe nested field name follows these rules - Nested struct fields are named by concatenating field names at each struct level using dot (`.`) delimiter, e.g. employer.contact_info.address.zip_code - Nested fields in a map key are named using the keyword `key`, e.g. employee_address_map.key.first_name - Nested fields in a map value are named using the keyword `value`, e.g. employee_address_map.value.zip_code - Nested fields in a list are named using the keyword `element`, e.g. employees.element.first_name", + ) + + +class BooleanTypeValue(IcebergBaseModel): + __root__: bool = Field(..., example=True) + + +class IntegerTypeValue(IcebergBaseModel): + __root__: int = Field(..., example=42) + + +class LongTypeValue(IcebergBaseModel): + __root__: int = Field(..., example=9223372036854775807) + + +class FloatTypeValue(IcebergBaseModel): + __root__: float = Field(..., example=3.14) + + +class DoubleTypeValue(IcebergBaseModel): + __root__: float = Field(..., example=123.456) + + +class DecimalTypeValue(IcebergBaseModel): + __root__: str = Field( + ..., + description="Decimal type values are serialized as strings. Decimals with a positive scale serialize as numeric plain text, while decimals with a negative scale use scientific notation and the exponent will be equal to the negated scale. For instance, a decimal with a positive scale is '123.4500', with zero scale is '2', and with a negative scale is '2E+20'", + example="123.4500", + ) + + +class StringTypeValue(IcebergBaseModel): + __root__: str = Field(..., example="hello") + + +class UUIDTypeValue(IcebergBaseModel): + __root__: UUID = Field( + ..., + description="UUID type values are serialized as a 36-character lowercase string in standard UUID format as specified by RFC-4122", + example="eb26bdb1-a1d8-4aa6-990e-da940875492c", + max_length=36, + min_length=36, + regex="^[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}$", + ) + + +class DateTypeValue(IcebergBaseModel): + __root__: date = Field( + ..., + description="Date type values follow the 'YYYY-MM-DD' ISO-8601 standard date format", + example="2007-12-03", + ) + + +class TimeTypeValue(IcebergBaseModel): + __root__: str = Field( + ..., + description="Time type values follow the 'HH:MM:SS.ssssss' ISO-8601 format with microsecond precision", + example="22:31:08.123456", + ) + + +class TimestampTypeValue(IcebergBaseModel): + __root__: str = Field( + ..., + description="Timestamp type values follow the 'YYYY-MM-DDTHH:MM:SS.ssssss' ISO-8601 format with microsecond precision", + example="2007-12-03T10:15:30.123456", + ) + + +class TimestampTzTypeValue(IcebergBaseModel): + __root__: str = Field( + ..., + description="TimestampTz type values follow the 'YYYY-MM-DDTHH:MM:SS.ssssss+00:00' ISO-8601 format with microsecond precision, and a timezone offset (+00:00 for UTC)", + example="2007-12-03T10:15:30.123456+00:00", + ) + + +class TimestampNanoTypeValue(IcebergBaseModel): + __root__: str = Field( + ..., + description="Timestamp_ns type values follow the 'YYYY-MM-DDTHH:MM:SS.sssssssss' ISO-8601 format with nanosecond precision", + example="2007-12-03T10:15:30.123456789", + ) + + +class TimestampTzNanoTypeValue(IcebergBaseModel): + __root__: str = Field( + ..., + description="Timestamp_ns type values follow the 'YYYY-MM-DDTHH:MM:SS.sssssssss+00:00' ISO-8601 format with nanosecond precision, and a timezone offset (+00:00 for UTC)", + example="2007-12-03T10:15:30.123456789+00:00", + ) + + +class FixedTypeValue(IcebergBaseModel): + __root__: str = Field( + ..., + description="Fixed length type values are stored and serialized as an uppercase hexadecimal string preserving the fixed length", + example="78797A", + ) + + +class BinaryTypeValue(IcebergBaseModel): + __root__: str = Field( + ..., + description="Binary type values are stored and serialized as an uppercase hexadecimal string", + example="78797A", + ) + + +class CountMap(IcebergBaseModel): + keys: Optional[List[IntegerTypeValue]] = Field(None, description="List of integer column ids for each corresponding value") + values: Optional[List[LongTypeValue]] = Field(None, description="List of Long values, matched to 'keys' by index") + + +class PrimitiveTypeValue(IcebergBaseModel): + __root__: Union[ + BooleanTypeValue, + IntegerTypeValue, + LongTypeValue, + FloatTypeValue, + DoubleTypeValue, + DecimalTypeValue, + StringTypeValue, + UUIDTypeValue, + DateTypeValue, + TimeTypeValue, + TimestampTypeValue, + TimestampTzTypeValue, + TimestampNanoTypeValue, + TimestampTzNanoTypeValue, + FixedTypeValue, + BinaryTypeValue, + ] + + +class ValueMap(IcebergBaseModel): + keys: Optional[List[IntegerTypeValue]] = Field(None, description="List of integer column ids for each corresponding value") + values: Optional[List[PrimitiveTypeValue]] = Field( + None, description="List of primitive type values, matched to 'keys' by index" + ) + + +class PlanTableScanRequest(IcebergBaseModel): + snapshot_id: Optional[int] = Field( + None, + alias="snapshot-id", + description="Identifier for the snapshot to scan in a point-in-time scan", + ) + select: Optional[List[FieldName]] = Field(None, description="List of selected schema fields") + filter: Optional[Expression] = Field(None, description="Expression used to filter the table data") + case_sensitive: Optional[bool] = Field( + True, + alias="case-sensitive", + description="Enables case sensitive field matching for filter and select", + ) + use_snapshot_schema: Optional[bool] = Field( + False, + alias="use-snapshot-schema", + description="Whether to use the schema at the time the snapshot was written.\nWhen time travelling, the snapshot schema should be used (true). When scanning a branch, the table schema should be used (false).", + ) + start_snapshot_id: Optional[int] = Field( + None, + alias="start-snapshot-id", + description="Starting snapshot ID for an incremental scan (exclusive)", + ) + end_snapshot_id: Optional[int] = Field( + None, + alias="end-snapshot-id", + description="Ending snapshot ID for an incremental scan (inclusive).\nRequired when start-snapshot-id is specified.", + ) + stats_fields: Optional[List[FieldName]] = Field( + None, + alias="stats-fields", + description="List of fields for which the service should send column stats.", + ) + + +class ContentFile(IcebergBaseModel): + content: str + file_path: str = Field(..., alias="file-path") + file_format: Literal["avro", "orc", "parquet", "puffin"] = Field(..., alias="file-format") + spec_id: int = Field(..., alias="spec-id") + partition: List[PrimitiveTypeValue] = Field( + ..., + description="A list of partition field values ordered based on the fields of the partition spec specified by the `spec-id`", + example=[1, "bar"], + ) + file_size_in_bytes: int = Field(..., alias="file-size-in-bytes", description="Total file size in bytes") + record_count: int = Field(..., alias="record-count", description="Number of records in the file") + key_metadata: Optional[BinaryTypeValue] = Field(None, alias="key-metadata", description="Encryption key metadata blob") + split_offsets: Optional[List[int]] = Field(None, alias="split-offsets", description="List of splittable offsets") + sort_order_id: Optional[int] = Field(None, alias="sort-order-id") + + +class PositionDeleteFile(ContentFile): + content: Literal["position-deletes"] = Field(..., const=True) + content_offset: Optional[int] = Field( + None, + alias="content-offset", + description="Offset within the delete file of delete content", + ) + content_size_in_bytes: Optional[int] = Field( + None, + alias="content-size-in-bytes", + description="Length, in bytes, of the delete content; required if content-offset is present", + ) + + +class EqualityDeleteFile(ContentFile): + content: Literal["equality-deletes"] = Field(..., const=True) + equality_ids: Optional[List[int]] = Field(None, alias="equality-ids", description="List of equality field IDs") + + +class DeleteFile(IcebergBaseModel): + __root__: Union[PositionDeleteFile, EqualityDeleteFile] = Field(..., discriminator="content") + + +class DataFile(ContentFile): + content: str = Field(..., const=True) + first_row_id: Optional[int] = Field( + None, + alias="first-row-id", + description="The first row ID assigned to the first row in the data file", + ) + column_sizes: Optional[CountMap] = Field( + None, + alias="column-sizes", + description="Map of column id to total count, including null and NaN", + ) + value_counts: Optional[CountMap] = Field(None, alias="value-counts", description="Map of column id to null value count") + null_value_counts: Optional[CountMap] = Field( + None, + alias="null-value-counts", + description="Map of column id to null value count", + ) + nan_value_counts: Optional[CountMap] = Field( + None, + alias="nan-value-counts", + description="Map of column id to number of NaN values in the column", + ) + lower_bounds: Optional[ValueMap] = Field( + None, + alias="lower-bounds", + description="Map of column id to lower bound primitive type values", + ) + upper_bounds: Optional[ValueMap] = Field( + None, + alias="upper-bounds", + description="Map of column id to upper bound primitive type values", + ) + + +class FileScanTask(IcebergBaseModel): + data_file: DataFile = Field(..., alias="data-file") + delete_file_references: Optional[List[int]] = Field( + None, + alias="delete-file-references", + description="A list of indices in the delete files array (0-based)", + ) + residual_filter: Optional[Expression] = Field( + None, + alias="residual-filter", + description="An optional filter to be applied to rows in this file scan task.\nIf the residual is not present, the client must produce the residual or use the original filter.", + ) + + +class PlanTask(IcebergBaseModel): + __root__: str = Field( + ..., + description="An opaque string provided by the REST server that represents a unit of work to produce file scan tasks for scan planning. This allows clients to fetch tasks across multiple requests to accommodate large result sets.", + ) + + +class ScanTasks(IcebergBaseModel): + """ + Scan and planning tasks for server-side scan planning. + + - `plan-tasks` contains opaque units of planning work + - `file-scan-tasks` contains a partial or complete list of table scan tasks + - `delete-files` contains delete files referenced by file scan tasks + + Each plan task must be passed to the fetchScanTasks endpoint to fetch the file scan tasks for the plan task. + + The list of delete files must contain all delete files referenced by the file scan tasks. + + """ + + delete_files: Optional[List[DeleteFile]] = Field( + None, + alias="delete-files", + description="Delete files referenced by file scan tasks", + ) + file_scan_tasks: Optional[List[FileScanTask]] = Field(None, alias="file-scan-tasks") + plan_tasks: Optional[List[PlanTask]] = Field(None, alias="plan-tasks") From 87cb3c4ab7c571cc9ba37e573c71170eb10e8715 Mon Sep 17 00:00:00 2001 From: Alex Stephen Date: Tue, 9 Sep 2025 12:37:03 -0700 Subject: [PATCH 2/8] add license headers --- pyiceberg/catalog/rest/expression.py | 17 +++++++++++++++++ pyiceberg/catalog/rest/planning_models.py | 17 +++++++++++++++++ 2 files changed, 34 insertions(+) diff --git a/pyiceberg/catalog/rest/expression.py b/pyiceberg/catalog/rest/expression.py index 60792241ed..6ff63627b8 100644 --- a/pyiceberg/catalog/rest/expression.py +++ b/pyiceberg/catalog/rest/expression.py @@ -1,3 +1,20 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + from typing import Any, Dict, List, Union from pydantic import BaseModel, Field diff --git a/pyiceberg/catalog/rest/planning_models.py b/pyiceberg/catalog/rest/planning_models.py index 02d038c511..1826dc1c33 100644 --- a/pyiceberg/catalog/rest/planning_models.py +++ b/pyiceberg/catalog/rest/planning_models.py @@ -1,3 +1,20 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + from datetime import date from typing import List, Literal, Optional, Union from uuid import UUID From f780d01f6346596f3d082317727760cb29625550 Mon Sep 17 00:00:00 2001 From: Alex Stephen Date: Thu, 11 Sep 2025 12:12:32 -0700 Subject: [PATCH 3/8] Consolidate some models --- pyiceberg/catalog/rest/expression.py | 27 +++++---------------------- 1 file changed, 5 insertions(+), 22 deletions(-) diff --git a/pyiceberg/catalog/rest/expression.py b/pyiceberg/catalog/rest/expression.py index 6ff63627b8..6112ed6b8c 100644 --- a/pyiceberg/catalog/rest/expression.py +++ b/pyiceberg/catalog/rest/expression.py @@ -17,12 +17,14 @@ from typing import Any, Dict, List, Union -from pydantic import BaseModel, Field +from pydantic import Field +from pyiceberg.expressions import Reference +from pyiceberg.transforms import Transform from pyiceberg.typedef import IcebergBaseModel -class ExpressionType(BaseModel): +class ExpressionType(IcebergBaseModel): __root__: str = Field( ..., example=[ @@ -57,32 +59,13 @@ class FalseExpression(IcebergBaseModel): type: ExpressionType = Field(default_factory=lambda: ExpressionType.parse_obj("false"), const=True) -class Transform(IcebergBaseModel): - __root__: str = Field( - ..., - example=[ - "identity", - "year", - "month", - "day", - "hour", - "bucket[256]", - "truncate[16]", - ], - ) - - -class Reference(IcebergBaseModel): - __root__: str = Field(..., example=["column-name"]) - - class TransformTerm(IcebergBaseModel): type: str = Field("transform", const=True) transform: Transform term: Reference -class Term(BaseModel): +class Term(IcebergBaseModel): __root__: Union[Reference, TransformTerm] From 33560dd7bd9b828a9dff4d93c2eac1fe16621348 Mon Sep 17 00:00:00 2001 From: Alex Stephen Date: Thu, 11 Sep 2025 12:33:00 -0700 Subject: [PATCH 4/8] Add tests and Pydantic changes --- pyiceberg/catalog/rest/expression.py | 83 +++++++----- pyiceberg/catalog/rest/planning_models.py | 134 +++++++++++-------- tests/catalog/test_rest_serializers.py | 151 ++++++++++++++++++++++ 3 files changed, 278 insertions(+), 90 deletions(-) create mode 100644 tests/catalog/test_rest_serializers.py diff --git a/pyiceberg/catalog/rest/expression.py b/pyiceberg/catalog/rest/expression.py index 6112ed6b8c..4bb75f90b2 100644 --- a/pyiceberg/catalog/rest/expression.py +++ b/pyiceberg/catalog/rest/expression.py @@ -15,58 +15,63 @@ # specific language governing permissions and limitations # under the License. -from typing import Any, Dict, List, Union +from typing import Any, Dict, List, Literal, Union from pydantic import Field -from pyiceberg.expressions import Reference from pyiceberg.transforms import Transform -from pyiceberg.typedef import IcebergBaseModel +from pyiceberg.typedef import IcebergBaseModel, IcebergRootModel -class ExpressionType(IcebergBaseModel): - __root__: str = Field( +class Reference(IcebergRootModel[str]): + root: str = Field(..., json_schema_extra={"example": "column-name"}) + + +class ExpressionType(IcebergRootModel[str]): + root: str = Field( ..., - example=[ - "true", - "false", - "eq", - "and", - "or", - "not", - "in", - "not-in", - "lt", - "lt-eq", - "gt", - "gt-eq", - "not-eq", - "starts-with", - "not-starts-with", - "is-null", - "not-null", - "is-nan", - "not-nan", - ], + json_schema_extra={ + "example": [ + "true", + "false", + "eq", + "and", + "or", + "not", + "in", + "not-in", + "lt", + "lt-eq", + "gt", + "gt-eq", + "not-eq", + "starts-with", + "not-starts-with", + "is-null", + "not-null", + "is-nan", + "not-nan", + ] + }, ) class TrueExpression(IcebergBaseModel): - type: ExpressionType = Field(default_factory=lambda: ExpressionType.parse_obj("true"), const=True) + type: Literal["true"] = "true" class FalseExpression(IcebergBaseModel): - type: ExpressionType = Field(default_factory=lambda: ExpressionType.parse_obj("false"), const=True) + type: Literal["false"] = "false" class TransformTerm(IcebergBaseModel): - type: str = Field("transform", const=True) + type: Literal["transform"] = "transform" transform: Transform term: Reference -class Term(IcebergBaseModel): - __root__: Union[Reference, TransformTerm] +class Term(IcebergRootModel[Union[Reference, TransformTerm]]): + root: Union[Reference, TransformTerm] class AndOrExpression(IcebergBaseModel): @@ -76,7 +81,7 @@ class AndOrExpression(IcebergBaseModel): class NotExpression(IcebergBaseModel): - type: ExpressionType = Field(default_factory=lambda: ExpressionType.parse_obj("not"), const=True) + type: Literal["not"] = "not" child: "Expression" @@ -98,8 +103,18 @@ class UnaryExpression(IcebergBaseModel): value: Dict[str, Any] -class Expression(IcebergBaseModel): - __root__: Union[ +class Expression(IcebergRootModel[ + Union[ + TrueExpression, + FalseExpression, + AndOrExpression, + NotExpression, + SetExpression, + LiteralExpression, + UnaryExpression, + ] +]): + root: Union[ TrueExpression, FalseExpression, AndOrExpression, diff --git a/pyiceberg/catalog/rest/planning_models.py b/pyiceberg/catalog/rest/planning_models.py index 1826dc1c33..529ef0f5e5 100644 --- a/pyiceberg/catalog/rest/planning_models.py +++ b/pyiceberg/catalog/rest/planning_models.py @@ -22,120 +22,121 @@ from pydantic import Field from pyiceberg.catalog.rest.expression import Expression -from pyiceberg.typedef import IcebergBaseModel +from pyiceberg.typedef import IcebergBaseModel, IcebergRootModel -class FieldName(IcebergBaseModel): - __root__: str = Field( +class FieldName(IcebergRootModel[str]): + root: str = Field( ..., description="A full field name (including parent field names), such as those passed in APIs like Java `Schema#findField(String name)`.\nThe nested field name follows these rules - Nested struct fields are named by concatenating field names at each struct level using dot (`.`) delimiter, e.g. employer.contact_info.address.zip_code - Nested fields in a map key are named using the keyword `key`, e.g. employee_address_map.key.first_name - Nested fields in a map value are named using the keyword `value`, e.g. employee_address_map.value.zip_code - Nested fields in a list are named using the keyword `element`, e.g. employees.element.first_name", ) -class BooleanTypeValue(IcebergBaseModel): - __root__: bool = Field(..., example=True) +class BooleanTypeValue(IcebergRootModel[bool]): + root: bool = Field(..., json_schema_extra={"example": True}) -class IntegerTypeValue(IcebergBaseModel): - __root__: int = Field(..., example=42) +class IntegerTypeValue(IcebergRootModel[int]): + root: int = Field(..., json_schema_extra={"example": 42}) -class LongTypeValue(IcebergBaseModel): - __root__: int = Field(..., example=9223372036854775807) +class LongTypeValue(IcebergRootModel[int]): + root: int = Field(..., json_schema_extra={"example": 9223372036854775807}) -class FloatTypeValue(IcebergBaseModel): - __root__: float = Field(..., example=3.14) +class FloatTypeValue(IcebergRootModel[float]): + root: float = Field(..., json_schema_extra={"example": 3.14}) -class DoubleTypeValue(IcebergBaseModel): - __root__: float = Field(..., example=123.456) +class DoubleTypeValue(IcebergRootModel[float]): + root: float = Field(..., json_schema_extra={"example": 123.456}) -class DecimalTypeValue(IcebergBaseModel): - __root__: str = Field( + +class DecimalTypeValue(IcebergRootModel[str]): + root: str = Field( ..., description="Decimal type values are serialized as strings. Decimals with a positive scale serialize as numeric plain text, while decimals with a negative scale use scientific notation and the exponent will be equal to the negated scale. For instance, a decimal with a positive scale is '123.4500', with zero scale is '2', and with a negative scale is '2E+20'", - example="123.4500", + json_schema_extra={"example": "123.4500"}, ) -class StringTypeValue(IcebergBaseModel): - __root__: str = Field(..., example="hello") +class StringTypeValue(IcebergRootModel[str]): + root: str = Field(..., json_schema_extra={"example": "hello"}) -class UUIDTypeValue(IcebergBaseModel): - __root__: UUID = Field( +class UUIDTypeValue(IcebergRootModel[UUID]): + root: UUID = Field( ..., description="UUID type values are serialized as a 36-character lowercase string in standard UUID format as specified by RFC-4122", - example="eb26bdb1-a1d8-4aa6-990e-da940875492c", + json_schema_extra={"example": "eb26bdb1-a1d8-4aa6-990e-da940875492c"}, max_length=36, min_length=36, - regex="^[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}$", + pattern="^[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}$", ) -class DateTypeValue(IcebergBaseModel): - __root__: date = Field( +class DateTypeValue(IcebergRootModel[date]): + root: date = Field( ..., description="Date type values follow the 'YYYY-MM-DD' ISO-8601 standard date format", - example="2007-12-03", + json_schema_extra={"example": "2007-12-03"}, ) -class TimeTypeValue(IcebergBaseModel): - __root__: str = Field( +class TimeTypeValue(IcebergRootModel[str]): + root: str = Field( ..., description="Time type values follow the 'HH:MM:SS.ssssss' ISO-8601 format with microsecond precision", - example="22:31:08.123456", + json_schema_extra={"example": "22:31:08.123456"}, ) -class TimestampTypeValue(IcebergBaseModel): - __root__: str = Field( +class TimestampTypeValue(IcebergRootModel[str]): + root: str = Field( ..., description="Timestamp type values follow the 'YYYY-MM-DDTHH:MM:SS.ssssss' ISO-8601 format with microsecond precision", - example="2007-12-03T10:15:30.123456", + json_schema_extra={"example": "2007-12-03T10:15:30.123456"}, ) -class TimestampTzTypeValue(IcebergBaseModel): - __root__: str = Field( +class TimestampTzTypeValue(IcebergRootModel[str]): + root: str = Field( ..., description="TimestampTz type values follow the 'YYYY-MM-DDTHH:MM:SS.ssssss+00:00' ISO-8601 format with microsecond precision, and a timezone offset (+00:00 for UTC)", - example="2007-12-03T10:15:30.123456+00:00", + json_schema_extra={"example": "2007-12-03T10:15:30.123456+00:00"}, ) -class TimestampNanoTypeValue(IcebergBaseModel): - __root__: str = Field( +class TimestampNanoTypeValue(IcebergRootModel[str]): + root: str = Field( ..., description="Timestamp_ns type values follow the 'YYYY-MM-DDTHH:MM:SS.sssssssss' ISO-8601 format with nanosecond precision", - example="2007-12-03T10:15:30.123456789", + json_schema_extra={"example": "2007-12-03T10:15:30.123456789"}, ) -class TimestampTzNanoTypeValue(IcebergBaseModel): - __root__: str = Field( +class TimestampTzNanoTypeValue(IcebergRootModel[str]): + root: str = Field( ..., description="Timestamp_ns type values follow the 'YYYY-MM-DDTHH:MM:SS.sssssssss+00:00' ISO-8601 format with nanosecond precision, and a timezone offset (+00:00 for UTC)", - example="2007-12-03T10:15:30.123456789+00:00", + json_schema_extra={"example": "2007-12-03T10:15:30.123456789+00:00"}, ) -class FixedTypeValue(IcebergBaseModel): - __root__: str = Field( +class FixedTypeValue(IcebergRootModel[str]): + root: str = Field( ..., description="Fixed length type values are stored and serialized as an uppercase hexadecimal string preserving the fixed length", - example="78797A", + json_schema_extra={"example": "78797A"}, ) -class BinaryTypeValue(IcebergBaseModel): - __root__: str = Field( +class BinaryTypeValue(IcebergRootModel[str]): + root: str = Field( ..., description="Binary type values are stored and serialized as an uppercase hexadecimal string", - example="78797A", + json_schema_extra={"example": "78797A"}, ) @@ -144,8 +145,29 @@ class CountMap(IcebergBaseModel): values: Optional[List[LongTypeValue]] = Field(None, description="List of Long values, matched to 'keys' by index") -class PrimitiveTypeValue(IcebergBaseModel): - __root__: Union[ +class PrimitiveTypeValue( + IcebergRootModel[ + Union[ + BooleanTypeValue, + IntegerTypeValue, + LongTypeValue, + FloatTypeValue, + DoubleTypeValue, + DecimalTypeValue, + StringTypeValue, + UUIDTypeValue, + DateTypeValue, + TimeTypeValue, + TimestampTypeValue, + TimestampTzTypeValue, + TimestampNanoTypeValue, + TimestampTzNanoTypeValue, + FixedTypeValue, + BinaryTypeValue, + ] + ] +): + root: Union[ BooleanTypeValue, IntegerTypeValue, LongTypeValue, @@ -215,7 +237,7 @@ class ContentFile(IcebergBaseModel): partition: List[PrimitiveTypeValue] = Field( ..., description="A list of partition field values ordered based on the fields of the partition spec specified by the `spec-id`", - example=[1, "bar"], + json_schema_extra={"example": [1, "bar"]}, ) file_size_in_bytes: int = Field(..., alias="file-size-in-bytes", description="Total file size in bytes") record_count: int = Field(..., alias="record-count", description="Number of records in the file") @@ -225,7 +247,7 @@ class ContentFile(IcebergBaseModel): class PositionDeleteFile(ContentFile): - content: Literal["position-deletes"] = Field(..., const=True) + content: Literal["position-deletes"] = "position-deletes" content_offset: Optional[int] = Field( None, alias="content-offset", @@ -239,16 +261,16 @@ class PositionDeleteFile(ContentFile): class EqualityDeleteFile(ContentFile): - content: Literal["equality-deletes"] = Field(..., const=True) + content: Literal["equality-deletes"] = "equality-deletes" equality_ids: Optional[List[int]] = Field(None, alias="equality-ids", description="List of equality field IDs") -class DeleteFile(IcebergBaseModel): - __root__: Union[PositionDeleteFile, EqualityDeleteFile] = Field(..., discriminator="content") +class DeleteFile(IcebergRootModel[Union[PositionDeleteFile, EqualityDeleteFile]]): + root: Union[PositionDeleteFile, EqualityDeleteFile] = Field(..., discriminator="content") class DataFile(ContentFile): - content: str = Field(..., const=True) + content: Literal["data"] = "data" first_row_id: Optional[int] = Field( None, alias="first-row-id", @@ -296,8 +318,8 @@ class FileScanTask(IcebergBaseModel): ) -class PlanTask(IcebergBaseModel): - __root__: str = Field( +class PlanTask(IcebergRootModel[str]): + root: str = Field( ..., description="An opaque string provided by the REST server that represents a unit of work to produce file scan tasks for scan planning. This allows clients to fetch tasks across multiple requests to accommodate large result sets.", ) diff --git a/tests/catalog/test_rest_serializers.py b/tests/catalog/test_rest_serializers.py new file mode 100644 index 0000000000..abae980ed7 --- /dev/null +++ b/tests/catalog/test_rest_serializers.py @@ -0,0 +1,151 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +from pyiceberg.catalog.rest.expression import ( + AndOrExpression, + Expression, + LiteralExpression, + Term, +) +from pyiceberg.catalog.rest.planning_models import ( + DataFile, + DeleteFile, + EqualityDeleteFile, + FileScanTask, + PlanTableScanRequest, + PositionDeleteFile, + ScanTasks, +) + + +def test_serialize_plan_table_scan_request(): + """Test serializing a PlanTableScanRequest to a dict""" + expression = AndOrExpression( + type="and", + left=Expression( + root=AndOrExpression( + type="or", + left=Expression( + root=AndOrExpression( + type="and", + left=Expression(root=LiteralExpression(type="lt", term=Term(root="a"), value={"type": "integer", "value": 1})), + right=Expression(root=LiteralExpression(type="lt-eq", term=Term(root="b"), value={"type": "integer", "value": 2})), + ) + ), + right=Expression(root=LiteralExpression(type="eq", term=Term(root="c"), value={"type": "integer", "value": 3})), + ) + ), + right=Expression(root=LiteralExpression(type="gt", term=Term(root="d"), value={"type": "integer", "value": 4})), + ) + request = PlanTableScanRequest( + snapshot_id=1, + select=["a", "b", "c"], + filter=Expression(root=expression), + case_sensitive=True, + ) + assert request.model_dump_json(exclude_none=True) == snapshot_json_for_plan_table_scan_request() + + +def test_deserialize_plan_table_scan_request(): + """Test deserializing a PlanTableScanRequest from a dict""" + expression = AndOrExpression( + type="and", + left=Expression( + root=AndOrExpression( + type="or", + left=Expression( + root=AndOrExpression( + type="and", + left=Expression(root=LiteralExpression(type="lt", term=Term(root="a"), value={"type": "integer", "value": 1})), + right=Expression(root=LiteralExpression(type="lt-eq", term=Term(root="b"), value={"type": "integer", "value": 2})), + ) + ), + right=Expression(root=LiteralExpression(type="eq", term=Term(root="c"), value={"type": "integer", "value": 3})), + ) + ), + right=Expression(root=LiteralExpression(type="gt", term=Term(root="d"), value={"type": "integer", "value": 4})), + ) + request = PlanTableScanRequest( + snapshot_id=1, + select=["a", "b", "c"], + filter=Expression(root=expression), + case_sensitive=True, + ) + assert request == PlanTableScanRequest.model_validate_json(snapshot_json_for_plan_table_scan_request()) + + +def test_deserialize_scan_tasks(): + """Test deserializing a ScanTasks from a dict""" + scan_tasks = ScanTasks.model_validate_json(snapshot_json_for_scan_tasks()) + assert len(scan_tasks.file_scan_tasks) == 1 + assert len(scan_tasks.delete_files) == 2 + assert scan_tasks.file_scan_tasks[0].data_file.file_path == "/path/to/data-a.parquet" + assert scan_tasks.delete_files[0].root.file_path == "/path/to/delete-a.parquet" + assert scan_tasks.delete_files[1].root.file_path == "/path/to/delete-b.parquet" + + +def test_serialize_scan_tasks(): + """Test serializing a ScanTasks to a dict""" + scan_tasks = ScanTasks( + file_scan_tasks=[ + FileScanTask( + data_file=DataFile( + content="data", + file_path="/path/to/data-a.parquet", + file_format="parquet", + partition=[], + record_count=56, + file_size_in_bytes=1024, + spec_id=0, + ), + delete_file_references=[0, 1], + ) + ], + delete_files=[ + DeleteFile( + root=PositionDeleteFile( + content="position-deletes", + file_path="/path/to/delete-a.parquet", + file_format="parquet", + partition=[], + record_count=10, + file_size_in_bytes=256, + spec_id=0, + ) + ), + DeleteFile( + root=EqualityDeleteFile( + content="equality-deletes", + file_path="/path/to/delete-b.parquet", + file_format="parquet", + partition=[], + record_count=10, + file_size_in_bytes=256, + spec_id=0, + equality_ids=[1, 2], + ) + ), + ], + ) + assert scan_tasks.model_dump_json(exclude_none=True) == snapshot_json_for_scan_tasks() + + +def snapshot_json_for_plan_table_scan_request(): + return """{"snapshot-id":1,"select":["a","b","c"],"filter":{"type":"and","left":{"type":"or","left":{"type":"and","left":{"type":"lt","term":"a","value":{"type":"integer","value":1}},"right":{"type":"lt-eq","term":"b","value":{"type":"integer","value":2}}},"right":{"type":"eq","term":"c","value":{"type":"integer","value":3}}},"right":{"type":"gt","term":"d","value":{"type":"integer","value":4}}},"case-sensitive":true,"use-snapshot-schema":false}""" + + +def snapshot_json_for_scan_tasks(): + return """{"delete-files":[{"content":"position-deletes","file-path":"/path/to/delete-a.parquet","file-format":"parquet","spec-id":0,"partition":[],"file-size-in-bytes":256,"record-count":10},{"content":"equality-deletes","file-path":"/path/to/delete-b.parquet","file-format":"parquet","spec-id":0,"partition":[],"file-size-in-bytes":256,"record-count":10,"equality-ids":[1,2]}],"file-scan-tasks":[{"data-file":{"content":"data","file-path":"/path/to/data-a.parquet","file-format":"parquet","spec-id":0,"partition":[],"file-size-in-bytes":1024,"record-count":56},"delete-file-references":[0,1]}]}""" From 93046b2c7b5d6b5c15d05ac4031736056b94968f Mon Sep 17 00:00:00 2001 From: Alex Stephen Date: Thu, 11 Sep 2025 12:33:35 -0700 Subject: [PATCH 5/8] lint --- pyiceberg/catalog/rest/expression.py | 22 ++++++++++++---------- pyiceberg/catalog/rest/planning_models.py | 1 - tests/catalog/test_rest_serializers.py | 16 ++++++++++++---- 3 files changed, 24 insertions(+), 15 deletions(-) diff --git a/pyiceberg/catalog/rest/expression.py b/pyiceberg/catalog/rest/expression.py index 4bb75f90b2..f80ffee7b6 100644 --- a/pyiceberg/catalog/rest/expression.py +++ b/pyiceberg/catalog/rest/expression.py @@ -103,17 +103,19 @@ class UnaryExpression(IcebergBaseModel): value: Dict[str, Any] -class Expression(IcebergRootModel[ - Union[ - TrueExpression, - FalseExpression, - AndOrExpression, - NotExpression, - SetExpression, - LiteralExpression, - UnaryExpression, +class Expression( + IcebergRootModel[ + Union[ + TrueExpression, + FalseExpression, + AndOrExpression, + NotExpression, + SetExpression, + LiteralExpression, + UnaryExpression, + ] ] -]): +): root: Union[ TrueExpression, FalseExpression, diff --git a/pyiceberg/catalog/rest/planning_models.py b/pyiceberg/catalog/rest/planning_models.py index 529ef0f5e5..d32544d1a3 100644 --- a/pyiceberg/catalog/rest/planning_models.py +++ b/pyiceberg/catalog/rest/planning_models.py @@ -32,7 +32,6 @@ class FieldName(IcebergRootModel[str]): ) - class BooleanTypeValue(IcebergRootModel[bool]): root: bool = Field(..., json_schema_extra={"example": True}) diff --git a/tests/catalog/test_rest_serializers.py b/tests/catalog/test_rest_serializers.py index abae980ed7..7ffc92bf96 100644 --- a/tests/catalog/test_rest_serializers.py +++ b/tests/catalog/test_rest_serializers.py @@ -41,8 +41,12 @@ def test_serialize_plan_table_scan_request(): left=Expression( root=AndOrExpression( type="and", - left=Expression(root=LiteralExpression(type="lt", term=Term(root="a"), value={"type": "integer", "value": 1})), - right=Expression(root=LiteralExpression(type="lt-eq", term=Term(root="b"), value={"type": "integer", "value": 2})), + left=Expression( + root=LiteralExpression(type="lt", term=Term(root="a"), value={"type": "integer", "value": 1}) + ), + right=Expression( + root=LiteralExpression(type="lt-eq", term=Term(root="b"), value={"type": "integer", "value": 2}) + ), ) ), right=Expression(root=LiteralExpression(type="eq", term=Term(root="c"), value={"type": "integer", "value": 3})), @@ -69,8 +73,12 @@ def test_deserialize_plan_table_scan_request(): left=Expression( root=AndOrExpression( type="and", - left=Expression(root=LiteralExpression(type="lt", term=Term(root="a"), value={"type": "integer", "value": 1})), - right=Expression(root=LiteralExpression(type="lt-eq", term=Term(root="b"), value={"type": "integer", "value": 2})), + left=Expression( + root=LiteralExpression(type="lt", term=Term(root="a"), value={"type": "integer", "value": 1}) + ), + right=Expression( + root=LiteralExpression(type="lt-eq", term=Term(root="b"), value={"type": "integer", "value": 2}) + ), ) ), right=Expression(root=LiteralExpression(type="eq", term=Term(root="c"), value={"type": "integer", "value": 3})), From 7fc25fff121cce57367b182aa09c32b0942fbc2e Mon Sep 17 00:00:00 2001 From: Alex Stephen Date: Thu, 11 Sep 2025 12:37:37 -0700 Subject: [PATCH 6/8] last test change --- pyiceberg/catalog/rest/expression.py | 2 +- tests/catalog/test_rest_serializers.py | 12 ++++++------ 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/pyiceberg/catalog/rest/expression.py b/pyiceberg/catalog/rest/expression.py index f80ffee7b6..a737981de0 100644 --- a/pyiceberg/catalog/rest/expression.py +++ b/pyiceberg/catalog/rest/expression.py @@ -66,7 +66,7 @@ class FalseExpression(IcebergBaseModel): class TransformTerm(IcebergBaseModel): type: Literal["transform"] = "transform" - transform: Transform + transform: Transform[Any, Any] term: Reference diff --git a/tests/catalog/test_rest_serializers.py b/tests/catalog/test_rest_serializers.py index 7ffc92bf96..ab09ac6f60 100644 --- a/tests/catalog/test_rest_serializers.py +++ b/tests/catalog/test_rest_serializers.py @@ -31,7 +31,7 @@ ) -def test_serialize_plan_table_scan_request(): +def test_serialize_plan_table_scan_request() -> None: """Test serializing a PlanTableScanRequest to a dict""" expression = AndOrExpression( type="and", @@ -63,7 +63,7 @@ def test_serialize_plan_table_scan_request(): assert request.model_dump_json(exclude_none=True) == snapshot_json_for_plan_table_scan_request() -def test_deserialize_plan_table_scan_request(): +def test_deserialize_plan_table_scan_request() -> None: """Test deserializing a PlanTableScanRequest from a dict""" expression = AndOrExpression( type="and", @@ -95,7 +95,7 @@ def test_deserialize_plan_table_scan_request(): assert request == PlanTableScanRequest.model_validate_json(snapshot_json_for_plan_table_scan_request()) -def test_deserialize_scan_tasks(): +def test_deserialize_scan_tasks() -> None: """Test deserializing a ScanTasks from a dict""" scan_tasks = ScanTasks.model_validate_json(snapshot_json_for_scan_tasks()) assert len(scan_tasks.file_scan_tasks) == 1 @@ -105,7 +105,7 @@ def test_deserialize_scan_tasks(): assert scan_tasks.delete_files[1].root.file_path == "/path/to/delete-b.parquet" -def test_serialize_scan_tasks(): +def test_serialize_scan_tasks() -> None: """Test serializing a ScanTasks to a dict""" scan_tasks = ScanTasks( file_scan_tasks=[ @@ -151,9 +151,9 @@ def test_serialize_scan_tasks(): assert scan_tasks.model_dump_json(exclude_none=True) == snapshot_json_for_scan_tasks() -def snapshot_json_for_plan_table_scan_request(): +def snapshot_json_for_plan_table_scan_request() -> str: return """{"snapshot-id":1,"select":["a","b","c"],"filter":{"type":"and","left":{"type":"or","left":{"type":"and","left":{"type":"lt","term":"a","value":{"type":"integer","value":1}},"right":{"type":"lt-eq","term":"b","value":{"type":"integer","value":2}}},"right":{"type":"eq","term":"c","value":{"type":"integer","value":3}}},"right":{"type":"gt","term":"d","value":{"type":"integer","value":4}}},"case-sensitive":true,"use-snapshot-schema":false}""" -def snapshot_json_for_scan_tasks(): +def snapshot_json_for_scan_tasks() -> str: return """{"delete-files":[{"content":"position-deletes","file-path":"/path/to/delete-a.parquet","file-format":"parquet","spec-id":0,"partition":[],"file-size-in-bytes":256,"record-count":10},{"content":"equality-deletes","file-path":"/path/to/delete-b.parquet","file-format":"parquet","spec-id":0,"partition":[],"file-size-in-bytes":256,"record-count":10,"equality-ids":[1,2]}],"file-scan-tasks":[{"data-file":{"content":"data","file-path":"/path/to/data-a.parquet","file-format":"parquet","spec-id":0,"partition":[],"file-size-in-bytes":1024,"record-count":56},"delete-file-references":[0,1]}]}""" From 407dc14f171ba4fd9565aff9c70baba38426784b Mon Sep 17 00:00:00 2001 From: Alex Stephen Date: Wed, 17 Sep 2025 17:15:00 -0700 Subject: [PATCH 7/8] PR fixes --- pyiceberg/catalog/rest/expression.py | 37 +++----------------------- tests/catalog/test_rest_serializers.py | 16 ++++++++--- 2 files changed, 16 insertions(+), 37 deletions(-) diff --git a/pyiceberg/catalog/rest/expression.py b/pyiceberg/catalog/rest/expression.py index a737981de0..ed4eebf103 100644 --- a/pyiceberg/catalog/rest/expression.py +++ b/pyiceberg/catalog/rest/expression.py @@ -27,35 +27,6 @@ class Reference(IcebergRootModel[str]): root: str = Field(..., json_schema_extra={"example": "column-name"}) -class ExpressionType(IcebergRootModel[str]): - root: str = Field( - ..., - json_schema_extra={ - "example": [ - "true", - "false", - "eq", - "and", - "or", - "not", - "in", - "not-in", - "lt", - "lt-eq", - "gt", - "gt-eq", - "not-eq", - "starts-with", - "not-starts-with", - "is-null", - "not-null", - "is-nan", - "not-nan", - ] - }, - ) - - class TrueExpression(IcebergBaseModel): type: Literal["true"] = "true" @@ -75,7 +46,7 @@ class Term(IcebergRootModel[Union[Reference, TransformTerm]]): class AndOrExpression(IcebergBaseModel): - type: ExpressionType + type: Literal["and", "or"] left: "Expression" right: "Expression" @@ -86,19 +57,19 @@ class NotExpression(IcebergBaseModel): class SetExpression(IcebergBaseModel): - type: ExpressionType + type: Literal["in", "not-in"] term: Term values: List[Dict[str, Any]] class LiteralExpression(IcebergBaseModel): - type: ExpressionType + type: Literal["lt", "lt-eq", "gt", "gt-eq", "eq", "not-eq", "starts-with", "not-starts-with"] term: Term value: Dict[str, Any] class UnaryExpression(IcebergBaseModel): - type: ExpressionType + type: Literal["is-null", "not-null", "is-nan", "not-nan"] term: Term value: Dict[str, Any] diff --git a/tests/catalog/test_rest_serializers.py b/tests/catalog/test_rest_serializers.py index ab09ac6f60..caf74f1b28 100644 --- a/tests/catalog/test_rest_serializers.py +++ b/tests/catalog/test_rest_serializers.py @@ -60,11 +60,13 @@ def test_serialize_plan_table_scan_request() -> None: filter=Expression(root=expression), case_sensitive=True, ) + # Assert that JSON matches. assert request.model_dump_json(exclude_none=True) == snapshot_json_for_plan_table_scan_request() def test_deserialize_plan_table_scan_request() -> None: - """Test deserializing a PlanTableScanRequest from a dict""" + """Test deserializing a dict to a PlanTableScanRequest""" + model = PlanTableScanRequest.model_validate_json(snapshot_json_for_plan_table_scan_request()) expression = AndOrExpression( type="and", left=Expression( @@ -86,18 +88,22 @@ def test_deserialize_plan_table_scan_request() -> None: ), right=Expression(root=LiteralExpression(type="gt", term=Term(root="d"), value={"type": "integer", "value": 4})), ) - request = PlanTableScanRequest( + expected = PlanTableScanRequest( snapshot_id=1, select=["a", "b", "c"], filter=Expression(root=expression), case_sensitive=True, ) - assert request == PlanTableScanRequest.model_validate_json(snapshot_json_for_plan_table_scan_request()) + + # Assert that deserialized dict == Python object + assert model == expected def test_deserialize_scan_tasks() -> None: - """Test deserializing a ScanTasks from a dict""" + """Test deserializing dict to ScanTasks""" scan_tasks = ScanTasks.model_validate_json(snapshot_json_for_scan_tasks()) + + # Assert JSON fields match expected. assert len(scan_tasks.file_scan_tasks) == 1 assert len(scan_tasks.delete_files) == 2 assert scan_tasks.file_scan_tasks[0].data_file.file_path == "/path/to/data-a.parquet" @@ -148,6 +154,8 @@ def test_serialize_scan_tasks() -> None: ), ], ) + + # Assert that JSON matches. assert scan_tasks.model_dump_json(exclude_none=True) == snapshot_json_for_scan_tasks() From 5be3e74729e2ba5fd71e5cd5cfc6e8608e44f7e8 Mon Sep 17 00:00:00 2001 From: Alex Stephen Date: Fri, 3 Oct 2025 16:44:03 -0700 Subject: [PATCH 8/8] PR comments --- pyiceberg/catalog/rest/planning_models.py | 35 ++++++ tests/catalog/test_rest_serializers.py | 123 ++++++++++++++++++++++ 2 files changed, 158 insertions(+) diff --git a/pyiceberg/catalog/rest/planning_models.py b/pyiceberg/catalog/rest/planning_models.py index d32544d1a3..309cefb50f 100644 --- a/pyiceberg/catalog/rest/planning_models.py +++ b/pyiceberg/catalog/rest/planning_models.py @@ -22,6 +22,7 @@ from pydantic import Field from pyiceberg.catalog.rest.expression import Expression +from pyiceberg.catalog.rest.response import ErrorResponse as IcebergErrorResponse from pyiceberg.typedef import IcebergBaseModel, IcebergRootModel @@ -345,3 +346,37 @@ class ScanTasks(IcebergBaseModel): ) file_scan_tasks: Optional[List[FileScanTask]] = Field(None, alias="file-scan-tasks") plan_tasks: Optional[List[PlanTask]] = Field(None, alias="plan-tasks") + + +class FailedPlanningResult(IcebergErrorResponse): + """Failed server-side planning result.""" + + status: Literal["failed"] + + +class AsyncPlanningResult(IcebergBaseModel): + status: Literal["submitted"] + plan_id: str = Field(..., alias="plan-id", description="ID used to track a planning request") + + +class CancelledPlanningResult(IcebergBaseModel): + """A cancelled planning result.""" + + status: Literal["cancelled"] + + +class CompletedPlanningWithIDResult(ScanTasks): + """Completed server-side planning result.""" + + status: Literal["completed"] + plan_id: Optional[str] = Field(None, alias="plan-id", description="ID used to track a planning request") + + +class PlanTableScanResult( + IcebergRootModel[Union[CompletedPlanningWithIDResult, FailedPlanningResult, AsyncPlanningResult, CancelledPlanningResult]] +): + """Result of server-side scan planning for planTableScan.""" + + root: Union[CompletedPlanningWithIDResult, FailedPlanningResult, AsyncPlanningResult, CancelledPlanningResult] = Field( + ..., discriminator="status" + ) diff --git a/tests/catalog/test_rest_serializers.py b/tests/catalog/test_rest_serializers.py index caf74f1b28..ef39d5e056 100644 --- a/tests/catalog/test_rest_serializers.py +++ b/tests/catalog/test_rest_serializers.py @@ -14,6 +14,8 @@ # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. +import json + from pyiceberg.catalog.rest.expression import ( AndOrExpression, Expression, @@ -21,14 +23,20 @@ Term, ) from pyiceberg.catalog.rest.planning_models import ( + AsyncPlanningResult, + CancelledPlanningResult, + CompletedPlanningWithIDResult, DataFile, DeleteFile, EqualityDeleteFile, + FailedPlanningResult, FileScanTask, PlanTableScanRequest, + PlanTableScanResult, PositionDeleteFile, ScanTasks, ) +from pyiceberg.catalog.rest.response import ErrorResponseMessage def test_serialize_plan_table_scan_request() -> None: @@ -165,3 +173,118 @@ def snapshot_json_for_plan_table_scan_request() -> str: def snapshot_json_for_scan_tasks() -> str: return """{"delete-files":[{"content":"position-deletes","file-path":"/path/to/delete-a.parquet","file-format":"parquet","spec-id":0,"partition":[],"file-size-in-bytes":256,"record-count":10},{"content":"equality-deletes","file-path":"/path/to/delete-b.parquet","file-format":"parquet","spec-id":0,"partition":[],"file-size-in-bytes":256,"record-count":10,"equality-ids":[1,2]}],"file-scan-tasks":[{"data-file":{"content":"data","file-path":"/path/to/data-a.parquet","file-format":"parquet","spec-id":0,"partition":[],"file-size-in-bytes":1024,"record-count":56},"delete-file-references":[0,1]}]}""" + + +def test_deserialize_async_planning_result() -> None: + """Test deserializing a dict to an AsyncPlanningResult""" + result = PlanTableScanResult.model_validate_json(snapshot_json_for_async_planning_result()) + expected = AsyncPlanningResult(status="submitted", plan_id="plan-123") + # Assert that deserialized dict == Python object + assert result.root == expected + + +def test_serialize_async_planning_result() -> None: + """Test serializing an AsyncPlanningResult to a dict""" + result = PlanTableScanResult(root=AsyncPlanningResult(status="submitted", plan_id="plan-123")) + # Assert that JSON matches + assert json.loads(result.model_dump_json(by_alias=True)) == json.loads(snapshot_json_for_async_planning_result()) + + +def snapshot_json_for_async_planning_result() -> str: + return """{"status":"submitted","plan-id":"plan-123"}""" + + +def test_deserialize_failed_planning_result() -> None: + """Test deserializing a dict to a FailedPlanningResult""" + result = PlanTableScanResult.model_validate_json(snapshot_json_for_failed_planning_result()) + expected = FailedPlanningResult( + status="failed", + error=ErrorResponseMessage( + message="The plan is invalid", + type="NoSuchPlanException", + code=404, + ), + ) + # Assert that deserialized dict == Python object + assert result.root == expected + + +def test_serialize_failed_planning_result() -> None: + """Test serializing a FailedPlanningResult to a dict""" + result = PlanTableScanResult( + root=FailedPlanningResult( + status="failed", + error=ErrorResponseMessage( + message="The plan is invalid", + type="NoSuchPlanException", + code=404, + ), + ) + ) + # Assert that JSON matches + assert json.loads(result.model_dump_json(by_alias=True, exclude_none=True)) == json.loads( + snapshot_json_for_failed_planning_result() + ) + + +def snapshot_json_for_failed_planning_result() -> str: + return """{"status":"failed","error":{"message":"The plan is invalid","type":"NoSuchPlanException","code":404}}""" + + +def test_deserialize_cancelled_planning_result() -> None: + """Test deserializing a dict to an CancelledPlanningResult""" + result = PlanTableScanResult.model_validate_json(snapshot_json_for_cancelled_planning_result()) + expected = CancelledPlanningResult(status="cancelled") + # Assert that deserialized dict == Python object + assert result.root == expected + + +def test_serialize_cancelled_planning_result() -> None: + """Test serializing an CancelledPlanningResult to a dict""" + result = PlanTableScanResult(root=CancelledPlanningResult(status="cancelled")) + # Assert that JSON matches + assert json.loads(result.model_dump_json(by_alias=True)) == json.loads(snapshot_json_for_cancelled_planning_result()) + + +def snapshot_json_for_cancelled_planning_result() -> str: + return """{"status":"cancelled"}""" + + +def test_deserialize_completed_planning_with_id_result() -> None: + """Test deserializing a dict to a CompletedPlanningWithIDResult""" + scan_tasks_dict = json.loads(snapshot_json_for_scan_tasks()) + scan_tasks_dict["status"] = "completed" + scan_tasks_dict["plan-id"] = "plan-456" + json_str = json.dumps(scan_tasks_dict) + + result = PlanTableScanResult.model_validate_json(json_str) + expected_scan_tasks = ScanTasks.model_validate_json(snapshot_json_for_scan_tasks()) + + expected = CompletedPlanningWithIDResult( + status="completed", + plan_id="plan-456", + file_scan_tasks=expected_scan_tasks.file_scan_tasks, + delete_files=expected_scan_tasks.delete_files, + ) + # Assert that deserialized dict == Python object + assert result.root == expected + + +def test_serialize_completed_planning_with_id_result() -> None: + """Test serializing a CompletedPlanningWithIDResult to a dict""" + expected_scan_tasks = ScanTasks.model_validate_json(snapshot_json_for_scan_tasks()) + result = PlanTableScanResult( + root=CompletedPlanningWithIDResult( + status="completed", + plan_id="plan-456", + file_scan_tasks=expected_scan_tasks.file_scan_tasks, + delete_files=expected_scan_tasks.delete_files, + ) + ) + + scan_tasks_dict = json.loads(snapshot_json_for_scan_tasks()) + scan_tasks_dict["status"] = "completed" + scan_tasks_dict["plan-id"] = "plan-456" + + # Assert that JSON matches + assert json.loads(result.model_dump_json(exclude_none=True, by_alias=True)) == scan_tasks_dict