From bd728a62149c59d58cd20dc4f80dcf368aacd494 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tim=20Swe=C3=B1a?= Date: Wed, 9 Jul 2025 15:09:15 -0700 Subject: [PATCH 1/6] feat: support INFORMATION_SCHEMA tables in read_gbq --- .../session/_io/bigquery/read_gbq_table.py | 57 +++++++++++++++++-- bigframes/session/loader.py | 11 +--- .../test_read_gbq_information_schema.py | 18 ++++++ 3 files changed, 72 insertions(+), 14 deletions(-) create mode 100644 tests/system/small/pandas/test_read_gbq_information_schema.py diff --git a/bigframes/session/_io/bigquery/read_gbq_table.py b/bigframes/session/_io/bigquery/read_gbq_table.py index 6322040428..341e39061b 100644 --- a/bigframes/session/_io/bigquery/read_gbq_table.py +++ b/bigframes/session/_io/bigquery/read_gbq_table.py @@ -42,17 +42,53 @@ import bigframes.session +def get_information_schema_metadata( + bqclient: bigquery.Client, + table_id: str, + default_project: Optional[str], +) -> bigquery.Table: + job_config = bigquery.QueryJobConfig(dry_run=True) + job = bqclient.query( + # TODO: better escaping? + f"SELECT * FROM `{table_id}`", + job_config=job_config, + ) + parts = table_id.split(".") + if len(parts) < 3: + project = default_project + dataset = parts[0] + table_id_short = ".".join(parts[1:]) + else: + project = parts[0] + dataset = parts[1] + table_id_short = ".".join(parts[2:]) + + table = bigquery.Table.from_api_repr( + { + "tableReference": { + "projectId": project, + "datasetId": dataset, + "tableId": table_id_short, + }, + "location": job.location, + } + ) + table.schema = job.schema + return table + + def get_table_metadata( bqclient: bigquery.Client, - table_ref: google.cloud.bigquery.table.TableReference, - bq_time: datetime.datetime, *, - cache: Dict[bigquery.TableReference, Tuple[datetime.datetime, bigquery.Table]], + table_id: str, + default_project: Optional[str], + bq_time: datetime.datetime, + cache: Dict[str, Tuple[datetime.datetime, bigquery.Table]], use_cache: bool = True, ) -> Tuple[datetime.datetime, google.cloud.bigquery.table.Table]: """Get the table metadata, either from cache or via REST API.""" - cached_table = cache.get(table_ref) + cached_table = cache.get(table_id) if use_cache and cached_table is not None: snapshot_timestamp, _ = cached_table @@ -76,7 +112,16 @@ def get_table_metadata( warnings.warn(msg, stacklevel=7) return cached_table - table = bqclient.get_table(table_ref) + if "INFORMATION_SCHEMA".casefold() in table_id.casefold(): + table = get_information_schema_metadata( + bqclient=bqclient, table_id=table_id, default_project=default_project + ) + else: + table_ref = google.cloud.bigquery.table.TableReference.from_string( + table_id, default_project=default_project + ) + table = bqclient.get_table(table_ref) + # local time will lag a little bit do to network latency # make sure it is at least table creation time. # This is relevant if the table was created immediately before loading it here. @@ -84,7 +129,7 @@ def get_table_metadata( bq_time = table.created cached_table = (bq_time, table) - cache[table_ref] = cached_table + cache[table_id] = cached_table return cached_table diff --git a/bigframes/session/loader.py b/bigframes/session/loader.py index add4efb6ab..76e07af29b 100644 --- a/bigframes/session/loader.py +++ b/bigframes/session/loader.py @@ -268,9 +268,7 @@ def __init__( self._default_index_type = default_index_type self._scan_index_uniqueness = scan_index_uniqueness self._force_total_order = force_total_order - self._df_snapshot: Dict[ - bigquery.TableReference, Tuple[datetime.datetime, bigquery.Table] - ] = {} + self._df_snapshot: Dict[str, Tuple[datetime.datetime, bigquery.Table]] = {} self._metrics = metrics # Unfortunate circular reference, but need to pass reference when constructing objects self._session = session @@ -617,10 +615,6 @@ def read_gbq_table( _check_duplicates("columns", columns) - table_ref = google.cloud.bigquery.table.TableReference.from_string( - table_id, default_project=self._bqclient.project - ) - columns = list(columns) include_all_columns = columns is None or len(columns) == 0 filters = typing.cast(list, list(filters)) @@ -631,7 +625,8 @@ def read_gbq_table( time_travel_timestamp, table = bf_read_gbq_table.get_table_metadata( self._bqclient, - table_ref=table_ref, + table_id=table_id, + default_project=self._bqclient.project, bq_time=self._clock.get_time(), cache=self._df_snapshot, use_cache=use_cache, diff --git a/tests/system/small/pandas/test_read_gbq_information_schema.py b/tests/system/small/pandas/test_read_gbq_information_schema.py new file mode 100644 index 0000000000..86d7ac0c24 --- /dev/null +++ b/tests/system/small/pandas/test_read_gbq_information_schema.py @@ -0,0 +1,18 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +def test_read_gbq_jobs_by_user_returns_schema(session): + df = session.read_gbq("region-US.INFORMATION_SCHEMA.JOBS_BY_USER") + assert df.dtypes is not None From 3ac25fb9460bece221322d83cd4811b761161f8f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tim=20Swe=C3=B1a?= Date: Wed, 9 Jul 2025 15:54:41 -0700 Subject: [PATCH 2/6] avoid storage semi executor --- .../session/_io/bigquery/read_gbq_table.py | 57 +++++++++++++------ bigframes/session/read_api_execution.py | 3 + .../test_read_gbq_information_schema.py | 14 +++++ 3 files changed, 58 insertions(+), 16 deletions(-) diff --git a/bigframes/session/_io/bigquery/read_gbq_table.py b/bigframes/session/_io/bigquery/read_gbq_table.py index 341e39061b..9620169070 100644 --- a/bigframes/session/_io/bigquery/read_gbq_table.py +++ b/bigframes/session/_io/bigquery/read_gbq_table.py @@ -42,6 +42,32 @@ import bigframes.session +def _convert_information_schema_table_id_to_table_reference( + table_id: str, + default_project: Optional[str], +) -> bigquery.TableReference: + """Squeeze an INFORMATION_SCHEMA reference into a TableReference. + + This is kind-of a hack. INFORMATION_SCHEMA is a view that isn't available + via the tables.get REST API. + """ + parts = table_id.split(".") + parts_casefold = [part.casefold() for part in parts] + dataset_index = parts_casefold.index("INFORMATION_SCHEMA".casefold()) + + if dataset_index == 0: + project = default_project + else: + project = ".".join(parts[:dataset_index]) + + dataset = "INFORMATION_SCHEMA" + table_id_short = ".".join(parts[dataset_index + 1 :]) + return bigquery.TableReference( + bigquery.DatasetReference(project, dataset), + table_id_short, + ) + + def get_information_schema_metadata( bqclient: bigquery.Client, table_id: str, @@ -53,24 +79,17 @@ def get_information_schema_metadata( f"SELECT * FROM `{table_id}`", job_config=job_config, ) - parts = table_id.split(".") - if len(parts) < 3: - project = default_project - dataset = parts[0] - table_id_short = ".".join(parts[1:]) - else: - project = parts[0] - dataset = parts[1] - table_id_short = ".".join(parts[2:]) - + table_ref = _convert_information_schema_table_id_to_table_reference( + table_id=table_id, + default_project=default_project, + ) table = bigquery.Table.from_api_repr( { - "tableReference": { - "projectId": project, - "datasetId": dataset, - "tableId": table_id_short, - }, + "tableReference": table_ref.to_api_repr(), "location": job.location, + # Prevent ourselves from trying to read the table with the BQ + # Storage API. + "type": "VIEW", } ) table.schema = job.schema @@ -112,7 +131,13 @@ def get_table_metadata( warnings.warn(msg, stacklevel=7) return cached_table - if "INFORMATION_SCHEMA".casefold() in table_id.casefold(): + table_id_casefold = table_id.casefold() + if ( + # Ensure we don't have false positives for some user defined dataset + # like MY_INFORMATION_SCHEMA or tables called INFORMATION_SCHEMA. + ".INFORMATION_SCHEMA.".casefold() in table_id_casefold + or table_id_casefold.startswith("INFORMATION_SCHEMA.".casefold()) + ): table = get_information_schema_metadata( bqclient=bqclient, table_id=table_id, default_project=default_project ) diff --git a/bigframes/session/read_api_execution.py b/bigframes/session/read_api_execution.py index d5bcf1dbc7..c5b472760b 100644 --- a/bigframes/session/read_api_execution.py +++ b/bigframes/session/read_api_execution.py @@ -46,6 +46,9 @@ def execute( if node.explicitly_ordered and ordered: return None + if not node.source.table.is_physically_stored: + return None + if limit is not None: if peek is None or limit < peek: peek = limit diff --git a/tests/system/small/pandas/test_read_gbq_information_schema.py b/tests/system/small/pandas/test_read_gbq_information_schema.py index 86d7ac0c24..a0cf70b859 100644 --- a/tests/system/small/pandas/test_read_gbq_information_schema.py +++ b/tests/system/small/pandas/test_read_gbq_information_schema.py @@ -16,3 +16,17 @@ def test_read_gbq_jobs_by_user_returns_schema(session): df = session.read_gbq("region-US.INFORMATION_SCHEMA.JOBS_BY_USER") assert df.dtypes is not None + + +def test_read_gbq_jobs_by_user_can_be_peeked(unordered_session): + df = unordered_session.read_gbq("region-US.INFORMATION_SCHEMA.JOBS_BY_USER") + result = df.peek() + assert result is not None + + +def test_read_gbq_jobs_by_user_four_parts_can_be_peeked(unordered_session): + df = unordered_session.read_gbq( + f"{unordered_session.bqclient.project}.region-US.INFORMATION_SCHEMA.JOBS_BY_USER" + ) + result = df.peek() + assert result is not None From 885d786170a470a36b1326d31fcea4907820cd1c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tim=20Swe=C3=B1a?= Date: Wed, 9 Jul 2025 15:58:12 -0700 Subject: [PATCH 3/6] use faster tables for peek tests --- .../small/pandas/test_read_gbq_information_schema.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tests/system/small/pandas/test_read_gbq_information_schema.py b/tests/system/small/pandas/test_read_gbq_information_schema.py index a0cf70b859..af4bc00b7d 100644 --- a/tests/system/small/pandas/test_read_gbq_information_schema.py +++ b/tests/system/small/pandas/test_read_gbq_information_schema.py @@ -18,15 +18,15 @@ def test_read_gbq_jobs_by_user_returns_schema(session): assert df.dtypes is not None -def test_read_gbq_jobs_by_user_can_be_peeked(unordered_session): - df = unordered_session.read_gbq("region-US.INFORMATION_SCHEMA.JOBS_BY_USER") +def test_read_gbq_schemata_can_be_peeked(unordered_session): + df = unordered_session.read_gbq("region-US.INFORMATION_SCHEMA.SCHEMATA") result = df.peek() assert result is not None -def test_read_gbq_jobs_by_user_four_parts_can_be_peeked(unordered_session): +def test_read_gbq_schemata_four_parts_can_be_peeked(unordered_session): df = unordered_session.read_gbq( - f"{unordered_session.bqclient.project}.region-US.INFORMATION_SCHEMA.JOBS_BY_USER" + f"{unordered_session.bqclient.project}.region-US.INFORMATION_SCHEMA.SCHEMATA" ) result = df.peek() assert result is not None From 34bd68564c7a3ef16765b1819ae087ff8203f73d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tim=20Swe=C3=B1a?= Date: Wed, 9 Jul 2025 16:05:03 -0700 Subject: [PATCH 4/6] more tests --- .../test_read_gbq_information_schema.py | 22 +++++++++++++++++-- 1 file changed, 20 insertions(+), 2 deletions(-) diff --git a/tests/system/small/pandas/test_read_gbq_information_schema.py b/tests/system/small/pandas/test_read_gbq_information_schema.py index af4bc00b7d..efe73f044d 100644 --- a/tests/system/small/pandas/test_read_gbq_information_schema.py +++ b/tests/system/small/pandas/test_read_gbq_information_schema.py @@ -12,9 +12,27 @@ # See the License for the specific language governing permissions and # limitations under the License. +import pytest -def test_read_gbq_jobs_by_user_returns_schema(session): - df = session.read_gbq("region-US.INFORMATION_SCHEMA.JOBS_BY_USER") + +@pytest.mark.parametrize("include_project", [True, False]) +@pytest.mark.parametrize( + "view_id", + [ + # https://cloud.google.com/bigquery/docs/information-schema-intro + "region-US.INFORMATION_SCHEMA.JOBS_BY_USER", + "region-US.INFORMATION_SCHEMA.SCHEMATA", + ], +) +def test_read_gbq_jobs_by_user_returns_schema( + unordered_session, view_id: str, include_project: bool +): + if include_project: + table_id = unordered_session.bqclient.project + "." + view_id + else: + table_id = view_id + + df = unordered_session.read_gbq(table_id) assert df.dtypes is not None From c0175efb8dc1b04d668c8ee9a5a9f176239861b8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tim=20Swe=C3=B1a?= Date: Thu, 10 Jul 2025 11:39:19 -0700 Subject: [PATCH 5/6] fix mypy --- bigframes/session/_io/bigquery/read_gbq_table.py | 8 ++++++++ tests/unit/session/test_session.py | 2 +- 2 files changed, 9 insertions(+), 1 deletion(-) diff --git a/bigframes/session/_io/bigquery/read_gbq_table.py b/bigframes/session/_io/bigquery/read_gbq_table.py index 9620169070..7896dbba39 100644 --- a/bigframes/session/_io/bigquery/read_gbq_table.py +++ b/bigframes/session/_io/bigquery/read_gbq_table.py @@ -60,6 +60,14 @@ def _convert_information_schema_table_id_to_table_reference( else: project = ".".join(parts[:dataset_index]) + if project is None: + message = ( + "Could not determine project ID. " + "Please provide a project or region in your INFORMATION_SCHEMA table ID, " + "For example, 'region-REGION_NAME.INFORMATION_SCHEMA.JOBS'." + ) + raise ValueError(message) + dataset = "INFORMATION_SCHEMA" table_id_short = ".".join(parts[dataset_index + 1 :]) return bigquery.TableReference( diff --git a/tests/unit/session/test_session.py b/tests/unit/session/test_session.py index 26b74a3f8a..e63e53d560 100644 --- a/tests/unit/session/test_session.py +++ b/tests/unit/session/test_session.py @@ -242,7 +242,7 @@ def test_read_gbq_cached_table(): table._properties["numRows"] = "1000000000" table._properties["location"] = session._location table._properties["type"] = "TABLE" - session._loader._df_snapshot[table_ref] = ( + session._loader._df_snapshot[str(table_ref)] = ( datetime.datetime(1999, 1, 2, 3, 4, 5, 678901, tzinfo=datetime.timezone.utc), table, ) From 7028108c8522c717050ce0d739cfb36ef057b8e1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tim=20Swe=C3=B1a=20=28Swast=29?= Date: Thu, 10 Jul 2025 13:42:01 -0500 Subject: [PATCH 6/6] Update bigframes/session/_io/bigquery/read_gbq_table.py --- bigframes/session/_io/bigquery/read_gbq_table.py | 1 - 1 file changed, 1 deletion(-) diff --git a/bigframes/session/_io/bigquery/read_gbq_table.py b/bigframes/session/_io/bigquery/read_gbq_table.py index 7896dbba39..292eab09b3 100644 --- a/bigframes/session/_io/bigquery/read_gbq_table.py +++ b/bigframes/session/_io/bigquery/read_gbq_table.py @@ -83,7 +83,6 @@ def get_information_schema_metadata( ) -> bigquery.Table: job_config = bigquery.QueryJobConfig(dry_run=True) job = bqclient.query( - # TODO: better escaping? f"SELECT * FROM `{table_id}`", job_config=job_config, )