44
55import collections .abc
66import datetime
7- from typing import Optional , Tuple
7+ from typing import Any , Optional , Tuple
88import warnings
99
1010import db_dtypes
2828# `docs/source/writing.rst`.
2929_PANDAS_DTYPE_TO_BQ = {
3030 "bool" : "BOOLEAN" ,
31+ "boolean" : "BOOLEAN" ,
3132 "datetime64[ns, UTC]" : "TIMESTAMP" ,
33+ "datetime64[us, UTC]" : "TIMESTAMP" ,
3234 "datetime64[ns]" : "DATETIME" ,
35+ "datetime64[us]" : "DATETIME" ,
3336 "float32" : "FLOAT" ,
3437 "float64" : "FLOAT" ,
3538 "int8" : "INTEGER" ,
3639 "int16" : "INTEGER" ,
3740 "int32" : "INTEGER" ,
3841 "int64" : "INTEGER" ,
42+ "Int8" : "INTEGER" ,
43+ "Int16" : "INTEGER" ,
44+ "Int32" : "INTEGER" ,
45+ "Int64" : "INTEGER" ,
3946 "uint8" : "INTEGER" ,
4047 "uint16" : "INTEGER" ,
4148 "uint32" : "INTEGER" ,
@@ -103,7 +110,7 @@ def dataframe_to_bigquery_fields(
103110
104111 # Try to automatically determine the type based on a few rows of the data.
105112 values = dataframe .reset_index ()[column ]
106- bq_field = values_to_bigquery_field (column , values )
113+ bq_field = values_to_bigquery_field (column , values , default_type = default_type )
107114
108115 if bq_field :
109116 bq_schema_out .append (bq_field )
@@ -114,7 +121,9 @@ def dataframe_to_bigquery_fields(
114121 arrow_value = pyarrow .array (values )
115122 bq_field = (
116123 pandas_gbq .schema .pyarrow_to_bigquery .arrow_type_to_bigquery_field (
117- column , arrow_value .type
124+ column ,
125+ arrow_value .type ,
126+ default_type = default_type ,
118127 )
119128 )
120129
@@ -151,6 +160,19 @@ def dataframe_to_bigquery_fields(
151160
152161
153162def dtype_to_bigquery_field (name , dtype ) -> Optional [schema .SchemaField ]:
163+ """Infers the BigQuery schema field type from a pandas dtype.
164+
165+ Args:
166+ name (str):
167+ Name of the column/field.
168+ dtype:
169+ A pandas / numpy dtype object.
170+
171+ Returns:
172+ Optional[schema.SchemaField]:
173+ The schema field, or None if a type cannot be inferred, such as if
174+ it is ambiguous like the object dtype.
175+ """
154176 bq_type = _PANDAS_DTYPE_TO_BQ .get (dtype .name )
155177
156178 if bq_type is not None :
@@ -164,9 +186,44 @@ def dtype_to_bigquery_field(name, dtype) -> Optional[schema.SchemaField]:
164186 return None
165187
166188
167- def value_to_bigquery_field (name , value ) -> Optional [schema .SchemaField ]:
168- if isinstance (value , str ):
169- return schema .SchemaField (name , "STRING" )
189+ def value_to_bigquery_field (
190+ name : str , value : Any , default_type : Optional [str ] = None
191+ ) -> Optional [schema .SchemaField ]:
192+ """Infers the BigQuery schema field type from a single value.
193+
194+ Args:
195+ name:
196+ The name of the field.
197+ value:
198+ The value to infer the type from. If None, the default type is used
199+ if available.
200+ default_type:
201+ The default field type. Defaults to None.
202+
203+ Returns:
204+ The schema field, or None if a type cannot be inferred.
205+ """
206+
207+ # Set the SchemaField datatype to the given default_type if the value
208+ # being assessed is None.
209+ if value is None :
210+ return schema .SchemaField (name , default_type )
211+
212+ # Map from Python types to BigQuery types. This isn't super exhaustive
213+ # because we rely more on pyarrow, which can check more than one value to
214+ # determine the type.
215+ type_mapping = {
216+ str : "STRING" ,
217+ }
218+
219+ # geopandas and shapely are optional dependencies, so only check if those
220+ # are installed.
221+ if _BaseGeometry is not None :
222+ type_mapping [_BaseGeometry ] = "GEOGRAPHY"
223+
224+ for type_ , bq_type in type_mapping .items ():
225+ if isinstance (value , type_ ):
226+ return schema .SchemaField (name , bq_type )
170227
171228 # For timezone-naive datetimes, the later pyarrow conversion to try and
172229 # learn the type add a timezone to such datetimes, causing them to be
@@ -182,35 +239,51 @@ def value_to_bigquery_field(name, value) -> Optional[schema.SchemaField]:
182239 else :
183240 return schema .SchemaField (name , "DATETIME" )
184241
185- if _BaseGeometry is not None and isinstance (value , _BaseGeometry ):
186- return schema .SchemaField (name , "GEOGRAPHY" )
187-
188242 return None
189243
190244
191- def values_to_bigquery_field (name , values ) -> Optional [schema .SchemaField ]:
245+ def values_to_bigquery_field (
246+ name : str , values : Any , default_type : str = "STRING"
247+ ) -> Optional [schema .SchemaField ]:
248+ """Infers the BigQuery schema field type from a list of values.
249+
250+ This function iterates through the given values to determine the
251+ corresponding schema field type.
252+
253+ Args:
254+ name:
255+ The name of the field.
256+ values:
257+ An iterable of values to infer the type from. If all the values
258+ are None or the iterable is empty, the function returns None.
259+ default_type:
260+ The default field type to use if a specific type cannot be
261+ determined from the values. Defaults to "STRING".
262+
263+ Returns:
264+ The schema field, or None if a type cannot be inferred.
265+ """
192266 value = pandas_gbq .core .pandas .first_valid (values )
193267
194- # All NULL, type not determinable.
268+ # All values came back as NULL, thus type not determinable by this method.
269+ # Return None so we can try other methods.
195270 if value is None :
196271 return None
197272
198- field = value_to_bigquery_field (name , value )
199- if field is not None :
273+ field = value_to_bigquery_field (name , value , default_type = default_type )
274+ if field :
200275 return field
201276
202- if isinstance (value , str ):
203- return schema .SchemaField (name , "STRING" )
204-
205- # Check plain ARRAY values here. Let STRUCT get determined by pyarrow,
206- # which can examine more values to determine all keys.
277+ # Check plain ARRAY values here. Exclude mapping types to let STRUCT get
278+ # determined by pyarrow, which can examine more values to determine all
279+ # keys.
207280 if isinstance (value , collections .abc .Iterable ) and not isinstance (
208281 value , collections .abc .Mapping
209282 ):
210283 # It could be that this value contains all None or is empty, so get the
211284 # first non-None value we can find.
212285 valid_item = pandas_gbq .core .pandas .first_array_valid (values )
213- field = value_to_bigquery_field (name , valid_item )
286+ field = value_to_bigquery_field (name , valid_item , default_type = default_type )
214287
215288 if field is not None :
216289 return schema .SchemaField (name , field .field_type , mode = "REPEATED" )
0 commit comments