diff --git a/datafusion/functions/benches/to_timestamp.rs b/datafusion/functions/benches/to_timestamp.rs index a8f5c5816d4d..c0803641a6ea 100644 --- a/datafusion/functions/benches/to_timestamp.rs +++ b/datafusion/functions/benches/to_timestamp.rs @@ -115,15 +115,17 @@ fn criterion_benchmark(c: &mut Criterion) { let arg_field = Field::new("a", DataType::Utf8, false).into(); let arg_fields = vec![arg_field]; let config_options = Arc::new(ConfigOptions::default()); + let to_timestamp_udf = to_timestamp(config_options.as_ref()); c.bench_function("to_timestamp_no_formats_utf8", |b| { + let to_timestamp_udf = Arc::clone(&to_timestamp_udf); let arr_data = data(); let batch_len = arr_data.len(); let string_array = ColumnarValue::Array(Arc::new(arr_data) as ArrayRef); b.iter(|| { black_box( - to_timestamp() + to_timestamp_udf .invoke_with_args(ScalarFunctionArgs { args: vec![string_array.clone()], arg_fields: arg_fields.clone(), @@ -137,13 +139,14 @@ fn criterion_benchmark(c: &mut Criterion) { }); c.bench_function("to_timestamp_no_formats_largeutf8", |b| { + let to_timestamp_udf = Arc::clone(&to_timestamp_udf); let data = cast(&data(), &DataType::LargeUtf8).unwrap(); let batch_len = data.len(); let string_array = ColumnarValue::Array(Arc::new(data) as ArrayRef); b.iter(|| { black_box( - to_timestamp() + to_timestamp_udf .invoke_with_args(ScalarFunctionArgs { args: vec![string_array.clone()], arg_fields: arg_fields.clone(), @@ -157,13 +160,14 @@ fn criterion_benchmark(c: &mut Criterion) { }); c.bench_function("to_timestamp_no_formats_utf8view", |b| { + let to_timestamp_udf = Arc::clone(&to_timestamp_udf); let data = cast(&data(), &DataType::Utf8View).unwrap(); let batch_len = data.len(); let string_array = ColumnarValue::Array(Arc::new(data) as ArrayRef); b.iter(|| { black_box( - to_timestamp() + to_timestamp_udf .invoke_with_args(ScalarFunctionArgs { args: vec![string_array.clone()], arg_fields: arg_fields.clone(), @@ -177,6 +181,7 @@ fn criterion_benchmark(c: &mut Criterion) { }); c.bench_function("to_timestamp_with_formats_utf8", |b| { + let to_timestamp_udf = Arc::clone(&to_timestamp_udf); let (inputs, format1, format2, format3) = data_with_formats(); let batch_len = inputs.len(); @@ -196,7 +201,7 @@ fn criterion_benchmark(c: &mut Criterion) { b.iter(|| { black_box( - to_timestamp() + to_timestamp_udf .invoke_with_args(ScalarFunctionArgs { args: args.clone(), arg_fields: arg_fields.clone(), @@ -210,6 +215,7 @@ fn criterion_benchmark(c: &mut Criterion) { }); c.bench_function("to_timestamp_with_formats_largeutf8", |b| { + let to_timestamp_udf = Arc::clone(&to_timestamp_udf); let (inputs, format1, format2, format3) = data_with_formats(); let batch_len = inputs.len(); @@ -237,7 +243,7 @@ fn criterion_benchmark(c: &mut Criterion) { b.iter(|| { black_box( - to_timestamp() + to_timestamp_udf .invoke_with_args(ScalarFunctionArgs { args: args.clone(), arg_fields: arg_fields.clone(), @@ -251,6 +257,7 @@ fn criterion_benchmark(c: &mut Criterion) { }); c.bench_function("to_timestamp_with_formats_utf8view", |b| { + let to_timestamp_udf = Arc::clone(&to_timestamp_udf); let (inputs, format1, format2, format3) = data_with_formats(); let batch_len = inputs.len(); @@ -279,7 +286,7 @@ fn criterion_benchmark(c: &mut Criterion) { b.iter(|| { black_box( - to_timestamp() + to_timestamp_udf .invoke_with_args(ScalarFunctionArgs { args: args.clone(), arg_fields: arg_fields.clone(), diff --git a/datafusion/functions/src/datetime/common.rs b/datafusion/functions/src/datetime/common.rs index 90b92a7f88f9..35013d93b1fb 100644 --- a/datafusion/functions/src/datetime/common.rs +++ b/datafusion/functions/src/datetime/common.rs @@ -15,8 +15,12 @@ // specific language governing permissions and limitations // under the License. +use std::fmt; +use std::hash::{Hash, Hasher}; +use std::str::FromStr; use std::sync::Arc; +use arrow::array::timezone::Tz; use arrow::array::{ Array, ArrowPrimitiveType, AsArray, GenericStringArray, PrimitiveArray, StringArrayType, StringViewArray, @@ -25,9 +29,10 @@ use arrow::compute::kernels::cast_utils::string_to_timestamp_nanos; use arrow::datatypes::DataType; use chrono::format::{parse, Parsed, StrftimeItems}; use chrono::LocalResult::Single; -use chrono::{DateTime, TimeZone, Utc}; +use chrono::{DateTime, FixedOffset, LocalResult, NaiveDateTime, TimeZone, Utc}; use datafusion_common::cast::as_generic_string_array; +use datafusion_common::config::ConfigOptions; use datafusion_common::{ exec_datafusion_err, exec_err, unwrap_or_internal_err, DataFusionError, Result, ScalarType, ScalarValue, @@ -42,6 +47,302 @@ pub(crate) fn string_to_timestamp_nanos_shim(s: &str) -> Result { string_to_timestamp_nanos(s).map_err(|e| e.into()) } +#[derive(Clone, Copy, Debug)] +enum ConfiguredZone { + Named(Tz), + Offset(FixedOffset), +} + +#[derive(Clone)] +pub(crate) struct ConfiguredTimeZone { + repr: Arc, + zone: ConfiguredZone, +} + +impl ConfiguredTimeZone { + pub(crate) fn utc() -> Self { + Self { + repr: Arc::from("+00:00"), + zone: ConfiguredZone::Offset(FixedOffset::east_opt(0).unwrap()), + } + } + + pub(crate) fn parse(tz: &str) -> Result> { + let tz = tz.trim(); + if tz.is_empty() { + return Ok(None); + } + + if let Ok(named) = Tz::from_str(tz) { + return Ok(Some(Self { + repr: Arc::from(tz), + zone: ConfiguredZone::Named(named), + })); + } + + if let Some(offset) = parse_fixed_offset(tz) { + return Ok(Some(Self { + repr: Arc::from(tz), + zone: ConfiguredZone::Offset(offset), + })); + } + + Err(exec_datafusion_err!( + "Invalid execution timezone '{tz}'. Please provide an IANA timezone name (e.g. 'America/New_York') or an offset in the form '+HH:MM'." + )) + } + + pub(crate) fn from_config(config: &ConfigOptions) -> Self { + match Self::parse(config.execution.time_zone.as_deref().unwrap_or("")) { + Ok(Some(tz)) => tz, + _ => Self::utc(), + } + } + + fn timestamp_from_naive(&self, naive: &NaiveDateTime) -> Result { + match self.zone { + ConfiguredZone::Named(tz) => { + local_datetime_to_timestamp(tz.from_local_datetime(naive), &self.repr) + } + ConfiguredZone::Offset(offset) => { + local_datetime_to_timestamp(offset.from_local_datetime(naive), &self.repr) + } + } + } + + fn datetime_from_formatted(&self, s: &str, format: &str) -> Result> { + let datetime = match self.zone { + ConfiguredZone::Named(tz) => { + string_to_datetime_formatted(&tz, s, format)?.with_timezone(&Utc) + } + ConfiguredZone::Offset(offset) => { + string_to_datetime_formatted(&offset, s, format)?.with_timezone(&Utc) + } + }; + Ok(datetime) + } +} + +impl fmt::Debug for ConfiguredTimeZone { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + f.debug_struct("ConfiguredTimeZone") + .field("repr", &self.repr) + .finish() + } +} + +impl PartialEq for ConfiguredTimeZone { + fn eq(&self, other: &Self) -> bool { + self.repr == other.repr + } +} + +impl Eq for ConfiguredTimeZone {} + +impl Hash for ConfiguredTimeZone { + fn hash(&self, state: &mut H) { + self.repr.hash(state); + } +} + +fn parse_fixed_offset(tz: &str) -> Option { + let tz = tz.trim(); + if tz.eq_ignore_ascii_case("utc") || tz.eq_ignore_ascii_case("z") { + return FixedOffset::east_opt(0); + } + + // Strict chrono-only path: normalize compact digit-only offsets + // (e.g. +05, +0500, +053045) into colon-separated forms and + // attempt parsing with chrono's Parsed API. Return None if chrono + // can't parse the provided string. + if let Some(first) = tz.chars().next() { + if first == '+' || first == '-' { + let sign = first; + let rest = &tz[1..]; + + let normalized = if rest.chars().all(|c| c.is_ascii_digit()) { + match rest.len() { + 2 => format!("{}{}:00", sign, &rest[0..2]), + 4 => format!("{}{}:{}", sign, &rest[0..2], &rest[2..4]), + 6 => { + format!("{}{}:{}:{}", sign, &rest[0..2], &rest[2..4], &rest[4..6]) + } + _ => tz.to_string(), + } + } else { + tz.to_string() + }; + + let try_formats = ["%::z", "%:z", "%z"]; + for fmt in try_formats.iter() { + let mut parsed = Parsed::new(); + if parse(&mut parsed, &normalized, StrftimeItems::new(fmt)).is_ok() { + if let Ok(off) = parsed.to_fixed_offset() { + return Some(off); + } + } + } + } + } + + None +} + +/// Converts a local datetime result to a UTC timestamp in nanoseconds. +/// +/// # DST Transition Behavior +/// +/// This function handles daylight saving time (DST) transitions by returning an error +/// when the local time is ambiguous or invalid: +/// +/// ## Ambiguous Times (Fall Back) +/// When clocks "fall back" (e.g., 2:00 AM becomes 1:00 AM), times in the repeated hour +/// exist twice. For example, in America/New_York on 2024-11-03: +/// - `2024-11-03 01:30:00` occurs both at UTC 05:30 (EDT) and UTC 06:30 (EST) +/// +/// DataFusion returns an error rather than silently choosing one interpretation, +/// ensuring users are aware of the ambiguity. +/// +/// ## Invalid Times (Spring Forward) +/// When clocks "spring forward" (e.g., 2:00 AM becomes 3:00 AM), times in the skipped hour +/// don't exist. For example, in America/New_York on 2024-03-10: +/// - `2024-03-10 02:30:00` never occurred (clocks jumped from 02:00 to 03:00) +/// +/// DataFusion returns an error for these non-existent times. +/// +/// ## Workarounds +/// To avoid ambiguity errors: +/// 1. Use timestamps with explicit timezone offsets (e.g., `2024-11-03 01:30:00-05:00`) +/// 2. Convert to UTC before processing +/// 3. Use a timezone without DST (e.g., UTC, `America/Phoenix`) +fn local_datetime_to_timestamp( + result: LocalResult>, + tz_repr: &str, +) -> Result { + match result { + Single(dt) => datetime_to_timestamp(dt.with_timezone(&Utc)), + LocalResult::Ambiguous(dt1, dt2) => Err(exec_datafusion_err!( + "The local time '{:?}' is ambiguous in timezone '{tz_repr}' (also corresponds to '{:?}').", + dt1.naive_local(), + dt2.naive_local() + )), + LocalResult::None => Err(exec_datafusion_err!( + "The local time is invalid in timezone '{tz_repr}'." + )), + } +} + +fn datetime_to_timestamp(datetime: DateTime) -> Result { + datetime + .timestamp_nanos_opt() + .ok_or_else(|| exec_datafusion_err!("{ERR_NANOSECONDS_NOT_SUPPORTED}")) +} + +fn timestamp_to_naive(value: i64) -> Result { + let secs = value.div_euclid(1_000_000_000); + let nanos = value.rem_euclid(1_000_000_000) as u32; + DateTime::::from_timestamp(secs, nanos) + .ok_or_else(|| exec_datafusion_err!("{ERR_NANOSECONDS_NOT_SUPPORTED}")) + .map(|dt| dt.naive_utc()) +} + +/// Detects whether a timestamp string contains explicit timezone information. +/// +/// This function performs a single-pass scan to check for: +/// 1. RFC3339-compatible format (via Arrow's parser) +/// 2. Timezone offset markers (e.g., `+05:00`, `-0800`, `+05`) +/// 3. Trailing 'Z' or 'z' suffix (UTC indicator) +/// 4. Named timezone identifiers (e.g., `UTC`, `America/New_York`) +/// +/// # Performance Considerations +/// This function is called for every string value during timestamp parsing. +/// The implementation uses a single-pass byte-level scan for efficiency. +/// +/// # Examples +/// ```ignore +/// assert!(has_explicit_timezone("2020-09-08T13:42:29Z")); +/// assert!(has_explicit_timezone("2020-09-08T13:42:29+05:00")); +/// assert!(has_explicit_timezone("2020-09-08T13:42:29 UTC")); +/// assert!(!has_explicit_timezone("2020-09-08T13:42:29")); +/// ``` +/// Heuristic-based explicit timezone detection (simplified). +/// +/// Uses chrono for full RFC3339 parsing and a few lightweight heuristics +/// for other common forms (trailing Z, numeric offsets, named tz tokens). +fn has_explicit_timezone(value: &str) -> bool { + // Fast, cheap heuristics first (avoid expensive RFC3339 parse on common inputs) + let v = value.trim(); + + // trailing 'Z' or 'z' indicates UTC + if v.ends_with('Z') || v.ends_with('z') { + return true; + } + + // Named timezones: IANA names often contain a slash, or common tokens like UTC/GMT + let up = v.to_uppercase(); + if v.contains('/') || up.contains("UTC") || up.contains("GMT") { + return true; + } + + // Common abbreviations like PST, EST, etc. + const COMMON_ABBREVIATIONS: [&str; 8] = + ["PST", "PDT", "EST", "EDT", "CST", "CDT", "MST", "MDT"]; + for &abbr in COMMON_ABBREVIATIONS.iter() { + if up.contains(abbr) { + return true; + } + } + + // Heuristic: trailing numeric offset like +0500, +05:00, -0330, etc. + if let Some(pos) = v.rfind(|c| ['+', '-'].contains(&c)) { + // Exclude scientific notation like 1.5e+10 (preceded by 'e' or 'E') + if !(pos > 0 + && v.chars() + .nth(pos - 1) + .map(|c| c == 'e' || c == 'E') + .unwrap_or(false)) + { + // Ensure the sign likely follows a time component. Look for a separator + // (space or 'T') before the sign and check for a ':' between that + // separator and the sign to avoid treating date dashes as offsets. + let sep_pos = v[..pos].rfind(|c| [' ', 'T', 't'].contains(&c)); + let has_time_before_sign = if let Some(spos) = sep_pos { + v[spos + 1..pos].contains(':') + } else { + v[..pos].contains(':') + }; + + if has_time_before_sign { + let rest = &v[pos + 1..]; + let digit_count = rest.chars().take_while(|c| c.is_ascii_digit()).count(); + if digit_count == 2 || digit_count == 4 || digit_count == 6 { + return true; + } + } + } + } + + // Last resort: try full RFC3339 parsing (covers additional valid cases) + if DateTime::parse_from_rfc3339(value).is_ok() { + return true; + } + + false +} + +pub(crate) fn string_to_timestamp_nanos_with_timezone( + timezone: &ConfiguredTimeZone, + s: &str, +) -> Result { + let ts = string_to_timestamp_nanos_shim(s)?; + if has_explicit_timezone(s) { + Ok(ts) + } else { + let naive = timestamp_to_naive(ts)?; + timezone.timestamp_from_naive(&naive) + } +} + /// Checks that all the arguments from the second are of type [Utf8], [LargeUtf8] or [Utf8View] /// /// [Utf8]: DataType::Utf8 @@ -152,6 +453,19 @@ pub(crate) fn string_to_timestamp_nanos_formatted( .ok_or_else(|| exec_datafusion_err!("{ERR_NANOSECONDS_NOT_SUPPORTED}")) } +pub(crate) fn string_to_timestamp_nanos_formatted_with_timezone( + timezone: &ConfiguredTimeZone, + s: &str, + format: &str, +) -> Result { + if has_explicit_timezone(s) { + return string_to_timestamp_nanos_formatted(s, format); + } + + let datetime = timezone.datetime_from_formatted(s, format)?; + datetime_to_timestamp(datetime) +} + /// Accepts a string with a `chrono` format and converts it to a /// millisecond precision timestamp. /// @@ -442,3 +756,149 @@ where // first map is the iterator, second is for the `Option<_>` array.iter().map(|x| x.map(&op).transpose()).collect() } + +#[cfg(test)] +mod tests { + use super::{has_explicit_timezone, ConfiguredTimeZone}; + use datafusion_common::config::ConfigOptions; + + #[test] + fn parse_empty_timezone_returns_none() { + assert!(ConfiguredTimeZone::parse(" ").unwrap().is_none()); + } + + #[test] + fn from_config_blank_timezone_defaults_to_utc() { + let mut config = ConfigOptions::default(); + config.execution.time_zone = None; + + let timezone = ConfiguredTimeZone::from_config(&config); + assert_eq!(timezone, ConfiguredTimeZone::utc()); + } + + #[test] + fn detects_timezone_token_outside_tail() { + assert!(has_explicit_timezone("UTC 2024-01-01 12:00:00")); + assert!(has_explicit_timezone("2020-09-08T13:42:29UTC")); + assert!(has_explicit_timezone("America/New_York 2020-09-08")); + } + + #[test] + fn detects_offsets_without_colons() { + // ISO-8601 formats with offsets (no colons) + assert!(has_explicit_timezone("2020-09-08T13:42:29+0500")); + assert!(has_explicit_timezone("2020-09-08T13:42:29-0330")); + assert!(has_explicit_timezone("2020-09-08T13:42:29+05")); + assert!(has_explicit_timezone("2020-09-08T13:42:29-08")); + + // 4-digit offsets + assert!(has_explicit_timezone("2024-01-01T12:00:00+0000")); + assert!(has_explicit_timezone("2024-01-01T12:00:00-1200")); + + // 6-digit offsets (with seconds) + assert!(has_explicit_timezone("2024-01-01T12:00:00+053045")); + assert!(has_explicit_timezone("2024-01-01T12:00:00-123045")); + + // Lowercase 't' separator + assert!(has_explicit_timezone("2020-09-08t13:42:29+0500")); + assert!(has_explicit_timezone("2020-09-08t13:42:29-0330")); + } + + #[test] + fn detects_offsets_with_colons() { + assert!(has_explicit_timezone("2020-09-08T13:42:29+05:00")); + assert!(has_explicit_timezone("2020-09-08T13:42:29-03:30")); + assert!(has_explicit_timezone("2020-09-08T13:42:29+05:00:45")); + } + + #[test] + fn detects_z_suffix() { + assert!(has_explicit_timezone("2020-09-08T13:42:29Z")); + assert!(has_explicit_timezone("2020-09-08T13:42:29z")); + } + + #[test] + fn rejects_naive_timestamps() { + assert!(!has_explicit_timezone("2020-09-08T13:42:29")); + assert!(!has_explicit_timezone("2020-09-08 13:42:29")); + assert!(!has_explicit_timezone("2024-01-01 12:00:00")); + + // Date formats with dashes that could be confused with offsets + assert!(!has_explicit_timezone("03:59:00.123456789 05-17-2023")); + assert!(!has_explicit_timezone("12:00:00 01-02-2024")); + } + + #[test] + fn rejects_scientific_notation() { + // Should not treat scientific notation as timezone offset + assert!(!has_explicit_timezone("1.5e+10")); + assert!(!has_explicit_timezone("2.3E-05")); + } + + #[test] + fn test_offset_without_colon_parsing() { + use super::{ + string_to_timestamp_nanos_shim, string_to_timestamp_nanos_with_timezone, + }; + + // Test the exact case from the issue: 2020-09-08T13:42:29+0500 + // This should parse correctly as having an explicit offset + let utc_tz = ConfiguredTimeZone::parse("UTC") + .unwrap() + .expect("UTC should parse"); + let result_utc = + string_to_timestamp_nanos_with_timezone(&utc_tz, "2020-09-08T13:42:29+0500") + .unwrap(); + + // Parse the equivalent RFC3339 format with colon to get the expected value + let expected = + string_to_timestamp_nanos_shim("2020-09-08T13:42:29+05:00").unwrap(); + assert_eq!(result_utc, expected); + + // Test with America/New_York timezone - should NOT double-adjust + // Because the timestamp has an explicit timezone, the session timezone should be ignored + let ny_tz = ConfiguredTimeZone::parse("America/New_York") + .unwrap() + .expect("America/New_York should parse"); + let result_ny = + string_to_timestamp_nanos_with_timezone(&ny_tz, "2020-09-08T13:42:29+0500") + .unwrap(); + + // The result should be the same as UTC because the timestamp has an explicit timezone + assert_eq!(result_ny, expected); + + // Test other offset formats without colons + let result2 = + string_to_timestamp_nanos_with_timezone(&utc_tz, "2020-09-08T13:42:29-0330") + .unwrap(); + let expected2 = + string_to_timestamp_nanos_shim("2020-09-08T13:42:29-03:30").unwrap(); + assert_eq!(result2, expected2); + + // Test 2-digit offsets + let result3 = + string_to_timestamp_nanos_with_timezone(&utc_tz, "2020-09-08T13:42:29+05") + .unwrap(); + let expected3 = + string_to_timestamp_nanos_shim("2020-09-08T13:42:29+05:00").unwrap(); + assert_eq!(result3, expected3); + } + + #[test] + fn detects_named_timezones_with_trailing_offsets() { + use super::has_explicit_timezone; + + // Named timezone tokens and trailing offsets should be detected by the + // simplified explicit-timezone heuristic. + assert!(has_explicit_timezone("America/Los_Angeles+8")); + assert!(has_explicit_timezone("PST+8")); + assert!(has_explicit_timezone("Europe/London-1")); + assert!(has_explicit_timezone("UTC+0")); + + assert!(has_explicit_timezone( + "2024-01-01T12:00:00 Europe/London+05" + )); + assert!(has_explicit_timezone("Meeting at 12:00 PST+8")); + assert!(has_explicit_timezone("Event at 12:00 Europe/London-1")); + } +} diff --git a/datafusion/functions/src/datetime/mod.rs b/datafusion/functions/src/datetime/mod.rs index d80f14facf82..43189092e8f5 100644 --- a/datafusion/functions/src/datetime/mod.rs +++ b/datafusion/functions/src/datetime/mod.rs @@ -49,11 +49,14 @@ make_udf_function!(to_char::ToCharFunc, to_char); make_udf_function!(to_date::ToDateFunc, to_date); make_udf_function!(to_local_time::ToLocalTimeFunc, to_local_time); make_udf_function!(to_unixtime::ToUnixtimeFunc, to_unixtime); -make_udf_function!(to_timestamp::ToTimestampFunc, to_timestamp); -make_udf_function!(to_timestamp::ToTimestampSecondsFunc, to_timestamp_seconds); -make_udf_function!(to_timestamp::ToTimestampMillisFunc, to_timestamp_millis); -make_udf_function!(to_timestamp::ToTimestampMicrosFunc, to_timestamp_micros); -make_udf_function!(to_timestamp::ToTimestampNanosFunc, to_timestamp_nanos); +make_udf_function_with_config!(to_timestamp::ToTimestampFunc, to_timestamp); +make_udf_function_with_config!( + to_timestamp::ToTimestampSecondsFunc, + to_timestamp_seconds +); +make_udf_function_with_config!(to_timestamp::ToTimestampMillisFunc, to_timestamp_millis); +make_udf_function_with_config!(to_timestamp::ToTimestampMicrosFunc, to_timestamp_micros); +make_udf_function_with_config!(to_timestamp::ToTimestampNanosFunc, to_timestamp_nanos); // create UDF with config make_udf_function_with_config!(now::NowFunc, now); @@ -107,23 +110,23 @@ pub mod expr_fn { ),( to_timestamp, "converts a string and optional formats to a `Timestamp(Nanoseconds, None)`", - args, + @config args, ),( to_timestamp_seconds, "converts a string and optional formats to a `Timestamp(Seconds, None)`", - args, + @config args, ),( to_timestamp_millis, "converts a string and optional formats to a `Timestamp(Milliseconds, None)`", - args, + @config args, ),( to_timestamp_micros, "converts a string and optional formats to a `Timestamp(Microseconds, None)`", - args, + @config args, ),( to_timestamp_nanos, "converts a string and optional formats to a `Timestamp(Nanoseconds, None)`", - args, + @config args, )); /// Returns a string representation of a date, time, timestamp or duration based @@ -259,6 +262,7 @@ pub mod expr_fn { /// Returns all DataFusion functions defined in this package pub fn functions() -> Vec> { use datafusion_common::config::ConfigOptions; + let config = ConfigOptions::default(); vec![ current_date(), current_time(), @@ -267,15 +271,15 @@ pub fn functions() -> Vec> { date_trunc(), from_unixtime(), make_date(), - now(&ConfigOptions::default()), + now(&config), to_char(), to_date(), to_local_time(), to_unixtime(), - to_timestamp(), - to_timestamp_seconds(), - to_timestamp_millis(), - to_timestamp_micros(), - to_timestamp_nanos(), + to_timestamp(&config), + to_timestamp_seconds(&config), + to_timestamp_millis(&config), + to_timestamp_micros(&config), + to_timestamp_nanos(&config), ] } diff --git a/datafusion/functions/src/datetime/to_timestamp.rs b/datafusion/functions/src/datetime/to_timestamp.rs index 0a0700097770..5ac19345bb1b 100644 --- a/datafusion/functions/src/datetime/to_timestamp.rs +++ b/datafusion/functions/src/datetime/to_timestamp.rs @@ -26,10 +26,11 @@ use arrow::datatypes::{ ArrowTimestampType, DataType, TimeUnit, TimestampMicrosecondType, TimestampMillisecondType, TimestampNanosecondType, TimestampSecondType, }; +use datafusion_common::config::ConfigOptions; use datafusion_common::format::DEFAULT_CAST_OPTIONS; use datafusion_common::{exec_err, Result, ScalarType, ScalarValue}; use datafusion_expr::{ - ColumnarValue, Documentation, ScalarUDFImpl, Signature, Volatility, + ColumnarValue, Documentation, ScalarUDF, ScalarUDFImpl, Signature, Volatility, }; use datafusion_macros::user_doc; @@ -69,6 +70,7 @@ Additional examples can be found [here](https://github.com/apache/datafusion/blo #[derive(Debug, PartialEq, Eq, Hash)] pub struct ToTimestampFunc { signature: Signature, + timezone: ConfiguredTimeZone, } #[user_doc( @@ -103,6 +105,7 @@ Additional examples can be found [here](https://github.com/apache/datafusion/blo #[derive(Debug, PartialEq, Eq, Hash)] pub struct ToTimestampSecondsFunc { signature: Signature, + timezone: ConfiguredTimeZone, } #[user_doc( @@ -137,6 +140,7 @@ Additional examples can be found [here](https://github.com/apache/datafusion/blo #[derive(Debug, PartialEq, Eq, Hash)] pub struct ToTimestampMillisFunc { signature: Signature, + timezone: ConfiguredTimeZone, } #[user_doc( @@ -171,6 +175,7 @@ Additional examples can be found [here](https://github.com/apache/datafusion/blo #[derive(Debug, PartialEq, Eq, Hash)] pub struct ToTimestampMicrosFunc { signature: Signature, + timezone: ConfiguredTimeZone, } #[user_doc( @@ -205,74 +210,105 @@ Additional examples can be found [here](https://github.com/apache/datafusion/blo #[derive(Debug, PartialEq, Eq, Hash)] pub struct ToTimestampNanosFunc { signature: Signature, + timezone: ConfiguredTimeZone, } impl Default for ToTimestampFunc { fn default() -> Self { - Self::new() + Self::new_with_config(&ConfigOptions::default()) } } impl ToTimestampFunc { + #[deprecated(since = "50.3.0", note = "use new_with_config instead")] pub fn new() -> Self { + Self::new_with_config(&ConfigOptions::default()) + } + + pub fn new_with_config(config: &ConfigOptions) -> Self { Self { signature: Signature::variadic_any(Volatility::Immutable), + timezone: ConfiguredTimeZone::from_config(config), } } } impl Default for ToTimestampSecondsFunc { fn default() -> Self { - Self::new() + Self::new_with_config(&ConfigOptions::default()) } } impl ToTimestampSecondsFunc { + #[deprecated(since = "50.3.0", note = "use new_with_config instead")] pub fn new() -> Self { + Self::new_with_config(&ConfigOptions::default()) + } + + pub fn new_with_config(config: &ConfigOptions) -> Self { Self { signature: Signature::variadic_any(Volatility::Immutable), + timezone: ConfiguredTimeZone::from_config(config), } } } impl Default for ToTimestampMillisFunc { fn default() -> Self { - Self::new() + Self::new_with_config(&ConfigOptions::default()) } } impl ToTimestampMillisFunc { + #[deprecated(since = "50.3.0", note = "use new_with_config instead")] pub fn new() -> Self { + Self::new_with_config(&ConfigOptions::default()) + } + + pub fn new_with_config(config: &ConfigOptions) -> Self { Self { signature: Signature::variadic_any(Volatility::Immutable), + timezone: ConfiguredTimeZone::from_config(config), } } } impl Default for ToTimestampMicrosFunc { fn default() -> Self { - Self::new() + Self::new_with_config(&ConfigOptions::default()) } } impl ToTimestampMicrosFunc { + #[deprecated(since = "50.3.0", note = "use new_with_config instead")] pub fn new() -> Self { + Self::new_with_config(&ConfigOptions::default()) + } + + pub fn new_with_config(config: &ConfigOptions) -> Self { Self { signature: Signature::variadic_any(Volatility::Immutable), + timezone: ConfiguredTimeZone::from_config(config), } } } impl Default for ToTimestampNanosFunc { fn default() -> Self { - Self::new() + Self::new_with_config(&ConfigOptions::default()) } } impl ToTimestampNanosFunc { + #[deprecated(since = "50.3.0", note = "use new_with_config instead")] pub fn new() -> Self { + Self::new_with_config(&ConfigOptions::default()) + } + + pub fn new_with_config(config: &ConfigOptions) -> Self { Self { signature: Signature::variadic_any(Volatility::Immutable), + timezone: ConfiguredTimeZone::from_config(config), } } } @@ -296,6 +332,10 @@ impl ScalarUDFImpl for ToTimestampFunc { &self.signature } + fn with_updated_config(&self, config: &ConfigOptions) -> Option { + Some(Self::new_with_config(config).into()) + } + fn return_type(&self, arg_types: &[DataType]) -> Result { Ok(return_type_for(&arg_types[0], Nanosecond)) } @@ -304,7 +344,7 @@ impl ScalarUDFImpl for ToTimestampFunc { &self, args: datafusion_expr::ScalarFunctionArgs, ) -> Result { - let args = args.args; + let datafusion_expr::ScalarFunctionArgs { args, .. } = args; if args.is_empty() { return exec_err!( "to_timestamp function requires 1 or more arguments, got {}", @@ -340,9 +380,11 @@ impl ScalarUDFImpl for ToTimestampFunc { Timestamp(_, Some(tz)) => { args[0].cast_to(&Timestamp(Nanosecond, Some(tz)), None) } - Utf8View | LargeUtf8 | Utf8 => { - to_timestamp_impl::(&args, "to_timestamp") - } + Utf8View | LargeUtf8 | Utf8 => to_timestamp_impl::( + &args, + "to_timestamp", + &self.timezone, + ), Decimal128(_, _) => { match &args[0] { ColumnarValue::Scalar(ScalarValue::Decimal128( @@ -390,6 +432,10 @@ impl ScalarUDFImpl for ToTimestampSecondsFunc { &self.signature } + fn with_updated_config(&self, config: &ConfigOptions) -> Option { + Some(Self::new_with_config(config).into()) + } + fn return_type(&self, arg_types: &[DataType]) -> Result { Ok(return_type_for(&arg_types[0], Second)) } @@ -398,7 +444,7 @@ impl ScalarUDFImpl for ToTimestampSecondsFunc { &self, args: datafusion_expr::ScalarFunctionArgs, ) -> Result { - let args = args.args; + let datafusion_expr::ScalarFunctionArgs { args, .. } = args; if args.is_empty() { return exec_err!( "to_timestamp_seconds function requires 1 or more arguments, got {}", @@ -416,9 +462,11 @@ impl ScalarUDFImpl for ToTimestampSecondsFunc { args[0].cast_to(&Timestamp(Second, None), None) } Timestamp(_, Some(tz)) => args[0].cast_to(&Timestamp(Second, Some(tz)), None), - Utf8View | LargeUtf8 | Utf8 => { - to_timestamp_impl::(&args, "to_timestamp_seconds") - } + Utf8View | LargeUtf8 | Utf8 => to_timestamp_impl::( + &args, + "to_timestamp_seconds", + &self.timezone, + ), other => { exec_err!( "Unsupported data type {} for function to_timestamp_seconds", @@ -445,6 +493,10 @@ impl ScalarUDFImpl for ToTimestampMillisFunc { &self.signature } + fn with_updated_config(&self, config: &ConfigOptions) -> Option { + Some(Self::new_with_config(config).into()) + } + fn return_type(&self, arg_types: &[DataType]) -> Result { Ok(return_type_for(&arg_types[0], Millisecond)) } @@ -453,7 +505,7 @@ impl ScalarUDFImpl for ToTimestampMillisFunc { &self, args: datafusion_expr::ScalarFunctionArgs, ) -> Result { - let args = args.args; + let datafusion_expr::ScalarFunctionArgs { args, .. } = args; if args.is_empty() { return exec_err!( "to_timestamp_millis function requires 1 or more arguments, got {}", @@ -476,6 +528,7 @@ impl ScalarUDFImpl for ToTimestampMillisFunc { Utf8View | LargeUtf8 | Utf8 => to_timestamp_impl::( &args, "to_timestamp_millis", + &self.timezone, ), other => { exec_err!( @@ -503,6 +556,10 @@ impl ScalarUDFImpl for ToTimestampMicrosFunc { &self.signature } + fn with_updated_config(&self, config: &ConfigOptions) -> Option { + Some(Self::new_with_config(config).into()) + } + fn return_type(&self, arg_types: &[DataType]) -> Result { Ok(return_type_for(&arg_types[0], Microsecond)) } @@ -511,7 +568,7 @@ impl ScalarUDFImpl for ToTimestampMicrosFunc { &self, args: datafusion_expr::ScalarFunctionArgs, ) -> Result { - let args = args.args; + let datafusion_expr::ScalarFunctionArgs { args, .. } = args; if args.is_empty() { return exec_err!( "to_timestamp_micros function requires 1 or more arguments, got {}", @@ -534,6 +591,7 @@ impl ScalarUDFImpl for ToTimestampMicrosFunc { Utf8View | LargeUtf8 | Utf8 => to_timestamp_impl::( &args, "to_timestamp_micros", + &self.timezone, ), other => { exec_err!( @@ -561,6 +619,10 @@ impl ScalarUDFImpl for ToTimestampNanosFunc { &self.signature } + fn with_updated_config(&self, config: &ConfigOptions) -> Option { + Some(Self::new_with_config(config).into()) + } + fn return_type(&self, arg_types: &[DataType]) -> Result { Ok(return_type_for(&arg_types[0], Nanosecond)) } @@ -569,7 +631,7 @@ impl ScalarUDFImpl for ToTimestampNanosFunc { &self, args: datafusion_expr::ScalarFunctionArgs, ) -> Result { - let args = args.args; + let datafusion_expr::ScalarFunctionArgs { args, .. } = args; if args.is_empty() { return exec_err!( "to_timestamp_nanos function requires 1 or more arguments, got {}", @@ -589,9 +651,11 @@ impl ScalarUDFImpl for ToTimestampNanosFunc { Timestamp(_, Some(tz)) => { args[0].cast_to(&Timestamp(Nanosecond, Some(tz)), None) } - Utf8View | LargeUtf8 | Utf8 => { - to_timestamp_impl::(&args, "to_timestamp_nanos") - } + Utf8View | LargeUtf8 | Utf8 => to_timestamp_impl::( + &args, + "to_timestamp_nanos", + &self.timezone, + ), other => { exec_err!( "Unsupported data type {} for function to_timestamp_nanos", @@ -617,6 +681,7 @@ fn return_type_for(arg: &DataType, unit: TimeUnit) -> DataType { fn to_timestamp_impl>( args: &[ColumnarValue], name: &str, + timezone: &ConfiguredTimeZone, ) -> Result { let factor = match T::UNIT { Second => 1_000_000_000, @@ -626,17 +691,30 @@ fn to_timestamp_impl>( }; match args.len() { - 1 => handle::( - args, - |s| string_to_timestamp_nanos_shim(s).map(|n| n / factor), - name, - ), - n if n >= 2 => handle_multiple::( - args, - string_to_timestamp_nanos_formatted, - |n| n / factor, - name, - ), + 1 => { + let timezone = timezone.clone(); + handle::( + args, + move |s| { + string_to_timestamp_nanos_with_timezone(&timezone, s) + .map(|n| n / factor) + }, + name, + ) + } + n if n >= 2 => { + let timezone = timezone.clone(); + handle_multiple::( + args, + move |s, format| { + string_to_timestamp_nanos_formatted_with_timezone( + &timezone, s, format, + ) + }, + |n| n / factor, + name, + ) + } _ => exec_err!("Unsupported 0 argument count for function {name}"), } } @@ -645,6 +723,9 @@ fn to_timestamp_impl>( mod tests { use std::sync::Arc; + use crate::datetime::common::{ + string_to_timestamp_nanos_formatted, ConfiguredTimeZone, + }; use arrow::array::types::Int64Type; use arrow::array::{ Array, PrimitiveArray, TimestampMicrosecondArray, TimestampMillisecondArray, @@ -660,27 +741,44 @@ mod tests { use super::*; fn to_timestamp(args: &[ColumnarValue]) -> Result { - to_timestamp_impl::(args, "to_timestamp") + let timezone = ConfiguredTimeZone::utc(); + to_timestamp_impl::(args, "to_timestamp", &timezone) } /// to_timestamp_millis SQL function fn to_timestamp_millis(args: &[ColumnarValue]) -> Result { - to_timestamp_impl::(args, "to_timestamp_millis") + let timezone = ConfiguredTimeZone::utc(); + to_timestamp_impl::( + args, + "to_timestamp_millis", + &timezone, + ) } /// to_timestamp_micros SQL function fn to_timestamp_micros(args: &[ColumnarValue]) -> Result { - to_timestamp_impl::(args, "to_timestamp_micros") + let timezone = ConfiguredTimeZone::utc(); + to_timestamp_impl::( + args, + "to_timestamp_micros", + &timezone, + ) } /// to_timestamp_nanos SQL function fn to_timestamp_nanos(args: &[ColumnarValue]) -> Result { - to_timestamp_impl::(args, "to_timestamp_nanos") + let timezone = ConfiguredTimeZone::utc(); + to_timestamp_impl::( + args, + "to_timestamp_nanos", + &timezone, + ) } /// to_timestamp_seconds SQL function fn to_timestamp_seconds(args: &[ColumnarValue]) -> Result { - to_timestamp_impl::(args, "to_timestamp_seconds") + let timezone = ConfiguredTimeZone::utc(); + to_timestamp_impl::(args, "to_timestamp_seconds", &timezone) } #[test] @@ -751,6 +849,246 @@ mod tests { Ok(()) } + #[test] + fn to_timestamp_respects_execution_timezone() -> Result<()> { + let mut options = ConfigOptions::default(); + options.execution.time_zone = Some("-05:00".to_string()); + let udf = ToTimestampFunc::new_with_config(&options); + let field = Field::new("arg", Utf8, true).into(); + + let args = datafusion_expr::ScalarFunctionArgs { + args: vec![ColumnarValue::Scalar(ScalarValue::Utf8(Some( + "2020-09-08T13:42:29".to_string(), + )))], + arg_fields: vec![field], + number_rows: 1, + return_field: Field::new("f", Timestamp(Nanosecond, None), true).into(), + config_options: Arc::new(ConfigOptions::default()), + }; + + let result = udf.invoke_with_args(args)?; + let ColumnarValue::Scalar(ScalarValue::TimestampNanosecond(Some(value), None)) = + result + else { + panic!("expected scalar timestamp"); + }; + + let expected = string_to_timestamp_nanos_shim("2020-09-08T18:42:29Z")?; + assert_eq!(value, expected); + Ok(()) + } + + #[test] + fn to_timestamp_invalid_execution_timezone_behavior() -> Result<()> { + let field: Arc = Field::new("arg", Utf8, true).into(); + let return_field: Arc = + Field::new("f", Timestamp(Nanosecond, None), true).into(); + + let mut options = ConfigOptions::default(); + options.execution.time_zone = Some("Invalid/Timezone".to_string()); + let udf = ToTimestampFunc::new_with_config(&options); + + let explicit_args = datafusion_expr::ScalarFunctionArgs { + args: vec![ColumnarValue::Scalar(ScalarValue::Utf8(Some( + "2020-09-08T13:42:29Z".to_string(), + )))], + arg_fields: vec![Arc::clone(&field)], + number_rows: 1, + return_field: Arc::clone(&return_field), + config_options: Arc::new(ConfigOptions::default()), + }; + + let explicit_result = udf.invoke_with_args(explicit_args)?; + let ColumnarValue::Scalar(ScalarValue::TimestampNanosecond(Some(value), None)) = + explicit_result + else { + panic!("expected scalar timestamp"); + }; + + let expected = string_to_timestamp_nanos_shim("2020-09-08T13:42:29Z")?; + assert_eq!(value, expected); + + let naive_args = datafusion_expr::ScalarFunctionArgs { + args: vec![ColumnarValue::Scalar(ScalarValue::Utf8(Some( + "2020-09-08T13:42:29".to_string(), + )))], + arg_fields: vec![field], + number_rows: 1, + return_field, + config_options: Arc::new(ConfigOptions::default()), + }; + + let naive_result = udf.invoke_with_args(naive_args)?; + let ColumnarValue::Scalar(ScalarValue::TimestampNanosecond(Some(value), None)) = + naive_result + else { + panic!("expected scalar timestamp"); + }; + + let expected = string_to_timestamp_nanos_shim("2020-09-08T13:42:29Z")?; + assert_eq!(value, expected); + Ok(()) + } + + #[test] + fn to_timestamp_formats_invalid_execution_timezone_behavior() -> Result<()> { + let expr_field: Arc = Field::new("arg", Utf8, true).into(); + let format_field: Arc = Field::new("fmt", Utf8, true).into(); + let return_field: Arc = + Field::new("f", Timestamp(Nanosecond, None), true).into(); + + let mut options = ConfigOptions::default(); + options.execution.time_zone = Some("Invalid/Timezone".to_string()); + let udf = ToTimestampFunc::new_with_config(&options); + + let make_args = |value: &str, format: &str| datafusion_expr::ScalarFunctionArgs { + args: vec![ + ColumnarValue::Scalar(ScalarValue::Utf8(Some(value.to_string()))), + ColumnarValue::Scalar(ScalarValue::Utf8(Some(format.to_string()))), + ], + arg_fields: vec![Arc::clone(&expr_field), Arc::clone(&format_field)], + number_rows: 1, + return_field: Arc::clone(&return_field), + config_options: Arc::new(ConfigOptions::default()), + }; + + for (value, format, expected_str) in [ + ("2020-09-08T13:42:29Z", "%+", Some("2020-09-08T13:42:29Z")), + ( + "2020-09-08 13:42:29 +0000", + "%Y-%m-%d %H:%M:%S %z", + Some("2020-09-08T13:42:29+00:00"), + ), + ( + "2020-09-08T13:42:29UTC", + "%Y-%m-%dT%H:%M:%S%Z", + Some("2020-09-08T13:42:29Z"), + ), + ( + "UTC 2024-01-01 12:00:00", + "%Z %Y-%m-%d %H:%M:%S", + Some("2024-01-01T12:00:00Z"), + ), + ( + "2020-09-08 09:42:29 America/New_York", + "%Y-%m-%d %H:%M:%S %Z", + None, + ), + ("20200908134229+0100", "%Y%m%d%H%M%S%z", None), + ("2020-09-08+0230 13:42", "%Y-%m-%d%z %H:%M", None), + ] { + let result = udf.invoke_with_args(make_args(value, format))?; + let ColumnarValue::Scalar(ScalarValue::TimestampNanosecond( + Some(actual), + None, + )) = result + else { + panic!("expected scalar timestamp"); + }; + let expected = match expected_str { + Some(expected_str) => string_to_timestamp_nanos_shim(expected_str)?, + None => string_to_timestamp_nanos_formatted(value, format)?, + }; + assert_eq!(actual, expected); + } + + let naive_args = datafusion_expr::ScalarFunctionArgs { + args: vec![ + ColumnarValue::Scalar(ScalarValue::Utf8(Some( + "2020-09-08T13:42:29".to_string(), + ))), + ColumnarValue::Scalar(ScalarValue::Utf8(Some( + "%Y-%m-%dT%H:%M:%S".to_string(), + ))), + ], + arg_fields: vec![Arc::clone(&expr_field), Arc::clone(&format_field)], + number_rows: 1, + return_field: Arc::clone(&return_field), + config_options: Arc::new(ConfigOptions::default()), + }; + + let naive_result = udf.invoke_with_args(naive_args)?; + let ColumnarValue::Scalar(ScalarValue::TimestampNanosecond(Some(actual), None)) = + naive_result + else { + panic!("expected scalar timestamp"); + }; + + let expected = string_to_timestamp_nanos_formatted( + "2020-09-08T13:42:29", + "%Y-%m-%dT%H:%M:%S", + )?; + assert_eq!(actual, expected); + Ok(()) + } + + #[test] + fn to_timestamp_formats_respect_timezone() -> Result<()> { + let timezone = + ConfiguredTimeZone::parse("Asia/Tokyo")?.expect("Asia/Tokyo should parse"); + let args = vec![ + ColumnarValue::Scalar(ScalarValue::Utf8(Some( + "03:59:00.123456789 05-17-2023".to_string(), + ))), + ColumnarValue::Scalar(ScalarValue::Utf8(Some( + "%H:%M:%S%.f %m-%d-%Y".to_string(), + ))), + ]; + + let result = to_timestamp_impl::( + &args, + "to_timestamp", + &timezone, + )?; + + let ColumnarValue::Scalar(ScalarValue::TimestampNanosecond(Some(value), None)) = + result + else { + panic!("expected scalar timestamp"); + }; + + let expected = string_to_timestamp_nanos_shim("2023-05-16T18:59:00.123456789Z")?; + assert_eq!(value, expected); + Ok(()) + } + + #[test] + fn to_timestamp_session_timezone_applied() -> Result<()> { + let timezone = ConfiguredTimeZone::parse("America/New_York")? + .expect("America/New_York should parse"); + let args = vec![ + ColumnarValue::Scalar(ScalarValue::Utf8(Some( + "2020-09-08 13-42-29".to_string(), + ))), + ColumnarValue::Scalar(ScalarValue::Utf8(Some( + "%Y-%m-%d %H-%M-%S".to_string(), + ))), + ]; + + let result = to_timestamp_impl::( + &args, + "to_timestamp", + &timezone, + )?; + + let ColumnarValue::Scalar(ScalarValue::TimestampNanosecond(Some(value), None)) = + result + else { + panic!("expected scalar timestamp"); + }; + + let expected = string_to_timestamp_nanos_shim("2020-09-08T17:42:29Z")?; + assert_eq!(value, expected); + Ok(()) + } + + #[test] + fn to_timestamp_invalid_session_timezone_errors() { + let err = + ConfiguredTimeZone::parse("Not/A_Zone").expect_err("expected parse error"); + assert_contains!(err.to_string(), "Invalid execution timezone"); + } + #[test] fn to_timestamp_invalid_input_type() -> Result<()> { // pass the wrong type of input array to to_timestamp and test @@ -989,11 +1327,19 @@ mod tests { #[test] fn test_tz() { let udfs: Vec> = vec![ - Box::new(ToTimestampFunc::new()), - Box::new(ToTimestampSecondsFunc::new()), - Box::new(ToTimestampMillisFunc::new()), - Box::new(ToTimestampNanosFunc::new()), - Box::new(ToTimestampSecondsFunc::new()), + Box::new(ToTimestampFunc::new_with_config(&ConfigOptions::default())), + Box::new(ToTimestampSecondsFunc::new_with_config( + &ConfigOptions::default(), + )), + Box::new(ToTimestampMillisFunc::new_with_config( + &ConfigOptions::default(), + )), + Box::new(ToTimestampNanosFunc::new_with_config( + &ConfigOptions::default(), + )), + Box::new(ToTimestampSecondsFunc::new_with_config( + &ConfigOptions::default(), + )), ]; let mut nanos_builder = TimestampNanosecondArray::builder(2); diff --git a/datafusion/functions/src/datetime/to_unixtime.rs b/datafusion/functions/src/datetime/to_unixtime.rs index 42651cd53716..64f201b85ec3 100644 --- a/datafusion/functions/src/datetime/to_unixtime.rs +++ b/datafusion/functions/src/datetime/to_unixtime.rs @@ -114,9 +114,11 @@ impl ScalarUDFImpl for ToUnixtimeFunc { DataType::Timestamp(_, tz) => arg_args[0] .cast_to(&DataType::Timestamp(TimeUnit::Second, tz), None)? .cast_to(&DataType::Int64, None), - DataType::Utf8 => ToTimestampSecondsFunc::new() - .invoke_with_args(args)? - .cast_to(&DataType::Int64, None), + DataType::Utf8 => { + ToTimestampSecondsFunc::new_with_config(args.config_options.as_ref()) + .invoke_with_args(args)? + .cast_to(&DataType::Int64, None) + } other => { exec_err!("Unsupported data type {} for function to_unixtime", other) } diff --git a/datafusion/functions/src/macros.rs b/datafusion/functions/src/macros.rs index 9e195f2d5291..9820f53992ca 100644 --- a/datafusion/functions/src/macros.rs +++ b/datafusion/functions/src/macros.rs @@ -41,6 +41,17 @@ /// - `Vec` argument (single argument followed by a comma) /// - Variable number of `Expr` arguments (zero or more arguments, must be without commas) /// - Functions that require config (marked with `@config` prefix) +/// +/// Note on configuration construction paths: +/// - The convenience wrappers generated for `@config` functions call the inner +/// constructor with `ConfigOptions::default()`. These wrappers are intended +/// primarily for programmatic `Expr` construction and convenience usage. +/// - When functions are registered in a session, DataFusion will call +/// `with_updated_config()` to create a `ScalarUDF` instance using the session's +/// actual `ConfigOptions`. This also happens when configuration changes at runtime +/// (e.g., via `SET` statements). In short: the macro uses the default config for +/// convenience constructors; the session config is applied when functions are +/// registered or when configuration is updated. #[macro_export] macro_rules! export_functions { ($(($FUNC:ident, $DOC:expr, $($arg:tt)*)),*) => { @@ -59,6 +70,24 @@ macro_rules! export_functions { } }; + // function that requires config and takes a vector argument + (single $FUNC:ident, $DOC:expr, @config $arg:ident,) => { + #[doc = $DOC] + pub fn $FUNC($arg: Vec) -> datafusion_expr::Expr { + use datafusion_common::config::ConfigOptions; + super::$FUNC(&ConfigOptions::default()).call($arg) + } + }; + + // function that requires config and variadic arguments + (single $FUNC:ident, $DOC:expr, @config $($arg:ident)*) => { + #[doc = $DOC] + pub fn $FUNC($($arg: datafusion_expr::Expr),*) -> datafusion_expr::Expr { + use datafusion_common::config::ConfigOptions; + super::$FUNC(&ConfigOptions::default()).call(vec![$($arg),*]) + } + }; + // single vector argument (a single argument followed by a comma) (single $FUNC:ident, $DOC:expr, $arg:ident,) => { #[doc = $DOC] diff --git a/datafusion/sqllogictest/test_files/to_timestamp_timezone.slt b/datafusion/sqllogictest/test_files/to_timestamp_timezone.slt new file mode 100644 index 000000000000..c03cb90ae278 --- /dev/null +++ b/datafusion/sqllogictest/test_files/to_timestamp_timezone.slt @@ -0,0 +1,157 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +########## +## to_timestamp timezone tests +########## + +## Test 1: Default timezone (UTC) with naive timestamp +## Naive timestamps (without explicit timezone) should be interpreted as UTC by default +query P +SELECT to_timestamp('2020-09-08T13:42:29'); +---- +2020-09-08T13:42:29 + +## Test 2: Explicit UTC timezone ('Z' suffix) +## Explicit timezone should be respected regardless of session timezone +query P +SELECT to_timestamp('2020-09-08T13:42:29Z'); +---- +2020-09-08T13:42:29 + +## Test 3: Explicit timezone offset (+05:00) +## Explicit offset should be respected - this is 13:42:29+05:00 which is 08:42:29 UTC +query P +SELECT to_timestamp('2020-09-08T13:42:29+05:00'); +---- +2020-09-08T08:42:29 + +## Test 4: Explicit timezone offset without colon (+0500) +## Should handle offset formats without colons +query P +SELECT to_timestamp('2020-09-08T13:42:29+0500'); +---- +2020-09-08T08:42:29 + +## Test 5: Negative timezone offset +query P +SELECT to_timestamp('2020-09-08T13:42:29-03:30'); +---- +2020-09-08T17:12:29 + +## Test 6: Configure session timezone to America/New_York +statement ok +SET datafusion.execution.time_zone = 'America/New_York'; + +## Test 7: Naive timestamp with configured timezone +## '2020-09-08T13:42:29' in America/New_York is EDT (UTC-4) +## So this should become '2020-09-08T17:42:29Z' in UTC +query P +SELECT to_timestamp('2020-09-08T13:42:29'); +---- +2020-09-08T17:42:29 + +## Test 8: Explicit UTC should override session timezone +## Even with America/New_York configured, explicit 'Z' should be respected +query P +SELECT to_timestamp('2020-09-08T13:42:29Z'); +---- +2020-09-08T13:42:29 + +## Test 9: Explicit offset should override session timezone +query P +SELECT to_timestamp('2020-09-08T13:42:29+05:00'); +---- +2020-09-08T08:42:29 + +## Test 10: Check arrow_typeof returns no timezone in result +## Result should be Timestamp(Nanosecond, None) regardless of input timezone +query T +SELECT arrow_typeof(to_timestamp('2020-09-08T13:42:29')); +---- +Timestamp(ns) + +## Test 11: Configure to offset-based timezone +statement ok +SET datafusion.execution.time_zone = '+05:30'; + +## Test 12: Naive timestamp with offset-based timezone +## '2020-09-08T13:42:29' in +05:30 should become '2020-09-08T08:12:29Z' +query P +SELECT to_timestamp('2020-09-08T13:42:29'); +---- +2020-09-08T08:12:29 + +## Test 13: Reset to UTC +statement ok +SET datafusion.execution.time_zone = 'UTC'; + +## Test 14: Naive timestamp back to UTC +query P +SELECT to_timestamp('2020-09-08T13:42:29'); +---- +2020-09-08T13:42:29 + +## Test 15: to_timestamp with format string - naive timestamp with session timezone +statement ok +SET datafusion.execution.time_zone = 'America/New_York'; + +query P +SELECT to_timestamp('2020-09-08 13:42:29', '%Y-%m-%d %H:%M:%S'); +---- +2020-09-08T17:42:29 + +## Test 16: to_timestamp with format string - explicit timezone should be respected +statement ok +SET datafusion.execution.time_zone = 'UTC'; + +query P +SELECT to_timestamp('2020-09-08 13:42:29 +0000', '%Y-%m-%d %H:%M:%S %z'); +---- +2020-09-08T13:42:29 + +## Test 17: Test all precision variants respect timezone +statement ok +SET datafusion.execution.time_zone = 'America/New_York'; + +## to_timestamp_seconds +query P +SELECT to_timestamp_seconds('2020-09-08T13:42:29'); +---- +2020-09-08T17:42:29 + +## to_timestamp_millis +query P +SELECT to_timestamp_millis('2020-09-08T13:42:29.123'); +---- +2020-09-08T17:42:29.123 + +## to_timestamp_micros +query P +SELECT to_timestamp_micros('2020-09-08T13:42:29.123456'); +---- +2020-09-08T17:42:29.123456 + +## to_timestamp_nanos +query P +SELECT to_timestamp_nanos('2020-09-08T13:42:29.123456789'); +---- +2020-09-08T17:42:29.123456789 + +## Test 18: Reset timezone for other tests +statement ok +SET datafusion.execution.time_zone = 'UTC';