diff --git a/tests/test_base.py b/tests/test_base.py index eefdc71..56ad940 100644 --- a/tests/test_base.py +++ b/tests/test_base.py @@ -29,7 +29,7 @@ def test_base_processing(self): "text__°C__meteo": np.random.randn(24), "hr__%hr__meteo": np.random.randn(24), }, - index=pd.date_range("2024-12-05 00:00:00", freq="h", periods=24), + index=pd.date_range("2024-12-05 00:00:00", freq="h", periods=24, tz="UTC"), ) dp = DumbProcessor(required_columns=["text__°C__meteo"], keep_required=False) diff --git a/tests/test_math.py b/tests/test_math.py index 94f3813..f98045e 100644 --- a/tests/test_math.py +++ b/tests/test_math.py @@ -8,7 +8,9 @@ def test_time_gradient(self): test = ( pd.Series( [0, 1, 2, 2, 2, 3], - index=pd.date_range("2009-01-01 00:00:00", freq="10s", periods=6), + index=pd.date_range( + "2009-01-01 00:00:00", freq="10s", periods=6, tz="UTC" + ), name="cpt1", ) * 3600 @@ -16,7 +18,7 @@ def test_time_gradient(self): ref = pd.DataFrame( {"cpt1": [360.0, 360.0, 180.0, -5.68e-14, 180.0, 360.0]}, - index=pd.date_range("2009-01-01 00:00:00", freq="10s", periods=6), + index=pd.date_range("2009-01-01 00:00:00", freq="10s", periods=6, tz="UTC"), ) to_test = time_gradient(test) @@ -26,7 +28,9 @@ def test_time_gradient(self): test = ( pd.DataFrame( {"cpt1": [0, 1, 2, 2, 2, 3], "cpt2": [0, 1, 2, 2, 2, 3]}, - index=pd.date_range("2009-01-01 00:00:00", freq="10s", periods=6), + index=pd.date_range( + "2009-01-01 00:00:00", freq="10s", periods=6, tz="UTC" + ), ) * 3600 ) @@ -36,7 +40,7 @@ def test_time_gradient(self): "cpt1": [360.0, 360.0, 180.0, -5.68e-14, 180.0, 360.0], "cpt2": [360.0, 360.0, 180.0, -5.68e-14, 180.0, 360.0], }, - index=pd.date_range("2009-01-01 00:00:00", freq="10s", periods=6), + index=pd.date_range("2009-01-01 00:00:00", freq="10s", periods=6, tz="UTC"), ) to_test = time_gradient(test) @@ -46,7 +50,7 @@ def test_time_gradient(self): def test_time_integrate(self): test = pd.Series( [360.0, 360.0, 180.0, -5.68e-14, 180.0, 360.0], - index=pd.date_range("2009-01-01 00:00:00", freq="10s", periods=6), + index=pd.date_range("2009-01-01 00:00:00", freq="10s", periods=6, tz="UTC"), name="cpt", ) @@ -59,7 +63,7 @@ def test_time_integrate(self): "cpt1": [360.0, 360.0, 180.0, -5.68e-14, 180.0, 360.0], "cpt2": [360.0, 360.0, 180.0, -5.68e-14, 180.0, 360.0], }, - index=pd.date_range("2009-01-01 00:00:00", freq="10s", periods=6), + index=pd.date_range("2009-01-01 00:00:00", freq="10s", periods=6, tz="UTC"), ) ref = pd.Series({"cpt1": 3.0, "cpt2": 3.0}) @@ -69,11 +73,11 @@ def test_time_integrate(self): def test_aggregate_time_series(self): sim_res = pd.DataFrame( {"a": [1, 2], "b": [3, 4]}, - index=pd.date_range("2009-01-01", freq="h", periods=2), + index=pd.date_range("2009-01-01", freq="h", periods=2, tz="UTC"), ) ref_df = pd.DataFrame( {"a": [1, 1], "b": [3, 4]}, - index=pd.date_range("2009-01-01", freq="h", periods=2), + index=pd.date_range("2009-01-01", freq="h", periods=2, tz="UTC"), ) expected_default = pd.Series([1.5, 3.5], index=["a", "b"]) diff --git a/tests/test_plot.py b/tests/test_plot.py index 70cff7c..29ac4ef 100644 --- a/tests/test_plot.py +++ b/tests/test_plot.py @@ -52,7 +52,7 @@ def test_plot_gaps_heatmap(self): "a": np.random.randn(24), "b": np.random.randn(24), }, - index=pd.date_range("2009", freq="h", periods=24), + index=pd.date_range("2009", freq="h", periods=24, tz="UTC"), ) df.loc["2009-01-01 05:00:00":"2009-01-01 09:00:00", :] = np.nan @@ -71,7 +71,7 @@ def test_add_multi_axis_scatter(self): "b__W": np.random.randn(24) * 100, "e__Wh": np.random.randn(24) * 100, }, - index=pd.date_range("2009", freq="h", periods=24), + index=pd.date_range("2009", freq="h", periods=24, tz="UTC"), ) df["e__Wh"] = abs(df).cumsum()["e__Wh"] @@ -94,14 +94,12 @@ def test_add_multi_axis_scatter(self): y_title_standoff=1, ) - assert True - def test_get_gaps_scatter_dict(self): np.random.seed(42) measure = pd.Series( np.random.randn(24), name="name", - index=pd.date_range("2009", freq="h", periods=24), + index=pd.date_range("2009", freq="h", periods=24, tz="UTC"), ) measure.loc["2009-01-01 02:00:00":"2009-01-01 05:00:00"] = np.nan @@ -114,10 +112,10 @@ def test_get_gaps_scatter_dict(self): assert gap_dict == [ { "x": [ - pd.Timestamp("2009-01-01 01:00:00"), - pd.Timestamp("2009-01-01 01:00:00"), - pd.Timestamp("2009-01-01 06:00:00"), - pd.Timestamp("2009-01-01 06:00:00"), + pd.Timestamp("2009-01-01 01:00:00", tz="UTC"), + pd.Timestamp("2009-01-01 01:00:00", tz="UTC"), + pd.Timestamp("2009-01-01 06:00:00", tz="UTC"), + pd.Timestamp("2009-01-01 06:00:00", tz="UTC"), ], "y": [ -1.913280244657798, @@ -133,10 +131,10 @@ def test_get_gaps_scatter_dict(self): }, { "x": [ - pd.Timestamp("2009-01-01 11:00:00"), - pd.Timestamp("2009-01-01 11:00:00"), - pd.Timestamp("2009-01-01 13:00:00"), - pd.Timestamp("2009-01-01 13:00:00"), + pd.Timestamp("2009-01-01 11:00:00", tz="UTC"), + pd.Timestamp("2009-01-01 11:00:00", tz="UTC"), + pd.Timestamp("2009-01-01 13:00:00", tz="UTC"), + pd.Timestamp("2009-01-01 13:00:00", tz="UTC"), ], "y": [ -1.913280244657798, diff --git a/tests/test_plumbing.py b/tests/test_plumbing.py index 5612390..a924b90 100644 --- a/tests/test_plumbing.py +++ b/tests/test_plumbing.py @@ -24,7 +24,7 @@ "light__DIMENSIONLESS__building": [100, 200, 300], "mass_flwr__m3/h__hvac": [300, 500, 600], }, - index=pd.date_range("2009", freq="h", periods=3), + index=pd.date_range("2009", freq="h", periods=3, tz="UTC"), ) TEST_DF_2 = pd.DataFrame( @@ -33,7 +33,7 @@ "b__°C__zone_1": np.random.randn(24), "c__Wh__zone_2": np.random.randn(24) * 100, }, - index=pd.date_range("2009", freq="h", periods=24), + index=pd.date_range("2009", freq="h", periods=24, tz="UTC"), ) TEST_DF_2["c__Wh__zone_2"] = abs(TEST_DF_2).cumsum()["c__Wh__zone_2"] @@ -73,7 +73,7 @@ def test__get_all_data_step(self): test_df = TEST_DF.copy() test_df.iloc[1, 0] = np.nan test_df.iloc[0, 1] = np.nan - pipe = _get_pipe_from_proc_list(test_df.columns, PIPE_DICT["common"]) + pipe = _get_pipe_from_proc_list(test_df.columns, PIPE_DICT["common"], tz="UTC") res = pipe.fit_transform(test_df) @@ -86,6 +86,7 @@ def test__get_column_wise_transformer(self): col_trans = _get_column_wise_transformer( proc_dict=PIPE_DICT["pre_processing"], data_columns=TEST_DF.columns, + tz="UTC", process_name="test", ) @@ -99,6 +100,7 @@ def test__get_column_wise_transformer(self): data_columns=TEST_DF[ [col for col in TEST_DF.columns if col != "radiation__W/m2__outdoor"] ].columns, + tz="UTC", process_name="test", ) @@ -122,6 +124,7 @@ def test__get_column_wise_transformer(self): col_trans = _get_column_wise_transformer( proc_dict=PIPE_DICT["pre_processing"], data_columns=cols_none, + tz="UTC", process_name="test", ) diff --git a/tests/test_processing.py b/tests/test_processing.py index 88d0c64..bba5711 100644 --- a/tests/test_processing.py +++ b/tests/test_processing.py @@ -35,6 +35,7 @@ ProjectSolarRadOnSurfaces, FillOtherColumns, DropColumns, + ReplaceTag, ) RESOURCES_PATH = Path(__file__).parent / "resources" @@ -81,7 +82,7 @@ def mock_get_oikolab_df(**kwargs): class TestCustomTransformers: def test_pd_identity(self): df = pd.DataFrame( - {"a": [1.0]}, index=pd.date_range("2009", freq="h", periods=1) + {"a": [1.0]}, index=pd.date_range("2009", freq="h", periods=1, tz="UTC") ) identity = Identity() @@ -93,12 +94,12 @@ def test_pd_identity(self): def test_pd_replace_duplicated(self): df = pd.DataFrame( {"a": [1.0, 1.0, 2.0], "b": [3.0, np.nan, 3.0]}, - pd.date_range("2009-01-01", freq="h", periods=3), + pd.date_range("2009-01-01", freq="h", periods=3, tz="UTC"), ) res = pd.DataFrame( {"a": [1.0, np.nan, 2.0], "b": [3.0, np.nan, np.nan]}, - pd.date_range("2009-01-01", freq="h", periods=3), + pd.date_range("2009-01-01", freq="h", periods=3, tz="UTC"), ) rep_dup = ReplaceDuplicated(keep="first", value=np.nan) @@ -109,12 +110,12 @@ def test_pd_replace_duplicated(self): def test_pd_dropna(self): df = pd.DataFrame( {"a": [1.0, 2.0, np.nan], "b": [3.0, 4.0, 5.0]}, - index=pd.date_range("2009", freq="h", periods=3), + index=pd.date_range("2009", freq="h", periods=3, tz="UTC"), ) ref = pd.DataFrame( {"a": [1.0, 2.0], "b": [3.0, 4.0]}, - index=pd.date_range("2009", freq="h", periods=2), + index=pd.date_range("2009", freq="h", periods=2, tz="UTC"), ) dropper = Dropna(how="any") @@ -126,7 +127,7 @@ def test_pd_dropna(self): def test_pd_rename_columns(self): df = pd.DataFrame( {"a": [1.0, 2.0, np.nan], "b": [3.0, 4.0, 5.0]}, - index=pd.date_range("2009", freq="h", periods=3), + index=pd.date_range("2009", freq="h", periods=3, tz="UTC"), ) new_cols = ["c", "d"] @@ -144,14 +145,16 @@ def test_pd_rename_columns(self): assert list(renamer.fit_transform(df).columns) == ["c", "a"] inversed = renamer.inverse_transform( - pd.DataFrame(np.zeros((2, 2)), pd.date_range("2009", freq="h", periods=2)) + pd.DataFrame( + np.zeros((2, 2)), pd.date_range("2009", freq="h", periods=2, tz="UTC") + ) ) assert list(inversed.columns) == ["c", "a"] def test_pd_sk_transformer(self): df = pd.DataFrame( {"a": [1.0, 2.0], "b": [3.0, 4.0]}, - index=pd.date_range("2009", freq="h", periods=2), + index=pd.date_range("2009", freq="h", periods=2, tz="UTC"), ) scaler = SkTransform(StandardScaler()) @@ -159,7 +162,7 @@ def test_pd_sk_transformer(self): ref = pd.DataFrame( {"a": [-1.0, 1.0], "b": [-1.0, 1.0]}, - index=pd.date_range("2009", freq="h", periods=2), + index=pd.date_range("2009", freq="h", periods=2, tz="UTC"), ) pd.testing.assert_frame_equal(to_test, ref) @@ -170,12 +173,12 @@ def test_pd_sk_transformer(self): def test_pd_replace_threshold(self): df = pd.DataFrame( {"col1": [1, 2, 3, np.nan, 4], "col2": [1, np.nan, np.nan, 4, 5]}, - index=pd.date_range("2009", freq="h", periods=5), + index=pd.date_range("2009", freq="h", periods=5, tz="UTC"), ) ref = pd.DataFrame( {"col1": [0.0, 2, 3, np.nan, 4], "col2": [0.0, np.nan, np.nan, 4, 5]}, - index=pd.date_range("2009", freq="h", periods=5), + index=pd.date_range("2009", freq="h", periods=5, tz="UTC"), ) dropper = ReplaceThreshold(lower=1.1, upper=5, value=0.0) @@ -190,7 +193,7 @@ def test_pd_replace_threshold(self): pd.testing.assert_frame_equal(dropper.transform(df), df) def test_pd_drop_time_gradient(self): - time_index = pd.date_range("2021-01-01 00:00:00", freq="h", periods=8) + time_index = pd.date_range("2021-01-01 00:00:00", freq="h", periods=8, tz="UTC") df = pd.DataFrame( { @@ -219,36 +222,52 @@ def test_pd_drop_time_gradient(self): def test_pd_apply_expression(self): df = pd.DataFrame( {"a": [1.0, 2.0], "b": [3.0, 4.0]}, - index=pd.date_range("2009", freq="h", periods=2), + index=pd.date_range("2009", freq="h", periods=2, tz="UTC"), ) ref = pd.DataFrame( {"a": [2.0, 4.0], "b": [6.0, 8.0]}, - index=pd.date_range("2009", freq="h", periods=2), + index=pd.date_range("2009", freq="h", periods=2, tz="UTC"), ) transformer = ApplyExpression("X * 2") pd.testing.assert_frame_equal(ref, transformer.fit_transform(df)) + df = pd.DataFrame( + {"a__W": [1.0, 2.0], "b__W": [3.0, 4.0]}, + index=pd.date_range("2009", freq="h", periods=2, tz="UTC"), + ) + + ref = pd.DataFrame( + {"a__kW": [0.001, 0.002], "b__kW": [0.003, 0.004]}, + index=pd.date_range("2009", freq="h", periods=2, tz="UTC"), + ) + + transformer = ApplyExpression("X / 1000", "kW") + + pd.testing.assert_frame_equal(ref, transformer.fit_transform(df)) + def test_pd_time_gradient(self): test = ( pd.DataFrame( - {"cpt1": [0, 1, 2, 2, 2, 3], "cpt2": [0, 1, 2, 2, 2, 3]}, - index=pd.date_range("2009-01-01 00:00:00", freq="10s", periods=6), + {"cpt1__J": [0, 1, 2, 2, 2, 3], "cpt2__J": [0, 1, 2, 2, 2, 3]}, + index=pd.date_range( + "2009-01-01 00:00:00", freq="10s", periods=6, tz="UTC" + ), ) * 3600 ) ref = pd.DataFrame( { - "cpt1": [360.0, 360.0, 180.0, -5.68e-14, 180.0, 360.0], - "cpt2": [360.0, 360.0, 180.0, -5.68e-14, 180.0, 360.0], + "cpt1__W": [360.0, 360.0, 180.0, -5.68e-14, 180.0, 360.0], + "cpt2__W": [360.0, 360.0, 180.0, -5.68e-14, 180.0, 360.0], }, - index=pd.date_range("2009-01-01 00:00:00", freq="10s", periods=6), + index=pd.date_range("2009-01-01 00:00:00", freq="10s", periods=6, tz="UTC"), ) - derivator = TimeGradient() + derivator = TimeGradient(new_unit="W") pd.testing.assert_frame_equal(ref, derivator.fit_transform(test), rtol=0.01) @@ -258,7 +277,7 @@ def test_pd_ffill(self): "cpt1": [0.0, np.nan, 2.0, 2.0, np.nan, np.nan], "cpt2": [0.0, 1.0, 2.0, 2.0, np.nan, 3.0], }, - index=pd.date_range("2009", freq="h", periods=6), + index=pd.date_range("2009", freq="h", periods=6, tz="UTC"), ) ref = pd.DataFrame( @@ -266,7 +285,7 @@ def test_pd_ffill(self): "cpt1": [0.0, 0.0, 2.0, 2.0, 2.0, 2.0], "cpt2": [0.0, 1.0, 2.0, 2.0, 2.0, 3.0], }, - index=pd.date_range("2009", freq="h", periods=6), + index=pd.date_range("2009", freq="h", periods=6, tz="UTC"), ) filler = Ffill() @@ -277,7 +296,7 @@ def test_pd_ffill(self): "cpt1": [0.0, 0.0, 2.0, 2.0, np.nan, np.nan], "cpt2": [0.0, 1.0, 2.0, 2.0, 2.0, 3.0], }, - index=pd.date_range("2009", freq="h", periods=6), + index=pd.date_range("2009", freq="h", periods=6, tz="UTC"), ) filler = Ffill(gaps_lte="1h") @@ -289,7 +308,7 @@ def test_pd_bfill(self): "cpt1": [np.nan, np.nan, 2.0, 2.0, np.nan, 3.0], "cpt2": [0.0, 1.0, 2.0, 2.0, np.nan, 3.0], }, - index=pd.date_range("2009", freq="h", periods=6), + index=pd.date_range("2009", freq="h", periods=6, tz="UTC"), ) ref = pd.DataFrame( @@ -297,7 +316,7 @@ def test_pd_bfill(self): "cpt1": [2.0, 2.0, 2.0, 2.0, 3.0, 3.0], "cpt2": [0.0, 1.0, 2.0, 2.0, 3.0, 3.0], }, - index=pd.date_range("2009", freq="h", periods=6), + index=pd.date_range("2009", freq="h", periods=6, tz="UTC"), ) filler = Bfill() @@ -309,7 +328,7 @@ def test_pd_bfill(self): "cpt1": [np.nan, np.nan, 2.0, 2.0, 3.0, 3.0], "cpt2": [0.0, 1.0, 2.0, 2.0, 3.0, 3.0], }, - index=pd.date_range("2009", freq="h", periods=6), + index=pd.date_range("2009", freq="h", periods=6, tz="UTC"), ) pd.testing.assert_frame_equal(ref, filler.fit_transform(test.copy())) @@ -319,7 +338,7 @@ def test_pd_fill_na(self): "cpt1": [0.0, np.nan, 2.0, 2.0, np.nan, np.nan], "cpt2": [0.0, 1.0, 2.0, 2.0, np.nan, 3.0], }, - index=pd.date_range("2009", freq="h", periods=6), + index=pd.date_range("2009", freq="h", periods=6, tz="UTC"), ) ref = pd.DataFrame( @@ -327,7 +346,7 @@ def test_pd_fill_na(self): "cpt1": [0.0, 0.0, 2.0, 2.0, 0.0, 0.0], "cpt2": [0.0, 1.0, 2.0, 2.0, 0.0, 3.0], }, - index=pd.date_range("2009", freq="h", periods=6), + index=pd.date_range("2009", freq="h", periods=6, tz="UTC"), ) filler = FillNa(value=0.0) @@ -339,7 +358,7 @@ def test_pd_fill_na(self): "cpt1": [0.0, 0.0, 2.0, 2.0, np.nan, np.nan], "cpt2": [0.0, 1.0, 2.0, 2.0, 0.0, 3.0], }, - index=pd.date_range("2009", freq="h", periods=6), + index=pd.date_range("2009", freq="h", periods=6, tz="UTC"), ) pd.testing.assert_frame_equal(ref, filler.fit_transform(test.copy())) @@ -354,7 +373,7 @@ def test_resampler(self): "col2__°C": np.random.random(10), "col3": np.random.random(10) * 10, }, - index=pd.date_range("2009-01-01", freq="h", periods=10), + index=pd.date_range("2009-01-01", freq="h", periods=10, tz="UTC"), ).astype("float") ref = pd.DataFrame( @@ -364,7 +383,7 @@ def test_resampler(self): "col2__°C": [0.56239, 0.47789], "col3": [9.69910, 5.24756], }, - index=pd.date_range("2009-01-01 00:00:00", freq="5h", periods=2), + index=pd.date_range("2009-01-01 00:00:00", freq="5h", periods=2, tz="UTC"), ).astype("float") column_resampler = Resample( @@ -408,7 +427,7 @@ def test_pd_add_time_lag(self): "col0": np.arange(2), "col1": np.arange(2) * 10, }, - index=pd.date_range("2009-01-01", freq="h", periods=2), + index=pd.date_range("2009-01-01", freq="h", periods=2, tz="UTC"), ) ref = pd.DataFrame( @@ -419,7 +438,7 @@ def test_pd_add_time_lag(self): "1:00:00_col1": [0.0], }, index=pd.DatetimeIndex( - ["2009-01-01 01:00:00"], dtype="datetime64[ns]", freq="h" + ["2009-01-01 01:00:00"], dtype="datetime64[ns, UTC]", freq="h", tz="UTC" ), ) @@ -436,7 +455,7 @@ def test_pd_add_time_lag(self): def test_pd_gaussian_filter(self): df = pd.DataFrame( {"a": [1, 2, 3], "b": [4, 5, 6]}, - index=pd.date_range("2009", freq="h", periods=3), + index=pd.date_range("2009", freq="h", periods=3, tz="UTC"), ) gfilter = GaussianFilter1D() @@ -456,7 +475,7 @@ def test_pd_gaussian_filter(self): def test_pd_combine_columns(self): x_in = pd.DataFrame( {"a__°C": [1, 2], "b__°C": [1, 2], "c": [1, 2]}, - index=pd.date_range("2009", freq="h", periods=2), + index=pd.date_range("2009", freq="h", periods=2, tz="UTC"), ) trans = CombineColumns( @@ -470,7 +489,7 @@ def test_pd_combine_columns(self): trans.fit_transform(x_in.copy()), pd.DataFrame( {"c": [1, 2], "combined": [2, 4]}, - index=pd.date_range("2009", freq="h", periods=2), + index=pd.date_range("2009", freq="h", periods=2, tz="UTC"), ), ) @@ -525,7 +544,7 @@ def test_pd_pd_interpolate(self): "data_1": np.arange(24).astype(float), "data_2": 2 * np.arange(24).astype(float), }, - index=pd.date_range("2009", freq="h", periods=24), + index=pd.date_range("2009", freq="h", periods=24, tz="UTC"), ) toy_holes = toy_df.copy() @@ -608,7 +627,7 @@ def test_pd_pd_interpolate(self): ) def test_pd_fill_gap(self): - index = pd.date_range("2009-01-01", "2009-12-31 23:00:00", freq="h") + index = pd.date_range("2009-01-01", "2009-12-31 23:00:00", freq="h", tz="UTC") cumsum_second = np.arange( start=0, stop=(index[-1] - index[0]).total_seconds() + 1, step=3600 ) @@ -626,15 +645,19 @@ def test_pd_fill_gap(self): holes_pairs = [ ("2009-06-14 12:00:00", "Temp_1"), ("2009-05-24", "Temp_1"), - (pd.date_range("2009-07-05", "2009-07-06", freq="h"), "Temp_1"), + (pd.date_range("2009-07-05", "2009-07-06", freq="h", tz="UTC"), "Temp_1"), ( - pd.date_range("2009-12-24 14:00:00", "2009-12-24 16:00:00", freq="h"), + pd.date_range( + "2009-12-24 14:00:00", "2009-12-24 16:00:00", freq="h", tz="UTC" + ), "Temp_1", ), ("2009-04-24", "Temp_2"), - (pd.date_range("2009-06-05", "2009-06-06", freq="h"), "Temp_2"), + (pd.date_range("2009-06-05", "2009-06-06", freq="h", tz="UTC"), "Temp_2"), ( - pd.date_range("2009-11-24 14:00:00", "2009-11-24 16:00:00", freq="h"), + pd.date_range( + "2009-11-24 14:00:00", "2009-11-24 16:00:00", freq="h", tz="UTC" + ), "Temp_2", ), ] @@ -651,8 +674,12 @@ def test_pd_fill_gap(self): assert r2_score(toy_df.loc[gap[0], gap[1]], res.loc[gap[0], gap[1]]) > 0.99 toy_df_15min = toy_df.resample("15min").mean().interpolate() - hole_backast = pd.date_range("2009-06-05", "2009-06-06 01:15:00", freq="15min") - hole_forecast = pd.date_range("2009-08-05", "2009-08-06 01:45:00", freq="15min") + hole_backast = pd.date_range( + "2009-06-05", "2009-06-06 01:15:00", freq="15min", tz="UTC" + ) + hole_forecast = pd.date_range( + "2009-08-05", "2009-08-06 01:45:00", freq="15min", tz="UTC" + ) toy_df_15min_hole = toy_df_15min.copy() toy_df_15min_hole.loc[hole_backast, "Temp_1"] = np.nan toy_df_15min_hole.loc[hole_forecast, "Temp_1"] = np.nan @@ -689,7 +716,7 @@ def test_combiner(self): "light__DIMENSIONLESS__building": [100, 200, 300], "mass_flwr__m3/h__hvac": [300, 500, 600], }, - index=pd.date_range("2009", freq="h", periods=3), + index=pd.date_range("2009", freq="h", periods=3, tz="UTC"), ) combiner = ExpressionCombine( @@ -779,7 +806,7 @@ def test_add_oiko_data(self, mock_get_oikolab): def test_add_solar_angles(self): df = pd.DataFrame( {"a": np.random.randn(24)}, - index=pd.date_range("2024-12-19", freq="h", periods=24), + index=pd.date_range("2024-12-19", freq="h", periods=24, tz="UTC"), ) sun_angle = AddSolarAngles() @@ -826,7 +853,7 @@ def test_fill_other_columns(self): "col_2": [1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0], "col_1_fill": [1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0], }, - index=pd.date_range("2009", freq="h", periods=10), + index=pd.date_range("2009", freq="h", periods=10, tz="UTC"), ) col_filler = FillOtherColumns(columns_map={"col_1": "col_1_fill"}) @@ -852,7 +879,7 @@ def test_fill_other_columns(self): def test_drop_columns(self): df = pd.DataFrame( {"a": [1, 2], "b": [1, 2], "c": [1, 2]}, - index=pd.date_range("2009", freq="h", periods=2), + index=pd.date_range("2009", freq="h", periods=2, tz="UTC"), ) col_dropper = DropColumns() @@ -872,3 +899,18 @@ def test_drop_columns(self): res = col_dropper.transform(df.copy()) assert res.shape == (2, 0) + + def test_replace_tag(self): + df = pd.DataFrame( + {"energy_1__Wh": [1.0, 2.0], "energy_2__Whr__bloc": [3.0, 4.0]}, + index=pd.date_range("2009", freq="h", periods=2, tz="UTC"), + ) + + rep = ReplaceTag({"Whr": "Wh"}) + rep.fit_transform(df) + + assert list(rep.get_feature_names_out()) == [ + "energy_1__Wh", + "energy_2__Wh__bloc", + ] + assert list(df.columns) == ["energy_1__Wh", "energy_2__Wh__bloc"] diff --git a/tests/test_regressors.py b/tests/test_regressors.py index 58eac45..f0fc13e 100644 --- a/tests/test_regressors.py +++ b/tests/test_regressors.py @@ -8,7 +8,7 @@ class TestRegressors: def test_stl_forecaster(self): - index = pd.date_range("2009-01-01", "2009-12-31 23:00:00", freq="h") + index = pd.date_range("2009-01-01", "2009-12-31 23:00:00", freq="h", tz="UTC") cumsum_second = np.arange( start=0, stop=(index[-1] - index[0]).total_seconds() + 1, step=3600 ) diff --git a/tests/test_utils.py b/tests/test_utils.py index ce8e36d..c3ab1e0 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -102,7 +102,7 @@ def test_get_data_level_names(self): def test_get_series_bloc(self): toy_sr = pd.Series( data=np.arange(24).astype(float), - index=pd.date_range("2009", freq="h", periods=24), + index=pd.date_range("2009", freq="h", periods=24, tz="UTC"), name="data_1", ) @@ -159,7 +159,7 @@ def test_get_series_bloc(self): # Get isolated gaps ser = pd.Series( [np.nan, 1, 2, np.nan, 3, 4, np.nan], - index=pd.date_range("2009", freq="h", periods=7), + index=pd.date_range("2009", freq="h", periods=7, tz="UTC"), ) res = get_series_bloc(ser, is_null=True) assert len(res) == 3 @@ -167,7 +167,7 @@ def test_get_series_bloc(self): # No gaps case ser = pd.Series( [0.0, 1.0, 2.0, 2.5, 3, 4, 5.0], - index=pd.date_range("2009", freq="h", periods=7), + index=pd.date_range("2009", freq="h", periods=7, tz="UTC"), ) res = get_series_bloc(ser, is_null=True) @@ -176,7 +176,7 @@ def test_get_series_bloc(self): # No gaps case ser = pd.Series( [0.0, 1.0, 2.0, np.nan, 3, 4, 5.0], - index=pd.date_range("2009", freq="h", periods=7), + index=pd.date_range("2009", freq="h", periods=7, tz="UTC"), ) res = get_series_bloc(ser, is_null=True) @@ -185,7 +185,7 @@ def test_get_series_bloc(self): def test_get_data_blocks(self): toy_df = pd.DataFrame( {"data_1": np.random.randn(24), "data_2": np.random.randn(24)}, - index=pd.date_range("2009-01-01", freq="h", periods=24), + index=pd.date_range("2009-01-01", freq="h", periods=24, tz="UTC"), ) toy_df.loc["2009-01-01 01:00:00", "data_1"] = np.nan @@ -203,10 +203,11 @@ def test_get_data_blocks(self): res = get_data_blocks(toy_df, is_null=True) assert len(res["combination"]) == 3 pd.testing.assert_index_equal( - res["data_1"][0], pd.DatetimeIndex(["2009-01-01 01:00:00"]) + res["data_1"][0], pd.DatetimeIndex(["2009-01-01 01:00:00"], tz="UTC") ) pd.testing.assert_index_equal( - res["data_2"][0], pd.date_range("2009-01-01 15:00:00", freq="h", periods=9) + res["data_2"][0], + pd.date_range("2009-01-01 15:00:00", freq="h", periods=9, tz="UTC"), ) res = get_data_blocks(toy_df, is_null=True, lower_td_threshold="1h30min") @@ -217,7 +218,9 @@ def test_get_data_blocks(self): # CAREFUL !!! Remove timestamps to get indexes without frequency toy_df.drop( - pd.date_range("2009-01-01 02:00:00", "2009-01-01 04:00:00", freq="h"), + pd.date_range( + "2009-01-01 02:00:00", "2009-01-01 04:00:00", freq="h", tz="UTC" + ), axis=0, inplace=True, ) @@ -243,12 +246,12 @@ def test_get_data_blocks(self): assert res["data_1"] == [] def test_outer_timestamps(self): - ref_index = pd.date_range("2009-01-01", freq="d", periods=5) - idx = pd.date_range("2009-01-02", freq="d", periods=2) + ref_index = pd.date_range("2009-01-01", freq="d", periods=5, tz="UTC") + idx = pd.date_range("2009-01-02", freq="d", periods=2, tz="UTC") start, end = get_outer_timestamps(idx, ref_index) - assert start == pd.to_datetime("2009-01-01") - assert end == pd.to_datetime("2009-01-04") + assert start == pd.to_datetime("2009-01-01", utc=True) + assert end == pd.to_datetime("2009-01-04", utc=True) start, end = get_outer_timestamps(ref_index, ref_index) assert start == ref_index[0] @@ -257,7 +260,9 @@ def test_outer_timestamps(self): def test_timedelta_to_int(self): X = pd.DataFrame( {"a": np.arange(10 * 6 * 24)}, - index=pd.date_range(dt.datetime.now(), freq="10min", periods=10 * 6 * 24), + index=pd.date_range( + dt.datetime.now(), freq="10min", periods=10 * 6 * 24, tz="UTC" + ), ) assert timedelta_to_int("24h", X) == 144 diff --git a/tide/base.py b/tide/base.py index 9982a6b..1650ad5 100644 --- a/tide/base.py +++ b/tide/base.py @@ -19,6 +19,7 @@ get_data_blocks, get_idx_freq_delta_or_min_time_interval, ensure_list, + get_tag_levels, ) from tide.meteo import get_oikolab_df @@ -99,6 +100,25 @@ def __init__( self.removed_columns = removed_columns self.added_columns = added_columns + def get_set_tags_values_columns(self, X, tag_level: int, value: str): + nb_tags = get_tag_levels(X.columns) + if tag_level > nb_tags - 1: + raise ValueError( + f"Asking for level {tag_level} tag (indexing from 0). " + f"Only {nb_tags} tags found in columns" + ) + + new_columns = [] + for col in X.columns: + parts = col.split("__") + parts[tag_level] = value + new_columns.append("__".join(parts)) + + return new_columns + + def set_tags_values(self, X, tag_level: int, value: str): + X.columns = self.get_set_tags_values_columns(X, tag_level, value) + def check_features(self, X): if self.required_columns is not None: if not set(self.required_columns).issubset(X.columns): diff --git a/tide/plumbing.py b/tide/plumbing.py index 6064c36..3d5b2a9 100644 --- a/tide/plumbing.py +++ b/tide/plumbing.py @@ -23,16 +23,19 @@ import tide.processing as pc -def _dummy_df(columns): +def _dummy_df(columns, tz): return pd.DataFrame( data=np.ones((2, len(columns))), columns=columns, - index=pd.date_range("2009", freq="h", periods=2), + index=pd.date_range("2009", freq="h", periods=2, tz=tz), ) def _get_pipe_from_proc_list( - data_columns: pd.Index | list[str], proc_list: list, verbose: bool = False + data_columns: pd.Index | list[str], + proc_list: list, + tz: str | dt.timezone, + verbose: bool = False, ) -> Pipeline: proc_units = [ getattr(pc, proc[0])( @@ -43,13 +46,14 @@ def _get_pipe_from_proc_list( ] pipe = make_pipeline(*proc_units, verbose=verbose) pipe.set_output(transform="pandas") - pipe.fit(_dummy_df(data_columns)) + pipe.fit(_dummy_df(data_columns, tz)) return pipe def _get_column_wise_transformer( proc_dict, data_columns: pd.Index | list[str], + tz: str | dt.timezone, process_name: str = None, verbose: bool = False, ) -> ColumnTransformer | None: @@ -63,7 +67,7 @@ def _get_column_wise_transformer( col_trans_list.append( ( f"{process_name}->{name}" if process_name is not None else name, - _get_pipe_from_proc_list(requested_col, proc_list, verbose), + _get_pipe_from_proc_list(requested_col, proc_list, tz, verbose), requested_col, ) ) @@ -77,12 +81,15 @@ def _get_column_wise_transformer( verbose_feature_names_out=False, verbose=verbose, ).set_output(transform="pandas") - transformer.fit(_dummy_df(data_columns)) + transformer.fit(_dummy_df(data_columns, tz)) return transformer def get_pipeline_from_dict( - data_columns: pd.Index | list[str], pipe_dict: dict = None, verbose: bool = False + data_columns: pd.Index | list[str], + pipe_dict: dict = None, + tz: str | dt.timezone = "UTC", + verbose: bool = False, ): if pipe_dict is None: return Pipeline([("Identity", pc.Identity())], verbose=verbose) @@ -91,11 +98,11 @@ def get_pipeline_from_dict( step_columns = data_columns.copy() for step, op_conf in pipe_dict.items(): if isinstance(op_conf, list): - operation = _get_pipe_from_proc_list(step_columns, op_conf, verbose) + operation = _get_pipe_from_proc_list(step_columns, op_conf, tz, verbose) elif isinstance(op_conf, dict): operation = _get_column_wise_transformer( - op_conf, step_columns, step, verbose + op_conf, step_columns, tz, step, verbose ) else: @@ -103,7 +110,7 @@ def get_pipeline_from_dict( if operation is not None: steps_list.append((step, operation)) - step_columns = operation.get_feature_names_out() + step_columns = [str(feat) for feat in operation.get_feature_names_out()] return Pipeline(steps_list, verbose=verbose) @@ -134,7 +141,7 @@ def show(self, steps: None | str | list[str] | slice = slice(None)): if self.root is not None: self.root.show() elif self.data is not None: - pipe = self.get_pipeline(steps=steps).fit(self.data) + pipe = self.get_pipeline(steps=steps) data_columns_to_tree(pipe.get_feature_names_out()).show() def set_data(self, data: pd.Series | pd.DataFrame): @@ -163,7 +170,9 @@ def get_pipeline( selected_steps = pipe_named_keys[steps] dict_to_pipe = {key: self.pipe_dict[key] for key in selected_steps} - return get_pipeline_from_dict(selection, dict_to_pipe, verbose) + return get_pipeline_from_dict( + selection, dict_to_pipe, self.data.index.tz, verbose + ) def get_corrected_data( self, diff --git a/tide/processing.py b/tide/processing.py index 7498fd4..7f06930 100644 --- a/tide/processing.py +++ b/tide/processing.py @@ -15,6 +15,7 @@ check_and_return_dt_index_df, parse_request_to_col_names, ensure_list, + get_added_removed_col, ) from tide.regressors import SkSTLForecast from tide.classifiers import STLEDetector @@ -446,15 +447,26 @@ class ApplyExpression(BaseProcessing): """ - def __init__(self, expression): + def __init__(self, expression: str, new_unit: str = None): super().__init__() self.expression = expression + self.new_unit = new_unit def _fit_implementation(self, X: pd.Series | pd.DataFrame, y=None): + if self.new_unit is not None: + self.new_cols_ = self.get_set_tags_values_columns( + X.copy(), 1, self.new_unit + ) + self.added_columns, self.removed_columns = get_added_removed_col( + X.columns, self.new_cols_ + ) return self def _transform_implementation(self, X: pd.Series | pd.DataFrame): - return eval(self.expression) + X = eval(self.expression) + if self.new_unit is not None: + X.columns = self.new_cols_ + return X class TimeGradient(BaseProcessing): @@ -483,16 +495,27 @@ class TimeGradient(BaseProcessing): """ - def __init__(self): + def __init__(self, new_unit: str = None): super().__init__() + self.new_unit = new_unit def _fit_implementation(self, X: pd.Series | pd.DataFrame, y=None): + if self.new_unit is not None: + self.new_cols_ = self.get_set_tags_values_columns( + X.copy(), 1, self.new_unit + ) + self.added_columns, self.removed_columns = get_added_removed_col( + X.columns, self.new_cols_ + ) return self def _transform_implementation(self, X: pd.Series | pd.DataFrame): original_index = X.index.copy() derivative = time_gradient(X) - return derivative.reindex(original_index) + derivative.reindex(original_index) + if self.new_unit is not None: + derivative.columns = self.new_cols_ + return derivative class Ffill(BaseFiller, BaseProcessing): @@ -1666,3 +1689,36 @@ def _transform_implementation(self, X: pd.Series | pd.DataFrame): if self.columns is not None else X ) + + +class ReplaceTag(BaseProcessing): + """ + Replaces Tide tag components with new values based on a specified mapping. + Tags are structured as strings separated by "__", typically following the format + "Name__unit__bloc__sub_bloc". + + Attributes: + tag_map (dict[str, str]): A dictionary mapping old tag substrings to new + tag substrings. + + """ + + def __init__(self, tag_map: dict[str, str] = None): + self.tag_map = tag_map + BaseProcessing.__init__(self) + + def _fit_implementation(self, X: pd.Series | pd.DataFrame, y=None): + self.new_columns_ = [] + for col in X.columns: + parts = col.split("__") + updated_parts = [self.tag_map.get(part, part) for part in parts] + self.new_columns_.append("__".join(updated_parts)) + pass + self.added_columns, self.removed_columns = get_added_removed_col( + X.columns, self.new_columns_ + ) + + def _transform_implementation(self, X: pd.Series | pd.DataFrame): + check_is_fitted(self, attributes=["new_columns_"]) + X.columns = self.new_columns_ + return X diff --git a/tide/utils.py b/tide/utils.py index 40da293..4f038bd 100644 --- a/tide/utils.py +++ b/tide/utils.py @@ -43,6 +43,12 @@ def __getitem__(self, key: str | list[str] | slice): raise TypeError("Invalid key type") +def get_added_removed_col(original_idx: list | pd.Index, new_idx: list | pd.Index): + added_columns = list(set(new_idx) - set(original_idx)) + removed_columns = list(set(original_idx) - set(new_idx)) + return added_columns, removed_columns + + def get_tag_levels(data_columns: pd.Index | list[str]) -> int: """ Returns max number of used tags from data columns names @@ -186,6 +192,9 @@ def check_and_return_dt_index_df(X: pd.Series | pd.DataFrame) -> pd.DataFrame: if not isinstance(X.index, pd.DatetimeIndex): raise ValueError("X index is not a pandas DateTime index") + if X.index.tz is None: + raise ValueError("X index must be tz_localized") + return X.to_frame() if isinstance(X, pd.Series) else X