diff --git a/docs/sphinx/source/api.rst b/docs/sphinx/source/api.rst index 8abb4a8a..9ec44c4f 100644 --- a/docs/sphinx/source/api.rst +++ b/docs/sphinx/source/api.rst @@ -133,8 +133,6 @@ Normalization normalize_with_expected_power normalize_with_pvwatts pvwatts_dc_power - delta_index - check_series_frequency Aggregation diff --git a/docs/sphinx/source/changelog/pending.rst b/docs/sphinx/source/changelog/pending.rst index 3341e4d6..bda6468f 100644 --- a/docs/sphinx/source/changelog/pending.rst +++ b/docs/sphinx/source/changelog/pending.rst @@ -2,8 +2,13 @@ Requirements ------------ * Removed pvlib version restrictions in setup.py. Previously "pvlib >= 0.11.0, <0.12.0", now "pvlib". * Updated pvlib version in requirements.txt from 0.11.0 to 0.14.0 +<<<<<<< fix-numpy-and-pandas-compatibility +* Removed pandas upper version restriction in setup.py. Now "pandas >= 1.4.4" to support pandas 3.0. +* Removed numpy upper version restriction in setup.py. Now "numpy >= 1.22.4" to support numpy 2.x. +======= * Added pandas upper version restriction in setup.py. Now "pandas >= 1.4.4, <3.0.0". * Added numpy upper version restriction in setup.py. Now "numpy >= 1.22.4, <2.3.0". +>>>>>>> development * Updated pandas version in requirements.txt from 2.2.2 to 2.2.3 for python 3.13 compativility. * Updated scipy version in requirements.txt from 1.13.1 to 1.14.1 for python 3.13 compatibility. * Updated h5py version in requirements.txt from 3.11.0 to 3.12.0 for python 3.13 compatibility. @@ -29,6 +34,33 @@ Requirements * Updated pytz version in requirements.txt from 2024.1 to 2025.2 for python 3.13 compatibility. +Deprecations +------------ +* Removed deprecated ``normalization.delta_index`` function (deprecated in v2.0.0). + The private ``_delta_index`` helper remains available for internal use. +* Removed deprecated ``normalization.check_series_frequency`` function (deprecated in v2.0.0). + The private ``_check_series_frequency`` helper remains available for internal use. + + +Bug Fixes +--------- +* Fixed pandas 3.0 compatibility in ``normalization.py`` by using ``.total_seconds()`` + instead of ``.view('int64')`` with hardcoded nanosecond divisors. Pandas 3.0 changed + the default datetime resolution from nanoseconds (``datetime64[ns]``) to microseconds + (``datetime64[us]``). Affected functions: ``_delta_index``, ``_t_step_nanoseconds``, + ``_aggregate``, ``_interpolate_series``. +* Fixed datetime resolution preservation in ``normalization.interpolate()`` to ensure + output maintains the same resolution as input (e.g., ``datetime64[us]``). +* Fixed numpy 2.x compatibility in ``soiling.py`` by using ``.item()`` and explicit + indexing to extract scalar values from numpy arrays, as implicit array-to-scalar + conversion is deprecated. +* Fixed xgboost 3.x compatibility in ``filtering.xgboost_clip_filter()`` by using + ``xgb.DMatrix`` with explicit feature names for model prediction. +* Fixed pandas 4.0 deprecation warnings by changing lowercase ``'d'`` to uppercase + ``'D'`` in Timedelta strings and using ``axis=`` keyword argument for DataFrame + aggregation methods. + + Enhancements ------------ * Modified ``TrendAnalysis._filter()`` to allow ``clip_filter`` to use ``pv_energy`` diff --git a/rdtools/degradation.py b/rdtools/degradation.py index 1698b368..28318a2b 100644 --- a/rdtools/degradation.py +++ b/rdtools/degradation.py @@ -261,7 +261,7 @@ def degradation_year_on_year(energy_normalized, recenter=True, # Auto center if recenter: start = energy_normalized.index[0] - oneyear = start + pd.Timedelta('364d') + oneyear = start + pd.Timedelta('364D') renorm = utilities.robust_median(energy_normalized[start:oneyear]) else: renorm = 1.0 @@ -280,7 +280,7 @@ def degradation_year_on_year(energy_normalized, recenter=True, tolerance=pd.Timedelta('8D') ) - df['time_diff_years'] = (df.dt - df.dt_right) / pd.Timedelta('365d') + df['time_diff_years'] = (df.dt - df.dt_right) / pd.Timedelta('365D') df['yoy'] = 100.0 * (df.energy - df.energy_right) / (df.time_diff_years) df.index = df.dt diff --git a/rdtools/filtering.py b/rdtools/filtering.py index 7c3759d4..ecc5162d 100644 --- a/rdtools/filtering.py +++ b/rdtools/filtering.py @@ -846,30 +846,31 @@ def xgboost_clip_filter(power_ac, mounting_type="fixed"): power_ac_df["mounting_config"] == "fixed", "mounting_config_bool" ] = 0 # Subset the dataframe to only include model inputs - power_ac_df = power_ac_df[ - [ - "first_order_derivative_backward", - "first_order_derivative_forward", - "first_order_derivative_backward_rolling_avg", - "first_order_derivative_forward_rolling_avg", - "sampling_frequency", - "mounting_config_bool", - "scaled_value", - "rolling_average", - "daily_max", - "percent_daily_max", - "deriv_max", - "deriv_backward_rolling_stdev", - "deriv_backward_rolling_mean", - "deriv_backward_rolling_median", - "deriv_backward_rolling_min", - "deriv_backward_rolling_max", - ] - ].dropna() + feature_cols = [ + "first_order_derivative_backward", + "first_order_derivative_forward", + "first_order_derivative_backward_rolling_avg", + "first_order_derivative_forward_rolling_avg", + "sampling_frequency", + "mounting_config_bool", + "scaled_value", + "rolling_average", + "daily_max", + "percent_daily_max", + "deriv_max", + "deriv_backward_rolling_stdev", + "deriv_backward_rolling_mean", + "deriv_backward_rolling_median", + "deriv_backward_rolling_min", + "deriv_backward_rolling_max", + ] + power_ac_df = power_ac_df[feature_cols].dropna() # Run the power_ac_df dataframe through the XGBoost ML model, - # and return boolean outputs + # and return boolean outputs. Use DMatrix with explicit feature names + # for xgboost 3.x compatibility. + dmatrix = xgb.DMatrix(power_ac_df, feature_names=feature_cols) xgb_predictions = pd.Series( - xgboost_clipping_model.predict(power_ac_df).astype(bool) + (xgboost_clipping_model.get_booster().predict(dmatrix) > 0.5).astype(bool) ) # Add datetime as an index xgb_predictions.index = power_ac_df.index diff --git a/rdtools/normalization.py b/rdtools/normalization.py index 23784a04..8a2e02b1 100644 --- a/rdtools/normalization.py +++ b/rdtools/normalization.py @@ -4,7 +4,6 @@ import numpy as np from scipy.optimize import minimize import warnings -from rdtools._deprecation import deprecated class ConvergenceError(Exception): @@ -175,44 +174,6 @@ def normalize_with_pvwatts(energy, pvwatts_kws): return energy_normalized, insolation -def _delta_index(series): - ''' - Takes a pandas series with a DatetimeIndex as input and - returns (time step sizes, average time step size) in hours - - Parameters - ---------- - series : pandas.Series - A pandas timeseries - - Returns - ------- - deltas : pandas.Series - A timeseries representing the timestep sizes of ``series`` - mean : float - The average timestep - ''' - - if series.index.freq is None: - # If there is no frequency information, explicitly calculate interval - # sizes. Length of each interval calculated by using 'int64' to convert - # to nanoseconds. - hours = pd.Series(series.index.view('int64') / (10.0**9 * 3600.0)) - hours.index = series.index - deltas = hours.diff() - else: - # If there is frequency information, pandas shift can be used to gain - # a meaningful interval for the first element of the timeseries - # Length of each interval calculated by using 'int64' to convert to - # nanoseconds. - deltas = (series.index - series.index.shift(-1)).view('int64') / \ - (10.0**9 * 3600.0) - return deltas, np.mean(deltas[~np.isnan(deltas)]) - - -delta_index = deprecated('2.0.0', removal='3.0.0')(_delta_index) - - def irradiance_rescale(irrad, irrad_sim, max_iterations=100, method='iterative', convergence_threshold=1e-6): ''' @@ -335,7 +296,36 @@ def _check_series_frequency(series, series_description): return freq -check_series_frequency = deprecated('2.0.0', removal='3.0.0')(_check_series_frequency) +def _delta_index(series): + ''' + Takes a pandas series with a DatetimeIndex as input and + returns (time step sizes, average time step size) in hours. + + Parameters + ---------- + series : pandas.Series + A pandas timeseries + + Returns + ------- + deltas : pandas.Series + A timeseries representing the timestep sizes of ``series`` + mean : float + The average timestep + ''' + # Use total_seconds() for resolution-agnostic calculation (pandas 3.0+) + if series.index.freq is None: + # If there is no frequency information, explicitly calculate interval sizes + deltas = pd.Series(series.index).diff().dt.total_seconds() / 3600.0 + deltas.index = series.index + else: + # If there is frequency information, pandas shift can be used to gain + # a meaningful interval for the first element of the timeseries + deltas = pd.Series( + (series.index - series.index.shift(-1)).total_seconds() / 3600.0, + index=series.index + ) + return deltas, deltas.mean() def _t_step_nanoseconds(time_series): @@ -343,9 +333,9 @@ def _t_step_nanoseconds(time_series): return a series of right labeled differences in the index of time_series in nanoseconds ''' - t_steps = np.diff(time_series.index.view('int64')).astype('float') - t_steps = np.insert(t_steps, 0, np.nan) - t_steps = pd.Series(index=time_series.index, data=t_steps) + # Use total_seconds() for resolution-agnostic calculation (pandas 3.0+) + t_steps = pd.Series(time_series.index).diff().dt.total_seconds() * 1e9 + t_steps.index = time_series.index return t_steps @@ -485,17 +475,21 @@ def _aggregate(time_series, target_frequency, max_timedelta, series_type): union_index = time_series.index.union(output_dummy.index) time_series = time_series.dropna() + # Return NaN series if no valid data remains after dropna + if len(time_series) == 0: + return pd.Series(np.nan, index=output_dummy.index) + values = time_series.values # Identify gaps (including from nans) bigger than max_time_delta - timestamps = time_series.index.view('int64') - timestamps = pd.Series(timestamps, index=time_series.index) - t_diffs = timestamps.diff() + # Use total_seconds() for resolution-agnostic calculation (pandas 3.0+) + t_diffs = pd.Series(time_series.index).diff().dt.total_seconds() * 1e9 + t_diffs.index = time_series.index # Keep track of the gap size but with refilled NaNs and new # timestamps from target freq t_diffs = t_diffs.reindex(union_index, method='bfill') - max_interval_nanoseconds = max_timedelta.total_seconds() * 10.0**9 + max_interval_nanoseconds = max_timedelta.total_seconds() * 1e9 gap_mask = t_diffs > max_interval_nanoseconds if time_series.index[0] != union_index[0]: @@ -503,8 +497,8 @@ def _aggregate(time_series, target_frequency, max_timedelta, series_type): gap_mask[:time_series.index[0]] = True time_series = time_series.reindex(union_index) - t_diffs = np.diff(time_series.index.view('int64')) - t_diffs_hours = t_diffs / 10**9 / 3600.0 + # Use total_seconds() for resolution-agnostic calculation + t_diffs_hours = pd.Series(time_series.index).diff().dt.total_seconds().values[1:] / 3600.0 if series_type == 'instantaneous': # interpolate with trapz sum time_series = time_series.interpolate(method='time') @@ -574,39 +568,41 @@ def _interpolate_series(time_series, target_index, max_timedelta=None, df = pd.DataFrame(time_series) df = df.dropna() - # convert to integer index and calculate the size of gaps in input - timestamps = df.index.view("int64").copy() + # convert to numeric index (seconds since epoch) for interpolation + # Use total_seconds() for resolution-agnostic calculation (pandas 3.0+) + epoch = pd.Timestamp('1970-01-01', tz=df.index.tz) + timestamps = (df.index - epoch).total_seconds().values df["timestamp"] = timestamps - df["gapsize_ns"] = df["timestamp"].diff() + df["gapsize_s"] = df["timestamp"].diff() df.index = timestamps - valid_indput_index = df.index.copy() + valid_input_index = df.index.copy() if max_timedelta is None: - max_interval_nanoseconds = 2 * df['gapsize_ns'].median() + max_interval_seconds = 2 * df['gapsize_s'].median() else: - max_interval_nanoseconds = max_timedelta.total_seconds() * 10.0**9 + max_interval_seconds = max_timedelta.total_seconds() - fraction_excluded = (df['gapsize_ns'] > max_interval_nanoseconds).mean() + fraction_excluded = (df['gapsize_s'] > max_interval_seconds).mean() if fraction_excluded > warning_threshold: warnings.warn("Fraction of excluded data " f"({100*fraction_excluded:0.02f}%) " "exceeded threshold", UserWarning) - # put data on index that includes both original and target indicies - target_timestamps = pd.Index(target_index.view('int64')) + # put data on index that includes both original and target indices + target_timestamps = pd.Index((target_index - epoch).total_seconds()) union_index = df.index.append(target_timestamps) union_index = union_index.drop_duplicates(keep='first') df = df.reindex(union_index) df = df.sort_index() # calculate the gap size in the original data (timestamps) - df['gapsize_ns'] = df['gapsize_ns'].bfill() - df.loc[valid_indput_index, 'gapsize_ns'] = 0 + df['gapsize_s'] = df['gapsize_s'].bfill() + df.loc[valid_input_index, 'gapsize_s'] = 0 # perform the interpolation when the max gap size criterion is satisfied - df_valid = df[df['gapsize_ns'] <= max_interval_nanoseconds].copy() + df_valid = df[df['gapsize_s'] <= max_interval_seconds].copy() df_valid['interpolated_data'] = \ df_valid['data'].interpolate(method='index') @@ -615,8 +611,8 @@ def _interpolate_series(time_series, target_index, max_timedelta=None, out = pd.Series(df['interpolated_data']) out = out.loc[target_timestamps] out.name = original_name - out.index = pd.to_datetime(out.index, utc=True).tz_convert(target_index.tz) - out = out.reindex(target_index) + # Convert seconds back to datetime, matching target_index + out.index = target_index return out @@ -665,6 +661,11 @@ def interpolate(time_series, target, max_timedelta=None, warning_threshold=0.1): target_index = pd.date_range(time_series.index.min(), time_series.index.max(), freq=target) + # Preserve the input series' datetime resolution (e.g., 'us' vs 'ns') + if hasattr(time_series.index, 'unit'): + input_unit = time_series.index.unit + if hasattr(target_index, 'unit') and target_index.unit != input_unit: + target_index = target_index.as_unit(input_unit) if (time_series.index.tz is None) ^ (target_index.tz is None): raise ValueError('Either time_series or target is time-zone aware but ' diff --git a/rdtools/soiling.py b/rdtools/soiling.py index 92a3bfb8..e8524ff4 100644 --- a/rdtools/soiling.py +++ b/rdtools/soiling.py @@ -1510,8 +1510,8 @@ def iterative_signal_decomposition( '{:.3e}'.format(adf_res[1])) # Check size of soiling signal vs residuals - SR_amp = float(np.diff(df_out.soiling_ratio.quantile([.1, .9]))) - residuals_amp = float(np.diff(df_out.residuals.quantile([.1, .9]))) + SR_amp = float(np.diff(df_out.soiling_ratio.quantile([.1, .9]))[0]) + residuals_amp = float(np.diff(df_out.residuals.quantile([.1, .9]))[0]) soiling_signal_strength = SR_amp / residuals_amp if soiling_signal_strength < soiling_significance: if verbose: @@ -1889,11 +1889,11 @@ def run_bootstrap(self, # Save best estimate and bootstrapped estimates of SR and soiling rates df_out.soiling_ratio = df_out.soiling_ratio.clip(lower=0, upper=1) df_out.loc[df_out.soiling_ratio.diff() == 0, 'soiling_rates'] = 0 - df_out['bt_soiling_ratio'] = (concat_SR * weights).sum(1) - df_out['bt_soiling_rates'] = (concat_r_s * weights).sum(1) + df_out['bt_soiling_ratio'] = (concat_SR * weights).sum(axis=1) + df_out['bt_soiling_rates'] = (concat_r_s * weights).sum(axis=1) # Set probability of cleaning events - df_out.cleaning_events = (concat_ce * weights).sum(1) + df_out.cleaning_events = (concat_ce * weights).sum(axis=1) # Find degradation rates self.degradation = [np.dot(bt_deg, weights), @@ -1908,7 +1908,7 @@ def run_bootstrap(self, np.quantile(bt_SL, ci_high_edge)] # Save "confidence intervals" for seasonal component - df_out.seasonal_component = (seasonal_samples * weights).sum(1) + df_out.seasonal_component = (seasonal_samples * weights).sum(axis=1) df_out['seasonal_low'] = seasonal_samples.quantile(ci_low_edge, 1) df_out['seasonal_high'] = seasonal_samples.quantile(ci_high_edge, 1) @@ -2216,7 +2216,7 @@ def _set_control_input(self, f, rolling_median_local, index, # The median zs of the week after the cleaning event z_med = rolling_median_local[HW+3] # Set control input this future median - u[0] = z_med - np.dot(f.H, np.dot(f.F, f.x)) + u[0] = z_med - np.dot(f.H, np.dot(f.F, f.x)).item() # If the change is bigger than the measurement noise: if np.abs(u[0]) > np.sqrt(f.R)/2: index_dummy = [n+3 for n in range(window_size-HW-1) @@ -2534,7 +2534,7 @@ def _make_seasonal_samples(list_of_SCs, sample_nr=10, min_multiplier=0.5, year=signal.index.year ).pivot(index='doy', columns='year', values='values') # We will use the median signal through all the years... - median_signal = year_matrix.median(1) + median_signal = year_matrix.median(axis=1) for j in range(sample_nr): # Generate random multiplier and phase shift multiplier = np.random.uniform(min_multiplier, max_multiplier) @@ -2578,7 +2578,7 @@ def _force_periodicity(in_signal, signal_index, out_index): year_matrix[year] = \ signal.loc[str(year)].reindex(dates_in_year).values[:365] # We will use the median signal through all the years... - median_signal = year_matrix.median(1) + median_signal = year_matrix.median(axis=1) # The output is the median signal broadcasted to the whole time series output = pd.Series( index=out_index, diff --git a/rdtools/test/degradation_test.py b/rdtools/test/degradation_test.py index 4e92a1f1..06626267 100644 --- a/rdtools/test/degradation_test.py +++ b/rdtools/test/degradation_test.py @@ -34,7 +34,7 @@ def get_corr_energy(cls, rd, input_freq): freq = input_freq x = pd.date_range(start=start, end=end, freq=freq) - day_deltas = (x - x[0]) / pd.Timedelta('1d') + day_deltas = (x - x[0]) / pd.Timedelta('1D') noise = (np.random.rand(len(day_deltas)) - 0.5) / 1e3 y = 1 + daily_rd * day_deltas + noise diff --git a/rdtools/test/normalization_pvwatts_test.py b/rdtools/test/normalization_pvwatts_test.py index 893d29a3..4018d98a 100644 --- a/rdtools/test/normalization_pvwatts_test.py +++ b/rdtools/test/normalization_pvwatts_test.py @@ -41,7 +41,7 @@ def setUp(self): periods=12, freq='MS') power_meas = 19.75 # power in dummy conditions - hours = (energy_index - energy_index.shift(-1)).view('int64') / (10.0**9 * 3600.0) + hours = (energy_index - energy_index.shift(-1)).total_seconds() / 3600.0 dummy_energy = hours * power_meas self.energy = pd.Series(dummy_energy, index=energy_index) diff --git a/setup.py b/setup.py index 8ee58b75..bc46f979 100755 --- a/setup.py +++ b/setup.py @@ -32,7 +32,7 @@ ] TESTS_REQUIRE = [ - "pytest >= 3.6.3", + "pytest >= 3.10.1", "pytest-cov", "coverage", "flake8", @@ -51,8 +51,8 @@ "h5py >= 3.7.0", "plotly>=4.0.0", "xgboost >= 1.6.0", - "pvlib", - "scikit-learn", + "pvlib >= 0.12.0", + "scikit-learn >= 1.1.3, != 1.6.0", "arch >= 5.0", "filterpy >= 1.4.2", ]