Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
39 changes: 26 additions & 13 deletions tests/test_processing.py
Original file line number Diff line number Diff line change
Expand Up @@ -483,44 +483,57 @@ def test_pd_gaussian_filter(self):

def test_pd_combine_columns(self):
x_in = pd.DataFrame(
{"a__°C": [1, 2], "b__°C": [1, 2], "c": [1, 2]},
{"a__°C": [1, 2], "b__°C": [2, 4]},
index=pd.date_range("2009", freq="h", periods=2, tz="UTC"),
)

trans = CombineColumns(
function=np.sum,
columns=["a__°C", "b__°C"],
function_kwargs={"axis": 1},
function="sum",
drop_columns=True,
)

res = trans.fit_transform(x_in.copy())
pd.testing.assert_frame_equal(
res,
pd.DataFrame(
{"c": [1, 2], "combined": [2, 4]},
{"combined": [3, 6]},
index=pd.date_range("2009", freq="h", periods=2, tz="UTC"),
),
)

trans = CombineColumns(
function="mean",
drop_columns=True,
)

res = trans.fit_transform(x_in.copy())
pd.testing.assert_frame_equal(
res,
pd.DataFrame(
{"combined": [1.5, 3]},
index=pd.date_range("2009", freq="h", periods=2, tz="UTC"),
),
)

check_feature_names_out(trans, res)

ref = x_in.copy()
ref["combined"] = [2, 4]
trans.set_params(drop_columns=False)
res = trans.fit_transform(x_in)
ref["combined"] = [1.8, 3.6]
trans.set_params(function="average", weights=[1, 4], drop_columns=False)
res = trans.fit_transform(x_in.copy())
pd.testing.assert_frame_equal(res, ref)
check_feature_names_out(trans, res)

ref["combined_2"] = [2, 4]
ref = x_in.copy()
ref["combined_2"] = [5, 10]
trans = CombineColumns(
function=np.sum,
tide_format_columns="°C",
function_kwargs={"axis": 1},
function="dot",
weights=[1, 2],
drop_columns=False,
result_column_name="combined_2",
)

res = trans.fit_transform(x_in)
res = trans.fit_transform(x_in.copy())
pd.testing.assert_frame_equal(res, ref)
check_feature_names_out(trans, res)

Expand Down
2 changes: 1 addition & 1 deletion tide/meteo.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@
"surface_thermal_radiation": "surface_thermal_radiation (W/m^2)",
"total_cloud_cover": "total_cloud_cover (0-1)",
"total_precipitation": "total_precipitation (mm of water equivalent)",
"wind_direction": "wind_direction (deg)"
"wind_direction": "wind_direction (deg)",
}


Expand Down
88 changes: 34 additions & 54 deletions tide/processing.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,8 @@
from tide.classifiers import STLEDetector
from tide.meteo import sun_position, beam_component, sky_diffuse, ground_diffuse

FUNCTION_MAP = {"mean": np.mean, "average": np.average, "sum": np.sum, "dot": np.dot}

MODEL_MAP = {"STL": SkSTLForecast, "Prophet": SkProphet}

OIKOLAB_DEFAULT_MAP = {
Expand Down Expand Up @@ -916,83 +918,61 @@ def _transform_implementation(self, X: pd.Series | pd.DataFrame):

class CombineColumns(BaseProcessing):
"""
A class that combines multiple columns in a pandas DataFrame using a specified
function.
A class that combines multiple columns in a pandas DataFrame using mean, sum,
average, or dot. Original columns can be dropped.

Parameters
----------
function (callable or None): A function or method to apply for combining
columns.
tide_format_columns str: Tide request format. Columns are determined using
tide columns format name__unit__bloc. It override the columns attribute
columns (list or None): A list of column names to combine.
If None, all columns will be combined.

function_kwargs (dict or None): Additional keyword arguments to pass to the
combining function.
drop_columns (bool): If True, the original columns to combine will be dropped
from the DataFrame. If False, the original columns will be retained.
label_name (str): The name of the new column that will store the combined
values.

Attributes
----------
columns : list
The column names of the input DataFrame.
index : pandas.Index
The index of the input DataFrame.

Methods
-------
get_feature_names_out(input_features=None)
Get output feature names for the transformed data.
fit(X, y=None)
Fit the transformer to the input data.
transform(X, y=None)
Transform the input data by applying the function
function (str): The name of the function to apply for combining columns.
Valide names are "mean", "sum", "average", "dot".
weights (list[float | int] or np.ndarray, optional): Weights to apply when
using 'average' or 'dot'. Ignored for functions like 'mean' or 'sum'.
drop_columns (bool): If True, the original columns used for combining will
be dropped from the DataFrame. If False, they will be retained.
result_column_name (str): The name of the new column that will store the
combined values.
"""

def __init__(
self,
function: Callable,
tide_format_columns: str = None,
columns=None,
function_kwargs: dict = {},
function: str,
weights: list[float | int] | np.ndarray = None,
drop_columns: bool = False,
result_column_name: str = "combined",
):
BaseProcessing.__init__(self)
self.function = function
self.tide_format_columns = tide_format_columns
self.columns = columns
self.function_kwargs = function_kwargs
self.weights = weights
self.drop_columns = drop_columns
self.result_column_name = result_column_name

def _fit_implementation(self, X: pd.Series | pd.DataFrame, y=None):
if self.columns is None and self.tide_format_columns is None:
raise ValueError("Provide at least one of columns or tide_format_columns")

self.required_columns = (
parse_request_to_col_names(X.columns, self.tide_format_columns)
if self.tide_format_columns
else self.columns
)
self.fit_check_features(X)
if self.drop_columns:
self.feature_names_out_ = list(X.columns.drop(self.required_columns))
if self.result_column_name in self.feature_names_out_:
self.method_ = FUNCTION_MAP[self.function]
if self.function in ["mean", "sum"] and self.weights is not None:
raise ValueError(
f"label_name {self.result_column_name} already in X columns. "
f"It cannot be overwritten"
f"Weights have been provided, but {self.function} "
f"cannot use it. Use one of 'average' or 'dot'"
)

if self.drop_columns:
self.feature_names_out_ = []

self.feature_names_out_.append(self.result_column_name)

def _transform_implementation(self, X: pd.Series | pd.DataFrame):
check_is_fitted(self, attributes=["feature_names_in_", "feature_names_out_"])
X[self.result_column_name] = self.function(
X[self.required_columns], **self.function_kwargs
check_is_fitted(
self, attributes=["feature_names_in_", "feature_names_out_", "method_"]
)

if self.function == "average":
X[self.result_column_name] = self.method_(X, axis=1, weights=self.weights)
elif self.function == "dot":
X[self.result_column_name] = self.method_(X, self.weights)

else:
X[self.result_column_name] = self.method_(X, axis=1)

return X[self.feature_names_out_]


Expand Down