diff --git a/demos/demo_cleaning.ipynb b/demos/demo_cleaning.ipynb
new file mode 100644
index 0000000..341156e
--- /dev/null
+++ b/demos/demo_cleaning.ipynb
@@ -0,0 +1,617 @@
+{
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "id": "ec779476-1923-4840-9fbb-a33b62a325ba",
+ "metadata": {},
+ "source": [
+ "### Step 1: Load the Titanic Dataset\n",
+ "We start by loading the Titanic dataset using Seaborn. \n",
+ "This dataset contains information about passengers such as age, class, fare, and survival status. \n",
+ "It’s a great example for demonstrating data cleaning because it includes missing values, categorical variables, and numerical features.\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 9,
+ "id": "cd31f646-5065-47d2-a0e8-65b8f3a9523a",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Original shape: (891, 15)\n"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "
\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " survived | \n",
+ " pclass | \n",
+ " sex | \n",
+ " age | \n",
+ " sibsp | \n",
+ " parch | \n",
+ " fare | \n",
+ " embarked | \n",
+ " class | \n",
+ " who | \n",
+ " adult_male | \n",
+ " deck | \n",
+ " embark_town | \n",
+ " alive | \n",
+ " alone | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | 0 | \n",
+ " 0 | \n",
+ " 3 | \n",
+ " male | \n",
+ " 22.0 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 7.2500 | \n",
+ " S | \n",
+ " Third | \n",
+ " man | \n",
+ " True | \n",
+ " NaN | \n",
+ " Southampton | \n",
+ " no | \n",
+ " False | \n",
+ "
\n",
+ " \n",
+ " | 1 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " female | \n",
+ " 38.0 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 71.2833 | \n",
+ " C | \n",
+ " First | \n",
+ " woman | \n",
+ " False | \n",
+ " C | \n",
+ " Cherbourg | \n",
+ " yes | \n",
+ " False | \n",
+ "
\n",
+ " \n",
+ " | 2 | \n",
+ " 1 | \n",
+ " 3 | \n",
+ " female | \n",
+ " 26.0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 7.9250 | \n",
+ " S | \n",
+ " Third | \n",
+ " woman | \n",
+ " False | \n",
+ " NaN | \n",
+ " Southampton | \n",
+ " yes | \n",
+ " True | \n",
+ "
\n",
+ " \n",
+ " | 3 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " female | \n",
+ " 35.0 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 53.1000 | \n",
+ " S | \n",
+ " First | \n",
+ " woman | \n",
+ " False | \n",
+ " C | \n",
+ " Southampton | \n",
+ " yes | \n",
+ " False | \n",
+ "
\n",
+ " \n",
+ " | 4 | \n",
+ " 0 | \n",
+ " 3 | \n",
+ " male | \n",
+ " 35.0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 8.0500 | \n",
+ " S | \n",
+ " Third | \n",
+ " man | \n",
+ " True | \n",
+ " NaN | \n",
+ " Southampton | \n",
+ " no | \n",
+ " True | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " survived pclass sex age sibsp parch fare embarked class \\\n",
+ "0 0 3 male 22.0 1 0 7.2500 S Third \n",
+ "1 1 1 female 38.0 1 0 71.2833 C First \n",
+ "2 1 3 female 26.0 0 0 7.9250 S Third \n",
+ "3 1 1 female 35.0 1 0 53.1000 S First \n",
+ "4 0 3 male 35.0 0 0 8.0500 S Third \n",
+ "\n",
+ " who adult_male deck embark_town alive alone \n",
+ "0 man True NaN Southampton no False \n",
+ "1 woman False C Cherbourg yes False \n",
+ "2 woman False NaN Southampton yes True \n",
+ "3 woman False C Southampton yes False \n",
+ "4 man True NaN Southampton no True "
+ ]
+ },
+ "execution_count": 9,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "import pandas as pd\n",
+ "import seaborn as sns\n",
+ "from dskit import cleaning\n",
+ "\n",
+ "# Load Titanic dataset\n",
+ "df = sns.load_dataset(\"titanic\")\n",
+ "print(\"Original shape:\", df.shape)\n",
+ "df.head()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "295b8b6a-adc0-41df-8959-24cbf6bffa2f",
+ "metadata": {},
+ "source": [
+ "### Step 2: Clean Column Names\n",
+ "Column names often contain spaces, uppercase letters, or special characters that make them hard to work with. \n",
+ "Using `rename_columns_auto`, we standardize them by:\n",
+ "- Converting to lowercase\n",
+ "- Replacing spaces with underscores\n",
+ "- Removing special characters \n",
+ "\n",
+ "This ensures consistency and makes coding easier."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 10,
+ "id": "53765dd8-4e15-4178-956b-51873f72bcf1",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Renamed Columns: ['survived', 'pclass', 'sex', 'age', 'sibsp', 'parch', 'fare', 'embarked', 'class', 'who', 'adult_male', 'deck', 'embark_town', 'alive', 'alone']\n"
+ ]
+ }
+ ],
+ "source": [
+ "df_cleaned = cleaning.rename_columns_auto(df)\n",
+ "print(\"Renamed Columns:\", df_cleaned.columns.tolist())"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "5f1bac70-4586-452d-bc5d-9178a58b1d9b",
+ "metadata": {},
+ "source": [
+ "### Step 3: Summarize Missing Values\n",
+ "Real-world datasets almost always have missing values. \n",
+ "The `missing_summary` function helps us quickly identify:\n",
+ "- How many missing values each column has\n",
+ "- The percentage of missing values relative to the dataset size \n",
+ "\n",
+ "This summary guides us in deciding how to handle missing data."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 11,
+ "id": "726620da-f368-4294-a848-c91b554c7879",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " Missing Count | \n",
+ " Missing % | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | deck | \n",
+ " 688 | \n",
+ " 77.216611 | \n",
+ "
\n",
+ " \n",
+ " | age | \n",
+ " 177 | \n",
+ " 19.865320 | \n",
+ "
\n",
+ " \n",
+ " | embarked | \n",
+ " 2 | \n",
+ " 0.224467 | \n",
+ "
\n",
+ " \n",
+ " | embark_town | \n",
+ " 2 | \n",
+ " 0.224467 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " Missing Count Missing %\n",
+ "deck 688 77.216611\n",
+ "age 177 19.865320\n",
+ "embarked 2 0.224467\n",
+ "embark_town 2 0.224467"
+ ]
+ },
+ "execution_count": 11,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "missing_summary = cleaning.missing_summary(df_cleaned)\n",
+ "missing_summary"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "f2cfd23a-a28c-4f77-9833-a0c43c9d62d4",
+ "metadata": {},
+ "source": [
+ "### Step 4: Fill Missing Values\n",
+ "Once we know where the missing values are, we need to handle them. \n",
+ "The `fill_missing` function provides multiple strategies:\n",
+ "- Mean or median for numeric columns\n",
+ "- Mode for categorical columns\n",
+ "- Forward/backward fill for sequential data\n",
+ "- Constant values if specified \n",
+ "\n",
+ "Here, we use the `auto` strategy, which intelligently chooses the best method for each column.\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 12,
+ "id": "58e8a23c-9bcb-4f79-b970-90b18f94bd14",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "survived 0\n",
+ "pclass 0\n",
+ "sex 0\n",
+ "age 0\n",
+ "sibsp 0\n",
+ "parch 0\n",
+ "fare 0\n",
+ "embarked 0\n",
+ "class 0\n",
+ "who 0\n",
+ "adult_male 0\n",
+ "deck 0\n",
+ "embark_town 0\n",
+ "alive 0\n",
+ "alone 0\n",
+ "dtype: int64"
+ ]
+ },
+ "execution_count": 12,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "df_filled = cleaning.fill_missing(df_cleaned, strategy=\"auto\")\n",
+ "df_filled.isnull().sum()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "653b6962-5d99-4513-9dee-3d41e2a34345",
+ "metadata": {},
+ "source": [
+ "### Step 5: Detect Outliers\n",
+ "Outliers are extreme values that can distort analysis and models. \n",
+ "The `outlier_summary` function detects them using:\n",
+ "- IQR method: values outside 1.5× the interquartile range\n",
+ "- Z-score method: values more than 3 standard deviations from the mean \n",
+ "\n",
+ "This helps us understand which columns contain unusual values."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 13,
+ "id": "e9ed791a-460d-4f0b-a024-4c96f6ae247d",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "parch 213\n",
+ "fare 116\n",
+ "age 66\n",
+ "sibsp 46\n",
+ "Name: Outlier Count, dtype: int64"
+ ]
+ },
+ "execution_count": 13,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "outliers = cleaning.outlier_summary(df_filled, method=\"iqr\")\n",
+ "outliers"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "64cd5f52-b387-47b5-bd3e-065e93d941c2",
+ "metadata": {},
+ "source": [
+ "### Step 6: Remove Outliers\n",
+ "After detecting outliers, we can remove them to make the dataset more robust. \n",
+ "The `remove_outliers` function filters rows that fall outside the acceptable range. \n",
+ "This step reduces noise and improves the reliability of statistical analysis and machine learning models."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 14,
+ "id": "9231793a-18a2-494a-8250-2021623c3105",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Shape after removing outliers: (561, 15)\n"
+ ]
+ }
+ ],
+ "source": [
+ "df_no_outliers = cleaning.remove_outliers(df_filled, method=\"iqr\")\n",
+ "print(\"Shape after removing outliers:\", df_no_outliers.shape)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "22ac079f-d6e9-4b03-8fc1-f51880d8707f",
+ "metadata": {},
+ "source": [
+ "### Step 6a: Visualize Outlier Removal\n",
+ "To better understand the impact of outlier removal, we can compare the distribution of the **age** column before and after cleaning. \n",
+ "Histograms are a simple way to see how extreme values affect the overall shape of the data. \n",
+ "\n",
+ "- Before removal: The distribution may show long tails or unusual spikes caused by outliers. \n",
+ "- After removal: The distribution becomes smoother and more representative of the majority of passengers. \n",
+ "\n",
+ "This visualization helps confirm that our cleaning step improves data quality without losing important information."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 15,
+ "id": "18856d9c-563d-4e4b-926d-b3524b2c1032",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "image/png": "",
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ }
+ ],
+ "source": [
+ "import matplotlib.pyplot as plt\n",
+ "\n",
+ "plt.figure(figsize=(12,5))\n",
+ "\n",
+ "#Before Outlier Removal\n",
+ "plt.subplot(1,2,1)\n",
+ "df_filled['age'].hist(bins=30, color='skyblue')\n",
+ "plt.title(\"Age Distribution (Before Outlier Removal)\")\n",
+ "plt.xlabel(\"Age\")\n",
+ "plt.ylabel(\"Count\")\n",
+ "\n",
+ "plt.subplot(1,2,2)\n",
+ "df_no_outliers['age'].hist(bins=30, color='salmon')\n",
+ "plt.title(\"Age Distribution (After Outlier Removal)\")\n",
+ "plt.xlabel(\"Age\")\n",
+ "plt.ylabel(\"Count\")\n",
+ "\n",
+ "plt.show()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "7c4d4d15-815d-4ff3-89f3-48a75d34d476",
+ "metadata": {},
+ "source": [
+ "### Step 7: Clean Text Columns\n",
+ "Text data often contains inconsistencies like uppercase letters, punctuation, or extra spaces. \n",
+ "The `simple_nlp_clean` function standardizes text by:\n",
+ "- Converting to lowercase\n",
+ "- Removing punctuation\n",
+ "- Stripping extra spaces \n",
+ "\n",
+ "This makes text columns easier to analyze and prepares them for NLP tasks."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 16,
+ "id": "ed5dfee1-acc2-4c71-9383-900350ac4fa3",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " embark_town | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | 0 | \n",
+ " southampton | \n",
+ "
\n",
+ " \n",
+ " | 2 | \n",
+ " southampton | \n",
+ "
\n",
+ " \n",
+ " | 3 | \n",
+ " southampton | \n",
+ "
\n",
+ " \n",
+ " | 4 | \n",
+ " southampton | \n",
+ "
\n",
+ " \n",
+ " | 5 | \n",
+ " queenstown | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " embark_town\n",
+ "0 southampton\n",
+ "2 southampton\n",
+ "3 southampton\n",
+ "4 southampton\n",
+ "5 queenstown"
+ ]
+ },
+ "execution_count": 16,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "df_text_cleaned = cleaning.simple_nlp_clean(df_no_outliers, text_cols=[\"embark_town\"])\n",
+ "df_text_cleaned[[\"embark_town\"]].head()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "4b45d75a-1fe5-4c77-8aac-ae07c28db56a",
+ "metadata": {},
+ "source": [
+ "### Conclusion\n",
+ "In this demo, we applied the `cleaning.py` functions to the Titanic dataset. \n",
+ "We saw how to:\n",
+ "- Standardize column names\n",
+ "- Summarize and fill missing values\n",
+ "- Detect and remove outliers\n",
+ "- Clean text columns\n",
+ "\n",
+ "Together, these steps show how `dskit.cleaning` simplifies common data preprocessing tasks, making datasets ready for analysis or modeling.\n"
+ ]
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "Python 3 (ipykernel)",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.13.3"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/dskit/cleaning.py b/dskit/cleaning.py
index bb28ee5..c89c786 100644
--- a/dskit/cleaning.py
+++ b/dskit/cleaning.py
@@ -4,8 +4,38 @@
def fix_dtypes(df):
"""
- Auto-detects and converts column types.
+ Auto-detects and converts column types in a DataFrame.
+
+ Parameters
+ ----------
+ df : pandas.DataFrame
+ Input DataFrame whose column types need to be inferred and converted.
+
+ Returns
+ -------
+ pandas.DataFrame
+ A copy of the DataFrame with columns converted to appropriate types:
+ - Numeric if possible
+ - Datetime if possible
+ - Category if object dtype with low cardinality
+
+ Notes
+ -----
+ - Numeric conversion is attempted first, followed by datetime conversion.
+ - Object columns with unique values less than 50% of total rows are converted to category.
+ - Category conversion is heuristic-based; users should validate conversions for critical datasets.
+
+ Example
+ -------
+ >>> import pandas as pd
+ >>> from dskit.cleaning import fix_dtypes
+ >>> df = pd.DataFrame({"A": ["1", "2", "3"], "B": ["2020-01-01", "2020-01-02", "2020-01-03"]})
+ >>> fix_dtypes(df).dtypes
+ A int64
+ B datetime64[ns]
+ dtype: object
"""
+
df = df.copy()
for col in df.columns:
# Try converting to numeric
@@ -31,8 +61,39 @@ def fix_dtypes(df):
def rename_columns_auto(df):
"""
- Cleans column names: lowercase, replace spaces with underscores, remove special chars.
+ Automatically cleans DataFrame column names.
+
+ This function:
+ - Converts column names to lowercase
+ - Replaces spaces with underscores
+ - Removes special characters
+
+ Parameters
+ ----------
+ df : pandas.DataFrame
+ Input DataFrame whose column names need cleaning.
+
+ Returns
+ -------
+ pandas.DataFrame
+ DataFrame with cleaned column names.
+
+ Raises
+ ------
+ TypeError
+ If input is not a pandas DataFrame.
+
+ Example
+ -------
+ >>> import pandas as pd
+ >>> from dskit.cleaning import rename_columns_auto
+ >>> df = pd.DataFrame(columns=["User Name", "Total$Amount"])
+ >>> rename_columns_auto(df).columns
+ Index(['user_name', 'totalamount'], dtype='object')
"""
+ if not isinstance(df, pd.DataFrame):
+ raise TypeError("Input must be a pandas DataFrame")
+
df = df.copy()
new_cols = []
for col in df.columns:
@@ -43,19 +104,82 @@ def rename_columns_auto(df):
df.columns = new_cols
return df
+
def replace_specials(df, chars_to_remove=r'[@#%$]', replacement=''):
"""
- Removes or replaces special characters from text columns.
+ Removes or replaces special characters from text columns in a DataFrame.
+
+ This function applies a regular expression replacement to all
+ object/string columns in the DataFrame.
+
+ Parameters
+ ----------
+ df : pandas.DataFrame
+ Input DataFrame containing text columns.
+ chars_to_remove : str, default=r'[@#%$]'
+ Regular expression pattern of characters to remove or replace.
+ replacement : str, default=''
+ String to replace the matched characters with.
+
+ Returns
+ -------
+ pandas.DataFrame
+ DataFrame with special characters removed or replaced in text columns.
+
+ Raises
+ ------
+ TypeError
+ If input is not a pandas DataFrame.
+
+ Example
+ -------
+ >>> import pandas as pd
+ >>> from dskit.cleaning import replace_specials
+ >>> df = pd.DataFrame({'text': ['Hello@World!', 'Price#$100']})
+ >>> replace_specials(df)
+ text
+ 0 HelloWorld
+ 1 Price100
"""
+ if not isinstance(df, pd.DataFrame):
+ raise TypeError("Input must be a pandas DataFrame")
+
df = df.copy()
for col in df.select_dtypes(include=['object', 'string']).columns:
- df[col] = df[col].astype(str).str.replace(chars_to_remove, replacement, regex=True)
+ df[col] = df[col].astype(str).str.replace(
+ chars_to_remove, replacement, regex=True
+ )
return df
+
def missing_summary(df):
"""
- Returns a summary of missing values.
+ Generates a summary of missing values in a DataFrame.
+
+ Parameters
+ ----------
+ df : pandas.DataFrame
+ Input DataFrame.
+
+ Returns
+ -------
+ pandas.DataFrame
+ DataFrame with two columns:
+ - 'Missing Count': number of missing values per column
+ - 'Missing %': percentage of missing values per column
+ Only columns with missing values are included.
+
+ Example
+ -------
+ >>> import pandas as pd
+ >>> from dskit.cleaning import missing_summary
+ >>> df = pd.DataFrame({"A": [1, None, 3], "B": [None, None, 2]})
+ >>> missing_summary(df)
+ Missing Count Missing %
+ B 2 66.666667
+ A 1 33.333333
"""
+
missing = df.isnull().sum()
missing_percent = 100 * df.isnull().sum() / len(df)
summary = pd.concat([missing, missing_percent], axis=1, keys=['Missing Count', 'Missing %'])
@@ -63,9 +187,47 @@ def missing_summary(df):
def fill_missing(df, strategy='auto', fill_value=None):
"""
- Fills missing values.
- strategy: 'auto', 'mean', 'median', 'mode', 'ffill', 'bfill', 'constant'
+ Fills missing values in a DataFrame using various strategies.
+
+ Parameters
+ ----------
+ df : pandas.DataFrame
+ Input DataFrame.
+ strategy : str, default='auto'
+ Strategy for filling missing values:
+ - 'auto': mean for numeric, mode for non-numeric
+ - 'mean': fill with column mean
+ - 'median': fill with column median
+ - 'mode': fill with column mode
+ - 'ffill': forward fill
+ - 'bfill': backward fill
+ - 'constant': fill with a specified constant value
+ fill_value : any, optional
+ Value to use when strategy='constant'.
+
+ Returns
+ -------
+ pandas.DataFrame
+ DataFrame with missing values filled.
+
+ Example
+ -------
+ >>> import pandas as pd
+ >>> from dskit.cleaning import fill_missing
+ >>> df = pd.DataFrame({"A": [1, None, 3], "B": ["x", None, "y"]})
+ >>> fill_missing(df, strategy="auto")
+ A B
+ 0 1 x
+ 1 2 x
+ 2 3 y
+
+ Notes
+ -----
+ - If strategy='constant' and fill_value=None, missing values will remain unchanged.
+ - For 'auto', numeric columns use mean, non-numeric columns use mode.
+
"""
+
df = df.copy()
for col in df.columns:
@@ -97,8 +259,45 @@ def fill_missing(df, strategy='auto', fill_value=None):
def outlier_summary(df, method='iqr', threshold=1.5):
"""
- Returns a summary of outliers.
+ Summarizes the number of outliers in numeric columns of a DataFrame.
+
+ Parameters
+ ----------
+ df : pandas.DataFrame
+ Input DataFrame.
+ method : str, default='iqr'
+ Method for detecting outliers:
+ - 'iqr': Interquartile Range method
+ - 'zscore': Z-score method
+ threshold : float, default=1.5
+ Threshold multiplier for IQR method. For z-score, cutoff is fixed at 3.
+
+ Returns
+ -------
+ pandas.Series
+ Series with:
+ - Index: column names
+ - Values: outlier counts per column
+
+ Example
+ -------
+ Standard case:
+ >>> import pandas as pd
+ >>> from dskit.cleaning import outlier_summary
+ >>> df = pd.DataFrame({"A": [1, 2, 100, 3, 4]})
+ >>> outlier_summary(df)
+ A 1
+ Name: Outlier Count, dtype: int64
+
+ Edge case (no outliers):
+ >>> import pandas as pd
+ >>> from dskit.cleaning import outlier_summary
+ >>> df = pd.DataFrame({"A": [1, 2, 3, 4]}) # no outliers
+ >>> outlier_summary(df)
+ Series([], Name: Outlier Count, dtype: int64)
+
"""
+
summary = {}
numeric_cols = df.select_dtypes(include=[np.number]).columns
@@ -125,8 +324,37 @@ def outlier_summary(df, method='iqr', threshold=1.5):
def remove_outliers(df, method='iqr', threshold=1.5):
"""
- Removes rows with outliers.
+ Removes rows containing outliers from numeric columns.
+
+ Parameters
+ ----------
+ df : pandas.DataFrame
+ Input DataFrame.
+ method : str, default='iqr'
+ Method for detecting outliers:
+ - 'iqr': Interquartile Range method
+ - 'zscore': Z-score method
+ threshold : float, default=1.5
+ Threshold multiplier for IQR method. For z-score, cutoff is fixed at 3.
+
+ Returns
+ -------
+ pandas.DataFrame
+ DataFrame with outlier rows removed.
+
+ Example
+ -------
+ >>> import pandas as pd
+ >>> from dskit.cleaning import remove_outliers
+ >>> df = pd.DataFrame({"A": [1, 2, 100, 3, 4]})
+ >>> remove_outliers(df)
+ A
+ 0 1
+ 1 2
+ 3 3
+ 4 4
"""
+
df = df.copy()
numeric_cols = df.select_dtypes(include=[np.number]).columns
@@ -150,8 +378,46 @@ def remove_outliers(df, method='iqr', threshold=1.5):
def simple_nlp_clean(df, text_cols=None):
"""
- Basic text cleaning: lowercase, remove punctuation, remove extra spaces.
+ Performs basic text cleaning on specified columns.
+
+ Operations
+ ----------
+ - Converts text to lowercase
+ - Removes punctuation
+ - Removes extra spaces
+
+ Parameters
+ ----------
+ df : pandas.DataFrame
+ Input DataFrame.
+ text_cols : list of str, optional
+ List of column names to clean. If None, all object/string columns are cleaned.
+
+ Returns
+ -------
+ pandas.DataFrame
+ DataFrame with cleaned text columns.
+
+ Example
+ -------
+ Standard case:
+ >>> import pandas as pd
+ >>> from dskit.cleaning import simple_nlp_clean
+ >>> df = pd.DataFrame({"text": ["Hello, World!!", "Python is GREAT"]})
+ >>> simple_nlp_clean(df)
+ text
+ 0 hello world
+ 1 python is great
+
+ Edge case (no outliers):
+ >>> df = pd.DataFrame({"text": [" ", None]})
+ >>> simple_nlp_clean(df)
+ text
+ 0
+ 1 none
+
"""
+
df = df.copy()
if text_cols is None:
text_cols = df.select_dtypes(include=['object', 'string']).columns
diff --git a/tests/test_cleaning.py b/tests/test_cleaning.py
new file mode 100644
index 0000000..c16c6ae
--- /dev/null
+++ b/tests/test_cleaning.py
@@ -0,0 +1,22 @@
+import pandas as pd
+import pytest
+from dskit import cleaning
+
+def test_missing_summary_returns_dataframe():
+ df = pd.DataFrame({"A": [1, None, 3], "B": [None, None, 2]})
+ result = cleaning.missing_summary(df)
+ assert isinstance(result, pd.DataFrame)
+ assert list(result.columns) == ["Missing Count", "Missing %"]
+
+def test_outlier_summary_returns_series():
+ df = pd.DataFrame({"A": [1, 2, 100, 3, 4]})
+ result = cleaning.outlier_summary(df)
+ assert isinstance(result, pd.Series)
+ assert result.name == "Outlier Count"
+
+def test_fill_missing_auto_strategy():
+ df = pd.DataFrame({"A": [1, None, 3], "B": ["x", None, "y"]})
+ result = cleaning.fill_missing(df, strategy="auto")
+ # Check that missing values are filled
+ assert result["A"].isnull().sum() == 0
+ assert result["B"].isnull().sum() == 0