diff --git a/.gitignore b/.gitignore index b65434b..a1203fd 100644 --- a/.gitignore +++ b/.gitignore @@ -16,4 +16,9 @@ venv/ .DS_Store Thumbs.db -/reports/*[!.gitkeep] \ No newline at end of file +/reports/*[!.gitkeep]ignore_subscriptions.txt + +# ignore customized ignores file +!ignore_subscriptions.example.txt +ignore_subscriptions.txt + diff --git a/README.md b/README.md index 0d87b17..b98937a 100644 --- a/README.md +++ b/README.md @@ -1,42 +1,172 @@ + # Subscription Finder Python Script + + This Python script is designed to help users find and manage their subscriptions. + + ## Setup + + To set up the project, follow these steps: -1. **Clone the repository:** - ```bash - git clone - cd - ``` - -2. **Create a virtual environment:** - - Windows CMD: - ```cmd - python -m venv venv - venv\Scripts\activate - ``` - - Bash: - ```bash - python3 -m venv venv - source venv/bin/activate - ``` - -3. **Install the dependencies:** - ```bash - pip install -r requirements.txt - ``` + + +1. **Clone the repository:** + +```bash + +git clone + +cd + +``` + + + +2. **Create a virtual environment:** + +Windows CMD: + +```cmd + +python -m venv venv + +venv\Scripts\activate + +``` + + + +Bash: + +```bash + +python3 -m venv venv + +source venv/bin/activate + +``` + +3. **Install the dependencies:** + +```bash + +pip install -r requirements.txt + +``` + + + +## CSV File Format + + + +The script expects a CSV file containing transaction data. It automatically identifies and maps column headers to standard names, supporting various linguistic and formatting differences. + + + +The essential columns and their recognized variations are: + + + +- **Date**: (`date`, `datum`, `fecha`, `data`) - The date of the transaction. + +- **Description**: (`description`, `desc`, `descripción`, `bezeichnung`, `opis`, `payee`) - A textual description of the transaction or vendor. + +- **Amount**: (`amount`, `amt`, `importe`, `betrag`, `kwota`, `sum`, `outflow`) - The transaction amount. Note: the script handles currency symbols and different decimal/thousands separators. + + + +The script also supports automatic language detection for column headers and will translate them to English before processing. + + ## Usage + + To use the script, run the following command: + + ```bash -python interpret.py reports/financial_reports.csv + +python interpret.py [options] + +``` + + + +**Example:** + +```bash + +python interpret.py reports/financial_reports.csv --recency-days 120 --threshold 0.2 + +``` + + + +### Command-line Arguments + + + +| Argument | Short | Default | Description | +| :--- | :--- | :--- | :--- | +| `file_path` | | | Path to the CSV file to analyze (Required). | +| `--threshold` | `-t` | `0.15` | Percentage threshold (0.0-1.0) for clustering similar transaction amounts. | +| `--recency-days` | `-r` | `90` | Number of days from the latest transaction date to consider a subscription "active". | +| `--min-transaction-amount` | | `10.0` | Minimum absolute transaction amount to consider. | +| `--max-transaction-amount` | | `10000.0` | Maximum absolute transaction amount to consider. | +| `--ignore-file` | | `ignore_subscriptions.txt` | Path to a text file containing vendor names to ignore. | +| `--debug` | `-d` | `False` | Enable verbose debug output. | + + + +### Ignoring Vendors + + + +You can exclude specific vendors or transactions by adding their names to a text file (default: `ignore_subscriptions.txt`). + +- One vendor per line. + +- Supports partial matching (e.g., "Grocery" will ignore "Joe's Grocery Store"). + +- Case-insensitive. + + + +Example `ignore_subscriptions.txt`: + +```text + +Whole Foods + +Starbucks + +One-time transfer + ``` -Replace `reports/financial_reports.csv` with the path to your CSV file. + + +## How It Works + + + +1. **Parses & Normalizes:** Reads the CSV, detects column names automatically (multilingual support), and normalizes vendor descriptions (removes location data, special characters, etc.). + +2. **Fuzzy Matching:** Groups similar vendor names together (e.g., "Netflix.com" and "Netflix Inc") using sequence matching logic. + +3. **Ignores:** Filters out vendors listed in the ignore file. + +4. **Clusters Amounts:** Groups transactions from the same vendor that have similar amounts (within the specified `--threshold`) to handle small price variations or currency fluctuations. This also helps separate recurring payments from one-off outliers (like a large downpayment vs. a monthly fee). + +5. **Identifies Candidates:** Filters for recurring transactions (count > 1) that fall within the specified amount range and recency window. + +6. **Reports:** specific details about the potential subscriptions found, sorted by estimated yearly cost. \ No newline at end of file diff --git a/ignore_subscriptions.example.txt b/ignore_subscriptions.example.txt new file mode 100644 index 0000000..b9f6c5e --- /dev/null +++ b/ignore_subscriptions.example.txt @@ -0,0 +1,8 @@ +# Add exact or partial vendor names to ignore (case-insensitive) +# One entry per line +Whole Foods +Trader Joe's +Safeway +Publix +Walmart +Target \ No newline at end of file diff --git a/ignore_subscriptions.txt b/ignore_subscriptions.txt new file mode 100644 index 0000000..2cf3995 --- /dev/null +++ b/ignore_subscriptions.txt @@ -0,0 +1,2 @@ +Kroger +SPOTTY DOG ICE CREAM diff --git a/interpret.py b/interpret.py index 0e12b22..e3f2db1 100644 --- a/interpret.py +++ b/interpret.py @@ -1,99 +1,218 @@ -from utils import clean_amount, translate_column_names, unify_column_names, standard_columns +from utils import clean_amount, translate_column_names, unify_column_names, standard_columns, normalize_description import sys import pandas as pd +import numpy as np +import argparse +import difflib -# Load the CSV file from first argument -file_path = sys.argv[1] +# Parse command-line arguments +parser = argparse.ArgumentParser(description='Analyze CSV for subscription candidates.') +parser.add_argument('file_path', help='Path to the CSV file to analyze.') +parser.add_argument('--threshold', '-t', type=float, default=0.15, + help='Percentage threshold for clustering similar amounts (e.g., 0.15 for 15%%). Default is 0.15.') +parser.add_argument('--recency-days', '-r', type=int, default=90, + help='Number of days from the latest transaction to consider a subscription active. Default is 90 days.') +parser.add_argument('--debug', '-d', action='store_true', + help='Enable debug mode to show verbose output.') +parser.add_argument('--min-transaction-amount', type=float, default=10.0, + help='Minimum absolute transaction amount to consider for a subscription. Default is 10.0.') +parser.add_argument('--max-transaction-amount', type=float, default=10000.0, + help='Maximum absolute transaction amount to consider for a subscription. Default is 10000.0 (i.e., $10,000).') +parser.add_argument('--ignore-file', type=str, default='ignore_subscriptions.txt', + help='Path to a file containing vendor names to ignore (one per line).') +args = parser.parse_args() + +file_path = args.file_path + +def load_ignore_patterns(ignore_file_path): + patterns = [] + try: + with open(ignore_file_path, 'r', encoding='utf-8') as f: + for line in f: + line = line.strip() + if line and not line.startswith('#'): + # Normalize the pattern just like we normalize descriptions + # This ensures "kroger" matches "KROGER" + patterns.append(normalize_description(line)) + except FileNotFoundError: + pass # It's okay if the file doesn't exist + return patterns + +def filter_ignored_vendors(df, ignore_patterns): + if not ignore_patterns: + return df + + initial_count = len(df) + + # We want to drop rows where the Description contains any of the ignore patterns + # Since descriptions are already normalized, we check for substring existence + + import re + # patterns are already normalized (UPPERCASE, etc). + escaped_patterns = [re.escape(p) for p in ignore_patterns] + full_pattern = '|'.join(escaped_patterns) + + if not full_pattern: + return df + + # Filter: Keep rows where Description DOES NOT contain the pattern + # Use str.contains with regex=True + df_filtered = df[~df['Description'].str.contains(full_pattern, case=True, regex=True)] + + removed_count = initial_count - len(df_filtered) + if args.debug and removed_count > 0: + print(f"Ignored {removed_count} transactions matching {len(ignore_patterns)} patterns from '{args.ignore_file}'.") + + return df_filtered def find_data_start(file_path): with open(file_path, 'r', encoding='utf-8') as file: for i, line in enumerate(file): - if 'Data' in line and 'Opis' in line and 'Kwota' in line: - return i - if 'Date' in line and 'Description' in line and 'Amount' in line: + line_lower = line.lower() + if (any(kw in line_lower for kw in standard_columns['Date']) and + any(kw in line_lower for kw in standard_columns['Description']) and + any(kw in line_lower for kw in standard_columns['Amount'])): return i return None -def get_subscription_candidates(df, groupby=['Description', 'Amount']): - subscription_candidates = df.groupby(groupby).agg({ - 'Amount': ['count', 'sum'], - 'Date': ['min', 'max'] - }).reset_index() - subscription_candidates.columns = ['Description', 'Amount', 'Transaction_Count', 'Total_Spent', 'First_Transaction', 'Last_Transaction'] - subscription_candidates = subscription_candidates[subscription_candidates['Transaction_Count'] > 1] - return subscription_candidates +def merge_similar_descriptions(df, threshold=0.7): + """ + Groups similar descriptions using fuzzy matching and prefix checking. + Prioritizes shorter names as representatives (e.g., "TRUIST" over "TRUIST LN..."). + Optimized with length heuristics. + """ + if df.empty: + return df - -start_row = find_data_start(file_path) -print(f"Offseting by {start_row} rows.") - -if start_row is not None: - df = pd.read_csv(file_path, skiprows=start_row, sep=';', index_col=False,) -else: - print("No valid data header found in the file.") + unique_descs = df['Description'].dropna().unique() + # Sort by length (shortest first) to prefer simpler names as representatives + sorted_descs = sorted(unique_descs, key=len) -# Example: Translate column names -if not df.empty: - df.columns = translate_column_names(df.columns, src_lang='pl') - df = unify_column_names(df, standard_columns) - - # Example: Convert 'Date' column to datetime - df['Date'] = pd.to_datetime(df['Date'], errors='coerce') + mapping = {} + reps = [] - df['Amount'] = pd.to_numeric(df['Amount'].apply(clean_amount), errors='coerce') - - # Example: Handle missing values - df.dropna(subset=['Description', 'Amount'], inplace=True) + for desc in sorted_descs: + match = None + if not desc: + continue + + for rep in reps: + # Check 1: Prefix match (strong signal) + # e.g., "TRUIST" matches "TRUIST LN..." + if desc.startswith(rep + " "): + match = rep + break + + # Check 2: Fuzzy match + # Optimization: Quick length check + # ratio = 2*M / (len(a) + len(b)). Max M = len(rep) (since rep is shorter/equal) + # If max possible ratio <= threshold, skip expensive difflib + max_possible_ratio = 2 * len(rep) / (len(rep) + len(desc)) + if max_possible_ratio <= threshold: + continue + ratio = difflib.SequenceMatcher(None, rep, desc).ratio() + if ratio > threshold: + match = rep + break + + if match: + mapping[desc] = match + else: + reps.append(desc) + mapping[desc] = desc + + df['Description'] = df['Description'].map(mapping) + return df - subscription_candidates = get_subscription_candidates(df, groupby=['Description', 'Amount']) - subscription_candidates['First_Transaction'] = pd.to_datetime(subscription_candidates['First_Transaction']) - subscription_candidates['Last_Transaction'] = pd.to_datetime(subscription_candidates['Last_Transaction']) - subscription_candidates['Total_Days'] = (subscription_candidates['Last_Transaction'] - subscription_candidates['First_Transaction']).dt.days - subscription_candidates['Avg_Days_Between_Transactions'] = subscription_candidates['Total_Days'] / (subscription_candidates['Transaction_Count'] - 1) +def _cluster_amounts_series(s_amounts, threshold): + # s_amounts is a Series of amounts for a single Description group - subscription_candidates = subscription_candidates[(subscription_candidates['Avg_Days_Between_Transactions'] > 25) & (subscription_candidates['Avg_Days_Between_Transactions'] < 35)] + if len(s_amounts) < 2: + return s_amounts # Return original Series if not enough to cluster + + # Sort by Amount to ensure deterministic processing + # Important: Operate on values, but preserve original index for returning Series + amounts = s_amounts.sort_values().values + original_index = s_amounts.sort_values().index - subscription_candidates = subscription_candidates[(subscription_candidates['Amount'] < -10) & (subscription_candidates['Amount'] > -1000)] + clusters = [] # List of [values] + if len(amounts) > 0: + current_cluster = [amounts[0]] + + for val in amounts[1:]: + ref = current_cluster[0] + # Avoid division by zero + if ref == 0: + if val == 0: + current_cluster.append(val) + else: + clusters.append(current_cluster) + current_cluster = [val] + continue + + # Calculate percentage difference + diff = abs((val - ref) / ref) + + if diff <= threshold: + current_cluster.append(val) + else: + clusters.append(current_cluster) + current_cluster = [val] + clusters.append(current_cluster) - print("Number of potential subscriptions:", len(subscription_candidates)) - - # Display potential subscriptions - print(subscription_candidates[['Description', 'Amount', 'Last_Transaction', 'Transaction_Count']].sort_values('Last_Transaction', ascending=False)) -else: - print("Dataframe is empty.") -# print(df.head()) - -def find_data_start(file_path): - with open(file_path, 'r', encoding='utf-8') as file: - for i, line in enumerate(file): - if 'Data' in line and 'Opis' in line and 'Kwota' in line: - return i - if 'Date' in line and 'Description' in line and 'Amount' in line: - return i - return None + # Build a list of new amounts matching the sorted order + new_amounts_list = [] + for cluster in clusters: + mean_val = np.mean(cluster) + new_amounts_list.extend([mean_val] * len(cluster)) + + # Create a Series with the new amounts, aligned to the original index + # We sorted amounts, so we must re-align with original_index + clustered_series = pd.Series(new_amounts_list, index=original_index) + return clustered_series.reindex(s_amounts.index) # Reindex to original Series order def get_subscription_candidates(df, groupby=['Description']): subscription_candidates = df.groupby(groupby).agg({ - 'Amount': ['count', 'sum'], + 'Amount': ['count', 'sum', 'mean'], 'Date': ['min', 'max'] }).reset_index() - subscription_candidates.columns = ['Description', 'Amount', 'Transaction_Count', 'Total_Spent', 'First_Transaction', 'Last_Transaction'] + + # Flatten columns based on what groupby produced + # Columns are: GroupKey(s)..., Amount-count, Amount-sum, Amount-mean, Date-min, Date-max + if len(subscription_candidates.columns) == 7: + # Grouped by ['Description', 'Amount'] + subscription_candidates.columns = ['Description', 'Amount', 'Transaction_Count', 'Total_Spent', 'Avg_Amount', 'First_Transaction', 'Last_Transaction'] + # 'Amount' is the grouping key (exact cluster value), 'Avg_Amount' is the calculated mean (identical). + # We can drop Avg_Amount. + subscription_candidates = subscription_candidates.drop(columns=['Avg_Amount']) + else: + # Grouped by ['Description'] + subscription_candidates.columns = ['Description', 'Transaction_Count', 'Total_Spent', 'Amount', 'First_Transaction', 'Last_Transaction'] + subscription_candidates = subscription_candidates[subscription_candidates['Transaction_Count'] > 1] return subscription_candidates start_row = find_data_start(file_path) -print(f"Offseting by {start_row} rows.") +if args.debug: + print(f"Offseting by {start_row} rows.") if start_row is not None: - df = pd.read_csv(file_path, skiprows=start_row, sep=';', index_col=False,) + df = pd.read_csv(file_path, skiprows=start_row, sep=',', index_col=False,) else: print("No valid data header found in the file.") + print(start_row) + print("Exiting.") + exit(1) # Example: Translate column names if not df.empty: - df.columns = translate_column_names(df.columns, src_lang='pl') + + # Check if 'Outflow' exists before translation/unification + is_outflow_present = any(col.lower() == 'outflow' for col in df.columns) + + df.columns = translate_column_names(df.columns, src_lang='auto') df = unify_column_names(df, standard_columns) # Example: Convert 'Date' column to datetime @@ -101,9 +220,34 @@ def get_subscription_candidates(df, groupby=['Description']): df['Amount'] = pd.to_numeric(df['Amount'].apply(clean_amount), errors='coerce') + if is_outflow_present: + # Outflow is usually positive, but we want negative for expenses + # Only invert positive values (income/refunds in Outflow column would be negative in YNAB but let's assume simple case) + # Actually YNAB: Outflow is positive number. Inflow is positive number. + # If we mapped Outflow to Amount, we have positive numbers. + # We need negative numbers for the filter logic below. + df['Amount'] = df['Amount'].apply(lambda x: -abs(x) if x > 0 else x) + # Example: Handle missing values df.dropna(subset=['Description', 'Amount'], inplace=True) + + # Normalize descriptions + df['Description'] = df['Description'].apply(normalize_description) + + # Merge similar descriptions (fuzzy matching) + df = merge_similar_descriptions(df) + # Load ignore patterns and filter + ignore_patterns = load_ignore_patterns(args.ignore_file) + df = filter_ignored_vendors(df, ignore_patterns) + + # Cluster amounts within each Description group to isolate outliers + if not df.empty: + try: + df['Amount'] = df.groupby('Description')['Amount'].transform(_cluster_amounts_series, threshold=args.threshold) + except Exception as e: + if args.debug: + print(f"Error during amount clustering: {e}") subscription_candidates = get_subscription_candidates(df, groupby=['Description', 'Amount']) subscription_candidates['First_Transaction'] = pd.to_datetime(subscription_candidates['First_Transaction']) @@ -113,12 +257,27 @@ def get_subscription_candidates(df, groupby=['Description']): subscription_candidates = subscription_candidates[(subscription_candidates['Avg_Days_Between_Transactions'] > 25) & (subscription_candidates['Avg_Days_Between_Transactions'] < 35)] - subscription_candidates = subscription_candidates[(subscription_candidates['Amount'] < -10) & (subscription_candidates['Amount'] > -1000)] + subscription_candidates = subscription_candidates[ + (subscription_candidates['Amount'] < -args.min_transaction_amount) & + (subscription_candidates['Amount'] > -args.max_transaction_amount) + ] + # Calculate yearly cost + subscription_candidates['Yearly_Cost'] = subscription_candidates['Amount'] * 12 + + # Filter by recency + if not df['Date'].empty: + max_date = df['Date'].max() + cutoff_date = max_date - pd.Timedelta(days=args.recency_days) + print(f"Filtering for subscriptions active since {cutoff_date.date()} (last {args.recency_days} days of data).") + subscription_candidates = subscription_candidates[subscription_candidates['Last_Transaction'] >= cutoff_date] + print("Number of potential subscriptions:", len(subscription_candidates)) # Display potential subscriptions - print(subscription_candidates[['Description', 'Amount', 'Transaction_Count', 'Total_Spent', 'Last_Transaction']].sort_values('Last_Transaction', ascending=False)) + output_df = subscription_candidates[['Description', 'Amount', 'Yearly_Cost', 'Last_Transaction', 'Transaction_Count']].copy() + output_df = output_df.sort_values('Yearly_Cost', ascending=True) + print(output_df.to_string(float_format="{:.2f}".format)) else: print("Dataframe is empty.") -# print(df.head()) +# print(df.head()) \ No newline at end of file diff --git a/utils.py b/utils.py index 6954ccf..8123b11 100644 --- a/utils.py +++ b/utils.py @@ -3,8 +3,8 @@ standard_columns = { 'Date': ['date', 'datum', 'fecha', 'data'], - 'Description': ['description', 'desc', 'descripción', 'bezeichnung', 'opis'], - 'Amount': ['amount', 'amt', 'importe', 'betrag', 'kwota', 'sum'], + 'Description': ['description', 'desc', 'descripción', 'bezeichnung', 'opis', 'payee'], + 'Amount': ['amount', 'amt', 'importe', 'betrag', 'kwota', 'sum', 'outflow',], # 'Category': ['category', 'kategorie', 'categoría', 'kategorie', 'kategoria'], # Add other standard columns and their variations } @@ -23,10 +23,23 @@ def escape_special_chars(text): return re.escape(text) def clean_amount(amount): - # Remove currency symbols and any non-numeric characters except for the minus sign and comma - amount = re.sub(r'[^\d,-]', '', amount) - # Replace comma with dot - amount = amount.replace(',', '.') + if not isinstance(amount, str): + return amount + # Remove currency symbols and any non-numeric characters except for the minus sign, comma, and dot + amount = re.sub(r'[^\d,.-]', '', amount) + + if ',' in amount and '.' in amount: + # If both are present, assume the last one is the decimal separator + if amount.rfind(',') > amount.rfind('.'): + # European format: 1.234,56 -> 1234.56 + amount = amount.replace('.', '').replace(',', '.') + else: + # US format: 1,234.56 -> 1234.56 + amount = amount.replace(',', '') + elif ',' in amount: + # Assume comma is decimal separator (European) + amount = amount.replace(',', '.') + return amount def map_columns_with_prefix_suffix(columns, standard_columns): @@ -53,4 +66,23 @@ def map_columns_with_prefix_suffix(columns, standard_columns): def unify_column_names(df, standard_columns): column_mapping = map_columns_with_prefix_suffix(df.columns, standard_columns) df.rename(columns=column_mapping, inplace=True) - return df \ No newline at end of file + return df + +def normalize_description(desc): + if not isinstance(desc, str): + return desc + + # Uppercase + desc = desc.upper() + + # Remove "Transfer : " prefix + desc = re.sub(r'^TRANSFER\s*:\s*', '', desc) + + # Generic cleanup + # Remove location info like ", FL, USA" + desc = re.sub(r',\s*[A-Z]{2}(?:,\s*USA)?.*$', '', desc) + + desc = re.sub(r'[^\w\s]', ' ', desc) # Replace special chars with space + desc = re.sub(r'\s+', ' ', desc).strip() + + return desc \ No newline at end of file