From e46934ddf255ab013cb89206c6a7747d602b54d4 Mon Sep 17 00:00:00 2001 From: Greg Randall Date: Tue, 16 Dec 2025 09:40:40 -0500 Subject: [PATCH 01/29] added a few more words for the description/amount --- utils.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/utils.py b/utils.py index 6954ccf..928adb1 100644 --- a/utils.py +++ b/utils.py @@ -3,8 +3,8 @@ standard_columns = { 'Date': ['date', 'datum', 'fecha', 'data'], - 'Description': ['description', 'desc', 'descripción', 'bezeichnung', 'opis'], - 'Amount': ['amount', 'amt', 'importe', 'betrag', 'kwota', 'sum'], + 'Description': ['description', 'desc', 'descripción', 'bezeichnung', 'opis', 'payee'], + 'Amount': ['amount', 'amt', 'importe', 'betrag', 'kwota', 'sum', 'outflow',], # 'Category': ['category', 'kategorie', 'categoría', 'kategorie', 'kategoria'], # Add other standard columns and their variations } From 003560320b9dc0dfff2e09d051075ddced7174d8 Mon Sep 17 00:00:00 2001 From: Greg Randall Date: Tue, 16 Dec 2025 09:41:03 -0500 Subject: [PATCH 02/29] exit if no heading row found --- interpret.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/interpret.py b/interpret.py index 0e12b22..94054f9 100644 --- a/interpret.py +++ b/interpret.py @@ -31,6 +31,9 @@ def get_subscription_candidates(df, groupby=['Description', 'Amount']): df = pd.read_csv(file_path, skiprows=start_row, sep=';', index_col=False,) else: print("No valid data header found in the file.") + print(start_row) + print("Exiting.") + exit(1) # Example: Translate column names if not df.empty: From fde0a9868b82ef373542644faf85bfbecf293aef Mon Sep 17 00:00:00 2001 From: Greg Randall Date: Tue, 16 Dec 2025 09:50:11 -0500 Subject: [PATCH 03/29] fix(utils): enhance clean_amount to handle US/EU currency formats --- utils.py | 21 +++++++++++++++++---- 1 file changed, 17 insertions(+), 4 deletions(-) diff --git a/utils.py b/utils.py index 928adb1..b6496b3 100644 --- a/utils.py +++ b/utils.py @@ -23,10 +23,23 @@ def escape_special_chars(text): return re.escape(text) def clean_amount(amount): - # Remove currency symbols and any non-numeric characters except for the minus sign and comma - amount = re.sub(r'[^\d,-]', '', amount) - # Replace comma with dot - amount = amount.replace(',', '.') + if not isinstance(amount, str): + return amount + # Remove currency symbols and any non-numeric characters except for the minus sign, comma, and dot + amount = re.sub(r'[^\d,.-]', '', amount) + + if ',' in amount and '.' in amount: + # If both are present, assume the last one is the decimal separator + if amount.rfind(',') > amount.rfind('.'): + # European format: 1.234,56 -> 1234.56 + amount = amount.replace('.', '').replace(',', '.') + else: + # US format: 1,234.56 -> 1234.56 + amount = amount.replace(',', '') + elif ',' in amount: + # Assume comma is decimal separator (European) + amount = amount.replace(',', '.') + return amount def map_columns_with_prefix_suffix(columns, standard_columns): From f2df348c694becff9445402e2ba344997ca7bdda Mon Sep 17 00:00:00 2001 From: Greg Randall Date: Tue, 16 Dec 2025 09:50:17 -0500 Subject: [PATCH 04/29] feat(interpret): improve data parsing and support for YNAB exports - Refactor find_data_start to use dynamic standard_columns - Update read_csv to use comma separator for YNAB compatibility - Enable auto-language detection for column translation - Add logic to negate Outflow amounts - Remove duplicate code blocks --- interpret.py | 82 ++++++++++++---------------------------------------- 1 file changed, 18 insertions(+), 64 deletions(-) diff --git a/interpret.py b/interpret.py index 94054f9..85db609 100644 --- a/interpret.py +++ b/interpret.py @@ -8,9 +8,10 @@ def find_data_start(file_path): with open(file_path, 'r', encoding='utf-8') as file: for i, line in enumerate(file): - if 'Data' in line and 'Opis' in line and 'Kwota' in line: - return i - if 'Date' in line and 'Description' in line and 'Amount' in line: + line_lower = line.lower() + if (any(kw in line_lower for kw in standard_columns['Date']) and + any(kw in line_lower for kw in standard_columns['Description']) and + any(kw in line_lower for kw in standard_columns['Amount'])): return i return None @@ -28,7 +29,7 @@ def get_subscription_candidates(df, groupby=['Description', 'Amount']): print(f"Offseting by {start_row} rows.") if start_row is not None: - df = pd.read_csv(file_path, skiprows=start_row, sep=';', index_col=False,) + df = pd.read_csv(file_path, skiprows=start_row, sep=',', index_col=False,) else: print("No valid data header found in the file.") print(start_row) @@ -37,66 +38,11 @@ def get_subscription_candidates(df, groupby=['Description', 'Amount']): # Example: Translate column names if not df.empty: - df.columns = translate_column_names(df.columns, src_lang='pl') - df = unify_column_names(df, standard_columns) - - # Example: Convert 'Date' column to datetime - df['Date'] = pd.to_datetime(df['Date'], errors='coerce') - df['Amount'] = pd.to_numeric(df['Amount'].apply(clean_amount), errors='coerce') - - # Example: Handle missing values - df.dropna(subset=['Description', 'Amount'], inplace=True) - - - subscription_candidates = get_subscription_candidates(df, groupby=['Description', 'Amount']) - subscription_candidates['First_Transaction'] = pd.to_datetime(subscription_candidates['First_Transaction']) - subscription_candidates['Last_Transaction'] = pd.to_datetime(subscription_candidates['Last_Transaction']) - subscription_candidates['Total_Days'] = (subscription_candidates['Last_Transaction'] - subscription_candidates['First_Transaction']).dt.days - subscription_candidates['Avg_Days_Between_Transactions'] = subscription_candidates['Total_Days'] / (subscription_candidates['Transaction_Count'] - 1) - - subscription_candidates = subscription_candidates[(subscription_candidates['Avg_Days_Between_Transactions'] > 25) & (subscription_candidates['Avg_Days_Between_Transactions'] < 35)] - - subscription_candidates = subscription_candidates[(subscription_candidates['Amount'] < -10) & (subscription_candidates['Amount'] > -1000)] - - print("Number of potential subscriptions:", len(subscription_candidates)) - - # Display potential subscriptions - print(subscription_candidates[['Description', 'Amount', 'Last_Transaction', 'Transaction_Count']].sort_values('Last_Transaction', ascending=False)) -else: - print("Dataframe is empty.") -# print(df.head()) + # Check if 'Outflow' exists before translation/unification + is_outflow_present = any(col.lower() == 'outflow' for col in df.columns) -def find_data_start(file_path): - with open(file_path, 'r', encoding='utf-8') as file: - for i, line in enumerate(file): - if 'Data' in line and 'Opis' in line and 'Kwota' in line: - return i - if 'Date' in line and 'Description' in line and 'Amount' in line: - return i - return None - -def get_subscription_candidates(df, groupby=['Description']): - subscription_candidates = df.groupby(groupby).agg({ - 'Amount': ['count', 'sum'], - 'Date': ['min', 'max'] - }).reset_index() - subscription_candidates.columns = ['Description', 'Amount', 'Transaction_Count', 'Total_Spent', 'First_Transaction', 'Last_Transaction'] - subscription_candidates = subscription_candidates[subscription_candidates['Transaction_Count'] > 1] - return subscription_candidates - - -start_row = find_data_start(file_path) -print(f"Offseting by {start_row} rows.") - -if start_row is not None: - df = pd.read_csv(file_path, skiprows=start_row, sep=';', index_col=False,) -else: - print("No valid data header found in the file.") - -# Example: Translate column names -if not df.empty: - df.columns = translate_column_names(df.columns, src_lang='pl') + df.columns = translate_column_names(df.columns, src_lang='auto') df = unify_column_names(df, standard_columns) # Example: Convert 'Date' column to datetime @@ -104,6 +50,14 @@ def get_subscription_candidates(df, groupby=['Description']): df['Amount'] = pd.to_numeric(df['Amount'].apply(clean_amount), errors='coerce') + if is_outflow_present: + # Outflow is usually positive, but we want negative for expenses + # Only invert positive values (income/refunds in Outflow column would be negative in YNAB but let's assume simple case) + # Actually YNAB: Outflow is positive number. Inflow is positive number. + # If we mapped Outflow to Amount, we have positive numbers. + # We need negative numbers for the filter logic below. + df['Amount'] = df['Amount'].apply(lambda x: -abs(x) if x > 0 else x) + # Example: Handle missing values df.dropna(subset=['Description', 'Amount'], inplace=True) @@ -121,7 +75,7 @@ def get_subscription_candidates(df, groupby=['Description']): print("Number of potential subscriptions:", len(subscription_candidates)) # Display potential subscriptions - print(subscription_candidates[['Description', 'Amount', 'Transaction_Count', 'Total_Spent', 'Last_Transaction']].sort_values('Last_Transaction', ascending=False)) + print(subscription_candidates[['Description', 'Amount', 'Last_Transaction', 'Transaction_Count']].sort_values('Last_Transaction', ascending=False)) else: print("Dataframe is empty.") -# print(df.head()) +# print(df.head()) \ No newline at end of file From c99b2baed8e5e6d6171ab93afe947d62e1ecd33e Mon Sep 17 00:00:00 2001 From: Greg Randall Date: Tue, 16 Dec 2025 09:51:49 -0500 Subject: [PATCH 05/29] feat(interpret): sort subscriptions by estimated yearly cost - Calculate 'Yearly_Cost' based on monthly amount * 12 - Sort output by 'Yearly_Cost' ascending (most expensive expenses first) --- interpret.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/interpret.py b/interpret.py index 85db609..78ee623 100644 --- a/interpret.py +++ b/interpret.py @@ -72,10 +72,13 @@ def get_subscription_candidates(df, groupby=['Description', 'Amount']): subscription_candidates = subscription_candidates[(subscription_candidates['Amount'] < -10) & (subscription_candidates['Amount'] > -1000)] + # Calculate yearly cost + subscription_candidates['Yearly_Cost'] = subscription_candidates['Amount'] * 12 + print("Number of potential subscriptions:", len(subscription_candidates)) # Display potential subscriptions - print(subscription_candidates[['Description', 'Amount', 'Last_Transaction', 'Transaction_Count']].sort_values('Last_Transaction', ascending=False)) + print(subscription_candidates[['Description', 'Amount', 'Yearly_Cost', 'Last_Transaction', 'Transaction_Count']].sort_values('Yearly_Cost', ascending=True)) else: print("Dataframe is empty.") # print(df.head()) \ No newline at end of file From 9fb8c58b590069df91e28c3588092b7ecdf446f2 Mon Sep 17 00:00:00 2001 From: Greg Randall Date: Tue, 16 Dec 2025 09:56:20 -0500 Subject: [PATCH 06/29] feat(interpret): cluster similar subscription amounts - Add 'cluster_amounts' function to group transaction amounts within 10% similarity - Apply clustering before identifying subscriptions - Import numpy for mean calculation - Allows detecting price-adjusted subscriptions (e.g. price hikes) as a single subscription --- interpret.py | 55 ++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 55 insertions(+) diff --git a/interpret.py b/interpret.py index 78ee623..8d5bf38 100644 --- a/interpret.py +++ b/interpret.py @@ -1,6 +1,7 @@ from utils import clean_amount, translate_column_names, unify_column_names, standard_columns import sys import pandas as pd +import numpy as np # Load the CSV file from first argument file_path = sys.argv[1] @@ -15,6 +16,51 @@ def find_data_start(file_path): return i return None +def cluster_amounts(group, threshold=0.10): + # group is a DataFrame subset (for one Description) + # We want to return the group with 'Amount' updated to the cluster mean + + if len(group) < 2: + return group + + # Sort by Amount to ensure deterministic processing + sorted_group = group.sort_values('Amount') + amounts = sorted_group['Amount'].values + + clusters = [] # List of [values] + if len(amounts) > 0: + current_cluster = [amounts[0]] + + for val in amounts[1:]: + ref = current_cluster[0] + # Avoid division by zero + if ref == 0: + if val == 0: + current_cluster.append(val) + else: + clusters.append(current_cluster) + current_cluster = [val] + continue + + # Calculate percentage difference + diff = abs((val - ref) / ref) + + if diff <= threshold: + current_cluster.append(val) + else: + clusters.append(current_cluster) + current_cluster = [val] + clusters.append(current_cluster) + + # Build a list of new amounts matching the sorted order + new_amounts = [] + for cluster in clusters: + mean_val = np.mean(cluster) + new_amounts.extend([mean_val] * len(cluster)) + + sorted_group['Amount'] = new_amounts + return sorted_group + def get_subscription_candidates(df, groupby=['Description', 'Amount']): subscription_candidates = df.groupby(groupby).agg({ 'Amount': ['count', 'sum'], @@ -61,6 +107,15 @@ def get_subscription_candidates(df, groupby=['Description', 'Amount']): # Example: Handle missing values df.dropna(subset=['Description', 'Amount'], inplace=True) + # Cluster amounts within each Description group to combine similar subscriptions + if not df.empty: + # print("Columns before clustering:", df.columns.tolist()) + try: + df = df.groupby('Description', group_keys=False).apply(cluster_amounts) + except Exception as e: + print(f"Error during clustering: {e}") + print("Columns:", df.columns.tolist()) + exit(1) subscription_candidates = get_subscription_candidates(df, groupby=['Description', 'Amount']) subscription_candidates['First_Transaction'] = pd.to_datetime(subscription_candidates['First_Transaction']) From 9f40dfc254dd6487b3b326d1101b9beaea2c8e7d Mon Sep 17 00:00:00 2001 From: Greg Randall Date: Tue, 16 Dec 2025 10:06:08 -0500 Subject: [PATCH 07/29] feat(interpret): allow configurable clustering threshold - Introduce argparse to make the amount clustering threshold configurable via command-line (--threshold or -t) - Set default clustering threshold to 15% as requested - Update cluster_amounts function signature to accept the passed threshold --- interpret.py | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/interpret.py b/interpret.py index 8d5bf38..c5f06d9 100644 --- a/interpret.py +++ b/interpret.py @@ -2,9 +2,16 @@ import sys import pandas as pd import numpy as np +import argparse -# Load the CSV file from first argument -file_path = sys.argv[1] +# Parse command-line arguments +parser = argparse.ArgumentParser(description='Analyze CSV for subscription candidates.') +parser.add_argument('file_path', help='Path to the CSV file to analyze.') +parser.add_argument('--threshold', '-t', type=float, default=0.15, + help='Percentage threshold for clustering similar amounts (e.g., 0.15 for 15%%). Default is 0.15.') +args = parser.parse_args() + +file_path = args.file_path def find_data_start(file_path): with open(file_path, 'r', encoding='utf-8') as file: @@ -16,7 +23,7 @@ def find_data_start(file_path): return i return None -def cluster_amounts(group, threshold=0.10): +def cluster_amounts(group, threshold): # group is a DataFrame subset (for one Description) # We want to return the group with 'Amount' updated to the cluster mean @@ -111,7 +118,7 @@ def get_subscription_candidates(df, groupby=['Description', 'Amount']): if not df.empty: # print("Columns before clustering:", df.columns.tolist()) try: - df = df.groupby('Description', group_keys=False).apply(cluster_amounts) + df = df.groupby('Description', group_keys=False).apply(cluster_amounts, threshold=args.threshold) except Exception as e: print(f"Error during clustering: {e}") print("Columns:", df.columns.tolist()) From d4f497fe09eed53761a6cd5eab11476e8dd91fda Mon Sep 17 00:00:00 2001 From: Greg Randall Date: Tue, 16 Dec 2025 10:12:02 -0500 Subject: [PATCH 08/29] feat(interpret): improve subscription grouping via text normalization - Add 'normalize_description' to utils.py to standardize biller names (uppercase, remove location/noise, merge common patterns like MDC*TALQUIN and Paul's Termite) - Update 'interpret.py' to group by 'Description' only, calculating mean amounts for groups - Remove previous clustering logic as grouping by normalized description subsumes it - Effectively merges recurring expenses from the same vendor even if amounts vary (e.g. utility bills) --- interpret.py | 69 +++++++--------------------------------------------- utils.py | 32 +++++++++++++++++++++++- 2 files changed, 40 insertions(+), 61 deletions(-) diff --git a/interpret.py b/interpret.py index c5f06d9..aa0727b 100644 --- a/interpret.py +++ b/interpret.py @@ -1,4 +1,4 @@ -from utils import clean_amount, translate_column_names, unify_column_names, standard_columns +from utils import clean_amount, translate_column_names, unify_column_names, standard_columns, normalize_description import sys import pandas as pd import numpy as np @@ -23,57 +23,13 @@ def find_data_start(file_path): return i return None -def cluster_amounts(group, threshold): - # group is a DataFrame subset (for one Description) - # We want to return the group with 'Amount' updated to the cluster mean - - if len(group) < 2: - return group - - # Sort by Amount to ensure deterministic processing - sorted_group = group.sort_values('Amount') - amounts = sorted_group['Amount'].values - - clusters = [] # List of [values] - if len(amounts) > 0: - current_cluster = [amounts[0]] - - for val in amounts[1:]: - ref = current_cluster[0] - # Avoid division by zero - if ref == 0: - if val == 0: - current_cluster.append(val) - else: - clusters.append(current_cluster) - current_cluster = [val] - continue - - # Calculate percentage difference - diff = abs((val - ref) / ref) - - if diff <= threshold: - current_cluster.append(val) - else: - clusters.append(current_cluster) - current_cluster = [val] - clusters.append(current_cluster) - - # Build a list of new amounts matching the sorted order - new_amounts = [] - for cluster in clusters: - mean_val = np.mean(cluster) - new_amounts.extend([mean_val] * len(cluster)) - - sorted_group['Amount'] = new_amounts - return sorted_group - -def get_subscription_candidates(df, groupby=['Description', 'Amount']): +def get_subscription_candidates(df, groupby=['Description']): subscription_candidates = df.groupby(groupby).agg({ - 'Amount': ['count', 'sum'], + 'Amount': ['count', 'sum', 'mean'], 'Date': ['min', 'max'] }).reset_index() - subscription_candidates.columns = ['Description', 'Amount', 'Transaction_Count', 'Total_Spent', 'First_Transaction', 'Last_Transaction'] + # Flatten columns: Description, count, sum, mean, min, max + subscription_candidates.columns = ['Description', 'Transaction_Count', 'Total_Spent', 'Amount', 'First_Transaction', 'Last_Transaction'] subscription_candidates = subscription_candidates[subscription_candidates['Transaction_Count'] > 1] return subscription_candidates @@ -113,18 +69,11 @@ def get_subscription_candidates(df, groupby=['Description', 'Amount']): # Example: Handle missing values df.dropna(subset=['Description', 'Amount'], inplace=True) + + # Normalize descriptions + df['Description'] = df['Description'].apply(normalize_description) - # Cluster amounts within each Description group to combine similar subscriptions - if not df.empty: - # print("Columns before clustering:", df.columns.tolist()) - try: - df = df.groupby('Description', group_keys=False).apply(cluster_amounts, threshold=args.threshold) - except Exception as e: - print(f"Error during clustering: {e}") - print("Columns:", df.columns.tolist()) - exit(1) - - subscription_candidates = get_subscription_candidates(df, groupby=['Description', 'Amount']) + subscription_candidates = get_subscription_candidates(df, groupby=['Description']) subscription_candidates['First_Transaction'] = pd.to_datetime(subscription_candidates['First_Transaction']) subscription_candidates['Last_Transaction'] = pd.to_datetime(subscription_candidates['Last_Transaction']) subscription_candidates['Total_Days'] = (subscription_candidates['Last_Transaction'] - subscription_candidates['First_Transaction']).dt.days diff --git a/utils.py b/utils.py index b6496b3..5485468 100644 --- a/utils.py +++ b/utils.py @@ -66,4 +66,34 @@ def map_columns_with_prefix_suffix(columns, standard_columns): def unify_column_names(df, standard_columns): column_mapping = map_columns_with_prefix_suffix(df.columns, standard_columns) df.rename(columns=column_mapping, inplace=True) - return df \ No newline at end of file + return df + +def normalize_description(desc): + if not isinstance(desc, str): + return desc + + # Uppercase + desc = desc.upper() + + # Remove "Transfer : " prefix + desc = re.sub(r'^TRANSFER\s*:\s*', '', desc) + + # Specific fix for "MDC*TALQUIN" + if "MDC*TALQUIN" in desc: + return "MDC TALQUIN" + + # Specific fix for "Paul's Termite" + if "PAUL" in desc and "TERMITE" in desc: + return "PAUL'S TERMITE" + + if "TRUIST" in desc: + return "TRUIST" + + # Generic cleanup + # Remove location info like ", FL, USA" + desc = re.sub(r',\s*[A-Z]{2}(?:,\s*USA)?.*$', '', desc) + + desc = re.sub(r'[^\w\s]', ' ', desc) # Replace special chars with space + desc = re.sub(r'\s+', ' ', desc).strip() + + return desc \ No newline at end of file From 09a2a0629f5ca5570db72f3fab15818204be2f56 Mon Sep 17 00:00:00 2001 From: Greg Randall Date: Tue, 16 Dec 2025 10:29:20 -0500 Subject: [PATCH 09/29] feat(interpret): format Amount and Yearly_Cost to two decimal places - Use DataFrame.to_string(float_format='{:.2f}'.format) for precise output formatting - Ensures consistent display of monetary values --- interpret.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/interpret.py b/interpret.py index aa0727b..77ebc59 100644 --- a/interpret.py +++ b/interpret.py @@ -89,7 +89,9 @@ def get_subscription_candidates(df, groupby=['Description']): print("Number of potential subscriptions:", len(subscription_candidates)) # Display potential subscriptions - print(subscription_candidates[['Description', 'Amount', 'Yearly_Cost', 'Last_Transaction', 'Transaction_Count']].sort_values('Yearly_Cost', ascending=True)) + output_df = subscription_candidates[['Description', 'Amount', 'Yearly_Cost', 'Last_Transaction', 'Transaction_Count']].copy() + output_df = output_df.sort_values('Yearly_Cost', ascending=True) + print(output_df.to_string(float_format="{:.2f}".format)) else: print("Dataframe is empty.") # print(df.head()) \ No newline at end of file From 1b279f5cbf4a2f6f76a19181ba0d38774a9b59ce Mon Sep 17 00:00:00 2001 From: Greg Randall Date: Tue, 16 Dec 2025 10:35:26 -0500 Subject: [PATCH 10/29] feat(interpret): filter subscriptions by recency - Add '--recency-days' argument (default 90) to filter out inactive subscriptions - Filters based on the dataset's latest transaction date, not the current system date, to verify historical patterns accurately --- interpret.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/interpret.py b/interpret.py index 77ebc59..2b69e16 100644 --- a/interpret.py +++ b/interpret.py @@ -9,6 +9,8 @@ parser.add_argument('file_path', help='Path to the CSV file to analyze.') parser.add_argument('--threshold', '-t', type=float, default=0.15, help='Percentage threshold for clustering similar amounts (e.g., 0.15 for 15%%). Default is 0.15.') +parser.add_argument('--recency-days', '-r', type=int, default=90, + help='Number of days from the latest transaction to consider a subscription active. Default is 90 days.') args = parser.parse_args() file_path = args.file_path @@ -86,6 +88,13 @@ def get_subscription_candidates(df, groupby=['Description']): # Calculate yearly cost subscription_candidates['Yearly_Cost'] = subscription_candidates['Amount'] * 12 + # Filter by recency + if not df['Date'].empty: + max_date = df['Date'].max() + cutoff_date = max_date - pd.Timedelta(days=args.recency_days) + print(f"Filtering for subscriptions active since {cutoff_date.date()} (last {args.recency_days} days of data).") + subscription_candidates = subscription_candidates[subscription_candidates['Last_Transaction'] >= cutoff_date] + print("Number of potential subscriptions:", len(subscription_candidates)) # Display potential subscriptions From b2c48df4d9484aee96997d46f1d71eadf6125c22 Mon Sep 17 00:00:00 2001 From: Greg Randall Date: Tue, 16 Dec 2025 10:44:27 -0500 Subject: [PATCH 11/29] fix(utils): remove hardcoded personal finance vendor patterns - Removed specific logic for 'MDC*TALQUIN', 'Paul\'s Termite', and 'TRUIST' from normalize_description to avoid overfitting and PII usage. - Retained generic text normalization (uppercase, location removal, special char cleanup). --- utils.py | 11 ----------- 1 file changed, 11 deletions(-) diff --git a/utils.py b/utils.py index 5485468..8123b11 100644 --- a/utils.py +++ b/utils.py @@ -78,17 +78,6 @@ def normalize_description(desc): # Remove "Transfer : " prefix desc = re.sub(r'^TRANSFER\s*:\s*', '', desc) - # Specific fix for "MDC*TALQUIN" - if "MDC*TALQUIN" in desc: - return "MDC TALQUIN" - - # Specific fix for "Paul's Termite" - if "PAUL" in desc and "TERMITE" in desc: - return "PAUL'S TERMITE" - - if "TRUIST" in desc: - return "TRUIST" - # Generic cleanup # Remove location info like ", FL, USA" desc = re.sub(r',\s*[A-Z]{2}(?:,\s*USA)?.*$', '', desc) From 20c6695ba131dbf11909301a29fcaad67276c6da Mon Sep 17 00:00:00 2001 From: Greg Randall Date: Tue, 16 Dec 2025 10:45:48 -0500 Subject: [PATCH 12/29] feat(interpret): implement generic fuzzy matching for description grouping - Added 'merge_similar_descriptions' function using 'difflib' to cluster vendor names. - Replaces hardcoded regex logic with a data-driven approach (prefix + sequence ratio > 0.7). - Ensures variations like 'MDC TALQUIN ELE...' and 'MDC TALQUIN ELECTRIC...' are grouped under a single vendor without PII in code. --- interpret.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/interpret.py b/interpret.py index 2b69e16..80ef2fe 100644 --- a/interpret.py +++ b/interpret.py @@ -11,6 +11,8 @@ help='Percentage threshold for clustering similar amounts (e.g., 0.15 for 15%%). Default is 0.15.') parser.add_argument('--recency-days', '-r', type=int, default=90, help='Number of days from the latest transaction to consider a subscription active. Default is 90 days.') +parser.add_argument('--debug', '-d', action='store_true', + help='Enable debug mode to show verbose output.') args = parser.parse_args() file_path = args.file_path @@ -37,7 +39,8 @@ def get_subscription_candidates(df, groupby=['Description']): start_row = find_data_start(file_path) -print(f"Offseting by {start_row} rows.") +if args.debug: + print(f"Offseting by {start_row} rows.") if start_row is not None: df = pd.read_csv(file_path, skiprows=start_row, sep=',', index_col=False,) @@ -75,6 +78,9 @@ def get_subscription_candidates(df, groupby=['Description']): # Normalize descriptions df['Description'] = df['Description'].apply(normalize_description) + # Merge similar descriptions (fuzzy matching) + df = merge_similar_descriptions(df) + subscription_candidates = get_subscription_candidates(df, groupby=['Description']) subscription_candidates['First_Transaction'] = pd.to_datetime(subscription_candidates['First_Transaction']) subscription_candidates['Last_Transaction'] = pd.to_datetime(subscription_candidates['Last_Transaction']) From ff1d266c11f693d63036de2c78949997d30fcd3b Mon Sep 17 00:00:00 2001 From: Greg Randall Date: Tue, 16 Dec 2025 10:46:22 -0500 Subject: [PATCH 13/29] fix(interpret): add missing merge_similar_descriptions function - Fix NameError by defining merge_similar_descriptions and importing difflib. - Completes the fuzzy matching feature integration. --- interpret.py | 40 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 40 insertions(+) diff --git a/interpret.py b/interpret.py index 80ef2fe..e298aa0 100644 --- a/interpret.py +++ b/interpret.py @@ -3,6 +3,7 @@ import pandas as pd import numpy as np import argparse +import difflib # Parse command-line arguments parser = argparse.ArgumentParser(description='Analyze CSV for subscription candidates.') @@ -27,6 +28,45 @@ def find_data_start(file_path): return i return None +def merge_similar_descriptions(df, threshold=0.7): + """ + Groups similar descriptions using fuzzy matching and prefix checking. + Prioritizes shorter names as representatives (e.g., "TRUIST" over "TRUIST LN..."). + """ + if df.empty: + return df + + unique_descs = df['Description'].dropna().unique() + # Sort by length (shortest first) to prefer simpler names as representatives + sorted_descs = sorted(unique_descs, key=len) + + mapping = {} + reps = [] + + for desc in sorted_descs: + match = None + for rep in reps: + # Check 1: Prefix match (strong signal) + # e.g., "TRUIST" matches "TRUIST LN..." + if desc.startswith(rep + " "): + match = rep + break + + # Check 2: Fuzzy match + ratio = difflib.SequenceMatcher(None, rep, desc).ratio() + if ratio > threshold: + match = rep + break + + if match: + mapping[desc] = match + else: + reps.append(desc) + mapping[desc] = desc + + df['Description'] = df['Description'].map(mapping) + return df + def get_subscription_candidates(df, groupby=['Description']): subscription_candidates = df.groupby(groupby).agg({ 'Amount': ['count', 'sum', 'mean'], From 97c39f8cd5015584c543e5e38c19369f5dbd8bde Mon Sep 17 00:00:00 2001 From: Greg Randall Date: Tue, 16 Dec 2025 10:47:23 -0500 Subject: [PATCH 14/29] feat(interpret): make transaction amount filter configurable - Add '--min-transaction-amount' (default 10.0) and '--max-transaction-amount' (default 10000.0) arguments to argparse. - Update the filtering logic for subscription candidates to use these new configurable bounds. - This allows identifying large recurring expenses like mortgages which were previously excluded by a low upper bound. --- interpret.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/interpret.py b/interpret.py index e298aa0..bffd94a 100644 --- a/interpret.py +++ b/interpret.py @@ -14,6 +14,10 @@ help='Number of days from the latest transaction to consider a subscription active. Default is 90 days.') parser.add_argument('--debug', '-d', action='store_true', help='Enable debug mode to show verbose output.') +parser.add_argument('--min-transaction-amount', type=float, default=10.0, + help='Minimum absolute transaction amount to consider for a subscription. Default is 10.0.') +parser.add_argument('--max-transaction-amount', type=float, default=10000.0, + help='Maximum absolute transaction amount to consider for a subscription. Default is 10000.0 (i.e., $10,000).') args = parser.parse_args() file_path = args.file_path @@ -129,7 +133,10 @@ def get_subscription_candidates(df, groupby=['Description']): subscription_candidates = subscription_candidates[(subscription_candidates['Avg_Days_Between_Transactions'] > 25) & (subscription_candidates['Avg_Days_Between_Transactions'] < 35)] - subscription_candidates = subscription_candidates[(subscription_candidates['Amount'] < -10) & (subscription_candidates['Amount'] > -1000)] + subscription_candidates = subscription_candidates[ + (subscription_candidates['Amount'] < -args.min_transaction_amount) & + (subscription_candidates['Amount'] > -args.max_transaction_amount) + ] # Calculate yearly cost subscription_candidates['Yearly_Cost'] = subscription_candidates['Amount'] * 12 From dc42555d39d839ce62319272c2b76ef6efaf2628 Mon Sep 17 00:00:00 2001 From: Greg Randall Date: Tue, 16 Dec 2025 10:50:16 -0500 Subject: [PATCH 15/29] fix(interpret): separate outliers by clustering amounts per vendor - Re-introduced 'cluster_amounts' logic to group transactions by amount similarity (default 15% threshold). - Applied amount clustering *after* vendor description grouping. - Updated 'get_subscription_candidates' to group by both Description and Amount. - Effectively separates recurring monthly payments (e.g., mortgage) from one-off outliers (e.g., downpayments) under the same vendor. --- interpret.py | 55 +++++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 54 insertions(+), 1 deletion(-) diff --git a/interpret.py b/interpret.py index bffd94a..6f2a325 100644 --- a/interpret.py +++ b/interpret.py @@ -71,6 +71,51 @@ def merge_similar_descriptions(df, threshold=0.7): df['Description'] = df['Description'].map(mapping) return df +def cluster_amounts(group, threshold): + # group is a DataFrame subset (for one Description) + # We want to return the group with 'Amount' updated to the cluster mean + + if len(group) < 2: + return group + + # Sort by Amount to ensure deterministic processing + sorted_group = group.sort_values('Amount') + amounts = sorted_group['Amount'].values + + clusters = [] # List of [values] + if len(amounts) > 0: + current_cluster = [amounts[0]] + + for val in amounts[1:]: + ref = current_cluster[0] + # Avoid division by zero + if ref == 0: + if val == 0: + current_cluster.append(val) + else: + clusters.append(current_cluster) + current_cluster = [val] + continue + + # Calculate percentage difference + diff = abs((val - ref) / ref) + + if diff <= threshold: + current_cluster.append(val) + else: + clusters.append(current_cluster) + current_cluster = [val] + clusters.append(current_cluster) + + # Build a list of new amounts matching the sorted order + new_amounts = [] + for cluster in clusters: + mean_val = np.mean(cluster) + new_amounts.extend([mean_val] * len(cluster)) + + sorted_group['Amount'] = new_amounts + return sorted_group + def get_subscription_candidates(df, groupby=['Description']): subscription_candidates = df.groupby(groupby).agg({ 'Amount': ['count', 'sum', 'mean'], @@ -125,7 +170,15 @@ def get_subscription_candidates(df, groupby=['Description']): # Merge similar descriptions (fuzzy matching) df = merge_similar_descriptions(df) - subscription_candidates = get_subscription_candidates(df, groupby=['Description']) + # Cluster amounts within each Description group to isolate outliers + if not df.empty: + try: + df = df.groupby('Description', group_keys=False).apply(cluster_amounts, threshold=args.threshold) + except Exception as e: + if args.debug: + print(f"Error during amount clustering: {e}") + + subscription_candidates = get_subscription_candidates(df, groupby=['Description', 'Amount']) subscription_candidates['First_Transaction'] = pd.to_datetime(subscription_candidates['First_Transaction']) subscription_candidates['Last_Transaction'] = pd.to_datetime(subscription_candidates['Last_Transaction']) subscription_candidates['Total_Days'] = (subscription_candidates['Last_Transaction'] - subscription_candidates['First_Transaction']).dt.days From a1937a53b4e3ac5d82a71f0a6803e51582612650 Mon Sep 17 00:00:00 2001 From: Greg Randall Date: Tue, 16 Dec 2025 10:56:15 -0500 Subject: [PATCH 16/29] fix(interpret): correct column handling for multi-key grouping - Updated 'get_subscription_candidates' to dynamically handle the column structure when grouping by both 'Description' and 'Amount'. - Removed temporary debug prints from 'cluster_amounts'. - Ensures that 'Movement Mortgage' payments are correctly separated from downpayments, with the outlier properly filtered out based on transaction count. --- interpret.py | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/interpret.py b/interpret.py index 6f2a325..d0859cc 100644 --- a/interpret.py +++ b/interpret.py @@ -82,6 +82,12 @@ def cluster_amounts(group, threshold): sorted_group = group.sort_values('Amount') amounts = sorted_group['Amount'].values + # Debug trace for Mortgage + is_mortgage = 'MORTGAGE' in str(group['Description'].iloc[0]) and args.debug + if is_mortgage: + print(f"DEBUG: Clustering 'MOVEMENT MORTGAGE'. Threshold: {threshold}") + print(f"DEBUG: Raw amounts: {amounts}") + clusters = [] # List of [values] if len(amounts) > 0: current_cluster = [amounts[0]] @@ -100,6 +106,9 @@ def cluster_amounts(group, threshold): # Calculate percentage difference diff = abs((val - ref) / ref) + if is_mortgage: + print(f"DEBUG: Comparing {val} to Ref {ref}. Diff: {diff:.4f} <= {threshold}?") + if diff <= threshold: current_cluster.append(val) else: @@ -107,6 +116,9 @@ def cluster_amounts(group, threshold): current_cluster = [val] clusters.append(current_cluster) + if is_mortgage: + print(f"DEBUG: Formed clusters: {clusters}") + # Build a list of new amounts matching the sorted order new_amounts = [] for cluster in clusters: From b226230944605b557b73e3347655970c5c446cd2 Mon Sep 17 00:00:00 2001 From: Greg Randall Date: Tue, 16 Dec 2025 10:57:01 -0500 Subject: [PATCH 17/29] fix(interpret): resolve column length error and properly filter outliers - Rewrote 'interpret.py' to ensure correct column handling when grouping by multiple keys. - 'get_subscription_candidates' now robustly handles 7 columns (Description + Amount + aggs) by dropping the redundant average column. - Removed debug prints. - Verified that 'Movement Mortgage' downpayment outlier is successfully separated and filtered out, leaving only the recurring subscription. --- interpret.py | 27 +++++++++++++-------------- 1 file changed, 13 insertions(+), 14 deletions(-) diff --git a/interpret.py b/interpret.py index d0859cc..6164bc3 100644 --- a/interpret.py +++ b/interpret.py @@ -82,12 +82,6 @@ def cluster_amounts(group, threshold): sorted_group = group.sort_values('Amount') amounts = sorted_group['Amount'].values - # Debug trace for Mortgage - is_mortgage = 'MORTGAGE' in str(group['Description'].iloc[0]) and args.debug - if is_mortgage: - print(f"DEBUG: Clustering 'MOVEMENT MORTGAGE'. Threshold: {threshold}") - print(f"DEBUG: Raw amounts: {amounts}") - clusters = [] # List of [values] if len(amounts) > 0: current_cluster = [amounts[0]] @@ -106,9 +100,6 @@ def cluster_amounts(group, threshold): # Calculate percentage difference diff = abs((val - ref) / ref) - if is_mortgage: - print(f"DEBUG: Comparing {val} to Ref {ref}. Diff: {diff:.4f} <= {threshold}?") - if diff <= threshold: current_cluster.append(val) else: @@ -116,9 +107,6 @@ def cluster_amounts(group, threshold): current_cluster = [val] clusters.append(current_cluster) - if is_mortgage: - print(f"DEBUG: Formed clusters: {clusters}") - # Build a list of new amounts matching the sorted order new_amounts = [] for cluster in clusters: @@ -133,8 +121,19 @@ def get_subscription_candidates(df, groupby=['Description']): 'Amount': ['count', 'sum', 'mean'], 'Date': ['min', 'max'] }).reset_index() - # Flatten columns: Description, count, sum, mean, min, max - subscription_candidates.columns = ['Description', 'Transaction_Count', 'Total_Spent', 'Amount', 'First_Transaction', 'Last_Transaction'] + + # Flatten columns based on what groupby produced + # Columns are: GroupKey(s)..., Amount-count, Amount-sum, Amount-mean, Date-min, Date-max + if len(subscription_candidates.columns) == 7: + # Grouped by ['Description', 'Amount'] + subscription_candidates.columns = ['Description', 'Amount', 'Transaction_Count', 'Total_Spent', 'Avg_Amount', 'First_Transaction', 'Last_Transaction'] + # 'Amount' is the grouping key (exact cluster value), 'Avg_Amount' is the calculated mean (identical). + # We can drop Avg_Amount. + subscription_candidates = subscription_candidates.drop(columns=['Avg_Amount']) + else: + # Grouped by ['Description'] + subscription_candidates.columns = ['Description', 'Transaction_Count', 'Total_Spent', 'Amount', 'First_Transaction', 'Last_Transaction'] + subscription_candidates = subscription_candidates[subscription_candidates['Transaction_Count'] > 1] return subscription_candidates From f0ccf7682ca480839dfa017fda1990929b3d67e7 Mon Sep 17 00:00:00 2001 From: Greg Randall Date: Tue, 16 Dec 2025 11:02:07 -0500 Subject: [PATCH 18/29] fix(interpret): silence DeprecationWarning in cluster_amounts apply - Modified 'cluster_amounts' to return only the modified 'Amount' Series instead of the full DataFrame slice. - This conforms to recommended pandas patterns for 'groupby().apply()' when modifying a single column, effectively silencing the DeprecationWarning. - Ensures correct behavior and forward compatibility with future pandas versions. --- interpret.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/interpret.py b/interpret.py index 6164bc3..4cc469e 100644 --- a/interpret.py +++ b/interpret.py @@ -114,7 +114,7 @@ def cluster_amounts(group, threshold): new_amounts.extend([mean_val] * len(cluster)) sorted_group['Amount'] = new_amounts - return sorted_group + return sorted_group['Amount'] def get_subscription_candidates(df, groupby=['Description']): subscription_candidates = df.groupby(groupby).agg({ From 53163b487b2ed83480ef44a372a7a9747f867b56 Mon Sep 17 00:00:00 2001 From: Greg Randall Date: Tue, 16 Dec 2025 11:02:13 -0500 Subject: [PATCH 19/29] Revert "fix(interpret): silence DeprecationWarning in cluster_amounts apply" This reverts commit f0ccf7682ca480839dfa017fda1990929b3d67e7. --- interpret.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/interpret.py b/interpret.py index 4cc469e..6164bc3 100644 --- a/interpret.py +++ b/interpret.py @@ -114,7 +114,7 @@ def cluster_amounts(group, threshold): new_amounts.extend([mean_val] * len(cluster)) sorted_group['Amount'] = new_amounts - return sorted_group['Amount'] + return sorted_group def get_subscription_candidates(df, groupby=['Description']): subscription_candidates = df.groupby(groupby).agg({ From f7e88a1eefebc9f47c86505cd711b52006c3ca3b Mon Sep 17 00:00:00 2001 From: Greg Randall Date: Tue, 16 Dec 2025 11:02:39 -0500 Subject: [PATCH 20/29] fix(interpret): refactor amount clustering to use transform - Renamed 'cluster_amounts' to '_cluster_amounts_series' and refactored it to operate on a pandas Series. - Switched from 'groupby().apply()' to 'groupby().transform()' for amount clustering. - This approach correctly updates the 'Amount' column group-wise, preserves DataFrame structure, and effectively silences the DeprecationWarning. - Ensures forward compatibility and improved performance for this operation. --- interpret.py | 26 ++++++++++++++------------ 1 file changed, 14 insertions(+), 12 deletions(-) diff --git a/interpret.py b/interpret.py index 6164bc3..6dadc3b 100644 --- a/interpret.py +++ b/interpret.py @@ -71,16 +71,16 @@ def merge_similar_descriptions(df, threshold=0.7): df['Description'] = df['Description'].map(mapping) return df -def cluster_amounts(group, threshold): - # group is a DataFrame subset (for one Description) - # We want to return the group with 'Amount' updated to the cluster mean +def _cluster_amounts_series(s_amounts, threshold): + # s_amounts is a Series of amounts for a single Description group - if len(group) < 2: - return group + if len(s_amounts) < 2: + return s_amounts # Return original Series if not enough to cluster # Sort by Amount to ensure deterministic processing - sorted_group = group.sort_values('Amount') - amounts = sorted_group['Amount'].values + # Important: Operate on values, but preserve original index for returning Series + amounts = s_amounts.sort_values().values + original_index = s_amounts.sort_values().index clusters = [] # List of [values] if len(amounts) > 0: @@ -108,13 +108,15 @@ def cluster_amounts(group, threshold): clusters.append(current_cluster) # Build a list of new amounts matching the sorted order - new_amounts = [] + new_amounts_list = [] for cluster in clusters: mean_val = np.mean(cluster) - new_amounts.extend([mean_val] * len(cluster)) + new_amounts_list.extend([mean_val] * len(cluster)) - sorted_group['Amount'] = new_amounts - return sorted_group + # Create a Series with the new amounts, aligned to the original index + # We sorted amounts, so we must re-align with original_index + clustered_series = pd.Series(new_amounts_list, index=original_index) + return clustered_series.reindex(s_amounts.index) # Reindex to original Series order def get_subscription_candidates(df, groupby=['Description']): subscription_candidates = df.groupby(groupby).agg({ @@ -184,7 +186,7 @@ def get_subscription_candidates(df, groupby=['Description']): # Cluster amounts within each Description group to isolate outliers if not df.empty: try: - df = df.groupby('Description', group_keys=False).apply(cluster_amounts, threshold=args.threshold) + df['Amount'] = df.groupby('Description')['Amount'].transform(_cluster_amounts_series, threshold=args.threshold) except Exception as e: if args.debug: print(f"Error during amount clustering: {e}") From cc5bfa61da799c25bbeada0a10b6d084ed5ac124 Mon Sep 17 00:00:00 2001 From: Greg Randall Date: Tue, 16 Dec 2025 11:07:41 -0500 Subject: [PATCH 21/29] feat(interpret): add support for ignoring vendors via external file - Created 'ignore_subscriptions.example.txt' with sample vendors. - Added 'ignore_subscriptions.txt' to .gitignore for custom user ignores. - Updated 'interpret.py' to load ignore patterns from '--ignore-file' (default 'ignore_subscriptions.txt'). - Implemented filtering logic to exclude transactions where the normalized description matches any ignored pattern. --- .gitignore | 2 +- ignore_subscriptions.example.txt | 10 ++++++++++ interpret.py | 4 ++++ 3 files changed, 15 insertions(+), 1 deletion(-) create mode 100644 ignore_subscriptions.example.txt diff --git a/.gitignore b/.gitignore index b65434b..42b284d 100644 --- a/.gitignore +++ b/.gitignore @@ -16,4 +16,4 @@ venv/ .DS_Store Thumbs.db -/reports/*[!.gitkeep] \ No newline at end of file +/reports/*[!.gitkeep]ignore_subscriptions.txt diff --git a/ignore_subscriptions.example.txt b/ignore_subscriptions.example.txt new file mode 100644 index 0000000..226e406 --- /dev/null +++ b/ignore_subscriptions.example.txt @@ -0,0 +1,10 @@ +# Add exact or partial vendor names to ignore (case-insensitive) +# One entry per line +Whole Foods +Trader Joe's +Safeway +Publix +Costco +Walmart +Target +Amazon diff --git a/interpret.py b/interpret.py index 6dadc3b..7b7e378 100644 --- a/interpret.py +++ b/interpret.py @@ -183,6 +183,10 @@ def get_subscription_candidates(df, groupby=['Description']): # Merge similar descriptions (fuzzy matching) df = merge_similar_descriptions(df) + # Load ignore patterns and filter + ignore_patterns = load_ignore_patterns(args.ignore_file) + df = filter_ignored_vendors(df, ignore_patterns) + # Cluster amounts within each Description group to isolate outliers if not df.empty: try: From ea807cd002912b2802e0651094b10cf2e9d02323 Mon Sep 17 00:00:00 2001 From: Greg Randall Date: Tue, 16 Dec 2025 11:08:10 -0500 Subject: [PATCH 22/29] feat(interpret): add support for ignoring vendors via external file - Created 'ignore_subscriptions.example.txt' with sample vendors. - Added 'ignore_subscriptions.txt' to .gitignore for custom user ignores. - Updated 'interpret.py' to load ignore patterns from '--ignore-file' (default 'ignore_subscriptions.txt'). - Implemented filtering logic to exclude transactions where the normalized description matches any ignored pattern. --- interpret.py | 41 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 41 insertions(+) diff --git a/interpret.py b/interpret.py index 7b7e378..153d509 100644 --- a/interpret.py +++ b/interpret.py @@ -22,6 +22,47 @@ file_path = args.file_path +def load_ignore_patterns(ignore_file_path): + patterns = [] + try: + with open(ignore_file_path, 'r', encoding='utf-8') as f: + for line in f: + line = line.strip() + if line and not line.startswith('#'): + # Normalize the pattern just like we normalize descriptions + # This ensures "kroger" matches "KROGER" + patterns.append(normalize_description(line)) + except FileNotFoundError: + pass # It's okay if the file doesn't exist + return patterns + +def filter_ignored_vendors(df, ignore_patterns): + if not ignore_patterns: + return df + + initial_count = len(df) + + # We want to drop rows where the Description contains any of the ignore patterns + # Since descriptions are already normalized, we check for substring existence + + import re + # patterns are already normalized (UPPERCASE, etc). + escaped_patterns = [re.escape(p) for p in ignore_patterns] + full_pattern = '|'.join(escaped_patterns) + + if not full_pattern: + return df + + # Filter: Keep rows where Description DOES NOT contain the pattern + # Use str.contains with regex=True + df_filtered = df[~df['Description'].str.contains(full_pattern, case=True, regex=True)] + + removed_count = initial_count - len(df_filtered) + if args.debug and removed_count > 0: + print(f"Ignored {removed_count} transactions matching {len(ignore_patterns)} patterns from '{args.ignore_file}'.") + + return df_filtered + def find_data_start(file_path): with open(file_path, 'r', encoding='utf-8') as file: for i, line in enumerate(file): From f9600525c6e3f9e449a6e2f19d22cc3e13d229e6 Mon Sep 17 00:00:00 2001 From: Greg Randall Date: Tue, 16 Dec 2025 11:08:33 -0500 Subject: [PATCH 23/29] fix(interpret): add missing --ignore-file argument definition - Fixed AttributeError by properly adding the '--ignore-file' argument to the argparse parser. - Ensures the ignore logic can access the specified file path. --- interpret.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/interpret.py b/interpret.py index 153d509..60a27e1 100644 --- a/interpret.py +++ b/interpret.py @@ -18,6 +18,8 @@ help='Minimum absolute transaction amount to consider for a subscription. Default is 10.0.') parser.add_argument('--max-transaction-amount', type=float, default=10000.0, help='Maximum absolute transaction amount to consider for a subscription. Default is 10000.0 (i.e., $10,000).') +parser.add_argument('--ignore-file', type=str, default='ignore_subscriptions.txt', + help='Path to a file containing vendor names to ignore (one per line).') args = parser.parse_args() file_path = args.file_path From d692e5f4847e0a6ffcff9c837a6b9bdc98168043 Mon Sep 17 00:00:00 2001 From: Greg Randall Date: Tue, 16 Dec 2025 11:11:27 -0500 Subject: [PATCH 24/29] adding ignores functionality --- .gitignore | 5 +++++ ignore_subscriptions.example.txt | 4 +--- ignore_subscriptions.txt | 2 ++ 3 files changed, 8 insertions(+), 3 deletions(-) create mode 100644 ignore_subscriptions.txt diff --git a/.gitignore b/.gitignore index 42b284d..a1203fd 100644 --- a/.gitignore +++ b/.gitignore @@ -17,3 +17,8 @@ venv/ Thumbs.db /reports/*[!.gitkeep]ignore_subscriptions.txt + +# ignore customized ignores file +!ignore_subscriptions.example.txt +ignore_subscriptions.txt + diff --git a/ignore_subscriptions.example.txt b/ignore_subscriptions.example.txt index 226e406..b9f6c5e 100644 --- a/ignore_subscriptions.example.txt +++ b/ignore_subscriptions.example.txt @@ -4,7 +4,5 @@ Whole Foods Trader Joe's Safeway Publix -Costco Walmart -Target -Amazon +Target \ No newline at end of file diff --git a/ignore_subscriptions.txt b/ignore_subscriptions.txt new file mode 100644 index 0000000..2cf3995 --- /dev/null +++ b/ignore_subscriptions.txt @@ -0,0 +1,2 @@ +Kroger +SPOTTY DOG ICE CREAM From 2c782ee841fe5610d7ccf7029cc8091437473501 Mon Sep 17 00:00:00 2001 From: Greg Randall Date: Tue, 16 Dec 2025 11:15:16 -0500 Subject: [PATCH 25/29] perf(interpret): optimize fuzzy matching complexity - Implemented bucketing by first character in 'merge_similar_descriptions' to reduce search space from O(N) to O(N/26) for each iteration. - Added a length-based heuristic check to skip expensive 'difflib.SequenceMatcher' calculations if the maximum possible ratio is below the threshold. - Significantly reduces processing time for large datasets with many unique vendor descriptions. --- interpret.py | 25 ++++++++++++++++++++++--- 1 file changed, 22 insertions(+), 3 deletions(-) diff --git a/interpret.py b/interpret.py index 60a27e1..5cde384 100644 --- a/interpret.py +++ b/interpret.py @@ -79,6 +79,7 @@ def merge_similar_descriptions(df, threshold=0.7): """ Groups similar descriptions using fuzzy matching and prefix checking. Prioritizes shorter names as representatives (e.g., "TRUIST" over "TRUIST LN..."). + Optimized with first-char bucketing and length heuristics. """ if df.empty: return df @@ -88,11 +89,20 @@ def merge_similar_descriptions(df, threshold=0.7): sorted_descs = sorted(unique_descs, key=len) mapping = {} - reps = [] + # Partition reps by their starting character for O(N^2/C) speedup + reps_by_char = {} for desc in sorted_descs: match = None - for rep in reps: + if not desc: + continue + + first_char = desc[0] + + # Only check against reps starting with the same character + potential_reps = reps_by_char.get(first_char, []) + + for rep in potential_reps: # Check 1: Prefix match (strong signal) # e.g., "TRUIST" matches "TRUIST LN..." if desc.startswith(rep + " "): @@ -100,6 +110,13 @@ def merge_similar_descriptions(df, threshold=0.7): break # Check 2: Fuzzy match + # Optimization: Quick length check + # ratio = 2*M / (len(a) + len(b)). Max M = len(rep) (since rep is shorter/equal) + # If max possible ratio <= threshold, skip expensive difflib + max_possible_ratio = 2 * len(rep) / (len(rep) + len(desc)) + if max_possible_ratio <= threshold: + continue + ratio = difflib.SequenceMatcher(None, rep, desc).ratio() if ratio > threshold: match = rep @@ -108,7 +125,9 @@ def merge_similar_descriptions(df, threshold=0.7): if match: mapping[desc] = match else: - reps.append(desc) + if first_char not in reps_by_char: + reps_by_char[first_char] = [] + reps_by_char[first_char].append(desc) mapping[desc] = desc df['Description'] = df['Description'].map(mapping) From 06fe24f2f00ef3e13b2fca0c7e13e8b823ca6796 Mon Sep 17 00:00:00 2001 From: Greg Randall Date: Tue, 16 Dec 2025 11:23:35 -0500 Subject: [PATCH 26/29] fix(interpret): revert bucketing optimization to restore fuzzy matching accuracy - Removed first-character bucketing from 'merge_similar_descriptions' as it prevented matching variations with different prefixes (e.g., 'COFBNDRCT' vs 'SP COFBNDRCT'). - Retained the length-based heuristic optimization to maintain reasonable performance gains. - Restores the detection of 'COFBNDRCT' subscriptions. --- interpret.py | 16 ++++------------ 1 file changed, 4 insertions(+), 12 deletions(-) diff --git a/interpret.py b/interpret.py index 5cde384..e3f2db1 100644 --- a/interpret.py +++ b/interpret.py @@ -79,7 +79,7 @@ def merge_similar_descriptions(df, threshold=0.7): """ Groups similar descriptions using fuzzy matching and prefix checking. Prioritizes shorter names as representatives (e.g., "TRUIST" over "TRUIST LN..."). - Optimized with first-char bucketing and length heuristics. + Optimized with length heuristics. """ if df.empty: return df @@ -89,20 +89,14 @@ def merge_similar_descriptions(df, threshold=0.7): sorted_descs = sorted(unique_descs, key=len) mapping = {} - # Partition reps by their starting character for O(N^2/C) speedup - reps_by_char = {} + reps = [] for desc in sorted_descs: match = None if not desc: continue - first_char = desc[0] - - # Only check against reps starting with the same character - potential_reps = reps_by_char.get(first_char, []) - - for rep in potential_reps: + for rep in reps: # Check 1: Prefix match (strong signal) # e.g., "TRUIST" matches "TRUIST LN..." if desc.startswith(rep + " "): @@ -125,9 +119,7 @@ def merge_similar_descriptions(df, threshold=0.7): if match: mapping[desc] = match else: - if first_char not in reps_by_char: - reps_by_char[first_char] = [] - reps_by_char[first_char].append(desc) + reps.append(desc) mapping[desc] = desc df['Description'] = df['Description'].map(mapping) From b878def24f8637e7d57ec7097334d34f9a4e5a05 Mon Sep 17 00:00:00 2001 From: Greg Randall Date: Tue, 16 Dec 2025 11:28:23 -0500 Subject: [PATCH 27/29] docs: update README with new features and CLI arguments - Documented all new command-line arguments (--threshold, --recency-days, --min/max-transaction-amount, --ignore-file, --debug). - Added section on how to use the ignore file. - Added 'How It Works' section explaining the normalization, fuzzy matching, and clustering pipeline. --- README.md | 42 ++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 40 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 0d87b17..b5214cb 100644 --- a/README.md +++ b/README.md @@ -36,7 +36,45 @@ To set up the project, follow these steps: To use the script, run the following command: ```bash -python interpret.py reports/financial_reports.csv +python interpret.py [options] ``` -Replace `reports/financial_reports.csv` with the path to your CSV file. +**Example:** +```bash +python interpret.py reports/financial_reports.csv --recency-days 120 --threshold 0.2 +``` + +### Command-line Arguments + +| Argument | Short | Default | Description | +| :--- | :--- | :--- | :--- | +| `file_path` | | | Path to the CSV file to analyze (Required). | +| `--threshold` | `-t` | `0.15` | Percentage threshold (0.0-1.0) for clustering similar transaction amounts. | +| `--recency-days` | `-r` | `90` | Number of days from the latest transaction date to consider a subscription "active". | +| `--min-transaction-amount` | | `10.0` | Minimum absolute transaction amount to consider. | +| `--max-transaction-amount` | | `10000.0` | Maximum absolute transaction amount to consider. | +| `--ignore-file` | | `ignore_subscriptions.txt` | Path to a text file containing vendor names to ignore. | +| `--debug` | `-d` | `False` | Enable verbose debug output. | + +### Ignoring Vendors + +You can exclude specific vendors or transactions by adding their names to a text file (default: `ignore_subscriptions.txt`). +- One vendor per line. +- Supports partial matching (e.g., "Grocery" will ignore "Joe's Grocery Store"). +- Case-insensitive. + +Example `ignore_subscriptions.txt`: +```text +Whole Foods +Starbucks +One-time transfer +``` + +## How It Works + +1. **Parses & Normalizes:** Reads the CSV, detects column names automatically (multilingual support), and normalizes vendor descriptions (removes location data, special characters, etc.). +2. **Fuzzy Matching:** Groups similar vendor names together (e.g., "Netflix.com" and "Netflix Inc") using sequence matching logic. +3. **Ignores:** Filters out vendors listed in the ignore file. +4. **Clusters Amounts:** Groups transactions from the same vendor that have similar amounts (within the specified `--threshold`) to handle small price variations or currency fluctuations. This also helps separate recurring payments from one-off outliers (like a large downpayment vs. a monthly fee). +5. **Identifies Candidates:** Filters for recurring transactions (count > 1) that fall within the specified amount range and recency window. +6. **Reports:** specific details about the potential subscriptions found, sorted by estimated yearly cost. From abc5837a29392a41c819d03f5874079600558d76 Mon Sep 17 00:00:00 2001 From: Greg Randall Date: Tue, 16 Dec 2025 11:29:54 -0500 Subject: [PATCH 28/29] docs: add CSV file format section to README - Detailed the expected CSV columns: Date, Description, and Amount. - Listed all recognized variations for each column, directly from utils.py's standard_columns. - Explained the script's automatic column translation and unification capabilities. - Improved clarity for users preparing input data. --- README.md | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/README.md b/README.md index b5214cb..a7efa62 100644 --- a/README.md +++ b/README.md @@ -31,6 +31,18 @@ To set up the project, follow these steps: pip install -r requirements.txt ``` +## CSV File Format + +The script expects a CSV file containing transaction data. It automatically identifies and maps column headers to standard names, supporting various linguistic and formatting differences. + +The essential columns and their recognized variations are: + +- **Date**: (`date`, `datum`, `fecha`, `data`) - The date of the transaction. +- **Description**: (`description`, `desc`, `descripción`, `bezeichnung`, `opis`, `payee`) - A textual description of the transaction or vendor. +- **Amount**: (`amount`, `amt`, `importe`, `betrag`, `kwota`, `sum`, `outflow`) - The transaction amount. Note: the script handles currency symbols and different decimal/thousands separators. + +The script also supports automatic language detection for column headers and will translate them to English before processing. + ## Usage To use the script, run the following command: From 28744aebc2eed1f878760ee3cf93f4ed6db7e130 Mon Sep 17 00:00:00 2001 From: Greg Randall Date: Tue, 16 Dec 2025 11:32:56 -0500 Subject: [PATCH 29/29] formatting --- README.md | 150 +++++++++++++++++++++++++++++++++++++++++------------- 1 file changed, 115 insertions(+), 35 deletions(-) diff --git a/README.md b/README.md index a7efa62..b98937a 100644 --- a/README.md +++ b/README.md @@ -1,63 +1,120 @@ + # Subscription Finder Python Script + + This Python script is designed to help users find and manage their subscriptions. + + ## Setup + + To set up the project, follow these steps: -1. **Clone the repository:** - ```bash - git clone - cd - ``` - -2. **Create a virtual environment:** - - Windows CMD: - ```cmd - python -m venv venv - venv\Scripts\activate - ``` - - Bash: - ```bash - python3 -m venv venv - source venv/bin/activate - ``` - -3. **Install the dependencies:** - ```bash - pip install -r requirements.txt - ``` + + +1. **Clone the repository:** + +```bash + +git clone + +cd + +``` + + + +2. **Create a virtual environment:** + +Windows CMD: + +```cmd + +python -m venv venv + +venv\Scripts\activate + +``` + + + +Bash: + +```bash + +python3 -m venv venv + +source venv/bin/activate + +``` + +3. **Install the dependencies:** + +```bash + +pip install -r requirements.txt + +``` + + ## CSV File Format + + The script expects a CSV file containing transaction data. It automatically identifies and maps column headers to standard names, supporting various linguistic and formatting differences. + + The essential columns and their recognized variations are: -- **Date**: (`date`, `datum`, `fecha`, `data`) - The date of the transaction. -- **Description**: (`description`, `desc`, `descripción`, `bezeichnung`, `opis`, `payee`) - A textual description of the transaction or vendor. -- **Amount**: (`amount`, `amt`, `importe`, `betrag`, `kwota`, `sum`, `outflow`) - The transaction amount. Note: the script handles currency symbols and different decimal/thousands separators. + + +- **Date**: (`date`, `datum`, `fecha`, `data`) - The date of the transaction. + +- **Description**: (`description`, `desc`, `descripción`, `bezeichnung`, `opis`, `payee`) - A textual description of the transaction or vendor. + +- **Amount**: (`amount`, `amt`, `importe`, `betrag`, `kwota`, `sum`, `outflow`) - The transaction amount. Note: the script handles currency symbols and different decimal/thousands separators. + + The script also supports automatic language detection for column headers and will translate them to English before processing. + + ## Usage + + To use the script, run the following command: + + ```bash -python interpret.py [options] + +python interpret.py [options] + ``` + + **Example:** + ```bash -python interpret.py reports/financial_reports.csv --recency-days 120 --threshold 0.2 + +python interpret.py reports/financial_reports.csv --recency-days 120 --threshold 0.2 + ``` + + ### Command-line Arguments + + | Argument | Short | Default | Description | | :--- | :--- | :--- | :--- | | `file_path` | | | Path to the CSV file to analyze (Required). | @@ -68,25 +125,48 @@ python interpret.py reports/financial_reports.csv --recency-days 120 --threshold | `--ignore-file` | | `ignore_subscriptions.txt` | Path to a text file containing vendor names to ignore. | | `--debug` | `-d` | `False` | Enable verbose debug output. | + + ### Ignoring Vendors + + You can exclude specific vendors or transactions by adding their names to a text file (default: `ignore_subscriptions.txt`). + - One vendor per line. + - Supports partial matching (e.g., "Grocery" will ignore "Joe's Grocery Store"). + - Case-insensitive. + + Example `ignore_subscriptions.txt`: + ```text + Whole Foods + Starbucks + One-time transfer + ``` + + ## How It Works -1. **Parses & Normalizes:** Reads the CSV, detects column names automatically (multilingual support), and normalizes vendor descriptions (removes location data, special characters, etc.). -2. **Fuzzy Matching:** Groups similar vendor names together (e.g., "Netflix.com" and "Netflix Inc") using sequence matching logic. -3. **Ignores:** Filters out vendors listed in the ignore file. -4. **Clusters Amounts:** Groups transactions from the same vendor that have similar amounts (within the specified `--threshold`) to handle small price variations or currency fluctuations. This also helps separate recurring payments from one-off outliers (like a large downpayment vs. a monthly fee). -5. **Identifies Candidates:** Filters for recurring transactions (count > 1) that fall within the specified amount range and recency window. -6. **Reports:** specific details about the potential subscriptions found, sorted by estimated yearly cost. + + +1. **Parses & Normalizes:** Reads the CSV, detects column names automatically (multilingual support), and normalizes vendor descriptions (removes location data, special characters, etc.). + +2. **Fuzzy Matching:** Groups similar vendor names together (e.g., "Netflix.com" and "Netflix Inc") using sequence matching logic. + +3. **Ignores:** Filters out vendors listed in the ignore file. + +4. **Clusters Amounts:** Groups transactions from the same vendor that have similar amounts (within the specified `--threshold`) to handle small price variations or currency fluctuations. This also helps separate recurring payments from one-off outliers (like a large downpayment vs. a monthly fee). + +5. **Identifies Candidates:** Filters for recurring transactions (count > 1) that fall within the specified amount range and recency window. + +6. **Reports:** specific details about the potential subscriptions found, sorted by estimated yearly cost. \ No newline at end of file