diff --git a/demos/01_data_io_demo.py b/demos/01_data_io_demo.py index 1071feb..7e00972 100644 --- a/demos/01_data_io_demo.py +++ b/demos/01_data_io_demo.py @@ -27,28 +27,34 @@ def demo_basic_loading(): sample_data.to_csv('temp_data/sample.csv', index=False) sample_data.to_excel('temp_data/sample.xlsx', index=False) sample_data.to_json('temp_data/sample.json', orient='records') - sample_data.to_parquet('temp_data/sample.parquet', index=False) - + # Load from CSV print("\nšŸ“ Loading CSV file...") df_csv = load('temp_data/sample.csv') print(f"āœ“ Loaded {len(df_csv)} rows from CSV") print(df_csv.head()) - + # Load from Excel print("\nšŸ“Š Loading Excel file...") df_excel = load('temp_data/sample.xlsx') print(f"āœ“ Loaded {len(df_excel)} rows from Excel") - + # Load from JSON print("\nšŸ“‹ Loading JSON file...") df_json = load('temp_data/sample.json') print(f"āœ“ Loaded {len(df_json)} rows from JSON") - - # Load from Parquet + + # Load from Parquet (Safely wrapped to prevent crashes) print("\nšŸ—‚ļø Loading Parquet file...") - df_parquet = load('temp_data/sample.parquet') - print(f"āœ“ Loaded {len(df_parquet)} rows from Parquet") + try: + # We attempt to save AND load here so we catch the missing engine error + sample_data.to_parquet('temp_data/sample.parquet', index=False) + df_parquet = load('temp_data/sample.parquet') + print(f"āœ“ Loaded {len(df_parquet)} rows from Parquet") + except ImportError: + print("āš ļø Skipped Parquet demo: 'pyarrow' or 'fastparquet' not installed.") + except Exception as e: + print(f"āš ļø Skipped Parquet demo: {str(e)}") def demo_folder_loading(): @@ -56,21 +62,21 @@ def demo_folder_loading(): print("\n" + "=" * 60) print("DEMO 2: Batch Loading from Folder") print("=" * 60) - + # Create multiple CSV files os.makedirs('temp_data/batch', exist_ok=True) - + for i in range(3): df = pd.DataFrame({ 'id': range(i*10, (i+1)*10), 'value': range(100, 110) }) df.to_csv(f'temp_data/batch/file_{i+1}.csv', index=False) - + print("\nšŸ“‚ Loading all CSV files from folder...") dfs = read_folder('temp_data/batch', file_type='csv') print(f"āœ“ Loaded {len(dfs)} files") - + for i, df in enumerate(dfs, 1): print(f" File {i}: {len(df)} rows") @@ -80,34 +86,39 @@ def demo_save_operations(): print("\n" + "=" * 60) print("DEMO 3: Saving Data") print("=" * 60) - + # Create sample data df = pd.DataFrame({ 'x': range(1, 6), 'y': [10, 20, 30, 40, 50] }) - + os.makedirs('temp_data/output', exist_ok=True) - + # Save as CSV print("\nšŸ’¾ Saving as CSV...") save(df, 'temp_data/output/result.csv') print("āœ“ Saved to result.csv") - + # Save as Excel print("\nšŸ’¾ Saving as Excel...") save(df, 'temp_data/output/result.xlsx') print("āœ“ Saved to result.xlsx") - + # Save as JSON print("\nšŸ’¾ Saving as JSON...") save(df, 'temp_data/output/result.json') print("āœ“ Saved to result.json") - - # Save as Parquet + + # Save as Parquet (Safely wrapped to prevent crashes) print("\nšŸ’¾ Saving as Parquet...") - save(df, 'temp_data/output/result.parquet') - print("āœ“ Saved to result.parquet") + try: + save(df, 'temp_data/output/result.parquet') + print("āœ“ Saved to result.parquet") + except ImportError: + print("āš ļø Skipped Parquet save: 'pyarrow' or 'fastparquet' not installed.") + except Exception as e: + print(f"āš ļø Skipped Parquet save: {str(e)}") def cleanup(): @@ -122,14 +133,14 @@ def cleanup(): print("\n" + "šŸš€" * 30) print("DATA I/O OPERATIONS DEMO".center(60)) print("šŸš€" * 30 + "\n") - + try: demo_basic_loading() demo_folder_loading() demo_save_operations() finally: cleanup() - + print("\n" + "āœ…" * 30) print("ALL DEMOS COMPLETED".center(60)) - print("āœ…" * 30 + "\n") + print("āœ…" * 30 + "\n") \ No newline at end of file diff --git a/demos/03_eda_demo.py b/demos/03_eda_demo.py index a19bacd..2b131f7 100644 --- a/demos/03_eda_demo.py +++ b/demos/03_eda_demo.py @@ -84,9 +84,19 @@ def demo_health_check(): report = data_health_check(df) print("\nāœ“ Health Check Report:") - print(f" Missing Values: {report.get('missing_values', {})}") - print(f" Data Types: {report.get('dtypes', {})}") - print(f" Shape: {report.get('shape', {})}") + + # --- FIX START: Logic Bug Prevention --- + # The data_health_check function might return a float (score) instead of a dictionary. + # We check the type before trying to access keys to prevent an AttributeError. + if isinstance(report, dict): + print(f" Missing Values: {report.get('missing_values', {})}") + print(f" Data Types: {report.get('dtypes', {})}") + print(f" Shape: {report.get('shape', {})}") + else: + # Fallback if it returns a score (float/int) + print(f" Overall Health Score: {report}") + print(" (Detailed dictionary report was not returned by the function)") + # --- FIX END --- def demo_feature_analysis(): @@ -94,9 +104,9 @@ def demo_feature_analysis(): print("\n" + "=" * 60) print("DEMO 5: Feature Analysis Report") print("=" * 60) - + df = create_sample_data() - + print("\nšŸ“Š Generating feature analysis report...") try: report = feature_analysis_report(df, target_col='performance') @@ -110,13 +120,13 @@ def demo_feature_analysis(): print("\n" + "šŸ“Š" * 30) print("EXPLORATORY DATA ANALYSIS DEMO".center(60)) print("šŸ“Š" * 30 + "\n") - + demo_basic_stats() demo_quick_eda() demo_comprehensive_eda() demo_health_check() demo_feature_analysis() - + print("\n" + "āœ…" * 30) print("ALL DEMOS COMPLETED".center(60)) - print("āœ…" * 30 + "\n") + print("āœ…" * 30 + "\n") \ No newline at end of file diff --git a/demos/06_modeling_demo.py b/demos/06_modeling_demo.py index 0b5e929..e4c0e2d 100644 --- a/demos/06_modeling_demo.py +++ b/demos/06_modeling_demo.py @@ -5,21 +5,24 @@ """ from dskit import ( - QuickModel, compare_models, auto_hpo, + QuickModel, compare_models, auto_hpo, evaluate_model, error_analysis, auto_encode, auto_scale, train_test_auto ) -from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier + +from sklearn.ensemble import RandomForestClassifier from sklearn.linear_model import LogisticRegression from sklearn.svm import SVC + import pandas as pd import numpy as np + def create_classification_data(): """Create sample classification dataset""" np.random.seed(42) n = 500 - + df = pd.DataFrame({ 'feature_1': np.random.normal(50, 10, n), 'feature_2': np.random.normal(100, 20, n), @@ -27,14 +30,14 @@ def create_classification_data(): 'feature_4': np.random.uniform(0, 100, n), 'category': np.random.choice(['A', 'B'], n) }) - + # Create target based on features df['target'] = ( - (df['feature_1'] > 50) & - (df['feature_2'] > 100) | + ((df['feature_1'] > 50) & + (df['feature_2'] > 100)) | (df['feature_3'] > 5) ).astype(int) - + return df @@ -43,24 +46,28 @@ def demo_quick_model(): print("=" * 60) print("DEMO 1: Quick Model Training") print("=" * 60) - + df = create_classification_data() df_encoded = auto_encode(df) df_scaled = auto_scale(df_encoded) X_train, X_test, y_train, y_test = train_test_auto(df_scaled, target='target') - + + # FIX 1: Cast target to integer + y_train = y_train.astype(int) + y_test = y_test.astype(int) + print("\nšŸ¤– Training Quick Random Forest model...") qm = QuickModel(model_type='rf', task='classification') qm.fit(X_train, y_train) - + print("\nāœ“ Model trained successfully") print(f" Model type: Random Forest") print(f" Training samples: {len(X_train)}") - + print("\nšŸ“Š Making predictions...") predictions = qm.predict(X_test) print(f"āœ“ Predicted {len(predictions)} samples") - + print("\nšŸ“ˆ Quick evaluation:") score = qm.score(X_test, y_test) print(f" Accuracy: {score:.4f}") @@ -71,27 +78,32 @@ def demo_compare_models(): print("\n" + "=" * 60) print("DEMO 2: Model Comparison") print("=" * 60) - + df = create_classification_data() df_encoded = auto_encode(df) df_scaled = auto_scale(df_encoded) X_train, X_test, y_train, y_test = train_test_auto(df_scaled, target='target') - + + # FIX 1: Cast target to integer + y_train = y_train.astype(int) + y_test = y_test.astype(int) + print("\nšŸ¤– Comparing multiple models...") print(" Models: Random Forest, Logistic Regression, SVM") - + + # FIX 2: Use short-codes ('rf', 'lr', 'svm') models = { - 'Random Forest': RandomForestClassifier(n_estimators=100, random_state=42), - 'Logistic Regression': LogisticRegression(random_state=42, max_iter=1000), - 'SVM': SVC(random_state=42) + 'rf': RandomForestClassifier(n_estimators=100, random_state=42), + 'lr': LogisticRegression(random_state=42, max_iter=1000), + 'svm': SVC(random_state=42) } - + results = compare_models( - X_train, y_train, X_test, y_test, - models=models, + X_train, y_train, X_test, y_test, + models=models, task='classification' ) - + print("\nāœ“ Model comparison completed:") print(results) @@ -101,27 +113,33 @@ def demo_hyperparameter_tuning(): print("\n" + "=" * 60) print("DEMO 3: Hyperparameter Optimization") print("=" * 60) - + df = create_classification_data() df_encoded = auto_encode(df) df_scaled = auto_scale(df_encoded) X_train, X_test, y_train, y_test = train_test_auto(df_scaled, target='target') - + + # FIX 1: Cast target to integer + y_train = y_train.astype(int) + y_test = y_test.astype(int) + print("\nšŸ”§ Tuning Random Forest hyperparameters...") print(" Method: Random Search") - print(" Maximum evaluations: 20") - - best_model, best_params, best_score = auto_hpo( - X_train, y_train, X_test, y_test, - model_type='rf', - task='classification', - method='random', - max_evals=20 - ) - - print("\nāœ“ Hyperparameter tuning completed:") - print(f" Best score: {best_score:.4f}") - print(f" Best parameters: {best_params}") + + # FIX 3: Robustness - Wrap in try-except for missing 'optuna' + try: + best_model, best_params, best_score = auto_hpo( + X_train, y_train, X_test, y_test, + model_type='rf', + task='classification' + ) + print("\nāœ“ Hyperparameter tuning completed:") + print(f" Best score: {best_score:.4f}") + print(f" Best parameters: {best_params}") + + except Exception as e: + print(f"\nāš ļø SKIPPING: {str(e)}") + print(" (Install 'optuna' to run this feature: pip install optuna)") def demo_model_evaluation(): @@ -129,23 +147,31 @@ def demo_model_evaluation(): print("\n" + "=" * 60) print("DEMO 4: Model Evaluation") print("=" * 60) - + df = create_classification_data() df_encoded = auto_encode(df) df_scaled = auto_scale(df_encoded) X_train, X_test, y_train, y_test = train_test_auto(df_scaled, target='target') - + + # FIX 1: Cast target to integer + y_train = y_train.astype(int) + y_test = y_test.astype(int) + print("\nšŸ¤– Training model for evaluation...") model = RandomForestClassifier(n_estimators=100, random_state=42) model.fit(X_train, y_train) - + print("\nšŸ“Š Evaluating model...") metrics = evaluate_model(model, X_test, y_test, task='classification') - - print("\nāœ“ Evaluation metrics:") - for metric, value in metrics.items(): - if isinstance(value, (int, float)): - print(f" {metric}: {value:.4f}") + + # FIX 4: Handle case where metrics is None + if metrics: + print("\nāœ“ Evaluation metrics:") + for metric, value in metrics.items(): + if isinstance(value, (int, float)): + print(f" {metric}: {value:.4f}") + else: + print("\nāœ“ Evaluation completed (metrics printed above)") def demo_error_analysis(): @@ -153,36 +179,56 @@ def demo_error_analysis(): print("\n" + "=" * 60) print("DEMO 5: Error Analysis") print("=" * 60) - + df = create_classification_data() df_encoded = auto_encode(df) df_scaled = auto_scale(df_encoded) X_train, X_test, y_train, y_test = train_test_auto(df_scaled, target='target') - + + # FIX 1: Cast target to integer + y_train = y_train.astype(int) + y_test = y_test.astype(int) + print("\nšŸ¤– Training model for error analysis...") model = RandomForestClassifier(n_estimators=100, random_state=42) model.fit(X_train, y_train) - + print("\nšŸ” Analyzing prediction errors...") analysis = error_analysis(model, X_test, y_test, task='classification') - + print("\nāœ“ Error analysis completed") print(f" Total predictions: {len(y_test)}") - if 'error_rate' in analysis: - print(f" Error rate: {analysis['error_rate']:.4f}") + + # āœ… FIX 5: analysis might be a DataFrame (cannot do `if analysis`) + if analysis is None: + pass + + elif isinstance(analysis, dict): + if "error_rate" in analysis: + print(f" Error rate: {analysis['error_rate']:.4f}") + + elif isinstance(analysis, pd.DataFrame): + if not analysis.empty: + print(f" Errors found: {len(analysis)}") + else: + print(" No errors found šŸŽ‰") + + else: + # fallback for unexpected return types + print(f" Returned analysis type: {type(analysis)}") if __name__ == "__main__": print("\n" + "šŸ¤–" * 30) print("MACHINE LEARNING MODELING DEMO".center(60)) print("šŸ¤–" * 30 + "\n") - + demo_quick_model() demo_compare_models() demo_hyperparameter_tuning() demo_model_evaluation() demo_error_analysis() - + print("\n" + "āœ…" * 30) print("ALL DEMOS COMPLETED".center(60)) print("āœ…" * 30 + "\n")