diff --git a/main.py b/main.py new file mode 100644 index 0000000..61501f0 --- /dev/null +++ b/main.py @@ -0,0 +1,309 @@ +### functions would go in this file. Edit as you see fit. #### +#pandas data analysis tool +import pandas as pd +#system function +import os +#read current path and needed fixed folder structure +from pathlib import Path +#3D interactive plot +import plotly.express as px +#static plot +import matplotlib.pyplot as plt +from mpl_toolkits.mplot3d import Axes3D +from sklearn.cluster import KMeans +from sklearn.preprocessing import StandardScaler +from sklearn.cluster import DBSCAN +from sklearn.cluster import AgglomerativeClustering +from scipy.cluster.hierarchy import dendrogram, linkage + +def load_exnode_data(folder_path_arg=None): # Renamed parameter to avoid conflict + # find all files are in the folder + if folder_path_arg is None: + project_root = Path(os.getcwd()) + effective_folder_path = project_root.parent / "inputs" + else: + effective_folder_path = Path(folder_path_arg).parent / "inputs" + + print(f"Using data folder path: {effective_folder_path}") + + all_data_points = [] + + # Get a list of all .exnode files in the folder + file_lists = [f for f in os.listdir(effective_folder_path) if f.endswith('.exnode')] + + if not file_lists: + print(f"No .exnode files found in {effective_folder_path}") + return pd.DataFrame() # Return an empty DataFrame if no files are found + else: + print(f"Found {len(file_lists)} .exnode files: {file_lists}") + + for file_name in file_lists: + file_path_full = os.path.join(effective_folder_path, file_name) + print(f"\nProcessing file: {file_name}") + + data_points_current_file = [] + current_node_id = None + current_node_values = [] # To store x, y, z, avg_intensity sequentially + + # error handle + try: + with open(file_path_full, 'r') as f: + for line_num, line in enumerate(f, 1): + line = line.strip() + + if line.startswith('Node:'): + # check if its first node if not store it + if current_node_id is not None: + # load 4 values - check if 4 values are present before appending + if len(current_node_values) == 4: + data_points_current_file.append({ + 'file_source': file_name, + 'node_id': current_node_id, + 'x': current_node_values[0], + 'y': current_node_values[1], + 'z': current_node_values[2], + 'avg_intensity': current_node_values[3] + }) + else: + print(f"Warning in {file_name}: Node {current_node_id} has {len(current_node_values)} values, expected 4. Skipping this node's data.") + + # Start new node + current_node_id = int(line.split(':')[1].strip()) + current_node_values = [] # Reset for new node + + elif current_node_id is not None and line != '': # We are inside a node's data block and line is not empty + # Assumes data lines are always valid floats + try: + value = float(line) + current_node_values.append(value) + except ValueError: + # The print statement's indentation was incorrect relative to the 'break' + print(f"Warning in {file_name}: Non-numeric or unexpected data '{line}' found for Node {current_node_id} at line {line_num}. Skipping value.") + except FileNotFoundError: # Added outer except blocks + print(f"Error: File {file_path_full} not found. Skipping.") + + # After loop, process the last node's data for the current file + if current_node_id is not None: + # Assumes exactly 4 values are always present for the last node + if len(current_node_values) == 4: + data_points_current_file.append({ + 'file_source': file_name, + 'node_id': current_node_id, + 'x': current_node_values[0], + 'y': current_node_values[1], + 'z': current_node_values[2], + 'avg_intensity': current_node_values[3] + }) + else: + print(f"Warning in {file_name}: Last node {current_node_id} has {len(current_node_values)} values, expected 4. Skipping this node's data.") + + all_data_points.extend(data_points_current_file) + print(f"Finished processing {file_name}. Added {len(data_points_current_file)} data points.") + + # Create DataFrame from all collected data points + df = pd.DataFrame(all_data_points) + + # Display some info + print("\n--- Combined DataFrame Info ---") + if not df.empty: + print("First 5 data points:") + print(df.head()) + print("last 5 data points:") + print(df.tail()) + print(f"\nTotal data points read from all files: {len(df)}") + else: + print("No data points were successfully read.") + + return df + +def plot_plotly_3d_scatter(df, x_col='x', y_col='y', z_col='z', color_col='avg_intensity', title='Interactive 3D Visualization of Data Points by Intensity (Plotly)'): + """ + Generates an interactive 3D scatter plot using plotly.express. + + Args: + df (pd.DataFrame): The input DataFrame. + x_col (str): The column name for the x-axis. Defaults to 'x'. + y_col (str): The column name for the y-axis. Defaults to 'y'. + z_col (str): The column name for the z-axis. Defaults to 'z'. + color_col (str): The column name to use for coloring the points. Defaults to 'avg_intensity'. + title (str): The title of the plot. Defaults to 'Interactive 3D Visualization of Data Points by Intensity (Plotly)'. + """ + if df.empty: + print("DataFrame is empty. No data to visualize interactively.") + return + + fig_interactive = px.scatter_3d(df, + x=x_col, + y=y_col, + z=z_col, + color=color_col, + title=title) + fig_interactive.show() + +def plot_mpl(df, x_col='x', y_col='y', z_col='z', color_col='avg_intensity', title='3D Visualization of Data Points by Intensity (Matplotlib)'): + """ + Generates a static 3D scatter plot using matplotlib. + + Args: + df (pd.DataFrame): The input DataFrame. + x_col (str): The column name for the x-axis. Defaults to 'x'. + y_col (str): The column name for the y-axis. Defaults to 'y'. + z_col (str): The column name for the z-axis. Defaults to 'z'. + color_col (str): The column name to use for coloring the points. Defaults to 'avg_intensity'. + title (str): The title of the plot. Defaults to '3D Visualization of Data Points by Intensity (Matplotlib)'. + """ + if not df.empty: + fig = plt.figure(figsize=(10, 8)) + ax = fig.add_subplot(111, projection='3d') + + scatter = ax.scatter(df[x_col], df[y_col], df[z_col], c=df[color_col], cmap='viridis', s=5) + + ax.set_xlabel('X Coordinate') + ax.set_ylabel('Y Coordinate') + ax.set_zlabel('Z Coordinate') + ax.set_title(title) + + # Add a color bar + cbar = fig.colorbar(scatter, ax=ax, pad=0.1) + cbar.set_label('Average Intensity') + + plt.show() + else: + print("DataFrame is empty. No data to visualize.") + +def perform_and_plot_all_kmeans_clusters(dataframe, n_clusters=5): + """ + Performs K-means clustering for various feature sets and visualizes the results. + + Args: + dataframe (pd.DataFrame): The DataFrame containing the data. + n_clusters (int): The number of clusters to use for K-means. + """ + print(f"Performing K-means clustering with {n_clusters} clusters...") + + # 1. K-means Clustering (Avg Intensity Only) + kmeans_intensity = KMeans(n_clusters=n_clusters, random_state=42, n_init='auto') + dataframe['intensity_cluster_label'] = kmeans_intensity.fit_predict(dataframe[['avg_intensity']]) + print("Intensity K-means clustering completed.") + plot_plotly_3d_scatter(dataframe, x_col='x', y_col='y', z_col='z', + color_col='intensity_cluster_label', + title=f'K-means Clustering ({n_clusters} Clusters - Avg Intensity Only)') + + # 2. K-means Clustering (X, Y, Z Coordinates Only) + kmeans_xyz = KMeans(n_clusters=n_clusters, random_state=42, n_init='auto') + dataframe['xyz_cluster_label'] = kmeans_xyz.fit_predict(dataframe[['x', 'y', 'z']]) + print("Location (X,Y,Z) K-means clustering completed.") + plot_plotly_3d_scatter(dataframe, x_col='x', y_col='y', z_col='z', + color_col='xyz_cluster_label', + title=f'K-means Clustering ({n_clusters} Clusters - X, Y, Z Coordinates Only)') + + # 3. K-means Clustering (X, Y, Z, and Avg Intensity Combined) + kmeans_xyzi = KMeans(n_clusters=n_clusters, random_state=42, n_init='auto') + features_xyzi = dataframe[['x', 'y', 'z', 'avg_intensity']] + scaler_xyzi = StandardScaler() + scaled_features_xyzi_kmean = scaler_xyzi.fit_transform(features_xyzi) + #dataframe['xyzi_cluster_label'] = kmeans_xyzi.fit_predict(dataframe[['x', 'y', 'z', 'avg_intensity']]) + dataframe['xyzi_cluster_label'] = kmeans_xyzi.fit_predict(scaled_features_xyzi_kmean) + print("Combined (X,Y,Z,Intensity) K-means clustering completed.") + plot_plotly_3d_scatter(dataframe, x_col='x', y_col='y', z_col='z', + color_col='xyzi_cluster_label', + title=f'K-means Clustering ({n_clusters} Clusters - X, Y, Z, and Avg Intensity)') + + print("All K-means clustering and plotting processes finished.") + +def perform_and_plot_all_dbscan_clusters(dataframe, eps_val=0.5, min_samples_val=8): + """ + Performs DBSCAN clustering for various feature sets and visualizes the results. + + Args: + dataframe (pd.DataFrame): The DataFrame containing the data. + eps_val (float): The maximum distance between two samples for one to be considered + as in the neighborhood of the other. + min_samples_val (int): The number of samples (or total weight) in a neighborhood for + a point to be considered as a core point. + """ + print(f"Performing DBSCAN clustering with eps={eps_val} and min_samples={min_samples_val}...") + + + # 1. DBSCAN Clustering (X, Y, Z, and Avg Intensity Combined) + features_xyzi = dataframe[['x', 'y', 'z', 'avg_intensity']] + scaler_xyzi = StandardScaler() + scaled_features_xyzi = scaler_xyzi.fit_transform(features_xyzi) + dbscan_xyzi = DBSCAN(eps=eps_val, min_samples=min_samples_val) + dataframe['dbscan_xyzi_cluster_label'] = dbscan_xyzi.fit_predict(scaled_features_xyzi) + print("DBSCAN (X,Y,Z,Intensity) clustering completed.") + num_clusters_xyzi = len(set(dataframe['dbscan_xyzi_cluster_label'])) - (1 if -1 in dataframe['dbscan_xyzi_cluster_label'].values else 0) + num_noise_points_xyzi = (dataframe['dbscan_xyzi_cluster_label'] == -1).sum() + print(f"Number of clusters (Combined): {num_clusters_xyzi}, Noise points: {num_noise_points_xyzi}") + plot_plotly_3d_scatter(dataframe, x_col='x', y_col='y', z_col='z', + color_col='dbscan_xyzi_cluster_label', + title=f'DBSCAN Clustering (eps={eps_val}, min_samples={min_samples_val} - X, Y, Z, and Avg Intensity Combined)') + + print("All DBSCAN clustering and plotting processes finished.") + +def perform_and_plot_all_agglomerative_clusters(dataframe, n_clusters=5, linkage='ward'): + """ + Performs Agglomerative Clustering for the combined feature set and visualizes the results. + + Args: + dataframe (pd.DataFrame): The DataFrame containing the data. + n_clusters (int): The number of clusters to form. + linkage (str): Which linkage criterion to use. E.g., 'ward', 'complete', 'average', 'single'. + """ + print(f"Performing Agglomerative Clustering with {n_clusters} clusters and linkage='{linkage}'...") + + features_agglomerative = dataframe[['x', 'y', 'z', 'avg_intensity']] + + # Scale the features + scaler = StandardScaler() + scaled_features_agglomerative = scaler.fit_transform(features_agglomerative) + + # Apply Agglomerative Clustering + agg_clustering = AgglomerativeClustering(n_clusters=n_clusters, linkage=linkage) + dataframe[f'agglomerative_cluster_label_{n_clusters}'] = agg_clustering.fit_predict(scaled_features_agglomerative) + + print("Agglomerative Clustering applied using 'x', 'y', 'z', and 'avg_intensity' features.") + num_clusters_agglomerative = len(set(dataframe[f'agglomerative_cluster_label_{n_clusters}'])) + print("Number of clusters found:", num_clusters_agglomerative) + + # Visualize the clusters + plot_plotly_3d_scatter(dataframe, x_col='x', y_col='y', z_col='z', + color_col=f'agglomerative_cluster_label_{n_clusters}', + title=f'Agglomerative Clustering ({n_clusters} Clusters, Linkage: {linkage} - X, Y, Z, and Avg Intensity)') + + print("Agglomerative clustering and plotting process finished.") + +def plot_agglomerative_dendrogram(dataframe, linkage_method='ward', p_val=30): + """ + Generates and plots a dendrogram for Agglomerative Clustering using combined features. + + Args: + dataframe (pd.DataFrame): The DataFrame containing the data. + linkage (str): Which linkage criterion to use for the dendrogram. + p_val (int): The number of last merged clusters to show when truncating the dendrogram. + """ + print(f"\nGenerating dendrogram for Combined (X,Y,Z,Intensity) Agglomerative Clustering with linkage='{linkage}'...") + + features_xyzi = dataframe[['x', 'y', 'z', 'avg_intensity']] + scaler_xyzi = StandardScaler() + scaled_features_xyzi = scaler_xyzi.fit_transform(features_xyzi) + + linkage_matrix = linkage(scaled_features_xyzi, method=linkage_method) + + plt.figure(figsize=(15, 7)) + plt.title(f'Hierarchical Clustering Dendrogram (Linkage: {linkage_method} - X, Y, Z, Avg Intensity)') + plt.xlabel('Sample Index or Cluster Size') + plt.ylabel('Distance') + dendrogram( + linkage_matrix, + leaf_rotation=90., # rotates the x axis labels + leaf_font_size=8., # font size for the x axis labels + truncate_mode='lastp', # show only the last p merged clusters + p=p_val, # show only the last p merged clusters + show_leaf_counts=True + ) + plt.show() + print("Dendrogram generated.") + + +