Waste-Image-Classifier/eda_utils.py at master · ana-morais57/Waste-Image-Classifier · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
import os
import shutil
import matplotlib.pyplot as plt
from PIL import Image
import seaborn as sns
from collections import Counter
from sklearn.model_selection import train_test_split
import random

def create_directory(path):
    """Create a directory if it doesn't exist."""
    if not os.path.exists(path):
        os.makedirs(path)

def copy_images(source_folder, target_folder):
    """Copy all images from the source folder to the target folder."""
    if os.path.exists(source_folder):
        for file_name in os.listdir(source_folder):
            if file_name.endswith('.png'):
                source_file = os.path.join(source_folder, file_name)
                target_file = os.path.join(target_folder, file_name)
                shutil.copy2(source_file, target_file)

def analyze_folder(folder_path):
    """Analyze a folder: count images, sizes, etc."""
    image_sizes = []
    file_counts = Counter()

    for root, dirs, files in os.walk(folder_path):
        for file in files:
            if file.endswith('.png'):
                file_counts[root] += 1
                image_path = os.path.join(root, file)
                try:
                    with Image.open(image_path) as img:
                        image_sizes.append(img.size)
                except Exception as e:
                    print(f"Error reading image {image_path}: {e}")

    return file_counts, image_sizes

def plot_image_size_distribution(image_sizes):
    """Plot the distribution of image sizes."""
    widths, heights = zip(*image_sizes)
    plt.figure(figsize=(12, 6))
    sns.histplot(widths, kde=True, color='blue', label='Width')
    sns.histplot(heights, kde=True, color='orange', label='Height')
    plt.xlabel('Pixels')
    plt.ylabel('Frequency')
    plt.title('Image Size Distribution')
    plt.legend()
    plt.show()

def eda():
    """Perform exploratory data analysis on the dataset."""
    print("Analyzing Recyclable Folder...")
    recyclable_counts, recyclable_sizes = analyze_folder(RECYCLABLE_FOLDER)
    print("Recyclable counts by folder:", recyclable_counts)

    print("Analyzing Household Waste Folder...")
    household_counts, household_sizes = analyze_folder(HOUSEHOLDWASTE_FOLDER)
    print("Household waste counts by folder:", household_counts)

    print("Plotting image size distribution for Recyclable...")
    plot_image_size_distribution(recyclable_sizes)

    print("Plotting image size distribution for Household Waste...")
    plot_image_size_distribution(household_sizes)

def check_image_formats(folder):
    formats = {}
    for root, _, files in os.walk(folder):
        for file in files:
            if file.endswith(('jpg', 'jpeg', 'png')):
                file_path = os.path.join(root, file)
                try:
                    with Image.open(file_path) as img:
                        formats[file_path] = img.mode  # e.g., 'RGB', 'L' (grayscale)
                except Exception as e:
                    print(f"Corrupted or unreadable image: {file_path} - {e}")
    return formats

def check_corrupted_images(folder):
    corrupted = []
    for root, _, files in os.walk(folder):
        for file in files:
            if file.endswith(('jpg', 'jpeg', 'png')):
                file_path = os.path.join(root, file)
                try:
                    with Image.open(file_path) as img:
                        img.verify()  # Verify image integrity
                except Exception as e:
                    corrupted.append(file_path)
    return corrupted

def split_dataset(source_folder, train_folder, val_folder, test_folder, test_size=0.15, val_size=0.15):
    """Split the dataset into train, validation, and test sets."""
    categories = os.listdir(source_folder)
    for category in categories:
        category_path = os.path.join(source_folder, category)
        if not os.path.isdir(category_path):
            continue

        images = [os.path.join(category_path, img) for img in os.listdir(category_path) if img.endswith('.png')]
        print(f"Found {len(images)} images in category: {category}")

        if len(images) == 0:
            print(f"No images found in {category}. Skipping...")
            continue

        # Split into train, val, and test
        train_imgs, temp_imgs = train_test_split(images, test_size=(test_size + val_size), random_state=42)
        val_imgs, test_imgs = train_test_split(temp_imgs, test_size=test_size / (test_size + val_size), random_state=42)

        # Helper function to copy files to target folders
        def copy_files(file_list, target_folder, category_name):
            target_category_path = os.path.join(target_folder, category_name)
            os.makedirs(target_category_path, exist_ok=True)
            for img_path in file_list:
                shutil.copy(img_path, target_category_path)

        # Copy images into respective folders
        copy_files(train_imgs, train_folder, category)
        copy_files(val_imgs, val_folder, category)
        copy_files(test_imgs, test_folder, category)


def visualize_random_images(folder, num_samples=9):
    all_images = []
    for root, _, files in os.walk(folder):
        all_images.extend([os.path.join(root, file) for file in files if file.endswith(('jpg', 'jpeg', 'png'))])

    sample_images = random.sample(all_images, min(num_samples, len(all_images)))

    plt.figure(figsize=(10, 10))
    for i, img_path in enumerate(sample_images):
        img = Image.open(img_path)
        ax = plt.subplot(3, 3, i + 1)
        plt.imshow(img)
        plt.axis('off')
    plt.show()