DynamicEarth/eval/evaluate.py at main · likyoo/DynamicEarth · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
"""
Change Detection Evaluation Metric Calculation

This script implements a standardized evaluation pipeline for change detection tasks,
calculating common metrics including mIoU, OA, F1-score, etc.

python eval/evaluate.py --gt [GROUND_TRUTH_DIR] --pred [PREDICTION_DIR] --threshold [0.5]

"""

import os
from typing import Dict, Optional
import argparse

import cv2
import numpy as np
from tqdm import tqdm


class ChangeDetectionMetrics:
    """
    Change Detection Evaluation Metric Calculator

    Attributes:
        threshold (float): Binarization threshold (0-1 scale)
        eps (float): Numerical stability constant
        tp (float): Accumulated true positives
        tn (float): Accumulated true negatives
        fp (float): Accumulated false positives
        fn (float): Accumulated false negatives
        results (dict): Dictionary storing final evaluation metrics

    Methods:
        reset(): Resets all accumulators
        update(): Updates metric calculations with new batch
        compute(): Computes and returns all metrics
    """

    def __init__(self, threshold: float = 0.5) -> None:
        """
        Initialize metric calculator

        Args:
            threshold: Binarization threshold (0-1 scale), default 0.5
        """
        self.threshold = threshold * 255.0  # Convert to pixel value
        self.eps = 1e-7  # Numerical stability constant

        # Initialize accumulators
        self.tp = 0.0
        self.tn = 0.0
        self.fp = 0.0
        self.fn = 0.0

        # Store final results
        self.results: Optional[Dict[str, float]] = None

    def reset(self) -> None:
        """Resets all accumulators to zero"""
        self.tp = 0.0
        self.tn = 0.0
        self.fp = 0.0
        self.fn = 0.0

    def update(self, prediction: np.ndarray, target: np.ndarray) -> None:
        """
        Update metrics with new data pair

        Args:
            prediction: Model prediction (grayscale image, 0-255)
            target: Ground truth (grayscale image, 0-255)
        """
        # Convert to binary masks
        pred_binary = (prediction > self.threshold)
        target_binary = (target > self.threshold)

        # Update confusion matrix elements
        self.tp += np.sum(pred_binary & target_binary)
        self.tn += np.sum(~pred_binary & ~target_binary)
        self.fp += np.sum(pred_binary & ~target_binary)
        self.fn += np.sum(~pred_binary & target_binary)

    def compute(self) -> Dict[str, float]:
        """Compute and return all evaluation metrics"""
        # Calculate IoU for both classes
        iou_change = self.tp / (self.tp + self.fp + self.fn + self.eps)
        iou_nochange = self.tn / (self.tn + self.fp + self.fn + self.eps)

        # Calculate mean IoU
        miou = 0.5 * (iou_change + iou_nochange)

        # Calculate overall accuracy
        oa = (self.tp + self.tn) / (self.tp + self.tn + self.fp + self.fn + self.eps)

        # Calculate precision/recall/F1-score
        precision = self.tp / (self.tp + self.fp + self.eps)
        recall = self.tp / (self.tp + self.fn + self.eps)
        f1_score = (2 * precision * recall) / (precision + recall + self.eps)

        # Organize results
        self.results = {
            'miou': miou,
            'oa': oa,
            'iou_change': iou_change,
            'iou_nochange': iou_nochange,
            'f1_score_change': f1_score,
            'precision_change': precision,
            'recall_change': recall
        }
        return self.results


def evaluate_metrics(
    ground_truth_dir: str,
    prediction_dir: str,
    threshold: float = 0.5
) -> Dict[str, float]:
    """
    Execute full evaluation pipeline

    Args:
        ground_truth_dir: Path to ground truth directory
        prediction_dir: Path to prediction directory
        threshold: Binarization threshold (0-1 scale)

    Returns:
        Dictionary containing evaluation metrics
    """
    # Initialize metric calculator
    metric_calculator = ChangeDetectionMetrics(threshold=threshold)

    # Get sorted file list
    image_list = sorted(os.listdir(prediction_dir))

    # Process all image pairs
    for filename in tqdm(image_list, desc="Processing Images"):
        # Construct file paths
        pred_path = os.path.join(prediction_dir, filename)
        gt_path = os.path.join(ground_truth_dir, filename)

        try:
            # Read images as grayscale
            pred = cv2.imread(pred_path, cv2.IMREAD_GRAYSCALE)
            gt = cv2.imread(gt_path, cv2.IMREAD_GRAYSCALE)

            # Validate inputs
            if pred is None or gt is None:
                raise ValueError(f"Invalid image file: {filename}")
            if pred.shape != gt.shape:
                raise ValueError(f"Size mismatch: {filename}")

            # Update metrics
            metric_calculator.update(pred, gt)
        except Exception as e:
            print(f"Error processing {filename}: {str(e)}")
            continue

    # Return computed metrics
    return metric_calculator.compute()


if __name__ == "__main__":
    # Configure argument parser
    parser = argparse.ArgumentParser(description="Change Detection Evaluation")
    parser.add_argument('--gt', type=str, required=True, help="Path to ground truth directory")
    parser.add_argument('--pred', type=str, required=True, help="Path to prediction directory")
    parser.add_argument('--threshold', type=float, default=0.5, help="Binarization threshold (0-1)")
    args = parser.parse_args()

    # Run evaluation
    results = evaluate_metrics(
        ground_truth_dir=args.gt,
        prediction_dir=args.pred,
        threshold=args.threshold
    )

    # Print formatted results
    print("\nEvaluation Results:")
    for metric, value in results.items():
        print(f"{metric:15}: {value:.4f}")