MicrosoftSpeciesClassification/detection_api.py at main · EibSReM/MicrosoftSpeciesClassification · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
################
#
# detection_api.py
#
# Internal API for running the FasterRCNN framework for the wildlife classification project.
#
# The Detector class is the external entry point.
#
################

import os
import matplotlib
from tqdm import tqdm
import numpy as np
from model import FasterRCNNVGG16
from trainer import FasterRCNNTrainer
from utils.config import opt
import data.dataset
import data.util
import torch
from torch.autograd import Variable
from torch.utils import data as data_
import torchvision.transforms as transforms
from utils import array_tool as at
from utils.vis_tool import visdom_bbox
import torch.utils.data
import torch
import PIL
import PIL.ImageDraw
import PIL.ImageFont


class Detector:

    def __init__(self, model_path, useGPU, n_fg_classes=1):
        '''
        Creates a new detection model using the weights
        stored in the file MODEL_PATH and initializes the GPU
        if USEGPU is set to true.
        MODEL_PATH: path to a trained detection model.
        USEGPU: if true, the GPU will be used for faster computations.
        '''

        torch.set_num_threads(1)
        opt.load_path = model_path
        self.faster_rcnn = FasterRCNNVGG16(n_fg_class=n_fg_classes)
        self.trainer = FasterRCNNTrainer(self.faster_rcnn, n_fg_class=n_fg_classes)
        if useGPU:
            self.trainer = self.trainer.cuda()
        state_dict = torch.load(opt.load_path)
        self.trainer.load(state_dict)
        self.transforms = transforms.ToTensor()
        self.useGPU = useGPU


    def predict_image(self, img, topk=1):

        '''
        Detects objects in the provided testing images.
        IMG: PIL image fitting the input of the trained model
        TOPK: the number of bounding boxes to return. We return the
        most confident bounding boxes first.

        RETURNs: (BBOXES, CONFS) where BBOXES is a n x 4 array,
        where each line corresponds to one bounding box. The
        bounding box coordniates are stored in the format
        [x_min, y_min, x_max, y_max], where x corresponds to the width
        and y to the height. CONFS are the confidence values for
        each bounding box and are a n x m array. Each row corresponds
        to the bounding box in the same row of BBOXES and provides
        the scores for the m classes, that the model was trained to detect.
        '''

        pred_bboxes, pred_labels, pred_scores = self._run_prediction(img)
        return pred_bboxes[:topk, [1,0,3,2]], pred_scores[:topk]


    def annotate_image(self, img, topk):
        '''
        Detects objects in the provided testing images.
        IMG: PIL image fitting the input of the trained model
        TOPK: the number of bounding boxes to return. We return the
        most confident bounding boxes first.

        RETURNS: IMG: a PIL image with the detected bounding boxes
        annotated as rectangles.
        '''

        pred_bboxes, pred_labels, pred_scores = self._run_prediction(img)
        draw = PIL.ImageDraw.Draw(img)
        colors = [(255,0,0),(0,255,0)]
        for bbox, label, score in zip(pred_bboxes, pred_labels, pred_scores):
            draw.rectangle(bbox[[1,0,3,2]], outline=colors[label])
            #font = PIL.ImageFont.truetype("sans-serif.ttf", 16)
            #draw.text(bbox[[1,0]],"Sample Text",colors[label])
        return img


    def _run_prediction(self, img, confidence_threshold=0.7):
        '''
        Prepare an input image for CNN processing.
        IMG: PIL image

        RETURN: IMG as pytorch tensor in the format 1xCxHxW
        normalized according to data.dataset.caffe_normalize.
        '''

        img = img.convert('RGB')
        img = np.asarray(img, dtype=np.float32)
        if img.ndim == 2:
            # reshape (H, W) -> (1, H, W)
            img = img[np.newaxis]
        else:
            # transpose (H, W, C) -> (C, H, W)
            img = img.transpose((2, 0, 1))
        proc_img = data.dataset.caffe_normalize(img/255.)
        tensor_img = torch.from_numpy(proc_img).unsqueeze(0)
        if self.useGPU:
            tensor_img = tensor_img.cuda()

            # This preset filters bounding boxes with a score < *confidence_threshold*
        # and has to be set everytime before using predict()
        self.faster_rcnn.use_preset('visualize')
        pred_bboxes, pred_labels, pred_scores = self.faster_rcnn.predict(tensor_img, [(img.shape[1], img.shape[2])])
        box_filter = np.array(pred_scores[0]) > confidence_threshold
        return pred_bboxes[0][box_filter], pred_labels[0][box_filter], pred_scores[0][box_filter]

# ...class Detector


if __name__ == '__main__':

    det = Detector('checkpoints/fasterrcnn_07122125_0.5273599762268979', True)
    print('Loaded model.')
    image_path = 'misc/demo.jpg'
    test_image = PIL.Image.open(image_path)
    print('Working on image {}'.format(image_path))
    print(det.predict_image(test_image, 5))
    pred_bboxes, pred_scores = det.predict_image(test_image, 1000)
    pred_img = visdom_bbox(np.array(test_image.convert('RGB')).transpose((2, 0, 1)),
                    at.tonumpy(pred_bboxes[:,[1,0,3,2]]),
                    at.tonumpy([1 for _ in pred_bboxes]),
                    at.tonumpy(pred_scores),
                    label_names=['Animal', 'BG'])
    PIL.Image.fromarray((255*pred_img).transpose((1,2,0)).astype(np.uint8)).save('output.jpg')
    det.annotate_image(test_image, 5).save('output-annotate.jpg')