diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..3592172 --- /dev/null +++ b/.gitignore @@ -0,0 +1,5 @@ +# file: ~/.gitignore_global +.DS_Store +.idea +__pycache__ +model_processing/__pycache__ \ No newline at end of file diff --git a/README.md b/README.md index 614ca01..2579cf9 100644 --- a/README.md +++ b/README.md @@ -1 +1,3 @@ -# ml-diplom \ No newline at end of file +такс, тут выяснилось (хорошо, что заранее) что у одной картинки может быть несколько ответов +поэтому давайте для каждой тренировчной картинки сделаем набор из лэйблов +затем оставим только те лэйблы у которых лэйбл-карта не 0 \ No newline at end of file diff --git a/heh2 b/heh2 deleted file mode 100644 index e69de29..0000000 diff --git a/image_loader.py b/image_loader.py new file mode 100644 index 0000000..dfd0c53 --- /dev/null +++ b/image_loader.py @@ -0,0 +1,151 @@ +from sklearn.utils import shuffle +import torch.utils.data.dataset +import os +from torchvision import transforms +from PIL import Image +import torch +from multiprocessing.pool import ThreadPool +from properties import classes, field_id, field_train, image_size, num_workers + +name_delimiter = '_.' +prefix = 'ISIC_' +attribute = '_attribute_' + +composite = transforms.Compose([ + transforms.Scale((image_size, image_size)), + transforms.ToTensor(), +]) + + +def images_filter(dir_path: str, suffix: str): + arrays = [] + for _, _, f in os.walk(dir_path): + for file in f: + if not file.endswith(suffix): + continue + id = file.split(suffix)[0].split(prefix)[1] + arrays.append((id, os.path.join(dir_path, file))) + return arrays + + +def map_inputs_labels(data_inputs_path: str, data_labels_path: str): + """ + composed inputs images and labels images to dict with attributes + :return: dict> + """ + inputs = images_filter(data_inputs_path, '.jpg') + labels = images_filter(data_labels_path, '.png') + # + labels_generator = map(lambda x: (x[0].split(attribute)[0], x[0].split(attribute)[1], x[1]), labels) + + labels_dict = dict() + for ids, attr, path in labels_generator: + if ids in labels_dict: + labels_dict[ids][attr] = path + else: + labels_dict[ids] = {attr: path} + + result = [] + for ids, path in inputs: + labels_dict[ids]['id'] = ids + labels_dict[ids]['train'] = path + result.append(labels_dict[ids]) + return result + + +def split_set(values: list, test_size: int, validate_size: int): + """ + get test set, validate set from list of images, previous shuffled + :param values: result of execute map_inputs_label + :param test_size: train set size + :param validate_size: validate set size + :return: train set, validate set + """ + shuffled = shuffle(values) + return shuffled[0:test_size], shuffled[test_size:test_size + validate_size] + + +def map_cell(cell: dict): + for i in classes: + cell[i] = composite(Image.open(cell[i])) + cell[field_train] = composite(Image.open(cell[field_train])) + return cell + + +def prepare_data(*args): + pool = ThreadPool(num_workers) + for array in args: + for v in array: + pool.apply(map_cell, args=(v,)) + pool.close() + pool.join() + + +def save_data(path: str, data: dict): + ids = data[field_id] + os.makedirs(os.path.join(path, ids), exist_ok=True) + for i in classes: + torch.save(data[i], os.path.join(path, ids, i)) + torch.save(data[field_train], os.path.join(path, ids, field_train)) + + +# either train_path or test_path +def load_tensors(path: str): + result = [] + for tensors in os.listdir(path): + tensor_dir = os.path.join(path, tensors) + if not os.path.isdir(tensor_dir): + continue + res = dict() + res[field_id] = tensors + for fl in os.listdir(tensor_dir): + tensor_file = os.path.join(tensor_dir, fl) + if not os.path.isfile(tensor_file): + continue + res[fl] = torch.load(tensor_file) + result.append(res) + return result + + +def remove_empty_mask(cell: dict): + for c in classes: + if cell[c].sum() > 1: + cell.pop(c) + return cell + + +def apply_resize(cell: dict): + scale = transforms.Compose([ + transforms.ToPILImage(), + transforms.Scale((image_size, image_size)), + transforms.ToTensor() + ]) + for c in classes: + cell[c] = scale(cell[c]) + cell[field_train] = scale(cell[field_train]) + return cell + + +class CustomDataset(torch.utils.data.dataset.Dataset): + """ + data -- list with dict of result load_tensors + """ + + def __init__(self, data: list): + self.data = data + self.inputs = [i[field_train] for i in data] + self.labels = [self.__item_to_tensor(i) for i in data] + + def __len__(self): + return len(self.inputs) + + def __getitem__(self, item): + return self.inputs[item], self.labels[item] + + @staticmethod + def __item_to_tensor(d: dict) -> torch.Tensor: + t = torch.tensor([0 for _ in classes]) + for idx, cl in enumerate(classes): + if cl in d: + t[idx] = 1 + return t diff --git a/model_processing/train_model.py b/model_processing/train_model.py new file mode 100644 index 0000000..50fa0db --- /dev/null +++ b/model_processing/train_model.py @@ -0,0 +1,44 @@ +from torchvision import models +import torch.utils.data.dataset +import torch +import torch.nn as nn +from torch.optim.optimizer import Optimizer +from model_processing.validate_model import ValidateModel + + +class TrainModel: + def __init__(self, validator: ValidateModel, device='cpu'): + self.device = device + self.validator = validator + + def train_model(self, m: models.AlexNet, train_loader: torch.utils.data.DataLoader, + validate_loader: torch.utils.data.DataLoader, + criterion: nn.CrossEntropyLoss, + optimizer: Optimizer, epochs: int) -> models.AlexNet: + + self.validator.flush() + + for epoch in range(epochs): + print("Epoch: {}/{}".format(epoch + 1, epochs)) + m = self._train_model_single_epoch(m, train_loader, criterion, optimizer) + acc = self.validator.validate_model_single_epoch(m, validate_loader) + print("acc: {}".format(acc)) + print("-" * 20) + + best_model = m.load_state_dict(self.validator.best_model_weights) + print("best acc: {}".format(self.validator.best_accuracy)) + return best_model + + def _train_model_single_epoch(self, m: models.AlexNet, data_loader: torch.utils.data.DataLoader, + criterion: nn.CrossEntropyLoss, optimizer: Optimizer) -> models.AlexNet: + m.train() + for inputs, labels in data_loader: + inputs = inputs.to(self.device) + labels = labels.type(torch.FloatTensor).to(self.device) + optimizer.zero_grad() + + model_result = m(inputs) + loss = criterion(model_result, labels) + loss.backward() + optimizer.step() + return m diff --git a/model_processing/validate_model.py b/model_processing/validate_model.py new file mode 100644 index 0000000..ac00f43 --- /dev/null +++ b/model_processing/validate_model.py @@ -0,0 +1,49 @@ +from torchvision import models +import torch.utils.data.dataset +import torch +import copy +from sklearn.metrics import f1_score + + +class ValidateModel: + def __init__(self, labels_number: int, device='cpu'): + self.device = device + self.labels_number = labels_number + + self.iterate_accuracy = [] + self.best_accuracy = 0.0 + self.best_model_weights = None + + def validate_model_single_epoch(self, m: models.AlexNet, data_loader: torch.utils.data.DataLoader) -> float: + + with torch.set_grad_enabled(False): + trust_answer = [[] for _ in range(self.labels_number)] + model_answer = [[] for _ in range(self.labels_number)] + m.eval() + for inputs, labels in data_loader: + inputs = inputs.to(self.device) + labels = labels.to(self.device) + + model_result = m(inputs) + # all vectors size 5 + for i, t in enumerate(model_result): + # all classes + for j, v in enumerate(t): + dst_0 = abs(v.item()) + dst_1 = abs(1 - v.item()) + trust = labels[i][j].item() + ma = 1 if dst_1 < dst_0 else 0 + trust_answer[j].append(trust) + model_answer[j].append(ma) + + res = sum([f1_score(t, m) for t, m in zip(trust_answer, model_answer)]) / self.labels_number + if res > self.best_accuracy: + self.best_accuracy = res + self.best_model_weights = copy.deepcopy(m.state_dict()) + self.iterate_accuracy.append(res) + return res + + def flush(self): + self.iterate_accuracy = [] + self.best_accuracy = 0.0 + self.best_model_weights = None diff --git a/notebooks/classifier.ipynb b/notebooks/classifier.ipynb new file mode 100644 index 0000000..c4369b9 --- /dev/null +++ b/notebooks/classifier.ipynb @@ -0,0 +1,218 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "outputs": [ + { + "name": "stderr", + "text": [ + "/home/nikita/anaconda3/envs/ml-diplom/lib/python3.7/site-packages/torchvision/transforms/transforms.py:210: UserWarning: The use of the transforms.Scale transform is deprecated, please use transforms.Resize instead.\n warnings.warn(\"The use of the transforms.Scale transform is deprecated, \" +\n" + ], + "output_type": "stream" + }, + { + "name": "stdout", + "text": [ + "prepared data is exists\n" + ], + "output_type": "stream" + } + ], + "source": [ + "from torchvision import models\n", + "import torch.utils.data.dataset\n", + "import torch\n", + "import torch.nn as nn\n", + "import torch.optim as optimize\n", + "import image_loader as il\n", + "import properties as pr\n", + "from torch.multiprocessing import Pool\n", + "import os \n", + "\n", + "device = 'cuda' if torch.cuda.is_available() else \"cpu\"\n", + "\n", + "if device == 'cuda':\n", + " torch.multiprocessing.set_start_method('spawn', force=True)\n", + " torch.set_default_tensor_type('torch.cuda.FloatTensor')\n", + "\n", + "if os.path.isdir(pr.tensor_train_path) and os.path.isdir(pr.tensor_validate_path):\n", + " print(\"prepared data is exists\")\n", + " train_tensor = il.load_tensors(pr.tensor_train_path)\n", + " validate_tensor = il.load_tensors(pr.tensor_validate_path)\n", + "else:\n", + " print(\"prepared data doesn't exists\")\n", + " r = il.map_inputs_labels(pr.data_inputs_path, pr.data_labels_path)\n", + " train, validate = il.split_set(r, pr.train_size, pr.test_size)\n", + " il.prepare_data(train, validate)\n", + " for i in train:\n", + " il.save_data(pr.tensor_train_path, i)\n", + " for j in validate:\n", + " il.save_data(pr.tensor_validate_path, j)\n", + " \n", + " train_tensor = il.load_tensors(pr.tensor_train_path)\n", + " validate_tensor = il.load_tensors(pr.tensor_validate_path)\n", + "\"\"\"\n", + "train_tensor = list(map(il.apply_resize, train_tensor))\n", + "validate_tensor = list(map(il.apply_resize, validate_tensor))\n", + "\n", + "for i in train_tensor:\n", + " il.save_data(pr.tensor_train_path, i)\n", + "for j in validate_tensor:\n", + " il.save_data(pr.tensor_validate_path, j)\n", + "\"\"\"\n", + "\n", + "train_tensor = list(map(il.remove_empty_mask, train_tensor))\n", + "validate_tensor = list(map(il.remove_empty_mask, validate_tensor))" + ], + "metadata": { + "collapsed": false, + "pycharm": { + "name": "#%%\n", + "is_executing": false + } + } + }, + { + "cell_type": "code", + "execution_count": 2, + "outputs": [], + "source": [ + "def get_model_parameters_to_update(m:models.AlexNet) -> list:\n", + " params = []\n", + " for name, param in m.named_parameters():\n", + " # maybe here better check require gradient back propagation\n", + " # and then update\n", + " # print(\"name={}\".format(name))\n", + " params.append(param)\n", + " return params\n", + "\n", + "def set_model_last_layer(m:models.AlexNet) -> models.AlexNet:\n", + " num_features = m.classifier[6].in_features\n", + " m.classifier[6] = nn.Linear(num_features, pr.labels_number) # here 2 for each label \n", + " return m" + ], + "metadata": { + "collapsed": false, + "pycharm": { + "name": "#%%\n", + "is_executing": false + } + } + }, + { + "cell_type": "code", + "execution_count": 3, + "outputs": [], + "source": [ + "### EXECUTE ONLY ONCE!!!!!!!!!!!\n", + "model = models.alexnet(pretrained=True, progress=False)\n", + "model = set_model_last_layer(model)\n", + "\n", + "parameters = get_model_parameters_to_update(model)\n", + "data_train = torch.utils.data.DataLoader(il.CustomDataset(train_tensor), batch_size=4, shuffle=True, num_workers=4)\n", + "data_validate = torch.utils.data.DataLoader(il.CustomDataset(validate_tensor), batch_size=4, shuffle=True, num_workers=4)\n", + "crit = nn.SmoothL1Loss()\n", + "optim = optimize.SGD(parameters, lr=0.001, momentum=0.9)\n" + ], + "metadata": { + "collapsed": false, + "pycharm": { + "name": "#%% \n", + "is_executing": false + } + } + }, + { + "cell_type": "code", + "execution_count": 4, + "outputs": [], + "source": [ + "from model_processing.train_model import TrainModel\n", + "\n", + "from model_processing.validate_model import ValidateModel" + ], + "metadata": { + "collapsed": false, + "pycharm": { + "name": "#%%\n", + "is_executing": false + } + } + }, + { + "cell_type": "code", + "execution_count": 5, + "outputs": [ + { + "name": "stdout", + "text": [ + "Epoch: 1/10\n", + "acc: 0.851646648908066\n--------------------\nEpoch: 2/10\n", + "acc: 0.8464873758965359\n--------------------\nEpoch: 3/10\n", + "acc: 0.8395762580569832\n--------------------\nEpoch: 4/10\n", + "acc: 0.7985506170313421\n--------------------\nEpoch: 5/10\n", + "acc: 0.7958712400146775\n--------------------\nEpoch: 6/10\n", + "acc: 0.8139802083403304\n--------------------\nEpoch: 7/10\n", + "acc: 0.8478839503646756\n--------------------\nEpoch: 8/10\n", + "acc: 0.7804984863489599\n--------------------\nEpoch: 9/10\n", + "acc: 0.808924053468127\n--------------------\nEpoch: 10/10\n", + "acc: 0.7809302213536918\n--------------------\nbest acc: 0.851646648908066\n" + ], + "output_type": "stream" + }, + { + "data": { + "text/plain": "" + }, + "metadata": {}, + "output_type": "execute_result", + "execution_count": 5 + } + ], + "source": [ + "validate_model = ValidateModel(pr.labels_number, device)\n", + "train_model = TrainModel(validate_model, device)\n", + "\n", + "train_model.train_model(model, data_train,data_validate, crit, optim, 10)" + ], + "metadata": { + "collapsed": false, + "pycharm": { + "name": "#%%\n", + "is_executing": false + } + } + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.4" + }, + "pycharm": { + "stem_cell": { + "cell_type": "raw", + "source": [], + "metadata": { + "collapsed": false + } + } + } + }, + "nbformat": 4, + "nbformat_minor": 1 +} \ No newline at end of file diff --git a/notebooks/init.ipynb b/notebooks/init.ipynb deleted file mode 100644 index a9b2291..0000000 --- a/notebooks/init.ipynb +++ /dev/null @@ -1,46 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true, - "pycharm": { - "name": "#%%\n" - } - }, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 2 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython2", - "version": "2.7.6" - }, - "pycharm": { - "stem_cell": { - "cell_type": "raw", - "source": [], - "metadata": { - "collapsed": false - } - } - } - }, - "nbformat": 4, - "nbformat_minor": 0 -} \ No newline at end of file diff --git a/properties.py b/properties.py new file mode 100644 index 0000000..341fc5c --- /dev/null +++ b/properties.py @@ -0,0 +1,23 @@ +data_inputs_path = "/home/nikita/PycharmProjects/ISIC2018_Task1-2_Training_Input" +data_labels_path = "/home/nikita/PycharmProjects/ISIC2018_Task2_Training_GroundTruth_v3" + +tensor_train_path = "/home/nikita/PycharmProjects/tensors/test" +tensor_validate_path = "/home/nikita/PycharmProjects/tensors/validate" + +# for image_loader +classes = [ + 'pigment_network', + 'streaks', + 'globules', + 'milia_like_cyst', + 'negative_network' +] +field_id = 'id' +field_train = 'train' +image_size = 224 +num_workers = 5 + +labels_number = len(classes) # 5 +train_size = 200 +test_size = 100 +batch_size = 8 diff --git a/requirements.txt b/requirements.txt deleted file mode 100644 index e69de29..0000000 diff --git a/utilits.py b/utilits.py new file mode 100644 index 0000000..12eb513 --- /dev/null +++ b/utilits.py @@ -0,0 +1,33 @@ +# load images + +import torch +from torchvision import datasets, transforms +import torch.utils.data.dataloader +import os + + +def image_data(dir_path: str, batch_size: int): + input_size = 224 + + # Data augmentation and normalization for training + # Just normalization for validation + composites = { + 'train': transforms.Compose([ + transforms.RandomResizedCrop(input_size), + transforms.RandomHorizontalFlip(), + transforms.ToTensor(), + transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]) + ]), + 'val': transforms.Compose([ + transforms.Resize(input_size), + transforms.CenterCrop(input_size), + transforms.ToTensor(), + transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]) + ]), + } + # datasets.DatasetFolder(os.path.join(dir_path, 'train'), datasets.folder.default_loader, datasets.folder.IMG_EXTENSIONS) + image_data_sets = {x: datasets.ImageFolder(os.path.join(dir_path, x), composites[x]) for x in ['train', 'val']} + return {x: len(image_data_sets[x]) for x in ['train', 'val']}, { + x: torch.utils.data.DataLoader(image_data_sets[x], batch_size=batch_size, shuffle=True, num_workers=2) for x + in ['train', 'val']} +