diff --git a/examples/healthcare/application/Kidney_Disease/README.md b/examples/healthcare/application/Kidney_Disease/README.md index 05a97bc32..0a3979e79 100644 --- a/examples/healthcare/application/Kidney_Disease/README.md +++ b/examples/healthcare/application/Kidney_Disease/README.md @@ -19,25 +19,28 @@ # Singa for Kidney Disease Prediction -## Kidney Disease Prediction Task +## Kidney disease Prediction Task -Kidney disease prediction is an important tool that uses data science and machine learning techniques to predict the likelihood of a patient suffering from Kidney disease. The goal is to judge whether a patient suffers from kidney disease by analyzing multiple data such as a patient’s medical history, physiological indicators, diagnostic information, treatment options, and socioeconomic factors, so as to take appropriate interventions in advance to provide treatment. +Kidney disease prediction is an important tool that uses data science and machine learning techniques to predict the likelihood of a patient suffering from Kidney disease. The core goal of this technology is to judge whether a patient suffers from kidney disease by analyzing multiple data such as a patient’s medical history, physiological indicators, diagnostic information, treatment options, and socioeconomic factors, so as to take appropriate interventions in advance to provide treatment. + +The dataset used in this task is MIMIC-III after preprocessed. The features are data containing 6 visit windows, with 2549 frequent diagnoses, procedures and drugs for each window. Each item in features are data for one patient, and these features are encoded by one-hot code. The labels are corresponding flags to mark whether the patient suffered from kidney disease, where the label equals "1" if the patient had kidn disease, the label equals "0" if not. -The dataset used in this task is MIMIC-III. The features are data containing 6 visit windows, with 2549 frequent diagnoses, procedures and drugs for each window. These features are encoded by one-hot. The labels are corresponding flags to mark whether the patient suffered from kidney disease, where the label equals "1" if the patient had kidney disease, and the label equals "0" if not. ## Structure -* `data` includes the load of mimic-iii data to be utilized. +* `kidney.py` in floder `healthcare/data` includes the load of pre-processed kidney data to be utilized. + +* `kidney_net.py` in folder `healthcare/models` includes the construction codes of the KidneyNet model to be applied for kidney disease prediction. + +* `train.py` is the training script, which controls the training flow bydoing BackPropagation and SGD update. -* `model` includes the MLP model construction codes by creating - a subclass of `Module` to wrap the neural network operations - of each model. +## Instruction +Before starting to use this model for kidney disease prediction, download the sample dataset for kidney disease prediction: https://github.com/lzjpaul/singa-healthcare/tree/main/data/kidney -* `train_kidney_mlp.py` is the training script, which controls the training flow by - doing BackPropagation and the SGD update. +The provided dataset is from MIMIC-III, which has been pre-processed. And the dataset contains 100 samples for model testing. -## Command +Please download the dataset to a folder(pathToDataset), and then pass the path to run the codes using the following command: ```bash -python train_kidney_mlp.py mlp kidney-disease -dir pathToDataset -``` \ No newline at end of file +python train.py kidneynet -dir pathToDataset +``` diff --git a/examples/healthcare/application/Kidney_Disease/run.sh b/examples/healthcare/application/Kidney_Disease/run.sh index 27de7eae3..d86c83bf4 100644 --- a/examples/healthcare/application/Kidney_Disease/run.sh +++ b/examples/healthcare/application/Kidney_Disease/run.sh @@ -17,4 +17,4 @@ # ### kidney disease dataset -python train_kidney_mlp.py mlp kidney-disease -dir pathToDataset \ No newline at end of file +python train.py kidneynet -dir pathToDataset diff --git a/examples/healthcare/application/Kidney_Disease/train_kidney_mlp.py b/examples/healthcare/application/Kidney_Disease/train.py similarity index 93% rename from examples/healthcare/application/Kidney_Disease/train_kidney_mlp.py rename to examples/healthcare/application/Kidney_Disease/train.py index 474858066..1af8c24ef 100644 --- a/examples/healthcare/application/Kidney_Disease/train_kidney_mlp.py +++ b/examples/healthcare/application/Kidney_Disease/train.py @@ -25,6 +25,10 @@ import time import argparse from PIL import Image +import sys +sys.path.append("../../..") +from healthcare.data import kidney +from healthcare.models import kidney_net np_dtype = {"float16": np.float16, "float32": np.float32} @@ -107,6 +111,7 @@ def run(global_rank, sgd, graph, verbosity, + dir_path, dist_option='plain', spars=None, precision='float32'): @@ -115,9 +120,9 @@ def run(global_rank, dev.SetRandSeed(0) np.random.seed(0) - if data == 'kidney-disease': - from data import load_kidneydata - train_x, train_y, val_x, val_y = load_kidneydata.load() + if data == 'kidney': + + train_x, train_y, val_x, val_y = kidney.load(dir_path) else: print('Wrong Dataset!') sys.exit(0) @@ -130,14 +135,14 @@ def run(global_rank, print(num_channels,image_size) - if model == 'mlp': + if model == 'kidneynet': import os, sys, inspect current = os.path.dirname( os.path.abspath(inspect.getfile(inspect.currentframe()))) parent = os.path.dirname(current) sys.path.insert(0, parent) - from mlp import model - model = model.create_model(data_size=data_size, + + model = kidney_net.create_model(data_size=data_size, num_classes=num_classes) else: print('Wrong model!') @@ -256,11 +261,11 @@ def run(global_rank, description='Training using the autograd and graph.') parser.add_argument( 'model', - choices=['cnn', 'resnet', 'xceptionnet', 'mlp', 'alexnet'], - default='cnn') - parser.add_argument('data', - choices=['mnist', 'cifar10', 'cifar100','mimic-iii','kidney-disease'], - default='kidney-disease') + choices=[ 'cardionet', 'diabeticnet', 'drnet', 'hematologicnet', 'kidneynet', 'malarianet', 'tedctnet'], + default='kidneynet') + parser.add_argument('-data', + choices=['mnist', 'cifar10', 'cifar100','kidney'], + default='kidney') parser.add_argument('-p', choices=['float32', 'float16'], default='float32', @@ -302,7 +307,12 @@ def run(global_rank, type=int, help='logging verbosity', dest='verbosity') - + parser.add_argument('-dir', + '--dir-path', + default="/tmp/kidney", + type=str, + help='the directory to store the kidney dataset', + dest='dir_path') args = parser.parse_args() sgd = opt.SGD(lr=args.lr, momentum=0.9, weight_decay=1e-5, dtype=singa_dtype[args.precision]) @@ -316,4 +326,5 @@ def run(global_rank, sgd, args.graph, args.verbosity, + args.dir_path, precision=args.precision) diff --git a/examples/healthcare/data/kidney.py b/examples/healthcare/data/kidney.py new file mode 100644 index 000000000..5956aae4a --- /dev/null +++ b/examples/healthcare/data/kidney.py @@ -0,0 +1,41 @@ +import numpy as np + +import pickle +import sys +import os + +def load_dataset(dir_path="/tmp/kidney"): + dir_path = check_dataset_exist(dir_path=dir_path) + feature_path = os.path.join(dir_path, "kidney_features.pkl") + label_path = os.path.join(dir_path, "kidney_labels.pkl") + with open(feature_path,'rb') as f: + features = pickle.load(f) + with open(label_path,'rb') as f: + labels = pickle.load(f) + + + split_train_point = int(len(features) * 8/ 10) + train_x, train_y = features[:split_train_point], labels[:split_train_point] + val_x, val_y = features[split_train_point:], labels[split_train_point:] + + return train_x,train_y,val_x,val_y + +def check_dataset_exist(dir_path): + if not os.path.exists(dir_path): + print( + 'Please download the kidney dataset first' + ) + sys.exit(0) + return dir_path + + +def load(dir_path): + train_x,train_y,val_x,val_y = load_dataset(dir_path) + + train_x = train_x.astype(np.float32) + val_x = val_x.astype(np.float32) + train_y = train_y.astype(np.int32) + val_y = val_y.astype(np.int32) + + return train_x,train_y,val_x,val_y + diff --git a/examples/healthcare/data/kidneydata.py b/examples/healthcare/data/kidneydata.py deleted file mode 100644 index 4144acf70..000000000 --- a/examples/healthcare/data/kidneydata.py +++ /dev/null @@ -1,57 +0,0 @@ -# -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. -# - -import numpy as np -import torch -from tqdm import tqdm -import pickle - - -def load_dataset(): - with open('/home/kidney_disease/kidney_features.pkl','rb') as f: # change the path to load dataset - features = pickle.load(f) - with open('/home/kidney_disease/kidney_labels.pkl','rb') as f: # change the path to load dataset - labels = pickle.load(f) - - - split_train_point = int(len(features) * 8/ 10) - train_x, train_y = features[:split_train_point], labels[:split_train_point] - val_x, val_y = features[split_train_point:], labels[split_train_point:] - - return train_x,train_y,val_x,val_y - -def process_label(data): - new_labels = [] - for i in tqdm(data, total=len(data)): - label = torch.squeeze(i, dim=0) - new_labels.append(label) - return new_labels - - - -def load(): - train_x,train_y,val_x,val_y = load_dataset() - - - train_x = train_x.astype(np.float32) - val_x = val_x.astype(np.float32) - train_y = train_y.astype(np.int32) - val_y = val_y.astype(np.int32) - - return train_x,train_y,val_x,val_y \ No newline at end of file diff --git a/examples/healthcare/models/kidney_net.py b/examples/healthcare/models/kidney_net.py index c4c49764b..67d472412 100644 --- a/examples/healthcare/models/kidney_net.py +++ b/examples/healthcare/models/kidney_net.py @@ -30,10 +30,10 @@ singa_dtype = {"float16": tensor.float16, "float32": tensor.float32} -class MLP(model.Model): +class KidneyNet(model.Model): def __init__(self, data_size=10, perceptron_size=100, num_classes=10): - super(MLP, self).__init__() + super(KidneyNet, self).__init__() self.num_classes = num_classes self.dimension = 2 @@ -73,20 +73,13 @@ def set_optimizer(self, optimizer): def create_model(pretrained=False, **kwargs): - """Constructs a CNN model. - Args: - pretrained (bool): If True, returns a pre-trained model. - - Returns: - The created CNN model. - """ - model = MLP(**kwargs) + model = KidneyNet(**kwargs) return model -__all__ = ['MLP', 'create_model'] +__all__ = ['KidneyNet', 'create_model'] if __name__ == "__main__": np.random.seed(0) @@ -131,7 +124,7 @@ def create_model(pretrained=False, **kwargs): sgd = opt.SGD(0.1, 0.9, 1e-5, dtype=singa_dtype[args.precision]) tx = tensor.Tensor((400, 2), dev, precision) ty = tensor.Tensor((400,), dev, tensor.int32) - model = MLP(data_size=2, perceptron_size=3, num_classes=2) + model = KidneyNet(data_size=2, perceptron_size=3, num_classes=2) # attach model to graph model.set_optimizer(sgd) @@ -145,5 +138,3 @@ def create_model(pretrained=False, **kwargs): if i % 100 == 0: print("training loss = ", tensor.to_numpy(loss)[0]) - -