csseries · kickermeister · Aug 27, 2021 · Aug 27, 2021 · Aug 27, 2021 · Aug 27, 2021
diff --git a/Makefile b/Makefile
@@ -72,3 +72,31 @@ pypi:
 
 run_locally:
 	@uvicorn api.fast:app --reload --host 0.0.0.0
+
+# ----------------------------------
+#      GCLOUD TRAINING
+# ----------------------------------
+
+
+BUCKET_NAME=taxifare_bucket_fast-drake-318911
+BUCKET_TRAINING_FOLDER = 'trainings'
+REGION=europe-west1
+PYTHON_VERSION=3.7
+RUNTIME_VERSION=1.15
+
+PACKAGE_NAME=cookit
+FILENAME=trainer
+
+JOB_NAME=cookit_training_$(shell date +'%Y%m%d_%H%M%S')
+
+
+
+gcp_submit_training:
+	gcloud ai-platform jobs submit training ${JOB_NAME} \
+		--job-dir gs://${BUCKET_NAME}/${BUCKET_TRAINING_FOLDER} \
+		--package-path ${PACKAGE_NAME} \
+		--module-name ${PACKAGE_NAME}.${FILENAME} \
+		--python-version=${PYTHON_VERSION} \
+		--runtime-version=${RUNTIME_VERSION} \
+		--region ${REGION} \
+		--stream-logs
diff --git a/api/fast.py b/api/fast.py
@@ -33,7 +33,7 @@ def index():
 # response = requests.post('http://localhost:8000/predict', files=files, data=payload)
 # --> make sure to close the opened file again or del the files dictionary in this example case!
 @app.post("/predict")
-async def predict(image: UploadFile = File(...), threshold: float = Form(0.5)):
+async def predict(image: UploadFile = File(...), threshold: float = Form(0.1)):
     """ Executes prediction based on sent file and threshold
         image: a file in multipart/form-data format
         threshold: a float value (default: 0.25) to filter predicted classes

diff --git a/class_labels.pkl b/class_labels.pkl
@@ -0,0 +1,128 @@
+(dp0
+I1
+Vlabel
+p1
+sI2
+VPumpkin
+p2
+sI3
+VIce cream
+p3
+sI4
+VSalad
+p4
+sI5
+VBread
+p5
+sI6
+VCoconut
+p6
+sI7
+VGrape
+p7
+sI8
+VMushroom
+p8
+sI9
+VHoneycomb
+p9
+sI10
+VFish
+p10
+sI11
+VOyster
+p11
+sI12
+VPomegranate
+p12
+sI13
+VRadish
+p13
+sI14
+VWatermelon
+p14
+sI15
+VPasta
+p15
+sI16
+VCabbage
+p16
+sI17
+VStrawberry
+p17
+sI18
+VApple
+p18
+sI19
+VOrange
+p19
+sI20
+VPotato
+p20
+sI21
+VBanana
+p21
+sI22
+VPear
+p22
+sI23
+VShellfish
+p23
+sI24
+VTomato
+p24
+sI25
+VCheese
+p25
+sI26
+VCarrot
+p26
+sI27
+VShrimp
+p27
+sI28
+VLemon
+p28
+sI29
+VArtichoke
+p29
+sI30
+VBroccoli
+p30
+sI31
+VBell pepper
+p31
+sI32
+VPineapple
+p32
+sI33
+VLobster
+p33
+sI34
+VMilk
+p34
+sI35
+VMango
+p35
+sI36
+VGrapefruit
+p36
+sI37
+VCantaloupe
+p37
+sI38
+VPeach
+p38
+sI39
+VCream
+p39
+sI40
+VZucchini
+p40
+sI41
+VCucumber
+p41
+sI42
+VWinter melon
+p42
+s.
diff --git a/cookit/data.py b/cookit/data.py
@@ -1,8 +1,142 @@
+import csv
+import json
+import math
+import pandas as pd
+from google.cloud import storage
+from cookit.utils import OIv4_INGREDIENTS_ONLY, TEST_FOOD_CLASSES, OIv4_MIN_SET
+from cookit.params import BUCKET_NAME
 
 
-def get_data():
-    pass
+
+def download_file_from_bucket(path='labels.json'):
+    client = storage.Client()
+    bucket = client.bucket(BUCKET_NAME)
+    blob = bucket.blob(path)
+    blob.download_to_filename(path)
+
+
+def upload_file_to_bucket(path):
+    client = storage.Client()
+    bucket = client.bucket(BUCKET_NAME)
+    blob = bucket.blob(path)
+    blob.upload_from_filename(path)
+
+
+def get_oi_dataset_df(path='oi_food.csv', nrows=1000):
+    """method to get the training data (or a portion of it) from google cloud bucket"""
+    df = pd.read_csv(f"gs://{BUCKET_NAME}/{path}", nrows=nrows)
+    return df
+
+
+# Should we perhaps keep non-food-related labels in the test set?
+def convert_oi_metadata(labelfile_path, baseurl=f'gs://{BUCKET_NAME}/data', csv_path='tf_training.csv',
+                        test_split=0.2, val_split=0.1, train_classes=OIv4_INGREDIENTS_ONLY):
+    """ Converts a json file in format fiftyone.types.FiftyOneImageDetectionDataset
+        to a format as it is expected by the tflite_model_maker.object_detector
+
+        labelfile_path: path to json file [String]
+        baseurl: url to files referenced in the json file [String]
+        csv_path: path to output csv file [String]
+        test_split: share of images for test [float]
+        val_split: share of images for validation [float]
+
+        Returns dict containing basic dataset information
+    """
+    with open(labelfile_path) as json_file:
+        oi_data = json.load(json_file)
+
+    classes = oi_data['classes']
+    food_classes = []
+
+    total_images = len(oi_data['labels'])
+    val_count = math.floor(total_images* val_split)
+    test_count = math.floor(total_images * test_split)
+
+    bbox_count, uuid_count = 0, 0
+    test_bboxes, val_bboxes, train_bboxes = 0, 0, 0
+
+    with open(csv_path, "w") as csv_file:
+        writer = csv.writer(csv_file, delimiter=',', dialect='excel')
+        line = ['set', 'path', 'label', 'x_min', 'y_min', 'x_max', 'y_min', 'x_max', 'y_max', 'x_min', 'y_max']
+        writer.writerow(line)
+        for uuid, labels in oi_data['labels'].items():
+            uuid_count += 1
+            for label_items in labels:
+                label = classes[label_items['label']]
+                if label in train_classes:
+                    food_classes.append(label)
+                    bbox_count += 1
+                    bb = label_items['bounding_box']
+                    if uuid_count <= val_count:
+                        split = "TEST"
+                        val_bboxes += 1
+                    elif uuid_count >= val_count and uuid_count < int(test_count + val_count):
+                        split = "VALIDATION"
+                        test_bboxes += 1
+                    elif uuid_count >= int(test_count + val_count):
+                        split = "TRAINING"
+                        train_bboxes += 1
+                    line = [split, f"{baseurl}/{uuid}.jpg", label, bb[0], bb[1], "", "",
+                            min(bb[0] + bb[2], 1.0), min(bb[1] + bb[3], 1.0), "", ""] # min() should actually not be neccesarry, but never say never....
+                    writer.writerow(line)
+
+    unique_food_classes = list(sorted(set(food_classes)))
+    print(f"Found {len(unique_food_classes)} food-related classes of total {len(classes)} classes.")
+    print(f"Found {bbox_count} bounding boxes. Train/Val/Test split: {train_bboxes} / {val_bboxes} / {test_bboxes}")
+    print(f"Total number of images in dataset: {total_images}")
+
+    return {
+        'food_classes': unique_food_classes,
+        'bbox_count': bbox_count,
+        'train_bbox_count': train_bboxes,
+        'val_bbox_count': val_bboxes,
+        'test_bbox_count': test_bboxes,
+        'images_count': total_images
+    }
+
+
+def get_random_slice(csv_path=f'gs://{BUCKET_NAME}/oi_food.csv',
+                     out_csv_name='oi_food_sample.csv',
+                     size=1000,
+                     return_df=True):
+    df = pd.read_csv(csv_path)
+    sample = df.sample(size)
+    sample.to_csv(out_csv_name, index=False)
+    upload_file_to_bucket(out_csv_name)
+    print(f"Uploaded CSV file containing {size} samples to gs://{BUCKET_NAME}/{out_csv_name}")
+    if return_df:
+        return sample
+
+def get_random_slice_balanced(csv_file='oi_food_minimal.csv',
+                              out_csv_file='oi_food_minimal_balanced.csv',
+                              label_size=50,
+                              gcloud_upload=False):
+    df = get_oi_dataset_df(csv_file, nrows=100_000)
+    out_df = pd.DataFrame(columns=df.columns)
+    for label in df.label.unique():
+        sample = df[(df.set == 'TRAINING') & (df.label == label)].sample(int(label_size*0.7), replace=True)
+        out_df = pd.concat([out_df, sample])
+        sample = df[(df.set == 'VALIDATION') & (df.label == label)].sample(int(label_size*0.2), replace=True)
+        out_df = pd.concat([out_df, sample])
+        sample = df[(df.set == 'TEST') & (df.label == label)].sample(int(label_size*0.1), replace=True)
+        out_df = pd.concat([out_df, sample])
+    out_df.to_csv(out_csv_file, index=False)
+    if gcloud_upload:
+        upload_file_to_bucket(out_csv_file)
+    return out_df
+
+
+
+
+def create_dataset(json_path='labels.json', csv_path='oi_food.csv', classes=OIv4_MIN_SET):
+    download_file_from_bucket(json_path)
+    ds = convert_oi_metadata(json_path, csv_path=csv_path, train_classes=classes)
+    upload_file_to_bucket(csv_path)
+    print(f"Uploaded new dataset to gs://{BUCKET_NAME}/{csv_path}")
 
 
 if __name__ == '__main__':
-    print("Nothing to do here...")
+    create_dataset()
+    print("Print 5 random samples")
+    print(get_oi_dataset_df('oi_food.csv', 5))
+    #sample = get_random_slice('oi_food.csv', 'labels_slice.csv', 1000)
diff --git a/cookit/params.py b/cookit/params.py
@@ -0,0 +1,2 @@
+PROJECT_NAME = 'fast-drake-318911'
+BUCKET_NAME = 'taxifare_bucket_fast-drake-318911'
diff --git a/cookit/predict.py b/cookit/predict.py
@@ -2,7 +2,6 @@
 import tensorflow as tf
 import tensorflow_hub as hub
 from PIL import Image, ImageOps
-from cookit.data import get_data
 from cookit.utils import OIv4_FOOD_CLASSES
 
 os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'

diff --git a/cookit/predict_lite.py b/cookit/predict_lite.py
@@ -1,8 +1,7 @@
 import tensorflow as tf
 import numpy as np
 import os
-
-from cookit.data import get_data
+import pickle
 from PIL import Image
 from cookit.utils import OIv4_FOOD_CLASSES
 
@@ -15,15 +14,12 @@ def __init__(self):
         """
         A basic call for predictions.
         """
-        self.model = self._get_model()
+        self.model = self._get_model('oi_food_balanced_400_lite4_ll.tflite')
+
+        with open('class_labels.pkl', 'rb') as f:
+            self.classes_map = pickle.load(f)
+        #self.classes = ['Baked Goods', 'Salad', 'Cheese', 'Seafood', 'Tomato']
 
-        # NOTE: The order of this list hardcoded here, and needs to be changed when re-training the model!
-        # When exporting the model in tflite format, the model_spec is lost, so we cannot do it like that:
-        # classes = ['???'] * model.model_spec.config.num_classes
-        # label_map = model.model_spec.config.label_map
-        # for label_id, label_name in label_map.as_dict().items():
-        #   classes[label_id-1] = label_name
-        self.classes = ['Baked Goods', 'Salad', 'Cheese', 'Seafood', 'Tomato']
 
     def _get_model(self, model_path='model.tflite'):
         """ Load the model from a local path """
@@ -59,10 +55,10 @@ def detect_objects(self, image):
         self.model.invoke()
 
         # Get all outputs from the model
-        boxes = self.get_output_tensor(0)
-        classes = self.get_output_tensor(1)
-        scores = self.get_output_tensor(2)
-        count = int(self.get_output_tensor(3))
+        boxes = self.get_output_tensor(1)
+        classes = self.get_output_tensor(3)
+        scores = self.get_output_tensor(0)
+        count = int(self.get_output_tensor(2))
 
         results = []
         for i in range(count):
@@ -86,7 +82,7 @@ def run_detection(self, image_path, threshold=0.5):
         results = self.detect_objects(preprocessed_image)
         return results
 
-    def predict(self, image_path, threshold=0.5):
+    def predict(self, image_path, threshold=0.1):
         detection_result_image = self.run_detection(image_path, threshold)
         print(f"Received file for prediction: {image_path}")
 
@@ -96,7 +92,7 @@ def predict(self, image_path, threshold=0.5):
 
         for result in detection_result_image:
             if result['score'] > threshold:
-                res_class = self.classes[result['class_id']]
+                res_class = self.classes_map[result['class_id']]
                 # do not return redundant ingredients
                 if res_class not in ingredients:
                     ingredients.append(res_class)
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,2 @@
		PROJECT_NAME = 'fast-drake-318911'
		BUCKET_NAME = 'taxifare_bucket_fast-drake-318911'