flagx_tutorial/main.py at main · PaMartini/flagx_tutorial · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692


def workflow_pipeline_unsupervised():

    import os

    from flagx import GatingPipeline

    save_dir = './results/workflow_pipeline_unsupervised'
    os.makedirs(save_dir, exist_ok=True)

    channels = [
        'FS INT', 'SS INT', 'FL1 INT_CD14-FITC', 'FL2 INT_CD19-PE', 'FL3 INT_CD13-ECD', 'FL4 INT_CD33-PC5.5',
        'FL5 INT_CD34-PC7', 'FL6 INT_CD117-APC', 'FL7 INT_CD7-APC700', 'FL8 INT_CD16-APC750', 'FL9 INT_HLA-PB',
        'FL10 INT_CD45-KO'
    ]

    cutoffs = {
        'FS INT': 1000, 'SS INT': 500,
        'FL1 INT_CD14-FITC': 300, 'FL2 INT_CD19-PE': 300, 'FL3 INT_CD13-ECD': 300, 'FL4 INT_CD33-PC5.5': 300,
        'FL5 INT_CD34-PC7': 400, 'FL6 INT_CD117-APC': 400, 'FL7 INT_CD7-APC700': 400, 'FL8 INT_CD16-APC750': 400,
        'FL9 INT_HLA-PB': 100, 'FL10 INT_CD45-KO': 100,
    }

    preprocessing_kwargs = {'flavour': 'log10_w_custom_cutoffs', 'flavour_kwargs': {'cutoffs': cutoffs}}

    som_kwargs = {
        'som_topology': 'planar',
        'som_grid_type': 'rectangular',
        'som_dimensions': (6, 6),
        'neighborhood': 'gaussian',
        'gaussian_neighborhood_sigma': 0.1,
        'initialization': 'pca',
        'n_epochs': 10,
        'verbosity': 3,
    }

    # Initialize the pipeline
    gp = GatingPipeline(
        train_data_file_path='./data/training',
        train_data_file_names=None,
        train_data_file_type=None,
        save_path='./results/workflow_pipeline_unsupervised/gp_output',
        channels=channels,
        label_key=None,  # No labels
        compensate=False,
        channel_names_alignment_kwargs=None,
        relabel_data_kwargs=None,
        preprocessing_kwargs=preprocessing_kwargs,
        downsampling_kwargs={'target_num_events': 100},
        gating_method='som',
        gating_method_kwargs=som_kwargs,
    )

    # Train the underlying ML model
    gp.train()

    # Annotate and export data
    gp.inference(
        data_file_path='./data/testing',
        data_file_names=None,
        sample_wise=False,  # Export all into one file
        gate=False,  # Cannot gate since SOM was trained with unlabeled data
        dim_red_methods=('som', 'umap'),
        dim_red_method_kwargs=(None, {'n_jobs': -1}),
        save_path=save_dir,
        save_filename='annotated_data.fcs',
        val_range=(0, 2**20),
        keep_unscaled=False,
    )

    # Save the pipeline
    gp.save(filepath=save_dir)


def workflow_step_wise_unsupervised_som_training():

    import os
    import numpy as np
    import pandas as pd
    import matplotlib.pyplot as plt

    from flagx.io import FlowDataManager
    from flagx.gating import SomClassifier

    # --- Define path where results are saved to
    save_path = './results/workflow_step_wise_unsupervised_som_training'
    save_path_data_handling = './results/workflow_step_wise_unsupervised_som_training/data_handling'
    os.makedirs(save_path_data_handling, exist_ok=True)

    # --- Specify where training data is saved and specify the corresponding filenames
    # Define path to training data
    training_data_path = './data/flagx_tutorial_data/training'

    # Get list of files in the data directory (only include ones ending with .csv)
    training_files = sorted([fn for fn in os.listdir(training_data_path) if fn.endswith('.csv')])

    # --- Before data processing with the FlowDataManager, check whether each file contains the same number of channels
    # Load the training files into pandas dataframes
    training_data_dfs = [pd.read_csv(os.path.join(training_data_path, fn)) for fn in training_files]

    # For each file print the number of channels
    for fn, data_df in zip(training_files, training_data_dfs):
        print(f'# --- {fn}, number of channels: {data_df.shape[1]}')

    print('\n')

    # As an additional check, also print the channel names
    for fn, data_df in zip(training_files, training_data_dfs):
        print(f'# --- {fn}:\n{data_df.columns.to_list()}')

    print('\n')

    # RESULT:
    #  - Number of channels is consistent across samples
    #  - If one sample had 14 instead of 13 channels (e.g. FS Peak),
    #    this channel should be removed to match the other samples.


    # --- Data loading and processing
    # Initialize the data manager
    fdm = FlowDataManager(
        data_file_names=training_files,
        data_file_type=None,  # Is inferred from the filename ending of the 1st file in the 'training_files' list
        data_file_path=training_data_path,
        save_path=save_path_data_handling,
        verbosity=1
    )

    # Load data into memory
    # Data samples are now stored not in a Pandas DataFrames, but in a list of AnnData object.
    # This list is an attribute of the FlowDataManager class and can be accessed via FlowDataManager.anndata_list_.
    # AnnData is a Python class similar to Pandas DataFrames but with more options and functions for data annotation.
    # see: https://anndata.readthedocs.io/en/stable/
    fdm.load_data_files_to_anndata()
    # Print the loaded AnnData objects containing the loaded training data
    for adata in fdm.anndata_list_:
        print(f'# --- {adata.uns['filename']}:\n{adata}')

    # --- Check the number of events per sample
    # Create a dataframe with the columns sample and n_events, this df is an attribute of the FDM instance
    # and can be accessed via FDM.sample_sizes_. Since a filename is passed as well, the df is also saved in the
    # directory specified via 'save_path_data_handling'.
    fdm.check_sample_sizes(filename_sample_sizes_df='sample_sizes.csv')

    # Use a built-in plotting function to visualize the number of events per sample.
    # Resulting plot also saved to 'save_path_data_handling'.
    fig, ax = plt.subplots(dpi=300)
    fdm.plot_sample_size_df(sample_size_df=fdm.sample_sizes_, ax=ax)
    fig.savefig(os.path.join(save_path_data_handling, 'sample_sizes.png'))
    plt.close(fig)

    # --- Apply preprocessing transformation to each sample
    # Example 1: Apply arcsinh with cofactor 150,
    # Example 2: Apply log transformation with custom cutoffs
    # In both cases, store non-transformed data in a separate layer of the AnnData object that we call 'no_trafo'.
    example_1 = True
    if example_1:
        preprocessing_kwargs = {'cofactor': 150}
        fdm.sample_wise_preprocessing(flavour='arcsinh', save_raw_to_layer='no_trafo', **preprocessing_kwargs)
    else:
        # Define python dictionary mapping channel names to cutoffs (arbitrarily chosen here, adjust if needed)
        channel_name_to_cutoff = {
            'FS INT': 1000, 'SS INT': 800,
            '15-FITC': 300, '13-PE': 300, '34-ECD': 300, '117-PC5.5': 300, '33-PC7': 300,
            '2-APC': 200, '7-APC-AF700': 200, '-APC-AF750': 200, 'HLADR-PB': 200, '45-CO': 200,
        }
        preprocessing_kwargs = {'cutoffs': channel_name_to_cutoff}
        fdm.sample_wise_preprocessing(
            flavour='log10_w_custom_cutoffs', save_raw_to_layer='no_trafo', **preprocessing_kwargs
        )

    # --- Downsample each sample to a target number of events
    # Set target_num_events to 1000 for fast model training in this example
    fdm.sample_wise_downsampling(data_set='all', target_num_events=1000)

    # --- Extract concatenated data matrix for model training
    # Define channels to be used for model training
    channels = [
        'FS INT', 'SS INT',
        '15-FITC', '13-PE', '34-ECD', '117-PC5.5', '33-PC7', '2-APC', '7-APC-AF700', '-APC-AF750', 'HLADR-PB', '45-CO'
    ]
    # Extract the processed data matrices from the AnnData objects
    data_matrices = [adata[:, channels].X for adata in fdm.anndata_list_]
    # Concatenate
    x_train = np.concatenate(data_matrices, axis=0)
    # Shuffle
    idx_shuffle = np.random.permutation(x_train.shape[0])
    x_train = x_train[idx_shuffle]

    # --- SOM training
    # Instantiate the SOMClassifier, set hyperparameters
    som_clf = SomClassifier(
        som_topology='planar',
        som_grid_type='rectangular',
        som_dimensions=(6, 3),  # (25, 25)
        neighborhood='gaussian',
        gaussian_neighborhood_sigma=0.1,
        initialization='pca',
        n_epochs=100,  # 1000,
        radius_0=-0.25,
        radius_n=0.1,
        radius_cooling='exponential',
        learning_rate_0=0.1,
        learning_rate_n=0.001,
        learning_rate_decay='exponential',
        unlabeled_label=-999,
        verbosity=2
    )

    # Model fitting. Since no labels are available only the SOM component is trained in an unsupervised fashion.
    # Still, a dummy label vector must be passed containing the label chosen to indicate unlabeled events
    y_dummy = np.ones(x_train.shape[0]) * -999
    som_clf.fit(X=x_train, y=y_dummy)

    # Save the trained model
    som_clf.save(filename='som_classifier.pkl', filepath=save_path)


def workflow_step_wise_unsupervised_inference():
    import os
    import numpy as np
    import pandas as pd

    from flagx.io import FlowDataManager, export_to_fcs
    from flagx.gating import SomClassifier
    from flagx.dimred import TSNE, UMAP

    # --- Define path where results are saved to
    save_path = './results/workflow_step_wise_unsupervised_inference'
    os.makedirs(save_path, exist_ok=True)


    # --- Specify where test data is saved and specify the corresponding filenames
    # Define path to training data
    test_data_path = './data/flagx_tutorial_data/testing'

    # Get list of files in the data directory (only include ones ending with .csv)
    test_files = sorted([fn for fn in os.listdir(test_data_path) if fn.endswith('.csv')])

    # --- Before data processing with the FlowDataManager, check whether each file contains the same number of channels
    # Load the training files into pandas dataframes
    test_data_dfs = [pd.read_csv(os.path.join(test_data_path, fn)) for fn in test_files]

    # For each file print the number of channels
    for fn, data_df in zip(test_files, test_data_dfs):
        print(f'# --- {fn}, number of channels: {data_df.shape[1]}')

    print('\n')

    # As an additional check, also print the channel names
    for fn, data_df in zip(test_files, test_data_dfs):
        print(f'# --- {fn}:\n{data_df.columns.to_list()}')

    print('\n')

    # !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
    # The test data files contain an error:
    # Channel '-APC-AF750' is named '#NAME?' (probably an artifact from manual renaming of the channels).
    # !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!

    # Fix the naming error and save the corrected files into a newly created directory
    # Name the column correctly
    for data_df in test_data_dfs:
        data_df.rename(columns={'#NAME?': '-APC-AF750'}, inplace=True)
        # print(data_df.columns.to_list())

    # Create a new dir, save, and reset test_data_path
    test_data_path_correct = './data/flagx_tutorial_data/testing_corrected'
    os.makedirs(test_data_path_correct, exist_ok=True)
    for fn, data_df in zip(test_files, test_data_dfs):
        data_df.to_csv(os.path.join(test_data_path_correct, fn), index=False)


    # --- Data loading and processing; NOTE: data should be processed the same way as for model training
    # Initialize the data manager
    fdm = FlowDataManager(
        data_file_names=test_files,
        data_file_type=None,  # Is inferred from the filename ending of the 1st file in the 'test_files' list
        data_file_path=test_data_path_correct,
        verbosity=1
    )

    # Load data into memory
    fdm.load_data_files_to_anndata()

    # --- Apply preprocessing transformation to each sample, same as for training!
    # Example 1: Apply arcsinh with cofactor 150,
    # Example 2: Apply log transformation with custom cutoffs
    # In both cases, store non-transformed data in a separate layer of the AnnData object that we call 'no_trafo'.
    example_1 = True
    if example_1:
        preprocessing_kwargs = {'cofactor': 150}
        fdm.sample_wise_preprocessing(flavour='arcsinh', save_raw_to_layer='no_trafo', **preprocessing_kwargs)
    else:
        # Define python dictionary mapping channel names to cutoffs (arbitrarily chosen here, adjust if needed)
        channel_name_to_cutoff = {
            'FS INT': 1000, 'SS INT': 800,
            '15-FITC': 300, '13-PE': 300, '34-ECD': 300, '117-PC5.5': 300, '33-PC7': 300,
            '2-APC': 200, '7-APC-AF700': 200, '-APC-AF750': 200, 'HLADR-PB': 200, '45-CO': 200,
        }
        preprocessing_kwargs = {'cutoffs': channel_name_to_cutoff}
        fdm.sample_wise_preprocessing(
            flavour='log10_w_custom_cutoffs', save_raw_to_layer='no_trafo', **preprocessing_kwargs
        )

    # NOTE: typically no downsampling at inference time, do so for faster UMAP and t-SNE computation
    # --- Downsample each sample to a target number of events
    fdm.sample_wise_downsampling(data_set='all', target_num_events=100)

    # Define channels that were used for model training
    channels = [
        'FS INT', 'SS INT',
        '15-FITC', '13-PE', '34-ECD', '117-PC5.5', '33-PC7', '2-APC', '7-APC-AF700', '-APC-AF750', 'HLADR-PB', '45-CO'
    ]
    # Extract the processed data matrices from the AnnData objects
    data_matrices = [adata[:, channels].X for adata in fdm.anndata_list_]
    # Get number of events per test sample and compute indices at which samples start in the concatenated data matrix
    num_events = [x.shape[0] for x in data_matrices]
    starting_indices = np.cumsum([0, ] + num_events)
    # Concatenate
    x_test = np.concatenate(data_matrices, axis=0)

    # --- Compute dimensionality reductions on concatenated test samples
    # --- SOM
    # Load the previously trained SOM model
    som_clf = SomClassifier.load(
        filename='som_classifier.pkl',
        filepath='./results/workflow_step_wise_unsupervised_som_training'
    )
    _, x_som, _, _ = som_clf.transform(x_test)

    # --- UMAP
    umap_model = UMAP(n_components=2, n_jobs=-1)
    x_umap = umap_model.fit_transform(x_test)

    # --- t-SNE
    tsne_model = TSNE(n_components=2, n_jobs=-1)
    x_tsne = tsne_model.fit_transform(x_test)

    # Change back into sample-wise format (input format required by export function)
    x_soms_1 = [x_som[starting_indices[i]: starting_indices[i + 1], 0] for i in range(len(num_events))]
    x_soms_2 = [x_som[starting_indices[i]: starting_indices[i + 1], 1] for i in range(len(num_events))]
    x_umaps_1 = [x_umap[starting_indices[i]: starting_indices[i + 1], 0] for i in range(len(num_events))]
    x_umaps_2 = [x_umap[starting_indices[i]: starting_indices[i + 1], 1] for i in range(len(num_events))]
    x_tsnes_1 = [x_tsne[starting_indices[i]: starting_indices[i + 1], 0] for i in range(len(num_events))]
    x_tsnes_2 = [x_tsne[starting_indices[i]: starting_indices[i + 1], 1] for i in range(len(num_events))]

    # Export to FCS
    export_to_fcs(
        data_list=fdm.anndata_list_,  # Export the test samples
        layer_key='no_trafo',  # We want to export non-transformed data => choose the 'no_trafo' layer
        sample_wise=False,  # Export one FCS in which the test samples are concatenated
        add_columns=[
            x_soms_1, x_soms_2,
            x_umaps_1, x_umaps_2,
            x_tsnes_1, x_tsnes_2
        ],  # Add columns corresponding to the 1st and 2nd dimension of the dimensionality reductions into 2D
        add_columns_names=['SOM_1', 'SOM_2', 'UMAP_1', 'UMAP_2', 'TSNE_1', 'TSNE_2'],  # Add names for added columns
        scale_columns=['SOM_1', 'SOM_2', 'UMAP_1', 'UMAP_2', 'TSNE_1', 'TSNE_2'],  # Select added columns for scaling
        val_range=(0, 2**20),  # Range to which selected columns are scaled to
        save_path=save_path,
        save_filenames='annotated_test_data.fcs'
    )


def example_concatenating_csv_files():

    import os
    import numpy as np
    import pandas as pd

    # --- Create example CSV files ---

    # Create directory to save example files into
    data_path = './data/example_csv'
    os.makedirs(data_path, exist_ok=True)

    # 5 gated populations
    population_names = ['others', 'blasts', 'laip1', 'laip2']

    # 3 channels + sample ID
    num_channels = 3
    channel_names = [f'C_{i}' for i in range(num_channels)] + ['sample_id']

    # Iterate over populations and create example data
    for population_name in population_names:

        # Generate random data matrix
        n_events = 10
        x_data = np.random.normal(size=(n_events, num_channels), loc=0, scale=1).round(decimals=4)

        # Generate random sample IDs between 1 and 6
        x_sample_id = np.random.randint(low=1, high=6, size=n_events)

        # Add sample ID as column to data matrix
        x_sample_id = x_sample_id.reshape(-1, 1)
        x_data_concat = np.concatenate((x_data, x_sample_id), axis=1)

        # Create DataFrame with channel names as column names
        data_df = pd.DataFrame(x_data_concat, columns=channel_names)

        # Save
        data_df.to_csv(os.path.join(data_path, f'{population_name}.csv'), index=False)


    # --- Concatenate CSVs into one file and add population labels ---
    filenames = ['others.csv', 'blasts.csv', 'laip1.csv', 'laip2.csv']
    data_dfs = []
    meta_data = []
    for i, filename in enumerate(filenames):

        # Load the CSV file
        data_df = pd.read_csv(os.path.join(data_path, filename))

        # Add a columns with an integer label encoding the population
        data_df['label'] = i

        # Add dataframe to list
        data_dfs.append(data_df)

        # Save information which population is encoded by which integer
        meta_data.append({
            'filename': filename,
            'integer_label': i
        })

    # Concatenate the DataFrames and save again
    data_df_concat = pd.concat(data_dfs, axis=0, ignore_index=True)
    data_df_concat.to_csv(os.path.join(data_path, 'data_concatenated_with_population_labels.csv'), index=False)

    # Generate and save DataFrame with the label information
    meta_data_df = pd.DataFrame(meta_data)
    meta_data_df.to_csv(os.path.join(data_path, 'meta_data.csv'), index=False)


def workflow_step_wise_supervised_training():
    import os
    import torch
    import numpy as np
    import pandas as pd

    from flagx.io import FlowDataManager
    from flagx.gating import SomClassifier, MLPClassifier

    # --- Define path where results are saved to
    save_path = './results/workflow_step_wise_supervised_training'
    save_path_data_handling = './results/workflow_step_wise_supervised_training/data_handling'
    os.makedirs(save_path_data_handling, exist_ok=True)

    # --- Generate some artificial training data for this example
    # Define path to training data
    training_data_path = './data/supervised_example'
    os.makedirs(training_data_path, exist_ok=True)

    num_channels = 5
    num_events = 100
    channel_names = [f'C_{i}' for i in range(num_channels)] + ['label']
    for i in range(6):
        # Generate random data matrix
        x_data = np.random.normal(size=(num_events, num_channels), loc=i, scale=0.01)
        # Generate random labels between i and i + 2
        y_label = np.random.randint(low=i, high=i + 2, size=num_events)
        # Add labels as column to data matrix
        y_label = y_label.reshape(-1, 1)
        x_data_concat = np.concatenate((x_data, y_label), axis=1)
        # Create DataFrame with channel names as column names
        data_df = pd.DataFrame(x_data_concat, columns=channel_names)
        # Save
        data_df.to_csv(os.path.join(training_data_path, f'sample_{i}.csv'), index=False)

    # Get list of files in the data directory (only include ones ending with .csv)
    training_files = sorted([fn for fn in os.listdir(training_data_path) if fn.endswith('.csv')])

    # --- Data loading and processing
    # Initialize the data manager
    fdm = FlowDataManager(
        data_file_names=training_files,
        data_file_type=None,  # Is inferred from the filename ending of the 1st file in the 'training_files' list
        data_file_path=training_data_path,
        save_path=save_path_data_handling,
        verbosity=1
    )

    # Load data into memory
    # Data samples are now stored not in a Pandas DataFrames, but in a list of AnnData object.
    # This list is an attribute of the FlowDataManager class and can be accessed via FlowDataManager.anndata_list_.
    # AnnData is a Python class similar to Pandas DataFrames but with more options and functions for data annotation.
    # see: https://anndata.readthedocs.io/en/stable/
    fdm.load_data_files_to_anndata()

    # --- Apply preprocessing transformation to each sample
    # Apply arcsinh with cofactor 150.
    # Store non-transformed data in a separate layer of the AnnData object that we call 'no_trafo'.
    preprocessing_kwargs = {'cofactor': 150}
    fdm.sample_wise_preprocessing(flavour='arcsinh', save_raw_to_layer='no_trafo', **preprocessing_kwargs)

    # --- Downsample each sample to a target number of events
    # Set target_num_events to 100 for fast model training in this example
    fdm.sample_wise_downsampling(data_set='all', target_num_events=50)

    # --- Extract concatenated data matrix for model training
    # Define channels to be used for model training
    channels = [f'C_{i}' for i in range(num_channels)]

    # Use dataloader with batchsize -1 (= all data) to extract the transformed data matrix and label vector
    # Note that the data matrix from which we retrieve the labels is the original non-transformed data (label_layer_key='no_trafo').
    # Otherwise, we would get the arcsinh-transformed labels
    data_loader = fdm.get_data_loader(
        data_set='all',
        channels=channels,
        label_key='label',
        label_layer_key='no_trafo',
        batch_size=-1,
    )
    x_train, y_train = next(iter(data_loader))

    TRAIN_SOM = True
    if TRAIN_SOM:
        # Instantiate the SOMClassifier, set hyperparameters
        som_clf = SomClassifier(
            som_topology='planar',
            som_grid_type='rectangular',
            som_dimensions=(3, 3),  # (25, 25)
            neighborhood='gaussian',
            gaussian_neighborhood_sigma=0.1,
            initialization='pca',
            n_epochs=100,  # 1000,
            radius_0=-0.25,
            radius_n=0.1,
            radius_cooling='exponential',
            learning_rate_0=0.1,
            learning_rate_n=0.001,
            learning_rate_decay='exponential',
            unlabeled_label=-999,
            verbosity=2
        )
        # Model fitting
        som_clf.fit(X=x_train, y=y_train)
        # Save the trained model
        som_clf.save(filename='som_classifier.pkl', filepath=save_path)

    else:
        # Instantiate the MLP, set hyperparameters
        mlp_clf = MLPClassifier(
            layer_sizes=(128, 64, 32),
            n_epochs=100,
            data_loader_params={'batch_size': 128, 'shuffle': True, 'num_workers': 1},
            device='cuda' if torch.cuda.is_available() else 'cpu',
            verbosity=2
        )
        # Model fitting
        mlp_clf.fit(X=x_train, y=y_train)
        # Save the trained model
        mlp_clf.save(filename='mlp_classifier.pkl', filepath=save_path)


def workflow_step_wise_supervised_inference():
    import os
    import numpy as np

    from flagx.io import FlowDataManager, export_to_fcs
    from flagx.gating import SomClassifier, MLPClassifier
    from flagx.dimred import UMAP, TSNE

    # --- Define path where results are saved to
    save_path = './results/workflow_step_wise_supervised_inference'
    os.makedirs(save_path, exist_ok=True)

    # Define path to training data
    inference_data_path = './data/supervised_example'

    # Get list of files in the data directory (only include ones ending with .csv)
    inference_files = sorted([fn for fn in os.listdir(inference_data_path) if fn.endswith('.csv')])

    # --- Data loading and processing
    # Initialize the data manager
    fdm = FlowDataManager(
        data_file_names=inference_files,
        data_file_type=None,  # Is inferred from the filename ending of the 1st file in the 'training_files' list
        data_file_path=inference_data_path,
        verbosity=1
    )

    # Load data into memory
    fdm.load_data_files_to_anndata()

    # --- Apply preprocessing transformation to each sample, same as for training data
    # Apply arcsinh with cofactor 150.
    # Store non-transformed data in a separate layer of the AnnData object that we call 'no_trafo'.
    preprocessing_kwargs = {'cofactor': 150}
    fdm.sample_wise_preprocessing(flavour='arcsinh', save_raw_to_layer='no_trafo', **preprocessing_kwargs)

    # Define channels that were used for model training
    channels = [f'C_{i}' for i in range(5)]

    # Extract the processed data matrices from the AnnData objects
    data_matrices = [adata[:, channels].X for adata in fdm.anndata_list_]
    # Get number of events per test sample and compute indices at which samples start in the concatenated data matrix
    num_events = [x.shape[0] for x in data_matrices]
    starting_indices = np.cumsum([0, ] + num_events)
    # Concatenate
    x_test = np.concatenate(data_matrices, axis=0)

    # Load the previously trained models
    som_clf = SomClassifier.load(
        filename='som_classifier.pkl',
        filepath='./results/workflow_step_wise_supervised_training'
    )

    mlp_clf = MLPClassifier.load(
        filename='mlp_classifier.pkl',
        filepath='./results/workflow_step_wise_supervised_training'
    )

    # Make prediction for the test data
    y_pred_som = som_clf.predict(x_test)
    y_pred_mlp = mlp_clf.predict(x_test)

    # Change predictions back into sample-wise format (input format required by export function)
    y_pred_som = [y_pred_som[starting_indices[i]: starting_indices[i + 1]] for i in range(len(num_events))]
    y_pred_mlp = [y_pred_mlp[starting_indices[i]: starting_indices[i + 1]] for i in range(len(num_events))]


    add_columns = [y_pred_som, y_pred_mlp]
    add_columns_names = ['pred_som', 'pred_mlp']

    # If dimensionality reductions should be computed as well, set: compute_dim_red = True
    compute_dim_red = True
    if compute_dim_red:

        # --- SOM
        _, x_som, _, _ = som_clf.transform(x_test)

        # --- UMAP
        umap_model = UMAP(n_components=2, n_jobs=-1)
        x_umap = umap_model.fit_transform(x_test)

        # --- t-SNE
        tsne_model = TSNE(n_components=2, n_jobs=-1)
        x_tsne = tsne_model.fit_transform(x_test)

        # Change back into sample-wise format (input format required by export function)
        x_soms_1 = [x_som[starting_indices[i]: starting_indices[i + 1], 0] for i in range(len(num_events))]
        x_soms_2 = [x_som[starting_indices[i]: starting_indices[i + 1], 1] for i in range(len(num_events))]
        x_umaps_1 = [x_umap[starting_indices[i]: starting_indices[i + 1], 0] for i in range(len(num_events))]
        x_umaps_2 = [x_umap[starting_indices[i]: starting_indices[i + 1], 1] for i in range(len(num_events))]
        x_tsnes_1 = [x_tsne[starting_indices[i]: starting_indices[i + 1], 0] for i in range(len(num_events))]
        x_tsnes_2 = [x_tsne[starting_indices[i]: starting_indices[i + 1], 1] for i in range(len(num_events))]

        add_columns += [
            x_soms_1, x_soms_2,
            x_umaps_1, x_umaps_2,
            x_tsnes_1, x_tsnes_2
        ]
        add_columns_names += ['SOM_1', 'SOM_2', 'UMAP_1', 'UMAP_2', 'TSNE_1', 'TSNE_2']

    # Export to FCS
    export_to_fcs(
        data_list=fdm.anndata_list_,  # Export the test samples
        layer_key='no_trafo',  # We want to export non-transformed data => choose the 'no_trafo' layer
        sample_wise=False,  # Export one FCS in which the test samples are concatenated
        add_columns=add_columns,  # Add columns corresponding to the 1st and 2nd dimension of the dimensionality reductions into 2D
        add_columns_names=add_columns_names,  # Add names for added columns
        scale_columns=add_columns_names,  # Select added columns for scaling (all that were added to the file)
        val_range=(0, 2 ** 20),  # Range to which selected columns are scaled to
        save_path=save_path,
        save_filenames='annotated_test_data.fcs'
    )


if __name__ == '__main__':

    # workflow_pipeline_unsupervised()

    # workflow_step_wise_unsupervised_som_training()

    # workflow_step_wise_unsupervised_inference()

    # example_concatenating_csv_files()

    # workflow_step_wise_supervised_training()

    workflow_step_wise_supervised_inference()

    print('done')