liver-stator/doublet.py at main · whimsial/liver-stator · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
#!/usr/bin/env python
# coding: utf-8

import scanpy as sc
import scrublet as scr
import matplotlib.pyplot as plt
import numpy as np
import os
import argparse
import glob

## set graphics parameters
plt.rcParams['font.sans-serif'] = 'Arial'
plt.rc('font', size = 14)
plt.rcParams['pdf.fonttype'] = 42

## get input parameters
parser = argparse.ArgumentParser(description="Doublet detection.")

parser.add_argument(
    '--sample_dir', type=str,
    help='A directory path for the sample'
)

parser.add_argument(
    '--data_type', type=str, required=True,
    help='Specify type of sample files (mtx or h5)'
)

parser.add_argument(
    '--doublet_threshold', type=float, required=True,
    help='Specify doublet threshold after examining distributions'
)

args = parser.parse_args()

print("You have passed the following directory:")
s = args.sample_dir
print(s)
if os.path.isdir(s):
    print(f"Directory: {s}")
else:
    print(f"Invalid directory: {s}")

## load count matrix
if args.data_type=="h5":
    ## find h5 file within sample directory
    full_path = os.path.join(s, "*.h5")
    sample_files = glob.glob(full_path)
    if sample_files:
        sample_file = sample_files[0]
    else:
        raise FileNotFoundError(f"No *.h5 files found in '{s}'")
    counts_matrix = sc.read_10x_h5(sample_file)
else:
    counts_matrix = sc.read_10x_mtx(s, var_names='gene_symbols', cache=True)

## doublet rates in 1000 cells normally ~0.8% per 1k cells
edr = 0.008
expected_doublet_rate = float(edr) * counts_matrix.shape[0] / 1000
print('EDR: {}'.format(expected_doublet_rate))

scrub = scr.Scrublet(counts_matrix.X,
                     expected_doublet_rate = float(expected_doublet_rate))

doublet_scores, predicted_doublets = scrub.scrub_doublets(
    min_counts=2,
    min_cells=3,
    min_gene_variability_pctl=60,
    n_prin_comps=30,
    log_transform=True,
    mean_center=True,
    normalize_variance=True,
    synthetic_doublet_umi_subsampling=1)

scrub.call_doublets(threshold=args.doublet_threshold)

outf1 = s + "/scrublet_EDR" + str(edr) + "_DoubletScores.csv"
outf2 = s + "/scrublet_EDR" + str(edr) + "_PredictedDoublets.csv"
np.savetxt(outf1, doublet_scores, delimiter=',')
np.savetxt(outf2, predicted_doublets, delimiter=',')
print("Saved " + outf1)
print("Saved " + outf2)