Skip to content

Commit fbb1fa9

Browse files
author
Marc Maxmeister
authored
v1.2.2 added --save_control CLI option (#42)
1 parent e132dac commit fbb1fa9

File tree

15 files changed

+355
-47
lines changed

15 files changed

+355
-47
lines changed

methylprep/cli.py

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -176,6 +176,14 @@ def cli_process(cmd_args):
176176
help="Change the processed beta or m_value data_type output from float64 to float16 or float32, to save disk space.",
177177
)
178178

179+
parser.add_argument(
180+
'-c', '--save_control',
181+
required=False,
182+
action='store_true',
183+
default=False,
184+
help='If specified, saves an additional "control_probes.pkl" file that contains Control and SNP-I probe data in the data_dir.'
185+
)
186+
179187
args = parser.parse_args(cmd_args)
180188

181189
array_type = args.array_type
@@ -200,6 +208,7 @@ def cli_process(cmd_args):
200208
export=args.no_export, # flag flips here
201209
meta_data_frame=args.no_meta_export, # flag flips here
202210
bit=args.bit,
211+
save_control=args.save_control,
203212
)
204213

205214

methylprep/files/idat.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,9 @@ class IdatHeaderLocation(IntEnum):
3939
class IdatSectionCode(IntEnum):
4040
"""Unique IntEnum defining constant values for byte offsets of IDAT headers.
4141
These values come from the field codes of the Bioconductor illuminaio package.
42+
43+
MM: refer to https://bioconductor.org/packages/release/bioc/vignettes/illuminaio/inst/doc/EncryptedFormat.pdf
44+
and https://bioconductor.org/packages/release/bioc/vignettes/illuminaio/inst/doc/illuminaio.pdf
4245
"""
4346
ILLUMINA_ID = 102
4447
STD_DEV = 103
@@ -239,7 +242,7 @@ def seek_to_section(section_code):
239242
data=probe_records,
240243
orient='index',
241244
columns=['mean_value'],
242-
dtype='float64',
245+
dtype='float32',
243246
)
244247

245248
data_frame.index.name = 'illumina_id'

methylprep/files/manifests.py

Lines changed: 33 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -50,6 +50,7 @@
5050
'Control_Type',
5151
'Color',
5252
'Extended_Type',
53+
# control probes don't have 'IlmnID' values set -- these probes are not locii specific
5354
)
5455

5556

@@ -80,6 +81,7 @@ def __init__(self, array_type, filepath_or_buffer=None, on_lambda=False):
8081
with get_file_object(filepath_or_buffer) as manifest_file:
8182
self.__data_frame = self.read_probes(manifest_file)
8283
self.__control_data_frame = self.read_control_probes(manifest_file)
84+
self.__snp_data_frame = self.read_snp_probes(manifest_file)
8385

8486
@property
8587
def columns(self):
@@ -93,6 +95,10 @@ def data_frame(self):
9395
def control_data_frame(self):
9496
return self.__control_data_frame
9597

98+
@property
99+
def snp_data_frame(self):
100+
return self.__snp_data_frame
101+
96102
@staticmethod
97103
def download_default(array_type, on_lambda=False):
98104
"""Downloads the appropriate manifest file if one does not already exist.
@@ -150,6 +156,13 @@ def read_probes(self, manifest_file):
150156
)
151157

152158
def get_probe_type(name, infinium_type):
159+
"""what:
160+
returns one of (I, II, SnpI, SnpII, Control)
161+
162+
how:
163+
.from_manifest_values() returns probe type using either
164+
the Infinium_Design_Type (I or II) or the name (starts with 'rs' == SnpI)
165+
and 'Control' is none of the above."""
153166
probe_type = ProbeType.from_manifest_values(name, infinium_type)
154167
return probe_type.value
155168

@@ -163,23 +176,37 @@ def get_probe_type(name, infinium_type):
163176
return data_frame
164177

165178
def read_control_probes(self, manifest_file):
166-
LOGGER.info(f'Reading control probes: {Path(manifest_file.name).stem}')
179+
""" Unlike other probes, control probes have no IlmnID because they're not locus-specific. """
180+
#LOGGER.info(f'Reading control probes: {Path(manifest_file.name).stem}')
167181

168182
self.seek_to_start(manifest_file)
169183

170-
num_headers = 4
184+
#num_headers = 4 -- removed on Jan 21 2020.
185+
# Steve Byerly added this =4, but the manifests seem to start controls at the point specified, with no extra columns based on .num_probes stored.
186+
num_headers = 0
171187

172188
return pd.read_csv(
173189
manifest_file,
174190
comment='[',
175191
header=None,
176-
index_col=0,
192+
index_col=0, # illumina_id, not IlmnID here
177193
names=CONTROL_COLUMNS,
178194
nrows=self.array_type.num_controls,
179195
skiprows=self.array_type.num_probes + num_headers,
180196
usecols=range(len(CONTROL_COLUMNS)),
181197
)
182198

199+
def read_snp_probes(self, manifest_file):
200+
""" Unlike cpg and control probes, these rs probes are NOT sequential in all arrays. """
201+
#LOGGER.info(f'Reading snp probes: {Path(manifest_file.name).stem} --> {snp_df.shape[0]} found')
202+
self.seek_to_start(manifest_file)
203+
# since these are not sequential, loading everything and filtering by IlmnID.
204+
snp_df = pd.read_csv(
205+
manifest_file,
206+
low_memory=False)
207+
snp_df = snp_df[snp_df['IlmnID'].str.match('rs', na=False)]
208+
return snp_df
209+
183210
def map_to_genome(self, data_frame):
184211
genome_df = self.get_genome_data()
185212
merged_df = inner_join_data(data_frame, genome_df)
@@ -221,6 +248,9 @@ def get_loci_names(self):
221248
return self.data_frame['Name'].unique()
222249

223250
def get_probe_details(self, probe_type, channel=None):
251+
"""given a probe type (I, II, SnpI, SnpII, Control) and a channel (Channel.RED | Channel.GREEN),
252+
This will return info needed to map probes to their names (e.g. cg0031313 or rs00542420),
253+
which are NOT in the idat files."""
224254
if not isinstance(probe_type, ProbeType):
225255
raise Exception('probe_type is not a valid ProbeType')
226256

methylprep/models/__init__.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,8 @@
66
FG_RED_PROBE_SUBSETS,
77
METHYLATED_PROBE_SUBSETS,
88
UNMETHYLATED_PROBE_SUBSETS,
9+
METHYLATED_SNP_PROBES,
10+
UNMETHYLATED_SNP_PROBES,
911
Channel,
1012
Probe,
1113
ProbeAddress,

methylprep/models/arrays.py

Lines changed: 12 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@ def __str__(self):
1515

1616
@classmethod
1717
def from_probe_count(cls, probe_count):
18-
"""Determines array type using number of probes. Returns array string."""
18+
"""Determines array type using number of probes counted in raw idat file. Returns array string."""
1919
if probe_count == 1055583:
2020
return cls.ILLUMINA_EPIC_PLUS
2121

@@ -36,20 +36,28 @@ def num_probes(self):
3636
ArrayType.ILLUMINA_27K: 27578,
3737
ArrayType.ILLUMINA_450K: 485578,
3838
ArrayType.ILLUMINA_EPIC: 865919,
39-
ArrayType.ILLUMINA_EPIC_PLUS: 868699,
39+
ArrayType.ILLUMINA_EPIC_PLUS: 868698, # was 868699 until Jan 21, 2020. corrected.
4040
}
41-
4241
return probe_counts.get(self)
4342

4443
@property
4544
def num_controls(self):
4645
probe_counts = {
47-
ArrayType.ILLUMINA_27K: 144,
46+
ArrayType.ILLUMINA_27K: 144, # the manifest does not contain control probe data (illumina's site included)
4847
ArrayType.ILLUMINA_450K: 850,
4948
ArrayType.ILLUMINA_EPIC: 635,
5049
ArrayType.ILLUMINA_EPIC_PLUS: 635,
5150
}
51+
return probe_counts.get(self)
5252

53+
@property
54+
def num_snps(self):
55+
probe_counts = {
56+
ArrayType.ILLUMINA_27K: 0,
57+
ArrayType.ILLUMINA_450K: 65,
58+
ArrayType.ILLUMINA_EPIC: 59,
59+
ArrayType.ILLUMINA_EPIC_PLUS: 120,
60+
}
5361
return probe_counts.get(self)
5462

5563
''' # doesn't appear to be used anywhere. -- and pipeline Array model conflicts with it.

methylprep/models/probes.py

Lines changed: 96 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,8 @@
44

55
@unique
66
class Channel(Enum):
7+
""" idat probes measure either a red or green fluorescence.
8+
This specifies which to return within idat.py: red_idat or green_idat."""
79
RED = 'Red'
810
GREEN = 'Grn'
911

@@ -21,6 +23,13 @@ def is_red(self):
2123

2224
@unique
2325
class ProbeAddress(Enum):
26+
"""AddressA_ID and AddressB_ID are columns in the manifest csv that contain internal Illumina probe identifiers.
27+
28+
Type II probes use AddressA_ID; Type I uses both, because there are two probes, two colors.
29+
30+
probe intensities in .idat files are keyed to one of these ids, but processed data is always keyed to the
31+
IlmnID probe "names" -- so this is used in converting between IDs. It is used to define probe sets below
32+
in this probes.py"""
2433
A = 'A'
2534
B = 'B'
2635

@@ -33,6 +42,11 @@ def header_name(self):
3342

3443
@unique
3544
class ProbeType(Enum):
45+
""" probes can either be type I or type II for CpG or Snp sequences. Control probes are used for background
46+
correction in different fluorescence ranges and staining efficiency.
47+
Type I probes record EITHER a red or a green value.
48+
Type II probes record both values together.
49+
NOOB uses the red fluorescence on a green probe and vice versa to calculate background fluorescence."""
3650
ONE = 'I'
3751
TWO = 'II'
3852
SNP_ONE = 'SnpI'
@@ -44,6 +58,9 @@ def __str__(self):
4458

4559
@staticmethod
4660
def from_manifest_values(name, infinium_type):
61+
""" this function determines which of four kinds of probe goes with this name, using either
62+
the Infinium_Design_Type (I or II) or the name (starts with 'rs')
63+
and decides 'Control' is non of the above."""
4764
is_snp = name.startswith('rs')
4865

4966
if infinium_type == 'I':
@@ -56,6 +73,7 @@ def from_manifest_values(name, infinium_type):
5673

5774

5875
class Probe():
76+
""" this doesn't appear to be instantiated anywhere in methylprep """
5977
__slots__ = [
6078
'address',
6179
'illumina_id',
@@ -69,6 +87,9 @@ def __init__(self, address, illumina_id, probe_type):
6987

7088

7189
class ProbeSubset():
90+
""" used below in probes.py to define sub-sets of probes:
91+
foreground-(red|green|all), or (un)methylated probes
92+
"""
7293
__slots__ = [
7394
'data_channel',
7495
'probe_address',
@@ -104,6 +125,81 @@ def get_probe_details(self, manifest):
104125
)
105126

106127

128+
'''
129+
# notebook equivalent of SNP_PROBES below.
130+
#manifest = data_containers[0].manifest.data_frame
131+
#snpProbesI = manifest[manifest['probe_type']=='SnpI']
132+
#snpProbesI_Grn = snpProbesI[snpProbesI['Color_Channel']=='Grn']
133+
#snpProbesI_Red = snpProbesI[snpProbesI['Color_Channel']=='Red']
134+
#snpProbesII = manifest[manifest['probe_type']=='SnpII']
135+
136+
SNP_PROBES = (
137+
# SNP_II_PROBES
138+
ProbeSubset(
139+
data_channel=Channel.GREEN,
140+
probe_address=ProbeAddress.A,
141+
probe_channel=None,
142+
probe_type=ProbeType.SNP_TWO,
143+
),
144+
# SNP_I_GREEN_PROBES
145+
ProbeSubset(
146+
data_channel=Channel.GREEN,
147+
probe_address=ProbeAddress.A,
148+
probe_channel=Channel.GREEN,
149+
probe_type=ProbeType.SNP_ONE,
150+
),
151+
# SNP_I_RED_PROBES
152+
ProbeSubset(
153+
data_channel=Channel.RED,
154+
probe_address=ProbeAddress.B,
155+
probe_channel=Channel.RED,
156+
probe_type=ProbeType.SNP_ONE,
157+
)
158+
)
159+
'''
160+
161+
METHYLATED_SNP_PROBES = (
162+
ProbeSubset(
163+
data_channel=Channel.GREEN,
164+
probe_address=ProbeAddress.A,
165+
probe_channel=None,
166+
probe_type=ProbeType.SNP_TWO,
167+
),
168+
ProbeSubset(
169+
data_channel=Channel.GREEN,
170+
probe_address=ProbeAddress.B,
171+
probe_channel=Channel.GREEN,
172+
probe_type=ProbeType.SNP_ONE,
173+
),
174+
ProbeSubset(
175+
data_channel=Channel.RED,
176+
probe_address=ProbeAddress.B,
177+
probe_channel=Channel.RED,
178+
probe_type=ProbeType.SNP_ONE,
179+
),
180+
)
181+
182+
UNMETHYLATED_SNP_PROBES = (
183+
ProbeSubset(
184+
data_channel=Channel.RED,
185+
probe_address=ProbeAddress.A,
186+
probe_channel=None,
187+
probe_type=ProbeType.SNP_TWO,
188+
),
189+
ProbeSubset(
190+
data_channel=Channel.GREEN,
191+
probe_address=ProbeAddress.A,
192+
probe_channel=Channel.GREEN,
193+
probe_type=ProbeType.SNP_ONE,
194+
),
195+
ProbeSubset(
196+
data_channel=Channel.RED,
197+
probe_address=ProbeAddress.A,
198+
probe_channel=Channel.RED,
199+
probe_type=ProbeType.SNP_ONE,
200+
),
201+
)
202+
107203
FG_GREEN_PROBE_SUBSETS = (
108204
ProbeSubset(
109205
data_channel=Channel.GREEN,

methylprep/processing/meth_dataset.py

Lines changed: 19 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,12 @@
22
import logging
33
import pandas as pd
44
# App
5-
from ..models import METHYLATED_PROBE_SUBSETS, UNMETHYLATED_PROBE_SUBSETS
5+
from ..models import (
6+
METHYLATED_PROBE_SUBSETS,
7+
UNMETHYLATED_PROBE_SUBSETS,
8+
METHYLATED_SNP_PROBES,
9+
UNMETHYLATED_SNP_PROBES,
10+
)
611

712

813
__all__ = ['MethylationDataset']
@@ -25,7 +30,7 @@ class MethylationDataset():
2530
__preprocessed = False
2631

2732
def __init__(self, raw_dataset, manifest, probe_subsets):
28-
LOGGER.info('Preprocessing methylation dataset: %s', raw_dataset.sample)
33+
#LOGGER.info('Preprocessing methylation dataset: %s', raw_dataset.sample)
2934

3035
self.probe_subsets = probe_subsets
3136
self.raw_dataset = raw_dataset
@@ -39,12 +44,24 @@ def __init__(self, raw_dataset, manifest, probe_subsets):
3944

4045
@classmethod
4146
def methylated(cls, raw_dataset, manifest):
47+
""" convenience method that feeds in a pre-defined list of methylated CpG locii probes """
4248
return cls(raw_dataset, manifest, METHYLATED_PROBE_SUBSETS)
4349

4450
@classmethod
4551
def unmethylated(cls, raw_dataset, manifest):
52+
""" convenience method that feeds in a pre-defined list of UNmethylated CpG locii probes """
4653
return cls(raw_dataset, manifest, UNMETHYLATED_PROBE_SUBSETS)
4754

55+
@classmethod
56+
def snp_methylated(cls, raw_dataset, manifest):
57+
""" convenience method that feeds in a pre-defined list of methylated Snp locii probes """
58+
return cls(raw_dataset, manifest, METHYLATED_SNP_PROBES)
59+
60+
@classmethod
61+
def snp_unmethylated(cls, raw_dataset, manifest):
62+
""" convenience method that feeds in a pre-defined list of methylated Snp locii probes """
63+
return cls(raw_dataset, manifest, UNMETHYLATED_SNP_PROBES)
64+
4865
def build_data_frame(self):
4966
return pd.concat(self.data_frames.values())
5067

0 commit comments

Comments
 (0)