From 893da488cd48ec0a1876b7074ca73c49e02167e4 Mon Sep 17 00:00:00 2001 From: "Terlouw, Barbara" Date: Wed, 23 Apr 2025 16:42:44 +0200 Subject: [PATCH 01/41] Add base collections to Base flag --- mewtwo/embeddings/bases.py | 8 ++++---- mewtwo/embeddings/sequence.py | 6 +++--- mewtwo/embeddings/terminator/stem.py | 2 +- 3 files changed, 8 insertions(+), 8 deletions(-) diff --git a/mewtwo/embeddings/bases.py b/mewtwo/embeddings/bases.py index 732079f..12e1698 100644 --- a/mewtwo/embeddings/bases.py +++ b/mewtwo/embeddings/bases.py @@ -9,14 +9,14 @@ class Base(Flag): G = 4 T = 8 U = 16 + DNA = A | T | G | C + RNA = A | U | G | C + PURINES = A | G + PYRIMIDINES = C | T | U def __repr__(self): return self.name - -DNA_BASES = Base.A | Base.T | Base.G | Base.C -RNA_BASES = Base.A | Base.U | Base.G | Base.C - WATSON_CRICK_PAIRS = [Base.A | Base.T, Base.A | Base.U, Base.C | Base.G] diff --git a/mewtwo/embeddings/sequence.py b/mewtwo/embeddings/sequence.py index 5335bbb..bd433ef 100644 --- a/mewtwo/embeddings/sequence.py +++ b/mewtwo/embeddings/sequence.py @@ -1,6 +1,6 @@ from enum import Flag from typing import Union -from mewtwo.embeddings.bases import Base, DNA_BASES, RNA_BASES +from mewtwo.embeddings.bases import Base class SeqType(Flag): @@ -40,14 +40,14 @@ def _check_sequence(self): if self.seq_type == SeqType.DNA: try: base = Base[character] - if base not in DNA_BASES: + if base not in Base.DNA: raise ValueError(f"DNA sequence must be comprised of bases A, T, C, and G. Found {character} in {self.sequence}") except KeyError: raise ValueError(f"DNA sequence must be comprised of bases A, T, C, and G. Found {character} in {self.sequence}") elif self.seq_type == SeqType.RNA: try: base = Base[character] - if base not in RNA_BASES: + if base not in Base.RNA: raise ValueError(f"RNA sequence must be comprised of bases A, C, G and U. Found {character} in {self.sequence}") except KeyError: raise ValueError(f"RNA sequence must be comprised of bases A, C, G, and U. Found {character} in {self.sequence}") diff --git a/mewtwo/embeddings/terminator/stem.py b/mewtwo/embeddings/terminator/stem.py index 0e0a09d..0530b6c 100644 --- a/mewtwo/embeddings/terminator/stem.py +++ b/mewtwo/embeddings/terminator/stem.py @@ -1,4 +1,4 @@ -from mewtwo.embeddings.bases import BasePair, Base +from mewtwo.embeddings.bases import BasePair from mewtwo.embeddings.sequence import RNASequence From 3d98f1ca267f14ae20cf9ca8b981300c3d9bb5ef Mon Sep 17 00:00:00 2001 From: "Terlouw, Barbara" Date: Wed, 23 Apr 2025 17:23:17 +0200 Subject: [PATCH 02/41] Add tests for bases --- mewtwo/embeddings/bases.py | 43 ++++++++++++++++++++++++++-- mewtwo/embeddings/test/test_bases.py | 38 ++++++++++++++++++++++++ 2 files changed, 79 insertions(+), 2 deletions(-) create mode 100644 mewtwo/embeddings/test/test_bases.py diff --git a/mewtwo/embeddings/bases.py b/mewtwo/embeddings/bases.py index 12e1698..331fc92 100644 --- a/mewtwo/embeddings/bases.py +++ b/mewtwo/embeddings/bases.py @@ -9,13 +9,52 @@ class Base(Flag): G = 4 T = 8 U = 16 + ZERO_PADDING = 32 DNA = A | T | G | C RNA = A | U | G | C PURINES = A | G PYRIMIDINES = C | T | U + TWO_H_BONDS = A | T | U + THREE_H_BONDS = C | G + + +BASE_TO_ONEHOT = {Base.A: [1, 0, 0, 0], + Base.C: [0, 1, 0, 0], + Base.G: [0, 0, 1, 0], + Base.T: [0, 0, 0, 1], + Base.U: [0, 0, 0, 1], + Base.ZERO_PADDING: [0, 0, 0, 0]} + + +def base_to_vector(base: Base, one_hot: bool = False) -> list[int]: + + if one_hot: + if base not in BASE_TO_ONEHOT: + raise ValueError(f"Not a base: {base}") + else: + return BASE_TO_ONEHOT[base] + + else: + if base in Base.PYRIMIDINES: + element_1 = 1 + elif base in Base.PURINES: + element_1 = 2 + elif base == Base.ZERO_PADDING: + element_1 = 0 + else: + raise ValueError(f"Unknown base: {base}") + + if base in Base.TWO_H_BONDS: + element_2 = 2 + elif base in Base.THREE_H_BONDS: + element_2 = 3 + elif base == Base.ZERO_PADDING: + element_2 = 0 + else: + raise ValueError(f"Unknown base: {base}") + + return [element_1, element_2] - def __repr__(self): - return self.name WATSON_CRICK_PAIRS = [Base.A | Base.T, Base.A | Base.U, diff --git a/mewtwo/embeddings/test/test_bases.py b/mewtwo/embeddings/test/test_bases.py new file mode 100644 index 0000000..bc53d7a --- /dev/null +++ b/mewtwo/embeddings/test/test_bases.py @@ -0,0 +1,38 @@ +import unittest + +from mewtwo.embeddings.bases import Base, base_to_vector, BasePair + + +class TestBase(unittest.TestCase): + def test_to_vector(self): + self.assertEqual(base_to_vector(Base.A), [2, 2]) + self.assertEqual(base_to_vector(Base.U), [1, 2]) + self.assertEqual(base_to_vector(Base.G), [2, 3]) + self.assertEqual(base_to_vector(Base.C), [1, 3]) + self.assertEqual(base_to_vector(Base.T), [1, 2]) + self.assertEqual(base_to_vector(Base.ZERO_PADDING), [0, 0]) + + with self.assertRaises(ValueError): + base_to_vector(Base.DNA) + + +class TestBasePair(unittest.TestCase): + def test_is_watson_crick(self): + self.assertTrue(BasePair(Base.C, Base.G, True).is_watson_crick()) + self.assertTrue(BasePair(Base.G, Base.C, True).is_watson_crick()) + self.assertTrue(BasePair(Base.A, Base.T, True).is_watson_crick()) + self.assertTrue(BasePair(Base.T, Base.A, True).is_watson_crick()) + self.assertTrue(BasePair(Base.A, Base.U, True).is_watson_crick()) + self.assertTrue(BasePair(Base.U, Base.A, True).is_watson_crick()) + self.assertFalse(BasePair(Base.G, Base.U, True).is_watson_crick()) + self.assertFalse(BasePair(Base.U, Base.U, True).is_watson_crick()) + + def test_is_wobble(self): + self.assertTrue(BasePair(Base.G, Base.U, True).is_wobble()) + self.assertTrue(BasePair(Base.U, Base.G, True).is_wobble()) + self.assertFalse(BasePair(Base.G, Base.T, True).is_wobble()) + self.assertFalse(BasePair(Base.T, Base.G, True).is_wobble()) + + +if __name__ == '__main__': + unittest.main() From 61e3a9a47c6e8420a2aed84fd0d4fc1df48b85f3 Mon Sep 17 00:00:00 2001 From: "Terlouw, Barbara" Date: Wed, 23 Apr 2025 17:30:49 +0200 Subject: [PATCH 03/41] Add negative test for Base --- mewtwo/embeddings/bases.py | 3 --- mewtwo/embeddings/test/test_bases.py | 9 +++++++++ 2 files changed, 9 insertions(+), 3 deletions(-) diff --git a/mewtwo/embeddings/bases.py b/mewtwo/embeddings/bases.py index 331fc92..e1a119c 100644 --- a/mewtwo/embeddings/bases.py +++ b/mewtwo/embeddings/bases.py @@ -113,6 +113,3 @@ def is_wobble(self): return True return False - - - diff --git a/mewtwo/embeddings/test/test_bases.py b/mewtwo/embeddings/test/test_bases.py index bc53d7a..976a565 100644 --- a/mewtwo/embeddings/test/test_bases.py +++ b/mewtwo/embeddings/test/test_bases.py @@ -12,6 +12,15 @@ def test_to_vector(self): self.assertEqual(base_to_vector(Base.T), [1, 2]) self.assertEqual(base_to_vector(Base.ZERO_PADDING), [0, 0]) + self.assertEqual(base_to_vector(Base.A, one_hot=True), [1, 0, 0, 0]) + self.assertEqual(base_to_vector(Base.U, one_hot=True), [0, 0, 0, 1]) + self.assertEqual(base_to_vector(Base.G, one_hot=True), [0, 0, 1, 0]) + self.assertEqual(base_to_vector(Base.C, one_hot=True), [0, 1, 0, 0]) + self.assertEqual(base_to_vector(Base.T, one_hot=True), [0, 0, 0, 1]) + self.assertEqual(base_to_vector(Base.ZERO_PADDING, one_hot=True), [0, 0, 0, 0]) + + self.assertNotEqual(base_to_vector(Base.A, one_hot=True), [2, 2]) + with self.assertRaises(ValueError): base_to_vector(Base.DNA) From e664d6251e67b7aea8e671d786755166e86326b7 Mon Sep 17 00:00:00 2001 From: "Terlouw, Barbara" Date: Thu, 24 Apr 2025 09:53:41 +0200 Subject: [PATCH 04/41] Add base pair featurisation --- mewtwo/embeddings/bases.py | 34 ++++++++++++++++++++++++++++ mewtwo/embeddings/test/test_bases.py | 19 +++++++++++++++- 2 files changed, 52 insertions(+), 1 deletion(-) diff --git a/mewtwo/embeddings/bases.py b/mewtwo/embeddings/bases.py index e1a119c..da36828 100644 --- a/mewtwo/embeddings/bases.py +++ b/mewtwo/embeddings/bases.py @@ -3,6 +3,13 @@ from enum import Flag +class PairingType(Flag): + STRUCTURE_BASED = 1 + WATSON_CRICK = 2 + WOBBLE = 4 + WOBBLE_OR_WATSON_CRICK = WOBBLE | WATSON_CRICK + + class Base(Flag): A = 1 C = 2 @@ -113,3 +120,30 @@ def is_wobble(self): return True return False + + def to_vector(self, one_hot: bool = False, + pairing_type: PairingType = PairingType.STRUCTURE_BASED) -> list[int]: + + vector = base_to_vector(self.base_1, one_hot) + vector.extend(base_to_vector(self.base_2, one_hot)) + if pairing_type == PairingType.WOBBLE_OR_WATSON_CRICK: + + if self.is_watson_crick() or self.is_wobble(): + vector.append(1) + else: + vector.append(0) + + elif pairing_type == PairingType.STRUCTURE_BASED: + if self.h_bonded: + vector.append(1) + else: + vector.append(0) + elif pairing_type == PairingType.WATSON_CRICK: + if self.is_watson_crick(): + vector.append(1) + else: + vector.append(0) + else: + raise ValueError(f"Pairing type must be structure-based, Watson-Crick, or a combination of Watson-Crick and Wobble. Got {pairing_type.name}") + + return vector diff --git a/mewtwo/embeddings/test/test_bases.py b/mewtwo/embeddings/test/test_bases.py index 976a565..94c0f9b 100644 --- a/mewtwo/embeddings/test/test_bases.py +++ b/mewtwo/embeddings/test/test_bases.py @@ -1,6 +1,6 @@ import unittest -from mewtwo.embeddings.bases import Base, base_to_vector, BasePair +from mewtwo.embeddings.bases import Base, base_to_vector, BasePair, PairingType class TestBase(unittest.TestCase): @@ -42,6 +42,23 @@ def test_is_wobble(self): self.assertFalse(BasePair(Base.G, Base.T, True).is_wobble()) self.assertFalse(BasePair(Base.T, Base.G, True).is_wobble()) + def test_to_vector(self): + base_pair_1 = BasePair(Base.G, Base.U, False) + base_pair_2 = BasePair(Base.A, Base.U, True) + + self.assertEqual(base_pair_1.to_vector(), [2, 3, 1, 2, 0]) + self.assertEqual(base_pair_1.to_vector(one_hot=True), [0, 0, 1, 0, 0, 0, 0, 1, 0]) + self.assertEqual(base_pair_1.to_vector(pairing_type=PairingType.WOBBLE_OR_WATSON_CRICK), [2, 3, 1, 2, 1]) + self.assertEqual(base_pair_1.to_vector(pairing_type=PairingType.WATSON_CRICK), [2, 3, 1, 2, 0]) + + with self.assertRaises(ValueError): + base_pair_1.to_vector(pairing_type=PairingType.WOBBLE) + + self.assertEqual(base_pair_2.to_vector(), [2, 2, 1, 2, 1]) + self.assertEqual(base_pair_2.to_vector(one_hot=True), [1, 0, 0, 0, 0, 0, 0, 1, 1]) + self.assertEqual(base_pair_2.to_vector(pairing_type=PairingType.WOBBLE_OR_WATSON_CRICK), [2, 2, 1, 2, 1]) + self.assertEqual(base_pair_2.to_vector(pairing_type=PairingType.WATSON_CRICK), [2, 2, 1, 2, 1]) + if __name__ == '__main__': unittest.main() From 6e46de7fce25db6374f47732a6a55e193c66cf52 Mon Sep 17 00:00:00 2001 From: "Terlouw, Barbara" Date: Thu, 24 Apr 2025 10:03:22 +0200 Subject: [PATCH 05/41] Add test for zero-padding base --- mewtwo/embeddings/test/test_bases.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/mewtwo/embeddings/test/test_bases.py b/mewtwo/embeddings/test/test_bases.py index 94c0f9b..e4c8fa3 100644 --- a/mewtwo/embeddings/test/test_bases.py +++ b/mewtwo/embeddings/test/test_bases.py @@ -45,6 +45,7 @@ def test_is_wobble(self): def test_to_vector(self): base_pair_1 = BasePair(Base.G, Base.U, False) base_pair_2 = BasePair(Base.A, Base.U, True) + base_pair_3 = BasePair(Base.ZERO_PADDING, Base.ZERO_PADDING, True) self.assertEqual(base_pair_1.to_vector(), [2, 3, 1, 2, 0]) self.assertEqual(base_pair_1.to_vector(one_hot=True), [0, 0, 1, 0, 0, 0, 0, 1, 0]) @@ -59,6 +60,8 @@ def test_to_vector(self): self.assertEqual(base_pair_2.to_vector(pairing_type=PairingType.WOBBLE_OR_WATSON_CRICK), [2, 2, 1, 2, 1]) self.assertEqual(base_pair_2.to_vector(pairing_type=PairingType.WATSON_CRICK), [2, 2, 1, 2, 1]) + self.assertEqual(base_pair_3.to_vector(pairing_type=PairingType.WOBBLE_OR_WATSON_CRICK), [0, 0, 0, 0, 0]) + if __name__ == '__main__': unittest.main() From e5a5a86714b89b98950e2e950fe082303872e0e6 Mon Sep 17 00:00:00 2001 From: "Terlouw, Barbara" Date: Thu, 24 Apr 2025 14:56:31 +0200 Subject: [PATCH 06/41] Bugfix: prevent aliasing of reference dictionary when one-hot encoding --- mewtwo/embeddings/bases.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/mewtwo/embeddings/bases.py b/mewtwo/embeddings/bases.py index da36828..283c3b6 100644 --- a/mewtwo/embeddings/bases.py +++ b/mewtwo/embeddings/bases.py @@ -39,7 +39,7 @@ def base_to_vector(base: Base, one_hot: bool = False) -> list[int]: if base not in BASE_TO_ONEHOT: raise ValueError(f"Not a base: {base}") else: - return BASE_TO_ONEHOT[base] + return BASE_TO_ONEHOT[base][:] else: if base in Base.PYRIMIDINES: @@ -72,18 +72,18 @@ def base_to_vector(base: Base, one_hot: bool = False) -> list[int]: @dataclass class BasePair: - base_1: Optional[Base] - base_2: Optional[Base] + base_1: Base + base_2: Base h_bonded: bool def __repr__(self): - if self.base_1 is None: + if self.base_1 == Base.ZERO_PADDING: repr_base_1 = ' ' else: repr_base_1 = self.base_1.name - if self.base_2 is None: + if self.base_2 == Base.ZERO_PADDING: repr_base_2 = ' ' else: repr_base_2 = self.base_2.name @@ -110,13 +110,13 @@ def __hash__(self): return hash((self.base_1, self.base_2, self.h_bonded)) def is_watson_crick(self): - if self.base_1 and self.base_2 and self.base_1 | self.base_2 in WATSON_CRICK_PAIRS: + if self.base_1 | self.base_2 in WATSON_CRICK_PAIRS: return True return False def is_wobble(self): - if self.base_1 and self.base_2 and self.base_1 | self.base_2 in WOBBLE_PAIRS: + if self.base_1 | self.base_2 in WOBBLE_PAIRS: return True return False From df0ea145a14d8ddda6ec33722225906e01ebfe5e Mon Sep 17 00:00:00 2001 From: "Terlouw, Barbara" Date: Thu, 24 Apr 2025 15:03:08 +0200 Subject: [PATCH 07/41] Assert that the max stem size is equal to or greater than number of stem bps --- mewtwo/embeddings/terminator/stem.py | 22 ++++-- .../embeddings/terminator/test/test_stem.py | 70 +++++++++++++++---- 2 files changed, 71 insertions(+), 21 deletions(-) diff --git a/mewtwo/embeddings/terminator/stem.py b/mewtwo/embeddings/terminator/stem.py index 0530b6c..cf82506 100644 --- a/mewtwo/embeddings/terminator/stem.py +++ b/mewtwo/embeddings/terminator/stem.py @@ -1,4 +1,4 @@ -from mewtwo.embeddings.bases import BasePair +from mewtwo.embeddings.bases import BasePair, Base, PairingType from mewtwo.embeddings.sequence import RNASequence @@ -23,7 +23,7 @@ def get_basepairs(self): for i, character in enumerate(self.upstream_structure): if character == '(': while reverse_downstream_structure[downstream_index] != ')': - basepairs.append(BasePair(None, reverse_downstream_sequence[downstream_index], False)) + basepairs.append(BasePair(Base.ZERO_PADDING, reverse_downstream_sequence[downstream_index], False)) downstream_index += 1 basepairs.append( BasePair(self.upstream_sequence[i], reverse_downstream_sequence[downstream_index], True)) @@ -35,10 +35,20 @@ def get_basepairs(self): reverse_downstream_sequence[downstream_index], False)) downstream_index += 1 else: - basepairs.append(BasePair(self.upstream_sequence[i], None, False)) + basepairs.append(BasePair(self.upstream_sequence[i], Base.ZERO_PADDING, False)) return basepairs - - def to_vector(self, max_stem_size): - pass + def to_vector(self, max_stem_size: int, one_hot: bool = False, + pairing_type: PairingType = PairingType.STRUCTURE_BASED) -> list[int]: + vector = [] + basepairs = self.get_basepairs() + assert max_stem_size >= len(basepairs) + for i in range(max_stem_size): + try: + basepair = basepairs[i] + except IndexError: + basepair = BasePair(Base.ZERO_PADDING, Base.ZERO_PADDING, False) + vector.extend(basepair.to_vector(one_hot=one_hot, pairing_type=pairing_type)) + + return vector diff --git a/mewtwo/embeddings/terminator/test/test_stem.py b/mewtwo/embeddings/terminator/test/test_stem.py index 488c778..d61f5c4 100644 --- a/mewtwo/embeddings/terminator/test/test_stem.py +++ b/mewtwo/embeddings/terminator/test/test_stem.py @@ -1,6 +1,6 @@ import unittest from mewtwo.embeddings.terminator.stem import Stem -from mewtwo.embeddings.bases import BasePair, Base +from mewtwo.embeddings.bases import BasePair, Base, PairingType from mewtwo.embeddings.sequence import RNASequence @@ -9,37 +9,42 @@ def test_get_basepairs(self): simple_stem = [BasePair(Base.G, Base.C, True), BasePair(Base.A, Base.U, True), BasePair(Base.A, Base.U, True)] upstream_bulge = [BasePair(Base.G, Base.C, True), BasePair(Base.A, Base.U, True), - BasePair(Base.A, None, False), + BasePair(Base.A, Base.ZERO_PADDING, False), BasePair(Base.U, Base.A, True)] downstream_bulge = [BasePair(Base.G, Base.C, True), BasePair(Base.A, Base.U, True), - BasePair(None, Base.U, False), + BasePair(Base.ZERO_PADDING, Base.U, False), BasePair(Base.U, Base.A, True)] mismatch = [BasePair(Base.G, Base.C, True), BasePair(Base.A, Base.U, True), BasePair(Base.U, Base.U, False), BasePair(Base.U, Base.A, True)] mismatched_closing_stack = [BasePair(Base.G, Base.G, False), BasePair(Base.A, Base.U, True), BasePair(Base.A, Base.U, True), BasePair(Base.U, Base.A, True)] double_upstream_bulge = [BasePair(Base.G, Base.C, True), BasePair(Base.A, Base.U, True), - BasePair(Base.A, None, False), - BasePair(Base.A, None, False), BasePair(Base.U, Base.A, True)] + BasePair(Base.A, Base.ZERO_PADDING, False), + BasePair(Base.A, Base.ZERO_PADDING, False), BasePair(Base.U, Base.A, True)] double_downstream_bulge = [BasePair(Base.G, Base.C, True), BasePair(Base.A, Base.U, True), - BasePair(None, Base.U, False), - BasePair(None, Base.U, False), BasePair(Base.U, Base.A, True)] + BasePair(Base.ZERO_PADDING, Base.U, False), + BasePair(Base.ZERO_PADDING, Base.U, False), BasePair(Base.U, Base.A, True)] upstream_mismatch_bulge = [BasePair(Base.G, Base.C, True), BasePair(Base.A, Base.U, True), BasePair(Base.A, Base.C, False), - BasePair(Base.A, None, False), BasePair(Base.U, Base.A, True)] + BasePair(Base.A, Base.ZERO_PADDING, False), BasePair(Base.U, Base.A, True)] downstream_mismatch_bulge = [BasePair(Base.G, Base.C, True), BasePair(Base.A, Base.U, True), BasePair(Base.C, Base.U, False), - BasePair(None, Base.U, False), BasePair(Base.U, Base.A, True)] + BasePair(Base.ZERO_PADDING, Base.U, False), BasePair(Base.U, Base.A, True)] self.assertEqual(simple_stem, Stem(RNASequence('GAA'), '(((', RNASequence('UUC'), ')))').get_basepairs()) self.assertEqual(upstream_bulge, Stem(RNASequence('GAAU'), '((.(', RNASequence('AUC'), ')))').get_basepairs()) self.assertEqual(downstream_bulge, Stem(RNASequence('GAU'), '(((', RNASequence('AUUC'), ').))').get_basepairs()) self.assertEqual(mismatch, Stem(RNASequence('GAUU'), '((.(', RNASequence('AUUC'), ').))').get_basepairs()) - self.assertEqual(mismatched_closing_stack, Stem(RNASequence('GAAU'), '.(((', RNASequence('AUUG'), '))).').get_basepairs()) - self.assertEqual(double_upstream_bulge, Stem(RNASequence('GAAAU'), '((..(', RNASequence('AUC'), ')))').get_basepairs()) - self.assertEqual(double_downstream_bulge, Stem(RNASequence('GAU'), '(((', RNASequence('AUUUC'), ')..))').get_basepairs()) - self.assertEqual(upstream_mismatch_bulge, Stem(RNASequence('GAAAU'), '((..(', RNASequence('ACUC'), ').))').get_basepairs()) - self.assertEqual(downstream_mismatch_bulge, Stem(RNASequence('GACU'), '((.(', RNASequence('AUUUC'), ')..))').get_basepairs()) + self.assertEqual(mismatched_closing_stack, + Stem(RNASequence('GAAU'), '.(((', RNASequence('AUUG'), '))).').get_basepairs()) + self.assertEqual(double_upstream_bulge, + Stem(RNASequence('GAAAU'), '((..(', RNASequence('AUC'), ')))').get_basepairs()) + self.assertEqual(double_downstream_bulge, + Stem(RNASequence('GAU'), '(((', RNASequence('AUUUC'), ')..))').get_basepairs()) + self.assertEqual(upstream_mismatch_bulge, + Stem(RNASequence('GAAAU'), '((..(', RNASequence('ACUC'), ').))').get_basepairs()) + self.assertEqual(downstream_mismatch_bulge, + Stem(RNASequence('GACU'), '((.(', RNASequence('AUUUC'), ')..))').get_basepairs()) with self.assertRaises(AssertionError): Stem(RNASequence('GAAU'), '.((', RNASequence('AUUG'), '))).') @@ -47,7 +52,42 @@ def test_get_basepairs(self): Stem(RNASequence('GAAU'), '.(((', RNASequence('AUUG'), ')).') def test_to_vector(self): - pass + simple_stem = Stem(RNASequence('GAA'), '(((', RNASequence('UUC'), ')))') + upstream_bulge = Stem(RNASequence('GAAU'), '((.(', RNASequence('AUC'), ')))') + gu_mismatch = Stem(RNASequence('GAGU'), '((.(', RNASequence('AUUC'), ').))') + + self.assertEqual(simple_stem.to_vector(3), [2, 3, 1, 3, 1, 2, 2, 1, 2, 1, 2, 2, 1, 2, 1]) + self.assertEqual(simple_stem.to_vector(4), [2, 3, 1, 3, 1, 2, 2, 1, 2, 1, 2, 2, 1, 2, 1, 0, 0, 0, 0, 0]) + self.assertEqual(simple_stem.to_vector(3, one_hot=True), [0, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, + 1, 0, 0, 0, 0, 0, 0, 1, 1]) + self.assertEqual(upstream_bulge.to_vector(4, one_hot=True), [0, 0, 1, 0, 0, 1, 0, 0, 1, + 1, 0, 0, 0, 0, 0, 0, 1, 1, + 1, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 1, 1, 0, 0, 0, 1]) + self.assertEqual(gu_mismatch.to_vector(4, pairing_type=PairingType.WATSON_CRICK), + [2, 3, 1, 3, 1, + 2, 2, 1, 2, 1, + 2, 3, 1, 2, 0, + 1, 2, 2, 2, 1]) + + self.assertEqual(gu_mismatch.to_vector(4, pairing_type=PairingType.WOBBLE_OR_WATSON_CRICK), + [2, 3, 1, 3, 1, + 2, 2, 1, 2, 1, + 2, 3, 1, 2, 1, + 1, 2, 2, 2, 1]) + + self.assertEqual(gu_mismatch.to_vector(5, pairing_type=PairingType.WOBBLE_OR_WATSON_CRICK), + [2, 3, 1, 3, 1, + 2, 2, 1, 2, 1, + 2, 3, 1, 2, 1, + 1, 2, 2, 2, 1, + 0, 0, 0, 0, 0]) + + with self.assertRaises(AssertionError): + upstream_bulge.to_vector(3, one_hot=True), [0, 0, 1, 0, 0, 1, 0, 0, 1, + 1, 0, 0, 0, 0, 0, 0, 1, 1, + 1, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 1, 1, 0, 0, 0, 1] if __name__ == '__main__': From 5fbde282eab4c6941d9e10f59f08f583eb8fb73f Mon Sep 17 00:00:00 2001 From: "Terlouw, Barbara" Date: Fri, 25 Apr 2025 09:15:32 +0200 Subject: [PATCH 08/41] Max stem size can now be smaller than the stem --- mewtwo/embeddings/terminator/stem.py | 2 +- mewtwo/embeddings/terminator/test/test_stem.py | 8 +++----- 2 files changed, 4 insertions(+), 6 deletions(-) diff --git a/mewtwo/embeddings/terminator/stem.py b/mewtwo/embeddings/terminator/stem.py index cf82506..01a4645 100644 --- a/mewtwo/embeddings/terminator/stem.py +++ b/mewtwo/embeddings/terminator/stem.py @@ -43,7 +43,7 @@ def to_vector(self, max_stem_size: int, one_hot: bool = False, pairing_type: PairingType = PairingType.STRUCTURE_BASED) -> list[int]: vector = [] basepairs = self.get_basepairs() - assert max_stem_size >= len(basepairs) + for i in range(max_stem_size): try: basepair = basepairs[i] diff --git a/mewtwo/embeddings/terminator/test/test_stem.py b/mewtwo/embeddings/terminator/test/test_stem.py index d61f5c4..b08919d 100644 --- a/mewtwo/embeddings/terminator/test/test_stem.py +++ b/mewtwo/embeddings/terminator/test/test_stem.py @@ -83,11 +83,9 @@ def test_to_vector(self): 1, 2, 2, 2, 1, 0, 0, 0, 0, 0]) - with self.assertRaises(AssertionError): - upstream_bulge.to_vector(3, one_hot=True), [0, 0, 1, 0, 0, 1, 0, 0, 1, - 1, 0, 0, 0, 0, 0, 0, 1, 1, - 1, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 1, 1, 0, 0, 0, 1] + self.assertEqual(upstream_bulge.to_vector(3, one_hot=True), [0, 0, 1, 0, 0, 1, 0, 0, 1, + 1, 0, 0, 0, 0, 0, 0, 1, 1, + 1, 0, 0, 0, 0, 0, 0, 0, 0]) if __name__ == '__main__': From 45e1dc8eedd15f2a64ccfa9ea07c9d52a9f986f3 Mon Sep 17 00:00:00 2001 From: "Terlouw, Barbara" Date: Mon, 28 Apr 2025 12:32:30 +0200 Subject: [PATCH 09/41] Add script to obtain feature labels from a feature index --- mewtwo/embeddings/feature_labels.py | 135 ++++++++++++++++++ mewtwo/embeddings/terminator/a_tract.py | 0 mewtwo/embeddings/terminator/terminator.py | 0 mewtwo/embeddings/terminator/u_tract.py | 0 mewtwo/embeddings/test/test_feature_labels.py | 133 +++++++++++++++++ mewtwo/embeddings/test/test_terminator.py | 0 6 files changed, 268 insertions(+) create mode 100644 mewtwo/embeddings/feature_labels.py create mode 100644 mewtwo/embeddings/terminator/a_tract.py create mode 100644 mewtwo/embeddings/terminator/terminator.py create mode 100644 mewtwo/embeddings/terminator/u_tract.py create mode 100644 mewtwo/embeddings/test/test_feature_labels.py create mode 100644 mewtwo/embeddings/test/test_terminator.py diff --git a/mewtwo/embeddings/feature_labels.py b/mewtwo/embeddings/feature_labels.py new file mode 100644 index 0000000..9563358 --- /dev/null +++ b/mewtwo/embeddings/feature_labels.py @@ -0,0 +1,135 @@ +from typing import Optional + +from mewtwo.embeddings.sequence import SeqType +from mewtwo.embeddings.bases import Base + + +class FeatureLabel: + + def __init__(self, feature_position: int, max_a_tract_length: int, + max_stem_length: int, max_loop_length: int, + max_u_tract_length: int, seq_type: SeqType = SeqType.RNA, + one_hot: bool = False) -> None: + + self.seq_type = seq_type + self.one_hot = one_hot + + self.base_index: Optional[int] = None + self.base_identity: Optional[Base] = None + self.base_hydrogen_bond_count: bool = False + + self.check_pot: bool = False + + self.base_pair_index: Optional[int] = None + self.stem_shoulder: Optional[str] = None + self.check_pairing: bool = False + + if self.one_hot: + nr_a_tract_features = 4 * max_a_tract_length + nr_stem_features = 9 * max_stem_length + nr_loop_features = 4 * max_loop_length + + # Assumes separate feature for POT + nr_u_tract_features = 5 * max_u_tract_length + + else: + nr_a_tract_features = 3 * max_a_tract_length + nr_stem_features = 7 * max_stem_length + nr_loop_features = 3 * max_loop_length + + # Assumes separate feature for POT + nr_u_tract_features = 4 * max_u_tract_length + + if feature_position < nr_a_tract_features: + self.feature_type = "A-tract" + self.relative_feature_position = feature_position + self.set_base_information() + elif feature_position < nr_a_tract_features + nr_stem_features: + self.feature_type = "stem" + self.relative_feature_position = feature_position - nr_a_tract_features + self.set_basepair_information() + elif feature_position < nr_a_tract_features + nr_stem_features + nr_loop_features: + self.feature_type = "loop" + self.relative_feature_position = feature_position - nr_a_tract_features - nr_stem_features + self.set_base_information() + elif feature_position < nr_a_tract_features + nr_stem_features + nr_loop_features + nr_u_tract_features: + self.feature_type = "U-tract" + self.relative_feature_position = \ + feature_position - nr_a_tract_features - nr_stem_features - nr_loop_features + self.set_base_information() + else: + raise IndexError(f"Feature position {feature_position} does not exist.") + + def set_base_information(self) -> None: + + if self.one_hot: + if self.seq_type == SeqType.RNA: + features = [Base.A, Base.C, Base.G, Base.U] + elif self.seq_type == SeqType.DNA: + features = [Base.A, Base.C, Base.G, Base.T] + else: + raise ValueError(f"Unsupported sequence type: {self.seq_type}") + + else: + features = [Base.PURINES, Base.PYRIMIDINES, "hydrogen bonds"] + + if self.feature_type == 'U-tract': + features.append('POT') + + feature_nr = len(features) + + for i in range(feature_nr): + if self.relative_feature_position % feature_nr == i: + feature = features[i] + if feature == 'POT': + self.check_pot = True + elif feature == 'hydrogen bonds': + self.base_hydrogen_bond_count = True + else: + self.base_identity = feature + + self.base_index = self.relative_feature_position // feature_nr + 1 + break + + else: + raise ValueError(f"Could not find feature type for feature at {self.relative_feature_position}") + + def set_basepair_information(self) -> None: + assert self.feature_type == "stem" + if self.one_hot: + feature_nr = 9 + if self.seq_type == SeqType.RNA: + features = [Base.A, Base.C, Base.G, Base.U, Base.A, Base.C, Base.G, Base.U, 'bonded'] + elif self.seq_type == SeqType.DNA: + features = [Base.A, Base.C, Base.G, Base.T, Base.A, Base.C, Base.G, Base.T, 'bonded'] + else: + raise ValueError(f"Unsupported sequence type: {self.seq_type}") + + else: + feature_nr = 7 + features = [Base.PURINES, Base.PYRIMIDINES, 'hydrogen bonds', + Base.PURINES, Base.PYRIMIDINES, 'hydrogen bonds', 'bonded'] + + for i in range(feature_nr): + if self.relative_feature_position % feature_nr == i: + feature = features[i] + self.base_pair_index = self.relative_feature_position // feature_nr + 1 + + if feature != 'bonded': + if i < len(features) // 2: + self.stem_shoulder = 'upstream' + + else: + self.stem_shoulder = 'downstream' + + if feature != 'hydrogen bonds': + self.base_identity = feature + else: + self.base_hydrogen_bond_count = True + + else: + self.check_pairing = True + break + else: + + raise ValueError(f"Could not find feature type for feature at {self.relative_feature_position}") diff --git a/mewtwo/embeddings/terminator/a_tract.py b/mewtwo/embeddings/terminator/a_tract.py new file mode 100644 index 0000000..e69de29 diff --git a/mewtwo/embeddings/terminator/terminator.py b/mewtwo/embeddings/terminator/terminator.py new file mode 100644 index 0000000..e69de29 diff --git a/mewtwo/embeddings/terminator/u_tract.py b/mewtwo/embeddings/terminator/u_tract.py new file mode 100644 index 0000000..e69de29 diff --git a/mewtwo/embeddings/test/test_feature_labels.py b/mewtwo/embeddings/test/test_feature_labels.py new file mode 100644 index 0000000..476aa2d --- /dev/null +++ b/mewtwo/embeddings/test/test_feature_labels.py @@ -0,0 +1,133 @@ +import unittest +from mewtwo.embeddings.feature_labels import FeatureLabel +from mewtwo.embeddings.bases import Base + + +class TestFeatureLabel(unittest.TestCase): + + def test_set_base_information(self): + + # A-tract features + + feature_a01 = FeatureLabel(0, 10, 1, 1, 1) + feature_a02 = FeatureLabel(1, 10, 1, 1, 1) + feature_a03 = FeatureLabel(2, 10, 1, 1, 1) + feature_a04 = FeatureLabel(3, 10, 1, 1, 1) + feature_a05 = FeatureLabel(4, 10, 1, 1, 1) + feature_a06 = FeatureLabel(5, 10, 1, 1, 1) + + feature_a07 = FeatureLabel(0, 10, 1, 1, 1, one_hot=True) + feature_a08 = FeatureLabel(1, 10, 1, 1, 1, one_hot=True) + feature_a09 = FeatureLabel(2, 10, 1, 1, 1, one_hot=True) + feature_a10 = FeatureLabel(3, 10, 1, 1, 1, one_hot=True) + feature_a11 = FeatureLabel(4, 10, 1, 1, 1, one_hot=True) + feature_a12 = FeatureLabel(5, 10, 1, 1, 1, one_hot=True) + feature_a13 = FeatureLabel(6, 10, 1, 1, 1, one_hot=True) + feature_a14 = FeatureLabel(7, 10, 1, 1, 1, one_hot=True) + + self.assertEqual(feature_a01.base_identity, Base.PURINES) + self.assertEqual(feature_a01.base_index, 1) + self.assertEqual(feature_a01.base_pair_index, None) + self.assertEqual(feature_a01.base_hydrogen_bond_count, False) + self.assertEqual(feature_a01.feature_type, 'A-tract') + + self.assertEqual(feature_a02.base_identity, Base.PYRIMIDINES) + self.assertEqual(feature_a02.base_index, 1) + self.assertEqual(feature_a02.base_pair_index, None) + self.assertEqual(feature_a02.base_hydrogen_bond_count, False) + + self.assertEqual(feature_a03.base_identity, None) + self.assertEqual(feature_a03.base_index, 1) + self.assertEqual(feature_a03.base_pair_index, None) + self.assertEqual(feature_a03.base_hydrogen_bond_count, True) + + self.assertEqual(feature_a04.base_identity, Base.PURINES) + self.assertEqual(feature_a04.base_index, 2) + self.assertEqual(feature_a04.base_pair_index, None) + self.assertEqual(feature_a04.base_hydrogen_bond_count, False) + + self.assertEqual(feature_a05.base_identity, Base.PYRIMIDINES) + self.assertEqual(feature_a05.base_index, 2) + self.assertEqual(feature_a05.base_pair_index, None) + self.assertEqual(feature_a05.base_hydrogen_bond_count, False) + + self.assertEqual(feature_a06.base_identity, None) + self.assertEqual(feature_a06.base_index, 2) + self.assertEqual(feature_a06.base_pair_index, None) + self.assertEqual(feature_a06.base_hydrogen_bond_count, True) + + self.assertEqual(feature_a07.base_identity, Base.A) + self.assertEqual(feature_a07.base_index, 1) + self.assertEqual(feature_a07.base_pair_index, None) + self.assertEqual(feature_a07.base_hydrogen_bond_count, False) + + self.assertEqual(feature_a08.base_identity, Base.C) + self.assertEqual(feature_a09.base_identity, Base.G) + self.assertEqual(feature_a10.base_identity, Base.U) + self.assertEqual(feature_a11.base_identity, Base.A) + self.assertEqual(feature_a12.base_identity, Base.C) + self.assertEqual(feature_a13.base_identity, Base.G) + self.assertEqual(feature_a14.base_identity, Base.U) + + # Loop features + + def test_set_basepair_information(self): + # Stem features + + feature_s01 = FeatureLabel(30, 10, 10, 1, 1) + feature_s02 = FeatureLabel(31, 10, 10, 1, 1) + feature_s03 = FeatureLabel(32, 10, 10, 1, 1) + feature_s04 = FeatureLabel(33, 10, 10, 1, 1) + feature_s05 = FeatureLabel(34, 10, 10, 1, 1) + feature_s06 = FeatureLabel(35, 10, 10, 1, 1) + feature_s07 = FeatureLabel(36, 10, 10, 1, 1) + feature_s08 = FeatureLabel(37, 10, 10, 1, 1) + + feature_s09 = FeatureLabel(40, 10, 10, 1, 1, one_hot=True) + feature_s10 = FeatureLabel(41, 10, 10, 1, 1, one_hot=True) + feature_s11 = FeatureLabel(42, 10, 10, 1, 1, one_hot=True) + feature_s12 = FeatureLabel(43, 10, 10, 1, 1, one_hot=True) + feature_s13 = FeatureLabel(44, 10, 10, 1, 1, one_hot=True) + feature_s14 = FeatureLabel(45, 10, 10, 1, 1, one_hot=True) + feature_s15 = FeatureLabel(46, 10, 10, 1, 1, one_hot=True) + feature_s16 = FeatureLabel(47, 10, 10, 1, 1, one_hot=True) + feature_s17 = FeatureLabel(48, 10, 10, 1, 1, one_hot=True) + + self.assertEqual(feature_s01.base_identity, Base.PURINES) + self.assertEqual(feature_s01.base_index, None) + self.assertEqual(feature_s01.base_pair_index, 1) + self.assertEqual(feature_s01.stem_shoulder, 'upstream') + self.assertEqual(feature_s01.base_hydrogen_bond_count, False) + self.assertEqual(feature_s01.feature_type, 'stem') + + self.assertEqual(feature_s02.base_identity, Base.PYRIMIDINES) + self.assertEqual(feature_s03.base_identity, None) + self.assertEqual(feature_s03.base_hydrogen_bond_count, True) + + self.assertEqual(feature_s04.base_identity, Base.PURINES) + self.assertEqual(feature_s04.stem_shoulder, "downstream") + + self.assertEqual(feature_s05.base_identity, Base.PYRIMIDINES) + self.assertEqual(feature_s06.base_hydrogen_bond_count, True) + + self.assertEqual(feature_s07.base_pair_index, 1) + self.assertEqual(feature_s07.stem_shoulder, None) + self.assertEqual(feature_s07.check_pairing, True) + + self.assertEqual(feature_s08.base_pair_index, 2) + self.assertEqual(feature_s08.base_identity, Base.PURINES) + + self.assertEqual(feature_s09.base_identity, Base.A) + self.assertEqual(feature_s10.base_identity, Base.C) + self.assertEqual(feature_s11.base_identity, Base.G) + self.assertEqual(feature_s12.base_identity, Base.U) + self.assertEqual(feature_s13.base_identity, Base.A) + self.assertEqual(feature_s14.base_identity, Base.C) + self.assertEqual(feature_s15.base_identity, Base.G) + self.assertEqual(feature_s16.base_identity, Base.U) + self.assertEqual(feature_s17.base_identity, None) + self.assertEqual(feature_s17.check_pairing, True) + + +if __name__ == '__main__': + unittest.main() \ No newline at end of file diff --git a/mewtwo/embeddings/test/test_terminator.py b/mewtwo/embeddings/test/test_terminator.py new file mode 100644 index 0000000..e69de29 From 9dc77f83b8ba9e1635384fe7bcd26bc7e205fa77 Mon Sep 17 00:00:00 2001 From: "Terlouw, Barbara" Date: Mon, 28 Apr 2025 12:33:28 +0200 Subject: [PATCH 10/41] Separate embedding of purines and pyrimidines into two features --- mewtwo/embeddings/bases.py | 19 ++++++++++++------- mewtwo/embeddings/test/test_bases.py | 28 ++++++++++++++-------------- 2 files changed, 26 insertions(+), 21 deletions(-) diff --git a/mewtwo/embeddings/bases.py b/mewtwo/embeddings/bases.py index 283c3b6..da82a4f 100644 --- a/mewtwo/embeddings/bases.py +++ b/mewtwo/embeddings/bases.py @@ -42,25 +42,29 @@ def base_to_vector(base: Base, one_hot: bool = False) -> list[int]: return BASE_TO_ONEHOT[base][:] else: - if base in Base.PYRIMIDINES: + if base in Base.PURINES: element_1 = 1 - elif base in Base.PURINES: - element_1 = 2 + element_2 = 0 + elif base in Base.PYRIMIDINES: + element_1 = 0 + element_2 = 1 + elif base == Base.ZERO_PADDING: element_1 = 0 + element_2 = 0 else: raise ValueError(f"Unknown base: {base}") if base in Base.TWO_H_BONDS: - element_2 = 2 + element_3 = 2 elif base in Base.THREE_H_BONDS: - element_2 = 3 + element_3 = 3 elif base == Base.ZERO_PADDING: - element_2 = 0 + element_3 = 0 else: raise ValueError(f"Unknown base: {base}") - return [element_1, element_2] + return [element_1, element_2, element_3] WATSON_CRICK_PAIRS = [Base.A | Base.T, @@ -138,6 +142,7 @@ def to_vector(self, one_hot: bool = False, vector.append(1) else: vector.append(0) + elif pairing_type == PairingType.WATSON_CRICK: if self.is_watson_crick(): vector.append(1) diff --git a/mewtwo/embeddings/test/test_bases.py b/mewtwo/embeddings/test/test_bases.py index e4c8fa3..6976881 100644 --- a/mewtwo/embeddings/test/test_bases.py +++ b/mewtwo/embeddings/test/test_bases.py @@ -5,12 +5,12 @@ class TestBase(unittest.TestCase): def test_to_vector(self): - self.assertEqual(base_to_vector(Base.A), [2, 2]) - self.assertEqual(base_to_vector(Base.U), [1, 2]) - self.assertEqual(base_to_vector(Base.G), [2, 3]) - self.assertEqual(base_to_vector(Base.C), [1, 3]) - self.assertEqual(base_to_vector(Base.T), [1, 2]) - self.assertEqual(base_to_vector(Base.ZERO_PADDING), [0, 0]) + self.assertEqual(base_to_vector(Base.A), [1, 0, 2]) + self.assertEqual(base_to_vector(Base.U), [0, 1, 2]) + self.assertEqual(base_to_vector(Base.G), [1, 0, 3]) + self.assertEqual(base_to_vector(Base.C), [0, 1, 3]) + self.assertEqual(base_to_vector(Base.T), [0, 1, 2]) + self.assertEqual(base_to_vector(Base.ZERO_PADDING), [0, 0, 0]) self.assertEqual(base_to_vector(Base.A, one_hot=True), [1, 0, 0, 0]) self.assertEqual(base_to_vector(Base.U, one_hot=True), [0, 0, 0, 1]) @@ -19,7 +19,7 @@ def test_to_vector(self): self.assertEqual(base_to_vector(Base.T, one_hot=True), [0, 0, 0, 1]) self.assertEqual(base_to_vector(Base.ZERO_PADDING, one_hot=True), [0, 0, 0, 0]) - self.assertNotEqual(base_to_vector(Base.A, one_hot=True), [2, 2]) + self.assertNotEqual(base_to_vector(Base.A, one_hot=True), [1, 0, 2]) with self.assertRaises(ValueError): base_to_vector(Base.DNA) @@ -47,20 +47,20 @@ def test_to_vector(self): base_pair_2 = BasePair(Base.A, Base.U, True) base_pair_3 = BasePair(Base.ZERO_PADDING, Base.ZERO_PADDING, True) - self.assertEqual(base_pair_1.to_vector(), [2, 3, 1, 2, 0]) + self.assertEqual(base_pair_1.to_vector(), [1, 0, 3, 0, 1, 2, 0]) self.assertEqual(base_pair_1.to_vector(one_hot=True), [0, 0, 1, 0, 0, 0, 0, 1, 0]) - self.assertEqual(base_pair_1.to_vector(pairing_type=PairingType.WOBBLE_OR_WATSON_CRICK), [2, 3, 1, 2, 1]) - self.assertEqual(base_pair_1.to_vector(pairing_type=PairingType.WATSON_CRICK), [2, 3, 1, 2, 0]) + self.assertEqual(base_pair_1.to_vector(pairing_type=PairingType.WOBBLE_OR_WATSON_CRICK), [1, 0, 3, 0, 1, 2, 1]) + self.assertEqual(base_pair_1.to_vector(pairing_type=PairingType.WATSON_CRICK), [1, 0, 3, 0, 1, 2, 0]) with self.assertRaises(ValueError): base_pair_1.to_vector(pairing_type=PairingType.WOBBLE) - self.assertEqual(base_pair_2.to_vector(), [2, 2, 1, 2, 1]) + self.assertEqual(base_pair_2.to_vector(), [1, 0, 2, 0, 1, 2, 1]) self.assertEqual(base_pair_2.to_vector(one_hot=True), [1, 0, 0, 0, 0, 0, 0, 1, 1]) - self.assertEqual(base_pair_2.to_vector(pairing_type=PairingType.WOBBLE_OR_WATSON_CRICK), [2, 2, 1, 2, 1]) - self.assertEqual(base_pair_2.to_vector(pairing_type=PairingType.WATSON_CRICK), [2, 2, 1, 2, 1]) + self.assertEqual(base_pair_2.to_vector(pairing_type=PairingType.WOBBLE_OR_WATSON_CRICK), [1, 0, 2, 0, 1, 2, 1]) + self.assertEqual(base_pair_2.to_vector(pairing_type=PairingType.WATSON_CRICK), [1, 0, 2, 0, 1, 2, 1]) - self.assertEqual(base_pair_3.to_vector(pairing_type=PairingType.WOBBLE_OR_WATSON_CRICK), [0, 0, 0, 0, 0]) + self.assertEqual(base_pair_3.to_vector(pairing_type=PairingType.WOBBLE_OR_WATSON_CRICK), [0, 0, 0, 0, 0, 0, 0]) if __name__ == '__main__': From 167ead44b3f485642b2ebdf7f1b0b2eb0ac1c00f Mon Sep 17 00:00:00 2001 From: "Terlouw, Barbara" Date: Mon, 28 Apr 2025 12:35:05 +0200 Subject: [PATCH 11/41] Backup --- mewtwo/embeddings/sequence.py | 10 ++++- mewtwo/embeddings/terminator/a_tract.py | 6 +++ mewtwo/embeddings/terminator/terminator.py | 16 ++++++++ .../embeddings/terminator/test/test_stem.py | 39 +++++++++++-------- mewtwo/embeddings/terminator/u_tract.py | 7 ++++ mewtwo/parsers/parse_termite_output.py | 4 +- 6 files changed, 63 insertions(+), 19 deletions(-) diff --git a/mewtwo/embeddings/sequence.py b/mewtwo/embeddings/sequence.py index bd433ef..69f174c 100644 --- a/mewtwo/embeddings/sequence.py +++ b/mewtwo/embeddings/sequence.py @@ -1,6 +1,6 @@ from enum import Flag from typing import Union -from mewtwo.embeddings.bases import Base +from mewtwo.embeddings.bases import Base, base_to_vector class SeqType(Flag): @@ -52,6 +52,14 @@ def _check_sequence(self): except KeyError: raise ValueError(f"RNA sequence must be comprised of bases A, C, G, and U. Found {character} in {self.sequence}") + def to_vector(self, one_hot: bool = False) -> list[int]: + vector = [] + for character in self.sequence: + base = Base[character] + vector.extend(base_to_vector(base, one_hot=one_hot)) + + return vector + class DNASequence(Sequence): def __init__(self, sequence): diff --git a/mewtwo/embeddings/terminator/a_tract.py b/mewtwo/embeddings/terminator/a_tract.py index e69de29..3571081 100644 --- a/mewtwo/embeddings/terminator/a_tract.py +++ b/mewtwo/embeddings/terminator/a_tract.py @@ -0,0 +1,6 @@ +from mewtwo.embeddings.sequence import RNASequence + + +class ATract: + def __init__(self, sequence: RNASequence) -> None: + self.sequence = sequence diff --git a/mewtwo/embeddings/terminator/terminator.py b/mewtwo/embeddings/terminator/terminator.py index e69de29..a9cbcb4 100644 --- a/mewtwo/embeddings/terminator/terminator.py +++ b/mewtwo/embeddings/terminator/terminator.py @@ -0,0 +1,16 @@ +from typing import Union + +from mewtwo.embeddings.terminator.hairpin import RNAFoldHairpin, TransTermHPHairpin +from mewtwo.embeddings.terminator.a_tract import ATract +from mewtwo.embeddings.terminator.u_tract import UTract + + +class Terminator: + def __init__(self, hairpin: Union[RNAFoldHairpin, TransTermHPHairpin], a_tract: ATract, u_tract: UTract): + self.hairpin = hairpin + self.a_tract = a_tract + self.u_tract = u_tract + + def to_vector(self, one_hot: bool = False) -> list[int]: + vector = [] + vector.extend(self.a_tract.sequence.to_vector(one_hot)) diff --git a/mewtwo/embeddings/terminator/test/test_stem.py b/mewtwo/embeddings/terminator/test/test_stem.py index b08919d..31d98bd 100644 --- a/mewtwo/embeddings/terminator/test/test_stem.py +++ b/mewtwo/embeddings/terminator/test/test_stem.py @@ -56,32 +56,39 @@ def test_to_vector(self): upstream_bulge = Stem(RNASequence('GAAU'), '((.(', RNASequence('AUC'), ')))') gu_mismatch = Stem(RNASequence('GAGU'), '((.(', RNASequence('AUUC'), ').))') - self.assertEqual(simple_stem.to_vector(3), [2, 3, 1, 3, 1, 2, 2, 1, 2, 1, 2, 2, 1, 2, 1]) - self.assertEqual(simple_stem.to_vector(4), [2, 3, 1, 3, 1, 2, 2, 1, 2, 1, 2, 2, 1, 2, 1, 0, 0, 0, 0, 0]) - self.assertEqual(simple_stem.to_vector(3, one_hot=True), [0, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, + self.assertEqual(simple_stem.to_vector(3), [1, 0, 3, 0, 1, 3, 1, + 1, 0, 2, 0, 1, 2, 1, + 1, 0, 2, 0, 1, 2, 1]) + self.assertEqual(simple_stem.to_vector(4), [1, 0, 3, 0, 1, 3, 1, + 1, 0, 2, 0, 1, 2, 1, + 1, 0, 2, 0, 1, 2, 1, + 0, 0, 0, 0, 0, 0, 0]) + self.assertEqual(simple_stem.to_vector(3, one_hot=True), [0, 0, 1, 0, 0, 1, 0, 0, 1, + 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1]) + self.assertEqual(upstream_bulge.to_vector(4, one_hot=True), [0, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1]) self.assertEqual(gu_mismatch.to_vector(4, pairing_type=PairingType.WATSON_CRICK), - [2, 3, 1, 3, 1, - 2, 2, 1, 2, 1, - 2, 3, 1, 2, 0, - 1, 2, 2, 2, 1]) + [1, 0, 3, 0, 1, 3, 1, + 1, 0, 2, 0, 1, 2, 1, + 1, 0, 3, 0, 1, 2, 0, + 0, 1, 2, 1, 0, 2, 1]) self.assertEqual(gu_mismatch.to_vector(4, pairing_type=PairingType.WOBBLE_OR_WATSON_CRICK), - [2, 3, 1, 3, 1, - 2, 2, 1, 2, 1, - 2, 3, 1, 2, 1, - 1, 2, 2, 2, 1]) + [1, 0, 3, 0, 1, 3, 1, + 1, 0, 2, 0, 1, 2, 1, + 1, 0, 3, 0, 1, 2, 1, + 0, 1, 2, 1, 0, 2, 1]) self.assertEqual(gu_mismatch.to_vector(5, pairing_type=PairingType.WOBBLE_OR_WATSON_CRICK), - [2, 3, 1, 3, 1, - 2, 2, 1, 2, 1, - 2, 3, 1, 2, 1, - 1, 2, 2, 2, 1, - 0, 0, 0, 0, 0]) + [1, 0, 3, 0, 1, 3, 1, + 1, 0, 2, 0, 1, 2, 1, + 1, 0, 3, 0, 1, 2, 1, + 0, 1, 2, 1, 0, 2, 1, + 0, 0, 0, 0, 0, 0, 0]) self.assertEqual(upstream_bulge.to_vector(3, one_hot=True), [0, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, diff --git a/mewtwo/embeddings/terminator/u_tract.py b/mewtwo/embeddings/terminator/u_tract.py index e69de29..e7dcd37 100644 --- a/mewtwo/embeddings/terminator/u_tract.py +++ b/mewtwo/embeddings/terminator/u_tract.py @@ -0,0 +1,7 @@ +from mewtwo.embeddings.sequence import RNASequence + + +class UTract: + def __init__(self, sequence: RNASequence, pot: int) -> None: + self.sequence = sequence + self.pot = pot diff --git a/mewtwo/parsers/parse_termite_output.py b/mewtwo/parsers/parse_termite_output.py index cd86231..ad824a6 100644 --- a/mewtwo/parsers/parse_termite_output.py +++ b/mewtwo/parsers/parse_termite_output.py @@ -1,6 +1,6 @@ from mewtwo.parsers.tabular import Tabular from mewtwo.embeddings.terminator.hairpin import RNAFoldHairpin, TransTermHPHairpin -from mewtwo.embeddings.bases import BasePair +from mewtwo.embeddings.bases import BasePair, Base from sys import argv from pprint import pprint @@ -79,7 +79,7 @@ def rnafold_hairpins_from_termite(input_file, get_rnafold: bool = True, get_tran if basepair not in basepairs_to_counts_transtermhp: basepairs_to_counts_transtermhp[basepair] = 0 basepairs_to_counts_transtermhp[basepair] += 1 - if basepair == BasePair('A', 'A', True): + if basepair == BasePair(Base.A, Base.A, True): print(hairpin.hairpin_sequence) print(hairpin.hairpin_structure) From cb57b88ec2b867f64c33cf5ebf5bf5e72b61cc5f Mon Sep 17 00:00:00 2001 From: "Terlouw, Barbara" Date: Mon, 28 Apr 2025 13:21:37 +0200 Subject: [PATCH 12/41] Add __next__ and __iter__ methods to class Sequence (untested) --- mewtwo/embeddings/sequence.py | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/mewtwo/embeddings/sequence.py b/mewtwo/embeddings/sequence.py index 69f174c..23be611 100644 --- a/mewtwo/embeddings/sequence.py +++ b/mewtwo/embeddings/sequence.py @@ -13,6 +13,8 @@ def __init__(self, sequence: str, seq_type: SeqType = SeqType['DNA']): self.sequence = sequence.upper() self.seq_type = seq_type self._check_sequence() + self.seq_length = len(sequence) + self.__current_index = 0 def __eq__(self, other): if self.sequence == other.sequence and type(self) == type(other): @@ -26,6 +28,17 @@ def __hash__(self): def __repr__(self): return self.sequence + def __iter__(self): + return type(self)(self.sequence) + + def __next__(self) -> Base: + if self.__current_index < self.seq_length: + base = Base[self.sequence[self.__current_index]] + self.__current_index += 1 + return base + else: + raise StopIteration + def __getitem__(self, index: Union[slice, int]) -> Union['Sequence', 'RNASequence', 'DNASequence', Base]: if isinstance(index, int): return Base[self.sequence[index]] From ddd81055b4c0668c982d324c203a6c90e53ad744 Mon Sep 17 00:00:00 2001 From: "Terlouw, Barbara" Date: Mon, 28 Apr 2025 13:22:26 +0200 Subject: [PATCH 13/41] Add vector method and tests for class Loop --- mewtwo/embeddings/terminator/loop.py | 45 +++++++++++++++++ .../embeddings/terminator/test/test_loop.py | 48 ++++++++++++++++++- 2 files changed, 92 insertions(+), 1 deletion(-) diff --git a/mewtwo/embeddings/terminator/loop.py b/mewtwo/embeddings/terminator/loop.py index 2d466f0..00be7ed 100644 --- a/mewtwo/embeddings/terminator/loop.py +++ b/mewtwo/embeddings/terminator/loop.py @@ -1,7 +1,52 @@ from mewtwo.embeddings.sequence import RNASequence +from mewtwo.embeddings.bases import Base, base_to_vector class Loop: def __init__(self, loop_sequence: RNASequence, loop_structure: str) -> None: self.sequence = loop_sequence self.structure = loop_structure + + def to_vector(self, max_loop_size, one_hot=False): + assert len(self.sequence) <= max_loop_size + if len(self.sequence) % 2 == 0: + center_base = Base.ZERO_PADDING + else: + center_base = self.sequence[len(self.sequence) // 2] + center_vector = base_to_vector(center_base, one_hot=one_hot) + padding = (max_loop_size - len(self.sequence)) // 2 + left_padding = [] + right_padding = [] + for i in range(padding): + left_padding.extend(base_to_vector(Base.ZERO_PADDING, one_hot=one_hot)) + right_padding.extend(base_to_vector(Base.ZERO_PADDING, one_hot=one_hot)) + + left_vector = [] + right_vector = [] + for i, base in enumerate(self.sequence): + if i < len(self.sequence) // 2: + + left_vector.extend(base_to_vector(base, one_hot=one_hot)) + elif i >= len(self.sequence) / 2: + right_vector.extend(base_to_vector(base, one_hot=one_hot)) + + vector = left_padding + left_vector + center_vector + right_vector + right_padding + print(left_padding) + print(left_vector) + print(center_vector) + print(right_vector) + print(right_padding) + return vector + + +def get_max_loop_size(loops: list[Loop]) -> int: + max_loop_size = 0 + for loop in loops: + stem_size = len(loop.sequence) + if stem_size > max_loop_size: + max_loop_size += 1 + + if max_loop_size % 2 == 0: + max_loop_size += 1 + + return max_loop_size diff --git a/mewtwo/embeddings/terminator/test/test_loop.py b/mewtwo/embeddings/terminator/test/test_loop.py index fdf3312..47ad35c 100644 --- a/mewtwo/embeddings/terminator/test/test_loop.py +++ b/mewtwo/embeddings/terminator/test/test_loop.py @@ -1,6 +1,52 @@ import unittest +from mewtwo.embeddings.sequence import RNASequence +from mewtwo.embeddings.terminator.loop import Loop + + +class TestLoop(unittest.TestCase): + def test_to_vector(self): + loop_01 = Loop(RNASequence("AGCGU"), '.....') + + self.assertEqual(loop_01.to_vector(max_loop_size=5), [1, 0, 2, + 1, 0, 3, + 0, 1, 3, + 1, 0, 3, + 0, 1, 2]) + + self.assertEqual(loop_01.to_vector(max_loop_size=9), [0, 0, 0, + 0, 0, 0, + 1, 0, 2, + 1, 0, 3, + 0, 1, 3, + 1, 0, 3, + 0, 1, 2, + 0, 0, 0, + 0, 0, 0]) + + loop_02 = Loop(RNASequence("AGCCGU"), '......') + + with self.assertRaises(AssertionError): + loop_02.to_vector(max_loop_size=5) + + self.assertEqual(loop_02.to_vector(max_loop_size=7), [1, 0, 2, + 1, 0, 3, + 0, 1, 3, + 0, 0, 0, + 0, 1, 3, + 1, 0, 3, + 0, 1, 2]) + + self.assertEqual(loop_02.to_vector(max_loop_size=9), [0, 0, 0, + 1, 0, 2, + 1, 0, 3, + 0, 1, 3, + 0, 0, 0, + 0, 1, 3, + 1, 0, 3, + 0, 1, 2, + 0, 0, 0]) + if __name__ == '__main__': unittest.main() - From 6c2a34807111a9c76f42fa9c224a8402130e6344 Mon Sep 17 00:00:00 2001 From: "Terlouw, Barbara" Date: Mon, 28 Apr 2025 13:24:06 +0200 Subject: [PATCH 14/41] Add check and test for odd max loop size --- mewtwo/embeddings/terminator/loop.py | 2 ++ mewtwo/embeddings/terminator/test/test_loop.py | 3 +++ 2 files changed, 5 insertions(+) diff --git a/mewtwo/embeddings/terminator/loop.py b/mewtwo/embeddings/terminator/loop.py index 00be7ed..b5498d6 100644 --- a/mewtwo/embeddings/terminator/loop.py +++ b/mewtwo/embeddings/terminator/loop.py @@ -9,6 +9,8 @@ def __init__(self, loop_sequence: RNASequence, loop_structure: str) -> None: def to_vector(self, max_loop_size, one_hot=False): assert len(self.sequence) <= max_loop_size + assert max_loop_size % 2 == 1 + if len(self.sequence) % 2 == 0: center_base = Base.ZERO_PADDING else: diff --git a/mewtwo/embeddings/terminator/test/test_loop.py b/mewtwo/embeddings/terminator/test/test_loop.py index 47ad35c..d333534 100644 --- a/mewtwo/embeddings/terminator/test/test_loop.py +++ b/mewtwo/embeddings/terminator/test/test_loop.py @@ -29,6 +29,9 @@ def test_to_vector(self): with self.assertRaises(AssertionError): loop_02.to_vector(max_loop_size=5) + with self.assertRaises(AssertionError): + loop_02.to_vector(max_loop_size=6) + self.assertEqual(loop_02.to_vector(max_loop_size=7), [1, 0, 2, 1, 0, 3, 0, 1, 3, From 3d5e59f6ef857cc610b7466ce70f0d49d73659a0 Mon Sep 17 00:00:00 2001 From: "Terlouw, Barbara" Date: Mon, 28 Apr 2025 13:25:39 +0200 Subject: [PATCH 15/41] Change get_hairpin_parts method to set_hairpin_parts method --- mewtwo/embeddings/terminator/hairpin.py | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/mewtwo/embeddings/terminator/hairpin.py b/mewtwo/embeddings/terminator/hairpin.py index c7138d1..eebe63d 100644 --- a/mewtwo/embeddings/terminator/hairpin.py +++ b/mewtwo/embeddings/terminator/hairpin.py @@ -15,6 +15,8 @@ def __init__(self, hairpin_id: str, distance_to_pot: int, self.prediction_software = prediction_software self.hairpin_sequence = None self.hairpin_structure = None + self.loop = None + self.stem = None def __eq__(self, other): if type(self) != other(type): @@ -36,7 +38,7 @@ def contains_multiple_hairpins(self): return False - def get_hairpin_parts(self): + def set_hairpin_parts(self): assert not self.contains_multiple_hairpins() last_left_shoulder = 0 @@ -55,10 +57,8 @@ def get_hairpin_parts(self): right_shoulder_structure = self.hairpin_structure[first_right_shoulder:] right_shoulder_sequence = self.hairpin_sequence[first_right_shoulder:] - loop = Loop(loop_sequence, loop_structure) - stem = Stem(left_shoulder_sequence, left_shoulder_structure, right_shoulder_sequence, right_shoulder_structure) - - return loop, stem + self.loop = Loop(loop_sequence, loop_structure) + self.stem = Stem(left_shoulder_sequence, left_shoulder_structure, right_shoulder_sequence, right_shoulder_structure) def to_vector(self, max_stem_size, max_loop_size): assert self.hairpin_sequence is not None and self.hairpin_structure is not None @@ -76,6 +76,8 @@ def __init__(self, hairpin_id, distance_to_pot, free_energy, hairpin_sequence, h self.hairpin_sequence = sequence self.hairpin_structure = hairpin_structure self.free_energy = free_energy + if not self.contains_multiple_hairpins(): + self.set_hairpin_parts() class TransTermHPHairpin(Hairpin): @@ -85,6 +87,8 @@ def __init__(self, hairpin_id, distance_to_pot, hairpin_score, hairpin): self.set_hairpin_sequence(hairpin) self.set_hairpin_structure(hairpin) self.hairpin_score = hairpin_score + if not self.contains_multiple_hairpins(): + self.set_hairpin_parts() def set_hairpin_sequence(self, hairpin): From 2fe5daf58cf76a4dc75225863c5f615d71c28363 Mon Sep 17 00:00:00 2001 From: "Terlouw, Barbara" Date: Mon, 28 Apr 2025 13:27:27 +0200 Subject: [PATCH 16/41] Call get_basepairs method in __init__ of class Stem --- mewtwo/embeddings/terminator/stem.py | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/mewtwo/embeddings/terminator/stem.py b/mewtwo/embeddings/terminator/stem.py index 01a4645..e8325df 100644 --- a/mewtwo/embeddings/terminator/stem.py +++ b/mewtwo/embeddings/terminator/stem.py @@ -12,6 +12,7 @@ def __init__(self, upstream_sequence: RNASequence, upstream_structure: str, self.upstream_structure = upstream_structure self.downstream_sequence = downstream_sequence self.downstream_structure = downstream_structure + self.basepairs = self.get_basepairs() def get_basepairs(self): basepairs = [] @@ -42,13 +43,22 @@ def get_basepairs(self): def to_vector(self, max_stem_size: int, one_hot: bool = False, pairing_type: PairingType = PairingType.STRUCTURE_BASED) -> list[int]: vector = [] - basepairs = self.get_basepairs() for i in range(max_stem_size): try: - basepair = basepairs[i] + basepair = self.basepairs[i] except IndexError: basepair = BasePair(Base.ZERO_PADDING, Base.ZERO_PADDING, False) vector.extend(basepair.to_vector(one_hot=one_hot, pairing_type=pairing_type)) return vector + + +def get_max_stem_size(stems: list[Stem]) -> int: + max_stem_size = 0 + for stem in stems: + stem_size = len(stem.basepairs) + if stem_size > max_stem_size: + max_stem_size += 1 + + return max_stem_size From 27d676e620821e450af6b0b2ca7b63d06874a820 Mon Sep 17 00:00:00 2001 From: "Terlouw, Barbara" Date: Mon, 28 Apr 2025 13:28:57 +0200 Subject: [PATCH 17/41] Code cleanup --- mewtwo/embeddings/test/test_feature_labels.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mewtwo/embeddings/test/test_feature_labels.py b/mewtwo/embeddings/test/test_feature_labels.py index 476aa2d..5dca8e0 100644 --- a/mewtwo/embeddings/test/test_feature_labels.py +++ b/mewtwo/embeddings/test/test_feature_labels.py @@ -130,4 +130,4 @@ def test_set_basepair_information(self): if __name__ == '__main__': - unittest.main() \ No newline at end of file + unittest.main() From 698207781668bed766f25a0f06385a6f5b7d54db Mon Sep 17 00:00:00 2001 From: "Terlouw, Barbara" Date: Mon, 28 Apr 2025 14:51:15 +0200 Subject: [PATCH 18/41] Add hairpin type enum --- mewtwo/embeddings/terminator/hairpin.py | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/mewtwo/embeddings/terminator/hairpin.py b/mewtwo/embeddings/terminator/hairpin.py index eebe63d..64772cf 100644 --- a/mewtwo/embeddings/terminator/hairpin.py +++ b/mewtwo/embeddings/terminator/hairpin.py @@ -1,13 +1,19 @@ +from enum import Enum + from mewtwo.embeddings.terminator.loop import Loop from mewtwo.embeddings.terminator.stem import Stem from mewtwo.embeddings.bases import BasePair, Base from mewtwo.embeddings.sequence import DNASequence, RNASequence, get_sequence_type, convert_to_rna, SeqType +class HairpinType(Enum): + RNAFOLD = 1 + TRANSTERMHP = 2 + + class Hairpin: def __init__(self, hairpin_id: str, distance_to_pot: int, - prediction_software: str): - assert prediction_software in ["RNAFold", "TransTermHP"] + prediction_software: HairpinType): self.hairpin_id = hairpin_id self.distance_to_pot = distance_to_pot @@ -67,7 +73,7 @@ def to_vector(self, max_stem_size, max_loop_size): class RNAFoldHairpin(Hairpin): def __init__(self, hairpin_id, distance_to_pot, free_energy, hairpin_sequence, hairpin_structure): - super().__init__(hairpin_id, distance_to_pot, "RNAFold") + super().__init__(hairpin_id, distance_to_pot, HairpinType.RNAFOLD) seq_type = get_sequence_type(hairpin_sequence) if SeqType.RNA in seq_type: sequence = RNASequence(hairpin_sequence) @@ -83,7 +89,7 @@ def __init__(self, hairpin_id, distance_to_pot, free_energy, hairpin_sequence, h class TransTermHPHairpin(Hairpin): def __init__(self, hairpin_id, distance_to_pot, hairpin_score, hairpin): - super().__init__(hairpin_id, distance_to_pot, "TransTermHP") + super().__init__(hairpin_id, distance_to_pot, HairpinType.TRANSTERMHP) self.set_hairpin_sequence(hairpin) self.set_hairpin_structure(hairpin) self.hairpin_score = hairpin_score From 5256f529b0b2670720ef7fe47b662731a9349cb7 Mon Sep 17 00:00:00 2001 From: "Terlouw, Barbara" Date: Mon, 28 Apr 2025 15:37:11 +0200 Subject: [PATCH 19/41] Bugfix: a tract vector now uses a_tract_size argument for indexing sequence --- mewtwo/embeddings/terminator/a_tract.py | 11 +++++++++++ .../embeddings/terminator/test/test_a_tract.py | 17 +++++++++++++++++ 2 files changed, 28 insertions(+) create mode 100644 mewtwo/embeddings/terminator/test/test_a_tract.py diff --git a/mewtwo/embeddings/terminator/a_tract.py b/mewtwo/embeddings/terminator/a_tract.py index 3571081..fbfbf7a 100644 --- a/mewtwo/embeddings/terminator/a_tract.py +++ b/mewtwo/embeddings/terminator/a_tract.py @@ -1,6 +1,17 @@ from mewtwo.embeddings.sequence import RNASequence +from mewtwo.embeddings.bases import Base, base_to_vector class ATract: def __init__(self, sequence: RNASequence) -> None: self.sequence = sequence + + def to_vector(self, a_tract_size: int = 10, one_hot: bool = False) -> list[int]: + vector = [] + if len(self.sequence) < a_tract_size: + for i in range(a_tract_size - len(self.sequence)): + vector.extend(base_to_vector(Base.ZERO_PADDING, one_hot)) + for base in self.sequence[-a_tract_size:]: + vector.extend(base_to_vector(base, one_hot)) + + return vector diff --git a/mewtwo/embeddings/terminator/test/test_a_tract.py b/mewtwo/embeddings/terminator/test/test_a_tract.py new file mode 100644 index 0000000..e849c75 --- /dev/null +++ b/mewtwo/embeddings/terminator/test/test_a_tract.py @@ -0,0 +1,17 @@ +import unittest + +from mewtwo.embeddings.terminator.a_tract import ATract +from mewtwo.embeddings.sequence import RNASequence + + +class TestATract(unittest.TestCase): + def test_to_vector(self): + a_tract_1 = ATract(RNASequence("ACGU")) + + self.assertEqual(a_tract_1.to_vector(4), [1, 0, 2, 0, 1, 3, 1, 0, 3, 0, 1, 2]) + self.assertEqual(a_tract_1.to_vector(3), [0, 1, 3, 1, 0, 3, 0, 1, 2]) + self.assertEqual(a_tract_1.to_vector(5), [0, 0, 0, 1, 0, 2, 0, 1, 3, 1, 0, 3, 0, 1, 2]) + + +if __name__ == "__main__": + unittest.main() \ No newline at end of file From fd1ba16726a931cd7a33889b6362c8081f0f9f19 Mon Sep 17 00:00:00 2001 From: "Terlouw, Barbara" Date: Mon, 28 Apr 2025 15:47:14 +0200 Subject: [PATCH 20/41] Add tests for a-tract and u-tract embeddings --- mewtwo/embeddings/terminator/a_tract.py | 2 +- .../terminator/test/test_a_tract.py | 30 ++++++++++-- .../terminator/test/test_u_tract.py | 46 +++++++++++++++++++ mewtwo/embeddings/terminator/u_tract.py | 18 ++++++++ 4 files changed, 91 insertions(+), 5 deletions(-) create mode 100644 mewtwo/embeddings/terminator/test/test_u_tract.py diff --git a/mewtwo/embeddings/terminator/a_tract.py b/mewtwo/embeddings/terminator/a_tract.py index fbfbf7a..3b9f7fb 100644 --- a/mewtwo/embeddings/terminator/a_tract.py +++ b/mewtwo/embeddings/terminator/a_tract.py @@ -7,7 +7,7 @@ def __init__(self, sequence: RNASequence) -> None: self.sequence = sequence def to_vector(self, a_tract_size: int = 10, one_hot: bool = False) -> list[int]: - vector = [] + vector: list[int] = [] if len(self.sequence) < a_tract_size: for i in range(a_tract_size - len(self.sequence)): vector.extend(base_to_vector(Base.ZERO_PADDING, one_hot)) diff --git a/mewtwo/embeddings/terminator/test/test_a_tract.py b/mewtwo/embeddings/terminator/test/test_a_tract.py index e849c75..6f4e606 100644 --- a/mewtwo/embeddings/terminator/test/test_a_tract.py +++ b/mewtwo/embeddings/terminator/test/test_a_tract.py @@ -8,10 +8,32 @@ class TestATract(unittest.TestCase): def test_to_vector(self): a_tract_1 = ATract(RNASequence("ACGU")) - self.assertEqual(a_tract_1.to_vector(4), [1, 0, 2, 0, 1, 3, 1, 0, 3, 0, 1, 2]) - self.assertEqual(a_tract_1.to_vector(3), [0, 1, 3, 1, 0, 3, 0, 1, 2]) - self.assertEqual(a_tract_1.to_vector(5), [0, 0, 0, 1, 0, 2, 0, 1, 3, 1, 0, 3, 0, 1, 2]) + self.assertEqual(a_tract_1.to_vector(4), [1, 0, 2, + 0, 1, 3, + 1, 0, 3, + 0, 1, 2]) + self.assertEqual(a_tract_1.to_vector(3), [0, 1, 3, + 1, 0, 3, + 0, 1, 2]) + self.assertEqual(a_tract_1.to_vector(5), [0, 0, 0, + 1, 0, 2, + 0, 1, 3, + 1, 0, 3, + 0, 1, 2]) + + self.assertEqual(a_tract_1.to_vector(4, one_hot=True), [1, 0, 0, 0, + 0, 1, 0, 0, + 0, 0, 1, 0, + 0, 0, 0, 1]) + self.assertEqual(a_tract_1.to_vector(3, one_hot=True), [0, 1, 0, 0, + 0, 0, 1, 0, + 0, 0, 0, 1]) + self.assertEqual(a_tract_1.to_vector(5, one_hot=True), [0, 0, 0, 0, + 1, 0, 0, 0, + 0, 1, 0, 0, + 0, 0, 1, 0, + 0, 0, 0, 1]) if __name__ == "__main__": - unittest.main() \ No newline at end of file + unittest.main() diff --git a/mewtwo/embeddings/terminator/test/test_u_tract.py b/mewtwo/embeddings/terminator/test/test_u_tract.py new file mode 100644 index 0000000..47dc309 --- /dev/null +++ b/mewtwo/embeddings/terminator/test/test_u_tract.py @@ -0,0 +1,46 @@ +import unittest + +from mewtwo.embeddings.terminator.u_tract import UTract +from mewtwo.embeddings.sequence import RNASequence + + +class TestUTract(unittest.TestCase): + def test_to_vector(self): + u_tract_1 = UTract(RNASequence("ACGU"), 2) + + self.assertEqual(u_tract_1.to_vector(4), [1, 0, 2, 0, + 0, 1, 3, 0, + 1, 0, 3, 1, + 0, 1, 2, 0]) + self.assertEqual(u_tract_1.to_vector(3), [1, 0, 2, 0, + 0, 1, 3, 0, + 1, 0, 3, 1]) + self.assertEqual(u_tract_1.to_vector(5), [1, 0, 2, 0, + 0, 1, 3, 0, + 1, 0, 3, 1, + 0, 1, 2, 0, + 0, 0, 0, 0]) + + self.assertEqual(u_tract_1.to_vector(4, one_hot=True), [1, 0, 0, 0, 0, + 0, 1, 0, 0, 0, + 0, 0, 1, 0, 1, + 0, 0, 0, 1, 0]) + self.assertEqual(u_tract_1.to_vector(3, one_hot=True), [1, 0, 0, 0, 0, + 0, 1, 0, 0, 0, + 0, 0, 1, 0, 1]) + self.assertEqual(u_tract_1.to_vector(5, one_hot=True), [1, 0, 0, 0, 0, + 0, 1, 0, 0, 0, + 0, 0, 1, 0, 1, + 0, 0, 0, 1, 0, + 0, 0, 0, 0, 0]) + + def test_init(self): + u_tract_1 = UTract(RNASequence("ACGU"), 2) + self.assertEqual(2, u_tract_1.pot) + + with self.assertRaises(AssertionError): + UTract(RNASequence("ACGU"), 4) + + +if __name__ == "__main__": + unittest.main() diff --git a/mewtwo/embeddings/terminator/u_tract.py b/mewtwo/embeddings/terminator/u_tract.py index e7dcd37..3c84101 100644 --- a/mewtwo/embeddings/terminator/u_tract.py +++ b/mewtwo/embeddings/terminator/u_tract.py @@ -1,7 +1,25 @@ from mewtwo.embeddings.sequence import RNASequence +from mewtwo.embeddings.bases import Base, base_to_vector class UTract: def __init__(self, sequence: RNASequence, pot: int) -> None: self.sequence = sequence self.pot = pot + assert self.pot < len(self.sequence) + + def to_vector(self, u_tract_size: int = 10, one_hot: bool = False) -> list[int]: + vector = [] + for i in range(u_tract_size): + try: + base = self.sequence[i] + except IndexError: + base = Base.ZERO_PADDING + vector.extend(base_to_vector(base, one_hot)) + + if i == self.pot: + vector.append(1) + else: + vector.append(0) + + return vector From b0d34bc493511d317eef6724007d8ead6cd06c23 Mon Sep 17 00:00:00 2001 From: "Terlouw, Barbara" Date: Mon, 28 Apr 2025 15:47:45 +0200 Subject: [PATCH 21/41] Add terminator class (untested) --- mewtwo/embeddings/terminator/terminator.py | 21 +++++++++++++++++++-- 1 file changed, 19 insertions(+), 2 deletions(-) diff --git a/mewtwo/embeddings/terminator/terminator.py b/mewtwo/embeddings/terminator/terminator.py index a9cbcb4..d4e28bd 100644 --- a/mewtwo/embeddings/terminator/terminator.py +++ b/mewtwo/embeddings/terminator/terminator.py @@ -1,16 +1,33 @@ -from typing import Union +from typing import Union, Optional from mewtwo.embeddings.terminator.hairpin import RNAFoldHairpin, TransTermHPHairpin from mewtwo.embeddings.terminator.a_tract import ATract from mewtwo.embeddings.terminator.u_tract import UTract +from mewtwo.embeddings.sequence import RNASequence class Terminator: - def __init__(self, hairpin: Union[RNAFoldHairpin, TransTermHPHairpin], a_tract: ATract, u_tract: UTract): + def __init__(self, start: int, end: int, pot: int, species: str, chromosome: str, strand: str, + sequence: RNASequence, termination_efficiency: Optional[float], + hairpin: Union[RNAFoldHairpin, TransTermHPHairpin], a_tract: ATract, u_tract: UTract): + self.start = start + self.end = end + self.pot = pot + self.species = species + self.chromosome = chromosome + self.te = termination_efficiency + self.sequence = sequence + self.strand = strand + self.hairpin = hairpin self.a_tract = a_tract self.u_tract = u_tract def to_vector(self, one_hot: bool = False) -> list[int]: + assert not self.hairpin.contains_multiple_hairpins() vector = [] vector.extend(self.a_tract.sequence.to_vector(one_hot)) + vector.extend(self.hairpin.stem.to_vector()) + vector.extend(self.hairpin.loop.to_vector()) + vector.extend(self.u_tract.to_vector()) + return vector From f7ea4f35015d4e4b2de6d3fc0ea260108c9337b5 Mon Sep 17 00:00:00 2001 From: "Terlouw, Barbara" Date: Mon, 28 Apr 2025 15:48:27 +0200 Subject: [PATCH 22/41] Parse terminators from termite output (untested) --- mewtwo/parsers/parse_termite_output.py | 135 +++++++++++++++++++++++-- 1 file changed, 127 insertions(+), 8 deletions(-) diff --git a/mewtwo/parsers/parse_termite_output.py b/mewtwo/parsers/parse_termite_output.py index ad824a6..66e9086 100644 --- a/mewtwo/parsers/parse_termite_output.py +++ b/mewtwo/parsers/parse_termite_output.py @@ -1,12 +1,122 @@ from mewtwo.parsers.tabular import Tabular from mewtwo.embeddings.terminator.hairpin import RNAFoldHairpin, TransTermHPHairpin from mewtwo.embeddings.bases import BasePair, Base +from mewtwo.embeddings.terminator.terminator import Terminator +from mewtwo.embeddings.sequence import DNASequence, convert_to_rna +from mewtwo.embeddings.terminator.a_tract import ATract +from mewtwo.embeddings.terminator.u_tract import UTract from sys import argv from pprint import pprint -def rnafold_hairpins_from_termite(input_file, get_rnafold: bool = True, get_transtermhp: bool = False, +def parse_termite_data(input_file: str, species_column: bool) -> Tabular: + if not species_column: + termite_data = Tabular(input_file, [0, 6]) + else: + termite_data = Tabular(input_file, [0, 1, 7]) + + return termite_data + + +def get_termite_terminators(input_file: str, prioritise_rnafold: bool = True, species_column: bool = False, + te_only: bool = True) -> list[Terminator]: + """ + + Parameters + ---------- + input_file: str, tabular termite output file + prioritise_rnafold: bool, if True, prioritise hairpins predicted with RNAFold. Otherwise, prioritise hairpins + predicted with TransTermHP + species_column: bool, if True, termite input contains an additional species column + te_only: bool, if True, only return terminators for which the termination efficiency is known + + Returns + ------- + list of terminator instances + + """ + termite_data = parse_termite_data(input_file, species_column) + + rnafold_terminators = {} + transtermhp_terminators = {} + + for datapoint in termite_data.data: + terminator_id = '|'.join(datapoint) + if not species_column: + species = "unknown" + else: + species = termite_data.get_value(datapoint, "Species") + chromosome = termite_data.get_value(datapoint, 'chromosome') + pot = int(termite_data.get_value(datapoint, 'POT')) + start = int(termite_data.get_value(datapoint, 'start')) + end = int(termite_data.get_value(datapoint, 'end')) + strand = termite_data.get_value(datapoint, 'strand') + sequence = convert_to_rna(DNASequence(termite_data.get_value(datapoint, 'sequence'))) + + te = termite_data.get_value(datapoint, "termination efficiency") + if te == '.': + te = None + else: + te = float(te) + + if te_only and te is None: + continue + + if termite_data.get_value(datapoint, 'rnafold') == '+': + + hairpin = RNAFoldHairpin(terminator_id, + termite_data.get_value(datapoint, "rnafold POT distance to hairpin"), + termite_data.get_value(datapoint, "rnafold energy"), + termite_data.get_value(datapoint, "rnafold hairpin"), + termite_data.get_value(datapoint, "rnafold hairpin structure")) + if not hairpin.contains_multiple_hairpins(): + a_tract_sequence = convert_to_rna(DNASequence(termite_data.get_value(datapoint, "rnafold a tract"))) + a_tract = ATract(a_tract_sequence) + u_tract_sequence = convert_to_rna(DNASequence(termite_data.get_value(datapoint, "rnafold u tract"))) + relative_pot = int(termite_data.get_value(datapoint, "rnafold POT distance to hairpin")) + u_tract = UTract(u_tract_sequence, relative_pot) + + terminator = Terminator(start, end, pot, species, chromosome, strand, sequence, te, hairpin, a_tract, + u_tract) + rnafold_terminators[terminator_id] = terminator + + if termite_data.get_value(datapoint, 'transtermhp') == '+': + hairpin = TransTermHPHairpin(terminator_id, + termite_data.get_value(datapoint, "transtermhp POT distance to hairpin"), + termite_data.get_value(datapoint, "transtermhp hairpin score"), + termite_data.get_value(datapoint, "transtermhp hairpin")) + if not hairpin.contains_multiple_hairpins(): + a_tract_sequence = convert_to_rna(DNASequence(termite_data.get_value(datapoint, "transtermhp a tract"))) + a_tract = ATract(a_tract_sequence) + u_tract_sequence = convert_to_rna(DNASequence(termite_data.get_value(datapoint, "transtermhp u tract"))) + relative_pot = int(termite_data.get_value(datapoint, "transtermhp POT distance to hairpin")) + u_tract = UTract(u_tract_sequence, relative_pot) + + terminator = Terminator(start, end, pot, species, chromosome, strand, sequence, te, hairpin, a_tract, + u_tract) + transtermhp_terminators[terminator_id] = terminator + + terminators = [] + + if prioritise_rnafold: + prioritised_terminators = rnafold_terminators + other_terminators = transtermhp_terminators + else: + prioritised_terminators = transtermhp_terminators + other_terminators = rnafold_terminators + + for terminator_id, terminator in prioritised_terminators.items(): + terminators.append(terminator) + + for terminator_id, terminator in other_terminators.items(): + if terminator_id not in prioritised_terminators: + terminators.append(terminator) + + return terminators + + +def rnafold_hairpins_from_termite(input_file: str, get_rnafold: bool = True, get_transtermhp: bool = False, get_mutually_exclusive: bool = False, species_column: bool = False): if get_rnafold and get_transtermhp: @@ -17,10 +127,8 @@ def rnafold_hairpins_from_termite(input_file, get_rnafold: bool = True, get_tran rnafold_hairpins = {} transtermhp_hairpins = {} - if not species_column: - termite_data = Tabular(input_file, [0, 6]) - else: - termite_data = Tabular(input_file, [0, 1, 7]) + termite_data = parse_termite_data(input_file, species_column) + for datapoint in termite_data.data: hairpin_id = '|'.join(datapoint) if termite_data.get_value(datapoint, 'rnafold') == '+': @@ -36,7 +144,9 @@ def rnafold_hairpins_from_termite(input_file, get_rnafold: bool = True, get_tran termite_data.get_value(datapoint, "transtermhp hairpin score"), termite_data.get_value(datapoint, "transtermhp hairpin")) transtermhp_hairpins[hairpin_id] = hairpin + hairpins = [] + if get_rnafold: for hairpin_id, hairpin in rnafold_hairpins.items(): hairpins.append(hairpin) @@ -67,8 +177,7 @@ def rnafold_hairpins_from_termite(input_file, get_rnafold: bool = True, get_tran print(hairpin.distance_to_pot, hairpin.free_energy, hairpin.hairpin_sequence, hairpin.hairpin_structure) else: - loop, stem = hairpin.get_hairpin_parts() - for basepair in stem.get_basepairs(): + for basepair in hairpin.stem.get_basepairs(): if hairpin.prediction_software == 'RNAFold': if basepair not in basepairs_to_counts_rnafold: @@ -79,14 +188,24 @@ def rnafold_hairpins_from_termite(input_file, get_rnafold: bool = True, get_tran if basepair not in basepairs_to_counts_transtermhp: basepairs_to_counts_transtermhp[basepair] = 0 basepairs_to_counts_transtermhp[basepair] += 1 - if basepair == BasePair(Base.A, Base.A, True): + if basepair == BasePair(Base.A, Base.A, False): print(hairpin.hairpin_sequence) print(hairpin.hairpin_structure) + print(hairpin.distance_to_pot) print(f"{counter} structures contain multiple hairpins") pprint(basepairs_to_counts_rnafold) pprint(basepairs_to_counts_transtermhp) + quantified_terminators = get_termite_terminators(argv[1], species_column=True, te_only=True) + print(len(quantified_terminators)) + + for terminator in quantified_terminators: + print(terminator.te) + print(terminator.hairpin.stem.to_vector(10)) + # all_terminators = get_termite_terminators(argv[1], species_column=True, te_only=False) + # print(len(all_terminators)) + From 4f17651c64890e4967aa037145d4fdc8f1d9159a Mon Sep 17 00:00:00 2001 From: "Terlouw, Barbara" Date: Wed, 7 May 2025 11:04:19 +0200 Subject: [PATCH 23/41] Add enum for feature type --- mewtwo/embeddings/feature_labels.py | 48 ++++++++++++++++--- mewtwo/embeddings/test/test_feature_labels.py | 4 +- .../__init__.py} | 0 mewtwo/machine_learning/__init__.py | 0 mewtwo/machine_learning/neural_network.py | 0 mewtwo/machine_learning/prepare_data.py | 0 .../random_forest/__init__.py | 0 .../random_forest/train_random_forest.py | 0 .../test/test_train_test_split.py | 0 .../machine_learning/transformer/__init__.py | 0 .../transformer/dnabert_regressor.py | 32 +++++++++++++ .../transformer/finetune_bert.py | 0 .../transformer/prepare_data.py | 0 mewtwo/parsers/parse_dnabert_data.py | 0 14 files changed, 76 insertions(+), 8 deletions(-) rename mewtwo/{embeddings/test/test_terminator.py => external_code/__init__.py} (100%) create mode 100644 mewtwo/machine_learning/__init__.py create mode 100644 mewtwo/machine_learning/neural_network.py create mode 100644 mewtwo/machine_learning/prepare_data.py create mode 100644 mewtwo/machine_learning/random_forest/__init__.py create mode 100644 mewtwo/machine_learning/random_forest/train_random_forest.py create mode 100644 mewtwo/machine_learning/test/test_train_test_split.py create mode 100644 mewtwo/machine_learning/transformer/__init__.py create mode 100644 mewtwo/machine_learning/transformer/dnabert_regressor.py create mode 100644 mewtwo/machine_learning/transformer/finetune_bert.py create mode 100644 mewtwo/machine_learning/transformer/prepare_data.py create mode 100644 mewtwo/parsers/parse_dnabert_data.py diff --git a/mewtwo/embeddings/feature_labels.py b/mewtwo/embeddings/feature_labels.py index 9563358..7e0fe13 100644 --- a/mewtwo/embeddings/feature_labels.py +++ b/mewtwo/embeddings/feature_labels.py @@ -2,6 +2,31 @@ from mewtwo.embeddings.sequence import SeqType from mewtwo.embeddings.bases import Base +from enum import Enum + +class FeatureType(Enum): + IS_A = 1 + IS_C = 2 + IS_G = 3 + IS_U = 4 + IS_PURINE = 5 + IS_PYRIMIDINE = 6 + NR_H_BONDS = 7 + IS_BONDED = 8 + IS_POT = 9 + + @staticmethod + def from_base(base: Base) -> "FeatureType": + base_to_feature_type = {Base.A: FeatureType.IS_A, + Base.C: FeatureType.IS_C, + Base.G: FeatureType.IS_G, + Base.U: FeatureType.IS_U, + Base.PURINES: FeatureType.IS_PURINE, + Base.PYRIMIDINES: FeatureType.IS_PYRIMIDINE} + + assert base in base_to_feature_type + + return base_to_feature_type[base] class FeatureLabel: @@ -14,6 +39,8 @@ def __init__(self, feature_position: int, max_a_tract_length: int, self.seq_type = seq_type self.one_hot = one_hot + self.feature_type: Optional[FeatureType] = None + self.base_index: Optional[int] = None self.base_identity: Optional[Base] = None self.base_hydrogen_bond_count: bool = False @@ -41,25 +68,28 @@ def __init__(self, feature_position: int, max_a_tract_length: int, nr_u_tract_features = 4 * max_u_tract_length if feature_position < nr_a_tract_features: - self.feature_type = "A-tract" + self.feature_category = "A-tract" self.relative_feature_position = feature_position self.set_base_information() elif feature_position < nr_a_tract_features + nr_stem_features: - self.feature_type = "stem" + self.feature_category = "stem" self.relative_feature_position = feature_position - nr_a_tract_features self.set_basepair_information() elif feature_position < nr_a_tract_features + nr_stem_features + nr_loop_features: - self.feature_type = "loop" + self.feature_category = "loop" self.relative_feature_position = feature_position - nr_a_tract_features - nr_stem_features self.set_base_information() elif feature_position < nr_a_tract_features + nr_stem_features + nr_loop_features + nr_u_tract_features: - self.feature_type = "U-tract" + self.feature_category = "U-tract" self.relative_feature_position = \ feature_position - nr_a_tract_features - nr_stem_features - nr_loop_features self.set_base_information() else: raise IndexError(f"Feature position {feature_position} does not exist.") + def __str__(self): + pass + def set_base_information(self) -> None: if self.one_hot: @@ -73,7 +103,7 @@ def set_base_information(self) -> None: else: features = [Base.PURINES, Base.PYRIMIDINES, "hydrogen bonds"] - if self.feature_type == 'U-tract': + if self.feature_category == 'U-tract': features.append('POT') feature_nr = len(features) @@ -83,10 +113,13 @@ def set_base_information(self) -> None: feature = features[i] if feature == 'POT': self.check_pot = True + self.feature_type = FeatureType.IS_POT elif feature == 'hydrogen bonds': self.base_hydrogen_bond_count = True + self.feature_type = FeatureType.NR_H_BONDS else: self.base_identity = feature + self.feature_type = FeatureType.from_base(feature) self.base_index = self.relative_feature_position // feature_nr + 1 break @@ -95,7 +128,7 @@ def set_base_information(self) -> None: raise ValueError(f"Could not find feature type for feature at {self.relative_feature_position}") def set_basepair_information(self) -> None: - assert self.feature_type == "stem" + assert self.feature_category == "stem" if self.one_hot: feature_nr = 9 if self.seq_type == SeqType.RNA: @@ -124,11 +157,14 @@ def set_basepair_information(self) -> None: if feature != 'hydrogen bonds': self.base_identity = feature + self.feature_type = FeatureType.from_base(feature) else: self.base_hydrogen_bond_count = True + self.feature_type = FeatureType.NR_H_BONDS else: self.check_pairing = True + self.feature_type = FeatureType.IS_BONDED break else: diff --git a/mewtwo/embeddings/test/test_feature_labels.py b/mewtwo/embeddings/test/test_feature_labels.py index 5dca8e0..5ba63ff 100644 --- a/mewtwo/embeddings/test/test_feature_labels.py +++ b/mewtwo/embeddings/test/test_feature_labels.py @@ -29,7 +29,7 @@ def test_set_base_information(self): self.assertEqual(feature_a01.base_index, 1) self.assertEqual(feature_a01.base_pair_index, None) self.assertEqual(feature_a01.base_hydrogen_bond_count, False) - self.assertEqual(feature_a01.feature_type, 'A-tract') + self.assertEqual(feature_a01.feature_category, 'A-tract') self.assertEqual(feature_a02.base_identity, Base.PYRIMIDINES) self.assertEqual(feature_a02.base_index, 1) @@ -98,7 +98,7 @@ def test_set_basepair_information(self): self.assertEqual(feature_s01.base_pair_index, 1) self.assertEqual(feature_s01.stem_shoulder, 'upstream') self.assertEqual(feature_s01.base_hydrogen_bond_count, False) - self.assertEqual(feature_s01.feature_type, 'stem') + self.assertEqual(feature_s01.feature_category, 'stem') self.assertEqual(feature_s02.base_identity, Base.PYRIMIDINES) self.assertEqual(feature_s03.base_identity, None) diff --git a/mewtwo/embeddings/test/test_terminator.py b/mewtwo/external_code/__init__.py similarity index 100% rename from mewtwo/embeddings/test/test_terminator.py rename to mewtwo/external_code/__init__.py diff --git a/mewtwo/machine_learning/__init__.py b/mewtwo/machine_learning/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/mewtwo/machine_learning/neural_network.py b/mewtwo/machine_learning/neural_network.py new file mode 100644 index 0000000..e69de29 diff --git a/mewtwo/machine_learning/prepare_data.py b/mewtwo/machine_learning/prepare_data.py new file mode 100644 index 0000000..e69de29 diff --git a/mewtwo/machine_learning/random_forest/__init__.py b/mewtwo/machine_learning/random_forest/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/mewtwo/machine_learning/random_forest/train_random_forest.py b/mewtwo/machine_learning/random_forest/train_random_forest.py new file mode 100644 index 0000000..e69de29 diff --git a/mewtwo/machine_learning/test/test_train_test_split.py b/mewtwo/machine_learning/test/test_train_test_split.py new file mode 100644 index 0000000..e69de29 diff --git a/mewtwo/machine_learning/transformer/__init__.py b/mewtwo/machine_learning/transformer/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/mewtwo/machine_learning/transformer/dnabert_regressor.py b/mewtwo/machine_learning/transformer/dnabert_regressor.py new file mode 100644 index 0000000..cdce2f9 --- /dev/null +++ b/mewtwo/machine_learning/transformer/dnabert_regressor.py @@ -0,0 +1,32 @@ +import torch +import sys +from transformers import AutoTokenizer, AutoModel +import os +import torch.nn as nn + +import mewtwo.external_code + +sys.path.append(os.path.join(mewtwo.external_code.__file__, "DNABERT-2-117M")) + +from mewtwo.external_code.dnabert.bert_layers import BertModel + +tokenizer = AutoTokenizer.from_pretrained("zhihan1996/DNABERT-2-117M", trust_remote_code=True) +model = BertModel.from_pretrained("zhihan1996/DNABERT-2-117M") +model.to("cpu") + + +class DNABERTRegressor(nn.Module): + def __init__(self, base_model, hidden_size=768): + super().__init__() + self.base = base_model + self.dropout = nn.Dropout(0.1) + # Regression output: predicting continuous values + self.regressor = nn.Linear(hidden_size, 1) + + def forward(self, input_ids, attention_mask=None): + # Pass through the model + outputs = self.base(input_ids=input_ids, attention_mask=attention_mask) + cls_token = outputs.last_hidden_state[:, 0] # Use the CLS token for regression + logits = self.regressor(self.dropout(cls_token)) + return logits + diff --git a/mewtwo/machine_learning/transformer/finetune_bert.py b/mewtwo/machine_learning/transformer/finetune_bert.py new file mode 100644 index 0000000..e69de29 diff --git a/mewtwo/machine_learning/transformer/prepare_data.py b/mewtwo/machine_learning/transformer/prepare_data.py new file mode 100644 index 0000000..e69de29 diff --git a/mewtwo/parsers/parse_dnabert_data.py b/mewtwo/parsers/parse_dnabert_data.py new file mode 100644 index 0000000..e69de29 From ce8cf7395774708ae331bbde619ed1d6e84fe3a6 Mon Sep 17 00:00:00 2001 From: "Terlouw, Barbara" Date: Wed, 7 May 2025 11:05:34 +0200 Subject: [PATCH 24/41] Bugfix: loop embeddings are now properly aligned --- mewtwo/embeddings/terminator/loop.py | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) diff --git a/mewtwo/embeddings/terminator/loop.py b/mewtwo/embeddings/terminator/loop.py index b5498d6..378a67a 100644 --- a/mewtwo/embeddings/terminator/loop.py +++ b/mewtwo/embeddings/terminator/loop.py @@ -33,22 +33,20 @@ def to_vector(self, max_loop_size, one_hot=False): right_vector.extend(base_to_vector(base, one_hot=one_hot)) vector = left_padding + left_vector + center_vector + right_vector + right_padding - print(left_padding) - print(left_vector) - print(center_vector) - print(right_vector) - print(right_padding) return vector def get_max_loop_size(loops: list[Loop]) -> int: max_loop_size = 0 for loop in loops: - stem_size = len(loop.sequence) - if stem_size > max_loop_size: - max_loop_size += 1 + loop_size = len(loop.sequence) + if loop_size > max_loop_size: + + max_loop_size = loop_size + if max_loop_size % 2 == 0: + max_loop_size += 1 return max_loop_size From f13608396e559ec368e2e1c8e48ba2c7d8113c57 Mon Sep 17 00:00:00 2001 From: "Terlouw, Barbara" Date: Wed, 7 May 2025 11:06:09 +0200 Subject: [PATCH 25/41] Bugfix: terminator featurisation --- mewtwo/embeddings/terminator/terminator.py | 31 ++++++++++++++++++---- 1 file changed, 26 insertions(+), 5 deletions(-) diff --git a/mewtwo/embeddings/terminator/terminator.py b/mewtwo/embeddings/terminator/terminator.py index d4e28bd..56c6cae 100644 --- a/mewtwo/embeddings/terminator/terminator.py +++ b/mewtwo/embeddings/terminator/terminator.py @@ -4,6 +4,7 @@ from mewtwo.embeddings.terminator.a_tract import ATract from mewtwo.embeddings.terminator.u_tract import UTract from mewtwo.embeddings.sequence import RNASequence +from mewtwo.embeddings.terminator.loop import get_max_loop_size class Terminator: @@ -23,11 +24,31 @@ def __init__(self, start: int, end: int, pot: int, species: str, chromosome: str self.a_tract = a_tract self.u_tract = u_tract - def to_vector(self, one_hot: bool = False) -> list[int]: + def to_vector(self, max_loop_size: int, max_stem_size: int = 10, a_tract_size: int = 10, u_tract_size: int = 10, + one_hot: bool = False) -> list[int]: assert not self.hairpin.contains_multiple_hairpins() vector = [] - vector.extend(self.a_tract.sequence.to_vector(one_hot)) - vector.extend(self.hairpin.stem.to_vector()) - vector.extend(self.hairpin.loop.to_vector()) - vector.extend(self.u_tract.to_vector()) + vector.extend(self.a_tract.to_vector(a_tract_size=a_tract_size, one_hot=one_hot)) + vector.extend(self.hairpin.stem.to_vector(max_stem_size=max_stem_size, one_hot=one_hot)) + vector.extend(self.hairpin.loop.to_vector(max_loop_size=max_loop_size, one_hot=one_hot)) + vector.extend(self.u_tract.to_vector(u_tract_size=u_tract_size, one_hot=one_hot)) return vector + + +def get_terminator_part_sizes(terminators: list[Terminator]) -> tuple[int, int, int, int]: + loops = [] + a_tracts = [] + u_tracts = [] + stems = [] + for terminator in terminators: + loops.append(terminator.hairpin.loop) + a_tracts.append(terminator.a_tract) + u_tracts.append(terminator.u_tract) + stems.append(terminator.hairpin.stem) + + max_loop_size = get_max_loop_size(loops) + max_stem_size = max([len(stem.basepairs) for stem in stems]) + max_a_tract_size = max([len(a_tract.sequence) for a_tract in a_tracts]) + max_u_tract_size = max([len(u_tract.sequence) for u_tract in u_tracts]) + + return max_loop_size, max_stem_size, max_a_tract_size, max_u_tract_size From 0ef6ebbc8b24f033863cb0adb6f5200d4c05c3d9 Mon Sep 17 00:00:00 2001 From: "Terlouw, Barbara" Date: Wed, 7 May 2025 11:07:10 +0200 Subject: [PATCH 26/41] Add machine learning code --- mewtwo/machine_learning/neural_network.py | 12 ++++ mewtwo/machine_learning/prepare_data.py | 21 +++++++ .../random_forest/train_random_forest.py | 44 ++++++++++++++ .../test/test_train_test_split.py | 59 +++++++++++++++++++ .../transformer/dnabert_regressor.py | 28 +++------ .../transformer/finetune_bert.py | 33 +++++++++++ .../transformer/prepare_data.py | 36 +++++++++++ 7 files changed, 213 insertions(+), 20 deletions(-) diff --git a/mewtwo/machine_learning/neural_network.py b/mewtwo/machine_learning/neural_network.py index e69de29..d4891b5 100644 --- a/mewtwo/machine_learning/neural_network.py +++ b/mewtwo/machine_learning/neural_network.py @@ -0,0 +1,12 @@ +from sklearn.neural_network import MLPRegressor + +from mewtwo.machine_learning.prepare_data import terminators_to_ml_input + + +def train_nn(train_terminators, test_terminators, one_hot=True): + train_x, train_y, test_x, test_y = terminators_to_ml_input(train_terminators, test_terminators, one_hot=one_hot) + print(len(train_x)) + nn = MLPRegressor(hidden_layer_sizes=(500, 1000, 1000, 500, 100), random_state=100125, max_iter=1000, solver='adam') + nn.fit(train_x, train_y) + + print(nn.score(test_x, test_y)) \ No newline at end of file diff --git a/mewtwo/machine_learning/prepare_data.py b/mewtwo/machine_learning/prepare_data.py index e69de29..e723521 100644 --- a/mewtwo/machine_learning/prepare_data.py +++ b/mewtwo/machine_learning/prepare_data.py @@ -0,0 +1,21 @@ +from mewtwo.embeddings.terminator.terminator import get_terminator_part_sizes, Terminator + + +def terminators_to_ml_input(train_terminators: list[Terminator], test_terminators: list[Terminator], one_hot=True) -> \ + tuple[list[list[int]], list[float], list[list[int]], list[float]]: + max_loop, max_stem, max_a, max_u = get_terminator_part_sizes(train_terminators + test_terminators) + + train_x = [] + train_y = [] + test_x = [] + test_y = [] + + for terminator in train_terminators: + train_x.append(terminator.to_vector(max_loop, max_stem, max_a, max_u, one_hot=one_hot)) + train_y.append(terminator.te) + + for terminator in test_terminators: + test_x.append(terminator.to_vector(max_loop, max_stem, max_a, max_u, one_hot=one_hot)) + test_y.append(terminator.te) + + return train_x, train_y, test_x, test_y diff --git a/mewtwo/machine_learning/random_forest/train_random_forest.py b/mewtwo/machine_learning/random_forest/train_random_forest.py index e69de29..693f6ff 100644 --- a/mewtwo/machine_learning/random_forest/train_random_forest.py +++ b/mewtwo/machine_learning/random_forest/train_random_forest.py @@ -0,0 +1,44 @@ +from sklearn.ensemble import RandomForestRegressor + +from mewtwo.embeddings.terminator.terminator import get_terminator_part_sizes +from mewtwo.embeddings.feature_labels import FeatureLabel + + +def train_random_forest(train_terminators, test_terminators, one_hot=False): + all_terminators = train_terminators + test_terminators + max_loop, max_stem, max_a, max_u = get_terminator_part_sizes(all_terminators) + train_x = [] + train_y = [] + test_x = [] + test_y = [] + for terminator in train_terminators: + train_x.append(terminator.to_vector(max_loop, max_stem, max_a, max_u, one_hot=one_hot)) + train_y.append(terminator.te) + + for terminator in test_terminators: + test_x.append(terminator.to_vector(max_loop, max_stem, max_a, max_u, one_hot=one_hot)) + test_y.append(terminator.te) + + random_forest = RandomForestRegressor(n_estimators=1000, oob_score=True) + random_forest.fit(train_x, train_y) + + print("oob", random_forest.oob_score_) + importance_and_label = [] + for i, feature_importance in enumerate(random_forest.feature_importances_): + feature_label = FeatureLabel(i, max_a, max_stem, max_loop, max_u, one_hot=one_hot) + importance_and_label.append((feature_importance, feature_label)) + + importance_and_label.sort(key=lambda x: x[0], reverse=True) + + for importance, label in importance_and_label[:20]: + if label.feature_category == 'stem': + print(label.feature_category, label.feature_type, importance, label.base_pair_index, label.base_identity, label.stem_shoulder) + else: + print(label.feature_category, label.feature_type, importance, label.base_index, label.base_identity) + + print("test score", random_forest.score(test_x, test_y)) + + return random_forest + + + diff --git a/mewtwo/machine_learning/test/test_train_test_split.py b/mewtwo/machine_learning/test/test_train_test_split.py index e69de29..57a58e2 100644 --- a/mewtwo/machine_learning/test/test_train_test_split.py +++ b/mewtwo/machine_learning/test/test_train_test_split.py @@ -0,0 +1,59 @@ +import unittest +import os +from shutil import rmtree + + +from mewtwo.machine_learning.train_test_split import split_data, split_data_from_file, bin_data +from mewtwo.parsers.parse_dnabert_data import parse_dnabert_data + +BASE_DIR = os.path.dirname(__file__) + + +class TestTrainTestSplit(unittest.TestCase): + def test_bin_data(self): + dummy_data_1 = [0.0, 0.05, 0.1, 0.11, 0.15, 0.9, 0.95, 1.0] + dummy_bins_1 = [0, 0, 0, 1, 1, 8, 9, 9] + + self.assertEqual(bin_data(dummy_data_1), dummy_bins_1) + + dummy_data_2 = [1.0, 0.0, 0.05, 0.1, 0.11, 0.15, 0.9, 0.95, 1.0] + dummy_bins_2 = [9, 0, 0, 0, 1, 1, 8, 9, 9] + + self.assertEqual(bin_data(dummy_data_2), dummy_bins_2) + + dummy_data_3 = [1.1, 0.0, 0.05, 0.1, 0.11, 0.15, 0.9, 0.95, 1.0] + + with self.assertRaises(ValueError): + bin_data(dummy_data_3) + + dummy_data_4 = [-0.1, 0.0, 0.05, 0.1, 0.11, 0.15, 0.9, 0.95, 1.0] + + with self.assertRaises(ValueError): + bin_data(dummy_data_4) + + def test_split_data_from_file(self): + + + test_data_1 = os.path.abspath(os.path.join(BASE_DIR, 'data', 'mock_data_1.txt')) + test_output_1 = os.path.abspath(os.path.join(BASE_DIR, 'output', 'mock_output_1')) + + test_data_2 = os.path.abspath(os.path.join(BASE_DIR, 'data', 'mock_data_2.txt')) + test_output_2 = os.path.abspath(os.path.join(BASE_DIR, 'output', 'mock_output_2')) + + split_data_from_file(test_data_1, test_output_1) + + self.assertEqual(os.path.exists(test_output_1), True) + self.assertEqual(os.path.exists(os.path.join(test_output_1, "validation.txt")), True) + self.assertEqual(os.path.exists(os.path.join(test_output_1, "train.txt")), True) + self.assertEqual(os.path.exists(os.path.join(test_output_1, "test.txt")), True) + + x_train_1, y_train_1 = parse_dnabert_data(os.path.join(test_output_1, "train.txt")) + self.assertEqual(len(x_train_1), 50) + + x_test_1, y_test_1 = parse_dnabert_data(os.path.join(test_output_1, "test.txt")) + self.assertEqual(len(x_test_1), 25) + + x_val_1, y_val_1 = parse_dnabert_data(os.path.join(test_output_1, "validation.txt")) + self.assertEqual(len(x_val_1), 25) + + rmtree(test_output_1) diff --git a/mewtwo/machine_learning/transformer/dnabert_regressor.py b/mewtwo/machine_learning/transformer/dnabert_regressor.py index cdce2f9..771aaa9 100644 --- a/mewtwo/machine_learning/transformer/dnabert_regressor.py +++ b/mewtwo/machine_learning/transformer/dnabert_regressor.py @@ -1,32 +1,20 @@ -import torch -import sys -from transformers import AutoTokenizer, AutoModel -import os import torch.nn as nn -import mewtwo.external_code - -sys.path.append(os.path.join(mewtwo.external_code.__file__, "DNABERT-2-117M")) - -from mewtwo.external_code.dnabert.bert_layers import BertModel - -tokenizer = AutoTokenizer.from_pretrained("zhihan1996/DNABERT-2-117M", trust_remote_code=True) -model = BertModel.from_pretrained("zhihan1996/DNABERT-2-117M") -model.to("cpu") - class DNABERTRegressor(nn.Module): - def __init__(self, base_model, hidden_size=768): + def __init__(self, base_model, hidden_size=768, dropout: float = 0.2, + tuning_mode: str = 'linear_head'): super().__init__() self.base = base_model - self.dropout = nn.Dropout(0.1) - # Regression output: predicting continuous values - self.regressor = nn.Linear(hidden_size, 1) + self.dropout = nn.Dropout(dropout) + if tuning_mode == 'linear_head': + + self.regressor = nn.Sequential(self.dropout, nn.Linear(hidden_size, 1), nn.Sigmoid()) def forward(self, input_ids, attention_mask=None): # Pass through the model outputs = self.base(input_ids=input_ids, attention_mask=attention_mask) - cls_token = outputs.last_hidden_state[:, 0] # Use the CLS token for regression - logits = self.regressor(self.dropout(cls_token)) + output = outputs.last_hidden_state[:, 0, :] + logits = self.regressor(output).squeeze(-1) return logits diff --git a/mewtwo/machine_learning/transformer/finetune_bert.py b/mewtwo/machine_learning/transformer/finetune_bert.py index e69de29..dfedbcf 100644 --- a/mewtwo/machine_learning/transformer/finetune_bert.py +++ b/mewtwo/machine_learning/transformer/finetune_bert.py @@ -0,0 +1,33 @@ +import argparse + +from transformers import AutoTokenizer, AutoModel + +from mewtwo.external_code.dnabert.bert_layers import BertModel +from mewtwo.machine_learning.transformer.dnabert_regressor import DNABERTRegressor +from torch.utils.data import DataLoader +from mewtwo.machine_learning.transformer.prepare_data import RegressionDataset + + +def parse_arguments() -> argparse.Namespace: + parser = argparse.ArgumentParser() + parser.add_argument("-i", type=str, required=True, help="Tabular input data, with sequence in one column and efficiency in the second") + parser.add_argument("-o", type=str, help="Output directoru") + parser.add_argument("-f", type=str, default='linear_head', + help="Finetuning mode, must be one of 'linear_head', 'partial', and 'adapter'") + args = parser.parse_args() + + assert args.f in ['linear_head', 'partial', 'adapter'] + + return args + +if __name__ == "__main__": + + tokenizer = AutoTokenizer.from_pretrained("zhihan1996/DNABERT-2-117M", trust_remote_code=True) + model = BertModel.from_pretrained("zhihan1996/DNABERT-2-117M") + model.to("cpu") + + dataset = RegressionDataset(sequences, labels, tokenizer) + dataloader = DataLoader(dataset, batch_size=2, shuffle=True) + + for name, param in model.named_parameters(): + print(name, param) diff --git a/mewtwo/machine_learning/transformer/prepare_data.py b/mewtwo/machine_learning/transformer/prepare_data.py index e69de29..e2600cb 100644 --- a/mewtwo/machine_learning/transformer/prepare_data.py +++ b/mewtwo/machine_learning/transformer/prepare_data.py @@ -0,0 +1,36 @@ +import torch + +from torch.utils.data import Dataset, DataLoader + + +class RegressionDataset(Dataset): + def __init__(self, sequences, labels, tokenizer, max_length=512): + self.sequences = sequences + self.labels = labels + self.tokenizer = tokenizer + self.max_length = max_length + + def __len__(self): + return len(self.sequences) + + def __getitem__(self, idx): + sequence = self.sequences[idx] + label = self.labels[idx] + + # Tokenize the sequence + encoding = self.tokenizer(sequence, truncation=True, padding="max_length", max_length=self.max_length, + return_tensors="pt") + input_ids = encoding['input_ids'].squeeze(0) # Squeeze the batch dimension + attention_mask = encoding['attention_mask'].squeeze(0) # Squeeze the batch dimension + + return { + 'input_ids': input_ids, + 'attention_mask': attention_mask, + 'labels': torch.tensor(label, dtype=torch.float) # For regression, labels are continuous values + } + + +# Example sequences and labels +sequences = ["ACGTACGT", "AGCTAGCT", "CGTACGTA"] +labels = [0.5, 1.2, 0.8] # Continuous labels for regression + From baea83c069ce708581aaf7616852077dd3d858d6 Mon Sep 17 00:00:00 2001 From: "Terlouw, Barbara" Date: Wed, 7 May 2025 11:07:39 +0200 Subject: [PATCH 27/41] Add parser for DNABERT data --- mewtwo/parsers/parse_dnabert_data.py | 11 ++ mewtwo/parsers/parse_termite_output.py | 133 ++++++++++++++++++------- 2 files changed, 108 insertions(+), 36 deletions(-) diff --git a/mewtwo/parsers/parse_dnabert_data.py b/mewtwo/parsers/parse_dnabert_data.py index e69de29..da32154 100644 --- a/mewtwo/parsers/parse_dnabert_data.py +++ b/mewtwo/parsers/parse_dnabert_data.py @@ -0,0 +1,11 @@ +def parse_dnabert_data(input_file: str) -> tuple[list[str], list[float]]: + seqs = [] + tes = [] + with open(input_file, 'r') as seq_data: + for line in seq_data: + line = line.strip() + seq, te = line.split('\t') + te = float(te) + seqs.append(seq) + tes.append(te / 100) + return seqs, tes diff --git a/mewtwo/parsers/parse_termite_output.py b/mewtwo/parsers/parse_termite_output.py index 66e9086..5f234c7 100644 --- a/mewtwo/parsers/parse_termite_output.py +++ b/mewtwo/parsers/parse_termite_output.py @@ -1,13 +1,52 @@ from mewtwo.parsers.tabular import Tabular from mewtwo.embeddings.terminator.hairpin import RNAFoldHairpin, TransTermHPHairpin from mewtwo.embeddings.bases import BasePair, Base -from mewtwo.embeddings.terminator.terminator import Terminator -from mewtwo.embeddings.sequence import DNASequence, convert_to_rna +from mewtwo.embeddings.terminator.terminator import Terminator, get_terminator_part_sizes +from mewtwo.embeddings.sequence import DNASequence, convert_to_rna, convert_to_dna from mewtwo.embeddings.terminator.a_tract import ATract from mewtwo.embeddings.terminator.u_tract import UTract +from mewtwo.machine_learning.random_forest.train_random_forest import train_random_forest +from mewtwo.machine_learning.train_test_split import split_data +from mewtwo.machine_learning.prepare_data import terminators_to_ml_input +from mewtwo.machine_learning.neural_network import train_nn from sys import argv +import os from pprint import pprint +from statistics import median + +def termite_to_dnabert_input(input_file: str, output_dir: str, species_column: bool = True) -> None: + if not os.path.exists(output_dir): + os.mkdir(output_dir) + + terminators = get_termite_terminators(input_file, species_column=True, te_only=True) + spec_to_term = sort_by_species(terminators) + + bacillus_terminators = [] + ecoli_terminators = [] + + for spec, term in spec_to_term.items(): + if 'Bacillus' in spec and '(d)' in spec: + bacillus_terminators.extend(term) + elif 'Escherichia' in spec and '(a)' in spec: + ecoli_terminators.extend(term) + + all_out_file = os.path.join(output_dir, 'all.txt') + ecoli_out_file = os.path.join(output_dir, 'ecoli.txt') + bacillus_out_file = os.path.join(output_dir, 'bacillus.txt') + + with open(all_out_file, 'w') as all_out: + with open(ecoli_out_file, 'w') as ecoli_out: + with open(bacillus_out_file, 'w') as bacillus_out: + for terminator in ecoli_terminators: + terminator_data = f"{convert_to_dna(terminator.sequence).sequence}\t{terminator.te}\n" + all_out.write(terminator_data) + ecoli_out.write(terminator_data) + for terminator in bacillus_terminators: + terminator_data = f"{convert_to_dna(terminator.sequence).sequence}\t{terminator.te}\n" + all_out.write(terminator_data) + bacillus_out.write(terminator_data) + def parse_termite_data(input_file: str, species_column: bool) -> Tabular: @@ -19,6 +58,16 @@ def parse_termite_data(input_file: str, species_column: bool) -> Tabular: return termite_data +def sort_by_species(terminators: list[Terminator]) -> dict[str, list[Terminator]]: + + species_to_terminators: dict[str, list[Terminator]] = {} + for terminator in terminators: + if terminator.species not in species_to_terminators: + species_to_terminators[terminator.species] = [] + species_to_terminators[terminator.species].append(terminator) + + return species_to_terminators + def get_termite_terminators(input_file: str, prioritise_rnafold: bool = True, species_column: bool = False, te_only: bool = True) -> list[Terminator]: """ @@ -57,8 +106,11 @@ def get_termite_terminators(input_file: str, prioritise_rnafold: bool = True, sp te = termite_data.get_value(datapoint, "termination efficiency") if te == '.': te = None - else: + elif 0 <= float(te) <= 100: te = float(te) + else: + print(datapoint, te) + te = None if te_only and te is None: continue @@ -167,42 +219,51 @@ def rnafold_hairpins_from_termite(input_file: str, get_rnafold: bool = True, get if __name__ == "__main__": - hairpins = rnafold_hairpins_from_termite(argv[1], get_rnafold=True, get_transtermhp=True, species_column=True) - counter = 0 - basepairs_to_counts_rnafold = {} - basepairs_to_counts_transtermhp = {} - for hairpin in hairpins: - if hairpin.contains_multiple_hairpins(): - counter += 1 - print(hairpin.distance_to_pot, hairpin.free_energy, hairpin.hairpin_sequence, hairpin.hairpin_structure) - - else: - for basepair in hairpin.stem.get_basepairs(): - if hairpin.prediction_software == 'RNAFold': - - if basepair not in basepairs_to_counts_rnafold: - - basepairs_to_counts_rnafold[basepair] = 0 - basepairs_to_counts_rnafold[basepair] += 1 - else: - if basepair not in basepairs_to_counts_transtermhp: - basepairs_to_counts_transtermhp[basepair] = 0 - basepairs_to_counts_transtermhp[basepair] += 1 - if basepair == BasePair(Base.A, Base.A, False): - print(hairpin.hairpin_sequence) - print(hairpin.hairpin_structure) - print(hairpin.distance_to_pot) - - print(f"{counter} structures contain multiple hairpins") - pprint(basepairs_to_counts_rnafold) - pprint(basepairs_to_counts_transtermhp) quantified_terminators = get_termite_terminators(argv[1], species_column=True, te_only=True) - print(len(quantified_terminators)) + species_to_terminators = sort_by_species(quantified_terminators) + bacillus_terminators = [] + ecoli_terminators = [] + + + for species, species_terminators in species_to_terminators.items(): + if 'Bacillus' in species and '(d)' in species: + bacillus_terminators.extend(species_terminators) + elif 'Escherichia' in species and '(a)' in species: + ecoli_terminators.extend(species_terminators) + + all_terminators = bacillus_terminators + ecoli_terminators + train_terminators, test_terminators, crossvalidation_sets = split_data(all_terminators, test_size=0.1) + + for crossval_nr, crossvalidation_set in crossvalidation_sets.items(): + train_random_forest(crossvalidation_set.train, crossvalidation_set.test, one_hot=True) + + # train_terminators, test_terminators, _ = split_data(ecoli_terminators, test_size=0.1) + # + # train_random_forest(train_terminators, test_terminators, one_hot=True) + # + # train_terminators, test_terminators, _ = split_data(all_terminators, test_size=0.1) + # + # train_random_forest(train_terminators, test_terminators, one_hot=True) + + print(min([t.te for t in all_terminators]), max([t.te for t in all_terminators])) + + termite_to_dnabert_input(argv[1], argv[2], species_column=True) + + # train_nn(train_terminators, test_terminators) + + + + + + # for terminator in quantified_terminators: + # print(terminator.te) + # print(terminator.to_vector(max_loop_size=max_loop, max_stem_size=max_stem, a_tract_size=max_a, u_tract_size=max_u)) + # print(terminator.to_vector(max_loop_size=max_loop, max_stem_size=max_stem, a_tract_size=max_a, + # u_tract_size=max_u, one_hot=True)) + + # print(max_loop, max_stem, max_a, max_u) - for terminator in quantified_terminators: - print(terminator.te) - print(terminator.hairpin.stem.to_vector(10)) # all_terminators = get_termite_terminators(argv[1], species_column=True, te_only=False) # print(len(all_terminators)) From d415734fa4be9e0768426453c32475fb36c6157b Mon Sep 17 00:00:00 2001 From: "Terlouw, Barbara" Date: Wed, 7 May 2025 11:16:02 +0200 Subject: [PATCH 28/41] Add scikit-learn to workflow --- .github/workflows/pytest.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/pytest.yml b/.github/workflows/pytest.yml index 8366308..7077ffd 100644 --- a/.github/workflows/pytest.yml +++ b/.github/workflows/pytest.yml @@ -36,6 +36,7 @@ jobs: run: | python -m pip install --upgrade pip python -m pip install pytest + python -m pip install scikit-learn==1.6.1 python -m pip install . - name: Test with pytest run: | From 5e9327a241ea5f5251702d4a5243d6e081c5dca9 Mon Sep 17 00:00:00 2001 From: "Terlouw, Barbara" Date: Wed, 7 May 2025 11:21:01 +0200 Subject: [PATCH 29/41] Add code for creating train test splits --- mewtwo/machine_learning/train_test_split.py | 166 ++++++++++++++++++++ 1 file changed, 166 insertions(+) create mode 100644 mewtwo/machine_learning/train_test_split.py diff --git a/mewtwo/machine_learning/train_test_split.py b/mewtwo/machine_learning/train_test_split.py new file mode 100644 index 0000000..c8d67b1 --- /dev/null +++ b/mewtwo/machine_learning/train_test_split.py @@ -0,0 +1,166 @@ +from sklearn.model_selection import StratifiedShuffleSplit, StratifiedKFold +from mewtwo.embeddings.terminator.terminator import get_terminator_part_sizes, Terminator +from mewtwo.parsers.parse_dnabert_data import parse_dnabert_data + +import os +from sys import argv +from math import isclose + + +class CrossvalidationFold: + def __init__(self, train: list[Terminator], test: list[Terminator]): + self.train = train + self.test = test + + +def split_data(terminators, test_size: float = 0.5, n_crossval_sets: int = 5): + max_loop, max_stem, max_a, max_u = get_terminator_part_sizes(terminators) + x = [] + y = [] + labels = [] + for terminator in terminators: + x.append(terminator.to_vector(max_loop, max_stem, max_a, max_u)) + y.append(terminator.te) + labels.append(terminator.species) + + sss = StratifiedShuffleSplit(n_splits=1, test_size=test_size, random_state=250589) + sss.get_n_splits(x, labels) + train_indices, test_indices = next(sss.split(x, labels)) + + train_x = [] + train_terminators = [] + train_labels = [] + + for index in train_indices: + train_x.append(x[index]) + train_terminators.append(terminators[index]) + train_labels.append(labels[index]) + + test_terminators = [] + + for index in test_indices: + test_terminators.append(terminators[index]) + + skf = StratifiedKFold(n_splits=n_crossval_sets, shuffle=True, random_state=100125) + crossvalidation_sets = {} + + for i, (train_i, test_i) in enumerate(skf.split(train_x, train_labels)): + train_terminators_c = [train_terminators[j] for j in train_i] + test_terminators_c = [train_terminators[j] for j in test_i] + + crossvalidation_sets[i] = CrossvalidationFold(train_terminators_c, test_terminators_c) + + return train_terminators, test_terminators, crossvalidation_sets + +def bin_data(y, n_bins: int = 10): + bin_ranges = [] + range_start = 0.0 + step = 1.0 / n_bins + bins = [] + + for i in range(n_bins): + range_end = range_start + step + bin_ranges.append((range_start, range_end)) + range_start = range_end + + for i, y_data in enumerate(y): + bin_index = 0 + bin_range = bin_ranges[bin_index] + + if y_data < 0.0 and not isclose(y_data, 0.0): + raise ValueError(f"Expected value between 0.0 and 1.0. Got {y_data}") + + while bin_range[1] <= y_data and not isclose(y_data, bin_range[1]): + bin_index += 1 + try: + bin_range = bin_ranges[bin_index] + except IndexError: + raise ValueError(f"Expected value between 0.0 and 1.0. Got {y_data}") + + + bins.append(bin_index) + + return bins + +def split_data_from_file(input_file: str, output_dir: str, test_size: float = 0.25, validation_size: float = 0.33333, + nr_stratification_bins: int = 10): + """ + + Parameters + ---------- + input_file: input tabular file with sequences in column one and float in column 2 + output_dir: output directory + test_size: proportion of ALL data + validation_size: proportion of TRAINING data + nr_stratification_bins: number of bins between 0 and 1 to perform stratification on + + Returns + ------- + + """ + + if not os.path.exists(output_dir): + os.mkdir(output_dir) + + train_out = os.path.join(output_dir, 'train.txt') + validation_out = os.path.join(output_dir, 'validation.txt') + test_out = os.path.join(output_dir, 'test.txt') + + x, y = parse_dnabert_data(input_file) + bins = bin_data(y, nr_stratification_bins) + + # Split train and test data + tts = StratifiedShuffleSplit(n_splits=1, test_size=test_size, random_state=250589) + tts.get_n_splits(x, bins) + + seen_indices, test_indices = next(tts.split(x, bins)) + + seen_x = [] + seen_y = [] + seen_bins = [] + + test_x = [] + test_y = [] + + for index in seen_indices: + seen_x.append(x[index]) + seen_y.append(y[index]) + seen_bins.append(bins[index]) + + for index in test_indices: + test_x.append(x[index]) + test_y.append(y[index]) + + tvs = StratifiedShuffleSplit(n_splits=1, test_size=validation_size, random_state=100125) + tvs.get_n_splits(seen_x, seen_bins) + + train_indices, validation_indices = next(tvs.split(seen_x, seen_bins)) + + train_x = [] + train_y = [] + + for index in train_indices: + train_x.append(seen_x[index]) + train_y.append(seen_y[index]) + + validation_x = [] + validation_y = [] + + for index in validation_indices: + validation_x.append(seen_x[index]) + validation_y.append(seen_y[index]) + + out_files = test_out, train_out, validation_out + out_x = test_x, train_x, validation_x + out_y = test_y, train_y, validation_y + + for i, out_file in enumerate(out_files): + + with open(out_file, 'w') as out: + for j, seq in enumerate(out_x[i]): + te = out_y[i][j] + out.write(f"{seq}\t{te}\n") + + +if __name__ == "__main__": + split_data_from_file(argv[1], argv[2]) From 0ebc7078d9fd5d35ae6223a2ce0ef4564f581478 Mon Sep 17 00:00:00 2001 From: "Terlouw, Barbara" Date: Wed, 7 May 2025 11:22:44 +0200 Subject: [PATCH 30/41] Add mock test data --- .../test/data/mock_data_1.txt | 100 ++++++++++++++++++ .../test/data/mock_data_2.txt | 100 ++++++++++++++++++ 2 files changed, 200 insertions(+) create mode 100644 mewtwo/machine_learning/test/data/mock_data_1.txt create mode 100644 mewtwo/machine_learning/test/data/mock_data_2.txt diff --git a/mewtwo/machine_learning/test/data/mock_data_1.txt b/mewtwo/machine_learning/test/data/mock_data_1.txt new file mode 100644 index 0000000..4ee4e93 --- /dev/null +++ b/mewtwo/machine_learning/test/data/mock_data_1.txt @@ -0,0 +1,100 @@ +AGGA 1.0 +AGGTCC 2.0 +GGTCCGT 3.0 +AGTGGACC 4.0 +GGTGCCCCGT 5.0 +ATGGC 6.0 +AGGA 7.0 +AGGTCC 8.0 +GGTCCGT 9.0 +AGTGGACC 10.0 +GGTGCCCCGT 11.0 +ATGGC 12.0 +AGGA 13.0 +AGGTCC 14.0 +GGTCCGT 15.0 +AGTGGACC 16.0 +GGTGCCCCGT 17.0 +ATGGC 18.0 +AGGA 19.0 +AGGTCC 20.0 +GGTCCGT 21.0 +AGTGGACC 22.0 +GGTGCCCCGT 23.0 +ATGGC 24.0 +AGGA 25.0 +AGGTCC 26.0 +GGTCCGT 27.0 +AGTGGACC 28.0 +GGTGCCCCGT 29.0 +ATGGC 30.0 +AGGA 31.0 +AGGTCC 32.0 +GGTCCGT 33.0 +AGTGGACC 34.0 +GGTGCCCCGT 35.0 +ATGGC 36.0 +AGGA 37.0 +AGGTCC 38.0 +GGTCCGT 39.0 +ATGGC 40.0 +AGGA 41.0 +AGGTCC 42.0 +GGTCCGT 43.0 +AGTGGACC 44.0 +GGTGCCCCGT 45.0 +ATGGC 46.0 +AGGA 47.0 +AGGTCC 48.0 +GGTCCGT 49.0 +ATGGC 50.0 +AGGA 51.0 +AGGTCC 52.0 +GGTCCGT 53.0 +AGTGGACC 54.0 +GGTGCCCCGT 55.0 +ATGGC 56.0 +AGGA 57.0 +AGGTCC 58.0 +GGTCCGT 59.0 +ATGGC 60.0 +AGGA 61.0 +AGGTCC 62.0 +GGTCCGT 63.0 +AGTGGACC 64.0 +GGTGCCCCGT 65.0 +ATGGC 66.0 +AGGA 67.0 +AGGTCC 68.0 +GGTCCGT 69.0 +ATGGC 70.0 +AGGA 71.0 +AGGTCC 72.0 +GGTCCGT 73.0 +AGTGGACC 74.0 +GGTGCCCCGT 75.0 +ATGGC 76.0 +AGGA 77.0 +AGGTCC 78.0 +GGTCCGT 79.0 +ATGGC 80.0 +AGGA 81.0 +AGGTCC 82.0 +GGTCCGT 83.0 +AGTGGACC 84.0 +GGTGCCCCGT 85.0 +ATGGC 86.0 +AGGA 87.0 +AGGTCC 88.0 +GGTCCGT 89.0 +ATGGC 90.0 +AGGA 91.0 +AGGTCC 92.0 +GGTCCGT 93.0 +AGTGGACC 94.0 +GGTGCCCCGT 95.0 +ATGGC 96.0 +AGGA 97.0 +AGGTCC 98.0 +GGTCCGT 99.0 +GGTCCGT 100.0 \ No newline at end of file diff --git a/mewtwo/machine_learning/test/data/mock_data_2.txt b/mewtwo/machine_learning/test/data/mock_data_2.txt new file mode 100644 index 0000000..4ee4e93 --- /dev/null +++ b/mewtwo/machine_learning/test/data/mock_data_2.txt @@ -0,0 +1,100 @@ +AGGA 1.0 +AGGTCC 2.0 +GGTCCGT 3.0 +AGTGGACC 4.0 +GGTGCCCCGT 5.0 +ATGGC 6.0 +AGGA 7.0 +AGGTCC 8.0 +GGTCCGT 9.0 +AGTGGACC 10.0 +GGTGCCCCGT 11.0 +ATGGC 12.0 +AGGA 13.0 +AGGTCC 14.0 +GGTCCGT 15.0 +AGTGGACC 16.0 +GGTGCCCCGT 17.0 +ATGGC 18.0 +AGGA 19.0 +AGGTCC 20.0 +GGTCCGT 21.0 +AGTGGACC 22.0 +GGTGCCCCGT 23.0 +ATGGC 24.0 +AGGA 25.0 +AGGTCC 26.0 +GGTCCGT 27.0 +AGTGGACC 28.0 +GGTGCCCCGT 29.0 +ATGGC 30.0 +AGGA 31.0 +AGGTCC 32.0 +GGTCCGT 33.0 +AGTGGACC 34.0 +GGTGCCCCGT 35.0 +ATGGC 36.0 +AGGA 37.0 +AGGTCC 38.0 +GGTCCGT 39.0 +ATGGC 40.0 +AGGA 41.0 +AGGTCC 42.0 +GGTCCGT 43.0 +AGTGGACC 44.0 +GGTGCCCCGT 45.0 +ATGGC 46.0 +AGGA 47.0 +AGGTCC 48.0 +GGTCCGT 49.0 +ATGGC 50.0 +AGGA 51.0 +AGGTCC 52.0 +GGTCCGT 53.0 +AGTGGACC 54.0 +GGTGCCCCGT 55.0 +ATGGC 56.0 +AGGA 57.0 +AGGTCC 58.0 +GGTCCGT 59.0 +ATGGC 60.0 +AGGA 61.0 +AGGTCC 62.0 +GGTCCGT 63.0 +AGTGGACC 64.0 +GGTGCCCCGT 65.0 +ATGGC 66.0 +AGGA 67.0 +AGGTCC 68.0 +GGTCCGT 69.0 +ATGGC 70.0 +AGGA 71.0 +AGGTCC 72.0 +GGTCCGT 73.0 +AGTGGACC 74.0 +GGTGCCCCGT 75.0 +ATGGC 76.0 +AGGA 77.0 +AGGTCC 78.0 +GGTCCGT 79.0 +ATGGC 80.0 +AGGA 81.0 +AGGTCC 82.0 +GGTCCGT 83.0 +AGTGGACC 84.0 +GGTGCCCCGT 85.0 +ATGGC 86.0 +AGGA 87.0 +AGGTCC 88.0 +GGTCCGT 89.0 +ATGGC 90.0 +AGGA 91.0 +AGGTCC 92.0 +GGTCCGT 93.0 +AGTGGACC 94.0 +GGTGCCCCGT 95.0 +ATGGC 96.0 +AGGA 97.0 +AGGTCC 98.0 +GGTCCGT 99.0 +GGTCCGT 100.0 \ No newline at end of file From 8d4b128936536f6efbdb2590c0995cf1922a1ff2 Mon Sep 17 00:00:00 2001 From: "Terlouw, Barbara" Date: Wed, 7 May 2025 11:27:00 +0200 Subject: [PATCH 31/41] Add output folder for testing --- mewtwo/machine_learning/test/output/README.md | 0 1 file changed, 0 insertions(+), 0 deletions(-) create mode 100644 mewtwo/machine_learning/test/output/README.md diff --git a/mewtwo/machine_learning/test/output/README.md b/mewtwo/machine_learning/test/output/README.md new file mode 100644 index 0000000..e69de29 From ebbc4a7fa4c1a982dfdd5c19a12e2348a03c33a2 Mon Sep 17 00:00:00 2001 From: "Terlouw, Barbara" Date: Thu, 8 May 2025 13:02:55 +0200 Subject: [PATCH 32/41] Code refactor: move binning and train test split to separate files --- .../__init__.py} | 0 .../data_preparation/binning.py | 33 +++++++++++++++++ .../test/data/mock_data_1.txt | 0 .../test/data/mock_data_2.txt | 0 .../data_preparation/test/output/README.md | 0 .../data_preparation/test/test_binning.py | 28 +++++++++++++++ .../test/test_train_test_split.py | 26 ++------------ .../train_test_split.py | 35 ++----------------- .../transformer/loss_functions.py | 0 9 files changed, 67 insertions(+), 55 deletions(-) rename mewtwo/machine_learning/{test/output/README.md => data_preparation/__init__.py} (100%) create mode 100644 mewtwo/machine_learning/data_preparation/binning.py rename mewtwo/machine_learning/{ => data_preparation}/test/data/mock_data_1.txt (100%) rename mewtwo/machine_learning/{ => data_preparation}/test/data/mock_data_2.txt (100%) create mode 100644 mewtwo/machine_learning/data_preparation/test/output/README.md create mode 100644 mewtwo/machine_learning/data_preparation/test/test_binning.py rename mewtwo/machine_learning/{ => data_preparation}/test/test_train_test_split.py (56%) rename mewtwo/machine_learning/{ => data_preparation}/train_test_split.py (80%) create mode 100644 mewtwo/machine_learning/transformer/loss_functions.py diff --git a/mewtwo/machine_learning/test/output/README.md b/mewtwo/machine_learning/data_preparation/__init__.py similarity index 100% rename from mewtwo/machine_learning/test/output/README.md rename to mewtwo/machine_learning/data_preparation/__init__.py diff --git a/mewtwo/machine_learning/data_preparation/binning.py b/mewtwo/machine_learning/data_preparation/binning.py new file mode 100644 index 0000000..00eee47 --- /dev/null +++ b/mewtwo/machine_learning/data_preparation/binning.py @@ -0,0 +1,33 @@ +from math import isclose + + + +def bin_data(y, n_bins: int = 10): + bin_ranges = [] + range_start = 0.0 + step = 1.0 / n_bins + bins = [] + + for i in range(n_bins): + range_end = range_start + step + bin_ranges.append((range_start, range_end)) + range_start = range_end + + for i, y_data in enumerate(y): + bin_index = 0 + bin_range = bin_ranges[bin_index] + + if y_data < 0.0 and not isclose(y_data, 0.0): + raise ValueError(f"Expected value between 0.0 and 1.0. Got {y_data}") + + while bin_range[1] <= y_data and not isclose(y_data, bin_range[1]): + bin_index += 1 + try: + bin_range = bin_ranges[bin_index] + except IndexError: + raise ValueError(f"Expected value between 0.0 and 1.0. Got {y_data}") + + + bins.append(bin_index) + + return bins \ No newline at end of file diff --git a/mewtwo/machine_learning/test/data/mock_data_1.txt b/mewtwo/machine_learning/data_preparation/test/data/mock_data_1.txt similarity index 100% rename from mewtwo/machine_learning/test/data/mock_data_1.txt rename to mewtwo/machine_learning/data_preparation/test/data/mock_data_1.txt diff --git a/mewtwo/machine_learning/test/data/mock_data_2.txt b/mewtwo/machine_learning/data_preparation/test/data/mock_data_2.txt similarity index 100% rename from mewtwo/machine_learning/test/data/mock_data_2.txt rename to mewtwo/machine_learning/data_preparation/test/data/mock_data_2.txt diff --git a/mewtwo/machine_learning/data_preparation/test/output/README.md b/mewtwo/machine_learning/data_preparation/test/output/README.md new file mode 100644 index 0000000..e69de29 diff --git a/mewtwo/machine_learning/data_preparation/test/test_binning.py b/mewtwo/machine_learning/data_preparation/test/test_binning.py new file mode 100644 index 0000000..0296dc4 --- /dev/null +++ b/mewtwo/machine_learning/data_preparation/test/test_binning.py @@ -0,0 +1,28 @@ +import unittest + +from mewtwo.machine_learning.data_preparation.binning import bin_data + +class TestBinning(unittest.TestCase): + def test_bin_data(self): + dummy_data_1 = [0.0, 0.05, 0.1, 0.11, 0.15, 0.9, 0.95, 1.0] + dummy_bins_1 = [0, 0, 0, 1, 1, 8, 9, 9] + + self.assertEqual(bin_data(dummy_data_1), dummy_bins_1) + + dummy_data_2 = [1.0, 0.0, 0.05, 0.1, 0.11, 0.15, 0.9, 0.95, 1.0] + dummy_bins_2 = [9, 0, 0, 0, 1, 1, 8, 9, 9] + + self.assertEqual(bin_data(dummy_data_2), dummy_bins_2) + + dummy_data_3 = [1.1, 0.0, 0.05, 0.1, 0.11, 0.15, 0.9, 0.95, 1.0] + + with self.assertRaises(ValueError): + bin_data(dummy_data_3) + + dummy_data_4 = [-0.1, 0.0, 0.05, 0.1, 0.11, 0.15, 0.9, 0.95, 1.0] + + with self.assertRaises(ValueError): + bin_data(dummy_data_4) + +if __name__ == "__main__": + unittest.main() \ No newline at end of file diff --git a/mewtwo/machine_learning/test/test_train_test_split.py b/mewtwo/machine_learning/data_preparation/test/test_train_test_split.py similarity index 56% rename from mewtwo/machine_learning/test/test_train_test_split.py rename to mewtwo/machine_learning/data_preparation/test/test_train_test_split.py index 57a58e2..e208e1b 100644 --- a/mewtwo/machine_learning/test/test_train_test_split.py +++ b/mewtwo/machine_learning/data_preparation/test/test_train_test_split.py @@ -3,33 +3,13 @@ from shutil import rmtree -from mewtwo.machine_learning.train_test_split import split_data, split_data_from_file, bin_data +from mewtwo.machine_learning.data_preparation.train_test_split import split_data_from_file from mewtwo.parsers.parse_dnabert_data import parse_dnabert_data BASE_DIR = os.path.dirname(__file__) class TestTrainTestSplit(unittest.TestCase): - def test_bin_data(self): - dummy_data_1 = [0.0, 0.05, 0.1, 0.11, 0.15, 0.9, 0.95, 1.0] - dummy_bins_1 = [0, 0, 0, 1, 1, 8, 9, 9] - - self.assertEqual(bin_data(dummy_data_1), dummy_bins_1) - - dummy_data_2 = [1.0, 0.0, 0.05, 0.1, 0.11, 0.15, 0.9, 0.95, 1.0] - dummy_bins_2 = [9, 0, 0, 0, 1, 1, 8, 9, 9] - - self.assertEqual(bin_data(dummy_data_2), dummy_bins_2) - - dummy_data_3 = [1.1, 0.0, 0.05, 0.1, 0.11, 0.15, 0.9, 0.95, 1.0] - - with self.assertRaises(ValueError): - bin_data(dummy_data_3) - - dummy_data_4 = [-0.1, 0.0, 0.05, 0.1, 0.11, 0.15, 0.9, 0.95, 1.0] - - with self.assertRaises(ValueError): - bin_data(dummy_data_4) def test_split_data_from_file(self): @@ -37,8 +17,8 @@ def test_split_data_from_file(self): test_data_1 = os.path.abspath(os.path.join(BASE_DIR, 'data', 'mock_data_1.txt')) test_output_1 = os.path.abspath(os.path.join(BASE_DIR, 'output', 'mock_output_1')) - test_data_2 = os.path.abspath(os.path.join(BASE_DIR, 'data', 'mock_data_2.txt')) - test_output_2 = os.path.abspath(os.path.join(BASE_DIR, 'output', 'mock_output_2')) + # test_data_2 = os.path.abspath(os.path.join(BASE_DIR, 'data', 'mock_data_2.txt')) + # test_output_2 = os.path.abspath(os.path.join(BASE_DIR, 'output', 'mock_output_2')) split_data_from_file(test_data_1, test_output_1) diff --git a/mewtwo/machine_learning/train_test_split.py b/mewtwo/machine_learning/data_preparation/train_test_split.py similarity index 80% rename from mewtwo/machine_learning/train_test_split.py rename to mewtwo/machine_learning/data_preparation/train_test_split.py index c8d67b1..abe5519 100644 --- a/mewtwo/machine_learning/train_test_split.py +++ b/mewtwo/machine_learning/data_preparation/train_test_split.py @@ -1,10 +1,10 @@ from sklearn.model_selection import StratifiedShuffleSplit, StratifiedKFold from mewtwo.embeddings.terminator.terminator import get_terminator_part_sizes, Terminator from mewtwo.parsers.parse_dnabert_data import parse_dnabert_data +from mewtwo.machine_learning.data_preparation.binning import bin_data import os from sys import argv -from math import isclose class CrossvalidationFold: @@ -52,38 +52,9 @@ def split_data(terminators, test_size: float = 0.5, n_crossval_sets: int = 5): return train_terminators, test_terminators, crossvalidation_sets -def bin_data(y, n_bins: int = 10): - bin_ranges = [] - range_start = 0.0 - step = 1.0 / n_bins - bins = [] - - for i in range(n_bins): - range_end = range_start + step - bin_ranges.append((range_start, range_end)) - range_start = range_end - - for i, y_data in enumerate(y): - bin_index = 0 - bin_range = bin_ranges[bin_index] - - if y_data < 0.0 and not isclose(y_data, 0.0): - raise ValueError(f"Expected value between 0.0 and 1.0. Got {y_data}") - - while bin_range[1] <= y_data and not isclose(y_data, bin_range[1]): - bin_index += 1 - try: - bin_range = bin_ranges[bin_index] - except IndexError: - raise ValueError(f"Expected value between 0.0 and 1.0. Got {y_data}") - - - bins.append(bin_index) - - return bins def split_data_from_file(input_file: str, output_dir: str, test_size: float = 0.25, validation_size: float = 0.33333, - nr_stratification_bins: int = 10): + nr_stratification_bins: int = 5): """ Parameters @@ -159,7 +130,7 @@ def split_data_from_file(input_file: str, output_dir: str, test_size: float = 0. with open(out_file, 'w') as out: for j, seq in enumerate(out_x[i]): te = out_y[i][j] - out.write(f"{seq}\t{te}\n") + out.write(f"{seq}\t{te * 100}\n") if __name__ == "__main__": diff --git a/mewtwo/machine_learning/transformer/loss_functions.py b/mewtwo/machine_learning/transformer/loss_functions.py new file mode 100644 index 0000000..e69de29 From c8f4848adda091d950cc1f9e57d50820b4a07537 Mon Sep 17 00:00:00 2001 From: "Terlouw, Barbara" Date: Thu, 8 May 2025 13:14:30 +0200 Subject: [PATCH 33/41] Turn TE percentage into number from 0-1 when reading in termite data --- .../test/data/mock_data_1.txt | 200 +++++++++--------- .../test/data/mock_data_2.txt | 100 --------- .../test/test_train_test_split.py | 3 - .../data_preparation/train_test_split.py | 2 +- mewtwo/parsers/parse_termite_output.py | 16 +- .../parsers/test/test_parse_termite_output.py | 2 - 6 files changed, 107 insertions(+), 216 deletions(-) delete mode 100644 mewtwo/machine_learning/data_preparation/test/data/mock_data_2.txt diff --git a/mewtwo/machine_learning/data_preparation/test/data/mock_data_1.txt b/mewtwo/machine_learning/data_preparation/test/data/mock_data_1.txt index 4ee4e93..28d9bfc 100644 --- a/mewtwo/machine_learning/data_preparation/test/data/mock_data_1.txt +++ b/mewtwo/machine_learning/data_preparation/test/data/mock_data_1.txt @@ -1,100 +1,100 @@ -AGGA 1.0 -AGGTCC 2.0 -GGTCCGT 3.0 -AGTGGACC 4.0 -GGTGCCCCGT 5.0 -ATGGC 6.0 -AGGA 7.0 -AGGTCC 8.0 -GGTCCGT 9.0 -AGTGGACC 10.0 -GGTGCCCCGT 11.0 -ATGGC 12.0 -AGGA 13.0 -AGGTCC 14.0 -GGTCCGT 15.0 -AGTGGACC 16.0 -GGTGCCCCGT 17.0 -ATGGC 18.0 -AGGA 19.0 -AGGTCC 20.0 -GGTCCGT 21.0 -AGTGGACC 22.0 -GGTGCCCCGT 23.0 -ATGGC 24.0 -AGGA 25.0 -AGGTCC 26.0 -GGTCCGT 27.0 -AGTGGACC 28.0 -GGTGCCCCGT 29.0 -ATGGC 30.0 -AGGA 31.0 -AGGTCC 32.0 -GGTCCGT 33.0 -AGTGGACC 34.0 -GGTGCCCCGT 35.0 -ATGGC 36.0 -AGGA 37.0 -AGGTCC 38.0 -GGTCCGT 39.0 -ATGGC 40.0 -AGGA 41.0 -AGGTCC 42.0 -GGTCCGT 43.0 -AGTGGACC 44.0 -GGTGCCCCGT 45.0 -ATGGC 46.0 -AGGA 47.0 -AGGTCC 48.0 -GGTCCGT 49.0 -ATGGC 50.0 -AGGA 51.0 -AGGTCC 52.0 -GGTCCGT 53.0 -AGTGGACC 54.0 -GGTGCCCCGT 55.0 -ATGGC 56.0 -AGGA 57.0 -AGGTCC 58.0 -GGTCCGT 59.0 -ATGGC 60.0 -AGGA 61.0 -AGGTCC 62.0 -GGTCCGT 63.0 -AGTGGACC 64.0 -GGTGCCCCGT 65.0 -ATGGC 66.0 -AGGA 67.0 -AGGTCC 68.0 -GGTCCGT 69.0 -ATGGC 70.0 -AGGA 71.0 -AGGTCC 72.0 -GGTCCGT 73.0 -AGTGGACC 74.0 -GGTGCCCCGT 75.0 -ATGGC 76.0 -AGGA 77.0 -AGGTCC 78.0 -GGTCCGT 79.0 -ATGGC 80.0 -AGGA 81.0 -AGGTCC 82.0 -GGTCCGT 83.0 -AGTGGACC 84.0 -GGTGCCCCGT 85.0 -ATGGC 86.0 -AGGA 87.0 -AGGTCC 88.0 -GGTCCGT 89.0 -ATGGC 90.0 -AGGA 91.0 -AGGTCC 92.0 -GGTCCGT 93.0 -AGTGGACC 94.0 -GGTGCCCCGT 95.0 -ATGGC 96.0 -AGGA 97.0 -AGGTCC 98.0 -GGTCCGT 99.0 -GGTCCGT 100.0 \ No newline at end of file +AGGA 0.01 +AGGTCC 0.02 +GGTCCGT 0.03 +AGTGGACC 0.04 +GGTGCCCCGT 0.05 +ATGGC 0.06 +AGGA 0.07 +AGGTCC 0.08 +GGTCCGT 0.09 +AGTGGACC 0.10 +GGTGCCCCGT 0.11 +ATGGC 0.12 +AGGA 0.13 +AGGTCC 0.14 +GGTCCGT 0.15 +AGTGGACC 0.16 +GGTGCCCCGT 0.17 +ATGGC 0.18 +AGGA 0.19 +AGGTCC 0.20 +GGTCCGT 0.21 +AGTGGACC 0.22 +GGTGCCCCGT 0.23 +ATGGC 0.24 +AGGA 0.25 +AGGTCC 0.26 +GGTCCGT 0.27 +AGTGGACC 0.28 +GGTGCCCCGT 0.29 +ATGGC 0.30 +AGGA 0.31 +AGGTCC 0.32 +GGTCCGT 0.33 +AGTGGACC 0.34 +GGTGCCCCGT 0.35 +ATGGC 0.36 +AGGA 0.37 +AGGTCC 0.38 +GGTCCGT 0.39 +ATGGC 0.40 +AGGA 0.41 +AGGTCC 0.42 +GGTCCGT 0.43 +AGTGGACC 0.44 +GGTGCCCCGT 0.45 +ATGGC 0.46 +AGGA 0.47 +AGGTCC 0.48 +GGTCCGT 0.49 +ATGGC 0.50 +AGGA 0.51 +AGGTCC 0.52 +GGTCCGT 0.53 +AGTGGACC 0.54 +GGTGCCCCGT 0.55 +ATGGC 0.56 +AGGA 0.57 +AGGTCC 0.58 +GGTCCGT 0.59 +ATGGC 0.60 +AGGA 0.61 +AGGTCC 0.62 +GGTCCGT 0.63 +AGTGGACC 0.64 +GGTGCCCCGT 0.65 +ATGGC 0.66 +AGGA 0.67 +AGGTCC 0.68 +GGTCCGT 0.69 +ATGGC 0.70 +AGGA 0.71 +AGGTCC 0.72 +GGTCCGT 0.73 +AGTGGACC 0.74 +GGTGCCCCGT 0.75 +ATGGC 0.76 +AGGA 0.77 +AGGTCC 0.78 +GGTCCGT 0.79 +ATGGC 0.80 +AGGA 0.81 +AGGTCC 0.82 +GGTCCGT 0.83 +AGTGGACC 0.84 +GGTGCCCCGT 0.85 +ATGGC 0.86 +AGGA 0.87 +AGGTCC 0.88 +GGTCCGT 0.89 +ATGGC 0.90 +AGGA 0.91 +AGGTCC 0.92 +GGTCCGT 0.93 +AGTGGACC 0.94 +GGTGCCCCGT 0.95 +ATGGC 0.96 +AGGA 0.97 +AGGTCC 0.98 +GGTCCGT 0.99 +GGTCCGT 1.00 \ No newline at end of file diff --git a/mewtwo/machine_learning/data_preparation/test/data/mock_data_2.txt b/mewtwo/machine_learning/data_preparation/test/data/mock_data_2.txt deleted file mode 100644 index 4ee4e93..0000000 --- a/mewtwo/machine_learning/data_preparation/test/data/mock_data_2.txt +++ /dev/null @@ -1,100 +0,0 @@ -AGGA 1.0 -AGGTCC 2.0 -GGTCCGT 3.0 -AGTGGACC 4.0 -GGTGCCCCGT 5.0 -ATGGC 6.0 -AGGA 7.0 -AGGTCC 8.0 -GGTCCGT 9.0 -AGTGGACC 10.0 -GGTGCCCCGT 11.0 -ATGGC 12.0 -AGGA 13.0 -AGGTCC 14.0 -GGTCCGT 15.0 -AGTGGACC 16.0 -GGTGCCCCGT 17.0 -ATGGC 18.0 -AGGA 19.0 -AGGTCC 20.0 -GGTCCGT 21.0 -AGTGGACC 22.0 -GGTGCCCCGT 23.0 -ATGGC 24.0 -AGGA 25.0 -AGGTCC 26.0 -GGTCCGT 27.0 -AGTGGACC 28.0 -GGTGCCCCGT 29.0 -ATGGC 30.0 -AGGA 31.0 -AGGTCC 32.0 -GGTCCGT 33.0 -AGTGGACC 34.0 -GGTGCCCCGT 35.0 -ATGGC 36.0 -AGGA 37.0 -AGGTCC 38.0 -GGTCCGT 39.0 -ATGGC 40.0 -AGGA 41.0 -AGGTCC 42.0 -GGTCCGT 43.0 -AGTGGACC 44.0 -GGTGCCCCGT 45.0 -ATGGC 46.0 -AGGA 47.0 -AGGTCC 48.0 -GGTCCGT 49.0 -ATGGC 50.0 -AGGA 51.0 -AGGTCC 52.0 -GGTCCGT 53.0 -AGTGGACC 54.0 -GGTGCCCCGT 55.0 -ATGGC 56.0 -AGGA 57.0 -AGGTCC 58.0 -GGTCCGT 59.0 -ATGGC 60.0 -AGGA 61.0 -AGGTCC 62.0 -GGTCCGT 63.0 -AGTGGACC 64.0 -GGTGCCCCGT 65.0 -ATGGC 66.0 -AGGA 67.0 -AGGTCC 68.0 -GGTCCGT 69.0 -ATGGC 70.0 -AGGA 71.0 -AGGTCC 72.0 -GGTCCGT 73.0 -AGTGGACC 74.0 -GGTGCCCCGT 75.0 -ATGGC 76.0 -AGGA 77.0 -AGGTCC 78.0 -GGTCCGT 79.0 -ATGGC 80.0 -AGGA 81.0 -AGGTCC 82.0 -GGTCCGT 83.0 -AGTGGACC 84.0 -GGTGCCCCGT 85.0 -ATGGC 86.0 -AGGA 87.0 -AGGTCC 88.0 -GGTCCGT 89.0 -ATGGC 90.0 -AGGA 91.0 -AGGTCC 92.0 -GGTCCGT 93.0 -AGTGGACC 94.0 -GGTGCCCCGT 95.0 -ATGGC 96.0 -AGGA 97.0 -AGGTCC 98.0 -GGTCCGT 99.0 -GGTCCGT 100.0 \ No newline at end of file diff --git a/mewtwo/machine_learning/data_preparation/test/test_train_test_split.py b/mewtwo/machine_learning/data_preparation/test/test_train_test_split.py index e208e1b..5a59e71 100644 --- a/mewtwo/machine_learning/data_preparation/test/test_train_test_split.py +++ b/mewtwo/machine_learning/data_preparation/test/test_train_test_split.py @@ -17,9 +17,6 @@ def test_split_data_from_file(self): test_data_1 = os.path.abspath(os.path.join(BASE_DIR, 'data', 'mock_data_1.txt')) test_output_1 = os.path.abspath(os.path.join(BASE_DIR, 'output', 'mock_output_1')) - # test_data_2 = os.path.abspath(os.path.join(BASE_DIR, 'data', 'mock_data_2.txt')) - # test_output_2 = os.path.abspath(os.path.join(BASE_DIR, 'output', 'mock_output_2')) - split_data_from_file(test_data_1, test_output_1) self.assertEqual(os.path.exists(test_output_1), True) diff --git a/mewtwo/machine_learning/data_preparation/train_test_split.py b/mewtwo/machine_learning/data_preparation/train_test_split.py index abe5519..ba8ba1f 100644 --- a/mewtwo/machine_learning/data_preparation/train_test_split.py +++ b/mewtwo/machine_learning/data_preparation/train_test_split.py @@ -130,7 +130,7 @@ def split_data_from_file(input_file: str, output_dir: str, test_size: float = 0. with open(out_file, 'w') as out: for j, seq in enumerate(out_x[i]): te = out_y[i][j] - out.write(f"{seq}\t{te * 100}\n") + out.write(f"{seq}\t{te}\n") if __name__ == "__main__": diff --git a/mewtwo/parsers/parse_termite_output.py b/mewtwo/parsers/parse_termite_output.py index 5f234c7..8a5a726 100644 --- a/mewtwo/parsers/parse_termite_output.py +++ b/mewtwo/parsers/parse_termite_output.py @@ -1,25 +1,21 @@ from mewtwo.parsers.tabular import Tabular from mewtwo.embeddings.terminator.hairpin import RNAFoldHairpin, TransTermHPHairpin -from mewtwo.embeddings.bases import BasePair, Base -from mewtwo.embeddings.terminator.terminator import Terminator, get_terminator_part_sizes +from mewtwo.embeddings.terminator.terminator import Terminator from mewtwo.embeddings.sequence import DNASequence, convert_to_rna, convert_to_dna from mewtwo.embeddings.terminator.a_tract import ATract from mewtwo.embeddings.terminator.u_tract import UTract from mewtwo.machine_learning.random_forest.train_random_forest import train_random_forest -from mewtwo.machine_learning.train_test_split import split_data -from mewtwo.machine_learning.prepare_data import terminators_to_ml_input -from mewtwo.machine_learning.neural_network import train_nn +from mewtwo.machine_learning.data_preparation.train_test_split import split_data from sys import argv import os -from pprint import pprint -from statistics import median + def termite_to_dnabert_input(input_file: str, output_dir: str, species_column: bool = True) -> None: if not os.path.exists(output_dir): os.mkdir(output_dir) - terminators = get_termite_terminators(input_file, species_column=True, te_only=True) + terminators = get_termite_terminators(input_file, species_column=species_column, te_only=True) spec_to_term = sort_by_species(terminators) bacillus_terminators = [] @@ -39,11 +35,11 @@ def termite_to_dnabert_input(input_file: str, output_dir: str, species_column: b with open(ecoli_out_file, 'w') as ecoli_out: with open(bacillus_out_file, 'w') as bacillus_out: for terminator in ecoli_terminators: - terminator_data = f"{convert_to_dna(terminator.sequence).sequence}\t{terminator.te}\n" + terminator_data = f"{convert_to_dna(terminator.sequence).sequence}\t{terminator.te / 100}\n" all_out.write(terminator_data) ecoli_out.write(terminator_data) for terminator in bacillus_terminators: - terminator_data = f"{convert_to_dna(terminator.sequence).sequence}\t{terminator.te}\n" + terminator_data = f"{convert_to_dna(terminator.sequence).sequence}\t{terminator.te / 100}\n" all_out.write(terminator_data) bacillus_out.write(terminator_data) diff --git a/mewtwo/parsers/test/test_parse_termite_output.py b/mewtwo/parsers/test/test_parse_termite_output.py index e9d4fb7..1c35782 100644 --- a/mewtwo/parsers/test/test_parse_termite_output.py +++ b/mewtwo/parsers/test/test_parse_termite_output.py @@ -1,6 +1,4 @@ import unittest -from mewtwo.parsers.parse_termite_output import RNAFoldHairpin, TransTermHPHairpin - if __name__ == '__main__': unittest.main() \ No newline at end of file From a0919d5443e3fd8c85f4d58ccb2e8a7cd7ed0c7e Mon Sep 17 00:00:00 2001 From: "Terlouw, Barbara" Date: Thu, 8 May 2025 13:15:51 +0200 Subject: [PATCH 34/41] Change DNABert data parser to remove 100 division --- mewtwo/parsers/parse_dnabert_data.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/mewtwo/parsers/parse_dnabert_data.py b/mewtwo/parsers/parse_dnabert_data.py index da32154..651fc64 100644 --- a/mewtwo/parsers/parse_dnabert_data.py +++ b/mewtwo/parsers/parse_dnabert_data.py @@ -1,3 +1,4 @@ + def parse_dnabert_data(input_file: str) -> tuple[list[str], list[float]]: seqs = [] tes = [] @@ -7,5 +8,5 @@ def parse_dnabert_data(input_file: str) -> tuple[list[str], list[float]]: seq, te = line.split('\t') te = float(te) seqs.append(seq) - tes.append(te / 100) + tes.append(te) return seqs, tes From 5532edfb86144b5ec7625a3aa47fd63956397d99 Mon Sep 17 00:00:00 2001 From: "Terlouw, Barbara" Date: Fri, 30 May 2025 17:00:59 +0200 Subject: [PATCH 35/41] Backup --- mewtwo/embeddings/feature_labels.py | 1 + mewtwo/machine_learning/analysis/__init__.py | 0 .../analysis/get_performance_rf.py | 37 +++ .../data_preparation/binning.py | 2 +- .../calculate_sample_weights.py | 22 ++ .../data_preparation/test/test_binning.py | 4 +- .../test/test_calculate_sample_weights.py | 27 ++ .../feature_inference/__init__.py | 0 .../feature_inference/infer_features_rf.py | 43 +++ .../random_forest/train_random_forest.py | 28 +- .../transformer/config/__init__.py | 0 .../transformer/config/config_types.py | 39 +++ .../transformer/dnabert_regressor.py | 34 ++- .../transformer/finetune_bert.py | 269 +++++++++++++++++- .../hyperparameter_optimization.py | 5 + .../transformer/load_model.py | 45 +++ .../transformer/loss_functions.py | 129 +++++++++ .../{prepare_data.py => regressor_dataset.py} | 11 +- .../transformer/schedulers.py | 6 + mewtwo/parsers/parse_feature_file.py | 14 + mewtwo/parsers/parse_model_config.py | 200 +++++++++++++ mewtwo/parsers/parse_termite_output.py | 9 +- mewtwo/writers/__init__.py | 0 mewtwo/writers/write_feature_importances.py | 25 ++ mewtwo/writers/write_model_settings.py | 100 +++++++ 25 files changed, 1018 insertions(+), 32 deletions(-) create mode 100644 mewtwo/machine_learning/analysis/__init__.py create mode 100644 mewtwo/machine_learning/analysis/get_performance_rf.py create mode 100644 mewtwo/machine_learning/data_preparation/calculate_sample_weights.py create mode 100644 mewtwo/machine_learning/data_preparation/test/test_calculate_sample_weights.py create mode 100644 mewtwo/machine_learning/feature_inference/__init__.py create mode 100644 mewtwo/machine_learning/feature_inference/infer_features_rf.py create mode 100644 mewtwo/machine_learning/transformer/config/__init__.py create mode 100644 mewtwo/machine_learning/transformer/config/config_types.py create mode 100644 mewtwo/machine_learning/transformer/hyperparameter_optimization.py create mode 100644 mewtwo/machine_learning/transformer/load_model.py rename mewtwo/machine_learning/transformer/{prepare_data.py => regressor_dataset.py} (73%) create mode 100644 mewtwo/machine_learning/transformer/schedulers.py create mode 100644 mewtwo/parsers/parse_feature_file.py create mode 100644 mewtwo/parsers/parse_model_config.py create mode 100644 mewtwo/writers/__init__.py create mode 100644 mewtwo/writers/write_feature_importances.py create mode 100644 mewtwo/writers/write_model_settings.py diff --git a/mewtwo/embeddings/feature_labels.py b/mewtwo/embeddings/feature_labels.py index 7e0fe13..1dd4ef6 100644 --- a/mewtwo/embeddings/feature_labels.py +++ b/mewtwo/embeddings/feature_labels.py @@ -4,6 +4,7 @@ from mewtwo.embeddings.bases import Base from enum import Enum + class FeatureType(Enum): IS_A = 1 IS_C = 2 diff --git a/mewtwo/machine_learning/analysis/__init__.py b/mewtwo/machine_learning/analysis/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/mewtwo/machine_learning/analysis/get_performance_rf.py b/mewtwo/machine_learning/analysis/get_performance_rf.py new file mode 100644 index 0000000..9340196 --- /dev/null +++ b/mewtwo/machine_learning/analysis/get_performance_rf.py @@ -0,0 +1,37 @@ +import os +from sys import argv +from statistics import mean, stdev + +from mewtwo.data_processing.iterate_over_dir import iterate_over_dir + + +def write_average_performance(crossvalidation_dir: str, out_file: str) -> None: + test_scores: list[float] = [] + pearsons: list[float] = [] + spearmans: list[float] = [] + + for folder_name, folder_path in iterate_over_dir(crossvalidation_dir, get_dirs=True): + if 'crossvalidation_results' in folder_name: + performance_file = os.path.join(folder_path, "performance.txt") + with open(performance_file, 'r') as performances: + performances.readline() + test_score, pearson, spearman = performances.readline().strip().split('\t') + test_scores.append(float(test_score)) + pearsons.append(float(pearson)) + spearmans.append(float(spearman)) + + av_test = mean(test_scores) + av_pearson = mean(pearsons) + av_spearman = mean(spearmans) + + stdev_test = stdev(test_scores) + stdev_pearson = stdev(pearsons) + stdev_spearman = stdev(spearmans) + + with open(out_file, 'w') as out: + out.write(f"\ttest_score\tpearson\tspearman\n") + out.write(f"Mean\t{av_test}\t{av_pearson}\t{av_spearman}\n") + out.write(f"Stdev\t{stdev_test}\t{stdev_pearson}\t{stdev_spearman}\n") + +if __name__ == "__main__": + write_average_performance(argv[1], argv[2]) diff --git a/mewtwo/machine_learning/data_preparation/binning.py b/mewtwo/machine_learning/data_preparation/binning.py index 00eee47..e62661a 100644 --- a/mewtwo/machine_learning/data_preparation/binning.py +++ b/mewtwo/machine_learning/data_preparation/binning.py @@ -2,7 +2,7 @@ -def bin_data(y, n_bins: int = 10): +def bin_data(y, n_bins: int = 5): bin_ranges = [] range_start = 0.0 step = 1.0 / n_bins diff --git a/mewtwo/machine_learning/data_preparation/calculate_sample_weights.py b/mewtwo/machine_learning/data_preparation/calculate_sample_weights.py new file mode 100644 index 0000000..ef740f1 --- /dev/null +++ b/mewtwo/machine_learning/data_preparation/calculate_sample_weights.py @@ -0,0 +1,22 @@ +from statistics import mean + +from mewtwo.machine_learning.data_preparation.binning import bin_data + + +def get_sample_weights(targets: list[float], n_bins=5) -> list[float]: + bins = bin_data(targets, n_bins=n_bins) + weights = [] + for i in range(len(targets)): + count = bins.count(bins[i]) + bin_weight = 1.0 / (count + 1e-6) + + weights.append(bin_weight) + + normalized_weights = [] + + for weight in weights: + normalized_weights.append(weight / mean(weights)) + + return normalized_weights + + diff --git a/mewtwo/machine_learning/data_preparation/test/test_binning.py b/mewtwo/machine_learning/data_preparation/test/test_binning.py index 0296dc4..5944729 100644 --- a/mewtwo/machine_learning/data_preparation/test/test_binning.py +++ b/mewtwo/machine_learning/data_preparation/test/test_binning.py @@ -7,12 +7,12 @@ def test_bin_data(self): dummy_data_1 = [0.0, 0.05, 0.1, 0.11, 0.15, 0.9, 0.95, 1.0] dummy_bins_1 = [0, 0, 0, 1, 1, 8, 9, 9] - self.assertEqual(bin_data(dummy_data_1), dummy_bins_1) + self.assertEqual(bin_data(dummy_data_1, n_bins=10), dummy_bins_1) dummy_data_2 = [1.0, 0.0, 0.05, 0.1, 0.11, 0.15, 0.9, 0.95, 1.0] dummy_bins_2 = [9, 0, 0, 0, 1, 1, 8, 9, 9] - self.assertEqual(bin_data(dummy_data_2), dummy_bins_2) + self.assertEqual(bin_data(dummy_data_2, n_bins=10), dummy_bins_2) dummy_data_3 = [1.1, 0.0, 0.05, 0.1, 0.11, 0.15, 0.9, 0.95, 1.0] diff --git a/mewtwo/machine_learning/data_preparation/test/test_calculate_sample_weights.py b/mewtwo/machine_learning/data_preparation/test/test_calculate_sample_weights.py new file mode 100644 index 0000000..30f5305 --- /dev/null +++ b/mewtwo/machine_learning/data_preparation/test/test_calculate_sample_weights.py @@ -0,0 +1,27 @@ +import unittest +from math import isclose + +from mewtwo.machine_learning.data_preparation.calculate_sample_weights import get_sample_weights + +class CalculateSampleWeights(unittest.TestCase): + def test_get_sample_weights(self): + dataset_1 = [0.1, 0.3, 0.5, 0.7, 0.9] + weights_1 = [1.0, 1.0, 1.0, 1.0, 1.0] + + self.assertNearlyEqual(weights_1, get_sample_weights(dataset_1)) + + dataset_2 = [0.1, 0.1, 0.3, 0.3, 0.5, 0.5, 0.5, 0.7, 0.9, 0.9] + weights_2 = [1.0, 1.0, 1.0, 1.0, 0.66666, 0.666666, 0.66666, 2.0, 1.0, 1.0] + + self.assertNearlyEqual(weights_2, get_sample_weights(dataset_2)) + + def assertNearlyEqual(self, list_1, list_2): + for i, element_1 in enumerate(list_1): + element_2 = list_2[i] + if not isclose(element_1, element_2, abs_tol=0.00001): + self.fail(f"Lists are not equal: {list_1}, {list_2}. \n First mismatching element: {i} ([{element_1}], [{element_2}])") + + + +if __name__ == "__main__": + unittest.main() \ No newline at end of file diff --git a/mewtwo/machine_learning/feature_inference/__init__.py b/mewtwo/machine_learning/feature_inference/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/mewtwo/machine_learning/feature_inference/infer_features_rf.py b/mewtwo/machine_learning/feature_inference/infer_features_rf.py new file mode 100644 index 0000000..3d90b3c --- /dev/null +++ b/mewtwo/machine_learning/feature_inference/infer_features_rf.py @@ -0,0 +1,43 @@ +import os +from statistics import mean, stdev + +from mewtwo.parsers.parse_feature_file import parse_feature_file +from mewtwo.data_processing.iterate_over_dir import iterate_over_dir +from mewtwo.writers.write_feature_importances import write_feature_importances + + +def get_average_features_from_crossvalidation(input_folder, get_stdev=True): + feature_to_importance_list = {} + feature_to_stdev = {} + for folder_name, folder_path in iterate_over_dir(input_folder, get_dirs=True): + if 'crossvalidation_results' in folder_name: + feature_file = os.path.join(folder_path, "feature_importances.txt") + feature_to_importance = parse_feature_file(feature_file) + for feature, importance in feature_to_importance.items(): + if feature not in feature_to_importance_list: + feature_to_importance_list[feature] = [] + + feature_to_importance_list[feature].append(importance) + feature_to_average_importance = {} + + for feature, importances in feature_to_importance_list.items(): + if not importances: + raise ValueError(f"No importance values found for feature {feature}") + elif get_stdev and len(importances) < 2: + raise ValueError(f"Too few importance values for feature {feature} to calculate stdev") + feature_to_average_importance[feature] = mean(importances) + if get_stdev: + feature_to_stdev[feature] = stdev(importances) + + if get_stdev: + return feature_to_average_importance, feature_to_stdev + + else: + return feature_to_average_importance + + +def write_average_importances(input_folder: str, out_file: str) -> None: + feature_to_average_importance, feature_to_stdev = get_average_features_from_crossvalidation(input_folder) + write_feature_importances(feature_to_average_importance, out_file, feature_to_stdev=feature_to_stdev, + sort_by_importance=True) + diff --git a/mewtwo/machine_learning/random_forest/train_random_forest.py b/mewtwo/machine_learning/random_forest/train_random_forest.py index 693f6ff..24c4e33 100644 --- a/mewtwo/machine_learning/random_forest/train_random_forest.py +++ b/mewtwo/machine_learning/random_forest/train_random_forest.py @@ -1,10 +1,17 @@ +import os + from sklearn.ensemble import RandomForestRegressor +from scipy.stats import pearsonr, spearmanr from mewtwo.embeddings.terminator.terminator import get_terminator_part_sizes from mewtwo.embeddings.feature_labels import FeatureLabel -def train_random_forest(train_terminators, test_terminators, one_hot=False): +def train_random_forest(train_terminators, test_terminators, out_dir=None, one_hot=True): + + features_out = os.path.join(out_dir, "feature_importances.txt") + performance_out = os.path.join(out_dir, "performance.txt") + all_terminators = train_terminators + test_terminators max_loop, max_stem, max_a, max_u = get_terminator_part_sizes(all_terminators) train_x = [] @@ -37,6 +44,25 @@ def train_random_forest(train_terminators, test_terminators, one_hot=False): print(label.feature_category, label.feature_type, importance, label.base_index, label.base_identity) print("test score", random_forest.score(test_x, test_y)) + print("Pearson correlation: ", pearsonr(random_forest.predict(test_x), test_y)) + + if out_dir is not None: + if not os.path.exists(out_dir): + os.mkdir(out_dir) + + with open(features_out, 'w') as features: + features.write("feature_name\tfeature_importance\n") + for importance, label in importance_and_label: + if label.feature_category == 'stem': + index = f"basepair_{label.base_pair_index}_{label.stem_shoulder}" + else: + index = f"basepair_{label.base_index}" + + features.write(f"{label.feature_category}|{index}|{label.feature_type.name}\t{importance:.10f}\n") + + with open(performance_out, 'w') as out: + out.write("test_score\tpearson\tspearman\n") + out.write(f"{random_forest.score(test_x, test_y):.10f}\t{pearsonr(random_forest.predict(test_x), test_y).statistic:.10f}\t{spearmanr(random_forest.predict(test_x), test_y).statistic}") return random_forest diff --git a/mewtwo/machine_learning/transformer/config/__init__.py b/mewtwo/machine_learning/transformer/config/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/mewtwo/machine_learning/transformer/config/config_types.py b/mewtwo/machine_learning/transformer/config/config_types.py new file mode 100644 index 0000000..90de47a --- /dev/null +++ b/mewtwo/machine_learning/transformer/config/config_types.py @@ -0,0 +1,39 @@ +from enum import Enum, Flag + + +class LossFunctionType(Flag): + MSE = 1 + PEARSON = 2 + MSE_PEARSON = 4 + WEIGHTED_MSE = 8 + WEIGHTED_MSE_PEARSON = 16 + SPEARMAN = 32 + MSE_SPEARMAN = 64 + WEIGHTED_MSE_SPEARMAN = 128 + + WEIGHTED = WEIGHTED_MSE | WEIGHTED_MSE_PEARSON | WEIGHTED_MSE_SPEARMAN + NEEDS_ALPHA = MSE_PEARSON | WEIGHTED_MSE_PEARSON | MSE_SPEARMAN | WEIGHTED_MSE_SPEARMAN + CORRELATION_ONLY = PEARSON | SPEARMAN + USES_SPEARMAN = WEIGHTED_MSE_SPEARMAN | SPEARMAN | MSE_SPEARMAN + + @staticmethod + def from_string_description(string_description) -> "LossFunctionType": + return LossFunctionType[string_description.upper()] + + +class FinetuningType(Enum): + LINEAR_HEAD = 1 + ADAPTER = 2 + + @staticmethod + def from_string_description(string_description) -> "FinetuningType": + return FinetuningType[string_description.upper()] + + +class SchedulerType(Enum): + REDUCE_ON_PLATEAU = 1 + COS_ANNEAL_WARMUP = 2 # Cosine annealing with warmup + + @staticmethod + def from_string_description(string_description) -> "SchedulerType": + return SchedulerType[string_description.upper()] diff --git a/mewtwo/machine_learning/transformer/dnabert_regressor.py b/mewtwo/machine_learning/transformer/dnabert_regressor.py index 771aaa9..cb43cf9 100644 --- a/mewtwo/machine_learning/transformer/dnabert_regressor.py +++ b/mewtwo/machine_learning/transformer/dnabert_regressor.py @@ -1,20 +1,42 @@ import torch.nn as nn +from peft import get_peft_model, LoraConfig, TaskType class DNABERTRegressor(nn.Module): - def __init__(self, base_model, hidden_size=768, dropout: float = 0.2, - tuning_mode: str = 'linear_head'): + def __init__(self, base_model, dropout: float = 0.2, + tuning_mode: str = 'linear_head', use_adapters=False, lora_alpha: int = 16, lora_r: int = 8, + lora_dropout: float = 0.1): super().__init__() self.base = base_model + self.current_epoch = 0 + + if use_adapters: + peft_config = LoraConfig(task_type=TaskType.SEQ_CLS, + inference_mode=False, + r=lora_r, + lora_alpha=lora_alpha, + lora_dropout=lora_dropout, + target_modules=["attention.self.Wqkv"]) + self.base = get_peft_model(self.base, peft_config) + self.dropout = nn.Dropout(dropout) + if tuning_mode == 'linear_head': - self.regressor = nn.Sequential(self.dropout, nn.Linear(hidden_size, 1), nn.Sigmoid()) + self.regressor = nn.Sequential(self.dropout, nn.Linear(self.base.base_model.config.hidden_size, 1), + nn.Sigmoid()) + + def update_epoch(self, epoch): + self.current_epoch = epoch def forward(self, input_ids, attention_mask=None): # Pass through the model outputs = self.base(input_ids=input_ids, attention_mask=attention_mask) - output = outputs.last_hidden_state[:, 0, :] - logits = self.regressor(output).squeeze(-1) - return logits + if isinstance(outputs, tuple): + last_hidden_state = outputs[0] + else: + last_hidden_state = outputs.last_hidden_state + cls_token = last_hidden_state[:, 0] # CLS token for regression + predictions = self.regressor(cls_token).squeeze(-1) + return predictions diff --git a/mewtwo/machine_learning/transformer/finetune_bert.py b/mewtwo/machine_learning/transformer/finetune_bert.py index dfedbcf..ecd1a30 100644 --- a/mewtwo/machine_learning/transformer/finetune_bert.py +++ b/mewtwo/machine_learning/transformer/finetune_bert.py @@ -1,33 +1,278 @@ import argparse +import os +from enum import Enum -from transformers import AutoTokenizer, AutoModel +import torch +import torch.optim as optim +import torch.nn as nn +from torch.utils.data import DataLoader +from torch.optim.lr_scheduler import ReduceLROnPlateau +from transformers import AutoTokenizer, get_cosine_schedule_with_warmup from mewtwo.external_code.dnabert.bert_layers import BertModel from mewtwo.machine_learning.transformer.dnabert_regressor import DNABERTRegressor -from torch.utils.data import DataLoader -from mewtwo.machine_learning.transformer.prepare_data import RegressionDataset +from mewtwo.machine_learning.transformer.regressor_dataset import RegressionDataset +from mewtwo.parsers.parse_dnabert_data import parse_dnabert_data +from mewtwo.machine_learning.data_preparation.calculate_sample_weights import get_sample_weights +from mewtwo.machine_learning.transformer.loss_functions import CombinedMSEPearsonLoss, WeightedMSELoss def parse_arguments() -> argparse.Namespace: parser = argparse.ArgumentParser() - parser.add_argument("-i", type=str, required=True, help="Tabular input data, with sequence in one column and efficiency in the second") - parser.add_argument("-o", type=str, help="Output directoru") + parser.add_argument("-i", type=str, required=True, help="Tabular input data, with sequence in one column and \ + efficiency in the second") + parser.add_argument("-v", type=str, required=True, + help="Tabular input data, with sequence in one column and efficiency in the second") + parser.add_argument('-a', type=float, default=0.5, help="Alpha value used for Pearson loss. The higher alpha, \ + the lower the contribution of Pearson correlation to the loss function") + + parser.add_argument("-o", type=str, required=True, help="Output directoru") parser.add_argument("-f", type=str, default='linear_head', help="Finetuning mode, must be one of 'linear_head', 'partial', and 'adapter'") + parser.add_argument("-e", type=int, default=15, help="Nr of epochs") + parser.add_argument("-lf", type=str, default="mse", help="Loss function. Must be one of: 'mse', 'weighted_mse', \ + 'combined_mse_pearson', 'combined_weighted_mse_pearson'") + parser.add_argument("-lr", type=float, default=1e-3, help="Learning rate. Starting learning rate if scheduler \ + is used.") + parser.add_argument("-sc", type=str, default=None, help="Learning rate scheduler. If given, must be one of \ + 'reduce_on_plateau', 'cos_anneal_warmup'") + parser.add_argument("-s", type=str, default=None, help="If given, save model to given location") + parser.add_argument("-m", type=str, default=None, help="If given, train from this model") + parser.add_argument("-num_expected_epochs", type=int, default=100, help="Number of total epochs estimated to use \ + for training in total. Used to determine nr of warmup steps.") + parser.add_argument("-lora_r", type=int, default=8, help="Rank for LoRA adapters") + parser.add_argument("-lora_alpha", type=int, default=16, help="Alpha scaling factor for LoRA adapters") + parser.add_argument("-lora_dropout", type=float, default=0.1, help="Dropout for LoRA adapters") + parser.add_argument("-config", type=str, default=None, help="Path to model config file") + args = parser.parse_args() + # TODO: Turn into enums + assert args.sc in [None, 'reduce_on_plateau', 'cos_anneal_warmup'] assert args.f in ['linear_head', 'partial', 'adapter'] + assert args.lf in ['mse', 'weighted_mse', 'combined_mse_pearson', 'combined_weighted_mse_pearson'] return args -if __name__ == "__main__": +def evaluate_model(model, dataloader, loss_fn, weighted: bool = False, device="cpu"): + model.eval() # Set to evaluation mode + total_loss = 0.0 + all_preds = [] + all_labels = [] + + with torch.no_grad(): + for batch in dataloader: + input_ids = batch["input_ids"].to(device) + attention_mask = batch["attention_mask"].to(device) + labels = batch["labels"].to(device) + weights = batch["weights"].to(device) + + outputs = model(input_ids=input_ids, attention_mask=attention_mask) + if weighted: + loss = loss_fn(outputs, labels, weights) + else: + loss = loss_fn(outputs, labels) + + total_loss += loss.item() + all_preds.extend(outputs.cpu().numpy()) + all_labels.extend(labels.cpu().numpy()) + + avg_loss = total_loss / len(dataloader) + return avg_loss, all_preds, all_labels + + +def train_model(model, dataloader, optimizer, loss_fn, scheduler, scheduler_type, weighted=False, device='cpu', ): + model.train() # set to training mode + model.to(device) + + total_loss = 0.0 + for batch in dataloader: + input_ids = batch["input_ids"].to(device) + attention_mask = batch["attention_mask"].to(device) + labels = batch["labels"].to(device) + weights = batch["weights"].to(device) + + optimizer.zero_grad() + outputs = model(input_ids=input_ids, attention_mask=attention_mask) + + if weighted: + loss = loss_fn(outputs, labels, weights) + else: + loss = loss_fn(outputs, labels) + + loss.backward() + optimizer.step() + if scheduler is not None: + if scheduler_type != 'reduce_on_plateau': + scheduler.step() + + total_loss += loss.item() + + avg_loss = total_loss / len(dataloader) + return avg_loss + + +def initialise(finetuning_mode: FinetuningType, lora_r, lora_alpha, lora_dropout): tokenizer = AutoTokenizer.from_pretrained("zhihan1996/DNABERT-2-117M", trust_remote_code=True) - model = BertModel.from_pretrained("zhihan1996/DNABERT-2-117M") - model.to("cpu") + base_model = BertModel.from_pretrained("zhihan1996/DNABERT-2-117M") + + if finetuning_mode == FinetuningType.LINEAR_HEAD or finetuning_mode == FinetuningType.ADAPTER: + for param in base_model.parameters(): + param.requires_grad = False + + use_adapters = False + + if finetuning_mode == FinetuningType.ADAPTER: + use_adapters = True + + model = DNABERTRegressor(base_model, use_adapters=use_adapters, lora_r=lora_r, lora_alpha=lora_alpha, + lora_dropout=lora_dropout) + + return model, tokenizer + + +def prepare_data(input_file: str, tokenizer: AutoTokenizer, shuffle: bool, batch_size: int = 5) \ + -> DataLoader: + sequences, labels = parse_dnabert_data(input_file) + sample_weights = get_sample_weights(labels) + dataset = RegressionDataset(sequences, labels, sample_weights, tokenizer) + dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=shuffle) + + return dataloader + + +def get_loss_function(string_description, alpha=0.5): + + if string_description == 'mse': + loss_fn = nn.MSELoss() + elif string_description == 'weighted_mse': + loss_fn = WeightedMSELoss() + elif string_description in ['combined_mse_pearson', 'combined_weighted_mse_pearson']: + loss_fn = CombinedMSEPearsonLoss(alpha=alpha) + else: + raise ValueError(f"Unknown loss function type: {string_description}") + + return loss_fn + - dataset = RegressionDataset(sequences, labels, tokenizer) - dataloader = DataLoader(dataset, batch_size=2, shuffle=True) +def main(): - for name, param in model.named_parameters(): - print(name, param) + args = parse_arguments() + + if not os.path.exists(args.o): + os.mkdir(args.o) + + finetuning_mode = FinetuningType.from_string_description(args.f) + model, tokenizer = initialise(finetuning_mode, lora_r=args.lora_r, lora_alpha=args.lora_alpha, + lora_dropout=args.lora_dropout) + + train_dataloader = prepare_data(args.i, tokenizer, True) + validation_dataloader = prepare_data(args.v, tokenizer, False) + + loss_fn = get_loss_function(args.lf, args.a) + + use_weights = False + + if 'weighted' in args.lf: + use_weights = True + + summary_file = os.path.join(args.o, "summary.txt") + + scheduler = None + warmup_steps = None + training_steps = None + + if args.m is not None: + summary = open(summary_file, 'a') + checkpoint = torch.load(args.m) + + model.load_state_dict(checkpoint["model_state_dict"]) + model.update_epoch(checkpoint["epoch"]) + + optimizer = optim.AdamW(model.parameters(), lr=args.lr) + + # TODO: Store as dataclass instead + optimizer.load_state_dict(checkpoint["optimizer_state_dict"]) + + if args.sc is not None: + if args.sc == 'cos_anneal_warmup': + warmup_steps = checkpoint["scheduler_num_warmup_steps"] + training_steps = checkpoint["scheduler_num_training_steps"] + scheduler = get_cosine_schedule_with_warmup(optimizer, num_warmup_steps=warmup_steps, + num_training_steps=training_steps) + elif args.sc == 'reduce_on_plateau': + scheduler = ReduceLROnPlateau(optimizer, mode='min', factor=0.1, patience=2, verbose=True, + min_lr=5e-6) + else: + raise ValueError(f"Unrecognised scheduler: {args.sc}") + + if checkpoint["scheduler_state_dict"] is not None and scheduler is not None: + scheduler.load_state_dict(checkpoint["scheduler_state_dict"]) + for param_group in optimizer.param_groups: + param_group['lr'] = scheduler.get_last_lr()[0] + + else: + summary = open(summary_file, 'w') + summary.write("epoch\taverage_train_loss\taverage_eval_loss\n") + + optimizer = optim.AdamW(model.parameters(), lr=args.lr) + + if args.sc is not None: + if args.sc == 'cos_anneal_warmup': + + training_steps = int(len(train_dataloader) * args.num_expected_epochs) + warmup_steps = int(0.1 * training_steps) + + scheduler = get_cosine_schedule_with_warmup(optimizer, num_warmup_steps=warmup_steps, + num_training_steps=training_steps) + elif args.sc == 'reduce_on_plateau': + scheduler = ReduceLROnPlateau(optimizer, mode='min', factor=0.1, patience=2, verbose=True, + min_lr=1e-5) + else: + raise ValueError(f"Unrecognised scheduler: {args.sc}") + + starting_epoch = model.current_epoch + current_epoch = starting_epoch + + if args.m is None: + avg_loss, all_preds, all_labels = evaluate_model(model, validation_dataloader, loss_fn, weighted=use_weights) + print(f"Epoch {starting_epoch}\t- Eval loss:\t{avg_loss:.4f}") + + for i in range(args.e): + current_epoch = starting_epoch + i + 1 + print(f"LR at epoch {current_epoch}: {optimizer.param_groups[0]['lr']}") + + out_file = os.path.join(args.o, f"epoch_{current_epoch:03d}.txt") + avg_train_loss = train_model(model, train_dataloader, optimizer, loss_fn, scheduler, scheduler_type=args.sc, + weighted=use_weights) + avg_loss, all_preds, all_labels = evaluate_model(model, validation_dataloader, loss_fn, + weighted=use_weights) + print(f"Epoch {current_epoch}\t- Train loss:\t{avg_train_loss:.4f}") + print(f" \t- Eval loss:\t{avg_loss:.4f}") + summary.write(f"{current_epoch}\t{avg_train_loss:.5f}\t{avg_loss:.5f}\n") + + with open(out_file, 'w') as out: + out.write("actual\tpredicted\n") + for j, prediction in enumerate(all_preds): + label = all_labels[j] + out.write(f"{label}\t{prediction}\n") + + model.update_epoch(current_epoch) + + if scheduler is not None and args.sc == 'reduce_on_plateau': + scheduler.step(avg_loss) + + summary.close() + + if args.s is not None: + torch.save({"model_state_dict": model.state_dict(), + "epoch": current_epoch, + "optimizer_state_dict": optimizer.state_dict(), + "scheduler_state_dict": scheduler.state_dict(), + "scheduler_num_warmup_steps": warmup_steps, + "scheduler_num_training_steps": training_steps}, args.s) + + +if __name__ == "__main__": + main() diff --git a/mewtwo/machine_learning/transformer/hyperparameter_optimization.py b/mewtwo/machine_learning/transformer/hyperparameter_optimization.py new file mode 100644 index 0000000..1d93417 --- /dev/null +++ b/mewtwo/machine_learning/transformer/hyperparameter_optimization.py @@ -0,0 +1,5 @@ + + +import argparse + +def parse_arguments() \ No newline at end of file diff --git a/mewtwo/machine_learning/transformer/load_model.py b/mewtwo/machine_learning/transformer/load_model.py new file mode 100644 index 0000000..c178628 --- /dev/null +++ b/mewtwo/machine_learning/transformer/load_model.py @@ -0,0 +1,45 @@ +import argparse +import os +from enum import Enum + +import torch +import torch.optim as optim +import torch.nn as nn +from torch.utils.data import DataLoader +from torch.optim.lr_scheduler import ReduceLROnPlateau +from transformers import AutoTokenizer, get_cosine_schedule_with_warmup + +from mewtwo.external_code.dnabert.bert_layers import BertModel +from mewtwo.machine_learning.transformer.dnabert_regressor import DNABERTRegressor +from mewtwo.machine_learning.transformer.regressor_dataset import RegressionDataset +from mewtwo.parsers.parse_dnabert_data import parse_dnabert_data +from mewtwo.machine_learning.data_preparation.calculate_sample_weights import get_sample_weights +from mewtwo.machine_learning.transformer.loss_functions import CombinedMSEPearsonLoss, WeightedMSELoss +from mewtwo.machine_learning.transformer.config.config_types import FinetuningType +from mewtwo.parsers.parse_model_config import ModelConfig + + +def initialise(finetuning_mode: FinetuningType, dropout, adapter_config=None): + tokenizer = AutoTokenizer.from_pretrained("zhihan1996/DNABERT-2-117M", trust_remote_code=True) + base_model = BertModel.from_pretrained("zhihan1996/DNABERT-2-117M") + + if finetuning_mode == FinetuningType.LINEAR_HEAD or finetuning_mode == FinetuningType.ADAPTER: + for param in base_model.parameters(): + param.requires_grad = False + + if finetuning_mode == FinetuningType.ADAPTER: + assert adapter_config is not None + model = DNABERTRegressor(base_model, use_adapters=True, lora_r=adapter_config.rank, + lora_alpha=adapter_config.alpha, + lora_dropout=adapter_config.dropout) + else: + model = DNABERTRegressor(base_model, use_adapters=False) + + return model, tokenizer + + +def load_model(config_file, model_checkpoint=None): + model_config = ModelConfig.from_file(config_file) + model = initialise(model_config.finetuning_mode, model_config.adapter_config) + + diff --git a/mewtwo/machine_learning/transformer/loss_functions.py b/mewtwo/machine_learning/transformer/loss_functions.py index e69de29..69635b3 100644 --- a/mewtwo/machine_learning/transformer/loss_functions.py +++ b/mewtwo/machine_learning/transformer/loss_functions.py @@ -0,0 +1,129 @@ +from typing import Optional + +import torch +import torch.nn as nn +import torch.nn.functional as F + +from mewtwo.machine_learning.transformer.config.config_types import LossFunctionType + + +def weighted_mse_loss(preds, targets, weights): + return (weights * (preds - targets) ** 2).mean() + + +class WeightedMSELoss(nn.Module): + def __init__(self): + super().__init__() + + def forward(self, preds, targets, weights): + return weighted_mse_loss(preds, targets, weights) + + +class CombinedMSEPearsonLoss(nn.Module): + def __init__(self, alpha=0.5): + """ + alpha: weight for MSE loss (0 <= alpha <= 1) + (1 - alpha) will be used for the Pearson correlation loss + """ + super().__init__() + self.alpha = alpha + + def forward(self, preds: torch.Tensor, targets: torch.Tensor, weights: Optional[torch.Tensor] = None) -> torch.Tensor: + # Ensure preds and targets have the same shape + if preds.shape != targets.shape: + raise ValueError("Shape mismatch between predictions and targets.") + + if weights is not None: + if targets.shape != weights.shape: + raise ValueError("Shape mismatch between weights and targets.") + + if weights is None: + # MSE Loss + mse_loss = F.mse_loss(preds, targets) + else: + mse_loss = weighted_mse_loss(preds, targets, weights) + + # Pearson Correlation + preds_centered = preds - preds.mean() + targets_centered = targets - targets.mean() + + numerator = torch.sum(preds_centered * targets_centered) + denominator = torch.sqrt(torch.sum(preds_centered ** 2)) * torch.sqrt(torch.sum(targets_centered ** 2)) + 1e-8 + pearson_corr = numerator / denominator + + corr_loss = 1 - pearson_corr + + # Weighted combination + return self.alpha * mse_loss + (1 - self.alpha) * corr_loss + + +def soft_rank(x, regularization_strength=1e-3): + """ + Approximate the ranks of elements in x using a softmax-based method. + Returns a tensor of the same shape with values approximating the rank of each element. + """ + x = x.unsqueeze(-1) + diff = x - x.transpose(0, 1) + P = torch.sigmoid(-diff / regularization_strength) + soft_ranks = P.sum(dim=-1) + 0.5 # Adding 0.5 to center ranks correctly + return soft_ranks + + +class CombinedMSESpearmanLoss(nn.Module): + def __init__(self, alpha=0.5, reg_strength=1e-3): + """ + alpha: weight for MSE loss (0 <= alpha <= 1) + reg_strength: smoothness factor for soft ranking + """ + super().__init__() + self.alpha = alpha + self.reg_strength = reg_strength + + def forward(self, preds, targets): + if preds.shape != targets.shape: + raise ValueError("Shape mismatch between predictions and targets.") + + # MSE + mse_loss = F.mse_loss(preds, targets) + + # Soft ranks + preds_rank = soft_rank(preds.squeeze(), self.reg_strength) + targets_rank = soft_rank(targets.squeeze(), self.reg_strength) + + # Centered ranks for Spearman + preds_rank_centered = preds_rank - preds_rank.mean() + targets_rank_centered = targets_rank - targets_rank.mean() + + # Spearman correlation (same as Pearson but on ranks) + numerator = torch.sum(preds_rank_centered * targets_rank_centered) + denominator = ( + torch.sqrt(torch.sum(preds_rank_centered**2)) * torch.sqrt(torch.sum(targets_rank_centered**2)) + 1e-8 + ) + spearman_corr = numerator / denominator + spearman_loss = 1 - spearman_corr + + return self.alpha * mse_loss + (1 - self.alpha) * spearman_loss + + +TYPE_TO_LOSS_FN = {LossFunctionType.MSE: nn.MSELoss, + LossFunctionType.WEIGHTED_MSE: WeightedMSELoss, + LossFunctionType.MSE_PEARSON: CombinedMSEPearsonLoss, + LossFunctionType.MSE_SPEARMAN: CombinedMSESpearmanLoss, + LossFunctionType.WEIGHTED_MSE_PEARSON: CombinedMSEPearsonLoss, + LossFunctionType.WEIGHTED_MSE_SPEARMAN: CombinedMSESpearmanLoss, + LossFunctionType.PEARSON: CombinedMSEPearsonLoss, + LossFunctionType.SPEARMAN: CombinedMSESpearmanLoss} + + +def get_loss_function(string_description: str, alpha: float = 0.5): + loss_fn_type = LossFunctionType.from_string_description(string_description) + loss_fn = TYPE_TO_LOSS_FN[loss_fn_type] + if loss_fn_type in LossFunctionType.NEEDS_ALPHA: + loss_fn_instance = loss_fn(alpha=alpha) + elif loss_fn.type in LossFunctionType.CORRELATION_ONLY: + loss_fn_instance = loss_fn(alpha=0.0) + else: + loss_fn_instance = loss_fn() + + return loss_fn_instance + diff --git a/mewtwo/machine_learning/transformer/prepare_data.py b/mewtwo/machine_learning/transformer/regressor_dataset.py similarity index 73% rename from mewtwo/machine_learning/transformer/prepare_data.py rename to mewtwo/machine_learning/transformer/regressor_dataset.py index e2600cb..5f23620 100644 --- a/mewtwo/machine_learning/transformer/prepare_data.py +++ b/mewtwo/machine_learning/transformer/regressor_dataset.py @@ -4,9 +4,10 @@ class RegressionDataset(Dataset): - def __init__(self, sequences, labels, tokenizer, max_length=512): + def __init__(self, sequences, labels, weights, tokenizer, max_length=512): self.sequences = sequences self.labels = labels + self.weights = weights self.tokenizer = tokenizer self.max_length = max_length @@ -16,6 +17,7 @@ def __len__(self): def __getitem__(self, idx): sequence = self.sequences[idx] label = self.labels[idx] + weight = self.weights[idx] # Tokenize the sequence encoding = self.tokenizer(sequence, truncation=True, padding="max_length", max_length=self.max_length, @@ -26,11 +28,8 @@ def __getitem__(self, idx): return { 'input_ids': input_ids, 'attention_mask': attention_mask, - 'labels': torch.tensor(label, dtype=torch.float) # For regression, labels are continuous values + 'labels': torch.tensor(label, dtype=torch.float), + 'weights': torch.tensor(weight, dtype=torch.float) } -# Example sequences and labels -sequences = ["ACGTACGT", "AGCTAGCT", "CGTACGTA"] -labels = [0.5, 1.2, 0.8] # Continuous labels for regression - diff --git a/mewtwo/machine_learning/transformer/schedulers.py b/mewtwo/machine_learning/transformer/schedulers.py new file mode 100644 index 0000000..7a32e9f --- /dev/null +++ b/mewtwo/machine_learning/transformer/schedulers.py @@ -0,0 +1,6 @@ +from enum import Enum + + +class SchedulerType(Enum): + REDUCE_ON_PLATEAU = 1 + COSINE_ANNEALING_WARM_START = 2 \ No newline at end of file diff --git a/mewtwo/parsers/parse_feature_file.py b/mewtwo/parsers/parse_feature_file.py new file mode 100644 index 0000000..5bbceaa --- /dev/null +++ b/mewtwo/parsers/parse_feature_file.py @@ -0,0 +1,14 @@ +from mewtwo.parsers.tabular import Tabular + + +def parse_feature_file(input_file: str) -> dict[str, float]: + + feature_to_importance = {} + feature_data = Tabular(input_file, [0]) + for feature in feature_data.data: + + feature_name = feature_data.get_value(feature, "feature_name") + importance = float(feature_data.get_value(feature, "feature_importance")) + feature_to_importance[feature_name] = importance + + return feature_to_importance diff --git a/mewtwo/parsers/parse_model_config.py b/mewtwo/parsers/parse_model_config.py new file mode 100644 index 0000000..564172b --- /dev/null +++ b/mewtwo/parsers/parse_model_config.py @@ -0,0 +1,200 @@ +from dataclasses import dataclass +from typing import Optional, Union +from math import isclose + + +from mewtwo.machine_learning.transformer.config.config_types import LossFunctionType, FinetuningType, SchedulerType + + +@dataclass +class AdapterConfig: + rank: int + alpha: int + dropout: float + + def __eq__(self, other): + if type(self) == type(other) and \ + self.rank == other.rank and \ + self.alpha == other.alpha and \ + isclose(self.dropout, other.dropout, rel_tol=0.01): + return True + + else: + return False + + @classmethod + def from_file(cls, input_file) -> Union["AdapterConfig", None]: + rank = None + alpha = None + dropout = None + + with open(input_file, 'r') as model_config: + for line in model_config: + line = line.strip() + field, value = line.split('\t') + if field == "adapter_r": + rank = int(value) + if field == "adapter_alpha": + alpha = int(value) + if field == "adapter_dropout": + dropout = float(value) + + if rank is None or alpha is None or dropout is None: + return None + else: + return AdapterConfig(rank, alpha, dropout) + + +@dataclass +class LossFunctionConfig: + type: LossFunctionType + alpha: Optional[float] + + def __post_init__(self): + if self.alpha is not None: + assert -0.00000001 < self.alpha < 1.000000001 + + def __eq__(self, other): + if type(self) == type(other) and self.type == other.type: + if self.alpha is not None and other.alpha is not None and isclose(self.alpha, other.alpha, rel_tol=0.01): + return True + elif self.alpha is None and other.alpha is None: + return True + + return False + + @classmethod + def from_file(cls, input_file) -> "LossFunctionConfig": + function_type = None + alpha = None + + with open(input_file, 'r') as model_config: + for line in model_config: + line = line.strip() + field, value = line.split('\t') + if field == "loss_function": + function_type = LossFunctionType[value] + elif field == "loss_function_alpha": + alpha = float(value) + + assert function_type is not None + + return LossFunctionConfig(function_type, alpha) + + +@dataclass +class SchedulerConfig: + type: SchedulerType + training_epochs: Optional[int] = None + warmup_epochs: Optional[int] = None + + def __post_init__(self): + if self.type == SchedulerType.COS_ANNEAL_WARMUP: + assert self.training_epochs is not None + assert self.warmup_epochs is not None + if self.warmup_epochs > self.training_epochs: + raise ValueError("Number of warmup steps is greater than the number of training steps.") + + def __eq__(self, other): + if type(self) == type(other) and \ + self.training_epochs == other.training_epochs and \ + self.type == other.type and \ + self.warmup_epochs == other.warmup_epochs: + return True + + return False + + @classmethod + def from_file(cls, input_file) -> Union["SchedulerConfig", None]: + scheduler_type = None + training_steps = None + warmup_steps = None + + with open(input_file, 'r') as model_config: + for line in model_config: + line = line.strip() + field, value = line.split('\t') + if field == "scheduler": + scheduler_type = SchedulerType[value] + if field == "scheduler_training_steps": + training_steps = int(value) + if field == "scheduler_warmup_steps": + warmup_steps = int(value) + + if scheduler_type is None: + return None + else: + return SchedulerConfig(scheduler_type, training_steps, warmup_steps) + + +@dataclass +class ModelConfig: + finetuning_mode: FinetuningType + learning_rate: float + hidden_layer_dropout: float + loss_function_config: LossFunctionConfig + epochs: int + batch_size: int + adapter_config: Optional[AdapterConfig] = None + scheduler_config: Optional[SchedulerConfig] = None + + def __eq__(self, other): + if self.finetuning_mode == other.finetuning_mode and \ + isclose(self.learning_rate, other.learning_rate, rel_tol=0.01) and \ + isclose(self.hidden_layer_dropout, other.hidden_layer_dropout, rel_tol=0.01) and \ + self.loss_function_config == other.loss_function_config and \ + self.adapter_config == other.adapter_config and \ + self.scheduler_config == other.scheduler_config: + + return True + else: + return False + + def write_model_config(self, out_file): + + with open(out_file, 'w') as out: + out.write(f"finetuning_mode\t{self.finetuning_mode.name}\n") + out.write(f"learning_rate\t{self.learning_rate:.10f}\n") + out.write(f"hidden_layer_dropout\t{self.hidden_layer_dropout:.2f}\n") + out.write(f"loss_function\t{self.loss_function_config.type.name}\n") + out.write(f"training_epochs\t{self.epochs}\n") + out.write(f"batch_size\t{self.batch_size}\n") + + if self.loss_function_config.alpha is not None: + out.write(f"loss_function_alpha\t{self.loss_function_config.alpha}\n") + + if self.adapter_config is not None: + out.write(f"adapter_r\t{self.adapter_config.rank}\n") + out.write(f"adapter_alpha\t{self.adapter_config.alpha}\n") + out.write(f"adapter_dropout\t{self.adapter_config.dropout}\n") + + if self.scheduler_config is not None: + out.write(f"scheduler\t{self.scheduler_config.type.name}\n") + if self.scheduler_config.training_epochs is not None: + out.write(f"scheduler_training_steps\t{self.scheduler_config.training_epochs}\n") + if self.scheduler_config.warmup_epochs is not None: + out.write(f"scheduler_warmup_steps\t{self.scheduler_config.warmup_epochs}\n") + + @classmethod + def from_file(cls, input_file): + + adapter_config = AdapterConfig.from_file(input_file) + scheduler_config = SchedulerConfig.from_file(input_file) + loss_function_config = LossFunctionConfig.from_file(input_file) + + field_to_value = {} + + with open(input_file, 'r') as model_config: + for line in model_config: + line = line.strip() + field, value = line.split('\t') + field_to_value[field] = value + + return cls(FinetuningType.from_string_description(field_to_value["finetuning_mode"]), + float(field_to_value["learning_rate"]), + float(field_to_value["hidden_layer_dropout"]), + loss_function_config, + int(field_to_value["training_epochs"]), + int(field_to_value["batch_size"]), + adapter_config, + scheduler_config) diff --git a/mewtwo/parsers/parse_termite_output.py b/mewtwo/parsers/parse_termite_output.py index 8a5a726..b0c7328 100644 --- a/mewtwo/parsers/parse_termite_output.py +++ b/mewtwo/parsers/parse_termite_output.py @@ -6,6 +6,7 @@ from mewtwo.embeddings.terminator.u_tract import UTract from mewtwo.machine_learning.random_forest.train_random_forest import train_random_forest from mewtwo.machine_learning.data_preparation.train_test_split import split_data +from mewtwo.machine_learning.feature_inference.infer_features_rf import write_average_importances from sys import argv import os @@ -44,7 +45,6 @@ def termite_to_dnabert_input(input_file: str, output_dir: str, species_column: b bacillus_out.write(terminator_data) - def parse_termite_data(input_file: str, species_column: bool) -> Tabular: if not species_column: termite_data = Tabular(input_file, [0, 6]) @@ -221,7 +221,6 @@ def rnafold_hairpins_from_termite(input_file: str, get_rnafold: bool = True, get bacillus_terminators = [] ecoli_terminators = [] - for species, species_terminators in species_to_terminators.items(): if 'Bacillus' in species and '(d)' in species: bacillus_terminators.extend(species_terminators) @@ -232,8 +231,10 @@ def rnafold_hairpins_from_termite(input_file: str, get_rnafold: bool = True, get train_terminators, test_terminators, crossvalidation_sets = split_data(all_terminators, test_size=0.1) for crossval_nr, crossvalidation_set in crossvalidation_sets.items(): - train_random_forest(crossvalidation_set.train, crossvalidation_set.test, one_hot=True) + out_dir = os.path.join(argv[2], f"crossvalidation_results_{crossval_nr}") + rf = train_random_forest(crossvalidation_set.train, crossvalidation_set.test, one_hot=True, out_dir=out_dir) + write_average_importances(argv[2], os.path.join(argv[2], "average_feature_importances.txt")) # train_terminators, test_terminators, _ = split_data(ecoli_terminators, test_size=0.1) # # train_random_forest(train_terminators, test_terminators, one_hot=True) @@ -244,7 +245,7 @@ def rnafold_hairpins_from_termite(input_file: str, get_rnafold: bool = True, get print(min([t.te for t in all_terminators]), max([t.te for t in all_terminators])) - termite_to_dnabert_input(argv[1], argv[2], species_column=True) + # termite_to_dnabert_input(argv[1], argv[2], species_column=True) # train_nn(train_terminators, test_terminators) diff --git a/mewtwo/writers/__init__.py b/mewtwo/writers/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/mewtwo/writers/write_feature_importances.py b/mewtwo/writers/write_feature_importances.py new file mode 100644 index 0000000..a6bde76 --- /dev/null +++ b/mewtwo/writers/write_feature_importances.py @@ -0,0 +1,25 @@ +from typing import Optional + + +def write_feature_importances(feature_to_importance: dict[str, float], out_file: str, + feature_to_stdev: Optional[dict[str, float]] = None, + sort_by_importance: bool = True) -> None: + features_and_importances = list(feature_to_importance.items()) + + if sort_by_importance: + features_and_importances.sort(key=lambda x: x[1], reverse=True) + + with open(out_file, 'w') as out: + if feature_to_stdev: + out.write("feature_name\tfeature_importance\tstdev\n") + else: + out.write("feature_name\tfeature_importance\n") + + for feature, importance in features_and_importances: + if feature_to_stdev: + stdev = feature_to_stdev[feature] + out.write(f"{feature}\t{importance}\t{stdev}\n") + else: + + out.write(f"{feature}\t{importance}\n") + diff --git a/mewtwo/writers/write_model_settings.py b/mewtwo/writers/write_model_settings.py new file mode 100644 index 0000000..7e09ebd --- /dev/null +++ b/mewtwo/writers/write_model_settings.py @@ -0,0 +1,100 @@ + +from argparse import ArgumentParser +import os + +from mewtwo.parsers.parse_model_config import ModelConfig, AdapterConfig, SchedulerConfig, LossFunctionConfig + +from mewtwo.machine_learning.transformer.config.config_types import FinetuningType, LossFunctionType, SchedulerType + + +def parse_arguments(): + parser = ArgumentParser(description="Write config files for hyperparameter optimization") + parser.add_argument('-o', type=str, required=True, help="Output directory") + args = parser.parse_args() + return args + + +def get_hyperoptimization_configs(): + model_configs = [] + + batch_sizes = [4, 8] + learning_rates = [1e-5, 5e-5, 1e-4, 5e-4] + hidden_layer_dropouts = [0.1] + + for batch_size in batch_sizes: + for finetuning_type in FinetuningType: + if finetuning_type.name == "ADAPTER": + lora_rank_options = [4] + lora_dropout_options = [0.1] + else: + lora_rank_options = [None] + lora_dropout_options = [None] + + for lora_r in lora_rank_options: + if lora_r is not None: + lora_alpha = 2 * lora_r + else: + lora_alpha = None + + for lora_dropout in lora_dropout_options: + if lora_r: + adapter_config = AdapterConfig(lora_r, lora_alpha, lora_dropout) + else: + adapter_config = None + + for scheduler_option in [s for s in SchedulerType] + [None]: + if scheduler_option == SchedulerType.COS_ANNEAL_WARMUP: + + training_epochs = 20 + warmup_epochs_options = [10] + else: + training_epochs = None + warmup_epochs_options = [None] + + if scheduler_option == SchedulerType.COS_ANNEAL_WARMUP: + + for warmup_epochs_option in warmup_epochs_options: + if scheduler_option is None: + scheduler_config = None + else: + scheduler_config = SchedulerConfig(scheduler_option, training_epochs, + warmup_epochs_option) + + for loss_function in LossFunctionType: + if loss_function not in LossFunctionType.USES_SPEARMAN and loss_function not in LossFunctionType.WEIGHTED: + if loss_function not in LossFunctionType.NEEDS_ALPHA: + loss_function_alpha_options = [None] + else: + loss_function_alpha_options = [0.25, 0.5, 0.75] + + for alpha_option in loss_function_alpha_options: + loss_function_config = LossFunctionConfig(loss_function, alpha_option) + + for learning_rate in learning_rates: + for hidden_layer_dropout in hidden_layer_dropouts: + model_config = ModelConfig(finetuning_type, learning_rate, + hidden_layer_dropout, + loss_function_config, 0, batch_size, + adapter_config, + scheduler_config) + model_configs.append(model_config) + + return model_configs + + +def main(): + args = parse_arguments() + if not os.path.exists(args.o): + os.mkdir(args.o) + + model_configs = get_hyperoptimization_configs() + for i, model_config in enumerate(model_configs): + out_file = os.path.join(args.o, f"config_{i + 1:03}") + model_config.write_model_config(out_file) + + +if __name__ == "__main__": + + main() + + From 6e119617f942a9a2cb3dac1e8d0b95e63af2d75f Mon Sep 17 00:00:00 2001 From: "Terlouw, Barbara" Date: Mon, 2 Jun 2025 14:51:34 +0200 Subject: [PATCH 36/41] Refactor transformer (untested) --- .../transformer/dnabert_regressor.py | 13 +- .../transformer/finetune_bert.py | 270 +++--------------- .../hyperparameter_optimization.py | 28 +- .../transformer/load_model.py | 45 --- .../transformer/loss_functions.py | 12 +- mewtwo/machine_learning/transformer/model.py | 185 ++++++++++++ mewtwo/parsers/parse_model_config.py | 19 +- mewtwo/writers/write_model_settings.py | 6 +- 8 files changed, 280 insertions(+), 298 deletions(-) delete mode 100644 mewtwo/machine_learning/transformer/load_model.py create mode 100644 mewtwo/machine_learning/transformer/model.py diff --git a/mewtwo/machine_learning/transformer/dnabert_regressor.py b/mewtwo/machine_learning/transformer/dnabert_regressor.py index cb43cf9..e4df108 100644 --- a/mewtwo/machine_learning/transformer/dnabert_regressor.py +++ b/mewtwo/machine_learning/transformer/dnabert_regressor.py @@ -3,12 +3,10 @@ class DNABERTRegressor(nn.Module): - def __init__(self, base_model, dropout: float = 0.2, - tuning_mode: str = 'linear_head', use_adapters=False, lora_alpha: int = 16, lora_r: int = 8, + def __init__(self, base_model, dropout: float = 0.2, use_adapters=False, lora_alpha: int = 16, lora_r: int = 8, lora_dropout: float = 0.1): super().__init__() self.base = base_model - self.current_epoch = 0 if use_adapters: peft_config = LoraConfig(task_type=TaskType.SEQ_CLS, @@ -21,13 +19,8 @@ def __init__(self, base_model, dropout: float = 0.2, self.dropout = nn.Dropout(dropout) - if tuning_mode == 'linear_head': - - self.regressor = nn.Sequential(self.dropout, nn.Linear(self.base.base_model.config.hidden_size, 1), - nn.Sigmoid()) - - def update_epoch(self, epoch): - self.current_epoch = epoch + self.regressor = nn.Sequential(self.dropout, nn.Linear(self.base.base_model.config.hidden_size, 1), + nn.Sigmoid()) def forward(self, input_ids, attention_mask=None): # Pass through the model diff --git a/mewtwo/machine_learning/transformer/finetune_bert.py b/mewtwo/machine_learning/transformer/finetune_bert.py index ecd1a30..3e86d08 100644 --- a/mewtwo/machine_learning/transformer/finetune_bert.py +++ b/mewtwo/machine_learning/transformer/finetune_bert.py @@ -1,159 +1,68 @@ import argparse import os -from enum import Enum -import torch -import torch.optim as optim -import torch.nn as nn -from torch.utils.data import DataLoader -from torch.optim.lr_scheduler import ReduceLROnPlateau -from transformers import AutoTokenizer, get_cosine_schedule_with_warmup - -from mewtwo.external_code.dnabert.bert_layers import BertModel -from mewtwo.machine_learning.transformer.dnabert_regressor import DNABERTRegressor -from mewtwo.machine_learning.transformer.regressor_dataset import RegressionDataset -from mewtwo.parsers.parse_dnabert_data import parse_dnabert_data -from mewtwo.machine_learning.data_preparation.calculate_sample_weights import get_sample_weights -from mewtwo.machine_learning.transformer.loss_functions import CombinedMSEPearsonLoss, WeightedMSELoss +from mewtwo.machine_learning.transformer.model import load_model +from mewtwo.machine_learning.transformer.config.config_types import SchedulerType def parse_arguments() -> argparse.Namespace: parser = argparse.ArgumentParser() - parser.add_argument("-i", type=str, required=True, help="Tabular input data, with sequence in one column and \ - efficiency in the second") + parser.add_argument("-i", type=str, required=True, + help="Tabular input data, with sequence in one column and efficiency in the second") parser.add_argument("-v", type=str, required=True, help="Tabular input data, with sequence in one column and efficiency in the second") - parser.add_argument('-a', type=float, default=0.5, help="Alpha value used for Pearson loss. The higher alpha, \ - the lower the contribution of Pearson correlation to the loss function") - - parser.add_argument("-o", type=str, required=True, help="Output directoru") - parser.add_argument("-f", type=str, default='linear_head', - help="Finetuning mode, must be one of 'linear_head', 'partial', and 'adapter'") + parser.add_argument("-c", type=str, default=None, help="Path to configuration file.") + parser.add_argument("-o", type=str, required=True, help="Output directory") parser.add_argument("-e", type=int, default=15, help="Nr of epochs") - parser.add_argument("-lf", type=str, default="mse", help="Loss function. Must be one of: 'mse', 'weighted_mse', \ - 'combined_mse_pearson', 'combined_weighted_mse_pearson'") - parser.add_argument("-lr", type=float, default=1e-3, help="Learning rate. Starting learning rate if scheduler \ - is used.") - parser.add_argument("-sc", type=str, default=None, help="Learning rate scheduler. If given, must be one of \ - 'reduce_on_plateau', 'cos_anneal_warmup'") - parser.add_argument("-s", type=str, default=None, help="If given, save model to given location") - parser.add_argument("-m", type=str, default=None, help="If given, train from this model") - parser.add_argument("-num_expected_epochs", type=int, default=100, help="Number of total epochs estimated to use \ - for training in total. Used to determine nr of warmup steps.") - parser.add_argument("-lora_r", type=int, default=8, help="Rank for LoRA adapters") - parser.add_argument("-lora_alpha", type=int, default=16, help="Alpha scaling factor for LoRA adapters") - parser.add_argument("-lora_dropout", type=float, default=0.1, help="Dropout for LoRA adapters") - parser.add_argument("-config", type=str, default=None, help="Path to model config file") - - args = parser.parse_args() - - # TODO: Turn into enums - assert args.sc in [None, 'reduce_on_plateau', 'cos_anneal_warmup'] - assert args.f in ['linear_head', 'partial', 'adapter'] - assert args.lf in ['mse', 'weighted_mse', 'combined_mse_pearson', 'combined_weighted_mse_pearson'] - - return args - - -def evaluate_model(model, dataloader, loss_fn, weighted: bool = False, device="cpu"): - model.eval() # Set to evaluation mode - total_loss = 0.0 - all_preds = [] - all_labels = [] - - with torch.no_grad(): - for batch in dataloader: - input_ids = batch["input_ids"].to(device) - attention_mask = batch["attention_mask"].to(device) - labels = batch["labels"].to(device) - weights = batch["weights"].to(device) - - outputs = model(input_ids=input_ids, attention_mask=attention_mask) - if weighted: - loss = loss_fn(outputs, labels, weights) - else: - loss = loss_fn(outputs, labels) - - total_loss += loss.item() - all_preds.extend(outputs.cpu().numpy()) - all_labels.extend(labels.cpu().numpy()) - - avg_loss = total_loss / len(dataloader) - return avg_loss, all_preds, all_labels - -def train_model(model, dataloader, optimizer, loss_fn, scheduler, scheduler_type, weighted=False, device='cpu', ): - model.train() # set to training mode - model.to(device) + parser.add_argument("-s", action="store_true", help="If given, save model to output_folder/checkpoint.pt") + parser.add_argument("-m", type=str, default=None, help="If given, train from this checkpoint") - total_loss = 0.0 - for batch in dataloader: - input_ids = batch["input_ids"].to(device) - attention_mask = batch["attention_mask"].to(device) - labels = batch["labels"].to(device) - weights = batch["weights"].to(device) - - optimizer.zero_grad() - outputs = model(input_ids=input_ids, attention_mask=attention_mask) - - if weighted: - loss = loss_fn(outputs, labels, weights) - else: - loss = loss_fn(outputs, labels) - - loss.backward() - optimizer.step() - if scheduler is not None: - if scheduler_type != 'reduce_on_plateau': - scheduler.step() + args = parser.parse_args() - total_loss += loss.item() + if args.c is None: + assert args.m - avg_loss = total_loss / len(dataloader) - return avg_loss + if args.c is not None and args.m is not None: + print("Warning: config file given alongside existing model. Config file will be ignored.") + if args.c is None and args.m is None: + raise ValueError("Model config or previous model checkpoint must be given.") -def initialise(finetuning_mode: FinetuningType, lora_r, lora_alpha, lora_dropout): - tokenizer = AutoTokenizer.from_pretrained("zhihan1996/DNABERT-2-117M", trust_remote_code=True) - base_model = BertModel.from_pretrained("zhihan1996/DNABERT-2-117M") + return args - if finetuning_mode == FinetuningType.LINEAR_HEAD or finetuning_mode == FinetuningType.ADAPTER: - for param in base_model.parameters(): - param.requires_grad = False - use_adapters = False +def finetune(model, summary, epochs, out_dir): + config_file = os.path.join(out_dir, "model.config") - if finetuning_mode == FinetuningType.ADAPTER: - use_adapters = True + current_epoch = model.config.epochs - model = DNABERTRegressor(base_model, use_adapters=use_adapters, lora_r=lora_r, lora_alpha=lora_alpha, - lora_dropout=lora_dropout) + for i in range(epochs): + current_epoch += i + 1 + print(f"LR at epoch {current_epoch}: {model.optimizer.param_groups[0]['lr']}") - return model, tokenizer + out_file = os.path.join(out_dir, f"epoch_{current_epoch:03d}.txt") + avg_train_loss = model.train_model() + avg_loss, all_preds, all_labels = model.evaluate_model() -def prepare_data(input_file: str, tokenizer: AutoTokenizer, shuffle: bool, batch_size: int = 5) \ - -> DataLoader: - sequences, labels = parse_dnabert_data(input_file) - sample_weights = get_sample_weights(labels) - dataset = RegressionDataset(sequences, labels, sample_weights, tokenizer) - dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=shuffle) + print(f"Epoch {current_epoch}\t- Train loss:\t{avg_train_loss:.4f}") + print(f" \t- Eval loss:\t{avg_loss:.4f}") - return dataloader + summary.write(f"{current_epoch}\t{avg_train_loss:.5f}\t{avg_loss:.5f}\n") + with open(out_file, 'w') as out: + out.write("actual\tpredicted\n") + for j, prediction in enumerate(all_preds): + label = all_labels[j] + out.write(f"{label}\t{prediction}\n") -def get_loss_function(string_description, alpha=0.5): + model.update_epoch(current_epoch) - if string_description == 'mse': - loss_fn = nn.MSELoss() - elif string_description == 'weighted_mse': - loss_fn = WeightedMSELoss() - elif string_description in ['combined_mse_pearson', 'combined_weighted_mse_pearson']: - loss_fn = CombinedMSEPearsonLoss(alpha=alpha) - else: - raise ValueError(f"Unknown loss function type: {string_description}") + if model.scheduler is not None and model.config.scheduler_config.type == SchedulerType.REDUCE_ON_PLATEAU: + model.scheduler.step(avg_loss) - return loss_fn + model.config.write_model_config(config_file) def main(): @@ -163,115 +72,30 @@ def main(): if not os.path.exists(args.o): os.mkdir(args.o) - finetuning_mode = FinetuningType.from_string_description(args.f) - model, tokenizer = initialise(finetuning_mode, lora_r=args.lora_r, lora_alpha=args.lora_alpha, - lora_dropout=args.lora_dropout) - - train_dataloader = prepare_data(args.i, tokenizer, True) - validation_dataloader = prepare_data(args.v, tokenizer, False) - - loss_fn = get_loss_function(args.lf, args.a) - - use_weights = False - - if 'weighted' in args.lf: - use_weights = True - summary_file = os.path.join(args.o, "summary.txt") - scheduler = None - warmup_steps = None - training_steps = None - if args.m is not None: summary = open(summary_file, 'a') - checkpoint = torch.load(args.m) - - model.load_state_dict(checkpoint["model_state_dict"]) - model.update_epoch(checkpoint["epoch"]) - - optimizer = optim.AdamW(model.parameters(), lr=args.lr) - - # TODO: Store as dataclass instead - optimizer.load_state_dict(checkpoint["optimizer_state_dict"]) - - if args.sc is not None: - if args.sc == 'cos_anneal_warmup': - warmup_steps = checkpoint["scheduler_num_warmup_steps"] - training_steps = checkpoint["scheduler_num_training_steps"] - scheduler = get_cosine_schedule_with_warmup(optimizer, num_warmup_steps=warmup_steps, - num_training_steps=training_steps) - elif args.sc == 'reduce_on_plateau': - scheduler = ReduceLROnPlateau(optimizer, mode='min', factor=0.1, patience=2, verbose=True, - min_lr=5e-6) - else: - raise ValueError(f"Unrecognised scheduler: {args.sc}") - - if checkpoint["scheduler_state_dict"] is not None and scheduler is not None: - scheduler.load_state_dict(checkpoint["scheduler_state_dict"]) - for param_group in optimizer.param_groups: - param_group['lr'] = scheduler.get_last_lr()[0] - - else: + model = load_model(args.i, args.v, model_checkpoint=args.m) + elif args.c is not None: summary = open(summary_file, 'w') summary.write("epoch\taverage_train_loss\taverage_eval_loss\n") + model = load_model(args.i, args.v, config_file=args.c) + else: + raise ValueError("Model or config file must be given") - optimizer = optim.AdamW(model.parameters(), lr=args.lr) - - if args.sc is not None: - if args.sc == 'cos_anneal_warmup': - - training_steps = int(len(train_dataloader) * args.num_expected_epochs) - warmup_steps = int(0.1 * training_steps) - - scheduler = get_cosine_schedule_with_warmup(optimizer, num_warmup_steps=warmup_steps, - num_training_steps=training_steps) - elif args.sc == 'reduce_on_plateau': - scheduler = ReduceLROnPlateau(optimizer, mode='min', factor=0.1, patience=2, verbose=True, - min_lr=1e-5) - else: - raise ValueError(f"Unrecognised scheduler: {args.sc}") - - starting_epoch = model.current_epoch - current_epoch = starting_epoch + current_epoch = model.config.epochs if args.m is None: - avg_loss, all_preds, all_labels = evaluate_model(model, validation_dataloader, loss_fn, weighted=use_weights) - print(f"Epoch {starting_epoch}\t- Eval loss:\t{avg_loss:.4f}") - - for i in range(args.e): - current_epoch = starting_epoch + i + 1 - print(f"LR at epoch {current_epoch}: {optimizer.param_groups[0]['lr']}") - - out_file = os.path.join(args.o, f"epoch_{current_epoch:03d}.txt") - avg_train_loss = train_model(model, train_dataloader, optimizer, loss_fn, scheduler, scheduler_type=args.sc, - weighted=use_weights) - avg_loss, all_preds, all_labels = evaluate_model(model, validation_dataloader, loss_fn, - weighted=use_weights) - print(f"Epoch {current_epoch}\t- Train loss:\t{avg_train_loss:.4f}") - print(f" \t- Eval loss:\t{avg_loss:.4f}") - summary.write(f"{current_epoch}\t{avg_train_loss:.5f}\t{avg_loss:.5f}\n") - - with open(out_file, 'w') as out: - out.write("actual\tpredicted\n") - for j, prediction in enumerate(all_preds): - label = all_labels[j] - out.write(f"{label}\t{prediction}\n") - - model.update_epoch(current_epoch) + avg_loss, all_preds, all_labels = model.evaluate_model() + print(f"Epoch {current_epoch}\t- Eval loss:\t{avg_loss:.4f}") - if scheduler is not None and args.sc == 'reduce_on_plateau': - scheduler.step(avg_loss) + finetune(model, summary, args.e, args.o) summary.close() if args.s is not None: - torch.save({"model_state_dict": model.state_dict(), - "epoch": current_epoch, - "optimizer_state_dict": optimizer.state_dict(), - "scheduler_state_dict": scheduler.state_dict(), - "scheduler_num_warmup_steps": warmup_steps, - "scheduler_num_training_steps": training_steps}, args.s) + model.save_model_checkpoint(os.path.join(args.o, "checkpoint.pt")) if __name__ == "__main__": diff --git a/mewtwo/machine_learning/transformer/hyperparameter_optimization.py b/mewtwo/machine_learning/transformer/hyperparameter_optimization.py index 1d93417..f142120 100644 --- a/mewtwo/machine_learning/transformer/hyperparameter_optimization.py +++ b/mewtwo/machine_learning/transformer/hyperparameter_optimization.py @@ -1,5 +1,29 @@ +import os +from argparse import ArgumentParser +from mewtwo.data_processing.iterate_over_dir import iterate_over_dir +from mewtwo.machine_learning.transformer.model import load_model +from mewtwo.machine_learning.transformer.config.config_types import SchedulerType +from mewtwo.machine_learning.transformer.finetune_bert import finetune -import argparse -def parse_arguments() \ No newline at end of file +def parse_arguments(): + parser = ArgumentParser(description="Perform hyperparameter optimisation from directory of configuration files") + parser.add_argument('-c', type=str, required=True, help="Directory of model configuration files") + parser.add_argument('-i', type=str, required=True, help="Path to training data") + parser.add_argument('-v', type=str, required=True, help="Path to validation data") + parser.add_argument('-o', type=str, required=True, help="Output directory") + parser.add_argument('-e', type=int, default=20, help="Training epochs") + args = parser.parse_args() + + return args + + +def main(): + args = parse_arguments() + for file_name, file_path in iterate_over_dir(args.c, extension='.config'): + model = load_model(args.i, args.v, config_file=file_path) + summary_file = os.path.join(args.o, "summary.txt") + + with open(summary_file, 'w') as summary: + finetune(model, summary, args.e, args.o) diff --git a/mewtwo/machine_learning/transformer/load_model.py b/mewtwo/machine_learning/transformer/load_model.py deleted file mode 100644 index c178628..0000000 --- a/mewtwo/machine_learning/transformer/load_model.py +++ /dev/null @@ -1,45 +0,0 @@ -import argparse -import os -from enum import Enum - -import torch -import torch.optim as optim -import torch.nn as nn -from torch.utils.data import DataLoader -from torch.optim.lr_scheduler import ReduceLROnPlateau -from transformers import AutoTokenizer, get_cosine_schedule_with_warmup - -from mewtwo.external_code.dnabert.bert_layers import BertModel -from mewtwo.machine_learning.transformer.dnabert_regressor import DNABERTRegressor -from mewtwo.machine_learning.transformer.regressor_dataset import RegressionDataset -from mewtwo.parsers.parse_dnabert_data import parse_dnabert_data -from mewtwo.machine_learning.data_preparation.calculate_sample_weights import get_sample_weights -from mewtwo.machine_learning.transformer.loss_functions import CombinedMSEPearsonLoss, WeightedMSELoss -from mewtwo.machine_learning.transformer.config.config_types import FinetuningType -from mewtwo.parsers.parse_model_config import ModelConfig - - -def initialise(finetuning_mode: FinetuningType, dropout, adapter_config=None): - tokenizer = AutoTokenizer.from_pretrained("zhihan1996/DNABERT-2-117M", trust_remote_code=True) - base_model = BertModel.from_pretrained("zhihan1996/DNABERT-2-117M") - - if finetuning_mode == FinetuningType.LINEAR_HEAD or finetuning_mode == FinetuningType.ADAPTER: - for param in base_model.parameters(): - param.requires_grad = False - - if finetuning_mode == FinetuningType.ADAPTER: - assert adapter_config is not None - model = DNABERTRegressor(base_model, use_adapters=True, lora_r=adapter_config.rank, - lora_alpha=adapter_config.alpha, - lora_dropout=adapter_config.dropout) - else: - model = DNABERTRegressor(base_model, use_adapters=False) - - return model, tokenizer - - -def load_model(config_file, model_checkpoint=None): - model_config = ModelConfig.from_file(config_file) - model = initialise(model_config.finetuning_mode, model_config.adapter_config) - - diff --git a/mewtwo/machine_learning/transformer/loss_functions.py b/mewtwo/machine_learning/transformer/loss_functions.py index 69635b3..7f10db8 100644 --- a/mewtwo/machine_learning/transformer/loss_functions.py +++ b/mewtwo/machine_learning/transformer/loss_functions.py @@ -5,6 +5,7 @@ import torch.nn.functional as F from mewtwo.machine_learning.transformer.config.config_types import LossFunctionType +from mewtwo.parsers.parse_model_config import LossFunctionConfig def weighted_mse_loss(preds, targets, weights): @@ -115,12 +116,11 @@ def forward(self, preds, targets): LossFunctionType.SPEARMAN: CombinedMSESpearmanLoss} -def get_loss_function(string_description: str, alpha: float = 0.5): - loss_fn_type = LossFunctionType.from_string_description(string_description) - loss_fn = TYPE_TO_LOSS_FN[loss_fn_type] - if loss_fn_type in LossFunctionType.NEEDS_ALPHA: - loss_fn_instance = loss_fn(alpha=alpha) - elif loss_fn.type in LossFunctionType.CORRELATION_ONLY: +def get_loss_function(config: LossFunctionConfig): + loss_fn = TYPE_TO_LOSS_FN[config.type] + if config.type in LossFunctionType.NEEDS_ALPHA: + loss_fn_instance = loss_fn(alpha=config.alpha) + elif config.type in LossFunctionType.CORRELATION_ONLY: loss_fn_instance = loss_fn(alpha=0.0) else: loss_fn_instance = loss_fn() diff --git a/mewtwo/machine_learning/transformer/model.py b/mewtwo/machine_learning/transformer/model.py new file mode 100644 index 0000000..f5454ef --- /dev/null +++ b/mewtwo/machine_learning/transformer/model.py @@ -0,0 +1,185 @@ +from dataclasses import dataclass +from typing import Union + +import torch +import torch.optim as optim +import torch.nn as nn + +from torch.utils.data import DataLoader +from torch.optim.lr_scheduler import ReduceLROnPlateau +from transformers import AutoTokenizer, get_cosine_schedule_with_warmup + +from mewtwo.external_code.dnabert.bert_layers import BertModel +from mewtwo.machine_learning.transformer.dnabert_regressor import DNABERTRegressor +from mewtwo.machine_learning.transformer.regressor_dataset import RegressionDataset +from mewtwo.parsers.parse_dnabert_data import parse_dnabert_data +from mewtwo.machine_learning.data_preparation.calculate_sample_weights import get_sample_weights +from mewtwo.machine_learning.transformer.loss_functions import get_loss_function, WeightedMSELoss, \ + CombinedMSEPearsonLoss, CombinedMSESpearmanLoss +from mewtwo.machine_learning.transformer.config.config_types import FinetuningType, SchedulerType, LossFunctionType +from mewtwo.parsers.parse_model_config import ModelConfig + + +@dataclass +class Model: + model: DNABERTRegressor + train_dataloader: DataLoader + eval_dataloader: DataLoader + tokenizer: AutoTokenizer + optimizer: optim.AdamW + scheduler: Union[torch.optim.lr_scheduler.LambdaLR, ReduceLROnPlateau] + loss_function: Union[nn.MSELoss, WeightedMSELoss, CombinedMSEPearsonLoss, CombinedMSESpearmanLoss] + config: ModelConfig + + def save_model_checkpoint(self, out_file): + torch.save({"model_state_dict": self.model.state_dict(), + "config": self.config, + + + "optimizer_state_dict": self.optimizer.state_dict(), + "scheduler_state_dict": self.scheduler.state_dict()} + , out_file) + + def update_epoch(self, epoch): + self.config.epochs = epoch + + def train_model(self, device='cpu'): + self.model.train() # set to training mode + self.model.to(device) + + total_loss = 0.0 + for batch in self.train_dataloader: + input_ids = batch["input_ids"].to(device) + attention_mask = batch["attention_mask"].to(device) + labels = batch["labels"].to(device) + weights = batch["weights"].to(device) + + self.optimizer.zero_grad() + outputs = self.model(input_ids=input_ids, attention_mask=attention_mask) + + if self.config.loss_function_config.type in LossFunctionType.WEIGHTED: + loss = self.loss_function(outputs, labels, weights) + else: + loss = self.loss_function(outputs, labels) + + loss.backward() + self.optimizer.step() + if self.scheduler is not None: + if self.config.scheduler_config.type == SchedulerType.COS_ANNEAL_WARMUP: + self.scheduler.step() + + total_loss += loss.item() + + avg_loss = total_loss / len(self.train_dataloader) + return avg_loss + + def evaluate_model(self, device="cpu"): + self.model.eval() # Set to evaluation mode + total_loss = 0.0 + all_preds = [] + all_labels = [] + + with torch.no_grad(): + for batch in self.eval_dataloader: + input_ids = batch["input_ids"].to(device) + attention_mask = batch["attention_mask"].to(device) + labels = batch["labels"].to(device) + weights = batch["weights"].to(device) + + outputs = self.model(input_ids=input_ids, attention_mask=attention_mask) + if self.config.loss_function_config.type in LossFunctionType.WEIGHTED: + loss = self.loss_function(outputs, labels, weights) + else: + loss = self.loss_function(outputs, labels) + + total_loss += loss.item() + all_preds.extend(outputs.cpu().numpy()) + all_labels.extend(labels.cpu().numpy()) + + avg_loss = total_loss / len(self.eval_dataloader) + + return avg_loss, all_preds, all_labels + + +def initialise(finetuning_mode: FinetuningType, dropout, adapter_config): + tokenizer = AutoTokenizer.from_pretrained("zhihan1996/DNABERT-2-117M", trust_remote_code=True) + base_model = BertModel.from_pretrained("zhihan1996/DNABERT-2-117M") + + if finetuning_mode == FinetuningType.LINEAR_HEAD or finetuning_mode == FinetuningType.ADAPTER: + for param in base_model.parameters(): + param.requires_grad = False + + if finetuning_mode == FinetuningType.ADAPTER: + assert adapter_config is not None + model = DNABERTRegressor(base_model, dropout=dropout, use_adapters=True, lora_r=adapter_config.rank, + lora_alpha=adapter_config.alpha, + lora_dropout=adapter_config.dropout) + else: + model = DNABERTRegressor(base_model, dropout=dropout, use_adapters=False) + + return model, tokenizer + + +def prepare_data(input_file: str, tokenizer: AutoTokenizer, shuffle: bool, batch_size: int = 5) \ + -> DataLoader: + sequences, labels = parse_dnabert_data(input_file) + sample_weights = get_sample_weights(labels) + dataset = RegressionDataset(sequences, labels, sample_weights, tokenizer) + dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=shuffle) + + return dataloader + + +def load_model(input_training_data, input_validation_data, config_file=None, model_checkpoint=None): + checkpoint = None + if model_checkpoint is not None: + if config_file: + print("Warning: Model checkpoint and config file have been given. Config file is ignored.") + + checkpoint = torch.load(model_checkpoint) + model_config = checkpoint["config"] + + elif config_file is not None: + model_config = ModelConfig.from_file(config_file) + + else: + raise ValueError("Config file or model checkpoint must be given") + + model, tokenizer = initialise(model_config.finetuning_mode, model_config.hidden_layer_dropout, + model_config.adapter_config) + + train_dataloader = prepare_data(input_training_data, tokenizer, True, batch_size=model_config.batch_size) + eval_dataloader = prepare_data(input_validation_data, tokenizer, True, batch_size=model_config.batch_size) + + if checkpoint is not None: + model.load_state_dict(checkpoint["model_state_dict"]) + + optimizer = optim.AdamW(model.parameters(), lr=model_config.learning_rate) + optimizer.load_state_dict(checkpoint["optimizer_state_dict"]) + + else: + optimizer = optim.AdamW(model.parameters(), lr=model_config.learning_rate) + + if model_config.scheduler_config.type == SchedulerType.COS_ANNEAL_WARMUP: + warmup_steps = model_config.scheduler_config.warmup_epochs * len(train_dataloader) + training_steps = model_config.scheduler_config.training_epochs * len(train_dataloader) + + scheduler = get_cosine_schedule_with_warmup(optimizer, num_warmup_steps=warmup_steps, + num_training_steps=training_steps) + elif model_config.scheduler_config.type == SchedulerType.REDUCE_ON_PLATEAU: + scheduler = ReduceLROnPlateau(optimizer, mode='min', factor=0.1, patience=2, verbose=True, + min_lr=5e-6) + + else: + raise ValueError(f"Unknown scheduler type: {model_config.scheduler_config.type.name}") + + if checkpoint is not None: + scheduler.load_state_dict(checkpoint["scheduler_state_dict"]) + for param_group in optimizer.param_groups: + param_group['lr'] = scheduler.get_last_lr()[0] + + loss_function = get_loss_function(model_config.loss_function_config) + training_model = Model(model, train_dataloader, eval_dataloader, tokenizer, optimizer, scheduler, loss_function, + model_config) + + return training_model diff --git a/mewtwo/parsers/parse_model_config.py b/mewtwo/parsers/parse_model_config.py index 564172b..4f79c51 100644 --- a/mewtwo/parsers/parse_model_config.py +++ b/mewtwo/parsers/parse_model_config.py @@ -107,8 +107,8 @@ def __eq__(self, other): @classmethod def from_file(cls, input_file) -> Union["SchedulerConfig", None]: scheduler_type = None - training_steps = None - warmup_steps = None + training_epochs = None + warmup_epochs = None with open(input_file, 'r') as model_config: for line in model_config: @@ -116,15 +116,15 @@ def from_file(cls, input_file) -> Union["SchedulerConfig", None]: field, value = line.split('\t') if field == "scheduler": scheduler_type = SchedulerType[value] - if field == "scheduler_training_steps": - training_steps = int(value) - if field == "scheduler_warmup_steps": - warmup_steps = int(value) + if field == "scheduler_training_epochs": + training_epochs = int(value) + if field == "scheduler_warmup_epochs": + warmup_epochs = int(value) if scheduler_type is None: return None else: - return SchedulerConfig(scheduler_type, training_steps, warmup_steps) + return SchedulerConfig(scheduler_type, training_epochs, warmup_epochs) @dataclass @@ -151,6 +151,7 @@ def __eq__(self, other): return False def write_model_config(self, out_file): + assert out_file.endswith('.config') with open(out_file, 'w') as out: out.write(f"finetuning_mode\t{self.finetuning_mode.name}\n") @@ -171,9 +172,9 @@ def write_model_config(self, out_file): if self.scheduler_config is not None: out.write(f"scheduler\t{self.scheduler_config.type.name}\n") if self.scheduler_config.training_epochs is not None: - out.write(f"scheduler_training_steps\t{self.scheduler_config.training_epochs}\n") + out.write(f"scheduler_training_epochs\t{self.scheduler_config.training_epochs}\n") if self.scheduler_config.warmup_epochs is not None: - out.write(f"scheduler_warmup_steps\t{self.scheduler_config.warmup_epochs}\n") + out.write(f"scheduler_warmup_epochs\t{self.scheduler_config.warmup_epochs}\n") @classmethod def from_file(cls, input_file): diff --git a/mewtwo/writers/write_model_settings.py b/mewtwo/writers/write_model_settings.py index 7e09ebd..5afd7c7 100644 --- a/mewtwo/writers/write_model_settings.py +++ b/mewtwo/writers/write_model_settings.py @@ -19,13 +19,13 @@ def get_hyperoptimization_configs(): batch_sizes = [4, 8] learning_rates = [1e-5, 5e-5, 1e-4, 5e-4] - hidden_layer_dropouts = [0.1] + hidden_layer_dropouts = [0.1, 0.2] for batch_size in batch_sizes: for finetuning_type in FinetuningType: if finetuning_type.name == "ADAPTER": lora_rank_options = [4] - lora_dropout_options = [0.1] + lora_dropout_options = [0.05] else: lora_rank_options = [None] lora_dropout_options = [None] @@ -89,7 +89,7 @@ def main(): model_configs = get_hyperoptimization_configs() for i, model_config in enumerate(model_configs): - out_file = os.path.join(args.o, f"config_{i + 1:03}") + out_file = os.path.join(args.o, f"model_{i + 1:03}.config") model_config.write_model_config(out_file) From 7e1e2ac224935bd155be255ad842ad3e09fd0f8a Mon Sep 17 00:00:00 2001 From: "Terlouw, Barbara" Date: Tue, 3 Jun 2025 10:47:17 +0200 Subject: [PATCH 37/41] Refactor finetuning code (untested) --- .../transformer/finetune_bert.py | 19 ++++++-- .../hyperparameter_optimization.py | 46 +++++++++++++++++-- mewtwo/writers/write_model_settings.py | 6 +-- 3 files changed, 59 insertions(+), 12 deletions(-) diff --git a/mewtwo/machine_learning/transformer/finetune_bert.py b/mewtwo/machine_learning/transformer/finetune_bert.py index 3e86d08..9da001f 100644 --- a/mewtwo/machine_learning/transformer/finetune_bert.py +++ b/mewtwo/machine_learning/transformer/finetune_bert.py @@ -4,6 +4,8 @@ from mewtwo.machine_learning.transformer.model import load_model from mewtwo.machine_learning.transformer.config.config_types import SchedulerType +from scipy.stats import pearsonr, spearmanr + def parse_arguments() -> argparse.Namespace: parser = argparse.ArgumentParser() @@ -32,7 +34,9 @@ def parse_arguments() -> argparse.Namespace: return args -def finetune(model, summary, epochs, out_dir): +def finetune(model, summary, epochs, out_dir, header=False): + if header: + summary.write("epoch\taverage_train_loss\taverage_eval_loss\tpearsonr\tspearmanr\n") config_file = os.path.join(out_dir, "model.config") current_epoch = model.config.epochs @@ -46,10 +50,14 @@ def finetune(model, summary, epochs, out_dir): avg_train_loss = model.train_model() avg_loss, all_preds, all_labels = model.evaluate_model() + pearson = pearsonr(all_labels, all_preds) + spearman = spearmanr(all_labels, all_preds) + print(f"Epoch {current_epoch}\t- Train loss:\t{avg_train_loss:.4f}") print(f" \t- Eval loss:\t{avg_loss:.4f}") - summary.write(f"{current_epoch}\t{avg_train_loss:.5f}\t{avg_loss:.5f}\n") + print(f" \t- PearsonR:\t{pearson:.4f}") + print(f" \t- SpearmanR:\t{spearman:.4f}") with open(out_file, 'w') as out: out.write("actual\tpredicted\n") @@ -57,6 +65,8 @@ def finetune(model, summary, epochs, out_dir): label = all_labels[j] out.write(f"{label}\t{prediction}\n") + summary.write(f"{current_epoch}\t{avg_train_loss:.5f}\t{avg_loss:.5f}\t{pearson:.5f}\t{spearman:.5f}\n") + model.update_epoch(current_epoch) if model.scheduler is not None and model.config.scheduler_config.type == SchedulerType.REDUCE_ON_PLATEAU: @@ -76,10 +86,11 @@ def main(): if args.m is not None: summary = open(summary_file, 'a') + write_header = False model = load_model(args.i, args.v, model_checkpoint=args.m) elif args.c is not None: summary = open(summary_file, 'w') - summary.write("epoch\taverage_train_loss\taverage_eval_loss\n") + write_header = True model = load_model(args.i, args.v, config_file=args.c) else: raise ValueError("Model or config file must be given") @@ -90,7 +101,7 @@ def main(): avg_loss, all_preds, all_labels = model.evaluate_model() print(f"Epoch {current_epoch}\t- Eval loss:\t{avg_loss:.4f}") - finetune(model, summary, args.e, args.o) + finetune(model, summary, args.e, args.o, header=write_header) summary.close() diff --git a/mewtwo/machine_learning/transformer/hyperparameter_optimization.py b/mewtwo/machine_learning/transformer/hyperparameter_optimization.py index f142120..79ca8b7 100644 --- a/mewtwo/machine_learning/transformer/hyperparameter_optimization.py +++ b/mewtwo/machine_learning/transformer/hyperparameter_optimization.py @@ -1,9 +1,9 @@ import os from argparse import ArgumentParser +from shutil import copy from mewtwo.data_processing.iterate_over_dir import iterate_over_dir from mewtwo.machine_learning.transformer.model import load_model -from mewtwo.machine_learning.transformer.config.config_types import SchedulerType from mewtwo.machine_learning.transformer.finetune_bert import finetune @@ -14,6 +14,7 @@ def parse_arguments(): parser.add_argument('-v', type=str, required=True, help="Path to validation data") parser.add_argument('-o', type=str, required=True, help="Output directory") parser.add_argument('-e', type=int, default=20, help="Training epochs") + parser.add_argument('-f', action="store_true", help="If given, save failed contigs to folder named 'failed'") args = parser.parse_args() return args @@ -21,9 +22,44 @@ def parse_arguments(): def main(): args = parse_arguments() + if not os.path.exists(args.o): + os.mkdir(args.o) + + failed_configs = [] + for file_name, file_path in iterate_over_dir(args.c, extension='.config'): - model = load_model(args.i, args.v, config_file=file_path) - summary_file = os.path.join(args.o, "summary.txt") - with open(summary_file, 'w') as summary: - finetune(model, summary, args.e, args.o) + out_dir = os.path.join(args.o, file_name) + if not os.path.exists(out_dir): + os.mkdir(out_dir) + + try: + model = load_model(args.i, args.v, config_file=file_path) + summary_file = os.path.join(out_dir, "summary.txt") + + with open(summary_file, 'w') as summary: + finetune(model, summary, args.e, out_dir, header=True) + + except Exception as e: + f"Could not train model with config {file_name}: {e}" + failed_configs.append((file_path, file_name)) + + if args.f and failed_configs: + failed_dir = os.path.join(args.o, "failed") + if not os.path.exists(failed_dir): + os.mkdir(failed_dir) + + for file_path, file_name in failed_configs: + + failed_path = os.path.join(failed_dir, f"{file_name}.config") + + if os.path.exists(failed_path): + os.remove(failed_path) + + copy(file_path, failed_path) + +if __name__ == "__main__": + main() + + + diff --git a/mewtwo/writers/write_model_settings.py b/mewtwo/writers/write_model_settings.py index 5afd7c7..59f504c 100644 --- a/mewtwo/writers/write_model_settings.py +++ b/mewtwo/writers/write_model_settings.py @@ -18,7 +18,7 @@ def get_hyperoptimization_configs(): model_configs = [] batch_sizes = [4, 8] - learning_rates = [1e-5, 5e-5, 1e-4, 5e-4] + learning_rates = [1e-5, 5e-5, 1e-4, 5e-4, 1e-3] hidden_layer_dropouts = [0.1, 0.2] for batch_size in batch_sizes: @@ -45,8 +45,8 @@ def get_hyperoptimization_configs(): for scheduler_option in [s for s in SchedulerType] + [None]: if scheduler_option == SchedulerType.COS_ANNEAL_WARMUP: - training_epochs = 20 - warmup_epochs_options = [10] + training_epochs = 10 + warmup_epochs_options = [2] else: training_epochs = None warmup_epochs_options = [None] From 324073cc472fd6e57a3afae658261e285b490124 Mon Sep 17 00:00:00 2001 From: "Terlouw, Barbara" Date: Thu, 12 Jun 2025 09:10:19 +0200 Subject: [PATCH 38/41] Add script to find best performing model from a folder --- .../analysis/get_performance_transformer.py | 22 +++++++++++++++++++ 1 file changed, 22 insertions(+) create mode 100644 mewtwo/machine_learning/analysis/get_performance_transformer.py diff --git a/mewtwo/machine_learning/analysis/get_performance_transformer.py b/mewtwo/machine_learning/analysis/get_performance_transformer.py new file mode 100644 index 0000000..fcaff54 --- /dev/null +++ b/mewtwo/machine_learning/analysis/get_performance_transformer.py @@ -0,0 +1,22 @@ +import os +from sys import argv + +from mewtwo.data_processing.iterate_over_dir import iterate_over_dir +from mewtwo.parsers.tabular import Tabular + + +def sort_models_by_performance(config_dir): + models_and_performances = [] + for model_name, folder_path in iterate_over_dir(config_dir, get_dirs=True): + summary_file = os.path.join(folder_path, "summary.txt") + summary_data = Tabular(summary_file, [0]) + best_performance = max(summary_data.get_column("pearsonr")) + models_and_performances.append((model_name, float(best_performance))) + + models_and_performances.sort(key=lambda x: x[1], reverse=True) + for model, performance in models_and_performances: + print(f"{model}\t{performance:.4f}\n") + + +if __name__ == "__main__": + sort_models_by_performance(argv[1]) From cd72a89214576535c9c8664b741f1bc5a6176782 Mon Sep 17 00:00:00 2001 From: "Terlouw, Barbara" Date: Thu, 12 Jun 2025 09:12:14 +0200 Subject: [PATCH 39/41] Add custom reduce on plateau scheduler with warmup and early stopping --- .../transformer/config/config_types.py | 16 +- .../transformer/finetune_bert.py | 59 ++++++- .../hyperparameter_optimization.py | 4 +- mewtwo/machine_learning/transformer/model.py | 22 ++- .../transformer/schedulers.py | 62 +++++++- mewtwo/parsers/parse_model_config.py | 74 ++++++++- mewtwo/writers/write_model_settings.py | 144 +++++++++++++----- 7 files changed, 314 insertions(+), 67 deletions(-) diff --git a/mewtwo/machine_learning/transformer/config/config_types.py b/mewtwo/machine_learning/transformer/config/config_types.py index 90de47a..fed9d49 100644 --- a/mewtwo/machine_learning/transformer/config/config_types.py +++ b/mewtwo/machine_learning/transformer/config/config_types.py @@ -1,6 +1,15 @@ from enum import Enum, Flag +class EarlyStoppingMetricType(Flag): + EVAL_LOSS = 1 + SPEARMAN_R = 2 + PEARSON_R = 4 + + MAX_METRICS = PEARSON_R | SPEARMAN_R + MIN_METRICS = EVAL_LOSS + + class LossFunctionType(Flag): MSE = 1 PEARSON = 2 @@ -30,9 +39,14 @@ def from_string_description(string_description) -> "FinetuningType": return FinetuningType[string_description.upper()] -class SchedulerType(Enum): +class SchedulerType(Flag): REDUCE_ON_PLATEAU = 1 COS_ANNEAL_WARMUP = 2 # Cosine annealing with warmup + REDUCE_ON_PLATEAU_WARMUP = 4 + WARMUP_ONLY = 8 + + WARMUP_SCHEDULERS = COS_ANNEAL_WARMUP | REDUCE_ON_PLATEAU_WARMUP | WARMUP_ONLY + REDUCE_ON_PLATEAU_SCHEDULERS = REDUCE_ON_PLATEAU | REDUCE_ON_PLATEAU_WARMUP @staticmethod def from_string_description(string_description) -> "SchedulerType": diff --git a/mewtwo/machine_learning/transformer/finetune_bert.py b/mewtwo/machine_learning/transformer/finetune_bert.py index 9da001f..74d9dbd 100644 --- a/mewtwo/machine_learning/transformer/finetune_bert.py +++ b/mewtwo/machine_learning/transformer/finetune_bert.py @@ -2,7 +2,7 @@ import os from mewtwo.machine_learning.transformer.model import load_model -from mewtwo.machine_learning.transformer.config.config_types import SchedulerType +from mewtwo.machine_learning.transformer.config.config_types import SchedulerType, EarlyStoppingMetricType from scipy.stats import pearsonr, spearmanr @@ -34,15 +34,49 @@ def parse_arguments() -> argparse.Namespace: return args +def metric_has_improved(old_metric, new_metric, metric_type): + if metric_type in EarlyStoppingMetricType.MAX_METRICS: + if new_metric > old_metric: + return True + else: + return False + elif metric_type in EarlyStoppingMetricType.MIN_METRICS: + if new_metric < old_metric: + return True + else: + return False + else: + raise ValueError(f"Unrecognised early stopping metric: {metric_type.name}") + + +def get_metric(eval_loss, pearson, spearman, metric_type): + if metric_type == EarlyStoppingMetricType.PEARSON_R: + return pearson + elif metric_type == EarlyStoppingMetricType.SPEARMAN_R: + return spearman + elif metric_type == EarlyStoppingMetricType.EVAL_LOSS: + return eval_loss + else: + raise ValueError(f"Unrecognised early stopping metric: {metric_type.name}") + + def finetune(model, summary, epochs, out_dir, header=False): if header: summary.write("epoch\taverage_train_loss\taverage_eval_loss\tpearsonr\tspearmanr\n") config_file = os.path.join(out_dir, "model.config") - current_epoch = model.config.epochs + starting_epoch = model.config.epochs + epochs_without_improvement = 0 + + best_model_path = os.path.join(out_dir, "best_checkpoint.pt") + if model.config.early_stopping_config and \ + model.config.early_stopping_config.metric in EarlyStoppingMetricType.MAX_METRICS: + best_metric = -1.1 + else: + best_metric = 1.1 for i in range(epochs): - current_epoch += i + 1 + current_epoch = starting_epoch + i + 1 print(f"LR at epoch {current_epoch}: {model.optimizer.param_groups[0]['lr']}") out_file = os.path.join(out_dir, f"epoch_{current_epoch:03d}.txt") @@ -50,8 +84,8 @@ def finetune(model, summary, epochs, out_dir, header=False): avg_train_loss = model.train_model() avg_loss, all_preds, all_labels = model.evaluate_model() - pearson = pearsonr(all_labels, all_preds) - spearman = spearmanr(all_labels, all_preds) + pearson = pearsonr(all_labels, all_preds).statistic + spearman = spearmanr(all_labels, all_preds).statistic print(f"Epoch {current_epoch}\t- Train loss:\t{avg_train_loss:.4f}") print(f" \t- Eval loss:\t{avg_loss:.4f}") @@ -69,9 +103,22 @@ def finetune(model, summary, epochs, out_dir, header=False): model.update_epoch(current_epoch) - if model.scheduler is not None and model.config.scheduler_config.type == SchedulerType.REDUCE_ON_PLATEAU: + if model.scheduler is not None and \ + model.config.scheduler_config.type in SchedulerType.REDUCE_ON_PLATEAU_SCHEDULERS: model.scheduler.step(avg_loss) + if model.config.early_stopping_config is not None: + metric = get_metric(avg_loss, pearson, spearman, model.config.early_stopping_config.metric) + if metric_has_improved(best_metric, metric, model.config.early_stopping_config.metric): + best_metric = metric + epochs_without_improvement = 0 + model.save_model_checkpoint(best_model_path) + else: + epochs_without_improvement += 1 + if epochs_without_improvement >= model.config.early_stopping_config.patience: + print(f"Early stopping at epoch {current_epoch}") + break + model.config.write_model_config(config_file) diff --git a/mewtwo/machine_learning/transformer/hyperparameter_optimization.py b/mewtwo/machine_learning/transformer/hyperparameter_optimization.py index 79ca8b7..347463e 100644 --- a/mewtwo/machine_learning/transformer/hyperparameter_optimization.py +++ b/mewtwo/machine_learning/transformer/hyperparameter_optimization.py @@ -58,8 +58,6 @@ def main(): copy(file_path, failed_path) + if __name__ == "__main__": main() - - - diff --git a/mewtwo/machine_learning/transformer/model.py b/mewtwo/machine_learning/transformer/model.py index f5454ef..d949e6b 100644 --- a/mewtwo/machine_learning/transformer/model.py +++ b/mewtwo/machine_learning/transformer/model.py @@ -18,6 +18,7 @@ CombinedMSEPearsonLoss, CombinedMSESpearmanLoss from mewtwo.machine_learning.transformer.config.config_types import FinetuningType, SchedulerType, LossFunctionType from mewtwo.parsers.parse_model_config import ModelConfig +from mewtwo.machine_learning.transformer.schedulers import WarmupReduceOnPlateau @dataclass @@ -27,7 +28,7 @@ class Model: eval_dataloader: DataLoader tokenizer: AutoTokenizer optimizer: optim.AdamW - scheduler: Union[torch.optim.lr_scheduler.LambdaLR, ReduceLROnPlateau] + scheduler: Union[torch.optim.lr_scheduler.LambdaLR, ReduceLROnPlateau, WarmupReduceOnPlateau] loss_function: Union[nn.MSELoss, WeightedMSELoss, CombinedMSEPearsonLoss, CombinedMSESpearmanLoss] config: ModelConfig @@ -37,8 +38,7 @@ def save_model_checkpoint(self, out_file): "optimizer_state_dict": self.optimizer.state_dict(), - "scheduler_state_dict": self.scheduler.state_dict()} - , out_file) + "scheduler_state_dict": self.scheduler.state_dict()}, out_file) def update_epoch(self, epoch): self.config.epochs = epoch @@ -65,7 +65,7 @@ def train_model(self, device='cpu'): loss.backward() self.optimizer.step() if self.scheduler is not None: - if self.config.scheduler_config.type == SchedulerType.COS_ANNEAL_WARMUP: + if self.config.scheduler_config.type in SchedulerType.WARMUP_SCHEDULERS: self.scheduler.step() total_loss += loss.item() @@ -132,10 +132,12 @@ def prepare_data(input_file: str, tokenizer: AutoTokenizer, shuffle: bool, batch def load_model(input_training_data, input_validation_data, config_file=None, model_checkpoint=None): checkpoint = None + load_from_checkpoint = False if model_checkpoint is not None: if config_file: print("Warning: Model checkpoint and config file have been given. Config file is ignored.") + load_from_checkpoint = True checkpoint = torch.load(model_checkpoint) model_config = checkpoint["config"] @@ -169,6 +171,18 @@ def load_model(input_training_data, input_validation_data, config_file=None, mod elif model_config.scheduler_config.type == SchedulerType.REDUCE_ON_PLATEAU: scheduler = ReduceLROnPlateau(optimizer, mode='min', factor=0.1, patience=2, verbose=True, min_lr=5e-6) + elif model_config.scheduler_config.type == SchedulerType.REDUCE_ON_PLATEAU_WARMUP: + warmup_steps = model_config.scheduler_config.warmup_epochs * len(train_dataloader) + scheduler = WarmupReduceOnPlateau( + optimizer, + warmup_steps=warmup_steps, + plateau_scheduler_kwargs={ + "mode": "min", + "patience": model_config.scheduler_config.plateau_patience, + "factor": model_config.scheduler_config.factor, + "min_lr": 5e-6 + }, load_from_checkpoint=load_from_checkpoint + ) else: raise ValueError(f"Unknown scheduler type: {model_config.scheduler_config.type.name}") diff --git a/mewtwo/machine_learning/transformer/schedulers.py b/mewtwo/machine_learning/transformer/schedulers.py index 7a32e9f..eea3464 100644 --- a/mewtwo/machine_learning/transformer/schedulers.py +++ b/mewtwo/machine_learning/transformer/schedulers.py @@ -1,6 +1,60 @@ -from enum import Enum +from torch.optim.lr_scheduler import ReduceLROnPlateau -class SchedulerType(Enum): - REDUCE_ON_PLATEAU = 1 - COSINE_ANNEALING_WARM_START = 2 \ No newline at end of file +class WarmupReduceOnPlateau: + def __init__(self, optimizer, warmup_steps, plateau_scheduler_kwargs, load_from_checkpoint=False): + self.optimizer = optimizer + self.warmup_steps = warmup_steps + self.current_step = 0 + self.base_lrs = [group['lr'] for group in optimizer.param_groups][:] + + if not load_from_checkpoint: + for group in optimizer.param_groups: + group['lr'] = 0.0 + + self.plateau_scheduler = ReduceLROnPlateau(optimizer, **plateau_scheduler_kwargs) + self.in_plateau_phase = False + + def step(self, metrics=None): + + if self.current_step <= self.warmup_steps: + if metrics is None: + # Warmup step + self.current_step += 1 + warmup_factor = min(self.current_step / float(self.warmup_steps), 1.0) + for i, param_group in enumerate(self.optimizer.param_groups): + param_group['lr'] = self.base_lrs[i] * warmup_factor + else: + # In warmup, but metrics passed — skip + return + else: + if not self.in_plateau_phase: + self.in_plateau_phase = True + print("Warmup complete. Switching to ReduceLROnPlateau.") + + if metrics is not None: + # Plateau step + self.current_step += 1 + self.plateau_scheduler.step(metrics) + else: + # Plateau phase, but no metric — skip + return + + def get_last_lr(self): + return [group['lr'] for group in self.optimizer.param_groups] + + def state_dict(self): + return { + 'current_step': self.current_step, + 'warmup_steps': self.warmup_steps, + 'base_lrs': self.base_lrs, + 'in_plateau_phase': self.in_plateau_phase, + 'plateau_scheduler': self.plateau_scheduler.state_dict() + } + + def load_state_dict(self, state_dict): + self.current_step = state_dict['current_step'] + self.warmup_steps = state_dict['warmup_steps'] + self.base_lrs = state_dict['base_lrs'] + self.in_plateau_phase = state_dict['in_plateau_phase'] + self.plateau_scheduler.load_state_dict(state_dict['plateau_scheduler']) diff --git a/mewtwo/parsers/parse_model_config.py b/mewtwo/parsers/parse_model_config.py index 4f79c51..deea2bf 100644 --- a/mewtwo/parsers/parse_model_config.py +++ b/mewtwo/parsers/parse_model_config.py @@ -3,7 +3,8 @@ from math import isclose -from mewtwo.machine_learning.transformer.config.config_types import LossFunctionType, FinetuningType, SchedulerType +from mewtwo.machine_learning.transformer.config.config_types import EarlyStoppingMetricType, LossFunctionType, \ + FinetuningType, SchedulerType @dataclass @@ -87,13 +88,20 @@ class SchedulerConfig: type: SchedulerType training_epochs: Optional[int] = None warmup_epochs: Optional[int] = None + plateau_patience: Optional[int] = None + factor: Optional[float] = None def __post_init__(self): - if self.type == SchedulerType.COS_ANNEAL_WARMUP: - assert self.training_epochs is not None + if self.type in SchedulerType.WARMUP_SCHEDULERS: assert self.warmup_epochs is not None - if self.warmup_epochs > self.training_epochs: - raise ValueError("Number of warmup steps is greater than the number of training steps.") + + if self.type == SchedulerType.COS_ANNEAL_WARMUP: + assert self.training_epochs is not None + if self.warmup_epochs > self.training_epochs: + raise ValueError("Number of warmup steps is greater than the number of training steps.") + if self.type in SchedulerType.REDUCE_ON_PLATEAU_SCHEDULERS: + assert self.plateau_patience is not None + assert self.factor is not None def __eq__(self, other): if type(self) == type(other) and \ @@ -109,6 +117,8 @@ def from_file(cls, input_file) -> Union["SchedulerConfig", None]: scheduler_type = None training_epochs = None warmup_epochs = None + plateau_patience = None + factor = None with open(input_file, 'r') as model_config: for line in model_config: @@ -120,11 +130,49 @@ def from_file(cls, input_file) -> Union["SchedulerConfig", None]: training_epochs = int(value) if field == "scheduler_warmup_epochs": warmup_epochs = int(value) + if field == "plateau_patience": + plateau_patience = int(value) + if field == "factor": + factor = float(value) if scheduler_type is None: return None else: - return SchedulerConfig(scheduler_type, training_epochs, warmup_epochs) + return SchedulerConfig(scheduler_type, training_epochs, warmup_epochs, plateau_patience, factor) + + +@dataclass +class EarlyStoppingConfig: + metric: EarlyStoppingMetricType + patience: int + + def __post_init__(self): + assert self.metric is not None + assert self.patience is not None + + def __eq__(self, other): + if type(self) == type(other) and self.metric == other.metric and self.patience == other.patience: + return True + return False + + @classmethod + def from_file(cls, input_file) -> Union["EarlyStoppingConfig", None]: + metric = None + patience = None + + with open(input_file, 'r') as model_config: + for line in model_config: + line = line.strip() + field, value = line.split('\t') + if field == "early_stopping_patience": + metric = EarlyStoppingMetricType[value] + if field == "early_stopping_patience": + patience = int(value) + + if metric is None or patience is None: + return None + else: + return EarlyStoppingConfig(metric, patience) @dataclass @@ -135,6 +183,7 @@ class ModelConfig: loss_function_config: LossFunctionConfig epochs: int batch_size: int + early_stopping_config: Optional[EarlyStoppingConfig] = None adapter_config: Optional[AdapterConfig] = None scheduler_config: Optional[SchedulerConfig] = None @@ -144,7 +193,8 @@ def __eq__(self, other): isclose(self.hidden_layer_dropout, other.hidden_layer_dropout, rel_tol=0.01) and \ self.loss_function_config == other.loss_function_config and \ self.adapter_config == other.adapter_config and \ - self.scheduler_config == other.scheduler_config: + self.scheduler_config == other.scheduler_config and \ + self.early_stopping_config == other.early_stopping_config: return True else: @@ -175,6 +225,14 @@ def write_model_config(self, out_file): out.write(f"scheduler_training_epochs\t{self.scheduler_config.training_epochs}\n") if self.scheduler_config.warmup_epochs is not None: out.write(f"scheduler_warmup_epochs\t{self.scheduler_config.warmup_epochs}\n") + if self.scheduler_config.plateau_patience is not None: + out.write(f"plateau_patience\t{self.scheduler_config.plateau_patience}\n") + if self.scheduler_config.factor is not None: + out.write(f"factor\t{self.scheduler_config.factor}\n") + + if self.early_stopping_config is not None: + out.write(f"early_stopping_patience\t{self.early_stopping_config.patience}\n") + out.write(f"early_stopping_metric\t{self.early_stopping_config.metric.name}\n") @classmethod def from_file(cls, input_file): @@ -182,6 +240,7 @@ def from_file(cls, input_file): adapter_config = AdapterConfig.from_file(input_file) scheduler_config = SchedulerConfig.from_file(input_file) loss_function_config = LossFunctionConfig.from_file(input_file) + early_stopping_config = EarlyStoppingConfig.from_file(input_file) field_to_value = {} @@ -197,5 +256,6 @@ def from_file(cls, input_file): loss_function_config, int(field_to_value["training_epochs"]), int(field_to_value["batch_size"]), + early_stopping_config, adapter_config, scheduler_config) diff --git a/mewtwo/writers/write_model_settings.py b/mewtwo/writers/write_model_settings.py index 59f504c..afbcf24 100644 --- a/mewtwo/writers/write_model_settings.py +++ b/mewtwo/writers/write_model_settings.py @@ -2,33 +2,57 @@ from argparse import ArgumentParser import os -from mewtwo.parsers.parse_model_config import ModelConfig, AdapterConfig, SchedulerConfig, LossFunctionConfig +from mewtwo.parsers.parse_model_config import ModelConfig, AdapterConfig, SchedulerConfig, LossFunctionConfig, EarlyStoppingConfig -from mewtwo.machine_learning.transformer.config.config_types import FinetuningType, LossFunctionType, SchedulerType +from mewtwo.machine_learning.transformer.config.config_types import FinetuningType, LossFunctionType, SchedulerType, \ + EarlyStoppingMetricType def parse_arguments(): parser = ArgumentParser(description="Write config files for hyperparameter optimization") + parser.add_argument('--batch_sizes', type=int, nargs='*', default=[4, 8]) + parser.add_argument('--learning_rates', type=float, nargs='*', default=[1e-5, 5e-5, 1e-4, 5e-4, 1e-3]) + parser.add_argument('--finetuning_types', type=str, nargs='*', default=["LINEAR_HEAD", "ADAPTER"]) + parser.add_argument('--dropout', type=float, nargs='*', default=[0.1, 0.2, 0.3]) + parser.add_argument('--lora_rank', type=int, nargs='*', default=[4, 8]) + parser.add_argument('--lora_dropout', type=float, nargs='*', default=[0.0, 0.05, 0.1]) + parser.add_argument('--training_epochs', type=int, default=30) + parser.add_argument('--warmup_epochs', type=int, nargs='*', default=[3, 6, 10]) + parser.add_argument('--loss_functions', type=str, nargs='*', default=['MSE', 'MSE_PEARSON', 'MSE_SPEARMAN', + 'WEIGHTED_MSE', 'WEIGHTED_MSE_PEARSON', + "WEIGHTED_MSE_SPEARMAN", 'PEARSON', "SPEARMAN"]) + parser.add_argument('--loss_function_alpha', type=float, nargs='*', default=[0.25, 0.5, 0.75]) + parser.add_argument('--early_stopping_metrics', type=str, nargs='*', default=["PEARSON_R", "SPEARMAN_R", "EVAL_LOSS"]) + parser.add_argument('--early_stopping_patiences', type=int, nargs='*', default=[2, 3]) + parser.add_argument('--use_early_stopping', action="store_true") + parser.add_argument('--scheduler_types', type=str, nargs='*', default=["COS_ANNEAL_WARMUP", "REDUCE_ON_PLATEAU", + "REDUCE_ON_PLATEAU_WARMUP", "WARMUP_ONlY"]) + parser.add_argument('--plateau_patiences', type=int, nargs='*', default=[2, 3]) + parser.add_argument('--plateau_factors', type=float, nargs='*', default=[0.5]) parser.add_argument('-o', type=str, required=True, help="Output directory") args = parser.parse_args() return args -def get_hyperoptimization_configs(): +def get_hyperoptimization_configs(batch_sizes, learning_rates, + finetuning_types, + hidden_layer_dropouts, lora_rank, + lora_dropout, training_epochs, + warmup_epochs_options, loss_functions, loss_function_alpha, + scheduler_types, use_early_stopping, + early_stopping_metrics, early_stopping_patiences, + plateau_patiences, plateau_factors): model_configs = [] - batch_sizes = [4, 8] - learning_rates = [1e-5, 5e-5, 1e-4, 5e-4, 1e-3] - hidden_layer_dropouts = [0.1, 0.2] - for batch_size in batch_sizes: - for finetuning_type in FinetuningType: - if finetuning_type.name == "ADAPTER": - lora_rank_options = [4] - lora_dropout_options = [0.05] - else: + for finetuning_type_str in finetuning_types: + finetuning_type = FinetuningType.from_string_description(finetuning_type_str) + if finetuning_type.name != "ADAPTER": lora_rank_options = [None] lora_dropout_options = [None] + else: + lora_rank_options = lora_rank[:] + lora_dropout_options = lora_dropout[:] for lora_r in lora_rank_options: if lora_r is not None: @@ -36,48 +60,71 @@ def get_hyperoptimization_configs(): else: lora_alpha = None - for lora_dropout in lora_dropout_options: + for lora_dropout_option in lora_dropout_options: if lora_r: - adapter_config = AdapterConfig(lora_r, lora_alpha, lora_dropout) + adapter_config = AdapterConfig(lora_r, lora_alpha, lora_dropout_option) else: adapter_config = None - for scheduler_option in [s for s in SchedulerType] + [None]: - if scheduler_option == SchedulerType.COS_ANNEAL_WARMUP: - - training_epochs = 10 - warmup_epochs_options = [2] + for scheduler_config_string in scheduler_types: + scheduler_type = SchedulerType[scheduler_config_string] + if scheduler_type not in SchedulerType.WARMUP_SCHEDULERS: + warmup_epochs = [None] else: - training_epochs = None - warmup_epochs_options = [None] - - if scheduler_option == SchedulerType.COS_ANNEAL_WARMUP: + warmup_epochs = warmup_epochs_options[:] - for warmup_epochs_option in warmup_epochs_options: - if scheduler_option is None: - scheduler_config = None - else: - scheduler_config = SchedulerConfig(scheduler_option, training_epochs, - warmup_epochs_option) - - for loss_function in LossFunctionType: - if loss_function not in LossFunctionType.USES_SPEARMAN and loss_function not in LossFunctionType.WEIGHTED: + if scheduler_type not in SchedulerType.REDUCE_ON_PLATEAU_SCHEDULERS: + plateau_patience_options = [None] + plateau_factor_options = [None] + else: + plateau_patience_options = plateau_patiences[:] + plateau_factor_options = plateau_factors[:] + + for warmup_epochs_option in warmup_epochs: + for plateau_patience in plateau_patience_options: + for plateau_factor in plateau_factor_options: + scheduler_config = SchedulerConfig(scheduler_type, training_epochs, + warmup_epochs_option, plateau_patience, + plateau_factor) + + for loss_function_str in loss_functions: + loss_function = LossFunctionType.from_string_description(loss_function_str) if loss_function not in LossFunctionType.NEEDS_ALPHA: loss_function_alpha_options = [None] else: - loss_function_alpha_options = [0.25, 0.5, 0.75] + loss_function_alpha_options = loss_function_alpha[:] for alpha_option in loss_function_alpha_options: loss_function_config = LossFunctionConfig(loss_function, alpha_option) for learning_rate in learning_rates: for hidden_layer_dropout in hidden_layer_dropouts: - model_config = ModelConfig(finetuning_type, learning_rate, - hidden_layer_dropout, - loss_function_config, 0, batch_size, - adapter_config, - scheduler_config) - model_configs.append(model_config) + early_stopping_config = None + + if not use_early_stopping: + early_stopping_metric_options = [None] + early_stopping_patience_options = [None] + + else: + early_stopping_metric_options = early_stopping_metrics[:] + early_stopping_patience_options = early_stopping_patiences[:] + + for early_stopping_metric in early_stopping_metric_options: + for early_stopping_patience in early_stopping_patience_options: + if early_stopping_metric is not None and \ + early_stopping_patience is not None: + early_stopping_config = EarlyStoppingConfig( + EarlyStoppingMetricType[early_stopping_metric], + early_stopping_patience) + model_config = ModelConfig(finetuning_type, learning_rate, + hidden_layer_dropout, + loss_function_config, 0, + batch_size, + early_stopping_config, + adapter_config, + scheduler_config) + + model_configs.append(model_config) return model_configs @@ -87,7 +134,22 @@ def main(): if not os.path.exists(args.o): os.mkdir(args.o) - model_configs = get_hyperoptimization_configs() + if args.use_early_stopping: + early_stopping_metric_options = args.early_stopping_metrics + early_stopping_patience_options = args.early_stopping_patiences + else: + early_stopping_metric_options = [None] + early_stopping_patience_options = [None] + + model_configs = get_hyperoptimization_configs(args.batch_sizes, args.learning_rates, args.finetuning_types, + args.dropout, args.lora_rank, + args.lora_dropout, args.training_epochs, + args.warmup_epochs, args.loss_functions, + args.loss_function_alpha, args.scheduler_types, + args.use_early_stopping, + early_stopping_metric_options, early_stopping_patience_options, + args.plateau_patiences, args.plateau_factors) + for i, model_config in enumerate(model_configs): out_file = os.path.join(args.o, f"model_{i + 1:03}.config") model_config.write_model_config(out_file) @@ -96,5 +158,3 @@ def main(): if __name__ == "__main__": main() - - From dc05457b01e5ba5d9ecb3c5c43372182617db057 Mon Sep 17 00:00:00 2001 From: "Terlouw, Barbara" Date: Mon, 16 Jun 2025 13:54:17 +0200 Subject: [PATCH 40/41] Add option to add second hidden layer in finetuning head --- mewtwo/data_processing/compute_te.py | 0 .../transformer/dnabert_regressor.py | 18 +++-- mewtwo/machine_learning/transformer/model.py | 19 +++--- mewtwo/parsers/parse_data_chen.py | 0 mewtwo/parsers/parse_model_config.py | 41 ++++++++++-- mewtwo/writers/write_model_settings.py | 66 +++++++++++-------- 6 files changed, 98 insertions(+), 46 deletions(-) create mode 100644 mewtwo/data_processing/compute_te.py create mode 100644 mewtwo/parsers/parse_data_chen.py diff --git a/mewtwo/data_processing/compute_te.py b/mewtwo/data_processing/compute_te.py new file mode 100644 index 0000000..e69de29 diff --git a/mewtwo/machine_learning/transformer/dnabert_regressor.py b/mewtwo/machine_learning/transformer/dnabert_regressor.py index e4df108..ef13575 100644 --- a/mewtwo/machine_learning/transformer/dnabert_regressor.py +++ b/mewtwo/machine_learning/transformer/dnabert_regressor.py @@ -1,10 +1,12 @@ +from typing import Optional + import torch.nn as nn from peft import get_peft_model, LoraConfig, TaskType class DNABERTRegressor(nn.Module): def __init__(self, base_model, dropout: float = 0.2, use_adapters=False, lora_alpha: int = 16, lora_r: int = 8, - lora_dropout: float = 0.1): + lora_dropout: float = 0.1, second_layer_dim: Optional[int] = None): super().__init__() self.base = base_model @@ -17,10 +19,17 @@ def __init__(self, base_model, dropout: float = 0.2, use_adapters=False, lora_al target_modules=["attention.self.Wqkv"]) self.base = get_peft_model(self.base, peft_config) - self.dropout = nn.Dropout(dropout) + hidden_size = self.base.base_model.config.hidden_size + + if second_layer_dim is None: - self.regressor = nn.Sequential(self.dropout, nn.Linear(self.base.base_model.config.hidden_size, 1), - nn.Sigmoid()) + self.regressor = nn.Sequential(nn.Dropout(dropout), nn.Linear(hidden_size, 1), + nn.Sigmoid()) + + else: + self.regressor = nn.Sequential(nn.Dropout(dropout), nn.Linear(hidden_size, second_layer_dim), + nn.ReLU(), nn.Dropout(dropout), nn.Linear(second_layer_dim, 1), + nn.Sigmoid()) def forward(self, input_ids, attention_mask=None): # Pass through the model @@ -32,4 +41,3 @@ def forward(self, input_ids, attention_mask=None): cls_token = last_hidden_state[:, 0] # CLS token for regression predictions = self.regressor(cls_token).squeeze(-1) return predictions - diff --git a/mewtwo/machine_learning/transformer/model.py b/mewtwo/machine_learning/transformer/model.py index d949e6b..d2aa6f6 100644 --- a/mewtwo/machine_learning/transformer/model.py +++ b/mewtwo/machine_learning/transformer/model.py @@ -101,7 +101,7 @@ def evaluate_model(self, device="cpu"): return avg_loss, all_preds, all_labels -def initialise(finetuning_mode: FinetuningType, dropout, adapter_config): +def initialise(finetuning_mode: FinetuningType, model_config): tokenizer = AutoTokenizer.from_pretrained("zhihan1996/DNABERT-2-117M", trust_remote_code=True) base_model = BertModel.from_pretrained("zhihan1996/DNABERT-2-117M") @@ -110,12 +110,16 @@ def initialise(finetuning_mode: FinetuningType, dropout, adapter_config): param.requires_grad = False if finetuning_mode == FinetuningType.ADAPTER: - assert adapter_config is not None - model = DNABERTRegressor(base_model, dropout=dropout, use_adapters=True, lora_r=adapter_config.rank, - lora_alpha=adapter_config.alpha, - lora_dropout=adapter_config.dropout) + assert model_config.adapter_config is not None + + model = DNABERTRegressor(base_model, dropout=model_config.hidden_layer_config.dropout, use_adapters=True, + lora_r=model_config.adapter_config.rank, + lora_alpha=model_config.adapter_config.alpha, + lora_dropout=model_config.adapter_config.dropout, + second_layer_dim=model_config.hidden_layer_config.second_layer_dim) else: - model = DNABERTRegressor(base_model, dropout=dropout, use_adapters=False) + model = DNABERTRegressor(base_model, dropout=model_config.hidden_layer_config.dropout, use_adapters=False, + second_layer_dim=model_config.hidden_layer_config.second_layer_dim) return model, tokenizer @@ -147,8 +151,7 @@ def load_model(input_training_data, input_validation_data, config_file=None, mod else: raise ValueError("Config file or model checkpoint must be given") - model, tokenizer = initialise(model_config.finetuning_mode, model_config.hidden_layer_dropout, - model_config.adapter_config) + model, tokenizer = initialise(model_config.finetuning_mode, model_config) train_dataloader = prepare_data(input_training_data, tokenizer, True, batch_size=model_config.batch_size) eval_dataloader = prepare_data(input_validation_data, tokenizer, True, batch_size=model_config.batch_size) diff --git a/mewtwo/parsers/parse_data_chen.py b/mewtwo/parsers/parse_data_chen.py new file mode 100644 index 0000000..e69de29 diff --git a/mewtwo/parsers/parse_model_config.py b/mewtwo/parsers/parse_model_config.py index deea2bf..7c51ccf 100644 --- a/mewtwo/parsers/parse_model_config.py +++ b/mewtwo/parsers/parse_model_config.py @@ -175,11 +175,41 @@ def from_file(cls, input_file) -> Union["EarlyStoppingConfig", None]: return EarlyStoppingConfig(metric, patience) +@dataclass +class HiddenLayerConfig: + dropout: float + second_layer_dim: Optional[int] + + def __eq__(self, other): + if type(self) == type(other) and isclose(self.dropout, other.dropout, rel_tol=0.01) and \ + self.second_layer_dim == other.second_layer_dim: + return True + return False + + @classmethod + def from_file(cls, input_file) -> Union["HiddenLayerConfig", None]: + dropout = None + second_layer_dim = None + + with open(input_file, 'r') as model_config: + for line in model_config: + line = line.strip() + field, value = line.split('\t') + if field == "hidden_layer_dropout": + dropout = float(value) + if field == "second_layer_dim": + second_layer_dim = int(value) + + if dropout is None: + raise ValueError("Hidden layer config must specify dropout") + else: + return HiddenLayerConfig(dropout, second_layer_dim) + @dataclass class ModelConfig: finetuning_mode: FinetuningType learning_rate: float - hidden_layer_dropout: float + hidden_layer_config: HiddenLayerConfig loss_function_config: LossFunctionConfig epochs: int batch_size: int @@ -190,7 +220,7 @@ class ModelConfig: def __eq__(self, other): if self.finetuning_mode == other.finetuning_mode and \ isclose(self.learning_rate, other.learning_rate, rel_tol=0.01) and \ - isclose(self.hidden_layer_dropout, other.hidden_layer_dropout, rel_tol=0.01) and \ + self.hidden_layer_config == other.hidden_layer_config and \ self.loss_function_config == other.loss_function_config and \ self.adapter_config == other.adapter_config and \ self.scheduler_config == other.scheduler_config and \ @@ -206,7 +236,9 @@ def write_model_config(self, out_file): with open(out_file, 'w') as out: out.write(f"finetuning_mode\t{self.finetuning_mode.name}\n") out.write(f"learning_rate\t{self.learning_rate:.10f}\n") - out.write(f"hidden_layer_dropout\t{self.hidden_layer_dropout:.2f}\n") + out.write(f"hidden_layer_dropout\t{self.hidden_layer_config.dropout:.2f}\n") + if self.hidden_layer_config.second_layer_dim: + out.write(f"second_layer_dim\t{self.hidden_layer_config.second_layer_dim}\n") out.write(f"loss_function\t{self.loss_function_config.type.name}\n") out.write(f"training_epochs\t{self.epochs}\n") out.write(f"batch_size\t{self.batch_size}\n") @@ -241,6 +273,7 @@ def from_file(cls, input_file): scheduler_config = SchedulerConfig.from_file(input_file) loss_function_config = LossFunctionConfig.from_file(input_file) early_stopping_config = EarlyStoppingConfig.from_file(input_file) + hidden_layer_config = HiddenLayerConfig.from_file(input_file) field_to_value = {} @@ -252,7 +285,7 @@ def from_file(cls, input_file): return cls(FinetuningType.from_string_description(field_to_value["finetuning_mode"]), float(field_to_value["learning_rate"]), - float(field_to_value["hidden_layer_dropout"]), + hidden_layer_config, loss_function_config, int(field_to_value["training_epochs"]), int(field_to_value["batch_size"]), diff --git a/mewtwo/writers/write_model_settings.py b/mewtwo/writers/write_model_settings.py index afbcf24..02d01e0 100644 --- a/mewtwo/writers/write_model_settings.py +++ b/mewtwo/writers/write_model_settings.py @@ -2,7 +2,8 @@ from argparse import ArgumentParser import os -from mewtwo.parsers.parse_model_config import ModelConfig, AdapterConfig, SchedulerConfig, LossFunctionConfig, EarlyStoppingConfig +from mewtwo.parsers.parse_model_config import ModelConfig, AdapterConfig, SchedulerConfig, LossFunctionConfig, \ + EarlyStoppingConfig, HiddenLayerConfig from mewtwo.machine_learning.transformer.config.config_types import FinetuningType, LossFunctionType, SchedulerType, \ EarlyStoppingMetricType @@ -29,6 +30,7 @@ def parse_arguments(): "REDUCE_ON_PLATEAU_WARMUP", "WARMUP_ONlY"]) parser.add_argument('--plateau_patiences', type=int, nargs='*', default=[2, 3]) parser.add_argument('--plateau_factors', type=float, nargs='*', default=[0.5]) + parser.add_argument('--second_layer_dim', type=int, nargs='*', default=[]) parser.add_argument('-o', type=str, required=True, help="Output directory") args = parser.parse_args() return args @@ -41,7 +43,7 @@ def get_hyperoptimization_configs(batch_sizes, learning_rates, warmup_epochs_options, loss_functions, loss_function_alpha, scheduler_types, use_early_stopping, early_stopping_metrics, early_stopping_patiences, - plateau_patiences, plateau_factors): + plateau_patiences, plateau_factors, second_layer_dims): model_configs = [] for batch_size in batch_sizes: @@ -99,32 +101,38 @@ def get_hyperoptimization_configs(batch_sizes, learning_rates, for learning_rate in learning_rates: for hidden_layer_dropout in hidden_layer_dropouts: - early_stopping_config = None - - if not use_early_stopping: - early_stopping_metric_options = [None] - early_stopping_patience_options = [None] - - else: - early_stopping_metric_options = early_stopping_metrics[:] - early_stopping_patience_options = early_stopping_patiences[:] - - for early_stopping_metric in early_stopping_metric_options: - for early_stopping_patience in early_stopping_patience_options: - if early_stopping_metric is not None and \ - early_stopping_patience is not None: - early_stopping_config = EarlyStoppingConfig( - EarlyStoppingMetricType[early_stopping_metric], - early_stopping_patience) - model_config = ModelConfig(finetuning_type, learning_rate, - hidden_layer_dropout, - loss_function_config, 0, - batch_size, - early_stopping_config, - adapter_config, - scheduler_config) - - model_configs.append(model_config) + if not second_layer_dims: + second_layer_dims = [None] + + for second_layer_dim in second_layer_dims: + hidden_layer_config = HiddenLayerConfig(hidden_layer_dropout, + second_layer_dim) + early_stopping_config = None + + if not use_early_stopping: + early_stopping_metric_options = [None] + early_stopping_patience_options = [None] + + else: + early_stopping_metric_options = early_stopping_metrics[:] + early_stopping_patience_options = early_stopping_patiences[:] + + for early_stopping_metric in early_stopping_metric_options: + for early_stopping_patience in early_stopping_patience_options: + if early_stopping_metric is not None and \ + early_stopping_patience is not None: + early_stopping_config = EarlyStoppingConfig( + EarlyStoppingMetricType[early_stopping_metric], + early_stopping_patience) + model_config = ModelConfig(finetuning_type, learning_rate, + hidden_layer_config, + loss_function_config, 0, + batch_size, + early_stopping_config, + adapter_config, + scheduler_config) + + model_configs.append(model_config) return model_configs @@ -148,7 +156,7 @@ def main(): args.loss_function_alpha, args.scheduler_types, args.use_early_stopping, early_stopping_metric_options, early_stopping_patience_options, - args.plateau_patiences, args.plateau_factors) + args.plateau_patiences, args.plateau_factors, args.second_layer_dim) for i, model_config in enumerate(model_configs): out_file = os.path.join(args.o, f"model_{i + 1:03}.config") From adda653ab04186eb469f989cf8958c67d2bdb0a0 Mon Sep 17 00:00:00 2001 From: "Terlouw, Barbara" Date: Tue, 15 Jul 2025 11:47:22 +0200 Subject: [PATCH 41/41] Improve modularity to train on data from different sources --- data/sequence_data/chen/test.txt | 146 +++ data/sequence_data/chen/train.txt | 290 ++++++ data/sequence_data/chen/validation.txt | 146 +++ data/sequence_data/choe/test.txt | 424 +++++++++ data/sequence_data/choe/train.txt | 848 ++++++++++++++++++ data/sequence_data/choe/validation.txt | 424 +++++++++ mewtwo/data_processing/compute_te.py | 14 + mewtwo/embeddings/feature_labels.py | 201 +++-- mewtwo/embeddings/sequence.py | 4 +- mewtwo/embeddings/terminator/a_tract.py | 10 +- .../embeddings/terminator/draw_terminator.py | 251 ++++++ mewtwo/embeddings/terminator/hairpin.py | 14 +- mewtwo/embeddings/terminator/terminator.py | 14 +- .../terminator/test/test_a_tract.py | 2 +- .../terminator/test/test_hairpin.py | 20 +- .../terminator/test/test_u_tract.py | 34 +- mewtwo/embeddings/terminator/u_tract.py | 25 +- mewtwo/embeddings/test/test_feature_labels.py | 182 ++-- .../data_preparation/binning.py | 2 - .../data_preparation/train_test_split.py | 10 +- .../feature_inference/infer_features_rf.py | 93 ++ .../random_forest/rf_from_data.py | 94 ++ .../random_forest/train_random_forest.py | 48 +- mewtwo/parsers/parse_data_chen.py | 65 ++ mewtwo/parsers/parse_feature_file.py | 6 +- mewtwo/parsers/parse_termite_output.py | 47 +- mewtwo/writers/write_model_settings.py | 2 +- requirements.txt | 13 + 28 files changed, 3177 insertions(+), 252 deletions(-) create mode 100644 data/sequence_data/chen/test.txt create mode 100644 data/sequence_data/chen/train.txt create mode 100644 data/sequence_data/chen/validation.txt create mode 100644 data/sequence_data/choe/test.txt create mode 100644 data/sequence_data/choe/train.txt create mode 100644 data/sequence_data/choe/validation.txt create mode 100644 mewtwo/embeddings/terminator/draw_terminator.py create mode 100644 mewtwo/machine_learning/random_forest/rf_from_data.py create mode 100644 requirements.txt diff --git a/data/sequence_data/chen/test.txt b/data/sequence_data/chen/test.txt new file mode 100644 index 0000000..6d6c457 --- /dev/null +++ b/data/sequence_data/chen/test.txt @@ -0,0 +1,146 @@ +GACGAACAATAAGGCCTCCCTAACGGGGGGCCTTTTTTATTGATAACAAAA 0.9943801281330785 +CCGGCTCATTGCAGCGAAATAATCCTCTCTTTATCTGCTATACCTGGT 0.7752808988764045 +ttcctgacttAAGCGGCGCTGGTTATCCATcggagccatc 0.8427672955974843 +accaggtataGCAGATAAAGAGAGGATTATTTCGCTGCaatgagccgg 0.6389891696750902 +TAGCGTGCTAACCACGCACGCTATTTTTTGTA 0.5305164319248826 +tacaaaaaaaGCCTCCACTGGGAGGCtttcaggcgc 0.4818652849740932 +CCTGGTAAGACGCCGCAGCGTCGCATCAGGTTTGTTGAG 0.0 +tggaaattaaTGCGCTGGCGGCAATGGCAcagcacagaa 0.0 +ttttagctatAAAAAAACCCGCCGAAGCGGGTTTTTTcgaaaattgt 0.5614035087719298 +TCGGTTACATGTTCGCATGTAACCGATTATCAAAA 0.0 +CGACGATGCCATTCGTGGCATCGTCGTTAAAATAA 0.0 +CTCGGTACCAAAGACGAACAATAAGACGCTGAAAAGCGTCTTTTTTCGTTTTGGTCC 0.9960885551122585 +GGCGTGAGATTGGAATACAATTTCGCGCCTTTTGTT 0.0990990990990992 +CTCGGTACCAAATTCCAGAAAAGAGGCCTCCCGAAAGGGGGGCCTTTTTTCGTTTTGGTCC 0.9973830895245074 +GTTGCCATTTGCCCTCCGCTGCGGCGGGGGGCTTTTAACCGGG 0.8516320474777448 +caaccatccgAAACCGCTCTCATCCATTCGATGAGAGCGGTTTttttaattac 0.8861047835990888 +tggggagactAAGGCAGCCAGATGGCTGCCTTttttacaggt 0.8525073746312685 +AAAACTCCAGGCCGGGTACGGTGTTTTACGCCGCATCCGGCATTACAAAAT 0.0 +CCTGGTAAGACGCAGATGCGTCTTATCAGGTTTTTTTTT 0.0825688073394496 +ctctttgacgGGCCAATAGCGATATTGGCCATTTTTTTagcgcaacat 0.896049896049896 +TCGGTTACATGTTCGCATGTAACCGATTTTCTCTG 0.8938428874734607 +caatccatgtAAAAAAAGGGCCCTGAAATTCAGGACCCTTTCtggcatcagc 0.45945945945945954 +TCGGTTACCGCTTCGGCGGTAACCGATTAAAATAA 0.0 +CGACGATATTCGTATCGTCGTTTTTTGGG 0.4117647058823529 +TCGGTTACCGCTTCGGCGGTAACCGATTTATTGTC 0.5983935742971889 +CTCGGTACCAAATTCCAGAAAAGAGACGCTGAAAAGCGTCTTTTTTTTTTTTGGTCC 0.9946638207043756 +CGACGATATTCGTATCGTCGTTTTTTTTT 0.31506849315068497 +ggggaaataaACGGCCCATCCATGAGGAATGGGCCGTgaaaggagat 0.8531571218795888 +ataacagaaaACTCCCCCGCGAGAAGCGGGGGAGTcgctggttaa 0.07407407407407418 +CGACGATGCCATTCGTGGCATCGTCGTTTTGTTGG 0.3939393939393939 +tagcaacaaaAAAGCCGACTCACTTGCAGTCGGCTTTctcattttaa 0.8522895125553914 +acaagaaaaaAGGCACGTCATCTGACGTGCCttttttattt 0.9921334172435494 +cagccactgcTCTGACCACAAGTAATTGTTCAGAttgataaaac 0.0 +CCCCGATTTATCGGGGTTTTTTGTTATCTGACTACAGAATAACTGGGCTTTAGGCCCTTTTTTT 0.8593530239099859 +TGATCATCAAGGCTTCCTTCGGGAAGCCTTTCTACGTTA 0.8275862068965517 +GCCGGATCGGCGCACTGATCCGGCTTTTGCAAC 0.0 +AAGACCCCCGCACCGAAAGGTCCGGGGGTTTTTTTT 0.9752413963852439 +AAAGTCAAAATGCCCGATCGAGGATCGGGCATTTTTGTAGC 0.765807962529274 +tgcgttatttTCGGCACCTTTTATGTAGCGAAGGTGCCGGaatatattct 0.3197278911564626 +agttttaacgAAGGGGTGGTTTCACCCCTTttgtctttct 0.7762863534675615 +GCCGCCAGTTCCGCTGGCGGCATTTT 0.19999999999999996 +CGACGATGTTCGCATCGTCGTTAAAATAA 0.0 +cgataaaaaaAGCCTGCCAGATGGCAGGCTatttaataac 0.24812030075187974 +CCCGCTTCGGCGGGTTTTTTTTT 0.9582985821517932 +AAAAAAAAAAAACACCCTAACGGGTGTTTTTTTTTTTTTGGTCTCCC 0.9964526427811281 +CCTGGTAAGACGCGAACAGCGTCGCATCAGGTTTTGCAAC 0.0 +TTTAATATGACACCGGACTCCGTTCCTCGATGGGGTCCGGTTGTTTTATTCAC 0.0 +GGGCGGTCGAACAGATCGCCCTTGTTGTAT 0.0 +AGCAGGAAAGAGTAAGGCTGAACCTTCATGTTCAACCTTACTCTCATTTAC 0.3197278911564626 +tattgattatAAAGGGCTTTAATTTTTGGCCCTTTtatttttggt 0.8347107438016529 +CCCGCATGTTCGCATGCGGGTTTTTTTTT 0.7252747252747254 +CTCGGTACCAAATTCCAGAAAAGAGACGCTGAAAAGCGTCTTTTTTCGTTTTGGTCC 0.9958317702471761 +CGACGATATTCGTATCGTCGTTTATTGTC 0.4949494949494949 +gtttctcgcgCAGGCGCTGAAAATAGCGCCTGtttttatttc 0.43181818181818177 +GATCCAGCCCATTCGTGGGCTGGATCTTAAAATAA 0.2063492063492064 +TTCTGTGCTGTGCCATTGCCGCCAGCGCATTAATTTCCA 0.15966386554621848 +TCGGTTACATGTTCGCATGTAACCGATTTTTTTTT 0.5515695067264574 +AACGAGAAAAGCCAACCTGCGGGTTGGCTTTTTTATGCA 0.9355670103092784 +GGGCGGTCAGATGATCGCCCTTTTTTTTT 0.9666332999666333 +tcggtcggtcCCCTCGCCCCTCTGGGGAGAGGGttagggtgag 0.33333333333333337 +ACGGCCCTGAACAAGGGCCGTTTGTTGTAT 0.0 +CCCGCATGTTCGCATGCGGGTTTATTGTC 0.5726495726495726 +aaaaatatgaATATATTCCGGCGCTTAATGCCACGCCGGAACATATcgaaatgatg 0.8425196850393701 +CCAATTATTGAACACCCGAAAGGGTGTTTTTTTGTTTCTGGTCTCCC 0.9936520027931187 +GACGAACAATAACACCCTAACGGGTGTTTTTTTGTTTCTGGTCTCCC 0.983878768337901 +GCCGGAGCGGCGCACTGCTCCGGCTTTTGCAAC 0.19999999999999996 +TTCCAGAAAAGAGGCCTCCCAAATCGGGGGGCCTTTTTTATTGATAACAAAA 0.9931544359255202 +ACGGCCCTGAACAAGGGCCGTTTTTGCAAC 0.6168582375478927 +CTCGGTACCAAATTCCAGAAAAGAGACGCTGAAAAGCGTCTTTTTTTATAGCGGTCC 0.9789517996211324 +gcgtaaaaaaGCACCTTTTTAGGTGCttttttgtgg 0.8 +TTCCGCTGAAGGCGTAATTGTTTAAATAACATTACGCCGCCTGGCCTT 0.7890295358649789 +taacgtagaaAGGCTTCCCGAAGGAAGCCttgatgatca 0.0 +gaacacatttGTCGGATGCGGCGCGAGCGCCTTATCCGACctacggttcg 0.6515679442508711 +CGACGTTCGCGTCGTTTTTTGGG 0.22480620155038766 +CAGATTGCTGACAACGTGCGCGTTGTTCATGCCGGA 0.6402877697841727 +CCTGGTAAGACGCCGCAGCGTCTTATCAGGTTTTTTGTA 0.2063492063492064 +GCCCGGACCAGGCCGCAGGGGGGAAACTCTGCGGCCTTTTTCGTTCTTACT 0.9178981937602627 +aataagcaatAACGGTACGACAGCTGTGTCGTGCCGTttgttttttc 0.3464052287581699 +atcaaaaaggAGCCGCCTGAGGGCGGCTtctttttgtg 0.9139414802065404 +TCTAACTAAAAAGGCCTCCCAAATCGGGGGGCCTTTTTTCTTTTCAACAAAA 0.9814436815735758 +CGACGATGCCATTCGTGGCATCGTCGTTTATTGTC 0.5689655172413792 +ttgaagataaAAAACCCTCTGTAGTAACAGAGGGTTTTgttcattcat 0.8373983739837398 +CCCGCTTCGGCGGGTTATCAAAA 0.0 +GATCCAGCTTCGGCTGGATCTTTTCTCTG 0.8990918264379415 +GACGAACAATAAGGCCTCCCTTTAGGGGGGGCCTTTTTTATTGATAACAAAA 0.9635302698760029 +tccggcaattAAAAAAGCGGCTAACCACGCCGCTTTTTTtacgtctgca 0.9916114419931213 +CTCGGTACCAAATTCCAGAAAAGAGGGGAGCGGGAAACCGCTCCCCTTTTTTCGTTTTGGTCC 0.9921550168667137 +TCGGTTACCGCTTCGGCGGTAACCGATTTTTTGGG 0.6309963099630996 +CGAACCGTAGGTCGGATAAGGCGCTCGCGCCGCATCCGACAAATGTGTTC 0.0825688073394496 +TAGCGTGACCGGAGATTCGGTCACGCTATTTTTTTTT 0.4285714285714286 +CGCCCGCGAACAGCGGGCGTTTTGCAAC 0.4117647058823529 +TCGGTTACCGCTTCGGCGGTAACCGATTTTTTTTT 0.4974874371859297 +GATCCAGCCCATTCGTGGGCTGGATCTTTATTGTC 0.6677740863787376 +cgcaaaaaaaAGCCAGCCTGTTTCCAGACTGGCttttgtgctt 0.7382198952879582 +GGCTCAAAGACCCGCTGCGGCGGGTTTTTTTGTCT 0.896049896049896 +GTAACAACGGAAACCGGCCATTGCGCCGGTTTTTTTTGGCCT 0.9847071417647958 +tccggcatgaACAACGCGCACGTTGTcagcaatctg 0.8427672955974843 +ttagtgcccaGGGTTCCCTCTCACCCTAACCCTCTCCCCGGTGGGGCGAGGGGACTgaccgagcgc 0.8058252427184466 +CGATTGAGCCTTCCAGTCCTTCGGGACTGGAATTTTTTTGTT 0.4444444444444444 +CGACGATACCATTCGTGGTATCGTCGTTTTCTCTG 0.8287671232876712 +AGGCCTCCCCGCAGGGGGGCCTTTTTTTGTA 0.5121951219512195 +CGCCCGCGAACAGCGGGCGTTGTTGTAT 0.0 +GACGAACAATAAGGCCTCCCAAATCGGGGGGCCTTTTTTCTTTTCAACAAAA 0.9736217356897916 +tacttcttacTCGCCCATCTGCAACGGATGGGCGAatttataccc 0.6563573883161512 +CCCGCATGCCATTCGTGGCATGCGGGTTAAAATAA 0.0 +CGACGATGCCATTCGTGGCATCGTCGTTTTTTGGG 0.6212121212121212 +agaaacagcaaacaatccaaaacgccgcgttcagcggcgttttttctgcttttct 0.9596122778675282 +ACTATTTTCTAAAGGCGCTTCGGCGCCTTTTTAGTCAGAT 0.7929606625258799 +aacggtttatTAGTCTGGAGACGGCAGACTAtcctcttccc 0.9445676274944568 +ATGGGAGGCGTTTCGTCGTGTGAAACAGAATGCGAAGACGAACAATAAAGGCCTCCCAAATCGGGGGGCCTTTTTT 0.9532273152478952 +cgcaaataacCAGGAGATAAAACCGACCACGGCACCAGGCAGTGACCATGTGGTTTCTTCAtcctcagtaa 0.9559082892416226 +aagtcaaaagcctccggtcggaggcttttgacttt 0.8868778280542986 +ttcctgatgtAATGCCGGATGACCTTCGTGTCATCCGGCATTtttcttttca 0.9546485260770975 +CCAATTATTGAACACCCTAACGGGTGTTTTTTCTTTTCTGGTCTCCC 0.985505145673286 +aggccaaaaaAAACCGGCGCAATGGCCGGTTTccgttgttac 0.07407407407407418 +CTCGGTACCAAATTCCAGAAAAGAGACGCTGAAAAGCGTCTTTTTATTTTTCGGTCC 0.9420289855072463 +catgactaaaAACAGCAGCAGTAAAACAGACCCTACTGCTGTTaaaacaagcg 0.04761904761904767 +TACGAATAAACGGCTCAGAAATGAGCCGTTTATTTTTTC 0.9090909090909091 +CCAATTATTGAACACCCTAACGGGTGTTTTTATTTTTCTGGTCTCCC 0.9836947660198924 +aagatgaacaAAACTAAAGCGCCACAAGGGCGCTTTAGTTTgttttccggt 0.7607655502392344 +CTCGGTACCAAAAAAAAAAAAAAAGACGCTGAAAAGCGTCTTTTTTCGTTTTGGTCC 0.9967691910054277 +TAATCGGATGCAGGCAGGGGAAGTGTCTGTTTACCCTGCCTGGTCTGATACG 0.4923857868020305 +ctgatgaaaaGGTGCCGGATGATGTGAATCATCCGGCACtggattatta 0.4974874371859297 +CGACGTTCGCGTCGTTATCAAAA 0.0 +CTAAAGCGCCGAACAGGCGCTTTAGTTGTTGTAT 0.14529914529914523 +cataaaaaaaGGGCCTAAAGCCCagttattctg 0.2063492063492064 +CCCGCTTCGGCGGGTTTTTTGGG 0.5726495726495726 +GATCCAGCCCATTCGTGGGCTGGATCTTTTGTTGG 0.4382022471910112 +AACTCCGCTGTTGCCCTGTTTCAGGGCAATTTTGCAACC 0.6047430830039525 +TTTTCGAAAAAAGGCCTCCCAAATCGGGGGGCCTTTTTTATTGATAACAAAA 0.9967320261437909 +acgcgtacaaCCGCGTGGGGAGACGACGCGGatttttaact 0.0 +TAGCGTGACCGGCGCATCGGTCACGCTATTTGTTGAG 0.0 +AGGCCTCCCAGATGGGGGGCCTTTTTTTTTT 0.9242424242424242 +ACGGCCCTAGATAGGGCCGTTTTTTTTTT 0.9651567944250871 +CGACGATACCATTCGTGGTATCGTCGTTAAAATAA 0.0 +CCCGCTTCGGCGGGTTTTCTCTG 0.8986828774062816 +GACGAACAATAAGGCCGCAAATCGCGGCCTTTTTTATTGATAACAAAA 0.9921911603935655 +cataaaaaaaCCCGCTTGCGCGGGctttttcaca 0.16666666666666663 +gtattcgcgcACCCCGGTCTAGCCGGGGTCATTTTTTagtggctttt 0.8366013071895425 +TAGCGTGACCGGCGCATCGGTCACGCTATTTTGCAAC 0.0 +aacgcatgagAAAGCCCCCGGAAGATCACCTTCCGGGGGCTTTtttattgcgc 0.9939246658566221 +TTCCAGAAAAGACACCCTAACGGGTGTTTTTTCGTTTTTGGTCTCCC 0.99153403318659 +ATCTCTCTACGCCCTCACCCGTACAGGGTGAGGGCAATAATCTTT 0.7429305912596401 +CCCGCACTTAACCCGCTTCGGCGGGTTTTTGTTTTT 0.9511957052220595 +CCTGGTAAGACGCTAACCACGCGTCTTATCAGGTTGTTGTAT 0.0 +GCCGGAGCGGTAACCACCTGCTCCGGCTTTTTTGTA 0.5305164319248826 diff --git a/data/sequence_data/chen/train.txt b/data/sequence_data/chen/train.txt new file mode 100644 index 0000000..04f8c5d --- /dev/null +++ b/data/sequence_data/chen/train.txt @@ -0,0 +1,290 @@ +aaagattattGCCCTCACCCTGTACGGGTGAGGGCgtagagagat 0.0 +CGACGATGTTCGCATCGTCGTTTTGTTGG 0.17355371900826444 +TAAAGAAACTCGCCGGATGAAAAGTCATCCGGCGTCATATTACT 0.0 +CTCGGTACCAAATTCCAGAAAAGACACCCGAAAGGGTGTTTTTTCGTTTTGGTCC 0.9933844932521831 +GACGAACAATAAGCGGCAAATCGCCGCTTTTTTATTGATAACAAAA 0.9484801648634724 +aggggtaaaaCCGTCCAGGAGCAGGGaaaagggcag 0.8680738786279684 +ctaagcgttgTCCCCAGTGGGGAtgtgacgaag 0.3006993006993006 +gaaacgaaaaGAGGGAGACTAGCTCCCTCTttcaactggc 0.5652173913043478 +TGCCGGGATAACCACTCCCGGCATTTTTTGTA 0.4623655913978495 +cagtaatgctGGCGCGCCCCCTCGCGCCtgaaaattac 0.3939393939393939 +cagtgaaaaaTGGCGCCCATCGGCGCCAtttttttatg 0.9732477260567148 +CCTGGTAAGACGCCGCAGCGTCGCATCAGGTTTTGCAAC 0.019607843137254943 +GTTATTAAATAGCCTGCCATCTGGCAGGCTTTTTTTATCG 0.8887652947719689 +caacaatgacAAGCGGTGGAGATCTTCTCTGCCGCTTtttttttcat 0.9688473520249221 +ggctggaaaaCCTGGCTCGTATGCGTAAGCAGGacattatttt 0.4285714285714286 +ccataaaaaaGCGCCATTCAGCGCCTTTTTAtcatcccctt 0.7916666666666666 +agaataaattCAACCGCCCGTCAGGGCGGTTGtcatatggag 0.8864926220204313 +caagcactgcAAAAAACAGCCGGACGGTTTTCACCTCCGGCTATTTTTTtaattgtgat 0.8666666666666667 +TAGCGTGACCGGTAACCACTCGGTCACGCTATTTTTTGTA 0.13793103448275856 +ACGGCCCTTAACCACAGGGCCGTTTTTTTGTA 0.7942386831275721 +acatttaataAAAAAAGGGCGGTCGCAAGATCGCCCTTTTTTacgtatgaca 0.9778858911985847 +acgagccaatAAAAATACCGGCGTTATGCCGGTATTTTTttacgaaaga 0.9085923217550275 +TAAGAAAAGCGAGAGTTACAGCTCTCACTTATTTGTT 0.15966386554621848 +GACGAACAATAACACCCTAACGGGTGTTTTTTATTGATTGGTCTCCC 0.9289267945984364 +aaacacgtagGCCTGATAAGCGAAGCGCATCAGGCagttttgcgt 0.32432432432432434 +TTCCAGAAAAGACACCCTAACGGGTGTTTTTTTGTTTCTGGTCTCCC 0.9912334531428071 +AGCCCCTCAATCGAGGGGCTTTTTTTTGC 0.9787685774946921 +CGACGTTCGCGTCGTTTTGTTGG 0.13043478260869557 +GCCGGAGCGGGAACACTGCTCCGGCTTTTGCAAC 0.0 +taacaatcagAAATGCCGGGAATAAATCCCGGCATTTtcataatcag 0.6453900709219857 +CGACGATGTTCGCATCGTCGTTTATTGTC 0.6124031007751938 +TAATAATCCAGTGCCGGATGATTCACATCATCCGGCACCTTTTCATCAG 0.019607843137254943 +AAAAGGGGCCTGAGTGGCCCCTTTTTTCA 0.8932764140875133 +gaggtgatttAAATTCATCCCCGGCGGCAAGCCGGGGAGATTTcattacggca 0.6563573883161512 +CCAATTATTGAACACCCAAATCGGGTGTTTTTTTGTTTCTGGTCTCCC 0.9892680832796737 +ttacccgcaaAGTGCGCTGCTGCTTAGCAGCGCACTgcttggtggg 0.5652173913043478 +CTGATTATGAAAATGCCGGGATTTATTCCCGGCATTTCTGATTGTTA 0.6309963099630996 +aagaattaaaAATGTTACGGGTAACTGTTATCGGTAACATTgtccagccgg 0.5652173913043478 +AAGGACACCAGAGCCTGCCAATGGCAGGCTCAGACTGATGA 0.7668997668997669 +TCGGTTACATGTTCGCATGTAACCGATTTTTTGGG 0.4219653179190751 +GGGCGGTCTAACCACGATCGCCCTTTTTTGTA 0.5283018867924529 +AATTCCATCGGGTCCGAATTTTCGGACCTTTTCTCCGC 0.9552572706935123 +CTCACCCTAACCCTCTCCCCAGAGGGGCGAGGGGACCGACCGA 0.8552821997105644 +TACCATGTCCTTATTGACCCCGTATATTACGGGGTCGTTTTTGTGCGGAAT 0.504950495049505 +AGGCCTCCCCGCAGGGGGGCCTTTTGTTGAG 0.46808510638297873 +cgaataatccGATTACGGCTACGCTTCTAATGTTCCCCTTGAATGGAGTCGAAGAATGCGTAATCccacgctgtt 0.2592592592592593 +ttaacccctaCCCCACGCGTACAACCGCGTGGGGagacgacgcg 0.19999999999999996 +aaaagaggtgGCCAGGGGGATCACCTGGCAgcatgctgcc 0.7899159663865546 +GGCAATAACTCTTTTCGCCGAGCAGGATGCTCGCGAAAAGAAACTGTGATT 0.3197278911564626 +CGCCCGCTAACCACGCGGGCGTTTTTTGTA 0.6453900709219857 +ATTTGTACCGAAAACCCCGGGGCGTGCTCCGGGGTTTTTTCTTATCAA 0.9253174010455564 +CCCTCTTGATAACCCAAGAGGGCATTTTTTA 0.0990990990990992 +GCCGGAGCGGCGCACTGCTCCGGCTTTGTTGAG 0.04761904761904767 +taacaaaaaaCCCCGATAAATCGGGGctttatataa 0.0 +TGCCGGGACGCATCCCGGCATTTTGCAAC 0.15254237288135586 +ACCGGAAAACAAACTAAAGCGCCCTTGTGGCGCTTTAGTTTTGTTCATCTT 0.9460625674217907 +CTGAGTGAACTCCCATGAGCATAGATAACTATGTGAATGGGATGAGCGAAGG 0.3670886075949368 +aaggttgaaaAATAAAAACGGCGCTAAAAAGCGCCGTTTTTTTTgacggtggta 0.988537368179734 +gcctttcaaaAGTAAGCAACGTCTGCTTACTgcccctctac 0.8826291079812206 +ccgtattaccCCGCGCCGGGAATGCGCGGccgccaattt 0.5260663507109005 +tttaaagagcAAACCCCTCAAACGAGGGGTTTtttgttgttt 0.9204455051710422 +ACGGCCCTCGCAAGGGCCGTTTTTTTGTA 0.8141263940520446 +GATCCAGCTTCGGCTGGATCTTTATTGTC 0.6226415094339622 +GGGCGGTCCGCAGATCGCCCTTTTTTGTA 0.5283018867924529 +GCTGATGCCAGAAAGGGTCCTGAATTTCAGGGCCCTTTTTTTACATGGATTG 0.9803188348750246 +CGACGTTCGCGTCGTTTATTGTC 0.4252873563218391 +AGGCCTCCCTAACCACGGGGGGCCTTTGTTGTAT 0.23076923076923084 +CCTGGTAAGACGCTAACCACGCGTCTTATCAGGTTTTTTGTA 0.0 +TCGGTTACATGTTCGCATGTAACCGATTTTGTTGG 0.3548387096774194 +CGACGTTCGCGTCGTTTTCTCTG 0.7578692493946732 +CTCGGTACCAAATCTAACTAAAAAGACGCTGAAAAGCGTCTTTTTTCTTTTCGGTCC 0.9891821722198183 +CTAAAGCGCCCGCAGGCGCTTTAGTTTGTTGAG 0.23664122137404586 +GTTTAACGGCTCTGGCGGAGCTCCCAGGCTCCGCCAGATTTATTTACT 0.0 +tgcctcaattAGCGGCTCATGTAGCCGCTttttctgcgc 0.9681630054122891 +GCCAGTTGAAAGAGGGAGCTAGTCTCCCTCTTTTCGTTTC 0.9185667752442996 +CCAATTATTGAAGGGGAGCGGGAAACCGCTCCCCTTTTTTTGTTTCTGGTCTCCC 0.9954863461972466 +agaaatcatccttagcgaaagctaaggattttttttatctg 0.9383857054836722 +GGGCGGTCCGCAGATCGCCCTTTTGCAAC 0.576271186440678 +atgaccttttGCACCGCTTTGCGGTGCTttcctggaag 0.7890295358649789 +GACGAACAATAACACCCAAATCGGGTGTTTTTTATTGATAACAAAA 0.84779299847793 +TACGTAAAAACCCGCTTCGGCGGGTTTTTACTTT 0.9655765920826161 +ctatcctcttCCCGGTCCCCTATGCCGGGttttttttat 0.84472049689441 +taaggttgaaAAATAAAAACGGCGCTAAAAAGCGCCGTTTTTTTTgacggtggta 0.9897351673167727 +gggtaaaaaaATGCCTGACTGCTTTGTGCGATCAGGCATtctcgaatta 0.0 +ccgtttaacaCGTTCTGGATGAAATCCATATCGcgatagcgca 0.8007968127490039 +GCCGGATCGGTAACCACCTGATCCGGCTTTTTTGTA 0.4565217391304348 +CTAAAGCGCCTAACCACGGCGCTTTAGTTTTTTGTA 0.8266897746967071 +TAGCGTGCCGCAGCACGCTATTTTGCAAC 0.25373134328358216 +CCCGCATGTTCGCATGCGGGTTTTCTCTG 0.8941798941798942 +TCGGTTACATGTTCGCATGTAACCGATTTCAACAA 0.0 +CGACGATACCATTCGTGGTATCGTCGTTTCAACAA 0.0 +CTCGGTACCAAACCAATTATTGAAGACGCTGAAAAGCGTCTTTTTTTGTTTCGGTCC 0.9913815392570887 +tttaaaagatTATCGGGAGAGTTACCTCCCGATAtaaaaggaag 0.7807017543859649 +GACGAACAATAAGGCCTCCCGAAAGGGGGGCCTTTTTTATTGATAACAAAA 0.993993993993994 +GACGAACAATAAGGCCTCCCAAATCGGGGGGCCTTTTTTCGTTTTAACAAAA 0.9602701628923321 +GCCGGATCGGTAACCACCTGATCCGGCTTGTTGTAT 0.0 +CTCGGTACCAAAGAAAAATAAAAAGACGCTGAAAAGCGTCTTTTTATTTTTCGGTCC 0.9894957983193278 +GATCTAACTAAAAAGGCCGCTCTGCGGCCTTTTTTCTTTTCACT 0.9852005327808199 +TGCCGGGACGCATCCCGGCATTTGTTGAG 0.04761904761904767 +cgtgttcctgAACGCCCGCATATGCGGGCGTTttgctttttg 0.9594155844155844 +CCCGCTTCGGCGGGTTAAAATAA 0.0 +AGCCTCCCAGTGGAGGCTTTTTTTGT 0.7389033942558747 +aagcacgaaaAAGGGAGCGATGAATTATCGCTCCCTTgtcttataac 0.5475113122171946 +CTAAAGCGCCCGCAGGCGCTTTAGTTTTTTGTA 0.7916666666666666 +tgcggagggaGTGGAAACGCCTCAGCCGGAACTGACCGAGGCGGTATAACttaacgcagt 0.8579545454545454 +CGACGTTCGCGTCGTTAAAATAA 0.5305164319248826 +cacagaatctAAGATCCCTGCCATTTGGCGGGGATTTTtttatttgtt 0.8675496688741722 +CCTGGTAAGACGCGAACAGCGTCGCATCAGGTTGTTGTAT 0.0 +gaagttcaagGGCGCATCTACTGATGTGCCttttttattt 0.797979797979798 +AATAAATGGATGCCCTGCGTAAGCGGGGCATTTTTCTTCCT 0.9623918766453554 +CTCGGTACCAAAGACGAACAATAAGACGCTGAAAAGCGTCTTTTTTATTGATGGTCC 0.9777133942500558 +CGACGATGCCATTCGTGGCATCGTCGTTTTCTCTG 0.8685939553219448 +GATCCAGCCCATTCGTGGGCTGGATCTTTCAACAA 0.0 +CTCGGTACCAAACCAATTATTGAAGACGCTGAAAAGCGTCTTTTTTCGTTTTGGTCC 0.9954096855634611 +CGACGATATTCGTATCGTCGTTTTCTCTG 0.7005988023952096 +GGGCGGTCGAACAGATCGCCCTTTTGCAAC 0.23076923076923084 +AGGCCTCCCTAACCACGGGGGGCCTTTTTTTGTA 0.7787610619469026 +CGACGATATTCGTATCGTCGTTTCAACAA 0.0 +CCCGCATGCCATTCGTGGCATGCGGGTTTTGTTGG 0.4219653179190751 +GACGAACAATAAGGCCTCCCAAATCGGGGGGCCTTTTTTTGTTTCAACAAAA 0.9871134020618557 +tgcaaaaaaaTCGGGCAGCGTCGTGCTGCCCGtgtgcatact 0.47916666666666663 +tctgaatgcgTGCCCATTCCTGACGGAATGGGCAtttctgcgca 0.9390615478366849 +tgtaagagatAAAAAAGGCCGGAGCATGCTCCGGCCttcgttttca 0.9450247388675096 +AGGCCTCCCCGCAGGGGGGCCTTTTTGCAAC 0.2907801418439716 +CGCCCGCCGCAGCGGGCGTTTTGCAAC 0.5867768595041323 +CTAAAGCGCCTAACCACGGCGCTTTAGTTGTTGTAT 0.0 +ctaatcttttCAACGCCTGGCACTGCCGGGCGTTGTTCTTTTTaacttcaggc 0.9325691166554282 +gaacttgtagGCCTGATAAGCGCAGCGTATCAGGCaatttttata 0.4252873563218391 +CCAATTATTGAACACCCTTTAGGGGTGTTTTTTTGTTTCTGGTCTCCC 0.9875992063492064 +CCCGCATGCCATTCGTGGCATGCGGGTTATCAAAA 0.0 +ccttcgctcaTCCCATTCACATAGTTATCTATGCTCATGGGAgttcactcag 0.7872340425531915 +CCCTCTTGGGTTATCAAGAGGGTCATTATATTT 0.0 +TGCCGGGAGAACATCCCGGCATTGTTGTAT 0.0 +ccgccttcacAAATGCCGCCACTCAAACAGAGCGGCATTTttcttccccg 0.9683544303797469 +TAGCGTGACCGGTAACCACTCGGTCACGCTATTGTTGTAT 0.0 +tcgaaaagaaCCCGGTAGCGTCACAACTACCGGGcgaacaaact 0.11504424778761058 +ATCTCCTTTCACGGCCCATTCCTCATGGATGGGCCGTTTATTTCCCC 0.9515269025690741 +TCGGTTACATGTTCGCATGTAACCGATTAAAATAA 0.0 +tagggaaaaaTGCCTGATAGCGCTTCGCTTATCAGGCctacccgcgc 0.44134078212290506 +TGAACAACATCGCGCTTATCGTTAAGGTAAGCGCGTATTTTTTTTACCCGCCAG 0.9264705882352942 +tgtagttgtaAGGCCGTGCTTCCGAAAGGAATGCGCGGCTTattttcgttt 0.8542274052478134 +agagaatataaaaagccagattattaatccggcttttttattattt 0.24812030075187974 +tccctgtcttCCCCCACATGCTGTGGGGGttttttttat 0.8727735368956743 +GATCCAGCCCATTCGTGGGCTGGATCTTTTTTTTT 0.9518304431599229 +cagataaaaaaaatccttagctttcgctaaggatgatttct 0.9909608605260779 +CTGAAATATCCAGCGGATCAAGAAAATTCGTTGGATATTTTTT 0.4565217391304348 +TGCCGGGATAACCACTCCCGGCATTGTTGTAT 0.029126213592232997 +ACAATTTTCGAAAAAACCCGCTTCGGCGGGTTTTTTTATAGCTAAAA 0.9883422709256237 +CCTGGTAAGACGCAGATGCGTCGCATCAGGTTTTTTTTT 0.0 +tgattatgcgAGCAGACTCGCACTCCTGCCAGTCTGCTGcaaaagaaag 0.15254237288135586 +TAGCGTAAAAGCAAAACACAAATCTATCCATGCAAGCATTCACCGCCGGTTTACTGGCGGTTTTTTTTCGCCGTCATA 0.9529411764705882 +aagttatgcgAAAATGCCGGTCTTGTTACCGGCATTTTttatggagaa 0.9065420560747663 +GCCGGAGCGGCGCACTGCTCCGGCTTTTTTGTA 0.3670886075949368 +taaagtatgtATCCCCAAAATAATTCGAGTCATTGCATCTGTGGCTAGAAGTATGAAGGGAttaaccataa 0.0 +atcaaataatGCCTGATAGCACATATCAGGCgttgtcctca 0.9251497005988024 +acgcgtaaatAAAAAAGGCGCTAGTGAAAGCGCCCTTTTTTgtcattatgc 0.3548387096774194 +GGGAAGAGGATAGTCTGCCGTCTCCAGACTAATAAACCGTT 0.8535871156661786 +gctcataagtAAAAAACGGCACCTGGTGCCGTTTTTTtgtctgaaac 0.8043052837573386 +TACCACCGTCAAAAAAAACGGCGCTTTTTAGCGCCGTTTTTATTTTTCAACCTT 0.9782135076252724 +gataaatcagAAACATAAAGGCGCTTTCGGGTGCCTTTATTATTTccagtgaaac 0.7925311203319503 +gcggagaaaaGGTCCGAAAATTCGGACCcgatggaatt 0.05660377358490576 +TAGCGTGACCGGGAACATCGGTCACGCTATTTTGCAAC 0.0 +GCATAATGACAAAAAAGGGCGCTTTCACTAGCGCCTTTTTTATTTACGCGT 0.9396863691194209 +TAGCGTGCTAACCACGCACGCTATTGTTGTAT 0.0 +TCAAACGAAACCAGGCTATACTCAAGCCTGGTTTTTTGATGG 0.3975903614457831 +CTTATTCCATAACAAAGCCGGGTAATTCCCGGCTTTGTTGTATCTGAAC 0.9643366619115549 +CTCGGTACCAAATTCCAGAAAAGAGACGCTAAATCAGCGTCTTTTTTCGTTTTGGTCC 0.9932768589485007 +caaattaagcAGAAGGCCATCCTGACGGATGGCCTTTTtgcgtttcta 0.9685534591194969 +TTTTCGAAAAAAGGCCTCCCAAATCGGGGGGCCTTTTTTTATAGCAACAAAA 0.9918982419184963 +CCCGCATGTTCGCATGCGGGTTTCAACAA 0.0 +CGACGATGTTCGCATCGTCGTTTCAACAA 0.0 +CCGGCTGGACAATGTTACCGATAACAGTTACCCGTAACATTTTTAATTCTT 0.0 +GCCGGATCGGGAACACTGATCCGGCTTTTGCAAC 0.41860465116279066 +CGACGATACCATTCGTGGTATCGTCGTTTTGTTGG 0.3902439024390244 +CCAATTATTGAACACCCTAACGGGTGTTTTTTTTTTTTTGGTCTCCC 0.9912095639943741 +CCTGGTAAGACGCGAACAGCGTCTTATCAGGTTGTTGTAT 0.0 +CACAATGATGCCCGGTTGCTTTTCACAACCGGGCATTTTTTTAAC 0.7076023391812866 +CTCGGTACCAAATTCCAGAAAAGAGACGCTGAAAAGCGTCTTTTTTATTGATGGTCC 0.9732405672999732 +CGACGATACCATTCGTGGTATCGTCGTTTTTTTTT 0.40828402366863903 +TAGCGTGCGAACAGCACGCTATTGTTGTAT 0.0 +taagacgtggCGCATCAGGCATCGTGCACCGAATGCCGGATGCGgcgtgaacgc 0.7867803837953091 +TGCTCGTACCAGGCCCCTGCAATTTCAACAGGGGCCTTTTTTTATCC 0.9591670069416088 +TCGGTTACCGCTTCGGCGGTAACCGATTATCAAAA 0.0 +CCTGGTAAGACGCGAACAGCGTCTTATCAGGTTTTGCAAC 0.1869918699186992 +ataacaaaaaACCCCGCCGGAGCGAGGttttttgtta 0.5454545454545454 +CGACGATGTTCGCATCGTCGTTATCAAAA 0.0 +ATGAATGAACAAAACCCTCTGTTACTACAGAGGGTTTTTTATCTTCAA 0.822380106571936 +ACTGTTATTCGGCGCTCCACGGAGCGCCTTTTTTTCT 0.7792494481236203 +agtaagaacgAAAAAGGCCGCAGAGTTTCCCCCCTGCGGCCtggtccgggc 0.4871794871794871 +AGGCCTCCCGAACAGGGGGGCCTTTGTTGTAT 0.0 +CCCGCATGTTCGCATGCGGGTTTTTTGGG 0.4923857868020305 +gctgttcagaTCACTGGTGCGCGCGGAGTCGCCGCCAGTGAgcaaacgctg 0.8175182481751825 +atttttttaaCGGGGGAGGTTCCCCCGtcagatcatt 0.05660377358490576 +CCTGGTAAGACGCTAACCACGCGTCGCATCAGGTTTTTTGTA 0.0 +CGACGATATTCGTATCGTCGTTATCAAAA 0.0 +CCTGGTAAGACGCCGCAGCGTCTTATCAGGTTTGTTGAG 0.0 +GATCCAGCCCATTCGTGGGCTGGATCTTTTTTGGG 0.6805111821086263 +caatgaaaaaAGGGCCCGCAGGCCCtttgttcgat 0.5555555555555556 +TCTAACTAAAAACACCCTAACGGGTGTTTTTTCTTTTCTGGTCTCCC 0.9904131914485668 +TATAAAAATTGCCTGATACGCTGCGCTTATCAGGCCTACAAGTTC 0.24812030075187974 +CGACGATGTTCGCATCGTCGTTTTTTTTT 0.6062992125984252 +aaagtaaaaaCCCGCCGAAGCGGGtttttacgta 0.8015873015873016 +TAATCGCCTCGCCCCGGTATCGTGCCGGGGCTTTTTCACTT 0.5815899581589958 +AAAAAATCACAGGGCAGGGAAACCTGCCCTTGTTTCAGCG 0.4736842105263158 +TAGCGTGCAGATGCACGCTATTTTTTTTT 0.6951219512195121 +AGTCAGAAAGCCGCCGACATGCTCGGCGGTTTTTCTGAA 0.504950495049505 +ACGGCCCTCGCAAGGGCCGTTTTGTTGAG 0.574468085106383 +aggaaagtaaGTGCCGGATATGAAATCCGGCACctgtcagact 0.0 +CGCCCGCAGATGCGGGCGTTTTTTTTT 0.8798076923076923 +CAGAATAACTGGGCTTTAGGCCCTTTTTTTATG 0.21875 +TAGCGTGACCGGCGCATCGGTCACGCTATTTTTTGTA 0.0825688073394496 +GATCCAGCCCATTCGTGGGCTGGATCTTATCAAAA 0.0 +agttaaccaaAAAGGGGGGATTTTATCTCCCCTTTaatttttcct 0.9845368795422916 +GAAACGAACCGGGAGCGCTGTGAATACAGTGCTCCCTTTTTTTATT 0.9015748031496063 +GCCGGATCGGCGCACTGATCCGGCTTTGTTGAG 0.0 +aacaaaaaaaTTCCAGTCCCGAAGGACTGGAAggctcaatcg 0.4871794871794871 +AAGCACAAAAGCCAGTCTGGAAACAGGCTGGCTTTTTTTTGCG 0.9687695190505934 +CCCAATGACTACTTCCATGCTCAACGCAAAACTACTACCAACTGCGCCATCCGCCGCAGTGGTCGTCGTGCGTGTGGTGGTGGTCGTCGGCAATGCGCCGTAGGGACTGGAACAACACACGATTCCAAAACCCCGCCGGCGCAAACCGGGCGGGGTTTTTCGTTTAAG 0.5348837209302326 +AGGCCTCCCGAACAGGGGGGCCTTTTTGCAAC 0.7252747252747254 +aacagtaactTTTCCGGCTTCCCGTTCGTCAGTACCTCGGGAAGCCGCCAAccaggataaa 0.0 +ACGGCCCTCGCAAGGGCCGTTTTTGCAAC 0.6677740863787376 +CGACGTTCGCGTCGTTTCAACAA 0.0 +GTAAAAATACAGGGCTGGAATCATCCGGCCCTTTTTTCTGAT 0.8204667863554758 +GCACTGATATAACGGGCCTGATGGCCCGTTTTAGTGTTTG 0.8702983138780804 +GAAAAATAAAAACACCCTAACGGGTGTTTTTATTTTTCTGGTCTCCC 0.9895287958115183 +ctctgcacaaACGCCACCTTTTCGGTGGCGttttttatcg 0.9487967229902714 +CTCGGTACCAAATTTTCGAAAAAAGACGCTGAAAAGCGTCTTTTTTTATAGCGGTCC 0.9873112549168888 +agtgaaaagaAAAAAGGCCGCAGAGCGGCCTTTTtagttagatc 0.8454404945904173 +ggaccaaaacGAAAAAAGACGCTTTTCAGCGTCTCTTTTCTGGAATTtggtaccgag 0.7050147492625369 +TAGCGTGACCGGGAACATCGGTCACGCTATTGTTGTAT 0.0 +AAAAGGATTCGCGGCTCTGCTCTTCAGAGCTGCTTTTATGATA 0.9686028257456829 +GACGAACAATAAGGGGAGCGGGAAACCGCTCCCCTTTTTTATTGATAACAAAA 0.9884951679705476 +actccaaaaaAAAACCTGAAAAAAACGGCCTGACGTGAATCAAGCAATTTTTTTCAGGTTTTgcccgcttag 0.24242424242424243 +CCAATTATTGAAGGCCTCCCAAATCGGGGGGCCTTTTTTATTGATAACAAAA 0.9955849889624724 +gacgaacaaTAAGGCCTCCCAAATCGGGGGGCCTTTTTTATTgaTaacaaaa 0.9958937297252906 +CCTGGTAAGACGCCGCAGCGTCTTATCAGGTTTTGCAAC 0.0 +aaataataaaaaagccggattaataatctggctttttatattctct 0.938347718865598 +tccggcaaaaaaacgggcaaggtgtcaccaccctgccctttttctttaaaaccgaaaagattacttcgcgtt 0.6941896024464832 +TCTAACTAAAAAGGCCTCCCAAATCGGGGGGCCTTTTTTATTGATAACAAAA 0.9964098513678467 +ccaggcatcaaataaaacgaaaggctcagtcgaaagactgggcctttcgttttatctgttgtttgtcggtgaacgctctc 0.9880339834868972 +ATGCTAAATTGCCCGATGCGCTGCGCTTATCGGGCCTTCATGGTT 0.0 +AGTATGCACACGGGCAGCACGACGCTGCCCGATTTTTTTGCA 0.8531571218795888 +CCCGCATGCCATTCGTGGCATGCGGGTTTTTTGGG 0.5652173913043478 +ACTGATTTTTAAGGCGACTGATGAGTCGCCTTTTTTTTGTCT 0.9818774918448714 +GTGAAGTAATACAAAACAGGCCCAGGCGGCCTGTTTTGTCTTTTTAATG 0.9521988527724665 +CCAATTATTGAACACCCTAACGGGTGTTTTTTTATAGCTGGTCTCCC 0.9794661190965093 +CCCGCATGCCATTCGTGGCATGCGGGTTTATTGTC 0.6363636363636364 +ttgataagaaAAAACCCCGGAGCACGCCCCGGGGTTTTcggtacaaat 0.5780590717299579 +gctacaaaaaTGCCCGATCCTCGATCGGGCAttttgacttt 0.6763754045307443 +cagccctaatCAGCGTTGCAGGATAAAGCACCGCTCactcttcaac 0.6212121212121212 +ATATTCTGAAATATCCAGCGGATCAAGAAAATTCGTTGGATAT 0.3055555555555556 +tgcttgattaAAAAGGCGCTACTCGGCATGGGGAAGCGCCTTTTttataggtgt 0.9272197962154294 +TGCCGGGACGCATCCCGGCATTTTTTGTA 0.4708994708994708 +TCGGTTACCGCTTCGGCGGTAACCGATTTTCTCTG 0.8939554612937434 +CTCGGTACCAAATCTAACTAAAAAGACGCTGAAAAGCGTCTTTTTTCGTTTTGGTCC 0.9959148658033417 +AGTTTGTTCGCCCGGTAGTTGTGACGCTACCGGGTTCTTTTCGA 0.8080614203454894 +GCCGGATCGGGAACACTGATCCGGCTTGTTGTAT 0.0 +aacaaataagTGAGAGCTGTAACTCTCgcttttctta 0.0 +CGACGTTCGCGTCGTTTTTTTTT 0.0 +AAGCAGAGAACCCTGGATGAGAGTCCGGGGTTTTTGTTTTT 0.8475609756097561 +gtgtaagtttAGCCGGATAACGCGCCAGATCCGGCTtacatctctg 0.4897959183673469 +CGCAGATAGCAAAAAAGCGCCTTTAGGGCGCTTTTTTACATTGGTGG 0.9851146174456683 +GATCCAGCTTCGGCTGGATCTTTTTTGGG 0.5780590717299579 +ccgattatttACGCAAATTTGCGTgccaaaattt 0.0 +GATCCAGCCCATTCGTGGGCTGGATCTTTTCTCTG 0.8699609882964889 +TTCCAGAAAAGAGGCCTCCCAAATCGGGGGGCCTTTTTTCGTTTTAACAAAA 0.9733262203254202 +CCAATTATTGAACACCCTTCGGGGTGTTTTTTTGTTTCTGGTCTCCC 0.9942062572421785 +TTCAGCCAAAAAACTTAAGACCGCCGGTCTTGTCCACTACCTTGCAGTAATGCGGTGGACAGGATCGGCGGTTTTCTTTTCTCTTCTCAA 0.9973571541836249 +AAAAAAGCCCGCACCTGACAGTGCGGGCTTTTTTTTT 0.9841042759497695 +agtaagcgcgAATATGCCTGATGGTGCAACACCATCAGGCATATTaaattatgct 0.5951417004048583 +CCTGTTGAGTAATAGTCAAAAGCCTCCGGTCGGAGGCTTTTGACTTTCTGCTTAC 0.98113919275745 +aaagaaagaaTTAACGCATCGCATCCAGACTGTTCTGAATGCGACGATAATTAAggtgctttaa 0.019607843137254943 +GATCCAGCTTCGGCTGGATCTTTTGTTGG 0.6563573883161512 +aaagtcaaaagcctccgaccggaggcttttgactt 0.8454404945904173 +AGTCAAAAGCCTCCGACCGGAGGCTTTTGACTATTACT 0.7668997668997669 +CCCGCATGCCATTCGTGGCATGCGGGTTTTCTCTG 0.9052132701421801 +CGACGATGTTCGCATCGTCGTTTTTTGGG 0.4285714285714286 +AAAAAAAAAAAAGGCCTCCCAAATCGGGGGGCCTTTTTTTTTTTTAACAAAA 0.9936515997968512 +GCCGGATCGGCGCACTGATCCGGCTTTTTTGTA 0.15254237288135586 +TTTTCGAAAAAACACCCTAACGGGTGTTTTTTTATAGCTGGTCTCCC 0.9882849109653233 +TTATATAAAGCCCCGATTTATCGGGGTTTTTTGTTA 0.9261992619926199 +atgcaggaaaGCAGGCTGGAGCTACCCAGCCTGCagtgaaatta 0.893048128342246 +CCAATTATTGAAGGCCGCTAACGCGGCCTTTTTTTGTTTCTGGTCTCCC 0.9956827699348099 +agacaaaaaaACCCGCCGCAGCGGGtctttgagcc 0.9363057324840764 +GGGCGGTCTAACCACGATCGCCCTTGTTGTAT 0.1071428571428572 +aaaaacaaaaACCCCGGACTCTCATCCAGGGttctctgctt 0.8745294855708908 +attttgtaatGCCGGATGCGGCGTAAAACACCGTACCCGGCctggagtttt 0.3377483443708609 +agtaaataaaTCTGGCGGAGCCTGGGAGCTCCGCCAGAgccgttaaac 0.504950495049505 +CTCGGTACCAAAAAAAAAAAAAAAGACGCTGAAAAGCGTCTTTTTTTTTTTTGGTCC 0.9963758924364875 +AAAGCCCCCGGAAGATCACCTTCCGGGGGCTTT 0.7245179063360881 diff --git a/data/sequence_data/chen/validation.txt b/data/sequence_data/chen/validation.txt new file mode 100644 index 0000000..0a3a5e5 --- /dev/null +++ b/data/sequence_data/chen/validation.txt @@ -0,0 +1,146 @@ +GATCCAGCTTCGGCTGGATCTTTTTTTTT 0.7807017543859649 +aaaatcaaaggatcttcttgagatccttttttt 0.5614035087719298 +gaaaaaataaACGGCTCATTTCTGAGCCGtttattcgta 0.646643109540636 +CGGCCCGCCTATGGCGGGCCGTTTTGTAT 0.8839907192575406 +CTGCCCTTTTCCCTGCTCCTGGACGGTTTTACCCCT 0.5454545454545454 +gggtcaaagaGGCATGATGCGACGCTTGTTCCTGCGCTTTGTTCATGCCGgatgcggcta 0.6855345911949686 +TTTTCGAAAAAACACCCTAACGGGTGTTTTTTTGTTTCTGGTCTCCC 0.9959158668572595 +TGTGAAAAAGCCCGCGCAAGCGGGTTTTTTTATG 0.9790224459827984 +CCCGCATGTTCGCATGCGGGTTATCAAAA 0.180327868852459 +CTCGGTACCAAATTCCAGAAAAGAGACGCTTTCGAGCGTCTTTTTTCGTTTTGGTCC 0.9961766392659147 +agaaaaaaagGCGCTCCGTGGAGCGCCgaataacagt 0.23076923076923084 +gaaccacctcCTTAGCCTGTGTAATCTCCCTTACACGGGCTTATTTTTTacgcgtaata 0.7995991983967936 +CTCGGTACCAAATTCCAGAAAAGAGACGCTTTTAGAGCGTCTTTTTTCGTTTTGGTCC 0.9953831948291783 +GCCGGAGCGGAGATCTGCTCCGGCTTTTTTTTT 0.6666666666666667 +CCCGCTTCGGCGGGTTTTGTTGG 0.33333333333333337 +gttaaaaaaaTGCCCGGTTGTGAAAAGCAACCGGGcatcattgtg 0.8076923076923077 +CCCGCATGTTCGCATGCGGGTTAAAATAA 0.0 +CCTGGTAAGACGCCGCAGCGTCGCATCAGGTTTTTTGTA 0.09090909090909094 +CTCGGTACCAAATTCCAGAAAAGAGGCCGCGAAAGCGGCCTTTTTTCGTTTTGGTCC 0.9963092821553792 +CCAATTATTGAAGCGGCTAACGCCGCTTTTTTTGTTTCTGGTCTCCC 0.985169805724455 +tgatgtaagtAAAAGAGCCGGCCCAACGGCCGGCTTTTTTctggtaatct 0.31506849315068497 +GGTATTCATTGCCTGATGCGACGCTTACGCGTCTTATCATGCCTACGGGAACCTGA 0.8694516971279374 +caggcatcaaATAAAACGAAAGGCTCAGTCGAAAGACTGGGCCTTTCGTTTTATctgttgtttg 0.9833277759253084 +CGACGATACCATTCGTGGTATCGTCGTTATCAAAA 0.0 +CCCGCATGTTCGCATGCGGGTTTTGTTGG 0.3197278911564626 +CTCGGTACCAAATTCCAGAAAAGAGACGCTGAAAAGCGTCTTTTTTTGTTTCGGTCC 0.9861495844875346 +CCTGTCCTGATCGTTTCCTGAACGATAAATTGTGA 0.7050147492625369 +AGTAAAACCCATGCCGGATGCGCCAGCATCCGGCATAATACCGATTAC 0.07407407407407418 +GACGAACAATAAGGCCTCCCAAATCGGGGGGCCTTTTTTTTTTTTAACAAAA 0.9768946395563771 +aaaaacaaaaACCCGCCGAAGCGGGTtaagtgcggg 0.5024875621890547 +aatcgcgttaCGGGGAGGAAGTTTTTTCAGATACTCCCggaacgcctg 0.5951417004048583 +ACGGCCCTTAACCACAGGGCCGTTTGTTGTAT 0.0 +TCTAACTAAAAACACCCTAACGGGTGTTTTTTTGTTTCTGGTCTCCC 0.9938886512253254 +CCAATTATTGAAGGCCTCCCTAACGGGGGGCCTTTTTTTGTTTCTGGTCTCCC 0.9959446855103613 +CTAAAGCGCCAGATGGCGCTTTAGTTTTTTTTT 0.8194945848375451 +GGGCGGTCCGCAGATCGCCCTTTGTTGAG 0.0 +cgtctgcgtaTGGAACGTGGTAACGGTTCTActgaagattt 0.8296422487223168 +gtaagcagaaAGTCAAAAGCCTCCGACCGGAGGCTTTTGACTattactcaac 0.6491228070175439 +ctctggtagtCCTGGTAAGACGCGAACAGCGTCGCATCAGGcatattgcca 0.9527410207939508 +ggaaacacagAAAAAAGCCCGCACCTGACAGTGCGGGCTTTTTTTTTcgaccaaagg 0.9968 +CGACGATATTCGTATCGTCGTTTTGTTGG 0.1071428571428572 +cttgaaaaatAAAGTATTAGCGTTCTGCGTTAAGACTTTtttcatgggt 0.1869918699186992 +aaccatgaagGCCCGATAAGCGCAGCGCATCGGGCaatttagcat 0.4444444444444444 +CTAAGCGGGCAAAACCTGAAAAAAATTGCTTGATTCACGTCAGGCCGTTTTTTTCAGGTTTTTTTTTGGAGT 0.8181818181818181 +CGACGATATTCGTATCGTCGTTAAAATAA 0.0 +gctccgccgtCACGGTTGCAGGAAAGCTAAGGGACTTAGCCTGCGGCGGTTTTGTttggcttcag 0.4565217391304348 +GACGAACAATAAGACGCTAAATCAGCGTCTTTTTTATTGATAACAAAA 0.9494438827098078 +aattattggcAAAAGGCAACCACAGGCTGCCTTTTtctttgactc 0.019607843137254943 +TAGCGTGCCGCAGCACGCTATTTTTTGTA 0.5884773662551441 +agcgtcaaaaGGCCGGATTTTCCGGCCttttttatta 0.9447208402432283 +aagtcgcaccAAAGGGGAGCGGGAAACCGCTCCCCTTTtatatttagc 0.9307958477508651 +aagtgaaaaaGCCCCGGCACGATACCGGGGCgaggcgatta 0.0 +CTCGGTACCAAATTCCAGAAAAGAGACGCTGAAAAGCGTCTTTTTTCTTTTCGGTCC 0.9848369977255497 +TTAAAGCACCTTAATTATCGTCGCATTCAGAACAGTCTGGATGCGATGCGTTAATTCTTTCTTT 0.5475113122171946 +gcttgctgaggatcctaaagccccgaattttttataaattcggggctttttt 0.9791796793670623 +AAGAACGAGTAAAAGGTCGGTTTAACCGGCCTTTTTATTTTGTGA 0.9734888653234358 +cagaaatcatccttagcgaaagctaaggattttttttatctg 0.9291784702549575 +CTAAAGCGCCGAACAGGCGCTTTAGTTTTGCAAC 0.6428571428571428 +AAAAAAAAAAAAGGCCTCCCAAATCGGGGGGCCTTTTTTATTGATAACAAAA 0.9957755998648192 +AGGCGACTGACGAAACCTCGCTCCGGCGGGGTTTTTTGTTATCTGCA 0.9836601307189542 +ccaattattgAACACCCTAACGGGTGTTTTTTTGTTTctggtctccc 0.9922396399192922 +TGTAATTAGATTTGACTGAAATCGTACAGTAAAAAGCGTACAATAAAGGCTCCACGAAAGTGGGGCCTTTTTTAGCGCGA 0.9164578111946533 +GTGAATTGTAGCTGACCTGGGACTTGTACCCGGGTCGGTATTTTTTTGCTTCTGGTCCCGG 0.8171846435100548 +ccatcaaaaaACCAGGCTTGAGTATAGCCTGGtttcgtttga 0.7076023391812866 +gtcagcttgaAAAAAGGGGCCACTCAGGCCCCCTTTTctgaaactcg 0.8654104979811574 +CGACGATGCCATTCGTGGCATCGTCGTTTCAACAA 0.0 +TGCCGGGAAGATTCCCGGCATTTTTTTTT 0.8813760379596678 +tatcataaaaGCAGCTCTGAAGAGCAGAGCCGCgaatcctttt 0.7340425531914894 +cgctgaaacaAGGGCAGGTTTCCCTGCCCTgtgatttttt 0.14529914529914523 +CTAAAGCGCCCGCAGGCGCTTTAGTTTTGCAAC 0.5024875621890547 +taatcattctTAGCGTGACCGGGAAGTCGGTCACGCTAcctcttctga 0.965552876334826 +gtgaataaaaCAACCGGACCCCATCGAGGAACGGAGTCCGGTGtcatattaaa 0.019607843137254943 +GTCAGTCGTCAGACGCCGGTTAATCCGGCGTTTTTTTTGACGCCCAC 0.9342537804076265 +CGCCCGCTAACCACGCGGGCGTTGTTGTAT 0.0825688073394496 +TAGCGTGCCGCAGCACGCTATTTGTTGAG 0.19354838709677413 +CTCGGTACCAAATTTTCGAAAAAAGACGCTGAAAAGCGTCTTTTTTCGTTTTGGTCC 0.9971714657464502 +CGAGTTTCAGAAAAGGGGGCCTGAGTGGCCCCTTTTTTCAAGCTGAC 0.9139414802065404 +AATGATCTGACGGGGGAACCTCCCCCGTTAAAAAAAT 0.21875 +CGCCCGCCGCAGCGGGCGTTTGTTGAG 0.37106918238993714 +CTCGGTACCAAATTCCAGAAAAGAGACGCTTAACAGCGTCTTTTTTCGTTTTGGTCC 0.995767374925929 +aaagcaaaaaCCCGCCGAAGCGGGtttttacgta 0.7058823529411764 +CGACGATGCCATTCGTGGCATCGTCGTTATCAAAA 0.0 +CGACGATACCATTCGTGGTATCGTCGTTTATTGTC 0.54337899543379 +CCCGCTTCGGCGGGTTTCAACAA 0.0 +tgcataaaaaAGCCAACCCGCAGGTTGGCttttctcgtt 0.9206978588421887 +GATCCAGCTTCGGCTGGATCTTATCAAAA 0.0 +caaacaatccAAAACGCCGCGTTCAGCGGCGTTTTttctgctttt 0.9511957052220595 +TCGGTTACCGCTTCGGCGGTAACCGATTTCAACAA 0.0 +aataaaaaaaGGGAGCACTGTATTCACAGCGCTCCCggttcgtttc 0.23664122137404586 +GTCAGTTTCACCTGTTTTACGTAAAAACCCGCTTCGGCGGGTTTTTACTTTTGG 0.993255092405234 +CTTTCTTTTGCAGCAGACTGGCAGGAGTGCGAGTCTGCTCGCATAATCA 0.4505494505494506 +TGCCGGGAGAACATCCCGGCATTTTGCAAC 0.28057553956834524 +aggaagaaaaATGCCCCGCTTACGCAGGGCAtccatttatt 0.6688741721854304 +GCGCGGGTAGGCCTGATAAGCGAAGCGCTATCAGGCATTTTTCCCTA 0.6996996996996997 +tcgcgctaaaAAAGGCCCCACTTTCGTGGAGCCTTTATTGTACGCTTTTTACTGTACGATTTCAGTCAAAtctaattaca 0.4623655913978495 +TAGCGTGCGAACAGCACGCTATTTTGCAAC 0.0 +AATAATTTCCGCGTCATGCTTCACGCCGCAGATGCGTTGGCTGCG 0.1869918699186992 +TCGGTTACATGTTCGCATGTAACCGATTTATTGTC 0.5555555555555556 +ATGCTTTAGTAAGGGCGCGACGTTTGCGCCCTTTGTAGGCCGG 0.48453608247422675 +tattttttggGCCGGCATGATGCCGGCttttttttat 0.8937300743889479 +caaaataacaAAACCCACCTTAAGGTGGGTTTcgccagagaa 0.3055555555555556 +tcatcagtctGAGCCTGCCATTGGCAGGCTCtggtgtcctt 0.646643109540636 +CGACGATGCCATTCGTGGCATCGTCGTTTTTTTTT 0.4219653179190751 +GCCGGAGCGGGAACACTGCTCCGGCTTGTTGTAT 0.0 +ACCTGTAAAAAAGGCAGCCATCTGGCTGCCTTAGTCTCCCCA 0.9759961593855017 +GATCCAGCTTCGGCTGGATCTTTCAACAA 0.0 +gttatcggtgCAGAGCCCGGGCGAACCGGGCTTTGttttgggtgt 0.3006993006993006 +AAAAAAAAAAAACACCCTAACGGGTGTTTTTTTGTTTCTGGTCTCCC 0.9964464660104474 +ACGGCTTCCCACGTCAGACCAAAACGCGCCAGGTATTTGCGTAGCCGATCCGCGTCATTGACGCTGGCTTTGCCCTGGCGCGAAACGTCAAAAAGCTG 0.6168582375478927 +ttcagaaaaaCCGCCGAGCATGTCGGCGGctttctgact 0.4350282485875706 +CCTGGTAAGACGCTAACCACGCGTCGCATCAGGTTGTTGTAT 0.0 +gttatgagtcAGGAAAAAAGGCGACAGAGTAATCTGTCGCCTTTTTTCTTtgcttgcttt 0.9901039089559623 +gattttatgaAAAACGGCCTGCGGGCCGTTTTgttttgtctg 0.7706422018348624 +CCCGCATGCCATTCGTGGCATGCGGGTTTCAACAA 0.0 +GCCGGAGCGGTAACCACCTGCTCCGGCTTGTTGTAT 0.0 +GTTATAAGACAAGGGAGCGATAATTCATCGCTCCCTTTTTCGTGCTT 0.9686323713927227 +GACGAACAATAAGGCCTCCCAAATCGGGGGGCCTTTTTATTTTTCAACAAAA 0.978401727861771 +TGAAAACGAAGGCCGGAGCATGCTCCGGCCTTTTTTATCTCTTACA 0.9724366041896362 +cctctacctgCTTCGGCCGATAAAGCCGACGataatactcc 0.9191592562651576 +CGCCCGCCGCAGCGGGCGTTTTTTGTA 0.6108949416342413 +ccaataaaaaTGGGGCCTTTCAGCCCCAtcaaacaatg 0.0 +GCCGGATCGGAGATCTGATCCGGCTTTTTTTTT 0.5 +CCAATTATTGAAGACGCTTAACAGCGTCTTTTTTTGTTTCTGGTCTCCC 0.9940539897728624 +CCAATTATTGAACACCCTAACGGGTGTTTTTTCGTTTTTGGTCTCCC 0.9886000911992704 +GATGGCTCCGATGGATAACCAGCGCCGCTTAAGTCAGGAA 0.9487179487179487 +cgtatcagacCAGGCAGGGTAAACAGACACTTCCCCTGCCTGcatccgatta 0.13043478260869557 +CCCGCATGCCATTCGTGGCATGCGGGTTTTTTTTT 0.41860465116279066 +CGACGATACCATTCGTGGTATCGTCGTTTTTTGGG 0.4923857868020305 +GAAAAATAAAAAGGCCTCCCAAATCGGGGGGCCTTTTTATTTTTCAACAAAA 0.9877345762296087 +CCAATTATTGAAGGCCTCCCAAATCGGGGGGCCTTTTTTTGTTTCAACAAAA 0.9828090080797662 +ACTGCGTTAAGTTATACCGCCTCGGTCAGTTCCGGCTGAGGCGTTTCCACTCCCTCCGCA 0.6062992125984252 +aaaagcaaaaGGGCCGCAGATGCGACCCttgtgtatca 0.0 +tccatacaaaACGGCCCGCCATAGGCGGGCCGgatttacatt 0.7732426303854876 +gtaatcggtaTTATGCCGGATGCTGGCGCATCCGGCATgggttttact 0.05660377358490576 +ccgacgtaaaAAGACGGTAAGTATCGCTTTCAGTCTTatgaatatcg 0.44134078212290506 +agaaattgagATTCATCCACTACTTGCATGGATGAGTaatgattaat 0.0 +GATCCAGCTTCGGCTGGATCTTAAAATAA 0.0 +GGCAGCATGCTGCCAGGTGATCCCCCTGGCCACCTCTTTT 0.966688874083944 +GACGAACAATAAGGCCTCCCTTCGGGGGGGCCTTTTTTATTGATAACAAAA 0.9907123618463824 +CGACGATGTTCGCATCGTCGTTTTCTCTG 0.8500749625187406 +tataaacaaaAAAACCCGACTTCACCAGTATTCTCTGGTTATGTCAGGTTTTgcctgcgaat 0.4350282485875706 +ATCGAACAAAGGGCCTGCGGGCCCTTTTTTCATTG 0.9604273842500989 +taacgtaaaaACCCGCTTCGGCGGGtttttttatg 0.9880597014925373 +CATTGTTTGATGGGGCTGAAAGGCCCCATTTTTATTGG 0.9211977935382191 +tcaggttcccGTAGGCATGATAAGACGCGTAAGCGTCGCATCAGGCaatgaatacc 0.3464052287581699 +AGCCCGCCTAATGAGCGGGCTTTTTTTT 0.8522895125553914 diff --git a/data/sequence_data/choe/test.txt b/data/sequence_data/choe/test.txt new file mode 100644 index 0000000..4c6b97e --- /dev/null +++ b/data/sequence_data/choe/test.txt @@ -0,0 +1,424 @@ +AGAGAAAACACATACGCGTAAATAGAAGCGCGTATGCACACGGAGGGAAAAAAGCCCGGGAGAGTCAATCTCATGTGAGACGACTGTCCGGGGTTTTTTTGTTTTCGGAGA 0.973 +TTCTGCTAATGTGAAATCAAATGTTATAAATCAAGCGGGTGCGGGTAAATTAAATTAAGAAAGTGAAAAACACAAAGGGTGCTAACCTTTGTGTTTTTTAATTAATTAAAA 0.977 +AGCTGGATTTTTGGCTCGTCAAATGGAACAGTAGGTCTTATTCCTTTCTTCGAAAATAGGTATGTAAAGAACAGCTCTCCTTGGGACGCTGTTCTTTTTCATGCGTGCCGA 0.9790000000000001 +AAGTCTTTTTCGGAGGATTAACAGGTATTTTACTGACATTGGTACTCGCTTACTTTTTCATGTAAAAAAAACGCTCCTGCTTTCGCTGGAGCGTTTTATTCTTTCACCGTT 0.9359999999999999 +GCTTTCCTCCTCTGCCGCTTCAGGCGTGCCGCAGGTGGAGAATCTGGAAAACAAAGGCCAGTAATTCGACATCAACCCTGCCCTTCAGGGGTGGGGTTACCTTGCTTATCC 0.9059999999999999 +AAACATAAGACCTGCTTCGTCTTACTTTACCATTTTTCAAATTATATTTGAAAGCAATAACCATCGTTAACCCGGACGTCATACAGTTCGGGTTTCGATTTTTCCATAAAA 0.91 +GGAAATGCTGGCAGCTTACGCGGGTTAATCCCTGTTACGTTTGAATGTAAGCGCATAATCAATCGCGGCCGCGTTAGTACTCACGACGTGGCTGCGTTTTTTCACGGTTAA 0.963 +AGCCTGCGCATCAAAGCAACCGTCTGCCGCTTTCAAAAGTATCAACTTGGCTGTAAGGATATGAAAAACCTTAGTCCGAAATCCGGACTAAGGTTTTTTTATACAATCGCC 0.9990000000000001 +CAATGAGCGCTTCGAGCACTACACGGACGAAAAAATCAAATATTTAGAGCAGATTTTTGAAAAAACGACAAAGCAGCACTGATTACAGTGCTGCTTTTTTTATCCCTGTTT 0.9990000000000001 +TATGGGATATGAATTCATGGGTGCTGATGCGTCCGGATATCATTTGTTTGAGAAAGAGATTACGGCAGAATAAGCGTCCTTTGCAGGGGGCTTTTTTGTCTTTTTCTAAAA 0.8290000000000001 +AAATGAACGTGTTGAAGGTCTTCGGGATGCCACTCGCGCCGATTATGACAGGTTATTCGAGCAAATGAACACGCAGGAGAATTAATTTCTCCTGCTTTTTTCATATGAATT 0.975 +AGCGGAGCGTGCGGCAGCTGAGATTGAAGCGTTTATGAAAAAATCCGCTGTCCCGCAGTAAAAACATAAAAACCGAAATCAATTTGATTTCGGTTTTCTGTTTAAATATTT 0.6409999999999999 +CCTCTCATTGCCGATCCGAAAGAGCTTCGCAGGCAACGAGCAGAAAAATCAGCGAATAAATAACAGACAGAAAAAAGGTGCTCATTGAGCACCTTTTTTAACGTCTTTGAG 0.612 +TTACACTTTGTTTAAAAGCTCTGTGAAGAGTGGAATATTTTTACGTGAATGCAGAAGGTGTGGCATGAAAAAGAGCATTTAGTGTAATGCTCTTTTTGTTTAAAACACTTC 0.752 +GTTGGACGAGCTTGAACATCCCGAACACAAAAATGAACGCTTCACAGTGGTAAGCGAAGCGGAATAATAAAAAACCCGGAGCCAGCTCCGGGTTTTTCTCATTAATAAGTG 1.0 +ATCGAGCACACGGATGCCATTCACGTTTCTTATCCAACCTTCTTCGAGCATTTAAATAAGCTTTCGAAAAAATCCTGAAGTTTTACTTCAGGATTTTTTATGCAGATCATA 0.9620000000000001 +ATTGGCGCCAGCGCAGGCGTTTCTATCGTAAAAGGAGAAAATGCATCAGGCGGAGACAACTATTAAGAGCTGACGGAAGGCAGACTGCTTTCCGTTTTCTGAGGAAACAGT 0.424 +CCTCTATTTACAGCTGGGATGAAGCTGATCGTATTTTAGCTGATCTGGAAACATTATGTAAAGAACACGAGTAGAGGGGAGACCCTCTCTCTTAGTGTTCATCTGTGACGG 0.727 +AGCCGAAGCTGGATGAATAAAACCGAAACTGGATAGATAACTACATGGCTTTTGCCTCATTAATTGAACGCCAGCGCATTCGCCTGCTGCTGGCGTTATTATTCGGTGCCT 0.332 +CCAGCCTGGCAGAACTGCTGAACGCAGGTCTGGGCGGTTCTGATAACGAGTAATCGTTAATCCGCAAATAACGTAAAAACCCGCTTCGGCGGGTTTTTTTATGGGGGGAGT 0.992 +ATTTTTATGAAAAAGCGGTCAGCATGAGAGAGATGATTCAAAGGAACGATTGTTTATATGAAGTATGAGACAAAACCATCTGCTGGCAGATGGTTTTTTTTTATGCAAAAA 0.997 +TATTATCCCACCGTGCGGGGCTGCACTTGGACGAGGAAAGGCTTAGAGATCAAGCCTTAACGAACTAAGACCCCCGCACCGAAAGGTCCGGGGGTTTTTTTTGACCTTAAA 0.7120000000000001 +ATCACAGTTACTACTTCATCGCCTCTTTTATAGAGGATCACCTGCGCTTCCATGCGCAGTATTTACTGAAGTGAAAGTCCGCCCGGTTCGCCGGGCATCTTCTCATCAGAA 0.736 +AGAAACTGGTACTTAAAAGGCATTGAATATAAAGATGGATGGTATGTAGGGAAGTACCAAGCAAATTATTGATTTATCGAGGGGGTGTTCCCCCCTCTTTCATTTTTAGGA 0.951 +TGGATTTTACCTACCGCCACCCCATCTTCCGGTTAGAAATCAAAATTCCGTTTCAGAAATAAAAAAACCTTGAAAAGCCTGGCTTTTCAAGGTTTTTTCCATTATGAGGAT 0.953 +GAAATTCCTTTACTGGCAGATAATAAAATACCTATCCGTGATGAATTGCTTTTGGTCAAGGAGTTACAATAAAAGGCGCTCCTCTAAGGAAGCGCCTTTTGATCATGCGAT 0.32299999999999995 +AACAAAAGCTACAGTTAAAAGATGAGATGCTCAAAATCCTGCAGCAGGAGAGCGTCAAAGAGGTGTAAACTTTCCTAAGCCGCCAGCCAGGCGGCTTTTTTAACAACTGCA 0.738 +TGGCCAGAAACCCGCGCGGCACTGGCGAACATGGTGCAGGCGGCGCAGAACTTGCGTCGGGGGTAAAATCCAAACCGGGTGGTAATACCACCCGGTCTTTTCTCATTACAG 0.82 +AAATTGTCGGTGAAGGCAATCCAAAATTAAAAGACGTTTATGAGGAAATGGAGAAGCATGGGTATACAGAAAACCGCTGACTTGATCAGCGGTTCTTCTCTATCATTAACC 0.5710000000000001 +GCACGTCATAAGGCTAACCTGACTGCACAGATCAACAAACTGGCTTAATCGCCAATTTGCTGAAGCTTTGTGAAAAAGCCCGCGCAAGCGGGTTTTTTTATGCCTGCTGCT 0.985 +AACGTTTTCACACCCTTTCCGGCGGTAAACCGCAGGTTGAAGGTGCTGAAGACTACACCGATTCTGACGATTAATAAAAAGGGCGAAATGCCCTTTTTTTATGTCAGTAGT 0.8759999999999999 +GCAGTCGAGTCGCTTCACGATGCATTTGAGCTTTCAAAACACCCTTCAGCTGTGTAATGACAATCAAAAAGGCGGGACTATGCAGTCACCGCCTTTTTGATTACACTGGGT 0.927 +TCCGGCCTACAGTTCAATGATAGTTCAACAGATTTCGAATATTCTGAAGCAAACTTGAACTTATCATCAGGCGAAGGCCTCTCCTCGCGAGAGGCTTTTTTATTTGATGGG 0.991 +GCCGCGGTCTTTCAGAATATCCTTGAGAAAAAATACGCTCATTATGTTTAAACAAACGTTTCATTAAAAAAAGCCGTTCCCCATGGGACGGCTTTTTTTCTATTGATTCAG 1.0 +ACTGAAATTAAAGTTGCTGCAGATCGGGTAGGGGCATAGCTTAGATAATTGGAAAAGAGGAAAAAAGCTTAATCTTTTTTCGAAGGTTAAGCTTTTTCTTTTATTTATAAA 0.818 +GCAAAGGCTATCAAGCGATGGCTGAAGCCGTTCACAAGCTAGGCTATAAAGAATTATCAGTTTCATAGAAAACGCGCCTCATGACAGGCGCGTTTTTTATGTGTGATCGAA 0.9890000000000001 +TGGACAAAATTGACGCATTAAATAAAGATGAGCGTGTCGGTCCAAATCCTGATGAGCTTCTGTTTTAATCAAAAAACTCCCCGTTATGGGGAGTTTTTTTAGTGCGTTTTT 1.0 +GAAATCACTAACGGTGCCAAAGGCCCTTCTGCTGCAAACGTAATCGCTCTGTAAGATACGTCAGCAAGAATTCAAAACCCGCTTAATCAGCGGGTTTTTTTTGGTCTTTAG 0.998 +GCCTGGTCTTTACTGTGTTGCTGGCAATCCCGTTGGGAATTTATTTCCTTGGCGGCGAAGAGCAGTAAGTAAAAAATAGGCCCGATAACTCGGGCCTTGTCAGTTATTGAA 0.792 +CGGCCAATTTAAATCACTCAACAAAGAAGATGTGCTGTCAATTTTGAAAGCATCACTATAAAAAAAGGGGAAATAGCCGTTTGGCTGCTTCCCTTTTTCTTTTTTGTCAAC 0.92 +AATGCGATGGATATTGCTGAAATCCGCAACAACGCGTCTCCGGCAGATCCGCAAGTTCATTAATGAGTATGTCCGGCAGAGAGGGCTCTGCCGGATTTTACGACTAATACA 0.7879999999999999 +CGGCAACACATCATGCCTCTGCCGCTCCTGCGCGTCCGCAACCTGTTGAGTAATAATTAGCTCAAAGTAATCAAGCCCTGGTAACTGCCAGGGCTTTTTTATTTCATCTTT 0.9590000000000001 +CGGCAGACATTGAATTGTCCTTTTCAAAAGCTTATATACCTGCCAATGAAAAACAATTACGTGCATAAAAAAGCGCAGTCGTGAAGACTGCGCTTTTCTGTTACTGTTACG 0.9790000000000001 +TAAAGTCAGAGCCGTTCACCTGGCTGATTTGCTATTAGAAGCAGCCGGGCACAAAACATCATGAGAAAGCCCAAAACAGACATTGTTTTGGGCTTTTGTGCGTTATTCCAT 0.948 +GTCAATCCAGTTGCTGAATTATGCTGTGGCAAGGCGGACAGCCGCAGTGAAAGCATCTCCGCCTGTACACTAAAACAAAGCCGCCTTGGCTTTGTTTTTTTATTTTCTCCT 0.888 +CAGATCATAGCTCACACCATGATGGATGCAGCAGTGACGGAGGTTTCAGCGATTCCGGCTGTGGCGGCGGAGGGGGCGGCGATTAAACGCCGCCTTTTTTTATTTCATTGA 0.8640000000000001 +AGACGCACCGTGGCAAAATATCGCGATCAAATGAATATTCCGTCATCGGCAGCGAGAAAGCGATATAAATAAAATCCTCCCTAGACGGGAGGATTTTTTTAAGGAATCATC 0.905 +GAGATCGAAAAAAATAAGAGCTTTGAAGAACTCCTAAATGAAAGCCAAATGGATTGGCATCAATATAAATAAACCAGCACCCGTACAGGTGCTGGTTTTTCTGCTATGAGC 0.917 +CATGTTTGCGATGCCAGCTCTTAACGTTCCGGCTGTACCGGCAATTCCGCAGCAGTAATTCACGGCCAAAACGCCGGAGAATTTTCACTCCGGCGTTTTCATCTGAAAACA 0.345 +AGGTGTCCAGACTTGAAAAAGCGGCTATCAAACAAATGAATAAAAATATTCATCAATAATGAAAAGCCTTTAAAACGATGTTGTTTTAAAGGCTTTTCTATTGATTATGGA 1.0 +TTTTTTGATGAAGAGGAATCAGGCAAGGTTACAGCTGTTTGGCGCCTTCATACAGAACAATAATAAAAAACCCGCTCATACTATATGAGCGGGTTTCGTTTGTTTTTTATC 1.0 +TGAATTTGATGAAGCAAAAAGCATAGAGGCTGTTGAAGGCATGAAAAAAATGAAAACAGCAGTCTGAAAAAAGAACGGCCCATCCATGGGCCGTTTTTTTAATTGTTTAAG 0.9890000000000001 +AATATACGGAAAGCAGACACCGTCTCGACAAATTCCGATTTGTTGATTTTGCGCGGCGGGGGAAATGAGAGAGGCGCCTGCCTGCGGCAGCGCGCTTTTTTGTTTGGTATC 0.8290000000000001 +CCCGCGTCCACTGCAATTTATTATGGCTTCCATTATTTTAGTCAGCGGTTTAAAATTGATTTAATGAAGAAGGCTATCCGCTTGAAGGATAGCCTTTTCTTATTATAACGT 0.9390000000000001 +TAAGCGTGATTTATTAAAATCACTACTTGAAGCGAAAGGGATGAAGGTAAAACTCGCAGGCGGTTAACATAAAAAGCCACGGATATATCCGTGGCTTTCGAATATTTTACT 0.7170000000000001 +CAGTTGCCGGAAAATAATTGTGAATATAGATCGTGTATGCATCTAGCTTACATACACTTTATTTCTTCACAGAAAAGCCCCTTTCTAAGGGGCTTTTCATATTTCAAGAGC 0.9890000000000001 +CACGGCTATACGGTGCTGGATATTCAGCAAGACGGGCCGACCATTCGTTATTTAATTCAGAAGTAATCCCTCATTCCCGCTGTGACCCACAGCGGGAACATCGTTACCTTT 0.8109999999999999 +GCCCATGGAAAACCAACCACGCTCGTTGTGCTTTCTTCTTCTAGCTGCCATCCCGCTTTTTGATAACAAAAAACCCGCAGCTATTCTGCGGGTTTTCCTCACTTATTTTTG 0.877 +CGGATTGAAACAAGTGAAGGAAAGGGCTTTAAGGCTGATATTGAAATCGAATTGTAATGGATTTATAACGGAAACGACTTGGCACAGGCCAAGTCTTTTTTATAAAATGGA 0.7 +AGCTTGATATGGATGATTACAGAGCAGCAGAATTAGTCGAAGCATTCATCAAAGAATAAAACACCAAAAGGAAATAGCCATGACGGCTATTTCCTTTTTTATTTGAAAATG 0.997 +TCAGCAGCGGGTTGCGCTCCCTTAATGTGCCTCGCCATATAAATTGAATGGTGCAGGGAGCGCGCAGGGGGCGGCCAATCGCCGCCGCCCCCTGCTGTCCCGGCCTTCGGG 0.44299999999999995 +GCCATACTATTCGCGACGTTATTCTCTTCCCGGCGATGCGCCCACAGAAATAAATTTCACTTTAATGAACGAAGCAGTCAGGCGACTGCTTCGTTCATCGTTCCGCAAATC 0.9440000000000001 +ATCTGGGATCGCGCGGCAGGTCGTCAGGTGACCTCCAGCGCTCACTAAATCACTGAACATTTGTTTTAACCACGGGGCTGCGATGCCCCGTGGTTTTTTATTGTGTTGATG 0.655 +TGGCTAAAGAATGTAATAGTACAGATAAAGAAGTCTTTAAAAAAATTTAAACGGTAACGATTCATAACCTTGATTGGAAAAAATGCCTTTCAAGGTTTTTAATTACAATAA 0.958 +AAGTATGTTGAAGAGATTGAACAGAGGTGTAAACAAAACATTTTGGCGATTGAAATCCAGATGAAAATCAAATAGGAGAGGCGAATGCCTCTCCTCTATTTGTCATCTCAT 0.968 +AGTGATGAAGAACACAACCGTCAAAGGGAAACAGCTGAAGGTGAATAAAGCGAATAAATGATGAATGACCTGCTCCCAGTTAAAGGGGCAGGTCATTTTGCTGCTGGCTGA 0.997 +TATCGCTATGATGACTACGAGCTGAGGGATTTAGAAGAAGTAAAGCATGATGAAAGAGTCACAGTCGAAGGGAAGGTTCATTCAGAGCCTTCTCTTACCTATTACGGAAAA 0.39899999999999997 +ATTCAGCGCAGTGAGCTGGAAAAGCAGGCGATGGAAACCGTCATTAACGCACTGGTAAAATGACCATTCAGGATGAGGCGGTAAGCTTCATCCTGATATTCTTGCTTCAGT 0.89 +AAGCGTGACAACATTTGTTGTAAATCGTTAAGAGCACAAAGACACACATAAATAATAGAGTGTCTGAATTGGCGATTTTCCGATTGGGAAATCGCTTTTTTATTTTGTTGC 0.977 +AAACCAATTTACTGTTCTGCGCCGCGGGAAGAAAAAATACTTCCTTGTGACGTATAAATAAGAAAAAGATCCTTTGCCACTGAAGGCAAAGGATCTTTTTGTTTACCGCAT 0.992 +TCTAAAAAATAGATATGATGCTTGTTTTATCGCTTCAAAATAAACGATGACAACTAGTAAATGCCGCCTTGACCTCTTGTTTAATCAGGCGGCTTTACTTTTATACATAGA 0.961 +CGAATTGTCGGCACATCGAAGTTTGTCTTTTTCCCGTTTAACGAAATGCGTCAAACAAAATAAAAACGCCTTGCTGGCCTAGGACAGCAGGCGTTTTTATTTTGAAAAAGA 0.958 +TCCTCGTGACAATCAAAAATTTACTGGAAGATCCTGAACAGCTTTTATTAGAAGGATAATAAAAAAGGGTACATCACGATAAAGTGATGTACCCTTTTTGATGCATATTTA 0.998 +GGAAATGGAAGATAACGGAGAGCTTGAGAAATTGAAGAAAAAATGGTTCACTGGCGAGAAGTAAAAAAAAGCGGCTCAACTTTTTCGTTGAGCCTTTTTATTGAATAGGAA 0.529 +TTATCCGATTGTGCAAACCATTTTAAGGCGTTCAAGCTTTATGGAAGCTGCTAAATAATTGTCACACAAAACCTCTTCCGCTTCCGGGAGAGGTTTTTTTGAACAGAGCGC 0.9420000000000001 +CATTCAACAAGCACTTACGAATGCAGAATATTGGCATGGTGTTATGGATGTAATTATTAAGAAAGCAAACAAAGTCGCTCAATAACTGAGTGGCTTTTTTCTTTGTCCTCT 0.998 +CCTTATTTGTCTTTGGATGGTTTACCGTGATGACCGTAATAAACAGCGGGTATCCAACCGCACATTAAAACCTGTATCCGATCGGATACAGGTTTATTTTTATGGCTGTAA 0.988 +GAAAAAAATGCCCAAACTGTCAGCAAAGCATTATATGCGATACGTTCTGAACAGCAAAGAACAAAGTAAAAAACGACCTTTACGAAGAGGTCGTTTTTGATTTTTTAATCA 0.988 +TAAAGCACAGCAAATCAAAGAAGACGCGATTGCGCGTGCGAAAGAAGCAGGAAAAACATTCTAATGAAAAAGCCTCCCCTTATTTGGGGAGGCTTTTGTTTTAGTGCTCCA 0.998 +CCACTATGGAAGTGTTCGAAAAGAGGATTGTCTTTTCTGTCTGTCATTCTTAAATAAAAAATAGGGAAAAGGCTTCCGCATAAGACGGGAGCCTTTTTCAAAGGTACTGAA 0.865 +CTAAAGGCCGCGCTCGTCTGACCGTTTCTAAGTAATAAAGCTAACCCCTGAGTGGTTAAGCTCGCATTTCCCAGGGAGTTACGCTTGTTAACTCCCAGTCAATTCACATTC 0.518 +GGTACTTATATCAAGGTACTAAACAACAAATTCCTAATTGAATTAGAAAATCTAAAATCTCATTAATCACAAAAAGAACCCTTTTTGAGGGTTCTTTTTTTATTTCAAATA 1.0 +ATTTCCAGAAGGGCTTTGACGCTATGCGTTCGGGCCAGTCCGGGAAAGTTATTCTGAGCTGGGATTAACACGAACAAGGGCTGGTATTCCAGCCCTTTTATCTGAGGATAA 0.9570000000000001 +GACAGCAAACAAAATGCTGAAAAAAATGGAGAATGCCATGGGTCTTGGAAGAAAAAGACGCTAATCAAAAAACCGCTCTTTGCAAAGAGCGGTTTTTTTCAGTTGACCTTT 0.991 +TCCCTGTCCTCGGCATCCTTGCCTTCCTATTGCCAAGTGACCAGAAGCTGAGAGAATGGCATAGCTGAAAAAACCCCTGCCAGGCGGCAGGGGGTTTTTTAATCCAGCTGG 0.988 +CACTGAGATTTACAACCTAAAAGGCGGCTTTAAAAAATGGGGCGGAAAGATTAAAGCGAAGAAATAAAGCAAACAGCTGTCTGGTAGACAGCTGTTTTTATTTATCTTTCT 0.981 +TCTTATCAAGCGCTCAAAAAGATGAAATCATCAATGTTAACAAAAAGAAACAAAAAAACCGCTGACTCGGCTCCAAATGAGCAAAGTCCAGCGGTTTTTTAATGCACCTTT 0.884 +CTTCATAAGGCATCTATCTCAATTGAAAATGACAAAACGGGTGTAACATACCGCATAGCAGTGCCAAAATAGACTGTAGATGTTTTGCAGTCTATTTTTTTATGTGAAAGA 0.9840000000000001 +TATTAATCATCGTTGGGACATCTTTCTTTGGTGGCTATTAAGAACACACCTTACCTCATATCCATCAGACGCCATAAGAAATCTTTATGGCGTCTGTTTTTTATTAGATTT 0.619 +CTGCTGACGGTATCTACGGGCCTGCGACGAAAGAGAAGATCGCAGCGCAATTAAGCTGATCAAAGACCATAAAAATCCCGGAGCCGCTCCGGGATTTATTTTTTCTTCTTC 0.765 +ATATTGTGACAGTTTGGGTTGACGGCGTAGATGTACAAAAAGGCAGAGTATCGCTGTCTATGGTAAAATAAAAGCACTGCTTACGAGCAGTGTTTTTTCCTATAGAAATAC 0.9079999999999999 +CACAATCAATGATGATGTGAATGCTCCTCTTCTGACTGATTGCTGGCAGCATCGGCGTCTCCGACTTGGACTTTTAAGGTCGGCATGCTATGCTGTTTTTTTGCTGTGACA 0.9159999999999999 +TCGCTGAGTGTGAATACCGCTGCCGCTGGCGGTAACGCCAGCTTAATGACTATAGGTTAAACAGGTAGCCGGAGGATGTTACAACCTCCTCCGGCATCTTTAACGTGGTCG 0.746 +GGCAGCTGTTAACGTAACAGCTATCTGATCGAATCCACTGATCTGAAGTGTGAATACGCTTCAATCTCGCTATAAAGCCTCGTCGAATGCGAGGCTTTTTACTATGCTTTA 0.9640000000000001 +AAGATTTAGCCAGATCATGTGATGCCGCATTTGCAGAAATTGTATAGATGCAGATATAGGGGAACGAAAAAGCTTAGTCATATCGATTAAGCTTTTTCCGCATCCCTTTCA 0.726 +ACAAGCAGAACACCTGATGATTTGGAAGCATTTAACCGCGAATCTTTAAACTTGTTAAAATAACAAGAAGGCACAGACTGTTCGGGTCTGTGCCTTTTTTAAAATTAAGAC 0.968 +TTTGGCACCGGATGCAGAAAAATCTGCCGGGTGATTTAGCCATTCAGGGGCGATAATCCCTTCCGTTTAAAGAGCAAACCCCTCAAACGAGGGGTTTTTTGTTGTTTTTAC 0.961 +GACTGCGATTCAGTCTGCACAGCAGCAGTAATCTCGTCATCATCCGCAGCTTTGCGCTGCGGATATCTGAACCGGAAATAATCACTATTTCCGGTTTTTTATTCTCTTAAT 0.986 +AAGCAGCATTCTGGGCAAAATAAAATCAAATAGCCTACGCAATGTAGGCTTAATGATTAGTCTGAGTTATATTACGGGGCGTTTTTTTAATGCCCCGCTTTACATATATTT 0.9590000000000001 +AAAATATAAAAGTAAAATTTAGAGATGATATAAAAACATTCGGTGCAAAGCATGTTGCAGCGATGAACAAAAGAGCAGTCGTTTATGCTGCTCTTTTCTATTCGTATGTTC 0.47100000000000003 +TTTACGAAGGGCTGTCTATCCTTTCAAGAGCATTAGAGAATGCGAAAAAAGAATAAATCAGTAAGTCTGTCATCCGCAAACTGCGGGAGCAGGCTTTTTTTATTTGACATT 1.0 +CATTATCTGCTATGTTACTGGCCATTTTTGCTTATTATACTAGCTAACTTTAATACAAAACTGCATGAACATAAGTTAAAATATGCTCATGCAGTTTTTTTAGTTTATCTA 0.894 +GCTGCACGCAAGTTTCGTGAGGAAATTCAAACGTATAAAACAGGAAGATAATCGTTCATTTTCCTTCAAAGGGCTGTTCTGTAAAAGGACAGCTTTTTTGCTGTCCATCAA 0.99 +CTCTTGAAAGCAAAAGAAACAGCTACATCTTAAAACCCTCTGTTAAGAGGGGACAGCTTGTCGGCAAGTCCATCCTTGGGCTTAGCAGGCAAGCTTTTTCTTTACGGCATA 0.9520000000000001 +ATGGAGAAAAAATGTCTTTTTAAGAGGATTAGAAAGTCTCCCTGTTTCACTTTCAAAATAACATTCAAAACGCCCCCTTTTAAAAGAGGGGGCGTTTGTGTTTTCTCAGCG 0.9520000000000001 +GAAGAGCCTCAACAATTATTAGAAACATTAGCTGACAATTTACCTGAACAGGAATAACCTGCACATCCAAGCCGAAGGAGGAGATATCCTCCTTCTTGTTTACCTTTTTAT 0.8640000000000001 +ATACAAGCTTGACCAGGATCAATGGTTCCGCATTTTTACAGCCGTACAGGGTTACGATATTCCATAAAAAGCATCAGTTTACCAGCTGATGCTTTTTCAATATTGTCACAC 0.92 +CCTTTCTGAAGATGTAATGAAGACGGTGGATGAAATTGTTGAGTCAACAGCAAAATAATTCAAGTATGAGCTGGGTCATTGTAAATGATCCGGCTTTTTCTTTTTCTCATC 1.0 +CCTTTTCAACGTCTAATTAATTTATTGTCTCATGAGGTTTCGCTAAATCTCATGAAGAGCGCAAAAGTATATAAAATCCACGCAATTGCGTGGATTTTTCATTTCACCGAG 0.987 +CGCAATTGGTATTCAGACAGCTTTTGACAAGTTATAAGTTCTAGTAAATACTAGATAAGGGGATTAATGACATAAAATAGGTTGTTAATTCTCTTATTTTTGTTCAGCTAA 0.511 +TAAATCCGATAACCTGAAAACCGAGATGGATGAGTACATCCGTAACCACTAATCCGTAAGACGTTGGGGAGACTAAGGCAGCCAGATGGCTGCCTTTTTTACAGGTGTTAT 0.9790000000000001 +TGGAGCAGGAAGCATCGCGTAAAAACGCATTTGCAACTGTCGGCGCTTTTCCAGTATGTTGCTAAAGATTTTATGAAAAACGGCCTGCGGGCCGTTTTGTTTTGTCTGGAT 0.8220000000000001 +CCGCTGGCAAAGTATCCATTCAATAACCGAAAGAAGGACCAGCATGAACGGACAAACTCCGGCTCGGCACTACTACAAGAAGCTTGTGCCGAGCCTTATTCTCATTTTGAA 0.527 +TCTGAGATGACGACGAAGGAAATGCTTGAAAATTTGATTTCTCATATTAAAAATGGATATGCAACATAAAAATCCCGCACGCCTTTAGCGGGATTTTTTTATGCGAGAGAA 0.958 +GGTATTTTGCCTGAAGATGAGGATGATAAAGAATTTGTAATTAGTTACCTCGGTCTATAATTAAAAAATGGCTCAGCCCTACTCTGCTGAGCTATTTTTTATAAACCAATC 0.9740000000000001 +AAAAACAAAACGGGCAAAACAGCAGTAGAGCTCGTAGAATCCCTGTTAGGGAAGAAGCTGCTGTAATAAGAAAACCCCCGCACCCGCGGGGGTTTCAGCGTGTCGACAAAC 0.7909999999999999 +GCCGCGTTGTGACTCTGGAAGATAGCCGCAAAGCCCTGGTCGGGAATTTAAAATAATCCTCTATTTTAAGACGGCATAATACTTTTTTATGCCGTTTAATTCTTCGTTTTG 0.8190000000000001 +CGCTTTCGTCCCATTCGGAAATTTTTTGTACGACCGCGGATTAAGGAACTATAAATAAGGAAGCGCAAAAAGGCTGGATCTTTATGATCCAGCCTTTCTTATTAAAACCAC 1.0 +TTAAAGTATTTGAGATATTTTTCGTAAACATTGAGTAAACTGGTGATACAGAAATAGGACATGGAAAATCCCGAGCCGAAACAAGACTCGGGGTTTTTCATTTTTAGGGGG 0.66 +TAGTTATCTATGCTCCTGGGGCTTCACTCACTTGCCGCCTTCCTGCAACGCGAATCATTTAGCGGAAAAATCCTGGGGCTGCCAACTGGCGGCCCTTTTACAAAGAAGGAT 0.425 +GCACTGTGTTCGTGGCAACAACTATTGCAAATGCACCTACACACCAAATTGAAGTTGCACAACGAGGAATGATTTAACCGCCGTCCATCGGCGGTTTTTTCGTCCCCTCTT 0.9470000000000001 +CCACGGCGGGTACGCTCCGTCAGTCCATCGAGAATCGCTAACGCCTTGATCAACCAATAAACCCCGTAAATTGTCCGGCATTTCGTGAGGAATGCCGACATTTATGCTCGC 0.915 +CGCTGAAGACCTTGAAAAGGCATTTCCGGGACTGCTTGAAGCCGTGCTGAATGTAGACCAGGACCGCTGGTGAATCAATCCCCTGTAACGGGGATTTTTTTATGTCCGTAA 0.976 +TGAACTGGAGCTGAATGAGGATACTGTTTTGATTACAGAGTTGTTTGTGCTTGTTAAAGATTAAAAAAGCCTGCGGGGCAAGAACCCCGCAGGCTTTTCATTAACCAGAAA 0.997 +ACGTGTATTTGCACTGGAAAACCGTTCCTTTTATAAACCGACATTTCATAAATAACTTAAAAAGGACTGATGCTGACATATTCAGCTCAGTCCTTTTTGATGCGTCTTTCC 0.98 +TCAGGACGTTCATCTGAAAAGGTTGTTAATACTGTATTTAAAGCTGAATAATTTAGGTTATTTTATAAAATGTATAGCTGTCGATTTTTCGGCAGCTTTTTAGATACTTGT 0.8340000000000001 +TTGTGTTTATCATCATGATCATCCAGCACCTTTCCTACGTCATCTCTTTCAAGAAAAACGAAAAAAACTCCGGCGCGCTGTAATCGGCCGGAGTTTTTTCTTACCAGCTTG 0.856 +CCGTCTGGTATGCAGGTTGTGACTGCAAAATAGTGACCTCGCGCAAAATGCACTAATAAAAACAGGGCTGGCAGGCTAATTCGGGCTTGCCAGCCTTTTTTTGTCTCGCTA 0.519 +ACTCACTTCGTACGCTACAGCGAAATGAAGTTCCCGCAGAACGATAAGAAATAAGTTTCCTCCCTTCCTTGCTAAGCCCTCTCAACCGAGAGGGCTTTTTCAATTCCATTT 0.973 +CAGGCTACGCATCCGCACCGACAGATTACGGGCTCTACCTGTTTATGACAAAAACTGAAAAAGCATAAAAAAGCGGCTCCCTAAATGGAGCCGCTTTTTCGGGAAATAGTC 1.0 +ATATAGACAGGCTTATCAGCATGCTTGATGACCTGGAAGCCAAATACGAACGTTTTAAAAAAGATTGGGAATAAAACCGCGTGCCCGCCGCGGTTTTTTTATTGGCATAAA 0.9520000000000001 +GCAGTAGGTCGGAGTAGACGCGCCAGCGTCGCATCCGACGTTACGCGCCAATGCGGCCTCCGGCACTAACGCAAAAGGGAACCTGATGGTTCCCTTTTTCACATCATTGAC 0.988 +CAGGAGTGGAGTATTATTCTGTCGGACGGAAGGCGGTTGTTCCGTCAGGTGCAATTCAATAGATGAAAAAGAAAGGCTGGTTGCGCACCAGTCTTTTTTTCATCTATTGCG 0.972 +GGAACAGCAGCGCGACGTTATCGCTCGTACCTTTACTGAAAGCCTGTAAGGCTTAAGTAGCAAGTTACTAAGCGGAAGAGGGCGACCTCTTCCGCTTTTTTTCGTTAATAG 0.983 +CTCTTATCATCATTAACGAAAATGTTTGAACAAATTCACGGCGGCACCCATACAGTACATTGGCTTACGTAAAAGAACACGAATACCGTGTTCTTTTTCACATAAAAAAAC 0.9259999999999999 +GGTCATCCAAGATAATGATTTAACGCTTGAAGAGTACTTTGAAAAAGTGAAGAAAATTTAAAATGAAAAGCCCGCTTACACTAGATAAGCGGGCTTCATTGTGTCTGGTGA 1.0 +GCATCTCTCCGGCAACTTTGACGAAATATATGAAATATATTGATGATATTCTAGAATAGTAAATTTTAAAGAGTTGATTTATTTAATCGACTCTTTTTTTGTTAGATAGAT 0.96 +TGTCGGGATACTTGGCAAGAAGTGTGAAAATGCCGGATAAGAGGAACGAAAAAGGATTGAATTGATCAGTAAAAAAGGACCGTTAGCGGGTCCTTTTTTATTTCCAATGGG 0.9159999999999999 +GTCTATGAAATCAGCAATACGATCAACATCCGCACAGGCGAAGAAGGACCTGAAGCACTTTAATATCGGTACGAGATTCGGACACTCCGGATCTCTTTTTTTGTGCACAGA 1.0 +CCCTTTCCTCTATGGCGGAAGAGCTTCGGGATATGACCAAACGATTTAAAATTGAATAATAAGCCTTAACACCCAAGCTTGTTGCGCTTGGGTGTTTTTTTTAGTTTTTTT 0.998 +GTATGGTAAGCTCATTATGGACGGTAATGCCGTAGAAACGATACAAACTCATTTTTCATAAGAATGCAACGGAAAACTGCTGATTTCAGCAGTTTTTTTTCGTTAAAAAAC 1.0 +GCGTAATTCTGGTACGCCTGGCAGATATTTTGCCTGCCGGGCGAACAGTGTGATACATTGCTGTGTCGGGTAAGCCATTACGCTATCCGACACAGTGTTAAATCCTCGCTT 0.466 +CCCAATTAATCTACGCTGTGAACTGCTGATTCCGATCCGTCGTTAACGCTGCAGCTCATCTAATGCAGGGGCGTCCAGATGCGAGATGTCCCCTGCGGTTTCAACGACCCA 0.335 +GTGATGTCTTGTTTACATTATGTTCTATTCGTCATTAACCTCTTGAGCGACAGAGAGGAGTGGGGCATTAACGTTTATGACGGGGAGAGTCCCCGTCGGTTGACATTTACT 0.698 +AAAAGAATCATTCAGCCCAAGTGAAATGCAGGCGATTGCTCAAGAATTAGCCTCGCTTCGTAAATAATGGAGACTGGCCGGAAACTCGGCGAGTCTTTTTTATCATTTCCC 0.607 +GATTTATGGCAAGCCGGAAGCGCAACAGGCATCTGGTGAATAATTAACCATTCCCATACAATTAGTTAACCAAAAAGGGGGGATTTTATCTCCCCTTTAATTTTTCCTCTA 0.912 +ACCTGAAACAGACGTACTGACTGAGCAGGATCAAAATCCGTTTTTAAATGGGTTTTCATTGTAAAAAAAGAGACAAACCGATAAGGTTTGTCTCTTTCTGCATTTAATGGC 0.997 +TAATCGGCTTGGTAATCTATTTCCTGTATTCAAGAAAGCATTCAAAATTAAATCAATAACCTTTTGATAAAGAGAGCGGCCATACAGGCCGCCTCTTTTCTGTTCTTGGCA 0.981 +GTCGAATCAGGCAAAGCCGGTCAAGAAGACATCTGATGAGTAAGATTATCGCGACTTTGTATGCGGTAATGGACAAGCGCCCCCTGCGGGCGCTTTCCTTCGTGATGGCGC 0.374 +GAAAATGGACGATCCGTCTGGTCGCGCGGTTGGTGTTCACCAGTCGGCGTATAAGTAAGAAATGCGAAACAGCCCCGGTGAAATCCGGGGCTGTTTCAGTTATTGAGGTGC 0.56 +TTACGATAGCGGTCAACCTTGTGTTTGCGGGGATCGCTTTATGGTACCATAAGAAATCAAAAAGTAAGCAGCAAGGGTCTTAATCCCTGCTGCTTATTAATGTGTGGATCG 0.9390000000000001 +TGATGTCCGTTGATTTTATGGTACCGGACTCGCATACGCTGCTGGCTGCTGTGTGGTAACAAAACCTCTCTATTAAAAAGGTGCTACGGCACCTTTTTTCTTAGCATTAGA 0.971 +GAAGAAACTTGGTAAACAAGAACCTTCTAAAAACGTTGGCACCTTTTAGCCACGCGAAAAAATTTAGAATGATAATCTTTTTATAAGATTATCATTTTTATTTATTCTATT 0.937 +TCATTGCGACCTTTGGAGAAGGAAACTGGTGGCCGTTCGGCAGGAAGAAAGGAAAAGAGTAATCGAGAGAAGAGTGATGAATGGTCATCACTCTTTTTTCATGACTAAACG 0.9570000000000001 +ATCCTGAAGGAACAATATGACGGTGTCATTTGGCTGGAGCATATCACGCCCTCTGTCATTATCAAGTGAGCGGTGTCCCCTGTGGTAAACAGGGGATTTTTACATATCGCA 0.873 +CATGAAACCGAGCTTGTTCCGGGGGAGGGGAGTCGTTCTGCGGAAGAAGTGTGAAAGGGTATCATTAACATCAGGCCGGATCACATCCGGCCTGGTGTGCTAATGCCTGCA 0.531 +ACGTTGGCAAACATGGCGGAAGAACTTCAGGACATTACGAAGAAATTTAAAATAGAATCATAATGAAACGAGAAAGCGGCATATCTGCTGCTTTCTTTTTTTGTTAAAAAG 1.0 +TGAATTGGTTCCGGGTAAAGTGATTCGCCTGGAAGTTAATGAAGACCGGATTGTCGCCGTCCAGTAAATGATAAAACGAGCCCTTCGGGGCTCGTTTTTGTCTATAAGTTA 0.436 +AAGGAGCGGTGCACCACCACAAGTATTTCTGCAAACAGTTCATCAACGCGCTCGAATGGCTATACGGAAAAAACCGCTCAACCCTATGAGCGGTTTTTTTATGCCTTCTGC 0.97 +ATTCTGTTCCTGGTGAGTTTGTTCATGGGCCGAAAACGACCCTAGATTTCAGACCATTAAGAAAATTTTATCCAAAGCCAGTCCAGCGGACTGGCTTTTGCGGTTTTAGCG 0.946 +GATGCTTCTAATTGGGGCGTCAAAACTGTAAGTGTTAAAGTCTTAAACTAATAATATAGATTGTGATAAAAGACATAAGCTTTTGGCTTGTGTCTTTTTTTGTTCTGCTTA 0.9990000000000001 +TGAACCCATCGCAGCGCAGCGTATCGCTATCAGCGAACGTCCCGCGTTAAATAGCTAACTAGCTATTCTCTTTGCCCCGCCATTCTGGTGGGGCTTTTTTTGTCTTACTTA 1.0 +GCAGCTCGTGCTAACCAGCGTCTGGACAACATGGCTACTAAATACCGCAAGTAATAGTACCTGTGAAGTGAAAAATGGCGCACATTGTGCGCCATTTTTTTTGTCTGCCGT 0.996 +GTGCACGTTCACCACACTATATTGTGATGAATGACAAGAAACACTAACCATGTCTGAATGGTTATAAGACAAGGGAGCGATAATTCATCGCTCCCTTTTTCGTGCTTGCTG 0.976 +ATTACAAAGCACTTGTGCAGGAAATATTAAATGAACCGATCCCGGAAGATATTGATGTAAAGGATGAGGATTAGGACACAATTTGAAATTGTGTCTTTTTTTGTATATTTT 1.0 +TCTCCCTCGCCCCTCCGGGGAGAGGGCCGGGGTGAGGGGAAAAGGCCGCACTGCTCACCAATTTTCTGCTATCCTGCCCCCGCACTAAGGGGGCAGTATGCTAAACATCGT 0.433 +TCAATGGGAAGAAGGCCGGACCATCATTACATACAAGCTGACTTCTTTGCATTCCGTCAATTAAAACAAGCCAAGAGCATAATTGCTCTTGGCTTTTGTTTTTTATACGAG 0.992 +AAAGGGCTGGAAAATCTCTCTGGCGATCTGTACGAGAAGATAACTAAAGCACTGGCTTGATAAATAACCGAATGGCGGCAATAGCGCCGCCATTCGGGGAATTTACCCCTG 0.7929999999999999 +GAAATTTTATTGAGACATATCGCAAGGAAGTTATTGCGTGATGATATATGTGAACGGCCGGTATATGGAGGAGAAAGATGCAGTTCTTTCTCCTTTTGACCATGGGTTTTT 0.371 +ATTGGCCATCATTCCAAGTAACGGCGAACGTTATCTAAGCACGCCGCTGTACCAATTCGATTAAAAAAAGCCAAAACTCCCGGTTCGCCGGGAGTTTTTTTATATTTCGTG 0.997 +GTACGTCACGGCTGTGGATACAGATAAAAAAGAAAGCAAGGAAGCAAAAGTTGTTGTTGAAAAGTAACCAAAAAGCGGTGCTCGATGCACCGCTTTTTTATTTGCGCCCCC 0.997 +ATCAGCAAGCCTGAGATCAGTGATTTGATTAAAAAAGTGGATACGTATGTTATTGAATAAATAAAAACAGCCGTTGCCAGAAAGAGGCACGGCTGTTTTTATTTTAAAAGT 0.978 +GCACGGCATACCGATGACCGGCAAAAACTTTCGAAGTGACTTAACTGAATTGGCAAATCGGTTATCGTCTATTTGACACCCGCACCACGCGGGTGTTTTTTATTGTTTTCC 0.973 +AACCCGCCACAGTAGTTCCTTCTCACCCGGAGGCAAGCACCTCCGGGGCCTTCCTGATACATAAAAAAACGCCTGCTCTTATTACGGAGCAGGCGTTAAAACAGGTCTGTA 0.9079999999999999 +CTGGTGAGGTTTATTTCAGAAACGCATATTTTAAATAAGGCTACGTCTGAAAATAAAAGACCTCAAGGTATATGGGGAGAAAAGCCCCATATACCTTTCTTACTGATGGAG 0.738 +CGACTGGAAACGGATGTTTATGTGCCGTTGGCGTGATGCCTGACTCCAGCTTCCCTTATAAATTCCTGCTTCAAACTGGCGCTACTAATGCGCCACTTTCTTATATGGTCA 0.857 +TTCGGTATTTTACCCTATGGCTTCCGACTTTGGCTGAAACGGAAAAAATAACCGCAAATAAACGAATAAGGTCCTTCAAAAAATGGAGGACCTTATTGATATTCTTCTAAT 0.775 +TCTTGGTGAAGTATTCAACATCTTTTCTGGTGTTGGTAAAAAAGACCAGCCCGGACAAAATCATTGATTCCCTGAATGCCCGCTTAGTCGGGCATTTTCTTTTTCTCAACT 0.8909999999999999 +AGAAGGCCAATATTATCAAATGTATGAACTGCAAAAGGGACAGAAACATTCCATTGCATAACGCTCAAAAACCCAAAACAATCGTGTTTTGGGTTTTTGGTTTATGTAGTC 0.955 +TCAGGCGGTCTTTTATCCATTTGTTTTTAAGAATTCCATTTATCCGCAATAAGTTTATACAGCAAGCTTTCTAAAATCCCGTGAAAAACGGGATTTTTTGCTTTCAGTAGG 1.0 +TAAGTATTCTGATGGGAATTCAAGCCAAAATTTATTGAATTTCATCGAAAATTTAATAAGCTAAATGATGACACTTGTTCAAAACAGAACAAGTGTTCTTTTTTCTATTGA 0.998 +ACGGAGCGCAGAAATCAAAGAACAGTTTGCCGGTGCCGCCTTAGAGAGACATTCTTAGATGTAAGGACAAAAACAGAAGGCACAGTGCCTTCTGTTTTTTATTTTTCCCGC 0.945 +GAATTTTTTATGTCGCCAAAATCCAAAAGAGCGCAGGACTTTTTAGAGAAAATATTATAGAATATAAAAAAGAAGAAGGCTGGCCGCCTCTTCTTTTTTATTGAAGAAACA 0.932 +ACTTTTCTCTACTCATCCTGATTTAGATGAGCGAATCAGACGGCTGGAAGCTAAATAATACAAACACATTGTTCCTCTGAGAGAATTCTCAGAGGTTTTTTATTTTATCCA 0.99 +ACAGGATATGAAAAACGCGACTTTAAAAGCGATATCAAGCGTTAATGGTTGATTGCTAAGTTGTAAATATTTTAACCCGCCGTTCATATGGCGGGTTGATTTTTATATGCC 0.996 +CGAAAGCAAAGTCGTAATTAAGACTGGTGATAACACACGTCTTACGTTCGACCGCCGCGCGATCAGAGAAGTTTCTGCTGCTGAATAACAGCAGGAGCTATACATAGCAAA 0.706 +AGGATGCATCAGCCATGATTGAGAAAAAAGACTAGTTTTCGAATATTATTTGAAACCCCTCAGCCTGCTCTAGTATAATAGGGTGGTTGAGGGGTGAATTTATGTTAGCGT 0.61 +CATTCGGCGTTGGAATTAATCACATATGAAACAGCCCATAGATCTTAGACGATAGGGGGCTATGCGTGAAAACAGAAGTTCACAGCATAGCTCCTTTTTGTATGGGCGCTT 0.9890000000000001 +TCTTAATCATGGTTTTAGCAGAAATCATGCCTATGAACAAAAATGCAAGCCAGCATTAAAAAGCCAAAAACCCAAAACATTTATTGTTTTGGGTTTTTTAGTGGTCACGGC 0.996 +TGAAAGAGATGAAATCAGCCGATATTACGGTTGATGTTGATATCCTAATAGGAATGAAATGAATGGTAGAGGAAAGCTCCGAAAAAAGGAGCTTTCCTTTTTTAATCGGAA 0.976 +ACACCGGAATCAGTGATTGCCCGTACCCGCAAAGAAAACGACCTTGGCGTGGGTTATTATCAGCCGTAATTCTCAGGCCTGCCGTTCCGGTGGGCCTTGTTTTTTCTCTCT 0.738 +CCCTGAGGGTGCACAGCTGCTTCTCGAAGAAATTCAAGCCGCTTTAAAAGAATAAAATTATGCTAAAAAAGGCGGAGTGATATCATCTCCGCCTTTTTTGCGTGCCAATTT 0.975 +TATCGTCGAACTGCCGCACCGTATAGAGCTTAGAAAGTCAACCAAGTCATAAGAAAAACAAAGAGCAAGCTTCACCTTTATGGTGAATTCTTGCTTTTTTCATGGGGAGAA 0.893 +ATAGATAATTGGAGATATTCACAGGCGAAATGTTTACCCGCCTTGAATATGAGCACGATACCCGCTTCTTCAGGCTCCTAAAGCCAGGGCGGGTTTTCTTTAGTTATGGTG 0.8320000000000001 +TGGATCGCCCGACAGCAGAGTGTGCCGCCGCGCTGGATAAAGCGCCGTTACCGACGCCACTGCCGTAATGAAATCTCCCCGGCTTGCCGCCGGGGATGAATTTAAATCACC 0.8029999999999999 +CAACCAAACGAGTAAAACTCTCTTCAAAGATGCCGATATCACTGATGAAACAGTTTCGTTTAACGGGTAACAGCCTTTTCACAAGAGGCTGTTGCTTTTTTATTGTATAAT 0.858 +ATTATCGGTTCAGTGTTCATCACGGTTTTATGCATTGCATTGGTCGCTTCTTTTTCAAAATAAAAACGCCTGCTGTCCTAGGCCAGCAAGGCGTTTTTATTTTGTTTGACG 0.97 +CGTCTGTGCTGGCACTCAACATTCAGCGCGGCGACAGCACCATCTACCTGTTAATGCAGTAATCTCCCTCAACCCCTTCCTGAAAACGGGAAGGGGTTCTCCTTACAATCT 0.8909999999999999 +GCTCATCATAAGAGTGATGACGAATGTTTTATATTAGTTGATGTAAAATAAAATGATCCATGAGACATAACAATGATTTGTTACATCTCATGGATTTTTTTGTTTCAGCGT 0.99 +TATTGCTGTTGTCGTAGTAGGAGTGGTAACGGTTCTGGTCATCCCTGTGTTCGTGCAGCTGATCGGAGGATAAGCCAAGGCTGAATGCCTTGGCTTTTTTTATTTTATGGT 0.9570000000000001 +GCGGACTGACTCAGGATATCAGAAAAAATGACTCAGAAAACAAAGAAGACAAACAAATGTAGGATAAATCGTTTGGGCCGATGAAAAATCGGCTCTTTATTTTGATTTGTT 0.9179999999999999 +ACAAGGCGAATCTGTCTCATTTAAACGGCTATGAAACGAAGCTGACTGGCAAGGTGATATTGGAAACGTAAAAAATCCCTCTTGATAGAGGGATTTTTTCATTCGGAAAAC 0.941 +AGAGATCATACCAACCTATGAGATTCGTAAGCTTGAAGAGCTATATCACATTTTGAATATCGAAAATACCGTCAGCTGCTAATCAGGCTGACGGTATTTCTTTCATAAGAA 0.986 +CATTATCAGTCATCATCAATGGAGAGGTAAAAGATTCATTGGATTTCCAAGTTTGCTGAGTGAATAAAAAATCCCTTCTTATCCAAGAAGGGATTTTGCTTATTAGCCTTC 0.899 +TTTGTCGGCAACGACCATATGGTTGAAGATGTCGAACGCTTTATCCGTGAGTTCCCGGACGCGTAATCTCCTGCGCCTCTCCACACGGAGAGGCGTTTTTATTGCTGCGCA 0.853 +TTGACTTTATAAAAGATGTATCTACTCACGTTAAAGACTTAGAGGGATTTACAAGATAAACCTAATTAAAGAAGGCCTAATAGGAGCCTTCTTTAATTTAATATGTAATCG 0.833 +GCTGAGGTCGATGAAGAGGAATCTGCCGATGCTCAGCCTGAAGGTGAAAACGAACAATAATAGCTTAAGGCGTAACCCTCCCGCGGTTACGTCTTTTGTGCTAGAATGAGA 0.917 +AAATTGAAAGCACGCTGCAAGATGAAGCAGATCGTTTTGCACAAGCGTTATCCTCATAATGGAAAAAACCTTGAAAAGCCAGGCTTTTCAAGGTTTTTTTATTTCTGAAAC 1.0 +AAAGCCATGCTGACAGCGGCCAAGGTCCTTGCCGGCGCTGCGATCACCTATCATCAGCTATAAAAAAACAGCCGGAGTGTTTATTCTCCGGCTGTTTCCTTTAATATCCTC 0.9540000000000001 +TCATTGTTATTGGATCAGTTACAGACATCTCCCGTTTCGAAAAAAGAGTTCTTCATACAAAATAGCAGCCAAATAAGCCGTCCTCAAAGGGCGGTTTTTCATATTATCAGA 0.965 +CAGACTACGCTGCAAAACTTAAGCAATCTGGAAAAAGGCGAAACCTGCCCGAACGAACTGGTTTAATCTTGCCGCTCCCCTGCATTCCAGGGGAGCTGATTCAGATAATCC 0.62 +GGACCACCTGACCGGGCAATTGACCGCATTTGGTGAGTTTATTCAGCGAGTTAAGATCTAAATAAAAAACCCGCCAGCAATCATGCATGGCGGGTTTTTAACGCGCTATCG 0.945 +GAAAAAACTCAAAAAAGGTATACAAGTCCAATGCGTGGAATGCGATTATAAGGAAGAACCACAGAAGTAGCGGTGAGCAGGGTCTTGCTCACCTTCTTTGTGTGTTAAAAG 0.695 +AATTTGTAGCCCGGATAAGGCATTCATGTCACATCCGGCATAAACAAAACGTACGTTGTTAATGTTCGAATGCCCGCGAGTTGAACACGCGGGCATTTTTTTAGAGTTCGC 0.956 +GTTTATATTGAATCATAAATCAGCGGAGTATAAACATTGGATATATGTTACAAAGTAGAGTCCTTAATTACAAAAGCAGGCACTGCGCCTGCTTTTGTTTGGATTATTTAC 0.843 +AAGCATTAGAATTCGAAGAAGCTGGAGCTTCTTAAAAAATAACCAAAAAGCAAGGACTGCTGAAAGGGCTGACATAAGCCTTTTGCCGGCGGTCCTTTTTTAATTCTGATT 0.998 +AACTAGTCAAAAAGACCTTATGATTACAAAATATCGCCTGTAAATAACTGAAAAAATATCAAAAAAAAGAACACCCAGAGCAAGCTCGGGGTGTTCTTTTATGAAACGTTT 0.977 +GCGGTAAGGCACCTAAGATTATGCCGATGCTGCAACATCGTGGCTTGAACATATAAACACCACCAATGGCTGGCCTGAGATATCGCCAGGCCAGCCTGGTTATAGCAGTAA 0.627 +GCCGCTTTCTTTTACCGCAGGCTGATGAATATTAAAAAGAAGCTTGCCGAGCAGCGTTTTCAATAGGAAATGGCAGAGAACTACAGGTTCTCTGCTTTTTTTGTGCTGTTT 0.991 +CTATGCCAATGACGAACCAGCCTTCTGTAAACCAAATGTTTGGCCGTCCGGAAGAAGAAAATGAGTGATCGTTCGGAACGATGTAAATCGTTCCTTTTTTTGTTTTCAAGC 0.868 +TGGCTGCCTATAGTCTAATAGCAGAAGGGGCTAAAGACGATGGTCTAGTAGAATATTTATAAGGCAATGCTGGGCTAGTCTCTAAGATTAGCCTAGTTATCATACGAATAA 0.506 +CGGTTTGTTTTTCTACCCGGTGCATATATGGATGAAGCGGGAGTTTGGCCGTTGGAATGACACAGAAAAAAAGAGAGGATGATGAAGCCTCTCTTTTTTATGAGAATGAGT 0.905 +AACAGCGGCACGCTTGAAGAGACCAAAAGGCAGCTTGATGAAATCATGAACAGCTGGGCATAATACAAAAAACACCGTTCATATTGAACGGTGTTTTTTGGCCATTAAAAA 0.9640000000000001 +GCGGCACGCTGGAAGATCCGAACCTGTTTATTCGTCGTATGAACCAGCTGCTGGTTTCCTGATGTAATGCCGGATGACCTTCGTGTCATCCGGCATTTTTCTTTTCATCAT 0.895 +TTCAAATTGAATCAGCAAAAGCTGCAGGAAGTTAAAAAAGAGCTGCAAAACGATTTAGGTGTATAATGAACGGGATGGCGATTACGCCATCCCGTTTTACATATTTCCTTG 0.98 +TTCGCTTCTGCTGATGGTCGTGTTGATCGCTTTAACAAAAAATACGGTCTTAAGTAATAATAGATTTCTCAACAGGCAAGCAGCAGTCTTGCCTGTTTTTTATATTGTCTA 0.996 +GTTTGATTGAACTGGCGCTGGAGCGTCACGCTGCGGATAACGCATTGAAAACCACAATGTAATATTTTAACGCCCGATAATATTCTTATTGGGCGTTTTCAGTTTCTTCAT 0.8440000000000001 +TGCGCTGATTTTAATCATTTACAAAATCCGGCTGTATCAGTCGAAAAAAGGACAGTCTGCAAAATCATAAAAGACTCAGCTGAAAGCTGAGTCTTTTTTGTGTTCTGTATG 0.991 +GACAGCAGAGGCTGATGGCAATGCAGCAGCAAAGGCTGTATGTATTCCCTTATCATTGGCATTAGAAAGAGGGCAGAAAACATTATGTTTTCTGCTTTTTTTTCGGAAAAA 0.9940000000000001 +AGCAATGGCGAAAAAACTGGCGAAAGGCATTGAAGTGAAGCCAGGCGAAATTGTCATTCCATTTACTGATTAATCACGAGGGCGCATTCGCGCCCTTTATTTTTCGTGCAA 0.87 +AATTTAAATACCGCCCGACGGATGATTTTGACGCCCGCTGGCCGCTAATTGCGGCACAGTTGCAGTAATATGACGCCGGATGACTTTTCATCCGGCGAGTTTCTTTAAACG 0.858 +ATTTATTAATCATTCTAGGTTAGAGACTTGATAAAAGTATACTAACATTGTGTAGTAATGTTCAGCTTGTAGAAAAAACAATGTTTTTTCTACAAGATTTTATTTTAAATG 0.596 +TTCTGGGATTTGCTGGAACAGGCGGCGACGCAGCAGGCGGGTGAAACAGTCAGTTTCCGCTAAGATTGCATGCCGGATAAGCCTCGCTTTCCGGCACGTTCATCACGAAAA 0.515 +ATCCCGGAATGTAACTCTCGCACCTACTCATGAATTCCTTGTTTAATTCGATAAACAACATTAGTTCTGATTCCCTTTAGCCGTTTCATAACGGCTTTTTCTTATTTCTAA 0.9790000000000001 +TGGCCAGTCTGACCGCAAAAAAGCAATTTCTGAAATCAGAGGAAGTAAATAAGCAGCTTTAGAAAAGAGCTGGGCATACAATAGCCTAAGCTCTTTTTCCTGTATGCAGAG 0.8370000000000001 +GCGCAGCGTGCGCATGAGATTCTGGAAAGCCGGGCGACGCAAGGTTCCAGCCTGCTGATTCCATAAAAGAAATAGGGCTTCCACCTGGGAAGCCCTTTCTTTTTATAGTTC 0.8959999999999999 +TGAACATTTAGAAACACTTATCACAAGTTTTCAAACGTATCATCCGAAAGACTGCGAAATAGAAACAGAAGAGTAACCCGGTTAAACAGCCGGGTTTTTTTATATACAAAA 0.9990000000000001 +AAGTAATTCCTGAGACAAAGTACAAACAGCAAGAACCCGCTGAGCAGGTTCTTAATCGGAAAGTTGAAAAAGGCGACACATTGGCATGTCGCCTTTTTTATTGCTTATTCC 0.983 +TATAAAGTAATAAATGATACGGAATATCGGAATGCACTCAGAAATTTACAAAAATAGAAACATGGTAAAAACCTTTTCATTGAGCGAAAAGGTTTTTATTTTGCAATCTTT 0.863 +ACATATTCTCCTTATCGCACGGTCAAACAGGCGAGAGAATCTGATCACGCGAAGATTAAGATGTAAGAAAAAGAGCCGGGGGATATCCCGGCTCTTTTTTATGCTTGTGTT 0.9590000000000001 +GCGGAACGCTACCTCGATGGTTTAGCTGACGCAAAATAATTTTACAAATCAGTAACAAAAGTAAAGAAGGCGACACCATGCGACTATGGGTCGCCTTTATTTTTTCCCCGT 0.9940000000000001 +TGAAAAACGGAAAAGTGTATCAGGAAAATGGATGGACATACGAGAAAAAATAGCATGAAAACGGCTCGCCTGCAGCAGACTGCGGAGCGAGCCGTTTTTTTATGTCTAGTT 1.0 +GATTATGATTACACAAAACATGACGAAGATTCTTCCGAAACGTCAAAGGATAACGAAGATAAATAAGAAAAAGCGGAGAGGTTGCCCTCTCCGCTTTTTTATTTGACAGCA 0.9990000000000001 +ACAGGGTGAATCCCATTAAAAAAACCGCTGCTTCAAACAGATGAGGCAGCGGTTTCTCAAAACCTTCATCGGCGCAATGCATTAGCGCCGATGATGATCAATGCAGCTGGT 0.6990000000000001 +GCTGGAGAAACTGCGCGAACTGTTCCTGCGCGTTGCGGATATTTCGCTGTTGCAATAATAACGCCGTTATTAAATAGCCTGCCATCTGGCAGGCTTTTTTTATCGCTAAAT 0.995 +CCGCAATGATTGGGCTGTTTAGCTGAAGCTCGTTGTTTATGCCGGATGCGGAGAAAGTCGTAGGCCTGATAAGACGCGACCAGCGCCGCATCAGGCATCATGCTCAGATGC 0.09 +CACGTACGAACGGACTGATATGGACGGCGTCTTCCATACGAATTGGTCTGAATAACCTGTATTAAAAACACGGTCAGTTTCAACTGAACCGTGTTTTTTTCTTCTATCCCA 0.991 +GCTACAGTAGCTTTAGTTGAGAAAAACGAAGAAGATGAGAATGAAGAAGAACAAGAAGAAGTGTGAAAAAAAGCGCAGCTGAAATAGCTGCGCTTTTTTGTGTCATAACCC 0.9990000000000001 +TGATAAAAAGTTTGGCGTGAATATCTTTGTTTCTGAAACAGATACAAATCCATTTACATTTCTATAAAAAAACTGAGGCTTTTACAGCCTCAGTTTTTTGCTTTTACTTTT 0.995 +TGCGACAGCAGGATGTAACATTGACTTACAGAAAAAAGAAGCCAGTGAAAGATGTCTAACACAAAAATCTCCTATTCAAACAATGAGTGGGAGATTTTTATTTTGATCACG 0.99 +CAAACACGGAATGTTTTGGAAGTCTCTCAGCACTTATTCTGATGAATATCAATCATGTGAAAAATAGCCGCAGGCGTGCATATGCTTGAGGCTGTTTTTTTATGCCATAAC 0.9640000000000001 +CTTGAATACATCACAAATATAAAGTATCCTGTATATGTATCATAGTTGGTGTATCAACCACTATAGGCCAGAGAGTTCCCTCACATACTCTCTGGCTTCTTTTATTAACCT 0.89 +GGGTTACCTGACATATATGCTGATTTATCTCGGCATGAAACAAGACCGGGATGCAAATCCTGTGTAATCGCGCTGTTCCCTTGGGGAACGGCGTGTTTTTCTATACTGGGC 0.948 +AATTTTTCTCTAATGAAGACCTTGACCGGATTTTAGAGCTTTTGTCTGAACGAGAATCATAAATGAAAAAACCATCTTTCAAACGAAGATGGTTTTTATTTTATATGCGCC 0.777 +CGTACCGGCATTTGTTTCTGGCAAGGCACTGAAAGACGCAGTTAAGTAAGATTGCGTGGCAGTGAACAGTTTTAACGAAGGGGTGGTTTCACCCCTTTTGTCTTTCTGGCG 0.903 +ATTTATGGGAAAAGCCAGAGTGAAAACAACAGCCGAAATGACAAAGAAAGCAAATTAACATAAAAAAAGCAGTACATGCTGGGCATGTACTGCTTTTTTCTATTACACTCC 0.9890000000000001 +GAAGCAAATGGAGCAGAAAGCCCGGCATATGAGATGAATTAACGTCCGGCCCCCTATAAAAACGAAAAGAAGACAGCCTTTTACCGGCTGTCTTCTCTTGCAATTTCATCC 0.9840000000000001 +AAAGAACATGGAACATGACGGCACCGATAATAAAGCGCCGTGCACGTTTAAAAAAATGATGATGAAACGGTGCGGAGCCGGCTTTCCGCCCCGTTTTTTATGATAGAAAGC 0.23 +CGAAATCCTGCAAACGCAGGGGCTGAATATCGAAGCGCTGTTCCGCGAGTAAGTAAGCATCTGTCAGAAAGGCCAGTCTCAAGCGAGGCTGGCCTTTTCTGTGCACAATAA 0.529 +TGGATACGCTGAAGAAAGATATCGCCCTGGGCGAAGAGTTCGTTAATAAGTAATTGATTAGCGGATAATAAAAAACCGGAGCACAGACTCCGGTTTTTTGTTTTGAGCGCA 0.9890000000000001 +ATTATTATCATAAATGCGATTGTCCTTCGCTTTTCTAAAACTAGATGGATCAAAATGGGCCTTTAATAAAGGGAGCTACTTCACAATGAAGTAGCTTTTTATGTGCACCTA 0.975 +ATGCGCAGGAGGAGCTGCTGAAAGAAATCAGAGACCTGCTGAAGCAGCAAGCAAAGTCGCCGGAATAAAAAAGATGCCGTTAGAAACGGCGTCTTTTTTTATCTCAATTTT 0.966 +GCAGAGAGAGCTTGATTATGAAAAAAGCAAGCGGATTTCTGCGGGAGCTTATTGAGAAAATCAATCTTTTAAAAAAGGGTGGGAATCCACTCTTTTTCTTTTTCAATCGCT 0.971 +TCATGGACTATGAGAGCCAATACAAGCACTTATTTACCATTCAAAATTTGAACTAAAACATCAATCTTATATCCTTTTAACGAAGGATATAAGATTTTTTTGTATATAAAA 1.0 +AAATCTGTCACTATTGCGCCTCTAACAGATTCATCGTGCTGTACCCTACATACAGCCGAACTATAAAAAGAAAGGGCTTCCCAGGTGGAAGCCCTATTTCTTTTATGGAAT 0.907 +AGGGCAAAAATTGTTTCATGAACATCTAATTTTTCATTCAGCTGTCCGGCAAAAAGCGCAAGGAGCTTATCAAAGGTTACAAATCCTTGCGCTTTTTGTGTTGCCAGCTCT 0.7829999999999999 +GAGGAAGAATTAATAAACTTCGGGAGACAATTCGCAAAGAAAAGCGGGTGCGCTGTCTGATCACTCACTGGGAACTGCTAAAACGGCTGTTCCTTTTTTTCTTGTCTTTGT 0.8540000000000001 +AGAAAAAACGCGGCTTCCAGAATGCTCCGAAACAGAAACGGAAAAAGAAGAAATAGCAGCTCAACCAGCAAATAGGGCCGCAGCGCTCTATTTGCTTTCTTTCGGAAACAG 0.602 +TGATTGATGCATTAGAAGATTTGGAAGATGTTCAGCAGGTATATCATAACGTTGATTTAGGTGAGTAAGGAGTGAGCAGGCTGTTATGGCCTGCTTTTTTTGTCCCGGAAA 0.9470000000000001 +AATTGTGAAAGAACTAACAAATGTCCCTGATATTGTGTCTGTGAAGCTCATTGATCTGCCATAATTAAAAAAACTCAAGCTATATAGCTTGAGTTTTTTTATTGTTCTATC 0.956 +CTTCAGACCCGGGCAGAAGTTAAAAAGCCGGGTCGAAAACGCTTCGCCCAAAGACGAGTAATCTGATCTAACTAAAAAGGCCGCTCTGCGGCCTTTTTTCTTTTCACTGTC 0.986 +GTGGGAAAAAACAGCCGGAGGGTGCACAAAACGAACAAAAAAACAGCACCCGTGAACGGGCGTAATCTGAAAGACTCTGCTTAAAAGCAGAGTCTTTTTGTGTTTGAAGCG 1.0 +CTTTACGTACGAACCGAAAGTATTGCGTCACTTCACCGCAAAACTTAAAGAAGTTTAATTAATACTACAGAGTGGCTATAAGGATGTTAGCCACTCTCTTACCCTACATCC 0.7170000000000001 +GAAGCCAATGTGCCTGCCGACGGGACTGTGCCCGAAATTACGGGCGAAATGAAAAGGAGAGGAATTCCCTAGGGCTGTCGAGAAACGTGACAGCCTTCATTTTCCCTTATA 0.77 +GGCGAGGGTGAAAACGGTGAGACTTGCCTGCCTTTTATCACGGCGTATCCGGCACTTTCGTCAGTAATCTGAATTTGCCCGGCACGTAGCCGGGCAATTTTGCAATTAACT 0.937 +ACCGGATTTTGAACCCGCTGTAATAAGAAAAACAGAGCGTCAGCGCCCTGTTTCAGATTATTGACAAAATCCTAAAACGATATTCGTTTTAGGATTTTGTGATTTTCAGCG 0.9670000000000001 +CTTGAGAAGCTCAATGAAACAAAATCTTTGCAAGTGTTCAGAAATCAATATAAAACGGTGAAAGAATAAAACGCTCCAGCGAAAGCAGGAGCGTTTTTTTTACATGAAAAA 0.983 +CGATCTCACCGAACACCAGCGCGGCGAACACTCCTACTACTAAGTTATCCCCTCTTTTGTAGATTGCCCGATGCGACGCTAAAGCGTCATTTCGGGCCTACGTTGAACATG 0.9059999999999999 +ATTGTTGCCATTATCGGTTTAATTGGCGCTTTCTTTATGAAACGCGTTAAAGTAGATCATTAATATGAAAAGCCCCTGACTAGTGTTCAGGGGCTTTTTCATGTTTACTTG 0.9540000000000001 +GCATTAGTTATTTCAACACTAGCTTCATATCATTATTACAAAAAATTAAACGGTTCACGTTGAAAAAGTGTCTTGCGGAGCAATCCGCAAGACACTCAATTATAATGGATT 0.8690000000000001 +TTCGGCTTGCTTCCGTCTATCAAGGCTTCGAAGCTTCAGCCTGTAGACGCGCTTCGCTATGAATAAAAAGAACGCATCCAATTGGAATGCGTTCTTTTTTTAGCTGGGAAG 0.9940000000000001 +GCTTTCATTTACGATCTCCGCGCTATTAAAGCAATCAGAATCTGCCGGCCAGCTGCTGCGGGAAAAATAAAAAACAGAAGGCACTGTGCCTTCTGTTTTTGTCCTTACATC 0.85 +TTTGGCGCGGTAATCGGCACTCATTTGGGTGAAGGTGCGTTAGGAATTTGCTGGTGTTTTAAATAAGGCCAAATCTCCGTTTTTAGAGCGGAGATTTTTTTATATTCTTAT 0.9840000000000001 +GCAAATGAGCTATGACGAGTTCCTTGAAAATGCTGATGTGCAGAAAAAATTGACTGAACTATACGCCGAATAAAAAAGCAGAGATTTCTCTGCTTTTTTTGATACCTAAAT 0.792 +CAACCCTGTGCCATGAATTTACGGCCTTTTGAGGCTGTCTGGTGGTTACAGAAGTAAATCTTCCCTAAGCCCCGGTAATGCCGGTCATTCCGGGGTTTTGTCTGTCAGTAT 0.784 +CCCGCTAAACCAGTATGTTGGTCCTACGAAGCATTCTGCACTGCGCCTGAATCAACAATCATAAAAAAAGGGGCTGCCTTACAGAGAGGCAGCCCTTTTTAAATCACCGAA 0.941 +TGATATTTTGACTGAGACTGTTGAGCTGACATTTAATGATGACGCCGCAAGCTCAGTGGCATTTTAACGGGCAGTGAACCTTTTGTTTACTGCTTTTTGTTTTGCCCTTTT 0.961 +TGAGCGAGAAAGCGGCTGCCAACCCAGTCACACAGCAGCAACCCGATATCCTGCCTTAATGCTGATCAAATCCTAAACGGCCTGCCGTTTAGGATTTTGTTATTTTCTTCT 0.9890000000000001 +AATGAAACACGGTATACACCAATGTATGATGTATTCCCAAAACTAAATCTAAACTTGAAGCGAAAATAAGGTCCCTTCTCTTTTTAGAGGGGGACCTTATTTTATTCATGC 0.9309999999999999 +AAAGGTAAAGCTGACGGAAGTTTAATTAATAAGCTTGTGAGCAGTCAACTGTCTTAAATGGCAAAGAAAAGGACATCTTTCTAAGAGAGATGTCTTTTTTTATACATAAAA 0.9990000000000001 +AAGTAAACATTCTGACAAAAACAGAAGAAAAGAAAGACGATACGAAAGAGTAATTCGCTTTCTTTCTAAAAAAACTGCCGGCTGACGCTGGCAGTTTTTTTATGTAAATGA 0.99 +CTTGTTTGATGAAAAGCAGCAATAGATGATCAAAACGAAAAATAGACAAATGAAAAGGCGTAACATATTTTATACAGCTCATATGTTATGCCTTTTTCTTCATTTCCGCCA 0.6709999999999999 +AAGCTGGCGCTGAAGTTGAAGTTAAATAAGCCAACCCTTCCGGTTGCAGCCTGAGAAATCAGGCTGATGGCTGGTGACTTTTTAGTCACCAGCCTTTTTGCGCTGTAAGGC 0.774 +TGGGTGAAACCTTAATCAAGGTGCTGAAAGAGAAGAACTTGTTCGAAACTTGTAAAACGCATTAATAAAAAAGGGACAGCCGTCAAGGCTGTTCCTGCTTTTTCTAACAAA 0.929 +CCAAGACGGCGTACTGGTTGCCTCGACCGTTCAGGAAGGGGTGATGCGTAATCACAATTAAAAAAATAGCCGGAGGTGAAAACCGTCCGGCTGTTTTTTGCAGTGCTTGTT 0.617 +CCTGTCTCTGAACCCGGACACCGTGGTGCAAACCTGGTTAAGCCTGGCTGAACTGAAGAAATAAAATAAATCCCCGGCGGCGTTTAGTCGCCGGGGTTATGTGATCCCCGA 0.899 +GTATAATCTGATCATGAAAGACGGATCGAAAATACCGGTCAGCCGGACATATGCGAAGGAATTGAAAAAGCTGCTCCATATTTGATGGGCGGCTTTTTGCATTTCAGCCAT 0.963 +CGACATTCAAGAATCTGCCATTTTCTTTTCAAAATCAACATCAAGTTGTCAAACTTTGAAATAAAGTTTTAAAGAACCCTGACTAGTTCAGGGTTTTTTTTTATGGGTAAA 0.988 +GAAAATACTGCTCGCCGCAGCGGGAATCTTTTTGTTAATTGGTCTTATATATCTGTTCATATTATGAGAAAAAGAAAGATTTACCTTTCTTTTTCTTTTTTTTGCCGTCCA 0.9390000000000001 +GGTCGTATCTATCGATCAGTCGTTACCAAAAGGCACAAGAATTAAATAATGATAAACAAAAGGTGTTTCACGTGTAACAATTCGTCGAACACCTTTTGTGTTTCGACAAGA 0.851 +AGTTGTCGCCAAAGTTTTCGATGTCAATGAGCCACTTAGTCAAATCAATCAGGCAAAACTCGCCTGACAGAATTTAATCAAGGGCGGTTAGCGCCCTTTTCATCCCTGTCT 0.941 +GTGCGGAAAAAAATGCTGGAGGGGCAGCTTAATGCTCACTTTCGGTACAAAAACCGTTAAATAATAAAAGTCCGGCTCGCAGTTGAGACGGACTTTTTACGTTTATAATGA 0.971 +ATTAAAACGGATTATTATCAATACTGGTCAAAAGTAAGACCGCAGCGCAAGAAAACAGAAAACGCATAAAAAGAAGACGCTCTGCATGCGTCTTCTTTCATCTTTCAGCCC 0.748 +ATGGAATTTGAGAAAAACCTCGTCCGGATGGCTGAGCGGAATGCAAAGGTATTTGGGTGAACGTTCTTCGCATGCCGGCTGTATATGCCGGCATGCATTTTTTATCTCAAG 0.958 +CAATCGCCGGATTATTTGATGAATTTACACAAACTGTACAGCGTGTATTCCCTAACGCTGAATAAATGAAAGAAGCCGCACAATTTGTGCGGCTTTTTCTTTGCGTTAATG 0.986 +TCTGTATAATGGTGTATGTTGGTCTTTGACTGCGATGAAGTGAGAGGTTGCTGACACACCCGGCCGCTTTGCCATGGCAAGGTGTTCAGGTTTTTCTCACGGAGAACTGTC 0.05 +TCGAAAAAACGCGTTGAATTCGTGCTGGCTTAATGCACCTTTTTACGCGATTTTTTTACTGGCTGGATGTTTTTTACTTCACTTTCTATCCAGCCGTCTTCCAGACGTGTG 0.45399999999999996 +ATGTCATATTTGAGCCTGAGTTGATCATTCGCCAGTCCGCACGAAAACTGAATGAATAAAAAAAGCAATGTATGGGTCTCCCCGCTACATTGCTTTTTTTATAGCTGTTAT 0.95 +AAAATGCTTGAAAAACGTGTGAACCGCGTTCTGGCTGAAACGGAATAGGTATCTGACTAAATAGTATTTCCTAAAAACCTGACCTCATCAGGTTTTTTATTTGTGCAAAAA 0.9490000000000001 +GTTTGAACAGTTTGGTTTTAACGTCATGAAAAGAAAAACAGCACTGTTCCATTATGGCGTCATTCAAACAAGCCCGTGATTCATATCACGGGCTTTCTGTTAATCGGTGAC 0.795 +GAGAGCAAAGAAAAAGCCAGCGGGGAAGCTGGATGGAAAGAAACAAAGTCGGTTTTCACTAAAAGAAAGCACGGGTGTTTGAAAAACCCGTGCTTTTTTGTTGCGGTTAGC 0.987 +GTTTTGGCGTATGGAGCGCCTGGCGTCTGGTTAAAACGACCCTCAAGCAGCAACAGCTTCGCGGTTAACTTCCCTCTGGCCGGAGCCATTCCGGCCTTATCCCTCAAATTT 0.9520000000000001 +TGCAGGACCGCAGCTATCGTGATGCGCTGCTCTCTATGAAAAATAAGAAAAAGTAATGAAAAAGGCAAATCCGTTTACTCATGCGGGTTTGCCTTTTTTGCTGTTTCCATC 1.0 +GAAATTGATGACCTGCTGAGATCGACAGCGTTATATGCTCAGGCCATTTACGAGCTTGCAAAATAAAAGGACCGGCTTCTGCTGAAGCCAGTCCTTTTTTTAAATAAATGA 0.9420000000000001 +TTCCAGAGTGCACAAAAGCACAAGGCGGAAGCAGCGCATTAATCTGATTGTTAGGTAGGTTGGTCAAGTCCGTAATCTCGAAAGAGGTTACGGACTTTTTGTTTATGGGGT 0.965 +TGATGAAGAGGATGGAAAAGATATTGAGGATCAAATCGCTGTCATTTTGGCAAATCAATAATAGCGGGAGACCTGTTTTCCAAGAGAAAACAGGTTTTTTTATGTCTGTAA 0.99 +CTATCGGAAAGCTGGAGAGCGTCCTTTATGAAAAAAAAGGACAGGCTGCTGCCGGTTGAATAGAAAAAAGCCGCGCATATCAACGTGCGCGGCTTTGCCATATTTAAGATT 0.848 +TTACGCTTTCGGCAAGTATTCAGGACCAAACTTAAATGTTTGGGCCACCACAGGCTCTTATTAAATAGAAAAAAGGCTGTCCGTACGGACAGCCTTTTCTAATTTATTTCG 0.98 +TGTGAAAAATGGTGAGCCGATCCAAACGGGAGATCTGCTCCTTGAAATTGAAAAAGCATAAAAAACAAAGAGTGTATATCAATGATATACACTCTTGTTTTTTATATAGAA 0.987 +GGCTATACCGAACCGTTATTACACGCCTGGCGTTTACGCGAAAAAGAAAGTCATTAATAGCGTCAAAAAAGCCCCGGCAAGTTTTATTGTCGGGGCTATGATAGTAACTGC 0.6970000000000001 +TCTGCAAGAGTATGCATAATATCTTATTGTACATGCTGGAACTTGCCGGAAACAAATAAAAAAGACTTGCCCGCTTTTGACAAACGGCAAGTCTTTTTTATTACTTCTGAT 0.993 +CAATGGATGAGCAAGGTAAATCCCTCGACGATTTCCTGATCAAGCAATAATCTTTTGTAGATTGCACTTGCTTAAAATCCCGCCAGCGGCGGGATTTTTTATTGTCCGGTT 0.966 +AACGTAATACAAGAGGAAATCGCTGCAGACAGACTGACTGTCTGCAAGGTTGTTCCTAAATCATAAACAAAACATCCCTCCAGTGTGAGGGGTGTTTTTCTGCGGAAAGAA 0.897 +CAGATTGATATTTTTGATATAAAGAGAGAATCTGTTGTTTAATGACTTTAATAAAGAAGAGTCACTTTATAAGCCCCTCTCAATGAGAAGGGCTTATTTTATTTAATTTGA 0.9520000000000001 +CAGTTTAAAAAAGATGAAAGCATTTATGATATTCAAAAGGTGCTTGCCGCAGTTTGTATGTAAGGCGAAAAGACGGAAGCTTCACAGCTTCCGTCTTTTCATTTACAGCTT 0.772 +TGGCACTCTGCAAGCCCTTTCCGAAAAATGGTTTGGTGCTGATGTGACCAAATAATCAGCATAATGACAAAAAAGGGCGCTTTCACTAGCGCCTTTTTTATTTACGCGTTT 0.912 +ATCACGCTTCCGATTTCAGCGACACTTGGTGCCATCGCCTACTTTATTTTAAATATGATATTTTAATGTCACACCGCTTCTGTCAAAGGAGCGGTTTTTTTGATGGAACAG 0.997 +TAGGTTTGCAAAGGATATTGCGGAAGAGGTGTATTATAGCCTTAAGTAGGACCTTTTGCTTTAGAAGAACACAATGAGGTTTAGTCCTCATTGTGTCTAGTTAAAATTGGA 0.9179999999999999 +GGGCTGGCTTGAACGAGCATTAAGCCCGATTAAAAAGAAAAAAGGAAATTTTATTATAGCTGATAGAAAAACCACCTCTATTTAAATACAACAGAGGTGGTTTTTTTAAAT 0.8540000000000001 +AAGCTTAATGCGAAAAATCGGACAGAAGCCGCCAGCATCGCTGAGGAAAAGGGCTGGATCTAACCATATACTGCACTCCTGGGAGATCGGGAGTGTTTTTGATATGCCACA 0.785 +ATACCGTTATTTCTATATCGAAGGCCCTGACGGTGAGTGGATTGAGTTCTTTCAGCGGTAAAAATAAAGAACGTACATTTTGTGATGTACGTTCTTTTTTATCTATACCAA 0.9390000000000001 +GATGTGCACCGAAATGTTAAAAAGAGTAGGATTAAACGCAAAACAGTTTCATAATAAGTAAAAAAAGGAAGCCTGATATGTCACAAGGCTTCCTTTTTTGCTATTTTTCAT 0.987 +AAAGATTATGTGGCGAAGCGTGACGCTGTTCAAAAACGAATGGCTGAGCTGAAGGGTTAATGAAAAGCCGGGGCGGTTTCTAGAGAAACCGCCTTGTCTTATAAAAGAGGG 0.49200000000000005 +ATTTACAACGGTAATGGCGGAAGCTTATGAGCTTGGCAAGTCTGACGCTAAATCTGAAACAGAATAAGAAGAAGCCACTTTTTTGAAAGTGGCTTTTCACATGATTTTCTT 0.904 +CTTCGTGGCGGCATGGGTGAAAGTGATGAACCTCGACCGTTTCGACCTGCTGTAATCTGACCCCGTTCAGCGGCTGCTTGCTGGCAGTCGCTGAACGTTCTTTACCAGCGT 0.84 +GATGGCTTACATCGAGCTGGTTGATCGTTCAGAGAAAGCAGAAGCTGCTGCAGAGTAATCTGAAGCAACGTAAAAAAACCCGCCCCGGCGGGTTTTTTTATACCCGTAGTA 0.982 +GAAAGATTTTTATGACTTTACCATTGATGATTTCAAGTTAATCAACTATAAACATGGGGACAAGCTTTTATTTGAGGTAGCGGTTTAATGCTGCCTTTTTATTGTGCAGTG 0.8640000000000001 +AGTGAACGGCTATATCAAAGAAGGGCTGAATGGCGGAGGCCTCAATACAAATTAATATCTTCAAAAAACAACCTGCTGTCTCCGTTACAGTGGGTTTTTTCGTCTGAGAAA 0.965 +AGCAGATTTCATGCACCGAAACAGTCTATTCAGGATATACATATTCGCAAAGAACAGTAAGAAAAAAAGCTCAACACTCGCATTGTGTTGAGCTTTTTATATGTTACTGCT 0.99 +CTCCGGCGACCATCTTTATGCCGACAATACCGGGATTATTCTTTCAGAAGATCCGCTGGATATTGAATGATAAGAAAGGCACCGCAAGGTGCCTTTTTTCTGCGTTACCTG 0.659 +AAATTGTCCGGAAAAAAGAATAACATCTGGATAAAACACATGAGATTTTGACTAATAAGATGCTTAAAGCCTGAGATGAAATTTTCATCCAGGCTTTTTTAAACAATGATC 0.9159999999999999 +AAAAAATGACGGGTCTTGAAAGCTATGATGTCAAGATTAATTTAATCTGATGTATTCAATAATAAAATTTATCCATAAACCTCGTTTTTACGGGGTTTTGTTATATTTGAA 0.927 +GCGATTGATGCACTGCCGAGTTTGGATGAGATTGGAGTTTCTTTGTAAATCATTGAAGATGTTTCTTGAAAGTCTTTGGACTTTTGGGGGCATCTTTTTTTATTGCTTACA 0.997 +ATGCTAATGAAACCCCATTTAATATCCAAGCAGCGTGTCTTGTGCTAATTTTAAGTAATTGATAAATAAAGCCGATTTAAAATTAAATCGGCTTATTGCTTTATTCGAGTT 0.889 +CTCTCAGGTACTGAAAGCGGAAATTACGCTGGATTACCAGTTGAAATCGTAAAGCATTGCCGGATGACGCGTCAGGCGCGTGAATGCCTGATGCGTTGTTAGCATCTCTAC 0.441 +ATAAAACAGACACATCTAATAAAACACAAACAAACTCATCATCCATTGAAAAAACAAATTAAACAAAAAAGCCGTCACCTTTGGGGTGATGGCTTTTTTGGTACACAATAA 0.987 +GTTGTCGACATGCTTAATGGCGCAGTACCGAAAGCGCCGTTCGATAGCTGGCTGAACGAATCTCTTTAATCTTACCGGGGCGCATCTTGTGCCCCGTTTTCTCCTCTGCGA 0.594 +AAGAACCGCAAGAGACACAGCCAGGCGCTTGCAAATGCGCTCGGAACTGATAAGCCTTGGTGAACGTTAAACCCAAAACATAAGTTGTTTTGGGTTTTTTTGTACTTTTCA 0.937 +GGCTTTCCATTCGCGCCTGGCTGCCAGTGCCGGTAACGCGGGCGCAGGGCACGACAAAAGAAGGGTAAATAAACGGGAGGCGAAGGTGCCTCCCGTTTTGCTTTCTATAAG 0.733 +GTTCTTGCAGGAACATTAGCCAATCTGCTCAGCGCAGCCATTGCCGGCATGTTTATATAAACGATAAAAGCTCCTTGCGTCAGTGCAAGGAGCTTTTATGTATGAAAAACG 0.991 +CCAAGGATGGAGATATAATCAGGCTTCTGGAATTTGAATTTGAATTTATTGATTAATCAATGTTGGCTGAAAAGAGAGCCTGGTGCCAGGCTCTCTATTTTAAAGGGGGGA 0.39399999999999996 +TAGGTTTGATTTGGACACTGATTAAAAAAGAACTGTCACAATATGAAAAGTGGCGTTTGCAATAAATATAAAAAGCCCCGTTCGATGAACGGGGCTATCTCACGCTTATTG 0.8220000000000001 +CGCCGCCGACGGACCAACCGGAAGAAATTCTCGAAGAACTTGGCGAAATCGAGAAACCGAATAAGTAATTTCTCATCAGGCGGCTCTGCCGCCTGATTGTTAACCACCGCT 0.845 +ATCGAAAAAACAATTTCGTGAATTTTTAGATTATCAAAAATGGAGAAAATCCCAAAAAGAGGAGTAGTGCCTGAGCAGAGGCACTAACTCCTCTTTTGTCAATAACCATCG 0.88 +TGGTGTCTTGGCTGATTGCTGTATTGATCGTTGCTTTGAATGTGTTTTTAATTGTAGATACGTTTCGATAAAAAAACCGGCTTCTAAAGCCGGTTTTTATTTTTCCGCAAA 0.98 +CCGGCCTACACGGTGATGATGTGGTAGGCCGGAGCAGGTGAGTCGCTCTCCAACGTGAAGTTTGTCAGCTATCTGTAGCCCATCTCTGCATGGGCTTTTTTCTCCGTCAAT 0.953 +TGTGAAAGGGCGTTCGCAAGCCGTTGTTGAGCTTCTTAGAATGGGTGAGCTAGAGCTTTAATCCTTGCCGGTATTCCTTCTTTTGGAAGGAGCCGGTTATTGCTGTTTGTT 0.6779999999999999 +TTTATGGGTATTATCAATTCGTCTGGAAAAAGAAACAAATCACACGTCTCATCCAATAGCCTCCGAATCACACCGTTCATAAGTTTGAACGGTGTTTTTTTATTTGATAGG 1.0 +ATGATCTCACGATTCGTTCATCCTATACGGGATGGATATTCAGCGAAGAAAAACTTCAGAAAAAATTAAAGCTCGGCTCTATATAGAGCCGGGCTTTTTACGTCTTATATA 0.9790000000000001 +CTGAATATCTCTGTCGCAACCGGCGTTTTGCTTGGTGAATGGTGGCGTCAGAATAAAGCCTGATAAATCAGCCGGGTGGCAACTCTGCCATCCGGCATTTCCCCGCAAATT 0.897 +ACCGCTGCGCTGTGGAAAGAAATGCGCGAGATTTATTACGGGCGCAATATGCCAGCTGTTTAACGGCTCTGGCGGAGCTCCCAGGCTCCGCCAGATTTATTTACTTCTGCT 0.9129999999999999 +CAAACACCGGGCGGAGAAATGCTCGTGAAAATTGTGAAAATTTCATAATATCTGTTTAGCAGCGAAACACCTCGTCCACAATGTGGGTGAGGTGTTTTTTTATGAAGATAT 0.992 +GCGTGCAGGGAATGCGAAGAGGACCTGGCGGAGAAAGCCAGCAAACTCTTTATCTCGTGGAATAAAAAAAGAGGCTTGGATGAATCCAAGCCTCTTTTCTATTAAGCTTCT 0.982 +AAAACGAAAACGTTGAACGCAACACTGACCAAGCAGACAGAAAGCAGTTCAAGCTAAGAAAAAACAAAAAGCTGAACCCGATTAAGGTTCAGCTTTTTTGTTACCCTAAAT 0.924 +CGTACGATTCGTGCGGCGATGGTTACTGTAGCGAAAGCAAAAGCTTAATTTCTGCTTTCGTAATAATTCACGGCCCTGCATGCGAATGCCGGGCCGTTTTCGTTACTCCGC 0.97 +GCGAGTACCAGGACATCATCCGCGAAGTCAAAGACGCCAACGCCTGATACACTTAAGGCAAATTAACTCCTCTTCGAACCCCCGCTTGTCGGGGGTTTTTAGCATTGATGG 0.741 +GTGGCGTTTATTATTTCTTGGATTATGATCGTTGTCGCAGCCGTCAAATATCTATAAACAAAAAAGCCAATCCTCATCATATGAGTGATTGGCTTTTTTCTTATCTTGGCG 1.0 +TTTGGGATTGTATATAATGCAAGTGTGCAGCTGACGAAATACAGAAGTTATCTTTTTAATTCTTGATATGCAGACAGCCTTTACAGAGGCTGTCTTTTTTTGTGCAGAGCA 0.991 +TCTGGAATGTAGATAAGAGTGTAAAAAGGTTAATGGAGATAACAAGCAATTATGTTGCTGAGAATACAGCTGGTGAGACTTCTAATTAGAAGTCTTTTCTTATTTTAAGGA 0.9009999999999999 +GTGATTTATTTGAAGTTGTCCCGTTATTAACTGAGGAATTCAAACAGCTGAATATACACTCATAAAGGAAATCCCGGACTTTAAAAGTCCGGGTTTTTTCATATTTATAAA 1.0 +GACACCCCTTGGCGAGAAGCACTCATCGGCCTCGCGCACATCGCTGTTCAACGCGATCGTTAATCCCCTCCCCTCATCCCGCGCAGCGCGCGGGATGGTTCCAGTAAGTTC 0.528 +CACCGCATTGTTTGCCGATCTCTTCACTGAGAAAGAATTGCAACAGTAATGCCAGCTTGTTAAAAATGCGTAAAAAAGCACCTTTTTAGGTGCTTTTTTGTGGCCTGCTTC 0.87 +CTACCGCCATGGCGGCATCCTTCAAATGGTGCTTCGTGAAAAAATGAAGCAGTCCTGATGAATCAATAGGAAGAGAAGGCATTTCGCTTTCTCTTCTTTTTATGACACAAT 0.857 +CGAGGAAAATGTGCAATAACCAACTCATTTCCCGGGCAATTCCGCCGGTTCCGAATGATACGAACAACTGAGACTGAGCCGCAAATGGTTCAGTCTTTTTACATGGCAGCC 0.865 +TAGTTTTTTTAAATGGAGCTCAACAATTTATTCAAAAGTATAAAAGGGTTAGTGTTTGAACATAAAAAAGTACCTTCTTACAATAGAAGGTACTTTTTTGTATCTATAATT 1.0 +CTTGTTGGCTCTGGCCCGAGGGGAATCGAATCCATTGTAGCGACGCAGAAGGCGGCGCTGGCAAACTGATTTTTAAGGCGACTGATGAGTCGCCTTTTTTTTGTCTGTAGA 0.821 +CGAAGTGCCGAAAGAGCTGCGTGTCGATATTCGTGAAGTCGATTAAGTCATTAGCAGCCTAAGTTATGCGAAAATGCCGGTCTTGTTACCGGCATTTTTTATGGAGAAAAC 0.392 +TCCAAGTATCAACAGATAAAGGGCTGTTCAGAAAACTTCATAAAATGAACAACCGCTAGGAAGGAAGGGCTGCCGGAAGTGATATTCGGCAGCCTTTTTCTTTGCATCAGC 0.917 +ATTACATGTCCTATACGAAGTGGAATACGATTGGGAATCCGGCGATCAACCAATAAATCAGATAAAAAAACGCTTTTGAGATGATCAAAAGCGTTTTTTGTTTGTCTCAGC 1.0 +GGAATCCGAGGCCGGCTATATCACGCTGCATGTCCAGCGTCTTCAAGATCTCTCGGAATAACCGCTTTGACTTGCAGGGAGTGATCTCTGGAAGTTTTTTTATTGATCAGG 0.797 +GGCATCGGATCGGGACACGGGCCGACAAACCATTTCGCTTTTAAACGAAACAGCTTAAATACAAGCCGATGAGATCACCAGCTGATGGTGATCTCTTTTGCGTTTGCTACG 0.966 +TTGGATCTGTTGCAATAGCTATCAGTATAATGAAATTTATCTGTGATCGTCTAAAAAAGAAACCTAACGGTAAAGAGTGTTAATTCCTCTTTGCCTTTTTTATGCTAAATA 0.9890000000000001 +CCTATTGTTTATGATGTTTTAACCTCTTATTTTTCAAAGGAAACCCGCTAACATCTTTCACCCATCTGCTTCAATCGAGCCCGCGAAAAGCGGGCTTTCTGAGTTATCGTT 0.9940000000000001 +CACCTTTCTTGTACGAAAAAACAAAGCGCCGTCCAATGATCCTGCCGATCATTATGGAGGTTTAAAAAAAGAGGCACTCCCTAAGGGAGTGCCTCTTTTTATTGCAGTGAT 1.0 +AAAACTGCGGTTAAAAGGGACAAAACAATTTGGACAAGCGGTTCAGCTCCATTATGTAAGAGCATAAGAAAAAGGATAGACAGATGTCTATCCTTTTTTGTATGTAAGTTA 0.996 +CGTACTGGTAAGGCTGCTCGTATCAAAGAGCGTCTTAACTAAGATTTCGCTTAAGCGACATCCTGTTAAGAAGGGCTGGCCAATTGGCTGGCCCTTTTTTATCTGTTTGCT 0.99 +GTGCTTTATTCTTTAAATTATTTTAGAGCGGATAAATTCAGATTTCATGTTTTGCGTAAACGATTCTAAAAACGGACGCCTGTATGTGTCTGTTTTTTTATTTGCACTGAT 0.948 +CCGGATTTCTCTGTAGATGATAGCGAAGGCGTAGCAGAAACTAACGAAGATTTTTAATCGTCTTGTTTGATACACAAGGGTCGCATCTGCGGCCCTTTTGCTTTTTTAAGT 0.934 +AGGATGCATCACAATTTGTTGAATTTGCACGTTCTTGTAGGCCGGATAAGGCGCTTACGCCGCATCCGGCGCTGCCCCTCTCCTCACGGAGAGGGTTTGGGTGAGGGAAAA 0.9520000000000001 +TTCTTGAGCTGAACGTGACCTTACCGGAAGCAGAAGAGCTGAAATAAGCCCGCTGTAAAAGCGCATTTATCTTCAAGGCAGAGTTATCTCTGCCTTGAGTTTTTCATCCCT 0.932 +TGAGTCTGCTTCTGAGCAAGAGAATAAAGAAGTCGTTTCTGACTTAAATGCATAAACTTACTTGAAAATCAGTATGTCACTCTGGCATACTGGTTTTTTATTTTTTGTGTA 0.997 +TCACTCCGGATTACGATCGGAACAAAAGAGCAAAATGAAGAAATACTTGCCATTTTAGCTGAAATTTTATAAGAGGTGATCAAATATCACCTCTTTAGTCTATAAACCAGG 0.41600000000000004 +GCTGGTATAATCCCGGCCCCGCGAGAGTACAAACAGTTGTAACTGAATAATTGCTTGTTTTTAAAGAAAAAGAAACAGCGGCTGGTCCGCTGTTTCTGCATTCTTACGGTA 0.737 +AGCAATAACTTCTGATGCCTTATTTAATGATGAAAGAAAATGTACACTTTCTTAATTATCTTATGAGTGAAGACTGCGACACTTAGTCAGCAGTCTATTTTATGGTTCTGG 0.87 +AAAGCAGCCGACAGCATTCTTGAAGCTGTAAAACAAAAAACTCAATCAGCAAATGCATAGTGTAAAAGCCTGTTCTTATCGAAAGAACAGGCTTTTTGCCATTTCATAAGA 1.0 +GAAAGCGGAAGACCAACAAGAAAAAGAGAAGGCGGCAAAAGCCGTTGAATCACCTTCCCATTAATAGAGCACCCCGCGGGTATCAATCCGCGGGGTTTTTTATTCAATCAT 0.99 +TTATATCAACGGCAGAGCCTACCGGAAATTTCAAAAAATGTCCTGGATGAAAAATTCAAAGAAAAAAGCTCAAATCATTTGCTGATCTGAGCTTTTTTTTCGATGGCGACA 0.8959999999999999 +CTACAGCATGACTGAACATAAACGCCAGGTTCCGGCGTCTATGTTTGATGACTGGTGGAAATAAGTTGAAGAAAGCCCGCTCTCGGAGCGGGCTTTTGTCGTGTACAGAAG 0.982 +GTCAGCGGAATTGCGGTATCTTTATTAAGTGCTGCGATTGTCGGCCTGTTTGTGTGGTAGAAACATTGAGCCATATCCCTTTTCGGATATGGCTCTTTTCATTATTGATAA 0.975 +AATTTACGGCAATTCAAGCGCGTTCAAAGCGATCAAACAAGCGCTACGTTTAGCGAAAAACTGATTGGAAAACAGCCTGGGGATTCTCAGGCTGTTTCTTTATTGACAGAA 0.975 +TGATATTACGTTTGTTCACGTCAGTCTGCCCGATCTTGCATTAGAACAAGTAGATATTTCCACAAAAATCGGCGAACTTTCAAGCAGTTCGCCGATTTTTATCAATGCAAT 0.871 +CCGGGTTCATTTTATACAGAGACAGAAAACGCAATCAAGTCAGCCCGCGATTCAGAAAACGATAGCTGAAAAAAAGCCGGAGAATGCTCCGGCTTTTTTTGTTGCATTATT 0.97 +TATAATTGGCTCAGCCGTTTGATTGAGTTTTTTGAGAGCGAAGACATTTTTAAATATGTGCCGAAGCCGTAAAAAGTTCCCAAATTCAATTCTGGGAACTTTTTTAAGTCC 0.848 +CTAGGGAAAGTGATTGGCGTGTTCAGAACCGTCCATTAATCTATTGATCTATCAGATGCATAAAATAACCCCCTCTTCCTTTACGGAGAGGGGGTTTGTTGTTCAAAAAAA 0.9940000000000001 +CAATCCGGGCCTTCTCGGTTCCGGTGCACATTTACGCCAGACCTTAGGTCACATTCTGTAAATCCTTCCTTTTATATCGGGAGGTAACTCTCCCGATAATCTTTTAAATCA 0.88 +TTTGCTAAGACCGATAAAGATTCGCTTTTTTGGGGCGAACAGACCATCGAACGTAAAAACGTTTAACCAGCGACTCCCCCGCTTCTCGCGGGGGAGTTTTCTGTTATTTCA 0.948 +TAACGTATCGACGTTCAAGAGTGGAGGAAAGAACATGGTGCTTCGATATACAGCTCTGGGCGATTCCTTGACGACAGGGAGAGGCTCCGGGCTGTTTTCACCCGGCTTCGT 0.48200000000000004 +TTCTCTGAAAGCTGACGCCTTCGCAGTAATCGTTAAAGCTGCAGAAGCGGCTAAACAGGCTTAAGTCTGACAGGTGCCGGATTTCATATCCGGCACTTACTTTCCTTAACT 0.93 +GCGGAATTAAAGCCGCAAACTACCTGCTTGATGATATTGAAACCTACTTCTTAAAAACGAAATAAAAAAAGACTCCGTCTTATTAGACGGAGTCTTTTTTGCTTTTGCCAA 0.997 +TGACTGAACTAGGCTATTTGTTGCTGAAGGATTTTATAACAATTGCCATAAGTTTGCCTCTGTAGAAAAACGCAGCGTAGTCGATACGCCGCGTTTTTTCTTTTTCCCTTC 0.996 +ATATTGACGGTAAGCGCCGAGGTGAATCCCTGTCTATTGAGGAATTCGCAGCGCTGTCTAACGGATTGTATAAAGCCCTTTTCTAAAAGGGCTTTTTGTTTTGCGCACCAC 0.9640000000000001 +AGAAGTGACGGTGCACGTCCGCCGCATCTGGGCTGCGGTGGGTGGTCGTCGCGACGAGCGATAACAGGCAAGGCCGGGGCGTCGTCTCCGGCCTGCTACCCTTTCCGTCAG 0.8109999999999999 +TCGCCTCTTATTAAAAGACGGCACTGAGGTAAGCCAAGATAAAACATCTGCATAAAGATAAGCACAAACCTCCTGAGTGGTAACACTCAGGAGGTTTTTTTGCATGCAACT 0.998 +ATTGTCGCAGTGCTGGTGGTGATTTCCCAGCCGTTTAAAGCGAAAAAACGTGATCTGTTCGGTCGCTAATCCATTCGGCGCTCCTGCGGGAGCGCTTTTTTCCTGCCGCTA 0.8740000000000001 +AACTCAAAGGAGTCATGTTGGACATTGAATGTAAATGAAAAAGACAATCTCCTGAAGAGAAACATGAACTAAGAAGCGGGCTTAAAAAGCCCACTTTTTTTCTAACAAACA 0.903 +GGATTTCGCAAAGGTTTCAGTTGCAGGAAATAACGTTACGATCTCTGTTAATAAATAAACGAAGCAGCAAAAAGCCGCCAGCATTGTCTGGCGGTTTTCCTTGGAGCATAC 0.8190000000000001 +CATATTATCAACTGCTTCCCGCCGGCTGGAGGAAAATCAATCGTCTGATTGTACAAACGACATAATGAAAAGAAAACTGCGGAAACCCGCAGTTTTCTGCAGTTTTAAGGC 0.98 +CGTGAATGAACTTGGGTTTTCGCCACAAATCAATACGGCTGAACTTCTGAACGAGGAGGAGAACGGAGCATAATCATTTTCTAAGATTATGCTCTTTTTCTTTTGTTATCG 0.953 +TTCATCCAATCATGTACGGGCTGTTTGTCGTATTTATCCTGTACTTTATCTTCTTAAAATAAGGAATGAAAAACCAGCTGCACTGGCAGCTGGTTTTTTTGTTGCAAAAGA 0.996 +ACGACTCTTGCGCAGATGGCAGAAGAGCTTCGCGATCTGACGAAACAATTTAAAATCGAGTAAAAACCGAAAAACAGCGCTATCAAAGCGCTGTTTTTTTATATGGTAAAC 0.9990000000000001 +AGTAATACTATGGTATAATGGTTACAAGTTAATAAGAACGGTCCTGAGATGAGGAGAGACCACAGCACCAAAGTGTAAGCATGCACTTTGGCTGTTGTGGTCTCTTTTTCT 0.755 +GAGCTCACATCATATATTCAGCAAAAAGATGAGCAGCGCCAAAATATCTTTTTGCATTAATTTCCAAAAGACTGCCTGATCGAATCGGGCAGTCTTTTTCAACGAGATTCG 0.977 +GGTCGATGTGGTCTTCCAGTTAGAACCTGTGGATCAACAACCCGCTAAAACACCTGCAGCACAATAATATTTGTAGCCCGACGTATTCGTCGGGCTAATGCCTTTACCCGA 0.8240000000000001 +AAAGAATACTTCGAAGAGATGTAATGCTTCCTTGAAGCATCAAACAGATTTTGTTAAAAGCAGTCGATACATAAAACCGCAGCTCAGCTGCGGTTTTTTTGTGGTTAAAAC 1.0 diff --git a/data/sequence_data/choe/train.txt b/data/sequence_data/choe/train.txt new file mode 100644 index 0000000..21bf20e --- /dev/null +++ b/data/sequence_data/choe/train.txt @@ -0,0 +1,848 @@ +TTAAAAGGCGTTGTTTCCCGTAAAAAACAAGTCGTTCCTGTCTTAACAGACGCAATGGCTGAATAAGCAAAAAGCATCCCGCGTCGGGATGCTTTTTCTTATTCACCCAGC 0.986 +ATGCCTGCAACTGCTGAATAATCTTTCAGCCAAAAAACTTAAGACCGCCGGTCTTGTCCACTACCTTGCAGTAATGCGGTGGACAGGATCGGCGGTTTTCTTTTCTCTTCT 0.995 +CCTTGTGGATGATAAAGGAAATGACTTCTTCCAGCAGATACAACTCACACAATGCACCCGCTGTGTGAAATAAACAGAGCCGCCCTTCGGGGCGGTTTTTTTACATGGCAC 0.915 +ATTACTTAATTCTTCATATGCAAGCCAAAACAACGTCCGAAGCTTTTTGATTTGAGAGAAATAAAATCGTGCGATTCTTTAGCGGATCGCACGATTTCAACTTTTTGATCA 0.866 +AAAATGTATACGACATCGAGAAAAGATGACCTGACCTATATTCTCGGACAGCTGTCCTAAAAAACCAGAAAAAGAAGCTGGACATCCGGCTTCTTTTTTTTGCGGTTGACT 0.998 +GAAGCTGTCTGATAATGATACGCAAATGGAAAAGGCAATTGAAACACTGAAAAAAGAAATGTAAAAAAAACCATACGCGGCTGCCGCGTATGGTTTTTTTATGCGCCCTTT 0.997 +ACCTAAAAAAGTATATGTATCTGCTCGAGCTTTGAAATCCGGTAAAGTTGAGCGTGTATAACAAAATGAACGCCTGCCCCGATATGTGGCAGGCGTTTTTATGTGGTGAAA 0.9990000000000001 +TTTATCCCGCAAAGCGGTGCTCCGTGAAGAGAAAGAAAAAGAAGAACAACAATCTTAAATGAAAACATAAAAGGAGCCTGGGAGACCCGGCTTCTTTATTTTGAAGCTTTT 0.966 +CAAGGAAAACACCCACATGCTGTTTGGTGACGCCAAAGCCAGCGTGGATGCAATCCTGAAAGCTCTGTAACCCTGACGGCCTCTGCTGAGGCCGTCACTCTTTATTGAGAT 0.92 +TCTCCCGCGTTTTCCGTCGGCAGTTTGATCGCACTCCCAGCGATTATCGCCACCGCCTGTAATTTTATTGCCCGCGCGTTAACTCCCGCGCGGGTAATTGCTCCATCCATT 0.615 +AGCGACCTGAAAGTCGGTCTGTTCCAGGATACGTCTGCCTTCTGATTATAATCAGACAGAGTATCAAAAGGCGAAACCTCCGCAATGCGGAGGTTTCTTTTTAAAGAGACA 0.9520000000000001 +CAAGTCAACAGCCTGATTAAAGAAGGGCTGAACGGCGGGGGCCAGAATACGAATTAATGAAAAACAAAACCTTGAAGAATGCTATTCTTCAAGGTTATTCTGCTTTCAGCA 0.8759999999999999 +TTGAAAGGGATATCGAGTAATCGACGGACGATAGATAATTCCTGAGACAATAGTGTAAAAAACGAGCCGCTGGGGGTGTTTTAAACCCCAGCGGCTTTTTTTTAGTAAAAA 0.978 +CCATCGGCACAGGTGTCATGCTTGATAATATGGACAAGCTTCCTAAACGCAAACTAAGCTGATTTCACAAACCTCATTCTGAAAAAGAATGAGGTTTTTTTATGAAAAAGC 0.993 +GGCGGTATGGGCGGCATGGGTGGCATGGGCGGCATGATGTAATTGCCCTGCACCTCGCAGAAATAAACAAACCCCCGGGCAGAAATGTCTGGGGGTTTTTCTTTTGGTCAT 0.986 +AATCAAAACAGTCAAACTAAAGGACAATTCATTTTGGGAAAAAGTGAAGCGTACTTTTTTATAGAAAATAAAAGGACAGGCTGCTGCCTGTCCTTTTATGATTTCTCATAT 0.986 +CGTATTGATAAAGAAGGTGTGTTCCATACCGAATGGCTGGATTAATCTGATTTAACCAACAATAAAATTGAGGCCCGGCGTATATTGCACCGGGCTTTTTTTTGCCAAATA 0.993 +TGGTTTATATAATGACTCGGGCTTAAGCGGTTCTCTTCCCCATTGAGGGCAAGGCTAGACGGGACTTACCGAAAGAAACCATCAATGATGGTTTCTTTTTTGTTCATAAAT 0.945 +GGAACTTGATCTGATTCAGAAGATGGGCCTGCAAAATTATCTGCAAGCACAGATCCGCGAAGAAGGTTGAGCCGTGTCGGGAGGGAGAACCCTCCCGGTTAATAAGAGATC 0.718 +GTTGATCGGCACATGAACCACAACGCCGGATGCGATGCTGCGCATCCGGCATCGCATCATTTCTTCTTCAATTGCGTCACCACCAACTGGTGGCGCGAATTATAAAACTTA 0.8270000000000001 +CGCGAATTCCTGGCGCAGGCCAAAGAGATTGTGCCGCAGCTACGGTTTGATTAATTAACATCCATTCGCAGCCGGTACGCAGTCAGTACCGGCTTTTTTTATTTGGTGCGG 0.7709999999999999 +GCAGATGCGCGAGCAGATTCAGAACATCGAATAACTCGTAACGCCAATTCTTACTTTTCCGCCTTCACAAATGCCGCCACTCAAACAGAGCGGCATTTTTCTTCCCCGCAA 0.929 +AGCCTCTATTCAAAGGCTTTATCGGAGCGTCTGTCGAAGCTGCAAATCAGAAGTAATAAAAAAGACTTGCCGTTTGTCAAAAGCGGGCAAGTCTTTTTTATTTGTTTCCGG 1.0 +TGCTGAAGAGCTTGAGCAGCTGGCCAATGAGCTTAATGAGAAGATGGGTCAGTTTACGATTTAAAGAGAAAAAAGAGGCTGGACTCCAGCCTCTTTTTCTATTCTATGCAG 0.9059999999999999 +TTTGGCTTGAATGTGTGGGACGCGACTGCTGTCTTTCAGAATGTAACGAAGGAGTCTTAATTTTAAAAAAGGCACAGACCCGAACAGTCTGTGCCTTCTTGTTATTTTAAC 0.982 +GATGGGGAAGAAAGAACAGCGATTAAATTAATCGTGGAGCCTCGCTAAAAACAAATAAAGCATTCTCAACCTGTTTGCGTAATGCAAACAGGTTGTTTTTCATTTATTGTG 0.9790000000000001 +CTGGACTACAGCAGTTACTGCAACCCAATAAAGAGTAAAAGCCCGAGCGATTATCGGGCTTTACTCTACGCAGTCGCGGCTTAGTCCGCGACGTGCGGATTCACACAGTTC 0.8540000000000001 +TTTGTGAAAACTTCGCCTGCCAGCAACCGACAACAAATATTGAAGAAGCGATCCACACATTAATAAGCAGCAGGGATTAAGACCCTTGCTGCTTACTTTTTGATTTCTTAT 0.996 +CAGCGTACACGACGCCAACATTGGCATGGAAGGCATTACACCATTCGATTTTATGAAAGAAGAACAGCAATAAAATGCTCCCGTGCAAGCGGGAGTTTTTTTTCATTTGCA 0.9890000000000001 +CGCTTCTTCTGGTAATAGAAGCGATTCTCATCGCTATTTACTACAGCCCGTTTGATTTATAAAAAGCAGAAAAGCGTTTAACGCTCTTCTGCTTTTTTTGCGAGTTTAATG 0.98 +CAAACACCGTTTGCGGGATCGGACTGACGTCATCCGTGAACGTGAAAAAGAAATAGAACGCAGAAAAAAAAGCAGAGACTGATCACAGCCTCTGCTTAATTATTGTATGAA 0.8909999999999999 +AGGATATTAAAGACACGCTTACAGCGGTCGAGGATGCATTTAAACATCTAAAAAACTAGAATCAAATACCTTGAACAGGAGCTTTGAGCTCCTGTTTTTTTATGGAATCAA 0.9940000000000001 +ACCTAAACGTGAAGATTTAGGATTTTGAAGCGGAAACATCAGATGATGAGGCACGATATCTAGGAGGAGTAAGATTTGTTCTAATCTTACTCCTTCTTCTTATTTTGAAGC 0.963 +CATCATTCACCACTCGACATATGAGATGGAAAAGATGTTCAAACGCGTGATGTAGCGGGGGTGGTAGGCATTGATCCATTCTCTGAATGGATCAATGCTTTTTTTGTACTC 0.951 +GAAGCCAATTGCTCAGGCGCTGGCAGAAGGTAAATCTCTCGACGATTTCCTGATCTAATTACTGAGGCCGGATGTCGCTGCTAGGGCGCGTCCGGCGTCGCTTCTCAAGCT 0.762 +TGGAGAGAAAAAAGGCCTGCATGACATCATTTGCAAAACACGTGTCGTGTATAAATAATAGAAAAAAGTCTAGACGCCAATAGGCATCTAGACTTTTGTTTTCTTTGCAAT 0.976 +CAAAAAGCAATGACGAAAATGGTGTGGCCTACTTTATAGACCAGCTTCTTTCGGGGCAATATGCATAAAGAAAAGACTCCAGAGATTGGAGTCTTTTTCTTTTATTGCACC 0.9790000000000001 +TATGTCGGTTCCACCGGAATCCTGACAGATAAAAAAGCCGGAATTATCATCCGGCATTTTACTGTTTAAGCGAAAGCGTGAGTGCCCCTCACGCTTTATTTTTGATTTCCT 0.871 +AAAGCCTCTTACGTCACGGGAAGTTTTATCGATTTGGCGGGCGGGAAATAAAACAGGGAAGTTGTCTGACCGGATGCAACAAGTATTGCATCCGGTACTTCATCGACTTAA 0.631 +GCCGGATTACTTGGCTCTGCATATGCAAGCAAAAACAATCAGTGAAATGTTCTAAGCGGGACTAAATGGGCATCCTCCCTGCGGGGGTGTCCATTTCATCCATATACTATA 0.536 +AAAAAGCAGCCAGATCCAGAGACTCTCCCCCCAACGCTGTAAACATTCGTCTTTAATGCACTGTAAATTATGAATTAATCCTCTAATTTATAGTGCTTTCTATTACCTTAT 0.983 +TGCTTCAGATTATGTAAATGGACATATTTTAGCAGTTGATGGCGGCTGGCTATCCCGCTGACAAATAAAAAAGCTGACGAACGATCGTCAGCTTTTTTCCATTACATCATG 0.988 +GGGATTGCACTGACCATTGCTTGGAAAGCGCCTGCACATCTTAAAGCCAGCACGTGATAAGAAGCGCATTCTTTGTGTACTGCAAAGAATGCGCTTCTTCTTTATACTGAT 0.945 +ACGATTTTGTAGGCCGGATAAGGCGTTTACGCCGCATCCGGCATAAACAAAGCGCACTTTGTCAATAATCTGAAGCCCCGGCCATGTGCCGGGGTTTCTTTTTGACTAATT 0.958 +TACACTTGAGTCTACTGACCAAGACATCGCAAATCAAATCCGCGGATAACAGTATGAAAGGGAAGGAGCGCCTGATTCAAAAGAAGACGGCGCTCTTTCTATGTAAGGGTG 0.93 +CTCGAATACCAGCCAGAAGCTGCTGGCGACTACCTGCTTTAATCCCATCTGCACAAGCGATAACTCGCTTTGCCCAGAGGATGCACTCGCATCCTCTTTACGCAAAATCAG 0.897 +TGCAGGAGATTTCAGGAAGCGCGCAGCATCTTGCGAAAATGGCCGAAAAAGCGCTTGAGGAAGAGTAAGAGACCGGGGACAAACATCCCCGGTCTTTTTCTTATCCTGCAT 0.983 +AGCGGGCGATTACGACCGCATTACTAAGCTGGCGCGTGAAGCTGTAGAAGGCGCTAAGCTGTAAAAGTCAAAATGCCCGATCGAGGATCGGGCATTTTTGTAGCGTTTTTT 0.927 +CGATATTTTATCTTTATTAATCTACTTTGGCATTGCCACTGCATTCATTCACTCATTATAAACGTAAAAAGTCCGTCTCAACTGCGAGCCGGACTTTTATTATTTAACGGT 0.9520000000000001 +GTCGTCGTGCGTGTGGTGGTGGTCGTCGGCAATGCGCCGTAGGGACTGGAACAACACACGATTCCAAAACCCCGCCGGCGCAAACCGGGCGGGGTTTTTCGTTTAAGCACC 0.728 +ACCCTTGAGCAACGTTACCGTCACGGGATTTCCAACCTCAAGCTGAAACACGCCAACCACAAATAAAAATGCCATGCCGGATGCAACACATCCGGCAACTTCACACTTACT 0.779 +AATATTAATATGCAAAAAACCGGCGCAATTGTTGATTTCTTCTCACCAGGCGCGCTGAAATAAGAAAAAGGTAATTCCGCCATCGCGGAGTTACCTGTCCCCTTACCGTGA 0.662 +GCGGAAGCGGGAAAAAATATAAAAATTGCTGCGGCCGTACTGAATAGTTCGCCCCGGCAAGTTTACTGACCGCGGCGCCTGCAGGCGCCTGCGGATCTTTTAATGAGGTGA 0.434 +AAATTAAAAGAAAAAGGCTATTCCTTCGGAGTGTTAGAGGACACAATGGTTCCTGTTCATGAATAAAAAAGTGAGGCGCATAAAGCGGCTCACTTTTTCATGCCCAAACTT 0.909 +TAAATCCTAACCAGACTGCCAGTCCTTCGGGGCTGGCGCTGTATCTCCTCTGTTCTCCTCAGGAAAATCATAGTAGCATCGCGCCTGTGATTTTCCTTTTAAGTCGGTTTT 0.395 +CTGTTCGTTTCGTTGACGGCAGACACAGCTTGAAAAAGAAACCAAATCTTGTAATCCGTCCATAACAAACAGAAAAAGCGGGTGTGAGCCCGCTTTAACTTTTGCAGAAAA 0.787 +AATCTGCAAATTCAATAAATTGCGATGTGTTGTGTAGGCCTGAGAAGCGCCGCGCATCAGGCAACCCAGAAGAAAGCCGGATGATCATCCGGCTTTTTTACATCATCAGAA 0.945 +GAAGTACCAGAAGTTCTGCGTCCGTATATGAACGGACTGGAATATATTGGCTAATACCCAATTTTTCTGAATCTAAAAAGCGCCTGCGGGCGCTTTTTTTGTCTCCCTTTG 0.998 +AAGCGGAAGTTGCTGAAGAAACTGCACCAGAAACAGAAACAACAACTGCGTAACCTATTCAAAAGGTGATAAGAGGGACTGCCTTTTATCACCTTTTTTCAAGAAAAATGT 0.8240000000000001 +AAACCCATCATTGGCTCACAGAGAGAGGGGTCAATACATCAAGCTATAGTGTGCGATAAATGATAAAAGCCCAAAACCATATGTGGTTTGGGCTTTTTGTATTATGATTTT 0.985 +GTAGCTTCCTGGACGATTAATCGGTAGGCCGGATCAGGCGTTACGCCGCACCCGGCACTAGGCCCTCTGCACAAACGCCACCTTTTCGGTGGCGTTTTTTATCGCCCACGC 0.9890000000000001 +AGTTGAAGTGCCGCAAGAAGCATTTATGGCAGTTCTGAAAATGGACGACAGTCCGAAAAAACAATAGAAGCCGCCGCAGTCTTGTACTGCGGCTTCTTTCTGTAAAGAAGG 0.858 +AAATATCTGTAATAAGAAATAGCCCTCGCCGCTTCCCTCTACAGGAATGGCGAAGGGCTGTCGGTTTCGACATGGTTGGCCATCGTATGATGGCCTTTTTTGTGCTTATCG 0.993 +CGGATGCGGCGTAAACGCTTTGTCCGGCCAACAAAAGCACGTAAATTCAATATATTGCTGAAATAACGTAGGCCTGATAAGCGTAGCGCATCAGGCAATTTTACGTTTTAA 0.866 +CTTTATCATCTTCCTGTTGCTTGCTTTCGGAATCTTCATGAAGCGGGATATTGCAAACTAATAAAAAAAGCTGTTTTGCTTATGCAAAACAGCTTTTTTGTCAAAATAAAA 0.927 +CCGCATCATTGAAGCCGCGATGGAAAGCAGTAAAGAGAAACGAACCATTATGCTGGAGCACTAAAACAAAAGCCTCCCCAAATAAGGGGAGGCTTTTTCATTAGAATGTTT 0.9840000000000001 +TGAAGAGTTCGGCTTCACTGTTGATAACGTTGTTGCGAAAGCAAAAGAACTGCTGTAATTAGCATTTCGGGTAAAAAGGTCGCTTCGGCGACCTTTTTTATTACCTTGATA 1.0 +ATCCTGTCATTGAAGGTTTATTTGGCAAAAAACAAGACGCTAATCCAAATCAAAAGGGCGCATAAAAAAACCATACGCGGCAGCCGCGTATGGTTTTTTTTACATTTCTTT 0.985 +ATAAAATGCAGCGGATCCACGAAGTGAAAAGCAGAATTGAAGAAGAGCTTGAGGAAGAATAAAGAAAAACTGGCCTTTCATTCGCGAAGGGCCAGTTTTCTTTATGTATAT 0.963 +TATCACGATCAGCAAGGTAACGCGATGCAAATTAACCGTGATGATGTTTCGCAAATTATTGAACGTTAACAAATAAGGTCAGCATCCGGCTGGCCTTAAGATTTTTCTCTT 0.828 +AATTGAGCGCGCTCTTCTTGAAGAAGTGCCGGGTGTCGTTGAAGTAGAACAAGTCTTTTAATAAAGCAGCCAGGCTGATATTTGATCAGCCTGGCTTTTTTATTAGAGATG 0.995 +CTTCGCCGGCTTTACATATTACTCAATTTTTATTATGGTGCCCGGCGGCGGAGGCCATTAAATGCAGAAAGAGACAAACCTTATCGGTTTGTCTCTTTTTTTACAATGAAA 0.993 +TCAAAAATCGGTTGAAGACGAAGATATTATTCGTCTGGTCGCCGAACGATAATCCTTCCTCAATCTTGAGTAACCCCGTCGTTGAAGGGTTACTCAATTCATCCTGATTAT 0.69 +AGGTAATCGTGGACCTCAAGCTTCTAATGTTGTAAAACTCTAAACTCAATACATGATGATGAGATGACAAATAGAGGAGAGGCATTCGCCTCTCCTATTTGATTTTCATCT 0.986 +GAGCTCGATTATCCAGGTCATATTAAGGTGACAGTAATCAGAGAGACTCGAGCCGTAGAGTATGCAAAATAAAGTGATGCGCTAAGCATCACTTTATTTTTTTGACGGCAA 0.9670000000000001 +AGGTATTTTTATCGGCGCACTGTTAAGCATGCGCAAATCGTAGTGCAAAAATGATAATAAATACGCGTCTTTGACCCCGAAGCCTGTCTTCGGGGTTTCTTTTTGCCTGGT 0.971 +GCGTTGTGATTCCGAAAGTGCTTCGTCCTTATATGGGGAATAGAGAAGTAATGAAACCGTAAATTATGGAAAGGCGTGCCTGACAAGGTGCGCCTTTTTGCTTATGTAAAC 0.997 +CACAATGGCTGGGAGGCTGGCGAGAAGCCATGGCGGACAGGGTAGTAATGGCCTGATTCTGTCTCTTTAAAAAGAAACCTCCGCATTGCGGAGGTTTCGCCTTTTGATACT 0.9570000000000001 +TCATTCGCGCGGGCATTATTAAAGCAACAACTGACGGTACGAATTTTCATGATTGAGAAAGAAAGACGAGAGACAGGGAGTAGCTTCCTGTCTCTTTTTTCTATAATTTGC 0.9990000000000001 +AAGCTCCTGGGATTCATTCACTTGCCGCCTTCCTGTAAACCGAATTATATAGAGTAAAATATTTGATTATCCTTTGCGCGGCATGATGTCGCGCTTTTTTTATGCGTCATT 0.998 +ATTTGGTTCTTGATCGGGCTTGTCTTTATCGTGATCGGAGTCATGATCAATTGGGGGCCGTTTTAACGATTGCTGCCCGCCGGCTTGTACGGCGGGCTTTTGAGTTATTCA 0.978 +GTTAAAGGGACCAGAAGAAGGTTTTGGATTTTCTAAAGCTTTACATAACCATTTCTTTAATTAAAGATAAGGACCAGCCACATAATAAGCTGGTCTTTCTTTATTTTCCCT 0.925 +CATTCAAAAAAGCGACATCGGCCCTTCAGGATTTGAAGAGAAATTAAATACTCATTTTTTGTAACGTGAAAAGCTTGCAGGTTATCTGCAAGCTTTTTCTTATGAACAAGT 0.971 +TCTCAGTTGGCAGAAGAACTTCAAACCGTCATTAATCGTTTTAAATATTAAACAGGCTACTTACGTGCAACCCCCATCTTATTCGGTGGGGGTTGGCTACTTTTACTGTGG 0.927 +AGACCTGTGCTGAATACGACACTGATCCACAGGCTGCGAAGTAATCAATTCGTCTGAAAGCTTGAGTAACAACGGAAACCGGCCATTGCGCCGGTTTTTTTTGGCCTGAGT 0.993 +GTGTAAATGGGCTGTTTACTGATTTCCCTGATAAGGCAGTAAAATTTCTTAATAAAGAGTAATGTTACTGCAGGATGGCGTTCATGCGCCATCCTGTCTCATATCAAATTT 0.728 +CAAGGCATCAGTTGGAGAGCAGCCTGTTGCTGAAGCTGAAATCGGCATCTATATTGAAAAATGAAAAAAACCCTTCACAACATTTTGTGAGGGGTTCTATTTTGTGTCGTA 0.983 +CTACTGATTCCATCAAAGTGGCGGATGAGCCATTTGAAGGCGACCCGCTTAAATTTGAATAGAAGTAACTAAAAAAAGCTCCCGAACGGGAGCTTTTTTATTTAATGGATA 0.997 +ATGTCGGTCAAAAATGGATTGGTGTTCAGCATATTTACGAAAACAACAACACTCAATTTTAACAAAAAAAGTCCGCTGATGTTATTCAGCGGACTTTTTCAATCTTTCTTG 0.951 +AAGCTTGAAGTGAAAAATCGAATTGAAGCCATTACCCGGTCAAAAGAAAAAGGCTGGTTTAAATAAAAAAGGATCTTGGCATCTGCCAGGATCCTTTTTGTTAACCTGAAA 0.972 +GTGAAGCGAGAAACATACGAAGATATTGTAAAACTTGATCTGCCATTTAAAACGGGTGTAAAGCAATAAAAGAAAGCGCCGATTTTTCGGCGCTTTTCTTATTTGAATATA 0.963 +ATCTGGAAGTTCAAGACGAAAAAAGAAACATATCGTCCGGCACCAGACCAGTTTGATTTTCGCCATGTCTCCAATCCTTAAAAAGGGTTGGAGGCTTTTTTTATGAAAGAC 0.998 +CGGAGTTTGGTGCGGGAACAGTTTTTACATTAACCTTTCCAATACGGAATCAATTTGAACATGTCATAAGCGTGTGACGAAAATGTCACATGCTTTTCTTTTTTGTTCGCC 0.961 +ATGCGGTAACCTTATTCAAATTGTACAGAAGTAACCGATTCAGTCGCGTCCTTTTTGTATGCAGAAGAAAAACCGCTGGCCGAGGGGCCGCGGTTTTTTACTCTTCTTCTG 0.857 +TTAATGCGTTTCTGGCTTTCACAGGATAAAGAAGAATTGCTAAAAGTATTTGGCTAAAATAGAAAGGACCTTTTTGCGCTTAAGCAAAAAGGTCTTTTTTGTGACGTCTCA 1.0 +GAAATTTCACTTCTTAAAAAGCTTAATAAGGAATACCAATTAAATTAAACAATCTTGTTGGTAACAAACATCCTCTTACTTATTGTAAGAGGATGTTTGTTTAATTCTTAC 0.997 +GACACCTGAAGGGTTAGGCCTACGCAGATTTGACAACGAGCAAGCCCTTTGATGGGCAGCTTTTGTACAGAGCCAGGGTGCTACCAACACCCTGGTCTTTTTATTTTATGC 0.996 +TGTAACAAAACCGGTGGACGAGGATGGAATTGCATACGCTGTCAAAGAATTGGGCCTTTTAAAATAAAGAAACGGCTGGCATTGTGCCAGCCGTTTTCTCTTTTAGCGCTC 0.977 +TTTGATAACCACGACCTTCTCGTCAGCTATTCAGAAGACGCTGATGAACCGGTTTTTGAATACCAATAAAAAAGAACGCACTTCATGTGCGTTCTTTTTTTACAAATGAAA 0.996 +AAAACAAATTGCGGGTACGTCAAAGTTTGTTTTCTACCCGTTTAACGAAATGCGCAAAACAAATTAGGATCAAGCAGCTTCCCATTGGGGCTGCTTTTTTTATATCTTTTT 0.99 +CAATATCAATACCCTGGAAATGATGCCGGTTACCCAAAGCTATGCCGGACTGAATGTCCACCGTCAGTAATTTTTATACCCGGCGTAACTGCCGGGTTATTGCTTGTCACA 0.6759999999999999 +CGGCGACGATACCTTGACACGTCTAACCCTGGCGTTAGATGTCATGGGAGGGGATTTTGGCCCTTCCGTGACAGTGCCTGCAGCATTGCAGGCACTGAATTCTAATTCGCA 0.85 +TATTGAAAACACAATCCAGGCAGCTGAGAAAGTATTTGCTGAGATCAGCCGCAGATAAGAGTGAAAACCGGTATCAAGGACTCCTTGTGCCGGTTTTTTCGTGCTCTCCCA 0.998 +TCAAAATATTGAAAAACATAACTGGATGCTAAAAGCTTACCTAGGTTAAGTTCAAAAATAGAACGGCCGGATGTCTTAAAAAAGACGTCCGGCTTTTCTTTTTGTATCTGA 0.995 +TCATCGTTGGCTTCTTCGTCATCGGTATCGCTGGTTACGCTTGCGGCCTGCTGGGACTGTAAGACTGTTGTACACTACCGGGGCCTTTTGGCCCCGTTTTTTTATCTGGAG 0.805 +TATAAGAATAAAGTATATGACCGGCTTGGACGAAACAGACCTTCTTCCAGCTGATTTGTCTTGTCTGCAGGATGGCGATTGATAAAAGCCATCCTTTTTATGTTCCAAGTT 0.9229999999999999 +AACTCATTAGGATTCGGCGGACACAACGCAACATTAATCTTTAAAAAATATCAATCATAATCAAGCCAAACCGGCTGCCCTTAAAGGGTAGCCGGTTTTTTTTGTGTGCAT 0.998 +TTGGCGGATGAGATCAAGTCCTTTGCGAATATGAATGCCCTTATTGTAAAATAAAGAGAGAAAAGGCGGTTGCTGTGTGATCGGTAATCGTCTTTTCCCCATTTATAAGTT 0.9259999999999999 +AGAGCTCGTTAAAACGGCAGAAATGGTTGTTGCCATTATTGAAGAGGCGGCAAAATAAAAAAAGAAAACCCGAGTGCGATTTCCCGCATCGGGTTTTTTTAGCTCCATTTC 0.9129999999999999 +CCCAGGGCTGAAAAACGCTGAAACATGGGAAGATGAGAAACAAGAAGCATAATATAAATATAAAACAAAGCTGCATTCAATAGTTGAATGCAGCTTTTTCATTATTGGAAA 0.997 +GATTTTGCGCAAGACAAAAAAAGCGATTTTGATTTGACGCCATTCAGGCTTTCCCGCTTCCAATAATCATCATTACGGCCTCCAGAATGAGGCCGTTTTTACAGGAGTATT 0.8859999999999999 +CGAGTGAGAGACCCTCGTCCGAAAACAAGATCTGCTGTTGAAGATGTTGTGAAGTGGGTTTAAAAGAAGAAAGCTGCTCGCATAGCGAGCAGCTCTTTTTTATGCCTGATT 0.99 +AGATACAACGGCTCGGTATCAGAGTTTGTACCGCCGGAAGTAGAGTTGGCGCTTCAGCAAAAATTCAGACAAGGATGAGGGTTTTGACTCATCCTTTTTTTGTTCCGTTTG 0.927 +AGAACGTTGTAAGGATAGGCAACTTCAAAAGGTTTAACTTTTCACATTAAAGAGAGCATACTAAAAAGTATCGGAGCTGGATAAAACCAGCTCCGTTTTTTATCTTTAATT 0.986 +GGATATCGCCAGTGAGCATGGCTGGATTGAGGTTGTTGAGTCCATGACCTTATCTGTCTGATTCCGCCACTCTGTGCCGGTTTACTGCACCGGCACATTCCCCCTCCGGCA 0.8690000000000001 +GAGAGTGATTGCCACAGGGAAGGGCCATGTAGCTGACAACATTATAAAAGAAGCGAAAAAAGCAGGGGTCCCGATTCAAGAAGATCGGACCCTTGTCGAATTAATGCGCCA 0.5770000000000001 +AACGCGTTTCTTTGGCTTAAAAAGGGAATGTGGGTTACACGAAGGGTAAATATTGCCGGACAAATTAAACGCCTCTGTCAGAAATGATGGGGGCGTTTTTATTTTGCGAGC 0.973 +AACGGAAAGTGACATTTGTGAAAGAAAATAAAAAATGGAAAGTCAATCAATTTGACGCAGTTATATAAGAAAAACGCCCGGTATAAGCGGGCGTTTTTTATTTGGTTTGCA 1.0 +GTGTGGGCGCGGCCGTTGGGCTGGTACTAGGACTGTTGCTGGCACGCCGTTAAAACCTCCCTACACCGGGGTACTGCATTTTTTTTGTAGTACCCCGTATAATGTGAGGCT 0.8320000000000001 +CACAAAAGATATTCATAAAGCGGCGTTTGTTCTGCCGAAATTTGTGAGTGACCTGATTAAATAATGAAGGTATGGCGCAGGAGCGAATCTTGCGCTTTTTTACAGGAAAGC 0.5579999999999999 +AATCATCATTGCTGTTTTACAGCTGATCCTTCTGTTCTTATAACACAAGGAAACGTACTTAAGGTGCGTCCGGTGAACCAGTCGGACGCACCTTTAATAACTATAAATAAG 0.983 +GAAGAAGATGATGCCCGTAAAAAAGATCTGAAATCAATCCAAAAGAAAACAGAACATCATAATCAGTAAAAAGGCGGTCTCGAAGGAGGCCGTCTTTTTTGATACGCTGAC 0.987 +TTGTATCAGCGGTTAAACCGCATATTTTTCACATGGATGAAAGAAAAAAACATACTGTAAACCTGCATGGCACACGTCAAAAATTTGGCGTGTGTTTTTCTGTGGATGGGG 1.0 +TGAACGCCATCAGGCCAAATGATTTTTACCGTTGCATCATGTCGCCCAATATGATGCTTGCTCGTACCAGGCCCCTGCAATTTCAACAGGGGCCTTTTTTTATCCCTGAAC 1.0 +CTGGCACTGCCGAAAGAGAAGTTGCAGGAACTGCACATCTAGTCAGTAAACTATCTTCACAATTAATTGAAAAAAGCCAGTTCAAATGAACTGGCTTAGTTGTACTTAGTG 0.889 +AGAAGGTATCGAGCCGTACCTTGAAACAAAATATTTGTCCATCGGTTTATAAAAGAATGCACGCTCCTGAGAGCTGCCGGATTTTCCGGCAGCTCTTTTTGTGTTCCGGCG 1.0 +AATTACGCTGAACAATGCGCTTCAGCTGATCGGCGTATCGGCTCCGGAAAAAATGTAATCACGATCAAAAGGACAAAGTCTTCGGGCTTTGTCCTTTTTTTATGAGAAAAA 0.9940000000000001 +GACCTTCATTTTGATTATGGCTATTAATGAAATTGGCACATATAAACAATAATAGAACATGATACAATAAAGACAATCAAATCGCTGGTTGTCTTCTTTTTTTCTGAAAAG 0.892 +TAAGCGCGCCTTCAAACAACTGGATAAAACTTTCCTTGATGATTTGTAATTGACCCCTGATTACGCATATAAAAGAAGGCATACTCAGCCTTCTTTTTTTTCACCGTAGTA 0.39899999999999997 +CGGCGGCATATTTGTTACGGTCATAAAGAAAAAACAGCCGGATGCGTATCCGGCTGCAGAAGAAAAGACGCTATAGCCTGTCCGGATCGGACGGGTTTTTTAGTCTTTAAA 0.9 +AATCTTACTTGCGGTTGCTTGTATTTTGATTATGGGGAAAGAAACGAAACAAACTGAGCTAGAATAGGAAAAGCACCTCTTAAAAGAGGTGCTTTCAGCGTGTCGACAAAC 0.467 +AATTGATGAAAAATACTTAACTCAATTTGCCGAAACGCTAGAAAAAGCGTTAGCGCAGTAATATAAAAGCCCAAAACTGATATCGTTTTGGGCTTTTTTTATTTTATTGAG 0.9940000000000001 +GACGAGCCAGAAGAAGAATAATTTTACTTCTTCATGCCAAAAGGGAGCTATCTCCCTTGTTTGAATTGAAAAGTCCAGGCTGCAAAGTCTGGGCTTTTGTCGTATTAGGGC 0.897 +CCTATGTCGGTTACAGAAAGTACAAGGCGGAAAAAAACAAGCTGAAAAAAAGGGACTACCCTAAATAAGGATGAGGGGCGGCGGTCAGCCGCCTTTTTTCTATTACATGTC 0.907 +CCTTGTGCTGGGCAAACCGAAAAGCTGGTCACCGGCTGAAACCTATTCTTCATCTTCTTAATCAAAAAGCCTCCCGACATCATGTCAGGAGGCTTTTATGCTAATGGTAGC 0.965 +CAGCCATCGGCTCGCATGCCGGTCCAGGTACATTGGCGATATTCTTTTTAAGAAAATGAGCAAACAAAAACAGTCAGACTCTGTGTCCTGACTGTTTTTGTTTGCTCATAG 1.0 +AGAAAATGAACGCATCATTGATGAGGTCAAAAAGGTCTTGCCTGTTGAACCGCACATGATGTAATGGAAAAAAGCTGACGATCGTTCGTCAGCTTTTTTATTTGTCAGCGG 0.99 +CTGGTGCTGTTTACACAGGTGATGAAATAGAAGTTCATTCCTATCAGCGAAAAGTCAAACGAAAAGCAGAAAGGGTCTGATTAAATCAGGCCCTTTCTATCTTTTCTTTAA 0.946 +CAAAGTCGTCATTTCATGGATTGTATCCATCGGCTTTGTTGGCATGCCTGAATTCATGTCTAGCAAGACCTTTGCCTTATGTCGGCAAAGGTCTTTTTTGCATGGTCCGGC 0.9590000000000001 +TAAAAGAATACAACAAATTCGTCAACCTGGAACGGCGTTTTGATGACGCAGGCGTTCCGCCCGGCGCATAAGCCCCAAGAGCTGCTCTTGGGGTTTTTTTCATTCGAAAGA 0.995 +TTATATGGATATTGCGACAGAAACGTGGAACTGCGAGTCTTGGACGACGAAAGTGCTTTAATTTATAAAAAAACCCGCCTGCTAAAGGGCGGGTTTTTTACTGCCATTTTT 1.0 +GTAGCGCACCGTCATGGGGTGTCGGGGGTCGGAGGTTCAAATCCTCTCGTGCCGACCAAAAATCCCAAGAAAAAACCAACCCTTACGGTTGGTTTTTTTATATCTGCAATT 0.943 +GCAACCGGACTGCTTCCTGCGTCCTGGCGAAGAGTATTCCAGCCTGACGGAATATCAGTTTATTGCTGAGTAATGTTACGCCCTCGTCATGAGGGCTTTATCTCATATTGT 0.841 +TTCACGATCTGTTACATAACCCCGATATCCGGTACGAACCAAACTGGACGGTTCTGAAGGCATAATCAAAGCCCAAAACATCATGTTTTGGGCTTGTCTCTTTACTCGCGT 0.775 +CTGCGCCAGGTGCAGTTGAAAAAATCAAAGCTGACATATCCGCTTAGATAATCACGACCATGAAAAAGAGCCCGCAGTGTAATGAGCAGGCTCTTTTTTTATTACAAAACG 0.9520000000000001 +AGCACTCGAAGTACAATATAGACAAGGTATTTTTTAAATAAGACGTCAGTCTGCGGATTGAACAGCGAAGCCAGAGCTTAAGTCAAGTTCTGGCTTTTGATTTGGGCATTT 0.995 +CAAGTGATGGACCTGCTGCAAAAGTTGCTCAAATAATGGCAATAACATAAATTCTGCAAAAATAATTTTCCCCTATCGTTATCTATGGGATAGGGGAATTAAGTCTCTGAG 0.853 +AACCCATCACATCTCGTTAACAGAGGATAAAAAAGATCATGCCATTCCATTAGTCTTCAAAGAATAGCAGTTTCCTTGATTTTAAGGGAACTGCCTTTTTTATTTATTGCA 0.9420000000000001 +CTTTATTATGAAGGATTCCGTTCTTCCGATCAGCTTGAAAAAACTGAAAGAATCTGGATGGATTGAATAAAAAACCCGGCACATGTGCCGGGTTTTTTATGTCCTCCCCTT 0.992 +TTATCGGCTATGTGCTGACCTTATTCATCGGACCATCCAAACAGCTTAATGAAATAAGTTAATGTCACAGAACGCCTGCGTTATTGCGCAGGCGTTTTGTAATAAAAAAAG 0.96 +GGAATTTAAAGCCTGGCTTGAAGAAGAGATGGGCATTAAGAAGTAACATCCGTATCGGTGTTATCCACGAAACGGCGTTGAGCAATCGACGCCGTTTTTTTATAGCTTATT 0.95 +GTGAAACAAATTTTAGAATCGATTGACGAGAAGACATCTGTTTACTAAATAAGCAAAAAGAGCATTTTTTGAAGTTTTGTTTCAAAAAATGCTCTTTTTCTATGCTTTATT 0.996 +GCAACGTATTCGTTGTCGTGATGGGTCTGATTGCAATCTCCGCAATCTTCTACTCTCTGTTCAGCTAAGTCCTTTCGCGCCGCTTTCGGGCGGCGCTTCCTCCGTTTTAAC 0.35 +ACCAAGATGACCTGGCAGAATCGGACCAGCAATAGCTGACGTACACATCAGCCCGTAATCTTTTTTGATTGCCATCACCTTAACGGGTGAGGGCGTTGTTGTTCGTTAATA 0.677 +TGGTGAGTCAAGACTGGTATCCACTGATAACAGCCTTTATCATTTCGCCTATCAAGTAAGAAGATAAAAAAGAAAAAGGCCCCAAGATGTTGGGGCCTTTTTCTTAATCGT 0.951 +TATCGCGTGCTTTCTGAAAAAGAATTCCTCGATGCCGGCATTCCGCACTTACAAATGATGAAGGACTGAACGCATCCGGATCTTTTGAGATCCGGATTTTTACTGCATATA 0.903 +AGTACGAGGAGATTGAGGTTGAGATCCACAATGGGAAACAGCCTCTGTATTCGTATATAGTTTCAGCAGAATAGAAGGGCAATTTGCCCTTCTATTCTTATGTCTGCTTTT 0.882 +GTAATGCACATATTGCAAGGAAATGTAGGCCTGATAAGCGTAGCGCATCAGGCAATTTTGCTTTTATCATCACTCTGAAACCGGGGAAACCCGGTTTTTTTATCCTCATTT 0.866 +GTTCTGAACTTGCTCCGGAAATTAACGAAGCGCTTATCGTTGAGTTCTACTCTCGTTAATCGTTTTAAAAACCCCTGCCGCTATGCGGTCGGGGTTTTTTTATCGGCTTTC 0.966 +CGAAGATAACGCATCTGCGGCGCGAAATATGAAGGGGGAGAGCCCTTATAGACCAGGTAGTACACGTTTGGTTAGGGGGCCTGCATATGGCCCCCTTTTTCACTTTTATAT 0.17 +GGTACCGAGCGGCACTACTAACACCGCATCTGTTCACGTCCTGTAATATTGCTTTTGTGAATTAATTTGTATATCGAAGCGCCCTGATGGGCGCTTTTTTTATTTAATCGA 0.9990000000000001 +TTGGTGTACCGATGAAAACGGAAACCCTGTCAGAGATTGTGATCATACCCAAACGTCTGCGTAAGATAGAAAAAGAGACTGCGGCAAGCAGTCTCTTTTTGATAAGATTAC 0.946 +GTAGTGCTGGCAGTCTTGTTTTTCGTGTTAACGATTGCGCTTGCTTATATCCTATAGGGCAATGTTTGTATAAGGTCTGATGTGAAGTCAGGCCTTTTTCACGTTTCTGGA 0.96 +AAGTGAAATGGCATACCTCACATTCACAAATTACAGAAGCCGCTATTGAGCAGCATTTGTCATAGAAAAAAGCGTCCAGATATCATCTGGCGCTTTTTTTTGTAAAAAACC 0.9890000000000001 +TGTTGAAGTAACAACTATTTGCTTGCCGGTTATTTTGCTTCCGGCAAGCAAATGAATTACACAATGCAAGAGGGTTATTTGTTCAGCAAATGGCCCATTTTCTCGGCTTTG 0.675 +CGCATCAGGATCTGCAAGATGAAGAAAAAGAACAGGATGAAAAGCCTGTCGTCAGCTGACTATAAAAAAATCATTTCTGGGTTCAGAAATGATTTTTTATTGTGTTACACT 0.797 +AACGACCGGACGCAAGCCGTTGTGGTCGCCATTAAAAATGGCTGGGTAGAAATGAGATAGTATAATAGGAGACTTGCCTTTTACTAGGCAGGTCTTTTTTTAGGCTGCCGT 0.885 +TGCTGATCCAGGCAAAAGACGGAACCGTGTACTCTATTACACGTTAATCGTCTCTGTCGTTCACTTTGAAAACGGCTCCTGGACAGGGGCCGTTTTCCTGTTTTTAACAAC 0.511 +AACTGATCAATTAATAAATGATTTTATAAAAACTCACTGATGAAAAACATTTTGGCACAGTAAAACGGCAGGCGCCTTTCCAATAGGAAGGGCGCTTTAATTCGATTTCCT 0.948 +CGGCAGTTTAAAGATATAAACGTCTTTATCGATGAATTAAAAGACTTAATAAAGAAAGAGCGTTAAAAGAGCTAAGAGGATTCTTCTAGCTCTTTTCTCATGTGCAAAATT 0.9390000000000001 +CCGTAAAAATAAATTTTAAAATTGAATAAGTCCAAGACGGAAAGCCTGCGGACACTGATCGTTTTACAGAGAAATTTGTGCTTCGATCGGTGTCCGTTTTTTTTCGCAACT 0.983 +GAAAAAATTCGTATCCATATCGAAGAACGCCGTTATATGGGGCGTGCTGACTAACTTCAGCCGCATGCAGAAAAGGGATAGCTCAGGCTGTCCCTTTTTTAATTTATTATA 0.973 +AAGAAGAATATCGAGTCGAAATGAAAATACGGATCGCTGAATAATAAAGCATAGAAAAAGAGCATTTTTTGAAACAAAACTTCAAAAAATGCTCTTTTTGCTTATTTAGTA 0.991 +AAGCGCAGACCACCGCAGAAAACATCATTAATACGCTGGTTATTCAGTAAGTTTTAAGATGATGAGGCGGCCTCAGGGACGTGTTCCGGAGGCCGTTTTTTTAATCGCCAC 0.765 +CTGGATGCGTTGGTGGCAGATGAAGATCTGAGCCGTTTTATCCTATAATCGCGTTCAATCATTTTCATCATTGTTTGATGGGGCTGAAAGGCCCCATTTTTATTGGCGCGT 0.8370000000000001 +ATTGTATATAACGTTTTTTCTAAAGCTAAAAAAATAGAACAAGAAGCTGACCTCGCTGAACAAAAAACAGCAGTCTAACTCCGCCGCGGCGGAGTTTTTTTTGCATATAAA 0.987 +TTTTCCTACATGGCGATCAGGAATGCGGACAAGCAGGATGCGTAAACAAAGGCGCCCAATATGTATAAAAGGCCGATGCTGTCAGCACCGGCCTTTCCATTATGAGTCTGT 0.915 +CAATAAATCAAGTAATTAACCTTTAATATATCTCTTACTGCTATTTTTTTCCGGGATTGTTATTTCCACGATGCAAATGACAATACCCGGAAAATCCTTCTGAACTCTTCA 0.846 +TATTCAGCTCTGGCAGGTCGTGTTCTCACGCGGTGTTGAAAACGGCCTTCGAGTGGCTCGCTAAAGGCTATTCTATCGCCCCCTCTCCGGGGGCGATTTCAGATCAGGCTT 0.65 +GCTTCCCGTACCCGCAAATGGATGTGAACTTTAAGCGGGTGAAAGAAGACAAAGCTGCGTAATCAACGCTGAGGCCAGATAATACTCTCTGGCCTCTCTCTTATTAGTTTT 0.953 +CCTATAGGAGGAACATGCAGGATTGAAGCTATTTCGGGTACATCACGAATATGGATTGATAAACATTAATCAGCTTGTAAATTTTTTTACAAGCTTTTTTAGCGCAATCGG 0.961 +TATTTTGCCGGCTCAAATGCCCGATTTTACGATAAAAGGAACTTGTACCCTTATCGTGTCTCATATTTAAACTTAATCTCACTTCGAGGTTAAGTTTTTTTATTAGAAAAT 0.958 +CGTGCAAAACGTACGTGATGTGATTCCGTTCCCACGTACTCCGCGTAACGCCAGCTTCTAATTTAGCGTCTTCAAGAGCCAGCGCCCGCTGGCTCTTTTTTTATCTTTTCT 0.9540000000000001 +ACGAAGGAGGAAATGCGGCACGCGATTGATGAAATCGCGGCGAAAACGATCCACAAGTAAACATGAAAAAGCCCCTGAACACTAGTCAGGGGCTTTTCATATTAATGATCT 0.986 +TATTGTGCCCGTCGAAGTAGAAGCGGCAGTCGCATAGCTCTTCTGTCTCATTGTGTACATCCTAAAGGCAAAATGCCAGCCCGATCGGCTGGCATTTTTATCTCAAAAATT 0.905 +ATTGGCCAAAGCAGAAAGGTCCGACCTAATTAAAGAAAGGGAAGCAAGTGTTCATCTGTAAAGGGTTTTAAAACGCCATGCCTCGTGCATGGCGTTTTTTTGTGCCAATGG 0.9990000000000001 +GACTATACGCCTGATGGTCGTTCCCCATCGGGCCTGAAAACCGCAATACGCTGGGTAACAATCTTCGAGGGTAGCAGTTAACGCTGCTACCCTCTTTTTTCTGGAGTAGAT 0.986 +TGGTTGCCGTTATTGGTGCGATTGTCGTGCTATTTATCTACAGGAAGATTAAAAGTTAACGCTTAAATTGCACAAAGGCTGCACACAGGCAGCCTTTGCTATTTTTTAGAG 0.978 +GGGGCCTAACCGCTTAATTATCCTACCGAGGTGTATATTATCACAGCTATTACGTTACGTATGCTTGTATATACAGCCTCCATGTCTCATGGAGGCTTTTTATATGGAATC 0.17 +ATGACGCTTATGAAATGGCGCTTGACCGTGTGAAGGAGACGGAAACAGAGTCTTCTTCCAAATAAAAAACCGCCCTGCCGTCTGGCAAGGGCGGTTTCTTTATGAAACAAA 0.9840000000000001 +GCAGACGATTTAGAACACGATTTCGAACAAGCGCTGAAAGAAATTGCTCCAGTTTCGGTTCGTTAAAAAAGAAGCCGGGGATAGTCCGGCTTCTTTTATTATTGTTCGTCA 0.8440000000000001 +CGATTTTCATTTCTTACAAAAATGTAAGGCTGCGTTAAGAAAACAACATAGCCCGCTTTGTTTCCCCTTGATAACATGGATTTATGTCAAGGGGATTTTTATGTTGAACGT 0.9490000000000001 +TGTAAAAGAGCAAAGGTATATGCTCAAAAAGCAAAAAGCGAAAAAGAAGCACAGAGGTAAATAATCCAAACAAAAGCAGGCGCAGTGCCTGCTTTTGTAATTAAGGACTCT 0.915 +AAAGGCGCACAGCAAAGTGCTGAAAGCGTTAGAAGACACATATCAAGCTGTTTTAAGAGGCTGATAAAAAAGCTTGCAGAGAAATCTGCAAGCTTTTTTCTATGAACGTTT 0.998 +AATACTGCCGCGATTCTTGTTTATGAAGCGCTGCGCCAGCAAAATTACCGTGATTTAAAATAGTGAAAAACCCGCTCATCGATGATGGGCGGGTTTTTTTGCGGATGTTGA 0.9990000000000001 +CTTGTTTGCTAACCGTGGCGCGGACGTTGCGCTGATTGGCACACCTGACGGTGTCAAAACCATTGTGAAATGATCTGACGGGGGAACCTCCCCCGTTAAAAAAATTCTCTT 0.912 +ATCAATGGATTGCACAGCTAGCGTAGGCCTGATAAGCGTAGCGCATCAGGCAGATTTGCGTTTGTCATCAGTTTAATGCCCGGAGAGATCCGGGCATATTTCCGAATACGG 0.606 +ACGCTACCTTGAGGTGGTAGTGCCCAATAGGGCTTACGGGTTCAAGTCCCGTCCTCGGTACCAAATTCCAGAAAAGAGACGCTGAAAAGCGTCTTTTTTCGTTTTGGTCCT 0.975 +GGATATTACAGTTGACGCTTCCAGAGGCGCAGTCTATCAAGGCCGTGCGAGCGTTCTTTAATTACAGGTGAAAATGGAAGGGGAATCCCTTCCTTTTCTCTTTATCATGCC 0.5379999999999999 +AGCGCTCTACAGCGCGAAACGGAGCGGGAAAAACCGCTTGATGATTCATGACTCAATAAAATAAAAAAAGCCCAAAACGATATCAGTTTTGGGCTTTTATATTACTGCGCT 0.9890000000000001 +TATCCTTTATGGTTATTTTACCGGTAACATGATCTTGCGCAGATTGTAGAACAATTTTTACACTTTCAGGCCTCGTGCGGATTCACCCACGAGGCTTTTTTTATTACACTG 0.986 +ACAAAACAAGGGATTCATGATTACATCGCGGATACCACCGTGGTGCATGAAAAATTATACCGCAAATAACCGATAACCCGGACTTCCGTCCGGGTTTATTTTTTTAGGAAT 0.993 +TTGCCGCACAAATTTGGCTTGCATGGAAAGGAAGAGAACTGAAAAAAGATTCGGCGCAATAAAAAATATCAGGCTGTCCTCTGCAGGGCAGCCTGTTTTTTGAAGTTTCAT 0.9440000000000001 +TGCGTTTATCAGCAAGCAGTAATCACCATAACGATTTATCAAAAAACATGTTAAAATAGAAATAATAGAAGGTACCTCATTGCCTGAGGTACCTTCACTTATGATGTTTTT 0.9440000000000001 +AATGCACGCACAGATCAAATAATAGCGTGTCATGGCAGATATTTTTCATCCGCTAATTTGATCGAATAACTAATACGGTTCTCTGATGAGGACCGTTTTTTTTTGCCCATT 0.9420000000000001 +GGATTCACTCACTTGCCGCCTTGATGCAACCTGAATGATTTTGTGTATATTACCCTCGGCAATTTCTTCTTCTGCGGCTCGATGAATTTGGGCCGCTGCTTAATTTACGGA 0.385 +ATTGATTTTGCAAGACTTTGTGGGTCGGATAAGGCGTTCACGCCGCATCCGGCAAAAACAACGAGCCAATAAAAATACCGGCGTTATGCCGGTATTTTTTTACGAAAGAGG 0.84 +AAACAACTGGCTAGAAGCCGTTCAATGGATGAACAACAACAAGCATCACTTAATACGCGTCAATAACTGAAAAAGGAGCCCAAACCGGGCCCCTTTTTCATATATACGCAT 0.852 +ATGAGGCGGGAGACGCATTTCAGTTTTAGACTCCCATAAATCCGCCAAAAAAAACGAATCATGTGAAAAGCCGCTAAGCCCTTGTTTAGCGGCTTTGTTTCTGCTGATAGC 0.975 +AAGATAGACGTGCATTAGCAGAATAAAGACGTTTGTCAGTTTTTATTTTAAAGATCTGCATGAAAAAACTGTACTCATTTGAAGCGAGTACAGTTTTTTATTTATTTACGT 0.9690000000000001 +CACATATTAAAGGACTTTTCAGCCTGGCTTACTAATCCTTTTAATGAATCTGGCATTGCCAAAAGCATTCCTTTCCGAGTGCTTTCTGGCAATGTCTTTTTTTATGTTTAT 0.998 +TAATTCCGGGTGCTGCCGGCAAGGATGTATTTGAAAAAGCGATTTCTGACGAGCAGAAGAAAAAGTAAGGAAAAAGCTCTCGATAAAGAGAGCTTTTTTTATTCCTGTTCC 0.868 +GCGATAATATGGCGAAACAGACACAAATCAATTTGTTATCATCAATGGCGATTTAATCCTGACCCGCCGCGGCAGCCTGTATTCGCGGGTTGCCGCCGTTTTCAGTTTTTT 0.455 +GCATTTGATATGAATAAAGGCCATTTCTTTGACAGTGAGACAGAAGTGAGAATCCGATAAGATCAAAAAAACCGGACATGGAGACATGTCCGGTTTTTTGCTATTGAAAAT 0.968 +ACCTGGTTTTGCGCACGTTATGCCTGGTATTGTCAACAGATGATGCAGGCAAGAGAGCTGGAGTTAGAGCACTGATATAACGGGCCTGATGGCCCGTTTTAGTGTTTGTAT 0.685 +CGCGGCAGAGAGCGCCGATCGATTGTTGCTACGCTGGTGGCAGCGGGGTCAAAAGCCATTGATGTAAAAGACAAACGCGAGGCTAAGACCTCGCGTTTTGCTTTAATCAAC 0.655 +AAAGACAATGATATGAAAGTTCAGATTGAAAAATCCCAGCAAGTGTAAATGTTTTGTATCACAGTAAGAAGACCTTCTTATTAAAAGAAGGTCTTCTGCTATTCTATTCAG 0.912 +ACAGTTGATGGCGAAGTCGTTTGCGAAGTAGAACTGACTTTTGCCCTCGGAGAATAATTGAAAAAAGGCAGATGGCCAGCGGCACATTTGTCTTTTTTTCTTTACAAATAT 0.7290000000000001 +CAGCTGATTTTACGGATACCGAGCCATCAGCACAATCAGAACGCCTCTTTTTCAAGGCGCAGAAATCAAAAACCATCGTTTCCTAAAACGATGGTTTTTTAAAATGCTTTT 0.991 +TAAACAGGCGCAAGCGAAAGTACTATAAAGTAAATTGCGAACACAAAGGGAGCTGAAGCTAGAAAGCCATTATGCGCTTTTTAGCTTATGCTCCTTTTATTTTTATAAAGA 0.9890000000000001 +CGGAGGCGACCTGCTGAAAGGGGAAACCGGTTCAAAGGTGAAGCATTCAAATAACAGCTGTTCTCTCTAAACACGGTGCCTTTACAGGCCCGTGTTTTTTTATCATTTGTG 1.0 +ATACGCATCTTGGTGTCGGAGTAGACCGGAAATATTACACCCAAAACTTTATTAAGCGATGGTAATGAAAGCCGCGGACGAATGGTTCCCGGCTTTTTCTATTTCCATACA 0.9490000000000001 +CTATGAGTATGTTCCTGCCGAGAAAGCTTTACAGGATGAATCATCTGTGGCGGGTGCTTAATTCACAATAAGCTTGCAGAAAGATTTCTGCAGGCTTTTTTATTTTTTAAA 0.9690000000000001 +TGCGTGGAAGATACACGTGAGACACAAAAACGACATCCTTAAGGCACTAGTGTGAAAAGCGATTCTTGTTGGGCATGCGTCCAAAAGAGTCGCTTTTTTATTTTCAAAGAA 0.922 +CAGCGTCACCCGTACAAACCGAAGCTGCAGCACATCGACTTCGTTCGCGCTTAATTGCTGAATAAGTTGTAAAAAACCCCGCTCCGGCGGGGTTTTTTGTATCTGCAGATT 0.98 +GTGTATGTGCGGGAAATGGGCCACACTGAGCTTCCTTTCTATCAGCAGCGCATGTAGCGTAAACAAGAAAAAAGATACCTGTTAAGAGGTATCTTTTTTTTGCGCCTTTAG 0.9990000000000001 +ATTTCTCAATTAATTCGTTTAATAGTTGAATAATGATCAGCAATACGTGCTATAATAGATCATAATATTACTTGCCAAAACCCGTTCAGCGGGTTTTTTATTGTGGCTTAA 0.813 +AATCATCGGCGTACCGATGGGGAAATAAAACAGTCAAAATTAAAATCTTAAATTAGTATATACTTATGTATTCAGAGGGTTTTGCGCCCTCTGTTTTTTTCGTTATAATAG 0.971 +ATGATCGTCTGGGGTTATCAAAGAAAACACCAGACTATGAAGAAGTAAAAGGAGAAGAGACAGAGTTAGAGGAGTAAACCGTGTGTTTACTCCTCTTTTAGTTTTATTGAC 0.987 +GTCACTATTGCCGAAGTCCGCATGGAAACCCGCAGCATTATTGCGCGCCCGGTCGCGTAATCTCCTTTCACGGCCCATTCCTCATGGATGGGCCGTTTATTTCCCCGCTCT 0.899 +CACGGTGGCGAAGGCTGCTGTGGCGGTAAAGGCAACGGCGGTTGCGGTTGCCACTAATACCGAAAAAGTGACAAAAAAGCGGGGAATCCCCGCTTTTTTTACGCCTCAATA 0.951 +TGACAGAAGAAGAAGTAAACGCAAGACTGGCAGCGATTGAAACAGAAGATCACGCTGCTGCTGAATAAAAAAAGAACACCTCGTATTGAGGTGTTCTTTTTTTCTATATGC 0.9890000000000001 +AGGTTAAAGATGTTGTTCTGCCAATGTTATGCCGCTGCACCCTCAACTTACGTTATCCCAACTTGTGACTGTTATTCGGCGCTCCACGGAGCGCCTTTTTTTCTTTCGTCT 0.996 +CAACGCCTGCACGCGTCTCCACCACGCTGGGTAACTGATGATTCGGTTGTACCCGGAACAACTCCGCGCGCAGCTCAATGAAGGGCTGCGCGCGGCGTATCTTTTACTTGG 0.501 +AGCCTGCGATAAGCAGACAGTGGATGCGATTAAAGAAAATATGTAGGTTTCATAAAAGCTTGTTGAACGAGCAGTAAGCCCGGATCGACAAGCTTTTTTCTTATTCTGTTT 0.9229999999999999 +TCACAATGGAAGCTGCTGAAGTGGCAATCGGAAGTCCGATTCACATCGTAAAATAATTTTCATATCAAAAACAGCCCCGCTTTGAGCGAGGGCTGTTTTTTTATTTTGACA 0.9990000000000001 +CCCTGTCCGCCTGGCTGACCTTGAGACGCATGTTTTGCGAACGTTTTATCAGAAAAATCACGTGTAACACTTGGGGCTCTAAGTCATCAAGTGTTTTTTTTGTATGTGTGT 0.919 +CGGCTGTAATAGAGCTGGTGCCGTTTTTAGATAAACAGAAGGCAAAAGAAACCCACCATTCAATATAAAAGGATCAGCACTGTCAATGCTGATCCTTTTTAAATTTGAGTT 0.972 +TTGCGCTGCTGGAACTGATCCGCTTTCTGCTTCAGCTTCTGAACTGATAGCGGAAACGTAATTAAGGGCTAAGAGCACACTACTCTTAGCCCTTTAACATTTAACGCATTG 0.997 +CCCCGGAAGAGGTTGCCGATTACATTGTTGAGACATTAAAAATTGGCTGGGATCTTTATCAGCCGATGTAAAAAGCCGTGCGCAGCGCACGGCTTTTTTTATCGTTTTATC 0.93 +ACGTCTGAGCTTAAGCAGCACGATCAGCGAACTGTAAGAAAATTTCATACAGTGAAACAAACGCGGTCATTCTCACATTCAGCTGAGTTTGACCGTTTCTTTTACATATTG 0.96 +AGATAAGGGATTAGAAGGAAAAGAATTTGATGAGTCTTATAATAAAAAGAAAATGTCGAAATAAAATAAGCTGACCGTTTCGTGCGGTCAGCTTATTTTTAATGATTTTCT 0.972 +CCGCGGTGTTTAATATGTTCAAACACCTGGCCCATGTGCTTTCAGAAAAATATCATCTGGTTGATTAAAGCAAAACGCGAGGTCTTAGCCTCGCGTTTGTCTTTTACATCA 0.5660000000000001 +CGTTCTATGCTACTTCTTATTATTGATTGCTCGTTTCCGTTATGAATCCAGGCAATCATAAGGACTCAAGACCAAAGCCTTAGGCGGCTTTGGTCTTTTTTATGTCTTGTT 0.9840000000000001 +AGGCGGCCGTACCGTTGGCGCGGGCGTTGTTGCTAAAGTTCTGGGCTAATTACACGTTAATTAGTTTTGAATTGAAAAGGGCGCTTCGGCGCCCTTTTTGCATTTGTTGAC 0.998 +TGAAACCATTTCACTTGATGAGTTTGTGAAAAAGGCAGTAGCTGAAGCGAAAAAATAAAATAAAAAAGCATGATCTCATTGAAGAGATCATGCTTTTTTTATTTCTCTAAT 1.0 +CACGCCGGGTTCTCAGCCGCAGCTAATAAACCGTTTGGATGCAATCCCTTGGCTATCCTGACGAGTTAACTGAAAGCACTGCTTAGGCAGTGCTTTTTTGTTTTCATTCAT 0.92 +GTAAAATTGAATACGGAAGAAGAAGAATTTGAAGAATATATTCAATTAACGATTCGCATACCAAAATAGAAGCTCTCCTGAAAAGCAGGAGAGCTTTTTATATTTTTAACA 0.991 +TTCTTCATCTTTTTAATGCTTGGGATTGCTGACGGCCTTTTCAGCGATACGTACAGCACTTATTAATGAGAAAAACCCGGAGCTGGCTCCGGGTTTTTTATTATTCCGCTT 0.9890000000000001 +TGAACGCCAAGTACCGCGAAACCTCTCGCGGCGGCCTGGCAATGAAGATCGTTGCCTGCGATTAATCGCTCTCCAAAGGCCTCGTTTTGCGAGGCCTCTTCCCGATTTCTC 0.754 +CTATCTGATCAGCACTGAAGTGAGCATGTACCATGTACGTGCTGTCAAACAAGGGACATACAATCAATAAAAAGCTTGCGCCTTGCTGCAAGCTTTTTTCTAAGAGTCATG 0.988 +CGTCCAAAGGACAAGCCAACCCGCCGATGGTCAACAAAATTCTGCTTGAAGAAATTAAAAAACGCTAATAAAAAAGCAGCCCTTAGAGGCTGCTTTTTTTATGGTCAAATT 1.0 +TCCGGCTGAAATGTTTTGCTCAGTTGAAGAAGCTGAAGCCGCCGGTTATCGGGCGCCAAAACGATAAACAAAAAGCTCCAGAATGTCTGGAGCTTTTTCTGTTTCACAACG 0.972 +ATCAGAGCTTGAAGATACAGAAGAGCTAGAAGAGGTGCTAGAAGAGATCCTAGACCAATACGCTGAGTAAGTATAAAAAGGAGCCGAGGCTCCTTTTCTTTAATAAAAAAT 0.835 +AGTCAGAATGCGATGCAGATGAGCGAATCGCGGCTGAACCTGGCGAGAGGGCGACTGCGTTAAGTTATACCGCCTCGGTCAGTTCCGGCTGAGGCGTTTCCACTCCCTCCG 0.745 +ATGTATGGACATGGAAGAAGTGATGGAGCGGATTGATGAGTTGATTCGCCAAAAAAACAAATAAGAAAAAGAACTCCCGTACCTTGTACGGGAGTTCTTGATTTAAAACTT 0.9359999999999999 +AAAACGGCGAAGCAACCTTCTTTTGCAACGGCGGCAGCGTCAGCGTGTGGGTTATCGAAGAGGTGATTTAAATTCATCCCCGGCGGCAAGCCGGGGAGATTTCATTACGGC 0.465 +AAGCGAAAGCGAAACTGATTGAACAGCAGGCTGTTATCGCCGCGCTGTAATCAAAACACAGTGATGAAAACGAAGGCCGGAGCATGCTCCGGCCTTTTTTATCTCTTACAG 0.96 +TCACTGTATATCCTGCCGGTTATTTGTATTGCTGCAGGTTATGTGTTCTTTTCTCTGCTTGGGTTTATTTAATCAAAAGGCCGCACGGAGCGGCCATTATCCTCAACCCAT 0.344 +TGTAGAAGATGGCGAATTTGTCGTAAAAACGACTGCTAAAACGAATTAATATAGAAGACGGAAATGAGGCATACAGCATGTAAGTGTATGCCTCACTTTCATATCAAGAAA 0.902 +CGGGAAAGATATTTTCCTGCTAAGCCCGGATGCCGGTGCTAAACCGGGTCATCAGGTGAAATAATCCCCCTTCAAGGCGCTGCATCGACAGCGCCTTTTCTTTATAAATTC 0.9570000000000001 +AGAGGTGTGCGGCATTCAGTCCCATGAACCGTTGCACGCCATCATTTTAAAGAAACTTAATCAATAACCGACCACCCGTGACACAATGTCACGGGCTTTTTTTACTATCTC 0.941 +CGTTCGTGAAACGATTCCGTTCCCAAGACTGTTAAACCGTCTGTATCCGTAATACATTCAAATGAAACAAAAAAGAGTCTCCTGCAAGGGAGACTTTTTACAGTAAAGGGG 0.9540000000000001 +TTTATCGCACCAGCAATCTACCGCTCTATTCATAAAGCAACCGGCTTTAACAGAAATCACATATCATCTACGTAAAAACCGTTCTATTGGACGGTTTTTTTATCATAACGA 0.852 +AAGCGTTAATGGTCACTATTGCCGGGATTGCGGCAGGTATGCGTAATACCGGCTAATCTTCCTCTTCTGCAAACCCTCGTGCTTTTGCGCGAGGGTTTTCTGAAATACTTC 0.863 +GACTTTATCAACTTGTTCATCAACCTGCTCCGCTTCTTCGGCATTTTGAGCAGTGATGATTAATGAAAAAGCGTCTGTCATTGGATAGGCGCTTTTTGCCATACATCAAAA 0.9129999999999999 +CGTATCTGCGTAATAAAGACGTAACTCGTTACCGTGAGTTAATTAACAAACTAGGCTTACGTCGATAATCGTAAAAAGCGGGAGGATTCCCGCTTTTTTATCGTATTAAAG 0.963 +TAAAAGGGTAAAGTCTATCCTAAATACTTTCATGAAGTTGTTAAGCCAGCCCTCTTGATGGGCCAACATTTATACTGACCCGGCGGCAACCGGGTCTCTTTTATTTTATGC 0.993 +AATTTAATCAAAAACACTTTTTTGAGGCTTGTGGGTGAATAACCGGTGAAGGTCTGAATCACAGAAATACCAAGGGCGTCAAAACCCTTGGTATTTTTTCTGGTTATTAGA 0.975 +TCTCTTCGGCGGTAATTGGATACAGCTGGTTACGCGTTCGTTTGCGTCCATCAGTTTGACAGGACGGTTTACCGGGGAGCCATAAACGGCTCCCTTTTCATTGTTATCAGG 0.555 +CGTTTTACCTTACTGCGTCGCGGTAAAAAGAATTACTGTCTGATTTGCTGGAAATAATGCATTAAGTGGAAAGGGGGAGTGAGAAATCACTCCCCCTGGTTTTTATACAGG 0.327 +ACCTGTAAGCCTTCCGCCTGATTCACCAGAAAAACAGAAGCTGCGGAAGAATGCCTGATGATAAAGGAGGTCTTCTAATATACTAGAAGGCTTCCTTTTTATTGTTGGAGG 0.917 +AAACAGCTGACGTGACGCTTTCTCAAGAGGATATTTCATTTATTGATAAGCTGTTCGCATAAGAAGAAAACAGCCTTCTCCAATGGAGAAGGCTGTTTTTTTGTGCGATAA 1.0 +GTCTGCCCGTACTCAAAAGCAACCTCCGGAAATATTGATGTGACACTTGAAGTTGCTGAATAAATAAAAAGAGGATGCCTGTACAAGGCATCCTCAATTTTTTTGATGAAG 0.986 +TACGGAATCAAAACGTGGAGGGCGCGGAGCAAGAAGAGGATAACTTGCTGCTTTCTATAAAATAAATGAAGCATCCGTTCATCCCGACGGATGCTTTTTTATTATCCTCAT 0.976 +CATATCGCCGCATTTATCATTTACGGCCTTCCGTATCTTCGCAGAAAGCGAAGCAGCTGAAGACAGCAAAAAACCCTCTTCCACTGAAGAGGGTTTTTTGTATTATTCGTT 0.9620000000000001 +CATGGCATTAAAGTCCGTCTTGAAGATGGCCGCATCGGCCGCGACCAGCAGATCGTTTCTACATAAAAAAAAGCTGCCCCTGAACAGGGCAGCTTTTTCATTTATTTTTTC 0.9590000000000001 +CAGTTTTAGACAAATACACACTGCGCGACCTCGTCAAAAACAAAGAAGATATCATGAAGCTTTTAAAAATGAAGGAATAGATTAAGATTCCTTCTTTTTTTATGCCCTTTT 0.985 +CTTCGGATTTTTCGCCTTCTTGTTTGTCATGGCAACGGTTATTCACCTAGTTGGTTCATAACAGCGACTGCCCGGGCTGCTGATTTCTCGGCAGTCTTTTTTAGTTCGTCT 0.978 +TGGCAAGAACACTGGCTACACTTGCGGAGCGATTTACATTGGAAGAAGATAAAAACGAGTAATGAAAAATCCCTGCTGTATGATGCGGCAGGGATTTTTTCTTCCGAAATC 0.968 +ATGGGGAACCTTTTTGGGATTGGCGCTACTATACCGCAGAAAATTATTATAGAACAAGATAGTGTGATGTATAAGACAGTCAGTTCTGGCTGTCTTTTTCTGTTATTCAAG 0.991 +AGGAACTGAGACAGAATTTGAAATTTCCGCAAAAGATGGACGCATTATCAAACAGGAGATAGATGACTAATCAAAACCCCCGCTGCAGCGGGGGTTTTTCATGATAAGAAA 0.981 +GATCATATTGATAAGGAAAATCCTTTCAAAGAACCTGAAAATGCAATCGCAATTGAGCGCTAAAAGAGAAAACGGCTGGCACAATGCCAGCCGTTTCTTTATTTTAAAAGG 0.9690000000000001 +TTCAGCCATAACACCGGTTATTTTCACAAATACCTCTGAAGAAGATCTAAAGCACATTCAAATGAAGTAAAAAAGTCCCCCCTGCTGCGGGGGACTTTTTCGCCAACATAA 0.986 +TTTGTTAGCGTAGGGCTTCAGTGATATAGTCTGCGCCATCTGATCGTAAGTAGTTGGCTTTATAAGGTCAGATATGCCGTGGTTTTACACGGCTTTTTTTTGTATAGGCTT 0.321 +TGTAATGGATGGTGACATTGATACGTTTATTGATGCCTACCTGCGTTCTAAGCTTTCATAAGCTGAAAAACACACCTGATGCATACGGGTGTGTTTTTTTATTTCTTATAT 0.997 +TGTGGTTCTCGAAGCGATGAGCTAACGCGTCATCTCGCTCAAAAATCCAGATTTATAAAAGAAAAAATGACTGGCCAGCATCGCAACATGCTGGCCTTTTTGGCAAGCAGG 0.535 +GAGCGGATGGGTGCGATGATTGTTGAGCTGGCCAAGTATTATGAAGAAAAGGAATAACAAGTACAAAAAGCCGCTTTTCCCTCGGAAAGCGGCTTTTGTCTGTCATTCATT 0.9279999999999999 +GAGATAAATTCTCAGCTGCTTAATGGACAGTCAGAAGCTGCTGCCGGAAAAGAATAAAATATAGAAGAAAACCTTGCGATAGTTGTCGCAAGGTTTTTTGCTTTTAATAAC 0.998 +GTGTTTGGGATTGGCAAACGCACGCAGGCCAATTTGACAAAAAGCGAACAAGCAGCTGAATAACGATAAAAAAGAGACATTCACGGATGTCTCTTTTTTTATTTTTCGCGT 0.976 +ACCCAGGCTGCAGTATTTTCCAAAACGGTTAAGCAAATTAAACAAGCCTACCGTCAGTAATATAAAACCGGCTGATAGCGTGCCTTTCAGCCGGTTTTTGCATCTGGCACG 0.84 +GAAGTAATGAACCAAGTGAAAAAATAATGGTGAAAAGATTTCGCGGCTAAGTCGCGTCTTTTCAGGCAGTATAGGGGACACATTTTGTGTTCCCTATTTTTAAAACTATAT 0.857 +CCTGACGGAAAACGATCGTCGCCGCGCCAACCGCGCACCGAAAGACGATTAATTTCGTTTTTTAGTCATAAAAAACCTGCCAGCGATGGCAGGTTTTTTTTTGCCTGTCGT 1.0 +CGTTTGAAAGTGTGCAAAGAGCGATGGAAAATGAACTGGCAAGCAAGTCATTAAAAGATGTTATGAATCATCTCTTTTAATCAAAAGGGATGACTTCTCACCGCTAACATG 0.884 +GCACCAGTCGTTCAGCAAACATCCGCACGCACTACCACCAGTAACGGTCATAACCCTTTCCGTAACTGATGACGACGACGGGGCTTCGGCCCCGTCTGAACGTAAGGCAAC 0.8079999999999999 +GTTCTCATCTTCATGGGAGCAGGAGATATCCAAAAATATATGAGAGCCTACGAAAACGTCATGGCATAATAAAAAGCAGTGATCTCACTGCTTTTTATTTATCTGATGGGA 0.9470000000000001 +AAAGGGAGAAATTGAAAAAACAAAAATTGGCCCGATGACAGCGGAACAACTGAAGGAATGGACGGAAGAATAGCTGAGAGCATAGACTCTCAGCTTTTTTCATATAGAGAA 1.0 +TTTTATCTGAACATATAAGCTAATTCGTTAAACAGCCTTATTTTATGTCGTCAGGCTCTACATGAACATGAACATGAGTGACATCGTGTTCATGTTTTATTTTGTCTTCTA 0.996 +CCGTACTATCAAAGAGTACGCCGATCATATCTGGCATATCGATCCGGTGAGATTGTAAGTTCACCAATAAATAGAACGGGGCCAAAGGGTCCCGTTTTTTTCCGCCATCAT 0.841 +GCTGTCGGAACGCTGGTTTACTTCCTATATTCGAGAAAACACAGCTTATTAAACAAATAATCTCTTTTCAGCCGGCGGTGCCTCACCCGCCGGCTTTTTCCTTTTTTTATG 0.978 +TCTGAAGATGAGGGAAGAGCTGGCTCATTACGAAGCGTCCGGATCAAGCATGGCGCCAATCAGATAACGTTTACTCTCCCTTTTTCAGGGAGAGTTTTTTTATGTTTGCTT 0.983 +ACAAAAGACAAAGAAAAAGAAATCATGGAAGTTTAATGAAAAACTATGTACAATAGATAATAGTGAAAAGACCCTCTCATGTTTACAGGGGGTTTTTTTGTTAATACTGTT 0.818 +CCCGGGTACTCTGGGTGTTGCCCTGGCCGTTTGCTTCGGCTTCGTGCTGGGTAGCTTCATGCTGTAATGATCAATCGCGGGGCGTTCACGCCCCGCTTTCTTTCCCGCCGA 0.9359999999999999 +AGCTTAAAGCTAAACTTGAAGAAGTTGGCGCTTCTGTAGAAGTTAAGTAATCTTCACTTACCTGTAGGGGAAGCTCGCTTTTATGAGGCGAGCTTTTTCTTTGCCATTCTT 0.986 +TGACGATCAAGTAGAATGGGGCATTAAACAAGGGTATATCGATCCCGAAGACGCACCATAAAATAAAAAAAGCCAAGGCATTCAGCCTTGGCTTATCCTCCGATCAGCTGC 0.856 +CTGTCGGTCCAGACCGCTCACAAACAAATGTCCTTCGCAGTGTGTACCGTGCGAACTAAATAGAATATGTCTGCAAGCCCCTATTTAAGGGGCTTGTTTTTGTTTGAAAGC 1.0 +AAACACTAAAACATGCAAGCCAAGAGCTTATGAACACAATTGCAAAATTCACATTGTAATCCAATTACATCCCCAAACATCATTTGTTTTGGGGATTTTTTATTTTATAAG 0.9840000000000001 +CTAAATCCTGGACGCTGGTTCGCGTTGTAGAGAAAGCGGTTCTGTAATACAGTACACTCTCTCAATACGAATAAACGGCTCAGAAATGAGCCGTTTATTTTTTCTACCCAT 0.691 +TGGTAGCGAAGAAGGGTTTGATCAAAAGTACAATTTTTATTACTTTCAGCAGCAATCTAAAGCTTAATTTTAAAGAGGGATATACAGTCCCTCTTTAAATAAAAAAAGAAA 0.92 +TGATCATCCCGACTGATGAAGAAGTCATGATTGCGCGCGACGTTGTTCGTTTAGCAAAATAAATCGCATGAAAGCACATTCTCTTGAATGTGCTTTTTTGTTGATGCATGA 0.929 +GTCCTCTCCATTTTGACATTCCATATTCAGGCAACCGCACATAAAATGACAGCAGACATTCTATAGTCTGCGCCACCCCGGCTCAGAGGCCGGGGTTTTATTTTTCTCCAC 0.96 +TGACAGACGATATGGAAAAACCGACTCACGCCATTGACTCTCTGACTGAATGGATTCCATACATTTGAAAAAAGGGCGCCCTAAAAGGGTGCCCTTATTCTGTATGCCGCG 0.9840000000000001 +AGGGGACAATCGTCATTTACGCGTTAAGTGTACTCTTTTATTTTATAGATTTTCTTCAACACAACCGGAAGGCTGGAAAAATGGCCTTCTGGTTGCTTTCTATTGTCTGGA 0.662 +TTATCTGATGAATATCACCTCTCTATTGAAACAATAAAAAAAATTGTCTACTCTAAATAAACAAAACGGAAGCACTGATAAAAAATAATCAGTGCTTTTTATATGGCGAGT 0.972 +CAAAAGCACGAAAGCGACCAATTCCAGGTGAATCTAAATAAAGAGGAAAAGGAACGGGCTTAAAAGAGAGACTTCGTCTGACAGAGGCGGAGTCTTTTTCATGGTAAAATA 0.924 +AGGGCGACGTCGTCCTTTTGCAGCCGGATCAGCCTGTCCCAAACGGCACAAAAATCGGATAACGCAAAAAAAGACGTTTGCCTAAGGCAAACGTCTTTGCGTATTGAGGCG 0.593 +TGAGTCCGGTCTGTATACACGCGGACCTGTTTCACTTCTCGTTGCGTTTGACGGGGCATAATAGAATTCCAAAGGTCTCTCCCATGCGGGTGAGACTCTTTTTACATATAT 0.9279999999999999 +GATTCACCATCTAACATCCAGGTAGCTGAAAGGCCAGTCGGAGGTTAAAAAAGGTTGATTAATTAATTTAGCCCTACTCAAACATTTGAGTGGGCTTTTATTTTATGATTT 0.973 +AACATATTCAATGGGTGAAGGCGAAAATCGTCATCTCGTTATTTCCCATAAAAGATAGCATAAAACCGAAGTCCGATAAAAATTGGATTTCGGTTTTTTTGTATCCGAAAA 0.9990000000000001 +TGGAGTGGGCTGCGAACCAATATTTTGGCACAGCCCCCTTCTATCGAAAACGAAAGGCATTTGAAACCCCATCGTCCGGAAAATAGGCGATGGGGCTTTTTAGCGGACGAT 0.997 +GATCCGGAAGAGCTGGCGACGAAGCCGGTAAAGTTGCCGGAGAAAGTGCGGAAAGTTGAAGCCGCGTAAATGAATAACGGAGCCGAAAGGCTCCGTTTCTTTATCCGCTAA 0.956 +CCATCACCGATTATCAGCAGATTGGCGATGCAGCGTGGAGCATCACCGTTCAAAAAGCCTGACTCAACATTGTCCTCCGCCGTACCAGCGGCGGAGGCTTCAAATTAGACA 0.8170000000000001 +GGAGAAAGCGGAACAGTTGGATGACTAAAACTTGGCTTGGTAAGCCAAGCTCAGAATGTTGACAAAATCCTAAAACAGTTTTCGTTTTAGGATTTTGTCATCTTTTCAGCG 0.682 +AGTCCCCTCGCCCCCTCGGGGAGAGGGTTAGGGTGAGGGGAACAGGCCCGCACAAGCAAACTTATCAGCAATCTCAGGCCGGATATTCATTCGGCCTTTTACAAAAACCTC 0.9159999999999999 +GCTGTATGAAATCAAAAAGTAATCTGCTTTATGCCTGATGCGACGCTTGAGCGTCGCATCCAACAATGACAAGCGGTGGAGATCTTCTCTGCCGCTTTTTTTTTCATCAAT 0.973 +AAGGCGCTGTCGCAGACGAATTCGGATGGACGACCGAAGTCGCAGCGCTGACTGAAAGCAAGTAAGAAAAAAGCCGGCCCATTACAGGCCGGCTTTTTTTACGCTTCAATT 1.0 +ACATCAATGTTTTAACCGTCTTAAAAAACTAGACAAAGCGTGAATAAAAAAAGAGAAGGTCTTTCATCAGTTTACTAAACTGTTGGGAGACCTTTTCTCCATATTAGCGGT 0.9840000000000001 +TGAGTGCTTTTGACTACCTGATTCGCACCAAAACTGCATAAGAAGAAGTAAGATTCACCTGCAATTGCTTAGCCGCCGGGGTCAAACCTGGCGGCTTTTTTATGGCATTAA 0.995 +CGCCGAAGGCTGATGCAAAGCCGGAAGCTGATGCGAAAGCCGCAGATTCTGCTAAAAAATAAGCATTAAGAACCGCCGCCTGACCAGGCGGCGGTTTTTTTATTACAGGCC 0.992 +CCGGTCTTCACAACCCGCCGATGGATGACGCCGTGATACACAAGCTGGCAGCTGAAGCGTTTGAGTCATAAAAAAAAGAGCCTTACCGGCTCTTTTTATGTATTGTTTTGC 0.885 +GGAAAAACGCAAACCTGATTTCGGACAGTTCCCTCGTTTTCCGTGATCAGCAATATCTAGTAAACCAACAGCTTGAGACTTTGCGGTCCAAGCTGTTTTCTTTTCAATACA 0.925 +TGATTGACACCCCTGCTCCTCCCGTTATCTATCCATTCACTATTATCCGGATCTTCGCATCATCCCCTCAAAATTGACCAGTCGACCAGACTGGTTCTTTCTTTTTGCCTA 0.972 +TGCTGCGAAACAGCTCAATACAGAGATTCCGGCCCCTGTTCAATACGAAAGAGGCTGGTAATGCCCATAAGAGATATCCTGTAGAGGATATCTCTTTTTTTATTTTTAGAG 0.992 +ATAGAAGAAATGAGAAAAGACCATTATGCCGTCGCAGGCAAACGCCTGAGCGATATGGAATAATCTCTATAAAGCCGGCGCTTCGCGCACCGGCTTTTATTATCCCTATTG 0.958 +CATGAGCTTCGCCAATACATTAATGATGTCGAAGCCGCCATGCTTGAAGGCGGCGAATAAATGCAGAAAAAAAGCTGGCGTTAGAGCGCCAGCTTTTTTTATATTAAGGGC 1.0 +ATGAATTAACGCAGAGCTTTTTCGAACGTTGTCAAGGTCAGCAGGATCAAGACATTTCATACAATAATTAAGCAGAGGCTGTGATCAGTCTCTGCTTTTTTTTCTGCGTTC 0.991 +TGGACTTCATCAACCTGTTCCTGATGTTGTTGCGGATCTTCGGCAACCGCCGTTAATTCTTTTTCACCATCACTATCCCGGCCATGTGTCGGGATTTTTTTCTTACTAAAA 0.968 +AAGCTGATGAACGTTATTTATGACAGGATCGACCGCAAGCATCAAAAAAATCTGCAAGAGCAAAACAATACATAAAAAGAGCCGGTAAGGCTCTTTTTTTTATGACTCAAA 0.9740000000000001 +ACGCTATCAAAAGCATCTGGCAAAAAGAAAGTGGAAATGTCTTATATTGGCGGTTAATTTGAAGTCAAAAGCTTATCGGCACTGTCCGGTAAGCTTTTTTATTAGAATCCG 0.972 +ATAAACGGCCCCTTCTGGGCAATGCCGATCAGTTAAGGATTAGTTGACCGATCCTTAAACTGAGGCACTATAACGGCTTCCACAACAGGGAGCCGTTTTCTTATGCCACTT 0.985 +GTCATACTCTGACTCAGAGTCAGAACGACGCGCTGGTAGCTGTCTTCCAGGCTGCGTTTAGCAAGTAAAAATACAGGGCTGGAATCATCCGGCCCTTTTTTCTGATATGAT 0.94 +TTTGTCCCGGATGTTGATCAGCTACCGGATGTTTTGTATCCGCTGCTGAAAAAACTGCTTCATAAAAGCATAGGATAGCCCTTAATCCTATGCTTTTTGGCGTTTGTTTTT 0.961 +TGCGAGGATTTGCGCCGCAACTCCCTGAAACGCTTCCAGGAACTGACGAAGCGCCTTCTCAGGCGAGTTAAAATTAAGGCGGCAGCAATGCCGCCTTTTCTTTTTCCGAAA 0.9670000000000001 +GTTCGCGGTAAGATGGATGGCGGCTGGACTGGCATTGTAAATGAACGGCTTCGTAAGGCTGTCGAAGAATAAAAAAGAGAGAGGTATCTCTCTTTTTTGTTTTTTTCCAAG 0.898 +TCTTCTCCATTTAATGAGGACTTAAAGAATTTAGGGGACAAGCTAGACTTAAAAATTAACTTAGAGCATTAAAAACAAGCCCTATTTTGGGGCTTTTCTTTAGAAAATAAA 0.9420000000000001 +TGAAGTCAAACCGGGACATTTTGTGATGTGCACGGAAGCCGAATTTAAAGCTTTTTCATGATTCATCAATCCTTCAAGAGATTTCTCTTGAAGGATTTTTTTGCGTCTTCA 0.998 +GTATGAGCATCTGAAATTTGTCCGTGACAAAATTCTAAAAGGAATGGATGAAAAATAATGGTGAGCAACCGCAGTTGAAACGTAAGAGCTGCGGTTTTTTTAAACCAATTT 0.9570000000000001 +AACGAGTACGGTAATTTTGTAATCAAATAAGCAGTGAGGAAAAGAAACCGCTGCAGATGAACTCAATCTAAAAGATACCAATGATGTTGGTATCTTTTTATTTGCAGTAAA 0.995 +GAAAGTCCAGAGAGGCTTGGAAGGGTTATGAAGAGAAGGAAGCTTCAATGCTGCCCTCTATTTAACCATACCCCGAGTCTATCTTAGACCGGGGTTTTTTTTCAGCCTTAA 0.852 +TCATGGGTGAAACGCCTGCTGAGCTTGTTGAGAATATATATACCTTTATTGAAGAAAATCCAATGTTTTAAAGAAAAGGGCCTGGACAGGCCCTTTTTTATTTTGCCGTAC 0.998 +GTGTGAAGAAGAGAAATAAGGAAAGCGGCACCTGAAAAAGAGCCTTTGTTTCCATGGTGAGTCATGTAAGTACGGTTCACACCCGTTTACATGACTTTTTATATTTGTCAG 0.615 +GGTTGTATGACCTGATGGTGACATAAATACGTCATCGACAGATGAACGTGTAATATAAAGAAAAGCAGGGCACGCGCCACCCTAACACGGTGGCGCATTTTATGGACATCC 0.8109999999999999 +CATATGTTAAAAGATTGGACATTCACATTAGTTGAAAAACATCAGGGTGAATAAAACAAATCTGAGGAGAGCGAGATTCATTGTCACGCTCTCCTCTGTTTAGCTGTATGG 0.917 +GAAGATGTAATCTTAGTCCGATTAAGTGAACCACATGCATAAACCTGCTTGTCTGCCGCAGATTGGCAAATGCCGGAAGTCTCGATTCTTCCGGCACATTTCATTTAGAAA 0.887 +AGGCACACGCCAATATTGGCGAAAAGCAGAAACCTATCCCCCGAAAATTCAGCTCGGCGAAGGATTGTAACAACAGAGGCTCAAGAAAGAGCCTCTTCTTTACTACTGCAC 0.691 +AATAAAGATGAGCCGATTAACAACGCGCTGGCAGAACAGCTGAAAAAACTCGGTTTGGATAAATAATAAAAAAACCTGTGCCTATCGCACAGGTTTTTTATTGGACTTAGG 0.977 +TTACAAATGCAATTAGCTAAATACTCCGTACAATAGAATCTCTTCTTAAGGGCCTATCTAGGACGAACCATACCAACAACAATAGTTGGTATGGTTTTTGCTGCGATGAAG 0.9179999999999999 +ACAGCTTCAGGCTGCTTTTCAAGAAAGCATTCAACAGGCTGCTAGCATTTCATAAATGAACGAAGAAAAGAGCCCGCTTTTTACAGCGGGGCTCTTTTTATTTTTTAACAA 0.971 +AATTGAGGTGAACTTCAATCAAGAATATATTGAGCGGTTGATTGATGAAATAACTAATTTAATAAAAAAACATATTTAAGCTTCTTTTGAGGAGCTTTTTTATTTTCAAAA 0.794 +TGGTATTTTTATGGTGGTTTGGTTTGCGTAAAGCGAAAAAAGCCCGTTCCGTTAATATGATGCAAGTATAGAAGCCGCACAGCAACCGTGCGGCTTTTTTGTGTGCAGTTT 0.993 +GACACGCTGAGAGAAGGCCTAGACCGTTTAGAATTATTTGTATTAAAAAAACGTGAAGCAATGCAGACGATAAACAACGGCGTTTAAGCCGTTGTTTTTATTTTTTTGGAA 0.992 +TCCGCTGCATAATCCCCAGGATTATGTCTACAGCAAAGAGTTGTATATCAAGGCACATTGATGCCTTTATCGCCGTCGGAGCACGTCCGGCGGCGCTATTCACGTTGATCT 0.672 +ATTTGGTTCATTCTTTGTCGGCAATCACCTTTCGCTTGATACACTTCCTCAATTCTTATATGTATAATATCCCCTCCCGCCCTATCCGGCGGGAGTTTTTCAATTCTCCTT 0.9990000000000001 +TGGATTGCCTGATGTGACGCTGGCGCGTCTTATCAGGTCAACGTGAACAGCCCTTCCACCTGAAATTAATTTTAAAAAGGGGCTAACAGCCCCTTTTTTGTCAGGAGAATT 0.654 +CGCGTAGAGATCGAAGTTAAAGGTATCAAAGACGTTGTAACTCAGCCGCAGGCTTAAGTTCTCGTCTGGTAGAAAAACCCCGCTGCTGCGGGGTTTTTTTTGCCTTTAGTA 0.995 +ATGGACCCGAAAGCCCGCGTACAGCCTCTTCACTCAACAAGTCTTTTTGAAAATAAATAATAAAAAAAGGAGCTTGTGTCTGCGACACAGCTCCTTTTTTATTTGAAGATA 0.997 +AAGTCCATCATGCGGTAGCAGCAGCTGTCGAGCATGCAGCTCTTACAGCAGTGAAATAAATGTAGCTAAACAGCACCTGTCGTAGCAGGTGCTGTTTTTTTCTTATCATTG 0.987 +CGCTTAGCGAGGTAAATCAGGAGAAGTTTAACAATATCAAGAAAGCACTAAGCGAAGCGAAATAACTAAACCTTCATGCGGCGGATTTTTCCGCCGCCTTATTGAGCGAGA 0.828 +GCAGTCCGCTTTCCTCTGCGATCAATGGCGCGGCACTTCGGGCGGACGGCGGTTTGGTCCGCAGTGTCTTTTAAACCTCTCCCAATAGGAGAGGTTTTTTTATATATTGTT 0.9640000000000001 +ACCACGGATGCGTCAAAACATATGAAACGCTGATCGAAGGAGAACATGTTTATCTCGTATATTGATCAAAAGACCGGATCACTTTTCTAGTGGTCCTTCATTTTTTACAGT 0.514 +TTATGCCGCGAAAAAGATCACAAACTTTGCCATATCATCTGGCGGGAATAAATAATCGCTTTTAGCGTGAAAAAGGGCTGACAACTGTCAGCCCTTATTGTTTTTAAAGCG 0.9179999999999999 +CGGCAGAAATAAGCTCGAAGATTACGACATCTTAACATTGATGAAATACTAAGGATAACCGCATAACGAAAAAGCACTCCATGTCAGGGTGCTTTTTTCCTATTGTTTTGC 0.9520000000000001 +CGGGATTACAGGTGAAAATCTTCACGTTGATTCTGGTTTCCATATCACTGCCCGCTAAGCAGGATCTATATCAAGCAATCCCGAATAGGGATTGCTTTTTTATTTCTTCTG 0.996 +GTGTAACGCTCTGAAATTACAGCCAGACGAAGACGGGATTGGCGCAACGCTCCAGCCTGCGTAAAAATACAGGCGCAACCCGTGAAGGTTGCGCCTTGTCTTTATTAGCGG 0.614 +TGCAAAAAAATCATCCATCCGGCTGGTCAGCAACTGTAGTTGTTAATGTGACAGAGCCATTGCCCATGATAGTGTCCATTAAAAGGATGGACACTATTTCCCCGGAACCTG 0.385 +CGCTGCGTAAAAAATTGAAAAAATACGGCATGAACTAATTCAGGTTAGCTAAATGCTTGATTAAAAAGGCGCTACTCGGCATGGGGAAGCGCCTTTTTTATAGGTGTCACA 0.9440000000000001 +TGCCTCGTTAAATTTTTTCAGGCCGGATATGGCGTTAACCGCCTGTCCGGCAAACCTGCCTGCAAGTCGAGCCTTGTCTGATTTATCAGCGAGGCTTTTTGCTTATTTATG 0.574 +GCTGGCGTTTCTGCTAATGCGGAAGCACTCGACTTTCATGTGACAGAAAGAGGAATGACGTAAGAACAAGCCCCTTCTCATTAGCGAGAAGGGGTTTTTCTTTTCAAAAAA 0.996 +AACCTGTTATGGCAACATGGTTTGAAGAAGAAATATATTGGACGTATCAATGGAATGATTAATCAAAAAGGTATAAGGCCTTAAGCCTTATACCTTTTTTAATTTTAAGTC 1.0 +GAAGCGGTGGCAGGGTACTATAAGTCATCATTAGAAAGATTAAGGTCATGGGAGGCTTGAACATATAAAGAAAAAGGACAACAGCTGTTGTCCTTTTCAATGCCGCTTGAA 0.732 +CGCTGCAAAATCGTCGCCGGTCAGACAGTGAGTTTTGCAGGTCACAGCGTACAGGTTGTTGCCTGATCCCTTTCCCGGCCTGTATCAACAGGCCGGTTCTCTTTTCAGACT 0.8740000000000001 +CCTGCAAAAGCCTCTTACCGATTATTCGGTGTCATGGAAAAGCTGTTAAAACGCAAAAAGTAAGCAATAAGCCATCAAGCGCAGCTTGGTGGCTTTTGTGCTCTTGTTTTG 0.946 +TTATCCGTTGTCAAAAATTAAAACATGTTAAACATACAATGCGTTAGGCTATTGAGAAGCTCTCGATAGTCTTAAAGGGGGCTTAACATGCCCCTTTTTTCATTTCAATTT 0.79 +TCCGCTTCTTTTTTTACTTTAGCACCTGAATCCAGGAGGCTAGCGCGTGAGAAGAGAAACGGAAAACAGCGCCTGAAAGCCTCCCAGTGGAGGCTTTTTTTGTATGCGCGT 0.462 +AAGCCCCGAAAATGTAATTAACCTTTTTATAAAAAGTTACGCTATGACAACATATAAGAAGATGTTTTGATCAATCTTTTTTCAAAACATGTTCTTTTTTTGTTCGTTTAC 0.988 +GTCCTTCAGATGACGCATCATAAAAGTAAAAAAACAGCCGCTCTTACTGAGTCGGTGAAATAAAAAAGCAACCCCGTGCAAAAAGCCGGGGTTGTTTTTTGTTACTTGCAA 0.982 +TATTTTGGAATTAGATAATAATAAAGTTGCTAAATGAGGTTGGTTTGTTTTATATTGACACTTCAAATGTTGGGAGAAGTTCTGCGAGACTTCTCTTTACTTTTTAAAAGT 0.496 +GAATGATGGCTAAGATTAAGTCATTTTTCGGAGTAAGATCTTAATGTGATAGAATCAAAGAGAAGAATCTGACAAAGCATATGCTGTGTCAGGTTTTTTTTGTTTTTGCCT 0.9940000000000001 +TTCGACAATACCGTCGCCATTCACCCAACGGCGGCAGAAGAGTTCGTGACAATGCGTTAAATGTTAAAGGGCTAAGAGTAGTGTGCTCTTAGCCCTTAATTACGTTTCCGC 0.527 +AAAGACACCGATTTAAGCGAGGTTCAGGCGGTGCTGACAAAGCATTTGTGCAGCTGAATATTTGATGAGCCAGCCGTTCAAAAAATGGTTAGGCTTTTTTTATATATAAAA 0.826 +CCTTATTTCCACATTTATTCAAAAAACTGCAAGACGGACAATTTAACACCTTACATATGAAATTCGCTTTCTCAAAAGCTGCCGGCCGGCAGCTTTTTTACATAAAAAACA 0.9940000000000001 +AAGAGCTATGGGACCGGAGATACCGGGTGGCTTGAAGCATTAAAGCGTCAAACGGGTGTGAATTATTAGGTAAGCTGTTCATGTAGGACAGCTTATTTTTTATGAGAATCT 0.981 +CAGCAAAACGAAGACGGCAGCATCGAGTTGCTGGAACGTTGCCCGCCAAGAATGGGAAATCTGTAAAAACAACAAAAAACCCCTCGTTTGAGGGGTTTGCTCTTTAAACGG 0.711 +TCGCGGGCAGTACATTTTAGAGGGTGAAATCGAATCTGCAGAATCAGCAAACGAATAAATATAAAGAAAAACTGCTTGGCATCGCTCAAGCAGTTTTTTTATTGAGGTTTT 0.996 +CACTGATGCTGAACTTGCCACGTTAATCGAGCTCTTCAGCAAACTTGATAAAAAACGGTGAAAAAAAGCACATACCGATATATAGGTATGTGCTTTTCGCAGTGCTTCACT 0.895 +ATTAAAGAGATCATTCAGCATATTAAGGAATTAAATGAAGCGCATCGAAACAAACAATAGCCAAAAAGCAGTCTGTTCATTATAGCAGACTGCTTTTTTGTGATTCAGCCG 0.955 +TAGGCCGCCGTTTGCCAGGTTTCTTTAGGAAAGGCAGACATGCAGGCTGATGAACTAGATATGATGAAACTGAGGGCTCGCATGCTGCGGGCTTTCTTTTATAAAGGAGAG 0.723 +AGGTTCTGCAGCCTCTTCATATAAGGGATAAAGTAAAAGAAACCATTCATCACATGCAAAAAATATATGAAACCTGACACACTGCTGTCAGGTTTCTTTGTCTATAATCTT 0.9990000000000001 +ATATGATCGAAGAAACACTAAACACATTCTTAGCGGACGAAGACGAAGAGTAAAACATCGCGCCAAACCAAAGCCGGAGCTATACTGCTCCGGCTTTTTTCTGCTTTATGC 0.993 +GCGCTGGCAGACAAAGAAGCAGAACTGATGCAGTTCTGATTTCTTGAACGACAAAAACGCCGCTCAGTAGATCCTTGCGGATCGGCTGGCGGCGTTTTGCTTTTTATTCTG 0.77 +ACCACTTTCAACGAGCTGATGAACCAGCAGGCGTAATTTACGCAGCATAACGCGCTAAATTCGCACAAAGGCCCGTCACCGCCAGGTGGTGGGCTTTTTTTTGTCATGAAT 0.978 +CTGCGTAAGCATGGCGCGAAGGCATATTACGGGCAGTAATGACTGTATAAAACCACAGCCAATCAAACGAAACCAGGCTATACTCAAGCCTGGTTTTTTGATGGATTTTCA 0.7859999999999999 +AATGGCGAGATCACAAGTTTTCACGTGATTTTAGAAGATGATAAAATCAAAGTGCGAGCAAAATAACAATCAGCGGTTTCCTTTTAGGAAGCCGCTTTTTTTTATACTTTC 0.996 +AAAAGGTGAGGAAAATGGGTGAGGCCGCTTATCAAAAAGCAAAAAAAGGTGTGTATATAAAAATCTTAAAGCAGTAATTGCCGCCTGACAGCGGCTTTTTTTATGTTTGGC 0.96 +GTGAAAAAGAGAGCCGATAAAGTGGTTCTGACTGTGAAATGGGATATGAAAGACCCAGTGTAATCAAAAAGGCGGTGACTGCATAGTCCCGCCTTTTTGATTGTCATTACA 0.81 +TTTTGCTGACAGTGCTGCTCGGTCTGCATTTGCCAATTGGTTTTCAATTGTTTTAATAGAAAAACCTATGAACCCGGCTCTTTGATAGAGCTGGTTTTTTTTATATTTATC 0.988 +AACTATTGGGCCGTAAAGTTCGGACCAATCACGTCAATCGCATCGGTACAGATGCAATCCACACCCCAGCGCAGCAACTCTGCTGCGCGCTGGGGTTTATTGACGGTATAA 0.6679999999999999 +CCATCACTGGTTTGGTTAAAAATTAAACACTTGAAAGTGTAATTTCCGTCCCCATATACTAAGCATCAGTAAAAAACTCCCGCCTTCTGGCGGGAGTTGCTATTTAATTAC 0.47600000000000003 +CGATCGCCAGCTGCCGGGAAGTGGCTGGCCAGCTTTTATTCATTAAAAACAGCAGCTCCTGCACGCCTTAAAAAAACCGGTTCAGCACCGGTTTTTTTGTTATTTGATAAA 0.9540000000000001 +CTATGACACGTGTTTGTGCACTTCCGCATCCTGCTATGAGAGCTATTTAACATTTGAGAATAGGGAGTTGAGCGTATTTGCTTATACTCCTTATTTTCTCTTAAGGGGGAT 0.899 +TGTGATCGTACATGACGTACGCGATGTGTTTGGCGAAGGGTTTGTGGATATATCTAAATAAAGAAAAATCCCTCTGTACTTGAAACAGAGGGATTTTTTCATTTAGAACGG 0.997 +AGATAATATGCTATTACGAAATGAAGCAATTAATGCTTATAGAAATTCTATTTAAATAATGAAAACAATGCCCAGCTTAAATAAAGAGCTGGGCATTTTACTTATTGGGGT 0.998 +CCGGCCTACAAAACCGAAACGTATGTAGGCCTGATAAGACGCGTCAGCGTCGCATCAGGCAGTTGTTGTCGGATAAGGCGTGAAAGCCTTATCCGTCCTGGAATGAGGAAG 0.596 +GGTTAGACTGTTAACAACAACCAGGTTTTCTACTGATATAACTGGTTACATTTAACGCCACGTTCACTCTTTTGCATCAACAAGATAACGTGGCTTTTTTTGGTAAGCAGA 0.795 +AGTTACCTGCCGGATGCGGCGTAAACGCCTTATCCGGCCTACGTTCAGCACGTTACTCCAGGCCGGATAAGACGCGGCAAGCGTCGCATCCGGCAATGTCTGCCCAGTTCT 0.639 +TAAACATTTCCGAGGAATTGCCCAGCAAGGCGGAGATGTTATCTTTCAATACTTAGAACAGCCAAGAAGCTGAAGGTTTCTCATACGTGAGAAGCCTTTTTATCATATATG 0.993 +CCCGTTAACAGACAAAGGAATCGAACAATTCCTGGCAGACTGGAACAAATAATGAAAGGGGCGGCAAACAGCTTTATGCCTGTTTGCCGCGGCCTTTGTATTTCACCGACC 0.5589999999999999 +GCGTGCTGACATACCCATCATTATTCACCGTTCATCAGCAGCTGAAGCAAAATGAATGACAGACAAAAGCCGCTTTCCGAGGGAAAAGCGGCTTTTTGTACTTGTTATTCC 0.927 +GGTTTGGTTTATTTTGCTGATTGTGATTTATAAAGTGAGAAAAGCGAAGCATCAATAAGACTCAAAACTCCTGCCTCAAAAATGAGAGCAGGAGTTTTTTTGATGAAAATG 0.995 +TCCAGGCTCTTTATTATATCGTCACCCGTCATATTTATAAGCGTGCTGTTCTTCAGCGTATGTAATATGAAATCCGGTCTGTTATCAGACCGGATTTTTGATTTATTGAAA 1.0 +ACGGATAAGCAAGAGTAGGCTGGATAAGACGCGCCAGCGTCGCATCCGGCATTTTTTTCTGCGCCAATGCAAAAAGGCCATCCGTCAGGATGGCCTTCTGCTTAATTTGAT 0.8590000000000001 +TATGAAAATGGTAGGATTGTTACTGATAAAGCAGGCAAAACCTAAATTGCAATGAGTGCGGATCATCTCTGTCTGTGCTGATGGTAATTTAGGTTTTTATTTTTTTCAGAG 0.7490000000000001 +GGATCGATTTTATCGGCCATCATGAAAAACGGCACCTCAGCCAAATGAAAGAAATTAAAGAAAAGATAGAAAGGGCCTGATTTAATCAGACCCTTTCTGCTTTTCGTTTGA 0.92 +ACGCTTTCTTCGATGATCCGAAATCGGACCGCGCAAAAGACTTCCTCGCGAAAATCCTGCATTAATCACTCTGGCGCGCATACCCAATTGCGCGCCATCTCACGCTTTATC 0.759 +TCAGTATCGGCGGAATTCCAGCTGAGCGCCGGTCGCTACCATTACCAGTTGGTCTGGTGTCAAAAATAATAATAACCGGGCAGGCCATGTCTGCCCGTATTTCGCGTAAGG 0.51 +AACCCTTATTTTGACGGTTTGTCTTCATGCGGAGGCGGCTGCGGTTCAGGCGGAAGCTGCGGATGTAAAGTGTCCTGACGAGACTGGCCGTCTCGTCTTTTACATATAAGA 0.326 +GACAATGCCTCTATGGCTGTATGTAAGCCTTACAGGCGTTATTGTATATTTGATGATTTCACCGTATTATTAAAAGCCATACTCAGTATGGCTTTTTTTACGTGTAAACAT 0.971 +TCTATTCATATTGAAAGTGAAGTGCGAAGAGGCACAGAGGTGACGATCACCCTCCCCGTATCCGCATCATAGCCCCCCTGACCATGTCAGGAGGGTTTTTTATCTTCGTGA 0.993 +CCATATGGACGGAAATCCTCCTGGAGGCGCTGAACACTCAGAGCATTCAGGCCATAACGAATAATACAAAAAACCCTCTTCAGTGGAAGAGGGTTTTTTGCTGTCTTCAGC 1.0 +TTGCAAAAGTGACGGCAGATATGGATCGGCTTCAGGACCATGTTGACGTGAGTTTATCAGTGCTTGATTAACATGTGGTAAAGGATTCTTTGCCCATGTATGTACTTTTTG 0.9159999999999999 +TCATACTTATGAGCGTACTGACAAAGAAGGCATCTTCCATACTGAATGGATGAAGTAATCAATAGAAACCCCCGAAGCTCTTAAGCTTTGGGGGTTTTGTTATTAAGGAGA 0.996 +GGAAGCGGTCATTATCCGCAATGCCGGAGATAAAGAGCCGACTGTTCACATTTATGACTGATAGAAAACCCTTGTGCCATATTGGCACAAGGGTTTTTATGTTACAATTAA 0.997 +GGCAGAACGGTTATCATGGATCTGTGCATAAAAGTGGCACACGCTGTAACAAAATAGGTGCAGAACTGCAAAAATCCTTGTAAATTCAAGGATTTTTGCTTTTTAATTTTA 0.9740000000000001 +CATCCTCTTCGTAAAGGATGACCGTGAAGACGAAGCAGTCGCTTCGGGATCATAAAAGGCAGCCCGGCACCGCAGGATGATGCGATGCCGGGCTGTTTTGTGTTACGCTGA 0.982 +GCCTGAACCGCTATGGCAAGCAATTGTGAACACACAAGGCTGGGTGCCTTATAAAGATTAATGTGCAAAGACTGCCGAAACGATTCGGCAGTCTTTTTTCCCTTTATATAA 1.0 +ACAGCGGCTATCGGCGCACCTTATTTTATTTATTTGCTCTATAAAACAAGAAATTCTTAATCGTTACACCCATTTTCTTAAAAAAGAAAATGGGTTTTTTTGATAATGACT 0.988 +CTGGTAGCCGCCTGGCTTGGCGATGCTCGCCTGATCGACAACAAAATGGTCGAGCTGGCGTAATACTTAACTGGCGCTACGGCTGATGGCGCCAGTTATTAATTTACCCCA 0.745 +AGAGTACGGCAAGCTGATTATTTCAGAGCATGCTCTAGAAACGATAAAAAAACACTTTGCATCATAGCAAACCGATTTCCTTCCGGAAATCGGTTTTTTTTATGCACGAAA 0.9990000000000001 +AAAAATACGGTCTTAAAGGCGCTCGTCGTGCACCTCAGTTCTCAAAACGTTAATTTTACGTTTTCAAAAAGCTCTCGACCTTGGGTTGGGAGCTTTTTTCTTTCAATAAAA 0.945 +CATTTACCCGATTGTTCGAAAAATCATGAAGGACCGCAAGGCGTCTAAACAAAAGAATATGTCCGCATCTAGAACAGCCGGCTGATCCCGGCTGTTTTTTTATAGGTCAGT 1.0 +CTAAACGCATCTTCTTTGCGCGGTAGACGAGTGAGCGGCATCCAGCATTAAGCCAGCACGCAGTCAAACAAAAAACCCGCGCCATTGCGCGGGTTTTTTTATGCCCGAAGC 1.0 +GTTTTTGCTGCTGCTCGTCTGTATCATCACCTTTTTGCAGCATCACGTATTCAGCTGGATGATTCCTTAAAAAAATCCTCCCGTCTAGGGAGGATTTTATTTATATCGCTT 0.818 +ATCGTTGAAGGAAACCGCGGACCACAAGCTGCTAACGTTACTAAAGAAGCGTAAGCATAAATTGATATGAAAAACTGCAGGTGCAAACCTGCAGTTTTTATTTTGACAAAA 0.998 +GATGCGGCGTGAACGCCTTATCTGGCCTGCAAAAAATAGCAATACCGTAATCAGGCATAATCTGCAGATACAAAAAACCCCGCCGGAGCGGGGTTTTTTACAACTTATTCA 0.993 +AGCCGGCTGATGATACGCTTGACATCCCGACATTCTTAAGAAACCGTAATAAACGCGGCTAATGTAAAGGACAAAATCGTTTTCGATTTTGTCCTTTTTTGTTTTTCTCTT 0.9990000000000001 +GTATCGACGATGCCTTTAAACAGCTTGCTTTTCTGTACCGTCTCACCCAAAAAGTAACACAAGCATAAAAAAGAGCCGGGATATCCCCCGGCTCTTTTTCTTACATCTTAA 0.991 +GGTCCACGTTGACGGCGGTTTCAGCATTGCTGCAATGAACGAACTCGAACTGAAATAATCGTTCTGTTGGTAAAGATGGGCGGCGTTCTGCCGCCCGTTATCTCTGTTATA 0.915 +AGGGCTTTGTTGATTTACTGGCTGATCGTCTGAAATTTTACGCTTAACACACGTCATTGATTAAGATCACCACGCGTTACCTGGGTGGTCTTAATCTGTTTATCTCTTTTT 0.871 +CAACAGGCCATTGACGATAATTTCTGGCAAGCCGAATACCGCGACTACCGCCGTACCTCCATCTAAGCCTGCGCCCGGTAGTGAAGGCTACCGGGCTATTTCCCTCTCCCT 0.909 +AGCCAGAGAAAATGCTGGCGGCACTGAAAGTTTTTGTACAACCGATGAAAGCGGCGACGCGCAGTTAATCCCACAGCCGCCAGTTCCGCTGGCGGCATTTTAACTTTCTTT 0.992 +GCACCCGGAGAATACCGTGAGCTGACAGAGGAAGAGCTTCATTTGCTAAATGAGCCGCAAGCATGACAAAAACCCCGCATACCGAATGCGGGGTTTTATTTATAAGCACAC 0.992 +AGCCACCACCGTTTAGAAATTTACGGCGTCTGCCAAGAGTGTTCGAAAAAAGAAAATCATTAAAAATAAGCTGACCGCACGAAACGGTCAGCTTATTTTATTTCGACATTT 0.995 +CCGCCAGTGGCAGGAACGTGGTAATAACCTGACCGTCAGCGCAGATCTGTATAAATAACTATTCAGTTACTCCTATAAGGCGGCTTGATGCCGCCTTTCGCATTTTTAGTC 0.9640000000000001 +GAAGGCCTGCCGATTCAGGACCCTGTGGAATTTACGAACAATATTTGCAAGGTCATGGTGTAAACAGAAAAAGGAATCGTCTCATAAGGAGACGATTCCTTTTTTTATAAT 0.9420000000000001 +TTCACACGATGGCGACGTTTGATGAAGCTTGGGATCAATACGAGTCTGAAGTGCACGGCAGATAATAGCTGTTTCATCCGCAGATCACTGCGGATTTTTTTATGTCCATAC 0.955 +AGCCGCCTGAGAAGCTGATCAAAACTAAGAAATCAAATGAACGAAAAAAGAAAAAATGACTAAAAAGCAGCCCTCTCTTTGCAGAGCGGCTGCTTTTATCCGTTTCCAACC 0.978 +TCCTATCCCTGATTTTGAAATTGCCGGTGCAACAGGTCTATTCGGTCTATGGGGGTAAGGATCTCCCAAAAGGGCATAGTCATTCTACTATGCCCTTTTTAAGTAAATTAC 0.7240000000000001 +AATTCGAACGTTTCGGCCGTGACCGCAAAAAAGTGAGCGTATATCCTGTAGCTCAATAATGATTCAAAAAACTCCGGTCGATGATGACTGGAGTTTTTTTTGCAATATGCG 0.956 +CCGAAAGAGGGAAGTATAATAGAGCGAAATCCTTTATATCAAGTAAAAAGTGTATAGATCATGCCAAGCGTTCCTCAGCGTCAGCGGAGGGCGCTTTTTTTTGCTGGAGAT 0.9940000000000001 +CGTTGATCCGTCAGACTATCATTACGGCAAGGAGAAAGAAATCAAATAAGATGAAAAAGAGCCTTGAGCGGGCGCATTGCCTTCGCTCAAGGCTCTTTTTTTGGTTATACA 0.992 +ATCTAGACTACTAAGCAGTTTTTTTTCATGTTATGATATACATCGGCTAAAAACGACACCCCGAGTAGGACGAGAGATGATTCTCGTTCTGCTCTCTTTTTTGGGCCCTAA 0.598 +GATGAATCACCACAGGCCTGATAAGTCGCGCAGCGTCGCATCAGGCAATGTGCTCCATTGTTAGCAACAAAAAAGCCGACTCACTTGCAGTCGGCTTTCTCATTTTAAACG 0.992 +TATCGAACGCGAGCAGAACTAACGCGACAGTTTTGCCAATAGCGTTTTGCGCGTGATCCCTAACTGACGGGCGGCTTCGGTTTTGTTGCCGCCCGTTTTCTCCAGCGCCGC 0.862 +GAGCAAGGCTTTAAAACAGTCAATGTTGAAGGCGGCATGATGGCCTGGGAAGGCGAAACAAAACCAAAAAACTAGGGGAGAGCGGTCCCCTAGTTTTTATTTGTATTCCGG 0.988 +CACGCCGATTTGCAGGCGAAAGGGATGGAAGTTAAGTAATCCTGTAATAATTACTTACATCTGTCTGATAAAAAACCCGCTTTTTGAAGCGGGTTTTTTGTATCGAACAAG 0.98 +ACAGCCTGCCGGGCGGAGTCGGCAAAGTCATGTCGGTCGGTGAGACGATCGAGCTTTAAACCGTGAAACCATCTCCTGATGAATCAGGAGGTGGTTTTTATTTTTTCAGTT 0.998 +GAGCTGGCGCAGACCATCCGCAAACAACTCCAGGCCCCGCGTCTGCGTAACTAATCCCTGAACAAATCCCCGGAAGGTTTTTGCCTTCCGGGGATTTTTTATTTGGCCAGG 0.993 +GTGGCCCCGCAGAGATGTTATGCCAGCATATGATGCAAATAAATAGCGGCCTATCAGATGCATATATAAAGAGAGAGCTTTTCGGAAGCGCTCTTTTTTTCTGGAAAAAAT 0.934 +TTTATCTCTCGCTGCACCAGTATTAGAAACCGTTTATGCGAATCTGCAAATGTATGAAGCAGAAAAATAAAAAAAGGAGGCGGAAAAGCCTCCTTTTATTTACTTAAAAAG 0.847 +CTAAAGCTGAAATCGTTGCCTCCTTCGAGCGCGCAAAGAATAAATAAGTTCTTCTGGCGTAATAACCCTGAACGCCGGGCTTCGGTTAGTAAGGGTTTTTTTATGCCCGCG 0.987 +TTATAAGCTGCTTGCTCAGCTTGGAGAAACGGCAAGACATGTTTTTCATCTGAGGATATTAAAGGAAACAGCCGGAGAATAAACACTCCGGCTGTTTTTTTATAGCTGATG 0.9420000000000001 +CAAACCGGGACTCTTTTTCAGAAAAGACATTGGGGCACGCGCTTTAAAGGCTGCCCAAAAATAAGTGAGGAAAACCCGCAGAATAGCTGCGGGTTTTTTGTTATCAAAAAG 0.995 +GACGAATCTTCCTATATGACAGGGCAGACCATTCATGTGAATGGCGGCCGTTTTATTTCAACGTAATCAACGAAAAACCAGCTCAAGAGCTGGTTTTTTGTGTGGTGAAAA 1.0 +ACGTAAAGCCTGATTACATCATCAGCAGCCTTCATGATTTGTTTCCTATATTAGAGAAATAAAAAAAGCATGATCTCTTCAATGAGATCATGCTTTTTTATTTTATTTTTT 1.0 +TACTCAGCAAAAGAATAAAGACAAAATTGAAGAAAATGCAGAAAACACAACGTCTTCTGATAACTAAAAAAAAACGCTCACATGATGTGGGCGTTTTTTTTATACAAAAAA 0.9940000000000001 +GAAAAATTGAAAAAAACATCATGAAATTAAATCAAGCAAAAAAACGAAGAATGAAGAGAAAATAAAAAGTGCACAGGCCAAAGGGCAGGTGCATTTTTGTTTTGTATGGAA 1.0 +CGCGCTGATCAAAGGCACCGACGTTGACCAGCCGCGTAACCTGGCAAAATCGGTTACGGTTGAGTAATAAATGGATGCCCTGCGTAAGCGGGGCATTTTTCTTCCTGTTAT 0.8270000000000001 +AAGGCCCGAAAAACCGTAAATTTATCAGCATCGAAGCTGAATAAGTTTTTCGCGTCCCGGTAACGGATGAAAGCCCCGCAACACGTTGCGGGGCTTTTTACATTCGATAAC 0.9790000000000001 +TAACACCGTGCCACCGGAGAAAACCGATGGCACGCAACGCAAAATATTACAATTCTGATAATTCGCCCGTTCTTGCCTGCACGCACGGGCGGTATTCTCACGCATTCAAGT 0.665 +CAAGAAGGCTATGCCCGCAGTTCCATCAACCCGTTCCTGTTTCCGGGTGAAGGAGAGTAATCTCTGACGTTTTCCGGGTGGCGCAGGTCGCCCGGATTACAGCTCATAGGG 0.733 +AGGCGTTAAAGCGCTGACAGACTTCATGGTTGAGTTCGAACGCCGTCACGGTTAATGCCGAAATTTTGCTTAATCCCCACAGCCAGCCTGTGGGGTTTTTATTTCTGTTGT 0.49 +CGTGCAGTATATACAATTGATATCCAGCACAGCTGACAGCCACTTTCATCAAAATAATACAGCGTAGCGCCTTTGAGCTTTCGTTCAAAGGCGCTGCTTTATATCGAGCCG 0.9490000000000001 +TAGATGACTTACTTTAGGTAAGTGGGCAGTTTGTGGGCAGTAAATCATTTGAAAATAAACACAAAAAAATACCAAGGGTGTCAAAACCCTTGGTATCATTGATGCGGCCGA 0.43200000000000005 +CCAAGCGCACCACGTTTATTCGTGTGCAGGAACAAGAAAACCGCATCTTCAACAACCTGTAATCTCCCAACGCTGGCGTGGAGCAACACGCCAGCGTTATCCCATCCCACT 0.904 +TACGTTGGTAAAGGGCGAAATCGGGAAGAGGAAAACGACGCTGAAGCAGAAAGCGAAAAAACTGAAGAATAAATTCAACCGCCCGTCAGGGCGGTTGTCATATGGAGTACA 0.387 +TGACACTGCTTGGGTATATCCCCGGTTTGATTCACGCGTTCTGGGTGCAAACCCGCGATTAAACCTGTGCTCATCCCCGCGGCGTTAACGCGCCGCGTTCCTCTGCTACAC 0.365 +AAGATGCAGACATCATCACGAAGTCCCAGGTAAATCTAAAAAATAGTCCAAAACATGGGTATAAAGATAGAATGCACCGGCTAAAAACACCGGTGTTTTTTTTATATTCTG 1.0 +ATGATATTACAAGTAAGCCGCCTGCGACGATTGAGTGGGAATAAGAATCAATTAATGGAAACCATCTTTTTGGCAATTTTGCCGGGAAGATGGTTTTATTTGTTTATTACG 0.996 +ATATGAAAGAAGTGCCATTGTGTTTCGGGACCGAAACAGCAGCGAATCATAAAAAGTGAAAAGGAAAGCCGGCGTGTTCAAATACATGCCGGCTTTTTTTGTTTTTCACTC 1.0 +GGAAAACAGCTTTCGTGTGAAAGACCCTGTTTCCGGCGCCTATATTACATTTGTGATTTAACTGCAATCCCCTTGCCGAAATAACGGCAGGGGGATTTTTTATTTTTGTCC 1.0 +ACACTCTGTCCTTTACCGTACCCGTTACGATCAACAATCTGATGCCTGGATTATGCGTCTTGCCTGATCCAAAAAGAACCCGTCGGCATGGCGGGTTATTTGTCCTGGTTA 0.439 +CGCCGCACTCGTCTGTACTAATTCCCCGAGAGCGTTGCTCTCCGATCAGACCGAGTTGTAGTTGTAAGGCCGTGCTTCCGAAAGGAATGCGCGGCTTATTTTCGTTTATGA 0.8959999999999999 +ATTTAGTTGTGGGGGTGCATGGACCAGTAAAGGCAGCATATATTCTCGTCTCTGACCGCTGAACTCAGGAAGCCCGGCAGGCATATGCCGGGCTTTTTGCTTGAAAAAACT 0.904 +AGGCTATCAATTGGTGACAGTTTCTCAGCTTGAAGAGGTTAAAAAACAAAGAGAAGCGAAATAAATTAGAAAAGGCTGTCCGTACGGACAGCCTTTTTTCTATTTAATAAG 0.99 +CCGTTCCTCTTACCATGCGGATGAACAAGTAAACGAAGCATCTAAAAAGCGTCAAGCACAAGCATAATGCCAAAACGCCAGATCATCTGGCGTTTTTGTCATTCCTATCGA 0.955 +GTTGGCTAGTAATCTTGAATGTATATAAACGTTCGCGGAGACATGGCTAAATGAGTGAAAGGATCAATCTTGTGAAAGTAATTCAAGGTTGATCCTTTTTTGTAAAATGAA 0.643 +AGCCGGAATCGCTGAAACCTCCGTCACTGCTGCATCCATCATGGTGTGAGCTATGATCTGAATAACCGAAATACCCGCTGTCCGCCTGCTGGCGGCTTTTGTTTTTCGAGG 0.642 +TTTGTATACCTTCAAAAATCAAGCATCTAATGACTTGCCGAATTAATGAGGTGCAAGTAAAAAGGAGTAGCAAGTTGAGCCATCTTGCTGCTCCTTTTTGCATTTTTATAT 0.7140000000000001 +AAAGATAAGCTTGAAGAGTGGATCGAAATGTCCAACCGCAAAGACCTTATCAAAAAATAAGCATGAAAAAAGCACCGGACAGGAATGTTCGGTGCTTTTGATTTCATGCAA 0.99 +TTGAACCAAAACGGACAGCCCATCTCCTTTTGTTTAAAAGCGATTTTGATACAAGGGATAAAACGATAAAAAAAGCCGTGCGCTGCGCACGGCTTTTTACATCGGCTGATA 0.9740000000000001 +TACGCAGAAGCCAGCATTGACGTGGCTAAGCGCAGCACTCCTCCGACTCAGCCTTGCGGTTGCAACTAATGTAAAAGACCGGTTAACGCCGGTCTTTTTTGCGTTTTACCC 0.99 +TATAACTGGACCTACACCGGCGTAGTGAAGAACGGAAGAATTTACTTAACGTCTATTCAATAAAACAACCCCATCACCAGCAGGTGATGGGGTTGTTTTTAAAAGAGCATA 0.98 +TGTTCTCTAACCACATTCCTGATTATCGTAATTTGATGACCAGTTATGACACGTTAACGAAGCAGAAATAAAAGAGAACCGGGGCCGAAACCCCGGTTGTTATGATTAGCT 0.7809999999999999 +GCCAGATCGACGATGAGCAACGTAAAAGCAAAAATAGATTATGAATACGACGTGACAAAGCAGTTTTAACTTATACCGAGCCGGTTATCCGGCTCTTTTTTATTCTCCGTC 0.981 +TCCCGGTCACAGTGAGTTTCATCTGCAAGTTGCCGAACCCACCTCTTATCTGTGCCGCTATCTGTAATTCCTCGCCTTCCCCTTGAACGGGAGGGCATTTTTCTGAAATAT 0.9009999999999999 +GGCATTTCTTTTATGGTCATCGTTTGGGCCGGTACTTTTGCCCTGATGATCTAAAAGCAAACTGTGCAATGAAAAACAGGAGCCATCAGGCTCCTGTTTGCATTTCTGACG 0.961 +ATGACGTACAGATTACAAAAGAATCACCTAACTATACAATTTCATAATAAATTGTTACAAATTAAAAACATTTGACAGGGTCTCTGACTCTGTCTATTTTTTTTATACTGA 0.996 +TCCATCCGCTGACACAGGAGAACATGAGGTTTCATGCTCCACTGCCTCAGGACATGAGCAAACTGATAAAAGGAGAGAATCATTGATTCTCTCCTTTTTTATATGGGGCGA 0.9890000000000001 +GATTGTTCATATCATCGTATGGCTTGTGATCGGCGGATTATGGTGGAAAGTACTAGGAATATGGTAGAAAGAAAAAGGCAGACGCGGTCTGCCTTTTTTTATTTTCACTCC 1.0 +AACCGGCGGGTTGAAGTTCTCATTTTGCCGAGAGGCGCAGCGGAAACAAATGAAAAATAGCAAAAAAGGAAGCCTTGTGACATATCAGGCTTCCTTTTTTTACTTATTATG 0.9890000000000001 +GCCACCATTCACGGTGAAGGTTATCGCTTCTGCGGTGATCTGGAAGATTAATCGGCTTTACCACCGTCAAAAAAAACGGCGCTTTTTAGCGCCGTTTTTATTTTTCAACCT 0.9790000000000001 +CTGAATTTATTACGGTAGCTCGCCCCTACGCCAAAGCAGCTTTTGACTTTGCCGTCGAACACCAAAGTGTAGAACGCTGGCAGGACATGCTGGCGTTTGCCGCCGAGGTAA 0.006 +CTGCCTGAAAGCGGAACGTCCAAAGCCTATACAGTACCGGGCCATGTTCAGTTTCCTGAATAAAAAATACGAAACAGCGGATCTTTTCCCGCTGTTTTTTCAATGTTCTGT 0.986 +AATTTTCTATTCACTCAGATTTCAAAAAACATTATTTGGAATAATGTTTGTCATCTCTTACCCCAAAAACAAGGCTTGTACAAAACAAGCCTTGTTTTTTTATGCATAAAC 1.0 +GGTATTAGAGATGTTCGGAAATACAGATTTGGCACAAGCCCTTCATGAAGCGATTTCTGGGCGCAAACAGTAAAAAAAGAGACATCCTGTGTCTCTTTTTTTATTGGAAAA 0.97 +TATGATGGCTCAATTTGATGAATGGTTTAAAAAAGAAAAGTAAGTGTTTTAAAAGCAGCTTAGACAGCCGTATGAGGTCGATGCCTAAGCTGCTTTTATTTATGCTTTTGA 0.903 +GAGCAGACGGCATGTTTACCGACTTTCCAGAAAAGGCTTCGGCATTGCTGAAAAATGAATAGTTGTTAGAAGGAGGCTGTTTGACGCAGCCTTCTTTTTTCATTCATTCAT 1.0 +GTGACGGGTCAGTTGCCAGACGGTATAGCCGGTGCTTGCACCGGCGACATCCCAGGCCAAATCCTTCCAGCTCCAGCCGCTCCCTTCGGGGCGGCTATCCCAAAGCTCTTT 0.732 +CTCTGTAAAAGGCGGAAAATTTCTTAGAGAATGCCGGAGATGCGGGATGAAAAAAAATATTTAAACAGAAAACCGAAATCAAATTGATTTCGGTTTTTATGTTTTTACTGC 0.992 +ACTTAGATCGCCATGGCTGTCTGGAGTCACTGCCTGATCAAAACCAATTGTCACTGTTCTAATATGGAAAGCAGAATTTCTCAGAAATTCTGCTTCTATGCATACATAAGC 0.579 +CAACATCTTCGTCAGCCTGCTGAGCATTCTGGGCTTCGCTAGCCGCGATTAATCTCACCCGCTAACACTTTCATAGCCTCGCTTTATGCGGGGCTTTGTTTTTTGTTACAC 0.93 +AAGAACTAGAGAATAAACTTGATCAGCTATAAGGCGTTCTGAGAGATTGTCCTGAAACTGTGAAAATAAGAAAAGCACCCTATACAAGGTGCTTTTCTTATTATGCTTCCT 0.927 +GGCGGACTTGAGTCTATGACGCAAATGATTAAAGAAGTGAAAGACGGTTTAGAAAAGTAAAACCAAAAAGAGCCTCCGCTAAATAGCGGGGCTCTTTTTTTGTTAATCAGC 0.996 +AGAAAGTTGACGCGAAAACTCGCGGCAAAGTCGGCATTAACGAACTGCTGCGTACCATCGACTAAAAGTATGCACACGGGCAGCACGACGCTGCCCGATTTTTTTGCAGGG 0.9009999999999999 +TACAATTGGTTGTGGTGGCGGAGCTGTTGCTTGTCAAAACTATCGTCAATTCTGCAGATAAAACATTTGTAGAGGGAATATTTTAAATATTCCCTCATATTTAAAGCGGGG 0.9359999999999999 +TAAAAAGATCAAAGACTTCATCGAGAAAAAATACGACGGAGCTGTGCTTCCTGTATCTGAATAAGACTGAAACCCCGGATGAGAATTCCGGGGTTTTTTTCGCGTATTTTC 0.998 +GCTTTGAAGTTGAATCAGTCGAAGAAATGAAGGCTAGTTTAAGTGCGAAAAGCTGAAAGAATCAATAAAAGCAATCGGTATGATGTCGATTGTTTTTATTTTTGAACAGAA 0.659 +CTACCGGTGGCCGTGTTGACCGCTTCAACAAGCGTTTCAACATCCCGGGCAGCAAATAAGACCGGATTTCCGAAAAAAGCGCCGCACGGCGCTTTTTTTGTGCCTGAATTT 0.687 +TTGGTGAAGACATTTCTGATGGCGGTAACGCGATTTCTGGCGCAGCAACGAAAGCGCAGCAATAAGCAATAACGGTACGACAGCTGTGTCGTGCCGTTTGTTTTTTCTGCG 0.905 +AGCTATTGAGTAGTAGCAACTCACGTTCCCAGTAGTAAACCCTGTTTTCCTTGCCATAGACACCATCCCTGTCTTCCCCCACATGCTGTGGGGGTTTTTTTTATCCTCAAT 0.978 +GGGATCTGGTGACAAAATGGTATCCGTCGTCAGAAGAAATTGAAGGGGGGCTTCCTAGCATTTTTAAATAAAAAACACGCACTGCGGTGCGTGTTTTTCGCTTTGTTTCTA 0.99 +GTCCTGAAACGGATTCAGCATTTAGATGAAGCATACAACAAAGTCAAACGTGGAGAATCAAAATAAACCTTCCGCTCACATGTGAGCAGGAAGGTTTTCCTTCTTTGAGGC 0.741 +CAGGTCGATGCCGATATGTACATCGTATTCGGCAATTAATACATAGCACGATTGATTAAATAACCTTAATAACAATGCCGACGTTATGTCGGCATTTTTTTATCAGATAAA 0.992 +GAAAACGGCAGGTTTCACGGGGGGCATATCGCAGCGGAAATCGCCAAAAGAGAAAACCATTAAAGATGAGGAGCATATGATGCATTCGTATGCTCTTTTTGTTTTTTTAAG 0.968 +ACTTCCGCGAACTGGCGATCTTCAAAAGCGGCGTAACGCTATAAAGTCGAAATGAGTTGTTAATAAAAAACGGCGTTTCATAATGTGAAGCGCCGTTTGCTTTTCCATACC 0.769 +TAAAAAAGAAAATGCGCGAACAAGGCTTGCTTGATTAAAAGCATTGCCCCGCCGGCGACATTGTAATCAAAAAACACCGCAAAGAAAGCGGTGTTTTTCATTTCTTTAAGC 0.983 +ACGAAAAGCCAGCCATCGTCAATCAGCTGATTTTGGAATATCTTAAGAACCAATAAGCTAAAATTTCTCTCCATCCGTCTGTCATAATGGCAGACTTTTTCTGTGCGTTTT 0.9890000000000001 +AACAGCAGTGCTTGATTCAAAAAAGGCTAACAAAATAAAGGGATTTTTACACAATTCTTGACTAAAATAAACCTGGATTTTCGGTAAATCCGGGTCTTTTTTGTACGCAGC 0.826 +TCGCATTCTTTTTTACCTTCCCCTGAATGGGAGGCGTTTCGTCGTGTGAAACAGAATGCGAAGACGAACAATAAGGCCTCCCAAATCGGGGGGCCTTTTTTATTGATAACA 0.935 +TTTGGCCGGCGCCATCGTGTCCGGTATAGCGATCAACATGCTTAGAAAAAGAGGATATCAAATGTTTTAAATGCCGCTTTCTTCTAAAGAAAGCTTTTTTTATGCGTTCAG 0.648 +ATCAGGTGCAACTGCCGCCCTCGGCGCGTTTGATTACCTAATCCGTAACTAATATAAGAAATCCGCTATATTGCCAGATTGGCAGGATAGCGGATTTTTCTTTTTCTACAG 0.9990000000000001 +AGCAGGAGGATCTGCGGGAATTTGTCACGCGGATTCAAAAGCTGAACCTGCCGTTCGGGTCATTACAAGAAACATCCCGCTAAAAAGCGGGATGTTTTTATATGCTTTTCA 0.977 +ACCTTTTTCAAAGCTCCCGTAACGACGTGAATGATTACTTAAATCAAAGCAGAGGCCTTTTGCATTAAAAACCTGCCGTTAACGACCGGCAGGTTTTTTCATTTTCTTAAA 0.9840000000000001 +CAGCTCTACCAAAACAATGTGCAAATGCTTGAGGAAGCGGTCCGAATCGAGAGAACCCAATAAACAAAAAAACTGATTCCGAGTTGGAATCAGTTTTTTATTTATCTTGGC 0.993 +GTATTTCATCAAGAGGTGTTAAAGCACAACGCGTAACATGCCGTTTTTTACCTGCATGCCCTCCTTTGTAATCGTTAATGGGGAGGCATGCAGGATTTTTTTTGCTCAGTA 0.775 +ATCGCTTGAGCGCAAGGTTAATCAACAAGACTAACGTTCCATCTTTTGTTCGCCAAACTTTACGGCCTGTCTCATTCTTACGATTGCGGCAGGCCGTGTTATTATTGTCGT 0.579 +AAATGAAGAATATGACGAAGCGCTTCAATACATAGAGGAACTGAAGCAGATTTGTTAACCATACCAAGCAAAAGAAATGATCCGTTTCTTTTGCTTTTTTTATTGACAAAA 0.996 +AAAGTGCTGGAACGATATCCAAATTGTAAGATTGACTGGGACCAGCCCATTAAACTCGCAAAAAAAGCAGAAGAGCGTTAAACGCTTTTCTGCTTTTTATAAATCAAACGG 0.9490000000000001 +GTTACTTGAATCCGCTTAATTTAGCGGTGATAATCCGCCACAATTTATTGTGACAAATCCAACCCTTCCTCGTCGGGCCTAACGACGCGGAAGGGTTTTTTTATATCGACT 0.779 +CGCCATGTATTTATCATATCGAAAACTTCATAACGAGCTGCAGTCATCTAATATTCAAATGAATTAAAAAACCTTTCCTCTTGTCAGGAAAGGTTTTTTATTTGAGAAGGG 0.995 +GCGCAGGTGACTGAATTTGCACACCGCATCGCCGATGCAGTAAAAGCCGTTTAAAGCGTTAGATAACTGGCTAAAAAGGCGGCGATTGTCGCCTTTTTTCTCAGCGAACAC 0.631 +CGCAGCCCTTGTGATTGCGCTGGTTGTAGGAACTGCATTGAATTATATAAATATCTTCAAATAAAAAAGGAGCTGTGTCGCAGACACAAGCTCCTTTTTTTATTATTTATT 0.9840000000000001 +TAAAAAAGATTGCAGGCTTGTGAAATTTTCCGTCTTGTATGTGCGACAAACGGTTGAATATTATGATGAAACGGTCCTTCCTATAAGCAGGAAGGTTTTTTAATGAAATAT 0.898 +TCTTTCGAAGGTATTTCCTTGAGATCCACTCTCGCTTCAGCTATTGCATCGGACTTAAAATTAAAAAAGGTATAAGGCTTAAGGCCTTATACCTTTTTGATTAATCATTCC 0.9179999999999999 +TGAAGGTTCGTAAACTCATCCGCCGTTTGCGCCGTATCCGCATCGACCGATTAATTAGCCGCATCCTGTAATCACAACCCCGTCCTGTACGGGGTTTGTTTTTTGGAGGCC 0.532 +AGATGGGGCTTGTTGTTCCTCAATATATGAAAAATGTCAACTCAATTGAAGATTTGAAGAAGTAATCAAAAAAGCAGCCTGTGTCAGGCTGCTTTTTTTGCGTTAAGAAGC 0.998 +TCCGAAATATCCTACACTTTCATTACGCAACGCTGAAATCGTTGAAAAATACTATCAAAAATAATGCAAAAGAGCCCGGCGTAAAAGCCGGGCTTTTTTTGTGCCAAGAGT 1.0 +TAACGCTGAAGATGTTTACAATCTCGCATTAATTACAGCGGCGCAAGCACTGTAATAAAATTGAAGACAATGGCAGCTCTCGACCGAGGGCTGCCTTTTTTGCGCTTCTGT 0.998 +AAGTAACCGACTTAACGAAACGCCATGCGAAAGAACTTTTAAAACAAGCGGATCAAGTCAAAACAACTGGGTAAGCTGCGCGAGAAGCGCAGCTTATTTTTTTCGTGCACA 0.985 +CCTCGCGATGCTCGCGGCGCTGGTCAATAGCGGGTATTGGTTCTTCGAAGGGTAAGTTTGGATTTAAGCCGGATGCGGCATCACATGCCGCATCCCTTTTCGCTTACTTCC 0.456 +GTCCAGAGAGGCTGAGAAGGATAACGGATAGACGGGATGCGTGTATAGGCGCGCACCTTGTCCTAAAACCCCTCTATGCTCTGGCAGGAGGGGTTTTTTCTTCTATATGAA 0.6409999999999999 +CTGATTCTGTTATTGATCATTTGTATATGGACGTTTATTCTTGCAAGATTAGGAGTGTAATAGAAAAAAGCAGTACATGCCCAGCATGTACTGCTTTTTTTATGTTAATTT 0.993 +CTGATTACCGGATCTCTTTACTTTATTTCTGACATTCGAAAAAGGTTGAAATAAAAGCATCCGGCTCCGGCAGAATCAAAAAAAGATTCTGCCGTTTTTTTCATGTGTAAA 0.9570000000000001 +CCTATGATTTTTCGTTAGATCCTGATTCACCTGCGCGGCCTGTCATTGAATAATTGCTATAAGCGGCGTTCTCTCGTTCAACCGAGAGGGCGCCGTGTTTTAATGGAAAAA 0.9179999999999999 +TAAATAAAAATGAATATCATATCTGTTATAAAATATCACCAAATCAGAAAAAAGCAGGTCTCCTTTAGTCAGAAAATATACAAAGAGATCTGCTTTTTATGATTCTTACAT 0.9590000000000001 +AAAGATGATTCAACATCCAACTCTCGGACAAACCATTGATGTAAAAGCATAACCTCATACCAGAGGCGGCCAAGAACTCCATCCTCCCGGCCGCCTTTATTGTTCACAAAA 0.8690000000000001 +GTGAAGACCGTGCATTGATCACCCAGGATATGTCTCGGTTTTATCCACATATTTTTGTTGAATAAGCCACGATACCGGATGGCACTCGCCATCCGGTAATTGTTAGCCTAT 0.579 +GTGTTATTTTCGGCATGCTTCCTGCAAATAAAGCCGCAAAGCTTGATCCCATTGAAGCGCTGCGTTATGAGTAGCCTCATGCAACATGCATGAGGTTTTTTTATGTTTAGT 0.997 +CTACGAAGCCATTATCAATAAACTGACGAAAAAACCAACCGATCAAAATTTCTAAGAAAGACAAAAGCCGGCGTCTGAAATCAAGACAGCCGGCTTTAATATTTCTCATAC 0.9570000000000001 +AGACGTGGTCAGACATCCGCTTGTTGCGAAAATTATTGAAGCATACGATAAGCAAAATTAATGCTGAATGACCCGATCGCGCGTTATCGGGTCATTTTTATGATTGCTTAT 0.7879999999999999 +GATGATTTCATCTTCACTCAGCGGCTGGCCATCAACAGAAATGGTAAAGTCGAGGCTGCTCCACGTGGCCAATTCTATGGCCTGAGGAGCAGTCTGCTTTTGTTTCTCGCG 0.518 +GCTTATGCAAAAGGCACTTGGAAACTAAAATTCAGTAATGATGTTATTGTGAAAATCAAATAACAGCTAACAAGGGTGCCTGTTTAGGCACCCTTGTTCTTTAAAAAAGGC 0.7809999999999999 +ATTGGCTGCTTTGTTTGGCTATATGCTGAATCGTTTTGAATTAAAACCAATGTTGCAGAAGGCATAAAAAAACCGCTCATAGGGTTGAGCGGTTTTTTCCGTATAGCCATT 0.956 +CGATGACGAAGTGTATGGAAAAATCAGAAAAACTCAGCAAATCCTGATGACTTTCGCCGGACGTCAGGCCGCCACTTCGGTGCGGTTACGTCCGGCTTTCTTTGCTTTGTA 0.64 +CAAGACAACGGAAAAGAAAGCTGAATGAGTATTCGCATAATCCCGCAAGATGAGCTGGGTTCAAGCGAGAAACGTACGGCGGATATGATTCCGCCGTTATTGTTCCCTCGG 0.39299999999999996 +GATAAAAAATCTACGTTACACCGTGAAACAAAATAACAGCTTTGCTGTTTGAGGTGTAAGAAAAAAGCCAGAGCTTTGAAAAAGGTTCTGGCTTTTTTTCTATTCCGAGGG 0.986 +ACTCGGCAAACCTGGCTTTGAAGAGAAAAAAGCAGAGCTTGAAAAACGTCTGGAAGATTAATGTGAGAAAGCTGACTGGCATTTGCCGGTCGGCTTTTTATAAAATCAGAA 0.852 +ACATGGGTACTGGCAGTGATAAGTCGGATGCGCTCGGCGTGCCCTATTATAATCAACACGCTATGTAGTTTGTTCTGGCCCCGACATCTCGGGGCTTATTAACTTCCCACC 0.9440000000000001 +CCTGACGAGGCGCCGGAAGTGAAAAAACGCACACCGATCCGCGACCGATTCACTGAAATGTAAAAAACCCCTTGTCTCAAACGGAGACAAGGGGTTTTTCATTAACGAGAT 0.9890000000000001 +AAAGCTTTAGAAGAAATTGATGCAGGCCTGCTTTCGTTTGAAAAGGAAGACCGCGAATAGTAGCACAAGTAGCAACCTATATCATGTAGGTTGTTATTTTTTTCCGTGCGG 0.875 +TGATCAAAGAAACTGCCGCTAAATACAAGTCTGAAGAGCCGGACGCGGAATAATCCGTCCTGCAATACTGACGGATGCGAACGGGCATCCGTCAGTCCTTTCTCTATCCCC 0.9540000000000001 +AGGTCGGATAAGGCGTTTACGCCGCATCCGACACGCATTGCCCGATGCCGCAAAGGCATAAAAAGTCGATGGCGTTGAATATTTTTTCAGCGCCATTTTTATTGATGCGCG 0.442 +CTTTGTTTATCAAGGGGTTTAGAGATAGTTATCAACTTTAAAATGGCCGGCGTGCCGAATTTGATTGTAGAAGACGAACTTTTGAAGTTCGTCTTCTTTTATTTTTCAGAG 0.996 +GCGCAAGAAATCGTTGAAAAAGGTTTACAAGAAGCAAAAGATCTTGCTGCAAAATTCTAATAAAGCCAAGACTCCTATCGTTACAAATAGGAGTCTTTTTGTATATACACA 0.988 +CAAAGTGTGCACAGGGCTACTCTGGCCGTTAATGCGCAAATTGTTCAGCCTACATTGATTGACTTTTTAAAGTAAGCGGCTCTTAGGAGTTCGCTTTTTTTATAGTTCAGG 0.9229999999999999 +ACAGCGTGTAGCTGCTCTGACTGATAAATTTCCTTTATATAAAGAATTAGATTATTAAGATCCTAAAACCCGCTTGGGCTTATGCCCGGCGGGTTTTTTGACGATGTTCTT 0.993 +GTTATTACTCGTACCTTCACTCAATCTATGTAATTAGATTTGACTGAAATCGTACAGTAAAAAGCGTACAATAAAGGCTCCACGAAAGTGGGGCCTTTTTTAGCGCGAGAG 0.985 +GGAAAAACAACACCCCCCGCTTGATAAAGAGACAATTGCGGGACTGCTTGAATTGTATGTTGTCAAAGGATTGTCAGCGAACTAGATAATCCTTTCTCTTTGCAAGAAAAT 0.526 +TGAAAAAACAAAAGGCAAAACTGAAATCAAGCAGGAATTACGCCAGCTGTAAGGAAAAGTTTGATAAAAAAGGCGTGGATAATATATCCACGCCTTATTCGTATAGGCAGG 0.81 +TTAATCTCTTTGTGATTATTTGTGTGTATACCATTGTATCTGTAGATACGAATTTACAGTTTGGCTTTTAAAAAAGACGCCTTTTCAGGCGTCTTTTTTCGTTATACCGGT 1.0 +GATAACGGCTGCGAAATTCTGACGCTACGCAAGGATGACACCATCCCGGCGATAATCTCGCACGACGAATAAGATGAAGCCGGCGAATGCCGGCTTTTTTAATGCGATAAT 0.6729999999999999 +AAGTGGTCCGCGTACCTTATGTTGGCGTAGATAAAGACAACCTGGCTGAATTCAGCAAGAAATAAGACTGATTGGGCGCAGTCTATTACTGCGCCCTTATTGTATGGCCAA 0.47 +AGAACCTGAAGGGATAGGCCTACCTAGAACCACATCGAGCAAGCCCTTTGATGGGCAACTTTTTGTACAGAGCCGGGGTGTTGGTAGCACCTCGGTCTTTTTTATTTTATG 0.983 +TCTCTTGCAACCCTGCAGGACGGCCTGCGCCGCATCAAAGCCGCATTATCGTAAATATAGCTGTAAACGCCTTTACGTCTTCATTTGTAAAGGCGTTCTTAATAAAGGAAT 0.973 +TTAAATGAAAAAGGATATATTGTTCCAGGTCTCGGAGATGCGGGTGACCGCATGTTTGGAACAAAATAAAAAATGAAATCCCCAAAAGGGGGTTTCATTTTTTTATCCAGT 0.982 +TTGCGCCCGAGTTTGTCGATATGTGGTATGAACGGTATTTGCGCTATACAAAATAATGGTTACGTAAAAACCTGTTGCGGGGTACAACAGGTTTTTCACATGGGCTCACAC 0.7609999999999999 +AAAAGAAAATACCTCTGTATCTTCGGGTGACACGGTAGCCATCATACCGCCGGTCAGCGGGGGATGAAAACCGGACGGCATTTCAGCCGTCCGGTTTTTCTGTATTTCATT 0.99 +CCGTGGTCTGGAAAAACGTGGCCTGCATAGCCGCGAGAAGAAAAAATCCTGATTCGGTGAGTTTTCGCTAAAATAAGGGCGGTCAGTTGACCGCCTTTTTTCTTTTCGTAG 0.8059999999999999 +GTGGTTGAAGCGCTGATTGTTGAAGATCAGGCAAGCAAGCTTCAGCAGTCGGAAGGTTAAGATGAAGACGATATTCGAAGCCCTGAAATGGGCTTCTTCTTATTTAACCGA 0.871 +CGGTCTGGCTGCTATCCTGACAGTTGTCACGCTGATTGGTGTCGTTACAATCTAACGCATCGCCAATGTAAATCCGGCCCGCCTATGGCGGGCCGTTTTGTATGGAAACCA 0.784 +CCGGGTAACACCATGTCAGAGCAGTTAGGCGACAAGTAAGTTCCCGCATCAGATGACTGTATTTGTACCGAAAACCCCGGGGCGTGCTCCGGGGTTTTTTCTTATCAATTC 0.731 +CGAAAAACAAACATACAGAAAACCTCAACAAGCTTTTACACGATGCGGTCGTACCGCTTGAGCAATAAATCATAACGGGCTGTCTGCAGCCCGTTATTTCTTTTTACGTGC 0.738 +GGGACAAAGCACAACGGCTGAGGTCACTGATGAAATCTGTTCGCGCTTAAGAAAGCTCTGATGAATCAGGCCGGTGGCAGATGGCTGCCCCGGCCTGTCCATTTCCTTACG 0.634 +AGAGCGGACGAAGATGATTTATGATTTTATTTTGCGTGAGCTCGGGGAACGGTATTAGAAAAATAGGAAGGAGCTGACCGAACAGGGCAGCTCCTTTCATAAAGCTATGCC 0.522 +AAAGCGGGATTGAGAAAGGCTTCGGGGATCTGGACTGGGCCGCACTGATTAAATGTATAAAATAAAGAAAAGCCTCTCCGTTTAAGGAGAGGTCTTTCTCTTTTACAAAGA 0.963 +AAGGGGAATGGCTATAAAAAAGCGACAGTTGAGCCGCTCAAAGATGATGACTTTTATTTTGAAGATGAAAAAGAGCAGATATAATATCTGCTCTTTTCGGCTTGAAGGTTC 0.72 +GGACTTTTTAGCAAAGAATCAGTAAAACAAGCACAGCAAGCAAATTAAAAGTCATAAAGTTGGGAAAGAAGGATGTTACTTATGTAGTATCCTTCTTTATTATTCCTTGAA 0.96 +TTTTCTTTTTATATTCCGACAAAACGGTAAAATCGAGTCTGAATTTGCCGAAGAATCTTGTTCCATAAGAAACACCCGCTGACTGAGCGGGTGTTTTTTTAATAGCCAACA 0.9890000000000001 +ATCGTGACCAACCCGAACGCGAAAAGCACCTGCGGTTGCGGTTCTTCCTTTAGTATCTAATCTGTTGTTCATGTGCCGGATAAACTATCCGGCACATCTTCTCTCTTCAAC 0.87 +TAAAGACGGTAACCGCGACAACGCCGTGGCTGACGGCCCGTATGTTGGGGTTAACGCCAGTTTCTGATCCCTTCCGGCGCGGCATTCTGTCGCGCCATTCAGTTTCTTACT 0.8740000000000001 +AAAAATCGGCAAATATGACATATCAGATGCAATTCCGGTGCCAGCTGCTCCAAATCCGATTTCCTTAAACTAAAAAGAGCCCTTTAAGGCTCTTTTTTAGTTGCTATTCAT 0.977 +GGCGATGCTGGGCTCATTTAGAAAAGATTAATGCCCAGGGGTTCAAAGCCCTCTTTCACCACATAAAAACGCCTGCCACATATCGGGGCAGGCGTTCATTTTGTTATACAC 0.878 +AACCTTGAAGATGGCGTGGCGTTTGCTATTGAGAAGTATGTGCTGAATTAATCTGTGGGCGGGCAAACAGCATAAACGCGTTTGCCCGCTTACTGATTATTGGTTTACTGC 0.441 +TATGGCTGCAAAAGCTGCAAGATCAAAGACGTTATGTGAAAGACGTTTGGACAGGAATGTAAAATATAAAATCCCGCCAATCTGATTGGCGGGATTGCTTTGCATATGAGA 0.9540000000000001 +AAGCGATCGCGCAGCTGCGCGTTATCGAGTTGACCAAAAAAGCGATGTAACACCGGCTTGAAAAGCACAAAAGCCAGTCTGGAAACAGGCTGGCTTTTTTTTGCGCGTGTG 0.998 +CCGCCAGCCTGTTCAGCTTCGCCAGTTTGTTCGTTAAGCAGCAATAATTACCCCGGTTGTCACCCGGATCATAGTCACTTGATGTGACTATGATCCGATTAATACTCTCTC 0.774 +CTCTTCGGGATTTTTATTCCGAAGTTTGTTGAGTTTACTTGACAGATTGCAGGTTTCGTCCTGTAATCAAGCACTAAGGGCGGCTACGGCCGCCCTTAATCAATGCAGCAA 0.764 +GTACATAGCAATTCATACCAAGAAAAAGTAACAGATGGGATCGAAAGCGGATTAGAGAAATACTTCCAATAAAAAAAGCTGCCTTTTGGCAGCTTTTTTTATTTTGAATCC 0.9990000000000001 +CTTAAAGCTTGTTAAAGCGATCGTCGCTCAGTACGAATAAATATGAATCAATGCAGGACCAAAAGACAGCTGTGTCTGATATCACACAGCTGTCTTTTTTTATGCCCAAAA 1.0 +AAACGGCTTGCTGAATCGGTGCAGACCGGAGAAATTTTTAAATTTATACCGGTCGAAACAAAGTAAAGCAGAAAACGCCTGGGAAACTAGGCGTTTTTTTGATGTAACTAC 1.0 +GTTTGGAAAACGCATGCTGATGATTATAAACCGGAGGATCTGCAGAACATCTCTTCCTCTATCGCAAAACGCCTGGCCTCATCATAAGATGAGGTTATTTTATTTTGTGAA 0.971 +TTCTGGCCGCGTATTTCAGCCACAAATGGGATGACTAATGAACGGAGATAATCCCTCACCTAACCGGCCCCTTGTTACAGTTGTGTACAAGGGGCCTGATTTTTATGACGG 0.841 +TTTTGCGACGAGAATTACGTGACGAAGCGCGCGGCAAAGTGGACAATAAAGCCTGAGCGTTAAGTCAGTCGTCAGACGCCGGTTAATCCGGCGTTTTTTTTGACGCCCACT 0.39399999999999996 +GGAAGAATTGCTGTTTCATTACGGGTATCGAAAAGAACAACCGGATTCATTTACAGGCAAATAAAAAAGAAGCTTTGCACATTGTGCGAAGCTTCTTTTTGTTTTACTTTT 0.995 +TTCGGCAACTTCCAAAAACTGAACAAAGATGACGTGCTTGCCATCCTTCGCGCGTCTCTATAAACCCAAAGGGCGGAATCCAGTCATTCCGCCCTTTCTTCTTGACTTGAT 0.968 +ACAGCTTAAATAGCAGGACCGGGGATCAATCGGGGAAAGTGTGAATAACTTTTCGGAAGTCATACACAGTCTGTCCACATGTGGATAGGCTGTGTTTCCTGTCTTTTTCAC 0.812 +CGGGATCCAGCTGCCGGAGGAAGAGATTCGGGCTGCGTTTGCGGGGTTGAGGAAGTAAGGGGTTATGGGCCGGATAAGGCGCAGCCGCATCCGGCCTGATATTTCACTCAA 0.5579999999999999 +GTCGCCTATATCGCTGATCCGGATGGACATTATATCGAAATTTGCAGTCCGATGGAATAACGCACAAAAGCCCAAAACAATGTCTGTTTTGGGCTTTCTCATGATGTTTTG 0.7879999999999999 +AGAGGCATCAAGTCTAGTGAAACAAGCGCCAAAGATGTGCTTAAAAAAGTGTATAATAAATAGGACAAATGAGAAGCGAGGGGAATCCTTGCTTCTTCGTTTTTGCAACCA 0.858 +AAAGCTATAAAACCTATTTTCTTGCCAACCTGCTTGAATCAAAGAAAAAACAATCTTAAATATGGCAAAGCCGCGCACGTTGATATGCGCGGCTTTTTTCTATTCAACCGG 1.0 +AGATGACTATAACGGAATCAGAAACTTTGTGATTAAACGAAAAGAGGATTCTTATTTATAAAAATAAGCAGCCGGACAGGCAGAGTTCCGGCTGTTTTTTTATTTCTTGAT 0.992 +CAGCCGATGCTGGAAGAACCGGAAGCCTTTGATCAAAGCTTCAGGAAATGGCTTGATCAATAAAAAACAGCCCGCAGATCAACATCCGCGGGCTGTTTCTGATTATAAGAC 0.7 +GCAGATGCCTCGCGACTATTCTGAAGGCGCATCCGGCCTGCTGCGTACTGGCGCGAAGCGCGACTAATTTATTTTTCGGGCGCAGCCATTGCGCCCTCCTCTTCTCTCCCT 0.924 +GGGTGGAAATCGGTACTCTTGTGGAACTGGCGCAATGGACGCTGTCAGCCGATAAAGTGCTCACATTTTAATGTTTGAGGGGGCGTCTGACGCCCCACTATGCGGGAAGTT 0.8370000000000001 +AAATAACCCAACCGGCAGAAAACGCCCCGCTGAAAAGTAATTCATAACCATCAGTCCTCAATGACGATTAAACACCATTGCCTGCGCAATGGTGTTTTTGTTTTTATCTGC 0.9329999999999999 +CCGAAAATATGCTCGCAGCGATCCAAGGGCAAACACCGAAAAATCTTACAAGAGAATTTCAATAAATCAAAAATCCGGTCTGATAACAGACCGGATTTCATATTACATACG 1.0 +AATCAGCAGCCTGAACAACAGCTACTGGAAGCCTCGTTACCTCGGTGCGAAAAGATTCTAATTTTTAGAGAAAACCCGTTCATTGGAACGGGTTTTTTTCATTAGACGGAG 1.0 +AAAACGATTCGGAAAATCATCGCGGTGCCTGGGAAGCTTGTCAATATTGTGGCAAACTAAGCCTAGAAAAAATCCCCTTTGCCAAAAGGGGATTTTTTTTCATCAGTCTCA 0.987 +GTGTTGACCAATATTGATAACAAACCTGCGCTGAGTGGTAACTAATAATAAAGTTGCTCTCAAAGACGTTAAAAAAGGTGCTCAATGAGCACCTTTTTTCTGTCTGTTATT 0.9129999999999999 +TACCTTCGCTTCACAGCCGAGAAAACCGAAAAAGAAAGCACAGTCAAATCTGACTGCGTAGCATCTGCAGGAGAATGGGTGAAAGCCTGTTCTCCTTTATTTGTGCTACCA 0.963 +ATGATAAAGCGTTTGTTCGGGCGCTGCGCAAAGCGTTCGACCTTCCGCATATTAAAAAAGCCAGTTAATATCATCAGGGAGCTAATCGGCTCCCTTTTTTTACCTTTAATT 0.578 +GCTCTTGAAGGAAGGCAGAAGGACAAAACAGTCCTGACAGGTGCCGCTTTTATAATAAACTGATGTAATGAAGCAGAGACGGTATAAGTCTCTGTTTTTTTATCGTGCGCA 0.987 +ATGCGAAAACGGGAGAAATTGTTGATGTCATTGATACAATCGCCTCTTAACACCGGACAAGCAAAAAATAGAAGCCGCTGCCTATGCAGCGGCTCTTTTTTATTATATCGC 0.948 +TTGGTATTGTGATTCTTGCCTTTGTTCCCGAGGAAACCAGCACAGATGTCAGCCAGCAGCAAAATGACCTGCCCCTTTAACTGGGAGCAGGTCATTCATCATTTATTCGCT 0.87 +TTATTCAGCTTGATGAGCTGATTACCGAAATGAAAAAGCATTTTTATAAATATCATGCGATGAATCGATAAAACCTCCTGTTAAAACGGGAGGTTTTTTATTGTGTACCAA 0.993 +GGCGAATGGCCGACTGGTCTGCGCTTAAGTCGCATTGGCGGCATCCATTAATAGCCATCTTCGTTCACGTTGTCGCCGTTATGGCGACAACGTGAATTATTTATGAGATAA 0.738 +TTTTGAGATTGAAATGCTGAAAGAAAAACAATTTGATGATACATTGCTTGAAAAGATTTTACAGCAATAAAAAATGGGGGATAACACCCCCATTTTTTAGCTTGTAAATTG 0.91 +ATTAGTGAATATTATCGGAGTCTGGGAGCGAGTTGGCCTGACTCCGGCAAACGGCCTTGCCAAAGAGGGCGGAGCGAGCTTCATATCTGTCCTCGTTTTTTTCTGTGTCAA 0.322 +GAGAATTTTTGTCACAACTGAAGGCTGGGTTCCTATTTATGAAAAAGACAGCGTGAATCAATAAGCGTGAGATAGCCCCGTTCATCGAACGGGGCTTTTTATATTTATTGC 0.982 +CTTTATGAAGCAAAAGCCGCGTACGTATTTAAATGACTACGTGAAAGAAAAGGGCTTTAATAAAAACTAAAGCAAACCTCTCCGTCTGGAGGGGTTTTTATGTTTCACGTG 0.9520000000000001 +TAATTTATAGCTGAAAAAGGACCTGACTTGGTTCTTTCGAATAGAAGCGCTATAATGAAAGCGGACAAGGGAAGGGGACGGACTCCCTTTCCCTTTTTCCATGAAGACCGG 0.69 +GCTTCCGGTTGAGACATTTTTACAAAATACGTATCGGTTTTACTTTCATCAGGTGATGAGAGAATCCAGCTAAAGCTTCTTTCAACAGAGAAGCTTTTTTTTGTTGGCAGG 1.0 +GCCTTGGACTAGGGATAAAATGTCCAGATACGCAAAAATTGCGCTTGGCATTCAATAAAACAAAAAAGGACAGCCTGACATGTGAGGCTGTCCTTTTTTACAGCTGAAAGG 0.917 +CGTTATGAAAAATATAAAGCAATTGGCAAAGTCTCGGTTGAAGATCAATATATCGGGGTAAACTAAATAAACGTGAAGCCTTTGGCTCACGTTTATTTTTTGTGACATTCC 0.988 +TAAAAGCCAGCCAGCCGTCGAACATCGCGTCGCAGGCTGAAGAAACGCCACCGCCACATTATTGAGGCGTAAAAAAAGCGGGGATTCCCCGCTTTTTTGTCACTTTTTCGG 0.6809999999999999 +TGCCAGGCGGGTACGCCAGGCGAAAACCGCTTTGGACCAGATCCAAAACTGGAACCTTAAAACAAATCTGCTCGCCGGATGTCTGACTCATCCGGCTTTTATCATTATGTT 0.943 +TGACAGAGGAGCAATTTAACGAAATGGTTAAGCCGGAAGACATGGTAAAACCAAAGGCGTAATAGGAAGAACGGCTGCTTTTTAAGCAGCCGTTCTTCTAAATATGTTGCT 0.973 +GATTACGCTCATCGCAAAACTTGAGCATAATATCATTGAGTTACAGGCCAAAGGGTGAAATGAAGGGGGCGTGTGGTCAGTAACCTGACCACACGCAAACTTATTTAGCGC 0.779 +CGTTGAGCAAGGTGCTCGTGGAGCTCAAGCTGCTAACGTTCAAAAAGCTTAATCTTCAATCGTTTATACAAACAGGCTCTTTATATAGGGCCTGTTTTTTTATGTCTATAT 0.998 +CATTCGAACCGTAATTCGTTACTGATTTCCCGCAGGTTTACCCCGTCCACTTCAGGCGGGGGTTCTTAATACTCTCCCCGGGCAGCCGTCCGGGGGATTAACCCTGAGATA 0.5529999999999999 +CTTAATTCGCACGCCGAATGATGAAACGATTGTACAGCTTATCCTTCCTGTCAGAACCTATTAATCCGATACACTGCTGCCGACCCGTCGGCAGCTTTTCTATTCGGTATC 0.8320000000000001 +AATGACAGCAGCACAAGCTCAAACAACAGTACAACGAATTCAACAACTGATTCATCCTATTAAAAAATGCCCGGTCCTTTTAGAGGATCGGGCATTTTTGCGCAGAAAAAA 0.995 +GCTTGTCAGGAGGCAGGGGAAGGCGAATGTGACAGCTGAGGGTACGGATATTGAACAAATACAATAAAAAATGTAAAAAGGCCTATGCGGCCTTTTTTTGTTTTAGGTCAA 1.0 +TGACGTTGGCGCAGCCAGCAGCATTTATGAAGAGTATTATCAGAAGCTGAAAGCAGGACGTTAATTCAGCAAGCGGGCGGTGGTATTACCGCCCGTTTTTTAGTTAGCAAG 0.863 +TTTACCAGGATACAGTGCTTCCTCACGTAAAAGAAGCAGTATTAACCAAATAAAACTTGAAAAATGACATAAAGGCAGCGCAGTTCGGCTGCCTTTCTCTTTCTGCCCTCG 0.33899999999999997 +TGTTCCGCGCGCAGTTTATCCGCTCCCAGACCGGCTTTGGCGTTGAGCAAGGCCGCGCGCATTAATCTCTCCCCTTCCCCGGTCGTCTGACCGGGGAATACTCTTCCTCTC 0.953 +TGCGTAAAGCACGTCGTCGTCCGCAGTTCTCCAAACGTTAATTGGCTTCTGCTCCGGCAGAAAACAATTTTCGAAAAAACCCGCTTCGGCGGGTTTTTTTATAGCTAAAAT 0.9940000000000001 +AGACAATTTGTCAGCGTTTGAAACGATTCATTATATTCATGCGGTAAAAAAATAATTTTCAAATCATCAGAAGCCTCTCAATGTTTGAGAGGCTTTTTATTGAACCCGCAT 0.941 +ATTGAAAAATATGGTTGAAGCCGCAAAACAGGCAAGAGCACAGCAGACACAGCTAGTATAATTTGAAAAAACCATCTGCATTTGGCAGATGGTTTTTTTCTATAATACAGC 0.9890000000000001 +AAACACGCTCCAATAATGAACATGGCGTTGCACATGCGATTCACGAGCTTGTTTTGGCTAAATAATAAAAAATCCAGCCTTCTAAAGGCTGGATTTTTTCGTTTTATTTGA 0.998 +TTCGTGCCGGCGAGTTTAAAGTAAAATCAGACATGAGTTTCAAAAAGATCGTAAAAACGTTAACACGATAAAAAAACAGGCTGACTCAGCCTGTTTTTTTCATCCAAACTT 0.996 +GTGAGATTTCTAAATGGGCGACTGAATGGATTGACCCTTCCTTTCTTGACTAATTGCTAAAATGGTTCTGATCTCAAAGACGAGAAAAGAACCGTTTTTTATTTTCCAGGT 0.9570000000000001 +GGTGGATTTAAGGAATCGACAGATCGAAATACGACGTATATCGATCATTCCCCTTACAAACTTAGTGATCAGAAGAAAGCCCTTAGCTAGGGCTTTTTCTTGCTTTACGGA 0.7140000000000001 +AGGAAGTGGAGTAAAAATTACGGATGGCAGAGTATCGCCATCCGAATTCACTTAATCGTTCTGTGCCGTCTGCCCCGCCGCCGCCATTTGGGCGGCTTTTTGTTTTTTATA 0.961 +TCCTGAAAAGTATGTTCAATAACTATTGCAAGACGTTCAAAGACGTACCGCCAGGCAATATGTTCCGATAACAAAAAACCTGCTCCGGCAGGTTTTTTTGTGTCCTGATGA 0.45899999999999996 +ACTGCAAAAAGAATTGCTGCTGCTTTAGATTGTAATGTCGAAGACCTCTATGAATTCAAGGTTGAAAGGCATTGAAACGACTAAGACTTTGGTCTTGGTCTCCTCCTTAGT 0.861 +CCAATATGAACAACTCTTACTGGAAACAGCGCTATCTCGGTGCAAAACGATATTTCTAAAAACAGAAACTGTGCGGCCTTAACGGCTGTACAGTTTTTTATTAGAGCCTAA 0.973 +AAATTGATCAAGGGGCATTTCATGATCAATCCAGAGTGATTGAAGCTGAAGAAGCGTAACATATGAAAACGTGTAATCCAAGAGGATTACACGTTTTTTTAACGGCCAGGC 0.98 +TCACAAGCTGTCACAGACCAGTTCAACGACTACCAAGATACAATGTCGCAATAAAGATCGAAAGGAGGAGGAAGGGGCTGTCCCTTTTTCCTCTTTTCTCCTTTTAAGCGG 0.614 +CCCTGATTGCTTTTATCCTGATTATCATTTTTATGTGCCTGCTGAAAGGCGAAAAATCTGAATAACAAAAAAAGCGCCCGCGTATGCCGGGCGCTTTTTTAATTTCGCAAA 0.978 +AAACTGCTGAAATGTTCAGAGTTTGGTGACGCGATCATCGAAAACATGTAATGCCGTAGTTTGTTAAATTTATTAACGGGAGCGTAACGCTCCCGTTGTTTTTTGTTAGGC 0.99 +TTTTGCTATTTTGAATAAATGACACTGTGCTGTTACAGTAAAAGTGTTCGTTGAATACATCCCTTCTCGTTTGGACCGGCTGCTGACCCAGCCGGTTTTTTTATGTAAATC 0.985 +AATTCCTTTATGAAAAAACGAAACGTAAACCGATGATCATCCCAATTATTATGGAAGTATAATGACTGACTAAAGACCGGAGCTGCTCCGGTCTTTATTTTTTGTCCTTCA 0.993 +CGCTTTAGTAAGTTATCATAACTGCCACTGGTCCATCCACAAACGCCACTGAACGCAAGCTAGCTACAGACACGCTCATCACTATGACGTGTCTGTATATTAATAAGCTAA 0.903 +TGTAATGCTTTTCGGTTATGATTAATAAAGTAATTGACCCCCTTTTTATAATGAATACACTTTTTTGGCACGGATTCTCCAGTTATCCGTGTCCTTTTTTTGTCCATAAAT 0.98 +AACAGCCATACCATCCGCGACGTTATTCTGTTCCCGGCGATGCGTCCGGTAAAATAAGCATTACGTTATGCTCACAACCCCGGCAAATGTCGGGGTTTTTTTATTTAAGCT 0.993 +GCCAGCAGCCAGCTTACCATAAACCGGTCTTTAAAAAACTTGGCGAGAGTTACTTTTAAAATAAAAACAGCCGTGCCTCTTTCTGGCAACGGCTGTTTTTATTTATTCAAT 0.975 +TAACGAGATGGGGATCGTGCTGGAAGATGGCCCGCAAGGGACCACCTGGCGTCGTAAGTAATTGCGCTATTGCCGGATGCGAGTTTTCGCATCCGGTTATCGTCTGCGCCA 0.7020000000000001 +CTCTGAGGAAGCCTCCTACATCACGGGACAGCAAATTTCCGCTGACGGCGGCAGCATGAAAAGCATATAAAAACATCCCGCTTTTTAGCGGGATGTTTCTTGTAATGACCC 0.851 +CAATTGTTTACCGAAGACTATGCATTAGCACTTATCAGCCCTTCCGGCAATATGCCGTCTTAAAAAGGAAAGCCTGCCCCATAATGGAGCAGGCATTTTTTAATCCCTTTC 0.9620000000000001 +GATCAGTTGGCTGAAACTGCTCAATACCACGGTATCAACTCTTTCTACAACTTAAACAAGTAATAAAAAGACCGCCGGGATTTTCTCTCGGCGGCTTTTTTATGCTTTCAG 0.9990000000000001 +AGCAGTATGCCATTTTTGTAATGAAAAGTACTTATTTACAAAAGAAGAGCTGGAAGGGCTTCGTGACCAAACTACCCGCTAAGCTCTTTAGCGGGTTTTTAATTTGAGAAA 0.7659999999999999 +CTGTTTGAACTACTTCGTTTTGTGATGATTTTCTTAAACGGTCGTGATGAGAATTAACGTTAAGGAGGTGGCCCCTTACCATTTATGGTAAGGGCTTTTTTTGCTTATTTT 0.9640000000000001 +TTTTGCGGCGAAAAAATAATCATTTGCCGCCTGCTGCAATGAGGCGTATAGGCCGCATATCAGCTTAAAAAATGAACCATCGCCAACGGCGGTGGTTTTTTTGTGATCAAT 0.997 +TTGTGCTACATCGCCATGCATATTCAGCGGTTCTACCAACGGTCAGTCGCACGCTGAGACAAACAAAAAACGCTTTTGATCATCTCAAAAGCGTTTTTTTATCTGATTTAT 0.856 +TTAATAATTTCGTAAATGCCGCGTGCAAACGTGGCATCCGTGCGCCAGAATTTGTACAATGCAGCGCCCCCGGACGAGCAGCCGCTCACCGGGGCGTTTCTTTTTTCAACC 0.746 +TGAGGAGTAAGAGACCCGGCGGGGGAGAAATCCCTCGCCACCTCTGATGTGGCAGGCATCCTCAACGCACCCGCACTTAACCCGCTTCGGCGGGTTTTTGTTTTTATTTTC 0.927 +GTGTGGATGGATCAGGCCGAAAAGGCAACTGGTGATTTTTATTGATTGCTGAGTTTTGAATACATTGAAAACCGCCAGCGGAAACGCTGGCGGTTTTTTGTATCAGGAAGA 0.9259999999999999 +ATGTTTTAGATGTTCTTCAGCAAATGATCCATCATACACAGCATGATTGAAAAAGACATCCTTGCTTAACTCGCTGTTAGCGGGCAAGGATGTCTTTTTCTTTATTGTATT 0.956 +CACTTTACGATGCCAATAAAAACAAAGACCGCGATAAAGATCTTATCGCCTGTGAACGGTAAATAAGAAAAAGCAATCTGGTGTCCAGATTGCTTTTTTTATGCTTAGGCC 0.9390000000000001 +ATTATCCCACAACTGCTGGATGAACAGAGAGTACAGGCTGTTTAACTGATACGGTGAGGCGCAACATTCCAGCAGCGGTAACGACGTACCGCTGCTTTTTTTTGCCCCAAT 0.847 +AATCTGGATAACCCTGATTTTGCGAAAGATATCGAAACAACTGAAGGAAGCCCTGAATAAAAGAAAAAAAGAAAGCCCCTTTTAGCAGGGCTTTCTTTTTATTTGGCTCTT 1.0 +CAGAATATGAAGAAAAATTCTCTGCACTCGGCCAGCCGATCTATCGGGCTGAGGTTGAATGGAGAACGTAAAGACAGACTCTGACAGGAGTCTGTTTTTTTTATGGGCCGT 0.9940000000000001 +TTTGCGCCGCAAAAAAACGTCCAGCTTGAACTCGAATAGATCAAAAAATGCCATGTCAAGACATGAGGAAAGGCTGTCGGGGGTTCCCGGCGGCCATTTTTAACATGAATC 0.726 +AAAGATCTTTCTAAAGAGGAAATGGTGACAGTAGCGAAAAGCATGCAGGGACAATCATCGAAATAACCGCCAAAGGCCAAACATGATTTGGCCTTTTTTTCGTTAGACATC 0.927 +GCCTTTGAACTGAAACAAGTTCAAGCTTTTGTACAAGTGATAAAGGATATGCTGGATCAGGAAAATCCATTTTAAAGACAGCGAGGTGCTGTCTTTTTTTTATTTATCTGT 0.9940000000000001 +TCTTTTAAAGAAGATAGAGAGAGATTACGTGTGAATACTAAAAAAAAATAAAGAATAACTAATGTTTTTTAACGTCCAGTTTTAATTGCTGGGCGTTTTACTTATTGAGAT 0.987 +GATATCCTGAAGGAATCAGAAATGATGACCGCCAAACAAAAAGCCAAAGTGCTATCGTAATGGCGTACTTGAGAGCATACGAAAATCGTGTGCTCTTTTTATTTATATTCA 0.96 +AAGCGCACCCTGTTGATAACCGGCGTAACGGATTTTTTGGAAATTCGGCGTGACGGCTATGAACGGAAAAAAGCTGGCTCTTTTAGAGCCAGCTTTTTTAGTCATGATTAT 0.988 +CCTCCCATCAACGGTTTCTGTCCCTTACCCGGTTTGTGTAAAGTTTTTACATCAAGTATAAACAAATGGAGTGGCTCTAGTAAAAAGGAGCCACTCTTTTTATTTATCTTA 0.998 +ACGTTTTTGATGAGGTATTTGAAAGAACCCTGAGAAAATATGAACTGCTTACAGAACAGGTTGGTAAACAAACATGAATCCTTGAAAGAGGATTCTTTTTTTATCACTGAA 0.98 +GTTAAGAAAGACAAAGAAATCAAAGACAAAACCGTAATCAAGGTTAAGTTTAAAAATCCTGATTAAAAAGAAAGCCGTCGTTATGCAGGCTTTCTTTTTTTATGCCTTCAG 0.995 +ATTCAACTGGCAGATGAGCCCGTGTAATCTGAAGATGGCAAAAGCTATCTGATGAGTTCCAAATTATGCCCAATGAGGTAGATTCATTGGGCATAATTGCTCATCAGGCAG 0.6509999999999999 +TGGTGAATTGAACAAAATCATTATGAAGCCTTTAGAAGAGCTTTACGGCAGTGTTGAAGGCTAATAAGCAAAATCCCTTCTTGGATAAGAAGGGATTTTTTATTCACTCAG 0.99 +GGCAGGTTATAAAGCCTTCAGGCATTACAATCCCCTTCACGCCAAAGAAGCTGAAAGAGATCACGCGTAAGGGGCGCCTGAACGTCCCTTAGCTTCTTTCCATCAGCTGAA 0.355 +GCTTGACCATCTTCTCATACAATATCAAAAGCAGCGCCTTCGAGCAGTAGCGGGTGATGAATGATTGCAAAATAAAAAACCCGTATAAGGGTTTTTTATTTTATAACCATG 0.892 +GTATCCGTAAGTTTGCTATTGACCAGGAAAAACTGGAAAAAATGATCGGCGATCTGCTGTAATCATTCTTAGCGTGACCGGGAAGTCGGTCACGCTACCTCTTCTGAAGCC 0.9009999999999999 +CATTGATAACCGCTCACTGTTTATGATTGGCGGACTCGGACTGACATTGAAAAAAGGCAAGTTCAAGTTTTTCTAAAAACCCGCGGAAGCGGGTTTTCTTTATTAAAAAAC 0.9 +TCGGTGCTGTCGTGATGGCAATTATCCTCGGTATCGTCAAAAAACCTGTTACAGAAAAATAAGAAAAAAAGCGTCCTTGGTTCGAAAGGACGCTTTTTCTTATCCGGCCGG 0.972 +CTGTTCTGAATGATCAAAACAAAATTGAACAAATGCTGGGGAACGAACCAGCCCGTACAATTTTGTAGAAGAAAAGCCCCCTTTATCGGGGGTTTTCTTTTAAGATTTTGA 0.6940000000000001 +ACAGGACTCGTGAATGAAAAATTTAAGCCGTTTACAAAGTTTTATAAACGGCTGGTGAAAAGGAATTAAAAACTCCGGCTTATGTGCCGGAGTTTTTTTCTGCGCAAAAAT 0.993 +AAAACAGGTATCGAAGTGCCCGTTTTCTTATTTAAAGAACATGTTCAACAAAATGAAATAAATCAAGGACTGGCAGGGCGATCTTTATGACCCTGCTTTTTTTGATAGATC 0.871 +TATTCGTAGCGTTGACGTAGGCACCTGGATTGCCGGTGTTGGTTACCGCTTCTAATCACTTTGGTGATATAAAAAATCCGCCTCTCGGGGCGGATTTTTGTTTTTAAGGTT 0.98 +TTAGAGCATAAGGCTTCTTAAACATGAGCTTATTAAGTGGTCATTAAATCAAACGTCTTTTATTTATTAGTTTGCGCTGATAAATAGGAGGCGTTTTGTTTTGGGGACATT 0.958 +GCTGTTTACGGCAGTGACTTGGCCGTTTATTTCTGAATATATTACGGCGTCGTTGATTCCGCTTTAATGAAAAAACCGGCTAATCCTAGCCGGTTTTTTTATGTCACACTT 0.9940000000000001 +TTGACGGAATTACGGCAGACGGGAAAAGAGAGCCTATCTTCAGAAACGGAAATTGGGCTTTTTAAGTAAATAAAAGGAGGCTTTTCCGCCTCCTTTTTTTATTTTTCTGCT 0.975 +AGATTGACGTTTCACGGCATTTGCCACCGCTGTAACGGAAAAGAAACTGAATAGACGGTGCCGAGCGCGAACCTTTTCTCATGGGGAAAGGGTTTTTTGCTGTTTCTATGT 0.975 +GTTTTTGTTCAGCAGTATGCTGATACAGTGAAATATCTGTCCGAGAAAAAATAATTCAATGTAAATTACATAAAGCCCGTGAATATTCACGGGCTTTTTTTATTATTTAAT 1.0 +CCGCTTATCGGAAAGCACAGAAAAGCAGTGCTGCCGATCGGGACGTGTCTTCTGGGCAATAATAAAAAATCCATTTCTCATGCGAGAAATGGATTTTCTTTATTTTTGGTT 0.996 +AATGACAAAATCAGACATGGCCGACCAGATGAAGAACATAAACATGAACAATAAGCTCAGCGCCGAAAGAGGCAGATCAACAACATCTGCCTCTTTCAAGTGCCTTTATAA 0.8859999999999999 +AGCAGAAAGTGGAAAAAGAAGGGGCGGATCAGCCGTCAATTCTGCCGTAATCAAGTTCCCGTTGATGTTGAAAATCCCGGTCAGAAGATCGGGATTTTTTTTGATGTATGG 0.9440000000000001 +GGCATACGTACCGAAAAACAGTGCAGGAGTTTTTAGACAACATGAATGATTCAACAGAATAAAAAAGCGAAAGGCCTCTTCGGCTCTTTCGCTTTTTTATGTTACAGAACG 0.9670000000000001 +AAGTTCGCAAAGCAATTGATTTTCTTGTAGAAAACCGTTAATATGGTGCATAATAATAGTACTAATAAAACAGGGCGCGAATCATTCGTGCCTTGTTTTGTACAATTATGA 0.787 +TCTTATCATGCCTACATAGCACTGCCACGTATGTTTACACCGCATCCGGCATAAAAACACGCGCACTTTGCTACGGCTTCCCTATCGGGAGGCCGTTTTTTTGCCTTTCAC 0.995 +TATATGACGAGCAAGAGAGGCTGATCTGTATATCCAGATGCACGCTGGCTGTCATCAAGAAATAAAAAAACAGCCGGAACTCTGCCTGTCCGGCTGCTTATTTTTATAAAT 0.8340000000000001 +CTGGACATGGTACAAGATGAAGTCTGCGGGGAAAAGTGTACTCGATGCATTCAATCCGAACGGAGAAGAGTAAAGCGCGTTAGCCGCTTTGCTCTTTTTTTGCGGGCTGAG 0.971 +GCGAAAAAGCACCGTACAACGGTCGTAAAGAGATCAAAGGCCAGGCATAAGACTGACTTTATCTGATTTAAAAATGCCAGCCTCCGGGCTGGCATTTTTTATGGTTGAGTT 0.97 +TCGTCAACATGGTGGCGCTGGCCGTGGTAGAAGCGCAAACCCAACCGCTGTAATTTTTTTTAACTCTCACGCTTACCCTGAATATTCAGGGTAAGCAGTTTAGCTGCAATA 0.9890000000000001 +GCATACCGTCAGGCACATGAGCGTAAAGCCAGCTGAAAACCGCTGCTTAATTTACTGCCTTAATCAAGAAACCGAAGTTGTAGCAGGCTTCGGTTTTTATTTTTCCCTGCT 0.9229999999999999 +TGAACGCCGCCAACGGCGATTCGAATACAAAGCTGCCGCTGTCTGTACGAACCCCTGCCCATTGGAAATAAAAAAGGACCCGCTAACGGTCCTTTTTTACTGATCAATTCA 0.915 +TATTCTCACTAATTTTTATCAAGCTTAAGCCATGGATTGAGCAGCAGCGAAGTGCACATATCCATTAAATAAAAAAGCTCCCGTTCGGGAGCTTTTTTTAGTTACTTCTAT 0.9690000000000001 +AAGCTTAATCAGGAGCTGGAATCTCTTTTGTCCGAATGGGAAGAACTATCCACAGAAGAAGATTAAAAAAGTCAAGCACCTCTTTTTAGAGGTGCTTTTTATCCACAGAAC 0.955 +AAGTATCAGTCAAAGATATTGCGGATGCGATAGAAGATCAGGGCTATGACGTAGCCAAGTGATTCAAGGTATCGCGCCTTTAGGGAGGCGCGATATTTCTTTCATTTTTAT 0.539 +TTCCTGTGGTTTATCGAGTCTAACATCGAATAAATCCATCGCTGATGGTGCAGAACTTTAGTACCCGATAAAAGCGGCTTCCTGACAGGAGGCCGTTTTGTTTTGCAGCCC 0.992 +GCTTGGATACCCATTAAGCAAAGGTAAAGTAGCGGGGCATTACAGCTATTAATGTACATATATAATCTGGAAAGCAAGAAGTCACATTCTTGCTTTTTCTATTGGTGGAGA 0.92 +TCCTTCCCTTAACAAACTCAGCCAGTCACATGCATTAATCTTGATGTTAATGATATATAGGTGCACATAAAAAGCTACTTCATTGTGAAGTAGCTCCCTTTATTAAAGGCC 0.863 +TGGAAGTGATGCGCTACACCCAGCAGCATGAGAGCGATGAATTGATTTTGCCGCCGCTGGCGGAAGCATAAAAAAATGGCGCCGATGGGCGCCATTTTTCACTGCGGCAAG 0.927 +AGAAATTTACTTGACCATTCAGGAAGAAAATAACCGTGCAGCAGCGTTATCCAGCGATGTGATCTCCGCATTATCCTCACAAAAAAAGTGAGGATTTTTTTATTTTTGTAT 0.97 +TCCGCTACATGAAGACGCCGACTTCCGCAACGCGTTAATATAACGACTGCGGTACAGGTCAATAAAGCCACCGCATCCTCAGGGATGTCGGTGGTTTTCTTTTTCTATAAG 0.861 +CATATATGCAGGAAGAACTAAAAGCACTGGTCTGAGTTAAATTTATATCAGCATAAATGGGTCAGGACGCTTTTAATCACATATAAAAAGCGTCCTTTTTCTCACAATCAA 0.867 +GCGGATTTTTTGCCGGAATTTGGGGAAGCATTGTTGATACGGTAACCGGCTGGTTTTAATCAATTGAAAGAGCTCTGATGGATGTTAGGGCTCTTTCGCGTAATATACTGA 0.488 +CGATGACGCCGGCTAACTATATTGGTCGAGCTATCACGATGGTTGATGAGCTGAAATAAACCTCGTATCAGTGCCGGATGGCGATGCTGTCCGGCCTGCTTATTAAGATTA 0.9309999999999999 +GTATTTACAAAAACGCTAAAAGAGCATAATAAAGCAAAAGAAAAATACATTTCCTCAAAAAATGAGAAATAAAAGGAGAGCAAAAAGCTCTCCTTTTGTTCGCTTTCCGCA 0.9670000000000001 +TTGGTCCGTACCAGAATGTGCCGTCAAAACCGGTAGTTATCCTTTCCGCTAAAGTCCTGCCGTAATGATTTCTCGCGCGGGCAATCTTGCCCGCGCTTCTGCTCTCCCGGC 0.669 +GGGCATTTGTCGGCGGAAATCCGATGAGGATCATCTATACGAAGGAAGAAATGCAAGAAAGATTGAAAAAGTCCGCTGAATAACATCAGCGGACTTTTTTTGTTAAAATTG 0.9890000000000001 +TCCTTTTCTTATCGAATAATAGAAAGTCCAGGAATTAACTGAATCAGTAAAATAGAGCTCCATTTCTGTGTTATTCGTCCGGCATCAGCTGGGCTTTTTTTATAAAGGAGG 0.392 +AATCATGACTGTGAGCATTTGAAGCATGTGCTTGAAGAAGGTAAAGAGAATTGATTTGTATGATCCGGTACCAAGGGAGAGCAGAACCCTTGGTATTTTTTTATGTATTCA 0.915 +ATTCACCTGAGCTCATTGAATCCCATTAAGTACCACTGCCATATCGCTTTATATATCACCTTCGCTTAGCTAATATGTTCTAAGTAGGAGGTGATATTTTGTTTGTATCGC 0.9520000000000001 +AACCAGTATATTCAACAGGGGGCTATTATGCGAATTGAAGTCACCATAGCGAAAACTTCTCCATTGCCAGCTGGGGCTATTGACGCCCTGGCTGGCGAACTTTCCCGCCGT 0.631 +TTTAGAAGACACGGAACCTGATGAAGATGGTTTTCTTACGCTGATTCTTCGCAACTGATCTCATCGAAACACAAAAGGAGGGCAAAAAGCCCTCCCGTTTTCATCTTCATT 0.657 +AGCAGCAGTCGATCGCAATGTACCAAAGCGTGAAGTGTACGATGCTTACCATATTAAGCAATAAAAAAACGTTCTTGTGTCATAACAAGAACGTTTTTTGTACAAGAAATG 0.892 +TGCTGTTTTGGTTTACCTTTGTTATCGGTATTCTGTTCATATTTATCAAGCTTTATAATAGCATAATGGCTAGCTTACCTGTGTAAGCTAGCCATTTCAGCGTTTGGTGGT 0.6970000000000001 +AAAAAATGGCGAAGAAGCTTGGTGTTGAGCTGAGTGAAGCTGTGTATGCAGGCGGAAAACTGAAGACGAAATAAAAAGAAGGGCGAAGAGCCCTTTTTTTGTTGTAAAATA 0.97 +GCCGCAACATCGGTGAAAGACGCCAACTTCGTCGAAGAAGTTGAAGAAGAGTAGTCCTTTATATTGAGTGTATCGCCAACGCGCCTTCGGGCGCGTTTTTTGTTGACAGCG 0.898 +TTCGCAAAATGGCTGAAGCTTCGCGTTTTAGAGGCTGGATATAAGTTGATGATTTGCATAAAAATAAAAAATCTCCTATGATAAAATAGGAGATTTTTTTATTCGGAAATA 0.95 +AGATAGAATGTTATTGCTTGACAAACCTGTCCTTTTATTAGAAGTTAATACTAATAAAACTTTTTATTGATCATTCAAAAAACTTTTTTGGATGATTCTTTAATACATATT 0.01 +GTGAAGTGGATCGATATTTGATGGGGAGTTCAGTTCTTCCGTAAATAAAGGGGATTATCATATAAGAGAAACCGGTCTGGCTGCCAGCCGGTTTCTTTTTTTATTCCATTA 0.91 +ATTTAAACAACAGGGGAAAATCGATGGCCATTATGTAGGTGTCATGGCAATGGAAAAAATACTTGCAATGTAAACAGGCCTCTAAAGAGACCTGTTTTTTAATATCCAGAT 0.982 +GATACCTGGATTGAACAGCCGTGGGATATGGGCGTCGTATTCGTCCCGCCAATCTCCGGTCGCTAATCTTTTCAACGCCTGGCACTGCCGGGCGTTGTTCTTTTTAACTTC 0.649 +TTTCATGAACACACCTTTATATAAAGCATTAATTCAACATGCGAGAAGAAATTCTCATTCATTTCATGTTCCGGGACATCACAATGGAGATGTCTTTTTTGATGACGCTAA 0.402 +CCACGCCAAACGCGGTAGTCACTAAAGGGCATCGCTTTGATATCTATGCAGGTACGCCAGTGTAAAACTAAAACAGGGTAGCGATAACGCTACCCTGCTTTCTGCAAGAAT 0.635 +CGACTGCAAAACGCTGGCTGAAATTCGCGAAAAACAGATGGCTGGCTAATTACAGCCGTTCCATCACGTTTACCACAGGCGGGAAATGCTCCCGCCTTATTTTTTGTTCAA 0.892 +AATGAAAATCTGACCGGATGTAACGGTTGATAAGAAAATTATAACGGCAGTGAAAATTCGCAGCAAAAGTAATTTGTTGCGAATCTTCCTGCCGTTGTTTTATATAAACCA 0.765 +GCAAGCTGAAGAGACACAAGAAGAACTCGAATTTGAAGAATAAAAATAAAATAAGTTTCAAATGATACAAAAGGCTGAGTGAAAAACTCAGCTTTTTTGTATTTTGCAAAA 0.9990000000000001 +TACCTTGTGCGCATTGCCGGATGCGATGCTGGCGCATCTTATCCGGCCTACGGGTGCCGAACGTAGGTCGGATAAGGCGTTCACGCCGCATCCGACATTAATGGCACGTTT 0.52 +TAAACCTGGTAAAGAACTGCGCGATCGCGCCAATATTTACGGTTAAGTTTTTTACTCAAACTTGAACGAGAGAAAAGCACCTGTCGGGTGCTTTTTTCATTTCTCTAATCT 0.99 +AGAAAGAAAAGCCGTTCAAGCGTTATACCAGGAATTTTTCGCGGGCGTGCTGATCTCTTAATCGTACATAAATAGCGGGCGGCAGCGCCCGCTATTTTTTTATATCACCTT 0.7809999999999999 +TTACACACATTTGCTGTGTAAAACGAGGGGTTTTCCGCAGGCAGGAGAGCAAAATGCCACGCTCTGTTCGTTGATAAAAGGCCGCATAGCGGCCTTTTTCCTTTCTTTACA 0.983 +ATCGCACTCGATCTGGCGAAGCTTGAAAAAGCCAGACCCGCGGAACAACCCGCTCCCGTCAAGTAATATCAATCAGGCACAAGAAATTGTGCCTGATTTTTTAACAGCGAC 0.628 +GTGCTGTGAAATGGCATTTGGAAGACCGTGTCATCGTTCATGAAAATAAAACAATCGTCTTTAACTAGACTGCAAGAGGCCCGCGCAATGCGGGCTATTTTTGATGACAAA 0.976 +GTAACTTCATTGCCGGTCGATCGTCATTACGATGTGTGAAAAAACACATCCGGTCACCGGGCAACCCGAAAGGAATACGCAGACGTATTCCTTTTTTGTTGTAAGTGAGAC 0.748 +AAAACGGGCGAAGCCAATTTTTGTCCGAATTGCGGCCAGAAGCTTGTATAAGAATCATACAGTGAAAAGTCCGGAGTGATCAGCACTCGGGACTTTTTTATTTAGGAGAAT 0.975 +CGAACCGCCAAAGGCTTGATGCGCGATATGTCCTCCTGACCCATCTCACGTTACAATCCGTGGTTATGTTAAACGCCCTTCTCCGTGTGAGAGGGCCTTGATCAGCCAGGT 0.342 +TTCGTTGATTACGGCGGTACGACAGGTTATATGAAAACAAAGGATTTACACATGACAAAATAACTGAGAGGGATTCCGCATAAATGCGGAATCCCTTTTATTATGAATTGA 0.9990000000000001 +TGGGCGAAGCTCCGCCATGTGGTTGAGATCGCTAATGATGTTTGGAAATAAGCAAAGAAAAAAACACACTTGATTCCCTAGCGGAGCAAGTGTGTTTTTTATGTCATTGTG 0.995 +AGGCAAAAGCGGCGAAGGTCTGTATTTTATCGACAAAGAACTCTCTACCCTCGACACACAAAACTAATGCTCTCCGGCGGCAGGTTTACTGCCGCCGTTTCAAATCAGTGA 0.46799999999999997 +CGGCATTCAATTTGACACAAAAGCCCAAGAAGTCCAAATCGACGAAGAAATGTAAATGGCTTAACACGAAACCAAGGGGAGGGCGGCCCTTTGGTTTTTTTACACGTTACC 0.772 +ATATCCGATGGATGATCAGATGTATTTATTTATAGACTTGCCTTATTCTTATTTATATGAGCAGGACGGACTGATTTAACGGCTGAAAGGCCGTTTTTTTTATGAAATGAA 0.965 +ACACCTTCTGAACCACGTCCCACCGTGCTGGTGTTTGACTCCGGCGTCGGTGGGTTGTCGGTCTATGACGAGATCCGGCATCTCTTACCGGATCTCCATTACATTTATGCT 0.33299999999999996 +GATCAAAAAAGCGGAGAAACTCGATAAACCAAGAAGTCCAAGAGTGAAAAAAGCATATTAACAAAAAAGATCTTTCCGCGCCTGCGGAAAGATCTTTTTATTTGCGATATA 1.0 +CGTACCGTTGGCGCGGGCGTTGTAGCAAAAGTTCTGAGCTAATTGCCGATAACATTTGACGCAATGCGCACTAAAAGGGCATCATTTGATGCCCTTTTTGCACGCTTTCGT 0.907 +TGGTCGGGTTTAACGTTCATTTCCACTCTCTGGCAAGCGCCTCGATTACTGCGATGTTTAGTTAATCACTCTGCCAGATGGCGCAATGCCATCTGGTATCACTTAAAGGTA 0.9520000000000001 +GCATCAGGTTTATACTTATCCGGCACCATAAGATTAACGCCGGTTAAAACCGGCGTTAAATATTAATGACGGCGCTGCCAGATCCACAGCGCCGTTATTGCCAGCGCAAAC 0.58 +TGAGCAAAGGCCGCATTGTCTTCCGTAGTCGCTGATTGTTTTACCGCCTGATGGGCGAAGAGAAAGAACGAGTAAAAGGTCGGTTTAACCGGCCTTTTTATTTTGTGATAT 0.965 +TACGGGAATCGGATCGAAACACTGATTCACGAATTACATTCAGAGAAACCATCTGAATAAAAAAAGAGAGGATAGGCGTATATCGTCTGTCCTCTTTCTTCGTTTATAAGA 0.955 +CTGAAGCGCCGCGTGAAGGCGTAGTGGTACCCCCGGTACCGGATCAGGAACCTGAGGCCTGATAACTGATAAGGGCAGGGCCACTGGCTCTGCCCTTTTGCTATTCTCACC 0.517 +GTTTGTTCCTATATGTTTAATTGGAAGCTGCCAAACCGTTCTTTTCGCTTAGATAATTAATAGCAAAAAAACCGATTTCGAAGTGAAATCGGTTTTTTTCTGCATAATATA 1.0 +AAAATGCGGACACTTTCACTGCAAAAATCAGACCAGACAAAAGCGGCAAATGAATAAGCGGAACGGGGAAGGATTTGCGGTCAAGTCCTTCCCTTCCGCACGTATCAATTC 0.852 +CGTACAAATAAACCTGTATGCTTGACATATGACTTGATTAAAAATGAAAAAGAGCTAGTTACTGTATGAAAAAATCCCGGTTTGAAGCCGGGATTTTTTTTATTCGGCTTT 1.0 +ATTACCAATATGAACACGAGCACATATTGGAAGGATAAATACGCAGGAAGTATACGGGTGCAATAAAAGAAAAAGACTCCAATCTCTGGAGTCTTTTCTTTATGCATATTG 0.991 +CTATTTCACAGACCCGATTTCATTAGAAGGACAACTGGAGTTTATTACGGAAAGCTTGACTAAATAAACAAAAGAGCCGCCTGCCCGCGGCTCTTTTGCTTATTTATAAGG 0.9229999999999999 +CTTCCAAACCATAAAGACGTTCTTGAAACAGCAAGAAAAGTGCTTGAATTTTAATCAAACTGCATAATCGAGAGGGAAGATGAACGTTTTCCCTCTATTATATATCTGTTT 0.006 +TTCTTGATGAAGCGATTAAAAACCGCTCCGGGATGCCGGTTCGTCTGAATTAATCTTCAAAACTTAAAGCAAAAGGCGGACTCATAATCCGCCTTTTTTATTTGCCAGACC 0.941 +TATCTGGACTTGCTGAGTCTGCAAAACTATATTTAAAAACAGTTCTGCAAGCGGATGAGAAGTAAAGAAACGCGGCAGGAGCCCTCCTGCCGCTTGTTTTTCACCCTGCGT 0.875 +TATCCTCTTGATCATTGTGGGAACTGCATTTGTAGGAGGCTACTAAATAAACGTAATCTCCATGAAGGCGGCACCGCAAGACTATGTCTTGCGGTGTTTTTTTGAAAAGAA 0.9179999999999999 +AGACCTTCAAAAAAGTATGGGAAAATCCGAACGGCATCATCAACAGCTTTTTTGTGATTAACTAATAAAAACCACTCGGCATGAGCTGAGTGGCTTTTTTAATGGATGAGG 0.9179999999999999 +AACAACTGGAAAATCTGGTAAACCATTATCTGTTCGACAAATAACGGCTAACTGTGCAGTCCGTTGGCCCGGTTATCGGTAGCGATACCGGGCATTTTTTTAAGGAACGAT 0.813 +CCGGTTGATGGCTGAACACACAGCAGAAAGCGATGAAACGTATGAAGCGTGATCGCAGCATGGAAAAGGGGATCATGATTTGTATGCTCTCCTTTTCCCTTTTCATACCCT 0.42 +TTCGGGGCTCTTATTAAAGAGAAAAGCCAAGTCAAAATCCTTGTTAGACCTAACTAATTTGAAACCAAAAAGAATCCGCACTCGGGTGCGGATTCTTTTTTAAATTATCCA 0.9990000000000001 +GGTGCCTGTGCCGGATGCGGCGGGGACGCCTTATCCGGCCTACATAGTGCATAAATTCTTATAATGAAGACGGACAACCCACTAAGTTGTCCGTCTTTTTTATTTCATTTA 0.968 +AAGGGCTTATAAGATAGTGATTTGCATTCATAAAGTCCTGCAATATGCTTCTAGCCGGGCACTTCAAAAAATAGAGTCCCTCTTATGGACTCTATTTTTCTTGGACAAAAC 0.929 +ACATCCTGCACTATGCGTTTGCCGCCTGTTGTTTTTTAGCGGCCGCTGTCTCTTTATTGTTTACATAAAAAACCCAAACGGCGACGTTTGGGTTTTTTGGTCTATCCTTAT 0.956 +ACCGGGAAAAAGGACAAGCAGCTCCTGCTCGAACTGTTCCTTTTACAGCTGTTAAAAAGAAATGAAAAAAACGATCCCCATTATTGAGGATCGTTTTTATATTATGCAGAA 0.919 +CTTCCGTGCAGGTAAAGCACTGAAAGACGCGGTAAACTAAGCGTTGTCCCCAGTGGGGATGTGACGAAGTTCAAGGGCGCATCTACTGATGTGCCTTTTTTATTTGTATTC 0.9640000000000001 +GGCTACGGGTGAAGCGGCAACTATCAGCGACGGCCTGGCGCGCGTTAATCAGGCATTTTAATTCTTTTCTCAGCCGGATGAGCCATGTCATCCGGCTTTTACCCCACCGCA 0.917 +GAGCGTATTCAGAGCTGGTGCGAGCAAATCCTCAACGAAATGGCAGAGCATTACGCCTGATCTCACTGACGGCTTAGCGCATATGCTTTGCCGTCATCTTTATCTTTGCGT 0.45 +AGAAAAAGAATATCAGGGAGATCTTGTCCCGGCTTTTCTGCCTTTGAAACGTCTTTCAGATTGGAAGAATGAATTCGAACATTGAATTCATTCTTTTTTTTCTGAAAAACA 0.95 +CCGGTTACTCCAACAAAGTTCTGGACCTGATCGCTCACATCTCCAAATAAGTTGAGATGACACTGTGATCTAAAAAGAGCGACTTCGGTCGCTCTTTTTTTTACCTGATAA 0.895 +TCCAATCTCAGCAATTCGTATTGGAAACAGCGGTACTTGGGCGCAAGAAGTTATTTTTAAAAATGAGTATGACAAAAAGCCTTTTATAGGCTTTTTTGTTTATACCAGACA 0.789 +AAGGCAATGATGTTACACCAGAAAAGCTAAAAAGAGAACAAAGAAATAACAAACTTCACTAATATAAGAGGAATACGGCAATATCGTATTCCTCTTTTGCATATACTATAA 0.9309999999999999 +TCCGCATCGCCGCGGAAGCGAAGGCGCCGAAGCTGATTCATACGTTAATTACAGAATAAAAAATAAAGCACATCCCATGCTGAGCGGGGTGTGCTTTTTTAATTATAGGAT 0.995 +TGTTCTTTTGCCTCGTCCTCTTATTATTCGCGGGATCCTTTCTCGTATCTTTATGTTTCGCATCATAAAAAATCCGCTATCACAGATAGCGGATTTTTTTATTGATGATAG 0.9890000000000001 +GAAGTCGATAAAGACTGGAATTCTGTTGAAATTGACGTCAAACAGATCCGCAAAGTAAATCCGTAATTGTTACCGCTCCCGGGACGCGTTCCCGGGAATAATTTCGCAGGG 0.623 +CCGCATCCGCCATTAACGGCACGGCACAGCGTGTGGAAGGCGGCATTATCCGTTCTCTGTAGTATAAAAAAACGCATCCGTGTTTCGGATGCGTTTTTTTTATACGTCTCC 1.0 +CGCTGCTGCTGGAAGATGAAATCAAGCAATTTGAATCACAAAGAAGCTCTTGGCTTCAAGCCTGAAAAAGCGGCCGGTATTTGTCCGGCGGCTTTTTTTGCCTGGTGAAAC 0.958 +ATCGCTCGTCTGGAGAAAGCATTCCAGGAACTGAACGCGATCGACGTTCTGTAAGATATTCCTTTCTGCTTATCTCAAGGCCCGCTCTGCGGGTCTTTTTTTCGCCAAAAG 0.938 +GAAGCGCTTGAATACGGCCTGATTGACAAAATTTTGACTCACACAGAAGACAAAAAGTAATAACACAACCTGCAAGAGCTGCGTCTCTTGCAGGTTTTTTTCATTTCAAGG 0.9990000000000001 +TGGGAGCAGCCGGACAGGGAGCCGGACGAGATCGATACATTTTCCAGTCAGAAAAACAAAGAAATTCAGCATAAAGCGGGGAAGATATCTCCGCTTTTTTCTTTGAATATT 0.946 +ATATACCCACGTTGAGAACGCCGGTGGCCTGAAAGACATCGCAATGCCGAAGGTCAAAGGTTAAAAGAATTCACCATGAGCGGCGCTTATGCGCCGTTTTTTTTTCTGTCA 0.9890000000000001 +ACATTCTTGCTGAGGAACACTACGCAAAACTTGAAGCTGAAAAAGCAAAATTAAAAGAAGAAACACAAAAAGCTTGACGGGAACCCGTCAAGCTTTTTTGTGTTAGAAAAA 1.0 +TTCCTGTTTCTGAGCTAAGGAGTGTCTTAGATATTTTGCATAAAAACACACGGGGATAGAACAATAAAAAAACTCAAGCTATATAGCTTGAGTTTTTTTAATTATGGCAGA 1.0 +GGTAGCGGCTGGTCCGAAACTGTAATGATTTGAAGCTGGAGAATATCTATCCAGTATCTTATAGAAAGCAAAACGGGAGGCACCTTCGCCTCCCGTTTATTTACCCTTCTT 0.772 +AAGGACAGTTTGTCCGTGTGGTTGAGGATGAAATAACTGAACACATGCGTTTGTCTTCTTAACCTAAAAAATCGGTGCATTAAAATGTACCGATTTTTTTATTTAGCCGGG 0.9840000000000001 +AGAGGAGATGAACAGATTCAAGGAGATTATCCTTAAAAATACCGAAACAAAAAAACTCAAATTTAAAAAGGATCAGCATTGACAGTGCTGATCCTTTTATATTGAATGGTG 0.9890000000000001 +GAAACAGTTATCGGTAAAATGCGTGAATTTGGTTCTTCAAACCAAGCTTAATTCAATTGGAACTTTTTATAGCCGCCTGACAGCTTGACAGGCGGTTTTCCGTCTATCTTT 0.5529999999999999 +TTATGGCACATGCCCTGATTGTACAGCGGAAAACCAAGAAAACACTACTGCGTAAAATGCGTATATATGAAAAAGGGGCCCGGTTTGGGCTCCTTTTTCAGTTATTGACGC 0.9690000000000001 +TGCCAATCGTATATGAAGTATTGGCGAAGTTCCGCAAGAAAAAACCGGGAACGGAAGAAGAGTAAAAAACAAAAAGCCTCAGCTCTGCTGAGGCTTTCAGCTTGTTGAAAA 0.807 +TCTGACTTTCAAAAGAAGACTAACAAAGCCTTTTTCGGTCAAAGAAGGCTGATATAATCATGCGGCAAGGGCTTTTCTATTTTTAGAAAAGCTCTTTTTGTATCAGAGAAA 1.0 +AATGATACGCTTTCCTTTATACGAGAAAAACTGGTTTGATACATTTGTAGACTTTAATAAGAAACGAAAGGCCAACTGCGACTTCAGTTGGCCTTTCCTATTTATAATAAA 0.992 +CGGAACCACCTACAAAAAACTTGTAACGCTATACGACCGATTTCGATTTGAAAATTAATGGGACACCCTATAATTGATACTCCACAAAGAGTATCTTTTTTATTGAAAGGA 0.945 +AAAAAATAAAAAGAAAAACAGAGCGCTTTTTTAATAGGTCCACTGCTTTAAAAACAGCTCTGTCCGTCTTTTACCTAGCTCAATCAGGGCTAGTTTTTTTTGTTGTGTTAA 0.922 +TGATCATTAACCATACACCGTACTTGTCAGGGCTTTTACAAGAGCTGTGGACACAGTACGGGGCATAAAAAAACTTCTCTTGTCCGGAGAAGTTTTTTTCAAGTATGATGG 0.738 +TAAATAAAGGATTAAAAGAAAAAGCAAAAGAGTTTGTCGATACGGGAAGCAATTTATATCAATAAAAAAAATGAAGATGGAGAATGCTCCATCTTCATTTTTTTAGAAAAA 0.873 +GAACGCCAAATACCGCGAAACCTCACGCGGTGGTCTGGCAATCAAAGTCCAGTGTGACTAATACTTCTTACTCGCCCATCTGCAACGGATGGGCGAATTTATACCCGCTTT 0.7979999999999999 +AAGCCGATGGGGTGATTTTCCAGACTGCGGTTTAAAAAGTCAGCGCACGCGCTGCGCATAAACGACACAATGCCCGGTGAATGAGATTCCCGGGCATTTTTTTATTTCTAA 0.987 +GGCTCATTTCAGAAGCATATCAAAGCGGTTATGTCGATATGCCTGATCAGGAGTCTTCATCTCCTGCAGAATAAGCTGTCTGTACAGGCAGCTTTTTTTGTATGCCAAAGT 0.9890000000000001 +GTTTATAAAGATAAAGCTGCACAAGCTATTCATGACGGCACAGTATCTTATTACAGATAATCGAAAGAGACAAATCTAATCACAGATTTGTCTCTTTTTTATATGAAATGA 0.9990000000000001 +CATCTTTAAGCCGATCATCATGTGTAATTGATCGGGAATTTGGGTGGAACCACGGATGATCAACACATTCGTCCCTTTTAGAGGGATGGGTGTGTTTTTTTATTTGAATTG 0.852 diff --git a/data/sequence_data/choe/validation.txt b/data/sequence_data/choe/validation.txt new file mode 100644 index 0000000..fe4914e --- /dev/null +++ b/data/sequence_data/choe/validation.txt @@ -0,0 +1,424 @@ +GTTCTGCTTTATACGCATGCCCAGGCGGCTGCAAAAAAGGAGAATATATTGCCCAGCGTTTGCTGGAATCATAAAAGAAGCCTTTACAGGCTTCTTTCAGCGTGTCGACAA 0.602 +AGCCTTCTTTGCAGCTGCGATGAAAAAAGAGAATTTTCTGAGCGGCTTACAGGAACTATTGAAACATGAGTAAAGATACGGGCAGCGCCAGCTGTCTTTTTTTATGTGTTT 0.9570000000000001 +GTCTGGGGAGGATCTCAAACAGTTAAAGTCCGCTCTATATACATTACTGGAAACACTTCATCAAAAAAATTGAGGATGCCTTGTACAGGCATCCTCTTTTTATTTATTCAG 0.9840000000000001 +TTAAAAACAAAACTTTCAAATAAATGGATGATAAAAATGAAAATAAGATCACTACTGGCGAATTGCTATTTGTATTTTTTATCAGCAATCGCCTTTTTTCTGCAATGGGTG 0.48 +GTTTTCCCGGGCGAATGCCTATTTGGAGAAAATGGGAGAGGCGCCGATTGATTGGTGTATTAAGGATTTGTAAAAAGCGCAGGTGATCTGCGCTTTTTTATTTGAGTATTT 0.904 +GAAATGTGTGCTGATTCCTGGAGATGGCAGTCTTCTAATGTGAATGGGTATAAGAGTGCGGAATAAGAATGGAGGCCTTCTCAATTGAGAAGGCCTTTTTTAAAGAACAAG 0.963 +TTTACGGCGACCGTAACCTGTTCTGCTCCTGCGTACCGATTAGCGAATACCAGTAATTCACTGATTCGACTATTTTCTAAAGGCGCTTCGGCGCCTTTTTAGTCAGATGAC 0.782 +TGCCCGTGCGGTTCTGGTAAAAAATACAAGCAGTGCCATGGCCGCCTGCAATAAAAGCTAACTGTTGAAGTAAAAGGCGCAGGATTCTGCGCCTTTTTTATAGGTTTAAGA 0.805 +GCCGATGGCTGTAGTGGCAAAGCTGCAGGAAGCCTTTGAAGAATATTGTGAAGAAGTATAAGACGAACAACCCGGATGCTCAAAGCAGCCCGGGTTTTTTTGTGCATAAAT 0.997 +TTGTATCAATAAATGTACGGTTGGTGAAAAGTTGAAACAATACGGAGATGTCTTAACCAATCACTAAAAATAAAAACCGCAGAAGCTGCGGTTTTTATTCTTTGAATTCAT 0.872 +AGGTTCAGACGACACCGTTGCTGTGGGTATCGTTTACCAGTTCTAATAGCACACCTCTTTGTTAAATGCCGAAAAAACAGGACTTTGGTCCTGTTTTTTTTATACCTTCCA 0.971 +AAATTCAATATATTGCAATCTCCGTGTAGGCCTGATAAGCGTAGCGCATCAGGCAATTTTTCGTTTATGATCATCAAGGCTTCCTTCGGGAAGCCTTTCTACGTTATCGCG 0.985 +CATTTCAACGACGCGGTCGATATTCTGATCCCCGATTATCAGCGTTATCTCGCGGACCGTCTGAAGTAATATTGCACAGGTGGCAAACGCCACCTGTTTCTTACGGTTTTC 0.649 +CGGCGTACTCGATAAAGCAGCACCTGTGATTCGCCCTAACAAAAAGTGATATCCAGGCCGGTAGATTCACGGAAGACCGTTCCATGATCGCCGGCCTTTTCTTTTTTACCT 0.5710000000000001 +TTACTTACTTTACAATGTTAAGCCGGAAGGAAGAAACATCAGAAGAACAGCCGGCTTCATAAAGAAAAAGAACTTGTTTCCTTGGAAACAAGTTCTTTTTTTGTTACATCC 0.9990000000000001 +AACGGAGACAAAATTCTATATTTCAGAATAAAAGCCTCTCATTGCAAGAGGCCAAATACGAAGAAAGAAGCCCAAAACTGTAAACGTTTAGGGCTTTTTTGTCTGTTTTAA 0.992 +TGCGTTTGATTCAAGCCAACCCGGCATTAAGTAAGCAGTTGATGGAATAGACTTTTATCCACTTTATTGCTGTTTACGGTCCTGATGACAGGACCGTTTTCCAACCGATTA 0.94 +TACGAAAAGGGAAAACTGTATATCCCAGAAGACGCATTTGATACAGCAATCTACACTTGTTCATAAGAAAAAGCTTGCAGATAACCTGCAAGCTTTTCACGTTACAAAAAA 0.982 +TACCGCGAAACATCACGCGGAGGACTGGCCATTAAAGTGGTCTGCGGCTGACATTGCTGTCAGCTTTCATCCTTGGGTGCACAAATGTGCACCTTTTTTTTGTGATCTGCC 1.0 +ATCTTAGATGAAAAAGGTGTTATCCAAGACATACATGTTGGCACCATGACAAAAAAAGAAATGGAACAAAAACTGGATCTTGATTAGATTCAGTTTTTTTTATACTCAAAT 1.0 +GCGCCTTTTTGTTTTCACCTGCAAACGGGGTCATAACTAAAAAGCTGAGAACACTCAACATGCAAAAAGAAAGAGCTGGCTGATGCAGCTCTTTCTTTTAATTATAAATCA 0.9470000000000001 +CGGTAGGCTGGATAAGGCGTTTACGCCACATCCGGCAATACAACTGCCGCAATGTTTACATCGAACTCCCCTCAGGCCACACTTTTCAGGTGGCCTTTTTTAATAATGCCA 0.941 +AAAAATTATGGAAGAAGCAAGAAAGCAAATGGGGATTGTATACCCTGCTGATCAAGCTTAAAAAAATCCGCCCGCGTGCAAATGCCGCGGCGGATTTTTTATTAGACAATC 0.961 +CTTCTCAAGCTATGCTTGCTCAAGCAAACCAACAGCCGCAAAACGTACTTCAATTATTACGTTAATTTTAAAAAAGACCTTGGCGTTGCCAGGGTCTTTTAATTTAAATTT 0.993 +CGATTGGTTTGGCCATGAAGTCGGGCATGTGGAATGTATCTCACTGCAAGAACCGATTAAAAGATAGTTTTCCCCGCAACGGTATGTTGCGGGTTTTTTTCTGTTTTGCCG 0.998 +ACTCAGAGCAAACGGCAAACACAGCCAGCAAAATCATCCGTGAAATGCTGCTTGGGTTTGTGAAATAAGTAAACATCCAGACGATGTCTGGATGTTTTTTTATTCCCAGTC 0.993 +TATGGTGTCTCTTTAGAAGACGTCATGAAGGCCCACCAAGAAAAACTGACAAAACGATTTGAGCATGCATAAAAAAGCGGCCGCGATGGCCGCTTTTTTTAACCTGTGATT 0.948 +GCGTATCAACAAAGCGCTGGATTTTATTGCTGAACGCGAAAATCAGCAGTAATTAGCGTGTAAAAAGATAAACGGCAGGAGATAATATCCTGCCGTTTTTTATTTATGCTG 0.9209999999999999 +GTATGACATCAGCGGCAAGCCGCCAGCTACCATTGAGTGGGAATGATTTGACCCTGCACTATGAATGAACAAAACCCTCTGTTACTACAGAGGGTTTTTTATCTTCAAGAA 0.9790000000000001 +GAAGCCGTAAAAGGTGACGTGCTGGAGATGAATATCCGTATTCTGCAGCCAGGGATTTAATCCTGCCTTGTTTGCCCGGCCATCCTGACCGGGCAATGTTCTTTCCTTTAA 0.895 +ATTGAATAAGAAGTAACCCGCCTTGTAAGAGGGAGGGTTACACAGGTAATCCGAAAAAAGCTATTTAAACAAGTTCTCTTTTTTTAGAACTTGTTTTTGTTTTTTTCCTTT 0.976 +ATATCCAATCGCATTGGAGCGATATGAGGCCAAACAGCCTAAAGACTGTTGAAAATGTGTAATCTTATCAAAAAGAGACTGCTTGCCGCAGTCTCTTTTTCTATCTTACGC 0.996 +ACAGAATGAGACTTCAGCTTCAATCTTTGCAAAATAATATGATCCTGCAGCACCCTCTTGATGTGTAGGAAAACGGACCCTTTTAAAAGGGTCCGTTTTTTTGTAAAAAAA 1.0 +ACTTCGAGAGGCACTTGAATCCTATATTGATACCTTTTTCCCTAACGCAAAAATTGAATATTTCACTTAAACGGGCTGTTGTGATCAACAGCTCGCTTTTTTATAAAAAAC 1.0 +GTCGTCGTCGTTTCGGTGGTGATGCGTAATCATCGCTGAACAGCGAACACAATCTGTAAAATAATATATACAGCCCCGATTTTTACCATCGGGGCTTTTTTTCTGTCTTTT 1.0 +CCCGCACAGCTTCATCTTTTAGAGATTAAAAAAACCGTGATTAACGGCCAGATTGTATATGAGAAATCATAAAAGGACAGGCAGCAGCCTGTCCTTTTATTTTCTATAAAA 0.9940000000000001 +CGGCGATCTACCTTTTGATTACAGTTGCATATTTGGTCAAAGGACGCAAGCATCAGTAAATAAGAAACCCTCTTGCCGCATCCGGCAAGAGGGTTTTTATTATTGTGCAGC 0.997 +GCGTTAAAAGATATTTTGCAAAATCGCATTCAAGGAAGAGTGATTGTGAAGCTTTAACAGGATCAGCTTGCAGAGAATGTTATTTTTCTGCAAGCTTTTTTGTGGACAGGA 0.7170000000000001 +GAGGGGAAAGGCGATTAATACCATGACACGGGCCGTGAAACCGCGCAGGTTTGCAATCAGGCCCATCATCTACGCCAGCGTTCTTAGCGCTGGCGTATTGTTGTGCGCCTT 0.485 +ATTACCAGCTTTAGCCACCCGGAAATCGGTACGGTGGTGGTAAGCGAATCCTGATTGACTTTCGCCGGATGCGACGTTTGATGCGTCTTATCCGGCTTTCACTTAACTAAG 0.759 +GACCATGGAACGCGTGAAAGGCATTCTTGAGCAGTACGGTCATAAGGTAATGTTCTAATTTCCCCGTAAAGCGGCAACTTTATTGAGTTGCCGCTTTTTTATTCCGCGCTT 0.9640000000000001 +GTCTCACGCTTGTCATTGCAGGCGTGGTCGTGCTAAATCAATCGAAAGCCCACGCTGAAGATAAAAAACAGACGGCCTGTGAGTGACCGTCTGTTTTCTTTATTCTCCACC 0.91 +AAAGAATGGATGAAACAAAAGCGGCGAGACACCCAGTGCTCAGATACGAGGCGGAAGAAAGATAAATAAAAACAGCTGTCTACCAGACAGCTGTTTGCTTTATTTCTTCGC 0.769 +CCAAGAAAAGTTCGCTCTTTCCAATTAAGAGAAGCACAGCGTTTGGCAAACTAATAGATCGTATCATCAAAAGAAGGCTGAGTCATCAGCCTTCTTTTATTTTTCAACCTG 0.995 +CGGAAGAAGAGATCATGCGCGTTGTAGAAGAAGCACATAAGACGTTAAAAGAAAAAAAAGCAGATGTACCGGAATAATAGGGCGCTATTATTCCTTTTTTCTGCATTCATG 0.909 +TCATGCACTGTTAAACCTATGCTGGTAGGATGTAGTAGAACCCCTTCATCCAAGGAGCCAATTTTGACGGCGGGAACCTATTTGTGTTCCCGTCCTTTTTTGTGTCTTCTC 0.995 +AGCGTCAAACGACATTTCATGATATGATAGAAGTGGCTTTACATTCCGTATCACAATAAAATATATCAAGAGGCGTGCTGGGTGCCGGCAGCCTCTTCTTTATGCATGCGG 0.7859999999999999 +GCGGTTTGTACATATAGTGTATTAATTGGTTTTGTATTAAAAGGAATCTCTAATAAATCTGTTTGATGAATCTGGAACTTGTAAAAGTTGCAGATTTTCTTTTATAGAAGC 0.39299999999999996 +ATCTGATCGGAAGAAAGGAAATTGAAGCGAATTATCACTATCCGAAGGAAAGAGTGATTATCGGATAGGCAAAACACCGCATATTTTGCGGTGTTTTTTGATTACCAAAAA 0.953 +TACCTTCGACTTCACCAACTCAGGTGAATCCGTCTTCGGCAGTTCCAGCTCCTACGCAATATTAAGCGACGTTAACCCTCTCCGCCCGGAGAGGGTTAGTAGTTCCAGCGA 0.8240000000000001 +TTCCCGCCCGCGCTGACGAGCCTGCTCTTACAAACGAGATACATGAACTGATGCTTTTTCATATTTATGACTAGCTTAGCCTAAACGGCTAAGCTTTTTTTATTTCTCAAG 0.9990000000000001 +TTAGCTATGATAGATAAGGATTAACCGCAGTTCAGGCTGCACCCATACGAATGTACCCACATGCAAAAAAACATCTGCCTAAACGGCAGATGTTTTTTAGGCTCGGAGCGC 0.9520000000000001 +TTCGAGACGAATACGGAAGATAAGCGTCAGGGGTAAGGGTTGGTGTTCGTCGCAGCAAGCCATCCAGGCCGGATAAGGCGTTCACGCCGCATCCGGCAATCGTGCACAATG 0.366 +CAGTGTAGACGGGGCGAATGCCATTTTGGACGAGCTTCAAAAAGAGATGCCGGCTGCTAAGAAATAAAAGAAAAGACAGGCAAACGCCTGTCTTTTTCTTATTTGATAAAG 0.987 +GTACTGGTTTACTAAGAGAATTGCATGAGCAGTAACTTCAGACATCAACTATTGAGTCTGTCGTTACTGGTTGGTATAGCGGCCCCCTGGGCCGCTTTTGCTCAGGCACCA 0.508 +AACAAACCGAAAGCCATGCGGTGAAAATCGTGGTGGAAGCGGCGCGTCGTCTGCTGTAATTCTCTTCTCCTGTCTGAAGGCCGACGCGTTCGGCCTTTTGTATTTTTGCGT 0.784 +ACAGAATGCGTTCGTTTGAAATCATCGCTGACGTTTGGAAAAACAGATAACAGCTATAAAAAAAGCAATGTAGCGGGGAGACCCATACATTGCTTTTTTTATTCATTCAGT 0.9690000000000001 +CTCTGTCGCCCTGGCGTTTAATAATTTACCGCGACCACACCGCGTTATGTTGGGGTCGCTCACCGTTCTTACTCTGGCCGTCGCTGTCTGGCGGCCTTATGTTTATCACCG 0.49700000000000005 +TTAAAGCAATTGAATAGCCTAAATCTCGATACTGAAAACTTTGAAATCAAGGGATTTAATGGTTATTAAGCAGATCCAAGCCCCCATATGGGGGCGTTCTTATTATTGTAC 0.865 +GATCAGATTTGAAGTGAGTGCTGCCAGAGGGTGAAATGAAAGAAATAATTGACGACTTACGATTGAAAAAGAGGTTAAAATGAATAACCTCTTTTTCTTATATAAAGATAT 0.895 +GGATACATTATTCCGGGCCTCGGCGATGCCGGTGACAAAATCTTTGGTACGAAATAAAGAATAAAAATAATTAAAGCCGACTTTAAGAGTCGGCTTTTTTTTGAGTAAAGC 0.743 +TGCAGGCGCTCTATCTGCGCGAGCAGGGCTTTAACAATGTGAAGGTATATCGCCCGTAATTTGTGGTTTTTACGTCGCATCTGGTCAGATGCGACGTTTGCCGCATCCGAC 0.569 +TGGAAATGGCTTTCCTCAGCGCTGAGGAAAAACGCGCACTGCGAGAAAAAGTCGCCGCGAAGTAACAAAATGGATGGTGCAAATGCACCATCCATTTTTCATGCAAGGCAC 0.8009999999999999 +CATCTTAAACCGGCGCCCATGTGGCTGCATAATACAGTTCCCCCAATGTCGAAATGTTAGGTTCTCTTCTGGGCTGAGGAATATATCCTCAGCCCGTATTTGATCAGATGT 0.841 +ACCGCGCCGAAACTTCATAGAAGCGAATGCGAGATACGTTAAAAATCTTGACATCTAATCATAAAAAGCCTTATTTCCAATAAGAAATAAGGCTTTTTTCTGAACAAGATC 0.927 +TGACCTCTACGGCTCAACTGATGCCGCTAAAGTTGAAGCAGCCTGGAATGCTGTTGGATTGTAATATTAGGAAAAGCCTGAGATCCCTCAGGCTTTTATTGTTACATATCT 0.865 +CTGAATACGGAAAACGCGTGGCAGAAGGCCTTGGATTGCCGATTAAAAAAGATTCTTAATGGAGAAATGCAAAAACCCGTTGTAGTCAGCGGGTTTTTTTATATTTAGTTA 0.995 +CAATTCGACAGATGCGGAACGTCAACAGTTGCATCAATTTCAAAATCAGCTGGACAAACTGCAATAAAAAAAGGCTATGGCGACTCGCCATAGCCTCTTATTTGACTTCAA 0.958 +CATTCCAGGCGAGATTTCCTTCCGCGCGGTCTAACGCCTATTAACCATTCTGGTTATTCTTCATACACCGCAGAGATGTTACATTGATGCGGTGTATTAGTTTTTGCCGCA 0.9420000000000001 +AACGGCAAAGAAATGACGAAAAAAATTAAACTGGATCAAAAAGAAGAGAAAACTTCGTAAGACATAATGCCTCAGGCCGTAAAAGCGGTCTGAGGCTTTTTATTAGATAAA 0.986 +AAATCCATTGACACATAAAGTTATTAGTATTATTATTTATTTAATTAAATTAAACAGAGAAAAGGAAGACGTTTGGCTCTTTTGAGCTAAGCGTCTTTTGTAGTTTTAAGG 0.995 +AACGCGGATTTGAGCAAGCCTCTCCCTCCACAGTAACTCTCGCTAAGTAAGTTTGATGGCCCCTCGAATAGTTCAATTTTTTTGAACAGAGGGGTCAATTTTCACCCTCTA 0.509 +TCTTATCTATGAGCACGACAGAAGAAGTTGTCGCGTTCGTAAAAGAAACATTCAAGTAATGTACAAAAACCAGACGGCCTCCGGCCTGTCTGGTTTTTTTCATAAGTAAGG 0.993 +GTGGCCGGATGCGGCGTGAACGCCTTTTCCGGTCTACGCATTAGACAGTTCTTCGTTTGTCATCAGCCTCAGAGCATTCAGGTAACTGAATGCTCTTTTTTATGCATTACA 0.995 +TGATGAAATGATGGATAAGGTTGATGAAGCCTTACAAATCAGTTTGGCACTCATTGATTTTTAGACATATTTGCAGGTTGCTCAAATAGAGCAACTTTTTTTGTTTTCAAA 0.9940000000000001 +AGTTCAGGGAGTTTGGACGTTTATCAAAAAAGAAGCTCAGCGCAAAAAAAGAAGCCGATAACATGAAAAGCAGTTTTCCCTAGGGAAAACTGCTTTTTTTATAGAAACAAT 1.0 +CCAGCGTGTCGATGATCTTAGCTGCGATTATGACGCTTTGCGCGGCGCCGTTTTTGCTTTCATTTATGTAAAAAACTGTCCTCTCGAGGGCAGTTTTTTTTATGTGCTTCC 0.988 +AAGAGGGTTGGACACATGGCGGTGAAAGACCGGTACATGTCAAAAATAAAACAGGAAACCAAGCCTTGGATCAAACGATTCGGGAGGCTTGGTTATTTTGCATTCGGCGGC 0.784 +CGCCCGTTGTGGATTCGAATAAAAGAATCCGTCTCGCGGCTGTTGTCGCCGATCTTATAAGATGCGTAAACCCCCGGCCTTTACGGCCGGGGGTTTTCCTGATGGTCACAG 0.953 +CCAGGCGATAAGCTGACGTTGTTTGTGAAAAACAACAACATGCCAGATTCCTGACAAACCAGATAATAAAAAGGCACCGATTCCCCCGGTGCCTTTTTTATTTATGCCGCT 0.988 +AAAAAACGTTGATTCAAAATTCGACGGATTAACGATATTTGTCTGATTAATAATCAGATCGGATTAATGTTGGTGTGTTTATAACACCAACATTAATTTTCCTGGGGATAT 0.884 +TCCCCAAAACAGACTCAGAACCGAAGAGCATCACAATAGACATTTAGAATAGATATGTTCTGCATGAAAAAAGCTGCCGTTTTGAACGGCAGCTTTTCTCTTCGTGCTTTC 0.9440000000000001 +GATCAAAGAGGAAATTGCTGGTTTATAATTTAGGATCCTGATATCATTGGTTCAATATCTAATTAAAGTACTTCTTATTGAAATAAGAAGTACTTTTTGATGCTTACATTG 0.903 +ATTTCAGTTCTTCCTGTTTTGGTCAGCAGGGCGCTGCGTTTTGCGCTTCACCCTGACGGACCGCATCTATCAATGGGCTGAGACATACTCAGCCTTGCCTTTAAAAAAATA 0.562 +GCTTCAAACCAAAAGAAGCGCTTCAAGAGCTTGTAAACAAACATCTTTAATTTTCCGCTGCTTACATGCCAGAGCGATTCCGATTGAGGGATCGCTTTTTTTATTCGCCAA 0.9940000000000001 +CAATTCCGGTGCGCGTATTGTCTGCGGAGCCCTTTTGGGAAATAACGAGAAGCAGTAATATCATCCAGGCCCTTTTGTATTGAGCATACAGGGGCTTTTTTAGTATACTCG 0.875 +AATTGGAATTCGAGTCAAAGCTTGATAAGGCAAAAGAATACGATTATAAAATCGGCATATAGCTATAAAAAAATCCCCGCAGGCATCTGCGGGGTCCTTCTATTCCTTAAT 0.723 +AGGATCAATGGTATCGGCAGGCGAAAAGGCTGCTCGTAAAGGCGAATCAAAGCTCTTTACAATAAAAGCGAAATAAGCAGGGCGTATGCCTTGCTTTTTTTATTCAACTGT 0.9690000000000001 +AAGCCAAAGCGTTGCGGAAACTAAGACATCCTAGCAGAAGTAAACGTTTGAAAGATTTCCTTGAATAAGATGGAACGGGTCTTGAAGATCCGTTCTTCTTTTTTTAAAAAG 0.985 +TTAAATGTGGTGATGGATCGCTGCCCGGCTATTGAGATCCCTCGCCTGGGCCTGGCCAAATAAAAAATCCCCGGAAGGCAAAAACCTTCCGGGGATTTGTTCAGGGATTAG 0.728 +CCAGCTGGAAAAGAACGCGATGAAAAAATTGCGTGCTGCCATTGAAGCGTAATTTCCGCTATTAAGCAGAGAACCCTGGATGAGAGTCCGGGGTTTTTGTTTTTTGGGCCT 0.985 +TTTGTTGAATTTTTGACTGAGGAAGACAGACGCAAAAAAATGTATTAAAGAAGATCTGCACCCGGATTGTTGAGATTTTCCAGACGATCCGGGTGTGTTTTTTTGCATGCA 0.9390000000000001 +GATATGTTCCGCACGCAAGTACATCCTTGGGAACGCGAACAGTATATGTCTCAGTATTAATATCTCAATCCCTTGGCACTAAAAGTGTCAGGGGATTTTTTATGTTAATAG 0.996 +ACATTCGTCGGGCTGAGCTTGCGTTACAGCGGGCTTTGAACAGATTGGATGTAGCAGGGAAATAAGAAAAAATCCTTCTCTTTATGAGAAGGATTTTTTTATGAACGCACA 0.982 +GCAACCCCGGGTGCTGCTAATGCGGATGCGACCTTCAAGGTTCAGTATCAATAACCTACCCAGGTTCAGGGACGTCATTACGGGCAGGGATGCCCACCCTTGTGCGATAAA 0.8809999999999999 +GTGACATTTAAGGGCAGGGCGGAAAATCTCAAGCAGGTTCAGGCTGTTCAGCAGTTTGAAGTCAAATAAGAGGCTATGGCGAGTCGCCATAGCCTTTTTTTATTGCAGTTT 0.993 +GGGTGAAGAAAGAAAGGGAAAAGGACGGGCAGGAGGAGCCGCCGTATATTCAGCAGACGATATCATAAAAGGTACAGCGAAATATGCTGTACCTTTTCGTTACATTTGAAC 0.915 +GCAGGAAAACCTTGCTGCGACTACTAGTATGGAAAACGTGCAGCTTGATGGTGTAAACAATGCAGCACAACAAATGGGGCCCACGGGAGCCCCATTTTTTAAAGCAGGTGA 0.685 +CCCAAACAAATACATTTTTCGGATTCCTGCTCTCGATCGGATTATTGATCAGCTATTTCCGATAATAAAAAAGACCGCTCGTTTCATGCGGTCTTTTTTTGTTACAATCGA 0.821 +ATCGTCTAACTAAAAAGAACATTTCCCAGTCATAAAATAGTTTTCCTAATAAGACCTGGATTTCGGTAAAATAAACAATTCCGATTTCCGGGTCTTTTTCGTGCGCAGCTC 0.995 +ATTGCCAAAAGAGCTATCCACGCCTACGCAGAGCTCGAAAAAAAACACAGTAAAAAATAAAAAACAGGGTGCACAACTAAAAGATTGTGTGCCCTTTCTTTTATTCAAAAA 0.965 +AGAGCTTGAGGCTGCGATGGACCGCTGGACAGAATTGTCCCTCATGATCGAAGAACTGGAAAGCTAAAAAGCGTGGCCGCAGCAGGCCGCGCTTTTTTTCACATAATGGAC 0.885 +AATCCGCAGCATGGAAAAACCGTCCGATCACGCCCCCGTCTGGGCGACCTTCCGCCGCTAATTTAGCAGCTCTCCTGGCTCAAACTGGGTCAGGAGAATTAACCTTGAGAA 0.7759999999999999 +GCCCGTTCCACCTTTATCAGCAGTGTAAAAGGCAAGGGGTAATTACGCCCCACAGTGCTGATTTTGCAACAACTGGTGCGTCTCCTGGCGCACCTTTTTTTATGCTTCCTT 0.9990000000000001 +AAAATTTATTTCTGAAACAACCGCAAAATAACGTATTGTGAATTGTCAAATGGAAGGATATACTTTTATTATCAAAAGCTGACCCGGCGTCAGCTTTTTTATATGGACATA 0.955 +CAAAGTAACTGACTGGAATACCATTAAAAAACCTGATGATCCGTATGTTTACAGAGGATTTTAAGTGAAGACGAAACCAGTACAAGTACTGGTTTCTTTTTTATTTCTTTT 0.9590000000000001 +ACTCCTTCGGCTTCGGTGGCACTAATGGTTCTTTGATCTTTAAAAAGATCTAAGTTGTCATTTTCCACCCTTATAAAAGGTCCGCTTGCGGGCCTTTTTTCTTAGCTTTTA 0.9740000000000001 +GGCGAAAAAACCTCACCGTACGATGCGCCCGGCCCGTAAAAATTAAAGTGTTAGAACCTCCTTTCAAATCATACATATGAGATGAAAGGGGGTTCTTTTTGTATGGGGCAA 0.986 +TATCAACGACCCGTCGATGGATTAAAGCTAATTGAAAGCGGCGACTGGTTAGACAGGGATAAGTAACCATATGCATACGCCACCTTCGGGTGGCGTTGTTTTTTGCGAGAC 0.8029999999999999 +AAGCCGCCATTTACTCGGTCAGATATGGTGTATCTGTCTTTTGACATATGACGTTTTGCATATACGCTTCTCTGAAAAGGCCTTTTACAGGCCTTTTTTTCATGCCCTATG 1.0 +GTCGCGAGACTCCCGAAGGATGCGTTAGTCGAGATCGAAGTTATTGCACTGGTGAAATAATAAGAAAAGTGATTCTGGGAGAGCCGGGATCACTTTTTTATTTACCTTATG 0.9740000000000001 +AAACAAAGTGTTTAACGGCGGAGCTGTTAAAGGCTGGTAATTTTTAGGCTGTTTCCACAATTTGTAATATCATTCAGGACGGGCGCTTGCCCGTCTTGTCATTTTTACAAG 0.6579999999999999 +TTGACCTTCCATGGGAGCGCACAGACAAAGCGGAGCAGCTGCGTAAAGAAGCGTTAGGAGAATAATTTTATAGCCGCTTACTGGTTAAGCGGCTTTCCCTTTTTTATCGTT 0.838 +CGAGCTTTCCACACAGGAGCTCGAAGAATTGTTTACATTAAGCGCTACAGCGCAATAATCATGACTAAAAAAGCTGGCTCTAAAAGAGCCAGCTTTTTTCCGTTCATAGCC 1.0 +AGAAGCTGAGAAAATCACCACCGTTCAGGCTGCCATTGATTACATCAACGGCCACCAGGCGTAAGTGAACATCTCCAGGCGGTCGTTCGACCGCCTGAGTTTTATCTTTTT 0.807 +TCGCGGCGGAAAAGAAAAATCTGTTGATATCAAGCTGTCCTCCGCAGACCAATTAGGCAGTTAATAAAAGCAGTCTGGCATCGTTGCCAGGCTGTTTTGATATGCAAAAAA 0.983 +AAGATCAGAACCTGATCTCCGCACACGGTAAAACCATCGTCGTTTACGGCACTCGTTAATCCCGTCGGAGTGGCGCGTTACCTGGTAGCGCGCCATTTTGTTTCCCCCGAT 0.314 +ACAAATGTCGTGTACCGCAGAAATGACGGGAAATATGGCTTAATTGAACCGACTGAATAATGAAGAGAAGCCTTCCGTGATGTCCGCGGAAGGTTTTTGTTTTTCTTATTT 0.9570000000000001 +AATATGCACTTTTTGTTCAGTCGATAGAAACATTTTTGAGGAAATCTGCAAACAATTTAAAAATCAAATAAACAGGCTGCCGATGGGATCGGCAGTTTTTTCTGTGAAAAG 0.647 +AAGATCCATCAACTTGGACAACTGAACATAATATACCGCTCAAAAAAATACTTCCAAAAGTAAAAGCAAAAAACTGAGGCTGTAAAAGCCTCAGTTTTTTTATAGAAATGT 0.9740000000000001 +GGCAATCCAGAGAGGTTGCAAAGAGGTGCACAACAAAGGCCCAAAAAGTATTCGGCAGGTCTTTGTATGCCTCTTTGCGTAAAAAAGCAAAGAGGTTTTTTTATACAGTCA 0.7020000000000001 +CCGATAAAGCCATTCTGGATAGCGAAGACGACGCCGAACTGGCACACCATTAATTCTTATGCTGGCAACGGTCCGTTTTGTATAGGGGCCGTTGCCTTACTTTTTAAATGT 0.807 +AGGGTATTGCCCAGCAGAACAGCTTTAAACACACCTGATAACATAACGTTGTAAAAACCGAATGCCCAGCCTTTAAAAAAACAGCTGGGCATTCGGTTGCTTATTAATCGC 0.843 +AGGCGACACACTTGTTGTCAATACATCAGACGGTTCTTACGTTTCAAGAGCATAGAAAGAAAAAAAGAAGTCTGTTCCCAAATGGGAGCAGGCTTTTTTTGTGCCCAAAAA 1.0 +TGAGAAATTTTTAATTCGCTCTGAAACTGATGGCGTAGAAGCGAAGAAATAATACCCTTTATACCATGTCCTTATTGACCCCGTATATTACGGGGTCGTTTTTGTGCGGAA 0.7490000000000001 +TCAGCTCCGACAGAGGATGAGCTGATTGACAAAGTAAAAAAGAAAATAAAAAAGTAAAAACATATTTTTTCTTAGCCTCGGCCCTCAGGTCAGGCTTTTTTGCTTCACAAG 0.7809999999999999 +GGAATGAACCCGGCGAGAAAAGCTACCAAAACCAACGTGCTGACAGCGTTGAGAAGAGAATTATAATGCGAACGAGCCGGCTGAGACAGCCGGCTTTTTCTATAGCGCATG 0.9440000000000001 +CAGCGATCAAACGCGCACGTCAAATGGCTTTACTTCCATACGTAAGCGGTGAGTAAGCTGATATGTAAAGAGCAAGGACCTTCGGGTTCTTGCTCTTTTTTATAGGGGGGA 0.9990000000000001 +GCTTTCTGAAAGCCTTCAAAAAGAGTCTCTGTCAGTGGCGGGAAGTTTAGCTTAAACAGGAAAGTGAAAGACGGGTGCTGTATGCTGCTCGTCTTTTTTATTGTTTTTTAA 0.997 +AACCAGTATCAAATGTTCAGCCGACTGACGATATGGTCCCGGTTGAATAACTGATTTAACTCTGCTGAAAGACTGCAAAAACAGTCTTTCAGCAGATATATTTATGAAAAA 0.596 +CTGATGAAAGCGCCCTTTCCGATATTACAAAAGAGCTGAAGGAAAGCCTCGAAAAGTAAAACAAAAAGAAGCTTCGCACAATGTGCAAAGCTTCTTTTTTATTTGCCTGTA 1.0 +CTCGAGCGTGATCCGCCACGCCAATCTGCCGGTGCTGGTTGTGCGTTAATTGCCAGAATAAGTATCCCGCCCTGCCCGGAGCCATCCCGGCGGGGCTTTTTAATGCCATAA 0.8859999999999999 +GCGGCGCGAACGCCTTATCCGGCCTACAAAACCAGCAATTTCAATACGTTGTAAAACTGTAGGCCTGATAAGACGCGGTAAGCGTCGCATCAGGCATTGAGCACCGAATGC 0.332 +GCCTGATGATAAGTTCAAGTTTGCTTCAGAATATTCGAAATCTGTTGAACTATCATTGAACTGTAGGCCGGATGTGGCGTTTTCGCCGCATCCGGCAACGTACTTACTCTA 0.583 +GCATTTTTCAAAAAATATGGATAAAATTTTTTCAACGATGGGCTTTGTATACCCGACGTTAAGAAAAAGTAGAAAACCCGCTGATGAGCGGGTTTTGTGCTTTAAATGGGG 0.7809999999999999 +AACTTCATCCAGAAAGTCATTGATTTCTTCGTATCTATCTGGAATGCGATTGTATCTATATTCAAATAAGAAAAGCGCCGAAAAATCGGCGCTTTCTTTTATTGCTTTACA 0.945 +GTCACTGTGCCGAAGGCGATTGCCGCGAAGATGAACATGCGCACGAAGGCAAATAAGCCAGCCTGAACGAGAAAAGCCAACCTGCGGGTTGGCTTTTTTATGCAAGGGAAA 0.914 +ACGAATACGGCTTCTCAGTTCCGAACGTAGTGAATCGAGTTAAGGCATTAATCAATAAGTAAGCTTTTGAAAGAGGATGAGTCAAATCATCCTCTTTTTCTTGTTTATCCG 0.996 +CACCGAAAGAATTATCCGAATTTTCTTTCTCAAAAGTGTGGATTGATGTTTGTAAAGACTGGGAATAAAAAAACATCCAGACATCGTCTGGATGTTTACTTATTTCACAAA 0.8540000000000001 +CATTACCGACCTGCCGAACCCGAAAGCGAAGAAGAAGTAATTTTTCGTTTGCCGGAACATCCGGCAATTAAAAAAGCGGCTAACCACGCCGCTTTTTTTACGTCTGCAATT 0.985 +GTGGATGAAATCACCACCGAGCCGGATTACGAAGCAGCTCTGGCTGTACTGAAAGCATAATTTAATATGCCTGATGGTGTTGCACCATCAGGCATATTCGCGCTTACTCTT 0.9109999999999999 +CAACATGGTCAAAGCCGTAAATGTCATCGTTGAGATTGCAAAGCAGTTTGAAGCGCAAGCATAACGCCAAAAGCCAGTCCAAAAAAGGACTGGCTTTTTTGTGTGAAAATG 1.0 +GTCAAAAGATTGGTTTAAAAAATAAAAAATAAATTCAAATGATGTAAAGAGGCTTCAAAGCCTTGCTGTACTTGAAAACAGGCTGTGAGGCCTGTTTTTTTATTAATCCTA 0.981 +TGGGATGACCGCAATTCTGAAAGTTGACTTGCCTGCATCATGTGTGACTGAGTATTGGTGTAAAATCACCCGCCAGCAGATTATACCTGCTGGTTTTTTTTATTCTCGCCG 0.917 +AATAACCTATTATGAGAAGAGTTCTGTAAGATTTGATAAGAATAAGCATGCAAGGACAAATGCATCTTCCTTTGAAACAAGATGCATTTGTCCTTTTGTCTGTTTCGAGTA 0.9079999999999999 +GTTAGATCTTCTTGAAGAAGCCATTGAAAGAATCAAGCGTTTTGTAGAAAAACATAGCTAACAGATCAAAAAGCGGCTGACAGAAAAGCCGCTTTTTATCTGCATTGACCC 0.775 +GCGCTGCCATGTTTTTCTGCGTGCTCTTCTATTGTGTAAAGGTGCCTTTCCTTTCGTAACAGTAACAGAAAAGCGCAGTCTTCACGACTGCGCTTTTTTATGCACGTAATT 1.0 +TACAAAACTCCATTTCGCCAATACGCAAAAAGCGCAGCGCCTTGCTGATAAAAAACGTTCATAGAAAAAAGCTTGCAGATTTCTCTGCAAGCTTTTTTATCAGCCTCTTAA 0.996 +AAGACCTGTGTTTCAGTACAACGTCAACAACAGTAACATTAACGGAATAAAGCAATATCTATAAGAAAACGCACTGCTTGCTTTGAGCAGTGCGTTTTTCTGTCATATCTC 1.0 +CTGGGGACACTTTCTTTTATTTTAAGGCGTGAACATTTGAAATCCGGCCCTCTCTATAGTATCCTTTACTTCAGATGAAGGATACTAGAGGGGGCTTTTTTTATGTCAATG 0.539 +GGAAGAAGTTAATAAATATTACGCAGAAGAACTAGGTCTAAAAGAGACGACCATTTAAATATGAAATCCATTTGACTATTTTGGTTAAATGGATTTTTCTTTTATCTCAAA 0.993 +TTGTGACAAAAAGCAAGGCACTGATTCAGGACATAGCCAACATGTTAAAAACAGTTCAAATGTAACGAAAAGGTACAGCATATTTCGCTGTACCTTTTATGATATCGTCTG 0.9329999999999999 +AAAGATATGACATCTGAATTGAACCGTTTGCTTTTGAATACGGTTGAAGAAGTGTGGAAGGATGGAGGAAGCTGATGTTTTTTGTCGCTTCCTTTTCTCCTTTATTCGACA 0.847 +CTGTAAACGATCTTACTGCATTCAATCAGCTTGCTGATGCTGCTAAAGCTCAATTAAACAAGTAATCATATAGAGCCGCTCTCCAGCAGAGCGGCTTTTTCTATATAAAGG 0.963 +TAAGACAGCAAAACCAATCTGCTGAACAAAACAAACAACAAAACAGCTAATCACTGAAACAGAAAAAAGCACTTCATCTTCGGGTGGAAGTGCTTTTTTCTGTTTGAAAAA 1.0 +CTGTGAAGCTGCATCCTGAAGTGCAGGCTGTGTTAAAGGTACATGTAAAAGAAGAAGCTTAATAGAAAAGAGGCTTGGATTCATCCAAGCCTCTTTTTTTATTCCACGAGA 0.998 +GGAGTCTGAAGCTCTCGCCAAAGATGCGGAACAGCTTAAATATTTATTAAAACAGCAACATATTTAGAAGAACGGCTGCTTAAAAAGCAGCCGTTCTTCCTATTACGCCTT 0.963 +CGCCGCTTTAGCCGCAAGAATGAAGATATCGAGCACAGCCATACTGTCGATCATCATTGATACAACGTGTAATCACTAAGGCCGCGTAAGCGGCCTTTTTTATGCATAACC 1.0 +GGGAAGCGAAATCCTGGTGTCTGCGGGAAATTCCCAAACTTTTTAACGGAAAATAATATTTATTATTGTTAATAAAAAGGGGCGAAATATCGCCCCAATTCTAAGATTAAT 0.6990000000000001 +AAGCGTTCAAACGATCGCCGTTCTTCAGGTGACAGACGCCAGAAAAAATCTTACTAATTTGATCGATTCAGAGCCCAAAACATGATGTTTTGGGCTTTTTTATGTTCGTGA 0.995 +GGCTGTGTACTATGGAAAAGGGTTTCATAAAAAAAATGCGGCTGCTGCCAGCGAACGAAATATCAGCTGACAAAAAGAGCTCCCAGTGGGCTCTTTTTGTGTGTGCTCATA 0.935 +AAAAAGGCCGCTATGCGGTCGGGCAGGGACCAATGTGGGTGGTGGTTAACGCACACTAACCGCTGATTTACCCGGCGCAGTCTCTCCTGCGCCGGTGTATTAACCTATCTC 0.861 +AATACAAAATCAAAAGCGCAGAAAGCTGCGGCGATTGGTATAAACGAGAAAGCTGCACAATAATAAAAACCCTCTTGCCGGATGCGGCAAGAGGGTTTCTTATTTACTGAT 0.991 +TAGATTCCTCTGCCTGTCTTGCCAAAAGCACTGTCAGCACATATCTTGCTTATCAAAGCTAGCTTCAGACAAGGACTGTCTTCAAACAGTCCTTGTTTTTTTATGTTCCTA 0.9990000000000001 +GAACTTTAAAAAGTTCGCACATACCGATGCCATCGCCCAGGCAGGCGGCCCTCTCGTATAAAAAACAAAAGCCAAGAGCAATTATGCTCTTGGCTTGTTTTAATTGACGGA 0.998 +AAAATGGGCAAACCTTGGACAAGTACTCCTGCAAAAGTAGATTTACCTAATAGCTAATTTTAAAATCACTTTGTCTTTATAAAGGACAAAGTGATTTTTTGTTATACAGAA 1.0 +GCATGGTGGCACTAACATCCACTATCTGGAGAAAAAACTCGGTCTTCAGGAAAAATAAGACTGCTAAAGCGTCAAAAGGCCGGATTTTCCGGCCTTTTTTATTACTGGGGA 0.9390000000000001 +TGCCGGCACGCTGCTTTCTTTACTAGAAAAAGGGACGGAAAACGGCCGCATTTATGATATTAAAGAGTTTTTGTAGAGCAGATATTTTCTGCTCTTTTTTTTATACAAAAG 0.9940000000000001 +GGGAAAAAACCGACAAAGCGCAGCTGCTGCGCGATGCTGCCGGTCTGAAGTAATCTTTCTTCACCTGCGTTCAAAGGCCAGCCTCGCGCTGGCCTTTTTCTTTTGGATAGG 0.993 +AGCTCATGTGACATTGGCGCGTACACGGCCGAAACTAATCGGCGGAGTGCGGGCGGTTTATGAGTGCGAATGAAAGGACTGCATAGCCAGTCTTTTCTTTTATTTTAAGAT 0.5660000000000001 +GTTTTACTTCGCGGTCAGTAATACGCCGCACTCCATATGGTGCGTGTAGGGGAACTGATCAAACAGAGCCAGACGTTCGACCTTGTGCGTCTGGCTTAATGTTTCCAGATT 0.34 +CGGGCCTTTGATCTGACAGAAGGAGATACGCCGATTGCCGAAGCATTGAAGAAACTATGACAGTACTGACACTCAGGGCTTTTTGCTCTTGAGTGTTTTTTTCTGTTTCTC 0.992 +CACGCCTTCTGTCCAGCTTCCGAGCAGCAAAGAAAACGGTGCCGATTCAAAAAAAGAAAAAAGCAGCAAATAAGCCGCCTGTTGAAGAGGCGGCTTTTTGTTACTTCTTTT 0.992 +TAAAGACAGGATTGATAAGCTCGAAGCAGCACGTAAAGCTGAAGATAAAATCTAATAACCAGAAAAAATACCAAGGGTTTTGACGCCCTTGGTATTTCTGTGATTCAGACC 0.848 +TTGTCGGTACCTCTGATTCATATCTTTAAATGGCTCGGATCATTAGCCCTGCACCTCATCCATTAAAAAAGCCACTCAGCTCATGCCGAGTGGTTTTTATTAGTTAATCAC 0.9690000000000001 +ACTCATCCTGACGATGAACCTCCGCAGCCACCACGCGGTGGTCGACCGGCATTACGCGTTGTGAAGTAATACAAAACAGGCCCAGGCGGCCTGTTTTGTCTTTTTAATGAC 0.8759999999999999 +ACGCAGGACGCGAAGCTGAAAAGCTGATCGCCAGCAACAAAAGACACAACGATTGATAAATATTCAAACAGTATCTCTAATTAAAGAGATACTGTTTTTTATTTTTGGGCT 0.998 +ATTATTCCCGAACTTGCGCATGTGGTCAGAGAAATCAGAAAAGACAAGTAAATGAATAAATACTATGAAACTTCACCTGAAATTTAGGTGAAGTTTTTTTTATAAAAGGCT 0.981 +CTTGGACGAAAAAGATAAAGATGCGTTTCTTACGCTCTCTAAAGAATATGCAGCGCTCAAGCAGAGCGAATAAGCCAATCCTTCAAGGATTGGCTTTTTATTATCCGTTCA 0.98 +GTCGTACATTTTCCCTGCGAAAAGGTGCGGAAAAGCGCGGTAAATAAGGAAAGAGAATTGACTCCGGAGTGTACAATTATTACAATCCGGCCTCTTTAATCACCCATGGCT 0.05 +ACCGTCTTTGTCACGCCTGATGCTGATACTGTCATCAAGCTCAAAAAAAGCTGAGCCGCATAAAGAAAAGCCAGAGCATTTGTATGCTCTGGCTTATTTCTTTCTTCTCAT 0.978 +CGAAAAGCAAATCGAGCCATAAAACAAGCGGAAGCGCCAACGGACAAAACGGCTACCATTAGCATAAAAGCCTCCTGACATGATGTCGGGAGGCTTTTTGATTAAGAAGAT 1.0 +GATGCGGCGTAAACGCCTTATCCGGCCTACGATTCCCATTATTTCAACAAATTACATTAAAGTAGGCCAGATAAGACGCGTCAGCGTCGCATCTGGCATTTGCACTGAATG 0.65 +GAATGAGTTGAGTTAGAGAATAGGGTAGCAGAGAATGAGTTTAGTTGAGCTGAGACATTATGTTTATTCTACCCAAAAGAAGTCTTTCTTTTGGGTTTATTTGTTATATAG 0.392 +AAGAATTGGAGGATGTTCTGTTACACTAGATGGCAAGGAATTGATCTGTTTCGAGCACCTTGTATGTAAGAGGAGGGGAAAGCATCGGCCCCTCCTTTTTTTGTATGCCTA 0.922 +ACATCTGCCGGAATTGACACAACGTCGATCTTCTCCCGGCCTTCCTATGTCACCACATATTTCGCTGTCGAATAGCTGCCGGCATGTCCGGCAGCTTATTTTATTGGGAGG 0.973 +TTAAACCTGACCAGTATCAGACACCCGTTGAACTGGATGACGAAGAAGACGATTAACGACTCTTCAATAACTGACAAGGCCCGAGTTATCGGGCCTATTTTTTACTTACTG 0.9009999999999999 +TGTTCAGGAAGCATTTCAAGAACAGCCGAAAAAGAAGGCAAGATCTGTATAATGCAGGTTAAAAATGCGCTTTTTTTCTTAGAAAAAAGCGCATTTTTACATTCAATATTT 0.93 +TCAGGTTATGCGTCTGCAAACAGAGATGGCTTAACCAAAGTGCTATGCAGTAAAAAGTGCTATGCAGTAATAAGACGGCTCCTGATTCAGGAGCCGTTGATGTTTCTGGGG 0.956 +TCATTACTGGACACTGCTGACTGAACTGCTGAAAGAAATTCCGGCGAAGTAATTATTTGATTTGCTGCCGGATGGCGTTTAATCGCCTTCCGGCAGTTTCATCCTTCATTA 0.8490000000000001 +GCAAGAATAAAAAAGATATATAATCATCGAATTGTTATTTTTTCAGATTGTCAGTGAAATAATTATACAAAAAGAGAAGGCTTGTGGGCCTTCTTTTTTTGTTGTTGCATC 0.875 +AGAATCTTCAAAGAAAACGAACGGCTGATTGACGAGTATAAACGAAAAAAAGCATGATCACACTAAAAGGACAAGGGAAAACAGCTCTTGTCCTTATTCCTTTTCTTAAAG 0.888 +CTCTCATTCACATTAAAGAAACCTCAGCGCCCTGCCGAACAGCAGCCGGCACGCTGATTAACAAAAAAAGAGCCCCGCTATTTAGCGGAGGCTCTTTTTGGTTTTACTTTT 1.0 +AATTAATAAATTGATACTAAATCGTTGATCATGCCTTCCGCTACTTAATAAGCTGTTGGGAAAGCAATCCCCTGATCTAGAGTGGTTCAGGGGATTTTTGTAATTTAAACG 0.971 +GCCGCGCCGTTTCCCATGTGGCCGCTTCCGTTGATTCGCTTGTCATCCTGACAGAAGAATAACCATCAAAAACCGGTCTGCCATACGGCCGGTTTTTTTGCGTTCATTATG 0.992 +AATGCCCCTTCTATTCGCGATGTGCTGTTATTCCCGCAAATGAGACAACGCTAATAAAAAAGAGCGGTATCCTCCATAGGGAAAGGATGCCGCTCTTTTTAAATCCCTTAG 0.982 +TTGCACATTTTTGTAGGCCGGATAAGGCGTTTACGCCGCATCCGGCAACATAAAGCGCAATTTGTCAGCAACGTGCTTCCCCGCCACCGGCGGGGTTTTTTTCTGCCTGGA 0.997 +TGGGAATCGGCAGTTTTTATGCTGAAACGCAGCTGGTGATTGATCCCCGTTAATCGATAGATTTGTAGTCAGAGACTGAGATGTTTTCAGTCTCTTTTTTTGTGGATTCAA 0.8640000000000001 +CCTGCGCCTGGCACAGGAACGTTATCCGGACGTTCAGTTCCACCAGACCCGCGAGCATTAATTCTTGCCTCCAGGGCGCGGTAGCCGCTGCGCCCTGTCAATTTCCCTTCC 0.813 +GCACCTGATGAACATCGCATCGATCAAGCCTGCCAAGCCCTCGAAGAAGCCGGCATTTTAATTAAAGACTAAAAAAAGCCCTCTAGTGGGCTTTTTTTATAATTGGTAAGA 0.914 +TTCAAATATGAAGATTGTGCCTGTTGATACCATGCAGGATGCGATTGATTATTTGAATAAGCTGAAAGCGAAAAGCACCTGATTCTCAGGTGCTTTTTCTATACATTTAAA 0.993 +TGATGCGTGACGAATGCTTTGCCATCCAGTACCATAGCGCCCTTTCCATTCCTGGACCTGAATAACACCACTACCTCATAAGCACGGTAGCGGGTGGTTATTGCCTGCAAT 0.441 +ACAAGAGCGGCTACCGTTATCACCCGGGCGAGCCAAGAACCTTCTTGCTCACAGCCAATATGCATTTCTGATTCAGATGTGGGGCGCAGGCCCCACTTTTTGGAGAAATTG 0.461 +GAAGAGATATAGAGAATGGCACGCCGATTACAGCATTTGATTTTTCAGCACTGGTGTAGATAGAAAAAGCCAATGGGATTCCAGTCCCATTGGCTTTTATAATGTATCACA 0.9690000000000001 +CGCATGGAAGAGGTCTGATTACGCTTCATCTTCCGCGTAATTTCTTTACTGTGTAGGCCAACAGGTAACGCAGAAAAAAGGCACCTTGCGGTGCCTTTCTTATCATTCAAT 0.9229999999999999 +GCAGTAACGATTGCCATGCAAAAAGGTATTCTGACAATAGACAACTAAACTGCATATTTGAAAATTGCCCAAACGTACATGCCCGAATGTACGTTTTTTTCATTTCATTGT 0.985 +GATAACAGTTCAATAATGAGAATAGAAGGAAAGAAGATAATTCTAACTTGTTGATGATTTAATTCTGAAAGCTAGGTGCCCTATTGGTACCTGGCTTTTTAAATAGCAACT 0.982 +AACGAAAGGCGAAACAAAAAATAGTGCTCCCGCCTGAGCTTGTAGTCAGACATTCTACATCACCGCTCAACACATGAGCCCGCTAATGAGCGGGTTTTTTCATTATGACAG 0.963 +TTTTAAAGGAATAAAAGCGCAAAAATCACTCGGATGATTCGCATGAGCTCTTCCTTCATTGGTCAACCTAATGAAAAAGGAGGAGCTTATGCGTTTTTTCTTATGTTCGAT 0.991 +ACGCTGAATTTGAAGAAGTCAAAGACAAAAAATAATCGCCCTATAAACGGGTAATTATACTGACACGGGCGAAGGGGAATTTCCTCTCCGCCCGTGCATTCATCTAGGGGC 0.7070000000000001 +CGTCGGTGCGCTAACTAAACACGTACAAGCACTCGACCTTTCAATGCGTTTTCGCTAATAGTTGACAGATTTATCCGCTCCATCGCGGGCGGATAATCTCCCCTTCCCCAA 0.843 +GTACGCCAGAAACCGTCGCGGAGTGCGAAGCATCACACACGGCACGCTTCCTTAAGCCGATGCTGTAATCGTTAAGGCCGCTTTCTGAGCGGCCTTTTCCTTTCAGAGTTG 0.825 +GTTTATTCGGGATCCGCGGAACCTGATCAGGCTAATACCTGCGAAGGGAACAAGAGTTAATCTGCTATCGCATCGCCCCTGCGGCGATCGTCTCTTGCTTCATCCGTCGTC 0.40299999999999997 +ATGATCTTTGGTATTTTGTAAAATCAAAAGAAAATCGCTGGATTCTTGGAAGTGGTCATTAATTAGAGGGGTACAAAGGAGAGTCTAACTCTCCTTTACCATTTGTTGGGG 0.892 +CAATGAAAAAACAAAGCTGCAAAAACGGCTGAACGCCGTCAAACCGAAAAAGTAACCAAAAACCTTTAAGATTTGCATTCCAAGTCTTAAAGGTTTTTTTCATTCTAAGAA 0.992 +AAACTGTCCGGTAGCGTAACCGTGGGTGAAACCCCGGTTATCCGCATCAAGAAGTAATTCTTGCCGCAGTGAAAAATGGCGCCCATCGGCGCCATTTTTTTATGCTTCCGC 0.99 +ACCTAATCGACCTTGACGCCATCCGCCAAAAACGCCGGATGCATCTTTCCTAATTCTATATTTCAAACGAAAAGGCTGCTGAATCACGCAGCCTTTTAAAATTGTTTTGAT 0.951 +AACAGTATGACCCACAGCGGGTACGTGCAGCGGCTTACTGGCACGCAAAGTAACTGATATATAAAAAGAAAAAGGCTGACGATTTCTCGTCAGCCTTTGCTATATCTGGTG 0.578 +TGGGGATTGACTTTTCGAATATTACTATTAAAACCAACCTTGCCCAAGCATTAAACTATCATCAATAAAAAAATCCGCTATCTGTGATAGCGGATTTTTTATGATGCGAAA 0.986 +AGTACACGTATTATAAAAGAGATGAAGAAGCCATTGGGAAACTTGCTGACTTTCTTAAAACAGAGATATAAAAATAAACATCAAAAGATGTTTATTTTTACACCATACATA 0.9990000000000001 +GATAAGCTGATTGACGGCGATTTTGTTGCTGAAGTGAACAGGCTTTATCAGATGAGAAGAAAGAAATAAGCCAGAGCATACAAATGCTCTGGCTTTTCTTTATGCGGCTCA 0.986 +AAAATTAGATAGAAACAATGAAGATAAAAATAGATTTTTTGGTTTTAGATTTTGACCAAATAATATTAAACTAGGGAGTTCTTATAGAACTTCCTTTTAACCTGTTTTAAA 0.937 +ATATCTCCAAAAAGCTGAAAGCTATGGGCGAAATGAAAAACGGCGAAGCGAAGTAATTCCCGTTTTATTCAATGAGGGTTGCCCGGCAACCCTCATTGCTCATTGATTCTT 0.867 +CTGGACTGGCCTGATTTCTACCGGGATCACCTACAAATTCTGATAATGCATCTTTCCAGGGCATTTTGTGCATAAAAACAGGGCGTCACGCCCTGTTTTGCATTACGGTGG 0.951 +ATTTGAAGTAATTAAGGTGGAATACCTGTAAGAATTACCCAATACTCAAGATGTTGATGTATTGTAAAGAAAGGAAAAAGGCCGCTATGCGGCCTTTTATCAACGAACAGA 0.38299999999999995 +CGGTATTTTATACAGGTGAAGCCAGTCCGTTTGTAACCGCTGTGCCGCCTCACTTATATCAGATTGCTGAATGAAGAGAAGGGAGTCCTTCTCTTTTTTCTATTTACATGA 0.988 +TAGATCGTCATGATAAAGAGGCATTTGAACAATTATCGCGCCAATTAAATCAGCTCACATAATATGACCAGCCTTGCAGTGAATTGCAAGGCTTGTTGCCGTAAAGGGGGA 0.7040000000000001 +CAATTTGTATTCGCAGTGTATATTTTGCTGATTTGCCACATTGTCGTCCTTTGTTTAGCGCTCTAAATCAAACAGCCAGAATAAAACTGGCTGTTTTCTTTAATTTCAGCT 0.9890000000000001 +GACGGCGCGAGACGAATTGCCCAGCTTCAGGATAATATTAAAAAAGCAGCATTGCAGCAATAATAAAAACCCCGCTTGTGGAACATAAGCGGGGTATTTCAATTACATCAT 0.965 +CGTCATGAAATTGAAGGACACGATATGGAAGAAATCATGAAAAATCTGACTGCTGCCTTCGATGCGCACTGCTAAATGCCCGTTCTCCGGGCATTTTCTTTTTGGGAGTAC 0.9109999999999999 +ACTCGCAATTATAATACGCTTGATTCTGTTCAGTACGAAATAACTGAATAATACGCACAGTAAAATATTCGAAAGCCACGGATATATCCGTGGCTTTTTATGTTAACCGCC 0.9790000000000001 +GTCCTTAACGATCTCAAATAATATGTTTAGCCTTTTGCCACAAAGGAAAAGACATTACTTTTTTAACTTCTACCGTTCATTTTTTGATGAACGGTTCTTTCATAAGACCTC 0.961 +TGTGACAGCTGTCATCGGCTCACTGGCGTTTTTTGGATTAACGAAAGAACGAGTAGGGCAGAATTCATAAAAAAACCTGACATGACGTCAGGTTTTTTGTTTTATTTAGAA 0.987 +GTTATCATCCGTTTCGCTGAAAAACATAACCCATAAAATGCTAGCTGTACCAGGAACCACCTCCTTAGCCTGTGTAATCTCCCTTACACGGGCTTATTTTTTACGCGTAAT 0.9590000000000001 +TGAGCTGGTCAGAGACACCGCCCGCGATATCAGCACGGCGTTGGGACTGAAAGCACATCCATAATGTCTGTCGCATCCCGCTCTGCGGAGCGGGTTTTTTTGACAAAATTT 0.995 +GGCATTGGCTTCGGTGCGCTGTTTGCGCTGGCAATAACGGCGCTGTGGATCTGGCAGCGCCGTCATTAATATTTAACGCCGGTTTTAACCGGCGTTAATCTTATGGTGCCG 0.862 +GATGGAAGTGGAGAGAATGAGTCTGCCACCTAGCGTATGAATCCATGAAAAATCGAGTGTTTTTAACAAAGTGAGGTTTCCTTAAGGGAAATCTCTTTTTTATTTCTCTTT 0.997 +ACCCGGCTTACAAAGCAAAACGCTATACTGATGAAAGCGAACAGTAATCGTACAGGGTAGTACAAATAAAAAAGGCACGTCAGATGACGTGCCTTTTTTCTTGTGAGCAGT 0.536 +GTTGGTTTATTGCGGACCATCAAACCCACTCACAGGAAATGAAAGTCGCTGAGAAAATGATTGGATAAATGAAAAACCCCCGCGGGATGCGGGGGTTCAATTTAACGAAAG 0.35700000000000004 +GCCGTGTTCCGCGTATGCTGAAATCCGGCTTGCTGTGGATGTATAAACACCATAACGGTTAATGATAGAGAAGAACCGCTGATCAAGTCAGCGGTTTTCTGTATACCCATG 0.828 +CTAGCTGCTCAAGGTGAAGAAAACAGCGGAACACAGAGTATATATAAGCTTTGGGTGAAATAAAAAAAGATTGCCTGTACGAAACAGGCAATCTTTTTTTATTATTTTCCG 1.0 +ATTCTGCTTAACAAGCTAAAATCATGCAACTAGCCTGTAAAATTTCCGCTGTATAGTAACAATAAAAAACACCCCTTCACGTCAATGTGAGGGGTTATTTGTGTGTTAACA 0.986 +AAAAATACCGCAGCTCGATACAAATCAAGACTAGCGAAAAAAGTGAACGGACTTTCTGCATAATATAAAAACGATCCTCAATAATGGGGATCGTTTTTTTCATTTCTTTTT 0.9990000000000001 +CTACTTGAAAGAGAATGGAACATGGTCTAAGGCCTATAAGACCATTAAAGAGATTAAAGATTGGATTAAATAATAAGAGCATCCTGCGGGGTGCTTTTTTTGTTCCCTGTA 0.998 +AATTACAGAAGTCATTAAGAAGTTAGACCGAAAAACGGTTGACGAAATTACGTACCAATAAAGATACGGAAGAATCCGCCTATTGAGGTGGATTCTATTTTTTTGTCTGTA 1.0 +AGCGTATCGGAAAAACAGATCTCTTTCTGTCACCATGTTCCTCGTTACACGTCTGATTGGCAAAAGCAAAAAAGACTCCGTCTAATAAGACGGAGTCTTTTTTTATTTCGT 0.991 +TACCTGCGTGAACTTCGCGGAAAAGCGGCTCGTATTAAAGAGATCAGACGATAATGATAACGAACGAAAAGAGCTTGTTACCTGTAACAAGCTCTTTTTTTATACAGAATT 0.996 +CTGTCTCTCACTATTTGACGCACTGGCTGGACTATCCACATCTACCTTATTCCCCCGAATAACGAGATCCCTTCCAGCACCGGGCAATTGCCCGGTTTTTTTTGCGTTGAA 0.998 +AATTACCGTTTTCGATGATTTCTGGAAAAAGTAATAATTTGCACATATTGGATTGTGCGAAAAAGAGTAATTTGTTCACGCCGGATGCGGCGTGAACGCCTTATTCGACCT 0.588 +AAATGTTTGAAGAAAAACTTGATGAAATGGAAGAGCTGCTGATGAAAGTCAAGCAGTCTTAAAAAAGTAAGCCTGTGCGGAAATGACGCACAGGCTTTTTTAAAACCCTTT 0.9540000000000001 +TCCGCCCATTTAGCAGAAGGCCAAGAAGTCCAAGTCGGAGAACTGATAGGTTACGAAGAATAAAAAGAGGAGCTTGCATAAACGCAGCGCCTCTTTTTTTGAAGAAAGGTG 0.81 +GATCCGGACGGTTACAAAATTGAGTTAATCGAAGAGAAAGACGCCGGTCGCGGTCTGGGCAACTAATCTCCTGCCGGGCGTGAACTCATCGCGCCCGCATCTTTACTGCAT 0.6559999999999999 +GATGATCCGGATTCTTATTATTATGAGGATGATCCGGTGTTTGAATAATGATGAGACAGGACAGGGCATGAGTTCCATATAAACGGAACCATGCCTATTTTATTTGACTAT 0.91 +CACTACATCAATAAACAAAAGCAGGTGCGCGAACAAACAGCCGCTTCAAAAGAAGAATAAAACAAAACAAGGAGGGAGGTTCAATGTCCTTCCTTTTTTGTTACCGAAATA 0.833 +AAAAGTGAAAAAGAAGCAAGCTGGATCCAAAAGCAGCAATCTATCACTTTAAATATGCATCAAAAAGGGTACATCACTTTATCGTGATGTACCCTTTTTTATTATCCTTCT 0.978 +CGAAGGTTCGCCGACAGATTAATGGTAAAGCTGAATCATATCCGCATCAAATAAAGCTTCTGTACACGACAAAAGCCCGCTCCGAGAGCGGGCTTTCTTCAACTTATTTCC 0.9740000000000001 +GCGTGACAAAAAAACGACGCTTGAGAGCAAAATAGAAAGTGTCAATAAACATTTACAGCCATAAAAATAAACCTGTATCCGATCGGATACAGGTTTTAATGTGCGGTTGGA 0.993 +CACGTCGGTGGCGATGAGCTGGATAAACTGCTGGCGGGGAAAGATAGCGACAAATAATTCACCAGACAAATCCCAATAACTTAATTATTGGGATTTGTTATATATAACTTT 0.86 +CGCACTGGAATCCGTTCTGCTGGGCGATAAAGAGTAATTGTGTTTCGCTGCAAGGCGATTGCCTTGTGAAGCCGGAGCGGGAGACTGCTCCGGCTTTTTAGTATCTATTCA 0.986 +GGATCATCGATGTAGGTCGGATAAGGCACTCGCCGCATCCGGCAAGATAAATCGCACGTTGTCAGCAACTGTAACGCAGAAGGTTATCCTTCTGCGTTTTTGTTTAATTAG 0.987 +GGGCTTGTAGATGATAACGAAGCCGCTGTTATTGCCAAATGGCTGTCAGAAAAAAAGTAAGATAAAAACATCCTTGCCGAGTGCTGGCAAGGATGTTTTTATGCCTTGACA 0.992 +GCGTGGGTCGTCTGTATTGTTGTCATTTCCTTTTTGATGCTGTTTTTCATTCAACAGAAAAAAACGCACTAAAAAAACTCCCCATAACGGGGAGTTTTTTGATTAAAACAG 1.0 +ACGGAGGCATTCCTCCTTTCCAAGAAACGCAAAACTATGTAAAAAAAATAACTTCAGTTTATTACGCGTAACCAGAGGTGCTCTTTCCCTCTGGTTTTTTTGCGAAGTGAA 0.99 +GCAACACAACCGGGATTACGGCGCACCACAAGCTGCCGCTGCCCGAGCCTGTGATGCTGAACAGTGCAACCTTGAGCGAGAAATAAGCTCAAGGTTTTTTTATATTGTGAT 0.98 +TGCGGTGAAACGTCACAGTCAGGTGAGCGGCGGTCGTCGTTCTGGCGGACGTAATAATAACGGTTAATCAAAGTATGCCGGACGTCATATCCGGCATTTTTACAGATTAAT 0.5820000000000001 +AGATCGCAGCGAAAGCAGCGGCGGTTGCAAACCAGGGTAAAGCGAAGTAAACGTCATTCGTTTAAAATGAGAAAGCCGACTGCAAGTGAGTCGGCTTTTTTGTTGCTAACA 0.888 +ATATGCCTTGGATAATTACACAGAAGTCAAAAGTGTATGGGTAAACCTTGAAGACTAACATGTATGTTTAAAAAACTGCTGCCTGCCAGCAGTTTTTTTCTTTCTGTAATG 0.9840000000000001 +CGATTCCTGAATTCGAAAAATGGCTGAATGAACTGAAGCCAATGGTGAAAGTCAACGCTTAATTGAACAATCCAAAAGGCCGCGCCTGCGGCCTTTTTTTATGCTTTCTCG 0.996 +AGAAGCGCTTTTAAAAATGATGAAATAAAATAAATTGGCCCCGGCTATTACTATGTCCCCTCTTACAAGCATACATTGTGATATGTAAGGGGGGATTTTCTTGCCAAGATA 0.7979999999999999 +CCTGATACGCTGCGCTTATCAGGCCTACAAGTTCAGCGATCTACATTAGCCGCATCCGGCATGAACAAAGCGCAGGAACAAGCGTCGCATCATGCCTCTTTGACCCACAGC 0.405 +AAACGATTCAAGGACATCACTGAGTCAGTCCTTTATACATTGCACGCAGTAAAGGATGAAATTGCAAGAGAAGACAGCCGGTAAAAGGCTGTCTTCTTTTCGTTTTTATAG 0.961 +AAGAAGGGAAAGGCACAAAGTTCATCATCCGCCTTCCTTTAACGGCAAAACAGCAATAGCATACAGGGCGGCGCATCAACTGATGCATCCGCCTTTTTTGCACACCGCTTC 0.738 +CGCAGTATTCGACAAAGTAGCGTTCACCGCTCTGGTTGAAAAAGCGAAAGCAGCTCTGGCATAAGCCAGTTGAAAGAGGGAGCTAGTCTCCCTCTTTTCGTTTCAACGCCA 0.915 +CCTTGATCAGACACAAACGAAAAAAGGTAAGTGACTGTCCCTATTGAATCTGGCTGATAGATGAATTGAGAAATGACCCCGGCGTTTCGCCGGGGTTTTGTGCTGCATAAG 0.9990000000000001 +GAATGCAGGCAAGCCTGAAGAGTGCTAAGCACTAAAAACGGATAGCAACTATTACTGACGCTTTTCTTAGCGATAAATCATCGTTAAGGGAGGCGTTTTTATTATGGGGAA 0.9990000000000001 +AGATTGAAGGCTACGATCCGCATCCGGGCATTAAAGCGCCGGTGGCTATCTAATTACGAAACATCCTGCCAGAGCCGACGCCAGTGTGCGTCGGTTTTTTTACCCTCCGTT 0.8859999999999999 +GCGCGCAGAATACTTCCGCGCAACAGGACAGCGAAGAAACCGAATAAGGTTTCGGGCTGTTTTTTTACACGGGGAGCCAGCGATCCTGCGTTCCCCGCTGATCTATTTAGA 0.47 +CGAGAAACAGTTAATTAAGACTGGTGACATCGTCGAATATCTGGTGAAACAGATTAAAGGCTGATGCCAGAAAGGGTCCTGAATTTCAGGGCCCTTTTTTTACATGGATTG 0.7909999999999999 +TGAGCTTTTGCCTTCTTCCGCCAATGTTTTCGCCGCAGAAGCGCACGAAACAAAAAAAGCACCACATGTAAAAAAGCTGCCTTTGCGGGCAGCTTTTTTTATTTTTTTCTA 1.0 +GTGCCGGCTGAAATGTCAGATTTGCACGCCGACACTTATATTATTGATGCTGTTTCTGTGTAATGAAAAACCCCTGAGAGAATAACTCTCAGGGACTCTTTATAAACTTTC 0.8540000000000001 +CGCCACTATGACAAACGTTGCCGCCGTAGCGATAATCGCCGTCATAGCTAACAAATAATGCAGTTTGCTGACCAGTCAGTTTGCGCTGACTGGTCAATCTCGTACTTATAA 0.764 +TATCCCTGCTTTTCAAACTGAAAAACTTGCTTTCTTTAAGCAGCATCTTAAAGGCTGATAAATGTGAAAAGCCGCCGCATATCATCAGGCGGTTTTTTTCTGCAAACTGCC 0.344 +TGGCACAACTGAAATTAGATCCTCGTATTTTCATCAAAACAGCGGGATTATGGTCAACAAATCCTTAAAAATGAAAACCTGTCTTTCGACAGGTTTTTTTATTTGAATGAA 0.9890000000000001 +CTGGTCATACGCGTAACGTTGCCTGATTTTTCCGCATAACTCACTATCCTTCTGTCATATCATTAAATTTTTCTGATTTATTGATCTGGCAGAAGGTTCATCACTGTTTCA 0.77 +GGTTCAAGTCATTATTGGCCCGAATGTGGAATTTGCAGCTGAGGAATTGAGGGCTGCTGTCAAATAAAAAAGCGGAGAGGGCAACCTCTCCGCTTTTTCTTATTTATCTTC 0.998 +TCGGCATTCCCGACTGCATTGATCAAGGAAAAGCTGCCGTGTCTGACGCGCTTACCTATTTATTCAGCTAAAACCTCCGCTTTATCGCGGAGGTTTTTTTGATGTGCAATG 0.985 +GTATGAGTTCAAAAAACGGTCGTCTAGTTTTAGCACGCCGTCGCCGCAAAGGCAGAAAAGTATTATCAGCTTAGGCCACTGAATAATGTCAGTGGTCTTTTTTCACGTTAT 0.818 +GATTCATTACGAAACATTAACAAAAGCATTGATCAACTCGATAAATACAATTATGCAATGAAAATTTCGTGAAAAAGACTTGGAAACAAGTCTTTTTTTTCGTTCTACCGA 1.0 +AAATTGGCGATGCGCTGGAACAGCAATAATCCTGAAGCCGCCTCGCAAAAAAATGCGTTGCAGCTGTAACAAGAAAAGGTCGCTTTCGCGGCCTTTTCCATTTCTGAACAT 0.8540000000000001 +AGTGGCTGAAAACATTAAGGAAAGCATAAAAGAGCTTGTCGAAGAACTCAGCATGTTCGCAAAAGCAGGAAATCCCGGCGTCTAAACGCCGGGATTTGTTTATGTGAGAAG 0.647 +CTGAGCAAATCATCAGCGAAATCCAAAACCAGCTTCAAAACCTTAAATAATCATTTCTTGTACAAAAAACGTTCTTGTTATGACACAAGAACGTTTTTTTATTGCTTAATA 1.0 +GGGCATCAGTCTTAATTAGTGCCACTGCTTTTCTTTGATGTCCCCATTTTGTGGAGCCCATCAACCCCGCCATTTCGGTTCAAGGTTGATGGGTTTTTTGTTATCTAAAAC 0.982 +GGCCGAAGCGATTCGTCGTATCAGCAACGAAGAATCGATCTCTGCCATGTTCGAACACTAATCGAACCCGGCTCAAAGACCCGCTGCGGCGGGTTTTTTTGTCTGTAATAT 0.852 +CTCGAGGTATTAACGAATGTATTTCCAAGCCTCCTGTCTAACATGAAGCTTGGAAACAATTGATGCGATAACCCTGTCCAACAAAACAGGGTTATCGTTATGTTATAGAGA 0.7809999999999999 +ACACGTCCTGTTCTGGAGGGGAGTTTCAGACACTCCGTTTCTTGCCTGAAAGTCGATCCGCTTTAAAAACAATAAGGGCTGACAGTTGTCAGCCCTTTTTCACGCTAAAAG 0.888 +CAGCGCACTCGGAACGGCAATGTACTTACATCAGAGGAGAAAACAGAATAGAGCGAATCAGGCATAAAAAAGAGCTGCTCGCTATGCGAGCAGCTTTCTTCTTTTAAACCC 0.968 +CAGCTTTCTGGAATGTTGTAAACTGGGACGAAGTGGCTCGTCTATACAGCGAAGCAAAATAATGGCACAAACAAGGTCCTCATTATGGGACCTTGTTTTTTTATGTCTTCT 0.9990000000000001 +CAAGAGCTTCCGTAATCCGGTTCCGCAGTACGAAGACGTAGCATAATCATCTTTTCTAATAATAAACCGGAGTCGAATGAACATTTGGCTCCGGTTTTTTTATTCTTTTGT 0.9670000000000001 +TGGATGATATGGCAGAGTGGGTGTCAAAAATTAAATAAAAGAACTATTTTTCTCTTTATACATTCACAGGCCGCCGAGATGTTCTCGACAGCCTTTTATTTTTGACTGCTG 0.799 +TGCTTCGGTTTGCGACAACCGGCTTGTGGGGACTTTATCGCATGATTGCGTTTTAATGTGACAGTTTGATAGGGGCTTGGCAGCAGCCAAGCCTCTTTACATAGAAGGGAA 0.74 +GCGGCTGACATGCTTCGCCGCCACCAAATTAAAGGAATTGAGCGTATGCTTGAAAAGCTGAACGCTAGAGAGATCGTTTAGATCCTCTCTAGCGTTTTGTATTGCTTTTTT 0.975 +ACTTCCTCGATTATTGCCGCCGACTGTTTAAAGTCAGCCGTTACCAGCAGGTAAAAGCCTGACTCTTCTTTTCGCGACTGGCATCGCCAGTCGCGTCTTTAACTGGCTTCG 0.608 +CGGGGATTGTGCTGGGGCGTAATGCTCTACGCAATTACAACGCGAAGTAATCTTTTCGGTTTTAAAGAAAAAGGGCAGGGTGGTGACACCTTGCCCGTTTTTTTGCCGGAT 0.856 +TCAGCAAACAGGTGGATATCATCACCGACATGCTGATCAAACGGGCCGAAGAAAATAAAAAATAAAAAAACGGCACAGTCATGACGCTGTGCCGTTTTTTATGATTCACTT 0.986 +TAAACGATTTGCCTGAATGGCTGCGTTAAAAATTTCTCCTCTGTTGTTTATTTGATACCCATCACACTTTCATCTCCCGGTTTTTTCGCCGGGAGATTTTCCTCATTTGAA 0.941 +CAGGCAATGAAAGCTATTCCGGGTACCATTCGCGCCCGTCTGCTGTACTAATTCCCCTTCTCTGAAAATCAACGGGCAGGTCACTGACTTGCCCGTTTTTTTATCCCTTCT 0.976 +GCAAAACAGCCAACCGCTCAAGATTTTTGTATCCCCTGCTACAGAAAGCAAGACAGCCCTTGCATCATTAACAGCCCGGCAGATTACTGCCGGGTTTTTACTGTAAAAAAA 0.963 +GAGCTTCACCATTGAAGGGCTGGCGGTTGGGGTTATTCGCAACGGCGACTGGCTGTAACATATCTCTGAGACCGCGATGCCGCCTGGCGTCGCGGTTTGTTTTTCATCTCT 0.474 +TCGGTTTATTTAAGGGAGAAAATAAAGGGAAACAGCTGATCAAGGTCAGTGATCCGAGCTGAATGAAGATGAAAACGGGAGGGCTTTTTGCCCTCCTTTTGTGTTTCGATG 0.9740000000000001 +CACTTTTGCCTGCGAAGCTGAAATGCTGATTTTCCAGCTGGGTCTGCGTTAATTCCTCTCGGGTACGTACGCCCCTGTCAGTTGCTGGCAGGGGCGTTTTCTTTTCTCTTC 0.968 +GCCTTGGCGGCGGGACAATTGATGATGTGTACAAAGACGGAACAAAATTGTGGTACGTATAAAAGATGAAAAACCTCAACACTGAGTTGGGGTTTTTTAATGAAAGGAAGA 0.764 +CTGAACGACGAAAACTCGATTGAAAATATTGTAGATCAAATTGCTTCAGTTATTCATGATAATCAAAAAAAGTGAATCTCAGTCGAGATTCACTTTTTCTTTAAAATAAAA 0.978 +ATTCTGATTTAGAAGCGGCTGCTGCAAAAACCCTAAAATTGATTCATCAATAATCCATTATAATTGAGTGTCTTGCGGATTGCTCCGCAAGACACTTTTTCAACGTGAACC 0.9940000000000001 +AACATCAGCATTTCTTTTAAGATGTTCGCCTTTTCATTTAAAATCAAAAAGAACCAAAAATAAAGAAAATCCATTTCTCGCATGAGAAATGGATTTTTTATTATTGCCCAG 0.9790000000000001 +GAAAAACGGTATATGCCTGAAGAGCTGAGCCATCAGCCTAAAGAAGTCATCTTTGTGTAACAGACAACAAGGCTTTCGAGTAGTCGAAAGCCTTGTTGTTATCCTTTTATT 0.621 +TCATAGTCGAACTATGTAGTATGTATTTTGGGCTCTTCAAAAACGAAGAGAGAACATAGTACGTTATGCTCCCTTTCAAGAAATTGAAAGGGAGCTTTCTTATTTTCACCC 0.924 +ATACAGGACCTCTTCCCCTCAGCAAGAGGAAGCGCTGCCAATCTCAGTATTGCAAGAAAAAGAATAAAGCGTAACAGGAGGCTGATGATCAGCCTCTTTTTGTTTGCAGCA 0.973 +AGAAAAAATGGCAGCTGCCTGCCGCAGAGGCTTTTGCGGAGCTTGTTTTACATTCGATTCATGAGCAAAAGGAGTGAGTGTCAGGCTCACTCCTTTTCTATTTAAATTGCG 0.871 +ATGGTCAACTTCCTGTTCGAGGGTAAAGAGGTGCATATCGAGGGCTATACGCCGGAAGATAAAAAATAAAAACAGTGCCGGAGCACGCCTCCGGCAACTTGCATAAAAACA 0.695 +GTAAAACAAGATGATTCAGTATTAATCATGCAGCAACACAAGGATAATTAAATATGCGTTATAAATAAGGGAGCGGTAAGGCAATTATCGTTCCTTATTTTTCGATACAAA 0.9570000000000001 +TTGATAAGAAAAAAGGCTTCTCCGCCTTGGTTTTTGTCAACGAAGCAGAAGAATAAAAATGTCTTAATCAAACCTTACTCCGCGCGGGTAAGGTTTTTTTAATGGTTTCTC 0.995 +GGCGGAAAACCGTCTTCATGTGCAAAAAGCACTGCTGAAGGCCATTTTATATAAAGGGGAATCATCAAAAAACTGCTGAGCCAAACTCAGCAGTTTTTTTGATGGCAACAT 0.971 +AAACGGAAGAAGTCGCAGAACGGATTTTTGCGATCTTGAATCAGCATCCGATCATTCAACCGTGATCAAAAGCGGACAGCTTCGGCTGTTCCGCTTTTTTTGTGTTGAATG 0.9840000000000001 +CTTTGCTGGCGAAAGATAAGAAGTTACATCACCTTACGTCTAAACATCGCGTACGCCGCTGACCCTGTCGTGGCCGTTATCACCAGTAGCGGCCACAAACTATTCCACACA 0.61 +CTGATCCTCTTTTCTTCAGAAACGAAAAAGGGAAAAGACGAAGCTTGGGGAGCGATCAAAAAAATGATAAACCGGTAGAGGATCCTCTACCGGTTTTATTTATCGTATAGA 0.9840000000000001 +CCTCATGGCAGGGGCAGATGCATTGTCCAGGAAGCGGCTTTCACATCACATGGATAAATAACGCAGCGAAGGAGCTTTCAATTGAGAAAGCTCCTTGCTGTTTTTTCGAAA 0.8290000000000001 +GGCCACAACCGCTGATTAGCATTGTGCGTTTGGCAATGCCGCTGATGTAATCTGTTACTTTGTAAGTCAGAAAGCCGCCGACATGCTCGGCGGTTTTTCTGAAAAAGAAGG 0.626 +GACCTACAGGAATACGGACATGCACCGATTCGATATGACGAAGATGAACAACACTTTTTAAATGTATAGAAAAAGCACCTGAGAATCAGGTGCTTTTCGCTTTCAGCTTAT 0.986 +TCCGTTATCCTGTCGGTCTTCCATTTGCTGATTCAAAAAGGCATTTTAGAGCCGCTTAGAAAAAAATAAAAAAAGCTGCCCGCAAAGGCAGCTTTTTTACATGTGGTGCTT 0.98 +AATTGTCCTATGTTTATTATTTATGACAGCACCCCGTCTGCTTAAAAGTGACAAAGCGTAAAAGTAACAATCCCCCAGCGAATTCCGCTGGGGGATTTTTCGTTATCTGTT 0.9940000000000001 +CCTGTACTATCCGCACATCCGTGAAATGCTTGAAAAACAAGGATTTAAAGTTGAAAAATAAGCTCTTATTCGGCCTGTCGGATTTTCCGGCAGGCCTTTCATTTACCCGGT 0.992 +GAGGATGGTTCGGAGTGGGCAAAGCGCTCCCTTCATAATCTGGACATTTTAGAAAAAGAATATTTTAACCGGCCATAACCCAGTCTGACAAAAGATTGGGTTTTTTACGTC 0.84 +GCGCAAGCTGAAAAAGTAGCGAAAATCATCGCCGCAAACAGCAGCAATACATAACAGAAACCTGAAACACAAAACGGCAGCCCTTGAGCTGCCGTTTTTTTATTCTGTCAG 0.9990000000000001 +AACGCGGAGGATAACGAAATCATCGCGGAACAGGAAAAAAGACAAAATCAATCTCATATCGAATCATAAAAAATCCGGCGACTGCACGCCGGATTTTTTTGATATCACCCT 0.992 +GCTCATTCGATATCTATCTCCTGTTGCCATCATCATCGTGTTTTTGCATGTGATAGGTATTTTTTAACACACGAAGCCCCCGAGGAAAGGGGGCTTTTTTTAATAGAGCTT 0.963 +CCATCCTCATCGGCCTTGCCGGACTGCCTGTTTATTACGGGATGAAAAAACGAAAAGCAAGCTGATAAAACGGTTCCCTTGTTTAGGGAACCGTTTTTTGGTGAAACAGAA 0.9940000000000001 +TGAGTGAGCTTGAGAAAGAATACAAGTCCCTTCAGAAAATGAAAAAAGAGATCTGACAATAGATATAAAGAGGTGAGTCTGCTGAAAAGCGGGCTTTTTTGTTTTGATTTT 0.97 +TGTCTGCGGCTGCCATTATTTTATCTGTAGCAGCACTTGTAAAGAAAAAGCGTGCCTAATAAAGAAAGCGATCCAGATGTCATGTCTGGGTCGCTTTTTATGTTTAACCGC 0.9990000000000001 +GCAACAAACCACCCGAACTATGAAAAGCCAATTGAGGCTGCTAAGGCATTAGTAAAGTAAGCAGGGAAAAAAGCTCCAGGCTGCCGCCGGAGCTTTTTTCATGGCAAGGAG 0.812 +CTGAAGTCATCATTCGCGATATTCGCACTGAAAAGTTCGGCAGGCTGTCATTTGAACAGCCGACGATGTAAAGGCGCGCAGAAATGCGGGTCTTTATTTTTTCTTGGCTGA 0.912 +TTTGTCGGATTAATTGAAGAAGTTGAAAAACAAGTGTGGATGCTTTCCTCTTATTTAGGGTAACAAAAAAGCTGAACCTTAATCGGGTTCAGCTTTTTGTTTTTTCTTAGC 0.996 +TGTTGTTCGCGGCGTTGACCGGCAGCCTGGAAAACCATCCGGATGGCTTTAATTTTAAATAATTAGCGGATAAAGAAACGGAGCCTTTCGGCTCCGTTATTCATTTACGCG 0.965 +TTTGCAGAAGAAAAATGGCGGATAAAAGACTATAATATCAACAGCCACTTATCCGGCTTTATCAAATAAGAAAAAGACAGGCGTTTGCCTGTCTTTTCTTTTATTTCTTAG 0.968 +AATTCACCTGAGCTCATTGAATCCCATTAAGTACCACTGCCATATCGCTTTATATACCACCTCCTCTTAGCTAATATGTTCTAAGTAGGAGGTGGTATTTTGTTTGTATCT 0.9990000000000001 +AACGATTCGTTATCAGTGCAGGAAAATGCCTGTTAGCGTAAAAGCAAAACACAAATCTATCCATGCAAGCATTCACCGCCGGTTTACTGGCGGTTTTTTTTCGCCGTCATA 0.998 +CAAATGGCCGGAAAAGCGGAAAAAAGCAACACCGATTAAATGCTCTGGATAAGGATTATCCAATTCTAAAAAAAAAGCCCGGACGACTGTTCGGGCTTGTCTTTTTATATG 0.9440000000000001 +TAATCGCGGGAACATATGTCACTTATGCGACACCAGGTGAAGAGGAACTTAATGCTGACGAAGAGATTAGAAAGAGCGAATAATGGTTCGCTCTTTTTATTTTTATGCATA 0.992 +GGAAAATCCGAACGCACAAGCGATTCAGGATGATTTAAAAGATATTGATTCTGAATAAGAAACAAATAAAAAAGGTGAGTCGCAAAACTCACCTTTTTTGTTGTTATCCCT 0.956 +ACATCATTGACCAACCTGTACGAACCAGGGGCCGTCAGCGAGTTCTATATAGAAATAACGGAAATAGACAAAAACGCCGATTCATGATCGGTGTTTTTTATGTAAAAAAGC 0.895 +CCGACGGTTTACTACCCGCCAGCAGCAAACATATTCACGAAAACGCGGAATAAACACCGTCGTCAGAAATGCAAACAGGAGCCTGATGGCTCCTGTTTTTCATTGCACAGT 0.9620000000000001 +CGTTGCAGACGAAGATGAGAAAGAACATTTGGCACAGCTTCTGGTCAAATTGGTTGATAAACAATAAAACATGAGCCGGGCGATATGCGCCCGGCTTTCTTGATATTGTCA 0.7979999999999999 +GCATAAGCTAAAAGAAGAAGGATACGAAAGCTATCTGCATCTCTTTAATAAATAAACTGTTAAAAACAGGCCTGCCTAATGGATAGACAGGCTTGTTTTTATGCCTTTTTC 0.9670000000000001 +GAAAGCAGAAACTGTTATGTCCCAATCGAAAACAGAACAAAACGCAACCATTTCTGGTTAATGAAAAGCCTGCGGGGTTCTTGCCCCGCAGGCTTTTTTAATCTTTAACAA 1.0 +TGTATTAACATACGGGTTTATCAATCAGCAAAATGACCCGCCATTTATTGTCGCCATCGAAAAAAAAGCTCAGATCAGCAAATGATTTGAGCTTTTTTCTTTGAATTTTTC 0.909 +TAAGCCGATTCTGTTCAAATGGCTGAGAGGAATGTCGAAGAATAAGCGGATCATCATCAAATAAAAAACGGCAGCCATGAAAAAACGGCTGCCGTTTTATTTTTGCTGAAC 0.8140000000000001 +GGACCAGCTGATCGTCCTGCTCAAGAAAGCAGGCATACACAGCCAGCATTTGCTGTTTCGTTAACTATTCTACACTCTCTATTTCAGGAGAGTGTTTTTTTGTTTCCATAT 0.9620000000000001 +TGGGGAACCTTTTTGGGATTGGCGCTACTATACCGCAGAAAATTATTATAGAACAAGATAGTGTGTGATGTATAAGACAGTCATTTTGGCTGTCTTTTTCTGTTATTCAAT 0.982 +GGAGAAATATATCAGGACATCGCAAAAAAAATAGATGCAAAAATGTCAGTGCAGGTATAAAAGGTGAACCGGGATTCCAGATCCCGGCTTTCCCTTTATTCACTTTCTTTT 0.706 +GCAGATTGAGAAAGAAGCGGTTGAGATTGTTAGCGAAGTTCTGAAAAACGCGTAAACACATTTAATAAAAAAAGGGCGGTCGCAAGATCGCCCTTTTTTACGTATGACAAA 0.818 +AAAGGACGGAGCAGTTGCTCTGACCCAATATCCAAAGGATTTGATCATACTGTAATAGAAATAAAAAAGGACTTTGTTCTCGACATCGTCCTTTTTTATCTGCATTAAATA 0.9470000000000001 +ACCCGAAACTGCAGGGTATTGCCCAACAGAACAGCTTTAAACATACCTGATAACGTTTAACGGTAACGGTGTCCCGAAACGAACCCGTTTCGGGACAATTTCCAAAGTCTG 0.16 +TATACGGTAAATGACAAACAGGAACCTGTTTTTTTAGAAAATGATATGAAAAAGGAATGAACCTTTCTCCCTTGCATACAAATAGGGAGAAAGGTTTTTTTATATTAATAG 0.981 +AATCTCCAAGTCTACGAGGACCTTACTGATTTCGGCTGCTGGTTCATCCCCGTCACAGATGAACACTAAGAGAGAGGGTCTCCCCTCTACTCGTGTTCTTTACATAATGTT 0.41700000000000004 +TGAAGATTTAAAAAATCGTCGTTATCAGCGCCTGATGAGCTACGGTTACGCGTAATTCGCAAAAGTTCTGAAAAAGGGTCACTTCGGTGGCCCTTTTTTATCGCCACGGTT 0.934 +ACCGTTCACGTTGATATGCTGATTGTTCCGCTGCGCGACGAAGAAGAATAATCTTCCCTCTACGACGTGTTCCTGAACGCCCGCATATGCGGGCGTTTTGCTTTTTGGCGC 0.9990000000000001 +CTTTTATACACATTGGAGCGCGAATTGTCTGCTTTGCATGACGCGTTTACAGCTAAATAAATGATAAAACTCAAACTTATTAACAGTTTGGGTTTTTTTATAACCGCTATT 0.966 +TCATCCGCTCGTCATCGTTTTCGCCATTCTGTTTTTCATCCAGCTGTTTATTTTATAAGGATAGACCAAAAAACCCAAACGTCGCCGTTTGGGTTTTTTATGTAAACAATA 0.991 +GAATTCGCGAAAAACATGTCTAATGGCGATGTCATGAAAGAAGTTCGTGTAGAAGGCTAATCGGCAGGCAAAAGGCTTCCTCTTAGAGAGGAAGCTTTTTTTATTGGCCAA 0.997 +ATGCACGATCGAGTAGGCGGGATAAGGTGTTTACGCCGCATCCGGCATGGAAAACGCGTACTTTGTTATCAATCTGGGGCCAGCAAATGCTGGCCTGATTTTTTCTTGAGG 0.68 +GATTCGAACAGCTGTATTTGCAGCTGGCGTACAGTCTTCTAAGAAAGTTCAATTTGAAATATTTGACTAAAAGGGAGAGGGCCGGCACCTCCCCTTTTTTTACACGCAAAA 0.973 +TTTAATGAACAAACCGAAAAATATAATAAAGCCCGGGAAGATTTATATGATGCTGCAGGCTATCGTGTGAAAAAGAGCTGACTCGATCAGCTCTTTTTTTGAATGAAATGC 0.937 +CGCCGCTGGATAAACGAATTGAAGCTCTGCGTACGGGTGAATACCTGAAGTAATATGAAGTAAATACTAAAAAGCGCGTCCTGATCGACGCGCTTTTTTTATGCCCTGGAT 0.986 +ACCATTTGCAGGGCTTCTTTTTCAGCAGGCCAATACCTCCGGAACAATTCGAACAATTCATTATTGAACAGCCGTCGCAATAACAGCGCCGGCTTTTTTTCACCCATAAGT 0.8590000000000001 +CGGAACACTTTATATTGGAAGTTATACGAAGCTGATGGCTTTTGGGAAATAACTGAATAGAATAGCAGAAGACCTTCTTTTAATAAGAAGGTCTTCTTACTGTGATACAAA 0.982 +GGCCACACTGATAAAATTGTCATCGCGGATGCCGGACTGCCGGTTCCTGACGGCGTTTTGAAAATTGATCTTTCACTGAAGCCGGGCCTTCCGGCTTTCCAAGATACAGCG 0.561 +CTGTTCGCAATGTACCAGGATAACCAACGGTTAATCGCGGCGCATATGTAATTGCATTTCCTTGAGCCTTATCCGACTTGTCAGTCGGATAAGGCTTTTTACTTTGTCTCA 0.606 +GATGAACTAGAAACGAAAGCTGACAGTAAAGAATCGGCGCAAAAATAGAGTTTAAAGACGGAAGAGTAAACTTATTGGGCACCGCAATGGTGCCCAGTACATTTAGATATT 0.628 +TTTCTATATGATGAGCAAAGAGATTGGCAAGAAGCATTATCTTGAAACAAGAACAGACTCATAATGGAAAGGCCGGTGCTGACAGCATCGGCCTTTTATACATATTGGGCG 1.0 +TCACCGTGTATGCAAAGCATGCGGATCATACAACGGCAAAGACATAAATGTAAAAAGTAACTAATGAGAAAAGCGCAGGGTGAAAGCCCTGCGCTTTTTCTTTTTCTCTTC 0.9990000000000001 +TCGGCGCAGTAATTGGAGTATGGGGAAGTCTGACTTCCATCCGCAAGTTTCTGCGAGTATAAAGTGAAAAAGCCGTTCCGTTTTCGGGACGGCTTTTCATTTGCCTTCTTT 0.996 +ACTCATGGAGAAAAAAGGTCTCGATTCTGAAACTATGTTCACTCACCACGCAAACCTTGAATAGCATCACACAACCGGCCTGAAGATCAGGCCGGTTTTATTTTTTCTAAA 0.975 +GTTGAAGATTTTAACGCTACGCTGCCGAAATCTCGTCTGCCGCAAAACATCACCTTAACCGAAGTGTAATTCTCTCTTGCCGGGCATCTGCCCGGCTATTTCCTCTCAGAT 0.8859999999999999 +CCAATTTTTAGAGCAGGCGGCTGTTCATTCAACAGAGCTTGTGTTAAAGGTGATCAAACGAATTCATTAAAACGGGGAGAACGTTCTCCTCCGTTTTTCTCTAAAAACAAC 0.527 +GGGTATATCGACATTATGAAACAAAACCTCGATGCCCTGAAAGATTCACTATTGGTTAAATCATAAGATAGTGTGCGCCGGAGGGGTGCACACTATTTCTTTGTGAGAAAG 0.963 +TTCACAGCGAAGTATTCGCGAAAGTGATCGTAAACGTAGTAGCTGAATAATTCGTTATTCAACGAGACGTAAAAACGCCGACCATTGGTCGGCGTTTTGCTTTCTATTTTT 0.991 +TTTCGTACTGAAAGGTTGGTGGCGCACTTCCTGAAACGGGCAGTGTATTCACCATGCGTAAAGCAATCAGATACCCAGCCCGCCTAATGAGCGGGCTTTTTTTTGAACAAA 0.9109999999999999 +GATTATATCACGGATACAAATATTGAGGATGGTGTCGCTAAAGCGATTCGCCATTGGGTACTATAAAAAAAGAGAGTCCTAAGATGGACTCTCTTTTTTAGTTTGGCAGGT 1.0 +GAAAAACCGGAGTTAAAACCAGCACCGCCTTTTGTTCATAATACACTTCCGGGACGCGAGTAAAGAGACAAGCCCAAAACATGATGTTTTGGGCTTTGATTATGCCTTCAG 0.975 +CTGGCTGTTGCCGCTAAACAGTCCAGCCTGATGGTTGAGTCACTGGTGCAGAAACTTGCACATGGCTAAGTCACTGTTCAGGGCGCTGGTCGCCCTGTCTTTTCTTGCGCC 0.688 +GCAGATTAAAAACTCACGGGACATTTTCATTTCTTTATATAAAGATTTTGCTTAAGAAGGCAAAGAAAAGAAAACAGTTTTCAAGGCTGTTTTCTTTTTATATCCGCAAAA 0.7829999999999999 +CGGCTGTGATGTGGATAAACCTCGTAACCTTGCGAAGAGTGTTACTGTGGAGTAATAAATGTTTAACCCCTTTGGATAAGATTATCTAAAGGGGTGTTTTTATGTCCAAAA 0.9990000000000001 +AAAATGGGCAAACCGTGGACTAGTACTCCTGCAAAAGTAGATTTACCTAATAGCTAATTTTAAAATCACTTTGTCTTTATCGGGGGACAAAGTGATTCTTTGTTTTATACA 0.961 +AGGCTGAAAAGCTCTCTCATAAAGAGTGACGTGGCGAGCAGGACGCTCCCTCAATATCTTGTTCGATACAAAAAACCCGCTTCAAAAAGCGGGTTTTTTATCAGACAGATG 0.978 +AACGGCTGCAGCTGCTTTCTTGTATTACAGGTACAAGAAACTGACAGGTGCTCAAGCCTAAGCAACAAAAAAACGGACGCCCAAGAGCGTCCGTTTTTTCTTATTTCTCAA 0.9840000000000001 +GGATTATGTGATTGTGAAGAATTCCATTTTTAATGGGGATAAATATTAATCATAAAAAAACGCCAAGTCAGCGGTTCTCCGCTTGAGTTGGCGTTTTCTGCTACTTCTTCA 0.997 +TTCAGTTGAAACGGCAAGAGAGAATGCAAAGAAAGCGTTGGACCAGCTAATTTTAAAATAGAGTTTGAACAGGTCTTGTCATGGGACAAGGCCTGTTTTTTTCTTTCTCCG 0.973 +GACGGAGCACCAATGGCAATTATCGAATTGGTTTAATCACATATTTTTGTGTATCTAAAGAAGGGCGGGACAGTTTCTAACTGGATCTATGCCCTTTTTTTAGATACTGCA 0.732 +CTTGTGGAGCTTGCAAAAGAACGCCACGCTGAAAAACAATTGATTAAGCATACATTCTAATGAAGAAGGACACTGCCGGTAAAAGGGCAGTGTTTTTTCCCGTATATACAA 0.879 +TGTATTGAATTTTCATGATTTTGTAGGCCGGATAAGGCGTTCACGCCGCATCCGGCAAGAAGCTATAAGAAAAGGGCGGAGATCATCTCCGCCCTTTTTATTTCTGCAATC 1.0 +ACTGGCAATGACCAAGACCAATGACGATTTCTTCGAAATGATGAAACGCTCATAAATTTGTCTTATGCCAAAAACGCCACGTGTTTACGTGGCGTTTTGCTTTTATATCTG 0.919 +TCTATTCGTGAAGGCGGACGTACTGTTGGTTCAGGCGTTGTTTCTACAATCACTGAGTAATAGTATGGTTTTAAACGAGACCCCTGTGGGTCTCGTTTTTTGTTTGCTTAT 0.998 +AACGTCTTGTATCTTTTGCTCAACAAAACATGGGCGGAGGACAATTCTAATTTACAATTTCACATAATGGCTTAGGAGTGGGGGTATCCCCACTCTTTTTCATTTTTTACA 0.991 +CGTGACGCTGGCATCAACACTGATAACATCGTAGCTCTGGGTCTGGTTTACCAGTTCTAATCTCGATTGATATCGAACAAAGGGCCTGCGGGCCCTTTTTTCATTGTTTTC 0.971 +GATAGCCCTTTATTTTGTGATTAAGAAACGTACAGCCATATTTAAACGATAGCTAGCGGATATGCATAGGGGTGACTGACGCTCCCCTATGCATATTTTTTTACTTGCGAT 0.826 +GCAGAGGATATTTCTCACAGTGCAAAACGGTTAGCCGAATACATCCAGCAGCCTTTTGTTTAAAATACAGCCCTGTCCAACATACGGCAGGGCTGTATTTGTTTAAAAATC 0.8909999999999999 +GAGCGCAGCTGAATAAATCAGTTGAATCTGTCAAAAATGTCATGAAAGTATTATCCTAATAAAAAGAGAGAAAGGCTTGCTTAATACAGCCTTTCTCTTTTTACTATAAAT 0.9990000000000001 +TATATGGCCAAAAAAGCTGCAGAGAAAAAAGAAGAATAAATCGGACATAATGAATATAAAGACTGAATACCTGCTTTTACGTTTTAAAAGCAGGTTTTTTATACACAAAAA 0.9279999999999999 +TGGAAATGGTCGACGGCACCACGTTGTCACACTCCCTTGCCGGGCGTCATAAGATTCGTTTTTAAGCAAACGAGAGCAGGATCACCTGCTCTCGCTTGAAATTATTCTCCC 0.9 +AGATCTTGCGATTGAATGGATCAGAGAAAATATGGATGGCATTACCGACGTTCTGTAACATAAAAAAGCGAAAGAGCCGAAGAGGCCTTTCGCTTTTTTATTCTGTTGAAT 0.976 +CGGCAAAAAGCTTTCGAAGCACTCTGAATACGGATATTACGGAACCAAGGACAATTTCATGCAAAAATAAATAACGTGCATCGTGCCCGTTATTTATTTTTGCGAAAAAAG 0.845 +AGGCATCGAGAAAGTTGAGACTGACTTTGCAGCAGAAGTTGCTGCGATGTCCAAGCAGTCTTAATTATCAAAAAGGAGCCGCCTGAGGGCGGCTTCTTTTTGTGCCCATCT 0.6559999999999999 +AGACAGCCGATGTGGTCTCTGGAGACGACGCAAAATATGTAGATTGTGACTACGAAAACGGCTGGTGTTAATGTTGGCGGTGAATTTCACTGCCTTCTTTTACAATCTGAA 0.723 +GCCTTGAATAGAGATACAAACTTACAAGCGTTCCGCAATTTCTTCGGAAGCGACAGAGTATAAGAAGAAAAAACCTCTTCCGCATGGGAGAGGTTTTTTTAAACAATAAGG 0.9640000000000001 +CAGTCTGGAGCAAATTACTGTGACCAAACAGCCAAATGGCGATGTTATTATTCAGTGAAACACCTGATCATGATTTAGCCCCGTCTCGTCGGGGCTTTTTTATGGCAGAAT 0.885 +AGCGTACCATTAGAAATATTGATGAACATTCTCTTGATAAAATAGTAGCTATACTAAGAGAGCAAACAAAATAAAAAGCCCGTTTTTGGGCTTTTCTTTTTTTATATGGCT 0.9520000000000001 +ATCATCGGCGGAAATCGCTGATTCAAGTTCTGACTGAAGCTGTTCATATGATATACTGTAAGCAAACGACAAACGGCATCATAGTATGCCGTTTGTTTTGGAATAGACAGA 0.506 +TCAGCCCCGCTTCCGGAGCTGACCGAAAAGAACACATTTCATAATCTGATTCAATGATTAGAAAGTCAAAGTCAGGCATCTCTTGGCTTTGACTTTTTTTCTTGCCCGGGA 0.914 +CTGAACAAGCGGAAGTAACGTGTCATGCGGGCGTCAGGCTGCCGTAATGGCAATTTGCGCCCGGACCAGGCCGCAGGGGGGAAACTCTGCGGCCTTTTTCGTTCTTACTGC 1.0 +TTGAAAGGCTGAGCGTGTTTAAAGACTAAAAAACCCGTCCGATCCGGACAGGCTATAGCGTCTTTTCTTCTGCAGCCGGATACGCATCCGGCTGTTTTTTCTTTATGACCG 0.858 +TATTGTGAGTGAGCGGATGATTGCAAAGTACGTGATTCAAAATGGCAACATCATTTATAGAGGAAAGAGAAAAGGCTCCTGAAACCAGGAGCCTTTTTATTTTTAAAAAAT 1.0 +GGCTAAAACGACTTTATTCCCCTGGTATGTGTATCCACCAGTAGAACCCTTCGTTGCCCGAATGCTGGCAGGAACTGTTGGCAGAACGGCAACATTTTTTTTGTCGTTGAC 0.74 +TAAGCAGACAGCAGAGGGAAAACAGGCCTTCAGAAGGTTCATCGATCAGTTGGGCGCCAAACCAGTCAAAAGCCTCAACCGCTAGGGTTGAGGCTTTTTCGATTTTTATGG 0.9640000000000001 +GAGCAATCCTTACTGGAAAGCAGCATTTAAAGGTGTTGTAAGACGTGTTGTTCAATAATAAATATGACAAGGGCCTTCTATAAACAGAAGGCTCTTTCTTTATTAACATAT 0.9440000000000001 +AAAGGCTGGACAATTCGTCAAAGCGGAAAGAAAAACAAATGCTTGATCAGGCATCATAATCACAATGTCTCCCGCATCGAAGTGAGCGGGAGGCTTTCTTTTTTTAAATAC 0.735 diff --git a/mewtwo/data_processing/compute_te.py b/mewtwo/data_processing/compute_te.py index e69de29..472b09a 100644 --- a/mewtwo/data_processing/compute_te.py +++ b/mewtwo/data_processing/compute_te.py @@ -0,0 +1,14 @@ +def ts_to_te(ts: float) -> float: + """ + Return terminator efficiency from terminator strength + + Parameters + ---------- + ts: float, termination strength, as defined by Chen et al., 2013 + + Returns + ------- + te: float, termination efficiency + """ + + return 1 - 1 / ts diff --git a/mewtwo/embeddings/feature_labels.py b/mewtwo/embeddings/feature_labels.py index 1dd4ef6..bf2ac1b 100644 --- a/mewtwo/embeddings/feature_labels.py +++ b/mewtwo/embeddings/feature_labels.py @@ -1,20 +1,25 @@ from typing import Optional +from enum import Enum, Flag +from dataclasses import dataclass from mewtwo.embeddings.sequence import SeqType from mewtwo.embeddings.bases import Base -from enum import Enum -class FeatureType(Enum): +class FeatureType(Flag): IS_A = 1 IS_C = 2 - IS_G = 3 - IS_U = 4 - IS_PURINE = 5 - IS_PYRIMIDINE = 6 - NR_H_BONDS = 7 - IS_BONDED = 8 - IS_POT = 9 + IS_G = 4 + IS_U = 8 + IS_PURINE = 16 + IS_PYRIMIDINE = 32 + NR_H_BONDS = 64 + IS_BONDED = 128 + IS_POT = 256 + IS_BASE_IDENTITY = IS_A | IS_C | IS_G | IS_U + ONE_HOT_TYPES = IS_BASE_IDENTITY | IS_BONDED | IS_POT + IS_BASE_FEATURE = IS_PURINE | IS_PYRIMIDINE | NR_H_BONDS + BASE_FEATURE_TYPES = IS_BASE_FEATURE | IS_BONDED | IS_POT @staticmethod def from_base(base: Base) -> "FeatureType": @@ -30,143 +35,189 @@ def from_base(base: Base) -> "FeatureType": return base_to_feature_type[base] +class FeatureCategory(Enum): + A_TRACT = 1 + STEM = 2 + LOOP = 3 + U_TRACT = 4 + + +class StemShoulder(Enum): + UPSTREAM = 1 + DOWNSTREAM = 2 + + +@dataclass class FeatureLabel: + feature_type: FeatureType + feature_category: FeatureCategory + base_index: Optional[int] = None + stem_shoulder: Optional[StemShoulder] = None + + def __hash__(self): + return hash((self.feature_type, self.feature_category, self.base_index, self.stem_shoulder)) - def __init__(self, feature_position: int, max_a_tract_length: int, - max_stem_length: int, max_loop_length: int, - max_u_tract_length: int, seq_type: SeqType = SeqType.RNA, - one_hot: bool = False) -> None: + def __repr__(self): - self.seq_type = seq_type - self.one_hot = one_hot + if self.feature_category == FeatureCategory.STEM: + base_description = f"basepair_{self.base_index}" + if self.feature_type != FeatureType.IS_BONDED: + base_description = f"{base_description}_{self.stem_shoulder.name}" + else: + base_description = f"base_{self.base_index}" + + return f"{self.feature_category.name}|{base_description}|{self.feature_type.name}" - self.feature_type: Optional[FeatureType] = None + @classmethod + def from_string(cls, feature_string: str) -> "FeatureLabel": + category, base_information, feature_type = feature_string.split('|') + feature_category = FeatureCategory[category] + base_info_split = base_information.split('_')[1:] + base_index = int(base_info_split[0]) + feature_type = FeatureType[feature_type.strip()] - self.base_index: Optional[int] = None - self.base_identity: Optional[Base] = None - self.base_hydrogen_bond_count: bool = False + shoulder = None + if feature_category == FeatureCategory.STEM and feature_type != FeatureType.IS_BONDED: + assert len(base_info_split) == 2 + shoulder = StemShoulder[base_info_split[1]] - self.check_pot: bool = False + return cls(feature_type, feature_category, base_index, shoulder) - self.base_pair_index: Optional[int] = None - self.stem_shoulder: Optional[str] = None - self.check_pairing: bool = False + @classmethod + def from_feature_position(cls, feature_position: int, max_a_tract_length: int, + max_stem_length: int, max_loop_length: int, + max_u_tract_length: int, seq_type: SeqType = SeqType.RNA, + one_hot: bool = False, utract_has_pot: bool = True) -> "FeatureLabel": - if self.one_hot: + if one_hot: nr_a_tract_features = 4 * max_a_tract_length nr_stem_features = 9 * max_stem_length nr_loop_features = 4 * max_loop_length - # Assumes separate feature for POT - nr_u_tract_features = 5 * max_u_tract_length + if utract_has_pot: + nr_u_tract_features = 5 * max_u_tract_length + else: + nr_u_tract_features = 4 * max_u_tract_length else: nr_a_tract_features = 3 * max_a_tract_length nr_stem_features = 7 * max_stem_length nr_loop_features = 3 * max_loop_length - # Assumes separate feature for POT - nr_u_tract_features = 4 * max_u_tract_length + if utract_has_pot: + nr_u_tract_features = 4 * max_u_tract_length + else: + nr_u_tract_features = 3 * max_u_tract_length if feature_position < nr_a_tract_features: - self.feature_category = "A-tract" - self.relative_feature_position = feature_position - self.set_base_information() + feature_category = FeatureCategory.A_TRACT + relative_feature_position = feature_position + elif feature_position < nr_a_tract_features + nr_stem_features: - self.feature_category = "stem" - self.relative_feature_position = feature_position - nr_a_tract_features - self.set_basepair_information() + feature_category = FeatureCategory.STEM + relative_feature_position = feature_position - nr_a_tract_features + elif feature_position < nr_a_tract_features + nr_stem_features + nr_loop_features: - self.feature_category = "loop" - self.relative_feature_position = feature_position - nr_a_tract_features - nr_stem_features - self.set_base_information() + feature_category = FeatureCategory.LOOP + relative_feature_position = feature_position - nr_a_tract_features - nr_stem_features + elif feature_position < nr_a_tract_features + nr_stem_features + nr_loop_features + nr_u_tract_features: - self.feature_category = "U-tract" - self.relative_feature_position = \ - feature_position - nr_a_tract_features - nr_stem_features - nr_loop_features - self.set_base_information() + feature_category = FeatureCategory.U_TRACT + relative_feature_position = feature_position - nr_a_tract_features - nr_stem_features - nr_loop_features + else: raise IndexError(f"Feature position {feature_position} does not exist.") - def __str__(self): - pass + if feature_category != FeatureCategory.STEM: + feature_type, base_index = cls.get_base_information(one_hot, seq_type, feature_category, + relative_feature_position, utract_has_pot) + shoulder = None + else: + feature_type, base_index, shoulder = cls.get_basepair_information(one_hot, seq_type, + relative_feature_position) + + return cls(feature_type, feature_category, base_index, shoulder) - def set_base_information(self) -> None: + @staticmethod + def get_base_information(one_hot: bool, seq_type: SeqType, feature_category: FeatureCategory, + relative_feature_position: int, + utract_has_pot: bool) -> tuple[FeatureType, Optional[int]]: - if self.one_hot: - if self.seq_type == SeqType.RNA: + if one_hot: + if seq_type == SeqType.RNA: features = [Base.A, Base.C, Base.G, Base.U] - elif self.seq_type == SeqType.DNA: + elif seq_type == SeqType.DNA: features = [Base.A, Base.C, Base.G, Base.T] else: - raise ValueError(f"Unsupported sequence type: {self.seq_type}") + raise ValueError(f"Unsupported sequence type: {seq_type}") else: features = [Base.PURINES, Base.PYRIMIDINES, "hydrogen bonds"] - if self.feature_category == 'U-tract': + if feature_category == FeatureCategory.U_TRACT and utract_has_pot: features.append('POT') feature_nr = len(features) for i in range(feature_nr): - if self.relative_feature_position % feature_nr == i: + if relative_feature_position % feature_nr == i: feature = features[i] if feature == 'POT': - self.check_pot = True - self.feature_type = FeatureType.IS_POT + feature_type = FeatureType.IS_POT elif feature == 'hydrogen bonds': - self.base_hydrogen_bond_count = True - self.feature_type = FeatureType.NR_H_BONDS + feature_type = FeatureType.NR_H_BONDS else: - self.base_identity = feature - self.feature_type = FeatureType.from_base(feature) + feature_type = FeatureType.from_base(feature) - self.base_index = self.relative_feature_position // feature_nr + 1 + base_index = relative_feature_position // feature_nr + 1 break else: - raise ValueError(f"Could not find feature type for feature at {self.relative_feature_position}") + raise ValueError(f"Could not find feature type for feature at {relative_feature_position}") + + return feature_type, base_index - def set_basepair_information(self) -> None: - assert self.feature_category == "stem" - if self.one_hot: + @staticmethod + def get_basepair_information(one_hot: bool, seq_type: SeqType, + relative_feature_position: int) -> tuple[FeatureType, Optional[int], Optional[str]]: + if one_hot: feature_nr = 9 - if self.seq_type == SeqType.RNA: + if seq_type == SeqType.RNA: features = [Base.A, Base.C, Base.G, Base.U, Base.A, Base.C, Base.G, Base.U, 'bonded'] - elif self.seq_type == SeqType.DNA: + elif seq_type == SeqType.DNA: features = [Base.A, Base.C, Base.G, Base.T, Base.A, Base.C, Base.G, Base.T, 'bonded'] else: - raise ValueError(f"Unsupported sequence type: {self.seq_type}") + raise ValueError(f"Unsupported sequence type: {seq_type}") else: feature_nr = 7 features = [Base.PURINES, Base.PYRIMIDINES, 'hydrogen bonds', Base.PURINES, Base.PYRIMIDINES, 'hydrogen bonds', 'bonded'] + stem_shoulder: Optional[StemShoulder] = None + for i in range(feature_nr): - if self.relative_feature_position % feature_nr == i: + if relative_feature_position % feature_nr == i: feature = features[i] - self.base_pair_index = self.relative_feature_position // feature_nr + 1 + base_index = relative_feature_position // feature_nr + 1 if feature != 'bonded': if i < len(features) // 2: - self.stem_shoulder = 'upstream' + stem_shoulder = StemShoulder.UPSTREAM else: - self.stem_shoulder = 'downstream' + stem_shoulder = StemShoulder.DOWNSTREAM if feature != 'hydrogen bonds': - self.base_identity = feature - self.feature_type = FeatureType.from_base(feature) + feature_type = FeatureType.from_base(feature) else: - self.base_hydrogen_bond_count = True - self.feature_type = FeatureType.NR_H_BONDS + feature_type = FeatureType.NR_H_BONDS else: - self.check_pairing = True - self.feature_type = FeatureType.IS_BONDED + feature_type = FeatureType.IS_BONDED break else: - raise ValueError(f"Could not find feature type for feature at {self.relative_feature_position}") + raise ValueError(f"Could not find feature type for feature at {relative_feature_position}") + + return feature_type, base_index, stem_shoulder diff --git a/mewtwo/embeddings/sequence.py b/mewtwo/embeddings/sequence.py index 23be611..2da3868 100644 --- a/mewtwo/embeddings/sequence.py +++ b/mewtwo/embeddings/sequence.py @@ -123,8 +123,6 @@ def get_sequence_type(sequence: str) -> SeqType: except ValueError: pass - seq_types = [] - if is_dna and is_rna: return SeqType.DNA | SeqType.RNA elif is_dna: @@ -132,4 +130,4 @@ def get_sequence_type(sequence: str) -> SeqType: elif is_rna: return SeqType.RNA else: - raise ValueError("Sequence is not DNA or RNA") + raise ValueError(f"Sequence is not DNA or RNA: {sequence}") diff --git a/mewtwo/embeddings/terminator/a_tract.py b/mewtwo/embeddings/terminator/a_tract.py index 3b9f7fb..ce62e26 100644 --- a/mewtwo/embeddings/terminator/a_tract.py +++ b/mewtwo/embeddings/terminator/a_tract.py @@ -1,9 +1,15 @@ -from mewtwo.embeddings.sequence import RNASequence +from mewtwo.embeddings.sequence import RNASequence, DNASequence, get_sequence_type, SeqType, convert_to_rna from mewtwo.embeddings.bases import Base, base_to_vector class ATract: - def __init__(self, sequence: RNASequence) -> None: + def __init__(self, sequence: str) -> None: + + seq_type = get_sequence_type(sequence) + if SeqType.RNA in seq_type: + sequence = RNASequence(sequence) + else: + sequence = convert_to_rna(DNASequence(sequence)) self.sequence = sequence def to_vector(self, a_tract_size: int = 10, one_hot: bool = False) -> list[int]: diff --git a/mewtwo/embeddings/terminator/draw_terminator.py b/mewtwo/embeddings/terminator/draw_terminator.py new file mode 100644 index 0000000..624f16a --- /dev/null +++ b/mewtwo/embeddings/terminator/draw_terminator.py @@ -0,0 +1,251 @@ +from dataclasses import dataclass, field +import os + +from math import pi, sin, cos + +from mewtwo.embeddings.terminator.terminator import get_terminator_part_sizes, Terminator +from mewtwo.parsers.parse_feature_file import parse_feature_file +from mewtwo.machine_learning.feature_inference.infer_features_rf import sum_importance_per_base, \ + normalize_importances, SortedFeatures + +from mewtwo.embeddings.feature_labels import FeatureLabel, FeatureCategory, FeatureType + + +@dataclass +class Point: + x: float + y: float + + def rotate_around_origin(self, angle): + new_x = self.x * cos(angle) - self.y * sin(angle) + new_y = self.x * sin(angle) + self.y * cos(angle) + self.x = new_x + self.y = new_y + + def move(self, x, y): + self.x += x + self.y += y + + +def get_extremes(points: list[Point]) -> tuple[float, float, float, float]: + return min([point.x for point in points]), max([point.x for point in points]), \ + min([point.y for point in points]), max([point.y for point in points]) + + +@dataclass +class TerminatorDrawingOptions: + base_diameter: float = 20 + circle_diameter: float = 15 + color: tuple[int, int, int] = (255, 0, 0) + + def __hash__(self): + return hash((self.base_diameter, self.circle_diameter, self.color)) + + +DEFAULT_OPTIONS = TerminatorDrawingOptions() + + +@dataclass +class BaseDrawing: + centre: Point + radius: float + color: tuple[int, int, int] = (255, 0, 0) + transparency: float = 1.0 + + def get_svg_circle(self): + return f'\n' + + +@dataclass +class Line: + point_1: Point + point_2: Point + + def get_svg_line(self): + return f'' + + +@dataclass +class TerminatorDrawing: + loop_size: int + stem_size: int + a_tract_size: int + u_tract_size: int + options: TerminatorDrawingOptions = DEFAULT_OPTIONS + feature_to_importance: dict[FeatureLabel, float] = field(default_factory=dict) + + @classmethod + def from_feature_file(cls, feature_file: str, features_to_visualise: FeatureType, + options: TerminatorDrawingOptions = DEFAULT_OPTIONS) -> "TerminatorDrawing": + feature_to_importance = parse_feature_file(feature_file) + feature_category_to_size = {FeatureCategory.STEM: 0, + FeatureCategory.LOOP: 0, + FeatureCategory.U_TRACT: 0, + FeatureCategory.A_TRACT: 0} + + for feature_label in feature_to_importance: + if feature_category_to_size[feature_label.feature_category] < feature_label.base_index: + feature_category_to_size[feature_label.feature_category] = feature_label.base_index + + selected_features_to_importance = {} + + for feature, importance in feature_to_importance.items(): + if feature.feature_type in features_to_visualise: + selected_features_to_importance[feature] = importance + + if len(features_to_visualise) > 1: + selected_features_to_importance = sum_importance_per_base(selected_features_to_importance) + + return cls(feature_category_to_size[FeatureCategory.LOOP], + feature_category_to_size[FeatureCategory.STEM], + feature_category_to_size[FeatureCategory.A_TRACT], + feature_category_to_size[FeatureCategory.U_TRACT], options=options, + feature_to_importance=selected_features_to_importance) + + @classmethod + def from_terminators(cls, terminators: list[Terminator], + drawing_options: TerminatorDrawingOptions = DEFAULT_OPTIONS) -> "TerminatorDrawing": + + max_loop, max_stem, max_a, max_u = get_terminator_part_sizes(terminators) + return cls(max_loop, max_stem, max_a, max_u, drawing_options) + + def get_svg(self): + transparencies = [] + + loop_coords = self.get_loop_coords() + min_x, max_x, min_y, max_y = get_extremes(loop_coords) + x_translation = -1 * min(0.0, min_x) + 10 + y_translation = -1 * min(0.0, min_y) + 10 + + for loop_coord in loop_coords: + loop_coord.move(x_translation, y_translation) + + circle_radius = self.options.circle_diameter / 2 + + loop = [] + for loop_coord in loop_coords: + loop.append(BaseDrawing(loop_coord, circle_radius)) + transparencies.append(0.0) + + stem_left_x = loop_coords[0].x + stem_right_x = loop_coords[-1].x + + stem_y = loop_coords[0].y + + left_stem_coords = [] + right_stem_coords = [] + lines = [Line(Point(stem_left_x + circle_radius, stem_y), Point(stem_right_x - circle_radius, stem_y))] + + for i in range(self.stem_size - 1): + stem_y += self.options.base_diameter + + left_stem_coords.append(Point(stem_left_x, stem_y)) + right_stem_coords.append(Point(stem_right_x, stem_y)) + lines.append(Line(Point(stem_left_x + circle_radius, stem_y), Point(stem_right_x - circle_radius, stem_y))) + transparencies.append(0.0) + transparencies.append(0.0) + + left_stem_coords.reverse() + right_stem_coords.reverse() + + tract_y = left_stem_coords[0].y + + a_tract_x = left_stem_coords[0].x + u_tract_x = right_stem_coords[0].x + + a_tract_coords = [] + u_tract_coords = [] + + for i in range(self.a_tract_size): + a_tract_x -= self.options.base_diameter + a_tract_coords.append(Point(a_tract_x, tract_y)) + transparencies.append(0.0) + + a_tract_coords.reverse() + + for i in range(self.u_tract_size): + u_tract_x += self.options.base_diameter + u_tract_coords.append(Point(u_tract_x, tract_y)) + transparencies.append(0.0) + + base_coords = a_tract_coords + left_stem_coords + right_stem_coords + loop_coords + u_tract_coords + + if self.feature_to_importance is not None: + normalize_importances(self.feature_to_importance) + sorted_features = SortedFeatures(self.feature_to_importance) + loop_transparencies = [sorted_features.left_stem_shoulder[-1][1]] + [x[1] for x in sorted_features.loop] + [sorted_features.right_stem_shoulder[-1][1]] + left_shoulder_transparencies = [x[1] for x in sorted_features.left_stem_shoulder[:-1]] + right_shoulder_transparencies = [x[1] for x in sorted_features.right_stem_shoulder[:-1]] + a_tract_transparencies = [x[1] for x in sorted_features.a_tract] + u_tract_transparencies = [x[1] for x in sorted_features.u_tract] + transparencies = a_tract_transparencies + left_shoulder_transparencies + right_shoulder_transparencies + loop_transparencies + u_tract_transparencies + + min_x, max_x, min_y, max_y = get_extremes(base_coords) + viewbox = f"0 0 {int(max_x) + 10} {int(max_y) + 10}" + + header = f'\n' + style = '\n' + body = "" + + for line in lines: + body += line.get_svg_line() + for i, base_coord in enumerate(base_coords): + transparency = transparencies[i] + base_drawing = BaseDrawing(base_coord, circle_radius, transparency=transparency) + + body += base_drawing.get_svg_circle() + + footer = '' + + return header + style + body + footer + + def write_svg(self, out_file): + svg_string = self.get_svg() + with open(out_file, 'w') as out: + out.write(svg_string) + + def get_loop_coords(self): + nr_polygon_sides = self.loop_size + 2 + theta = 2 * pi / nr_polygon_sides + + r = self.options.base_diameter / (2 * sin(pi / nr_polygon_sides)) + + angle = pi / nr_polygon_sides + loop_centres = [] + + while angle < 2 * pi: + coords = Point(cos(angle) * r, sin(angle) * r) + loop_centres.append(coords) + angle += theta + + for base_centre in loop_centres: + base_centre.rotate_around_origin(pi / 2) + + return loop_centres + + +def visualise_feature_importances(feature_file, out_dir): + + if not os.path.exists(out_dir): + os.mkdir(out_dir) + + out_a = os.path.join(out_dir, "is_a.svg") + out_c = os.path.join(out_dir, "is_c.svg") + out_u = os.path.join(out_dir, "is_u.svg") + out_g = os.path.join(out_dir, "is_g.svg") + out_all = os.path.join(out_dir, "all_features.svg") + + terminator_drawing = TerminatorDrawing.from_feature_file(feature_file, FeatureType.ONE_HOT_TYPES) + terminator_drawing.write_svg(out_all) + + terminator_drawing = TerminatorDrawing.from_feature_file(feature_file, FeatureType.IS_A) + terminator_drawing.write_svg(out_a) + + terminator_drawing = TerminatorDrawing.from_feature_file(feature_file, FeatureType.IS_U) + terminator_drawing.write_svg(out_u) + + terminator_drawing = TerminatorDrawing.from_feature_file(feature_file, FeatureType.IS_G) + terminator_drawing.write_svg(out_g) + + terminator_drawing = TerminatorDrawing.from_feature_file(feature_file, FeatureType.IS_C) + terminator_drawing.write_svg(out_c) diff --git a/mewtwo/embeddings/terminator/hairpin.py b/mewtwo/embeddings/terminator/hairpin.py index 64772cf..8897ca5 100644 --- a/mewtwo/embeddings/terminator/hairpin.py +++ b/mewtwo/embeddings/terminator/hairpin.py @@ -1,4 +1,5 @@ from enum import Enum +from typing import Optional from mewtwo.embeddings.terminator.loop import Loop from mewtwo.embeddings.terminator.stem import Stem @@ -12,8 +13,8 @@ class HairpinType(Enum): class Hairpin: - def __init__(self, hairpin_id: str, distance_to_pot: int, - prediction_software: HairpinType): + def __init__(self, hairpin_id: str, + prediction_software: HairpinType, distance_to_pot: Optional[int] = None): self.hairpin_id = hairpin_id self.distance_to_pot = distance_to_pot @@ -72,8 +73,9 @@ def to_vector(self, max_stem_size, max_loop_size): class RNAFoldHairpin(Hairpin): - def __init__(self, hairpin_id, distance_to_pot, free_energy, hairpin_sequence, hairpin_structure): - super().__init__(hairpin_id, distance_to_pot, HairpinType.RNAFOLD) + def __init__(self, hairpin_id: str, free_energy: float, hairpin_sequence: str, hairpin_structure: str, + distance_to_pot: Optional[int] = None): + super().__init__(hairpin_id, HairpinType.RNAFOLD, distance_to_pot) seq_type = get_sequence_type(hairpin_sequence) if SeqType.RNA in seq_type: sequence = RNASequence(hairpin_sequence) @@ -88,8 +90,8 @@ def __init__(self, hairpin_id, distance_to_pot, free_energy, hairpin_sequence, h class TransTermHPHairpin(Hairpin): - def __init__(self, hairpin_id, distance_to_pot, hairpin_score, hairpin): - super().__init__(hairpin_id, distance_to_pot, HairpinType.TRANSTERMHP) + def __init__(self, hairpin_id, hairpin_score, hairpin, distance_to_pot: Optional[int] = None): + super().__init__(hairpin_id, HairpinType.TRANSTERMHP, distance_to_pot) self.set_hairpin_sequence(hairpin) self.set_hairpin_structure(hairpin) self.hairpin_score = hairpin_score diff --git a/mewtwo/embeddings/terminator/terminator.py b/mewtwo/embeddings/terminator/terminator.py index 56c6cae..a5baa15 100644 --- a/mewtwo/embeddings/terminator/terminator.py +++ b/mewtwo/embeddings/terminator/terminator.py @@ -8,9 +8,13 @@ class Terminator: - def __init__(self, start: int, end: int, pot: int, species: str, chromosome: str, strand: str, - sequence: RNASequence, termination_efficiency: Optional[float], - hairpin: Union[RNAFoldHairpin, TransTermHPHairpin], a_tract: ATract, u_tract: UTract): + def __init__(self, + hairpin: Union[RNAFoldHairpin, TransTermHPHairpin], a_tract: ATract, u_tract: UTract, + sequence: RNASequence, + termination_efficiency: Optional[float] = None, + start: Optional[int] = None, end: Optional[int] = None, pot: Optional[int] = None, + species: Optional[str] = None, chromosome: Optional[str] = None, strand: Optional[str] = None, + is_synthetic: bool = False): self.start = start self.end = end self.pot = pot @@ -23,6 +27,7 @@ def __init__(self, start: int, end: int, pot: int, species: str, chromosome: str self.hairpin = hairpin self.a_tract = a_tract self.u_tract = u_tract + self.is_synthetic = is_synthetic def to_vector(self, max_loop_size: int, max_stem_size: int = 10, a_tract_size: int = 10, u_tract_size: int = 10, one_hot: bool = False) -> list[int]: @@ -34,6 +39,9 @@ def to_vector(self, max_loop_size: int, max_stem_size: int = 10, a_tract_size: i vector.extend(self.u_tract.to_vector(u_tract_size=u_tract_size, one_hot=one_hot)) return vector + def get_base_coords(self, max_loop_size, max_stem_size, a_tract_size, u_tract_size): + pass + def get_terminator_part_sizes(terminators: list[Terminator]) -> tuple[int, int, int, int]: loops = [] diff --git a/mewtwo/embeddings/terminator/test/test_a_tract.py b/mewtwo/embeddings/terminator/test/test_a_tract.py index 6f4e606..055b010 100644 --- a/mewtwo/embeddings/terminator/test/test_a_tract.py +++ b/mewtwo/embeddings/terminator/test/test_a_tract.py @@ -6,7 +6,7 @@ class TestATract(unittest.TestCase): def test_to_vector(self): - a_tract_1 = ATract(RNASequence("ACGU")) + a_tract_1 = ATract("ACGU") self.assertEqual(a_tract_1.to_vector(4), [1, 0, 2, 0, 1, 3, diff --git a/mewtwo/embeddings/terminator/test/test_hairpin.py b/mewtwo/embeddings/terminator/test/test_hairpin.py index 962f685..1307a65 100644 --- a/mewtwo/embeddings/terminator/test/test_hairpin.py +++ b/mewtwo/embeddings/terminator/test/test_hairpin.py @@ -5,22 +5,22 @@ class TestHairpin(unittest.TestCase): def test_contains_multiple_hairpins(self): self.assertTrue( - RNAFoldHairpin("example_rnafold_true", 5, -9.8, 'tggggccaaacggtgccggtaagaccacca', - '.((.(((....))).))(((......))).').contains_multiple_hairpins()) + RNAFoldHairpin("example_rnafold_true", -9.8, 'tggggccaaacggtgccggtaagaccacca', + '.((.(((....))).))(((......))).', 5).contains_multiple_hairpins()) self.assertTrue( - RNAFoldHairpin("example_rnafold_true", 5, -9.8, 'ccattaggaaaggattacc', - '((....))...((....))').contains_multiple_hairpins()) + RNAFoldHairpin("example_rnafold_true", -9.8, 'ccattaggaaaggattacc', + '((....))...((....))', 5).contains_multiple_hairpins()) self.assertFalse( - RNAFoldHairpin("example_rnafold_false", 3, -9.1, 'cccacccgaggggtggg', - '(((((((...)))))))').contains_multiple_hairpins()) + RNAFoldHairpin("example_rnafold_false", -9.1, 'cccacccgaggggtggg', + '(((((((...)))))))', 3).contains_multiple_hairpins()) self.assertFalse( - RNAFoldHairpin("example_rnafold_false", 3, -9.1, 'cccatcccgaggggtggg', - '((((.(((...)))))))').contains_multiple_hairpins()) + RNAFoldHairpin("example_rnafold_false", -9.1, 'cccatcccgaggggtggg', + '((((.(((...)))))))', 3).contains_multiple_hairpins()) self.assertFalse( - TransTermHPHairpin("example_transtermhp_false", 5, -9.8, 'a-tg aaa caat').contains_multiple_hairpins()) + TransTermHPHairpin("example_transtermhp_false", -9.8, 'a-tg aaa caat', 5).contains_multiple_hairpins()) self.assertFalse( - TransTermHPHairpin("example_transtermhp_false", 5, -9.8, 'atg aaa cat').contains_multiple_hairpins()) + TransTermHPHairpin("example_transtermhp_false", -9.8, 'atg aaa cat', 5).contains_multiple_hairpins()) if __name__ == '__main__': diff --git a/mewtwo/embeddings/terminator/test/test_u_tract.py b/mewtwo/embeddings/terminator/test/test_u_tract.py index 47dc309..1b9b3d4 100644 --- a/mewtwo/embeddings/terminator/test/test_u_tract.py +++ b/mewtwo/embeddings/terminator/test/test_u_tract.py @@ -6,7 +6,7 @@ class TestUTract(unittest.TestCase): def test_to_vector(self): - u_tract_1 = UTract(RNASequence("ACGU"), 2) + u_tract_1 = UTract("ACGU", 2) self.assertEqual(u_tract_1.to_vector(4), [1, 0, 2, 0, 0, 1, 3, 0, @@ -34,12 +34,40 @@ def test_to_vector(self): 0, 0, 0, 1, 0, 0, 0, 0, 0, 0]) + u_tract_2 = UTract("ACGU", None) + + self.assertEqual(u_tract_2.to_vector(4), [1, 0, 2, + 0, 1, 3, + 1, 0, 3, + 0, 1, 2]) + self.assertEqual(u_tract_2.to_vector(3), [1, 0, 2, + 0, 1, 3, + 1, 0, 3]) + self.assertEqual(u_tract_2.to_vector(5), [1, 0, 2, + 0, 1, 3, + 1, 0, 3, + 0, 1, 2, + 0, 0, 0]) + + self.assertEqual(u_tract_2.to_vector(4, one_hot=True), [1, 0, 0, 0, + 0, 1, 0, 0, + 0, 0, 1, 0, + 0, 0, 0, 1]) + self.assertEqual(u_tract_2.to_vector(3, one_hot=True), [1, 0, 0, 0, + 0, 1, 0, 0, + 0, 0, 1, 0]) + self.assertEqual(u_tract_2.to_vector(5, one_hot=True), [1, 0, 0, 0, + 0, 1, 0, 0, + 0, 0, 1, 0, + 0, 0, 0, 1, + 0, 0, 0, 0]) + def test_init(self): - u_tract_1 = UTract(RNASequence("ACGU"), 2) + u_tract_1 = UTract("ACGU", 2) self.assertEqual(2, u_tract_1.pot) with self.assertRaises(AssertionError): - UTract(RNASequence("ACGU"), 4) + UTract("ACGU", 4) if __name__ == "__main__": diff --git a/mewtwo/embeddings/terminator/u_tract.py b/mewtwo/embeddings/terminator/u_tract.py index 3c84101..39c7cae 100644 --- a/mewtwo/embeddings/terminator/u_tract.py +++ b/mewtwo/embeddings/terminator/u_tract.py @@ -1,12 +1,21 @@ -from mewtwo.embeddings.sequence import RNASequence + +from typing import Optional + +from mewtwo.embeddings.sequence import RNASequence, SeqType, convert_to_rna, get_sequence_type, DNASequence from mewtwo.embeddings.bases import Base, base_to_vector class UTract: - def __init__(self, sequence: RNASequence, pot: int) -> None: + def __init__(self, sequence: str, pot: Optional[int] = None) -> None: + seq_type = get_sequence_type(sequence) + if SeqType.RNA in seq_type: + sequence = RNASequence(sequence) + else: + sequence = convert_to_rna(DNASequence(sequence)) self.sequence = sequence self.pot = pot - assert self.pot < len(self.sequence) + if self.pot is not None: + assert self.pot < len(self.sequence) def to_vector(self, u_tract_size: int = 10, one_hot: bool = False) -> list[int]: vector = [] @@ -17,9 +26,11 @@ def to_vector(self, u_tract_size: int = 10, one_hot: bool = False) -> list[int]: base = Base.ZERO_PADDING vector.extend(base_to_vector(base, one_hot)) - if i == self.pot: - vector.append(1) - else: - vector.append(0) + if self.pot is not None: + + if i == self.pot: + vector.append(1) + else: + vector.append(0) return vector diff --git a/mewtwo/embeddings/test/test_feature_labels.py b/mewtwo/embeddings/test/test_feature_labels.py index 5ba63ff..69db617 100644 --- a/mewtwo/embeddings/test/test_feature_labels.py +++ b/mewtwo/embeddings/test/test_feature_labels.py @@ -1,6 +1,5 @@ import unittest -from mewtwo.embeddings.feature_labels import FeatureLabel -from mewtwo.embeddings.bases import Base +from mewtwo.embeddings.feature_labels import FeatureLabel, FeatureCategory, FeatureType, StemShoulder class TestFeatureLabel(unittest.TestCase): @@ -9,124 +8,109 @@ def test_set_base_information(self): # A-tract features - feature_a01 = FeatureLabel(0, 10, 1, 1, 1) - feature_a02 = FeatureLabel(1, 10, 1, 1, 1) - feature_a03 = FeatureLabel(2, 10, 1, 1, 1) - feature_a04 = FeatureLabel(3, 10, 1, 1, 1) - feature_a05 = FeatureLabel(4, 10, 1, 1, 1) - feature_a06 = FeatureLabel(5, 10, 1, 1, 1) - - feature_a07 = FeatureLabel(0, 10, 1, 1, 1, one_hot=True) - feature_a08 = FeatureLabel(1, 10, 1, 1, 1, one_hot=True) - feature_a09 = FeatureLabel(2, 10, 1, 1, 1, one_hot=True) - feature_a10 = FeatureLabel(3, 10, 1, 1, 1, one_hot=True) - feature_a11 = FeatureLabel(4, 10, 1, 1, 1, one_hot=True) - feature_a12 = FeatureLabel(5, 10, 1, 1, 1, one_hot=True) - feature_a13 = FeatureLabel(6, 10, 1, 1, 1, one_hot=True) - feature_a14 = FeatureLabel(7, 10, 1, 1, 1, one_hot=True) - - self.assertEqual(feature_a01.base_identity, Base.PURINES) + feature_a01 = FeatureLabel.from_feature_position(0, 10, 1, 1, 1) + feature_a02 = FeatureLabel.from_feature_position(1, 10, 1, 1, 1) + feature_a03 = FeatureLabel.from_feature_position(2, 10, 1, 1, 1) + feature_a04 = FeatureLabel.from_feature_position(3, 10, 1, 1, 1) + feature_a05 = FeatureLabel.from_feature_position(4, 10, 1, 1, 1) + feature_a06 = FeatureLabel.from_feature_position(5, 10, 1, 1, 1) + + feature_a07 = FeatureLabel.from_feature_position(0, 10, 1, 1, 1, one_hot=True) + feature_a08 = FeatureLabel.from_feature_position(1, 10, 1, 1, 1, one_hot=True) + feature_a09 = FeatureLabel.from_feature_position(2, 10, 1, 1, 1, one_hot=True) + feature_a10 = FeatureLabel.from_feature_position(3, 10, 1, 1, 1, one_hot=True) + feature_a11 = FeatureLabel.from_feature_position(4, 10, 1, 1, 1, one_hot=True) + feature_a12 = FeatureLabel.from_feature_position(5, 10, 1, 1, 1, one_hot=True) + feature_a13 = FeatureLabel.from_feature_position(6, 10, 1, 1, 1, one_hot=True) + feature_a14 = FeatureLabel.from_feature_position(7, 10, 1, 1, 1, one_hot=True) + + self.assertEqual(feature_a01.feature_type, FeatureType.IS_PURINE) self.assertEqual(feature_a01.base_index, 1) - self.assertEqual(feature_a01.base_pair_index, None) - self.assertEqual(feature_a01.base_hydrogen_bond_count, False) - self.assertEqual(feature_a01.feature_category, 'A-tract') + self.assertEqual(feature_a01.stem_shoulder, None) + self.assertEqual(feature_a01.feature_category, FeatureCategory.A_TRACT) - self.assertEqual(feature_a02.base_identity, Base.PYRIMIDINES) + self.assertEqual(feature_a02.feature_type, FeatureType.IS_PYRIMIDINE) self.assertEqual(feature_a02.base_index, 1) - self.assertEqual(feature_a02.base_pair_index, None) - self.assertEqual(feature_a02.base_hydrogen_bond_count, False) - self.assertEqual(feature_a03.base_identity, None) self.assertEqual(feature_a03.base_index, 1) - self.assertEqual(feature_a03.base_pair_index, None) - self.assertEqual(feature_a03.base_hydrogen_bond_count, True) + self.assertEqual(feature_a03.feature_type, FeatureType.NR_H_BONDS) - self.assertEqual(feature_a04.base_identity, Base.PURINES) + self.assertEqual(feature_a04.feature_type, FeatureType.IS_PURINE) self.assertEqual(feature_a04.base_index, 2) - self.assertEqual(feature_a04.base_pair_index, None) - self.assertEqual(feature_a04.base_hydrogen_bond_count, False) - self.assertEqual(feature_a05.base_identity, Base.PYRIMIDINES) + self.assertEqual(feature_a05.feature_type, FeatureType.IS_PYRIMIDINE) self.assertEqual(feature_a05.base_index, 2) - self.assertEqual(feature_a05.base_pair_index, None) - self.assertEqual(feature_a05.base_hydrogen_bond_count, False) - self.assertEqual(feature_a06.base_identity, None) self.assertEqual(feature_a06.base_index, 2) - self.assertEqual(feature_a06.base_pair_index, None) - self.assertEqual(feature_a06.base_hydrogen_bond_count, True) + self.assertEqual(feature_a06.feature_type, FeatureType.NR_H_BONDS) - self.assertEqual(feature_a07.base_identity, Base.A) + self.assertEqual(feature_a07.feature_type, FeatureType.IS_A) self.assertEqual(feature_a07.base_index, 1) - self.assertEqual(feature_a07.base_pair_index, None) - self.assertEqual(feature_a07.base_hydrogen_bond_count, False) - self.assertEqual(feature_a08.base_identity, Base.C) - self.assertEqual(feature_a09.base_identity, Base.G) - self.assertEqual(feature_a10.base_identity, Base.U) - self.assertEqual(feature_a11.base_identity, Base.A) - self.assertEqual(feature_a12.base_identity, Base.C) - self.assertEqual(feature_a13.base_identity, Base.G) - self.assertEqual(feature_a14.base_identity, Base.U) + self.assertEqual(feature_a08.feature_type, FeatureType.IS_C) + self.assertEqual(feature_a09.feature_type, FeatureType.IS_G) + self.assertEqual(feature_a10.feature_type, FeatureType.IS_U) + self.assertEqual(feature_a11.feature_type, FeatureType.IS_A) + self.assertEqual(feature_a12.feature_type, FeatureType.IS_C) + self.assertEqual(feature_a13.feature_type, FeatureType.IS_G) + self.assertEqual(feature_a14.feature_type, FeatureType.IS_U) # Loop features def test_set_basepair_information(self): # Stem features - feature_s01 = FeatureLabel(30, 10, 10, 1, 1) - feature_s02 = FeatureLabel(31, 10, 10, 1, 1) - feature_s03 = FeatureLabel(32, 10, 10, 1, 1) - feature_s04 = FeatureLabel(33, 10, 10, 1, 1) - feature_s05 = FeatureLabel(34, 10, 10, 1, 1) - feature_s06 = FeatureLabel(35, 10, 10, 1, 1) - feature_s07 = FeatureLabel(36, 10, 10, 1, 1) - feature_s08 = FeatureLabel(37, 10, 10, 1, 1) - - feature_s09 = FeatureLabel(40, 10, 10, 1, 1, one_hot=True) - feature_s10 = FeatureLabel(41, 10, 10, 1, 1, one_hot=True) - feature_s11 = FeatureLabel(42, 10, 10, 1, 1, one_hot=True) - feature_s12 = FeatureLabel(43, 10, 10, 1, 1, one_hot=True) - feature_s13 = FeatureLabel(44, 10, 10, 1, 1, one_hot=True) - feature_s14 = FeatureLabel(45, 10, 10, 1, 1, one_hot=True) - feature_s15 = FeatureLabel(46, 10, 10, 1, 1, one_hot=True) - feature_s16 = FeatureLabel(47, 10, 10, 1, 1, one_hot=True) - feature_s17 = FeatureLabel(48, 10, 10, 1, 1, one_hot=True) - - self.assertEqual(feature_s01.base_identity, Base.PURINES) - self.assertEqual(feature_s01.base_index, None) - self.assertEqual(feature_s01.base_pair_index, 1) - self.assertEqual(feature_s01.stem_shoulder, 'upstream') - self.assertEqual(feature_s01.base_hydrogen_bond_count, False) - self.assertEqual(feature_s01.feature_category, 'stem') - - self.assertEqual(feature_s02.base_identity, Base.PYRIMIDINES) - self.assertEqual(feature_s03.base_identity, None) - self.assertEqual(feature_s03.base_hydrogen_bond_count, True) - - self.assertEqual(feature_s04.base_identity, Base.PURINES) - self.assertEqual(feature_s04.stem_shoulder, "downstream") - - self.assertEqual(feature_s05.base_identity, Base.PYRIMIDINES) - self.assertEqual(feature_s06.base_hydrogen_bond_count, True) - - self.assertEqual(feature_s07.base_pair_index, 1) + feature_s01 = FeatureLabel.from_feature_position(30, 10, 10, 1, 1) + feature_s02 = FeatureLabel.from_feature_position(31, 10, 10, 1, 1) + feature_s03 = FeatureLabel.from_feature_position(32, 10, 10, 1, 1) + feature_s04 = FeatureLabel.from_feature_position(33, 10, 10, 1, 1) + feature_s05 = FeatureLabel.from_feature_position(34, 10, 10, 1, 1) + feature_s06 = FeatureLabel.from_feature_position(35, 10, 10, 1, 1) + feature_s07 = FeatureLabel.from_feature_position(36, 10, 10, 1, 1) + feature_s08 = FeatureLabel.from_feature_position(37, 10, 10, 1, 1) + + feature_s09 = FeatureLabel.from_feature_position(40, 10, 10, 1, 1, one_hot=True) + feature_s10 = FeatureLabel.from_feature_position(41, 10, 10, 1, 1, one_hot=True) + feature_s11 = FeatureLabel.from_feature_position(42, 10, 10, 1, 1, one_hot=True) + feature_s12 = FeatureLabel.from_feature_position(43, 10, 10, 1, 1, one_hot=True) + feature_s13 = FeatureLabel.from_feature_position(44, 10, 10, 1, 1, one_hot=True) + feature_s14 = FeatureLabel.from_feature_position(45, 10, 10, 1, 1, one_hot=True) + feature_s15 = FeatureLabel.from_feature_position(46, 10, 10, 1, 1, one_hot=True) + feature_s16 = FeatureLabel.from_feature_position(47, 10, 10, 1, 1, one_hot=True) + feature_s17 = FeatureLabel.from_feature_position(48, 10, 10, 1, 1, one_hot=True) + + self.assertEqual(feature_s01.feature_type, FeatureType.IS_PURINE) + self.assertEqual(feature_s01.base_index, 1) + + self.assertEqual(feature_s01.stem_shoulder, StemShoulder.UPSTREAM) + self.assertEqual(feature_s01.feature_category, FeatureCategory.STEM) + + self.assertEqual(feature_s02.feature_type, FeatureType.IS_PYRIMIDINE) + + self.assertEqual(feature_s03.feature_type, FeatureType.NR_H_BONDS) + self.assertEqual(feature_s03.feature_category, FeatureCategory.STEM) + + self.assertEqual(feature_s04.feature_type, FeatureType.IS_PURINE) + self.assertEqual(feature_s04.stem_shoulder, StemShoulder.DOWNSTREAM) + + self.assertEqual(feature_s05.feature_type, FeatureType.IS_PYRIMIDINE) + self.assertEqual(feature_s06.feature_type, FeatureType.NR_H_BONDS) + + self.assertEqual(feature_s07.base_index, 1) self.assertEqual(feature_s07.stem_shoulder, None) - self.assertEqual(feature_s07.check_pairing, True) - - self.assertEqual(feature_s08.base_pair_index, 2) - self.assertEqual(feature_s08.base_identity, Base.PURINES) - - self.assertEqual(feature_s09.base_identity, Base.A) - self.assertEqual(feature_s10.base_identity, Base.C) - self.assertEqual(feature_s11.base_identity, Base.G) - self.assertEqual(feature_s12.base_identity, Base.U) - self.assertEqual(feature_s13.base_identity, Base.A) - self.assertEqual(feature_s14.base_identity, Base.C) - self.assertEqual(feature_s15.base_identity, Base.G) - self.assertEqual(feature_s16.base_identity, Base.U) - self.assertEqual(feature_s17.base_identity, None) - self.assertEqual(feature_s17.check_pairing, True) + + self.assertEqual(feature_s08.base_index, 2) + self.assertEqual(feature_s08.feature_type, FeatureType.IS_PURINE) + + self.assertEqual(feature_s09.feature_type, FeatureType.IS_A) + self.assertEqual(feature_s10.feature_type, FeatureType.IS_C) + self.assertEqual(feature_s11.feature_type, FeatureType.IS_G) + self.assertEqual(feature_s12.feature_type, FeatureType.IS_U) + self.assertEqual(feature_s13.feature_type, FeatureType.IS_A) + self.assertEqual(feature_s14.feature_type, FeatureType.IS_C) + self.assertEqual(feature_s15.feature_type, FeatureType.IS_G) + self.assertEqual(feature_s16.feature_type, FeatureType.IS_U) + self.assertEqual(feature_s17.feature_type, FeatureType.IS_BONDED) if __name__ == '__main__': diff --git a/mewtwo/machine_learning/data_preparation/binning.py b/mewtwo/machine_learning/data_preparation/binning.py index e62661a..64b9573 100644 --- a/mewtwo/machine_learning/data_preparation/binning.py +++ b/mewtwo/machine_learning/data_preparation/binning.py @@ -1,7 +1,6 @@ from math import isclose - def bin_data(y, n_bins: int = 5): bin_ranges = [] range_start = 0.0 @@ -27,7 +26,6 @@ def bin_data(y, n_bins: int = 5): except IndexError: raise ValueError(f"Expected value between 0.0 and 1.0. Got {y_data}") - bins.append(bin_index) return bins \ No newline at end of file diff --git a/mewtwo/machine_learning/data_preparation/train_test_split.py b/mewtwo/machine_learning/data_preparation/train_test_split.py index ba8ba1f..b52a20a 100644 --- a/mewtwo/machine_learning/data_preparation/train_test_split.py +++ b/mewtwo/machine_learning/data_preparation/train_test_split.py @@ -13,15 +13,21 @@ def __init__(self, train: list[Terminator], test: list[Terminator]): self.test = test -def split_data(terminators, test_size: float = 0.5, n_crossval_sets: int = 5): +def split_data(terminators, attribute_for_splitting: str, test_size: float = 0.5, n_crossval_sets: int = 5, + n_bins: int = 5): max_loop, max_stem, max_a, max_u = get_terminator_part_sizes(terminators) x = [] y = [] labels = [] + for terminator in terminators: x.append(terminator.to_vector(max_loop, max_stem, max_a, max_u)) y.append(terminator.te) - labels.append(terminator.species) + attribute = getattr(terminator, attribute_for_splitting) + labels.append(attribute) + + if type(labels[0]) in (int, float): + labels = bin_data(labels, n_bins) sss = StratifiedShuffleSplit(n_splits=1, test_size=test_size, random_state=250589) sss.get_n_splits(x, labels) diff --git a/mewtwo/machine_learning/feature_inference/infer_features_rf.py b/mewtwo/machine_learning/feature_inference/infer_features_rf.py index 3d90b3c..7d221dd 100644 --- a/mewtwo/machine_learning/feature_inference/infer_features_rf.py +++ b/mewtwo/machine_learning/feature_inference/infer_features_rf.py @@ -4,6 +4,7 @@ from mewtwo.parsers.parse_feature_file import parse_feature_file from mewtwo.data_processing.iterate_over_dir import iterate_over_dir from mewtwo.writers.write_feature_importances import write_feature_importances +from mewtwo.embeddings.feature_labels import FeatureCategory, FeatureLabel, FeatureType, StemShoulder def get_average_features_from_crossvalidation(input_folder, get_stdev=True): @@ -41,3 +42,95 @@ def write_average_importances(input_folder: str, out_file: str) -> None: write_feature_importances(feature_to_average_importance, out_file, feature_to_stdev=feature_to_stdev, sort_by_importance=True) + +class SortedFeatures: + def __init__(self, summed_feature_to_importance): + sorted_features = {StemShoulder.UPSTREAM: [], + StemShoulder.DOWNSTREAM: [], + FeatureCategory.U_TRACT: [], + FeatureCategory.A_TRACT: [], + FeatureCategory.LOOP: []} + for summed_feature, importance in summed_feature_to_importance.items(): + if summed_feature.feature_category != FeatureCategory.STEM: + sorted_features[summed_feature.feature_category].append((summed_feature, importance)) + + else: + sorted_features[summed_feature.stem_shoulder].append((summed_feature, importance)) + + self.a_tract = sorted(sorted_features[FeatureCategory.A_TRACT], key=lambda x: x[0].base_index) + self.u_tract = sorted(sorted_features[FeatureCategory.U_TRACT], key=lambda x: x[0].base_index) + self.loop = sorted(sorted_features[FeatureCategory.LOOP], key=lambda x: x[0].base_index) + self.left_stem_shoulder = sorted(sorted_features[StemShoulder.UPSTREAM], key=lambda x: x[0].base_index) + self.right_stem_shoulder = sorted(sorted_features[StemShoulder.DOWNSTREAM], key=lambda x: x[0].base_index) + + +def sum_importance_per_base(feature_to_importance): + importance_dict = {StemShoulder.UPSTREAM: {}, + StemShoulder.DOWNSTREAM: {}, + FeatureCategory.U_TRACT: {}, + FeatureCategory.A_TRACT: {}, + FeatureCategory.LOOP: {}} + + one_hot = False + + for feature, importance in feature_to_importance.items(): + if feature.feature_type in FeatureType.IS_BASE_IDENTITY: + one_hot = True + + if feature.feature_category != FeatureCategory.STEM: + if feature.base_index not in importance_dict[feature.feature_category]: + importance_dict[feature.feature_category][feature.base_index] = [] + importance_dict[feature.feature_category][feature.base_index].append(importance) + else: + if feature.base_index not in importance_dict[StemShoulder.UPSTREAM]: + importance_dict[StemShoulder.UPSTREAM][feature.base_index] = [] + if feature.base_index not in importance_dict[StemShoulder.DOWNSTREAM]: + importance_dict[StemShoulder.DOWNSTREAM][feature.base_index] = [] + + if feature.stem_shoulder is None: + importance_dict[StemShoulder.UPSTREAM][feature.base_index].append(importance) + importance_dict[StemShoulder.DOWNSTREAM][feature.base_index].append(importance) + elif feature.stem_shoulder == StemShoulder.UPSTREAM: + importance_dict[StemShoulder.UPSTREAM][feature.base_index].append(importance) + elif feature.stem_shoulder == StemShoulder.DOWNSTREAM: + importance_dict[StemShoulder.DOWNSTREAM][feature.base_index].append(importance) + else: + raise ValueError(f"Unknown value for stem shoulder: {feature.stem_shoulder}") + + summed_importances = {} + + for category, base_to_importances in importance_dict.items(): + if type(category) == StemShoulder: + feature_category = FeatureCategory.STEM + stem_shoulder = category + else: + feature_category = category + stem_shoulder = None + + for base, importances in base_to_importances.items(): + + if one_hot: + feature_type = FeatureType.ONE_HOT_TYPES + else: + feature_type = FeatureType.BASE_FEATURE_TYPES + + feature = FeatureLabel(feature_type, feature_category, base, stem_shoulder) + summed_importances[feature] = sum(importances) + + return summed_importances + + +def get_normalized_importances(feature_file): + feature_to_importance = parse_feature_file(feature_file) + normalize_importances(feature_to_importance) + + return feature_to_importance + + +def normalize_importances(feature_to_importance): + max_importance = max(feature_to_importance.values()) + factor = 1.0 / max_importance + + for feature in feature_to_importance: + feature_to_importance[feature] = factor * feature_to_importance[feature] + diff --git a/mewtwo/machine_learning/random_forest/rf_from_data.py b/mewtwo/machine_learning/random_forest/rf_from_data.py new file mode 100644 index 0000000..e4a6065 --- /dev/null +++ b/mewtwo/machine_learning/random_forest/rf_from_data.py @@ -0,0 +1,94 @@ +import os +from argparse import ArgumentParser, Namespace +from enum import Enum + +from joblib import dump + +from mewtwo.parsers.parse_data_chen import get_chen_terminators +from mewtwo.machine_learning.data_preparation.train_test_split import split_data +from mewtwo.machine_learning.random_forest.train_random_forest import train_random_forest, RandomForestMode, \ + FeaturisationMode +from mewtwo.machine_learning.feature_inference.infer_features_rf import write_average_importances +from mewtwo.parsers.parse_termite_output import get_termite_terminators +from mewtwo.embeddings.terminator.draw_terminator import visualise_feature_importances + + +class DataSource(Enum): + TERMITE = 1 + CHEN = 2 + + +def parse_arguments() -> Namespace: + parser = ArgumentParser(description="Train random forest from E. coli terminator data published by Chen et al.") + parser.add_argument("-i", type=str, required=True, help="Path to input file.") + parser.add_argument("-o", type=str, required=True, help="Path to output directory.") + parser.add_argument("-d", type=str, default="CHEN", help="Data source. Must be CHEN or TERMITE") + parser.add_argument("-m", type=str, default="FULL", help="Random forest training mode. Must be one of \ + 'FULL', 'TRAIN', or 'CROSSVALIDATION'.") + parser.add_argument("-a", type=str, default="is_synthetic", help="Attribute for stratified data splitting") + parser.add_argument("-f", type=str, default="ONE_HOT", help="Featurisation mode. Must be one of 'ONE_HOT' or \ + 'PURINE_PYRIMIDINE'.") + parser.add_argument("-s", action="store_true", help="If given, save random forest models.") + parser.add_argument("-n", type=int, default=100, help="Number of trees in RF") + + args = parser.parse_args() + return args + + +def rf_from_data(data_file: str, data_source: DataSource, out_dir: str, attribute: str, save_model: bool = False, + mode: RandomForestMode = RandomForestMode.CROSSVALIDATION, + featurisation_mode: FeaturisationMode = FeaturisationMode.ONE_HOT, n_trees: int = 100) -> None: + if data_source == DataSource.CHEN: + terminators = get_chen_terminators(data_file) + elif data_source == DataSource.TERMITE: + terminators = get_termite_terminators(data_file, species_column=True) + else: + raise ValueError(f"Unknown data source: {data_source.name}") + + train_terminators, test_terminators, crossvalidation_sets = split_data(terminators, + attribute_for_splitting=attribute) + + if featurisation_mode == FeaturisationMode.ONE_HOT: + one_hot = True + else: + one_hot = False + + if RandomForestMode.CROSSVALIDATION in mode: + figure_dir = os.path.join(out_dir, "feature_importance_visualisations_crossval") + + for crossval_nr, crossvalidation_set in crossvalidation_sets.items(): + out_path = os.path.join(out_dir, f"crossvalidation_results_{crossval_nr}") + + rf = train_random_forest(crossvalidation_set.train, crossvalidation_set.test, one_hot=one_hot, + out_dir=out_path, n_trees=n_trees) + if save_model: + model_path = os.path.join(out_dir, f"crossvalidation_model_{crossval_nr}.rf") + dump(rf, model_path) + + averaged_features_dir = os.path.join(out_dir, "average_feature_importances.txt") + + write_average_importances(out_dir, averaged_features_dir) + visualise_feature_importances(averaged_features_dir, figure_dir) + + if RandomForestMode.TRAIN in mode: + figure_dir = os.path.join(out_dir, "feature_importance_visualisations_train") + + rf = train_random_forest(train_terminators, test_terminators, one_hot=one_hot, out_dir=out_dir, n_trees=n_trees) + if save_model: + model_path = os.path.join(out_dir, f"predictor.rf") + dump(rf, model_path) + + visualise_feature_importances(os.path.join(out_dir, "feature_importances.txt"), figure_dir) + + +def main(): + args = parse_arguments() + if not os.path.exists(args.o): + os.mkdir(args.o) + + rf_from_data(args.i, DataSource[args.d], args.o, args.a, args.s, RandomForestMode[args.m], + FeaturisationMode[args.f], args.n) + + +if __name__ == "__main__": + main() diff --git a/mewtwo/machine_learning/random_forest/train_random_forest.py b/mewtwo/machine_learning/random_forest/train_random_forest.py index 24c4e33..0655efa 100644 --- a/mewtwo/machine_learning/random_forest/train_random_forest.py +++ b/mewtwo/machine_learning/random_forest/train_random_forest.py @@ -1,16 +1,31 @@ import os +from enum import Enum, Flag +from typing import Optional from sklearn.ensemble import RandomForestRegressor from scipy.stats import pearsonr, spearmanr -from mewtwo.embeddings.terminator.terminator import get_terminator_part_sizes -from mewtwo.embeddings.feature_labels import FeatureLabel +from mewtwo.embeddings.terminator.terminator import get_terminator_part_sizes, Terminator +from mewtwo.embeddings.feature_labels import FeatureLabel, FeatureCategory -def train_random_forest(train_terminators, test_terminators, out_dir=None, one_hot=True): +class FeaturisationMode(Enum): + ONE_HOT = 1 + PURINE_PYRIMIDINE = 2 + + +class RandomForestMode(Flag): + CROSSVALIDATION = 1 + TRAIN = 2 + FULL = CROSSVALIDATION | TRAIN + + +def train_random_forest(train_terminators: list[Terminator], test_terminators: list[Terminator], + out_dir: Optional[str] = None, one_hot: bool = True, n_trees: int = 100): features_out = os.path.join(out_dir, "feature_importances.txt") performance_out = os.path.join(out_dir, "performance.txt") + datapoints_out = os.path.join(out_dir, "actual_vs_predicted.txt") all_terminators = train_terminators + test_terminators max_loop, max_stem, max_a, max_u = get_terminator_part_sizes(all_terminators) @@ -18,6 +33,7 @@ def train_random_forest(train_terminators, test_terminators, out_dir=None, one_h train_y = [] test_x = [] test_y = [] + for terminator in train_terminators: train_x.append(terminator.to_vector(max_loop, max_stem, max_a, max_u, one_hot=one_hot)) train_y.append(terminator.te) @@ -26,22 +42,27 @@ def train_random_forest(train_terminators, test_terminators, out_dir=None, one_h test_x.append(terminator.to_vector(max_loop, max_stem, max_a, max_u, one_hot=one_hot)) test_y.append(terminator.te) - random_forest = RandomForestRegressor(n_estimators=1000, oob_score=True) + random_forest = RandomForestRegressor(n_estimators=n_trees, oob_score=True) random_forest.fit(train_x, train_y) print("oob", random_forest.oob_score_) importance_and_label = [] + if train_terminators[0].u_tract.pot is not None: + u_tract_pot = True + else: + u_tract_pot = False for i, feature_importance in enumerate(random_forest.feature_importances_): - feature_label = FeatureLabel(i, max_a, max_stem, max_loop, max_u, one_hot=one_hot) + feature_label = FeatureLabel.from_feature_position(i, max_a, max_stem, max_loop, max_u, one_hot=one_hot, + utract_has_pot=u_tract_pot) importance_and_label.append((feature_importance, feature_label)) importance_and_label.sort(key=lambda x: x[0], reverse=True) for importance, label in importance_and_label[:20]: - if label.feature_category == 'stem': - print(label.feature_category, label.feature_type, importance, label.base_pair_index, label.base_identity, label.stem_shoulder) + if label.feature_category == FeatureCategory.STEM: + print(label.feature_category, label.feature_type, importance, label.base_index, label.stem_shoulder) else: - print(label.feature_category, label.feature_type, importance, label.base_index, label.base_identity) + print(label.feature_category, label.feature_type, importance, label.base_index) print("test score", random_forest.score(test_x, test_y)) print("Pearson correlation: ", pearsonr(random_forest.predict(test_x), test_y)) @@ -53,17 +74,18 @@ def train_random_forest(train_terminators, test_terminators, out_dir=None, one_h with open(features_out, 'w') as features: features.write("feature_name\tfeature_importance\n") for importance, label in importance_and_label: - if label.feature_category == 'stem': - index = f"basepair_{label.base_pair_index}_{label.stem_shoulder}" - else: - index = f"basepair_{label.base_index}" - features.write(f"{label.feature_category}|{index}|{label.feature_type.name}\t{importance:.10f}\n") + features.write(f"{label}\t{importance:.10f}\n") with open(performance_out, 'w') as out: out.write("test_score\tpearson\tspearman\n") out.write(f"{random_forest.score(test_x, test_y):.10f}\t{pearsonr(random_forest.predict(test_x), test_y).statistic:.10f}\t{spearmanr(random_forest.predict(test_x), test_y).statistic}") + with open(datapoints_out, 'w') as out: + out.write(f"actual\tpredicted\n") + for i, prediction in enumerate(random_forest.predict(test_x)): + true_label = test_y[i] + out.write(f"{true_label}\t{prediction}\n") return random_forest diff --git a/mewtwo/parsers/parse_data_chen.py b/mewtwo/parsers/parse_data_chen.py index e69de29..a468ce6 100644 --- a/mewtwo/parsers/parse_data_chen.py +++ b/mewtwo/parsers/parse_data_chen.py @@ -0,0 +1,65 @@ +from sys import argv + +from mewtwo.parsers.tabular import Tabular +from mewtwo.data_processing.compute_te import ts_to_te +from mewtwo.embeddings.terminator.hairpin import RNAFoldHairpin +from mewtwo.embeddings.terminator.a_tract import ATract +from mewtwo.embeddings.terminator.u_tract import UTract +from mewtwo.embeddings.terminator.terminator import Terminator + + +def get_chen_data(input_file, type_column): + if not type_column: + chen_data = Tabular(input_file, [0]) + else: + chen_data = Tabular(input_file, [1]) + + return chen_data + + +def chen_to_dnabert_input(input_file, output_file, type_column: bool = True): + with open(output_file, 'w') as out: + chen_data = get_chen_data(input_file, type_column) + for datapoint in chen_data.data: + sequence = chen_data.get_value(datapoint, "Sequence") + ts = float(chen_data.get_value(datapoint, "Average Strength")) + te = max(0.0, ts_to_te(ts)) + if 1.0 >= te >= -0.0000001: + out.write(f"{sequence}\t{te}\n") + + +def get_chen_terminators(input_file, type_column: bool = True): + chen_data = get_chen_data(input_file, type_column) + terminators = [] + for datapoint in chen_data.data: + ts = float(chen_data.get_value(datapoint, "Average Strength")) + te = max(0.0, ts_to_te(ts)) + if 1.0 >= te >= -0.0000001: + terminator_type = chen_data.get_value(datapoint, 'Type') + if terminator_type == 'Synthetic': + is_synthetic = True + else: + is_synthetic = False + + free_energy = chen_data.get_value(datapoint, 'dGH') + hairpin_structure = chen_data.get_value(datapoint, 'Structure') + hairpin_sequence = chen_data.get_value(datapoint, 'Hairpin') + sequence = chen_data.get_value(datapoint, 'Sequence') + a_tract_seq = chen_data.get_value(datapoint, 'A-tract') + u_tract_seq = chen_data.get_value(datapoint, 'U-tract') + terminator_id = chen_data.get_value(datapoint, 'Name') + + hairpin = RNAFoldHairpin(terminator_id, free_energy, hairpin_sequence, hairpin_structure) + if not hairpin.contains_multiple_hairpins(): + + a_tract = ATract(a_tract_seq) + u_tract = UTract(u_tract_seq) + terminator = Terminator(hairpin, a_tract, u_tract, sequence, termination_efficiency=te, + is_synthetic=is_synthetic) + terminators.append(terminator) + + return terminators + + +if __name__ == "__main__": + chen_to_dnabert_input(argv[1], argv[2]) diff --git a/mewtwo/parsers/parse_feature_file.py b/mewtwo/parsers/parse_feature_file.py index 5bbceaa..020ff77 100644 --- a/mewtwo/parsers/parse_feature_file.py +++ b/mewtwo/parsers/parse_feature_file.py @@ -1,7 +1,9 @@ from mewtwo.parsers.tabular import Tabular +from mewtwo.embeddings.feature_labels import FeatureLabel +from mewtwo.embeddings.sequence import SeqType -def parse_feature_file(input_file: str) -> dict[str, float]: +def parse_feature_file(input_file: str) -> dict[FeatureLabel, float]: feature_to_importance = {} feature_data = Tabular(input_file, [0]) @@ -9,6 +11,6 @@ def parse_feature_file(input_file: str) -> dict[str, float]: feature_name = feature_data.get_value(feature, "feature_name") importance = float(feature_data.get_value(feature, "feature_importance")) - feature_to_importance[feature_name] = importance + feature_to_importance[FeatureLabel.from_string(feature_name)] = importance return feature_to_importance diff --git a/mewtwo/parsers/parse_termite_output.py b/mewtwo/parsers/parse_termite_output.py index b0c7328..e453cc8 100644 --- a/mewtwo/parsers/parse_termite_output.py +++ b/mewtwo/parsers/parse_termite_output.py @@ -1,7 +1,7 @@ from mewtwo.parsers.tabular import Tabular from mewtwo.embeddings.terminator.hairpin import RNAFoldHairpin, TransTermHPHairpin from mewtwo.embeddings.terminator.terminator import Terminator -from mewtwo.embeddings.sequence import DNASequence, convert_to_rna, convert_to_dna +from mewtwo.embeddings.sequence import convert_to_dna from mewtwo.embeddings.terminator.a_tract import ATract from mewtwo.embeddings.terminator.u_tract import UTract from mewtwo.machine_learning.random_forest.train_random_forest import train_random_forest @@ -64,6 +64,7 @@ def sort_by_species(terminators: list[Terminator]) -> dict[str, list[Terminator] return species_to_terminators + def get_termite_terminators(input_file: str, prioritise_rnafold: bool = True, species_column: bool = False, te_only: bool = True) -> list[Terminator]: """ @@ -97,13 +98,15 @@ def get_termite_terminators(input_file: str, prioritise_rnafold: bool = True, sp start = int(termite_data.get_value(datapoint, 'start')) end = int(termite_data.get_value(datapoint, 'end')) strand = termite_data.get_value(datapoint, 'strand') - sequence = convert_to_rna(DNASequence(termite_data.get_value(datapoint, 'sequence'))) + sequence = termite_data.get_value(datapoint, 'sequence') + + print(sequence) te = termite_data.get_value(datapoint, "termination efficiency") if te == '.': te = None elif 0 <= float(te) <= 100: - te = float(te) + te = float(te) / 100 else: print(datapoint, te) te = None @@ -114,35 +117,33 @@ def get_termite_terminators(input_file: str, prioritise_rnafold: bool = True, sp if termite_data.get_value(datapoint, 'rnafold') == '+': hairpin = RNAFoldHairpin(terminator_id, - termite_data.get_value(datapoint, "rnafold POT distance to hairpin"), - termite_data.get_value(datapoint, "rnafold energy"), + float(termite_data.get_value(datapoint, "rnafold energy")), termite_data.get_value(datapoint, "rnafold hairpin"), - termite_data.get_value(datapoint, "rnafold hairpin structure")) + termite_data.get_value(datapoint, "rnafold hairpin structure"), + int(termite_data.get_value(datapoint, "rnafold POT distance to hairpin"))) if not hairpin.contains_multiple_hairpins(): - a_tract_sequence = convert_to_rna(DNASequence(termite_data.get_value(datapoint, "rnafold a tract"))) - a_tract = ATract(a_tract_sequence) - u_tract_sequence = convert_to_rna(DNASequence(termite_data.get_value(datapoint, "rnafold u tract"))) + + a_tract = ATract(termite_data.get_value(datapoint, "rnafold a tract")) relative_pot = int(termite_data.get_value(datapoint, "rnafold POT distance to hairpin")) - u_tract = UTract(u_tract_sequence, relative_pot) + u_tract = UTract(termite_data.get_value(datapoint, "rnafold u tract"), relative_pot) - terminator = Terminator(start, end, pot, species, chromosome, strand, sequence, te, hairpin, a_tract, - u_tract) + terminator = Terminator(hairpin, a_tract, + u_tract, sequence, te, start, end, pot, species, chromosome, strand) rnafold_terminators[terminator_id] = terminator if termite_data.get_value(datapoint, 'transtermhp') == '+': hairpin = TransTermHPHairpin(terminator_id, - termite_data.get_value(datapoint, "transtermhp POT distance to hairpin"), termite_data.get_value(datapoint, "transtermhp hairpin score"), - termite_data.get_value(datapoint, "transtermhp hairpin")) + termite_data.get_value(datapoint, "transtermhp hairpin"), + termite_data.get_value(datapoint, "transtermhp POT distance to hairpin")) if not hairpin.contains_multiple_hairpins(): - a_tract_sequence = convert_to_rna(DNASequence(termite_data.get_value(datapoint, "transtermhp a tract"))) - a_tract = ATract(a_tract_sequence) - u_tract_sequence = convert_to_rna(DNASequence(termite_data.get_value(datapoint, "transtermhp u tract"))) + + a_tract = ATract(termite_data.get_value(datapoint, "transtermhp a tract")) relative_pot = int(termite_data.get_value(datapoint, "transtermhp POT distance to hairpin")) - u_tract = UTract(u_tract_sequence, relative_pot) + u_tract = UTract(termite_data.get_value(datapoint, "transtermhp u tract"), relative_pot) - terminator = Terminator(start, end, pot, species, chromosome, strand, sequence, te, hairpin, a_tract, - u_tract) + terminator = Terminator(hairpin, a_tract, + u_tract, sequence, te, start, end, pot, species, chromosome, strand) transtermhp_terminators[terminator_id] = terminator terminators = [] @@ -181,10 +182,10 @@ def rnafold_hairpins_from_termite(input_file: str, get_rnafold: bool = True, get hairpin_id = '|'.join(datapoint) if termite_data.get_value(datapoint, 'rnafold') == '+': hairpin = RNAFoldHairpin(hairpin_id, - termite_data.get_value(datapoint, "rnafold POT distance to hairpin"), termite_data.get_value(datapoint, "rnafold energy"), termite_data.get_value(datapoint, "rnafold hairpin"), - termite_data.get_value(datapoint, "rnafold hairpin structure")) + termite_data.get_value(datapoint, "rnafold hairpin structure"), + termite_data.get_value(datapoint, "rnafold POT distance to hairpin")) rnafold_hairpins[hairpin_id] = hairpin if termite_data.get_value(datapoint, 'transtermhp') == '+': hairpin = TransTermHPHairpin(hairpin_id, @@ -228,7 +229,7 @@ def rnafold_hairpins_from_termite(input_file: str, get_rnafold: bool = True, get ecoli_terminators.extend(species_terminators) all_terminators = bacillus_terminators + ecoli_terminators - train_terminators, test_terminators, crossvalidation_sets = split_data(all_terminators, test_size=0.1) + train_terminators, test_terminators, crossvalidation_sets = split_data(all_terminators, "species", test_size=0.1) for crossval_nr, crossvalidation_set in crossvalidation_sets.items(): out_dir = os.path.join(argv[2], f"crossvalidation_results_{crossval_nr}") diff --git a/mewtwo/writers/write_model_settings.py b/mewtwo/writers/write_model_settings.py index 02d01e0..2bd87b5 100644 --- a/mewtwo/writers/write_model_settings.py +++ b/mewtwo/writers/write_model_settings.py @@ -27,7 +27,7 @@ def parse_arguments(): parser.add_argument('--early_stopping_patiences', type=int, nargs='*', default=[2, 3]) parser.add_argument('--use_early_stopping', action="store_true") parser.add_argument('--scheduler_types', type=str, nargs='*', default=["COS_ANNEAL_WARMUP", "REDUCE_ON_PLATEAU", - "REDUCE_ON_PLATEAU_WARMUP", "WARMUP_ONlY"]) + "REDUCE_ON_PLATEAU_WARMUP", "WARMUP_ONLY"]) parser.add_argument('--plateau_patiences', type=int, nargs='*', default=[2, 3]) parser.add_argument('--plateau_factors', type=float, nargs='*', default=[0.5]) parser.add_argument('--second_layer_dim', type=int, nargs='*', default=[]) diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..59a934f --- /dev/null +++ b/requirements.txt @@ -0,0 +1,13 @@ +torch==2.2.2 +peft==0.10.0 +scikit-learn==1.6.1 +transformers==4.40.1 +mewtwo==0.0.1 +pip==25.0 +zlib==1.2.13 +wheel==0.45.1 +openssl==3.0.16 +setuptools==72.1.0 +einops==0.8.1 +scipy==1.15.2 +joblib==1.4.2 \ No newline at end of file