Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
36 commits
Select commit Hold shift + click to select a range
7b780e3
MHWG panel brainstorming
standage Sep 20, 2023
62caacb
Recovered notebook [skip ci]
standage Oct 17, 2023
3aa0084
MHWG panel notebooks
standage Oct 19, 2023
66a3321
Merge branch 'master' into panel/mhwg
standage Oct 19, 2023
8659983
Merge branch 'master' into panel/mhwg
standage Oct 23, 2023
875d015
Merge branch 'master' into panel/mhwg
standage Oct 25, 2023
b390ac6
Mark II
standage Oct 25, 2023
ba4ce3e
Merge branch 'master' into panel/mhwg
standage Feb 16, 2024
591393e
Filter
standage Sep 19, 2024
60d384a
New design procedure [skip ci]
standage Sep 20, 2024
ef5ee44
Merge branch 'master' into panel/mhwg
standage Sep 25, 2024
ee1bf8b
Moar updates [skip ci]
standage Sep 26, 2024
968d31a
Filter by low complexity [skip ci]
standage Oct 24, 2024
165aea6
Cleanup [skip ci]
standage Nov 14, 2024
ad83663
Clean low complexity [skip ci]
standage Nov 15, 2024
50a6249
Filter by repeats [skip ci]
standage Nov 18, 2024
6e808be
Filter by forensic STRs [skip ci]
standage Nov 18, 2024
9358dfa
Putting it all together [skip ci]
standage Nov 19, 2024
3b44733
Masking workflow complete [skip ci]
standage Nov 20, 2024
6ef8bf8
Design workflow again [skip ci]
standage Nov 20, 2024
9f8f14b
Add whitelist [skip ci]
standage Nov 21, 2024
3ae4a68
Final panel [skip ci]
standage Nov 22, 2024
07830c0
Final panel design
standage Nov 29, 2024
9e2ad30
txt -> tsv
standage Nov 29, 2024
74021f5
Final panel
standage Nov 30, 2024
e38c028
Add RSIDs
standage Nov 30, 2024
34e4923
Reorg
standage Dec 2, 2024
609de6f
Code cleanup
standage Dec 2, 2024
aa0d4fe
code format [skip ci]
standage Dec 3, 2024
ee34280
Ideogram [skip ci]
standage Dec 5, 2024
381bf27
Include scripts for plotting ideogram and Ae scores vs CODIS
standage Apr 22, 2025
0348b22
Update DB growth figure
standage Apr 23, 2025
4de82fe
Document markII
standage Apr 23, 2025
3a1bb4f
Remove repeats table
standage Apr 23, 2025
4ba3d6f
Merge branch 'master' into panel/mhwg
standage Apr 23, 2025
321998f
Remove repeats from dbbuild as well
standage Apr 23, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
18 changes: 12 additions & 6 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,11 +4,17 @@ This project adheres to [Semantic Versioning](http://semver.org/).

## [Unreleased]

### Added
- Panel design notebooks (see #157).

### Fixed
- Debugged a test that counts observed haplotypes (#154).
- Replaced global pooled Ae values with 26-population average as the default Ae reported (#155, #158).
- Replaced deprecated `pkg_resources` module with `importlib.resources` (#156).
- Upgraded versioneer to a Python 3.12+ compatible version (#156).
- Debugged a test that counts observed haplotypes (see #154).
- Replaced global pooled Ae values with 26-population average as the default Ae reported (see #155, #158).
- Replaced deprecated `pkg_resources` module with `importlib.resources` (see #156).
- Upgraded versioneer to a Python 3.12+ compatible version (see #156).

### Removed
- Table flagging microhaps with repetitive content (see #153, #157).


## [0.11] 2023-10-25
Expand All @@ -21,13 +27,13 @@ This project adheres to [Semantic Versioning](http://semver.org/).
- Merged RSIDs resolved during database build now propagated to the final marker definition (see #149).

### Fixed
- Added manual and automated fixes to ensure frequencies are formatted correcly and matche to the correct marker definition (see #150).
- Added manual and automated fixes to ensure frequencies are formatted correcly and matched to the correct marker definition (see #150).


## [0.10.1] 2023-10-13

### Fixed
- Bug with offsets table (`marker --format=offsets`) when multiple markers are defined for a locus (#144).
- Bug with offsets table (`marker --format=offsets`) when multiple markers are defined for a locus (see #144).


## [0.10] 2023-09-15
Expand Down
2 changes: 1 addition & 1 deletion dbbuild/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ The goal is that—if ever needed, heaven forbid—any reasonbly capable bioinfo
The MicroHapDB database can be rebuilt with the following command in the `dbbuild/` directory.

```
./build.py databases/dbSNP/ databases/chains/ | tee build-summary.txt
./build.py databases/dbSNP/ databases/chains/ --exclude Auton2015 | tee build-summary.txt
```

The arguments provided to the build script will depend on the location of the dbSNP files and liftover chain files on the system.
Expand Down
14 changes: 3 additions & 11 deletions dbbuild/build.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,14 +15,11 @@
from argparse import ArgumentParser
from lib import SourceIndex
from pathlib import Path
from repeats import main as flag_repeats
import sys


def main(
source_path, dbsnp_path, chain_path, rmsk_path, exclusions=["Auton2015"], check_only=False
):
validate_paths(dbsnp_path, chain_path, rmsk_path)
def main(source_path, dbsnp_path, chain_path, exclusions=["Auton2015"], check_only=False):
validate_paths(dbsnp_path, chain_path)
if check_only:
return
index = SourceIndex(source_path, dbsnp_path, chain_path, exclude=exclusions)
Expand All @@ -35,12 +32,10 @@ def main(
frequencies.to_csv("frequency.csv.gz", index=False, float_format="%.5f", compression="gzip")
index.populations.to_csv("population.csv", index=False)
index.merges.to_csv("merged.csv", index=False)
repeats = flag_repeats(Path(rmsk_path) / "rmsk.txt.gz", "marker.csv", delta=25)
repeats.to_csv("repeats.csv", index=False)
print(index)


def validate_paths(dbsnp_path, rmsk_path, chain_path):
def validate_paths(dbsnp_path, chain_path):
paths = list()
for version in (37, 38):
for extension in ("vcf.gz", "vcf.gz.tbi", "rsidx"):
Expand All @@ -49,7 +44,6 @@ def validate_paths(dbsnp_path, rmsk_path, chain_path):
paths.append(Path(dbsnp_path) / "refsnp-merged.csv.gz")
paths.append(Path(chain_path) / "hg19ToHg38.over.chain.gz")
paths.append(Path(chain_path) / "hg38ToHg19.over.chain.gz")
paths.append(Path(rmsk_path) / "rmsk.txt.gz")
files_present = [p.is_file() for p in paths]
print("-" * 60, "[Auxiliary data file check]\n", "Present Path", sep="\n", file=sys.stderr)
for path, present in zip(paths, files_present):
Expand Down Expand Up @@ -81,7 +75,6 @@ def get_parser():
parser = ArgumentParser(description="MicroHapDB database build procedure")
parser.add_argument("dbsnp_path")
parser.add_argument("chain_path")
parser.add_argument("rmsk_path")
parser.add_argument(
"--sources",
default="sources",
Expand All @@ -107,7 +100,6 @@ def get_parser():
args.sources,
args.dbsnp_path,
args.chain_path,
args.rmsk_path,
exclusions=args.exclude,
check_only=args.check,
)
6 changes: 6 additions & 0 deletions dbbuild/repeats.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,12 @@ def parse_ucsc_rmsk_track(path):
"id",
]
table = pd.read_csv(path, sep="\t", names=header)
table = table[
(~table.repClass.isin(("SINE", "LINE", "LTR")))
| ((table.repClass == "SINE") & (table.swScore > 929))
| ((table.repClass == "LINE") & (table.swScore > 411))
| ((table.repClass == "LTR") & (table.swScore > 909))
]
return table.groupby("genoName")


Expand Down
2 changes: 1 addition & 1 deletion microhapdb/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
# -------------------------------------------------------------------------------------------------

from . import nomenclature
from .tables import markers, merged, populations, frequencies, repeats, indels, variantmap, hg38
from .tables import markers, merged, populations, frequencies, indels, variantmap, hg38
from .population import Population
from .marker import Marker, Locus
from microhapdb import cli
Expand Down
3 changes: 1 addition & 2 deletions microhapdb/cli/marker.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,6 @@
import pandas as pd
import sys
from textwrap import dedent
from warnings import warn


def main(args):
Expand Down Expand Up @@ -96,7 +95,7 @@ def display(
for marker in markers:
loci[marker.locus].markers.append(marker)
table = pd.concat([locus.definition for locus in loci.values()])
table = table.rename(columns={"ChromOffset": f"OffsetHg38"})
table = table.rename(columns={"ChromOffset": "OffsetHg38"})
table.to_csv(sys.stdout, sep="\t", index=False)
else:
raise ValueError(f'unsupported view format "{view_format}"')
Expand Down
Loading