bioforensics · standage · Apr 23, 2025 · Sep 20, 2023 · Oct 17, 2023 · Oct 19, 2023
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -4,11 +4,17 @@ This project adheres to [Semantic Versioning](http://semver.org/).
 
 ## [Unreleased]
 
+### Added
+- Panel design notebooks (see #157).
+
 ### Fixed
-- Debugged a test that counts observed haplotypes (#154).
-- Replaced global pooled Ae values with 26-population average as the default Ae reported (#155, #158).
-- Replaced deprecated `pkg_resources` module with `importlib.resources` (#156).
-- Upgraded versioneer to a Python 3.12+ compatible version (#156).
+- Debugged a test that counts observed haplotypes (see #154).
+- Replaced global pooled Ae values with 26-population average as the default Ae reported (see #155, #158).
+- Replaced deprecated `pkg_resources` module with `importlib.resources` (see #156).
+- Upgraded versioneer to a Python 3.12+ compatible version (see #156).
+
+### Removed
+- Table flagging microhaps with repetitive content (see #153, #157).
 
 
 ## [0.11] 2023-10-25
@@ -21,13 +27,13 @@ This project adheres to [Semantic Versioning](http://semver.org/).
 - Merged RSIDs resolved during database build now propagated to the final marker definition (see #149).
 
 ### Fixed
-- Added manual and automated fixes to ensure frequencies are formatted correcly and matche to the correct marker definition (see #150).
+- Added manual and automated fixes to ensure frequencies are formatted correcly and matched to the correct marker definition (see #150).
 
 
 ## [0.10.1] 2023-10-13
 
 ### Fixed
-- Bug with offsets table (`marker --format=offsets`) when multiple markers are defined for a locus (#144).
+- Bug with offsets table (`marker --format=offsets`) when multiple markers are defined for a locus (see #144).
 
 
 ## [0.10] 2023-09-15

diff --git a/dbbuild/README.md b/dbbuild/README.md
@@ -29,7 +29,7 @@ The goal is that—if ever needed, heaven forbid—any reasonbly capable bioinfo
 The MicroHapDB database can be rebuilt with the following command in the `dbbuild/` directory.
 
 ```
-./build.py databases/dbSNP/ databases/chains/ | tee build-summary.txt
+./build.py databases/dbSNP/ databases/chains/ --exclude Auton2015 | tee build-summary.txt
 ```
 
 The arguments provided to the build script will depend on the location of the dbSNP files and liftover chain files on the system.

diff --git a/dbbuild/build.py b/dbbuild/build.py
@@ -15,14 +15,11 @@
 from argparse import ArgumentParser
 from lib import SourceIndex
 from pathlib import Path
-from repeats import main as flag_repeats
 import sys
 
 
-def main(
-    source_path, dbsnp_path, chain_path, rmsk_path, exclusions=["Auton2015"], check_only=False
-):
-    validate_paths(dbsnp_path, chain_path, rmsk_path)
+def main(source_path, dbsnp_path, chain_path, exclusions=["Auton2015"], check_only=False):
+    validate_paths(dbsnp_path, chain_path)
     if check_only:
         return
     index = SourceIndex(source_path, dbsnp_path, chain_path, exclude=exclusions)
@@ -35,12 +32,10 @@ def main(
     frequencies.to_csv("frequency.csv.gz", index=False, float_format="%.5f", compression="gzip")
     index.populations.to_csv("population.csv", index=False)
     index.merges.to_csv("merged.csv", index=False)
-    repeats = flag_repeats(Path(rmsk_path) / "rmsk.txt.gz", "marker.csv", delta=25)
-    repeats.to_csv("repeats.csv", index=False)
     print(index)
 
 
-def validate_paths(dbsnp_path, rmsk_path, chain_path):
+def validate_paths(dbsnp_path, chain_path):
     paths = list()
     for version in (37, 38):
         for extension in ("vcf.gz", "vcf.gz.tbi", "rsidx"):
@@ -49,7 +44,6 @@ def validate_paths(dbsnp_path, rmsk_path, chain_path):
     paths.append(Path(dbsnp_path) / "refsnp-merged.csv.gz")
     paths.append(Path(chain_path) / "hg19ToHg38.over.chain.gz")
     paths.append(Path(chain_path) / "hg38ToHg19.over.chain.gz")
-    paths.append(Path(rmsk_path) / "rmsk.txt.gz")
     files_present = [p.is_file() for p in paths]
     print("-" * 60, "[Auxiliary data file check]\n", "Present  Path", sep="\n", file=sys.stderr)
     for path, present in zip(paths, files_present):
@@ -81,7 +75,6 @@ def get_parser():
     parser = ArgumentParser(description="MicroHapDB database build procedure")
     parser.add_argument("dbsnp_path")
     parser.add_argument("chain_path")
-    parser.add_argument("rmsk_path")
     parser.add_argument(
         "--sources",
         default="sources",
@@ -107,7 +100,6 @@ def get_parser():
         args.sources,
         args.dbsnp_path,
         args.chain_path,
-        args.rmsk_path,
         exclusions=args.exclude,
         check_only=args.check,
     )
diff --git a/dbbuild/repeats.py b/dbbuild/repeats.py
@@ -58,6 +58,12 @@ def parse_ucsc_rmsk_track(path):
         "id",
     ]
     table = pd.read_csv(path, sep="\t", names=header)
+    table = table[
+        (~table.repClass.isin(("SINE", "LINE", "LTR")))
+        | ((table.repClass == "SINE") & (table.swScore > 929))
+        | ((table.repClass == "LINE") & (table.swScore > 411))
+        | ((table.repClass == "LTR")  & (table.swScore > 909))
+    ]
     return table.groupby("genoName")
 
 

diff --git a/microhapdb/__init__.py b/microhapdb/__init__.py
@@ -11,7 +11,7 @@
 # -------------------------------------------------------------------------------------------------
 
 from . import nomenclature
-from .tables import markers, merged, populations, frequencies, repeats, indels, variantmap, hg38
+from .tables import markers, merged, populations, frequencies, indels, variantmap, hg38
 from .population import Population
 from .marker import Marker, Locus
 from microhapdb import cli

diff --git a/microhapdb/cli/marker.py b/microhapdb/cli/marker.py
@@ -17,7 +17,6 @@
 import pandas as pd
 import sys
 from textwrap import dedent
-from warnings import warn
 
 
 def main(args):
@@ -96,7 +95,7 @@ def display(
             for marker in markers:
                 loci[marker.locus].markers.append(marker)
             table = pd.concat([locus.definition for locus in loci.values()])
-            table = table.rename(columns={"ChromOffset": f"OffsetHg38"})
+            table = table.rename(columns={"ChromOffset": "OffsetHg38"})
             table.to_csv(sys.stdout, sep="\t", index=False)
         else:
             raise ValueError(f'unsupported view format "{view_format}"')