diff --git a/bench/varity/large_vcf_lift_bench.clj b/bench/varity/large_vcf_lift_bench.clj new file mode 100644 index 0000000..9c59104 --- /dev/null +++ b/bench/varity/large_vcf_lift_bench.clj @@ -0,0 +1,21 @@ +(ns varity.large-vcf-lift-bench + (:require [libra.bench :refer [defbench is]] + [libra.criterium :as c] + [cljam.io.sequence :as cseq] + [cljam.io.vcf :as vcf] + [varity.chain :as ch] + [varity.vcf-lift :as vcf-lift] + [varity.t-common + :refer [test-ref-seq-file + test-large-vcf-file + test-large-chain-file + prepare-cavia!]])) + +(defbench large-vcf-lift-bench + (prepare-cavia!) + (with-open [seq-rdr (cseq/reader test-ref-seq-file) + vcf-rdr (vcf/reader test-large-vcf-file)] + (let [chidx (ch/index (ch/load-chain test-large-chain-file))] + (is (c/quick-bench + (doall (vcf-lift/liftover-variants + seq-rdr chidx (vcf/read-variants vcf-rdr)))))))) diff --git a/src/varity/chain.clj b/src/varity/chain.clj index 21b4568..860147c 100644 --- a/src/varity/chain.clj +++ b/src/varity/chain.clj @@ -5,9 +5,9 @@ (:require [clojure.java.io :as io] [clojure.string :as string] [cljam.util.chromosome :refer [normalize-chromosome-key]] + [cljam.util.intervals :as intervals] [proton.core :refer [as-long]] - [varity.util :as util]) - (:import [clojure.lang Sorted])) + [varity.util :as util])) (defn- update-multi [m ks f] @@ -81,7 +81,9 @@ (recur (->> (assoc d :t-start curr-t-start :q-start curr-q-start :t-end (+ curr-t-start (:size d)) - :q-end (+ curr-q-start (:size d))) + :q-end (+ curr-q-start (:size d)) + :start curr-t-start + :end (+ curr-t-start (:size d))) (conj! results)) (first r) (next r) @@ -93,11 +95,11 @@ (:q-end header)))))) (defn- index-chain [chain] - (->> chain - cumsum-chain - (map (juxt :t-start identity)) - (into (sorted-map)) - (assoc chain :data))) + (assoc chain + :data + (-> chain + cumsum-chain + (intervals/index-intervals {:structure :nclist})))) (defn index "Creates chain index for search." @@ -109,12 +111,11 @@ (mapv index-chain xs)]))))) ;;; Search - (defn- in-block? - [pos {:keys [^Sorted data header] :as chain}] + [pos {:keys [data header] :as chain}] (when (<= (inc (:t-start header)) pos (:t-end header)) - (when-let [[start m] (first (. data seqFrom pos false))] - (when (<= start pos (dec (+ start (:size m)))) + (when-let [m (first (intervals/find-overlap-intervals data nil pos (inc pos)))] + (when (<= (:t-start m) pos (dec (+ (:t-start m) (:size m)))) (assoc chain :in-block m))))) (def ^:private normalize-chr (memoize normalize-chromosome-key)) @@ -137,6 +138,4 @@ (defn search-overlap-blocks "Calculates a list of blocks that overlap the given interval." [start end blocks-idx] - (->> (subseq blocks-idx <= end) - (map second) - (filter #(> (:t-end %) start)))) + (intervals/find-overlap-intervals blocks-idx nil (inc start) end)) diff --git a/test-resources/large.chain.gz b/test-resources/large.chain.gz new file mode 100644 index 0000000..f8f1674 Binary files /dev/null and b/test-resources/large.chain.gz differ diff --git a/test-resources/large.vcf.gz b/test-resources/large.vcf.gz new file mode 100644 index 0000000..77d3323 Binary files /dev/null and b/test-resources/large.vcf.gz differ diff --git a/test/varity/chain_test.clj b/test/varity/chain_test.clj index 461b3db..966621f 100644 --- a/test/varity/chain_test.clj +++ b/test/varity/chain_test.clj @@ -1,14 +1,18 @@ (ns varity.chain-test (:require [clojure.test :refer :all] + [cljam.util.intervals :as intervals] [varity.chain :as chain])) (def ^:private sample-indexed-block - (sorted-map - 2 {:t-start 2 :t-end 5} - 6 {:t-start 6 :t-end 10} - 8 {:t-start 8 :t-end 12} - 14 {:t-start 14 :t-end 16} - 18 {:t-start 18 :t-end 20})) + (intervals/index-intervals + (map (fn [{:keys [t-start t-end] :as m}] + (assoc m :start t-start :end t-end)) + [{:t-start 2 :t-end 5} + {:t-start 6 :t-end 10} + {:t-start 8 :t-end 12} + {:t-start 14 :t-end 16} + {:t-start 18 :t-end 20}]) + {:structure :nclist})) (def ^:private ^:const sample-indexed-chain {"chr1" [{:header {:t-name "chr1", :t-start 0, :t-end 100 :score 1}} @@ -19,19 +23,17 @@ "chr3" [{:header {:t-name "chr2", :t-start 0, :t-end 300 :score 1}}]}) (deftest search-overlap-blocks-test - (is (= (chain/search-overlap-blocks 7 12 sample-indexed-block) - [{:t-start 6, :t-end 10} {:t-start 8, :t-end 12}])) - (is (= (chain/search-overlap-blocks 15 17 sample-indexed-block) - [{:t-start 14, :t-end 16}])) - (is (= (chain/search-overlap-blocks 6 7 sample-indexed-block) - [{:t-start 6, :t-end 10}])) - (is (= (chain/search-overlap-blocks 4 4 sample-indexed-block) - [{:t-start 2, :t-end 5}])) - (is (= (chain/search-overlap-blocks 5 5 sample-indexed-block) [])) - (is (= (chain/search-overlap-blocks 6 6 sample-indexed-block) - [{:t-start 6, :t-end 10}])) - (is (= (chain/search-overlap-blocks 1 1 sample-indexed-block) [])) - (is (= (chain/search-overlap-blocks 30 31 sample-indexed-block) []))) + (are [start end ans] (= (map #(select-keys % [:t-start :t-end]) + (chain/search-overlap-blocks start end sample-indexed-block)) + ans) + 7 12 [{:t-start 6, :t-end 10} {:t-start 8, :t-end 12}] + 15 17 [{:t-start 14, :t-end 16}] + 6 7 [{:t-start 6, :t-end 10}] + 4 4 [{:t-start 2, :t-end 5}] + 5 5 [] + 6 6 [{:t-start 6, :t-end 10}] + 1 1 [] + 30 31 [])) (deftest search-containing-chains-test (is (= (chain/search-containing-chains "chr1" 10 20 sample-indexed-chain) diff --git a/test/varity/t_common.clj b/test/varity/t_common.clj index 7dcd6c3..9c0c2b0 100644 --- a/test/varity/t_common.clj +++ b/test/varity/t_common.clj @@ -73,3 +73,7 @@ (def test-load-refgene-file "./test-resources/test-refgene.txt") (def test-load-refseq-file "./test-resources/test-refseq.txt") + +(def test-large-chain-file "./test-resources/large.chain.gz") + +(def test-large-vcf-file "./test-resources/large.vcf.gz") diff --git a/test/varity/vcf_lift_test.clj b/test/varity/vcf_lift_test.clj index 110bdd6..178f59b 100644 --- a/test/varity/vcf_lift_test.clj +++ b/test/varity/vcf_lift_test.clj @@ -10,14 +10,14 @@ [{:header {:t-name "chr1", :t-start 0, :t-end 540, :t-size 540, :q-name "chr1", :q-start 0, :q-end 540, :q-size 540, :q-strand :reverse} - :data [{:chr "chr1", :size 540, :dt nil, :dq nil}]}])) + :data [{:size 540, :dt nil, :dq nil}]}])) (def small-forward-chains (ch/index [{:header {:t-name "chr1", :t-start 0, :t-end 540, :t-size 540, :q-name "chr1", :q-start 0, :q-end 540, :q-size 540, :q-strand :forward} - :data [{:chr "chr1", :size 540, :dt nil, :dq nil}]}])) + :data [{:size 540, :dt nil, :dq nil}]}])) (deftest small-new-interval-test (testing "reverse"