Skip to content

Commit 17a2f3d

Browse files
Add skip_ambiguous_windows variants
1 parent 69a1c27 commit 17a2f3d

File tree

4 files changed

+139
-7
lines changed

4 files changed

+139
-7
lines changed

CHANGELOG.md

Lines changed: 4 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,14 +1,13 @@
11
# Changelog
22

33
## 2.0
4+
- **Breaking**: Migrate to `packed-seq` `4.0` with `PaddedIt`.
5+
- **Breaking**: Improve simd versions for short sequences by reusing allocated buffers.
6+
- **Feature**: Use `seq-hash` crate to cleanly support multiple kmer hashers; add `*_with_hasher` function variants.
7+
- **Feature**: Add support for `_skip_ambiguous_windows` variants that omit minimizers for windows containing non-ACTG (such as NYR).
48
- Move `simd-minimizers` crate from subdir into the repo root.
5-
- Rename `simd-minimizers-bench` to just `bench`.
69
- Cleanups (use Rust 2024 edition; drop some public-but-unused functions).
710
- Improve simd versions for short sequences by preventing over-allocating the output vector.
8-
- **Breaking**: Migrate to `packed-seq` `4.0` with `PaddedIt`.
9-
- **Breaking**: Improve simd versions for short sequences by reusing allocated buffers.
10-
- **Feature**: Use `seq-hash` crate to cleanly support multiple kmer hashers;
11-
add `*_with_hasher` function variants.
1211

1312
## 1.4
1413
- Make `NtHash` and `MulHash` seedable.

src/lib.rs

Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -178,8 +178,11 @@ pub mod private {
178178
use collect::CollectAndDedup;
179179
use collect::collect_and_dedup_into_scalar;
180180
use collect::collect_and_dedup_with_index_into_scalar;
181+
use minimizers::canonical_minimizers_skip_ambiguous_windows;
181182
/// Re-export of the `packed-seq` crate.
182183
pub use packed_seq;
184+
use packed_seq::BitSeq;
185+
use packed_seq::PackedSeq;
183186
/// Re-export of the `seq-hash` crate.
184187
pub use seq_hash;
185188

@@ -316,6 +319,35 @@ impl<'h, const CANONICAL: bool, H: KmerHasher> Builder<'h, CANONICAL, H, ()> {
316319
}
317320
}
318321

322+
impl<'h, H: KmerHasher> Builder<'h, true, H, ()> {
323+
pub fn run_skip_ambiguous_windows_once<'s, 'o>(&self, seq: PackedSeq<'s>, ambi: BitSeq<'s>) -> Vec<u32> {
324+
let mut min_pos = vec![];
325+
self.run_skip_ambiguous_windows(seq, ambi, &mut min_pos);
326+
min_pos
327+
}
328+
pub fn run_skip_ambiguous_windows<'s, 'o>(
329+
&self,
330+
seq: PackedSeq<'s>,
331+
ambi: BitSeq<'s>,
332+
min_pos: &'o mut Vec<u32>,
333+
) -> Output<'o, true, PackedSeq<'s>> {
334+
let default_hasher = self.hasher.is_none().then(|| H::new(self.k));
335+
let hasher = self
336+
.hasher
337+
.unwrap_or_else(|| default_hasher.as_ref().unwrap());
338+
339+
CACHE.with_borrow_mut(|cache| {
340+
canonical_minimizers_skip_ambiguous_windows(seq, ambi, hasher, self.w, cache)
341+
.collect_and_dedup_into::<true>(min_pos)
342+
});
343+
Output {
344+
k: self.k,
345+
seq,
346+
min_pos,
347+
}
348+
}
349+
}
350+
319351
/// With-superkmer version
320352
impl<'h, 'o2, const CANONICAL: bool, H: KmerHasher> Builder<'h, CANONICAL, H, &'o2 mut Vec<u32>> {
321353
pub fn run_scalar_once<'s, SEQ: Seq<'s>>(self, seq: SEQ) -> Vec<u32> {

src/minimizers.rs

Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -165,3 +165,43 @@ pub fn canonical_minimizers_seq_simd<'s>(
165165
canonical.blend(lmin, rmin)
166166
})
167167
}
168+
169+
/// Use canonical NtHash, and keep both leftmost and rightmost minima.
170+
#[inline(always)]
171+
pub fn canonical_minimizers_skip_ambiguous_windows<'s>(
172+
seq: packed_seq::PackedSeq<'s>,
173+
bitseq: packed_seq::BitSeq<'s>,
174+
hasher: &impl KmerHasher,
175+
w: usize,
176+
cache: &mut Cache,
177+
) -> PaddedIt<impl ChunkIt<u32x8>> {
178+
assert!(hasher.is_canonical());
179+
180+
let k = hasher.k();
181+
let l = k + w - 1;
182+
let mut hash_mapper = hasher.in_out_mapper_simd(seq);
183+
let (c_delay, mut canonical_mapper) = canonical_mapper_simd(l);
184+
185+
let mut padded_it = seq
186+
.par_iter_bp_delayed_2_with_factor(l, hasher.delay(), c_delay, 2)
187+
.zip(bitseq.par_iter_kmer_ambiguity(l, l));
188+
189+
// Process first k-1 characters separately, to initialize hash values.
190+
padded_it.advance_with(k - 1, |((a, rh, rc), _ambi)| {
191+
hash_mapper((a, rh));
192+
canonical_mapper((a, rc));
193+
});
194+
let mut sliding_min_mapper = sliding_lr_min_mapper_simd(w, padded_it.it.len(), cache);
195+
padded_it.advance_with(w - 1, |((a, rh, rc), _ambi)| {
196+
let hash = hash_mapper((a, rh));
197+
canonical_mapper((a, rc));
198+
sliding_min_mapper(hash);
199+
});
200+
201+
padded_it.map(move |((a, rh, rc), ambi)| {
202+
let hash = hash_mapper((a, rh));
203+
let canonical = canonical_mapper((a, rc));
204+
let (lmin, rmin) = sliding_min_mapper(hash);
205+
ambi.blend(u32x8::splat(u32::MAX - 1), canonical.blend(lmin, rmin))
206+
})
207+
}

src/test.rs

Lines changed: 63 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,8 @@
11
use super::*;
22
use crate::minimizers::*;
33
use itertools::Itertools;
4-
use packed_seq::{AsciiSeq, AsciiSeqVec, PackedSeq, PackedSeqVec, SeqVec};
5-
use rand::Rng;
4+
use packed_seq::{AsciiSeq, AsciiSeqVec, BitSeqVec, PackedSeq, PackedSeqVec, PaddedIt, SeqVec};
5+
use rand::{Rng, random_range};
66
use seq_hash::{AntiLexHasher, MulHasher, NtHasher};
77
use std::sync::LazyLock;
88

@@ -409,3 +409,64 @@ fn readme_example() {
409409
.values_u64()
410410
.collect();
411411
}
412+
413+
#[test]
414+
fn skip_ambiguous() {
415+
let len = 100;
416+
let mut ascii = AsciiSeqVec::random(len);
417+
// set 1% to N
418+
for _ in 0..len / 100 {
419+
ascii.seq[random_range(0..len)] = b'N';
420+
}
421+
let packed_seq = PackedSeqVec::from_ascii(&ascii.seq);
422+
let ambiguous = BitSeqVec::from_ascii(&ascii.seq);
423+
424+
for k in 1..=64 {
425+
for w in 1..64 {
426+
if (k + w - 1) % 2 == 0 {
427+
continue;
428+
}
429+
if k + w - 1 > 64 {
430+
continue;
431+
}
432+
let hasher = &<NtHasher>::new(k);
433+
434+
let mut poss0 = vec![];
435+
canonical_minimizers_skip_ambiguous_windows(
436+
packed_seq.as_slice(),
437+
ambiguous.as_slice(),
438+
hasher,
439+
w,
440+
&mut Default::default(),
441+
)
442+
.collect_into(&mut poss0);
443+
let mut poss1 = vec![];
444+
canonical_minimizers_skip_ambiguous_windows(
445+
packed_seq.as_slice(),
446+
ambiguous.as_slice(),
447+
hasher,
448+
w,
449+
&mut Default::default(),
450+
)
451+
.collect_and_dedup_into::<false>(&mut poss1);
452+
let mut poss2 = vec![];
453+
canonical_minimizers_skip_ambiguous_windows(
454+
packed_seq.as_slice(),
455+
ambiguous.as_slice(),
456+
hasher,
457+
w,
458+
&mut Default::default(),
459+
)
460+
.collect_and_dedup_into::<true>(&mut poss2);
461+
462+
let poss = canonical_minimizers(k, w)
463+
.run_skip_ambiguous_windows_once(packed_seq.as_slice(), ambiguous.as_slice());
464+
for pos in poss {
465+
// these should be filtered out
466+
assert_ne!(pos, u32::MAX - 1);
467+
// check that kmer at pos does not have ambiguous bases
468+
assert_eq!(ambiguous.read_kmer_u128(k, pos as usize), 0);
469+
}
470+
}
471+
}
472+
}

0 commit comments

Comments
 (0)