SeqAn3 3.4.0-rc.3
The Modern C++ library for sequence analysis.
Loading...
Searching...
No Matches
fm_index_cursor.hpp
Go to the documentation of this file.
1// SPDX-FileCopyrightText: 2006-2024 Knut Reinert & Freie Universität Berlin
2// SPDX-FileCopyrightText: 2016-2024 Knut Reinert & MPI für molekulare Genetik
3// SPDX-License-Identifier: BSD-3-Clause
4
10#pragma once
11
12#include <array>
13#include <ranges>
14#include <type_traits>
15
16#include <sdsl/suffix_trees.hpp>
17
24
25namespace seqan3
26{
30{
34 size_t end_position{};
35
44 friend bool operator==(suffix_array_interval const & lhs, suffix_array_interval const & rhs) noexcept
45 {
46 return lhs.begin_position == rhs.begin_position && lhs.end_position == rhs.end_position;
47 }
48
54 friend bool operator!=(suffix_array_interval const & lhs, suffix_array_interval const & rhs) noexcept
55 {
56 return !(lhs == rhs);
57 }
59};
60
82template <typename index_t>
84{
85public:
90 using index_type = index_t;
92 using size_type = typename index_type::size_type;
94
95private:
100 using node_type = detail::fm_index_cursor_node<index_t>;
102 using sdsl_char_type = typename index_type::sdsl_char_type;
104 using sdsl_index_type = typename index_t::sdsl_index_type;
106 using sdsl_sigma_type = typename index_type::sdsl_sigma_type;
108 using index_alphabet_type = typename index_t::alphabet_type;
114
116 index_type const * index{nullptr};
118 size_type parent_lb{};
120 size_type parent_rb{};
122 node_type node{};
124 sdsl_sigma_type sigma{};
125
126 template <typename _index_t>
127 friend class bi_fm_index_cursor;
128
130 size_type offset() const noexcept
131 {
132 assert(index->index.size() > query_length());
133 return index->index.size() - query_length() - 1; // since the string is reversed during construction
134 }
135
137 bool
138 backward_search(sdsl_index_type const & csa, sdsl_char_type const c, size_type & l, size_type & r) const noexcept
139 {
140 assert(l <= r && r < csa.size());
141
142 size_type _l, _r;
143
144 size_type cc = c;
145 if constexpr (!std::same_as<index_alphabet_type, sdsl::plain_byte_alphabet>)
146 {
147 cc = csa.char2comp[c];
148 if (cc == 0 && c > 0) // [[unlikely]]
149 return false;
150 }
151
152 size_type const c_begin = csa.C[cc];
153 if (l == 0 && r + 1 == csa.size()) // [[unlikely]]
154 {
155 _l = c_begin;
156 _r = csa.C[cc + 1] - 1;
157 // if we use not the plain_byte_alphabet, we could return always return true here
158 }
159 else
160 {
161 _l = c_begin + csa.bwt.rank(l, c); // count c in bwt[0..l-1]
162 _r = c_begin + csa.bwt.rank(r + 1, c) - 1; // count c in bwt[0..r]
163 }
164
165 if (_r >= _l)
166 {
167 r = _r;
168 l = _l;
169 assert(r + 1 - l >= 0);
170 return true;
171 }
172 return false;
173 }
174
175public:
180 // Default construction is necessary to make this class semi-regular and e.g., to allow construction of
181 // std::array of iterators.
182 fm_index_cursor() noexcept = default;
183 fm_index_cursor(fm_index_cursor const &) noexcept = default;
184 fm_index_cursor & operator=(fm_index_cursor const &) noexcept = default;
185 fm_index_cursor(fm_index_cursor &&) noexcept = default;
186 fm_index_cursor & operator=(fm_index_cursor &&) noexcept = default;
187 ~fm_index_cursor() = default;
188
190 fm_index_cursor(index_t const & _index) noexcept :
191 index(&_index),
192 node({0, _index.index.size() - 1, 0, 0}),
193 sigma(_index.index.sigma - index_t::text_layout_mode)
194 {
195 assert(_index.index.size() != 0);
196 }
197 //\}
198
211 bool operator==(fm_index_cursor const & rhs) const noexcept
212 {
213 assert(index != nullptr);
214 assert(node != rhs.node || (query_length() == 0 || (parent_lb == rhs.parent_lb && parent_rb == rhs.parent_rb)));
215
216 // position in the implicit suffix tree is defined by the SA interval and depth.
217 // No need to compare parent intervals
218 return node == rhs.node;
219 }
220
233 bool operator!=(fm_index_cursor const & rhs) const noexcept
234 {
235 assert(index != nullptr);
236
237 return !(*this == rhs);
238 }
239
257 bool extend_right() noexcept
258 {
259 // TODO: specialize extend_right() and cycle_back() for EPR-dictionaries
260 // store all cursors at once in a private std::array of cursors
261 assert(index != nullptr);
262
263 sdsl_sigma_type c = 1; // NOTE: start with 0 or 1 depending on implicit_sentintel
264 size_type _lb = node.lb, _rb = node.rb;
265 while (c < sigma && !backward_search(index->index, index->index.comp2char[c], _lb, _rb))
266 {
267 ++c;
268 }
269
270 if (c != sigma)
271 {
272 parent_lb = node.lb;
273 parent_rb = node.rb;
275 node = {_lb, _rb, node.depth + 1, static_cast<sdsl_char_type>(c)};
276 return true;
277 }
278 return false;
279 }
280
294 template <typename char_t>
295 requires std::convertible_to<char_t, index_alphabet_type>
296 bool extend_right(char_t const c) noexcept
297 {
298 assert(index != nullptr);
299 // The rank cannot exceed 255 for single text and 254 for text collections as they are reserved as sentinels
300 // for the indexed text.
301 assert(seqan3::to_rank(static_cast<index_alphabet_type>(c))
302 < ((index_type::text_layout_mode == text_layout::single) ? 255 : 254));
303
304 size_type _lb = node.lb, _rb = node.rb;
305
306 sdsl_char_type c_char = seqan3::to_rank(static_cast<index_alphabet_type>(c)) + 1;
307
308 if (backward_search(index->index, c_char, _lb, _rb))
309 {
310 parent_lb = node.lb;
311 parent_rb = node.rb;
312 node = {_lb, _rb, node.depth + 1, c_char};
313 return true;
314 }
315 return false;
316 }
317
319 template <typename char_type>
320 requires detail::is_char_adaptation_v<char_type>
321 bool extend_right(char_type const * cstring) noexcept
322 {
324 }
325
342 template <std::ranges::range seq_t>
343 bool extend_right(seq_t && seq) noexcept
344 {
345 static_assert(std::ranges::forward_range<seq_t>, "The query must model forward_range.");
346 static_assert(std::convertible_to<range_innermost_value_t<seq_t>, index_alphabet_type>,
347 "The alphabet of the sequence must be convertible to the alphabet of the index.");
348
349 assert(index != nullptr); // range must not be empty!
350
351 size_type _lb = node.lb, _rb = node.rb;
352 size_type new_parent_lb = parent_lb, new_parent_rb = parent_rb;
353
354 sdsl_char_type c{};
355 size_t len{0};
356
357 for (auto it = std::ranges::begin(seq); it != std::ranges::end(seq); ++len, ++it)
358 {
359 // The rank cannot exceed 255 for single text and 254 for text collections as they are reserved as sentinels
360 // for the indexed text.
361 assert(seqan3::to_rank(static_cast<index_alphabet_type>(*it))
362 < ((index_type::text_layout_mode == text_layout::single) ? 255 : 254));
363
364 c = seqan3::to_rank(static_cast<index_alphabet_type>(*it)) + 1;
365
366 new_parent_lb = _lb;
367 new_parent_rb = _rb;
368 if (!backward_search(index->index, c, _lb, _rb))
369 return false;
370 }
371
372 parent_lb = new_parent_lb;
373 parent_rb = new_parent_rb;
374 node = {_lb, _rb, len + node.depth, c};
375 return true;
376 }
377
404 bool cycle_back() noexcept
405 {
406 assert(index != nullptr && query_length() > 0);
407 // parent_lb > parent_rb --> invalid interval
408 assert(parent_lb <= parent_rb);
409
410 sdsl_sigma_type c = node.last_char + 1;
411 size_type _lb = parent_lb, _rb = parent_rb;
412
413 while (c < sigma && !backward_search(index->index, index->index.comp2char[c], _lb, _rb))
414 {
415 ++c;
416 }
417
418 if (c != sigma) // Collection has additional sentinel as delimiter
419 {
421 node = {_lb, _rb, node.depth, static_cast<sdsl_char_type>(c)};
422 return true;
423 }
424 return false;
425 }
426
442 size_type last_rank() const noexcept
443 {
444 // parent_lb > parent_rb --> invalid interval
445 assert(index != nullptr && query_length() > 0 && parent_lb <= parent_rb);
446
447 return index->index.comp2char[node.last_char] - 1; // text is not allowed to contain ranks of 0
448 }
449
466 {
467 assert(index != nullptr);
468
469 return {node.lb, node.rb + 1};
470 }
471
490 size_type query_length() const noexcept
491 {
492 assert(index != nullptr);
493 assert(node.depth != 0 || (node.lb == 0 && node.rb == index->size() - 1)); // depth == 0 -> root node
494
495 return node.depth;
496 }
497
515 template <std::ranges::range text_t>
516 auto path_label(text_t && text) const noexcept
517 requires (index_t::text_layout_mode == text_layout::single)
518 {
519 static_assert(std::ranges::input_range<text_t>, "The text must model input_range.");
520 static_assert(range_dimension_v<text_t> == 1, "The input cannot be a text collection.");
521 static_assert(std::same_as<range_innermost_value_t<text_t>, index_alphabet_type>,
522 "The alphabet types of the given text and index differ.");
523 assert(index != nullptr);
524
525 size_type const query_begin = offset() - index->index[node.lb];
526 return text | views::slice(query_begin, query_begin + query_length());
527 }
528
530 template <std::ranges::range text_t>
531 auto path_label(text_t && text) const noexcept
532 requires (index_t::text_layout_mode == text_layout::collection)
533 {
534 static_assert(std::ranges::input_range<text_t>, "The text collection must model input_range.");
535 static_assert(range_dimension_v<text_t> == 2, "The input must be a text collection.");
536 static_assert(std::same_as<range_innermost_value_t<text_t>, index_alphabet_type>,
537 "The alphabet types of the given text and index differ.");
538 assert(index != nullptr);
539
540 // Position of query in concatenated text.
541 size_type const location = offset() - index->index[node.lb];
542
543 // The rank represents the number of start positions of the individual sequences/texts in the collection
544 // before position `location + 1` and thereby also the number of delimiters.
545 size_type const rank = index->text_begin_rs.rank(location + 1);
546 assert(rank > 0);
547 size_type const text_id = rank - 1;
548
549 // The start location of the `text_id`-th text in the sequence (position of the `rank`-th 1 in the bitvector).
550 size_type const start_location = index->text_begin_ss.select(rank);
551 // Substract lengths of previous sequences.
552 size_type const query_begin = location - start_location;
553
554 // Take subtext, slice query out of it
555 return text[text_id] | views::slice(query_begin, query_begin + query_length());
556 }
557
569 size_type count() const noexcept
570 {
571 assert(index != nullptr);
572
573 return 1 + node.rb - node.lb;
574 }
575
588 requires (index_t::text_layout_mode == text_layout::single)
589 {
590 assert(index != nullptr);
591
592 locate_result_type occ{};
593 occ.reserve(count());
594 for (size_type i = 0; i < count(); ++i)
595 {
596 occ.emplace_back(0, offset() - index->index[node.lb + i]);
597 }
598
599 return occ;
600 }
601
604 requires (index_t::text_layout_mode == text_layout::collection)
605 {
606 assert(index != nullptr);
607
609 occ.reserve(count());
610 for (size_type i = 0; i < count(); ++i)
611 {
612 size_type loc = offset() - index->index[node.lb + i];
613 size_type sequence_rank = index->text_begin_rs.rank(loc + 1);
614 size_type sequence_position = loc - index->text_begin_ss.select(sequence_rank);
615 occ.emplace_back(sequence_rank - 1, sequence_position);
616 }
617 return occ;
618 }
619
632 auto lazy_locate() const
633 requires (index_t::text_layout_mode == text_layout::single)
634 {
635 assert(index != nullptr);
636
637 return std::views::iota(node.lb, node.lb + count())
638 | std::views::transform(
639 [*this, _offset = offset()](auto sa_pos)
640 {
641 return locate_result_value_type{0u, _offset - index->index[sa_pos]};
642 });
643 }
644
646 auto lazy_locate() const
647 requires (index_t::text_layout_mode == text_layout::collection)
648 {
649 assert(index != nullptr);
650
651 return std::views::iota(node.lb, node.lb + count())
652 | std::views::transform(
653 [*this, _offset = offset()](auto sa_pos)
654 {
655 auto loc = _offset - index->index[sa_pos];
656 size_type sequence_rank = index->text_begin_rs.rank(loc + 1);
657 size_type sequence_position = loc - index->text_begin_ss.select(sequence_rank);
658 return locate_result_value_type{sequence_rank - 1, sequence_position};
659 });
660 }
661
669 template <cereal_archive archive_t>
670 void CEREAL_SERIALIZE_FUNCTION_NAME(archive_t & archive)
671 {
672 archive(parent_lb);
673 archive(parent_rb);
674 archive(node);
675 archive(sigma);
676 }
678};
679
680} // namespace seqan3
Core alphabet concept and free function/type trait wrappers.
Provides alphabet adaptations for standard char types.
The SeqAn FM Index Cursor.
Definition fm_index_cursor.hpp:84
bool extend_right(seq_t &&seq) noexcept
Tries to extend the query by seq to the right.
Definition fm_index_cursor.hpp:343
auto path_label(text_t &&text) const noexcept
Returns the searched query.
Definition fm_index_cursor.hpp:516
bool cycle_back() noexcept
Tries to replace the rightmost character of the query by the next lexicographically larger character ...
Definition fm_index_cursor.hpp:404
bool operator!=(fm_index_cursor const &rhs) const noexcept
Compares two cursors.
Definition fm_index_cursor.hpp:233
seqan3::suffix_array_interval suffix_array_interval() const noexcept
Returns the half-open suffix array interval.
Definition fm_index_cursor.hpp:465
locate_result_type locate() const
Locates the occurrences of the searched query in the text.
Definition fm_index_cursor.hpp:587
auto lazy_locate() const
This is an overloaded member function, provided for convenience. It differs from the above function o...
Definition fm_index_cursor.hpp:646
bool operator==(fm_index_cursor const &rhs) const noexcept
Compares two cursors.
Definition fm_index_cursor.hpp:211
size_type count() const noexcept
Counts the number of occurrences of the searched query in the text.
Definition fm_index_cursor.hpp:569
size_type last_rank() const noexcept
Outputs the rightmost rank.
Definition fm_index_cursor.hpp:442
bool extend_right() noexcept
Tries to extend the query by the smallest possible character to the right such that the query is foun...
Definition fm_index_cursor.hpp:257
bool extend_right(char_type const *cstring) noexcept
This is an overloaded member function, provided for convenience. It differs from the above function o...
Definition fm_index_cursor.hpp:321
locate_result_type locate() const
This is an overloaded member function, provided for convenience. It differs from the above function o...
Definition fm_index_cursor.hpp:603
auto path_label(text_t &&text) const noexcept
This is an overloaded member function, provided for convenience. It differs from the above function o...
Definition fm_index_cursor.hpp:531
size_type query_length() const noexcept
Returns the length of the searched query. .
Definition fm_index_cursor.hpp:490
index_t index_type
Type of the index.
Definition fm_index_cursor.hpp:90
typename index_type::size_type size_type
Type for representing positions in the indexed text.
Definition fm_index_cursor.hpp:92
auto lazy_locate() const
Locates the occurrences of the searched query in the text on demand, i.e. a std::ranges::view is retu...
Definition fm_index_cursor.hpp:632
bool extend_right(char_t const c) noexcept
Tries to extend the query by the character c to the right.
Definition fm_index_cursor.hpp:296
fm_index_cursor() noexcept=default
Default constructor. Accessing member functions on a default constructed object is undefined behavior...
Provides various transformation traits used by the range module.
Provides the internal representation of a node of the seqan3::fm_index_cursor.
T emplace_back(T... args)
constexpr auto to_rank
Return the rank representation of a (semi-)alphabet object.
Definition alphabet/concept.hpp:152
@ seq
The "sequence", usually a range of nucleotides or amino acids.
text_layout
The possible text layouts (single, collection) the seqan3::fm_index and seqan3::bi_fm_index can suppo...
Definition search/fm_index/concept.hpp:70
@ single
The text is a single range.
Definition search/fm_index/concept.hpp:72
@ collection
The text is a range of ranges.
Definition search/fm_index/concept.hpp:74
constexpr auto slice
A view adaptor that returns a half-open interval on the underlying range.
Definition slice.hpp:175
The main SeqAn3 namespace.
Definition aligned_sequence_concept.hpp:26
T reserve(T... args)
Provides the concept for seqan3::detail::sdsl_index.
Provides seqan3::views::slice.
The underlying suffix array interval.
Definition fm_index_cursor.hpp:30
size_t end_position
The exclusive end position of the interval ("right boundary").
Definition fm_index_cursor.hpp:34
size_t begin_position
The begin position of the interval ("left boundary").
Definition fm_index_cursor.hpp:32
friend bool operator==(suffix_array_interval const &lhs, suffix_array_interval const &rhs) noexcept
Test for equality.
Definition fm_index_cursor.hpp:44
friend bool operator!=(suffix_array_interval const &lhs, suffix_array_interval const &rhs) noexcept
Test for inequality.
Definition fm_index_cursor.hpp:54
Hide me