SDSL 3.0.1
Succinct Data Structure Library
Loading...
Searching...
No Matches
csa_wt.hpp
Go to the documentation of this file.
1// Copyright (c) 2016, the SDSL Project Authors. All rights reserved.
2// Please see the AUTHORS file for details. Use of this source code is governed
3// by a BSD license that can be found in the LICENSE file.
8#ifndef INCLUDED_SDSL_CSA_WT
9#define INCLUDED_SDSL_CSA_WT
10
11#include <algorithm> // for std::swap
12#include <cassert>
13#include <cstring> // for strlen
14#include <iomanip>
15#include <iostream>
16#include <iterator>
17
20#include <sdsl/fast_cache.hpp>
21#include <sdsl/iterators.hpp>
23#include <sdsl/util.hpp>
25
26namespace sdsl
27{
28
29template <class t_csa>
30class psi_of_csa_wt; // forward declaration of PSI-array class
31
32template <class t_csa>
33class bwt_of_csa_wt; // forward declaration of BWT-array class
34
37
48template <class t_wt = wt_huff<>, // Wavelet tree type
49 uint32_t t_dens = 32, // Sample density for suffix array (SA) values
50 uint32_t t_inv_dens = 64, // Sample density for inverse suffix array (ISA) values
51 class t_sa_sample_strat = sa_order_sa_sampling<>, // Policy class for the SA sampling.
52 class t_isa_sample_strat = isa_sampling<>, // Policy class for ISA sampling.
53 class t_alphabet_strat = // Policy class for the representation of the alphabet.
54 typename wt_alphabet_trait<t_wt>::type>
55class csa_wt
56{
57 static_assert(std::is_same<typename index_tag<t_wt>::type, wt_tag>::value,
58 "First template argument has to be a wavelet tree type.");
59 static_assert(t_dens > 0, "Second template argument has to be greater then 0.");
60 static_assert(t_inv_dens > 0, "Third template argument has to be greater then 0.");
61 static_assert(std::is_same<typename sampling_tag<t_sa_sample_strat>::type, sa_sampling_tag>::value,
62 "Forth template argument has to be a suffix array sampling strategy.");
63 static_assert(std::is_same<typename sampling_tag<t_isa_sample_strat>::type, isa_sampling_tag>::value,
64 "Fifth template argument has to be a inverse suffix array sampling strategy.");
65 static_assert(is_alphabet<t_alphabet_strat>::value, "Sixth template argument has to be a alphabet strategy.");
66
67 friend class bwt_of_csa_wt<csa_wt>;
68
69 public:
70 enum
71 {
73 isa_sample_dens = t_inv_dens
74 };
75
76 typedef uint64_t value_type;
82 typedef const pointer const_pointer;
85 typedef ptrdiff_t difference_type;
92 typedef t_wt wavelet_tree_type;
93 typedef typename t_sa_sample_strat::template type<csa_wt> sa_sample_type;
94 typedef typename t_isa_sample_strat::template type<csa_wt> isa_sample_type;
95 typedef t_alphabet_strat alphabet_type;
96 typedef typename alphabet_type::char_type char_type; // Note: This is the char type of the CSA not the WT!
97 typedef typename alphabet_type::comp_char_type comp_char_type;
98 typedef typename alphabet_type::string_type string_type;
100
103 typedef typename alphabet_type::alphabet_category alphabet_category;
104
105 private:
106 t_wt m_wavelet_tree; // the wavelet tree
107 sa_sample_type m_sa_sample; // suffix array samples
108 isa_sample_type m_isa_sample; // inverse suffix array samples
109 alphabet_type m_alphabet;
110//#define USE_CSA_CACHE
111#ifdef USE_CSA_CACHE
112 mutable fast_cache csa_cache;
113#endif
114
115 public:
116 const typename alphabet_type::char2comp_type & char2comp = m_alphabet.char2comp;
117 const typename alphabet_type::comp2char_type & comp2char = m_alphabet.comp2char;
118 const typename alphabet_type::C_type & C = m_alphabet.C;
119 const typename alphabet_type::sigma_type & sigma = m_alphabet.sigma;
120 const psi_type psi = psi_type(*this);
121 const lf_type lf = lf_type(*this);
122 const bwt_type bwt = bwt_type(*this);
123 const text_type text = text_type(*this);
125 const bwt_type L = bwt_type(*this);
126 const isa_type isa = isa_type(*this);
127 const sa_sample_type & sa_sample = m_sa_sample;
128 const isa_sample_type & isa_sample = m_isa_sample;
129 const wavelet_tree_type & wavelet_tree = m_wavelet_tree;
130
132 csa_wt() = default;
133
135 csa_wt(const csa_wt & csa)
136 : m_wavelet_tree(csa.m_wavelet_tree)
137 , m_sa_sample(csa.m_sa_sample)
138 , m_isa_sample(csa.m_isa_sample)
139 , m_alphabet(csa.m_alphabet)
140 {
141 m_isa_sample.set_vector(&m_sa_sample);
142 }
143
146 : m_wavelet_tree(std::move(csa.m_wavelet_tree))
147 , m_sa_sample(std::move(csa.m_sa_sample))
148 , m_isa_sample(std::move(csa.m_isa_sample))
149 , m_alphabet(std::move(csa.m_alphabet))
150 {
151 m_isa_sample.set_vector(&m_sa_sample);
152 }
153
155 csa_wt(cache_config & config);
156
158
163 size_type size() const { return m_wavelet_tree.size(); }
164
166
170
172
175 bool empty() const { return m_wavelet_tree.empty(); }
176
178
181 const_iterator begin() const { return const_iterator(this, 0); }
182
184
187 const_iterator end() const { return const_iterator(this, size()); }
188
190
196 inline value_type operator[](size_type i) const;
197
199
202 csa_wt & operator=(const csa_wt & csa)
203 {
204 if (this != &csa)
205 {
206 csa_wt tmp(csa);
207 *this = std::move(tmp);
208 }
209 return *this;
210 }
211
213
217 {
218 if (this != &csa)
219 {
220 m_wavelet_tree = std::move(csa.m_wavelet_tree);
221 m_sa_sample = std::move(csa.m_sa_sample);
222 m_isa_sample = std::move(csa.m_isa_sample);
223 m_isa_sample.set_vector(&m_sa_sample);
224 m_alphabet = std::move(csa.m_alphabet);
225 }
226 return *this;
227 }
228
230 bool operator==(csa_wt const & other) const noexcept
231 {
232 return (m_wavelet_tree == other.m_wavelet_tree) && (m_sa_sample == other.m_sa_sample) &&
233 (m_isa_sample == other.m_isa_sample) && (m_alphabet == other.m_alphabet);
234 }
235
237 bool operator!=(csa_wt const & other) const noexcept { return !(*this == other); }
238
240
243 size_type serialize(std::ostream & out, structure_tree_node * v = nullptr, std::string name = "") const;
244
246
248 void load(std::istream & in);
249
251 template <typename archive_t>
252 void CEREAL_SAVE_FUNCTION_NAME(archive_t & ar) const;
253
255 template <typename archive_t>
256 void CEREAL_LOAD_FUNCTION_NAME(archive_t & ar);
257
258 private:
259 // Calculates how many symbols c are in the prefix [0..i-1] of the BWT of the original text.
260 /*
261 * \param i The exclusive index of the prefix range [0..i-1], so \f$i\in [0..size()]\f$.
262 * \param c The symbol to count the occurrences in the prefix.
263 * \returns The number of occurrences of symbol c in the prefix [0..i-1] of the BWT.
264 * \par Time complexity
265 * \f$ \Order{\log |\Sigma|} \f$
266 */
267 size_type rank_bwt(size_type i, const char_type c) const { return m_wavelet_tree.rank(i, c); }
268
269 // Calculates the position of the i-th c in the BWT of the original text.
270 /*
271 * \param i The i-th occurrence. \f$i\in [1..rank(size(),c)]\f$.
272 * \param c Symbol c.
273 * \returns The position of the i-th c in the BWT or size() if c does occur less then i times.
274 * \par Time complexity
275 * \f$ \Order{t_{\Psi}} \f$
276 */
277 size_type select_bwt(size_type i, const char_type c) const
278 {
279 assert(i > 0);
280 char_type cc = char2comp[c];
281 if (cc == 0 and c != 0) // character is not in the text => return size()
282 return size();
283 assert(cc != 255);
284 if (C[cc] + i - 1 < C[cc + 1]) { return m_wavelet_tree.select(i, c); }
285 else
286 return size();
287 }
288};
289
290// == template functions ==
291
292template <class t_wt,
293 uint32_t t_dens,
294 uint32_t t_inv_dens,
295 class t_sa_sample_strat,
296 class t_isa,
297 class t_alphabet_strat>
299{
300 if (!cache_file_exists(key_bwt<alphabet_type::int_width>(), config)) { return; }
301 {
302 auto event = memory_monitor::event("construct csa-alpbabet");
304 cache_file_name(key_bwt<alphabet_type::int_width>(), config));
305 size_type n = bwt_buf.size();
306 m_alphabet = alphabet_type(bwt_buf, n);
307 }
308 {
309 auto event = memory_monitor::event("sample SA");
310 m_sa_sample = sa_sample_type(config);
311 }
312 {
313 auto event = memory_monitor::event("sample ISA");
314 isa_sample_type isa_s(config, &m_sa_sample);
315 util::swap_support(m_isa_sample, isa_s, &m_sa_sample, &m_sa_sample);
316 }
317
318 // if ( config.delete_files ) {
319 // remove_from_cache<int_vector<>>(conf::KEY_SA, config);
320 // }
321 {
322 auto event = memory_monitor::event("construct wavelet tree");
324 cache_file_name(key_bwt<alphabet_type::int_width>(), config));
325 m_wavelet_tree = wavelet_tree_type(bwt_buf.begin(), bwt_buf.end(), config.dir);
326 }
327}
328
329template <class t_wt,
330 uint32_t t_dens,
331 uint32_t t_inv_dens,
332 class t_sa_sample_strat,
333 class t_isa,
334 class t_alphabet_strat>
336 -> value_type
337{
338 size_type off = 0;
339 while (!m_sa_sample.is_sampled(i))
340 {
341 i = lf[i];
342 ++off;
343 }
344 value_type result = m_sa_sample[i];
345 if (result + off < size()) { return result + off; }
346 else
347 {
348 return result + off - size();
349 }
350}
351
352template <class t_wt,
353 uint32_t t_dens,
354 uint32_t t_inv_dens,
355 class t_sa_sample_strat,
356 class t_isa,
357 class t_alphabet_strat>
360 std::string name) const
361 -> size_type
362{
363 structure_tree_node * child = structure_tree::add_child(v, name, util::class_name(*this));
364 size_type written_bytes = 0;
365 written_bytes += m_wavelet_tree.serialize(out, child, "wavelet_tree");
366 written_bytes += m_sa_sample.serialize(out, child, "sa_samples");
367 written_bytes += m_isa_sample.serialize(out, child, "isa_samples");
368 written_bytes += m_alphabet.serialize(out, child, "alphabet");
369 structure_tree::add_size(child, written_bytes);
370 return written_bytes;
371}
372
373template <class t_wt,
374 uint32_t t_dens,
375 uint32_t t_inv_dens,
376 class t_sa_sample_strat,
377 class t_isa,
378 class t_alphabet_strat>
380{
381 m_wavelet_tree.load(in);
382 m_sa_sample.load(in);
383 m_isa_sample.load(in, &m_sa_sample);
384 m_alphabet.load(in);
385}
386
387template <class t_wt,
388 uint32_t t_dens,
389 uint32_t t_inv_dens,
390 class t_sa_sample_strat,
391 class t_isa,
392 class t_alphabet_strat>
393template <typename archive_t>
395 archive_t & ar) const
396{
397 ar(CEREAL_NVP(m_wavelet_tree));
398 ar(CEREAL_NVP(m_sa_sample));
399 ar(CEREAL_NVP(m_isa_sample));
400 ar(CEREAL_NVP(m_alphabet));
401}
402
403template <class t_wt,
404 uint32_t t_dens,
405 uint32_t t_inv_dens,
406 class t_sa_sample_strat,
407 class t_isa,
408 class t_alphabet_strat>
409template <typename archive_t>
411 archive_t & ar)
412{
413 ar(CEREAL_NVP(m_wavelet_tree));
414 ar(CEREAL_NVP(m_sa_sample));
415 ar(CEREAL_NVP(m_isa_sample));
416 m_isa_sample.set_vector(&m_sa_sample);
417 ar(CEREAL_NVP(m_alphabet));
418}
419
420} // end namespace sdsl
421#endif
#define CEREAL_NVP(X)
Definition: cereal.hpp:31
#define CEREAL_LOAD_FUNCTION_NAME
Definition: cereal.hpp:34
#define CEREAL_SAVE_FUNCTION_NAME
Definition: cereal.hpp:35
A class for the Compressed Suffix Array (CSA) based on a Wavelet Tree (WT) of the Burrow Wheeler Tran...
Definition: csa_wt.hpp:56
bool empty() const
Returns if the data strucutre is empty.
Definition: csa_wt.hpp:175
random_access_const_iterator< csa_wt > const_iterator
Definition: csa_wt.hpp:77
t_isa_sample_strat::template type< csa_wt > isa_sample_type
Definition: csa_wt.hpp:94
csa_wt(csa_wt &&csa)
Move constructor.
Definition: csa_wt.hpp:145
bool operator!=(csa_wt const &other) const noexcept
Inequality operator.
Definition: csa_wt.hpp:237
ptrdiff_t difference_type
Definition: csa_wt.hpp:85
const_iterator begin() const
Returns a const_iterator to the first element.
Definition: csa_wt.hpp:181
void load(std::istream &in)
Load from a stream.
Definition: csa_wt.hpp:379
csa_wt & operator=(csa_wt &&csa)
Assignment Move Operator.
Definition: csa_wt.hpp:216
const psi_type psi
Definition: csa_wt.hpp:120
const first_row_type F
Definition: csa_wt.hpp:124
csa_tag index_category
Definition: csa_wt.hpp:101
const bwt_type bwt
Definition: csa_wt.hpp:122
void CEREAL_SAVE_FUNCTION_NAME(archive_t &ar) const
Serialise (save) via cereal.
Definition: csa_wt.hpp:394
const alphabet_type::char2comp_type & char2comp
Definition: csa_wt.hpp:116
const value_type const_reference
Definition: csa_wt.hpp:79
traverse_csa_wt< csa_wt, true > psi_type
Definition: csa_wt.hpp:86
static size_type max_size()
Returns the largest size that csa_wt can ever have.
Definition: csa_wt.hpp:169
const text_type text
Definition: csa_wt.hpp:123
const alphabet_type::sigma_type & sigma
Definition: csa_wt.hpp:119
alphabet_type::string_type string_type
Definition: csa_wt.hpp:98
const_iterator iterator
Definition: csa_wt.hpp:78
bool operator==(csa_wt const &other) const noexcept
Equality operator.
Definition: csa_wt.hpp:230
@ isa_sample_dens
Definition: csa_wt.hpp:73
@ sa_sample_dens
Definition: csa_wt.hpp:72
text_of_csa< csa_wt > text_type
Definition: csa_wt.hpp:91
const sa_sample_type & sa_sample
Definition: csa_wt.hpp:127
t_sa_sample_strat::template type< csa_wt > sa_sample_type
Definition: csa_wt.hpp:93
size_type serialize(std::ostream &out, structure_tree_node *v=nullptr, std::string name="") const
Serialize to a stream.
Definition: csa_wt.hpp:358
const_reference * pointer
Definition: csa_wt.hpp:81
size_type csa_size_type
Definition: csa_wt.hpp:84
void CEREAL_LOAD_FUNCTION_NAME(archive_t &ar)
Serialise (load) via cereal.
Definition: csa_wt.hpp:410
value_type operator[](size_type i) const
[]-operator
Definition: csa_wt.hpp:335
const lf_type lf
Definition: csa_wt.hpp:121
bwt_of_csa_wt< csa_wt > bwt_type
Definition: csa_wt.hpp:88
const_reference reference
Definition: csa_wt.hpp:80
const alphabet_type::C_type & C
Definition: csa_wt.hpp:118
alphabet_type::char_type char_type
Definition: csa_wt.hpp:96
size_type size() const
Number of elements in the .
Definition: csa_wt.hpp:163
const wavelet_tree_type & wavelet_tree
Definition: csa_wt.hpp:129
isa_of_csa_wt< csa_wt > isa_type
Definition: csa_wt.hpp:89
csa_wt()=default
Default constructor.
const isa_type isa
Definition: csa_wt.hpp:126
const isa_sample_type & isa_sample
Definition: csa_wt.hpp:128
alphabet_type::comp_char_type comp_char_type
Definition: csa_wt.hpp:97
alphabet_type::alphabet_category alphabet_category
Definition: csa_wt.hpp:103
const_iterator end() const
Returns a const_iterator to the element after the last element.
Definition: csa_wt.hpp:187
csa_wt(const csa_wt &csa)
Copy constructor.
Definition: csa_wt.hpp:135
lf_tag extract_category
Definition: csa_wt.hpp:102
int_vector ::size_type size_type
Definition: csa_wt.hpp:83
first_row_of_csa< csa_wt > first_row_type
Definition: csa_wt.hpp:90
t_alphabet_strat alphabet_type
Definition: csa_wt.hpp:95
traverse_csa_wt< csa_wt, false > lf_type
Definition: csa_wt.hpp:87
csa_wt csa_type
Definition: csa_wt.hpp:99
t_wt wavelet_tree_type
Definition: csa_wt.hpp:92
csa_wt & operator=(const csa_wt &csa)
Assignment Operator.
Definition: csa_wt.hpp:202
const bwt_type L
Definition: csa_wt.hpp:125
const pointer const_pointer
Definition: csa_wt.hpp:82
uint64_t value_type
Definition: csa_wt.hpp:76
const alphabet_type::comp2char_type & comp2char
Definition: csa_wt.hpp:117
uint64_t size() const
Returns the number of elements currently stored.
A generic vector class for integers of width .
Definition: int_vector.hpp:216
static size_type max_size() noexcept
Maximum size of the int_vector.
Definition: int_vector.hpp:542
static mm_event_proxy event(const std::string &name)
Generic iterator for a random access container.
Definition: iterators.hpp:24
static structure_tree_node * add_child(structure_tree_node *v, const std::string &name, const std::string &type)
static void add_size(structure_tree_node *v, uint64_t value)
A wrapper class for the and LF function for (compressed) suffix arrays that are based on a wavelet t...
csa_alphabet_strategy.hpp includes different strategy classes for representing an alphabet of a CSA.
csa_sampling_strategy.hpp includes different strategy classes for suffix array sampling in the CSAs.
iterators.hpp contains an generic iterator for random access containers.
Namespace for the succinct data structure library.
std::string cache_file_name(const std::string &key, const cache_config &config)
Returns the file name of the resource.
Definition: io.hpp:630
bool cache_file_exists(const std::string &key, const cache_config &config)
Checks if the resource specified by the key exists in the cache.
Definition: io.hpp:672
int_vector ::size_type size(const range_type &r)
Size of a range.
Definition: wt_helper.hpp:787
Helper class for construction process.
Definition: config.hpp:67
std::string dir
Definition: config.hpp:71
suffix_array_helper.hpp contains some helper classes for CSTs
util.hpp contains some helper methods for int_vector and other stuff like demangle class names.
wavelet_trees.hpp contains wavelet tree implementations.