SDSL 3.0.1
Succinct Data Structure Library
Loading...
Searching...
No Matches
construct_sa_se.hpp
Go to the documentation of this file.
1// Copyright (c) 2016, the SDSL Project Authors. All rights reserved.
2// Please see the AUTHORS file for details. Use of this source code is governed
3// by a BSD license that can be found in the LICENSE file.
4#ifndef SDSL_CONSTRUCT_SA_SE
5#define SDSL_CONSTRUCT_SA_SE
6
7#include <cmath>
8#include <fstream>
9#include <iostream>
10#include <string>
11
12#include <sdsl/int_vector.hpp>
13#include <sdsl/io.hpp>
14#include <sdsl/rank_support.hpp>
16
17namespace sdsl
18{
19
20template <class int_vector_type>
21uint64_t _get_next_lms_position(int_vector_type & text, uint64_t i)
22{
23 if (i >= text.size() - 3) { return text.size() - 1; }
24 // text[i] is S-TYP or L-TYP
25 uint64_t ci = text[i], cip1 = text[i + 1];
26 while (ci <= cip1)
27 {
28 ++i;
29 ci = cip1;
30 cip1 = text[i + 1];
31 }
32 // text[i] is L-TYP or S-TYP
33 uint64_t candidate = i + 1;
34 while (ci >= cip1)
35 {
36 if (ci > cip1)
37 {
38 if (i + 1 == text.size() - 1) { return text.size() - 1; }
39 candidate = i + 1;
40 }
41 ++i;
42 ci = cip1;
43 cip1 = text[i + 1];
44 }
45 return candidate;
46}
47
48inline void _construct_sa_IS(int_vector<> & text,
49 int_vector<> & sa,
50 std::string & filename_sa,
51 size_t n,
52 size_t text_offset,
53 size_t sigma,
54 uint64_t recursion)
55{
56 uint64_t buffersize = 1024 * 1024 / 8;
57
58 size_t name = 0;
59 size_t number_of_lms_strings = 0;
60 std::string filename_c_array = tmp_file(filename_sa, "_c_array" + util::to_string(recursion));
61 // Phase 1
62 {
63 std::vector<uint64_t> bkt(sigma, 0);
64 // Step 1 - Count characters into c array
65 // TODO: better create this in higher recursion-level
66 for (size_t i = 0; i < n; ++i) { ++bkt[text[text_offset + i]]; }
67
68 // Step 1.5 save them into cached_external_array
69 int_vector_buffer<> c_array(filename_c_array, std::ios::out, buffersize, 64);
70 for (size_t c = 0; c < sigma; ++c) { c_array[c] = bkt[c]; }
71
72 // Step 2 Calculate End-Pointer of Buckets
73 bkt[0] = 0;
74 for (size_t c = 1; c < sigma; ++c) { bkt[c] = bkt[c - 1] + bkt[c]; }
75
76 // Step 3 - Insert S*-positions into correct bucket of SA but not in correct order inside the buckets
77 for (size_t i = n - 2, was_s_typ = 1; i < n; --i)
78 {
79 if (text[text_offset + i] > text[text_offset + i + 1])
80 {
81 if (was_s_typ)
82 {
83 sa[bkt[text[text_offset + i + 1]]--] = i + 1;
84 ++number_of_lms_strings;
85 was_s_typ = 0;
86 }
87 }
88 else if (text[text_offset + i] < text[text_offset + i + 1])
89 {
90 was_s_typ = 1;
91 }
92 }
93
94 // Step 4 - Calculate Begin-Pointer of Buckets
95 bkt[0] = 0;
96 for (size_t c = 1; c < sigma; ++c) { bkt[c] = bkt[c - 1] + c_array[c - 1]; }
97
98 // Step 5 - Scan from Left-To-Right to induce L-Types
99 for (size_t i = 0; i < n; ++i)
100 {
101 if (sa[i] > 0 and text[text_offset + sa[i]] <= text[text_offset + sa[i] - 1])
102 { // faster than if(sa[i]>0 and bkt_beg[text[ sa[i]-1 ]] > i)
103 sa[bkt[text[text_offset + sa[i] - 1]]++] = sa[i] - 1;
104 sa[i] = 0;
105 }
106 }
107
108 // Step 6 - Scan from Right-To-Left to induce S-Types
109 bkt[0] = 0;
110 for (size_t c = 1; c < sigma; ++c) { bkt[c] = bkt[c - 1] + c_array[c]; }
111 c_array.close();
112
113 for (size_t i = n - 1, endpointer = n; i < n; --i)
114 {
115 if (sa[i] > 0)
116 {
117 if (text[text_offset + sa[i] - 1] <= text[text_offset + sa[i]])
118 { // faster than if(bkt_end[text[ sa[i]-1 ]] < i)
119 sa[bkt[text[text_offset + sa[i] - 1]]--] = sa[i] - 1;
120 }
121 else
122 {
123 sa[--endpointer] = sa[i];
124 }
125 sa[i] = 0;
126 }
127 }
128
129 // Step 7 - Determine length of LMS-Strings
130 for (size_t i = n - 2, end = n - 2, was_s_typ = 1; i < n; --i)
131 {
132 if (text[text_offset + i] > text[text_offset + i + 1])
133 {
134 if (was_s_typ)
135 {
136 sa[(i + 1) >> 1] = end - i;
137 end = i + 1;
138 was_s_typ = 0;
139 }
140 }
141 else if (text[text_offset + i] < text[text_offset + i + 1])
142 {
143 was_s_typ = 1;
144 }
145 }
146
147 // Step 8 - Rename
148 for (size_t i = n - number_of_lms_strings + 1, cur_pos = 0, cur_len = 0, last_pos = n - 1, last_len = 1; i < n;
149 ++i)
150 {
151 cur_pos = sa[i];
152 cur_len = sa[(cur_pos >> 1)];
153 if (cur_len == last_len)
154 {
155 size_t l = 0;
156 while (l < cur_len and text[text_offset + cur_pos + l] == text[text_offset + last_pos + l]) { ++l; }
157 if (l >= cur_len) { --name; }
158 }
159 sa[(cur_pos >> 1)] = ++name;
160 last_pos = cur_pos;
161 last_len = cur_len;
162 }
163 }
164
165 // Step 9 - Calculate SA of new string - in most cases recursive
166 if (name + 1 < number_of_lms_strings)
167 {
168 // Move Names to the end
169 for (size_t i = 0, t = n - number_of_lms_strings; i < (n >> 1); ++i)
170 {
171 if (sa[i] > 0)
172 {
173 sa[t++] = sa[i];
174 sa[i] = 0;
175 }
176 }
177 sa[n - 1] = 0;
178
179 // Recursive call
180 std::string filename_sa_rec = tmp_file(filename_sa, "_sa_rec" + util::to_string(recursion + 1));
182 sa,
183 filename_sa_rec,
184 number_of_lms_strings,
185 n - number_of_lms_strings,
186 name + 1,
187 recursion + 1);
188
189 for (size_t i = n - 2, endpointer = n - 1, was_s_typ = 1; i < n; --i)
190 {
191 if (text[text_offset + i] > text[text_offset + i + 1])
192 {
193 if (was_s_typ)
194 {
195 sa[endpointer--] = i + 1;
196 was_s_typ = 0;
197 }
198 }
199 else if (text[text_offset + i] < text[text_offset + i + 1])
200 {
201 was_s_typ = 1;
202 }
203 }
204
205 // Sort S*-positions in correct order into SA
206 for (size_t i = 0; i < number_of_lms_strings; ++i)
207 {
208 size_t pos = sa[i];
209 sa[i] = sa[n - number_of_lms_strings + pos];
210 sa[n - number_of_lms_strings + pos] = 0;
211 }
212 }
213 else
214 {
215 // Move s*-Positions to front
216 sa[0] = n - 1;
217 for (size_t i = 1; i < number_of_lms_strings; ++i)
218 {
219 sa[i] = sa[n - number_of_lms_strings + i];
220 sa[n - number_of_lms_strings + i] = 0;
221 }
222 // Clear lex. names
223 for (size_t i = number_of_lms_strings; i < (n >> 1); ++i) { sa[i] = 0; }
224 }
225
226 // Phase 3
227 {
228 // Step 10 - Count characters into c array
229
230 // Step 11 - Calculate End-Pointer of Buckets
231 int_vector_buffer<> c_array(filename_c_array, std::ios::in, buffersize, 64);
232 std::vector<uint64_t> bkt(sigma, 0);
233 for (size_t c = 1; c < sigma; ++c) { bkt[c] = bkt[c - 1] + c_array[c]; }
234
235 // Step 12 - Move S*-positions in correct order into SA
236 for (size_t i = number_of_lms_strings - 1; i < n; --i)
237 {
238 size_t pos = sa[i];
239 sa[i] = 0;
240 sa[bkt[text[text_offset + pos]]--] = pos;
241 }
242
243 // Step 13 - Calculate Begin-Pointer of Buckets
244 bkt[0] = 0;
245 for (size_t c = 1; c < sigma; ++c) { bkt[c] = bkt[c - 1] + c_array[c - 1]; }
246
247 // Step 14 - Scan from Left-To-Right to induce L-Types
248 for (size_t i = 0; i < n; ++i)
249 {
250 if (sa[i] > 0 and text[text_offset + sa[i]] <= text[text_offset + sa[i] - 1])
251 { // faster than if(sa[i]>0 and bkt_beg[text[ sa[i]-1 ]] > i)
252 sa[bkt[text[text_offset + sa[i] - 1]]++] = sa[i] - 1;
253 }
254 }
255
256 // Step 15 - Scan from Right-To-Left to induce S-Types
257 bkt[0] = 0;
258 for (size_t c = 1; c < sigma; ++c) { bkt[c] = bkt[c - 1] + c_array[c]; }
259 for (size_t i = n - 1; i < n; --i)
260 {
261 if (sa[i] > 0 and text[text_offset + sa[i] - 1] <= text[text_offset + sa[i]])
262 {
263 sa[bkt[text[text_offset + sa[i] - 1]]--] = sa[i] - 1;
264 }
265 }
266 c_array.close(true);
267 }
268}
269
270template <class int_vector_type>
271void _construct_sa_se(int_vector_type & text, std::string filename_sa, uint64_t sigma, uint64_t recursion)
272{
273 std::string filename_text = tmp_file(filename_sa, "_text_rec" + util::to_string(recursion));
274 store_to_file(text, filename_text);
275 uint64_t n = text.size();
276 uint64_t nsize = bits::hi(n) + 1;
277 uint8_t int_width = bits::hi(n - 1) + 1;
278 uint64_t buffersize = 1024 * 1024 / 8;
279
280 // Step 1 - Scan Text from right to left and count LMS, S and L characters and store lms_positions
281
282 // Define variables
283 size_t first_lms_pos = 0;
284 size_t number_of_lms_strings = 0;
285 size_t bkt_s_last = 0, bkt_s_sum = 0, bound_s = 0, bkt_l_sum = 0;
286 int_vector<> C(sigma, 0, int_width);
287 int_vector<> bkt_lms(sigma, 0, int_width);
288 int_vector<> bkt_s(sigma, 0, int_width);
289 int_vector<> bkt_l(sigma, 0, int_width);
290 std::string filename_lms_pos_b = tmp_file(filename_sa, "_lms_pos_b" + util::to_string(recursion));
291 size_t parts = 10;
292
293 {
294 int_vector_buffer<1> lms_pos_b(filename_lms_pos_b, std::ios::out, buffersize, 1);
295 uint64_t ci = text[n - 1];
296 ++C[ci];
297 bool was_s_typ = 1;
298 for (size_t i = n - 2; i < n; --i)
299 {
300 uint64_t cip1 = ci;
301 ci = text[i];
302 ++C[ci];
303 if (was_s_typ)
304 {
305 ++bkt_s[text[i + 1]];
306 if (ci > cip1)
307 {
308 ++bkt_lms[cip1];
309 lms_pos_b[i + 1] = 1;
310 ++number_of_lms_strings;
311 first_lms_pos = i + 1;
312 was_s_typ = 0;
313 }
314 }
315 else if (ci < cip1)
316 {
317 was_s_typ = 1;
318 }
319 }
320 if (was_s_typ) { ++bkt_s[ci]; }
321 bkt_l[0] = C[0] - bkt_s[0];
322 for (size_t i = 1; i < C.size(); ++i)
323 {
324 bkt_l[i] = C[i] - bkt_s[i];
325 C[i] = C[i] + C[i - 1];
326 }
327 lms_pos_b.close();
328 }
329
330 // Step 2 - Scan Text from right to left and detect LMS-Positions. Sort and write them to disk
331 int_vector_buffer<> right(tmp_file(filename_sa, "_right" + util::to_string(recursion)),
332 std::ios::out,
333 buffersize,
334 nsize);
335 size_t right_pointer = 0;
336 int_vector_buffer<> left(tmp_file(filename_sa, "_left" + util::to_string(recursion)),
337 std::ios::out,
338 buffersize,
339 nsize);
340 size_t left_pointer = 0;
341 {
342 for (size_t i = 0, tmp2 = 0, tmp = 0; i < sigma; ++i)
343 {
344 tmp += bkt_lms[i];
345 bkt_lms[i] = tmp2;
346 tmp2 = tmp;
347 }
348 int_vector_buffer<> lms_positions(tmp_file(filename_sa, "_lms_positions" + util::to_string(recursion)),
349 std::ios::out,
350 buffersize,
351 nsize);
352 for (size_t i = n - 2, was_s_typ = 1, ci = text[n - 1]; i < n; --i)
353 {
354 uint64_t cip1 = ci;
355 ci = text[i];
356 if (ci > cip1)
357 {
358 if (was_s_typ)
359 {
360 lms_positions.push_back(bkt_lms[cip1]);
361 lms_positions.push_back(i + 1);
362 ++bkt_lms[cip1];
363 was_s_typ = 0;
364 }
365 }
366 else if (ci < cip1)
367 {
368 was_s_typ = 1;
369 }
370 }
371 util::clear(text);
372 {
373 // Order lms_positions according to first character
374 int_vector<> lms_strings(number_of_lms_strings, 0, int_width);
375 for (size_t i = 0; i < lms_positions.size();)
376 {
377 size_t idx = lms_positions[i++];
378 size_t val = lms_positions[i++];
379 lms_strings[idx] = val;
380 }
381 lms_positions.close(true);
382 // Store it to file
383 left_pointer = 0;
384 for (size_t i = 0; i < number_of_lms_strings; ++i)
385 {
386 left[left_pointer++] = lms_strings[number_of_lms_strings - i - 1];
387 }
388 }
389 load_from_file(text, filename_text);
390 }
391 left_pointer--;
392
393 // Step 3 - Scan virtual array from left to right
394 {
395 // prepare bkt_l and backup it into bkt_lms
396 for (size_t i = 0, tmp = 0; i < sigma; ++i)
397 {
398 tmp = bkt_l[i];
399 bkt_l[i] = bkt_l_sum;
400 bkt_l_sum += tmp;
401 bkt_lms[i] = bkt_l[i];
402 }
403
404 size_t partsize = bkt_l_sum / parts + 1;
405
406 int_vector<> array(partsize, 0, int_width);
407 std::vector<int_vector_buffer<>> cached_array(parts - 1);
408 for (size_t i = 0; i < cached_array.size(); ++i)
409 {
410 cached_array[i] = int_vector_buffer<>(tmp_file(filename_sa,
411 "_rightbuffer" + util::to_string(i) + "_" + util::to_string(recursion)),
412 std::ios::out,
413 buffersize,
414 nsize);
415 }
416
417 for (size_t c = 0, pos = 0, offset = 0; c < sigma; ++c)
418 {
419 // begin with array
420 for (; pos < bkt_l[c]; ++pos)
421 {
422 // Load lazy values
423 if (pos - offset >= partsize)
424 {
425 offset += partsize;
426 for (size_t i = 0, cur_part = pos / partsize - 1; i < cached_array[cur_part].size();)
427 {
428 size_t src = cached_array[cur_part][i++];
429 size_t val = cached_array[cur_part][i++];
430 array[src - offset] = val;
431 }
432 cached_array[pos / partsize - 1].reset();
433 }
434
435 size_t idx = array[pos - offset];
436 if (idx == 0) { right[right_pointer++] = idx; }
437 else
438 {
439 size_t symbol = text[idx - 1];
440 if (symbol >= c)
441 {
442 size_t val = idx - 1;
443 size_t src = bkt_l[symbol];
444 bkt_l[symbol] = bkt_l[symbol] + 1;
445 if ((src - offset) / partsize == 0) { array[src - offset] = val; }
446 else
447 {
448 size_t part = src / partsize - 1;
449 cached_array[part].push_back(src);
450 cached_array[part].push_back(val);
451 }
452 }
453 else
454 {
455 right[right_pointer++] = idx;
456 }
457 }
458 }
459 // continue with stack
460 while (left_pointer < number_of_lms_strings and text[left[left_pointer]] == c)
461 {
462 size_t idx = left[left_pointer--];
463 --idx;
464 size_t symbol = text[idx];
465
466 size_t val = idx;
467 size_t src = bkt_l[symbol];
468 bkt_l[symbol] = bkt_l[symbol] + 1;
469 if ((src - offset) / partsize == 0) { array[src - offset] = val; }
470 else
471 {
472 size_t part = src / partsize - 1;
473 cached_array[part].push_back(src);
474 cached_array[part].push_back(val);
475 }
476 }
477 }
478 for (size_t i = 0; i < cached_array.size(); ++i) { cached_array[i].close(true); }
479
480 // Restore bkt_l from bkt_lms
481 for (size_t i = 0; i < sigma; ++i) { bkt_l[i] = bkt_lms[i]; }
482 }
483 right_pointer--;
484
485 // Step 4 - Scan virtual array from right to left
486 left_pointer = 0;
487 left.reset();
488 {
489 // Prepare bkt_s and backup it into bkt_lms
490 bkt_s_last = 0, bkt_s_sum = 0;
491 for (size_t i = 0; i < sigma; ++i)
492 {
493 bkt_s_sum += bkt_s[i];
494 if (bkt_s[i])
495 {
496 bkt_s[i] = bkt_s_sum;
497 bkt_s_last = bkt_s_sum;
498 }
499 else
500 {
501 bkt_s[i] = bkt_s_sum;
502 }
503 bkt_lms[i] = bkt_s[i];
504 }
505 bound_s = bkt_s_sum;
506
507 // Determine splitting parameters
508 for (size_t i = 0; i < sigma; ++i)
509 {
510 if (bkt_s[i] > bkt_s_sum / 2)
511 {
512 bkt_s_sum = bkt_s[i];
513 break;
514 }
515 }
516
517 size_t partsize = bound_s / parts + 1;
518 int_vector<> array(partsize, 0, int_width);
519 std::vector<int_vector_buffer<>> cached_array(parts - 1);
520 for (size_t i = 0; i < cached_array.size(); ++i)
521 {
522 cached_array[i] = int_vector_buffer<>(tmp_file(filename_sa,
523 "_leftbuffer" + util::to_string(i) + "_" + util::to_string(recursion)),
524 std::ios::out,
525 buffersize,
526 nsize);
527 }
528 for (size_t c = sigma - 1, pos = bkt_s_last - 1, offset = partsize * (parts - 1); c < sigma; --c)
529 {
530 // begin with array
531 for (; pos + 1 > bkt_s[c]; --pos)
532 {
533 while (pos < offset)
534 {
535 // Load buffered values
536 offset -= partsize;
537 for (size_t i = 0, cur_part = offset / partsize; i < cached_array[cur_part].size();)
538 {
539 size_t src = cached_array[cur_part][i++];
540 size_t val = cached_array[cur_part][i++];
541 array[src - offset] = val;
542 }
543 cached_array[offset / partsize].reset();
544 }
545
546 size_t idx = array[pos - offset];
547 if (idx == 0) { idx = n; }
548 --idx;
549 size_t symbol = text[idx];
550 if (symbol <= c)
551 {
552 bkt_s[symbol] = bkt_s[symbol] - 1;
553 size_t val = idx;
554 size_t src = bkt_s[symbol];
555 if (src >= offset) { array[src - offset] = val; }
556 else
557 {
558 size_t part = src / partsize;
559 cached_array[part].push_back(src);
560 cached_array[part].push_back(val);
561 }
562 }
563 else
564 {
565 left[left_pointer++] = array[pos - offset];
566 }
567 }
568
569 // continue with stack
570 while (right_pointer < number_of_lms_strings and text[right[right_pointer]] == c)
571 {
572 size_t idx = right[right_pointer--];
573 if (idx == 0) { idx = n; }
574 --idx;
575 size_t symbol = text[idx];
576 bkt_s[symbol] = bkt_s[symbol] - 1;
577
578 size_t val = idx;
579 size_t src = bkt_s[symbol];
580 if (src >= offset) { array[src - offset] = val; }
581 else
582 {
583 size_t part = src / partsize;
584 cached_array[part].push_back(src);
585 cached_array[part].push_back(val);
586 }
587 }
588 }
589 for (size_t i = 0; i < cached_array.size(); ++i) { cached_array[i].close(true); }
590 // Restore bkt_s from bkt_lms
591 for (size_t i = 0; i < sigma; ++i) { bkt_s[i] = bkt_lms[i]; }
592 }
593 right.buffersize(0);
594 right.reset();
595 right_pointer = 0;
596 --left_pointer;
597
598 // Step 5 - Detect same lms-Strings, write text to file
599 int_vector<1> same_lms(number_of_lms_strings, false);
600 size_t last_end_pos = first_lms_pos, order = number_of_lms_strings - 1;
601 same_lms[number_of_lms_strings - 1] = true;
602 for (size_t i = number_of_lms_strings - 2, a = 0, b = 0, last_a = left[number_of_lms_strings - 1];
603 i < number_of_lms_strings;
604 --i)
605 {
606 b = last_a;
607 a = left[i];
608 last_a = a;
609
610 size_t end_pos = _get_next_lms_position(text, a);
611 if (end_pos - a == last_end_pos - b)
612 {
613 while (a < end_pos and text[a] == text[b])
614 {
615 ++a;
616 ++b;
617 }
618 if (text[a] == text[b])
619 {
620 same_lms[i] = true;
621 --order;
622 }
623 }
624 last_end_pos = end_pos;
625 }
626 util::clear(text);
627
628 // Step 7 - Create renamed string
629 int_vector<> text_rec;
630 if (recursion == 0) { text_rec.width((bits::hi(order + 1) + 1)); }
631 else
632 {
633 text_rec.width((bits::hi(number_of_lms_strings + 1) + 1));
634 }
635 text_rec.resize(number_of_lms_strings);
636 util::_set_zero_bits(text_rec);
637 {
638 if (recursion == 0 and n / 2 * text_rec.width() > 8 * n)
639 {
640 size_t size_of_part = n / 4 + 3;
641 text_rec.resize(size_of_part);
642 util::_set_zero_bits(text_rec);
643 order = 0;
644 for (size_t i = number_of_lms_strings - 1; i < number_of_lms_strings; --i)
645 {
646 if (!same_lms[i]) { ++order; }
647 if (left[i] / 2 >= size_of_part) { text_rec[(left[i] / 2) - size_of_part] = order; }
648 }
649 std::string filename_text_rec_part2 = tmp_file(filename_sa, "_text_rec_part2" + util::to_string(recursion));
650 size_t pos = 0;
651 for (size_t i = 0; i < size_of_part; ++i)
652 {
653 if (text_rec[i] > 0) { text_rec[pos++] = text_rec[i]; }
654 }
655 text_rec.resize(pos);
656 store_to_file(text_rec, filename_text_rec_part2);
657 text_rec.resize(size_of_part);
658 util::_set_zero_bits(text_rec);
659 order = 0;
660 for (size_t i = number_of_lms_strings - 1; i < number_of_lms_strings; --i)
661 {
662 if (!same_lms[i]) { ++order; }
663 if (left[i] / 2 < size_of_part) { text_rec[left[i] / 2] = order; }
664 }
665 pos = 0;
666 for (size_t i = 0; i < size_of_part; ++i)
667 {
668 if (text_rec[i] > 0) { text_rec[pos++] = text_rec[i]; }
669 }
670 text_rec.resize(number_of_lms_strings);
671 int_vector_buffer<> buf(filename_text_rec_part2, std::ios::in, 1024 * 1024);
672 for (size_t i = 0; i < buf.size(); ++i) { text_rec[pos++] = buf[i]; }
673 buf.close(true);
674 text_rec[number_of_lms_strings - 1] = 0;
675 }
676 else
677 {
678 text_rec.resize(n / 2 + 1);
679 util::_set_zero_bits(text_rec);
680 order = 0;
681 for (size_t i = number_of_lms_strings - 1; i < number_of_lms_strings; --i)
682 {
683 if (!same_lms[i]) { ++order; }
684 text_rec[left[left_pointer--] / 2] = order;
685 }
686 for (size_t i = 0, pos = 0; i < text_rec.size(); ++i)
687 {
688 if (text_rec[i] > 0) { text_rec[pos++] = text_rec[i]; }
689 }
690 text_rec[number_of_lms_strings - 1] = 0;
691 text_rec.resize(number_of_lms_strings);
692 }
693 }
694 util::clear(same_lms);
695 left.buffersize(0);
696 left.reset();
697
698 // Step 8 - Determine complete LMS order (recursivly)
699 int_vector<> isa_rec;
700 std::string filename_sa_rec = tmp_file(filename_sa, "_sa_rec" + util::to_string(recursion + 1));
701 if (text_rec.size() > order + 1)
702 {
703 if (recursion == 0)
704 {
705 memory_monitor::event("begin _construct_sa");
706 _construct_sa_se<int_vector<>>(text_rec, filename_sa_rec, order + 1, recursion + 1);
707 memory_monitor::event("end _construct_sa");
708 }
709 else
710 {
711 text_rec.resize(text_rec.size() * 2);
712 for (size_t i = 0; i < number_of_lms_strings; ++i)
713 {
714 text_rec[number_of_lms_strings + i] = text_rec[i];
715 text_rec[i] = 0;
716 }
717 memory_monitor::event("begin sa_simple");
718 _construct_sa_IS(text_rec,
719 text_rec,
720 filename_sa_rec,
721 number_of_lms_strings,
722 number_of_lms_strings,
723 order + 1,
724 recursion + 1);
725 memory_monitor::event("end sa_simple");
726 // SA' in first half, S' in second half
727 text_rec.resize(number_of_lms_strings);
728 store_to_file(text_rec, filename_sa_rec);
729 }
730 }
731 else
732 {
733 isa_rec = std::move(text_rec);
734 }
735
736 // Step 9 - Initiate left for scan in step 12
737 if (isa_rec.size() > 0)
738 {
739 // isa_rec exists //TODO test if its better to create sa_rec
740 // TODO always enough memory? in memory: isa_rec, lms_pos_b, select_support, tmp_left, leftbuffer
741 // load bit_vector lms_positions and create select support
742 bit_vector lms_pos_b(n);
743 load_from_file(lms_pos_b, filename_lms_pos_b);
744 sdsl::remove(filename_lms_pos_b);
745 select_support_mcl<> lms_select_support; // select_support for bit_vector
746 util::init_support(lms_select_support, &lms_pos_b); // Create select_support
747 // write left
748 int_vector<> tmp_left(number_of_lms_strings, 0, int_width);
749 for (size_t i = number_of_lms_strings - 1; i < number_of_lms_strings; --i)
750 {
751 size_t idx = isa_rec[i];
752 size_t val = lms_select_support.select(i + 1); // TODO test alternative without select support: look for 1
753 // in lms_pos_b (backwards)
754 tmp_left[idx] = val;
755 }
756 util::clear(lms_select_support);
757 util::clear(lms_pos_b);
758 util::clear(isa_rec);
759 // write to left
760 left.buffersize(buffersize);
761 left_pointer = 0;
762 for (; left_pointer < number_of_lms_strings; ++left_pointer)
763 {
764 left[left_pointer] = tmp_left[number_of_lms_strings - left_pointer - 1];
765 }
766 left_pointer--;
767 util::clear(tmp_left);
768 }
769 else
770 {
771 left.buffersize(buffersize);
772 left_pointer = 0;
773 {
774 // load bit_vector lms_positions and create select support
775 bit_vector lms_pos_b(n);
776 load_from_file(lms_pos_b, filename_lms_pos_b);
777 sdsl::remove(filename_lms_pos_b);
778 select_support_mcl<> lms_select_support; // select_support for bit_vector
779 util::init_support(lms_select_support, &lms_pos_b); // create select_support
780 // write to left sa_rec buffered
781 int_vector_buffer<> sa_rec_buf(filename_sa_rec, std::ios::in, buffersize, nsize);
782 for (uint64_t i = 0; i < sa_rec_buf.size(); ++i)
783 {
784 uint64_t pos = lms_select_support.select(sa_rec_buf[i] + 1);
785 left[number_of_lms_strings - 1 - left_pointer++] = pos;
786 }
787 sa_rec_buf.close(true);
788 left_pointer--;
789 }
790 // TODO test sa_rec unbuffered in recursion level 1 -> space still good?
791 }
792
793 // Step 10 - Reload text
794 load_from_file(text, filename_text);
795 sdsl::remove(filename_text);
796
797 // Step 12 - Scan virtual array from left to right second time
798 right.buffersize(buffersize);
799 right_pointer = 0;
800 int_vector_buffer<> cached_sa(filename_sa, std::ios::out, buffersize, nsize);
801 size_t sa_pointer = 0;
802 {
803 size_t partsize = bkt_l_sum / parts + 1;
804 int_vector<> array(partsize, 0, int_width);
805 std::vector<int_vector_buffer<>> cached_array(parts - 1);
806 for (size_t i = 0; i < cached_array.size(); ++i)
807 {
808 cached_array[i] = int_vector_buffer<>(tmp_file(filename_sa,
809 "_rightbuffer" + util::to_string(i) + "_" + util::to_string(recursion)),
810 std::ios::out,
811 buffersize,
812 nsize);
813 }
814
815 for (size_t c = 0, pos = 0, offset = 0; c < sigma; ++c)
816 {
817 // begin with array
818 for (; pos < bkt_l[c]; ++pos)
819 {
820 // Load lazy values
821 if (pos - offset >= partsize)
822 {
823 offset += partsize;
824 for (size_t i = 0, cur_part = pos / partsize - 1; i < cached_array[cur_part].size();)
825 {
826 size_t src = cached_array[cur_part][i++];
827 size_t val = cached_array[cur_part][i++];
828 array[src - offset] = val;
829 }
830 cached_array[pos / partsize - 1].reset();
831 }
832 size_t idx = array[pos - offset];
833 if (idx == 0)
834 {
835 cached_sa[sa_pointer++] = idx;
836 right[right_pointer++] = idx;
837 }
838 else
839 {
840 size_t symbol = text[idx - 1];
841 cached_sa[sa_pointer++] = idx;
842 if (symbol >= c)
843 {
844 size_t val = idx - 1;
845 size_t src = bkt_l[symbol];
846 bkt_l[symbol] = bkt_l[symbol] + 1;
847 if ((src - offset) / partsize == 0) { array[src - offset] = val; }
848 else
849 {
850 size_t part = src / partsize - 1;
851 cached_array[part].push_back(src);
852 cached_array[part].push_back(val);
853 }
854 }
855 else
856 {
857 right[right_pointer++] = idx;
858 }
859 }
860 }
861 sa_pointer = C[c];
862 // continue with stack
863 while (left_pointer < number_of_lms_strings and text[left[left_pointer]] == c)
864 {
865 size_t idx = left[left_pointer--];
866 if (idx == 0) { idx = n; }
867 --idx;
868 size_t symbol = text[idx];
869 size_t val = idx;
870 size_t src = bkt_l[symbol];
871 bkt_l[symbol] = bkt_l[symbol] + 1;
872 if ((src - offset) / partsize == 0) { array[src - offset] = val; }
873 else
874 {
875 size_t part = src / partsize - 1;
876 cached_array[part].push_back(src);
877 cached_array[part].push_back(val);
878 }
879 }
880 }
881 for (size_t i = 0; i < cached_array.size(); ++i) { cached_array[i].close(true); }
882 }
883 left.close(true);
884 right_pointer--;
885
886 // Step 13 - Scan virtual array from right to left second time
887 {
888 size_t partsize = bound_s / parts + 1;
889
890 int_vector<> array(partsize, 0, int_width);
891 std::vector<int_vector_buffer<>> cached_array(parts - 1);
892 for (size_t i = 0; i < cached_array.size(); ++i)
893 {
894 cached_array[i] = int_vector_buffer<>(tmp_file(filename_sa,
895 "_leftbuffer" + util::to_string(i) + "_" + util::to_string(recursion)),
896 std::ios::out,
897 buffersize,
898 nsize);
899 // cached_array_pointer[i] = 0;
900 }
901 for (size_t c = sigma - 1, pos = bkt_s_last - 1, offset = partsize * (parts - 1); c < sigma; --c)
902 {
903 // begin with array
904 assert(c < C.size());
905 sa_pointer = C[c] - 1;
906 for (; pos + 1 > bkt_s[c]; --pos)
907 {
908 while (pos < offset)
909 {
910 // Load buffered values
911 offset -= partsize;
912 for (size_t i = 0, cur_part = offset / partsize; i < cached_array[cur_part].size();)
913 {
914 size_t src = cached_array[cur_part][i++];
915 size_t val = cached_array[cur_part][i++];
916 assert((src - offset) < array.size());
917 array[src - offset] = val;
918 }
919 assert((offset / partsize) < parts - 1);
920 cached_array[offset / partsize].reset();
921 }
922
923 assert((pos - offset) < array.size());
924 size_t idx = array[pos - offset];
925 if (idx == 0) { idx = n; }
926 --idx;
927 assert((idx) < text.size());
928 size_t symbol = text[idx];
929 if (symbol <= c)
930 {
931 if (idx == n - 1) { cached_sa[sa_pointer--] = 0; }
932 else
933 {
934 cached_sa[sa_pointer--] = idx + 1;
935 }
936 assert((symbol) < bkt_s.size());
937 bkt_s[symbol] = bkt_s[symbol] - 1;
938 size_t val = idx;
939 size_t src = bkt_s[symbol];
940 if (src >= offset)
941 {
942 assert((src - offset) < array.size());
943 array[src - offset] = val;
944 }
945 else
946 {
947 size_t part = src / partsize;
948 assert(part < parts - 1);
949 cached_array[part].push_back(src);
950 cached_array[part].push_back(val);
951 }
952 }
953 else
954 {
955 if (idx == n - 1) { cached_sa[sa_pointer--] = 0; }
956 else
957 {
958 cached_sa[sa_pointer--] = idx + 1;
959 }
960 }
961 }
962 // continue with stack
963 while (right_pointer < number_of_lms_strings and text[right[right_pointer]] == c)
964 {
965 size_t idx = right[right_pointer--];
966 if (idx == 0) { idx = n; }
967 --idx;
968 size_t symbol = text[idx];
969 assert((symbol) < bkt_s.size());
970 bkt_s[symbol] = bkt_s[symbol] - 1;
971
972 size_t val = idx;
973 size_t src = bkt_s[symbol];
974 if (src >= offset)
975 {
976 assert((src - offset) < array.size());
977 array[src - offset] = val;
978 }
979 else
980 {
981 size_t part = src / partsize;
982 assert((part) < parts - 1);
983 cached_array[part].push_back(src);
984 cached_array[part].push_back(val);
985 }
986 }
987 }
988 for (size_t i = 0; i < cached_array.size(); ++i) { cached_array[i].close(true); }
989 }
990 right.close(true);
991 cached_sa.close();
992
993 return;
994}
995
996} // namespace sdsl
997
998#endif
uint64_t size() const
Returns the number of elements currently stored.
void reset()
Delete all content and set size to 0.
void close(bool remove_file=false)
Close the int_vector_buffer.
uint64_t buffersize() const
Returns the buffersize in bytes.
void push_back(const uint64_t value)
Appends the given element value to the end of the int_vector_buffer.
uint8_t width() const noexcept
Returns the width of the integers which are accessed via the [] operator.
Definition: int_vector.hpp:595
size_type size() const noexcept
The number of elements in the int_vector.
void push_back(value_type value)
Insert element at the end.
Definition: int_vector.hpp:448
void resize(const size_type size)
Resize the int_vector in terms of elements.
Definition: int_vector.hpp:521
static mm_event_proxy event(const std::string &name)
A class supporting constant time select queries.
size_type select(size_type i) const
Select function.
int_vector.hpp contains the sdsl::int_vector class.
io.hpp contains some methods for reading/writing sdsl structures.
std::string to_string(const T &t, int w=1)
void _set_zero_bits(t_int_vec &v)
Sets all bits of the int_vector to 0-bits.
Definition: util.hpp:531
Namespace for the succinct data structure library.
void _construct_sa_IS(int_vector<> &text, int_vector<> &sa, std::string &filename_sa, size_t n, size_t text_offset, size_t sigma, uint64_t recursion)
uint64_t _get_next_lms_position(int_vector_type &text, uint64_t i)
void _construct_sa_se(int_vector_type &text, std::string filename_sa, uint64_t sigma, uint64_t recursion)
std::string tmp_file(const cache_config &config, std::string name_part="")
Returns a name for a temporary file. I.e. the name was not used before.
Definition: io.hpp:698
bool load_from_file(T &v, const std::string &file)
Load sdsl-object v from a file.
Definition: io.hpp:901
bool store_to_file(const T &v, const std::string &file)
Store a data structure to a file.
Definition: io.hpp:798
int remove(const std::string &)
Remove a file.
Definition: ram_fs.hpp:194
rank_support.hpp contains classes that support a sdsl::bit_vector with constant time rank information...
select_support.hpp contains classes that support a sdsl::bit_vector with constant time select informa...
static constexpr uint32_t hi(uint64_t x)
Position of the most significant set bit the 64-bit word x.
Definition: bits.hpp:651