libstdc++
unicode.h
Go to the documentation of this file.
1 // Unicode utilities -*- C++ -*-
2 
3 // Copyright The GNU Toolchain Authors.
4 //
5 // This file is part of the GNU ISO C++ Library. This library is free
6 // software; you can redistribute it and/or modify it under the
7 // terms of the GNU General Public License as published by the
8 // Free Software Foundation; either version 3, or (at your option)
9 // any later version.
10 
11 // This library is distributed in the hope that it will be useful,
12 // but WITHOUT ANY WARRANTY; without even the implied warranty of
13 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 // GNU General Public License for more details.
15 
16 // Under Section 7 of GPL version 3, you are granted additional
17 // permissions described in the GCC Runtime Library Exception, version
18 // 3.1, as published by the Free Software Foundation.
19 
20 // You should have received a copy of the GNU General Public License and
21 // a copy of the GCC Runtime Library Exception along with this program;
22 // see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
23 // <http://www.gnu.org/licenses/>.
24 
25 /** @file include/bits/unicode.h
26  * This is an internal header file, included by other library headers.
27  * Do not attempt to use it directly. @headername{format}
28  */
29 
30 #ifndef _GLIBCXX_UNICODE_H
31 #define _GLIBCXX_UNICODE_H 1
32 
33 #if __cplusplus >= 202002L
34 #include <array>
35 #include <bit> // bit_width
36 #include <charconv> // __detail::__from_chars_alnum_to_val_table
37 #include <string_view>
38 #include <cstdint>
39 #include <bits/stl_algo.h>
40 #include <bits/stl_iterator.h>
41 #include <bits/ranges_base.h> // iterator_t, sentinel_t, input_range, etc.
42 #include <bits/ranges_util.h> // view_interface
43 
44 namespace std _GLIBCXX_VISIBILITY(default)
45 {
46 _GLIBCXX_BEGIN_NAMESPACE_VERSION
47 namespace __unicode
48 {
49  // A Unicode code point that is not a high or low surrogate.
50  constexpr bool
51  __is_scalar_value(char32_t __c)
52  {
53  if (__c < 0xD800) [[likely]]
54  return true;
55  return 0xDFFF < __c && __c <= 0x10FFFF;
56  }
57 
58  // A code point that can be encoded in a single code unit of type _CharT.
59  template<typename _CharT>
60  constexpr bool
61  __is_single_code_unit(char32_t __c)
62  {
63  if constexpr (__gnu_cxx::__int_traits<_CharT>::__max <= 0xFF)
64  return __c < 0x7F; // ASCII character
65  else
66  return __c < __gnu_cxx::__int_traits<_CharT>::__max
67  && __is_scalar_value(__c);
68  }
69 
70  // Based on https://www.open-std.org/jtc1/sc22/wg21/docs/papers/2023/p2728r6.html#add-the-transcoding-iterator-template
71 
72  struct _Repl
73  {
74  constexpr char32_t
75  operator()() const noexcept
76  { return 0xFFFD; }
77  };
78 
79  struct _Null_sentinel_t
80  {
81  template<input_iterator _It>
82  requires default_initializable<iter_value_t<_It>>
83  && equality_comparable_with<iter_reference_t<_It>, iter_value_t<_It>>
84  friend constexpr auto
85  operator==(_It __it, _Null_sentinel_t)
86  { return *__it == iter_value_t<_It>{}; }
87  };
88 
89  template<typename _FromFmt, typename _ToFmt,
90  input_iterator _Iter, sentinel_for<_Iter> _Sent = _Iter,
91  typename _ErrorHandler = _Repl>
92  requires convertible_to<iter_value_t<_Iter>, _FromFmt>
93  class _Utf_iterator
94  {
95  static_assert(forward_iterator<_Iter> || noexcept(_ErrorHandler()()));
96 
97  public:
98  using value_type = _ToFmt;
99  using difference_type = iter_difference_t<_Iter>;
100  using reference = value_type;
101  using iterator_concept
102  = std::__detail::__clamp_iter_cat<__iter_category_t<_Iter>,
103  bidirectional_iterator_tag>;
104 
105  constexpr _Utf_iterator() = default;
106 
107  constexpr
108  _Utf_iterator(_Iter __first, _Iter __it, _Sent __last)
109  requires bidirectional_iterator<_Iter>
110  : _M_first_and_curr{__first, __it}, _M_last(__last)
111  {
112  if (_M_curr() != _M_last)
113  _M_read();
114  else
115  _M_buf = {};
116  }
117 
118  constexpr
119  _Utf_iterator(_Iter __it, _Sent __last)
120  requires (!bidirectional_iterator<_Iter>)
121  : _M_first_and_curr{__it}, _M_last(__last)
122  {
123  if (_M_curr() != _M_last)
124  _M_read();
125  else
126  _M_buf = {};
127  }
128 
129  template<class _Iter2, class _Sent2>
130  requires convertible_to<_Iter2, _Iter> && convertible_to<_Sent2, _Sent>
131  constexpr
132  _Utf_iterator(const _Utf_iterator<_FromFmt, _ToFmt, _Iter2, _Sent2,
133  _ErrorHandler>& __other)
134  : _M_buf(__other._M_buf), _M_first_and_curr(__other._M_first_and_curr),
135  _M_buf_index(__other._M_buf_index), _M_buf_last(__other._M_buf_last),
136  _M_last(__other._M_last)
137  { }
138 
139  [[nodiscard]]
140  constexpr _Iter
141  begin() const requires bidirectional_iterator<_Iter>
142  { return _M_first(); }
143 
144  [[nodiscard]]
145  constexpr _Sent
146  end() const { return _M_last; }
147 
148  [[nodiscard]]
149  constexpr _Iter
150  base() const requires forward_iterator<_Iter>
151  { return _M_curr(); }
152 
153  [[nodiscard]]
154  constexpr iter_difference_t<_Iter>
155  _M_units() const requires forward_iterator<_Iter>
156  { return _M_to_increment; }
157 
158  [[nodiscard]]
159  constexpr value_type
160  operator*() const { return _M_buf[_M_buf_index]; }
161 
162  constexpr _Utf_iterator&
163  operator++()
164  {
165  if (_M_buf_index + 1 == _M_buf_last && _M_curr() != _M_last)
166  {
167  if constexpr (forward_iterator<_Iter>)
168  std::advance(_M_curr(), _M_to_increment);
169  if (_M_curr() == _M_last)
170  _M_buf_index = 0;
171  else
172  _M_read();
173  }
174  else if (_M_buf_index + 1 < _M_buf_last)
175  ++_M_buf_index;
176  return *this;
177  }
178 
179  constexpr _Utf_iterator
180  operator++(int)
181  {
182  auto __tmp = *this;
183  ++*this;
184  return __tmp;
185  }
186 
187  constexpr _Utf_iterator&
188  operator--() requires bidirectional_iterator<_Iter>
189  {
190  if (!_M_buf_index && _M_curr() != _M_first())
191  _M_read_reverse();
192  else if (_M_buf_index)
193  --_M_buf_index;
194  return *this;
195  }
196 
197  constexpr _Utf_iterator
198  operator--(int)
199  {
200  auto __tmp = *this;
201  --*this;
202  return __tmp;
203  }
204 
205  [[nodiscard]]
206  friend constexpr bool
207  operator==(_Utf_iterator __lhs, _Utf_iterator __rhs)
208  requires forward_iterator<_Iter> || requires (_Iter __i) { __i != __i; }
209  {
210  if constexpr (forward_iterator<_Iter>)
211  return __lhs._M_curr() == __rhs._M_curr()
212  && __lhs._M_buf_index == __rhs._M_buf_index;
213  else if (__lhs._M_curr() != __rhs._M_curr())
214  return false;
215  else if (__lhs._M_buf_index == __rhs._M_buf_index
216  && __lhs._M_buf_last == __rhs._M_buf_last)
217  return true;
218  else
219  return __lhs._M_buf_index == __lhs._M_buf_last
220  && __rhs._M_buf_index == __rhs._M_buf_last;
221  }
222 
223  [[nodiscard]]
224  friend constexpr bool
225  operator==(_Utf_iterator __lhs, _Sent __rhs)
226  {
227  if constexpr (forward_iterator<_Iter>)
228  return __lhs._M_curr() == __rhs;
229  else
230  return __lhs._M_curr() == __rhs
231  && __lhs._M_buf_index == __lhs._M_buf_last;
232  }
233 
234  private:
235  constexpr void
236  _M_read()
237  {
238  if constexpr (sizeof(_FromFmt) == sizeof(uint8_t))
239  _M_read_utf8();
240  else if constexpr (sizeof(_FromFmt) == sizeof(uint16_t))
241  _M_read_utf16();
242  else
243  {
244  static_assert(sizeof(_FromFmt) == sizeof(uint32_t));
245  _M_read_utf32();
246  }
247  }
248 
249  constexpr void
250  _M_read_reverse(); // TODO
251 
252  template<typename>
253  struct _Guard
254  {
255  _Guard(void*, _Iter&) { }
256  };
257 
258  template<typename _It> requires forward_iterator<_It>
259  struct _Guard<_It>
260  {
261  constexpr ~_Guard() { _M_this->_M_curr() = std::move(_M_orig); }
262  _Utf_iterator* _M_this;
263  _It _M_orig;
264  };
265 
266  constexpr void
267  _M_read_utf8()
268  {
269  _Guard<_Iter> __g{this, _M_curr()};
270  char32_t __c{};
271  const uint8_t __lo_bound = 0x80, __hi_bound = 0xBF;
272  uint8_t __u = *_M_curr()++;
273  uint8_t __to_incr = 1;
274  auto __incr = [&, this] {
275  ++__to_incr;
276  return ++_M_curr();
277  };
278 
279  if (__u <= 0x7F) [[likely]] // 0x00 to 0x7F
280  __c = __u;
281  else if (__u < 0xC2) [[unlikely]]
282  __c = _S_error();
283  else if (_M_curr() == _M_last) [[unlikely]]
284  __c = _S_error();
285  else if (__u <= 0xDF) // 0xC2 to 0xDF
286  {
287  __c = __u & 0x1F;
288  __u = *_M_curr();
289 
290  if (__u < __lo_bound || __u > __hi_bound) [[unlikely]]
291  __c = _S_error();
292  else
293  {
294  __c = (__c << 6) | (__u & 0x3F);
295  __incr();
296  }
297  }
298  else if (__u <= 0xEF) // 0xE0 to 0xEF
299  {
300  const uint8_t __lo_bound_2 = __u == 0xE0 ? 0xA0 : __lo_bound;
301  const uint8_t __hi_bound_2 = __u == 0xED ? 0x9F : __hi_bound;
302 
303  __c = __u & 0x0F;
304  __u = *_M_curr();
305 
306  if (__u < __lo_bound_2 || __u > __hi_bound_2) [[unlikely]]
307  __c = _S_error();
308  else if (__incr() == _M_last) [[unlikely]]
309  __c = _S_error();
310  else
311  {
312  __c = (__c << 6) | (__u & 0x3F);
313  __u = *_M_curr();
314 
315  if (__u < __lo_bound || __u > __hi_bound) [[unlikely]]
316  __c = _S_error();
317  else
318  {
319  __c = (__c << 6) | (__u & 0x3F);
320  __incr();
321  }
322  }
323  }
324  else if (__u <= 0xF4) // 0xF0 to 0xF4
325  {
326  const uint8_t __lo_bound_2 = __u == 0xF0 ? 0x90 : __lo_bound;
327  const uint8_t __hi_bound_2 = __u == 0xF4 ? 0x8F : __hi_bound;
328 
329  __c = __u & 0x07;
330  __u = *_M_curr();
331 
332  if (__u < __lo_bound_2 || __u > __hi_bound_2) [[unlikely]]
333  __c = _S_error();
334  else if (__incr() == _M_last) [[unlikely]]
335  __c = _S_error();
336  else
337  {
338  __c = (__c << 6) | (__u & 0x3F);
339  __u = *_M_curr();
340 
341  if (__u < __lo_bound || __u > __hi_bound) [[unlikely]]
342  __c = _S_error();
343  else if (__incr() == _M_last) [[unlikely]]
344  __c = _S_error();
345  else
346  {
347  __c = (__c << 6) | (__u & 0x3F);
348  __u = *_M_curr();
349 
350  if (__u < __lo_bound || __u > __hi_bound) [[unlikely]]
351  __c = _S_error();
352  else
353  {
354  __c = (__c << 6) | (__u & 0x3F);
355  __incr();
356  }
357  }
358  }
359  }
360  else [[unlikely]]
361  __c = _S_error();
362 
363  _M_update(__c, __to_incr);
364  }
365 
366  constexpr void
367  _M_read_utf16()
368  {
369  _Guard<_Iter> __g{this, _M_curr()};
370  char32_t __c{};
371  uint16_t __u = *_M_curr()++;
372  uint8_t __to_incr = 1;
373 
374  if (__u < 0xD800 || __u > 0xDFFF) [[likely]]
375  __c = __u;
376  else if (__u < 0xDC00 && _M_curr() != _M_last)
377  {
378  uint16_t __u2 = *_M_curr();
379  if (__u2 < 0xDC00 || __u2 > 0xDFFF) [[unlikely]]
380  __c = _S_error();
381  else
382  {
383  ++_M_curr();
384  __to_incr = 2;
385  uint32_t __x = (__u & 0x3F) << 10 | (__u2 & 0x3FF);
386  uint32_t __w = (__u >> 6) & 0x1F;
387  __c = (__w + 1) << 16 | __x;
388  }
389  }
390  else
391  __c = _S_error();
392 
393  _M_update(__c, __to_incr);
394  }
395 
396  constexpr void
397  _M_read_utf32()
398  {
399  _Guard<_Iter> __g{this, _M_curr()};
400  char32_t __c = *_M_curr()++;
401  if (!__is_scalar_value(__c)) [[unlikely]]
402  __c = _S_error();
403  _M_update(__c, 1);
404  }
405 
406  // Encode the code point __c as one or more code units in _M_buf.
407  constexpr void
408  _M_update(char32_t __c, uint8_t __to_incr)
409  {
410  _M_to_increment = __to_incr;
411  _M_buf_index = 0;
412  if constexpr (sizeof(_ToFmt) == sizeof(uint32_t))
413  {
414  _M_buf[0] = __c;
415  _M_buf_last = 1;
416  }
417  else if constexpr (sizeof(_ToFmt) == sizeof(uint16_t))
418  {
419  if (__is_single_code_unit<_ToFmt>(__c))
420  {
421  _M_buf[0] = __c;
422  _M_buf[1] = 0;
423  _M_buf_last = 1;
424  }
425  else
426  {
427  // From http://www.unicode.org/faq/utf_bom.html#utf16-4
428  const char32_t __lead_offset = 0xD800 - (0x10000 >> 10);
429  char16_t __lead = __lead_offset + (__c >> 10);
430  char16_t __trail = 0xDC00 + (__c & 0x3FF);
431  _M_buf[0] = __lead;
432  _M_buf[1] = __trail;
433  _M_buf_last = 2;
434  }
435  }
436  else
437  {
438  static_assert(sizeof(_ToFmt) == 1);
439  int __bits = std::bit_width((uint32_t)__c);
440  if (__bits <= 7) [[likely]]
441  {
442  _M_buf[0] = __c;
443  _M_buf[1] = _M_buf[2] = _M_buf[3] = 0;
444  _M_buf_last = 1;
445  }
446  else if (__bits <= 11)
447  {
448  _M_buf[0] = 0xC0 | (__c >> 6);
449  _M_buf[1] = 0x80 | (__c & 0x3F);
450  _M_buf[2] = _M_buf[3] = 0;
451  _M_buf_last = 2;
452  }
453  else if (__bits <= 16)
454  {
455  _M_buf[0] = 0xE0 | (__c >> 12);
456  _M_buf[1] = 0x80 | ((__c >> 6) & 0x3F);
457  _M_buf[2] = 0x80 | (__c & 0x3F);
458  _M_buf[3] = 0;
459  _M_buf_last = 3;
460  }
461  else
462  {
463  _M_buf[0] = 0xF0 | ((__c >> 18) & 0x07);
464  _M_buf[1] = 0x80 | ((__c >> 12) & 0x3F);
465  _M_buf[2] = 0x80 | ((__c >> 6) & 0x3F);
466  _M_buf[3] = 0x80 | (__c & 0x3F);
467  _M_buf_last = 4;
468  }
469  }
470  }
471 
472  constexpr char32_t
473  _S_error()
474  {
475  char32_t __c = _ErrorHandler()();
476  __glibcxx_assert(__is_scalar_value(__c));
477  return __c;
478  }
479 
480  constexpr _Iter
481  _M_first() const requires bidirectional_iterator<_Iter>
482  { return _M_first_and_curr._M_first; }
483 
484  constexpr _Iter&
485  _M_curr() { return _M_first_and_curr._M_curr; }
486 
487  constexpr _Iter
488  _M_curr() const { return _M_first_and_curr._M_curr; }
489 
490  array<value_type, 4 / sizeof(_ToFmt)> _M_buf;
491 
492  template<typename _It>
493  struct _First_and_curr
494  {
495  _First_and_curr() = default;
496 
497  constexpr
498  _First_and_curr(_It __curr) : _M_curr(__curr) { }
499 
500  template<convertible_to<_It> _It2>
501  constexpr
502  _First_and_curr(const _First_and_curr<_It2>& __other)
503  : _M_curr(__other._M_curr) { }
504 
505  _It _M_curr;
506  };
507 
508  template<typename _It> requires bidirectional_iterator<_It>
509  struct _First_and_curr<_It>
510  {
511  _First_and_curr() = default;
512 
513  constexpr
514  _First_and_curr(_It __first, _It __curr)
515  : _M_first(__first), _M_curr(__curr) { }
516 
517  template<convertible_to<_It> _It2>
518  constexpr
519  _First_and_curr(const _First_and_curr<_It2>& __other)
520  : _M_first(__other._M_first), _M_curr(__other._M_curr) { }
521 
522  _It _M_first;
523  _It _M_curr;
524  };
525 
526  _First_and_curr<_Iter> _M_first_and_curr;
527 
528  uint8_t _M_buf_index = 0;
529  uint8_t _M_buf_last = 0;
530  uint8_t _M_to_increment = 0;
531 
532  [[no_unique_address]] _Sent _M_last;
533 
534  template<typename _FromFmt2, typename _ToFmt2,
535  input_iterator _Iter2, sentinel_for<_Iter2> _Sent2,
536  typename _ErrHandler>
537  requires convertible_to<iter_value_t<_Iter2>, _FromFmt2>
538  friend class _Utf_iterator;
539  };
540 
541  template<typename _ToFormat, ranges::input_range _Range>
542  class _Utf_view
543  : public ranges::view_interface<_Utf_view<_ToFormat, _Range>>
544  {
545  using _Iterator = _Utf_iterator<ranges::range_value_t<_Range>,
546  _ToFormat, ranges::iterator_t<_Range>,
547  ranges::sentinel_t<_Range>>;
548 
549  template<typename _Iter, typename _Sent>
550  constexpr auto
551  _M_begin(_Iter __first, _Sent __last)
552  {
553  if constexpr (bidirectional_iterator<_Iter>)
554  return _Iterator(__first, __first, __last);
555  else
556  return _Iterator(__first, __last);
557  }
558 
559  template<typename _Iter, typename _Sent>
560  constexpr auto
561  _M_end(_Iter __first, _Sent __last)
562  {
563  if constexpr (!is_same_v<_Iter, _Sent>)
564  return __last;
565  else if constexpr (bidirectional_iterator<_Iter>)
566  return _Iterator(__first, __last, __last);
567  else
568  return _Iterator(__last, __last);
569  }
570 
571  _Range _M_base;
572 
573  public:
574  constexpr explicit
575  _Utf_view(_Range&& __r) : _M_base(std::forward<_Range>(__r)) { }
576 
577  constexpr auto begin()
578  { return _M_begin(ranges::begin(_M_base), ranges::end(_M_base)); }
579 
580  constexpr auto end()
581  { return _M_end(ranges::begin(_M_base), ranges::end(_M_base)); }
582 
583  constexpr bool empty() const { return ranges::empty(_M_base); }
584  };
585 
586 #ifdef __cpp_char8_t
587  template<typename _View>
588  using _Utf8_view = _Utf_view<char8_t, _View>;
589 #else
590  template<typename _View>
591  using _Utf8_view = _Utf_view<char, _View>;
592 #endif
593  template<typename _View>
594  using _Utf16_view = _Utf_view<char16_t, _View>;
595  template<typename _View>
596  using _Utf32_view = _Utf_view<char32_t, _View>;
597 
598 inline namespace __v16_0_0
599 {
600 #define _GLIBCXX_GET_UNICODE_DATA 160000
601 #include "unicode-data.h"
602 #ifdef _GLIBCXX_GET_UNICODE_DATA
603 # error "Invalid unicode data"
604 #endif
605 
606  // The field width of a code point.
607  constexpr int
608  __field_width(char32_t __c) noexcept
609  {
610  if (__c < __width_edges[0]) [[likely]]
611  return 1;
612 
613  auto* __p = std::upper_bound(__width_edges, std::end(__width_edges), __c);
614  return (__p - __width_edges) % 2 + 1;
615  }
616 
617  // @pre c <= 0x10FFFF
618  constexpr bool
619  __should_escape_category(char32_t __c) noexcept
620  {
621  constexpr uint32_t __mask = 0x01;
622  auto* __end = std::end(__escape_edges);
623  auto* __p = std::lower_bound(__escape_edges, __end,
624  (__c << 1u) + 2);
625  return __p[-1] & __mask;
626  }
627 
628 
629  // @pre c <= 0x10FFFF
630  constexpr _Gcb_property
631  __grapheme_cluster_break_property(char32_t __c) noexcept
632  {
633  constexpr uint32_t __mask = (1 << __gcb_shift_bits) - 1;
634  auto* __end = std::end(__gcb_edges);
635  auto* __p = std::lower_bound(__gcb_edges, __end,
636  (__c << __gcb_shift_bits) | __mask);
637  return _Gcb_property(__p[-1] & __mask);
638  }
639 
640  constexpr bool
641  __is_incb_linker(char32_t __c) noexcept
642  {
643  const auto __end = std::end(__incb_linkers);
644  // Array is small enough that linear search is faster than binary search.
645  return _GLIBCXX_STD_A::find(__incb_linkers, __end, __c) != __end;
646  }
647 
648  // @pre c <= 0x10FFFF
649  constexpr _InCB
650  __incb_property(char32_t __c) noexcept
651  {
652  if ((__c << 2) < __incb_edges[0]) [[likely]]
653  return _InCB(0);
654 
655  constexpr uint32_t __mask = 0x3;
656  auto* __end = std::end(__incb_edges);
657  auto* __p = std::lower_bound(__incb_edges, __end, (__c << 2) | __mask);
658  return _InCB(__p[-1] & __mask);
659  }
660 
661  constexpr bool
662  __is_extended_pictographic(char32_t __c)
663  {
664  if (__c < __xpicto_edges[0]) [[likely]]
665  return 0;
666 
667  auto* __p = std::upper_bound(__xpicto_edges, std::end(__xpicto_edges), __c);
668  return (__p - __xpicto_edges) % 2;
669  }
670 
671  struct _Grapheme_cluster_iterator_base
672  {
673  char32_t _M_c; // First code point in the cluster.
674  _Gcb_property _M_prop; // GCB property of _M_c.
675  enum class _XPicto : unsigned char { _Init, _Zwj, _Matched, _Failed };
676  _XPicto _M_xpicto_seq_state = _XPicto::_Init;
677  unsigned char _M_RI_count = 0;
678  bool _M_incb_linker_seen = false;
679 
680  constexpr void
681  _M_reset(char32_t __c, _Gcb_property __p)
682  {
683  _M_c = __c;
684  _M_prop = __p;
685  _M_xpicto_seq_state = _XPicto::_Init;
686  _M_RI_count = 0;
687  _M_incb_linker_seen = false;
688  }
689 
690  constexpr void
691  _M_update_xpicto_seq_state(char32_t __c, _Gcb_property __p)
692  {
693  if (_M_xpicto_seq_state == _XPicto::_Failed)
694  return;
695 
696  auto __next_state = _XPicto::_Failed;
697  if (_M_xpicto_seq_state != _XPicto::_Zwj) // i.e. Init or Matched
698  {
699  if (__p == _Gcb_property::_Gcb_ZWJ)
700  {
701  if (_M_xpicto_seq_state == _XPicto::_Matched)
702  __next_state = _XPicto::_Zwj;
703  // We check _M_c here so that we do the lookup at most once,
704  // and only for clusters containing at least one ZWJ.
705  else if (__is_extended_pictographic(_M_c))
706  __next_state = _XPicto::_Zwj;
707  }
708  else if (__p == _Gcb_property::_Gcb_Extend)
709  __next_state = _M_xpicto_seq_state; // no change
710  }
711  else // Zwj
712  {
713  // This assumes that all \p{Extended_Pictographic} emoji have
714  // Grapheme_Cluster_Break=Other.
715  if (__p == _Gcb_property::_Gcb_Other
716  && __is_extended_pictographic(__c))
717  __next_state = _XPicto::_Matched;
718  }
719  _M_xpicto_seq_state = __next_state;
720  }
721 
722  constexpr void
723  _M_update_ri_count(_Gcb_property __p)
724  {
725  if (__p == _Gcb_property::_Gcb_Regional_Indicator)
726  ++_M_RI_count;
727  else
728  _M_RI_count = 0;
729  }
730 
731  constexpr void
732  _M_update_incb_state(char32_t __c, _Gcb_property)
733  {
734  if (__is_incb_linker(__c))
735  _M_incb_linker_seen = true;
736  }
737  };
738 
739  // Split a range into extended grapheme clusters.
740  template<ranges::forward_range _View> requires ranges::view<_View>
741  class _Grapheme_cluster_view
742  : public ranges::view_interface<_Grapheme_cluster_view<_View>>
743  {
744  public:
745 
746  constexpr
747  _Grapheme_cluster_view(_View __v)
748  : _M_begin(_Utf32_view<_View>(std::move(__v)).begin())
749  { }
750 
751  constexpr auto begin() const { return _M_begin; }
752  constexpr auto end() const { return _M_begin.end(); }
753 
754  private:
755  struct _Iterator : private _Grapheme_cluster_iterator_base
756  {
757  private:
758  // Iterator over the underlying code points.
759  using _U32_iterator = ranges::iterator_t<_Utf32_view<_View>>;
760 
761  public:
762  // TODO: Change value_type to be subrange<_U32_iterator> instead?
763  // Alternatively, value_type could be _Utf32_view<iterator_t<_View>>.
764  // That would be the whole cluster, not just the first code point.
765  // Would need to store two iterators and find end of current cluster
766  // on increment, so operator* returns value_type(_M_base, _M_next).
767  using value_type = char32_t;
768  using iterator_concept = forward_iterator_tag;
769  using difference_type = ptrdiff_t;
770 
771  constexpr
772  _Iterator(_U32_iterator __i)
773  : _M_base(__i)
774  {
775  if (__i != __i.end())
776  {
777  _M_c = *__i;
778  _M_prop = __grapheme_cluster_break_property(_M_c);
779  }
780  }
781 
782  // The first code point of the current extended grapheme cluster.
783  constexpr value_type
784  operator*() const
785  { return _M_c; }
786 
787  constexpr auto
788  operator->() const
789  { return &_M_c; }
790 
791  // Move to the next extended grapheme cluster.
792  constexpr _Iterator&
793  operator++()
794  {
795  const auto __end = _M_base.end();
796  if (_M_base != __end)
797  {
798  auto __p_prev = _M_prop;
799  auto __it = _M_base;
800  while (++__it != __end)
801  {
802  char32_t __c = *__it;
803  auto __p = __grapheme_cluster_break_property(*__it);
804  _M_update_xpicto_seq_state(__c, __p);
805  _M_update_ri_count(__p);
806  _M_update_incb_state(__c, __p);
807  if (_M_is_break(__p_prev, __p, __it))
808  {
809  // Found a grapheme cluster break
810  _M_reset(__c, __p);
811  break;
812  }
813  __p_prev = __p;
814  }
815  _M_base = __it;
816  }
817  return *this;
818  }
819 
820  constexpr _Iterator
821  operator++(int)
822  {
823  auto __tmp = *this;
824  ++*this;
825  return __tmp;
826  }
827 
828  constexpr bool
829  operator==(const _Iterator& __i) const
830  { return _M_base == __i._M_base; }
831 
832  // This supports iter != iter.end()
833  constexpr bool
834  operator==(const ranges::sentinel_t<_View>& __i) const
835  { return _M_base == __i; }
836 
837  // Iterator to the start of the current cluster.
838  constexpr auto base() const { return _M_base.base(); }
839 
840  // The end of the underlying view (not the end of the current cluster!)
841  constexpr auto end() const { return _M_base.end(); }
842 
843  // Field width of the first code point in the cluster.
844  constexpr int
845  width() const noexcept
846  { return __field_width(_M_c); }
847 
848  private:
849  _U32_iterator _M_base;
850 
851  // Implement the Grapheme Cluster Boundary Rules from Unicode Annex #29
852  // http://www.unicode.org/reports/tr29/#Grapheme_Cluster_Boundary_Rules
853  // This implements the rules from TR29 revision 43 in Unicode 15.1.0.
854  // Return true if there is a break between code point with property p1
855  // and code point with property p2.
856  constexpr bool
857  _M_is_break(_Gcb_property __p1, _Gcb_property __p2,
858  _U32_iterator __curr) const
859  {
860  using enum _Gcb_property;
861 
862  if (__p1 == _Gcb_Control || __p1 == _Gcb_LF)
863  return true; // Break after Control or LF.
864 
865  if (__p1 == _Gcb_CR)
866  return __p2 != _Gcb_LF; // Do not break between a CR and LF.
867 
868  // Rule GB5
869  if (__p2 == _Gcb_Control || __p2 == _Gcb_CR || __p2 == _Gcb_LF)
870  return true; // Break before Control, CR or LF.
871 
872  // Rule GB6
873  if (__p1 == _Gcb_L)
874  switch (__p2)
875  {
876  case _Gcb_L:
877  case _Gcb_V:
878  case _Gcb_LV:
879  case _Gcb_LVT:
880  return false; // Do not break Hangul syllable sequences.
881  default:
882  return true;
883  }
884 
885  // Rule GB7
886  if (__p1 == _Gcb_LV || __p1 == _Gcb_V)
887  switch (__p2)
888  {
889  case _Gcb_V:
890  case _Gcb_T:
891  return false; // Do not break Hangul syllable sequences.
892  default:
893  return true;
894  }
895 
896  // Rule GB8
897  if (__p1 == _Gcb_LVT || __p1 == _Gcb_T)
898  return __p2 != _Gcb_T; // Do not break Hangul syllable sequences.
899 
900  // Rule GB9
901  if (__p2 == _Gcb_Extend || __p2 == _Gcb_ZWJ)
902  return false; // Do not break before extending characters or ZWJ.
903 
904  // The following GB9x rules only apply to extended grapheme clusters,
905  // which is what the C++ standard uses (not legacy grapheme clusters).
906 
907  // Rule GB9a
908  if (__p2 == _Gcb_SpacingMark)
909  return false; // Do not break before SpacingMarks,
910  // Rule GB9b
911  if (__p1 == _Gcb_Prepend)
912  return false; // or after Prepend characters.
913 
914  // Rule GB9c (Unicode 15.1.0)
915  // Do not break within certain combinations with
916  // Indic_Conjunct_Break (InCB)=Linker.
917  if (_M_incb_linker_seen
918  && __incb_property(_M_c) == _InCB::_Consonant
919  && __incb_property(*__curr) == _InCB::_Consonant)
920  {
921  // Match [_M_base, __curr] against regular expression
922  // Consonant ([Extend Linker]* Linker [Extend Linker]* Consonant)+
923  bool __have_linker = false;
924  auto __it = _M_base;
925  while (++__it != __curr)
926  {
927  if (__is_incb_linker(*__it))
928  __have_linker = true;
929  else
930  {
931  auto __incb = __incb_property(*__it);
932  if (__incb == _InCB::_Consonant)
933  __have_linker = false;
934  else if (__incb != _InCB::_Extend)
935  break;
936  }
937  }
938  if (__it == __curr && __have_linker)
939  return false;
940  }
941 
942  // Rule GB11
943  // Do not break within emoji modifier sequences
944  // or emoji zwj sequences.
945  if (__p1 == _Gcb_ZWJ && _M_xpicto_seq_state == _XPicto::_Matched)
946  return false;
947 
948  // Rules GB12 and GB13
949  // Do not break within emoji flag sequences. That is, do not break
950  // between regional indicator (RI) symbols if there is an odd number
951  // of RI characters before the break point.
952  if (__p1 == _Gcb_property::_Gcb_Regional_Indicator && __p1 == __p2)
953  return (_M_RI_count & 1) == 0;
954 
955  // Rule GB999
956  return true; // Otherwise, break everywhere.
957  }
958  };
959 
960  _Iterator _M_begin;
961  };
962 
963 } // namespace __v16_0_0
964 
965  // Return the field width of a string.
966  template<typename _CharT>
967  constexpr size_t
968  __field_width(basic_string_view<_CharT> __s)
969  {
970  if (__s.empty()) [[unlikely]]
971  return 0;
972  _Grapheme_cluster_view<basic_string_view<_CharT>> __gc(__s);
973  auto __it = __gc.begin();
974  const auto __end = __gc.end();
975  size_t __n = __it.width();
976  while (++__it != __end)
977  __n += __it.width();
978  return __n;
979  }
980 
981  // Truncate a string to at most `__max` field width units, and return the
982  // resulting field width.
983  template<typename _CharT>
984  constexpr size_t
985  __truncate(basic_string_view<_CharT>& __s, size_t __max)
986  {
987  if (__s.empty()) [[unlikely]]
988  return 0;
989 
990  _Grapheme_cluster_view<basic_string_view<_CharT>> __gc(__s);
991  auto __it = __gc.begin();
992  const auto __end = __gc.end();
993  size_t __n = __it.width();
994  if (__n > __max)
995  {
996  __s = {};
997  return 0;
998  }
999  while (++__it != __end)
1000  {
1001  size_t __n2 = __n + __it.width();
1002  if (__n2 > __max)
1003  {
1004  __s = basic_string_view<_CharT>(__s.begin(), __it.base());
1005  return __n;
1006  }
1007  __n = __n2;
1008  }
1009  return __n;
1010  }
1011 
1012  template<typename _CharT>
1013  consteval bool
1014  __literal_encoding_is_unicode()
1015  {
1016  if constexpr (is_same_v<_CharT, char16_t>)
1017  return true;
1018  else if constexpr (is_same_v<_CharT, char32_t>)
1019  return true;
1020 #ifdef __cpp_char8_t
1021  else if constexpr (is_same_v<_CharT, char8_t>)
1022  return true;
1023 #endif
1024 
1025  const char* __enc = "";
1026 
1027 #ifdef __GNUC_EXECUTION_CHARSET_NAME
1028  auto __remove_iso10646_prefix = [](const char* __s) {
1029  // GNU iconv allows "ISO-10646/" prefix (case-insensitive).
1030  if (__s[0] == 'I' || __s[0] == 'i')
1031  if (__s[1] == 'S' || __s[1] == 's')
1032  if (__s[2] == 'O' || __s[2] == 'o')
1033  if (string_view(__s + 3).starts_with("-10646/"))
1034  return __s + 10;
1035  return __s;
1036  };
1037 
1038  if constexpr (is_same_v<_CharT, char>)
1039  __enc = __remove_iso10646_prefix(__GNUC_EXECUTION_CHARSET_NAME);
1040 # if defined _GLIBCXX_USE_WCHAR_T && defined __GNUC_WIDE_EXECUTION_CHARSET_NAME
1041  else
1042  __enc = __remove_iso10646_prefix(__GNUC_WIDE_EXECUTION_CHARSET_NAME);
1043 # endif
1044 
1045  if ((__enc[0] == 'U' || __enc[0] == 'u')
1046  && (__enc[1] == 'T' || __enc[1] == 't')
1047  && (__enc[2] == 'F' || __enc[2] == 'f'))
1048  {
1049  __enc += 3;
1050  if (__enc[0] == '-')
1051  ++__enc;
1052  if (__enc[0] == '8')
1053  return __enc[1] == '\0' || string_view(__enc + 1) == "//";
1054  else if constexpr (!is_same_v<_CharT, char>)
1055  {
1056  string_view __s(__enc);
1057  if (__s.ends_with("//"))
1058  __s.remove_suffix(2);
1059  if (__s.ends_with("LE") || __s.ends_with("BE"))
1060  __s.remove_suffix(2);
1061  return __s == "16" || __s == "32";
1062  }
1063  }
1064 #elif defined __clang_literal_encoding__
1065  if constexpr (is_same_v<_CharT, char>)
1066  __enc = __clang_literal_encoding__;
1067 # if defined _GLIBCXX_USE_WCHAR_T && defined __clang_wide_literal_encoding__
1068  else
1069  __enc = __clang_wide_literal_encoding__;
1070 # endif
1071  // Clang accepts "-fexec-charset=utf-8" but the macro is still uppercase.
1072  string_view __s(__enc);
1073  if (__s == "UTF-8")
1074  return true;
1075  else if constexpr (!is_same_v<_CharT, char>)
1076  return __s == "UTF-16" || __s == "UTF-32";
1077 #endif
1078 
1079  return false;
1080  }
1081 
1082  consteval bool
1083  __literal_encoding_is_utf8()
1084  { return __literal_encoding_is_unicode<char>(); }
1085 
1086  consteval bool
1087  __literal_encoding_is_extended_ascii()
1088  {
1089  return '0' == 0x30 && 'A' == 0x41 && 'Z' == 0x5a
1090  && 'a' == 0x61 && 'z' == 0x7a;
1091  }
1092 
1093  // https://www.unicode.org/reports/tr22/tr22-8.html#Charset_Alias_Matching
1094  constexpr bool
1095  __charset_alias_match(string_view __a, string_view __b)
1096  {
1097  // Map alphanumeric chars to their base 64 value, everything else to 127.
1098  auto __map = [](char __c, bool& __num) -> unsigned char {
1099  if (__c == '0') [[unlikely]]
1100  return __num ? 0 : 127;
1101  const auto __v = __detail::__from_chars_alnum_to_val(__c);
1102  __num = __v < 10;
1103  return __v;
1104  };
1105 
1106  auto __ptr_a = __a.begin(), __end_a = __a.end();
1107  auto __ptr_b = __b.begin(), __end_b = __b.end();
1108  bool __num_a = false, __num_b = false;
1109 
1110  while (true)
1111  {
1112  // Find the value of the next alphanumeric character in each string.
1113  unsigned char __val_a{}, __val_b{};
1114  while (__ptr_a != __end_a
1115  && (__val_a = __map(*__ptr_a, __num_a)) == 127)
1116  ++__ptr_a;
1117  while (__ptr_b != __end_b
1118  && (__val_b = __map(*__ptr_b, __num_b)) == 127)
1119  ++__ptr_b;
1120  // Stop when we reach the end of a string, or get a mismatch.
1121  if (__ptr_a == __end_a)
1122  return __ptr_b == __end_b;
1123  else if (__ptr_b == __end_b)
1124  return false;
1125  else if (__val_a != __val_b)
1126  return false; // Found non-matching characters.
1127  ++__ptr_a;
1128  ++__ptr_b;
1129  }
1130  return true;
1131  }
1132 
1133 } // namespace __unicode
1134 
1135 namespace ranges
1136 {
1137  template<typename _To, typename _Range>
1138  inline constexpr bool
1139  enable_borrowed_range<std::__unicode::_Utf_view<_To, _Range>>
1140  = enable_borrowed_range<_Range>;
1141 
1142  template<typename _Range>
1143  inline constexpr bool
1144  enable_borrowed_range<std::__unicode::_Grapheme_cluster_view<_Range>>
1145  = enable_borrowed_range<_Range>;
1146 } // namespace ranges
1147 
1148 _GLIBCXX_END_NAMESPACE_VERSION
1149 } // namespace std
1150 #endif // C++20
1151 #endif // _GLIBCXX_UNICODE_H
constexpr complex< _Tp > operator*(const complex< _Tp > &__x, const complex< _Tp > &__y)
Return new complex value x times y.
Definition: complex:434
constexpr std::remove_reference< _Tp >::type && move(_Tp &&__t) noexcept
Convert a value to an rvalue.
Definition: move.h:138
constexpr _Tp && forward(typename std::remove_reference< _Tp >::type &__t) noexcept
Forward an lvalue.
Definition: move.h:72
_Tp * end(valarray< _Tp > &__va) noexcept
Return an iterator pointing to one past the last element of the valarray.
Definition: valarray:1251
_Tp * begin(valarray< _Tp > &__va) noexcept
Return an iterator pointing to the first element of the valarray.
Definition: valarray:1229
ISO C++ entities toplevel namespace is std.
constexpr auto empty(const _Container &__cont) noexcept(noexcept(__cont.empty())) -> decltype(__cont.empty())
Return whether a container is empty.
Definition: range_access.h:294
constexpr void advance(_InputIterator &__i, _Distance __n)
A generalization of pointer arithmetic.
GNU extensions for public use.
__numeric_traits_integer< _Tp > __int_traits
Convenience alias for __numeric_traits<integer-type>.