Unicode.cpp 3.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154
  1. /*
  2. * Copyright 2011-present Facebook, Inc.
  3. *
  4. * Licensed under the Apache License, Version 2.0 (the "License");
  5. * you may not use this file except in compliance with the License.
  6. * You may obtain a copy of the License at
  7. *
  8. * http://www.apache.org/licenses/LICENSE-2.0
  9. *
  10. * Unless required by applicable law or agreed to in writing, software
  11. * distributed under the License is distributed on an "AS IS" BASIS,
  12. * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13. * See the License for the specific language governing permissions and
  14. * limitations under the License.
  15. */
  16. #include <folly/Unicode.h>
  17. #include <folly/Conv.h>
  18. namespace folly {
  19. //////////////////////////////////////////////////////////////////////
  20. std::string codePointToUtf8(char32_t cp) {
  21. std::string result;
  22. // Based on description from http://en.wikipedia.org/wiki/UTF-8.
  23. if (cp <= 0x7f) {
  24. result.resize(1);
  25. result[0] = static_cast<char>(cp);
  26. } else if (cp <= 0x7FF) {
  27. result.resize(2);
  28. result[1] = static_cast<char>(0x80 | (0x3f & cp));
  29. result[0] = static_cast<char>(0xC0 | (cp >> 6));
  30. } else if (cp <= 0xFFFF) {
  31. result.resize(3);
  32. result[2] = static_cast<char>(0x80 | (0x3f & cp));
  33. result[1] = (0x80 | static_cast<char>((0x3f & (cp >> 6))));
  34. result[0] = (0xE0 | static_cast<char>(cp >> 12));
  35. } else if (cp <= 0x10FFFF) {
  36. result.resize(4);
  37. result[3] = static_cast<char>(0x80 | (0x3f & cp));
  38. result[2] = static_cast<char>(0x80 | (0x3f & (cp >> 6)));
  39. result[1] = static_cast<char>(0x80 | (0x3f & (cp >> 12)));
  40. result[0] = static_cast<char>(0xF0 | (cp >> 18));
  41. }
  42. return result;
  43. }
  44. char32_t utf8ToCodePoint(
  45. const unsigned char*& p,
  46. const unsigned char* const e,
  47. bool skipOnError) {
  48. /* The following encodings are valid, except for the 5 and 6 byte
  49. * combinations:
  50. * 0xxxxxxx
  51. * 110xxxxx 10xxxxxx
  52. * 1110xxxx 10xxxxxx 10xxxxxx
  53. * 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
  54. * 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
  55. * 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
  56. */
  57. const auto skip = [&] {
  58. ++p;
  59. return U'\ufffd';
  60. };
  61. if (p >= e) {
  62. if (skipOnError) {
  63. return skip();
  64. }
  65. throw std::runtime_error("folly::utf8ToCodePoint empty/invalid string");
  66. }
  67. unsigned char fst = *p;
  68. if (!(fst & 0x80)) {
  69. // trivial case
  70. return *p++;
  71. }
  72. static const uint32_t bitMask[] = {
  73. (1 << 7) - 1,
  74. (1 << 11) - 1,
  75. (1 << 16) - 1,
  76. (1 << 21) - 1,
  77. };
  78. // upper control bits are masked out later
  79. uint32_t d = fst;
  80. if ((fst & 0xC0) != 0xC0) {
  81. if (skipOnError) {
  82. return skip();
  83. }
  84. throw std::runtime_error(
  85. to<std::string>("folly::utf8ToCodePoint i=0 d=", d));
  86. }
  87. fst <<= 1;
  88. for (unsigned int i = 1; i != 4 && p + i < e; ++i) {
  89. const unsigned char tmp = p[i];
  90. if ((tmp & 0xC0) != 0x80) {
  91. if (skipOnError) {
  92. return skip();
  93. }
  94. throw std::runtime_error(to<std::string>(
  95. "folly::utf8ToCodePoint i=", i, " tmp=", (uint32_t)tmp));
  96. }
  97. d = (d << 6) | (tmp & 0x3F);
  98. fst <<= 1;
  99. if (!(fst & 0x80)) {
  100. d &= bitMask[i];
  101. // overlong, could have been encoded with i bytes
  102. if ((d & ~bitMask[i - 1]) == 0) {
  103. if (skipOnError) {
  104. return skip();
  105. }
  106. throw std::runtime_error(
  107. to<std::string>("folly::utf8ToCodePoint i=", i, " d=", d));
  108. }
  109. // check for surrogates only needed for 3 bytes
  110. if (i == 2) {
  111. if ((d >= 0xD800 && d <= 0xDFFF) || d > 0x10FFFF) {
  112. if (skipOnError) {
  113. return skip();
  114. }
  115. throw std::runtime_error(
  116. to<std::string>("folly::utf8ToCodePoint i=", i, " d=", d));
  117. }
  118. }
  119. p += i + 1;
  120. return d;
  121. }
  122. }
  123. if (skipOnError) {
  124. return skip();
  125. }
  126. throw std::runtime_error("folly::utf8ToCodePoint encoding length maxed out");
  127. }
  128. //////////////////////////////////////////////////////////////////////
  129. } // namespace folly