UnicodeTest.cpp 4.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129
  1. /*
  2. * Copyright 2018-present Facebook, Inc.
  3. *
  4. * Licensed under the Apache License, Version 2.0 (the "License");
  5. * you may not use this file except in compliance with the License.
  6. * You may obtain a copy of the License at
  7. *
  8. * http://www.apache.org/licenses/LICENSE-2.0
  9. *
  10. * Unless required by applicable law or agreed to in writing, software
  11. * distributed under the License is distributed on an "AS IS" BASIS,
  12. * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13. * See the License for the specific language governing permissions and
  14. * limitations under the License.
  15. */
  16. #include <folly/Unicode.h>
  17. #include <initializer_list>
  18. #include <stdexcept>
  19. #include <folly/Range.h>
  20. #include <folly/portability/GTest.h>
  21. using folly::utf8ToCodePoint;
  22. void testValid(std::initializer_list<unsigned char> data, char32_t expected) {
  23. {
  24. const unsigned char* p = data.begin();
  25. const unsigned char* e = data.end();
  26. EXPECT_EQ(utf8ToCodePoint(p, e, /* skipOnError */ false), expected)
  27. << folly::StringPiece(
  28. (const char*)data.begin(), (const char*)data.end());
  29. }
  30. {
  31. const unsigned char* p = data.begin();
  32. const unsigned char* e = data.end();
  33. EXPECT_EQ(utf8ToCodePoint(p, e, /* skipOnError */ true), expected)
  34. << folly::StringPiece(
  35. (const char*)data.begin(), (const char*)data.end());
  36. }
  37. }
  38. void testInvalid(std::initializer_list<unsigned char> data) {
  39. {
  40. const unsigned char* p = data.begin();
  41. const unsigned char* e = data.end();
  42. EXPECT_THROW(
  43. utf8ToCodePoint(p, e, /* skipOnError */ false), std::runtime_error)
  44. << folly::StringPiece(
  45. (const char*)data.begin(), (const char*)data.end());
  46. }
  47. {
  48. const unsigned char* p = data.begin();
  49. const unsigned char* e = data.end();
  50. EXPECT_EQ(utf8ToCodePoint(p, e, /* skipOnError */ true), 0xfffd)
  51. << folly::StringPiece(
  52. (const char*)data.begin(), (const char*)data.end());
  53. }
  54. }
  55. TEST(InvalidUtf8ToCodePoint, rfc3629Overlong) {
  56. // https://tools.ietf.org/html/rfc3629
  57. // Implementations of the decoding algorithm above MUST protect against
  58. // decoding invalid sequences. For instance, a naive implementation may
  59. // decode the overlong UTF-8 sequence C0 80 into the character U+0000 [...]
  60. // Decoding invalid sequences may have security consequences or cause other
  61. // problems.
  62. testInvalid({0xC0, 0x80});
  63. }
  64. TEST(InvalidUtf8ToCodePoint, rfc3629SurrogatePair) {
  65. // https://tools.ietf.org/html/rfc3629
  66. // Implementations of the decoding algorithm above MUST protect against
  67. // decoding invalid sequences. For instance, a naive implementation may
  68. // decode [...] the surrogate pair ED A1 8C ED BE B4 into U+233B4.
  69. // Decoding invalid sequences may have security consequences or cause other
  70. // problems.
  71. testInvalid({0xED, 0xA1, 0x8C, 0xED, 0xBE, 0xB4});
  72. }
  73. TEST(InvalidUtf8ToCodePoint, MarkusKuhnSingleUTF16Surrogates) {
  74. // https://www.cl.cam.ac.uk/~mgk25/ucs/examples/UTF-8-test.txt
  75. // 5.1.1 U+D800 = ed a0 80
  76. // 5.1.2 U+DB7F = ed ad bf
  77. // 5.1.3 U+DB80 = ed ae 80
  78. // 5.1.4 U+DBFF = ed af bf
  79. // 5.1.5 U+DC00 = ed b0 80
  80. // 5.1.6 U+DF80 = ed be 80
  81. // 5.1.7 U+DFFF = ed bf bf
  82. testInvalid({0xed, 0xa0, 0x80});
  83. testInvalid({0xed, 0xad, 0xbf});
  84. testInvalid({0xed, 0xae, 0x80});
  85. testInvalid({0xed, 0xaf, 0xbf});
  86. testInvalid({0xed, 0xb0, 0x80});
  87. testInvalid({0xed, 0xbe, 0x80});
  88. testInvalid({0xed, 0xbf, 0xbf});
  89. }
  90. TEST(InvalidUtf8ToCodePoint, MarkusKuhnPairedUTF16Surrogates) {
  91. // https://www.cl.cam.ac.uk/~mgk25/ucs/examples/UTF-8-test.txt
  92. // 5.2.1 U+D800 U+DC00 = ed a0 80 ed b0 80
  93. // 5.2.2 U+D800 U+DFFF = ed a0 80 ed bf bf
  94. // 5.2.3 U+DB7F U+DC00 = ed ad bf ed b0 80
  95. // 5.2.4 U+DB7F U+DFFF = ed ad bf ed bf bf
  96. // 5.2.5 U+DB80 U+DC00 = ed ae 80 ed b0 80
  97. // 5.2.6 U+DB80 U+DFFF = ed ae 80 ed bf bf
  98. // 5.2.7 U+DBFF U+DC00 = ed af bf ed b0 80
  99. // 5.2.8 U+DBFF U+DFFF = ed af bf ed bf bf
  100. testInvalid({0xed, 0xa0, 0x80, 0xed, 0xb0, 0x80});
  101. testInvalid({0xed, 0xa0, 0x80, 0xed, 0xbf, 0xbf});
  102. testInvalid({0xed, 0xad, 0xbf, 0xed, 0xb0, 0x80});
  103. testInvalid({0xed, 0xad, 0xbf, 0xed, 0xbf, 0xbf});
  104. testInvalid({0xed, 0xae, 0x80, 0xed, 0xb0, 0x80});
  105. testInvalid({0xed, 0xae, 0x80, 0xed, 0xbf, 0xbf});
  106. testInvalid({0xed, 0xaf, 0xbf, 0xed, 0xb0, 0x80});
  107. testInvalid({0xed, 0xaf, 0xbf, 0xed, 0xbf, 0xbf});
  108. }
  109. TEST(ValidUtf8ToCodePoint, FourCloverLeaf) {
  110. testValid({0xF0, 0x9F, 0x8D, 0x80}, 0x1F340); // u8"\U0001F340";
  111. }
  112. TEST(InvalidUtf8ToCodePoint, FourCloverLeafAsSurrogates) {
  113. testInvalid({0xd8, 0x3c, 0xdf, 0x40}); // u8"\U0001F340";
  114. }
  115. TEST(ValidUtf8ToCodePoint, LastCodePoint) {
  116. testValid({0xF4, 0x8F, 0xBF, 0xBF}, 0x10FFFF); // u8"\U0010FFFF";
  117. }