123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154 |
- /*
- * Copyright 2011-present Facebook, Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
- #include <folly/Unicode.h>
- #include <folly/Conv.h>
- namespace folly {
- //////////////////////////////////////////////////////////////////////
- std::string codePointToUtf8(char32_t cp) {
- std::string result;
- // Based on description from http://en.wikipedia.org/wiki/UTF-8.
- if (cp <= 0x7f) {
- result.resize(1);
- result[0] = static_cast<char>(cp);
- } else if (cp <= 0x7FF) {
- result.resize(2);
- result[1] = static_cast<char>(0x80 | (0x3f & cp));
- result[0] = static_cast<char>(0xC0 | (cp >> 6));
- } else if (cp <= 0xFFFF) {
- result.resize(3);
- result[2] = static_cast<char>(0x80 | (0x3f & cp));
- result[1] = (0x80 | static_cast<char>((0x3f & (cp >> 6))));
- result[0] = (0xE0 | static_cast<char>(cp >> 12));
- } else if (cp <= 0x10FFFF) {
- result.resize(4);
- result[3] = static_cast<char>(0x80 | (0x3f & cp));
- result[2] = static_cast<char>(0x80 | (0x3f & (cp >> 6)));
- result[1] = static_cast<char>(0x80 | (0x3f & (cp >> 12)));
- result[0] = static_cast<char>(0xF0 | (cp >> 18));
- }
- return result;
- }
- char32_t utf8ToCodePoint(
- const unsigned char*& p,
- const unsigned char* const e,
- bool skipOnError) {
- /* The following encodings are valid, except for the 5 and 6 byte
- * combinations:
- * 0xxxxxxx
- * 110xxxxx 10xxxxxx
- * 1110xxxx 10xxxxxx 10xxxxxx
- * 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
- * 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
- * 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
- */
- const auto skip = [&] {
- ++p;
- return U'\ufffd';
- };
- if (p >= e) {
- if (skipOnError) {
- return skip();
- }
- throw std::runtime_error("folly::utf8ToCodePoint empty/invalid string");
- }
- unsigned char fst = *p;
- if (!(fst & 0x80)) {
- // trivial case
- return *p++;
- }
- static const uint32_t bitMask[] = {
- (1 << 7) - 1,
- (1 << 11) - 1,
- (1 << 16) - 1,
- (1 << 21) - 1,
- };
- // upper control bits are masked out later
- uint32_t d = fst;
- if ((fst & 0xC0) != 0xC0) {
- if (skipOnError) {
- return skip();
- }
- throw std::runtime_error(
- to<std::string>("folly::utf8ToCodePoint i=0 d=", d));
- }
- fst <<= 1;
- for (unsigned int i = 1; i != 4 && p + i < e; ++i) {
- const unsigned char tmp = p[i];
- if ((tmp & 0xC0) != 0x80) {
- if (skipOnError) {
- return skip();
- }
- throw std::runtime_error(to<std::string>(
- "folly::utf8ToCodePoint i=", i, " tmp=", (uint32_t)tmp));
- }
- d = (d << 6) | (tmp & 0x3F);
- fst <<= 1;
- if (!(fst & 0x80)) {
- d &= bitMask[i];
- // overlong, could have been encoded with i bytes
- if ((d & ~bitMask[i - 1]) == 0) {
- if (skipOnError) {
- return skip();
- }
- throw std::runtime_error(
- to<std::string>("folly::utf8ToCodePoint i=", i, " d=", d));
- }
- // check for surrogates only needed for 3 bytes
- if (i == 2) {
- if ((d >= 0xD800 && d <= 0xDFFF) || d > 0x10FFFF) {
- if (skipOnError) {
- return skip();
- }
- throw std::runtime_error(
- to<std::string>("folly::utf8ToCodePoint i=", i, " d=", d));
- }
- }
- p += i + 1;
- return d;
- }
- }
- if (skipOnError) {
- return skip();
- }
- throw std::runtime_error("folly::utf8ToCodePoint encoding length maxed out");
- }
- //////////////////////////////////////////////////////////////////////
- } // namespace folly
|