json.cpp 25 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948
  1. /*
  2. * Copyright 2011-present Facebook, Inc.
  3. *
  4. * Licensed under the Apache License, Version 2.0 (the "License");
  5. * you may not use this file except in compliance with the License.
  6. * You may obtain a copy of the License at
  7. *
  8. * http://www.apache.org/licenses/LICENSE-2.0
  9. *
  10. * Unless required by applicable law or agreed to in writing, software
  11. * distributed under the License is distributed on an "AS IS" BASIS,
  12. * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13. * See the License for the specific language governing permissions and
  14. * limitations under the License.
  15. */
  16. #include <folly/json.h>
  17. #include <algorithm>
  18. #include <functional>
  19. #include <iterator>
  20. #include <type_traits>
  21. #include <boost/algorithm/string.hpp>
  22. #include <folly/Conv.h>
  23. #include <folly/Portability.h>
  24. #include <folly/Range.h>
  25. #include <folly/String.h>
  26. #include <folly/Unicode.h>
  27. #include <folly/lang/Bits.h>
  28. #include <folly/portability/Constexpr.h>
  29. namespace folly {
  30. //////////////////////////////////////////////////////////////////////
  31. namespace json {
  32. namespace {
  33. struct Printer {
  34. explicit Printer(
  35. std::string& out,
  36. unsigned* indentLevel,
  37. serialization_opts const* opts)
  38. : out_(out), indentLevel_(indentLevel), opts_(*opts) {}
  39. void operator()(dynamic const& v) const {
  40. switch (v.type()) {
  41. case dynamic::DOUBLE:
  42. if (!opts_.allow_nan_inf &&
  43. (std::isnan(v.asDouble()) || std::isinf(v.asDouble()))) {
  44. throw std::runtime_error(
  45. "folly::toJson: JSON object value was a "
  46. "NaN or INF");
  47. }
  48. toAppend(
  49. v.asDouble(), &out_, opts_.double_mode, opts_.double_num_digits);
  50. break;
  51. case dynamic::INT64: {
  52. auto intval = v.asInt();
  53. if (opts_.javascript_safe) {
  54. // Use folly::to to check that this integer can be represented
  55. // as a double without loss of precision.
  56. intval = int64_t(to<double>(intval));
  57. }
  58. toAppend(intval, &out_);
  59. break;
  60. }
  61. case dynamic::BOOL:
  62. out_ += v.asBool() ? "true" : "false";
  63. break;
  64. case dynamic::NULLT:
  65. out_ += "null";
  66. break;
  67. case dynamic::STRING:
  68. escapeString(v.asString(), out_, opts_);
  69. break;
  70. case dynamic::OBJECT:
  71. printObject(v);
  72. break;
  73. case dynamic::ARRAY:
  74. printArray(v);
  75. break;
  76. default:
  77. CHECK(0) << "Bad type " << v.type();
  78. }
  79. }
  80. private:
  81. void printKV(const std::pair<const dynamic, dynamic>& p) const {
  82. if (!opts_.allow_non_string_keys && !p.first.isString()) {
  83. throw std::runtime_error(
  84. "folly::toJson: JSON object key was not a "
  85. "string");
  86. }
  87. (*this)(p.first);
  88. mapColon();
  89. (*this)(p.second);
  90. }
  91. template <typename Iterator>
  92. void printKVPairs(Iterator begin, Iterator end) const {
  93. printKV(*begin);
  94. for (++begin; begin != end; ++begin) {
  95. out_ += ',';
  96. newline();
  97. printKV(*begin);
  98. }
  99. }
  100. void printObject(dynamic const& o) const {
  101. if (o.empty()) {
  102. out_ += "{}";
  103. return;
  104. }
  105. out_ += '{';
  106. indent();
  107. newline();
  108. if (opts_.sort_keys || opts_.sort_keys_by) {
  109. using ref = std::reference_wrapper<decltype(o.items())::value_type const>;
  110. std::vector<ref> refs(o.items().begin(), o.items().end());
  111. using SortByRef = FunctionRef<bool(dynamic const&, dynamic const&)>;
  112. auto const& sort_keys_by = opts_.sort_keys_by
  113. ? SortByRef(opts_.sort_keys_by)
  114. : SortByRef(std::less<dynamic>());
  115. std::sort(refs.begin(), refs.end(), [&](ref a, ref b) {
  116. // Only compare keys. No ordering among identical keys.
  117. return sort_keys_by(a.get().first, b.get().first);
  118. });
  119. printKVPairs(refs.cbegin(), refs.cend());
  120. } else {
  121. printKVPairs(o.items().begin(), o.items().end());
  122. }
  123. outdent();
  124. newline();
  125. out_ += '}';
  126. }
  127. void printArray(dynamic const& a) const {
  128. if (a.empty()) {
  129. out_ += "[]";
  130. return;
  131. }
  132. out_ += '[';
  133. indent();
  134. newline();
  135. (*this)(a[0]);
  136. for (auto& val : range(std::next(a.begin()), a.end())) {
  137. out_ += ',';
  138. newline();
  139. (*this)(val);
  140. }
  141. outdent();
  142. newline();
  143. out_ += ']';
  144. }
  145. private:
  146. void outdent() const {
  147. if (indentLevel_) {
  148. --*indentLevel_;
  149. }
  150. }
  151. void indent() const {
  152. if (indentLevel_) {
  153. ++*indentLevel_;
  154. }
  155. }
  156. void newline() const {
  157. if (indentLevel_) {
  158. out_ += to<std::string>('\n', std::string(*indentLevel_ * 2, ' '));
  159. }
  160. }
  161. void mapColon() const {
  162. out_ += indentLevel_ ? ": " : ":";
  163. }
  164. private:
  165. std::string& out_;
  166. unsigned* const indentLevel_;
  167. serialization_opts const& opts_;
  168. };
  169. //////////////////////////////////////////////////////////////////////
  170. struct FOLLY_EXPORT ParseError : std::runtime_error {
  171. explicit ParseError(
  172. unsigned int line,
  173. std::string const& context,
  174. std::string const& expected)
  175. : std::runtime_error(to<std::string>(
  176. "json parse error on line ",
  177. line,
  178. !context.empty() ? to<std::string>(" near `", context, '\'') : "",
  179. ": ",
  180. expected)) {}
  181. };
  182. // Wraps our input buffer with some helper functions.
  183. struct Input {
  184. explicit Input(StringPiece range, json::serialization_opts const* opts)
  185. : range_(range), opts_(*opts), lineNum_(0) {
  186. storeCurrent();
  187. }
  188. Input(Input const&) = delete;
  189. Input& operator=(Input const&) = delete;
  190. char const* begin() const {
  191. return range_.begin();
  192. }
  193. // Parse ahead for as long as the supplied predicate is satisfied,
  194. // returning a range of what was skipped.
  195. template <class Predicate>
  196. StringPiece skipWhile(const Predicate& p) {
  197. std::size_t skipped = 0;
  198. for (; skipped < range_.size(); ++skipped) {
  199. if (!p(range_[skipped])) {
  200. break;
  201. }
  202. if (range_[skipped] == '\n') {
  203. ++lineNum_;
  204. }
  205. }
  206. auto ret = range_.subpiece(0, skipped);
  207. range_.advance(skipped);
  208. storeCurrent();
  209. return ret;
  210. }
  211. StringPiece skipDigits() {
  212. return skipWhile([](char c) { return c >= '0' && c <= '9'; });
  213. }
  214. StringPiece skipMinusAndDigits() {
  215. bool firstChar = true;
  216. return skipWhile([&firstChar](char c) {
  217. bool result = (c >= '0' && c <= '9') || (firstChar && c == '-');
  218. firstChar = false;
  219. return result;
  220. });
  221. }
  222. void skipWhitespace() {
  223. range_ = folly::skipWhitespace(range_);
  224. storeCurrent();
  225. }
  226. void expect(char c) {
  227. if (**this != c) {
  228. throw ParseError(
  229. lineNum_, context(), to<std::string>("expected '", c, '\''));
  230. }
  231. ++*this;
  232. }
  233. std::size_t size() const {
  234. return range_.size();
  235. }
  236. int operator*() const {
  237. return current_;
  238. }
  239. void operator++() {
  240. range_.pop_front();
  241. storeCurrent();
  242. }
  243. template <class T>
  244. T extract() {
  245. try {
  246. return to<T>(&range_);
  247. } catch (std::exception const& e) {
  248. error(e.what());
  249. }
  250. }
  251. bool consume(StringPiece str) {
  252. if (boost::starts_with(range_, str)) {
  253. range_.advance(str.size());
  254. storeCurrent();
  255. return true;
  256. }
  257. return false;
  258. }
  259. std::string context() const {
  260. return range_.subpiece(0, 16 /* arbitrary */).toString();
  261. }
  262. dynamic error(char const* what) const {
  263. throw ParseError(lineNum_, context(), what);
  264. }
  265. json::serialization_opts const& getOpts() {
  266. return opts_;
  267. }
  268. void incrementRecursionLevel() {
  269. if (currentRecursionLevel_ > opts_.recursion_limit) {
  270. error("recursion limit exceeded");
  271. }
  272. currentRecursionLevel_++;
  273. }
  274. void decrementRecursionLevel() {
  275. currentRecursionLevel_--;
  276. }
  277. private:
  278. void storeCurrent() {
  279. current_ = range_.empty() ? EOF : range_.front();
  280. }
  281. private:
  282. StringPiece range_;
  283. json::serialization_opts const& opts_;
  284. unsigned lineNum_;
  285. int current_;
  286. unsigned int currentRecursionLevel_{0};
  287. };
  288. class RecursionGuard {
  289. public:
  290. explicit RecursionGuard(Input& in) : in_(in) {
  291. in_.incrementRecursionLevel();
  292. }
  293. ~RecursionGuard() {
  294. in_.decrementRecursionLevel();
  295. }
  296. private:
  297. Input& in_;
  298. };
  299. dynamic parseValue(Input& in);
  300. std::string parseString(Input& in);
  301. dynamic parseNumber(Input& in);
  302. dynamic parseObject(Input& in) {
  303. DCHECK_EQ(*in, '{');
  304. ++in;
  305. dynamic ret = dynamic::object;
  306. in.skipWhitespace();
  307. if (*in == '}') {
  308. ++in;
  309. return ret;
  310. }
  311. for (;;) {
  312. if (in.getOpts().allow_trailing_comma && *in == '}') {
  313. break;
  314. }
  315. if (*in == '\"') { // string
  316. auto key = parseString(in);
  317. in.skipWhitespace();
  318. in.expect(':');
  319. in.skipWhitespace();
  320. ret.insert(std::move(key), parseValue(in));
  321. } else if (!in.getOpts().allow_non_string_keys) {
  322. in.error("expected string for object key name");
  323. } else {
  324. auto key = parseValue(in);
  325. in.skipWhitespace();
  326. in.expect(':');
  327. in.skipWhitespace();
  328. ret.insert(std::move(key), parseValue(in));
  329. }
  330. in.skipWhitespace();
  331. if (*in != ',') {
  332. break;
  333. }
  334. ++in;
  335. in.skipWhitespace();
  336. }
  337. in.expect('}');
  338. return ret;
  339. }
  340. dynamic parseArray(Input& in) {
  341. DCHECK_EQ(*in, '[');
  342. ++in;
  343. dynamic ret = dynamic::array;
  344. in.skipWhitespace();
  345. if (*in == ']') {
  346. ++in;
  347. return ret;
  348. }
  349. for (;;) {
  350. if (in.getOpts().allow_trailing_comma && *in == ']') {
  351. break;
  352. }
  353. ret.push_back(parseValue(in));
  354. in.skipWhitespace();
  355. if (*in != ',') {
  356. break;
  357. }
  358. ++in;
  359. in.skipWhitespace();
  360. }
  361. in.expect(']');
  362. return ret;
  363. }
  364. dynamic parseNumber(Input& in) {
  365. bool const negative = (*in == '-');
  366. if (negative && in.consume("-Infinity")) {
  367. if (in.getOpts().parse_numbers_as_strings) {
  368. return "-Infinity";
  369. } else {
  370. return -std::numeric_limits<double>::infinity();
  371. }
  372. }
  373. auto integral = in.skipMinusAndDigits();
  374. if (negative && integral.size() < 2) {
  375. in.error("expected digits after `-'");
  376. }
  377. auto const wasE = *in == 'e' || *in == 'E';
  378. constexpr const char* maxInt = "9223372036854775807";
  379. constexpr const char* minInt = "-9223372036854775808";
  380. constexpr auto maxIntLen = constexpr_strlen(maxInt);
  381. constexpr auto minIntLen = constexpr_strlen(minInt);
  382. if (*in != '.' && !wasE && in.getOpts().parse_numbers_as_strings) {
  383. return integral;
  384. }
  385. if (*in != '.' && !wasE) {
  386. if (LIKELY(!in.getOpts().double_fallback || integral.size() < maxIntLen) ||
  387. (!negative && integral.size() == maxIntLen && integral <= maxInt) ||
  388. (negative && integral.size() == minIntLen && integral <= minInt)) {
  389. auto val = to<int64_t>(integral);
  390. in.skipWhitespace();
  391. return val;
  392. } else {
  393. auto val = to<double>(integral);
  394. in.skipWhitespace();
  395. return val;
  396. }
  397. }
  398. auto end = !wasE ? (++in, in.skipDigits().end()) : in.begin();
  399. if (*in == 'e' || *in == 'E') {
  400. ++in;
  401. if (*in == '+' || *in == '-') {
  402. ++in;
  403. }
  404. auto expPart = in.skipDigits();
  405. end = expPart.end();
  406. }
  407. auto fullNum = range(integral.begin(), end);
  408. if (in.getOpts().parse_numbers_as_strings) {
  409. return fullNum;
  410. }
  411. auto val = to<double>(fullNum);
  412. return val;
  413. }
  414. std::string decodeUnicodeEscape(Input& in) {
  415. auto hexVal = [&](int c) -> uint16_t {
  416. // clang-format off
  417. return uint16_t(
  418. c >= '0' && c <= '9' ? c - '0' :
  419. c >= 'a' && c <= 'f' ? c - 'a' + 10 :
  420. c >= 'A' && c <= 'F' ? c - 'A' + 10 :
  421. (in.error("invalid hex digit"), 0));
  422. // clang-format on
  423. };
  424. auto readHex = [&]() -> uint16_t {
  425. if (in.size() < 4) {
  426. in.error("expected 4 hex digits");
  427. }
  428. uint16_t ret = uint16_t(hexVal(*in) * 4096);
  429. ++in;
  430. ret += hexVal(*in) * 256;
  431. ++in;
  432. ret += hexVal(*in) * 16;
  433. ++in;
  434. ret += hexVal(*in);
  435. ++in;
  436. return ret;
  437. };
  438. /*
  439. * If the value encoded is in the surrogate pair range, we need to
  440. * make sure there is another escape that we can use also.
  441. */
  442. uint32_t codePoint = readHex();
  443. if (codePoint >= 0xd800 && codePoint <= 0xdbff) {
  444. if (!in.consume("\\u")) {
  445. in.error(
  446. "expected another unicode escape for second half of "
  447. "surrogate pair");
  448. }
  449. uint16_t second = readHex();
  450. if (second >= 0xdc00 && second <= 0xdfff) {
  451. codePoint = 0x10000 + ((codePoint & 0x3ff) << 10) + (second & 0x3ff);
  452. } else {
  453. in.error("second character in surrogate pair is invalid");
  454. }
  455. } else if (codePoint >= 0xdc00 && codePoint <= 0xdfff) {
  456. in.error("invalid unicode code point (in range [0xdc00,0xdfff])");
  457. }
  458. return codePointToUtf8(codePoint);
  459. }
  460. std::string parseString(Input& in) {
  461. DCHECK_EQ(*in, '\"');
  462. ++in;
  463. std::string ret;
  464. for (;;) {
  465. auto range = in.skipWhile([](char c) { return c != '\"' && c != '\\'; });
  466. ret.append(range.begin(), range.end());
  467. if (*in == '\"') {
  468. ++in;
  469. break;
  470. }
  471. if (*in == '\\') {
  472. ++in;
  473. switch (*in) {
  474. // clang-format off
  475. case '\"': ret.push_back('\"'); ++in; break;
  476. case '\\': ret.push_back('\\'); ++in; break;
  477. case '/': ret.push_back('/'); ++in; break;
  478. case 'b': ret.push_back('\b'); ++in; break;
  479. case 'f': ret.push_back('\f'); ++in; break;
  480. case 'n': ret.push_back('\n'); ++in; break;
  481. case 'r': ret.push_back('\r'); ++in; break;
  482. case 't': ret.push_back('\t'); ++in; break;
  483. case 'u': ++in; ret += decodeUnicodeEscape(in); break;
  484. // clang-format on
  485. default:
  486. in.error(
  487. to<std::string>("unknown escape ", *in, " in string").c_str());
  488. }
  489. continue;
  490. }
  491. if (*in == EOF) {
  492. in.error("unterminated string");
  493. }
  494. if (!*in) {
  495. /*
  496. * Apparently we're actually supposed to ban all control
  497. * characters from strings. This seems unnecessarily
  498. * restrictive, so we're only banning zero bytes. (Since the
  499. * string is presumed to be UTF-8 encoded it's fine to just
  500. * check this way.)
  501. */
  502. in.error("null byte in string");
  503. }
  504. ret.push_back(char(*in));
  505. ++in;
  506. }
  507. return ret;
  508. }
  509. dynamic parseValue(Input& in) {
  510. RecursionGuard guard(in);
  511. in.skipWhitespace();
  512. // clang-format off
  513. return
  514. *in == '[' ? parseArray(in) :
  515. *in == '{' ? parseObject(in) :
  516. *in == '\"' ? parseString(in) :
  517. (*in == '-' || (*in >= '0' && *in <= '9')) ? parseNumber(in) :
  518. in.consume("true") ? true :
  519. in.consume("false") ? false :
  520. in.consume("null") ? nullptr :
  521. in.consume("Infinity") ?
  522. (in.getOpts().parse_numbers_as_strings ? (dynamic)"Infinity" :
  523. (dynamic)std::numeric_limits<double>::infinity()) :
  524. in.consume("NaN") ?
  525. (in.getOpts().parse_numbers_as_strings ? (dynamic)"NaN" :
  526. (dynamic)std::numeric_limits<double>::quiet_NaN()) :
  527. in.error("expected json value");
  528. // clang-format on
  529. }
  530. } // namespace
  531. //////////////////////////////////////////////////////////////////////
  532. std::array<uint64_t, 2> buildExtraAsciiToEscapeBitmap(StringPiece chars) {
  533. std::array<uint64_t, 2> escapes{{0, 0}};
  534. for (auto b : ByteRange(chars)) {
  535. if (b >= 0x20 && b < 0x80) {
  536. escapes[b / 64] |= uint64_t(1) << (b % 64);
  537. }
  538. }
  539. return escapes;
  540. }
  541. std::string serialize(dynamic const& dyn, serialization_opts const& opts) {
  542. std::string ret;
  543. unsigned indentLevel = 0;
  544. Printer p(ret, opts.pretty_formatting ? &indentLevel : nullptr, &opts);
  545. p(dyn);
  546. return ret;
  547. }
  548. // Fast path to determine the longest prefix that can be left
  549. // unescaped in a string of sizeof(T) bytes packed in an integer of
  550. // type T.
  551. template <bool EnableExtraAsciiEscapes, class T>
  552. size_t firstEscapableInWord(T s, const serialization_opts& opts) {
  553. static_assert(std::is_unsigned<T>::value, "Unsigned integer required");
  554. static constexpr T kOnes = ~T() / 255; // 0x...0101
  555. static constexpr T kMsbs = kOnes * 0x80; // 0x...8080
  556. // Sets the MSB of bytes < b. Precondition: b < 128.
  557. auto isLess = [](T w, uint8_t b) {
  558. // A byte is < b iff subtracting b underflows, so we check that
  559. // the MSB wasn't set before and it's set after the subtraction.
  560. return (w - kOnes * b) & ~w & kMsbs;
  561. };
  562. auto isChar = [&](uint8_t c) {
  563. // A byte is == c iff it is 0 if xored with c.
  564. return isLess(s ^ (kOnes * c), 1);
  565. };
  566. // The following masks have the MSB set for each byte of the word
  567. // that satisfies the corresponding condition.
  568. auto isHigh = s & kMsbs; // >= 128
  569. auto isLow = isLess(s, 0x20); // <= 0x1f
  570. auto needsEscape = isHigh | isLow | isChar('\\') | isChar('"');
  571. if /* constexpr */ (EnableExtraAsciiEscapes) {
  572. // Deal with optional bitmap for unicode escapes. Escapes can optionally be
  573. // set for ascii characters 32 - 127, so the inner loop may run up to 96
  574. // times. However, for the case where 0 or a handful of bits are set,
  575. // looping will be minimal through use of findFirstSet.
  576. for (size_t i = 0; i < opts.extra_ascii_to_escape_bitmap.size(); ++i) {
  577. const auto offset = i * 64;
  578. // Clear first 32 characters if this is the first index, since those are
  579. // always escaped.
  580. auto bitmap = opts.extra_ascii_to_escape_bitmap[i] &
  581. (i == 0 ? uint64_t(-1) << 32 : ~0UL);
  582. while (bitmap) {
  583. auto bit = folly::findFirstSet(bitmap);
  584. needsEscape |= isChar(offset + bit - 1);
  585. bitmap &= bitmap - 1;
  586. }
  587. }
  588. }
  589. if (!needsEscape) {
  590. return sizeof(T);
  591. }
  592. if (folly::kIsLittleEndian) {
  593. return folly::findFirstSet(needsEscape) / 8 - 1;
  594. } else {
  595. return sizeof(T) - folly::findLastSet(needsEscape) / 8;
  596. }
  597. }
  598. // Escape a string so that it is legal to print it in JSON text.
  599. template <bool EnableExtraAsciiEscapes>
  600. void escapeStringImpl(
  601. StringPiece input,
  602. std::string& out,
  603. const serialization_opts& opts) {
  604. auto hexDigit = [](uint8_t c) -> char {
  605. return c < 10 ? c + '0' : c - 10 + 'a';
  606. };
  607. out.push_back('\"');
  608. auto* p = reinterpret_cast<const unsigned char*>(input.begin());
  609. auto* q = reinterpret_cast<const unsigned char*>(input.begin());
  610. auto* e = reinterpret_cast<const unsigned char*>(input.end());
  611. while (p < e) {
  612. // Find the longest prefix that does not need escaping, and copy
  613. // it literally into the output string.
  614. auto firstEsc = p;
  615. while (firstEsc < e) {
  616. auto avail = e - firstEsc;
  617. uint64_t word = 0;
  618. if (avail >= 8) {
  619. word = folly::loadUnaligned<uint64_t>(firstEsc);
  620. } else {
  621. word = folly::partialLoadUnaligned<uint64_t>(firstEsc, avail);
  622. }
  623. auto prefix = firstEscapableInWord<EnableExtraAsciiEscapes>(word, opts);
  624. DCHECK_LE(prefix, avail);
  625. firstEsc += prefix;
  626. if (prefix < 8) {
  627. break;
  628. }
  629. }
  630. if (firstEsc > p) {
  631. out.append(reinterpret_cast<const char*>(p), firstEsc - p);
  632. p = firstEsc;
  633. // We can't be in the middle of a multibyte sequence, so we can reset q.
  634. q = p;
  635. if (p == e) {
  636. break;
  637. }
  638. }
  639. // Handle the next byte that may need escaping.
  640. // Since non-ascii encoding inherently does utf8 validation
  641. // we explicitly validate utf8 only if non-ascii encoding is disabled.
  642. if ((opts.validate_utf8 || opts.skip_invalid_utf8) &&
  643. !opts.encode_non_ascii) {
  644. // To achieve better spatial and temporal coherence
  645. // we do utf8 validation progressively along with the
  646. // string-escaping instead of two separate passes.
  647. // As the encoding progresses, q will stay at or ahead of p.
  648. CHECK_GE(q, p);
  649. // As p catches up with q, move q forward.
  650. if (q == p) {
  651. // calling utf8_decode has the side effect of
  652. // checking that utf8 encodings are valid
  653. char32_t v = utf8ToCodePoint(q, e, opts.skip_invalid_utf8);
  654. if (opts.skip_invalid_utf8 && v == U'\ufffd') {
  655. out.append(u8"\ufffd");
  656. p = q;
  657. continue;
  658. }
  659. }
  660. }
  661. auto encodeUnicode = opts.encode_non_ascii && (*p & 0x80);
  662. if /* constexpr */ (EnableExtraAsciiEscapes) {
  663. encodeUnicode = encodeUnicode ||
  664. (*p >= 0x20 && *p < 0x80 &&
  665. (opts.extra_ascii_to_escape_bitmap[*p / 64] &
  666. (uint64_t(1) << (*p % 64))));
  667. }
  668. if (encodeUnicode) {
  669. // note that this if condition captures utf8 chars
  670. // with value > 127, so size > 1 byte (or they are whitelisted for
  671. // Unicode encoding).
  672. // NOTE: char32_t / char16_t are both unsigned.
  673. char32_t cp = utf8ToCodePoint(p, e, opts.skip_invalid_utf8);
  674. auto writeHex = [&](char16_t v) {
  675. char buf[] = "\\u\0\0\0\0";
  676. buf[2] = hexDigit((v >> 12) & 0x0f);
  677. buf[3] = hexDigit((v >> 8) & 0x0f);
  678. buf[4] = hexDigit((v >> 4) & 0x0f);
  679. buf[5] = hexDigit(v & 0x0f);
  680. out.append(buf, 6);
  681. };
  682. // From the ECMA-404 The JSON Data Interchange Syntax 2nd Edition Dec 2017
  683. if (cp < 0x10000u) {
  684. // If the code point is in the Basic Multilingual Plane (U+0000 through
  685. // U+FFFF), then it may be represented as a six-character sequence:
  686. // a reverse solidus, followed by the lowercase letter u, followed by
  687. // four hexadecimal digits that encode the code point.
  688. writeHex(static_cast<char16_t>(cp));
  689. } else {
  690. // To escape a code point that is not in the Basic Multilingual Plane,
  691. // the character may be represented as a twelve-character sequence,
  692. // encoding the UTF-16 surrogate pair corresponding to the code point.
  693. writeHex(static_cast<char16_t>(
  694. 0xd800u + (((cp - 0x10000u) >> 10) & 0x3ffu)));
  695. writeHex(static_cast<char16_t>(0xdc00u + ((cp - 0x10000u) & 0x3ffu)));
  696. }
  697. } else if (*p == '\\' || *p == '\"') {
  698. char buf[] = "\\\0";
  699. buf[1] = char(*p++);
  700. out.append(buf, 2);
  701. } else if (*p <= 0x1f) {
  702. switch (*p) {
  703. // clang-format off
  704. case '\b': out.append("\\b"); p++; break;
  705. case '\f': out.append("\\f"); p++; break;
  706. case '\n': out.append("\\n"); p++; break;
  707. case '\r': out.append("\\r"); p++; break;
  708. case '\t': out.append("\\t"); p++; break;
  709. // clang-format on
  710. default:
  711. // Note that this if condition captures non readable chars
  712. // with value < 32, so size = 1 byte (e.g control chars).
  713. char buf[] = "\\u00\0\0";
  714. buf[4] = hexDigit(uint8_t((*p & 0xf0) >> 4));
  715. buf[5] = hexDigit(uint8_t(*p & 0xf));
  716. out.append(buf, 6);
  717. p++;
  718. }
  719. } else {
  720. out.push_back(char(*p++));
  721. }
  722. }
  723. out.push_back('\"');
  724. }
  725. void escapeString(
  726. StringPiece input,
  727. std::string& out,
  728. const serialization_opts& opts) {
  729. if (FOLLY_UNLIKELY(
  730. opts.extra_ascii_to_escape_bitmap[0] ||
  731. opts.extra_ascii_to_escape_bitmap[1])) {
  732. escapeStringImpl<true>(input, out, opts);
  733. } else {
  734. escapeStringImpl<false>(input, out, opts);
  735. }
  736. }
  737. std::string stripComments(StringPiece jsonC) {
  738. std::string result;
  739. enum class State {
  740. None,
  741. InString,
  742. InlineComment,
  743. LineComment
  744. } state = State::None;
  745. for (size_t i = 0; i < jsonC.size(); ++i) {
  746. auto s = jsonC.subpiece(i);
  747. switch (state) {
  748. case State::None:
  749. if (s.startsWith("/*")) {
  750. state = State::InlineComment;
  751. ++i;
  752. continue;
  753. } else if (s.startsWith("//")) {
  754. state = State::LineComment;
  755. ++i;
  756. continue;
  757. } else if (s[0] == '\"') {
  758. state = State::InString;
  759. }
  760. result.push_back(s[0]);
  761. break;
  762. case State::InString:
  763. if (s[0] == '\\') {
  764. if (UNLIKELY(s.size() == 1)) {
  765. throw std::logic_error("Invalid JSONC: string is not terminated");
  766. }
  767. result.push_back(s[0]);
  768. result.push_back(s[1]);
  769. ++i;
  770. continue;
  771. } else if (s[0] == '\"') {
  772. state = State::None;
  773. }
  774. result.push_back(s[0]);
  775. break;
  776. case State::InlineComment:
  777. if (s.startsWith("*/")) {
  778. state = State::None;
  779. ++i;
  780. }
  781. break;
  782. case State::LineComment:
  783. if (s[0] == '\n') {
  784. // skip the line break. It doesn't matter.
  785. state = State::None;
  786. }
  787. break;
  788. default:
  789. throw std::logic_error("Unknown comment state");
  790. }
  791. }
  792. return result;
  793. }
  794. } // namespace json
  795. //////////////////////////////////////////////////////////////////////
  796. dynamic parseJson(StringPiece range) {
  797. return parseJson(range, json::serialization_opts());
  798. }
  799. dynamic parseJson(StringPiece range, json::serialization_opts const& opts) {
  800. json::Input in(range, &opts);
  801. auto ret = parseValue(in);
  802. in.skipWhitespace();
  803. if (in.size() && *in != '\0') {
  804. in.error("parsing didn't consume all input");
  805. }
  806. return ret;
  807. }
  808. std::string toJson(dynamic const& dyn) {
  809. return json::serialize(dyn, json::serialization_opts());
  810. }
  811. std::string toPrettyJson(dynamic const& dyn) {
  812. json::serialization_opts opts;
  813. opts.pretty_formatting = true;
  814. return json::serialize(dyn, opts);
  815. }
  816. //////////////////////////////////////////////////////////////////////
  817. // dynamic::print_as_pseudo_json() is implemented here for header
  818. // ordering reasons (most of the dynamic implementation is in
  819. // dynamic-inl.h, which we don't want to include json.h).
  820. void dynamic::print_as_pseudo_json(std::ostream& out) const {
  821. json::serialization_opts opts;
  822. opts.allow_non_string_keys = true;
  823. opts.allow_nan_inf = true;
  824. out << json::serialize(*this, opts);
  825. }
  826. void PrintTo(const dynamic& dyn, std::ostream* os) {
  827. json::serialization_opts opts;
  828. opts.allow_nan_inf = true;
  829. opts.allow_non_string_keys = true;
  830. opts.pretty_formatting = true;
  831. opts.sort_keys = true;
  832. *os << json::serialize(dyn, opts);
  833. }
  834. //////////////////////////////////////////////////////////////////////
  835. } // namespace folly