String-inl.h 19 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654
  1. /*
  2. * Copyright 2012-present Facebook, Inc.
  3. *
  4. * Licensed under the Apache License, Version 2.0 (the "License");
  5. * you may not use this file except in compliance with the License.
  6. * You may obtain a copy of the License at
  7. *
  8. * http://www.apache.org/licenses/LICENSE-2.0
  9. *
  10. * Unless required by applicable law or agreed to in writing, software
  11. * distributed under the License is distributed on an "AS IS" BASIS,
  12. * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13. * See the License for the specific language governing permissions and
  14. * limitations under the License.
  15. */
  16. #pragma once
  17. #include <iterator>
  18. #include <stdexcept>
  19. #include <folly/CppAttributes.h>
  20. #ifndef FOLLY_STRING_H_
  21. #error This file may only be included from String.h
  22. #endif
  23. namespace folly {
  24. namespace detail {
  25. // Map from character code to value of one-character escape sequence
  26. // ('\n' = 10 maps to 'n'), 'O' if the character should be printed as
  27. // an octal escape sequence, or 'P' if the character is printable and
  28. // should be printed as is.
  29. extern const std::array<char, 256> cEscapeTable;
  30. } // namespace detail
  31. template <class String>
  32. void cEscape(StringPiece str, String& out) {
  33. char esc[4];
  34. esc[0] = '\\';
  35. out.reserve(out.size() + str.size());
  36. auto p = str.begin();
  37. auto last = p; // last regular character
  38. // We advance over runs of regular characters (printable, not double-quote or
  39. // backslash) and copy them in one go; this is faster than calling push_back
  40. // repeatedly.
  41. while (p != str.end()) {
  42. char c = *p;
  43. unsigned char v = static_cast<unsigned char>(c);
  44. char e = detail::cEscapeTable[v];
  45. if (e == 'P') { // printable
  46. ++p;
  47. } else if (e == 'O') { // octal
  48. out.append(&*last, size_t(p - last));
  49. esc[1] = '0' + ((v >> 6) & 7);
  50. esc[2] = '0' + ((v >> 3) & 7);
  51. esc[3] = '0' + (v & 7);
  52. out.append(esc, 4);
  53. ++p;
  54. last = p;
  55. } else { // special 1-character escape
  56. out.append(&*last, size_t(p - last));
  57. esc[1] = e;
  58. out.append(esc, 2);
  59. ++p;
  60. last = p;
  61. }
  62. }
  63. out.append(&*last, size_t(p - last));
  64. }
  65. namespace detail {
  66. // Map from the character code of the character following a backslash to
  67. // the unescaped character if a valid one-character escape sequence
  68. // ('n' maps to 10 = '\n'), 'O' if this is the first character of an
  69. // octal escape sequence, 'X' if this is the first character of a
  70. // hexadecimal escape sequence, or 'I' if this escape sequence is invalid.
  71. extern const std::array<char, 256> cUnescapeTable;
  72. // Map from the character code to the hex value, or 16 if invalid hex char.
  73. extern const std::array<unsigned char, 256> hexTable;
  74. } // namespace detail
  75. template <class String>
  76. void cUnescape(StringPiece str, String& out, bool strict) {
  77. out.reserve(out.size() + str.size());
  78. auto p = str.begin();
  79. auto last = p; // last regular character (not part of an escape sequence)
  80. // We advance over runs of regular characters (not backslash) and copy them
  81. // in one go; this is faster than calling push_back repeatedly.
  82. while (p != str.end()) {
  83. char c = *p;
  84. if (c != '\\') { // normal case
  85. ++p;
  86. continue;
  87. }
  88. out.append(&*last, p - last);
  89. ++p;
  90. if (p == str.end()) { // backslash at end of string
  91. if (strict) {
  92. throw std::invalid_argument("incomplete escape sequence");
  93. }
  94. out.push_back('\\');
  95. last = p;
  96. continue;
  97. }
  98. char e = detail::cUnescapeTable[static_cast<unsigned char>(*p)];
  99. if (e == 'O') { // octal
  100. unsigned char val = 0;
  101. for (int i = 0; i < 3 && p != str.end() && *p >= '0' && *p <= '7';
  102. ++i, ++p) {
  103. val <<= 3;
  104. val |= (*p - '0');
  105. }
  106. out.push_back(val);
  107. last = p;
  108. } else if (e == 'X') { // hex
  109. ++p;
  110. if (p == str.end()) { // \x at end of string
  111. if (strict) {
  112. throw std::invalid_argument("incomplete hex escape sequence");
  113. }
  114. out.append("\\x");
  115. last = p;
  116. continue;
  117. }
  118. unsigned char val = 0;
  119. unsigned char h;
  120. for (; (p != str.end() &&
  121. (h = detail::hexTable[static_cast<unsigned char>(*p)]) < 16);
  122. ++p) {
  123. val <<= 4;
  124. val |= h;
  125. }
  126. out.push_back(val);
  127. last = p;
  128. } else if (e == 'I') { // invalid
  129. if (strict) {
  130. throw std::invalid_argument("invalid escape sequence");
  131. }
  132. out.push_back('\\');
  133. out.push_back(*p);
  134. ++p;
  135. last = p;
  136. } else { // standard escape sequence, \' etc
  137. out.push_back(e);
  138. ++p;
  139. last = p;
  140. }
  141. }
  142. out.append(&*last, p - last);
  143. }
  144. namespace detail {
  145. // Map from character code to escape mode:
  146. // 0 = pass through
  147. // 1 = unused
  148. // 2 = pass through in PATH mode
  149. // 3 = space, replace with '+' in QUERY mode
  150. // 4 = percent-encode
  151. extern const std::array<unsigned char, 256> uriEscapeTable;
  152. } // namespace detail
  153. template <class String>
  154. void uriEscape(StringPiece str, String& out, UriEscapeMode mode) {
  155. static const char hexValues[] = "0123456789abcdef";
  156. char esc[3];
  157. esc[0] = '%';
  158. // Preallocate assuming that 25% of the input string will be escaped
  159. out.reserve(out.size() + str.size() + 3 * (str.size() / 4));
  160. auto p = str.begin();
  161. auto last = p; // last regular character
  162. // We advance over runs of passthrough characters and copy them in one go;
  163. // this is faster than calling push_back repeatedly.
  164. unsigned char minEncode = static_cast<unsigned char>(mode);
  165. while (p != str.end()) {
  166. char c = *p;
  167. unsigned char v = static_cast<unsigned char>(c);
  168. unsigned char discriminator = detail::uriEscapeTable[v];
  169. if (LIKELY(discriminator <= minEncode)) {
  170. ++p;
  171. } else if (mode == UriEscapeMode::QUERY && discriminator == 3) {
  172. out.append(&*last, size_t(p - last));
  173. out.push_back('+');
  174. ++p;
  175. last = p;
  176. } else {
  177. out.append(&*last, size_t(p - last));
  178. esc[1] = hexValues[v >> 4];
  179. esc[2] = hexValues[v & 0x0f];
  180. out.append(esc, 3);
  181. ++p;
  182. last = p;
  183. }
  184. }
  185. out.append(&*last, size_t(p - last));
  186. }
  187. template <class String>
  188. void uriUnescape(StringPiece str, String& out, UriEscapeMode mode) {
  189. out.reserve(out.size() + str.size());
  190. auto p = str.begin();
  191. auto last = p;
  192. // We advance over runs of passthrough characters and copy them in one go;
  193. // this is faster than calling push_back repeatedly.
  194. while (p != str.end()) {
  195. char c = *p;
  196. switch (c) {
  197. case '%': {
  198. if (UNLIKELY(std::distance(p, str.end()) < 3)) {
  199. throw std::invalid_argument("incomplete percent encode sequence");
  200. }
  201. auto h1 = detail::hexTable[static_cast<unsigned char>(p[1])];
  202. auto h2 = detail::hexTable[static_cast<unsigned char>(p[2])];
  203. if (UNLIKELY(h1 == 16 || h2 == 16)) {
  204. throw std::invalid_argument("invalid percent encode sequence");
  205. }
  206. out.append(&*last, size_t(p - last));
  207. out.push_back((h1 << 4) | h2);
  208. p += 3;
  209. last = p;
  210. break;
  211. }
  212. case '+':
  213. if (mode == UriEscapeMode::QUERY) {
  214. out.append(&*last, size_t(p - last));
  215. out.push_back(' ');
  216. ++p;
  217. last = p;
  218. break;
  219. }
  220. // else fallthrough
  221. FOLLY_FALLTHROUGH;
  222. default:
  223. ++p;
  224. break;
  225. }
  226. }
  227. out.append(&*last, size_t(p - last));
  228. }
  229. namespace detail {
  230. /*
  231. * The following functions are type-overloaded helpers for
  232. * internalSplit().
  233. */
  234. inline size_t delimSize(char) {
  235. return 1;
  236. }
  237. inline size_t delimSize(StringPiece s) {
  238. return s.size();
  239. }
  240. inline bool atDelim(const char* s, char c) {
  241. return *s == c;
  242. }
  243. inline bool atDelim(const char* s, StringPiece sp) {
  244. return !std::memcmp(s, sp.start(), sp.size());
  245. }
  246. // These are used to short-circuit internalSplit() in the case of
  247. // 1-character strings.
  248. inline char delimFront(char c) {
  249. // This one exists only for compile-time; it should never be called.
  250. std::abort();
  251. return c;
  252. }
  253. inline char delimFront(StringPiece s) {
  254. assert(!s.empty() && s.start() != nullptr);
  255. return *s.start();
  256. }
  257. /*
  258. * Shared implementation for all the split() overloads.
  259. *
  260. * This uses some external helpers that are overloaded to let this
  261. * algorithm be more performant if the deliminator is a single
  262. * character instead of a whole string.
  263. *
  264. * @param ignoreEmpty iff true, don't copy empty segments to output
  265. */
  266. template <class OutStringT, class DelimT, class OutputIterator>
  267. void internalSplit(
  268. DelimT delim,
  269. StringPiece sp,
  270. OutputIterator out,
  271. bool ignoreEmpty) {
  272. assert(sp.empty() || sp.start() != nullptr);
  273. const char* s = sp.start();
  274. const size_t strSize = sp.size();
  275. const size_t dSize = delimSize(delim);
  276. if (dSize > strSize || dSize == 0) {
  277. if (!ignoreEmpty || strSize > 0) {
  278. *out++ = to<OutStringT>(sp);
  279. }
  280. return;
  281. }
  282. if (std::is_same<DelimT, StringPiece>::value && dSize == 1) {
  283. // Call the char version because it is significantly faster.
  284. return internalSplit<OutStringT>(delimFront(delim), sp, out, ignoreEmpty);
  285. }
  286. size_t tokenStartPos = 0;
  287. size_t tokenSize = 0;
  288. for (size_t i = 0; i <= strSize - dSize; ++i) {
  289. if (atDelim(&s[i], delim)) {
  290. if (!ignoreEmpty || tokenSize > 0) {
  291. *out++ = to<OutStringT>(sp.subpiece(tokenStartPos, tokenSize));
  292. }
  293. tokenStartPos = i + dSize;
  294. tokenSize = 0;
  295. i += dSize - 1;
  296. } else {
  297. ++tokenSize;
  298. }
  299. }
  300. tokenSize = strSize - tokenStartPos;
  301. if (!ignoreEmpty || tokenSize > 0) {
  302. *out++ = to<OutStringT>(sp.subpiece(tokenStartPos, tokenSize));
  303. }
  304. }
  305. template <class String>
  306. StringPiece prepareDelim(const String& s) {
  307. return StringPiece(s);
  308. }
  309. inline char prepareDelim(char c) {
  310. return c;
  311. }
  312. template <class OutputType>
  313. void toOrIgnore(StringPiece input, OutputType& output) {
  314. output = folly::to<OutputType>(input);
  315. }
  316. inline void toOrIgnore(StringPiece, decltype(std::ignore)&) {}
  317. template <bool exact, class Delim, class OutputType>
  318. bool splitFixed(const Delim& delimiter, StringPiece input, OutputType& output) {
  319. static_assert(
  320. exact || std::is_same<OutputType, StringPiece>::value ||
  321. IsSomeString<OutputType>::value ||
  322. std::is_same<OutputType, decltype(std::ignore)>::value,
  323. "split<false>() requires that the last argument be a string type "
  324. "or std::ignore");
  325. if (exact && UNLIKELY(std::string::npos != input.find(delimiter))) {
  326. return false;
  327. }
  328. toOrIgnore(input, output);
  329. return true;
  330. }
  331. template <bool exact, class Delim, class OutputType, class... OutputTypes>
  332. bool splitFixed(
  333. const Delim& delimiter,
  334. StringPiece input,
  335. OutputType& outHead,
  336. OutputTypes&... outTail) {
  337. size_t cut = input.find(delimiter);
  338. if (UNLIKELY(cut == std::string::npos)) {
  339. return false;
  340. }
  341. StringPiece head(input.begin(), input.begin() + cut);
  342. StringPiece tail(
  343. input.begin() + cut + detail::delimSize(delimiter), input.end());
  344. if (LIKELY(splitFixed<exact>(delimiter, tail, outTail...))) {
  345. toOrIgnore(head, outHead);
  346. return true;
  347. }
  348. return false;
  349. }
  350. } // namespace detail
  351. //////////////////////////////////////////////////////////////////////
  352. template <class Delim, class String, class OutputType>
  353. void split(
  354. const Delim& delimiter,
  355. const String& input,
  356. std::vector<OutputType>& out,
  357. bool ignoreEmpty) {
  358. detail::internalSplit<OutputType>(
  359. detail::prepareDelim(delimiter),
  360. StringPiece(input),
  361. std::back_inserter(out),
  362. ignoreEmpty);
  363. }
  364. template <class Delim, class String, class OutputType>
  365. void split(
  366. const Delim& delimiter,
  367. const String& input,
  368. fbvector<OutputType>& out,
  369. bool ignoreEmpty) {
  370. detail::internalSplit<OutputType>(
  371. detail::prepareDelim(delimiter),
  372. StringPiece(input),
  373. std::back_inserter(out),
  374. ignoreEmpty);
  375. }
  376. template <
  377. class OutputValueType,
  378. class Delim,
  379. class String,
  380. class OutputIterator>
  381. void splitTo(
  382. const Delim& delimiter,
  383. const String& input,
  384. OutputIterator out,
  385. bool ignoreEmpty) {
  386. detail::internalSplit<OutputValueType>(
  387. detail::prepareDelim(delimiter), StringPiece(input), out, ignoreEmpty);
  388. }
  389. template <bool exact, class Delim, class... OutputTypes>
  390. typename std::enable_if<
  391. StrictConjunction<IsConvertible<OutputTypes>...>::value &&
  392. sizeof...(OutputTypes) >= 1,
  393. bool>::type
  394. split(const Delim& delimiter, StringPiece input, OutputTypes&... outputs) {
  395. return detail::splitFixed<exact>(
  396. detail::prepareDelim(delimiter), input, outputs...);
  397. }
  398. namespace detail {
  399. /*
  400. * If a type can have its string size determined cheaply, we can more
  401. * efficiently append it in a loop (see internalJoinAppend). Note that the
  402. * struct need not conform to the std::string api completely (ex. does not need
  403. * to implement append()).
  404. */
  405. template <class T>
  406. struct IsSizableString {
  407. enum {
  408. value = IsSomeString<T>::value || std::is_same<T, StringPiece>::value
  409. };
  410. };
  411. template <class Iterator>
  412. struct IsSizableStringContainerIterator
  413. : IsSizableString<typename std::iterator_traits<Iterator>::value_type> {};
  414. template <class Delim, class Iterator, class String>
  415. void internalJoinAppend(
  416. Delim delimiter,
  417. Iterator begin,
  418. Iterator end,
  419. String& output) {
  420. assert(begin != end);
  421. if (std::is_same<Delim, StringPiece>::value && delimSize(delimiter) == 1) {
  422. internalJoinAppend(delimFront(delimiter), begin, end, output);
  423. return;
  424. }
  425. toAppend(*begin, &output);
  426. while (++begin != end) {
  427. toAppend(delimiter, *begin, &output);
  428. }
  429. }
  430. template <class Delim, class Iterator, class String>
  431. typename std::enable_if<IsSizableStringContainerIterator<Iterator>::value>::type
  432. internalJoin(Delim delimiter, Iterator begin, Iterator end, String& output) {
  433. output.clear();
  434. if (begin == end) {
  435. return;
  436. }
  437. const size_t dsize = delimSize(delimiter);
  438. Iterator it = begin;
  439. size_t size = it->size();
  440. while (++it != end) {
  441. size += dsize + it->size();
  442. }
  443. output.reserve(size);
  444. internalJoinAppend(delimiter, begin, end, output);
  445. }
  446. template <class Delim, class Iterator, class String>
  447. typename std::enable_if<
  448. !IsSizableStringContainerIterator<Iterator>::value>::type
  449. internalJoin(Delim delimiter, Iterator begin, Iterator end, String& output) {
  450. output.clear();
  451. if (begin == end) {
  452. return;
  453. }
  454. internalJoinAppend(delimiter, begin, end, output);
  455. }
  456. } // namespace detail
  457. template <class Delim, class Iterator, class String>
  458. void join(
  459. const Delim& delimiter,
  460. Iterator begin,
  461. Iterator end,
  462. String& output) {
  463. detail::internalJoin(detail::prepareDelim(delimiter), begin, end, output);
  464. }
  465. template <class OutputString>
  466. void backslashify(
  467. folly::StringPiece input,
  468. OutputString& output,
  469. bool hex_style) {
  470. static const char hexValues[] = "0123456789abcdef";
  471. output.clear();
  472. output.reserve(3 * input.size());
  473. for (unsigned char c : input) {
  474. // less than space or greater than '~' are considered unprintable
  475. if (c < 0x20 || c > 0x7e || c == '\\') {
  476. bool hex_append = false;
  477. output.push_back('\\');
  478. if (hex_style) {
  479. hex_append = true;
  480. } else {
  481. if (c == '\r') {
  482. output += 'r';
  483. } else if (c == '\n') {
  484. output += 'n';
  485. } else if (c == '\t') {
  486. output += 't';
  487. } else if (c == '\a') {
  488. output += 'a';
  489. } else if (c == '\b') {
  490. output += 'b';
  491. } else if (c == '\0') {
  492. output += '0';
  493. } else if (c == '\\') {
  494. output += '\\';
  495. } else {
  496. hex_append = true;
  497. }
  498. }
  499. if (hex_append) {
  500. output.push_back('x');
  501. output.push_back(hexValues[(c >> 4) & 0xf]);
  502. output.push_back(hexValues[c & 0xf]);
  503. }
  504. } else {
  505. output += c;
  506. }
  507. }
  508. }
  509. template <class String1, class String2>
  510. void humanify(const String1& input, String2& output) {
  511. size_t numUnprintable = 0;
  512. size_t numPrintablePrefix = 0;
  513. for (unsigned char c : input) {
  514. if (c < 0x20 || c > 0x7e || c == '\\') {
  515. ++numUnprintable;
  516. }
  517. if (numUnprintable == 0) {
  518. ++numPrintablePrefix;
  519. }
  520. }
  521. // hexlify doubles a string's size; backslashify can potentially
  522. // explode it by 4x. Now, the printable range of the ascii
  523. // "spectrum" is around 95 out of 256 values, so a "random" binary
  524. // string should be around 60% unprintable. We use a 50% hueristic
  525. // here, so if a string is 60% unprintable, then we just use hex
  526. // output. Otherwise we backslash.
  527. //
  528. // UTF8 is completely ignored; as a result, utf8 characters will
  529. // likely be \x escaped (since most common glyphs fit in two bytes).
  530. // This is a tradeoff of complexity/speed instead of a convenience
  531. // that likely would rarely matter. Moreover, this function is more
  532. // about displaying underlying bytes, not about displaying glyphs
  533. // from languages.
  534. if (numUnprintable == 0) {
  535. output = input;
  536. } else if (5 * numUnprintable >= 3 * input.size()) {
  537. // However! If we have a "meaningful" prefix of printable
  538. // characters, say 20% of the string, we backslashify under the
  539. // assumption viewing the prefix as ascii is worth blowing the
  540. // output size up a bit.
  541. if (5 * numPrintablePrefix >= input.size()) {
  542. backslashify(input, output);
  543. } else {
  544. output = "0x";
  545. hexlify(input, output, true /* append output */);
  546. }
  547. } else {
  548. backslashify(input, output);
  549. }
  550. }
  551. template <class InputString, class OutputString>
  552. bool hexlify(
  553. const InputString& input,
  554. OutputString& output,
  555. bool append_output) {
  556. if (!append_output) {
  557. output.clear();
  558. }
  559. static char hexValues[] = "0123456789abcdef";
  560. auto j = output.size();
  561. output.resize(2 * input.size() + output.size());
  562. for (size_t i = 0; i < input.size(); ++i) {
  563. int ch = input[i];
  564. output[j++] = hexValues[(ch >> 4) & 0xf];
  565. output[j++] = hexValues[ch & 0xf];
  566. }
  567. return true;
  568. }
  569. template <class InputString, class OutputString>
  570. bool unhexlify(const InputString& input, OutputString& output) {
  571. if (input.size() % 2 != 0) {
  572. return false;
  573. }
  574. output.resize(input.size() / 2);
  575. int j = 0;
  576. for (size_t i = 0; i < input.size(); i += 2) {
  577. int highBits = detail::hexTable[static_cast<uint8_t>(input[i])];
  578. int lowBits = detail::hexTable[static_cast<uint8_t>(input[i + 1])];
  579. if ((highBits | lowBits) & 0x10) {
  580. // One of the characters wasn't a hex digit
  581. return false;
  582. }
  583. output[j++] = (highBits << 4) + lowBits;
  584. }
  585. return true;
  586. }
  587. namespace detail {
  588. /**
  589. * Hex-dump at most 16 bytes starting at offset from a memory area of size
  590. * bytes. Return the number of bytes actually dumped.
  591. */
  592. size_t
  593. hexDumpLine(const void* ptr, size_t offset, size_t size, std::string& line);
  594. } // namespace detail
  595. template <class OutIt>
  596. void hexDump(const void* ptr, size_t size, OutIt out) {
  597. size_t offset = 0;
  598. std::string line;
  599. while (offset < size) {
  600. offset += detail::hexDumpLine(ptr, offset, size, line);
  601. *out++ = line;
  602. }
  603. }
  604. } // namespace folly