String-inl.h 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405
  1. /*
  2. * Copyright 2014-present Facebook, Inc.
  3. *
  4. * Licensed under the Apache License, Version 2.0 (the "License");
  5. * you may not use this file except in compliance with the License.
  6. * You may obtain a copy of the License at
  7. *
  8. * http://www.apache.org/licenses/LICENSE-2.0
  9. *
  10. * Unless required by applicable law or agreed to in writing, software
  11. * distributed under the License is distributed on an "AS IS" BASIS,
  12. * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13. * See the License for the specific language governing permissions and
  14. * limitations under the License.
  15. */
  16. #ifndef FOLLY_GEN_STRING_H_
  17. #error This file may only be included from folly/gen/String.h
  18. #endif
  19. #include <folly/Conv.h>
  20. #include <folly/Portability.h>
  21. #include <folly/String.h>
  22. namespace folly {
  23. namespace gen {
  24. namespace detail {
  25. /**
  26. * Finds the first occurrence of delimiter in "in", advances "in" past the
  27. * delimiter. Populates "prefix" with the consumed bytes, including the
  28. * delimiter.
  29. *
  30. * Returns the number of trailing bytes of "prefix" that make up the
  31. * delimiter, or 0 if the delimiter was not found.
  32. */
  33. inline size_t
  34. splitPrefix(StringPiece& in, StringPiece& prefix, char delimiter) {
  35. size_t found = in.find(delimiter);
  36. if (found != StringPiece::npos) {
  37. ++found;
  38. prefix.assign(in.data(), in.data() + found);
  39. in.advance(found);
  40. return 1;
  41. }
  42. prefix.clear();
  43. return 0;
  44. }
  45. /**
  46. * As above, but supports multibyte delimiters.
  47. */
  48. inline size_t
  49. splitPrefix(StringPiece& in, StringPiece& prefix, StringPiece delimiter) {
  50. auto found = in.find(delimiter);
  51. if (found != StringPiece::npos) {
  52. found += delimiter.size();
  53. prefix.assign(in.data(), in.data() + found);
  54. in.advance(found);
  55. return delimiter.size();
  56. }
  57. prefix.clear();
  58. return 0;
  59. }
  60. /**
  61. * As above, but splits by any of the EOL terms: \r, \n, or \r\n.
  62. */
  63. inline size_t splitPrefix(StringPiece& in, StringPiece& prefix, MixedNewlines) {
  64. const auto kCRLF = "\r\n";
  65. const size_t kLenCRLF = 2;
  66. auto p = in.find_first_of(kCRLF);
  67. if (p != std::string::npos) {
  68. const auto in_start = in.data();
  69. size_t delim_len = 1;
  70. in.advance(p);
  71. // Either remove an MS-DOS CR-LF 2-byte newline, or eat 1 byte at a time.
  72. if (in.removePrefix(kCRLF)) {
  73. delim_len = kLenCRLF;
  74. } else {
  75. in.advance(delim_len);
  76. }
  77. prefix.assign(in_start, in.data());
  78. return delim_len;
  79. }
  80. prefix.clear();
  81. return 0;
  82. }
  83. inline const char* ch(const unsigned char* p) {
  84. return reinterpret_cast<const char*>(p);
  85. }
  86. // Chop s into pieces of at most maxLength, feed them to cb
  87. template <class Callback>
  88. bool consumeFixedSizeChunks(Callback& cb, StringPiece& s, uint64_t maxLength) {
  89. while (!s.empty()) {
  90. auto num_to_add = s.size();
  91. if (maxLength) {
  92. num_to_add = std::min<uint64_t>(num_to_add, maxLength);
  93. }
  94. if (!cb(StringPiece(s.begin(), num_to_add))) {
  95. return false;
  96. }
  97. s.advance(num_to_add);
  98. }
  99. return true;
  100. }
  101. // Consumes all of buffer, plus n chars from s.
  102. template <class Callback>
  103. bool consumeBufferPlus(Callback& cb, IOBuf& buf, StringPiece& s, uint64_t n) {
  104. buf.reserve(0, n);
  105. memcpy(buf.writableTail(), s.data(), n);
  106. buf.append(n);
  107. s.advance(n);
  108. if (!cb(StringPiece(detail::ch(buf.data()), buf.length()))) {
  109. return false;
  110. }
  111. buf.clear();
  112. return true;
  113. }
  114. } // namespace detail
  115. template <class Callback>
  116. bool StreamSplitter<Callback>::flush() {
  117. CHECK(maxLength_ == 0 || buffer_.length() < maxLength_);
  118. if (!pieceCb_(StringPiece(detail::ch(buffer_.data()), buffer_.length()))) {
  119. return false;
  120. }
  121. // We are ready to handle another stream now.
  122. buffer_.clear();
  123. return true;
  124. }
  125. template <class Callback>
  126. bool StreamSplitter<Callback>::operator()(StringPiece in) {
  127. StringPiece prefix;
  128. // NB This code assumes a 1-byte delimiter. It's not too hard to support
  129. // multibyte delimiters, just remember that maxLength_ chunks can end up
  130. // falling in the middle of a delimiter.
  131. bool found = detail::splitPrefix(in, prefix, delimiter_);
  132. if (buffer_.length() != 0) {
  133. if (found) {
  134. uint64_t num_to_add = prefix.size();
  135. if (maxLength_) {
  136. CHECK(buffer_.length() < maxLength_);
  137. // Consume as much of prefix as possible without exceeding maxLength_
  138. num_to_add = std::min(maxLength_ - buffer_.length(), num_to_add);
  139. }
  140. // Append part of the prefix to the buffer, and send it to the callback
  141. if (!detail::consumeBufferPlus(pieceCb_, buffer_, prefix, num_to_add)) {
  142. return false;
  143. }
  144. if (!detail::consumeFixedSizeChunks(pieceCb_, prefix, maxLength_)) {
  145. return false;
  146. }
  147. found = detail::splitPrefix(in, prefix, delimiter_);
  148. // Post-conditions:
  149. // - we consumed all of buffer_ and all of the first prefix.
  150. // - found, in, and prefix reflect the second delimiter_ search
  151. } else if (maxLength_ && buffer_.length() + in.size() >= maxLength_) {
  152. // Send all of buffer_, plus a bit of in, to the callback
  153. if (!detail::consumeBufferPlus(
  154. pieceCb_, buffer_, in, maxLength_ - buffer_.length())) {
  155. return false;
  156. }
  157. // Post-conditions:
  158. // - we consumed all of buffer, and the minimal # of bytes from in
  159. // - found is false
  160. } // Otherwise: found is false & we cannot invoke the callback this turn
  161. }
  162. // Post-condition: buffer_ is nonempty only if found is false **and**
  163. // len(buffer + in) < maxLength_.
  164. // Send lines to callback directly from input (no buffer)
  165. while (found) { // Buffer guaranteed to be empty
  166. if (!detail::consumeFixedSizeChunks(pieceCb_, prefix, maxLength_)) {
  167. return false;
  168. }
  169. found = detail::splitPrefix(in, prefix, delimiter_);
  170. }
  171. // No more delimiters left; consume 'in' until it is shorter than maxLength_
  172. if (maxLength_) {
  173. while (in.size() >= maxLength_) { // Buffer is guaranteed to be empty
  174. if (!pieceCb_(StringPiece(in.begin(), maxLength_))) {
  175. return false;
  176. }
  177. in.advance(maxLength_);
  178. }
  179. }
  180. if (!in.empty()) { // Buffer may be nonempty
  181. // Incomplete line left, append to buffer
  182. buffer_.reserve(0, in.size());
  183. memcpy(buffer_.writableTail(), in.data(), in.size());
  184. buffer_.append(in.size());
  185. }
  186. CHECK(maxLength_ == 0 || buffer_.length() < maxLength_);
  187. return true;
  188. }
  189. namespace detail {
  190. class StringResplitter : public Operator<StringResplitter> {
  191. char delimiter_;
  192. bool keepDelimiter_;
  193. public:
  194. explicit StringResplitter(char delimiter, bool keepDelimiter = false)
  195. : delimiter_(delimiter), keepDelimiter_(keepDelimiter) {}
  196. template <class Source>
  197. class Generator : public GenImpl<StringPiece, Generator<Source>> {
  198. Source source_;
  199. char delimiter_;
  200. bool keepDelimiter_;
  201. public:
  202. Generator(Source source, char delimiter, bool keepDelimiter)
  203. : source_(std::move(source)),
  204. delimiter_(delimiter),
  205. keepDelimiter_(keepDelimiter) {}
  206. template <class Body>
  207. bool apply(Body&& body) const {
  208. auto splitter =
  209. streamSplitter(this->delimiter_, [this, &body](StringPiece s) {
  210. // The stream ended with a delimiter; our contract is to swallow
  211. // the final empty piece.
  212. if (s.empty()) {
  213. return true;
  214. }
  215. if (s.back() != this->delimiter_) {
  216. return body(s);
  217. }
  218. if (!keepDelimiter_) {
  219. s.pop_back(); // Remove the 1-character delimiter
  220. }
  221. return body(s);
  222. });
  223. if (!source_.apply(splitter)) {
  224. return false;
  225. }
  226. return splitter.flush();
  227. }
  228. static constexpr bool infinite = Source::infinite;
  229. };
  230. template <class Source, class Value, class Gen = Generator<Source>>
  231. Gen compose(GenImpl<Value, Source>&& source) const {
  232. return Gen(std::move(source.self()), delimiter_, keepDelimiter_);
  233. }
  234. template <class Source, class Value, class Gen = Generator<Source>>
  235. Gen compose(const GenImpl<Value, Source>& source) const {
  236. return Gen(source.self(), delimiter_, keepDelimiter_);
  237. }
  238. };
  239. template <class DelimiterType = char>
  240. class SplitStringSource
  241. : public GenImpl<StringPiece, SplitStringSource<DelimiterType>> {
  242. StringPiece source_;
  243. DelimiterType delimiter_;
  244. public:
  245. SplitStringSource(const StringPiece source, DelimiterType delimiter)
  246. : source_(source), delimiter_(std::move(delimiter)) {}
  247. template <class Body>
  248. bool apply(Body&& body) const {
  249. StringPiece rest(source_);
  250. StringPiece prefix;
  251. while (size_t delim_len = splitPrefix(rest, prefix, this->delimiter_)) {
  252. prefix.subtract(delim_len); // Remove the delimiter
  253. if (!body(prefix)) {
  254. return false;
  255. }
  256. }
  257. if (!rest.empty()) {
  258. if (!body(rest)) {
  259. return false;
  260. }
  261. }
  262. return true;
  263. }
  264. };
  265. /**
  266. * Unsplit - For joining tokens from a generator into a string. This is
  267. * the inverse of `split` above.
  268. *
  269. * This type is primarily used through the 'unsplit' function.
  270. */
  271. template <class Delimiter, class Output>
  272. class Unsplit : public Operator<Unsplit<Delimiter, Output>> {
  273. Delimiter delimiter_;
  274. public:
  275. explicit Unsplit(const Delimiter& delimiter) : delimiter_(delimiter) {}
  276. template <class Source, class Value>
  277. Output compose(const GenImpl<Value, Source>& source) const {
  278. Output outputBuffer;
  279. UnsplitBuffer<Delimiter, Output> unsplitter(delimiter_, &outputBuffer);
  280. unsplitter.compose(source);
  281. return outputBuffer;
  282. }
  283. };
  284. /**
  285. * UnsplitBuffer - For joining tokens from a generator into a string,
  286. * and inserting them into a custom buffer.
  287. *
  288. * This type is primarily used through the 'unsplit' function.
  289. */
  290. template <class Delimiter, class OutputBuffer>
  291. class UnsplitBuffer : public Operator<UnsplitBuffer<Delimiter, OutputBuffer>> {
  292. Delimiter delimiter_;
  293. OutputBuffer* outputBuffer_;
  294. public:
  295. UnsplitBuffer(const Delimiter& delimiter, OutputBuffer* outputBuffer)
  296. : delimiter_(delimiter), outputBuffer_(outputBuffer) {
  297. CHECK(outputBuffer);
  298. }
  299. template <class Source, class Value>
  300. void compose(const GenImpl<Value, Source>& source) const {
  301. // If the output buffer is empty, we skip inserting the delimiter for the
  302. // first element.
  303. bool skipDelim = outputBuffer_->empty();
  304. source | [&](Value v) {
  305. if (skipDelim) {
  306. skipDelim = false;
  307. toAppend(std::forward<Value>(v), outputBuffer_);
  308. } else {
  309. toAppend(delimiter_, std::forward<Value>(v), outputBuffer_);
  310. }
  311. };
  312. }
  313. };
  314. /**
  315. * Hack for static for-like constructs
  316. */
  317. template <class Target, class = void>
  318. inline Target passthrough(Target target) {
  319. return target;
  320. }
  321. FOLLY_PUSH_WARNING
  322. #ifdef __clang__
  323. // Clang isn't happy with eatField() hack below.
  324. #pragma GCC diagnostic ignored "-Wreturn-stack-address"
  325. #endif // __clang__
  326. /**
  327. * ParseToTuple - For splitting a record and immediatlely converting it to a
  328. * target tuple type. Primary used through the 'eachToTuple' helper, like so:
  329. *
  330. * auto config
  331. * = split("1:a 2:b", ' ')
  332. * | eachToTuple<int, string>()
  333. * | as<vector<tuple<int, string>>>();
  334. *
  335. */
  336. template <class TargetContainer, class Delimiter, class... Targets>
  337. class SplitTo {
  338. Delimiter delimiter_;
  339. public:
  340. explicit SplitTo(Delimiter delimiter) : delimiter_(delimiter) {}
  341. TargetContainer operator()(StringPiece line) const {
  342. int i = 0;
  343. StringPiece fields[sizeof...(Targets)];
  344. // HACK(tjackson): Used for referencing fields[] corresponding to variadic
  345. // template parameters.
  346. auto eatField = [&]() -> StringPiece& { return fields[i++]; };
  347. if (!split(
  348. delimiter_,
  349. line,
  350. detail::passthrough<StringPiece&, Targets>(eatField())...)) {
  351. throw std::runtime_error("field count mismatch");
  352. }
  353. i = 0;
  354. return TargetContainer(To<Targets>()(eatField())...);
  355. }
  356. };
  357. FOLLY_POP_WARNING
  358. } // namespace detail
  359. } // namespace gen
  360. } // namespace folly