123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405 |
- /*
- * Copyright 2014-present Facebook, Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
- #ifndef FOLLY_GEN_STRING_H_
- #error This file may only be included from folly/gen/String.h
- #endif
- #include <folly/Conv.h>
- #include <folly/Portability.h>
- #include <folly/String.h>
- namespace folly {
- namespace gen {
- namespace detail {
- /**
- * Finds the first occurrence of delimiter in "in", advances "in" past the
- * delimiter. Populates "prefix" with the consumed bytes, including the
- * delimiter.
- *
- * Returns the number of trailing bytes of "prefix" that make up the
- * delimiter, or 0 if the delimiter was not found.
- */
- inline size_t
- splitPrefix(StringPiece& in, StringPiece& prefix, char delimiter) {
- size_t found = in.find(delimiter);
- if (found != StringPiece::npos) {
- ++found;
- prefix.assign(in.data(), in.data() + found);
- in.advance(found);
- return 1;
- }
- prefix.clear();
- return 0;
- }
- /**
- * As above, but supports multibyte delimiters.
- */
- inline size_t
- splitPrefix(StringPiece& in, StringPiece& prefix, StringPiece delimiter) {
- auto found = in.find(delimiter);
- if (found != StringPiece::npos) {
- found += delimiter.size();
- prefix.assign(in.data(), in.data() + found);
- in.advance(found);
- return delimiter.size();
- }
- prefix.clear();
- return 0;
- }
- /**
- * As above, but splits by any of the EOL terms: \r, \n, or \r\n.
- */
- inline size_t splitPrefix(StringPiece& in, StringPiece& prefix, MixedNewlines) {
- const auto kCRLF = "\r\n";
- const size_t kLenCRLF = 2;
- auto p = in.find_first_of(kCRLF);
- if (p != std::string::npos) {
- const auto in_start = in.data();
- size_t delim_len = 1;
- in.advance(p);
- // Either remove an MS-DOS CR-LF 2-byte newline, or eat 1 byte at a time.
- if (in.removePrefix(kCRLF)) {
- delim_len = kLenCRLF;
- } else {
- in.advance(delim_len);
- }
- prefix.assign(in_start, in.data());
- return delim_len;
- }
- prefix.clear();
- return 0;
- }
- inline const char* ch(const unsigned char* p) {
- return reinterpret_cast<const char*>(p);
- }
- // Chop s into pieces of at most maxLength, feed them to cb
- template <class Callback>
- bool consumeFixedSizeChunks(Callback& cb, StringPiece& s, uint64_t maxLength) {
- while (!s.empty()) {
- auto num_to_add = s.size();
- if (maxLength) {
- num_to_add = std::min<uint64_t>(num_to_add, maxLength);
- }
- if (!cb(StringPiece(s.begin(), num_to_add))) {
- return false;
- }
- s.advance(num_to_add);
- }
- return true;
- }
- // Consumes all of buffer, plus n chars from s.
- template <class Callback>
- bool consumeBufferPlus(Callback& cb, IOBuf& buf, StringPiece& s, uint64_t n) {
- buf.reserve(0, n);
- memcpy(buf.writableTail(), s.data(), n);
- buf.append(n);
- s.advance(n);
- if (!cb(StringPiece(detail::ch(buf.data()), buf.length()))) {
- return false;
- }
- buf.clear();
- return true;
- }
- } // namespace detail
- template <class Callback>
- bool StreamSplitter<Callback>::flush() {
- CHECK(maxLength_ == 0 || buffer_.length() < maxLength_);
- if (!pieceCb_(StringPiece(detail::ch(buffer_.data()), buffer_.length()))) {
- return false;
- }
- // We are ready to handle another stream now.
- buffer_.clear();
- return true;
- }
- template <class Callback>
- bool StreamSplitter<Callback>::operator()(StringPiece in) {
- StringPiece prefix;
- // NB This code assumes a 1-byte delimiter. It's not too hard to support
- // multibyte delimiters, just remember that maxLength_ chunks can end up
- // falling in the middle of a delimiter.
- bool found = detail::splitPrefix(in, prefix, delimiter_);
- if (buffer_.length() != 0) {
- if (found) {
- uint64_t num_to_add = prefix.size();
- if (maxLength_) {
- CHECK(buffer_.length() < maxLength_);
- // Consume as much of prefix as possible without exceeding maxLength_
- num_to_add = std::min(maxLength_ - buffer_.length(), num_to_add);
- }
- // Append part of the prefix to the buffer, and send it to the callback
- if (!detail::consumeBufferPlus(pieceCb_, buffer_, prefix, num_to_add)) {
- return false;
- }
- if (!detail::consumeFixedSizeChunks(pieceCb_, prefix, maxLength_)) {
- return false;
- }
- found = detail::splitPrefix(in, prefix, delimiter_);
- // Post-conditions:
- // - we consumed all of buffer_ and all of the first prefix.
- // - found, in, and prefix reflect the second delimiter_ search
- } else if (maxLength_ && buffer_.length() + in.size() >= maxLength_) {
- // Send all of buffer_, plus a bit of in, to the callback
- if (!detail::consumeBufferPlus(
- pieceCb_, buffer_, in, maxLength_ - buffer_.length())) {
- return false;
- }
- // Post-conditions:
- // - we consumed all of buffer, and the minimal # of bytes from in
- // - found is false
- } // Otherwise: found is false & we cannot invoke the callback this turn
- }
- // Post-condition: buffer_ is nonempty only if found is false **and**
- // len(buffer + in) < maxLength_.
- // Send lines to callback directly from input (no buffer)
- while (found) { // Buffer guaranteed to be empty
- if (!detail::consumeFixedSizeChunks(pieceCb_, prefix, maxLength_)) {
- return false;
- }
- found = detail::splitPrefix(in, prefix, delimiter_);
- }
- // No more delimiters left; consume 'in' until it is shorter than maxLength_
- if (maxLength_) {
- while (in.size() >= maxLength_) { // Buffer is guaranteed to be empty
- if (!pieceCb_(StringPiece(in.begin(), maxLength_))) {
- return false;
- }
- in.advance(maxLength_);
- }
- }
- if (!in.empty()) { // Buffer may be nonempty
- // Incomplete line left, append to buffer
- buffer_.reserve(0, in.size());
- memcpy(buffer_.writableTail(), in.data(), in.size());
- buffer_.append(in.size());
- }
- CHECK(maxLength_ == 0 || buffer_.length() < maxLength_);
- return true;
- }
- namespace detail {
- class StringResplitter : public Operator<StringResplitter> {
- char delimiter_;
- bool keepDelimiter_;
- public:
- explicit StringResplitter(char delimiter, bool keepDelimiter = false)
- : delimiter_(delimiter), keepDelimiter_(keepDelimiter) {}
- template <class Source>
- class Generator : public GenImpl<StringPiece, Generator<Source>> {
- Source source_;
- char delimiter_;
- bool keepDelimiter_;
- public:
- Generator(Source source, char delimiter, bool keepDelimiter)
- : source_(std::move(source)),
- delimiter_(delimiter),
- keepDelimiter_(keepDelimiter) {}
- template <class Body>
- bool apply(Body&& body) const {
- auto splitter =
- streamSplitter(this->delimiter_, [this, &body](StringPiece s) {
- // The stream ended with a delimiter; our contract is to swallow
- // the final empty piece.
- if (s.empty()) {
- return true;
- }
- if (s.back() != this->delimiter_) {
- return body(s);
- }
- if (!keepDelimiter_) {
- s.pop_back(); // Remove the 1-character delimiter
- }
- return body(s);
- });
- if (!source_.apply(splitter)) {
- return false;
- }
- return splitter.flush();
- }
- static constexpr bool infinite = Source::infinite;
- };
- template <class Source, class Value, class Gen = Generator<Source>>
- Gen compose(GenImpl<Value, Source>&& source) const {
- return Gen(std::move(source.self()), delimiter_, keepDelimiter_);
- }
- template <class Source, class Value, class Gen = Generator<Source>>
- Gen compose(const GenImpl<Value, Source>& source) const {
- return Gen(source.self(), delimiter_, keepDelimiter_);
- }
- };
- template <class DelimiterType = char>
- class SplitStringSource
- : public GenImpl<StringPiece, SplitStringSource<DelimiterType>> {
- StringPiece source_;
- DelimiterType delimiter_;
- public:
- SplitStringSource(const StringPiece source, DelimiterType delimiter)
- : source_(source), delimiter_(std::move(delimiter)) {}
- template <class Body>
- bool apply(Body&& body) const {
- StringPiece rest(source_);
- StringPiece prefix;
- while (size_t delim_len = splitPrefix(rest, prefix, this->delimiter_)) {
- prefix.subtract(delim_len); // Remove the delimiter
- if (!body(prefix)) {
- return false;
- }
- }
- if (!rest.empty()) {
- if (!body(rest)) {
- return false;
- }
- }
- return true;
- }
- };
- /**
- * Unsplit - For joining tokens from a generator into a string. This is
- * the inverse of `split` above.
- *
- * This type is primarily used through the 'unsplit' function.
- */
- template <class Delimiter, class Output>
- class Unsplit : public Operator<Unsplit<Delimiter, Output>> {
- Delimiter delimiter_;
- public:
- explicit Unsplit(const Delimiter& delimiter) : delimiter_(delimiter) {}
- template <class Source, class Value>
- Output compose(const GenImpl<Value, Source>& source) const {
- Output outputBuffer;
- UnsplitBuffer<Delimiter, Output> unsplitter(delimiter_, &outputBuffer);
- unsplitter.compose(source);
- return outputBuffer;
- }
- };
- /**
- * UnsplitBuffer - For joining tokens from a generator into a string,
- * and inserting them into a custom buffer.
- *
- * This type is primarily used through the 'unsplit' function.
- */
- template <class Delimiter, class OutputBuffer>
- class UnsplitBuffer : public Operator<UnsplitBuffer<Delimiter, OutputBuffer>> {
- Delimiter delimiter_;
- OutputBuffer* outputBuffer_;
- public:
- UnsplitBuffer(const Delimiter& delimiter, OutputBuffer* outputBuffer)
- : delimiter_(delimiter), outputBuffer_(outputBuffer) {
- CHECK(outputBuffer);
- }
- template <class Source, class Value>
- void compose(const GenImpl<Value, Source>& source) const {
- // If the output buffer is empty, we skip inserting the delimiter for the
- // first element.
- bool skipDelim = outputBuffer_->empty();
- source | [&](Value v) {
- if (skipDelim) {
- skipDelim = false;
- toAppend(std::forward<Value>(v), outputBuffer_);
- } else {
- toAppend(delimiter_, std::forward<Value>(v), outputBuffer_);
- }
- };
- }
- };
- /**
- * Hack for static for-like constructs
- */
- template <class Target, class = void>
- inline Target passthrough(Target target) {
- return target;
- }
- FOLLY_PUSH_WARNING
- #ifdef __clang__
- // Clang isn't happy with eatField() hack below.
- #pragma GCC diagnostic ignored "-Wreturn-stack-address"
- #endif // __clang__
- /**
- * ParseToTuple - For splitting a record and immediatlely converting it to a
- * target tuple type. Primary used through the 'eachToTuple' helper, like so:
- *
- * auto config
- * = split("1:a 2:b", ' ')
- * | eachToTuple<int, string>()
- * | as<vector<tuple<int, string>>>();
- *
- */
- template <class TargetContainer, class Delimiter, class... Targets>
- class SplitTo {
- Delimiter delimiter_;
- public:
- explicit SplitTo(Delimiter delimiter) : delimiter_(delimiter) {}
- TargetContainer operator()(StringPiece line) const {
- int i = 0;
- StringPiece fields[sizeof...(Targets)];
- // HACK(tjackson): Used for referencing fields[] corresponding to variadic
- // template parameters.
- auto eatField = [&]() -> StringPiece& { return fields[i++]; };
- if (!split(
- delimiter_,
- line,
- detail::passthrough<StringPiece&, Targets>(eatField())...)) {
- throw std::runtime_error("field count mismatch");
- }
- i = 0;
- return TargetContainer(To<Targets>()(eatField())...);
- }
- };
- FOLLY_POP_WARNING
- } // namespace detail
- } // namespace gen
- } // namespace folly
|