RecordIO.h 5.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180
  1. /*
  2. * Copyright 2013-present Facebook, Inc.
  3. *
  4. * Licensed under the Apache License, Version 2.0 (the "License");
  5. * you may not use this file except in compliance with the License.
  6. * You may obtain a copy of the License at
  7. *
  8. * http://www.apache.org/licenses/LICENSE-2.0
  9. *
  10. * Unless required by applicable law or agreed to in writing, software
  11. * distributed under the License is distributed on an "AS IS" BASIS,
  12. * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13. * See the License for the specific language governing permissions and
  14. * limitations under the License.
  15. */
  16. /**
  17. * RecordIO: self-synchronizing stream of variable length records
  18. *
  19. * RecordIO gives you the ability to write a stream of variable length records
  20. * and read them later even in the face of data corruption -- randomly inserted
  21. * or deleted chunks of the file, or modified data. When reading, you may lose
  22. * corrupted records, but the stream will resynchronize automatically.
  23. */
  24. #pragma once
  25. #define FOLLY_IO_RECORDIO_H_
  26. #include <atomic>
  27. #include <memory>
  28. #include <mutex>
  29. #include <folly/File.h>
  30. #include <folly/Range.h>
  31. #include <folly/io/IOBuf.h>
  32. #include <folly/system/MemoryMapping.h>
  33. namespace folly {
  34. /**
  35. * Class to write a stream of RecordIO records to a file.
  36. *
  37. * RecordIOWriter is thread-safe
  38. */
  39. class RecordIOWriter {
  40. public:
  41. /**
  42. * Create a RecordIOWriter around a file; will append to the end of
  43. * file if it exists.
  44. *
  45. * Each file must have a non-zero file id, which is embedded in all
  46. * record headers. Readers will only return records with the requested
  47. * file id (or, if the reader is created with fileId=0 in the constructor,
  48. * the reader will return all records). File ids are only used to allow
  49. * resynchronization if you store RecordIO records (with headers) inside
  50. * other RecordIO records (for example, if a record consists of a fragment
  51. * from another RecordIO file). If you're not planning to do that,
  52. * the defaults are fine.
  53. */
  54. explicit RecordIOWriter(File file, uint32_t fileId = 1);
  55. /**
  56. * Write a record. We will use at most headerSize() bytes of headroom,
  57. * you might want to arrange that before copying your data into it.
  58. */
  59. void write(std::unique_ptr<IOBuf> buf);
  60. /**
  61. * Return the position in the file where the next byte will be written.
  62. * Conservative, as stuff can be written at any time from another thread.
  63. */
  64. off_t filePos() const {
  65. return filePos_;
  66. }
  67. private:
  68. File file_;
  69. uint32_t fileId_;
  70. std::unique_lock<File> writeLock_;
  71. std::atomic<off_t> filePos_;
  72. };
  73. /**
  74. * Class to read from a RecordIO file. Will skip invalid records.
  75. */
  76. class RecordIOReader {
  77. public:
  78. class Iterator;
  79. /**
  80. * RecordIOReader is iterable, returning pairs of ByteRange (record content)
  81. * and position in file where the record (including header) begins.
  82. * Note that the position includes the header, that is, it can be passed back
  83. * to seek().
  84. */
  85. typedef Iterator iterator;
  86. typedef Iterator const_iterator;
  87. typedef std::pair<ByteRange, off_t> value_type;
  88. typedef value_type& reference;
  89. typedef const value_type& const_reference;
  90. /**
  91. * A record reader with a fileId of 0 will return all records.
  92. * A record reader with a non-zero fileId will only return records where
  93. * the fileId matches.
  94. */
  95. explicit RecordIOReader(File file, uint32_t fileId = 0);
  96. Iterator cbegin() const;
  97. Iterator begin() const;
  98. Iterator cend() const;
  99. Iterator end() const;
  100. /**
  101. * Create an iterator to the first valid record after pos.
  102. */
  103. Iterator seek(off_t pos) const;
  104. private:
  105. MemoryMapping map_;
  106. uint32_t fileId_;
  107. };
  108. namespace recordio_helpers {
  109. // We're exposing the guts of the RecordIO implementation for two reasons:
  110. // 1. It makes unit testing easier, and
  111. // 2. It allows you to build different RecordIO readers / writers that use
  112. // different storage systems underneath (not standard files)
  113. /**
  114. * Header size.
  115. */
  116. constexpr size_t headerSize(); // defined in RecordIO-inl.h
  117. /**
  118. * Write a header in the buffer. We will prepend the header to the front
  119. * of the chain. Do not write the buffer if empty (we don't allow empty
  120. * records). Returns the total length, including header (0 if empty)
  121. * (same as buf->computeChainDataLength(), but likely faster)
  122. *
  123. * The fileId should be unique per stream and allows you to have RecordIO
  124. * headers stored inside the data (for example, have an entire RecordIO
  125. * file stored as a record inside another RecordIO file). The fileId may
  126. * not be 0.
  127. */
  128. size_t prependHeader(std::unique_ptr<IOBuf>& buf, uint32_t fileId = 1);
  129. /**
  130. * Search for the first valid record that begins in searchRange (which must be
  131. * a subrange of wholeRange). Returns the record data (not the header) if
  132. * found, ByteRange() otherwise.
  133. *
  134. * The fileId may be 0, in which case we'll return the first valid record for
  135. * *any* fileId, or non-zero, in which case we'll only look for records with
  136. * the requested fileId.
  137. */
  138. struct RecordInfo {
  139. uint32_t fileId;
  140. ByteRange record;
  141. };
  142. RecordInfo
  143. findRecord(ByteRange searchRange, ByteRange wholeRange, uint32_t fileId);
  144. /**
  145. * Search for the first valid record in range.
  146. */
  147. RecordInfo findRecord(ByteRange range, uint32_t fileId);
  148. /**
  149. * Check if there is a valid record at the beginning of range. Returns the
  150. * record data (not the header) if the record is valid, ByteRange() otherwise.
  151. */
  152. RecordInfo validateRecord(ByteRange range, uint32_t fileId);
  153. } // namespace recordio_helpers
  154. } // namespace folly
  155. #include <folly/io/RecordIO-inl.h>