15
15
// specific language governing permissions and limitations
16
16
// under the License.
17
17
18
- //! Contains file writer API, and provides methods to write row groups and columns by
19
- //! using row group writers and column writers respectively.
18
+ //! [`SerializedFileWriter`]: Low level Parquet writer API
20
19
21
20
use crate :: bloom_filter:: Sbbf ;
22
21
use crate :: format as parquet;
@@ -139,7 +138,10 @@ pub type OnCloseRowGroup<'a, W> = Box<
139
138
// Serialized impl for file & row group writers
140
139
141
140
/// Parquet file writer API.
142
- /// Provides methods to write row groups sequentially.
141
+ ///
142
+ /// This is a low level API for writing Parquet files directly, and handles
143
+ /// tracking the file structures, including row groups and column chunks,
144
+ /// and writing the file footer.
143
145
///
144
146
/// The main workflow should be as following:
145
147
/// - Create file writer, this will open a new file and potentially write some metadata.
@@ -221,11 +223,13 @@ impl<W: Write + Send> SerializedFileWriter<W> {
221
223
}
222
224
223
225
/// Creates new row group from this file writer.
224
- /// In case of IO error or Thrift error, returns `Err`.
225
226
///
226
- /// There can be at most 2^15 row groups in a file; and row groups have
227
- /// to be written sequentially. Every time the next row group is requested, the
228
- /// previous row group must be finalised and closed using `RowGroupWriter::close` method.
227
+ /// Note: Parquet files are limited to at most 2^15 row groups in a file; and row groups must
228
+ /// be written sequentially.
229
+ ///
230
+ /// Every time the next row group is requested, the previous row group must
231
+ /// be finalised and closed using the [`SerializedRowGroupWriter::close`]
232
+ /// method or an error will be returned.
229
233
pub fn next_row_group ( & mut self ) -> Result < SerializedRowGroupWriter < ' _ , W > > {
230
234
self . assert_previous_writer_closed ( ) ?;
231
235
let ordinal = self . row_group_index ;
@@ -397,7 +401,7 @@ impl<W: Write + Send> SerializedFileWriter<W> {
397
401
/// Writes the given buf bytes to the internal buffer.
398
402
///
399
403
/// This can be used to write raw data to an in-progress parquet file, for
400
- /// example, custom index structures or other payloads. Other parquet readers
404
+ /// example, custom index structures or other payloads. Other parquet readers
401
405
/// will skip this data when reading the files.
402
406
///
403
407
/// It's safe to use this method to write data to the underlying writer,
@@ -409,7 +413,7 @@ impl<W: Write + Send> SerializedFileWriter<W> {
409
413
/// Returns a mutable reference to the underlying writer.
410
414
///
411
415
/// **Warning**: if you write directly to this writer, you will skip
412
- /// the `TrackedWrite` buffering and byte‐counting layers. That’ll cause
416
+ /// the `TrackedWrite` buffering and byte‐counting layers, which can cause
413
417
/// the file footer’s recorded offsets and sizes to diverge from reality,
414
418
/// resulting in an unreadable or corrupted Parquet file.
415
419
///
@@ -478,6 +482,7 @@ fn write_bloom_filters<W: Write + Send>(
478
482
}
479
483
480
484
/// Parquet row group writer API.
485
+ ///
481
486
/// Provides methods to access column writers in an iterator-like fashion, order is
482
487
/// guaranteed to match the order of schema leaves (column descriptors).
483
488
///
@@ -645,12 +650,21 @@ impl<'a, W: Write + Send> SerializedRowGroupWriter<'a, W> {
645
650
} )
646
651
}
647
652
648
- /// Append an encoded column chunk from another source without decoding it
653
+ /// Append an encoded column chunk from `reader` directly to the underlying
654
+ /// writer.
655
+ ///
656
+ /// This method can be used for efficiently concatenating or projecting
657
+ /// parquet data, or encoding parquet data to temporary in-memory buffers.
649
658
///
650
- /// This can be used for efficiently concatenating or projecting parquet data,
651
- /// or encoding parquet data to temporary in-memory buffers
659
+ /// Arguments:
660
+ /// - `reader`: a [`ChunkReader`] containing the encoded column data
661
+ /// - `close`: the [`ColumnCloseResult`] metadata returned from closing
662
+ /// the column writer that wrote the data in `reader`.
652
663
///
653
- /// See [`Self::next_column`] for writing data that isn't already encoded
664
+ /// See Also:
665
+ /// 1. [`get_column_writer`] / [`get_column_writers`] for creating writers
666
+ /// that can encode data.
667
+ /// 2. [`Self::next_column`] for writing data that isn't already encoded
654
668
pub fn append_column < R : ChunkReader > (
655
669
& mut self ,
656
670
reader : & R ,
0 commit comments