Skip to content

Commit f4b55af

Browse files
committed
Docs: Add more comments to the Parquet writer code
1 parent 1f77ac5 commit f4b55af

File tree

3 files changed

+37
-19
lines changed

3 files changed

+37
-19
lines changed

parquet/src/arrow/arrow_writer/mod.rs

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -908,7 +908,7 @@ impl ArrowRowGroupWriterFactory {
908908
}
909909
}
910910

911-
/// Returns the [`ArrowColumnWriter`] for a given schema
911+
/// Returns [`ArrowColumnWriter`]s for each column in a given schema
912912
pub fn get_column_writers(
913913
parquet: &SchemaDescriptor,
914914
props: &WriterPropertiesPtr,
@@ -1008,7 +1008,7 @@ impl ArrowColumnWriterFactory {
10081008
Ok(Box::<ArrowPageWriter>::default())
10091009
}
10101010

1011-
/// Gets the [`ArrowColumnWriter`] for the given `data_type`
1011+
/// Gets an [`ArrowColumnWriter`] for the given `data_type`
10121012
fn get_arrow_column_writer(
10131013
&self,
10141014
data_type: &ArrowDataType,

parquet/src/column/writer/mod.rs

Lines changed: 8 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -64,6 +64,8 @@ macro_rules! downcast_writer {
6464
}
6565

6666
/// Column writer for a Parquet type.
67+
///
68+
/// See [`get_column_writer`] to create instances of this type
6769
pub enum ColumnWriter<'a> {
6870
/// Column writer for boolean type
6971
BoolColumnWriter(ColumnWriterImpl<'a, BoolType>),
@@ -96,13 +98,13 @@ impl ColumnWriter<'_> {
9698
downcast_writer!(self, typed, typed.get_estimated_total_bytes())
9799
}
98100

99-
/// Close this [`ColumnWriter`]
101+
/// Close this [`ColumnWriter`], returning the metadata for the column chunk.
100102
pub fn close(self) -> Result<ColumnCloseResult> {
101103
downcast_writer!(self, typed, typed.close())
102104
}
103105
}
104106

105-
/// Gets a specific column writer corresponding to column descriptor `descr`.
107+
/// Create a specific column writer corresponding to column descriptor `descr`.
106108
pub fn get_column_writer<'a>(
107109
descr: ColumnDescPtr,
108110
props: WriterPropertiesPtr,
@@ -173,7 +175,9 @@ pub fn get_typed_column_writer_mut<'a, 'b: 'a, T: DataType>(
173175
})
174176
}
175177

176-
/// Metadata returned by [`GenericColumnWriter::close`]
178+
/// Metadata for a column chunk of a parquet file.
179+
///
180+
/// Note this structure is returned by [`ColumnWriter::close`].
177181
#[derive(Debug, Clone)]
178182
pub struct ColumnCloseResult {
179183
/// The total number of bytes written
@@ -316,7 +320,7 @@ impl<T: Default> ColumnMetrics<T> {
316320
/// Typed column writer for a primitive column.
317321
pub type ColumnWriterImpl<'a, T> = GenericColumnWriter<'a, ColumnValueEncoderImpl<T>>;
318322

319-
/// Generic column writer for a primitive column.
323+
/// Generic column writer for a primitive Parquet column
320324
pub struct GenericColumnWriter<'a, E: ColumnValueEncoder> {
321325
// Column writer properties
322326
descr: ColumnDescPtr,

parquet/src/file/writer.rs

Lines changed: 27 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -15,8 +15,7 @@
1515
// specific language governing permissions and limitations
1616
// under the License.
1717

18-
//! Contains file writer API, and provides methods to write row groups and columns by
19-
//! using row group writers and column writers respectively.
18+
//! [`SerializedFileWriter`]: Low level Parquet writer API
2019
2120
use crate::bloom_filter::Sbbf;
2221
use crate::format as parquet;
@@ -139,7 +138,10 @@ pub type OnCloseRowGroup<'a, W> = Box<
139138
// Serialized impl for file & row group writers
140139

141140
/// Parquet file writer API.
142-
/// Provides methods to write row groups sequentially.
141+
///
142+
/// This is a low level API for writing Parquet files directly, and handles
143+
/// tracking the file structures, including row groups and column chunks,
144+
/// and writing the file footer.
143145
///
144146
/// The main workflow should be as following:
145147
/// - Create file writer, this will open a new file and potentially write some metadata.
@@ -221,11 +223,13 @@ impl<W: Write + Send> SerializedFileWriter<W> {
221223
}
222224

223225
/// Creates new row group from this file writer.
224-
/// In case of IO error or Thrift error, returns `Err`.
225226
///
226-
/// There can be at most 2^15 row groups in a file; and row groups have
227-
/// to be written sequentially. Every time the next row group is requested, the
228-
/// previous row group must be finalised and closed using `RowGroupWriter::close` method.
227+
/// Note: Parquet files are limited to at most 2^15 row groups in a file; and row groups must
228+
/// be written sequentially.
229+
///
230+
/// Every time the next row group is requested, the previous row group must
231+
/// be finalised and closed using the [`SerializedRowGroupWriter::close`]
232+
/// method or an error will be returned.
229233
pub fn next_row_group(&mut self) -> Result<SerializedRowGroupWriter<'_, W>> {
230234
self.assert_previous_writer_closed()?;
231235
let ordinal = self.row_group_index;
@@ -397,7 +401,7 @@ impl<W: Write + Send> SerializedFileWriter<W> {
397401
/// Writes the given buf bytes to the internal buffer.
398402
///
399403
/// This can be used to write raw data to an in-progress parquet file, for
400-
/// example, custom index structures or other payloads. Other parquet readers
404+
/// example, custom index structures or other payloads. Other parquet readers
401405
/// will skip this data when reading the files.
402406
///
403407
/// It's safe to use this method to write data to the underlying writer,
@@ -409,7 +413,7 @@ impl<W: Write + Send> SerializedFileWriter<W> {
409413
/// Returns a mutable reference to the underlying writer.
410414
///
411415
/// **Warning**: if you write directly to this writer, you will skip
412-
/// the `TrackedWrite` buffering and byte‐counting layers. That’ll cause
416+
/// the `TrackedWrite` buffering and byte‐counting layers, which can cause
413417
/// the file footer’s recorded offsets and sizes to diverge from reality,
414418
/// resulting in an unreadable or corrupted Parquet file.
415419
///
@@ -478,6 +482,7 @@ fn write_bloom_filters<W: Write + Send>(
478482
}
479483

480484
/// Parquet row group writer API.
485+
///
481486
/// Provides methods to access column writers in an iterator-like fashion, order is
482487
/// guaranteed to match the order of schema leaves (column descriptors).
483488
///
@@ -645,12 +650,21 @@ impl<'a, W: Write + Send> SerializedRowGroupWriter<'a, W> {
645650
})
646651
}
647652

648-
/// Append an encoded column chunk from another source without decoding it
653+
/// Append an encoded column chunk from `reader` directly to the underlying
654+
/// writer.
655+
///
656+
/// This method can be used for efficiently concatenating or projecting
657+
/// parquet data, or encoding parquet data to temporary in-memory buffers.
649658
///
650-
/// This can be used for efficiently concatenating or projecting parquet data,
651-
/// or encoding parquet data to temporary in-memory buffers
659+
/// Arguments:
660+
/// - `reader`: a [`ChunkReader`] containing the encoded column data
661+
/// - `close`: the [`ColumnCloseResult`] metadata returned from closing
662+
/// the column writer that wrote the data in `reader`.
652663
///
653-
/// See [`Self::next_column`] for writing data that isn't already encoded
664+
/// See Also:
665+
/// 1. [`get_column_writer`] / [`get_column_writers`] for creating writers
666+
/// that can encode data.
667+
/// 2. [`Self::next_column`] for writing data that isn't already encoded
654668
pub fn append_column<R: ChunkReader>(
655669
&mut self,
656670
reader: &R,

0 commit comments

Comments
 (0)