Skip to content

Commit 7979348

Browse files
authored
Merge pull request #8038 from onflow/alex/suggestions2_stored-chunk-data-pack
Suggested documentation extensions for Chunk Data Pack PR #7983
2 parents 28a3fae + c59b4de commit 7979348

File tree

8 files changed

+101
-33
lines changed

8 files changed

+101
-33
lines changed

engine/execution/state/state.go

Lines changed: 11 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -413,6 +413,12 @@ func (s *state) saveExecutionResults(
413413
return fmt.Errorf("can not retrieve chunk data packs: %w", err)
414414
}
415415

416+
// Within the following `Store` call, the chunk data packs themselves are going to be persisted into their
417+
// dedicated database. However, we have not yet persisted that this execution node is committing to the
418+
// result represented by the chunk data packs. Populating the index from chunk ID to chunk data pack ID
419+
// in the protocol database (signifying the node's slashable commitment to the respective result) is
420+
// done by the functor returned by `Store`. The functor's is invoked as part of the atomic batch update
421+
// of the protocol database below.
416422
storeFunc, err := s.chunkDataPacks.Store(chunks)
417423
if err != nil {
418424
return fmt.Errorf("can not store chunk data packs for block ID: %v: %w", blockID, err)
@@ -427,9 +433,11 @@ func (s *state) saveExecutionResults(
427433
// This design guarantees consistency in two scenarios:
428434
//
429435
// Case 1: If the batch update is interrupted, the mapping has not yet been saved.
430-
// Later, if we attempt to store another execution result that references a
431-
// different chunk data pack but the same chunk ID, there is no conflict,
432-
// because no previous mapping exists.
436+
// Later, if we attempt to store another execution result that references a different
437+
// chunk data pack but the same chunk ID, there is no conflict, because no previous mapping
438+
// exists. By convention, a node should only share information once it has persisted its
439+
// commitment in the database. Therefore, if the database write was interrupted, none of the
440+
// information is stored and no binding commitment to a different result could have been made.
433441
//
434442
// Case 2: If the batch update succeeds, the mapping is saved. Later, if we
435443
// attempt to store another execution result that references a different
@@ -484,7 +492,6 @@ func (s *state) saveExecutionResults(
484492
return nil
485493
})
486494
})
487-
488495
}
489496

490497
func (s *state) UpdateLastExecutedBlock(ctx context.Context, executedID flow.Identifier) error {

model/flow/chunk.go

Lines changed: 12 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -129,6 +129,11 @@ func (ch *Chunk) ID() Identifier {
129129
return MakeID(ch)
130130
}
131131

132+
// ChunkDataPackHeader is a reduced representation of ChunkDataPack. In a nutshell, we substitute
133+
// the larger [ChunkDataPack.Proof] and [ChunkDataPack.Collection] with their collision-resistant hashes.
134+
// Note, ChunkDataPackHeader.ID() is the same as ChunkDataPack.ID().
135+
//
136+
//structwrite:immutable - mutations allowed only within the constructor
132137
type ChunkDataPackHeader struct {
133138
ChunkID Identifier // ID of the chunk this data pack is for
134139
StartState StateCommitment // commitment for starting state
@@ -141,6 +146,8 @@ type ChunkDataPackHeader struct {
141146
ExecutionDataRoot BlockExecutionDataRoot
142147
}
143148

149+
// NewChunkDataPackHeader instantiates an "immutable" ChunkDataPackHeader.
150+
// The `CollectionID` field is set to [flow.ZeroID] for system chunks.
144151
func NewChunkDataPackHeader(ChunkID Identifier, StartState StateCommitment, ProofID Identifier, CollectionID Identifier, ExecutionDataRoot BlockExecutionDataRoot) *ChunkDataPackHeader {
145152
return &ChunkDataPackHeader{
146153
ChunkID: ChunkID,
@@ -197,8 +204,9 @@ type ChunkDataPack struct {
197204
// a trusted ChunkDataPack using NewChunkDataPack constructor.
198205
type UntrustedChunkDataPack ChunkDataPack
199206

200-
// NewChunkDataPack returns an initialized chunk data pack.
201-
// Construction ChunkDataPack allowed only within the constructor.
207+
// FromUntrustedChunkDataPack converts a chunk data pack from an untrusted source
208+
// into its canonical representation. Here, basic structural validation is performed.
209+
// Construction of ChunkDataPacks is ONLY allowed via THIS CONSTRUCTOR.
202210
//
203211
// All errors indicate a valid ChunkDataPack cannot be constructed from the input.
204212
func FromUntrustedChunkDataPack(untrusted UntrustedChunkDataPack) (*ChunkDataPack, error) {
@@ -231,6 +239,8 @@ func FromUntrustedChunkDataPack(untrusted UntrustedChunkDataPack) (*ChunkDataPac
231239
), nil
232240
}
233241

242+
// NewChunkDataPack instantiates an "immutable" ChunkDataPack.
243+
// The `collection` field is set to `nil` for system chunks.
234244
func NewChunkDataPack(chunkID Identifier, startState StateCommitment, proof StorageProof, collection *Collection, executionDataRoot BlockExecutionDataRoot) *ChunkDataPack {
235245
return &ChunkDataPack{
236246
ChunkID: chunkID,

storage/chunk_data_packs.go

Lines changed: 13 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -21,11 +21,20 @@ type ChunkDataPacks interface {
2121
// chunk data pack (or it will get slashed). This mapping from chunk ID to the ID of the chunk data pack that the Execution Node
2222
// actually committed to is stored in the protocol database, in the following phase 2.
2323
// - In the second phase, we populate the index mappings from ChunkID to one "distinguished" chunk data pack ID. This mapping
24-
// is stored in the protocol database. Typically, en Execution Node uses this for indexing its own chunk data packs which it
24+
// is stored in the protocol database. Typically, an Execution Node uses this for indexing its own chunk data packs which it
2525
// publicly committed to.
26-
// - This function can approximately be described as an atomic operation. When it completes successfully, either both databases
27-
// have been updated, or neither. However, this is an approximation only, because interim states exist, where the chunk data
28-
// packs already have been stored in the chunk data pack database, but the index mappings do not yet exist.
26+
//
27+
// ATOMICITY:
28+
// [ChunkDataPacks.Store] executes phase 1 immediately, persisting the chunk data packs in their dedicated database. However,
29+
// the index mappings in phase 2 is deferred to the caller, who must invoke the returned functor to perform phase 2. This
30+
// approach has the following benefits:
31+
// - Our API reflects that we are writing to two different databases here, with the chunk data pack database containing largely
32+
// specialized data subject to pruning. In contrast, the protocol database persists the commitments a node make (subject to
33+
// slashing). The caller receives the ability to persist this commitment in the form of the returned functor. The functor
34+
// may be discarded by the caller without corrupting the state (if anything, we have just stored some additional chunk data
35+
// packs).
36+
// - The serialization and storage of the comparatively large chunk data packs is separated from the protocol database writes.
37+
// - The locking duration of the protocol database is reduced.
2938
//
3039
// The Store method returns:
3140
// - func(lctx lockctx.Proof, rw storage.ReaderBatchWriter) error: Function for populating the index mapping from chunkID

storage/chunk_data_packs_stored.go

Lines changed: 12 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -29,19 +29,21 @@ type StoredChunkDataPacks interface {
2929
BatchRemove(chunkDataPackIDs []flow.Identifier, rw ReaderBatchWriter) error
3030
}
3131

32-
// StoredChunkDataPack is an in-storage representation of chunk data pack.
33-
// Its prime difference is instead of an actual collection, it keeps a collection ID hence relying on maintaining
34-
// the collection on a secondary storage.
32+
// StoredChunkDataPack is an in-storage representation of chunk data pack. Its prime difference is instead of an
33+
// actual collection, it keeps a collection ID hence relying on maintaining the collection on a secondary storage.
34+
// Note, StoredChunkDataPack.ID() is the same as ChunkDataPack.ID()
3535
//
3636
//structwrite:immutable - mutations allowed only within the constructor
3737
type StoredChunkDataPack struct {
3838
ChunkID flow.Identifier
3939
StartState flow.StateCommitment
4040
Proof flow.StorageProof
41-
CollectionID flow.Identifier
41+
CollectionID flow.Identifier // flow.ZeroID for system chunks
4242
ExecutionDataRoot flow.BlockExecutionDataRoot
4343
}
4444

45+
// NewStoredChunkDataPack instantiates an "immutable" [StoredChunkDataPack].
46+
// The `collectionID` field is set to [flow.ZeroID] for system chunks.
4547
func NewStoredChunkDataPack(
4648
chunkID flow.Identifier,
4749
startState flow.StateCommitment,
@@ -58,16 +60,18 @@ func NewStoredChunkDataPack(
5860
}
5961
}
6062

63+
// IsSystemChunk returns true if this chunk data pack is for a system chunk.
6164
func (s *StoredChunkDataPack) IsSystemChunk() bool {
6265
return s.CollectionID == flow.ZeroID
6366
}
6467

68+
// ToStoredChunkDataPack converts the given Chunk Data Pack to its reduced representation.
69+
// (Collections are stored separately and don't need to be included again here).
6570
func ToStoredChunkDataPack(c *flow.ChunkDataPack) *StoredChunkDataPack {
6671
collectionID := flow.ZeroID
6772
if c.Collection != nil {
6873
collectionID = c.Collection.ID()
6974
}
70-
7175
return NewStoredChunkDataPack(
7276
c.ChunkID,
7377
c.StartState,
@@ -77,7 +81,9 @@ func ToStoredChunkDataPack(c *flow.ChunkDataPack) *StoredChunkDataPack {
7781
)
7882
}
7983

80-
func ToStoredChunkDataPacks(cs []*flow.ChunkDataPack) []*StoredChunkDataPack { // ToStoredChunkDataPack converts the given ChunkDataPacks to their reduced representation,
84+
// ToStoredChunkDataPacks converts the given Chunk Data Packs to their reduced representation.
85+
// (Collections are stored separately and don't need to be included again here).
86+
func ToStoredChunkDataPacks(cs []*flow.ChunkDataPack) []*StoredChunkDataPack {
8187
scs := make([]*StoredChunkDataPack, 0, len(cs))
8288
for _, c := range cs {
8389
scs = append(scs, ToStoredChunkDataPack(c))

storage/operation/chunk_data_packs.go

Lines changed: 9 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -10,8 +10,8 @@ import (
1010
"github.com/onflow/flow-go/storage"
1111
)
1212

13-
// IndexChunkDataPackByChunkID inserts a mapping from chunk ID to stored chunk data pack ID.
14-
// It requires the [storage.LockInsertOwnReceipt] lock to be held by the caller.
13+
// IndexChunkDataPackByChunkID inserts a mapping from chunk ID to stored chunk data pack ID. It requires
14+
// the [storage.LockInsertOwnReceipt] lock to be acquired by the caller and held until the write batch has been committed.
1515
// Returns [storage.ErrDataMismatch] if a different chunk data pack ID already exists for the given chunk ID.
1616
func IndexChunkDataPackByChunkID(lctx lockctx.Proof, rw storage.ReaderBatchWriter, chunkID flow.Identifier, chunkDataPackID flow.Identifier) error {
1717
if !lctx.HoldsLock(storage.LockInsertOwnReceipt) {
@@ -35,7 +35,7 @@ func IndexChunkDataPackByChunkID(lctx lockctx.Proof, rw storage.ReaderBatchWrite
3535
}
3636

3737
// RetrieveChunkDataPackID retrieves the stored chunk data pack ID for a given chunk ID.
38-
// Returns [storage.ErrNotFound] if no mapping exists for the given chunk ID.
38+
// Returns [storage.ErrNotFound] if no chunk data pack has been indexed as result for the given chunk ID.
3939
func RetrieveChunkDataPackID(r storage.Reader, chunkID flow.Identifier, chunkDataPackID *flow.Identifier) error {
4040
return RetrieveByKey(r, MakePrefix(codeIndexChunkDataPackByChunkID, chunkID), chunkDataPackID)
4141
}
@@ -47,14 +47,18 @@ func RemoveChunkDataPackID(w storage.Writer, chunkID flow.Identifier) error {
4747
}
4848

4949
// InsertStoredChunkDataPack inserts a [storage.StoredChunkDataPack] into the database, keyed by its own ID.
50-
// The caller must ensure the chunkDataPackID is the same as c.ID().
50+
//
51+
// CAUTION: The caller must ensure `storeChunkDataPackID` is the same as `c.ID()`, ie. a collision-resistant
52+
// hash of the chunk data pack! This method silently overrides existing data, which is safe only if for the
53+
// same key, we always write the same value.
54+
//
5155
// No error returns expected during normal operations.
5256
func InsertStoredChunkDataPack(rw storage.ReaderBatchWriter, storeChunkDataPackID flow.Identifier, c *storage.StoredChunkDataPack) error {
5357
return UpsertByKey(rw.Writer(), MakePrefix(codeChunkDataPack, storeChunkDataPackID), c)
5458
}
5559

5660
// RetrieveStoredChunkDataPack retrieves a chunk data pack by stored chunk data pack ID.
57-
// It returns [storage.ErrNotFound] if the chunk data pack is not found
61+
// It returns [storage.ErrNotFound] if no chunk data pack with the given ID is known.
5862
func RetrieveStoredChunkDataPack(r storage.Reader, storeChunkDataPackID flow.Identifier, c *storage.StoredChunkDataPack) error {
5963
return RetrieveByKey(r, MakePrefix(codeChunkDataPack, storeChunkDataPackID), c)
6064
}

storage/store/chunk_data_packs.go

Lines changed: 40 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,30 @@ import (
1414
"github.com/onflow/flow-go/storage/operation"
1515
)
1616

17+
// ChunkDataPacks manages storage and retrieval of ChunkDataPacks, primarily serving the use case of EXECUTION NODES persisting
18+
// and indexing chunk data packs for their OWN RESULTS. Essentially, the chunk describes a batch of work to be done, and the
19+
// chunk data pack describes the result of that work. The storage of chunk data packs is segregated across different
20+
// storage components for efficiency and modularity reasons:
21+
// 0. Usually (ignoring the system chunk for a moment), the batch of work is given by the collection referenced in the chunk
22+
// data pack. For any chunk data pack being stored, we assume that the executed collection has *previously* been persisted
23+
// in [storage.Collections]. It is useful to persist the collections individually, so we can individually retrieve them.
24+
// 1. The actual chunk data pack itself is stored in a dedicated storage component `cdpStorage`. Note that for this storage
25+
// component, no atomicity is required, as we are storing chunk data packs by their collision-resistant hashes, so
26+
// different chunk data packs will be stored under different keys.
27+
// Theoretically, nodes could store persist multiple different (disagreeing) chunk data packs for the same
28+
// chunk in this step. However, for efficiency, Execution Nodes only store their own chunk data packs.
29+
// 2. The index mapping from ChunkID to chunkDataPackID is stored in the protocol database for fast retrieval.
30+
// This index is intended to be populated by execution nodes when they commit to a specific result represented by the chunk
31+
// data pack. Here, we require atomicity, as an execution node should not be changing / overwriting which chunk data pack
32+
// it committed to (during normal operations).
33+
//
34+
// Since the executed collections are stored separately (step 0, above), we can just use the collection ID in context of the
35+
// chunk data pack storage (step 1, above). Therefore, we utilize the reduced representation [storage.StoredChunkDataPack]
36+
// internally. While removing redundant data from storage, it takes 3 look-ups to return chunk data pack by chunk ID:
37+
//
38+
// i. a lookup for chunkID -> chunkDataPackID
39+
// ii. a lookup for chunkDataPackID -> StoredChunkDataPack (only has CollectionID, no collection data)
40+
// iii. a lookup for CollectionID -> Collection, then reconstruct the chunk data pack from the collection and the StoredChunkDataPack
1741
type ChunkDataPacks struct {
1842
// the protocol DB is used for storing index mappings from chunk ID to chunk data pack ID
1943
protocolDB storage.DB
@@ -27,11 +51,6 @@ type ChunkDataPacks struct {
2751

2852
// cache chunkID -> chunkDataPackID
2953
chunkIDToChunkDataPackIDCache *Cache[flow.Identifier, flow.Identifier]
30-
31-
// it takes 3 look ups to return chunk data pack by chunk ID:
32-
// 1. a cache lookup for chunkID -> chunkDataPackID
33-
// 2. a lookup for chunkDataPackID -> StoredChunkDataPack (only has CollectionID, no collection data)
34-
// 3. a lookup for CollectionID -> Collection, then restore the chunk data pack with the collection and the StoredChunkDataPack
3554
}
3655

3756
var _ storage.ChunkDataPacks = (*ChunkDataPacks)(nil)
@@ -76,11 +95,20 @@ func NewChunkDataPacks(collector module.CacheMetrics, db storage.DB, stored stor
7695
// chunk data pack (or it will get slashed). This mapping from chunk ID to the ID of the chunk data pack that the Execution Node
7796
// actually committed to is stored in the protocol database, in the following phase 2.
7897
// - In the second phase, we populate the index mappings from ChunkID to one "distinguished" chunk data pack ID. This mapping
79-
// is stored in the protocol database. Typically, en Execution Node uses this for indexing its own chunk data packs which it
98+
// is stored in the protocol database. Typically, an Execution Node uses this for indexing its own chunk data packs which it
8099
// publicly committed to.
81-
// - This function can approximately be described as an atomic operation. When it completes successfully, either both databases
82-
// have been updated, or neither. However, this is an approximation only, because interim states exist, where the chunk data
83-
// packs already have been stored in the chunk data pack database, but the index mappings do not yet exist.
100+
//
101+
// ATOMICITY:
102+
// [ChunkDataPacks.Store] executes phase 1 immediately, persisting the chunk data packs in their dedicated database. However,
103+
// the index mappings in phase 2 is deferred to the caller, who must invoke the returned functor to perform phase 2. This
104+
// approach has the following benefits:
105+
// - Our API reflects that we are writing to two different databases here, with the chunk data pack database containing largely
106+
// specialized data subject to pruning. In contrast, the protocol database persists the commitments a node make (subject to
107+
// slashing). The caller receives the ability to persist this commitment in the form of the returned functor. The functor
108+
// may be discarded by the caller without corrupting the state (if anything, we have just stored some additional chunk data
109+
// packs).
110+
// - The serialization and storage of the comparatively large chunk data packs is separated from the protocol database writes.
111+
// - The locking duration of the protocol database is reduced.
84112
//
85113
// The Store method returns:
86114
// - func(lctx lockctx.Proof, rw storage.ReaderBatchWriter) error: Function for populating the index mapping from chunkID
@@ -133,7 +161,8 @@ func (ch *ChunkDataPacks) Store(cs []*flow.ChunkDataPack) (
133161
return nil
134162
}
135163

136-
// Return the function that completes the storage process
164+
// Returned Functor: when invoked, will add the deferred storage operations to the provided ReaderBatchWriter
165+
// NOTE: until this functor is called, only the chunk data packs are stored by their respective IDs.
137166
return storeChunkDataPacksFunc, nil
138167
}
139168

@@ -242,7 +271,7 @@ func (ch *ChunkDataPacks) ByChunkID(chunkID flow.Identifier) (*flow.ChunkDataPac
242271
return nil, fmt.Errorf("cannot retrieve stored chunk data pack %x for chunk %x: %w", chunkDataPackID, chunkID, err)
243272
}
244273

245-
var collection *flow.Collection
274+
var collection *flow.Collection // nil by default, which only represents system chunk
246275
if schdp.CollectionID != flow.ZeroID {
247276
collection, err = ch.collections.ByID(schdp.CollectionID)
248277
if err != nil {

storage/store/chunk_data_packs_stored.go

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,9 @@ import (
1010
"github.com/onflow/flow-go/storage/operation"
1111
)
1212

13+
// StoredChunkDataPacks represents persistent storage for chunk data packs.
14+
// It works with the reduced representation `StoredChunkDataPack` for chunk data packs,
15+
// where instead of the full collection data, only the collection's hash (ID) is contained.
1316
type StoredChunkDataPacks struct {
1417
db storage.DB
1518
byIDCache *Cache[flow.Identifier, *storage.StoredChunkDataPack]

storage/store/chunk_data_packs_test.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -41,7 +41,7 @@ func TestChunkDataPacks_Store(t *testing.T) {
4141
require.Equal(t, chunkDataPack, stored, "mismatched chunk data pack at index %d", i)
4242
}
4343

44-
// Store again is idemopotent
44+
// Store again is idempotent
4545
storeFunc, err = chunkDataPackStore.Store(chunkDataPacks)
4646
if err != nil {
4747
return err

0 commit comments

Comments
 (0)