@@ -14,6 +14,30 @@ import (
1414	"github.com/onflow/flow-go/storage/operation" 
1515)
1616
17+ // ChunkDataPacks manages storage and retrieval of ChunkDataPacks, primarily serving the use case of EXECUTION NODES persisting 
18+ // and indexing chunk data packs for their OWN RESULTS. Essentially, the chunk describes a batch of work to be done, and the 
19+ // chunk data pack describes the result of that work. The storage of chunk data packs is segregated across different 
20+ // storage components for efficiency and modularity reasons: 
21+ //  0. Usually (ignoring the system chunk for a moment), the batch of work is given by the collection referenced in the chunk 
22+ //     data pack. For any chunk data pack being stored, we assume that the executed collection has *previously* been persisted 
23+ //     in [storage.Collections]. It is useful to persist the collections individually, so we can individually retrieve them. 
24+ //  1. The actual chunk data pack itself is stored in a dedicated storage component `cdpStorage`. Note that for this storage 
25+ //     component, no atomicity is required, as we are storing chunk data packs by their collision-resistant hashes, so 
26+ //     different chunk data packs will be stored under different keys. 
27+ //     Theoretically, nodes could store persist multiple different (disagreeing) chunk data packs for the same 
28+ //     chunk in this step. However, for efficiency, Execution Nodes only store their own chunk data packs. 
29+ //  2. The index mapping from ChunkID to chunkDataPackID is stored in the protocol database for fast retrieval. 
30+ //     This index is intended to be populated by execution nodes when they commit to a specific result represented by the chunk 
31+ //     data pack. Here, we require atomicity, as an execution node should not be changing / overwriting which chunk data pack 
32+ //     it committed to (during normal operations). 
33+ // 
34+ // Since the executed collections are stored separately (step 0, above), we can just use the collection ID in context of the 
35+ // chunk data pack storage (step 1, above). Therefore, we utilize the reduced representation [storage.StoredChunkDataPack] 
36+ // internally. While removing redundant data from storage, it takes 3 look-ups to return chunk data pack by chunk ID: 
37+ // 
38+ //	i. a lookup for chunkID -> chunkDataPackID 
39+ //	ii. a lookup for chunkDataPackID -> StoredChunkDataPack (only has CollectionID, no collection data) 
40+ //	iii. a lookup for CollectionID -> Collection, then reconstruct the chunk data pack from the collection and the StoredChunkDataPack 
1741type  ChunkDataPacks  struct  {
1842	// the protocol DB is used for storing index mappings from chunk ID to chunk data pack ID 
1943	protocolDB  storage.DB 
@@ -27,11 +51,6 @@ type ChunkDataPacks struct {
2751
2852	// cache chunkID -> chunkDataPackID 
2953	chunkIDToChunkDataPackIDCache  * Cache [flow.Identifier , flow.Identifier ]
30- 
31- 	// it takes 3 look ups to return chunk data pack by chunk ID: 
32- 	// 1. a cache lookup for chunkID -> chunkDataPackID 
33- 	// 2. a lookup for chunkDataPackID -> StoredChunkDataPack (only has CollectionID, no collection data) 
34- 	// 3. a lookup for CollectionID -> Collection, then restore the chunk data pack with the collection and the StoredChunkDataPack 
3554}
3655
3756var  _  storage.ChunkDataPacks  =  (* ChunkDataPacks )(nil )
@@ -76,11 +95,20 @@ func NewChunkDataPacks(collector module.CacheMetrics, db storage.DB, stored stor
7695//     chunk data pack (or it will get slashed). This mapping from chunk ID to the ID of the chunk data pack that the Execution Node 
7796//     actually committed to is stored in the protocol database, in the following phase 2. 
7897//   - In the second phase, we populate the index mappings from ChunkID to one "distinguished" chunk data pack ID. This mapping 
79- //     is stored in the protocol database. Typically, en  Execution Node uses this for indexing its own chunk data packs which it 
98+ //     is stored in the protocol database. Typically, an  Execution Node uses this for indexing its own chunk data packs which it 
8099//     publicly committed to. 
81- //   - This function can approximately be described as an atomic operation. When it completes successfully, either both databases 
82- //     have been updated, or neither. However, this is an approximation only, because interim states exist, where the chunk data 
83- //     packs already have been stored in the chunk data pack database, but the index mappings do not yet exist. 
100+ // 
101+ // ATOMICITY: 
102+ // [ChunkDataPacks.Store] executes phase 1 immediately, persisting the chunk data packs in their dedicated database. However, 
103+ // the index mappings in phase 2 is deferred to the caller, who must invoke the returned functor to perform phase 2. This 
104+ // approach has the following benefits: 
105+ //   - Our API reflects that we are writing to two different databases here, with the chunk data pack database containing largely 
106+ //     specialized data subject to pruning. In contrast, the protocol database persists the commitments a node make (subject to 
107+ //     slashing). The caller receives the ability to persist this commitment in the form of the returned functor. The functor 
108+ //     may be discarded by the caller without corrupting the state (if anything, we have just stored some additional chunk data 
109+ //     packs). 
110+ //   - The serialization and storage of the comparatively large chunk data packs is separated from the protocol database writes. 
111+ //   - The locking duration of the protocol database is reduced. 
84112// 
85113// The Store method returns: 
86114//   - func(lctx lockctx.Proof, rw storage.ReaderBatchWriter) error: Function for populating the index mapping from chunkID 
@@ -133,7 +161,8 @@ func (ch *ChunkDataPacks) Store(cs []*flow.ChunkDataPack) (
133161		return  nil 
134162	}
135163
136- 	// Return the function that completes the storage process 
164+ 	// Returned Functor: when invoked, will add the deferred storage operations to the provided ReaderBatchWriter 
165+ 	// NOTE: until this functor is called, only the chunk data packs are stored by their respective IDs. 
137166	return  storeChunkDataPacksFunc , nil 
138167}
139168
@@ -242,7 +271,7 @@ func (ch *ChunkDataPacks) ByChunkID(chunkID flow.Identifier) (*flow.ChunkDataPac
242271		return  nil , fmt .Errorf ("cannot retrieve stored chunk data pack %x for chunk %x: %w" , chunkDataPackID , chunkID , err )
243272	}
244273
245- 	var  collection  * flow.Collection 
274+ 	var  collection  * flow.Collection   // nil by default, which only represents system chunk 
246275	if  schdp .CollectionID  !=  flow .ZeroID  {
247276		collection , err  =  ch .collections .ByID (schdp .CollectionID )
248277		if  err  !=  nil  {
0 commit comments