Skip to content

Commit cac5bd5

Browse files
authored
Do not send duplicate files in shards (#1840)
cc @assafvayner not sure if in xet spec or not
1 parent 73bb2da commit cac5bd5

File tree

1 file changed

+6
-0
lines changed

1 file changed

+6
-0
lines changed

packages/hub/src/utils/createXorbs.ts

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -108,6 +108,7 @@ export async function* createXorbs(
108108
void,
109109
undefined
110110
> {
111+
const alreadyDoneFileSha256s: Set<string> = new Set();
111112
const chunkModule = await import("../vendor/xet-chunk/chunker_wasm");
112113
let xorbId = 0;
113114

@@ -147,6 +148,11 @@ export async function* createXorbs(
147148
const remoteXorbHashes: string[] = [""]; // starts at index 1 (to simplify implem a bit)
148149

149150
for await (const fileSource of fileSources) {
151+
if (alreadyDoneFileSha256s.has(fileSource.sha256)) {
152+
continue;
153+
}
154+
alreadyDoneFileSha256s.add(fileSource.sha256);
155+
150156
const chunker = new chunkModule.Chunker(TARGET_CHUNK_SIZE);
151157
try {
152158
xorb.fileSize[fileSource.path] = fileSource.content.size;

0 commit comments

Comments
 (0)