3636import org .apache .logging .log4j .Logger ;
3737import org .opensearch .Version ;
3838import org .opensearch .cluster .ClusterInfo ;
39+ import org .opensearch .cluster .DiskUsage ;
3940import org .opensearch .cluster .routing .RoutingNode ;
4041import org .opensearch .cluster .routing .ShardRouting ;
4142import org .opensearch .cluster .routing .allocation .DiskThresholdEvaluator ;
4950import org .opensearch .index .store .remote .filecache .FileCacheSettings ;
5051
5152import java .util .List ;
53+ import java .util .Map ;
5254import java .util .stream .Collectors ;
5355import java .util .stream .StreamSupport ;
5456
@@ -111,11 +113,14 @@ public Decision canAllocate(ShardRouting shardRouting, RoutingNode node, Routing
111113 return Decision .ALWAYS ;
112114 }
113115
114- final Decision decision = earlyTerminate (node , allocation );
116+ ClusterInfo clusterInfo = allocation .clusterInfo ();
117+ Map <String , DiskUsage > usages = clusterInfo .getNodeMostAvailableDiskUsages ();
118+ final Decision decision = earlyTerminate (node , allocation , usages );
115119 if (decision != null ) {
116120 return decision ;
117121 }
118122
123+ DiskUsage usage = usages .get (node .nodeId ());
119124 final long shardSize = DiskThresholdDecider .getExpectedShardSize (
120125 shardRouting ,
121126 0L ,
@@ -125,18 +130,21 @@ public Decision canAllocate(ShardRouting shardRouting, RoutingNode node, Routing
125130 allocation .routingTable ()
126131 );
127132
128- final long totalAddressableSpace = calculateTotalAddressableSpace (node , allocation );
129- final long currentNodeRemoteShardSize = calculateCurrentNodeRemoteShardSize (node , allocation , false );
130- final long freeSpace = Math .max (totalAddressableSpace - currentNodeRemoteShardSize , 0 );
131- final long freeSpaceAfterAllocation = Math .max (freeSpace - shardSize , 0 );
132- final long freeSpaceLowThreshold = diskThresholdEvaluator .getFreeSpaceLowThreshold (totalAddressableSpace );
133+ final DiskUsage usageAfterShardAssigned = new DiskUsage (
134+ usage .getNodeId (),
135+ usage .getNodeName (),
136+ usage .getPath (),
137+ usage .getTotalBytes (),
138+ Math .max (0 , usage .getFreeBytes () - shardSize )
139+ );
140+ final long freeSpaceLowThreshold = diskThresholdEvaluator .getFreeSpaceLowThreshold (usage .getTotalBytes ());
133141
134142 final ByteSizeValue freeSpaceLowThresholdInByteSize = new ByteSizeValue (freeSpaceLowThreshold );
135- final ByteSizeValue freeSpaceInByteSize = new ByteSizeValue (freeSpace );
136- final ByteSizeValue freeSpaceAfterAllocationInByteSize = new ByteSizeValue (freeSpaceAfterAllocation );
143+ final ByteSizeValue freeSpaceInByteSize = new ByteSizeValue (usage . getFreeBytes () );
144+ final ByteSizeValue freeSpaceAfterAllocationInByteSize = new ByteSizeValue (usageAfterShardAssigned . getFreeBytes () );
137145 final ByteSizeValue shardSizeInByteSize = new ByteSizeValue (shardSize );
138146
139- if (freeSpaceAfterAllocation < freeSpaceLowThreshold ) {
147+ if (diskThresholdEvaluator . isNodeExceedingLowWatermark ( usageAfterShardAssigned ) ) {
140148 logger .warn (
141149 "after allocating [{}] node [{}] would have less than the required threshold of "
142150 + "{} free (currently {} free, estimated shard size is {}), preventing allocation" ,
@@ -180,21 +188,29 @@ public Decision canRemain(ShardRouting shardRouting, RoutingNode node, RoutingAl
180188 return Decision .ALWAYS ;
181189 }
182190
183- final Decision decision = earlyTerminate (node , allocation );
191+ ClusterInfo clusterInfo = allocation .clusterInfo ();
192+ Map <String , DiskUsage > usages = clusterInfo .getNodeMostAvailableDiskUsages ();
193+ final Decision decision = earlyTerminate (node , allocation , usages );
184194 if (decision != null ) {
185195 return decision ;
186196 }
187197
188- final long totalAddressableSpace = calculateTotalAddressableSpace (node , allocation );
189- final long currentNodeRemoteShardSize = calculateCurrentNodeRemoteShardSize (node , allocation , true );
190- final long freeSpace = Math .max (totalAddressableSpace - currentNodeRemoteShardSize , 0 );
198+ final long leavingRemoteShardSize = calculateCurrentNodeLeavingRemoteShardSize (node , allocation );
199+ final DiskUsage usage = usages .get (node .nodeId ());
200+ final DiskUsage usageAfterSubtractingLeavingShard = new DiskUsage (
201+ usage .getNodeId (),
202+ usage .getNodeName (),
203+ usage .getPath (),
204+ usage .getTotalBytes (),
205+ Math .min (usage .getFreeBytes () + leavingRemoteShardSize , usage .getTotalBytes ())
206+ );
191207
192- final long freeSpaceHighThreshold = diskThresholdEvaluator .getFreeSpaceHighThreshold (totalAddressableSpace );
208+ final long freeSpaceHighThreshold = diskThresholdEvaluator .getFreeSpaceHighThreshold (usage . getTotalBytes () );
193209
194210 final ByteSizeValue freeSpaceHighThresholdInByteSize = new ByteSizeValue (freeSpaceHighThreshold );
195- final ByteSizeValue freeSpaceInByteSize = new ByteSizeValue (freeSpace );
211+ final ByteSizeValue freeSpaceInByteSize = new ByteSizeValue (usageAfterSubtractingLeavingShard . getFreeBytes () );
196212
197- if (freeSpace < freeSpaceHighThreshold ) {
213+ if (diskThresholdEvaluator . isNodeExceedingHighWatermark ( usageAfterSubtractingLeavingShard ) ) {
198214 logger .warn (
199215 "less than the required {} of free remote addressable space threshold left ({} free) on node [{}], shard cannot remain" ,
200216 freeSpaceHighThresholdInByteSize ,
@@ -220,18 +236,14 @@ public Decision canRemain(ShardRouting shardRouting, RoutingNode node, RoutingAl
220236 );
221237 }
222238
223- private long calculateCurrentNodeRemoteShardSize (RoutingNode node , RoutingAllocation allocation , boolean subtractLeavingShards ) {
224- final List <ShardRouting > remoteShardsOnNode = StreamSupport .stream (node .spliterator (), false )
225- .filter (
226- shard -> shard .primary ()
227- && REMOTE_CAPABLE .equals (getShardPool (shard , allocation ))
228- && (subtractLeavingShards == false || shard .relocating () == false )
229- )
239+ private long calculateCurrentNodeLeavingRemoteShardSize (RoutingNode node , RoutingAllocation allocation ) {
240+ final List <ShardRouting > leavingRemoteShardsOnNode = StreamSupport .stream (node .spliterator (), false )
241+ .filter (shard -> shard .primary () && REMOTE_CAPABLE .equals (getShardPool (shard , allocation )) && (shard .relocating () == true ))
230242 .collect (Collectors .toList ());
231243
232- var remoteShardSize = 0L ;
233- for (ShardRouting shard : remoteShardsOnNode ) {
234- remoteShardSize += DiskThresholdDecider .getExpectedShardSize (
244+ var leavingRemoteShardSize = 0L ;
245+ for (ShardRouting shard : leavingRemoteShardsOnNode ) {
246+ leavingRemoteShardSize += DiskThresholdDecider .getExpectedShardSize (
235247 shard ,
236248 0L ,
237249 allocation .clusterInfo (),
@@ -241,19 +253,10 @@ private long calculateCurrentNodeRemoteShardSize(RoutingNode node, RoutingAlloca
241253 );
242254 }
243255
244- return remoteShardSize ;
245- }
246-
247- private long calculateTotalAddressableSpace (RoutingNode node , RoutingAllocation allocation ) {
248- ClusterInfo clusterInfo = allocation .clusterInfo ();
249- // TODO: Change the default value to 5 instead of 0
250- final double dataToFileCacheSizeRatio = fileCacheSettings .getRemoteDataRatio ();
251- final AggregateFileCacheStats fileCacheStats = clusterInfo .getNodeFileCacheStats ().getOrDefault (node .nodeId (), null );
252- final long nodeCacheSize = fileCacheStats != null ? fileCacheStats .getTotal ().getBytes () : 0 ;
253- return (long ) dataToFileCacheSizeRatio * nodeCacheSize ;
256+ return leavingRemoteShardSize ;
254257 }
255258
256- private Decision earlyTerminate (RoutingNode node , RoutingAllocation allocation ) {
259+ private Decision earlyTerminate (RoutingNode node , RoutingAllocation allocation , final Map < String , DiskUsage > usages ) {
257260 // Always allow allocation if the decider is disabled
258261 if (diskThresholdSettings .isWarmThresholdEnabled () == false ) {
259262 return allocation .decision (Decision .YES , NAME , "the warm disk threshold decider is disabled" );
@@ -285,9 +288,12 @@ private Decision earlyTerminate(RoutingNode node, RoutingAllocation allocation)
285288 return allocation .decision (Decision .YES , NAME , "File Cache Stat is unavailable" );
286289 }
287290
288- double remoteDataRatio = fileCacheSettings .getRemoteDataRatio ();
289- if (remoteDataRatio == 0 ) {
290- return allocation .decision (Decision .YES , NAME , "Remote data ratio is set to 0, no limit on allocation" );
291+ // Fail open if there are no disk usages available
292+ if (usages .isEmpty () || usages .containsKey (node .nodeId ()) == false ) {
293+ if (logger .isTraceEnabled ()) {
294+ logger .trace ("unable to determine disk usages for disk-aware allocation, allowing allocation" );
295+ }
296+ return allocation .decision (Decision .YES , NAME , "disk usages are unavailable" );
291297 }
292298
293299 return null ;
0 commit comments