|
22 | 22 | import org.apache.iotdb.ainode.rpc.thrift.TLoadModelReq; |
23 | 23 | import org.apache.iotdb.common.rpc.thrift.FunctionType; |
24 | 24 | import org.apache.iotdb.common.rpc.thrift.Model; |
| 25 | +import org.apache.iotdb.common.rpc.thrift.TAINodeLocation; |
25 | 26 | import org.apache.iotdb.common.rpc.thrift.TConfigNodeLocation; |
26 | 27 | import org.apache.iotdb.common.rpc.thrift.TDataNodeConfiguration; |
27 | 28 | import org.apache.iotdb.common.rpc.thrift.TDataNodeLocation; |
|
119 | 120 | import org.apache.iotdb.confignode.rpc.thrift.TDropTriggerReq; |
120 | 121 | import org.apache.iotdb.confignode.rpc.thrift.TExtendRegionReq; |
121 | 122 | import org.apache.iotdb.confignode.rpc.thrift.TFetchTableResp; |
| 123 | +import org.apache.iotdb.confignode.rpc.thrift.TGetAINodeLocationResp; |
122 | 124 | import org.apache.iotdb.confignode.rpc.thrift.TGetAllPipeInfoResp; |
123 | 125 | import org.apache.iotdb.confignode.rpc.thrift.TGetDatabaseReq; |
124 | | -import org.apache.iotdb.confignode.rpc.thrift.TGetModelInfoReq; |
125 | | -import org.apache.iotdb.confignode.rpc.thrift.TGetModelInfoResp; |
126 | 126 | import org.apache.iotdb.confignode.rpc.thrift.TGetPipePluginTableResp; |
127 | 127 | import org.apache.iotdb.confignode.rpc.thrift.TGetRegionIdReq; |
128 | 128 | import org.apache.iotdb.confignode.rpc.thrift.TGetRegionIdResp; |
@@ -379,37 +379,6 @@ public class ClusterConfigTaskExecutor implements IConfigTaskExecutor { |
379 | 379 | // NOTE: AINode location is now maintained globally inside AINodeClient. |
380 | 380 | // We only resolve via ConfigNode when needed, then publish it back to AINodeClient. |
381 | 381 |
|
382 | | - /** Ask ConfigNode for the latest AINode location (precise by modelId when available). */ |
383 | | - private TEndPoint resolveViaConfigNodeAndPublish(String modelIdOrNull) { |
384 | | - try (ConfigNodeClient cn = |
385 | | - ConfigNodeClientManager.getInstance().borrowClient(ConfigNodeInfo.CONFIG_REGION_ID)) { |
386 | | - TEndPoint ep = null; |
387 | | - if (modelIdOrNull != null && !modelIdOrNull.isEmpty()) { |
388 | | - final TGetModelInfoResp resp = cn.getModelInfo(new TGetModelInfoReq(modelIdOrNull)); |
389 | | - if (resp != null && resp.isSetAiNodeAddress()) { |
390 | | - ep = resp.getAiNodeAddress(); |
391 | | - } |
392 | | - } else { |
393 | | - final org.apache.iotdb.confignode.rpc.thrift.TGetAINodeLocationResp r = |
394 | | - cn.getAINodeLocation( |
395 | | - new org.apache.iotdb.confignode.rpc.thrift.TGetAINodeLocationReq()); |
396 | | - if (r != null |
397 | | - && r.getStatus() != null |
398 | | - && r.getStatus().getCode() |
399 | | - == org.apache.iotdb.rpc.TSStatusCode.SUCCESS_STATUS.getStatusCode() |
400 | | - && r.isSetAiNodeAddress()) { |
401 | | - ep = r.getAiNodeAddress(); |
402 | | - } |
403 | | - } |
404 | | - if (ep != null) { |
405 | | - AINodeClient.updateGlobalAINodeLocation(ep); |
406 | | - } |
407 | | - return ep; |
408 | | - } catch (Exception e) { |
409 | | - return null; |
410 | | - } |
411 | | - } |
412 | | - |
413 | 382 | private static final Logger LOGGER = LoggerFactory.getLogger(ClusterConfigTaskExecutor.class); |
414 | 383 |
|
415 | 384 | private static final IClientManager<ConfigRegionId, ConfigNodeClient> CONFIG_NODE_CLIENT_MANAGER = |
@@ -3646,38 +3615,106 @@ public SettableFuture<ConfigTaskResult> showAIDevices() { |
3646 | 3615 | public SettableFuture<ConfigTaskResult> loadModel( |
3647 | 3616 | String existingModelId, List<String> deviceIdList) { |
3648 | 3617 | final SettableFuture<ConfigTaskResult> future = SettableFuture.create(); |
3649 | | - // 1) Try direct DataNode → AINode with cached endpoint |
| 3618 | + final long t0 = System.currentTimeMillis(); |
| 3619 | + LOGGER.info("[LoadModel] begin: modelId={}, devices={}", existingModelId, deviceIdList); |
3650 | 3620 | TEndPoint ep = AINodeClient.getCurrentEndpoint(); |
| 3621 | + LOGGER.debug("[LoadModel] currentEndpoint(beforeResolve)={}", ep); |
| 3622 | + |
| 3623 | + if (ep == null) { |
| 3624 | + ep = resolveAINodeEndpointOrNullWithLog("[LoadModel] initial-resolve"); |
| 3625 | + LOGGER.debug("[LoadModel] endpoint(after initial resolve)={}", ep); |
| 3626 | + } |
| 3627 | + |
3651 | 3628 | try (final AINodeClient ai = AINodeClientManager.getInstance().borrowClient(ep)) { |
| 3629 | + LOGGER.info("[LoadModel] borrowClient OK: endpoint={}", ep); |
| 3630 | + |
3652 | 3631 | final TLoadModelReq req = new TLoadModelReq(existingModelId, deviceIdList); |
3653 | 3632 | final TSStatus result = ai.loadModel(req); |
| 3633 | + LOGGER.info( |
| 3634 | + "[LoadModel] RPC done: statusCode={}, message={}", result.getCode(), result.getMessage()); |
| 3635 | + |
3654 | 3636 | if (TSStatusCode.SUCCESS_STATUS.getStatusCode() != result.getCode()) { |
3655 | | - future.setException(new IoTDBException(result)); |
| 3637 | + final IoTDBException ex = new IoTDBException(result); |
| 3638 | + LOGGER.warn("[LoadModel] RPC not success: {}", ex.getMessage()); |
| 3639 | + future.setException(ex); |
3656 | 3640 | } else { |
3657 | 3641 | future.set(new ConfigTaskResult(TSStatusCode.SUCCESS_STATUS)); |
3658 | 3642 | } |
3659 | 3643 | } catch (final Exception first) { |
3660 | | - // 2) Fallback: ask ConfigNode for latest AINode location, update cache and retry once |
3661 | | - final TEndPoint refreshed = resolveViaConfigNodeAndPublish(existingModelId); |
3662 | | - if (refreshed == null || (ep != null && refreshed.equals(ep))) { |
3663 | | - future.setException(first); |
3664 | | - return future; |
3665 | | - } |
3666 | | - try (final AINodeClient ai = AINodeClientManager.getInstance().borrowClient(refreshed)) { |
3667 | | - final TLoadModelReq req = new TLoadModelReq(existingModelId, deviceIdList); |
3668 | | - final TSStatus result = ai.loadModel(req); |
3669 | | - if (TSStatusCode.SUCCESS_STATUS.getStatusCode() != result.getCode()) { |
3670 | | - future.setException(new IoTDBException(result)); |
| 3644 | + final org.apache.iotdb.common.rpc.thrift.TAINodeLocation refreshedLocation; |
| 3645 | + try (final ConfigNodeClient cn = |
| 3646 | + CONFIG_NODE_CLIENT_MANAGER.borrowClient(ConfigNodeInfo.CONFIG_REGION_ID)) { |
| 3647 | + final org.apache.iotdb.confignode.rpc.thrift.TGetAINodeLocationResp r = |
| 3648 | + cn.getAINodeLocation(); |
| 3649 | + final boolean hasLoc = (r != null && r.isSetAiNodeLocation()); |
| 3650 | + |
| 3651 | + if (hasLoc) { |
| 3652 | + refreshedLocation = r.getAiNodeLocation(); |
| 3653 | + debugDumpLocation("[LoadModel] refreshed-location", refreshedLocation); |
| 3654 | + AINodeClient.updateGlobalAINodeLocation(refreshedLocation); |
| 3655 | + |
| 3656 | + final TEndPoint epRefreshed = pickEndpointFrom(refreshedLocation); |
3671 | 3657 | } else { |
3672 | | - future.set(new ConfigTaskResult(TSStatusCode.SUCCESS_STATUS)); |
| 3658 | + future.setException(first); |
| 3659 | + return future; |
3673 | 3660 | } |
3674 | | - } catch (final Exception second) { |
3675 | | - future.setException(second); |
| 3661 | + } catch (Exception e2) { |
| 3662 | + future.setException(first); |
| 3663 | + return future; |
3676 | 3664 | } |
| 3665 | + future.setException(first); |
3677 | 3666 | } |
| 3667 | + |
3678 | 3668 | return future; |
3679 | 3669 | } |
3680 | 3670 |
|
| 3671 | + private TEndPoint resolveAINodeEndpointOrNullWithLog(final String tag) { |
| 3672 | + try (final ConfigNodeClient cn = |
| 3673 | + CONFIG_NODE_CLIENT_MANAGER.borrowClient(ConfigNodeInfo.CONFIG_REGION_ID)) { |
| 3674 | + final TGetAINodeLocationResp resp = cn.getAINodeLocation(); |
| 3675 | + final boolean ok = (resp != null && resp.isSetAiNodeLocation()); |
| 3676 | + if (!ok) { |
| 3677 | + return null; |
| 3678 | + } |
| 3679 | + final TAINodeLocation loc = resp.getAiNodeLocation(); |
| 3680 | + debugDumpLocation(tag + " aiNodeLocation", loc); |
| 3681 | + |
| 3682 | + final TEndPoint picked = pickEndpointFrom(loc); |
| 3683 | + AINodeClient.updateGlobalAINodeLocation(loc); |
| 3684 | + |
| 3685 | + return picked; |
| 3686 | + } catch (Exception e) { |
| 3687 | + return null; |
| 3688 | + } |
| 3689 | + } |
| 3690 | + |
| 3691 | + private static TEndPoint pickEndpointFrom(final TAINodeLocation loc) { |
| 3692 | + if (loc == null) return null; |
| 3693 | + try { |
| 3694 | + if (loc.isSetInternalEndPoint() && loc.getInternalEndPoint() != null) { |
| 3695 | + return loc.getInternalEndPoint(); |
| 3696 | + } |
| 3697 | + } catch (Throwable ignore) { |
| 3698 | + } |
| 3699 | + return null; |
| 3700 | + } |
| 3701 | + |
| 3702 | + private static void debugDumpLocation(final String tag, final TAINodeLocation loc) { |
| 3703 | + if (loc == null) { |
| 3704 | + LOGGER.debug("{}: location=null", tag); |
| 3705 | + return; |
| 3706 | + } |
| 3707 | + StringBuilder sb = new StringBuilder(128); |
| 3708 | + sb.append(tag).append(": "); |
| 3709 | + try { |
| 3710 | + sb.append("internal=") |
| 3711 | + .append(loc.isSetInternalEndPoint() ? loc.getInternalEndPoint() : "null") |
| 3712 | + .append("; "); |
| 3713 | + } catch (Throwable ignore) { |
| 3714 | + } |
| 3715 | + LOGGER.debug(sb.toString()); |
| 3716 | + } |
| 3717 | + |
3681 | 3718 | @Override |
3682 | 3719 | public SettableFuture<ConfigTaskResult> unloadModel( |
3683 | 3720 | String existingModelId, List<String> deviceIdList) { |
|
0 commit comments