Skip to content

Commit cd72edc

Browse files
authored
Merge pull request #1159 from mtulio/refact-sg-deletion
refact/lb/sg: isolate security group deletion fragments from EnsureLoadBalancerDeleted
2 parents f4871ba + e454240 commit cd72edc

File tree

1 file changed

+141
-97
lines changed

1 file changed

+141
-97
lines changed

pkg/providers/v1/aws.go

Lines changed: 141 additions & 97 deletions
Original file line numberDiff line numberDiff line change
@@ -44,6 +44,7 @@ import (
4444
v1 "k8s.io/api/core/v1"
4545
"k8s.io/apimachinery/pkg/types"
4646
"k8s.io/apimachinery/pkg/util/sets"
47+
"k8s.io/apimachinery/pkg/util/wait"
4748
"k8s.io/client-go/informers"
4849
informercorev1 "k8s.io/client-go/informers/core/v1"
4950
clientset "k8s.io/client-go/kubernetes"
@@ -2837,6 +2838,139 @@ func (c *Cloud) updateInstanceSecurityGroupsForLoadBalancer(ctx context.Context,
28372838
return nil
28382839
}
28392840

2841+
// deleteSecurityGroupsWithBackoff deletes a list of security group IDs with retries and exponential backoff.
2842+
// The function attempts to delete each security group in the provided list, handling potential dependency violations
2843+
// caused by resources still being associated with the security groups (e.g., load balancers in the process of deletion).
2844+
//
2845+
// Parameters:
2846+
// - `ctx`: The context for the operation.
2847+
// - `svcName`: The name of the service associated with the security groups.
2848+
// - `securityGroupIDs`: A map of security group IDs to be deleted.
2849+
//
2850+
// Behavior:
2851+
// - If the list of security group IDs is empty, the function returns immediately.
2852+
// - The function retries deletion for up to 10 minutes, with an initial backoff of 5 seconds that doubles with each retry.
2853+
// - Dependency violations are logged and ignored, allowing retries until the timeout is reached.
2854+
// - If all security groups are successfully deleted, the function exits.
2855+
// - If the timeout is reached and some security groups remain, an error is returned.
2856+
//
2857+
// Returns:
2858+
// - `error`: An error if any security groups could not be deleted within the timeout period.
2859+
func (c *Cloud) deleteSecurityGroupsWithBackoff(ctx context.Context, svcName string, securityGroupIDs map[string]struct{}) error {
2860+
if len(securityGroupIDs) == 0 {
2861+
return nil
2862+
}
2863+
err := wait.PollUntilContextTimeout(ctx, 5*time.Second, 10*time.Minute, true, func(ctx context.Context) (bool, error) {
2864+
for securityGroupID := range securityGroupIDs {
2865+
_, err := c.ec2.DeleteSecurityGroup(ctx, &ec2.DeleteSecurityGroupInput{
2866+
GroupId: &securityGroupID,
2867+
})
2868+
if err == nil {
2869+
delete(securityGroupIDs, securityGroupID)
2870+
continue
2871+
}
2872+
ignore := false
2873+
var ae smithy.APIError
2874+
if errors.As(err, &ae) {
2875+
if ae.ErrorCode() == "DependencyViolation" {
2876+
klog.V(2).Infof("Ignoring DependencyViolation while deleting load-balancer security group (%s), assuming because LB is in process of deleting", securityGroupID)
2877+
ignore = true
2878+
}
2879+
}
2880+
if !ignore {
2881+
return true, fmt.Errorf("error while deleting load balancer security group (%s): %q", securityGroupID, err)
2882+
}
2883+
}
2884+
2885+
if len(securityGroupIDs) == 0 {
2886+
klog.V(2).Info("Deleted all security groups for load balancer: ", svcName)
2887+
return true, nil
2888+
}
2889+
2890+
klog.V(2).Infof("Waiting for load-balancer %q to delete so we can delete security groups: %v", svcName, securityGroupIDs)
2891+
return false, nil
2892+
})
2893+
if err != nil {
2894+
ids := []string{}
2895+
for id := range securityGroupIDs {
2896+
ids = append(ids, id)
2897+
}
2898+
return fmt.Errorf("could not delete security groups %v for Load Balancer %q: %w", strings.Join(ids, ","), svcName, err)
2899+
}
2900+
return nil
2901+
}
2902+
2903+
// buildSecurityGroupsToDelete evaluates all deletion criteria and creates a list of valid security group IDs to be deleted.
2904+
// It returns two maps:
2905+
// - `securityGroupIDs`: A map of security group IDs that are eligible for deletion.
2906+
// - `taggedLBSecurityGroups`: A map of security group IDs that are tagged and associated with the load balancer.
2907+
// The function filters security groups based on the following criteria:
2908+
// - Excludes security groups defined in the Cloud Configuration.
2909+
// - Excludes security groups with no cluster tags.
2910+
// - Excludes security groups annotated with `service.beta.kubernetes.io/aws-load-balancer-security-groups` or
2911+
// `service.beta.kubernetes.io/aws-load-balancer-extra-security-groups`.
2912+
//
2913+
// Parameters:
2914+
// - `ctx`: The context for the operation.
2915+
// - `service`: The Kubernetes service object.
2916+
// - `lbSecurityGroups`: A list of security group IDs associated with the load balancer.
2917+
// Returns:
2918+
// - `securityGroupIDs`: A map of security group IDs to be deleted.
2919+
// - `taggedLBSecurityGroups`: A map of tagged security group IDs.
2920+
// - `error`: An error if the operation fails.
2921+
func (c *Cloud) buildSecurityGroupsToDelete(ctx context.Context, service *v1.Service, lbSecurityGroups []string) (map[string]struct{}, map[string]struct{}, error) {
2922+
securityGroupIDs := map[string]struct{}{}
2923+
taggedLBSecurityGroups := map[string]struct{}{}
2924+
2925+
describeRequest := &ec2.DescribeSecurityGroupsInput{}
2926+
describeRequest.Filters = []ec2types.Filter{
2927+
newEc2Filter("group-id", lbSecurityGroups...),
2928+
}
2929+
response, err := c.ec2.DescribeSecurityGroups(ctx, describeRequest)
2930+
if err != nil {
2931+
return nil, nil, fmt.Errorf("error querying security groups for ELB: %q", err)
2932+
}
2933+
2934+
annotatedSgSet := map[string]bool{}
2935+
annotatedSgsList := getSGListFromAnnotation(service.Annotations[ServiceAnnotationLoadBalancerSecurityGroups])
2936+
annotatedExtraSgsList := getSGListFromAnnotation(service.Annotations[ServiceAnnotationLoadBalancerExtraSecurityGroups])
2937+
annotatedSgsList = append(annotatedSgsList, annotatedExtraSgsList...)
2938+
2939+
for _, sg := range annotatedSgsList {
2940+
annotatedSgSet[sg] = true
2941+
}
2942+
2943+
for _, sg := range response {
2944+
sgID := aws.ToString(sg.GroupId)
2945+
2946+
if sgID == c.cfg.Global.ElbSecurityGroup {
2947+
//We don't want to delete a security group that was defined in the Cloud Configuration.
2948+
continue
2949+
}
2950+
if sgID == "" {
2951+
klog.Warningf("Ignoring empty security group in %s", service.Name)
2952+
continue
2953+
}
2954+
2955+
if !c.tagging.hasClusterTag(sg.Tags) {
2956+
klog.Warningf("Ignoring security group with no cluster tag in %s", service.Name)
2957+
continue
2958+
} else {
2959+
taggedLBSecurityGroups[sgID] = struct{}{}
2960+
}
2961+
2962+
// This is an extra protection of deletion of non provisioned Security Group which is annotated with `service.beta.kubernetes.io/aws-load-balancer-security-groups`.
2963+
if _, ok := annotatedSgSet[sgID]; ok {
2964+
klog.Warningf("Ignoring security group with annotation `service.beta.kubernetes.io/aws-load-balancer-security-groups` or service.beta.kubernetes.io/aws-load-balancer-extra-security-groups in %s", service.Name)
2965+
continue
2966+
}
2967+
2968+
securityGroupIDs[sgID] = struct{}{}
2969+
}
2970+
2971+
return securityGroupIDs, taggedLBSecurityGroups, nil
2972+
}
2973+
28402974
// EnsureLoadBalancerDeleted implements LoadBalancer.EnsureLoadBalancerDeleted.
28412975
func (c *Cloud) EnsureLoadBalancerDeleted(ctx context.Context, clusterName string, service *v1.Service) error {
28422976
if isLBExternal(service.Annotations) {
@@ -2908,57 +3042,13 @@ func (c *Cloud) EnsureLoadBalancerDeleted(ctx context.Context, clusterName strin
29083042
// if the load balancer security group is being deleted.
29093043
securityGroupIDs := map[string]struct{}{}
29103044
taggedLBSecurityGroups := map[string]struct{}{}
2911-
{
2912-
// Delete the security group(s) for the load balancer
2913-
// Note that this is annoying: the load balancer disappears from the API immediately, but it is still
2914-
// deleting in the background. We get a DependencyViolation until the load balancer has deleted itself
2915-
2916-
var loadBalancerSGs = lb.SecurityGroups
2917-
2918-
describeRequest := &ec2.DescribeSecurityGroupsInput{}
2919-
describeRequest.Filters = []ec2types.Filter{
2920-
newEc2Filter("group-id", loadBalancerSGs...),
2921-
}
2922-
response, err := c.ec2.DescribeSecurityGroups(ctx, describeRequest)
2923-
if err != nil {
2924-
return fmt.Errorf("error querying security groups for ELB: %q", err)
2925-
}
2926-
annotatedSgSet := map[string]bool{}
2927-
annotatedSgsList := getSGListFromAnnotation(service.Annotations[ServiceAnnotationLoadBalancerSecurityGroups])
2928-
annotatedExtraSgsList := getSGListFromAnnotation(service.Annotations[ServiceAnnotationLoadBalancerExtraSecurityGroups])
2929-
annotatedSgsList = append(annotatedSgsList, annotatedExtraSgsList...)
2930-
2931-
for _, sg := range annotatedSgsList {
2932-
annotatedSgSet[sg] = true
2933-
}
2934-
2935-
for _, sg := range response {
2936-
sgID := aws.ToString(sg.GroupId)
29373045

2938-
if sgID == c.cfg.Global.ElbSecurityGroup {
2939-
//We don't want to delete a security group that was defined in the Cloud Configuration.
2940-
continue
2941-
}
2942-
if sgID == "" {
2943-
klog.Warningf("Ignoring empty security group in %s", service.Name)
2944-
continue
2945-
}
2946-
2947-
if !c.tagging.hasClusterTag(sg.Tags) {
2948-
klog.Warningf("Ignoring security group with no cluster tag in %s", service.Name)
2949-
continue
2950-
} else {
2951-
taggedLBSecurityGroups[sgID] = struct{}{}
2952-
}
2953-
2954-
// This is an extra protection of deletion of non provisioned Security Group which is annotated with `service.beta.kubernetes.io/aws-load-balancer-security-groups`.
2955-
if _, ok := annotatedSgSet[sgID]; ok {
2956-
klog.Warningf("Ignoring security group with annotation `service.beta.kubernetes.io/aws-load-balancer-security-groups` or service.beta.kubernetes.io/aws-load-balancer-extra-security-groups in %s", service.Name)
2957-
continue
2958-
}
2959-
2960-
securityGroupIDs[sgID] = struct{}{}
2961-
}
3046+
// Delete the security group(s) for the load balancer
3047+
// Note that this is annoying: the load balancer disappears from the API immediately, but it is still
3048+
// deleting in the background. We get a DependencyViolation until the load balancer has deleted itself
3049+
securityGroupIDs, taggedLBSecurityGroups, err = c.buildSecurityGroupsToDelete(ctx, service, lb.SecurityGroups)
3050+
if err != nil {
3051+
return fmt.Errorf("unable to build security groups to delete: %w", err)
29623052
}
29633053

29643054
{
@@ -2993,53 +3083,7 @@ func (c *Cloud) EnsureLoadBalancerDeleted(ctx context.Context, clusterName strin
29933083
}
29943084
}
29953085

2996-
{
2997-
2998-
// Loop through and try to delete them
2999-
timeoutAt := time.Now().Add(time.Second * 600)
3000-
for {
3001-
for securityGroupID := range securityGroupIDs {
3002-
request := &ec2.DeleteSecurityGroupInput{}
3003-
request.GroupId = &securityGroupID
3004-
_, err := c.ec2.DeleteSecurityGroup(ctx, request)
3005-
if err == nil {
3006-
delete(securityGroupIDs, securityGroupID)
3007-
} else {
3008-
ignore := false
3009-
var ae smithy.APIError
3010-
if errors.As(err, &ae) {
3011-
if ae.ErrorCode() == "DependencyViolation" {
3012-
klog.V(2).Infof("Ignoring DependencyViolation while deleting load-balancer security group (%s), assuming because LB is in process of deleting", securityGroupID)
3013-
ignore = true
3014-
}
3015-
}
3016-
if !ignore {
3017-
return fmt.Errorf("error while deleting load balancer security group (%s): %q", securityGroupID, err)
3018-
}
3019-
}
3020-
}
3021-
3022-
if len(securityGroupIDs) == 0 {
3023-
klog.V(2).Info("Deleted all security groups for load balancer: ", service.Name)
3024-
break
3025-
}
3026-
3027-
if time.Now().After(timeoutAt) {
3028-
ids := []string{}
3029-
for id := range securityGroupIDs {
3030-
ids = append(ids, id)
3031-
}
3032-
3033-
return fmt.Errorf("timed out deleting ELB: %s. Could not delete security groups %v", service.Name, strings.Join(ids, ","))
3034-
}
3035-
3036-
klog.V(2).Info("Waiting for load-balancer to delete so we can delete security groups: ", service.Name)
3037-
3038-
time.Sleep(10 * time.Second)
3039-
}
3040-
}
3041-
3042-
return nil
3086+
return c.deleteSecurityGroupsWithBackoff(ctx, service.Name, securityGroupIDs)
30433087
}
30443088

30453089
// UpdateLoadBalancer implements LoadBalancer.UpdateLoadBalancer

0 commit comments

Comments
 (0)