Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 3 additions & 13 deletions pkg/providers/v1/aws.go
Original file line number Diff line number Diff line change
Expand Up @@ -2734,19 +2734,9 @@ func (c *Cloud) updateInstanceSecurityGroupsForLoadBalancer(ctx context.Context,
loadBalancerSecurityGroupID := lbSecurityGroupIDs[0]

// Get the actual list of groups that allow ingress from the load-balancer
actualGroups := make(map[*ec2types.SecurityGroup]bool)
{
describeRequest := &ec2.DescribeSecurityGroupsInput{}
describeRequest.Filters = []ec2types.Filter{
newEc2Filter("ip-permission.group-id", loadBalancerSecurityGroupID),
}
response, err := c.ec2.DescribeSecurityGroups(ctx, describeRequest)
if err != nil {
return fmt.Errorf("error querying security groups for ELB: %q", err)
}
for _, sg := range response {
actualGroups[&sg] = c.tagging.hasClusterTag(sg.Tags)
}
actualGroups, _, err := c.buildSecurityGroupRuleReferences(ctx, loadBalancerSecurityGroupID)
if err != nil {
return fmt.Errorf("error building security group rule references: %w", err)
}

// Open the firewall from the load balancer to the instance
Expand Down
165 changes: 153 additions & 12 deletions pkg/providers/v1/aws_loadbalancer.go
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@ import (
"time"

"github.com/aws/aws-sdk-go-v2/aws"
"github.com/aws/aws-sdk-go-v2/service/ec2"
ec2types "github.com/aws/aws-sdk-go-v2/service/ec2/types"

elb "github.com/aws/aws-sdk-go-v2/service/elasticloadbalancing"
Expand Down Expand Up @@ -1082,24 +1083,24 @@ func (c *Cloud) ensureLoadBalancer(ctx context.Context, namespacedName types.Nam

{
// Sync security groups
expected := sets.New[string](securityGroupIDs...)
actual := stringSetFromList(loadBalancer.SecurityGroups)
expected := sets.New(securityGroupIDs...)
actual := sets.New(loadBalancer.SecurityGroups...)

if !expected.Equal(actual) {
// This call just replaces the security groups, unlike e.g. subnets (!)
request := &elb.ApplySecurityGroupsToLoadBalancerInput{}
request.LoadBalancerName = aws.String(loadBalancerName)
if securityGroupIDs == nil {
request.SecurityGroups = nil
} else {
request.SecurityGroups = securityGroupIDs
}
klog.V(2).Info("Applying updated security groups to load balancer")
_, err := c.elb.ApplySecurityGroupsToLoadBalancer(ctx, request)
if err != nil {
klog.V(2).Infof("Applying updated security groups to load balancer %q", loadBalancerName)
if _, err := c.elb.ApplySecurityGroupsToLoadBalancer(ctx, &elb.ApplySecurityGroupsToLoadBalancerInput{
LoadBalancerName: aws.String(loadBalancerName),
SecurityGroups: securityGroupIDs,
}); err != nil {
return nil, fmt.Errorf("error applying AWS loadbalancer security groups: %q", err)
}
dirty = true

// Ensure the replaced security groups are removed from AWS when owned by the controller.
if errs := c.removeOwnedSecurityGroups(ctx, loadBalancerName, actual.UnsortedList()); len(errs) > 0 {
return nil, fmt.Errorf("error removing owned security groups: %v", errs)
}
}
}

Expand Down Expand Up @@ -1704,3 +1705,143 @@ func ValidateHealthCheck(s *elbtypes.HealthCheck) error {

return nil
}

// isOwnedSecurityGroup checks if the security group is owned by the controller
// by checking if the security group has the cluster ownership tag
// (kubernetes.io/cluster/<clusterID>=owned).
//
// Parameters:
// - `ctx`: The context for the operation.
// - `securityGroupID`: The ID of the security group to check.
//
// Returns:
// - `bool`: True if the security group is owned by the controller, false otherwise.
// - `error`: An error if the security group cannot be retrieved, is not found,
// or if multiple security groups are found with the same ID (unexpected).
func (c *Cloud) isOwnedSecurityGroup(ctx context.Context, securityGroupID string) (bool, error) {
groups, err := c.ec2.DescribeSecurityGroups(ctx, &ec2.DescribeSecurityGroupsInput{
GroupIds: []string{securityGroupID},
})
if err != nil {
return false, fmt.Errorf("error retrieving security group %q: %w", securityGroupID, err)
}
if len(groups) == 0 {
return false, fmt.Errorf("security group %q not found", securityGroupID)
}
if len(groups) != 1 {
// This should not be possible - ids should be unique
return false, fmt.Errorf("[BUG] multiple security groups(%d) found with same id %q", len(groups), securityGroupID)
}
return c.tagging.hasClusterTagOwned(groups[0].Tags)
}

// buildSecurityGroupRuleReferences finds all security groups that have ingress rules
// referencing the specified security group ID, and categorizes them based on cluster tagging.
// This is used to identify dependencies before removing a security group.
//
// Parameters:
// - ctx: The context for the request.
// - sgID: The ID of the security group to find references for.
//
// Returns:
// - map[*ec2types.SecurityGroup]bool: All security groups with ingress rules referencing sgID, mapped to their cluster tag status (true/false).
// - map[*ec2types.SecurityGroup]IPPermissionSet: Only cluster-tagged security groups mapped to their ingress rules that reference sgID.
// - error: An error if the AWS DescribeSecurityGroups API call fails.
func (c *Cloud) buildSecurityGroupRuleReferences(ctx context.Context, sgID string) (map[*ec2types.SecurityGroup]bool, map[*ec2types.SecurityGroup]IPPermissionSet, error) {
groupsHasTags := make(map[*ec2types.SecurityGroup]bool)
groupsLinkedPermissions := make(map[*ec2types.SecurityGroup]IPPermissionSet)
sgsOut, err := c.ec2.DescribeSecurityGroups(ctx, &ec2.DescribeSecurityGroupsInput{
Filters: []ec2types.Filter{
newEc2Filter("ip-permission.group-id", sgID),
},
})
if err != nil {
return groupsHasTags, groupsLinkedPermissions, fmt.Errorf("error querying security groups for ELB: %q", err)
}

for _, sg := range sgsOut {
groupsHasTags[&sg] = c.tagging.hasClusterTag(sg.Tags)

groupsLinkedPermissions[&sg] = NewIPPermissionSet()
for _, rule := range sg.IpPermissions {
if rule.UserIdGroupPairs != nil {
for _, pair := range rule.UserIdGroupPairs {
if pair.GroupId != nil && aws.ToString(pair.GroupId) == sgID {
groupsLinkedPermissions[&sg].Insert(rule)
}
}
}
}

}
return groupsHasTags, groupsLinkedPermissions, nil
}

// removeOwnedSecurityGroups removes the CLB owned/managed security groups from AWS.
// It revokes ingress rules that reference the security groups to be removed,
// then deletes the security groups that are owned by the controller.
// This is used when updating load balancer security groups to clean up orphaned ones.
//
// Parameters:
// - `ctx`: The context for the operation.
// - `loadBalancerName`: The name of the load balancer (used for logging and deletion operations).
// - `securityGroups`: The list of security group IDs to process for removal.
//
// Returns:
// - `[]error`: Collection of all errors encountered during the removal process.
func (c *Cloud) removeOwnedSecurityGroups(ctx context.Context, loadBalancerName string, securityGroups []string) []error {
allErrs := []error{}
sgMap := make(map[string]struct{})

// Validate each security group references building a reading list to be deleted.
for _, sg := range securityGroups {
isOwned, err := c.isOwnedSecurityGroup(ctx, sg)
if err != nil {
allErrs = append(allErrs, fmt.Errorf("unable to validate if security group %q is owned by the controller: %w", sg, err))
continue
}

groupsWithClusterTag, groupsLinkedPermissions, err := c.buildSecurityGroupRuleReferences(ctx, sg)
if err != nil {
allErrs = append(allErrs, fmt.Errorf("error building security group rule references for %q: %w", sg, err))
continue
}

// Revoke ingress rules referencing the security group to be deleted
// from cluster-tagged security groups, when the referenced security
// group has no cluster tag, skip the revoke assuming it is user-managed.
for sgTarget, sgPerms := range groupsLinkedPermissions {
if !groupsWithClusterTag[sgTarget] {
klog.Warningf("security group %q has no cluster tag, skipping remove lifecycle after update", sg)
continue
}

klog.Infof("revoking security group ingress references of %q from %q", sg, aws.ToString(sgTarget.GroupId))
if _, err := c.ec2.RevokeSecurityGroupIngress(ctx, &ec2.RevokeSecurityGroupIngressInput{
GroupId: sgTarget.GroupId,
IpPermissions: sgPerms.List(),
}); err != nil {
allErrs = append(allErrs, fmt.Errorf("error revoking security group ingress rules from %q: %w", aws.ToString(sgTarget.GroupId), err))
continue
}
}

// Skip security group removal when the security group is not owned by the controller.
if !isOwned {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

why not detect this at the top of the loop and then continue if true so that we avoid the other calculations?

Copy link
Contributor Author

@mtulio mtulio Jul 31, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

my initial thought was to revoke all rules before validating ownership, but I've since reconsidered. That approach indeed require more calculations/calls.

This brings up a new question: could orphan rules, created by our controller, exist in a cluster's SGs (e.g., node SGs)? If so, users won't be able to delete these SGs without manually removing the orphan rules first.

I believe the answer is yes, and the function buildSecurityGroupRuleReferences returns the SGs with tagging boolean, if we'll introduce an extra check, well make it consistent with the updateInstanceSecurityGroupsForLoadBalancer() logic: skip when rules' SG which is not part of the cluster. If user intentionally, manually, referenced a rule to non-cluster's SG the controller will raise an error with dependency violation. Is that an accurate behavior?

Let me revisit with your first suggestion it in the next scan.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I reviewed the code, we can't return early to prevent not removing dependencies, for example in the BYO SG scenario (isOwned==false) we still need to remove references to it's SG in the cluster's SG (node's SG), otherwise we'll leak rules.

To address my comment above, I added a new check in the loop checking if the group is part of the cluster, and revoking rules on on those, following the behavior of the referenced function updateInstanceSecurityGroupsForLoadBalancer().

LMK if that's accurate to you.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

gotcha, that makes sense to me.

klog.Warningf("security group %q is not owned by the controller, skipping remove lifecycle after update", sg)
continue
}

klog.Infof("making loadbalancer owned security group %q ready for deletion", sg)
sgMap[sg] = struct{}{}
}
if len(sgMap) == 0 {
return allErrs
}

if err := c.deleteSecurityGroupsWithBackoff(ctx, loadBalancerName, sgMap); err != nil {
return append(allErrs, fmt.Errorf("error deleting security groups %v: %v", sgMap, err))
}
klog.Infof("loadbalancer owned security groups deleted!")
return nil
}
Loading