Skip to content

Commit 23ba0b3

Browse files
committed
fix: remove CLB security group on update from managed to BYO SG
This change fixes the leak security group of an owned/managed (created by controller) when BYO SG (annotation) is added to the service after it is created (update)
1 parent 12af74b commit 23ba0b3

File tree

3 files changed

+858
-16
lines changed

3 files changed

+858
-16
lines changed

pkg/providers/v1/aws_loadbalancer.go

Lines changed: 183 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,7 @@ import (
2929
"time"
3030

3131
"github.com/aws/aws-sdk-go-v2/aws"
32+
"github.com/aws/aws-sdk-go-v2/service/ec2"
3233
ec2types "github.com/aws/aws-sdk-go-v2/service/ec2/types"
3334

3435
elb "github.com/aws/aws-sdk-go-v2/service/elasticloadbalancing"
@@ -1082,24 +1083,24 @@ func (c *Cloud) ensureLoadBalancer(ctx context.Context, namespacedName types.Nam
10821083

10831084
{
10841085
// Sync security groups
1085-
expected := sets.New[string](securityGroupIDs...)
1086-
actual := stringSetFromList(loadBalancer.SecurityGroups)
1086+
expected := sets.New(securityGroupIDs...)
1087+
actual := sets.New(loadBalancer.SecurityGroups...)
10871088

10881089
if !expected.Equal(actual) {
10891090
// This call just replaces the security groups, unlike e.g. subnets (!)
1090-
request := &elb.ApplySecurityGroupsToLoadBalancerInput{}
1091-
request.LoadBalancerName = aws.String(loadBalancerName)
1092-
if securityGroupIDs == nil {
1093-
request.SecurityGroups = nil
1094-
} else {
1095-
request.SecurityGroups = securityGroupIDs
1096-
}
1097-
klog.V(2).Info("Applying updated security groups to load balancer")
1098-
_, err := c.elb.ApplySecurityGroupsToLoadBalancer(ctx, request)
1099-
if err != nil {
1091+
klog.V(2).Infof("Applying updated security groups to load balancer %q", loadBalancerName)
1092+
if _, err := c.elb.ApplySecurityGroupsToLoadBalancer(ctx, &elb.ApplySecurityGroupsToLoadBalancerInput{
1093+
LoadBalancerName: aws.String(loadBalancerName),
1094+
SecurityGroups: securityGroupIDs,
1095+
}); err != nil {
11001096
return nil, fmt.Errorf("error applying AWS loadbalancer security groups: %q", err)
11011097
}
11021098
dirty = true
1099+
1100+
// Ensure the replaced security groups are removed from AWS when owned by the controller.
1101+
if sgsNotRemoved, errs := c.removeOwnedSecurityGroups(ctx, loadBalancerName, actual.UnsortedList()); len(errs) > 0 {
1102+
return nil, fmt.Errorf("error removing owned security groups %v: %v", sgsNotRemoved, errs)
1103+
}
11031104
}
11041105
}
11051106

@@ -1704,3 +1705,173 @@ func ValidateHealthCheck(s *elbtypes.HealthCheck) error {
17041705

17051706
return nil
17061707
}
1708+
1709+
// isOwnedSecurityGroup checks if the security group is owned by the controller
1710+
// by checking if the security group has the cluster ownership tag
1711+
// (kubernetes.io/cluster/<clusterID>=owned).
1712+
//
1713+
// Parameters:
1714+
// - `ctx`: The context for the operation.
1715+
// - `securityGroupID`: The ID of the security group to check.
1716+
//
1717+
// Returns:
1718+
// - `bool`: True if the security group is owned by the controller, false otherwise.
1719+
// - `error`: An error if the security group cannot be retrieved, is not found,
1720+
// or if multiple security groups are found with the same ID (unexpected).
1721+
//
1722+
// Note: This function calls the AWS DescribeSecurityGroups API to retrieve
1723+
// the security group and check its tags.
1724+
func (c *Cloud) isOwnedSecurityGroup(ctx context.Context, securityGroupID string) (bool, error) {
1725+
groups, err := c.ec2.DescribeSecurityGroups(ctx, &ec2.DescribeSecurityGroupsInput{
1726+
GroupIds: []string{securityGroupID},
1727+
})
1728+
if err != nil {
1729+
return false, fmt.Errorf("error retrieving security group %q: %w", securityGroupID, err)
1730+
}
1731+
if len(groups) == 0 {
1732+
return false, fmt.Errorf("security group %q not found", securityGroupID)
1733+
}
1734+
if len(groups) != 1 {
1735+
// This should not be possible - ids should be unique
1736+
return false, fmt.Errorf("multiple security groups found with same id %q", securityGroupID)
1737+
}
1738+
return c.tagging.hasClusterTagOwned(groups[0].Tags), nil
1739+
}
1740+
1741+
// removeOwnedSecurityGroups removes the owned/managed security groups from AWS.
1742+
// It revokes ingress rules that reference the security groups to be removed,
1743+
// then deletes the security groups that are owned by the controller.
1744+
// This is used when updating load balancer security groups to clean up orphaned ones.
1745+
//
1746+
// Parameters:
1747+
// - `ctx`: The context for the operation.
1748+
// - `loadBalancerName`: The name of the load balancer (used for logging and deletion operations).
1749+
// - `securityGroups`: The list of security group IDs to process for removal.
1750+
//
1751+
// Returns:
1752+
// - `[]string`: Security group IDs that could not be removed due to errors.
1753+
// - `[]error`: Collection of all errors encountered during the removal process.
1754+
//
1755+
// Behavior:
1756+
// - For each security group, checks if it is owned by the controller using cluster tags.
1757+
// - Identifies and revokes all ingress rules in other security groups that reference the target SG.
1758+
// - Only deletes security groups that are owned by the controller (have cluster ownership tags).
1759+
// - Non-owned security groups have their references revoked but are not deleted.
1760+
// - Processing continues for all security groups even if some operations fail.
1761+
// - Collects and returns all security groups that could not be fully processed.
1762+
func (c *Cloud) removeOwnedSecurityGroups(ctx context.Context, loadBalancerName string, securityGroups []string) ([]string, []error) {
1763+
groupsNotRemoved := []string{}
1764+
errs := []error{}
1765+
for _, sg := range securityGroups {
1766+
// Ensure security group is owned by the controller
1767+
isOwned, err := c.isOwnedSecurityGroup(ctx, sg)
1768+
if err != nil {
1769+
groupsNotRemoved = append(groupsNotRemoved, sg)
1770+
errs = append(errs, fmt.Errorf("unable to validate if security group %q is owned by the controller: %w", sg, err))
1771+
continue
1772+
}
1773+
1774+
// build a list of security groups referencing the SG to be removed
1775+
_, groupsLinkedPermissions, err := c.buildSecurityGroupRuleReferences(ctx, sg)
1776+
if err != nil {
1777+
groupsNotRemoved = append(groupsNotRemoved, sg)
1778+
errs = append(errs, fmt.Errorf("error building security group rule references for %q: %w", sg, err))
1779+
continue
1780+
}
1781+
1782+
// revoke the rules references to the security group to be removed
1783+
for sgTarget, sgPerms := range groupsLinkedPermissions {
1784+
klog.Infof("revoking security group ingress for %q", aws.ToString(sgTarget.GroupId))
1785+
if _, err := c.ec2.RevokeSecurityGroupIngress(ctx, &ec2.RevokeSecurityGroupIngressInput{
1786+
GroupId: sgTarget.GroupId,
1787+
IpPermissions: sgPerms.List(),
1788+
}); err != nil {
1789+
groupsNotRemoved = append(groupsNotRemoved, sg)
1790+
errs = append(errs, fmt.Errorf("error revoking security group ingress rules from %q: %w", aws.ToString(sgTarget.GroupId), err))
1791+
continue
1792+
}
1793+
}
1794+
1795+
// leave the loop if there are errors while revoking the rules references to the security group to be removed
1796+
if len(errs) > 0 {
1797+
klog.Warningf("one or more errors while revoking security group ingress rules, skipping %q deletion", groupsNotRemoved)
1798+
continue
1799+
}
1800+
1801+
// if the security group is not owned by the controller, skip the SG removal lifecycle
1802+
if !isOwned {
1803+
msg := fmt.Sprintf("security group %q is not owned by the controller, skipping remove lifecycle after update", sg)
1804+
klog.Warningf("%s", msg)
1805+
groupsNotRemoved = append(groupsNotRemoved, sg)
1806+
errs = append(errs, fmt.Errorf("%s", msg))
1807+
continue
1808+
}
1809+
1810+
// delete the owned/managed security group
1811+
klog.Infof("deleting security group %q", sg)
1812+
sgMap := make(map[string]struct{})
1813+
sgMap[sg] = struct{}{}
1814+
if err := c.deleteSecurityGroupsWithBackoff(ctx, loadBalancerName, sgMap); err != nil {
1815+
msg := fmt.Sprintf("error deleting security group %q: %v", sg, err)
1816+
klog.Warningf("%s", msg)
1817+
groupsNotRemoved = append(groupsNotRemoved, sg)
1818+
errs = append(errs, fmt.Errorf("error deleting security group %q: %w", sg, err))
1819+
}
1820+
}
1821+
return groupsNotRemoved, errs
1822+
}
1823+
1824+
// buildSecurityGroupRuleReferences finds all security groups that have ingress rules
1825+
// referencing the specified security group ID, and categorizes them based on cluster tagging.
1826+
// This is used to identify dependencies before removing a security group.
1827+
//
1828+
// Parameters:
1829+
// - ctx: The context for the request.
1830+
// - sgID: The ID of the security group to find references for.
1831+
//
1832+
// Returns:
1833+
// - map[*ec2types.SecurityGroup]bool: All security groups with ingress rules referencing sgID, mapped to their cluster tag status (true/false).
1834+
// - map[*ec2types.SecurityGroup]IPPermissionSet: Only cluster-tagged security groups mapped to their ingress rules that reference sgID.
1835+
// - error: An error if the AWS DescribeSecurityGroups API call fails.
1836+
//
1837+
// Behavior:
1838+
// - Queries AWS for all security groups with ingress rules referencing the target sgID.
1839+
// - For each found security group, records its cluster tag status in the first map.
1840+
// - Only processes ingress rules for cluster-tagged security groups; non-cluster-tagged ones are skipped with a warning.
1841+
// - Collects the specific ingress rules that reference the target security group from cluster-tagged SGs only.
1842+
// - Non-cluster-tagged security groups appear in the first map (with false value) but are excluded from the second map.
1843+
func (c *Cloud) buildSecurityGroupRuleReferences(ctx context.Context, sgID string) (map[*ec2types.SecurityGroup]bool, map[*ec2types.SecurityGroup]IPPermissionSet, error) {
1844+
groupsWithTags := make(map[*ec2types.SecurityGroup]bool)
1845+
groupsLinkedPermissions := make(map[*ec2types.SecurityGroup]IPPermissionSet)
1846+
sgsOut, err := c.ec2.DescribeSecurityGroups(ctx, &ec2.DescribeSecurityGroupsInput{
1847+
Filters: []ec2types.Filter{
1848+
newEc2Filter("ip-permission.group-id", sgID),
1849+
},
1850+
})
1851+
if err != nil {
1852+
return groupsWithTags, groupsLinkedPermissions, fmt.Errorf("error querying security groups for ELB: %q", err)
1853+
}
1854+
1855+
// Iterate over the security groups and build the maps.
1856+
for _, sg := range sgsOut {
1857+
// Save cluster tags
1858+
groupsWithTags[&sg] = c.tagging.hasClusterTag(sg.Tags)
1859+
if !c.tagging.hasClusterTag(sg.Tags) {
1860+
klog.Warningf("security group %q is not cluster tagged, skip removing reference rule list to remove %q", aws.ToString(sg.GroupId), sgID)
1861+
continue
1862+
}
1863+
// Save group linked rules
1864+
groupsLinkedPermissions[&sg] = NewIPPermissionSet()
1865+
for _, rule := range sg.IpPermissions {
1866+
if rule.UserIdGroupPairs != nil {
1867+
for _, pair := range rule.UserIdGroupPairs {
1868+
if pair.GroupId != nil && *pair.GroupId == sgID {
1869+
groupsLinkedPermissions[&sg].Insert(rule)
1870+
}
1871+
}
1872+
}
1873+
}
1874+
1875+
}
1876+
return groupsWithTags, groupsLinkedPermissions, nil
1877+
}

0 commit comments

Comments
 (0)