Skip to content

Commit 1907542

Browse files
committed
fix/byosg: remove managed SG on BYOSG scenario for CLB
1 parent 501741e commit 1907542

File tree

3 files changed

+209
-25
lines changed

3 files changed

+209
-25
lines changed

pkg/providers/v1/aws.go

Lines changed: 3 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -2734,19 +2734,9 @@ func (c *Cloud) updateInstanceSecurityGroupsForLoadBalancer(ctx context.Context,
27342734
loadBalancerSecurityGroupID := lbSecurityGroupIDs[0]
27352735

27362736
// Get the actual list of groups that allow ingress from the load-balancer
2737-
actualGroups := make(map[*ec2types.SecurityGroup]bool)
2738-
{
2739-
describeRequest := &ec2.DescribeSecurityGroupsInput{}
2740-
describeRequest.Filters = []ec2types.Filter{
2741-
newEc2Filter("ip-permission.group-id", loadBalancerSecurityGroupID),
2742-
}
2743-
response, err := c.ec2.DescribeSecurityGroups(ctx, describeRequest)
2744-
if err != nil {
2745-
return fmt.Errorf("error querying security groups for ELB: %q", err)
2746-
}
2747-
for _, sg := range response {
2748-
actualGroups[&sg] = c.tagging.hasClusterTag(sg.Tags)
2749-
}
2737+
actualGroups, _, err := c.buildSecurityGroupRuleReferences(ctx, loadBalancerSecurityGroupID)
2738+
if err != nil {
2739+
return fmt.Errorf("error building security group rule references: %w", err)
27502740
}
27512741

27522742
// Open the firewall from the load balancer to the instance

pkg/providers/v1/aws_loadbalancer.go

Lines changed: 181 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,7 @@ import (
2929
"time"
3030

3131
"github.com/aws/aws-sdk-go-v2/aws"
32+
"github.com/aws/aws-sdk-go-v2/service/ec2"
3233
ec2types "github.com/aws/aws-sdk-go-v2/service/ec2/types"
3334

3435
elb "github.com/aws/aws-sdk-go-v2/service/elasticloadbalancing"
@@ -1082,24 +1083,24 @@ func (c *Cloud) ensureLoadBalancer(ctx context.Context, namespacedName types.Nam
10821083

10831084
{
10841085
// Sync security groups
1085-
expected := sets.New[string](securityGroupIDs...)
1086-
actual := stringSetFromList(loadBalancer.SecurityGroups)
1086+
expected := sets.New(securityGroupIDs...)
1087+
actual := sets.New(loadBalancer.SecurityGroups...)
10871088

10881089
if !expected.Equal(actual) {
10891090
// This call just replaces the security groups, unlike e.g. subnets (!)
1090-
request := &elb.ApplySecurityGroupsToLoadBalancerInput{}
1091-
request.LoadBalancerName = aws.String(loadBalancerName)
1092-
if securityGroupIDs == nil {
1093-
request.SecurityGroups = nil
1094-
} else {
1095-
request.SecurityGroups = securityGroupIDs
1096-
}
1097-
klog.V(2).Info("Applying updated security groups to load balancer")
1098-
_, err := c.elb.ApplySecurityGroupsToLoadBalancer(ctx, request)
1099-
if err != nil {
1091+
klog.V(2).Infof("Applying updated security groups to load balancer %q", loadBalancerName)
1092+
if _, err := c.elb.ApplySecurityGroupsToLoadBalancer(ctx, &elb.ApplySecurityGroupsToLoadBalancerInput{
1093+
LoadBalancerName: aws.String(loadBalancerName),
1094+
SecurityGroups: securityGroupIDs,
1095+
}); err != nil {
11001096
return nil, fmt.Errorf("error applying AWS loadbalancer security groups: %q", err)
11011097
}
11021098
dirty = true
1099+
1100+
// Ensure the replaced security groups are removed from AWS when owned by the controller.
1101+
if sgsNotRemoved, errs := c.removeOwnedSecurityGroups(ctx, loadBalancerName, actual.UnsortedList()); len(errs) > 0 {
1102+
return nil, fmt.Errorf("error removing owned security groups %v: %v", sgsNotRemoved, errs)
1103+
}
11031104
}
11041105
}
11051106

@@ -1704,3 +1705,171 @@ func ValidateHealthCheck(s *elbtypes.HealthCheck) error {
17041705

17051706
return nil
17061707
}
1708+
1709+
// isOwnedSecurityGroup checks if the security group is owned by the controller
1710+
// by checking if the security group has the cluster ownership tag
1711+
// (kubernetes.io/cluster/<clusterID>=owned).
1712+
//
1713+
// Parameters:
1714+
// - `ctx`: The context for the operation.
1715+
// - `securityGroupID`: The ID of the security group to check.
1716+
//
1717+
// Returns:
1718+
// - `bool`: True if the security group is owned by the controller, false otherwise.
1719+
// - `error`: An error if the security group cannot be retrieved, is not found,
1720+
// or if multiple security groups are found with the same ID (unexpected).
1721+
//
1722+
// Note: This function calls the AWS DescribeSecurityGroups API to retrieve
1723+
// the security group and check its tags.
1724+
func (c *Cloud) isOwnedSecurityGroup(ctx context.Context, securityGroupID string) (bool, error) {
1725+
groups, err := c.ec2.DescribeSecurityGroups(ctx, &ec2.DescribeSecurityGroupsInput{
1726+
GroupIds: []string{securityGroupID},
1727+
})
1728+
if err != nil {
1729+
return false, fmt.Errorf("error retrieving security group %q: %w", securityGroupID, err)
1730+
}
1731+
if len(groups) == 0 {
1732+
return false, fmt.Errorf("security group %q not found", securityGroupID)
1733+
}
1734+
if len(groups) != 1 {
1735+
// This should not be possible - ids should be unique
1736+
return false, fmt.Errorf("multiple security groups found with same id %q", securityGroupID)
1737+
}
1738+
return c.tagging.hasClusterTagOwned(groups[0].Tags), nil
1739+
}
1740+
1741+
// buildSecurityGroupRuleReferences finds all security groups that have ingress rules
1742+
// referencing the specified security group ID, and categorizes them based on cluster tagging.
1743+
// This is used to identify dependencies before removing a security group.
1744+
//
1745+
// Parameters:
1746+
// - ctx: The context for the request.
1747+
// - sgID: The ID of the security group to find references for.
1748+
//
1749+
// Returns:
1750+
// - map[*ec2types.SecurityGroup]bool: All security groups with ingress rules referencing sgID, mapped to their cluster tag status (true/false).
1751+
// - map[*ec2types.SecurityGroup]IPPermissionSet: Only cluster-tagged security groups mapped to their ingress rules that reference sgID.
1752+
// - error: An error if the AWS DescribeSecurityGroups API call fails.
1753+
//
1754+
// Behavior:
1755+
// - Queries AWS for all security groups with ingress rules referencing the target sgID.
1756+
// - For each found security group, records its cluster tag status in the first map.
1757+
// - Only processes ingress rules for cluster-tagged security groups; non-cluster-tagged ones are skipped with a warning.
1758+
// - Collects the specific ingress rules that reference the target security group from cluster-tagged SGs only.
1759+
// - Non-cluster-tagged security groups appear in the first map (with false value) but are excluded from the second map.
1760+
func (c *Cloud) buildSecurityGroupRuleReferences(ctx context.Context, sgID string) (map[*ec2types.SecurityGroup]bool, map[*ec2types.SecurityGroup]IPPermissionSet, error) {
1761+
groupsHasTags := make(map[*ec2types.SecurityGroup]bool)
1762+
groupsLinkedPermissions := make(map[*ec2types.SecurityGroup]IPPermissionSet)
1763+
sgsOut, err := c.ec2.DescribeSecurityGroups(ctx, &ec2.DescribeSecurityGroupsInput{
1764+
Filters: []ec2types.Filter{
1765+
newEc2Filter("ip-permission.group-id", sgID),
1766+
},
1767+
})
1768+
if err != nil {
1769+
return groupsHasTags, groupsLinkedPermissions, fmt.Errorf("error querying security groups for ELB: %q", err)
1770+
}
1771+
1772+
// Iterate over the security groups and build the maps.
1773+
for _, sg := range sgsOut {
1774+
// Save cluster tags
1775+
groupsHasTags[&sg] = c.tagging.hasClusterTag(sg.Tags)
1776+
1777+
// Save group linked rules
1778+
groupsLinkedPermissions[&sg] = NewIPPermissionSet()
1779+
for _, rule := range sg.IpPermissions {
1780+
if rule.UserIdGroupPairs != nil {
1781+
for _, pair := range rule.UserIdGroupPairs {
1782+
if pair.GroupId != nil && *pair.GroupId == sgID {
1783+
groupsLinkedPermissions[&sg].Insert(rule)
1784+
}
1785+
}
1786+
}
1787+
}
1788+
1789+
}
1790+
return groupsHasTags, groupsLinkedPermissions, nil
1791+
}
1792+
1793+
// removeOwnedSecurityGroups removes the owned/managed security groups from AWS.
1794+
// It revokes ingress rules that reference the security groups to be removed,
1795+
// then deletes the security groups that are owned by the controller.
1796+
// This is used when updating load balancer security groups to clean up orphaned ones.
1797+
//
1798+
// Parameters:
1799+
// - `ctx`: The context for the operation.
1800+
// - `loadBalancerName`: The name of the load balancer (used for logging and deletion operations).
1801+
// - `securityGroups`: The list of security group IDs to process for removal.
1802+
//
1803+
// Returns:
1804+
// - `[]string`: Security group IDs that could not be removed due to errors.
1805+
// - `[]error`: Collection of all errors encountered during the removal process.
1806+
//
1807+
// Behavior:
1808+
// - For each security group, checks if it is owned by the controller using cluster tags.
1809+
// - Identifies and revokes all ingress rules in other security groups that reference the target SG.
1810+
// - Only deletes security groups that are owned by the controller (have cluster ownership tags).
1811+
// - Non-owned security groups have their references revoked but are not deleted.
1812+
// - Processing continues for all security groups even if some operations fail.
1813+
// - Collects and returns all security groups that could not be fully processed.
1814+
func (c *Cloud) removeOwnedSecurityGroups(ctx context.Context, loadBalancerName string, securityGroups []string) ([]string, []error) {
1815+
groupsNotRemoved := []string{}
1816+
errs := []error{}
1817+
for _, sg := range securityGroups {
1818+
// Ensure security group is owned by the controller
1819+
isOwned, err := c.isOwnedSecurityGroup(ctx, sg)
1820+
if err != nil {
1821+
groupsNotRemoved = append(groupsNotRemoved, sg)
1822+
errs = append(errs, fmt.Errorf("unable to validate if security group %q is owned by the controller: %w", sg, err))
1823+
continue
1824+
}
1825+
1826+
// build a list of security groups referencing the SG to be removed
1827+
_, groupsLinkedPermissions, err := c.buildSecurityGroupRuleReferences(ctx, sg)
1828+
if err != nil {
1829+
groupsNotRemoved = append(groupsNotRemoved, sg)
1830+
errs = append(errs, fmt.Errorf("error building security group rule references for %q: %w", sg, err))
1831+
continue
1832+
}
1833+
1834+
// revoke the rules references to the security group to be removed
1835+
for sgTarget, sgPerms := range groupsLinkedPermissions {
1836+
klog.Infof("revoking security group ingress for %q", aws.ToString(sgTarget.GroupId))
1837+
if _, err := c.ec2.RevokeSecurityGroupIngress(ctx, &ec2.RevokeSecurityGroupIngressInput{
1838+
GroupId: sgTarget.GroupId,
1839+
IpPermissions: sgPerms.List(),
1840+
}); err != nil {
1841+
groupsNotRemoved = append(groupsNotRemoved, sg)
1842+
errs = append(errs, fmt.Errorf("error revoking security group ingress rules from %q: %w", aws.ToString(sgTarget.GroupId), err))
1843+
continue
1844+
}
1845+
}
1846+
1847+
// leave the loop if there are errors while revoking the rules references to the security group to be removed
1848+
if len(errs) > 0 {
1849+
klog.Warningf("one or more errors while revoking security group ingress rules, skipping %q deletion", groupsNotRemoved)
1850+
continue
1851+
}
1852+
1853+
// if the security group is not owned by the controller, skip the SG removal lifecycle
1854+
if !isOwned {
1855+
msg := fmt.Sprintf("security group %q is not owned by the controller, skipping remove lifecycle after update", sg)
1856+
klog.Warningf("%s", msg)
1857+
groupsNotRemoved = append(groupsNotRemoved, sg)
1858+
errs = append(errs, fmt.Errorf("%s", msg))
1859+
continue
1860+
}
1861+
1862+
// delete the owned/managed security group
1863+
klog.Infof("deleting security group %q", sg)
1864+
sgMap := make(map[string]struct{})
1865+
sgMap[sg] = struct{}{}
1866+
if err := c.deleteSecurityGroupsWithBackoff(ctx, loadBalancerName, sgMap); err != nil {
1867+
msg := fmt.Sprintf("error deleting security group %q: %v", sg, err)
1868+
klog.Warningf("%s", msg)
1869+
groupsNotRemoved = append(groupsNotRemoved, sg)
1870+
errs = append(errs, fmt.Errorf("error deleting security group %q: %w", sg, err))
1871+
}
1872+
klog.Infof("security group %q deleted", sg)
1873+
}
1874+
return groupsNotRemoved, errs
1875+
}

pkg/providers/v1/tags.go

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -154,6 +154,31 @@ func (t *awsTagging) hasClusterTag(tags []ec2types.Tag) bool {
154154
return false
155155
}
156156

157+
// hasClusterTagOwned checks if the resource has the cluster tag with the value "owned"
158+
// This is used to determine if the resource is owned by the controller.
159+
// It checks both legacy and new tags.
160+
func (t *awsTagging) hasClusterTagOwned(tags []ec2types.Tag) bool {
161+
if len(t.ClusterID) == 0 {
162+
return false
163+
}
164+
clusterTagKey := t.clusterTagKey()
165+
for _, tag := range tags {
166+
tagKey := aws.ToString(tag.Key)
167+
tagValue := aws.ToString(tag.Value)
168+
169+
// For legacy tags, the cluster ID is the value, not "owned"
170+
if (tagKey == TagNameKubernetesClusterLegacy) && (tagValue == t.ClusterID) {
171+
return true
172+
}
173+
174+
// For new tags, check if it's the cluster tag with "owned" value
175+
if tagKey == clusterTagKey && tagValue == string(ResourceLifecycleOwned) {
176+
return true
177+
}
178+
}
179+
return false
180+
}
181+
157182
func (t *awsTagging) hasNoClusterPrefixTag(tags []ec2types.Tag) bool {
158183
for _, tag := range tags {
159184
if strings.HasPrefix(aws.ToString(tag.Key), TagNameKubernetesClusterPrefix) {

0 commit comments

Comments
 (0)