From 2b2c54236edd57ab2f62b6df85753c0a1cc17de5 Mon Sep 17 00:00:00 2001 From: "W. Trevor King" Date: Sun, 31 Mar 2019 21:20:13 -0700 Subject: [PATCH 1/2] pkg/destroy/aws: Catch NatGatewayNotFound from DeleteNatGateway Like we do for most of our other resource types, make deletion idempotent by treating "that's (already) gone" as a successful deletion. --- pkg/destroy/aws/aws.go | 3 +++ 1 file changed, 3 insertions(+) diff --git a/pkg/destroy/aws/aws.go b/pkg/destroy/aws/aws.go index c3f425bff2f..fc3557874c2 100644 --- a/pkg/destroy/aws/aws.go +++ b/pkg/destroy/aws/aws.go @@ -687,6 +687,9 @@ func deleteEC2NATGateway(client *ec2.EC2, id string, logger logrus.FieldLogger) NatGatewayId: aws.String(id), }) if err != nil { + if err.(awserr.Error).Code() == "NatGatewayNotFound" { + return nil + } return err } From ca0ee608f6d204ecff13c2d10d9ee2a210c2d3f2 Mon Sep 17 00:00:00 2001 From: "W. Trevor King" Date: Sun, 31 Mar 2019 21:44:06 -0700 Subject: [PATCH 2/2] pkg/destroy/aws: Destroy NAT gateways by VPC too Sometimes these slip through without getting tagged. For example: $ aws ec2 describe-nat-gateways --filter Name=vpc-id,Values=vpc-030a62e79bc5bc0d3 --output json | jq '.NatGateways[] | {NatGatewayId, CreateTime, NumTags: (.Tags | length)}' { "NatGatewayId": "nat-0f85c55eae154e749", "CreateTime": "2019-03-28T15:10:30.000Z", "NumTags": 3 } { "NatGatewayId": "nat-0aba1c63954afbe32", "CreateTime": "2019-03-28T15:10:31.000Z", "NumTags": 3 } { "NatGatewayId": "nat-0b87f04521788a765", "CreateTime": "2019-03-28T15:12:06.000Z", "NumTags": 0 } { "NatGatewayId": "nat-0f26b4e1ba6ef97a5", "CreateTime": "2019-03-28T15:11:44.000Z", "NumTags": 0 } { "NatGatewayId": "nat-0f49c9061debf9777", "CreateTime": "2019-03-28T15:10:30.000Z", "NumTags": 3 } { "NatGatewayId": "nat-004b67f754f85ea4a", "CreateTime": "2019-03-28T15:11:43.000Z", "NumTags": 0 } In this case, the issue seems to have been the CI cluster evicting the setup container in the middle of its Terraform execution [1]: 2019/03/28 15:07:33 Running pod e2e-aws 2019/03/28 15:12:54 error: unable to signal to artifacts container to terminate in pod e2e-aws, triggering deletion: could not run remote command: unable to upgrade connection: container not found ("artifacts") 2019/03/28 15:12:54 error: unable to retrieve artifacts from pod e2e-aws: could not read gzipped artifacts: unable to upgrade connection: container not found ("artifacts") 2019/03/28 15:12:54 error: unable to signal to artifacts container to terminate in pod e2e-aws, triggering deletion: could not run remote command: pods "e2e-aws" is forbidden: pods "e2e-aws" not found 2019/03/28 15:12:54 error: unable to retrieve artifacts from pod e2e-aws: could not read gzipped artifacts: pods "e2e-aws" is forbidden: pods "e2e-aws" not found 2019/03/28 15:12:55 error: unable to signal to artifacts container to terminate in pod e2e-aws, triggering deletion: could not run remote command: pods "e2e-aws" is forbidden: pods "e2e-aws" not found 2019/03/28 15:12:55 error: unable to retrieve artifacts from pod e2e-aws: could not read gzipped artifacts: pods "e2e-aws" is forbidden: pods "e2e-aws" not found 2019/03/28 15:13:00 Ran for 6m10s error: could not run steps: template pod "e2e-aws" failed: pod e2e-aws was already deleted But deletion should be robust about that sort of thing. [1]: https://storage.googleapis.com/origin-ci-test/logs/release-openshift-origin-installer-e2e-aws-4.0/6259/build-log.txt --- pkg/destroy/aws/aws.go | 33 +++++++++++++++++++++++++++++++++ 1 file changed, 33 insertions(+) diff --git a/pkg/destroy/aws/aws.go b/pkg/destroy/aws/aws.go index fc3557874c2..1aca4fce33e 100644 --- a/pkg/destroy/aws/aws.go +++ b/pkg/destroy/aws/aws.go @@ -697,6 +697,38 @@ func deleteEC2NATGateway(client *ec2.EC2, id string, logger logrus.FieldLogger) return nil } +func deleteEC2NATGatewaysByVPC(client *ec2.EC2, vpc string, logger logrus.FieldLogger) error { + var lastError error + err := client.DescribeNatGatewaysPages( + &ec2.DescribeNatGatewaysInput{ + Filter: []*ec2.Filter{ + { + Name: aws.String("vpc-id"), + Values: []*string{&vpc}, + }, + }, + }, + func(results *ec2.DescribeNatGatewaysOutput, lastPage bool) bool { + for _, gateway := range results.NatGateways { + err := deleteEC2NATGateway(client, *gateway.NatGatewayId, logger.WithField("NAT gateway", *gateway.NatGatewayId)) + if err != nil { + if lastError != nil { + logger.Debug(err) + } + lastError = errors.Wrapf(err, "deleting EC2 NAT gateway %s", *gateway.NatGatewayId) + } + } + + return !lastPage + }, + ) + + if lastError != nil { + return lastError + } + return err +} + func deleteEC2RouteTable(client *ec2.EC2, id string, logger logrus.FieldLogger) error { response, err := client.DescribeRouteTables(&ec2.DescribeRouteTablesInput{ RouteTableIds: []*string{aws.String(id)}, @@ -939,6 +971,7 @@ func deleteEC2VPC(ec2Client *ec2.EC2, elbClient *elb.ELB, elbv2Client *elbv2.ELB } for _, helper := range [](func(client *ec2.EC2, vpc string, logger logrus.FieldLogger) error){ + deleteEC2NATGatewaysByVPC, // not always tagged deleteEC2NetworkInterfaceByVPC, // not always tagged deleteEC2RouteTablesByVPC, // not always tagged deleteEC2VPCEndpointsByVPC, // not taggable