/
containerCleaner.ts
427 lines (393 loc) · 15.1 KB
/
containerCleaner.ts
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
// This code is very aggressive about running requests in parallel and does not use
// a task queue, because the quota limits for GCR.io are absurdly high. At the time
// of writing, we can make 50K requests per 10m.
// https://cloud.google.com/container-registry/quotas
import * as clc from "cli-color";
import { FirebaseError } from "../../error";
import { previews } from "../../previews";
import { artifactRegistryDomain, containerRegistryDomain } from "../../api";
import { logger } from "../../logger";
import * as artifactregistry from "../../gcp/artifactregistry";
import * as backend from "./backend";
import * as docker from "../../gcp/docker";
import * as utils from "../../utils";
import * as poller from "../../operation-poller";
async function retry<Return>(func: () => Promise<Return>): Promise<Return> {
const sleep = (ms: number): Promise<void> => new Promise((resolve) => setTimeout(resolve, ms));
const MAX_RETRIES = 3;
const INITIAL_BACKOFF = 100;
const TIMEOUT_MS = 10_000;
let retry = 0;
// eslint-disable-next-line no-constant-condition
while (true) {
try {
const timeout = new Promise<Return>((resolve, reject) => {
setTimeout(() => reject(new Error("Timeout")), TIMEOUT_MS);
});
return await Promise.race([func(), timeout]);
} catch (error) {
logger.debug("Failed docker command with error ", error);
retry += 1;
if (retry >= MAX_RETRIES) {
throw new FirebaseError("Failed to clean up artifacts", { original: error });
}
await sleep(Math.pow(INITIAL_BACKOFF, retry - 1));
}
}
}
export async function cleanupBuildImages(
haveFunctions: backend.TargetIds[],
deletedFunctions: backend.TargetIds[],
cleaners: { gcr?: ContainerRegistryCleaner; ar?: ArtifactRegistryCleaner } = {}
): Promise<void> {
utils.logBullet(clc.bold.cyan("functions: ") + "cleaning up build files...");
const failedDomains: Set<string> = new Set();
const cleanup: Array<Promise<void>> = [];
const arCleaner = cleaners.ar || new ArtifactRegistryCleaner();
// Whether the container was stored in GCR or AR is up to a server-side experiment;
// clean up both, just in case.
// TODO: remove GCR path once the experiment is rollack-safe.
cleanup.push(
...haveFunctions.map(async (func) => {
try {
await arCleaner.cleanupFunction(func);
} catch (err) {
const path = `${func.project}/${func.region}/gcf-artifacts`;
failedDomains.add(`https://console.cloud.google.com/artifacts/docker/${path}`);
}
})
);
cleanup.push(
...deletedFunctions.map(async (func) => {
try {
await Promise.all([arCleaner.cleanupFunction(func), arCleaner.cleanupFunctionCache(func)]);
} catch (err) {
const path = `${func.project}/${func.region}/gcf-artifacts`;
failedDomains.add(`https://console.cloud.google.com/artifacts/docker/${path}`);
}
})
);
const gcrCleaner = cleaners.gcr || new ContainerRegistryCleaner();
cleanup.push(
...[...haveFunctions, ...deletedFunctions].map(async (func) => {
try {
await gcrCleaner.cleanupFunction(func);
} catch (err) {
const path = `${func.project}/${docker.GCR_SUBDOMAIN_MAPPING[func.region]}/gcf`;
failedDomains.add(`https://console.cloud.google.com/gcr/images/${path}`);
}
})
);
await Promise.all(cleanup);
if (failedDomains.size) {
let message =
"Unhandled error cleaning up build images. This could result in a small monthly bill if not corrected. ";
message +=
"You can attempt to delete these images by redeploying or you can delete them manually at";
if (failedDomains.size == 1) {
message += " " + failedDomains.values().next().value;
} else {
message += [...failedDomains].map((domain) => "\n\t" + domain).join("");
}
utils.logLabeledWarning("functions", message);
}
}
// TODO: AR has a very simple API but is a Google API and thus probably has much lower quotas
// than the raw Docker API. If there are reports of any quota issues we may have to run these
// requests through a ThrottlerQueue.
export class ArtifactRegistryCleaner {
static packagePath(func: backend.TargetIds): string {
// GCFv1 names can include upper-case letters, but docker images cannot.
// to fix this, the artifact registry path for these images uses a custom encoding scheme.
// * Underscores are doubled
// * Dashes are doubled
// * A leading capital letter is replaced with <lower><dash><lower>
// * Other capital letters are replaced with <underscore><lower>
const encodedId = func.id
.replace(/_/g, "__")
.replace(/-/g, "--")
.replace(/^[A-Z]/, (first) => `${first.toLowerCase()}-${first.toLowerCase()}`)
.replace(/[A-Z]/g, (upper) => `_${upper.toLowerCase()}`);
return `projects/${func.project}/locations/${func.region}/repositories/gcf-artifacts/packages/${encodedId}`;
}
static POLLER_OPTIONS = {
apiOrigin: artifactRegistryDomain,
apiVersion: artifactregistry.API_VERSION,
masterTimeout: 5 * 60 * 1_000,
};
// GCFv1 for AR has the following directory structure
// Hostname: <region>-docker.pkg.dev
// Directory structure:
// gcf-artifacts/
// +- <function ID>
// +- <function ID>/cache
// We leave the cache directory alone because it only costs
// a few MB and improves performance. We only delete the cache if
// the function was deleted in its entirety.
async cleanupFunction(func: backend.TargetIds): Promise<void> {
let op: artifactregistry.Operation;
try {
op = await artifactregistry.deletePackage(ArtifactRegistryCleaner.packagePath(func));
} catch (err) {
// The client was not enrolled in the AR experimenet and the package
// was missing
if (err.status === 404) {
return;
}
throw err;
}
if (op.done) {
return;
}
await poller.pollOperation<void>({
...ArtifactRegistryCleaner.POLLER_OPTIONS,
pollerName: `cleanup-${func.region}-${func.id}`,
operationResourceName: op.name,
});
}
async cleanupFunctionCache(func: backend.TargetIds): Promise<void> {
// GCF uses "<id>/cache" as their pacakge name, but AR percent-encodes this to
// avoid parsing issues with OP.
const op = await artifactregistry.deletePackage(
`${ArtifactRegistryCleaner.packagePath(func)}%2Fcache`
);
if (op.done) {
return;
}
await poller.pollOperation<void>({
...ArtifactRegistryCleaner.POLLER_OPTIONS,
pollerName: `cleanup-cache-${func.region}-${func.id}`,
operationResourceName: op.name,
});
}
}
// Temporary class to turn off AR cleaning if AR isn't enabled yet
export class NoopArtifactRegistryCleaner extends ArtifactRegistryCleaner {
cleanupFunction(): Promise<void> {
return Promise.resolve();
}
cleanupFunctionCache(): Promise<void> {
return Promise.resolve();
}
}
export class ContainerRegistryCleaner {
readonly helpers: Record<string, DockerHelper> = {};
private helper(location: string): DockerHelper {
const subdomain = docker.GCR_SUBDOMAIN_MAPPING[location] || "us";
if (!this.helpers[subdomain]) {
const origin = `https://${subdomain}.${containerRegistryDomain}`;
this.helpers[subdomain] = new DockerHelper(origin);
}
return this.helpers[subdomain];
}
// GCFv1 has the directory structure:
// gcf/
// +- <region>/
// +- <uuid>
// +- <hash> (tags: <FuncName>_version-<#>)
// +- cache/ (Only present in first deploy of region)
// | +- <hash> (tags: latest)
// +- worker/ (Only present in first deploy of region)
// +- <hash> (tags: latest)
//
// We'll parallel search for the valid <uuid> and their children
// until we find one with the right tag for the function name.
// The underlying Helper's caching should make this expensive for
// the first function and free for the next functions in the same
// region.
async cleanupFunction(func: backend.TargetIds): Promise<void> {
const helper = this.helper(func.region);
const uuids = (await helper.ls(`${func.project}/gcf/${func.region}`)).children;
const uuidTags: Record<string, string[]> = {};
const loadUuidTags: Promise<void>[] = [];
for (const uuid of uuids) {
loadUuidTags.push(
(async () => {
const path = `${func.project}/gcf/${func.region}/${uuid}`;
const tags = (await helper.ls(path)).tags;
uuidTags[path] = tags;
})()
);
}
await Promise.all(loadUuidTags);
const extractFunction = /^(.*)_version-\d+$/;
const entry = Object.entries(uuidTags).find(([, tags]) => {
return tags.find((tag) => extractFunction.exec(tag)?.[1] === func.id);
});
if (!entry) {
logger.debug("Could not find image for function", backend.functionName(func));
return;
}
await helper.rm(entry[0]);
}
}
function getHelper(cache: Record<string, DockerHelper>, subdomain: string): DockerHelper {
if (!cache[subdomain]) {
cache[subdomain] = new DockerHelper(`https://${subdomain}.${containerRegistryDomain}`);
}
return cache[subdomain];
}
/**
* List all paths from the GCF directory in GCR (e.g. us.gcr.io/project-id/gcf/location).
* @param projectId: the current project that contains GCF artifacts
* @param location: the specific region to search for artifacts. If omitted, will search all locations.
* @param dockerHelpers: a map of {@link SUBDOMAINS} to {@link DockerHelper}. If omitted, will use the default value and create each {@link DockerHelper} internally.
*
* @throws {@link FirebaseError}
* Thrown if the provided location is not a valid Google Cloud region or we fail to search subdomains.
*/
export async function listGcfPaths(
projectId: string,
locations?: string[],
dockerHelpers: Record<string, DockerHelper> = {}
): Promise<string[]> {
if (!locations) {
locations = Object.keys(docker.GCR_SUBDOMAIN_MAPPING);
}
const invalidRegion = locations.find((loc) => !docker.GCR_SUBDOMAIN_MAPPING[loc]);
if (invalidRegion) {
throw new FirebaseError(`Invalid region ${invalidRegion} supplied`);
}
const locationsSet = new Set(locations); // for quick lookup
const subdomains = new Set(Object.values(docker.GCR_SUBDOMAIN_MAPPING));
const failedSubdomains: string[] = [];
const listAll: Promise<Stat>[] = [];
for (const subdomain of subdomains) {
listAll.push(
(async () => {
try {
return getHelper(dockerHelpers, subdomain).ls(`${projectId}/gcf`);
} catch (err) {
failedSubdomains.push(subdomain);
logger.debug(err);
const stat: Stat = {
children: [],
digests: [],
tags: [],
};
return Promise.resolve(stat);
}
})()
);
}
const gcfDirs = (await Promise.all(listAll))
.map((results) => results.children)
.reduce((acc, val) => [...acc, ...val], [])
.filter((loc) => locationsSet.has(loc));
if (failedSubdomains.length == subdomains.size) {
throw new FirebaseError("Failed to search all subdomains.");
} else if (failedSubdomains.length > 0) {
throw new FirebaseError(
`Failed to search the following subdomains: ${failedSubdomains.join(",")}`
);
}
return gcfDirs.map((loc) => {
return `${docker.GCR_SUBDOMAIN_MAPPING[loc]}.${containerRegistryDomain}/${projectId}/gcf/${loc}`;
});
}
/**
* Deletes all artifacts from GCF directory in GCR.
* @param projectId: the current project that contains GCF artifacts
* @param location: the specific region to be clean up. If omitted, will delete all locations.
* @param dockerHelpers: a map of {@link SUBDOMAINS} to {@link DockerHelper}. If omitted, will use the default value and create each {@link DockerHelper} internally.
*
* @throws {@link FirebaseError}
* Thrown if the provided location is not a valid Google Cloud region or we fail to delete subdomains.
*/
export async function deleteGcfArtifacts(
projectId: string,
locations?: string[],
dockerHelpers: Record<string, DockerHelper> = {}
): Promise<void> {
if (!locations) {
locations = Object.keys(docker.GCR_SUBDOMAIN_MAPPING);
}
const invalidRegion = locations.find((loc) => !docker.GCR_SUBDOMAIN_MAPPING[loc]);
if (invalidRegion) {
throw new FirebaseError(`Invalid region ${invalidRegion} supplied`);
}
const subdomains = new Set(Object.values(docker.GCR_SUBDOMAIN_MAPPING));
const failedSubdomains: string[] = [];
const deleteLocations = locations.map((loc) => {
const subdomain = docker.GCR_SUBDOMAIN_MAPPING[loc]!;
try {
return getHelper(dockerHelpers, subdomain).rm(`${projectId}/gcf/${loc}`);
} catch (err) {
failedSubdomains.push(subdomain);
logger.debug(err);
}
});
await Promise.all(deleteLocations);
if (failedSubdomains.length == subdomains.size) {
throw new FirebaseError("Failed to search all subdomains.");
} else if (failedSubdomains.length > 0) {
throw new FirebaseError(
`Failed to search the following subdomains: ${failedSubdomains.join(",")}`
);
}
}
export interface Stat {
children: string[];
digests: docker.Digest[];
tags: docker.Tag[];
}
export class DockerHelper {
readonly client: docker.Client;
readonly cache: Record<string, Stat> = {};
constructor(origin: string) {
this.client = new docker.Client(origin);
}
async ls(path: string): Promise<Stat> {
if (!this.cache[path]) {
const raw = await retry(() => this.client.listTags(path));
this.cache[path] = {
tags: raw.tags,
digests: Object.keys(raw.manifest),
children: raw.child,
};
}
return this.cache[path];
}
// While we can't guarantee all promises will succeed, we can do our darndest
// to expunge as much as possible before throwing.
async rm(path: string): Promise<void> {
let toThrowLater: unknown = undefined;
const stat = await this.ls(path);
const recursive = stat.children.map(async (child) => {
try {
await this.rm(`${path}/${child}`);
stat.children.splice(stat.children.indexOf(child), 1);
} catch (err) {
toThrowLater = err;
}
});
// Unlike a filesystem, we can delete a "directory" while its children are still being
// deleted. Run these in parallel to improve performance and just wait for the result
// before the function's end.
// An image cannot be deleted until its tags have been removed. Do this in two phases.
const deleteTags = stat.tags.map(async (tag) => {
try {
await retry(() => this.client.deleteTag(path, tag));
stat.tags.splice(stat.tags.indexOf(tag), 1);
} catch (err) {
logger.debug("Got error trying to remove docker tag:", err);
toThrowLater = err;
}
});
await Promise.all(deleteTags);
const deleteImages = stat.digests.map(async (digest) => {
try {
await retry(() => this.client.deleteImage(path, digest));
stat.digests.splice(stat.digests.indexOf(digest), 1);
} catch (err) {
logger.debug("Got error trying to remove docker image:", err);
toThrowLater = err;
}
});
await Promise.all(deleteImages);
await Promise.all(recursive);
if (toThrowLater) {
throw toThrowLater;
}
}
}