forked from GoogleCloudPlatform/DataflowTemplates
/
ExportPipeline.java
274 lines (240 loc) · 11.2 KB
/
ExportPipeline.java
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
/*
* Copyright (C) 2018 Google LLC
*
* Licensed under the Apache License, Version 2.0 (the "License"); you may not
* use this file except in compliance with the License. You may obtain a copy of
* the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
* License for the specific language governing permissions and limitations under
* the License.
*/
package com.google.cloud.teleport.spanner;
import com.google.cloud.spanner.Options.RpcPriority;
import com.google.cloud.spanner.SpannerOptions;
import com.google.cloud.teleport.metadata.Template;
import com.google.cloud.teleport.metadata.TemplateCategory;
import com.google.cloud.teleport.metadata.TemplateCreationParameter;
import com.google.cloud.teleport.metadata.TemplateParameter;
import com.google.cloud.teleport.metadata.TemplateParameter.TemplateEnumOption;
import com.google.cloud.teleport.spanner.ExportPipeline.ExportPipelineOptions;
import org.apache.beam.runners.dataflow.options.DataflowPipelineOptions;
import org.apache.beam.sdk.Pipeline;
import org.apache.beam.sdk.PipelineResult;
import org.apache.beam.sdk.io.gcp.spanner.SpannerConfig;
import org.apache.beam.sdk.options.Default;
import org.apache.beam.sdk.options.Description;
import org.apache.beam.sdk.options.PipelineOptions;
import org.apache.beam.sdk.options.PipelineOptionsFactory;
import org.apache.beam.sdk.options.ValueProvider;
import org.apache.beam.sdk.options.ValueProvider.NestedValueProvider;
import org.apache.beam.sdk.transforms.SerializableFunction;
/**
* Dataflow template that exports a Cloud Spanner database to Avro files in GCS.
*
* <p>Check out <a
* href="https://github.com/GoogleCloudPlatform/DataflowTemplates/blob/main/v1/README_Cloud_Spanner_to_GCS_Avro.md">README</a>
* for instructions on how to use or modify this template.
*/
@Template(
name = "Cloud_Spanner_to_GCS_Avro",
category = TemplateCategory.BATCH,
displayName = "Cloud Spanner to Avro Files on Cloud Storage",
description =
"A pipeline to export a Cloud Spanner database to a set of Avro files in Cloud Storage.",
optionsClass = ExportPipelineOptions.class,
documentation =
"https://cloud.google.com/dataflow/docs/guides/templates/provided/cloud-spanner-to-avro",
contactInformation = "https://cloud.google.com/support")
public class ExportPipeline {
/** Options for Export pipeline. */
public interface ExportPipelineOptions extends PipelineOptions {
@TemplateParameter.Text(
order = 1,
regexes = {"[a-z][a-z0-9\\-]*[a-z0-9]"},
description = "Cloud Spanner instance id",
helpText = "The instance id of the Cloud Spanner database that you want to export.")
ValueProvider<String> getInstanceId();
void setInstanceId(ValueProvider<String> value);
@TemplateParameter.Text(
order = 2,
regexes = {"[a-z][a-z0-9_\\-]*[a-z0-9]"},
description = "Cloud Spanner database id",
helpText = "The database id of the Cloud Spanner database that you want to export.")
ValueProvider<String> getDatabaseId();
void setDatabaseId(ValueProvider<String> value);
@TemplateParameter.GcsWriteFolder(
order = 3,
description = "Cloud Storage output directory",
helpText =
"The Cloud Storage path where the Avro files should be exported to. A new directory"
+ " will be created under this path that contains the export.",
example = "gs://your-bucket/your-path")
ValueProvider<String> getOutputDir();
void setOutputDir(ValueProvider<String> value);
@TemplateParameter.GcsWriteFolder(
order = 4,
optional = true,
description = "Cloud Storage temp directory for storing Avro files",
helpText =
"The Cloud Storage path where the temporary Avro files can be created. Ex:"
+ " gs://your-bucket/your-path")
ValueProvider<String> getAvroTempDirectory();
void setAvroTempDirectory(ValueProvider<String> value);
@TemplateCreationParameter(value = "")
@Description("Test dataflow job identifier for Beam Direct Runner")
@Default.String(value = "")
ValueProvider<String> getTestJobId();
void setTestJobId(ValueProvider<String> jobId);
@TemplateParameter.Text(
order = 6,
optional = true,
description = "Cloud Spanner Endpoint to call",
helpText = "The Cloud Spanner endpoint to call in the template. Only used for testing.",
example = "https://batch-spanner.googleapis.com")
@Default.String("https://batch-spanner.googleapis.com")
ValueProvider<String> getSpannerHost();
void setSpannerHost(ValueProvider<String> value);
@TemplateCreationParameter(value = "false")
@Description("If true, wait for job finish")
@Default.Boolean(true)
boolean getWaitUntilFinish();
void setWaitUntilFinish(boolean value);
@TemplateParameter.Text(
order = 7,
optional = true,
regexes = {
"^([0-9]{4})-([0-9]{2})-([0-9]{2})T([0-9]{2}):([0-9]{2}):(([0-9]{2})(\\.[0-9]+)?)Z$"
},
description = "Snapshot time",
helpText =
"Specifies the snapshot time as RFC 3339 format in UTC time without the timezone"
+ " offset(always ends in 'Z'). Timestamp must be in the past and Maximum timestamp"
+ " staleness applies. See"
+ " https://cloud.google.com/spanner/docs/timestamp-bounds#maximum_timestamp_staleness",
example = "1990-12-31T23:59:59Z")
@Default.String(value = "")
ValueProvider<String> getSnapshotTime();
void setSnapshotTime(ValueProvider<String> value);
@TemplateParameter.ProjectId(
order = 8,
optional = true,
description = "Cloud Spanner Project Id",
helpText = "The project id of the Cloud Spanner instance.")
ValueProvider<String> getSpannerProjectId();
void setSpannerProjectId(ValueProvider<String> value);
@TemplateParameter.Boolean(
order = 9,
optional = true,
description = "Export Timestamps as Timestamp-micros type",
helpText =
"If true, Timestamps are exported as timestamp-micros type. Timestamps are exported as"
+ " ISO8601 strings at nanosecond precision by default.")
@Default.Boolean(false)
ValueProvider<Boolean> getShouldExportTimestampAsLogicalType();
void setShouldExportTimestampAsLogicalType(ValueProvider<Boolean> value);
@TemplateParameter.Text(
order = 10,
optional = true,
regexes = {"^[a-zA-Z0-9_]+(,[a-zA-Z0-9_]+)*$"},
description = "Cloud Spanner table name(s).",
helpText =
"If provided, only this comma separated list of tables are exported. Ancestor tables"
+ " and tables that are referenced via foreign keys are required. If not explicitly"
+ " listed, the `shouldExportRelatedTables` flag must be set for a successful"
+ " export.")
@Default.String(value = "")
ValueProvider<String> getTableNames();
void setTableNames(ValueProvider<String> value);
@TemplateParameter.Boolean(
order = 11,
optional = true,
description = "Export necessary Related Spanner tables.",
helpText =
"Used in conjunction with `tableNames`. If true, add related tables necessary for the"
+ " export, such as interleaved parent tables and foreign keys tables. If"
+ " `tableNames` is specified but doesn't include related tables, this option must"
+ " be set to true for a successful export.")
@Default.Boolean(false)
ValueProvider<Boolean> getShouldExportRelatedTables();
void setShouldExportRelatedTables(ValueProvider<Boolean> value);
@TemplateParameter.Enum(
order = 12,
enumOptions = {
@TemplateEnumOption("LOW"),
@TemplateEnumOption("MEDIUM"),
@TemplateEnumOption("HIGH")
},
optional = true,
description = "Priority for Spanner RPC invocations",
helpText =
"The request priority for Cloud Spanner calls. The value must be one of:"
+ " [HIGH,MEDIUM,LOW].")
ValueProvider<RpcPriority> getSpannerPriority();
void setSpannerPriority(ValueProvider<RpcPriority> value);
@TemplateParameter.Boolean(
order = 13,
optional = true,
description = "Use independent compute resource (Spanner DataBoost).",
helpText =
"Use Spanner on-demand compute so the export job will run on independent compute"
+ " resources and have no impact to current Spanner workloads. This will incur"
+ " additional charges in Spanner.")
@Default.Boolean(false)
ValueProvider<Boolean> getDataBoostEnabled();
void setDataBoostEnabled(ValueProvider<Boolean> value);
}
/**
* Runs a pipeline to export a Cloud Spanner database to Avro files.
*
* @param args arguments to the pipeline
*/
public static void main(String[] args) {
ExportPipelineOptions options =
PipelineOptionsFactory.fromArgs(args).withValidation().as(ExportPipelineOptions.class);
Pipeline p = Pipeline.create(options);
SpannerConfig spannerConfig =
SpannerConfig.create()
// Temporary fix explicitly setting SpannerConfig.projectId to the default project
// if spannerProjectId is not provided as a parameter. Required as of Beam 2.38,
// which no longer accepts null label values on metrics, and SpannerIO#setup() has
// a bug resulting in the label value being set to the original parameter value,
// with no fallback to the default project.
// TODO: remove NestedValueProvider when this is fixed in Beam.
.withProjectId(
NestedValueProvider.of(
options.getSpannerProjectId(),
(SerializableFunction<String, String>)
input -> input != null ? input : SpannerOptions.getDefaultProjectId()))
.withHost(options.getSpannerHost())
.withInstanceId(options.getInstanceId())
.withDatabaseId(options.getDatabaseId())
.withRpcPriority(options.getSpannerPriority())
.withDataBoostEnabled(options.getDataBoostEnabled());
p.begin()
.apply(
"Run Export",
new ExportTransform(
spannerConfig,
options.getOutputDir(),
options.getTestJobId(),
options.getSnapshotTime(),
options.getTableNames(),
options.getShouldExportRelatedTables(),
options.getShouldExportTimestampAsLogicalType(),
options.getAvroTempDirectory()));
PipelineResult result = p.run();
if (options.getWaitUntilFinish()
&&
/* Only if template location is null, there is a dataflow job to wait for. Else it's
* template generation which doesn't start a dataflow job.
*/
options.as(DataflowPipelineOptions.class).getTemplateLocation() == null) {
result.waitUntilFinish();
}
}
}