-
Notifications
You must be signed in to change notification settings - Fork 0
/
restructure.yml
168 lines (159 loc) · 5.3 KB
/
restructure.yml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
service:
# Whether to run the application as a polling service.
enable: false
# Polling interval in seconds.
interval: 30
# Source data resource
# @since: 0.7.0
source:
type: s3 # hdfs, azure or s3
s3:
endpoint: http://localhost:9000 # using AWS S3 endpoint is also possible.
bucket: radar
accessToken: minioadmin
secretKey: minioadmin
# If true, try to read the metadata property "endOffset" to determine the
# final offset of an input object.
#endOffsetFromTags: false
azure:
endpoint: https://MyBlobStorageAccount.blob.core.windows.net
# when using personal login
#username: User
#password: Password
# when using shared access tokens
#accountName: MyBlobStorageAccount
#accountKey: MyLongToken
# when using a specially made SAS token
#sasToken: MyLongToken
# if no credentials are supplied, this only works with a publicly writable blob storage
container: MySourceContainer
# If true, try to read the metadata property "endOffset" to determine the
# final offset of an input object.
#endOffsetFromMetadata: false
# only actually needed if source type is hdfs
hdfs:
nameNodes: [hdfs-namenode-1, hdfs-namenode-2]
# Target data resource
# @since: 0.7.0
target:
type: s3 # s3, azure or local
s3:
endpoint: http://localhost:9000
bucket: out
accessToken: minioadmin
secretKey: minioadmin
azure:
endpoint: https://MyBlobStorageAccount.blob.core.windows.net
# when using personal login
#username: User
#password: Password
# when using shared access tokens
#accountName: MyBlobStorageAccount
#accountKey: MyLongToken
# when using a specially made SAS token
#sasToken: MyLongToken
# if no credentials are supplied, this only works with a publicly writable blob storage
container: MyTargetContainer
# only actually needed if target type is local
local:
userId: 1000 # write as user ID 1000, use -1 to use current user (default).
groupId: 100 # write as group ID 100, use -1 to use current user (default).
# Redis configuration
# @since: 0.7.0
redis:
# Redis URI
uri: redis://localhost:6379
# Key prefix for locks
lockPrefix: radar-output/lock/
# Compression characteristics
compression:
# Compression type: none, zip or gzip
type: gzip
# Compression Factory class
# factory: org.radarbase.hdfs.compression.CompressionFactory
# Additional compression properties
# properties: {}
# File format
format:
# Format type: CSV or JSON
type: csv
# Whether to deduplicate the files in each topic by default
deduplication:
enable: true
# Use specific fields to consider records distinct. Disregarded if empty.
# distinctFields: []
# Ignore specific fields to consider records distinct. Disregarded if empty.
# ignoreFields: []
# Format factory class
# factory: org.radarbase.hdfs.format.FormatFactory
# Additional format properties
# properties: {}
# Worker settings. Each worker thread has its own cache and topic, so the
# settings only apply to a single thread.
worker:
# Enable processing files for extraction
enable: true
# Maximum number of files and converters to keep open while processing. Increasing this will
# decrease memory pressure but slow down processing.
cacheSize: 300
# Maximum number of offsets in cache. Increasing this will decrease memory
# pressure but slow down processing.
cacheOffsetsSize: 500000
# Number of threads to do processing on
numThreads: 2
# Maximum number of files to process in any given topic.
maxFilesPerTopic: null
# Minimum time in seconds since a source file was last modified before it is
# considered for processing. This avoids synchronization issues when a file has just been
# written.
minimumFileAge: 60
cleaner:
# Enable cleaning up old source files
enable: true
# Interval in seconds to clean data
interval: 1260 # 21 minutes
# Number of days after which a source file is considered old
age: 7
# Path settings
paths:
# Input directories in source
inputs:
- /testIn
# Root temporary directory for local file processing.
temp: ./output/+tmp
# Output directory in target
output: /output
# Output path construction factory
factory: org.radarbase.output.path.FormattedPathFactory
# Additional properties
# properties:
# format: ${projectId}/${userId}/${topic}/${time:mm}/${time:YYYYmmDD_HH'00'}${attempt}${extension}
# plugins: fixed time key value org.example.plugin.MyPathPlugin
# Individual topic configuration
topics:
# topic name
connect_fitbit_source:
# deduplicate this topic, regardless of the format settings
deduplication:
enable: true
# deduplicate this topic only using given fields.
distinctFields: [key.sourceId, value.time]
# topic name
connect_fitbit_source2:
# deduplicate this topic, regardless of the format settings
deduplication:
enable: true
# deduplicate this topic without regard of given fields.
ignoreFields: [value.timeReceived]
connect_fitbit_bad:
# Do not process this topic
exclude: true
biovotion_acceleration:
# Disable deduplication
deduplication:
enable: false
questionnaire_response:
# Specify an alternative path format.
pathProperties:
format: ${projectId}/${userId}/${topic}/${value:name}/${filename}
plugins: fixed value