forked from airbytehq/airbyte
-
Notifications
You must be signed in to change notification settings - Fork 0
/
array.sql
192 lines (161 loc) · 7.43 KB
/
array.sql
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
{#
Adapter Macros for the following functions:
- Bigquery: unnest() -> https://cloud.google.com/bigquery/docs/reference/standard-sql/arrays#flattening-arrays-and-repeated-fields
- Snowflake: flatten() -> https://docs.snowflake.com/en/sql-reference/functions/flatten.html
- Redshift: -> https://blog.getdbt.com/how-to-unnest-arrays-in-redshift/
- postgres: unnest() -> https://www.postgresqltutorial.com/postgresql-array/
- MSSQL: openjson() –> https://docs.microsoft.com/en-us/sql/relational-databases/json/validate-query-and-change-json-data-with-built-in-functions-sql-server?view=sql-server-ver15
- ClickHouse: ARRAY JOIN –> https://clickhouse.com/docs/zh/sql-reference/statements/select/array-join/
- Databricks: LATERAL VIEW -> https://docs.databricks.com/spark/latest/spark-sql/language-manual/sql-ref-syntax-qry-select-lateral-view.html
#}
{# cross_join_unnest ------------------------------------------------- #}
{% macro cross_join_unnest(stream_name, array_col) -%}
{{ adapter.dispatch('cross_join_unnest')(stream_name, array_col) }}
{%- endmacro %}
{% macro default__cross_join_unnest(stream_name, array_col) -%}
{% do exceptions.warn("Undefined macro cross_join_unnest for this destination engine") %}
{%- endmacro %}
{% macro bigquery__cross_join_unnest(stream_name, array_col) -%}
cross join unnest({{ array_col }}) as {{ array_col }}
{%- endmacro %}
{% macro clickhouse__cross_join_unnest(stream_name, array_col) -%}
ARRAY JOIN {{ array_col }}
{%- endmacro %}
{% macro oracle__cross_join_unnest(stream_name, array_col) -%}
{% do exceptions.warn("Normalization does not support unnesting for Oracle yet.") %}
{%- endmacro %}
{% macro postgres__cross_join_unnest(stream_name, array_col) -%}
cross join jsonb_array_elements(
case jsonb_typeof({{ array_col }})
when 'array' then {{ array_col }}
else '[]' end
) as _airbyte_nested_data
{%- endmacro %}
{% macro mysql__cross_join_unnest(stream_name, array_col) -%}
left join joined on _airbyte_{{ stream_name }}_hashid = joined._airbyte_hashid
{%- endmacro %}
{% macro redshift__cross_join_unnest(stream_name, array_col) -%}
left join joined on _airbyte_{{ stream_name }}_hashid = joined._airbyte_hashid
{%- endmacro %}
{% macro snowflake__cross_join_unnest(stream_name, array_col) -%}
cross join table(flatten({{ array_col }})) as {{ array_col }}
{%- endmacro %}
{% macro databricks__cross_join_unnest(stream_name, array_col) -%}
lateral view outer explode(from_json({{ array_col }}, 'array<string>')) as _airbyte_nested_data
{%- endmacro %}
{% macro sqlserver__cross_join_unnest(stream_name, array_col) -%}
{# https://docs.microsoft.com/en-us/sql/relational-databases/json/convert-json-data-to-rows-and-columns-with-openjson-sql-server?view=sql-server-ver15#option-1---openjson-with-the-default-output #}
CROSS APPLY (
SELECT [value] = CASE
WHEN [type] = 4 THEN (SELECT [value] FROM OPENJSON([value]))
WHEN [type] = 5 THEN [value]
END
FROM OPENJSON({{ array_col }})
) AS {{ array_col }}
{%- endmacro %}
{# unnested_column_value -- this macro is related to unnest_cte #}
{% macro unnested_column_value(column_col) -%}
{{ adapter.dispatch('unnested_column_value')(column_col) }}
{%- endmacro %}
{% macro default__unnested_column_value(column_col) -%}
{{ column_col }}
{%- endmacro %}
{% macro postgres__unnested_column_value(column_col) -%}
_airbyte_nested_data
{%- endmacro %}
{% macro snowflake__unnested_column_value(column_col) -%}
{{ column_col }}.value
{%- endmacro %}
{% macro redshift__unnested_column_value(column_col) -%}
_airbyte_nested_data
{%- endmacro %}
{% macro mysql__unnested_column_value(column_col) -%}
_airbyte_nested_data
{%- endmacro %}
{% macro databricks__unnested_column_value(column_col) -%}
_airbyte_nested_data
{%- endmacro %}
{% macro oracle__unnested_column_value(column_col) -%}
{{ column_col }}
{%- endmacro %}
{% macro sqlserver__unnested_column_value(column_col) -%}
{# unnested array/sub_array will be located in `value` column afterwards, we need to address to it #}
{{ column_col }}.value
{%- endmacro %}
{# unnest_cte ------------------------------------------------- #}
{% macro unnest_cte(from_table, stream_name, column_col) -%}
{{ adapter.dispatch('unnest_cte')(from_table, stream_name, column_col) }}
{%- endmacro %}
{% macro default__unnest_cte(from_table, stream_name, column_col) -%}{%- endmacro %}
{% macro redshift__unnest_cte(from_table, stream_name, column_col) -%}
{# -- based on https://docs.aws.amazon.com/redshift/latest/dg/query-super.html #}
{% if redshift_super_type() -%}
with joined as (
select
table_alias._airbyte_{{ stream_name }}_hashid as _airbyte_hashid,
_airbyte_nested_data
from {{ from_table }} as table_alias, table_alias.{{ column_col }} as _airbyte_nested_data
)
{%- else -%}
{# -- based on https://blog.getdbt.com/how-to-unnest-arrays-in-redshift/ #}
{%- if not execute -%}
{{ return('') }}
{% endif %}
{%- call statement('max_json_array_length', fetch_result=True) -%}
with max_value as (
select max(json_array_length({{ column_col }}, true)) as max_number_of_items
from {{ from_table }}
)
select
case when max_number_of_items is not null and max_number_of_items > 1
then max_number_of_items
else 1 end as max_number_of_items
from max_value
{%- endcall -%}
{%- set max_length = load_result('max_json_array_length') -%}
with numbers as (
{{dbt_utils.generate_series(max_length["data"][0][0])}}
),
joined as (
select
_airbyte_{{ stream_name }}_hashid as _airbyte_hashid,
json_extract_array_element_text({{ column_col }}, numbers.generated_number::int - 1, true) as _airbyte_nested_data
from {{ from_table }}
cross join numbers
-- only generate the number of records in the cross join that corresponds
-- to the number of items in {{ from_table }}.{{ column_col }}
where numbers.generated_number <= json_array_length({{ column_col }}, true)
)
{%- endif %}
{%- endmacro %}
{% macro mysql__unnest_cte(from_table, stream_name, column_col) -%}
{%- if not execute -%}
{{ return('') }}
{% endif %}
{%- call statement('max_json_array_length', fetch_result=True) -%}
with max_value as (
select max(json_length({{ column_col }})) as max_number_of_items
from {{ from_table }}
)
select
case when max_number_of_items is not null and max_number_of_items > 1
then max_number_of_items
else 1 end as max_number_of_items
from max_value
{%- endcall -%}
{%- set max_length = load_result('max_json_array_length') -%}
with numbers as (
{{ dbt_utils.generate_series(max_length["data"][0][0]) }}
),
joined as (
select
_airbyte_{{ stream_name }}_hashid as _airbyte_hashid,
{# -- json_extract(column_col, '$[i][0]') as _airbyte_nested_data #}
json_extract({{ column_col }}, concat("$[", numbers.generated_number - 1, "][0]")) as _airbyte_nested_data
from {{ from_table }}
cross join numbers
-- only generate the number of records in the cross join that corresponds
-- to the number of items in {{ from_table }}.{{ column_col }}
where numbers.generated_number <= json_length({{ column_col }})
)
{%- endmacro %}