/
Ice_Surface_Analyser.py
549 lines (388 loc) · 21.2 KB
/
Ice_Surface_Analyser.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
"""
Functions for classifying Sentinel 2 images using a trained classification model
"""
import numpy as np
import pandas as pd
import xarray as xr
import ebmodel as ebm
import ebmodel as ebm
import glob
import os
import dask
try:
import joblib
except ImportError:
from sklearn.externals import joblib
class SurfaceClassifier:
# Bands to use in classifier
s2_bands_use = ['B02', 'B03', 'B04', 'B05', 'B06', 'B07', 'B8A', 'B11', 'B12']
# Level 2A scaling factor
l2a_scaling_factor = 10000
# names of bands as used in classifier
NAME_bands = 'bands'
NAME_x = 'x'
NAME_y = 'y'
def __init__(self, pkl):
""" Initialise classifier.
:param pkl: path of pickled classifier.
:type pkl: str
"""
self.classifier = joblib.load(pkl)
return
def load_img_to_xr(self, img_path, resolution, Icemask, Cloudmask):
""" Load Sentinel-2 JP2000s into xr.Dataset.
Load all bands of image (specified by s2_bands_use) into an xarray
Dataset, include Icemask and Cloudmask, return the in-memory Dataset.
Applies scaling factor.
"""
# Open link to each image
store = []
for band in self.s2_bands_use:
fn = glob.glob('%s*%s_%sm.jp2' %(img_path,band,resolution))
if len(fn) > 1:
raise ValueError("Multiple bands named {} in blob container. One expected.".format(fn))
else:
try:
fn = fn[0]
except:
raise IndexError("At least one band missing from S2 blob container")
da_band = xr.open_rasterio(fn, chunks={self.NAME_x: 2000, self.NAME_y: 2000})
crs = da_band.attrs['crs']
# Apply scaling factor
da_band = da_band / self.l2a_scaling_factor
da_band[self.NAME_bands] = band
store.append(da_band)
# Concatenate all bands into a single DataArray
da = xr.concat(store, dim=self.NAME_bands).squeeze()
# Create complete dataset
ds = xr.Dataset({ 'Data': ((self.NAME_bands,self.NAME_y,self.NAME_x), da),
'Icemask': ((self.NAME_y,self.NAME_x), Icemask),
'Cloudmask': ((self.NAME_y,self.NAME_x), Cloudmask) },
coords={self.NAME_bands:self.s2_bands_use,
self.NAME_y:da_band[self.NAME_y],
self.NAME_x:da_band[self.NAME_x]})
ds.Data.attrs['crs'] = crs
return ds
def classify_image(self, S2vals, savepath, tile, date,
savefigs=True):
""" Apply classifier to image.
function applies pickled classifier to multispectral S2 image saved as
NetCDF, saving plot and summary data to output folder.
"""
# stack the values into a 1D array
stacked = S2vals.Data.stack(allpoints=[self.NAME_y,self.NAME_x])
# Transpose and rename so that DataArray has exactly the same layout/labels as the training DataArray.
# mask out nan areas not masked out by GIMP
stackedT = stacked.T
stackedT = stackedT.rename({'allpoints': 'samples'})
# apply classifier
predicted = self.classifier.predict(stackedT)
# Unstack back to x,y grid
predicted = predicted.unstack(dim='samples')
return predicted
def calculate_albedo(self, S2vals):
""" Calculate albedo using Liang et al (2002) equation
For some reason B8A is being used in unforked version of repo, even
though Naegeli uses B08...
See also Naegeli et al 2017, Remote Sensing
"""
albedo = ( 0.356 * S2vals.Data.loc[{self.NAME_bands:'B02'}] \
+ 0.130 * S2vals.Data.loc[{self.NAME_bands:'B04'}] \
+ 0.373 * S2vals.Data.loc[{self.NAME_bands:'B8A'}] \
+ 0.085 * S2vals.Data.loc[{self.NAME_bands:'B11'}] \
+ 0.072 * S2vals.Data.loc[{self.NAME_bands:'B12'}] \
- 0.0018 )
return albedo
def calculate_2DBA(self, S2vals):
Index2DBA = S2vals.Data.loc[{self.NAME_bands:'B05'}]/S2vals.Data.loc[{self.NAME_bands:'B04'}]
predict2DBA = 201100* Index2DBA -194500
return Index2DBA, predict2DBA
def combine_masks(self, S2vals):
""" Combine ice mask and cloud masks """
mask2 = (S2vals.Icemask.values == 1) \
& (S2vals.Data.sum(dim=self.NAME_bands) > 0) \
& (S2vals.Cloudmask.values == 0)
return mask2
def invert_snicar_multi_LUT(self, S2vals, mask2, predictedxr, predict2DBA, wavelengths, idx, tile, year, month):
"""
function inverts snicar but with 2BDA band ratio acting as first-pass filter
i.e. the 2BDA value is used to restrict the LUT to a particular parameter range
This allows finer resolution of parameters without sacrificing computation time
"""
# REFORMAT INPUT DATA
# 1) reshape band data into 1D stack, rename and chunk
stacked = S2vals.Data.stack(allpoints=[self.NAME_y,self.NAME_x])
stackedT = stacked.T
stackedT = stackedT.rename({'allpoints': 'samples'})
stackedT = stackedT.chunk(20000,9)
# 2) Repeat for 2BDA data
stacked2DBA = predict2DBA.stack(allpoints=[self.NAME_y,self.NAME_x])
stacked2DBA = stacked2DBA.T
stacked2DBA = stacked2DBA.rename({'allpoints': 'samples'})
stacked2DBA.chunk(20000)
dz = [0.02, 0.03, 0.04, 0.05, 0.075 , 0.1]
densities = [600, 650, 700, 750, 800, 850]
grain_rds = [600, 700, 800, 900, 1000]
algae = [0, 5000, 7500, 10000, 12500, 15000, 17500, 20000, 25000, 30000]
solzens = [40, 45, 50, 55]
algae_low = [0, 5000, 7500, 10000, 12500]
algae_mid = [10000, 12500, 15000, 17500]
algae_high = [15000, 17500, 20000, 25000, 30000]
# load the LUT associated with the appropriate solar zenith angle
# then reshape to correct wavelength range and flatten
LUT = np.load(str(os.environ['PROCESS_DIR'] +'LUT.npy'))
# select appropriate SZA from file metadata
files = glob.glob(os.environ['PROCESS_DIR'] + '*.jp2') # grab current files
file = files[5] #take first file from list
print(file[52:60]) # print year,month and day from the filename
rowID = file[52:60] # make the date info from the filename rowID
# open the metadata file
metadataDF = pd.read_csv('/datadrive/BigIceSurfClassifier/Process_Dir/AllFiles_coszen.csv')
#find the row corresponding to the open file
DF2 = metadataDF[(metadataDF['filename'].str.contains(rowID)) & (metadataDF['filename'].str.contains(tile.upper()))]
# condition to protect against missing or invalid data in metadata file (defaults to most common SZA)
if len(DF2)==0:
coszen = 0.7
else:
# grab the coszen value from the metadata
coszen = DF2['coszen'].values[0]
coszen = int(coszen*10) #round to nearest tenth
if coszen >=0.75:
zen_idx = 0
elif (coszen >=0.65) & (coszen < 0.75):
zen_idx = 1
elif (coszen >=0.58) & (coszen < 0.65):
zen_idx = 2
else:
zen_idx = 3
print(LUT.shape)
LUT_low = LUT[zen_idx,:,:,:,0:5,10:].reshape(len(densities)*len(grain_rds)*len(dz)*len(algae_low),len(wavelengths[10:]))
LUT_mid = LUT[zen_idx,:,:,:,3:7,10:].reshape(len(densities)*len(grain_rds)*len(dz)*len(algae_mid),len(wavelengths[10:]))
LUT_high = LUT[zen_idx,:,:,:,5:,10:].reshape(len(densities)*len(grain_rds)*len(dz)*len(algae_high),len(wavelengths[10:]))
# reshape LUT to correct wavelengths and convert to xr.DataArray_hi
LUT_low = LUT_low[:,idx]
LUT_low = xr.DataArray(LUT_low,dims=('spectrum','bands')).astype(np.float32)
LUT_mid = LUT_mid[:,idx]
LUT_mid = xr.DataArray(LUT_mid,dims=('spectrum','bands')).astype(np.float32)
LUT_high = LUT_high[:,idx]
LUT_high = xr.DataArray(LUT_high,dims=('spectrum','bands')).astype(np.float32)
# set up empty dataArrays for calculating error between S2 and LUT
error_array = np.zeros(shape=stackedT.shape)
error_array1 = xr.DataArray(error_array,dims=('spectrum','bands')).astype(np.float32)
error_array2 = xr.DataArray(error_array,dims=('spectrum','bands')).astype(np.float32)
error_array3 = xr.DataArray(error_array,dims=('spectrum','bands')).astype(np.float32)
# subtract S2 reflectance from LUT
print("subtraction step")
error_array1 = LUT_low-stackedT.astype(np.float32)
error_array2 = LUT_mid-stackedT.astype(np.float32)
error_array3 = LUT_high-stackedT.astype(np.float32)
# make absolute (i.e. -ves to +ve)
print("taking abs")
idx_array1 = xr.apply_ufunc(abs,error_array1,dask='allowed')
idx_array2 = xr.apply_ufunc(abs,error_array2,dask='allowed')
idx_array3 = xr.apply_ufunc(abs,error_array3,dask='allowed')
# flush used error_arrays from memory
error_array1 = None
error_array2 = None
error_array3 = None
# take mean of absolute error across wavelength
print("finding mean")
idx_array1 = xr.apply_ufunc(np.mean,idx_array1,input_core_dims=[['bands']],dask='allowed',kwargs={'axis':-1})
idx_array2 = xr.apply_ufunc(np.mean,idx_array2,input_core_dims=[['bands']],dask='allowed',kwargs={'axis':-1})
idx_array3 = xr.apply_ufunc(np.mean,idx_array3,input_core_dims=[['bands']],dask='allowed',kwargs={'axis':-1})
# find index of lowest mean error
print("finding argmin")
idx_array1 = xr.apply_ufunc(np.argmin,idx_array1,input_core_dims=[['spectrum']],dask='allowed',kwargs={'axis':-1})
idx_array2 = xr.apply_ufunc(np.argmin,idx_array2,input_core_dims=[['spectrum']],dask='allowed',kwargs={'axis':-1})
idx_array3 = xr.apply_ufunc(np.argmin,idx_array3,input_core_dims=[['spectrum']],dask='allowed',kwargs={'axis':-1})
print("masking by 2BDA")
idx_array1 = np.where((stacked2DBA.values>=10000)&(stacked2DBA.values<20000),idx_array2, idx_array1)
idx_array1 = np.where(stacked2DBA.values>20000,idx_array3, idx_array1)
param_array1 = np.array(np.unravel_index(idx_array1,[len(densities),len(grain_rds),len(dz),len(algae_low)]))
param_array2 = np.array(np.unravel_index(idx_array2,[len(densities),len(grain_rds),len(dz),len(algae_mid)]))
param_array3 = np.array(np.unravel_index(idx_array3,[len(densities),len(grain_rds),len(dz),len(algae_high)]))
# creat esingle array with param idxs aligned with 2BDA range (i.e use different LUT depending on LUT)
param_array1 = np.where((stacked2DBA.values>=10000)&(stacked2DBA.values<20000), param_array2, param_array1)
param_array1 = np.where(stacked2DBA.values>=20000, param_array3, param_array1)
# flush now-redundant param_arrays from memory
param_array2 = None
param_array3= None
# flush no-redundant idx_arrays from memory
idx_array1 = None
idx_array2 = None
idx_array3 = None
## Start the bit that hurts my head...
# use the indexes to retrieve the actual parameter values for each pixel from the LUT indexes
# only the first value from the parameter arrays is needed.
param_names = ['density','reff','dz','algae']
# concatenate param values from each LUT into list of lists
algae = [algae_low, algae_mid, algae_high]
densities = [densities,densities, densities]
dz = [dz, dz, dz]
grain_rds = [grain_rds, grain_rds, grain_rds]
# define params to iterate through - top level iterator
params = [densities, grain_rds, dz, algae]
# loop through params (e.g. j == densities)
for paramChoice in np.arange(0,len(params),1):
# select parameter from parameter list
param = params[paramChoice]
# set up empty result arrays
result_array = np.empty(shape=stacked2DBA.shape)
result_array2 = np.empty(shape=stacked2DBA.shape)
result_array3 = np.empty(shape=stacked2DBA.shape)
# select array of parameter values from each parameter
for paramValues in np.arange(0,len(param),1):
# iterate through individual values in each parameter value list
for ParamVal in np.arange(0,len(param[paramValues]),1):
# condition checks which array of param values we are in
# each param value array is specific to an individual LUT
# create separate array for each LUT
if paramValues==0:
result_array[param_array1[paramChoice]==ParamVal] = param[paramValues][ParamVal]
elif paramValues==1:
result_array2[param_array1[paramChoice]==ParamVal] = param[paramValues][ParamVal]
elif paramValues==2:
result_array3[param_array1[paramChoice]==ParamVal] = param[paramValues][ParamVal]
# combine individual result arrays by selecting values from each based on 2BDA value
result_array = np.where((stacked2DBA.values>=10000)&(stacked2DBA.values<20000),result_array2, result_array)
result_array = np.where(stacked2DBA >= 20000, result_array3, result_array)
# reshape result_array to original dims, add metadata, convert to xr DataArray, apply mask
result_array = result_array.reshape(int(np.sqrt(len(stackedT))),int(np.sqrt(len(stackedT))))
resultxr = xr.DataArray(data=result_array,dims=['y','x'], coords={'x':S2vals.x, 'y':S2vals.y}).chunk(2000,2000)
# mask to ice area
resultxr = resultxr.where(mask2 > 0)
# send to netcdf and flush memory
resultxr.to_netcdf(str(os.environ['PROCESS_DIR']+ "{}.nc".format(param_names[paramChoice])))
# flush now-redundant result_arrays from memory
result_array = None
result_array2 = None
resultxr = None
return
def invert_snicar_single_LUT(self, S2vals, mask2, predictedxr, densities, grain_rds, algae, wavelengths, idx, tile, year, month):
"""
Pixelwise retrieval of snicar RT params by matching spectra against snicar-generated LUT loaded from process_dir
"""
# stack the values into a 1D array
stacked = S2vals.Data.stack(allpoints=[self.NAME_y,self.NAME_x])
# Transpose and rename so that DataArray has exactly the same layout/labels as the training DataArray.
stackedT = stacked.T
stackedT = stackedT.rename({'allpoints': 'samples'})
# RECHUNK STACKEDT: choice of chunk size is crucial for maximising speed while preventing memory over-allocation.
# While 20000 seems small there are very large intermediate arrays spawned by the compute() function.
# chunks >~20000 give memory error due to size of intermediate arrays
stackedT = stackedT.chunk(20000,9)
#################################
# SELECT AND LOAD LUT
#################################
# identify coszen and load appropriate lut
files = glob.glob(os.environ['PROCESS_DIR'] + '*.jp2') # grab current files
file = files[5] #take first file from list
print(file[52:60]) # print year,month and day from the filename
rowID = file[52:60] # make the date info from the filename rowID
print(rowID)
print(tile.upper())
# open the metadata file
metadataDF = pd.read_csv('/datadrive/BigIceSurfClassifier/Process_Dir/AllFiles_coszen.csv')
#find the row corresponding to the open file
DF2 = metadataDF[(metadataDF['filename'].str.contains(rowID)) & (metadataDF['filename'].str.contains(tile.upper()))]
# condition to protect against missing or invalid data in metadata file (defaults to most common SZA)
if len(DF2)==0:
coszen = 0.7
else:
# grab the coszen value from the metadata
coszen = DF2['coszen'].values[0]
coszen = int(coszen*10) #round to nearest tenth
# load the LUT associated with the appropriate solar zenith angle
LUT = np.load(str(os.environ['PROCESS_DIR'] +'LUT.npy'.format(coszen)))
# reformat LUT: flatten LUT from 3D to 2D array with one column per combination
# of RT params, one row per wavelength
LUT = LUT.reshape(len(densities)*len(grain_rds)*len(algae),len(wavelengths))
##################################
###################################
# find most similar LUT spectrum for each pixel in S2 image
# astype(float 16) to reduce memory allocation (default was float64)
LUT = LUT[:,idx] # reduce wavelengths to only the 9 that match the S2 image
LUT = xr.DataArray(LUT,dims=('spectrum','bands')).astype(np.float16)
################################
# original - uncomment to restore
#################################
# error_array = LUT - stackedT.astype(np.float16) # subtract reflectance from snicar reflectance pixelwise
####################
# NEW: EXPERIMENTAL
####################
def get_error_array(LUT,ds):
error_array = LUT-stackedT.astype(np.float32)
return error_array
error_array = xr.apply_ufunc(get_error_array,LUT,stackedT)
###################
# END
###################
idx_array = xr.apply_ufunc(abs,error_array,dask='allowed')
idx_array = xr.apply_ufunc(np.mean,idx_array,input_core_dims=[['bands']],dask='allowed',kwargs={'axis':-1})
idx_array = xr.apply_ufunc(np.argmin,idx_array,input_core_dims=[['spectrum']],dask='allowed',kwargs={'axis':-1})
# unravel index computes the index in the original n-dimeniona LUT from the index in the flattened LUT
@dask.delayed
def unravel(idx_array,densities,grain_rds,algae):
param_array = np.array(np.unravel_index(idx_array,[len(densities),len(grain_rds),len(algae)]))
return param_array
param_array = unravel(idx_array,densities,grain_rds,algae)
param_array = np.array(param_array.compute(scheduler='processes'))
# flush disk
idx_array = None
error_array = None
dirtyLUT = None
#use the indexes to retrieve the actual parameter values for each pixel from the LUT indexes
# only the first value from the parameter arrays is needed.
counter = 0
param_names = ['density','reff','algae']
for param in [densities, grain_rds, algae]:
for i in np.arange(0,len(param),1):
if i ==0: # in first loop, pixels !=i should be replaced by param_array values, as result_array doesn't exist yet
result_array = np.where(param_array[counter]==i, param[i], param_array[counter])
else:
result_array = np.where(param_array[counter]==i, param[i], result_array)
# reshape to original dims, add metadata, convert to xr DataArray, apply mask
result_array = result_array.reshape(int(np.sqrt(len(stackedT))),int(np.sqrt(len(stackedT))))
resultxr = xr.DataArray(data=result_array,dims=['y','x'], coords={'x':S2vals.x, 'y':S2vals.y}).chunk(2000,2000)
resultxr = resultxr.where(mask2 > 0)
# send to netcdf and flush memory
resultxr.to_netcdf(str(os.environ['PROCESS_DIR']+ "{}.nc".format(param_names[counter])))
result_array = None
resultxr = None
counter +=1
# retrieved params are saved as temporary netcdfs to the process_dir and then collated directly from
# file into the final dataset in run_classifier.py
return
def run_ebmodel(self, alb):
## Input Data, as per first row of Brock and Arnold (2000) spreadsheet
lat = 67.0666
lon = -49.38
lon_ref = 0
summertime = 0
slope = 1.
aspect = 90.
elevation = 1020.
albedo = alb
roughness = 0.005
met_elevation = 1020.
lapse = 0.0065
day = 202
time = 1200
inswrad = 571
avp = 900
airtemp = 5.612
windspd = 3.531
SWR,LWR,SHF,LHF = ebm.calculate_seb(lat, lon, lon_ref, day, time, summertime, slope, aspect, elevation,
met_elevation, lapse, inswrad, avp, airtemp, windspd, albedo, roughness)
sw_melt, lw_melt, shf_melt, lhf_melt, total = ebm.calculate_melt(
SWR,LWR,SHF,LHF, windspd, airtemp)
# flush memory
sw_melt = None
lw_melt = None
shf_melt = None
lhf_melt = None
SWR = None
LWR = None
SHF = None
LHF = None
return total