-
Notifications
You must be signed in to change notification settings - Fork 2
/
01 EDA - FITs.R
271 lines (204 loc) · 8.62 KB
/
01 EDA - FITs.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
library(tidyverse)
library(dplyr)
library(lubridate)
library(rgdal)
library(raster)
library(tmap)
library(tmaptools)
library(sf)
library(forcats)
library(reshape2)
tmap_mode("view")
# Get FiT data ---------------------------------------------------
# FIT data comes in 3x ~25mb files - Read and combine all 3
# fits1 <- readxl::read_excel('data/fit_installation_report_-_april_2019_part_1_final.xlsx', skip=5)
# fits2 <- readxl::read_excel('data/fit_installation_report_-_april_2019_part_2_final.xlsx', skip=5)
# fits3 <-readxl::read_excel('data/fit_installation_report_-_april_2019_part_3_final.xlsx', skip=5)
# fits <- rbind(fits1, fits2, fits3) # Combine
# Rename columns for easier manipulation
# fits <- rename(fits,
# extension = `Extension: Y/N`,
# postcode = `Installation Postcode`,
# tech = `Technology`,
# cap = `Installed capacity` ,
# net_cap = `Declared net capacity`,
# app_date = `Application date`,
# comm_date = `Commissioning date`,
# msc_date = `MCS issue date`,
# export = `Export status`,
# tariff = `Tariff code`,
# tariff_desc = `Tariff description`,
# type = `Installation type`,
# country = `Country`,
# la = `Local authority`,
# govt_region = `Government office region`,
# accred = `Accreditation Route`,
# mpan = `METER NUMBER (MPAN)`,
# school = `Community/school category`,
# lsoa = `LLSOA code`)
# write_csv(fits, 'data/fits_data_april_2019_combined.csv')
fits <- read_csv('data/fits_data_april_2019_combined.csv')
# Other pre-prep to FiT Database
######################### Get Renewable Energy Planning Database (REPD)
repd <- readxl::read_xlsx('data/renewable-energy-public-database-q1-2019.xlsx', sheet = 'Database', skip = 6)
# Drop missing values
repd <- repd[!is.na(repd$`Installed Capacity (MWelec)`),]
# Filter for solar only
repd <- repd %>% filter(`Technology Type`=='Solar Photovoltaics')
# Create factors
repd$`Development Status (short)` <- as_factor(repd$`Development Status (short)`)
# Capacity by status
repd %>% group_by(`Development Status (short)`) %>%
summarise(capacity=sum(`Installed Capacity (MWelec)`, status = `Development Status (short)`)) %>%
mutate(x=fct_reorder(`Development Status (short)`, capacity) %>% fct_rev()) %>%
ggplot(aes(x, y=capacity)) + geom_bar(stat = "identity") +
theme(axis.text.x = element_text(angle=45, hjust=1))
# Operational only
repd <- repd %>% filter(`Development Status (short)`=='Operational')
# I forgot to remove duplicates as those in REPD receiving FIT payments are also in the FIT database
repd_fit <- repd[!is.na(repd$`FiT Tariff (p/kWh)`),]
sum(repd_fit$`Installed Capacity (MWelec)`)
# 224.901 MW down as receiving FiT data
# let's drop those values from repd that should exist in the FIT
repd <- repd[is.na(repd$`FiT Tariff (p/kWh)`),]
# Drop most columns
repd <- repd %>% dplyr::select('Ref ID', 'Site Name', 'Installed Capacity (MWelec)', 'Mounting Type for Solar', 'Address', 'County', 'Region', 'Post Code', 'X-coordinate', 'Y-coordinate', 'Operational')
# Rename sensibly
repd <- rename(repd,
id = 'Ref ID',
name = 'Site Name',
cap = 'Installed Capacity (MWelec)',
mounting = 'Mounting Type for Solar',
address = 'Address',
county = 'County',
region = 'Region',
postcode = 'Post Code',
x = 'X-coordinate',
y = 'Y-coordinate',
start_date = 'Operational')
repd$mounting <- as_factor(repd$mounting)
repd$county <- as_factor(repd$county)
repd$region <- as_factor(repd$region)
# Capacity by region
repd %>% group_by(region) %>% summarise(cap = sum(cap)) %>% ggplot() + geom_col(aes(region, cap)) +
theme(axis.text.x = element_text(angle=45, hjust=1))
# EDA on FiT---------------------------------------------------------------------
# Initial exploration of FiT Database
# Bear in mind FIT database has technology mix, not just solar!
# fits %>% group_by(tech) %>% summarise(count=n())
# Take only solar
fits <- fits %>% filter(tech=='Photovoltaic')
# Drop the tech column
fits <- fits %>% select(-tech)
# Size range
range(fits$cap)
# [1] 0.02 4999.80
# Histogram of installations by size
b = c(0, 0.5, 1.5, 2, 2.5, 3.5, 4, 5, 10, 50, 500, 5000)
ranges = paste(head(b,-1), b[-1], sep=" - ")
freq = hist(fits$cap, breaks=b, include.lowest=TRUE, plot=FALSE)
df <- data.frame(range = ranges, frequency = freq$counts, breaks=freq$breaks[-1])
df
# Plot
df$range <- factor(df$range, levels = df$range[order(df$breaks)]) # Order by the breaks
df %>%
ggplot(aes(x=range, y=frequency)) + geom_bar(stat="identity") +
theme_minimal()
# Banding due to FiT thresholds
fits %>% filter(cap > 50 & cap <= 1000) %>%
ggplot(aes(cap)) + geom_histogram()
fits %>% filter(cap > 1000) %>%
ggplot(aes(cap)) + geom_histogram()
# Counts
fits %>%
# group_by(Country) %>%
summarise(capacity = sum(cap), count=n())
# capacity count
# <dbl> <int>
# 1 4965576. 839694
# This is a little low, certainly on capacity. BEIS reports 13GW of solar, this has 4.9GW
# Total count is 839,694 installations. BEIS has 960,000 GB (985,000 UK)
# From the BEIS deployment stats, FiT only covers 5GW. Rest is Renewables Obligation (RO) (6.3GW), CfDs (tiny), Unaccredited (0.85GW)
# Join FiT and REPD -------------------------------------------------------
# Cut out crust
fits <- fits %>% dplyr::select(c(postcode,cap,net_cap,comm_date,export,tariff,tariff_desc,type,country,la,govt_region,school,lsoa))
repd <- repd %>% dplyr::select(id, name, cap, `RO Banding (ROC/MWh)`, `FiT Tariff (p/kWh)`, mounting, `Development Status (short)`,
address, county, region, Country, postcode, x, y, `Planning Authority`, start_date)
fits$id <- NA # Create dummy ID column to match with REPD
fits$source <- "fitsDB"
repd$source <- "REPD"
# Correct the fact that REPD plants have capacity in MW, not kW
repd$cap <- repd$cap * 1000
# Join up dbs
pv <- full_join(fits, repd, by=c("source", "postcode", "cap", "id"))
# Check all rows have gone in separately
nrow(pv) == count(fits) + count(repd)
# I suspect there are duplicates in here. Challenge will be to identify them.
# Try and merge region information
pv$region <- as.character(pv$region)
pv %>% group_by(region) %>% summarise(count = n())
pv %>% group_by(govt_region) %>% summarise(count = n())
# Find intersection of regions
intersect(unique(pv$region), unique(pv$govt_region))
# Regions not matching
union(setdiff(unique(pv$region), unique(pv$govt_region)), setdiff(unique(pv$govt_region), unique(pv$region)))
# Set Region from REPD to govt_region from FITs
pv$region[pv$region == "Eastern"] <- "East of England"
pv$region[pv$region == "Yorkshire and Humber"] <- "Yorkshire and The Humber"
# Turn unknown to NA
pv$govt_region[pv$govt_region == "Unknown"] <- NA
# Merge region columns
pv$region[is.na(pv$region)] <- pv$govt_region[is.na(pv$region)]
# Drop govt_region col
pv <- pv %>% dplyr::select(-govt_region)
# Net cap vs cap?
grouped <- pv %>% group_by(region) %>%
summarise(count = n(), cap=sum(cap), net_cap=sum(cap))
grouped
# melt the data frame for plotting
pv.melt <- melt(grouped, id.vars=c('region'))
# Plot
pv.melt %>%
ggplot(aes(region, value)) + geom_bar(aes(fill = variable), position="dodge", stat="identity")
# Tariff types?
pv %>% group_by(tariff) %>%
summarise(count=n())
# Export tariff
pv %>% group_by(export) %>%
summarise(count=n())
# Export tariff description
pv %>% group_by(tariff_desc) %>%
summarise(count=n())
# Only a tiny amount of tariff information so perhaps discard?
# Vast majority on Deemed Export Tariff. Small % on No Export and Standard. Discard.
# Type
pv %>% group_by(type) %>%
summarise(count=n())
# This is useful
# Country
pv %>% group_by(country) %>%
summarise(count=n())
# Ok I guess
# School
pv %>% group_by(school) %>%
summarise(count=n())
# Ok I guess
# Mounting
pv %>% group_by(mounting) %>%
summarise(count=n())
# Barely any information here, Discard.
# Address
pv %>% group_by(address) %>%
summarise(count=n())
# Mostly not but useful where it is.
# County
pv %>% group_by(county) %>%
summarise(count=n())
# Too much missing info
# Slim down and export ----------------------------------------------------
# Slim down the file
pv <- pv %>% dplyr::select(-c('net_cap', 'mounting', 'county', 'tariff', 'tariff_desc'))
pv %>% group_by(pv$name) %>% summarise(count=n())
# Write merged file
write_csv(pv, 'data/fit_repd_pv.csv')
# TODO Fix: Columns (11,12,16) have mixed types.