-
Notifications
You must be signed in to change notification settings - Fork 1
/
03-Simulation.Rmd
401 lines (325 loc) · 17 KB
/
03-Simulation.Rmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
```{r truncated class simulation, include=FALSE}
## Returns a function that returns a list with each of its elements defined as functions or vectors of one
## element that describe aspects of a truncated triangular distribution
## (pdf, cdf, inverse cdf, mean, median, mode, upper bound, lower bound, variance).
generate.truncated.triangular <- function(a, b, orig.tri.dist) {
## Original parameters of the triangular distribution
M <- orig.tri.dist$tri.mode
L <- orig.tri.dist$tri.lower
U <- orig.tri.dist$tri.upper
## Throw an error if the new bounds do not fall witin the old bounds of the triangular pdf.
if (a < L | b > U) {
stop("The new bounds of the pdf do not fall within the original range")
}
## Create a function that returns the pdf of the truncated triangular, given some x.
pdf <- function(x) {
if (a < x & x <= b) {
pdf.g <- orig.tri.dist$pdf
cumul.F <- orig.tri.dist$cdf
result <- pdf.g(x) / (cumul.F(b) - cumul.F(a))
return(result)
} else {
result <- 0
return(result)
}
}
## Create a function that returns the cdf of the truncated triangular, given some x.
cdf <- function(x) {
cumul.F <- orig.tri.dist$cdf
if (x <= a) {
result <- 0
} else if (a < x & x <= b) {
result <- (cumul.F(x) - cumul.F(a)) / (cumul.F(b) - cumul.F(a))
} else if (b < x) {
result <- 1
}
return(result)
}
## Create a function that returns the inverse cdf of the truncated triangular, given some probability p.
inverse.cdf <- function(p) {
result <-
orig.tri.dist$inverse.cdf(p * (orig.tri.dist$cdf(b) - orig.tri.dist$cdf(a)) + orig.tri.dist$cdf(a))
return(result)
}
## Create a vector of length 1 that describes the mean of the distribution
trun.tri.mean <- if (a <= b & b < M) {
result.numerator <-
(2 * (-3 * L * ((b ^ 2) / 2 - (a ^ 2) / 2) + b ^ 3 - a ^ 3)) / (3 * (U - L) * (M - L))
result.denominator <-
orig.tri.dist$cdf(b) - orig.tri.dist$cdf(a)
result.numerator / result.denominator
} else if (a < M & b >= M) {
result.numerator <-
(
-M ^ 3 * U - 2 * a ^ 3 * U + 3 * L * a ^ 2 * U + 3 * M * b ^ 2 * U - 3 * L * b ^ 2 * U + L * M ^ 3 + 2 * M * a ^ 3 - 3 * L * M * a ^ 2 - 2 * M * b ^ 3 + 2 * L * b ^ 3
) / (3 * (U - L) * (M - L) * (U - M))
result.denominator <-
orig.tri.dist$cdf(b) - orig.tri.dist$cdf(a)
result.numerator / result.denominator
} else if (M <= a & a <= b) {
result.numerator <-
(3 * b ^ 2 * U - 3 * a ^ 2 * U - 2 * b ^ 3 + 2 * a ^ 3) / (3 * (U - L) * (U - M))
result.denominator <-
orig.tri.dist$cdf(b) - orig.tri.dist$cdf(a)
result.numerator / result.denominator
}
## Create a vector of length 1 that describes the median of the distribution
trun.tri.median <- inverse.cdf(0.5)
## Create a vector of length 1 that describes the mode of the distribution
trun.tri.mode <- if (a <= b & b <= M) {
b
} else if (a <= b & b >= M) {
M
} else if (a >= M & b >= a) {
a
}
## Create a vector of length 1 that describes the upper bound of the distribution's domain
trun.tri.upper <- b
## Create a vector of length 1 that describes the lower bound of the distribution's domain
trun.tri.lower <- a
## Create a vector of length 1 that describes the variance of the distribution
trun.tri.var <- if (a <= b & b < M) {
result.numerator <-
(-4 * L * ((b ^ 3) / 3 - (a ^ 3) / 3) + b ^ 4 - a ^ 4) / (2 * (U - L) * (M - L))
result.denominator <-
orig.tri.dist$cdf(b) - orig.tri.dist$cdf(a)
(result.numerator / result.denominator) - trun.tri.mean ^ 2
} else if (a < M & b >= M) {
result.numerator <-
(
-M ^ 4 * U - 3 * a ^ 4 * U + 4 * L * a ^ 3 * U + 4 * M * b ^ 3 * U - 4 * L * b ^ 3 * U + L * M ^ 4 + 3 * M * a ^ 4 - 4 * L * M * a ^ 3 - 3 * M * b ^ 4 + 3 * L * b ^ 4
) / (6 * (U - L) * (M - L) * (U - M))
result.denominator <-
orig.tri.dist$cdf(b) - orig.tri.dist$cdf(a)
(result.numerator / result.denominator) - trun.tri.mean ^ 2
} else if (M <= a & a <= b) {
result.numerator <-
(4 * (b ^ 3) * U - 4 * (a ^ 3) * U - 2 * b ^ 4 + 3 * a ^ 4) / (6 * (U - L) * (U - M))
result.denominator <-
orig.tri.dist$cdf(b) - orig.tri.dist$cdf(a)
(result.numerator / result.denominator) - trun.tri.mean ^ 2
}
## Build the list and return it. This list contains all major properties of the truncated triangular distribution
return(
list(
pdf = pdf,
cdf = cdf,
inverse.cdf = inverse.cdf,
trun.tri.mean = trun.tri.mean,
trun.tri.median = trun.tri.median,
trun.tri.mode = trun.tri.mode,
trun.tri.upper = trun.tri.upper,
trun.tri.lower = trun.tri.lower,
trun.tri.var = trun.tri.var
)
)
}
```
```{r triangular class simulation, include=FALSE}
## Returns a function that returns a list with each of its elements defined as functions or vectors of one
## element that describe aspects of a triangular distribution
## (pdf, cdf, inverse cdf, mean, median, mode, upper bound, lower bound, variance)
generate.triangular <- function(L, U, M) {
## Create a function that returns the pdf of the triangular, given some x.
pdf <- function(x) {
if (x < L) {
result <- 0
} else if (x < M) {
result <- 2 * (x - L) / ((U - L) * (M - L))
} else if (x == M) {
result <- 2 / (U - L)
} else if (x <= U) {
result <- 2 * (U - x) / ((U - L) * (U - M))
} else if (U < x) {
result <- 0
}
return(result)
}
## Create a function that returns the cdf of the triangular, given some x.
cdf <- function(x) {
if (x <= L) {
result <- 0
} else if (x <= M) {
result <- ((x - L) ^ 2) / ((U - L) * (M - L))
} else if (x < U) {
result <- 1 - ((U - x) ^ 2) / ((U - L) * (U - M))
} else if (U <= x) {
result <- 1
}
return(result)
}
## Create a function that returns the inverse cdf of the triangular, given some probability p.
inverse.cdf <- function(p) {
if (p < (M - L) / (U - L)) {
result <- L + sqrt(max(0, (M - L) * (U - L) * p))
} else if (p >= (M - L) / (U - L)) {
result <- U - sqrt(max(0, (U - L) * (U - M) * (1 - p)))
}
return(result)
}
## Create a vector of length 1 that describes the mean of the distribution
tri.mean <- (L + U + M) / 3
## Create a vector of length 1 that describes the median of the distribution
tri.median <- inverse.cdf(0.5)
## Create a vector of length 1 that describes the mode of the distribution
tri.mode <- M
## Create a vector of length 1 that describes the upper bound of the distribution's domain
tri.upper <- U
## Create a vector of length 1 that describes the lower bound of the distribution's domain
tri.lower <- L
## Create a vector of length 1 that describes the variance of the distribution
tri.var <-
((L ^ 2) + (U ^ 2) + (M ^ 2) - L * U - L * M - U * M) / 18
## Build the list and return it. This list contains all major properties of the triangular distribution
return(
list(
pdf = pdf,
cdf = cdf,
inverse.cdf = inverse.cdf,
tri.mean = tri.mean,
tri.median = tri.median,
tri.mode = tri.mode,
tri.upper = tri.upper,
tri.lower = tri.lower,
tri.var = tri.var
)
)
}
```
# Simulation
So far we have designed two constructor functions, `generate.triangular()` and `generate.truncated.triangular()`, with which we may create triangular distributions and truncated triangular distributions respectively. Let's move one to some code the actually does something. Say, for example, that the truncated triangular distribution is how you chose for a random variable to behave, how long it takes for a crop to grow or the amount of rain water of a particular time period. Being able to randomly sample instances of this variable without having to wait for rainfall or for crops to grow and study how a larger system behaves as a function of this random variable could be highly beneficial and is, in fact, the premise of Monte Carlo simulation.
Some definitions for this section:
> Let $Z \sim f(x) = Triangular(2, 9, 7)$ and $Z' \sim f(x|3 < X \leq 8)=h(x)$
```{r Distributions, echo=TRUE}
## Set a seed so results are replicable
set.seed(10)
## Create a list that contains the major properties of a triangular
## distribution with L = 2, U = 9, and M = 7
my.tri.dist <- generate.triangular(L = 2, U = 9, M = 7)
## Create a list that contains the major properties of a truncated
## triangular distribution between a = 3 and b = 8, based on the
## previous triangular
my.trun.tri.dist <-
generate.truncated.triangular(a = 3,
b = 8,
orig.tri.dist = my.tri.dist)
```
## Uniform Distribution
This may well be the most simple continuous probability density function. The uniform distribution, describes a random variable that is equiprobable at all values within an interval. This means that given an interval $[a,b]$, a random variable $X$ can be any value greater than or equal to $a$ and smaller than or equal to $b$ with the same likelihood. Let's express this mathematically!
> Let $U \sim g(x)=Uniform(a,b)$, where
$$
g(x) =
\begin{cases}
\frac{1}{b-a} \quad& (a\leq x\leq b)\\
0 \quad& (x\notin [a,b])
\end{cases}
$$
No need to implement this programatically, it is a very simple distribution that we use inderectly when generating random numbers.
## Pseudo-random Uniform Numbers
We'll concern ourselves with a method to simulate a uniform random variable, the _linear congruence method_. This is useful to generate numbers that follow a uniform distribution which we can then transform into any distribution. Know that it is not the most widespread method, but the most simple which will result in a reliable random rumber generator. Most software will use some implementation of the _Mersenne Twister_ if you're looking for further reading.
It is not possible to produce a sequence of numbers that is truly random. If you knew the initial conditions of a system, with enough information you could perfectly predict it. Rather, pseudo-random number generators (RNGs) take some _seed_ ($X_{0}$) as its initial state and produces numbers from it following a deterministic procedure that is too difficult to reverse without knowing the seed. The seed is properly hidden from the target of the random numbers and so the system seems random, since they have no way to predict what the next condition will be. All following numbers are indexed as $X_{1},X_{2},\cdots,X_{n}$. The general expression for the sequence is as follows:
$$
X_{n+1} = (aX_{n}+c) \mod m.
$$
Where $m$ is usually a prime number, $a$ is some positive integer smaller than $m$, and c is some integer in the interval $[0,m[$.
The numbers in this sequence are not samples of a uniform random variable. The actual sequence of uniformly distributed random numbers, $U_{n}$, is the following transform of the $X_{n}$ sequence.
$$
U_{n}=\frac{X_{n}}{m}
$$
$U_{n}$ is a sequence of uniformly distributed numbers on the interval $[0,1[$. If we wanted another sequence of random numbers, $U'_{n}$ on some other interval, $[a,b[$, we would do the following transform.
$$
U'_{n}=U_{n} (b - a) + a
$$
Notice how when $U_{n}$ is very close to $1$, $U'_{n}$ comes very close to being $b$, whereas when $U_{n}$ is $0$, $U'_{n}$ is just $a$, and so we now have a random sample of numbers uniformly distributed on the interval $[a,b[$.
We also won't implement this procedure since most programming languages already have a function to produce uniformly distributed numbers on the interval $[0,1[$, though knowing how one may do so will come in handy. In R, this function is `runif()`.
One last thing to note is that one should never return $U_{0}$ or $U'_{0}$ as a random number, since a user with malicious intent could easily derive the seed from it.
## Pseudo-Random Variables
Let us now consider two methods for generating samples of variables following arbitrary distributions off of a sequence of uniformly distributed random numbers, $U_{n}$.
### Inverse CDF Method
You may have noticed already when studying the triangular CDF, but given a random variable $Y$ with PDF $f(y)$ and CDF $F(y)$, the CDF produces a value on the interval $[0,1]$. The inverse CDF will also take a value on the interval $[0,1]$ and assign it a $y$.
Mathematicaly,
$$
F(y) = p \Rightarrow F^{-1}\left(F\left(y\right)\right)=F^{-1}\left(p\right) \Rightarrow y = F^{-1}\left(p\right).
$$
This means that given some value between 0 and 1, we can apply the inverse CDF to it and get a matching value, $y$. If we do this to all elements in a random sample of values between 0 and 1 we can get a random sample of $y$s that follow the PDF $f(y)$. Let's see this implemented in R
```{r Inverse CDF, echo=TRUE}
## 1. Generate a sequence of uniform random numbers on the interval [0, 1].
## 10000 should be enough.
U <- runif(n = 10000)
## 2. Transform all elements of the U sequence with the inverse CDF
## of the truncated distribution
simulated.Z.prime.Inv.CDF <- sapply(U, my.trun.tri.dist$inverse.cdf)
```
The variable `simulated.Z.prime` holds a sequence of numbers that follow the truncated triangular distribution defined at the start of this section. Let's plot a histogram of this sequence along with the curve of the pdf it should ressemble.
```{r Plot Simulated Inverse CDF, echo=TRUE}
## Plot and describe the first simulation with many bins
hist(simulated.Z.prime.Inv.CDF,
xlab = "z",
breaks = (my.trun.tri.dist$trun.tri.upper - my.trun.tri.dist$trun.tri.lower) * 10,
freq = FALSE)
plot.function(
function(z) {sapply(z, my.trun.tri.dist$pdf)},
from = 2.99999,
to = 8.0001,
n = 100000,
col = "red",
add = TRUE
)
```
The truncated triangular PDF seems to closely fit the random sample. This is a good sign that our procedure is correct but is not enough to say for sure.
### Trial and Error
An expression for the inverse CDF of a distribution is not always easy or possible to derive. If such is the case, we execute the following procedure. One weakness of this approach is it only works for random variables on a closed, finite interval, meaning that the likes of the normal and exponential distributions can't be simulated reliably. Also, this is a slower approach because we can't know how long it will take until we've sampled enough numbers from the distribution. Luckily, the truncated distribution requires that we simulate within an interval and the expected value of iterations until we reach the desired sample size is its minimum the way it is implemented in our code.
1. Find a constant function that is always greater than the PDF you are trying to simulate. The closer this constant is to being the max of the PDF the better.
2. Take a uniform random number and transform it on the interval of the domain of your PDF.
3. Take the next random uniform number, and transform it so it falls between 0 and the constant you found on step one.
4. Evaluate if the transformed value in step three is less than or equal to the PDF evaluated on the transformed value from step two.
5. If the condition in step four is met, add the transformed value from step 2 to your sample of the PDF.
6. Repeat steps two through five until you have gathered your goal sample size.
Let's see this procedure implemented programatically.
```{r Simulated Trial and Error, echo=TRUE}
## Simulate 10000 instances of a random variable with f(x) = truncated triangular by trial and error.
simulated.Z.prime.t.and.e <- rep(0, 10000)
success.count <- 0
## Stop once we've achieved 10000 succesful numbers
while (success.count < 10000) {
## Step 2
u.1 <-
runif(
n = 1,
min = my.trun.tri.dist$trun.tri.lower,
max = my.trun.tri.dist$trun.tri.upper
)
## Step 3
u.2 <-
runif(
n = 1,
min = 0,
max = my.trun.tri.dist$pdf(my.trun.tri.dist$trun.tri.mode)
)
## Step 4
if (u.2 <= my.trun.tri.dist$pdf(u.1)) {
# Step 5
success.count <- success.count + 1
simulated.Z.prime.t.and.e[success.count] <- u.1
}
}
```
In the case of the truncated triangular PDF we know it reaches its maximum when evaluated on its mode, meaning we can take the value $h(mode)$ as the constant of step 1.
Again, let's plot these results to see how we did.
```{r Plot Simulated Trial and Error, echo=TRUE}
## Plot and describe the first simulation with many bins
hist(simulated.Z.prime.t.and.e,
xlab = "z",
breaks = (my.trun.tri.dist$trun.tri.upper - my.trun.tri.dist$trun.tri.lower) * 10,
freq = FALSE)
plot.function(
function(z) {sapply(z, my.trun.tri.dist$pdf)},
from = 2.99999,
to = 8.0001,
n = 100000,
col = "red",
add = TRUE
)
```
Once again, the distribution closely ressembles the sampled data. Next, we will look into testing the hypothesis that the sample matches the distribution.
Whichever method is fine for simulating or sampling PDFs, but know that each has its benefits. You would use the inverse CDF ideally on all scenarios as it is easy to implement, given an inverse CDF, but if such is not available to you or there is no interest in deriving one, the slower simulation by trial and error would still prove reliable.