# Set plot options in Jupyter Lab
options(repr.plot.width = 20)
options(repr.plot.height = 6)
We will revisit some of the examples for Time Series Decomposition and Seasonal ARMA model estimation
?fma::airpass
airpass <- fma::airpass
We can examine the data plot:
plot.ts(airpass)
Note the totak number of years in the series:
total_years <- length(unique(floor(time(airpass))))
print(total_years)
We can also plot the seasonal plot:
forecast::seasonplot(airpass, year.labels = TRUE, year.labels.left = TRUE, col = 1:total_years)
Looking at the $\rm ACF$ and $\rm PACF$ plots:
forecast::tsdisplay(airpass)
and note the ACF slowly declining and the decline is in waves, which appear to repeat every 12
lags - this indicates seasonality. The time series chart indicates that there is an increasing trend.
The variation appears to be increasing, which also indicates that the series may be multiplicative: $Y_t = T_t \cdot S_t \cdot E_t$.
We want to take logarithms of the data, to get an additive form: $\log(Y_t) = \widetilde{Y}_t$ so that $\widetilde{Y}_t = \widetilde{T}_t + \widetilde{S}_t + \widetilde{E}_t$
log_passengers = log(airpass)
forecast::tsdisplay(log_passengers)
Now the series appears to be additive. We can move on to decomposition of the series.
one_sided_trd <- filter(log_passengers, filter = c(1/24, rep(1/12, total_years - 1), 1/24), sides = 1)
two_sided_trd <- filter(log_passengers, filter = c(1/24, rep(1/12, total_years - 1), 1/24), sides = 2)
plot.ts(log_passengers, main = "Log of Airpassangers")
lines(one_sided_trd, col = "red")
lines(two_sided_trd, col = "blue")
legend(x = "topleft", legend = c("Actual", "One-sided MA Trend", "Two-sided MA Trend"), lty = 1, col = c("black", "red", "blue"))
Next, we move to decompose the seasonality
#seasonal component of one-sided averaging
one_sided_season <- log_passengers - one_sided_trd
one_sided_season <- matrix(one_sided_season, ncol = 12, byrow = TRUE)
one_sided_season <- apply(one_sided_season, 2, mean, na.rm = TRUE)
#seasonal component of two-sided averaging
two_sided_season <- log_passengers - two_sided_trd
two_sided_season <- matrix(two_sided_season, ncol = 12, byrow = TRUE)
two_sided_season <- apply(two_sided_season, 2, mean, na.rm = TRUE)
We then scale the seasons to sum up to 0:
one_sided_season <- one_sided_season - sum(one_sided_season) / length(one_sided_season)
two_sided_season <- two_sided_season - sum(two_sided_season) / length(two_sided_season)
par(mfrow = c(1, 2))
plot.ts(one_sided_season, main = "Seasonality from One-sided trend decomposition")
plot.ts(two_sided_season, main = "Seasonality from Two-sided trend decomposition")
Using $S_t = S_{t+d}$, create the series for the full years:
one_sided_season <- rep(one_sided_season, times = total_years)
one_sided_season <- ts(one_sided_season, start = start(log_passengers), freq = frequency(log_passengers))
#
two_sided_season <- rep(two_sided_season, times = total_years)
two_sided_season <- ts(two_sided_season, start = start(log_passengers), freq = frequency(log_passengers))
Next, we can examine the decomposition visually
par(mfrow = c(1, 2))
#
plot(log_passengers, lwd = 2, main = "One-sided averaging decomposition")
lines(one_sided_trd, col = "blue", lwd = 2)
lines(one_sided_trd + one_sided_season, col = "red", lwd = 2)
#
plot(log_passengers, lwd = 2, main = "Two-sided averaging decomposition")
lines(two_sided_trd, col = "blue", lwd = 2)
lines(two_sided_trd + two_sided_season, col = "red", lwd = 2)
We can also use the decompose(...)
function to do all of this:
two_sided_decomp <- decompose(log_passengers, type = "additive")
Compared with the filter()
function:
two_sided_trd - two_sided_decomp$trend
The results are the same. The upside is we automatically get the seasonal decomposition as well, which are the same as our manually calculated ones:
two_sided_season - two_sided_decomp$seasonal
We can also plot the series:
plot.ts(log_passengers, main = "Log of Airpassangers")
lines(two_sided_decomp$trend, col = "blue", lwd = 2)
lines(two_sided_decomp$trend + two_sided_decomp$seasonal, col = "red", lwd = 2)
and examine the residuals:
forecast::tsdisplay(two_sided_decomp$random)
Looking at the $\rm ACF$, the residuals still appear to have some searonality.
s_es <- forecast::ses(log_passengers)
par(mfrow = c(1, 2))
#
plot.ts(log_passengers)#, main = "Simple Exponential smoothing")
lines(fitted.values(s_es), col = "blue", lty = 2, lwd = 2)
lines(fitted.values(forecast::ses(log_passengers, alpha = 0.1)), col = "darkorange", lty = 2, lwd = 2)
#
plot(forecast::forecast(s_es, h = 10))
d_es <- forecast::holt(log_passengers)
par(mfrow = c(1, 2))
#
plot.ts(log_passengers)#, main = "Double Exponential smoothing")
lines(fitted.values(d_es), col = "blue", lty = 2, lwd = 2)
lines(fitted.values(forecast::holt(log_passengers, alpha = 0.1, beta = 0.09)), col = "darkorange", lty = 2, lwd = 2)
#
plot(forecast::forecast(d_es, h = 10))
t_es <- forecast::hw(log_passengers)
par(mfrow = c(1, 2))
#
plot.ts(log_passengers)#, main = "Triple Exponential smoothing")
lines(fitted.values(t_es), col = "blue", lty = 2, lwd = 2)
#
plot(forecast::forecast(t_es, h = 10))
R
allows to use an automated procedure to choose the smoothing parameters - the function ets
selects the best variant automatically (by the minimum of AIC) and estimates its parameters:
ets_es <- forecast::ets(log_passengers)
par(mfrow = c(1, 2))
#
plot.ts(log_passengers)
lines(fitted.values(ets_es), col = "blue", lty = 2, lwd = 2)
#
plot(forecast::forecast(ets_es, h = 10))
Note the help file:
In all cases, "N"=none, "A"=additive, "M"=multiplicative and "Z"=automatically selected. So, for example, "ANN" is simple exponential smoothing with additive errors, "MAM" is multiplicative Holt-Winters' method with multiplicative errors, and so on.
forecast::tsdisplay(ets_es$residuals)
The residuals ppear to have a slight autocorrelation
tt_dt <- data.frame(tt = 1:length(log_passengers))
tt_dt$tt_sq <- tt_dt$tt^2
#
mdl_auto = forecast::auto.arima(airpass, max.d = 0, max.D = 0, xreg = tt_dt)
mdl_auto
plot(airpass, main = "SARMA with trend")
lines(mdl_auto$fitted, col = "red")
We can examine the residuals
forecast::tsdisplay(mdl_auto$residuals)
Note that there appears to be a large autocorrelation for every 12
-th lag. So the restriction of no differencing causes auto.arima
to fail to find a suitible model.
For somparison, if we were to specify a $SARIMA(1, 0, 1)\times(1, 0, 1)_{12}$. If we use the default specification, we may get a warning that the series is not stationary:
tryCatch({
forecast::Arima(airpass, order = c(1, 0, 1), seasonal = c(1, 0, 1), xreg = tt_dt)
}, warning = function(w) {
# warning-handler-code
}, error = function(e) {
print(e)
}, finally = {
# cleanup-code
})
If we are sure that the process is stationary, we can specify a different parameter estimation method:
mdl_manual = forecast::Arima(airpass, order = c(1, 0, 1), seasonal = c(1, 0, 1), xreg = tt_dt, method = "ML")
mdl_manual
plot(airpass, main = "SARMA with trend")
lines(mdl_manual$fitted, col = "red")
forecast::tsdisplay(mdl_manual$residuals)
The residuals look much better compared to the auto.arima
results with restriction that the data is not to be differenced.
Finally, we can forecast the process. we do need to have the soem future values of the trend. Thankfully, we have opted to use the standard indexing of $t = 1,2,...$ for the trend.
tt_forc <- data.frame(tt = tail(tt_dt$tt, 1) + 1:20)
tt_forc$tt_sq <- tt_forc$tt^2
plot(forecast::forecast(mdl_manual, xreg = tt_forc))
lines(mdl_manual$fitted, col = "red")
# Set plot options in Jupyter Lab
options(repr.plot.width = 15)
options(repr.plot.height = 4)
suppressPackageStartupMessages({library(forecast)})
nyse <- read.table(url("http://uosis.mif.vu.lt/~rlapinskas/(data%20R&GRETL/nyse.txt"), header = TRUE)
nyse <- ts(nyse, start = 1952, frequency = 12)
head(nyse, 36)
par(mfrow = c(1, 3))
plot(nyse, main = bquote(Y[t]))
plot(log(nyse), main = bquote(log(Y[t])))
plot(diff(log(nyse)), main = bquote(Delta~log(Y[t])))
Note that the growth appears to be exponential - it is more convenient to take logarithms to linearize the series.
lnyse <- log(nyse)
par(mfrow = c(1, 2))
forecast::Acf(lnyse, )
forecast::Pacf(lnyse)
Slow ACF decay - indication of non-stationarity, or a trend, or a nonstationarity and/or drift and/or trend. We need to test whether the data is TS or DS. To do this - we need to carry out unit root testing.
We can always load the relevant libraries to avoid the use of ::
notation.
library(forecast)
library(dynlm)
library(urca)
Unit root tests consist of:
We will examine three cases:
mod.1 <- dynlm(d(lnyse) ~ L(lnyse) + L(d(lnyse), 1:4) + time(lnyse))
summary(mod.1)
The last lag coefficient, L(d(lnyse), 1:4)4
is insignificant since its $p-value = 0.4070 > 0.05`. We will reduce the maximum lag and continue the procedure.
mod.1 <- dynlm(d(lnyse) ~ L(lnyse) + L(d(lnyse), 1:3) + time(lnyse))
print(round(coef(summary(mod.1)), 4))
p-value
of L(d(lnyse), 1:3)3
is > 0.05
.
mod.1 <- dynlm(d(lnyse) ~ L(lnyse) + L(d(lnyse), 1:2) + time(lnyse))
print(round(coef(summary(mod.1)), 4))
p-value
of L(d(lnyse), 1:2)2
is > 0.05
.
mod.1 <- dynlm(d(lnyse) ~ L(lnyse) + L(d(lnyse), 1) + time(lnyse))
print(round(coef(summary(mod.1)), 4))
p-value
of L(d(lnyse), 1)
is > 0.05
.
mod.1 <- dynlm(d(lnyse) ~ L(lnyse) + time(lnyse))
print(round(coef(summary(mod.1)), 4))
The final model contains no lags of $\Delta Y_{t-j}$ - so there is no autoregressive coefficient for the difference model.
mod.2 <- dynlm(d(lnyse) ~ L(lnyse) + L(d(lnyse), 1:5) + time(lnyse))
print(round(coef(summary(mod.2)), 4))
L(d(lnyse), 1:5)5
has p-value < 0.05
, so the final lag order is 5
.
The maximum lag via the rule-of-thumb is:
floor(12 * (length(lnyse) / 100)^(1/4))
mod.3 <- dynlm(d(lnyse) ~ L(lnyse) + L(d(lnyse), 1:18) + time(lnyse))
print(round(coef(summary(mod.3)), 4))
L(d(lnyse), 1:18)18
is insignificant. We will continue reducing the lag order untill the last lag is significant. We omit the intermediate model output with insignificant $p_\max$ coefficients. The final model
mod.3 <- dynlm(d(lnyse) ~ L(lnyse) + L(d(lnyse), 1:14) + time(lnyse))
print(round(coef(summary(mod.3)), 4))
has a $p_\max = 14$ lag order.
If we compare the $\rm AIC$ and $\rm BIC$ values of the three models:
a <- data.frame('mdl_1 (p = 0)' = c(AIC(mod.1), BIC(mod.1)),
'mdl_2 (p = 5)' = c(AIC(mod.2), BIC(mod.2)),
'mdl_3 (p = 14)' = c(AIC(mod.3), BIC(mod.3)),
check.names = FALSE)
rownames(a) <- c("AIC", "BIC")
a
The $\rm AIC$ and $\rm BIC$ values are smaller for the model, where we started with an arbitraty value of $p = 4$. Having determined the lag order, we asses the trend significance.
Note: we will examine the final models for each case. In reality we would need to select an arbitrary lag outselves, if we opt to manually test for a unit root.
For the 1st model:
print(round(coef(summary(mod.1)), 4))
The trend is insignificant, so we remove it.
mod.1 <- dynlm(d(lnyse) ~ L(lnyse))
print(round(coef(summary(mod.1)), 4))
For the 2nd model:
mod.2 <- dynlm(d(lnyse) ~ L(lnyse) + L(d(lnyse), 1:5) + time(lnyse))
print(round(coef(summary(mod.2)), 4))
The trend is significant, so we leave it.
For the 3rd model:
print(round(coef(summary(mod.3)), 4))
The trend is insignificant, so we remove it
mod.3 <- dynlm(d(lnyse) ~ L(lnyse) + L(d(lnyse), 1:14))
print(round(coef(summary(mod.3)), 4))
So, we have that:
The $\rm ADF$ test tests the null hypothesis: $$ H_0: \rho = 0 \text{ (i.e. the series contains a unit root)} $$
Depending on the trend, the critical values are:
Looking at the 1st model:
print(round(coef(summary(mod.1)), 4))
The coefficient $\widehat{\rho}$ t-value is L(lnyse) = -0.0720 > -2.89
. So we have no grounds to reject the null hypothesis and conclude that the series contains a unit root.
Since we have determined that L(lnyse)
is insignificant, we could remove it from the model, if we wanted to consider this model as a contender for the best model for this series:
mod.1 <- dynlm(d(lnyse) ~ 1)
print(round(coef(summary(mod.1)), 4))
Then the estimated for $\Delta Y_t$ is: $\Delta Y_t = 0.0069 + \epsilon_t$, which we can write it as a model for $Y_t$: $Y_t = 0.0069 + Y_{t-1} + \epsilon_t$, which is an $\rm AR(1)$ model for $Y_t$.
Looking at the 2nd model:
print(round(coef(summary(mod.2)), 4))
The coefficient $\widehat{\rho}$ t-value is L(lnyse) = -2.2920 > -3.45
. So we have no grounds to reject the null hypothesis and conclude that the series contains a unit root.
Since we have determined that L(lnyse)
is insignificant, we could remove it from the model, if we wanted to consider this model as a contender for the best model for this series:
mod.2 <- dynlm(d(lnyse) ~ L(d(lnyse), 1:5) + time(lnyse))
print(round(coef(summary(mod.2)), 5))
Since we needed the trend for unit root testing - having removed the insignificant $\rho$ coefficient, we could try to remove the insignificant trend and improve our model. But we will assume that this model is acceptible as is.
Then the estimated model for $\Delta Y_t$ is:
$\Delta Y_t = -0.07357 + 0.00004 \cdot t + 0.04398 \Delta Y_{t-1} -0.03932 \Delta Y_{t-2} + 0.01035 \Delta Y_{t-3} + 0.02288 \Delta Y_{t-4} + 0.09225 \Delta Y_{t-5} + \epsilon_t$,
which we can re-write as a model for $Y_t$.
We use the property $\gamma_j = -[\phi_{j+1} + ... + \phi_p]$, $j = 1, ..., p - 1$ and the expression of $\rho$ to get the coefficients:
$ \begin{aligned} \gamma_5 &= - \phi_6 &&\Longrightarrow \phi_6 = - \gamma_5 = -0.09225\\ \gamma_4 &= - [\phi_5 + \phi_6] &&\Longrightarrow \phi_5 = - \gamma_4 - \phi_6 = -0.02288+0.09225 = 0.06937\\ \gamma_3 &= - [\phi_4 + \phi_5 + \phi_6] &&\Longrightarrow \phi_4 = - \gamma_3 - [\phi_5 + \phi_6] = - \gamma_3 + \gamma_4 = -0.01035+0.02288 = 0.01253\\ \gamma_2 &= - [\phi_3 + \phi_4 + \phi_5 + \phi_6] &&\Longrightarrow \phi_3 = - \gamma_2 + \gamma_3 = 0.03932 + 0.01035 = 0.04967\\ \gamma_1 &= - [\phi_2 + \phi_3 + \phi_4 + \phi_5 + \phi_6] &&\Longrightarrow \phi_2 = - \gamma_1 + \gamma_2 = -0.04398 -0.03932 = -0.0833\\ \rho &= \phi_1 + \phi_2 + \phi_3 + \phi_4 + \phi_5 + \phi_6 - 1,\text{ since } \rho = 0 &&\Longrightarrow \phi_1 = 1 - (\phi_2 + \phi_3 + \phi_4 + \phi_5 + \phi_6) = 1 + \gamma_1= 1 + 0.04398 = 1.04398 \end{aligned} $
So, the equivalent model for $Y_t$ is:
$Y_t = -0.07357 + 0.00004 \cdot t + 1.04398 Y_{t-1} -0.0833 Y_{t-2} + 0.04967 Y_{t-3} + 0.01253 Y_{t-4} + 0.06937 Y_{t-5} -0.09225 Y_{t-6} + \epsilon_t$
To verify that this is indeed the correct transformation, we can try to simulate a few observations with the same residuals:
set.seed(123)
N <- 200
eps <- rnorm(mean = 0, sd = 1, n = N)
tt <- 1:length(eps)
First, we will simulate $Y_t$:
Y <- NULL
Y[1] <- -0.07357 + 0.00004 * tt[1] + eps[1]
Y[2] <- -0.07357 + 0.00004 * tt[2] + 1.04398 * Y[1] + eps[2]
Y[3] <- -0.07357 + 0.00004 * tt[3] + 1.04398 * Y[2] −0.0833 * Y[1] + eps[3]
Y[4] <- -0.07357 + 0.00004 * tt[4] + 1.04398 * Y[3] −0.0833 * Y[2] + 0.04967 * Y[1] + eps[4]
Y[5] <- -0.07357 + 0.00004 * tt[5] + 1.04398 * Y[4] −0.0833 * Y[3] + 0.04967 * Y[2] + 0.01253 * Y[1] + eps[5]
Y[6] <- -0.07357 + 0.00004 * tt[6] + 1.04398 * Y[5] −0.0833 * Y[4] + 0.04967 * Y[3] + 0.01253 * Y[2] + 0.06937 * Y[1] + eps[5]
for(j in 7:N){
Y[j] <- -0.07357 + 0.00004 * tt[j] + 1.04398 * Y[j-1] −0.0833 * Y[j-2] + 0.04967 * Y[j - 3] + 0.01253 * Y[j - 4] + 0.06937 * Y[j - 5] − 0.09225 * Y[j - 6] + eps[j]
}
Secondly, we will simulate $\Delta Y_t$:
dY <- NULL
dY[1] <- -0.0735 + 0.00004 * tt[1] + eps[1]
dY[2] <- -0.07357 + 0.00004 * tt[2] + 0.04398 * dY[1] + eps[2]
dY[3] <- -0.07357 + 0.00004 * tt[3] + 0.04398 * dY[2] −0.03932 * dY[1] + eps[3]
dY[4] <- -0.07357 + 0.00004 * tt[4] + 0.04398 * dY[3] −0.03932 * dY[2] + 0.01035 * dY[1] + eps[4]
dY[5] <- -0.07357 + 0.00004 * tt[5] + 0.04398 * dY[4] −0.03932 * dY[3] + 0.01035 * dY[2] + 0.02288 * dY[1] + eps[5]
for(j in 6:N){
dY[j] <- -0.07357 + 0.00004 * tt[j] + 0.04398 * dY[j-1] −0.03932 * dY[j-2] + 0.01035 * dY[j - 3] + 0.02288 * dY[j - 4] + 0.09225 * dY[j - 5] + eps[j]
}
Finally, we will assume that the first 100
, or so, observations of $Y_t$ are burn-in values (since the first couple of values assume that $Y_j = \epsilon_j = 0$, for $j \leq 0$). This means that we treat treat the first 101
$\Delta Y_t$ values as burn-in values as well and drop them:
Y <- Y[-c(1:100)]
dY <- dY[-c(1:101)]
We can compare $\Delta Y_t$ with $Y_t - Y_{t-1}$:
dt_compare <- data.frame(diff(Y), dY)
dt_compare$DIFFERENCE <- round(diff(Y) - dY, 10)
head(dt_compare)
We see that there is no difference between the simulate $Y_t - Y_{t-1}$ (diff.Y.
), which was calculated from the simulated $Y_t$ series, and $\Delta Y_t$ (dY
).
We can also visually compare the charts:
plot.ts(ts(diff(Y)), col = "red", lwd = 2)
lines(ts(dY), col = "blue", lty = 2, lwd = 2)
So, the two models are equivalent. For simulation and forecasting we may be more interested in $Y_t$ model, as it is the original series, which we are interested in.
The final, 3rd model
print(round(coef(summary(mod.3)), 4))
The coefficient $\widehat{\rho}$ t-value is L(lnyse) = -0.0471 > -2.89
. So we have no grounds to reject the null hypothesis and conclude that the series contains a unit root.
We will not re-write this models equations as they can be determined by applying the same methods as before.
Overall, regardless of the lag order we see that the series has a unit root.
It is important to note, that removing the trend increased the t-statistic of $\widehat{\rho}$, so it is very possible that, if we incorrectly remove the trend - we may make an incorrect test conclusion.
We will carry out different tests from different packages. For completeness sake, we will provide the same test results from more than one package.
The $\rm ADF$ test tests the null hypothesis: $H_0: \rho = 0 \text{ (i.e. the series contains a unit root)}$
We can set the maximum lag order, and choose the criterion for the order selection:
tst_adf_1 <- urca::ur.df(lnyse, type = "trend", lags = 18, selectlags = "BIC")
print(tst_adf_1)
We can examine the model
tst_adf_1@testreg
The critical values can be extracted via @cval
:
print(tst_adf_1@cval)
To carry out the unit root test, we examine the t-statistic of z.lag.1
- it is -2.096
(Note that this is the same as the first value in the output The value of the test statistic is: -2.0965 6.0649 2.2091
). The relevant critival value is tau
, which is -3.41
for the $5\%$ significance level.
Since The test statistic -2.096 > -3.45
, we do not reject the null hypothesis of a unit root.
The second way to carry out and $\rm ADF$ test:
tst_adf_2 <- tseries::adf.test(lnyse, alternative = "stationary")
We can either manually specify the lag order, or let the function select the maximum lag order itself.
tst_adf_2
This output is a bit easier to read - the $p-value = 0.6198 > 0.05$, so we have no grounds to reject the null hypothesis of a unit root.
The $\rm KPSS$ test tests the null hypothesis: $H_0: \text{ the data is stationary}$
The test involves specifying and estimating the following equation with a random-walk component: $$ Y_t = r_t + \delta \cdot t + \epsilon_t,\quad r_t = r_{t-1} + u_t $$
The null hypothesis of stationarity is equivalent to the hypothesis: $H_0: \sigma^2_u = 0$, which would mean that $r_t = r_{t-1} = ... = r_0 = \rm const$ (i.e. a constant intercept).
We can set the number of lags to use the rule of thumb (lags = "long"
, which is equivalent to $\root 4 \of {12 \times (n/100)}$), or lags = "short"
, which is equivalent to $\root 4 \of {4 \times (n/100)}$. The test types specify as deterministic component either a constant mu
or a constant with linear trend tau
.
tst_kpss_1 <- urca::ur.kpss(lnyse, type = "tau", lags = "long")
tst_kpss_1
We see that the test statistic is 0.3909
. The critical values are:
tst_kpss_1@cval
Since 0.3909 > 0.146
, we reject the null hypothesis of stationarity and conclude that the series is not stationary.
The second way to carry out the KPSS
test (where lshort = FALSE
indicates to use the rule-of-thumb larger lag selection):
tst_kpss_2 <- tseries::kpss.test(lnyse, null = "Trend", lshort = FALSE)
print(tst_kpss_2)
Since p-value = 0.01 < 0.05
, we reject the null hypothesis of stationarity and conclude that the series is not stationary.
The $\rm PP$ test tests builds on the ADF test and tests the null hypothesis: $H_0: \text{ (i.e. the series contains a unit root)}$.
We begin with the built-in function:
tst_pp_0 <- stats::PP.test(lnyse, lshort = FALSE)
tst_pp_0
p-value = 0.6039 > 0.05
, so we have no grounds to reject the null hypothesis of a unit root.
The urca
package also contains the $PP$ test:
tst_pp_1 <- urca::ur.pp(lnyse, model = "trend", type = "Z-tau", lags = "long")
tst_pp_1
Note: use type = "Z-tau"
, otherwise the critical values will be missing!.
If we are interested, we can examine the specified regression, which was used for this test:
tst_pp_1@testreg
Although the critical test statistic is calculated differently than the one in the $ADF$ test, the critical values for the PP test are the same as for the DF test.
The critical values are:
tst_pp_1@cval
Since the value of the test statistic -1.9393
is greater than the $5\%$ critical value -3.420314
, we have no grounds to reject the null hypothesis of a unit root.
The tseries
package also has the $PP$ test:
tst_pp_2 <- tseries::pp.test(lnyse, alternative = "stationary", type = "Z(t_alpha)", lshort = FALSE)
tst_pp_2
The p-value = 0.6039 > 0.05
, we have no grounds to reject the null hypothesis and conclude that the series has a unit root.
So, we can conclude that all tests indicate that the series has a unit root.
# Set plot options in Jupyter Lab
options(repr.plot.width = 10)
options(repr.plot.height = 4)
Since we have determined that the series has a unit root (and we see no indication of seasonality), we can take the differences of the series:
dlnyse <- diff(lnyse)
If we examine the series:
forecast::tsdisplay(dlnyse)
The series appear to be stationary. We will test this by repeating the unit root tests on the differenced data. For simplicity, we will use the test, which have the p-values
, from the tseries
package:
tseries::adf.test(dlnyse, alternative = "stationary")
$p-value = 0.01 < 0.05$, so we reject the null hypothesis of a unit root.
tseries::kpss.test(dlnyse, null = "Level", lshort = FALSE)
$p-value = 0.1 > 0.05$, so we have no grounds to reject the null hypothesis of stationarity.
tseries::pp.test(dlnyse, alternative = "stationary", type = "Z(t_alpha)", lshort = FALSE)
$p-value = 0.01 < 0.05$, so we reject the null hypothesis of a unit root.
In other words, the differences, $\Delta Y_t$, are stationary.
To make it interesting, we will specify three different models:
auto.arima
, without any restrictions.ADF
test we select and re-estimate mod.2
model (this is a different parametrization than the one on dynlm
)¶Furthermore, we will only be able to estimate the drift component, since if $Y_t$ contais a constant $\alpha$ and trend $\delta \cdot t$, then $\Delta Y_t$ contains $\delta$.
In this specification $\alpha$ is not identifiable.
mdl_4_1 <- forecast::Arima(lnyse, order = c(5, 1, 0), xreg = data.frame(tt = 1:length(lnyse)))
print(mdl_4_1)
The residuals of this model:
forecast::tsdisplay(mdl_4_1$residuals)
Appear to be white noise.
We will now expand upon what a different parametrization entails for models specified dynlm
versus the ones specified with Arima()
.
forecast::Arima
(or via stats::arima
) is specified as a process with a linear trend, whose deviations from the trend are described as the AR(1) process:¶For example, assume that we would want to specify a model as a stationary $\rm AR(1)$ model with a trend using forecast::Arima
(or stats::arima
) the specification is as follows:
$ \begin{cases} Y_t &= \alpha + \delta \cdot t + u_t,\\ u_t &= \phi u_{t-1} + \epsilon_t \end{cases} $
mod.a1 <- stats::arima(lnyse, order = c(1,0,0), xreg=time(lnyse))
mod.a1
mod.a2 <- forecast::Arima(lnyse, order = c(1,0,0), xreg=time(lnyse))
mod.a2
The estimated model is:
$ \begin{cases} Y_t &= -144.2528 + 0.0763 \cdot t + u_t,\\ u_t &= 0.9832 u_{t-1} + \epsilon_t \end{cases} $
In this case, we would want to test whether the residuals exhibit a unit root by testing whether $\phi = 1$.
dynlm::dynlm
is specified as an autoregressive process with a linear trend:¶From the (1) expression we would have that:
$ \begin{cases} u_t &= Y_t - \alpha - \delta \cdot t,\\ u_t &= \phi u_{t-1} + \epsilon_t \end{cases} \Longrightarrow Y_t - \alpha - \delta \cdot t = \phi \left(Y_{t-1} - \alpha - \delta \cdot (t-1) \right) + \epsilon_t $
which means that an alternative specification involves a reparametrization of the previous process as:
$ \begin{aligned} Y_t &= \tilde{\alpha} + \tilde{\delta} \cdot t + \phi Y_{t-1} + \epsilon_t = \left[ (1 - \phi) \alpha + \phi \delta\right] + \left[ (1 - \phi)\delta \right] \cdot t + \phi Y_{t-1} + \epsilon_t \end{aligned} $
which is as an autoregressive process with a linear trend.
This is the specification that dynlm::dynlm(...)
uses:
mod.b1 <- dynlm::dynlm(lnyse ~ time(lnyse) + L(lnyse))
summary(mod.b1)
The estimated model is:
$ \begin{aligned} Y_t &= 2.169524 + 0.001152 \cdot t + 0.984518 Y_{t-1} + \epsilon_t \end{aligned} $
The models are equivalent, but due to the parametrization $\alpha \neq \tilde \alpha$ and $\delta \neq \tilde{\delta}$. The autoregressive coefficient $\widehat{\phi}$ also differ due to different parameter estimation methods (which again may further differ for different parametrization options). However they are fairly close: ar1 = 0.9832
versus L(lnyse) = 0.984518
.
If we use the parametrization on the estimated coefficients from the first model, we would have that:
$ \begin{aligned} \tilde{\alpha} &= \left[ (1 - \phi) \alpha + \phi \delta\right]\\ \tilde{\delta} &=\left[ (1 - \phi)\delta \right] \end{aligned} $
# tilde{alpha}
(1 - 0.9832) * (-144.2528) + 0.9832 * 0.0763
# tilde{delta}
(1 - 0.9832) * 0.0763
We see that they are fairly close to (Intercept) = -2.169524
and time(lnyse) = 0.001152
respectively.
par(mfrow = c(1, 2))
forecast::Acf(diff(lnyse))
forecast::Pacf(diff(lnyse))
We can test the lag significance via the $Ljung-Box$ test. The null hypothesis is:
$$
\begin{aligned}
H_0&: \rho(1) = ... = \rho(k) = 0\\
H_1&: \exists j: \rho(j) \neq 0
\end{aligned}
$$
We will test, whether the first 20
lags have a significant autocorrelation with different values of $k$:
for(i in 1:20){
aa <- Box.test(diff(lnyse), lag = i, type = "Ljung-Box")
print(paste0("lag = ", i, " p-value = ", round(aa$p.value, 5)))
}
We see that in all cases the $p-value > 0.05$, so we have no grounds to reject the null hypothesis. This indicates that the differences, $\Delta Y_t$, are white noise. We can specify the model as:
mdl_4_2 <- forecast::Arima(lnyse, order = c(0, 1, 0), include.drift = TRUE)
print(mdl_4_2)
The residuals
forecast::tsdisplay(mdl_4_2$residuals)
for(i in (length(coef(mdl_4_2)) + 1):20){
aa <- Box.test(mdl_4_2$residuals, lag = i, type = "Ljung-Box", fitdf = length(coef(mdl_4_2)))
print(paste0("lag = ", i, " p-value = ", round(aa$p.value, 5)))
}
The residuals are white noise. So this model appears to be adequate. The residuals of the previous model:
for(i in (length(coef(mdl_4_1))+1):20){
aa <- Box.test(mdl_4_1$residuals, lag = i, type = "Ljung-Box", fitdf = length(coef(mdl_4_1)))
print(paste0("lag = ", i, " p-value = ", round(aa$p.value, 5)))
}
are also WN.
auto.arima
without any restrictions¶mdl_4_3 <- forecast::auto.arima(lnyse)
print(mdl_4_3)
So, mdl_4_3
is the same as mdl_4_2
.
If we'd like, we can compare the models via AIC/BIC:
a <- data.frame(mdl_4_1 = c(AIC(mdl_4_1), BIC(mdl_4_1)),
mdl_4_2 = c(AIC(mdl_4_2), BIC(mdl_4_2)))
rownames(a) <- c("AIC", "BIC")
print(a)
Overall, the second model, mdl_4_2
, appears to be better, since its AIC and BIC values are smaller.
Intuitively, we may assume that we can simply differenciate the data and then fit the model on the differences.
# Set plot options in Jupyter Lab
options(repr.plot.width = 5)
options(repr.plot.height = 4)
For, example, the $\rm ARIMA(0, 1, 0)$ on $Y_t$:
m_tst_1 <- forecast::Arima(lnyse, order = c(0, 1, 0), include.drift = TRUE)
m_tst_1
plot(lnyse)
lines(m_tst_1$fitted, col = "red")
Can be written as a $\rm ARIMA(0, 0, 0)$ model on the differences $\Delta Y_t$:
m_tst_2 <- forecast::Arima(diff(lnyse), order = c(0, 0, 0), include.constant = TRUE)
m_tst_2
While the model appears to be the same. If we were to estimate the fit - the fitted values are then calculated for $\widehat{\Delta Y_t}$:
head(fitted(m_tst_2), 10)
which are constant. This is to be expected, since our model for $\widehat{\Delta Y_t}$ is in fact a constant. Unfortunately, in this case the forecasts are NOT the same the $\rm ARIMA(0, 1, 0)$ models on $Y_t$, even if we compare the differences:
# Set plot options in Jupyter Lab
options(repr.plot.width = 15)
options(repr.plot.height = 4)
plot(diff(lnyse), lwd = 2)
lines(fitted(m_tst_2), col = "red")
lines(diff(m_tst_1$fitted), col = "blue", lty = 2)
legend("bottomleft", y.intersp = 3, cex = 0.8,
legend = c(as.expression(bquote(Delta~Y[t])), as.expression(bquote(widehat(Delta~Y)[t])), as.expression(bquote(widehat(Y)[t] - Y[t-1]))),
lty = c(1, 1, 2), col = c("black", "red", "blue"), lwd = 2)
Check out a comment on stackoverflow by the author of the forecast package. For $\Delta Y_t \sim AR(1)$ the difference is that:
When fitting a $I(1)$ model on $Y_t$, the fitted values are: $$\widehat{Y}_t = Y_{t-1} + \widehat{\gamma}\cdot(Y_{t-1} - Y_{t-2})$$
When fitting a $I(0)$ model on $\Delta Y_t$, the fitted values are: $$\widehat{\Delta {Y}}_t = \widehat{\gamma}\cdot(Y_{t-1} - Y_{t-2})$$
In our case this is equivalent to comparing the fitted values of $\widehat{(Y_{t-1} - Y_{t-2})} = \widehat{\alpha}$ versus $\widehat{Y}_t = \widehat{\alpha} + Y_{t-1}$.
The problem is that in general $\widehat{Y}_t - Y_{t-1} \neq \widehat{\Delta {Y}}_t$.
Fortunately, once we have the coefficient estimates we can use the parametrization (assuming that $H_0: \rho = 0$ is not rejected): $\Delta Y_t = \alpha + \delta \cdot t \sum_{j = 1}^{p-1} \gamma_j \Delta Y_{t-j} + \epsilon_t$
In this case, our equation is: $\Delta Y_t = \alpha + \epsilon_t$.
Since we do not want to calulate $\widehat{\Delta Y}_t$, but rather $\widehat{Y}_t$, we simply apply the definition of $\Delta$ and get: $Y_t = \alpha + Y_{t-1} + \epsilon_t$.
The two equations are equivalent, just with a different parametrization. If we fit a model on manually-calculated $\Delta Y_t$ - there is no way for the software to detect that you took the differences MANUALLY.
This is also evident from the fact that the coefficients are identical (though they may sometimes differ due to different starting values for differenced versus non-differenced data).
Remember that the fitted values are calculated as a conditional expectation on $Y_t$, conditionally that any variables on the right-hand-side are given, hence (replacing $\epsilon_t$ with its mean of zero): $$ \widehat{Y}_t = \widehat{\alpha} + Y_{t-1},\ t = 2,...,T $$ Assuming that our series starts at $Y_1$ with $Y_s = 0$ for $s \leq 0$.
(if we had a moving-average part, we could calculate $\widehat{\epsilon}_s = Y_s - \widehat{Y}_s$ for $s < t$ and use the residuals for for $t + 1$).
In the case of the differences model, we have that:
y_fit <- coef(m_tst_2) + lnyse[-length(lnyse)]
The first fitted value is equal to the true value
lnyse[1]
We can compare the fitted values with the ones from the ARIMA model on $Y_t$:
aa <- data.frame(m_tst_1$fitted, c(NA, y_fit))
colnames(aa) <- c("Y_ARIMA", "dY_ARMA")
aa$diff <- aa$Y_ARIMA - aa$dY_ARMA
head(aa)
summary(aa$diff)
Obviously then diff(aa$Y_ARIMA)
and diff(aa$dY_ARMA)
will also be equal. However, this is different from fitted(m_tst_2)
:
bbb <- data.frame(diff(m_tst_1$fitted), m_tst_2$fitted)
colnames(bbb) <- c("Y_ARIMA_diff", "dY_ARMA_diff")
bbb$diff <- bbb$Y_ARIMA - bbb$dY_ARMA
head(bbb)
summary(bbb$diff)
All in all, it is much easier to specify the model for $Y_t$, instead of manually calculating the differences and estimating a model on $\Delta Y_t$, as most forecasting and fittign functions are readily available.
Assume that we have selected the following model as the "best" one:
mdl_4_2
which we can write as: $\Delta Y_{t} = 0.0069 + \epsilon_t$, or equivalently as: $Y_t = 0.0069 + Y_{t-1} + \epsilon_t$. In other words, this is a random walk with drift.
Assume that we are not so "lucky" and our model is much more complex. For example
mod.2 <- dynlm(d(lnyse) ~ L(d(lnyse), 1:5) + time(lnyse))
print(round(coef(summary(mod.2)), 5))
Since we needed the trend for unit root testing - having removed the insignificant $\rho$ coefficient, we could try to remove the insignificant trend and improve our model. But we will assume that this model is acceptible as is.
Remembering the model parametrization of dynlm
, the estimated model for $\Delta Y_t$ is:
$\Delta Y_t = -0.07357 + 0.00004 \cdot t + 0.04398 \Delta Y_{t-1} -0.03932 \Delta Y_{t-2} + 0.01035 \Delta Y_{t-3} + 0.02288 \Delta Y_{t-4} + 0.09225 \Delta Y_{t-5} + \epsilon_t$,
which we can re-write as a model for $Y_t$.
We use the property $\gamma_j = -[\phi_{j+1} + ... + \phi_p]$, $j = 1, ..., p - 1$ and the expression of $\rho$ to get the coefficients:
$ \begin{aligned} \gamma_5 &= - \phi_6 &&\Longrightarrow \phi_6 = - \gamma_5 = -0.09225\\ \gamma_4 &= - [\phi_5 + \phi_6] &&\Longrightarrow \phi_5 = - \gamma_4 - \phi_6 = -0.02288+0.09225 = 0.06937\\ \gamma_3 &= - [\phi_4 + \phi_5 + \phi_6] &&\Longrightarrow \phi_4 = - \gamma_3 - [\phi_5 + \phi_6] = - \gamma_3 + \gamma_4 = -0.01035+0.02288 = 0.01253\\ \gamma_2 &= - [\phi_3 + \phi_4 + \phi_5 + \phi_6] &&\Longrightarrow \phi_3 = - \gamma_2 + \gamma_3 = 0.03932 + 0.01035 = 0.04967\\ \gamma_1 &= - [\phi_2 + \phi_3 + \phi_4 + \phi_5 + \phi_6] &&\Longrightarrow \phi_2 = - \gamma_1 + \gamma_2 = -0.04398 -0.03932 = -0.0833\\ \rho &= \phi_1 + \phi_2 + \phi_3 + \phi_4 + \phi_5 + \phi_6 - 1,\text{ since } \rho = 0 &&\Longrightarrow \phi_1 = 1 - (\phi_2 + \phi_3 + \phi_4 + \phi_5 + \phi_6) = 1 + \gamma_1= 1 + 0.04398 = 1.04398 \end{aligned} $
So, the equivalent model for $Y_t$ is:
$Y_t = -0.07357 + 0.00004 \cdot t + 1.04398 Y_{t-1} -0.0833 Y_{t-2} + 0.04967 Y_{t-3} + 0.01253 Y_{t-4} + 0.06937 Y_{t-5} -0.09225 Y_{t-6} + \epsilon_t$
To verify that this is indeed the correct transformation, we can try to simulate a few observations with the same residuals:
set.seed(123)
N <- 200
eps <- rnorm(mean = 0, sd = 1, n = N)
tt <- 1:length(eps)
First, we will simulate $Y_t$:
Y <- NULL
Y[1] <- -0.07357 + 0.00004 * tt[1] + eps[1]
Y[2] <- -0.07357 + 0.00004 * tt[2] + 1.04398 * Y[1] + eps[2]
Y[3] <- -0.07357 + 0.00004 * tt[3] + 1.04398 * Y[2] −0.0833 * Y[1] + eps[3]
Y[4] <- -0.07357 + 0.00004 * tt[4] + 1.04398 * Y[3] −0.0833 * Y[2] + 0.04967 * Y[1] + eps[4]
Y[5] <- -0.07357 + 0.00004 * tt[5] + 1.04398 * Y[4] −0.0833 * Y[3] + 0.04967 * Y[2] + 0.01253 * Y[1] + eps[5]
Y[6] <- -0.07357 + 0.00004 * tt[6] + 1.04398 * Y[5] −0.0833 * Y[4] + 0.04967 * Y[3] + 0.01253 * Y[2] + 0.06937 * Y[1] + eps[6]
for(j in 7:N){
Y[j] <- -0.07357 + 0.00004 * tt[j] + 1.04398 * Y[j-1] −0.0833 * Y[j-2] + 0.04967 * Y[j - 3] + 0.01253 * Y[j - 4] + 0.06937 * Y[j - 5] − 0.09225 * Y[j - 6] + eps[j]
}
Secondly, we will simulate $\Delta Y_t$:
dY <- NULL
dY[1] <- -0.0735 + 0.00004 * tt[1] + eps[1]
dY[2] <- -0.07357 + 0.00004 * tt[2] + 0.04398 * dY[1] + eps[2]
dY[3] <- -0.07357 + 0.00004 * tt[3] + 0.04398 * dY[2] −0.03932 * dY[1] + eps[3]
dY[4] <- -0.07357 + 0.00004 * tt[4] + 0.04398 * dY[3] −0.03932 * dY[2] + 0.01035 * dY[1] + eps[4]
dY[5] <- -0.07357 + 0.00004 * tt[5] + 0.04398 * dY[4] −0.03932 * dY[3] + 0.01035 * dY[2] + 0.02288 * dY[1] + eps[5]
for(j in 6:N){
dY[j] <- -0.07357 + 0.00004 * tt[j] + 0.04398 * dY[j-1] −0.03932 * dY[j-2] + 0.01035 * dY[j - 3] + 0.02288 * dY[j - 4] + 0.09225 * dY[j - 5] + eps[j]
}
Finally, we will assume that the first 100
, or so, observations of $Y_t$ are burn-in values (since the first couple of values assume that $Y_j = \epsilon_j = 0$, for $j \leq 0$). This means that we treat treat the first 101
$\Delta Y_t$ values as burn-in values as well and drop them:
Y <- Y[-c(1:100)]
dY <- dY[-c(1:101)]
We can compare $\Delta Y_t$ with $Y_t - Y_{t-1}$:
dt_compare <- data.frame(diff(Y), dY)
dt_compare$DIFFERENCE <- round(diff(Y) - dY, 10)
head(dt_compare)
We see that there is no difference between the simulate $Y_t - Y_{t-1}$ (diff.Y.
), which was calculated from the simulated $Y_t$ series, and $\Delta Y_t$ (dY
).
We can also visually compare the charts:
plot.ts(ts(diff(Y)), col = "red", lwd = 2)
lines(ts(dY), col = "blue", lty = 2, lwd = 2)
So, the two models are equivalent. For simulation and forecasting we may be more interested in $Y_t$ model, as it is the original series, which we are interested in.
On the other hand, if our model was specified via Arima
, we would need to differently specify the model.
forecast::Arima(lnyse, order = c(5, 1, 0), xreg = data.frame(tt = 1:length(lnyse)))
which is equivalent to:
forecast::Arima(lnyse, order = c(5, 1, 0), include.drift = TRUE)
or
arima(lnyse, order = c(5, 1, 0), xreg = 1:length(lnyse))
This model, estimated via Arima
for $\Delta Y_t$ is:
$ \begin{cases} \Delta Y_t &= \tilde{\delta} + u_t \\ u_t &= \gamma_1 u_{t-1} + \gamma_2 u_{t-2} + \gamma_3 u_{t-3} + \gamma_4 u_{t-4} + \gamma_5 u_{t-5} + \epsilon_t \end{cases} $
which is equivalent to:
$ \Delta Y_t = \tilde{\delta} (1 - \gamma_1 - \gamma_2 - \gamma_3 - \gamma_4 - \gamma_5) + \gamma_1 \Delta Y_{t-1} + \gamma_2 \Delta Y_{t-2} + \gamma_3 \Delta Y_{t-3} + \gamma_4 \Delta Y_{t-4} + \gamma_5 \Delta Y_{t-5} + \epsilon_t $
cfs <- coef(forecast::Arima(lnyse, order = c(5, 1, 0), include.drift = TRUE))
print(round(unname(cfs["drift"]) * (1 - sum(cfs[paste0("ar", 1:5)])), 4))
Along with the coefficient values gives us:
$ \Delta Y_t = 0.006 + 0.0380 \Delta Y_{t-1} - 0.0350 \Delta Y_{t-2} + 0.0089\Delta Y_{t-3} + 0.0228 \Delta Y_{t-4} + 0.0923 \Delta Y_{t-5} + \epsilon_t $
An important distinction is how we arrived at such a model. In the previous case - we have transformed $Y_t$ into an equivalent model via differences in order to test whether a unit root exists. Since it did - we removed an additional insignificant parameter and were left with a model, which included both a constant and a trend in the differences model.
Assuming that the parametrization is the same as before, with $\rho = 0$ - we arrive at the following coefficient parametrizations:
So the final equation is $$ \begin{aligned} Y_t &= \tilde{\delta} (1 - \gamma_1 - \gamma_2 - \gamma_3 - \gamma_4 - \gamma_5) + (1 + \gamma_1) Y_{t-1} + (\gamma_2-\gamma_1) Y_{t-2} + (\gamma_3 - \gamma_2) Y_{t-3} + (\gamma_4-\gamma_3) Y_{t-4} + (\gamma_5-\gamma_4) Y_{t-5} + (-\gamma_5)Y_{t-6} + \epsilon_t \end{aligned} $$
sprintf("phi_1 = %f", round(1 + cfs["ar1"], 4))
sprintf("phi_2 = %f", round(cfs["ar2"] - cfs["ar1"], 4))
sprintf("phi_3 = %f", round(cfs["ar3"] - cfs["ar2"], 4))
sprintf("phi_4 = %f", round(cfs["ar4"] - cfs["ar3"], 4))
sprintf("phi_5 = %f", round(cfs["ar5"] - cfs["ar4"], 4))
sprintf("phi_6 = %f", round(- cfs["ar5"], 4))
along with the coefficients can be written as:
\begin{aligned} Y_t &= 0.006 + 1.038 Y_{t-1} -0.0729 Y_{t-2} + 0.0438 Y_{t-3} + 0.0139 Y_{t-4} + 0.0695 Y_{t-5} -0.0923Y_{t-6} + \epsilon_t \end{aligned}We will show a simulation example to highlight that the two equations produce the same processes.
dynlm
]¶If we want to directly compare with dynlm
, we need to specify the trend to be $t = 1,...$, instead of time()
, which takes into account the frequency and year.
lnyse_tt <- ts(1:length(lnyse), start = start(lnyse), frequency = frequency(lnyse))
round(coef(summary(dynlm(d(lnyse) ~ L(d(lnyse), 1:5) + lnyse_tt))), 6)
Compared with the dynlm
models:
$\Delta Y_t = 0.005143 + 0.000003 \cdot t + 0.04398 \Delta Y_{t-1} -0.03932 \Delta Y_{t-2} + 0.01035 \Delta Y_{t-3} + 0.02288 \Delta Y_{t-4} + 0.09225 \Delta Y_{t-5} + \epsilon_t$,
and
$Y_t = 0.005143 + 0.000003 \cdot t + 1.04398 Y_{t-1} -0.0833 Y_{t-2} + 0.04967 Y_{t-3} + 0.01253 Y_{t-4} + 0.06937 Y_{t-5} -0.09225 Y_{t-6} + \epsilon_t$
The difference is due to the different parametrization and different parameter estimation methods employed. Again not thet since we changed the trend component to have values $t = 1,...$, only the (intercept)
and trend
parameters changed in the dynlm
model.
If we want our model to have a trend component, we need to use the fact that $Y_t = c_1 + c_2 \cdot t + \phi Y_{t-1} + w_t$ can be expressed as
$Y_t - Y_{t-1} = c_1 - c_1 + c_2 \cdot t - c2 \cdot (t-1) + \phi Y_{t-1} - \phi Y_{t-1} + w_t - w_{t-1}$
or
$\Delta Y_t = c_2 + \phi \Delta Y_{t-1} + \Delta w_t$. In other words, we cannot identify the constant $c_1$ and so we can set it to $0$.
set.seed(123)
N <- 200
eps <- rnorm(mean = 0, sd = 1, n = N)
tt <- 1:length(eps)
delta_coef <- 0.0069 * (1 - 0.0380 + 0.0350 - 0.0089 - 0.0228 - 0.0923)
delta_coef
Y <- NULL
Y[1] <- delta_coef + eps[1]
Y[2] <- delta_coef + (1 + 0.0380) * Y[1] + eps[2]
Y[3] <- delta_coef + (1 + 0.0380) * Y[2] + (-0.0350 - 0.0380) * Y[1] + eps[3]
Y[4] <- delta_coef + (1 + 0.0380) * Y[3] + (-0.0350 - 0.0380) * Y[2] + (0.0089 + 0.0350) * Y[1] + eps[4]
Y[5] <- delta_coef + (1 + 0.0380) * Y[4] + (-0.0350 - 0.0380) * Y[3] + (0.0089 + 0.0350) * Y[2] + (0.0228 - 0.0089) * Y[1] + eps[5]
Y[6] <- delta_coef + (1 + 0.0380) * Y[5] + (-0.0350 - 0.0380) * Y[4] + (0.0089 + 0.0350) * Y[3] + (0.0228 - 0.0089) * Y[2] + (0.0923 - 0.0228) * Y[1] + eps[6]
for(j in 7:N){
Y[j] <- delta_coef + (1 + 0.0380) * Y[j-1] + (-0.0350 - 0.0380) * Y[j-2] + (0.0089 + 0.0350) * Y[j - 3] + (0.0228 - 0.0089) * Y[j - 4] + (0.0923 - 0.0228) * Y[j - 5] − 0.0923 * Y[j - 6] + eps[j]
}
Secondly, we will simulate $\Delta Y_t$:
dY <- NULL
dY[1] <- delta_coef + eps[1]
dY[2] <- delta_coef + 0.0380 * dY[1] + eps[2]
dY[3] <- delta_coef + 0.0380 * dY[2] -0.0350 * dY[1] + eps[3]
dY[4] <- delta_coef + 0.0380 * dY[3] -0.0350 * dY[2] + 0.0089 * dY[1] + eps[4]
dY[5] <- delta_coef + 0.0380 * dY[4] -0.0350 * dY[3] + 0.0089 * dY[2] + 0.0228 * dY[1] + eps[5]
for(j in 6:N){
dY[j] <- delta_coef + 0.0380 * dY[j-1] -0.0350 * dY[j-2] + 0.0089 * dY[j - 3] + 0.0228 * dY[j - 4] + 0.0923 * dY[j - 5] + eps[j]
}
Finally, we will assume that the first 100
, or so, observations of $Y_t$ are burn-in values (since the first couple of values assume that $Y_j = \epsilon_j = 0$, for $j \leq 0$). This means that we treat treat the first 101
$\Delta Y_t$ values as burn-in values as well and drop them:
Y <- Y[-c(1:100)]
dY <- dY[-c(1:101)]
We can compare $\Delta Y_t$ with $Y_t - Y_{t-1}$:
dt_compare <- data.frame(diff(Y), dY)
dt_compare$DIFFERENCE <- round(diff(Y) - dY, 10)
head(dt_compare)
We see that there is no difference between the simulated $Y_t - Y_{t-1}$ (diff.Y.
), which was calculated from the simulated $Y_t$ series, and $\Delta Y_t$ (dY
).
We can also visually compare the charts:
plot.ts(ts(diff(Y)), col = "red", lwd = 2)
lines(ts(dY), col = "blue", lty = 2, lwd = 2)
Furthermore, we can compare with the values, simulated with dynlm
coefficients:
Y_dynlm <- NULL
Y_dynlm[1] <- 0.005143 + 0.000003 * tt[1] + eps[1]
Y_dynlm[2] <- 0.005143 + 0.000003 * tt[2] + 1.04398 * Y_dynlm[1] + eps[2]
Y_dynlm[3] <- 0.005143 + 0.000003 * tt[3] + 1.04398 * Y_dynlm[2] −0.0833 * Y[1] + eps[3]
Y_dynlm[4] <- 0.005143 + 0.000003 * tt[4] + 1.04398 * Y_dynlm[3] −0.0833 * Y_dynlm[2] + 0.04967 * Y_dynlm[1] + eps[4]
Y_dynlm[5] <- 0.005143 + 0.000003 * tt[5] + 1.04398 * Y_dynlm[4] −0.0833 * Y_dynlm[3] + 0.04967 * Y_dynlm[2] + 0.01253 * Y_dynlm[1] + eps[5]
Y_dynlm[6] <- 0.005143 + 0.000003 * tt[6] + 1.04398 * Y_dynlm[5] −0.0833 * Y_dynlm[4] + 0.04967 * Y_dynlm[3] + 0.01253 * Y_dynlm[2] + 0.06937 * Y_dynlm[1] + eps[6]
for(j in 7:N){
Y_dynlm[j] <- 0.005143 + 0.000003 * tt[j] + 1.04398 * Y_dynlm[j-1] −0.0833 * Y_dynlm[j-2] + 0.04967 * Y_dynlm[j - 3] + 0.01253 * Y_dynlm[j - 4] + 0.06937 * Y_dynlm[j - 5] − 0.09225 * Y_dynlm[j - 6] + eps[j]
}
Y_dynlm <- Y_dynlm[-c(1:100)]
plot.ts(ts(Y), col = "red", lwd = 2)
lines(ts(Y_dynlm), col = "blue", lty = 2, lwd = 2)
legend("topright", c("Y via forecast::Arima() parametrization", "", "Y via dynlm() parametrization"), col = c("red", NA, "blue"), lty = c(1, NA, 2))
# Set plot options in Jupyter Lab
options(repr.plot.width = 10)
options(repr.plot.height = 4)
tt_forc <- data.frame(tt = length(lnyse) + 1:10)
mdl_4_1_forc <- forecast(mdl_4_1, h = 10, xreg = tt_forc)
mdl_4_2_forc <- forecast(mdl_4_2, h = 10)
We will plot the original historical data, the fitted values and the forecasts.
plot.ts(exp(lnyse),
xlim = c(floor(min(time(lnyse))), floor(max(time(lnyse))) + 1),
ylim = c(0, max(exp(mdl_4_1_forc$mean), exp(mdl_4_2_forc$mean))))
#
lines(exp(fitted(mdl_4_1)), col = "red", lty = 2)
lines(exp(fitted(mdl_4_2)), col = "blue", lty = 2)
#
lines(exp(mdl_4_1_forc$mean), col = "red", lty = 2)
lines(exp(mdl_4_2_forc$mean), col = "blue", lty = 2)
As it is difficult to see, we will also reduce the time window to see the last few years
plot.ts(window(exp(lnyse), start = 1990),
xlim = c(1990, floor(max(time(lnyse))) + 1),
ylim = c(1800, max(exp(mdl_4_1_forc$mean), exp(mdl_4_2_forc$mean))),
lwd = 2)
#
lines(exp(fitted(mdl_4_1)), col = "red", lty = 2)
lines(exp(fitted(mdl_4_2)), col = "blue", lty = 2)
#
lines(exp(mdl_4_1_forc$mean), col = "red", lty = 2)
lines(exp(mdl_4_2_forc$mean), col = "blue", lty = 2)
We see that the model forecasts do not appear to differ by a whole lot.
Assume that our best model (again, we can select it via AIC or BIC but we choose a more complex one for a more general example) is:
mdl_4_1
forecast::Arima(lnyse, order = c(5, 1, 0), include.drift = TRUE)
So, we will use $k$ of around $20\%$ of our series and create $k = \lfloor 0.2 \cdot T \rfloor$ samples and for each sample we will:
forcast_errors <- NULL
mdl_params <- NULL
k <- floor(length(lnyse) * 0.2)
k
for(i in k:1){
# get the sample
smpl <- lnyse[1:(length(lnyse) - i)]
# re-estimate the model:
tt <- data.frame(tt = 1:length(smpl))
smpl_mdl <- forecast::Arima(smpl, order = c(5, 1, 0), include.drift = TRUE)
# calculate the one-step ahead forecast
smpl_forc_1 <- forecast(smpl_mdl, h = 1) # since we have decided to include the trend as an exogeneous regressor,
# alternatively - see the code in this file on how to include it automatically
# calculate the forecast error:
tmp_e <- lnyse[length(smpl) + 1] - smpl_forc_1$mean
# save the parameters:
mdl_params <- rbind(mdl_params, coef(smpl_mdl))
# Save the forecast errors:
forcast_errors <- c(forcast_errors, tmp_e)
}
tail(forcast_errors, 10)
The root mean squared errror is:
paste0("RMSE = ", round(sqrt(mean(forcast_errors^2)), 5))
The parameter mean is:
t(apply(mdl_params, 2, mean))
and the variance is:
t(apply(mdl_params, 2, var))
We can visualize the series as separate plots:
# Set plot options in Jupyter Lab
options(repr.plot.width = 10)
options(repr.plot.height = 6)
par(mfrow = c(3, 2))
for(i in 1:ncol(mdl_params)){
plot(mdl_params[, i], main = colnames(mdl_params)[i])
points(length(mdl_params[, i]) + 1, coef(mdl_4_1)[i], col = "red", pch = 19)
}
The red dot coincides with the full sample estimate (we can also use abline
to draw it as a horizontal line, but we do not want to treat it as a true coefficient value).
We see that as the sample size increases (here the sample size is T - k + index
), the estimates converge to some value (maybe if we had more data the values would continue to converge to some unknown value?). The drift
estimate exhibits more variation compared to the remaining parameters so it is unclear how stable it would be if we had a larger sample size.
We can also use a built-in function to carry out the cross validation for us. We do need to specify our model as a spearate function for this process to work:
far1 <- function(x, h){
forecast(Arima(x, order = c(5, 1, 0), include.drift = TRUE), h = h)
}
#
ts_cv_out <- tsCV(y = lnyse, forecastfunction = far1, h = 1)
head(ts_cv_out, 20)
Note that this process is long and performs a cross-validation on the whole dataset. We can compare the last few values with our manually calculated ones:
tail(ts_cv_out[-c(1:(length(log_passengers) - k - 1))], 11)
We see that the errors are the same as from our manual calculations:
tail(forcast_errors, 10)
The airpassangers series, which we analysed previously, may also be seasonally integrated. This would require us to repeat the previous steps not only on non-seasonal differences, but on the seasonal differences as well (as well as combining both seasonal and non-seasonal differences). Here we will simply provide the output of the auto.arima
function:
mdl_auto = forecast::auto.arima(log_passengers)
mdl_auto
The model appears to fit the data quite well. It appears that the series exhibited unit roots in both the seasonal and non-seasonal autoregressive components.
We can forecast the logarithms of the series, as well as plot the fitted values as follows:
# Set plot options in Jupyter Lab
options(repr.plot.width = 10)
options(repr.plot.height = 5)
plot(forecast(mdl_auto), main = forecast(mdl_auto)$method)
lines(fitted.values(mdl_auto), col = "red")
In order to get the original values, we would need to take exponents of appropriate elements from the forecast(mdl_auto)
variable. A more convenient way is fitting the original series and specifying a logarithmic transformation:
mdl_auto = forecast::auto.arima(airpass, lambda = 0)
mdl_auto
We can forecast the series, as well as plot the fitted values of the original data:
plot(forecast(mdl_auto), main = forecast(mdl_auto)$method)
lines(airpass, col = "darkgreen", lwd = 2) # to verify that it is indeed the original series
lines(fitted.values(mdl_auto), col = "red")