3.2 Example

Considerations

The following libraries will be useful for this exercise.

suppressPackageStartupMessages({
  suppressWarnings({
    suppressMessages({
      library(forecast)
      library(fma)
      library(astsa)
      library(ggplot2)
    })
  })
})      
#Import the required modules for vectors and matrix operations, data generation
import numpy as np
#Import the required modules for plot creation:
import matplotlib.pyplot as plt
#import the required modules for TimeSeries data generation:
import statsmodels.api as sm
#Import the required modules for test statistic calculation:
import statsmodels.stats as sm_stat
#Import the required modules for model estimation:
import statsmodels.tsa as smt
#Import dmatrix to fit regression splines
from patsy import dmatrix
#Import the required modules for time series data types
import pandas as pd
# Set maximum column width of a pd.DataFrame to PREVENT a line break of wide tables
pd.set_option('display.max_columns', None)
# pd.set_option('display.max_colwidth', -1)
pd.set_option('display.expand_frame_repr', False)

We will also need some custom functions to make plotting easier:

def tsdisplay(y, figsize = (10, 8), title = "", lags = 20):
    tmp_data = pd.Series(y)
    fig = plt.figure(figsize = figsize)
    #Plot the time series
    _ = tmp_data.plot(ax = fig.add_subplot(311), title = "$Time\ Series\ " + title + "$", legend = False)
    #Plot the ACF:
    _ = sm.graphics.tsa.plot_acf(tmp_data, lags = lags, zero = False, ax = fig.add_subplot(323))
    _ = plt.xticks(np.arange(1,  lags + 1, 1.0))
    #Plot the PACF:
    _ = sm.graphics.tsa.plot_pacf(tmp_data, lags = 20, zero = False, ax = fig.add_subplot(324))
    _ = plt.xticks(np.arange(1,  lags + 1, 1.0))
    #Plot the QQ plot of the data:
    _ = sm.qqplot(tmp_data, line='s', ax = fig.add_subplot(325)) 
    _ = plt.title("QQ Plot")
    #Plot the residual histogram:
    _ = fig.add_subplot(326).hist(tmp_data, bins = 40, density = 1)
    _ = plt.title("Histogram")
    #Fix the layout of the plots:
    _ = plt.tight_layout()
    plt.show()
tsdiag2 <- function(y, title = "", lags = 30, df = 0){
  rez <- matrix(0, nrow = 4, ncol = lags)
  rownames(rez) <- c("Ljung-Box: X-squared", "Ljung-Box: p-value", "Box-Pierce: X-squared", "Box-Pierce: p-value")
  colnames(rez) <- 1:lags
  suppressWarnings({
    rez[1, ] <- sapply(1:lags, function(z){Box.test(y, lag = z, fitdf = df, type = "Ljung-Box")$statistic})
    rez[2, ] <- sapply(1:lags, function(z){Box.test(y, lag = z, fitdf = df, type = "Ljung-Box")$p.value})
    rez[3, ] <- sapply(1:lags, function(z){Box.test(y, lag = z, fitdf = df, type = "Box-Pierce")$statistic})
    rez[4, ] <- sapply(1:lags, function(z){Box.test(y, lag = z, fitdf = df, type = "Box-Pierce")$p.value})
  })
  # Fix for when df - fitdf = 0, since Box.test(y, lag = 0) and Box.test(y, lag = 1, fitdf = 1) do not always return NA
  rez[ c(2, 4), 1:lags == df] <- NA
  print(rez)
  # Plot Ljung-Box and Box-Pierce statistic p-values:
  plot(1:lags, rez[2, ], pch = 19, col = "blue", main = paste0("Time Series, with model fitted df = ", df),
       ylim = c(0, max(rez[c(2, 4),], 0.05, na.rm = TRUE)))
  points(1:lags, rez[4, ], pch = 19, col = "darkgreen")
  abline(h = 0.05, col = "red")
  axis(1, at = 1:lags, labels = 1:lags)
  legend("topleft", legend = c("Ljung-Box", "Box-Pierce", "5% critical value"),
         lty = c(NA, NA, 1), pch = c(19, 19, NA), col = c("blue", "darkgreen", "red"))
  # Return nothing:
  return(NULL)
}
def tsdiag(y, fig_size = (10, 8), title = "", lags = 30, df = 0):
    tmp_data = pd.Series(y.copy()).reset_index(drop = True)
    tmp_data.index = list(range(1, len(tmp_data) + 1))
    # Carry out the Ljung-Box and Box-Pierce tests:
    tmp_acor = list(sm_stat.diagnostic.acorr_ljungbox(tmp_data, lags = lags, model_df = df, boxpierce = True))
    #
    col_index = ["Ljung-Box: X-squared", "Ljung-Box: p-value", "Box-Pierce: X-squared", "Box-Pierce: p-value"]
    rez = pd.DataFrame(tmp_acor, index = col_index, columns = range(1, len(tmp_acor[0]) + 1))
    print(rez)
    # Plot Ljung-Box and Box-Pierce statistic p-values:
    fig = plt.figure(figsize = fig_size)
    _ = plt.plot(range(1, len(tmp_acor[0]) + 1), tmp_acor[1], 'bo', label = "Ljung-Box values")
    _ = plt.plot(range(1, len(tmp_acor[0]) + 1), tmp_acor[3], 'go', label = "Box-Pierce values")
    _ = plt.xticks(np.arange(1,  len(tmp_acor[0]) + 1, 1.0))
    _ = plt.axhline(y = 0.05, color = "blue", linestyle = "--", label = "5% critical value")
    _ = plt.title("$Time\ Series\ " + title + "$, with model fitted df = " + str(df))
    _ = plt.legend()
    # get the p-values of the Ljung-Box test:
    p_vals = pd.Series(tmp_acor[1])
    # Start the index from 1 instead of 0 (because Ljung-Box test is for lag values from 1 to k)
    p_vals.index += 1
    # Annotate the p-value points above and to the left of the vertex
    x = np.arange(p_vals.size) + 1
    for X, Y, Z in zip(x, p_vals, p_vals):
        plt.annotate(round(Z, 4), xy=(X,Y), xytext=(-5, 5), ha = 'left', textcoords='offset points')
    plt.show()
    # Return nothing
    return None

Note that we have slightly modified the tsdiag() function - it now adds the p-value of the \(Ljung-Box\) test on the plot itself.

Note that these functions are not necessary, but they make plotting much easier since we will frequently need to plot the data, its autocorrelation, as well as test whether the autocorrelation is significant.

3.2.1 Exercise Set 1: Exploratory Data Analysis (EDA)

We will show an example with the airpass dataset.

We will begin by reading in the data:

airpass <- fma::airpass
airpass = sm.datasets.get_rdataset("AirPassengers", "datasets")
airpass = pd.Series(airpass.data["value"])
airpass.index = pd.date_range(start = "1949-01", periods = len(airpass.index), freq = "M").to_period()
airpass.index = airpass.index.to_timestamp()
airpass.head()
## 1949-01-01    112
## 1949-02-01    118
## 1949-03-01    132
## 1949-04-01    129
## 1949-05-01    121
## Freq: MS, Name: value, dtype: int64

3.2.1.1 Task 1

We begin by plotting our data as follows:

#
#
plot(airpass, col = "cornflowerblue")

fig = plt.figure(figsize = (10, 8))
_ = airpass.plot(ax = fig.add_subplot(111))
plt.show()

We note a couple of visual indications of non-stationarity:

  • The mean of the process appears to be increasing as time increases;
  • The variance of the process (i.e. the fluctuations) appear to be smaller at the beginning and much larger at the end of the time series
  • The monthly data appears to exhibit a seasonality pattern

We can view this seasonality by plotting each year as a separate plot.

Since we have a time series variable:

str(airpass)
##  Time-Series [1:144] from 1949 to 1961: 112 118 132 129 121 135 148 148 136 119 ...
airpass.index
## DatetimeIndex(['1949-01-01', '1949-02-01', '1949-03-01', '1949-04-01',
##                '1949-05-01', '1949-06-01', '1949-07-01', '1949-08-01',
##                '1949-09-01', '1949-10-01',
##                ...
##                '1960-03-01', '1960-04-01', '1960-05-01', '1960-06-01',
##                '1960-07-01', '1960-08-01', '1960-09-01', '1960-10-01',
##                '1960-11-01', '1960-12-01'],
##               dtype='datetime64[ns]', length=144, freq='MS')

We can extract its year and month:

print(start(airpass)[1]:end(airpass)[1])
##  [1] 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960
print(rep(start(airpass)[2]:frequency(airpass), times = end(airpass)[1] - start(airpass)[1] + 1)[1:length(airpass)])
##   [1]  1  2  3  4  5  6  7  8  9 10 11 12  1  2  3  4  5  6  7  8  9 10 11 12  1  2  3  4  5  6  7  8  9 10 11 12  1  2  3  4  5  6  7  8  9 10 11 12  1  2  3  4  5  6  7  8  9 10 11 12  1  2  3  4  5  6  7  8  9 10 11 12  1  2  3  4  5  6  7  8  9 10 11 12  1  2  3  4  5  6  7  8  9 10 11 12  1  2  3  4  5  6  7  8  9 10 11 12  1  2  3  4  5  6  7  8  9 10 11 12  1  2  3  4  5  6  7  8  9 10 11 12  1  2  3  4  5  6  7  8  9 10 11 12
print(airpass.index.year)
## Int64Index([1949, 1949, 1949, 1949, 1949, 1949, 1949, 1949, 1949, 1949,
##             ...
##             1960, 1960, 1960, 1960, 1960, 1960, 1960, 1960, 1960, 1960],
##            dtype='int64', length=144)
print(airpass.index.month)
## Int64Index([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10,
##             ...
##              3,  4,  5,  6,  7,  8,  9, 10, 11, 12],
##            dtype='int64', length=144)

which we can use to transpose the data:

data_to_plot = pd.pivot_table(airpass.to_frame(), index = airpass.index.month, columns = airpass.index.year)
data_to_plot.head()
##   value                                                       
##    1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960
## 1   112  115  145  171  196  204  242  284  315  340  360  417
## 2   118  126  150  180  196  188  233  277  301  318  342  391
## 3   132  141  178  193  236  235  267  317  356  362  406  419
## 4   129  135  163  181  235  227  269  313  348  348  396  461
## 5   121  125  172  183  229  234  270  318  355  363  420  472

and plot the seasonal plots:

# Two ways to create plots: the second one automatically adds colors and a legend
# forecast::seasonplot(airpass)
forecast::ggseasonplot(airpass) + ggplot2::theme_bw()

fig = plt.figure(figsize = (10, 8))
_ = data_to_plot.plot(ax = fig.add_subplot(111))
plt.show()

We see that there appears to be an increase in passengers between June and August each year, with the least amount of airline passengers in February and November of each year.

This appears to be a deterministic (i.e. non-random and repeating) pattern.

Overall, we have that the series is not stationary.

3.2.1.2 Task 2

One way to check for autocorrelation is to examine the ACF and PACF plots, which we can do using our own tsdisplay function:

forecast::tsdisplay(airpass)

tsdisplay(airpass)

To test for significant autocorrelation with the Ljung-Box test. We will again opt to use our won function as it is a bit more convenient:

tsdiag2(airpass, lags = 20)
##                              1        2        3        4        5        6        7        8        9       10       11        12       13       14       15       16       17       18       19       20
## Ljung-Box: X-squared  132.1415 245.6462 342.6748 427.7387 504.7966 575.6019 643.0386 709.4845 779.5912 857.0686 944.3903 1036.4819 1117.992 1185.553 1241.504 1289.037 1330.381 1367.042 1401.081 1434.149
## Ljung-Box: p-value      0.0000   0.0000   0.0000   0.0000   0.0000   0.0000   0.0000   0.0000   0.0000   0.0000   0.0000    0.0000    0.000    0.000    0.000    0.000    0.000    0.000    0.000    0.000
## Box-Pierce: X-squared 129.4263 239.8212 333.5270 415.0951 488.4584 555.3839 618.6636 680.5584 745.3831 816.4925 896.0390  979.2999 1052.435 1112.593 1162.029 1203.702 1239.665 1271.304 1300.447 1328.532
## Box-Pierce: p-value     0.0000   0.0000   0.0000   0.0000   0.0000   0.0000   0.0000   0.0000   0.0000   0.0000   0.0000    0.0000    0.000    0.000    0.000    0.000    0.000    0.000    0.000    0.000

## NULL
tsdiag(airpass, lags = 20)
##                                  1             2             3             4              5              6              7              8              9              10             11             12             13             14             15             16             17             18             19             20
## Ljung-Box: X-squared   1.321415e+02  2.456462e+02  3.426748e+02  4.277387e+02   5.047966e+02   5.756019e+02   6.430386e+02   7.094845e+02   7.795912e+02   8.570686e+02   9.443903e+02   1.036482e+03   1.117992e+03   1.185553e+03   1.241504e+03   1.289037e+03   1.330381e+03   1.367042e+03   1.401081e+03   1.434149e+03
## Ljung-Box: p-value     1.393231e-30  4.556318e-54  5.751088e-74  2.817731e-91  7.360195e-107  4.264008e-121  1.305463e-134  6.496271e-148  5.249370e-162  1.100789e-177  1.766396e-195  2.682212e-214  7.708990e-231  2.212517e-244  1.982759e-255  1.137910e-264  1.384297e-272  1.691589e-279  7.482011e-286  5.300473e-292
## Box-Pierce: X-squared  1.294263e+02  2.398212e+02  3.335270e+02  4.150951e+02   4.884584e+02   5.553839e+02   6.186636e+02   6.805584e+02   7.453831e+02   8.164925e+02   8.960390e+02   9.792999e+02   1.052435e+03   1.112593e+03   1.162029e+03   1.203702e+03   1.239665e+03   1.271304e+03   1.300447e+03   1.328532e+03
## Box-Pierce: p-value    5.471060e-30  8.384679e-53  5.499710e-72  1.522155e-88  2.473624e-103  9.752966e-117  2.327567e-129  1.095987e-141  1.203020e-154  5.870025e-169  4.405101e-185  5.277465e-202  9.512605e-217  1.053877e-228  2.336849e-238  2.390890e-246  4.075967e-253  5.829188e-259  2.829002e-264  2.291495e-269

Since all the p-values are less than the 0.05 critical value, we reject the null hypothesis of no autocorrelation and conclude that the series is autocorrelated.

We also see from the ACF plot that:

  • the ACF decays very slowly - this indicates a trend effect.
  • the ACF plot exhibits a larger correlation at around every \(12^{th}\) lag (at lags 12, 24, 36, etc. the correlation spikes). This is an indication that there is a seasonal (or cyclic) component in our data.

Since we have monthly data, where every \(12^{th}\) lag exhibits a stronger autocorrelation, the seasonal period is \(d = 12\). Usually the seasonal period equal the frequency of the time series.

3.2.1.3 Task 3

We again state that the variance of the process (i.e. the fluctuations) appear to be smaller at the beginning and much larger at the end of the time series. This indicated that our series may be multiplicative: \(Y_t = T_t \cdot S_t \cdot E_t\).

Since in practice it is much easier to deal with additive series: \(Y_t = T_t + S_t + E_t\), we will transform the data by taking the logarithms:

log_passengers <- log(airpass)
log_passengers = np.log(airpass)

We can then re-examine the logarithms of the series:

forecast::tsdisplay(log_passengers)

tsdisplay(log_passengers)

The series appears to now have a constant variance - the fluctuations at the beginning appear to be similar to those at the end of the time series.

Considerations

Note that the minimum value in our series is a positive value:

print(min(airpass))
## [1] 104
print(airpass.min())
## 104

Hence, we can take the logarithms of our data.

3.2.2 Exercise Set 2: Time series decomposition without forecasts

3.2.2.1 Task 4

We will begin by examining a simple decomposition method based on averaging (more specifically we select the amount of values that we will use for the averaging and “move” the averaging window based on time \(t\)). This is known as the moving-average method for decomposition.

We will begin by estimating the trend component of the series.

Trend Decomposition via Moving Average

We will provide an example of the moving-average method for decomposition.

If we select the length of the moving average as an odd number, for example \(l = 3\), then:

  • The two-sided moving-average:

\[\widehat{T}_t = \dfrac{Y_{t-1} + Y_t + Y_{t+1}}{3}\]

  • The one-sided moving-average:

\[\widehat{T}_t = \dfrac{Y_{t-2} + Y_{t-1} + Y_t}{3}\]

are calculated for integer times \(t\).

If the time series contains a seasonal component and we want to average it out, the length of the moving average must be equal to the seasonal frequency (for a monthly series, the length \(l = 12\)). However, there is a slight hurdle - suppose, our times series begin in January (\(t = 1\)) and we average up to December (\(t = 12\)). This average corresponds to a time \(t = 6.5\) - a time between June and July.

When we come to estimate seasonal effects, we need a moving-average at integer times. This can be achieved by averaging the average of January to December and the average of February (\(t = 2\)) to January (\(t = 13\)). This average of the two moving averages corresponds to \(t = 7\) and the process is called centering.

Thus, the trend at time \(t\) can be estimated by the centered moving average:

  • Two-sided averaging: \[ \begin{aligned} \widehat{T}_t &= \dfrac{(Y_{t-6} +...+ Y_{t+5})/12 + (Y_{t-5} +...+ Y_{t+6})/12}{2} = \dfrac{(1/2)Y_{t-6} + Y_{t-5}...+ Y_{t+5}+ (1/2)Y_{t+6}}{12} \end{aligned} \]

  • One-sided averaging: \[ \begin{aligned} \widehat{T}_t &= \dfrac{(Y_{t-12} +...+ Y_{t-1})/12 + (Y_{t-11} +...+ Y_{t})/12}{2} = \dfrac{(1/2)Y_{t-12} + Y_{t-11}...+ Y_{t-1}+ (1/2)Y_{t}}{12} \end{aligned} \]

Note that the weights are different for the first and last elements used in the averaging. we can use the statsmodels.tsa.seasonal.seasonal_decompose to automatically decompose our series.

We will begin with a two-sided decomposition:

decomposition_1 <- decompose(log_passengers, type = "additive")
trend_1 <- decomposition_1$trend
decomposition_1  = smt.seasonal.seasonal_decompose(log_passengers, 
                        model = "additive", period = 12, two_sided = True)
trend_1 = decomposition_1.trend

Considerations

Note that we can also do the two-sided decomposition using the filter() function:

dec_1_manual <- filter(log_passengers,
                       filter = c(1/2, rep(1, times = frequency(airpass) - 1), 1/2) / frequency(airpass), sides = 2)
dec_1_manual
##           Jan      Feb      Mar      Apr      May      Jun      Jul      Aug      Sep      Oct      Nov      Dec
## 1949       NA       NA       NA       NA       NA       NA 4.837280 4.841114 4.846596 4.851238 4.854488 4.859954
## 1950 4.869840 4.881389 4.893411 4.904293 4.912752 4.923701 4.940483 4.957406 4.974380 4.991942 5.013095 5.033804
## 1951 5.047776 5.060902 5.073812 5.088378 5.106906 5.124312 5.138282 5.152751 5.163718 5.171454 5.178401 5.189431
## 1952 5.203909 5.218093 5.231553 5.243722 5.257413 5.270736 5.282916 5.292150 5.304079 5.323338 5.343560 5.357427
## 1953 5.367695 5.378309 5.388417 5.397805 5.403849 5.407220 5.410364 5.410294 5.408381 5.406761 5.406218 5.410571
## 1954 5.419628 5.428330 5.435128 5.442237 5.450659 5.461103 5.473655 5.489713 5.503974 5.516367 5.529403 5.542725
## 1955 5.557864 5.572693 5.587498 5.602730 5.616658 5.631189 5.645937 5.659812 5.674172 5.687636 5.700766 5.714738
## 1956 5.727153 5.738856 5.750676 5.760658 5.770846 5.780430 5.788745 5.796524 5.804821 5.814072 5.823075 5.832692
## 1957 5.842665 5.853541 5.864863 5.875490 5.885654 5.894475 5.901555 5.907026 5.910012 5.910708 5.911637 5.913829
## 1958 5.917360 5.922887 5.926146 5.927563 5.929657 5.930458 5.932964 5.938377 5.946188 5.956352 5.967813 5.977291
## 1959 5.985269 5.994078 6.003991 6.014899 6.026589 6.040709 6.054492 6.066195 6.073088 6.080733 6.091930 6.102013
## 1960 6.112511 6.121153 6.128381 6.137437 6.145733 6.151526       NA       NA       NA       NA       NA       NA

which are identical to the decompose() results:

trend_1
##           Jan      Feb      Mar      Apr      May      Jun      Jul      Aug      Sep      Oct      Nov      Dec
## 1949       NA       NA       NA       NA       NA       NA 4.837280 4.841114 4.846596 4.851238 4.854488 4.859954
## 1950 4.869840 4.881389 4.893411 4.904293 4.912752 4.923701 4.940483 4.957406 4.974380 4.991942 5.013095 5.033804
## 1951 5.047776 5.060902 5.073812 5.088378 5.106906 5.124312 5.138282 5.152751 5.163718 5.171454 5.178401 5.189431
## 1952 5.203909 5.218093 5.231553 5.243722 5.257413 5.270736 5.282916 5.292150 5.304079 5.323338 5.343560 5.357427
## 1953 5.367695 5.378309 5.388417 5.397805 5.403849 5.407220 5.410364 5.410294 5.408381 5.406761 5.406218 5.410571
## 1954 5.419628 5.428330 5.435128 5.442237 5.450659 5.461103 5.473655 5.489713 5.503974 5.516367 5.529403 5.542725
## 1955 5.557864 5.572693 5.587498 5.602730 5.616658 5.631189 5.645937 5.659812 5.674172 5.687636 5.700766 5.714738
## 1956 5.727153 5.738856 5.750676 5.760658 5.770846 5.780430 5.788745 5.796524 5.804821 5.814072 5.823075 5.832692
## 1957 5.842665 5.853541 5.864863 5.875490 5.885654 5.894475 5.901555 5.907026 5.910012 5.910708 5.911637 5.913829
## 1958 5.917360 5.922887 5.926146 5.927563 5.929657 5.930458 5.932964 5.938377 5.946188 5.956352 5.967813 5.977291
## 1959 5.985269 5.994078 6.003991 6.014899 6.026589 6.040709 6.054492 6.066195 6.073088 6.080733 6.091930 6.102013
## 1960 6.112511 6.121153 6.128381 6.137437 6.145733 6.151526       NA       NA       NA       NA       NA       NA

We can plot this decomposition alongside the time series:

#
plot(log_passengers, col = "cornflowerblue")
lines(trend_1, col = "orange")
legend("topleft", legend = c("log of Airpassangers", "Two-sided decomposition of the trend"),
       lty = 1, col = c("cornflowerblue", "orange"))

fig = plt.figure(figsize = (10, 8))
_ = log_passengers.plot(label = "log of Airpassangers")
_ = trend_1.plot(label = "Two-sided decomposition of the trend")
_ = plt.legend()
plt.show()

Since we used a two-sided decomposition, we cannot calculate the trend for the first and last 6 months of our monthly time series.

On the other hand, if we specify a one-sided decomposition:

trend_2 <- filter(log_passengers,
                       filter = c(1/2, rep(1, times = frequency(airpass) - 1), 1/2) / frequency(airpass), sides = 1)
# trend_2
decomposition_2  = smt.seasonal.seasonal_decompose(log_passengers, model = "additive", period = 12, two_sided = False)
trend_2 = decomposition_2.trend
#
plot(log_passengers, col = "cornflowerblue")
lines(trend_2, col = "orange")
legend("topleft", legend = c("log of Airpassangers", "One-sided decomposition of the trend"),
       lty = 1, col = c("cornflowerblue", "orange"))

fig = plt.figure(figsize = (10, 8))
_ = log_passengers.plot(label = "log of Airpassangers")
_ = trend_2.plot(label = "One-sided decomposition of the trend")
_ = plt.legend()
plt.show()

Then we will not be able to calculate the trend for the first 12 months of the series.

Careful!

We would like to stress the importance of correctly selecting the period of the series. If we select a lower period value, then we will begin to capture the seasonal part with our trend filter.

For example, selecting the moving average length \(l = 3\), or selecting too large of a period frequency length, say \(l = 19\), yields:

period = 3 # an odd number, so we have a different weight formula
trend_3 <- filter(log_passengers, filter = rep(1, period) / 3, sides = 1)
decomposition_3  = smt.seasonal.seasonal_decompose(log_passengers, model = "additive", period = 3, two_sided = False)
trend_3 = decomposition_3.trend
plot(log_passengers, col = "cornflowerblue")
lines(trend_3, col = "orange")
lines(trend_2, col = "darkgreen")
legend("topleft", legend = c("log of Airpassangers", "One-sided decomposition of the trend with period d = 3",
                             "One-sided decomposition of the trend with period d = 12"),
       lty = 1, col = c("cornflowerblue", "orange", "darkgreen"))

fig = plt.figure(figsize = (10, 8))
_ = log_passengers.plot(label = "log of Airpassangers")
_ = trend_3.plot(label = "One-sided decomposition of the trend with d = 3")
_ = trend_2.plot(label = "One-sided decomposition of the trend with period d = 12")
_ = plt.legend()
plt.show()

This can be explained by looking at the formulas for the moving average decomposition - this is because we are using fewer observations around time \(t\). So, our estimated trend values would be closer to the overall series value. However, in doing so, we are not estimating the trend component correctly. As such, it is important to correctly determine whether your data exhibits a seasonal component and if so - to correctly determine the seasonal frequency.

Furthermore, specifying too large of a period frequency length, say \(l = 24\) yields:

period = 24 # an even number - we use an appropriate weight formula
trend_4 <- filter(log_passengers, filter = c(1/2, rep(1, times = period - 1), 1/2) / period, sides = 1)
decomposition_4  = smt.seasonal.seasonal_decompose(log_passengers, model = "additive", period = 24, two_sided = False)
trend_4 = decomposition_4.trend
plot(log_passengers, col = "cornflowerblue")
lines(trend_4, col = "orange")
lines(trend_2, col = "darkgreen")
legend("topleft", legend = c("log of Airpassangers", "One-sided decomposition of the trend with period d = 24",
                             "One-sided decomposition of the trend with period d = 12"),
       lty = 1, col = c("cornflowerblue", "orange", "darkgreen"))

fig = plt.figure(figsize = (10, 8))
_ = log_passengers.plot(label = "log of Airpassangers")
_ = trend_4.plot(label = "One-sided decomposition of the trend with period d = 24")
_ = trend_2.plot(label = "One-sided decomposition of the trend with period d = 12")
_ = plt.legend()
plt.show()

Notice that too large of a period may not capture the trend as accurately. Again, as we take more values, which cover more than one period, we calculate the average value over more than one period, hence our trend estimate is not as accurate.

Considerations

Manual function specification

We could also try to specify our own function to decompose the trend only:

import math as math

def moving_average_decompose(y, l = 12):
    tmp_index = y.index
    #Time Series:
    y = np.array(y)
    #Trend Component:
    t_two_side = pd.np.zeros(y.size)
    t_one_side = pd.np.zeros(y.size)
    tmp_val = 0
    #Loop through each time point:
    for j in range(0, y.size):
        ###########################################################
        # The length is even - use centered averaging:
        if l % 2 == 0:
            # The centered averaging method weights - see the formula example with l = 12.
            tmp_weights = np.concatenate([np.array([1 / (l * 2)]),
                                          np.repeat(1 / l, l - 1),
                                          np.array([1 / (l * 2)])])
            #Two-sided
            if j + 1 <= l / 2 or j + 1 > y.size - l / 2: 
                # We do not have enough data to the left or right of the series to calculate trend at time t = j:
                t_two_side[j] = np.nan
            else:
                tmp_val = y[int(j - (l / 2)):int(j + (l / 2 + 1))] * tmp_weights
                t_two_side[j] = tmp_val.sum()
            #One-sided
            if j + 1 <= l:
                t_one_side[j] = np.nan
            else:
                tmp_val = y[int(j - l):int(j + 1)] * tmp_weights
                t_one_side[j] = tmp_val.sum()
        ###########################################################
        # The length is odd:
        else:
            #For the non-centered averaging the weights are simply the mean, i.e. the length l.
            #tmp_weights = np.repeat(1 / l, l)
            #Two-sided
            if j + 1 <= math.floor(l / 2) or j + 1 > y.size - math.floor(l / 2):
                # We do not have enough data to the left or right of the series to calculate trend at time t = j:
                t_two_side[j] = np.nan
            else:
                tmp_val = y[int(j - math.floor(l / 2)):int(j + (math.floor(l / 2) + 1))]
                t_two_side[j] = tmp_val.mean()
            
            #One-sided
            if j * 2 <= l:
                t_one_side[j] = np.nan
            else:
                tmp_val = y[int(j - l + 1):int(j + 1)]
                t_one_side[j] = tmp_val.mean()
    #Return the same time series:
    t_one_side = pd.Series(t_one_side, name = "One-Sided MA with length = " + str(l))
    t_one_side.index = tmp_index
    t_two_side = pd.Series(t_two_side, name = "Two-Sided MA with length = " + str(l))
    t_two_side.index = tmp_index
    
    return {"Two-sided": t_two_side, "One-sided": t_one_side}

Here we have applied the formulas for calculating the moving average for both odd and even length of the period.

We can test our function as follows:

decomposition_custom_1 = moving_average_decompose(log_passengers)
decomposition_custom_2 = moving_average_decompose(log_passengers, l = 3)
decomposition_custom_3 = moving_average_decompose(log_passengers, l = 24)
fig = plt.figure(figsize = (10, 8))
_ = log_passengers.plot()
_ = decomposition_custom_1["One-sided"].plot()
_ = decomposition_custom_1["Two-sided"].plot()
_ = decomposition_custom_2["Two-sided"].plot()
_ = decomposition_custom_3["One-sided"].plot()
_ = plt.legend()
plt.show()

Furthermore, we may wish to compare it with the built-in function:

comparison = decomposition_custom_1["Two-sided"] - trend_1
print("Two-Sided MA, l = 12: max. difference = ", round(comparison.max(), 5), "; min. difference = ", round(comparison.min(), 5))
## Two-Sided MA, l = 12: max. difference =  0.0 ; min. difference =  -0.0
comparison = decomposition_custom_1["One-sided"] - trend_2
print("One-Sided MA, l = 12: max. difference = ", round(comparison.max(), 5), "; min. difference = ", round(comparison.min(), 5))
## One-Sided MA, l = 12: max. difference =  0.0 ; min. difference =  -0.0
comparison = decomposition_custom_2["One-sided"] - trend_3
print("One-Sided MA, l =  3: max. difference = ", round(comparison.max(), 5), "; min. difference = ", round(comparison.min(), 5))
## One-Sided MA, l =  3: max. difference =  0.0 ; min. difference =  -0.0
comparison = decomposition_custom_3["One-sided"] - trend_4
print("One-Sided MA, l =  24: max. difference = ", round(comparison.max(), 5), "; min. difference = ", round(comparison.min(), 5))
## One-Sided MA, l =  24: max. difference =  0.0 ; min. difference =  -0.0

Note the smallest and largest value are approximately zero. This means that our calculations are approximately the same as the ones provided by the built-in functions.

Having estimated the trend, we can move on to estimating the seasonal effect.

Seasonal Effect Decomposition

We will decompose the seasonal effect using both trend averaging methods and examine how the decomposition fits the data. We will employ the following algorithm:

  1. Estimate the trend component \(T_t\) using the moving average method;
  2. Estimate the seasonal component as \(\widehat{S}_t = Y_t - \widehat{T_t}\), \(t = 1,...,T\);
  3. Decide on the seasonal period \(d\) such that: \(S_t = S_{t+d}\) and \(\sum_{t = 1}^d S_t = 0\). Then, calculate \(\overline{S}_k = \dfrac{1}{N} \sum_{j = 1}^N \widehat{S}_{j\cdot k}\), \(k = 1,...,d\), where \(N = \lfloor T / d \rfloor\);
  4. Calculate the total sum: \(S_{tot} = \sum_{k = 1}^d \overline{S}_k\);
  5. Calculate \(w = \dfrac{1}{d} \sum_{k = 1}^d S_{tot}\);
  6. Adjust each seasonal factor: \(\widetilde{S}_t = \widehat{S}_t - w\);
  7. We now have estimated the seasonal component \(\widetilde{S}_t\) such that \(\widetilde{S}_t = \widetilde{S}_{t+d}\) and \(\sum_{t= 1}^d \widetilde{S}_t = 0\).

We can use the results from the built-in decomposition, which decomposes not only the trend, but the seasonality and random component, and print the different components as follows:

par(mfrow = c(4, 1))
# Plot the series
plot(log_passengers, 
     col = "cornflowerblue",
     main = "Observed")
plot(decomposition_1$trend, 
     col = "cornflowerblue",
     main = "Trend")
plot(decomposition_1$seasonal, 
     col = "cornflowerblue", 
     main = "Seasonal")
plot(decomposition_1$random, 
     col = "cornflowerblue", 
     main = "Residual")

fig, ax = plt.subplots(4, 1, figsize = (10, 8))
# Plot the series
_ = decomposition_1.observed.plot(ax = ax[0])
_ = decomposition_1.trend.plot(ax = ax[1])
_ = decomposition_1.seasonal.plot(ax = ax[2])
_ = decomposition_1.resid.plot(ax = ax[3])
# Add the labels to the Y-axis
_ = ax[0].set_ylabel('Observed')
_ = ax[1].set_ylabel('Trend')
_ = ax[2].set_ylabel('Seasonal')
_ = ax[3].set_ylabel('Residual')
# Fix layout
_ = plt.tight_layout()
plt.show()

Though, we would prefer to see how all of these component look alongside one another and the whole time series:

Since the decompose() function in R allows for a two-side decomposition, we will manually calculate the seasonality from the filter() function’s one-way trend decomposition results:

# Since the series starts at January 1949 and ends at Decembet 1960 - we can create an approprriate matrix:
seas_avg_2 <- colMeans(matrix(log_passengers -  trend_2, ncol = frequency(log_passengers), byrow = TRUE), na.rm = TRUE)
season_2 <- seas_avg_2 - sum(seas_avg_2) / length(seas_avg_2)
legend_args <- list("topleft", legend = c("Log of Airpassangers", "Trend", "Trend + Season"),
                    col = c("cornflowerblue", "darkorange", "forestgreen"), lty = 1, lwd = 2)
#
#
par(mfrow = c(2, 1))
plot(log_passengers, main = "One-sided decomposition", col = "cornflowerblue", lwd = 2)
lines(trend_2, col = "darkorange", lwd = 2)
lines(season_2 + trend_2, col = "forestgreen", lwd = 2)
do.call(legend, legend_args)
#
plot(log_passengers, main = "Two-sided decomposition", col = "cornflowerblue", lwd = 2)
lines(decomposition_1$trend, col = "darkorange", lwd = 2)
lines(decomposition_1$seasonal + decomposition_1$trend, col = "forestgreen", lwd = 2)
do.call(legend, legend_args)

fig = plt.figure(figsize = (10, 8))
#
_ = log_passengers.plot(ax = fig.add_subplot(211), label = "Log of Airpassangers", title = "One-sided decomposition")
_ = decomposition_2.trend.plot(label = "Trend")
_ = (decomposition_2.seasonal + decomposition_2.trend).plot(label = "Trend + Season")
_ = plt.legend()
#
_ = log_passengers.plot(ax = fig.add_subplot(212), label = "Log of Airpassangers", title = "Two-sided decomposition")
_ = decomposition_1.trend.plot(label = "Trend")
_ = (decomposition_1.seasonal + decomposition_1.trend).plot(label = "Trend + Season")
_ = plt.legend()
#
_ = plt.tight_layout()
plt.show()

The decomposed trend and seasonality fit the data much nicer compared to only the trend component. Furthermore, the two-sided decomposition method appears to fit the data much closer than the one-side method. The downside is that we do not have the trend and consequently the trend + seasonality for the last 6 months of the series.

Finally, we can check if the remainder is stationary:

par(mfrow = c(1, 2))
plot(log_passengers - trend_2 - season_2, 
     main = "One-sided decomposition Residuals", col = "cornflowerblue", lwd = 2)
plot(decomposition_1$random, 
     main = "Two-sided decomposition Residuals", col = "cornflowerblue", lwd = 2)

fig = plt.figure(figsize = (15, 8))
_ = decomposition_2.resid.plot(ax = fig.add_subplot(121), title = "One-sided decomposition Residuals")
_ = decomposition_1.resid.plot(ax = fig.add_subplot(122), title = "Two-sided decomposition Residuals")
_ = plt.tight_layout()
plt.show()

We note that the residuals might not be stationary - we can compare the variances from different time periods:

print(paste0("Two-sided decomp. Variance for [1949-01-01 - 1954-01-01]: ", round(var(window(decomposition_1$random, start = c(1949, 1), end = c(1954, 1)), na.rm = TRUE), 6)))
## [1] "Two-sided decomp. Variance for [1949-01-01 - 1954-01-01]: 0.001534"
print(paste0("Two-sided decomp. Variance for [1954-02-01 - 1960-12-01]: ", round(var(window(decomposition_1$random, start = c(1954, 2), end = c(1960, 12)), na.rm = TRUE), 6)))
## [1] "Two-sided decomp. Variance for [1954-02-01 - 1960-12-01]: 0.000845"
print(paste0("One-sided decomp. Variance for [1954-02-01 - 1960-12-01]: ", round(var(window(log_passengers - trend_2 - season_2, start = c(1949, 1), end = c(1954, 1)), na.rm = TRUE), 6)))
## [1] "One-sided decomp. Variance for [1954-02-01 - 1960-12-01]: 0.002611"
print(paste0("One-sided decomp. Variance for [1954-02-01 - 1960-12-01]: ", round(var(window(log_passengers - trend_2 - season_2, start = c(1954, 2), end = c(1960, 12)), na.rm = TRUE), 6)))
## [1] "One-sided decomp. Variance for [1954-02-01 - 1960-12-01]: 0.001329"
print("Two-sided decomp. Variance for [1949-01-01 - 1954-01-01]: ", round(decomposition_1.resid.loc['19490101':'19540101'].var(), 6))
## Two-sided decomp. Variance for [1949-01-01 - 1954-01-01]:  0.001534
print("Two-sided decomp. Variance for [1954-02-01 - 1960-12-01]: ", round(decomposition_1.resid.loc['19540201':'19601201'].var(), 6))
## Two-sided decomp. Variance for [1954-02-01 - 1960-12-01]:  0.000845
print("One-sided decomp. Variance for [1949-01-01 - 1954-01-01]: ", round(decomposition_2.resid.loc['19490101':'19540101'].var(), 6))
## One-sided decomp. Variance for [1949-01-01 - 1954-01-01]:  0.002611
print("One-sided decomp. Variance for [1954-02-01 - 1960-12-01]: ", round(decomposition_2.resid.loc['19540201':'19601201'].var(), 6))
## One-sided decomp. Variance for [1954-02-01 - 1960-12-01]:  0.001329

The residuals in the second-half of the time series appear to have a smaller variance. (Note: it would most likely be best to carry out a test for variance equality of two subsets)

Let’s also look at the residual correlation plots:

par(mfcol = c(2, 2), mar = c(5, 5, 3, 1))
forecast::Acf(decomposition_1$random, 
              main = "ACF of Two-sided decomposition Residuals")
forecast::Pacf(decomposition_1$random, 
               main = "PACF of Two-sided decomposition Residuals")
forecast::Acf(log_passengers - trend_2 - season_2, 
              main = "ACF of One-sided decomposition Residuals")
forecast::Pacf(log_passengers - trend_2 - season_2, 
               main = "PACF of One-sided decomposition Residuals")

fig = plt.figure(figsize = (10, 8))
fig = sm.graphics.tsa.plot_acf(decomposition_1.resid.dropna(), zero = False, lags=40, ax = fig.add_subplot(221), title = "ACF of Two-sided decomposition Residuals")
fig = sm.graphics.tsa.plot_acf(decomposition_2.resid.dropna(), zero = False, lags=40, ax = fig.add_subplot(222), title = "ACF of One-sided decomposition Residuals")
#
fig = sm.graphics.tsa.plot_acf(decomposition_1.resid.dropna(), zero = False, lags=40, ax = fig.add_subplot(223), title = "PACF of Two-sided decomposition Residuals")
fig = sm.graphics.tsa.plot_acf(decomposition_2.resid.dropna(), zero = False, lags=40, ax = fig.add_subplot(224), title = "PACF of One-sided decomposition Residuals")
#
_ = plt.tight_layout()
plt.show()

The residual series is not White Noise, so we would need to try to estimate an \(ARMA\) model for it, as long as the residuals are stationary. Unfortunately, it seems that the two-sided decomposition still leaves a seasonality effect, as shown by the oscillating residual ACF plot.

Looking at the residual plots from this decomposition, we note that they do not appear to be stationary for the two-sided decomposition method on this data - the variance is smaller in the middle of the data, compared to the beginning of the time series. It also seems to be the case for the one-sided decomposition residuals as well.

We can see that this decomposition method does not fully eliminate the deterministic components from this specific time series data sample.

Note that this method does not allow to produce forecasts.

Considerations

Manual function specification

Since we have already calculated the trend, we can implement the rest of the procedure (under the assumption that the time series starts from the first season, i.e. January for monthly data) as follows:

def decomp_seas(y, trend, period = 12, title = ""):
    #Initial seasonal component estimation:
    seas_0 = y - trend
    seas_1 = pd.np.zeros(period)
    #Seasonal component averaging:
    for j in range(0, period):
        #Select every j-th month from each year in the dataset:
        seas_1[j] = seas_0[j::period].mean()
    #Adjustment factor:
    w = seas_1.mean()
    seas_1 = seas_1 - w
    #Save the seasonal component in a time series format:
    seasonal_results = pd.np.zeros(y.size)
    for j in range(0, period):
        #Select every j-th month from each year in the dataset:
        seasonal_results[j::period] = seas_1[j]
    
    seasonal_results = pd.Series(seasonal_results, name = "Seasonal " + title)
    seasonal_results.index = trend.index
    return(seasonal_results)

We can now test it and estimate the seasonal component for both one-sided and two-sided trend decomposition:

seas_results = {"One-sided": decomp_seas(y = log_passengers, trend = decomposition_custom_1["One-sided"], period = 12, title = "(One-sided)"),
                "Two-sided": decomp_seas(y = log_passengers, trend = decomposition_custom_1["Two-sided"], period = 12, title = "(Two-sided)")}

We can compare how the seasonal components differ for one-sided and two-sided methods:

fig = plt.figure(figsize = (10, 8))
_ = seas_results["One-sided"].plot()
_ = seas_results["Two-sided"].plot()
_ = plt.legend()
_ = plt.tight_layout()
plt.show()

Note that by construction of the algorithm \(\widetilde{S}_t = \widetilde{S}_{t+d}\), meaning we have the seasonal component decomposed for the whole series, unlike the one-sided and two-sided decomposition of the trend component.

Furthermore, we can add the seasonality component to the trend component to get the overall deterministic component \(\widehat{T}_t + \widetilde{S}_t\):

T_S = {"One-sided": decomposition_custom_1["One-sided"] + seas_results["One-sided"], "Two-sided": decomposition_custom_1["Two-sided"] + seas_results["Two-sided"]}
T_S["One-sided"].name = "Trend + Season"
T_S["Two-sided"].name = "Trend + Season"

Finally, we want to see how all of these component look alongside one another and the whole time series:

fig = plt.figure(figsize = (10, 8))
_ = log_passengers.plot(ax = fig.add_subplot(211), label = "Log of Airpassangers", title = "One-sided decomposition")
_ = decomposition_custom_1["One-sided"].plot()
_ = T_S["One-sided"].plot()
_ = plt.legend()
#
_ = log_passengers.plot(ax = fig.add_subplot(212), label = "Log of Airpassangers", title = "Two-sided decomposition")
_ = decomposition_custom_1["Two-sided"].plot()
_ = T_S["Two-sided"].plot()
_ = plt.legend()
#
_ = plt.tight_layout()
plt.show()

We note that the estimated seasonal component does not differ from the one we estimated manually:

comparison = seas_results["Two-sided"] - decomposition_1.seasonal
print("Two-Sided MA, l = 12: max. difference = ", round(comparison.max(), 5), "; min. difference = ", round(comparison.min(), 5))
## Two-Sided MA, l = 12: max. difference =  0.0 ; min. difference =  -0.0
comparison = seas_results["One-sided"] - decomposition_2.seasonal
print("One-Sided MA, l = 12: max. difference = ", round(comparison.max(), 5), "; min. difference = ", round(comparison.min(), 5))
## One-Sided MA, l = 12: max. difference =  0.0 ; min. difference =  -0.0

So, the manual estimation method is identical to the built-in method provided in statsmodels.

As with the built-in functions, we can manually examine the remainder:

fig = plt.figure(figsize = (15, 4))
_ = (log_passengers - T_S["One-sided"]).plot(ax = fig.add_subplot(121), title = "One-sided decomposition Residuals")
_ = (log_passengers - T_S["Two-sided"]).plot(ax = fig.add_subplot(122), title = "Two-sided decomposition Residuals")
_ = plt.tight_layout()
plt.show()

3.2.3 Exercise Set 3: Time series decomposition with forecasts

We will now examine a number of alternative decomposition methods, which allow to forecast the series, as well as decompose it. These methods usually are more robust and can usually capture the seasonality and trend, which may not be accounted for via the moving-average decomposition.

3.2.3.1 Task 5

The OLS method estimates coefficients of an additive model. If needed, we can also specify not only a linear but also a polynomial trend. We will estimate three different models via OLS:

  • Model 1: \(\text{AP}_t = \beta_0 + \beta_1 t + \beta_2 t^2 + \epsilon_t\)
  • Model 2: \(\text{AP}_t = \beta_0 + \beta_1 t + \beta_2 t^2 + \gamma_1 \text{dm}_{1,t} + ... + \gamma_{11} \text{dm}_{11,t} + \epsilon_t\)
  • Model 3: \(\log(\text{AP}_t) = \beta_0 + \beta_1 t + \beta_2 t^2 + \gamma_1 \text{dm}_{1,t} + ... + \gamma_{11} \text{dm}_{11,t} + \epsilon_t\)

where: - \(AP_t\) is the monthly airline passenger numbers 1949-1960; - \(t\) is the trend, \(\rm t = 1949/01/01, 1950/02/01,..., 1960/12/01\). Equivalently, we can specify \(t = 1,..., 144\). The only difference would be the coefficient \(\beta_1, \beta_2\) magnitude. - \(\text{dm}_{k,t}\), \(k = 1,2,...,11\) - dummy variables indicating the month of time \(t\);

Recall, that in order to avoid the dummy variable trap (a scenario in which the independent variables are multicollinear (highly correlated)) we have to exclude one dummy variable - in this case \(\text{dm}_{12,t}\) - from our regression models.

We begin by examining the initial (non-logarithmized) data once more:

tsdisplay(airpass)

tsdisplay(airpass)

  • From the ACFs slow, wavy decay it seems that we have a trend and a seasonal component.
  • From the time series plot is seems that the magnitude of seasonal variance increases as time increases.

So it seems to be either an exponential or a polynomial growing time series. We need to create additional dummy/indicator variables:

AP <- airpass
AP = airpass
  • the trend component
# Create trend
time <- 1:length(airpass)
time_sq <- time^2
# Create trend
time = np.array(range(1, len(airpass.index) + 1))
time_sq = time ** 2
  • the seasonal indicator variables for January - November
dm <- forecast::seasonaldummy(airpass)
dm = pd.get_dummies(airpass.index.month)
print(head(dm, 6))
##      Jan Feb Mar Apr May Jun Jul Aug Sep Oct Nov
## [1,]   1   0   0   0   0   0   0   0   0   0   0
## [2,]   0   1   0   0   0   0   0   0   0   0   0
## [3,]   0   0   1   0   0   0   0   0   0   0   0
## [4,]   0   0   0   1   0   0   0   0   0   0   0
## [5,]   0   0   0   0   1   0   0   0   0   0   0
## [6,]   0   0   0   0   0   1   0   0   0   0   0
print(dm.head(6))
##    1   2   3   4   5   6   7   8   9   10  11  12
## 0   1   0   0   0   0   0   0   0   0   0   0   0
## 1   0   1   0   0   0   0   0   0   0   0   0   0
## 2   0   0   1   0   0   0   0   0   0   0   0   0
## 3   0   0   0   1   0   0   0   0   0   0   0   0
## 4   0   0   0   0   1   0   0   0   0   0   0   0
## 5   0   0   0   0   0   1   0   0   0   0   0   0

We also need to drop the last column to exclude the last month, December, so as to avoid the dummy variable trap:

dm = dm.iloc[:, :-1]
print(dm.head(6))
##    1   2   3   4   5   6   7   8   9   10  11
## 0   1   0   0   0   0   0   0   0   0   0   0
## 1   0   1   0   0   0   0   0   0   0   0   0
## 2   0   0   1   0   0   0   0   0   0   0   0
## 3   0   0   0   1   0   0   0   0   0   0   0
## 4   0   0   0   0   1   0   0   0   0   0   0
## 5   0   0   0   0   0   1   0   0   0   0   0

Considerations

This can also be done manually:

#Create a dummy variable matrix of zeroes:
dm_manual = np.column_stack(np.zeros((11, airpass.size)))
#Create a DataFrame
dm_manual = pd.DataFrame(dm_manual)
#Add values to dummy variables:
for j in range(0, len(dm_manual.columns)):
    #Select every j-th month from each year in the dataset:
    dm_manual.iloc[j::12, j] = 1
print(dm_manual.head())
##     0    1    2    3    4    5    6    7    8    9    10
## 0  1.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0
## 1  0.0  1.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0
## 2  0.0  0.0  1.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0
## 3  0.0  0.0  0.0  1.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0
## 4  0.0  0.0  0.0  0.0  1.0  0.0  0.0  0.0  0.0  0.0  0.0

Finally, we can combine everything into one data matrix:

#
#Create a DataFrame
dtf <- data.frame(airpass, time, time_sq, dm)
#Rename the columns:
colnames(dtf) <- c("AP", "time", "time_sq", paste0("dm_", 1:ncol(dm)))
#
print(head(dtf, 13))
##     AP time time_sq dm_1 dm_2 dm_3 dm_4 dm_5 dm_6 dm_7 dm_8 dm_9 dm_10 dm_11
## 1  112    1       1    1    0    0    0    0    0    0    0    0     0     0
## 2  118    2       4    0    1    0    0    0    0    0    0    0     0     0
## 3  132    3       9    0    0    1    0    0    0    0    0    0     0     0
## 4  129    4      16    0    0    0    1    0    0    0    0    0     0     0
## 5  121    5      25    0    0    0    0    1    0    0    0    0     0     0
## 6  135    6      36    0    0    0    0    0    1    0    0    0     0     0
## 7  148    7      49    0    0    0    0    0    0    1    0    0     0     0
## 8  148    8      64    0    0    0    0    0    0    0    1    0     0     0
## 9  136    9      81    0    0    0    0    0    0    0    0    1     0     0
## 10 119   10     100    0    0    0    0    0    0    0    0    0     1     0
## 11 104   11     121    0    0    0    0    0    0    0    0    0     0     1
## 12 118   12     144    0    0    0    0    0    0    0    0    0     0     0
## 13 115   13     169    1    0    0    0    0    0    0    0    0     0     0
dtf = np.column_stack((airpass, time, time_sq, dm))
#Create a DataFrame
dtf = pd.DataFrame(dtf)
#Rename the columns:
dtf.columns = ["AP"] + ["time"] + ["time_sq"] + ["dm_" + str(number) for number in range(1, len(dm.columns) + 1)]
dtf.index = [str(number) for number in range(1, len(dtf.index) + 1)]
dtf.head(13)
##      AP  time  time_sq  dm_1  dm_2  dm_3  dm_4  dm_5  dm_6  dm_7  dm_8  dm_9  dm_10  dm_11
## 1   112     1        1     1     0     0     0     0     0     0     0     0      0      0
## 2   118     2        4     0     1     0     0     0     0     0     0     0      0      0
## 3   132     3        9     0     0     1     0     0     0     0     0     0      0      0
## 4   129     4       16     0     0     0     1     0     0     0     0     0      0      0
## 5   121     5       25     0     0     0     0     1     0     0     0     0      0      0
## 6   135     6       36     0     0     0     0     0     1     0     0     0      0      0
## 7   148     7       49     0     0     0     0     0     0     1     0     0      0      0
## 8   148     8       64     0     0     0     0     0     0     0     1     0      0      0
## 9   136     9       81     0     0     0     0     0     0     0     0     1      0      0
## 10  119    10      100     0     0     0     0     0     0     0     0     0      1      0
## 11  104    11      121     0     0     0     0     0     0     0     0     0      0      1
## 12  118    12      144     0     0     0     0     0     0     0     0     0      0      0
## 13  115    13      169     1     0     0     0     0     0     0     0     0      0      0

To make it a bit easier to specify the models, we will defined these formulas:

formula_2 <- paste0("AP ~ ", paste0(grep("AP", colnames(dtf), invert = TRUE, value = TRUE), collapse = " + "))
print(formula_2)
## [1] "AP ~ time + time_sq + dm_1 + dm_2 + dm_3 + dm_4 + dm_5 + dm_6 + dm_7 + dm_8 + dm_9 + dm_10 + dm_11"
formula_2 = "AP ~ " + " + ".join(dtf.columns.intersection(dtf.columns.difference(["AP"])))
print(formula_2)
## AP ~ time + time_sq + dm_1 + dm_2 + dm_3 + dm_4 + dm_5 + dm_6 + dm_7 + dm_8 + dm_9 + dm_10 + dm_11
formula_3 <- paste0("log(AP) ~ ", paste0(grep("AP", colnames(dtf), invert = TRUE, value = TRUE), collapse = " + "))
print(formula_3)
## [1] "log(AP) ~ time + time_sq + dm_1 + dm_2 + dm_3 + dm_4 + dm_5 + dm_6 + dm_7 + dm_8 + dm_9 + dm_10 + dm_11"
formula_3 = "np.log(AP) ~ " + " + ".join(dtf.columns.intersection(dtf.columns.difference(["AP"])))
print(formula_3)
## np.log(AP) ~ time + time_sq + dm_1 + dm_2 + dm_3 + dm_4 + dm_5 + dm_6 + dm_7 + dm_8 + dm_9 + dm_10 + dm_11

Now we can estimate the models.

  • Model 1: \(\text{AP}_t = \beta_0 + \beta_1 t + \beta_2 t^2 + \epsilon_t\)
AP_OLS_1 <- lm("AP ~ time + time_sq", data = dtf)
print(round(coef(summary(AP_OLS_1)), 3))
##             Estimate Std. Error t value Pr(>|t|)
## (Intercept)  112.380     11.384   9.872    0.000
## time           1.641      0.362   4.527    0.000
## time_sq        0.007      0.002   2.894    0.004
AP_OLS_1 = sm.formula.ols(formula = 'AP ~ time + time_sq', data = dtf)
print(AP_OLS_1.fit().summary().tables[1])
## ==============================================================================
##                  coef    std err          t      P>|t|      [0.025      0.975]
## ------------------------------------------------------------------------------
## Intercept    112.3800     11.384      9.872      0.000      89.874     134.886
## time           1.6410      0.362      4.527      0.000       0.924       2.358
## time_sq        0.0070      0.002      2.894      0.004       0.002       0.012
## ==============================================================================

The first, quadratic model has all significant coefficients.

  • Model 2: \(\text{AP}_t = \beta_0 + \beta_1 t + \beta_2 t^2 + \gamma_1 \text{dm}_{1,t} + ... + \gamma_{11} \text{dm}_{11,t} + \epsilon_t\)
AP_OLS_2 <- lm(formula_2, data = dtf)
print(round(coef(summary(AP_OLS_2)), 3))
##             Estimate Std. Error t value Pr(>|t|)
## (Intercept)   79.378      9.004   8.816    0.000
## time           1.626      0.192   8.478    0.000
## time_sq        0.007      0.001   5.573    0.000
## dm_1           9.180      9.709   0.946    0.346
## dm_2          -0.159      9.706  -0.016    0.987
## dm_3          32.405      9.704   3.339    0.001
## dm_4          26.704      9.702   2.752    0.007
## dm_5          28.822      9.700   2.971    0.004
## dm_6          66.009      9.699   6.806    0.000
## dm_7         103.016      9.697  10.623    0.000
## dm_8         100.091      9.696  10.323    0.000
## dm_9          48.736      9.695   5.027    0.000
## dm_10         10.199      9.695   1.052    0.295
## dm_11        -26.268      9.694  -2.710    0.008
AP_OLS_2 = sm.formula.ols(formula = formula_2, data = dtf)
print(AP_OLS_2.fit().summary().tables[1])
## ==============================================================================
##                  coef    std err          t      P>|t|      [0.025      0.975]
## ------------------------------------------------------------------------------
## Intercept     79.3775      9.004      8.816      0.000      61.564      97.191
## time           1.6255      0.192      8.478      0.000       1.246       2.005
## time_sq        0.0071      0.001      5.573      0.000       0.005       0.010
## dm_1           9.1803      9.709      0.946      0.346     -10.027      28.388
## dm_2          -0.1587      9.706     -0.016      0.987     -19.361      19.044
## dm_3          32.4048      9.704      3.339      0.001      13.207      51.603
## dm_4          26.7039      9.702      2.752      0.007       7.510      45.898
## dm_5          28.8221      9.700      2.971      0.004       9.631      48.013
## dm_6          66.0094      9.699      6.806      0.000      46.822      85.197
## dm_7         103.0157      9.697     10.623      0.000      83.831     122.201
## dm_8         100.0911      9.696     10.323      0.000      80.908     119.274
## dm_9          48.7356      9.695      5.027      0.000      29.554      67.917
## dm_10         10.1991      9.695      1.052      0.295      -8.981      29.379
## dm_11        -26.2683      9.694     -2.710      0.008     -45.448      -7.089
## ==============================================================================

While the January, February and December dummy variables are not statistically significantly different from zero (and as such, these months are not significantly different from the baseline group of December), most of the remaining dummy variables are statistically significant, so we can conclude that the seasonal effect is significant in our data.

  • Model 3: \(\log(\text{AP}_t) = \beta_0 + \beta_1 t + \beta_2 t^2 + \gamma_1 \text{dm}_{1,t} + ... + \gamma_{11} \text{dm}_{11,t} + \epsilon_t\)
AP_OLS_3 <- lm(formula_3, data = dtf)
print(round(coef(summary(AP_OLS_3)), 3))
##             Estimate Std. Error t value Pr(>|t|)
## (Intercept)    4.630      0.018 253.331    0.000
## time           0.013      0.000  33.877    0.000
## time_sq        0.000      0.000  -8.265    0.000
## dm_1           0.021      0.020   1.082    0.281
## dm_2          -0.001      0.020  -0.048    0.962
## dm_3           0.129      0.020   6.555    0.000
## dm_4           0.098      0.020   4.962    0.000
## dm_5           0.095      0.020   4.838    0.000
## dm_6           0.217      0.020  11.041    0.000
## dm_7           0.321      0.020  16.323    0.000
## dm_8           0.312      0.020  15.855    0.000
## dm_9           0.167      0.020   8.511    0.000
## dm_10          0.029      0.020   1.497    0.137
## dm_11         -0.114      0.020  -5.797    0.000
AP_OLS_3 = sm.formula.ols(formula = formula_3, data = dtf)
print(AP_OLS_3.fit().summary().tables[1])
## ==============================================================================
##                  coef    std err          t      P>|t|      [0.025      0.975]
## ------------------------------------------------------------------------------
## Intercept      4.6301      0.018    253.331      0.000       4.594       4.666
## time           0.0132      0.000     33.877      0.000       0.012       0.014
## time_sq    -2.148e-05    2.6e-06     -8.265      0.000   -2.66e-05   -1.63e-05
## dm_1           0.0213      0.020      1.082      0.281      -0.018       0.060
## dm_2          -0.0009      0.020     -0.048      0.962      -0.040       0.038
## dm_3           0.1291      0.020      6.555      0.000       0.090       0.168
## dm_4           0.0977      0.020      4.962      0.000       0.059       0.137
## dm_5           0.0953      0.020      4.838      0.000       0.056       0.134
## dm_6           0.2174      0.020     11.041      0.000       0.178       0.256
## dm_7           0.3213      0.020     16.323      0.000       0.282       0.360
## dm_8           0.3120      0.020     15.855      0.000       0.273       0.351
## dm_9           0.1675      0.020      8.511      0.000       0.129       0.206
## dm_10          0.0295      0.020      1.497      0.137      -0.009       0.068
## dm_11         -0.1141      0.020     -5.797      0.000      -0.153      -0.075
## ==============================================================================

In the final model we have “linearized” the time series by taking the logarithms. We see that we come to the same conclusions regarding coefficient significance as in the second model.

To check whether our models adequately capture the trend and seasonality, we use tsdisplay:

tsdisplay(AP_OLS_1$residuals, main = "Time Series: Model 1 residuals")

tsdisplay(AP_OLS_1.fit().resid, title = "Model\ 1\ residuals")

We clearly see that Model 1 does not capture the seasonality effects in our series - they remain in the model residuals.

tsdisplay(AP_OLS_2$residuals, main = "Time Series: Model 2 residuals")

tsdisplay(AP_OLS_2.fit().resid, title = "Model\ 2\ residuals")

While the middle of the series seems a bit better - the beginning and end of the series exhibit clear seasonality patterns (also seen from the residual ACF plots). Again, the seasonality remains in our series.

Finally, our third model:

tsdisplay(AP_OLS_3$residuals, main = "Time Series: Model 3 residuals")

tsdisplay(AP_OLS_3.fit().resid, title = "Model\ 3\ residuals")

An improvement! The ACF does not show a clear seasonality pattern as before. Just to be sure - we plot the ACF for more lags:

forecast::Acf(AP_OLS_3$residuals, lag.max = 50, xaxt = 'n')
axis(side = 1, at = 1:50, labels = 1:50)

fig = plt.figure(figsize = (10, 8))
_ = sm.graphics.tsa.plot_acf(AP_OLS_3.fit().resid, lags = 50, zero = False)
_ = plt.xticks(np.arange(1,  50 + 1, 1.0))
plt.show()

So, while the bars appear to have some oscillation - most of its seasonal pattern appears to be insignificant. We can conclude that Model 3 adequately captures the trend and seasonal patterns in our dataset.

Finally, the fitted values can be calculated as:

fit_1 <- AP_OLS_1$fitted.values
fit_2 <- AP_OLS_2$fitted.values
fit_3 <- AP_OLS_3$fitted.values
fit_1 = AP_OLS_1.fit()
fit_2 = AP_OLS_2.fit()
fit_3 = AP_OLS_3.fit()

Considerations

It would be interesting to predict the series with each model. For that, we need to create a new sample of explanatory variables to predict and plot.

An important distinction for time series forecasting - we need future values of exogeneous variables. Thankfully, we have only used deterministic variables - the year and months - hence we know their exact values in the future. So, we construct the exogeneous variable matrix, which we will use for forecasting.

Assume that we want to forecast \(h = 20\) steps into the future:

h <- 20
h = 20

We will begin by creating a time series index vector, which we will transform into the appropriate seasonal dummy variables:

forecast_date <- ts(data = NA, start = end(airpass) + c(0, 1),
                    end = end(airpass) + c(0, h), frequency = frequency(airpass))
#
print(time(forecast_date))
##           Jan      Feb      Mar      Apr      May      Jun      Jul      Aug      Sep      Oct      Nov      Dec
## 1961 1961.000 1961.083 1961.167 1961.250 1961.333 1961.417 1961.500 1961.583 1961.667 1961.750 1961.833 1961.917
## 1962 1962.000 1962.083 1962.167 1962.250 1962.333 1962.417 1962.500 1962.583
forecast_date = pd.date_range(start = airpass.index[-1].to_pydatetime() + pd.DateOffset(months = 1), 
                              periods = h, freq = airpass.index.freq)
forecast_date = forecast_date.to_period().to_timestamp()
print(forecast_date)                                     
## DatetimeIndex(['1961-01-01', '1961-02-01', '1961-03-01', '1961-04-01',
##                '1961-05-01', '1961-06-01', '1961-07-01', '1961-08-01',
##                '1961-09-01', '1961-10-01', '1961-11-01', '1961-12-01',
##                '1962-01-01', '1962-02-01', '1962-03-01', '1962-04-01',
##                '1962-05-01', '1962-06-01', '1962-07-01', '1962-08-01'],
##               dtype='datetime64[ns]', freq='MS')

Note that the above code works when freq > 1. Adding c(0, 1), means that we are adding 0 years and 1 period into the future. Otherwise, we have annual data and we should add 1, instead of c(0, 1). Same arguments apply to the end of the series.

Note that the above code works for months only, if we have quarterly data, we should add pd.DateOffset(months = 3), if we have daily data, we should add pd.DateOffset(days = 1), if we have annual data, we should add pd.DateOffset(years = 1) to get the first date of the forecast horizon. See the documentation for pandas.tseries.offsets.DateOffset for more info. In Python, we can simply specify the number of periods that we want, \(h\), instead of the end date.

The above code allows us to create the seasonal dummy variables:

dm_forc <- forecast::seasonaldummy(forecast_date)
#
print(head(dm_forc, 6))
##      Jan Feb Mar Apr May Jun Jul Aug Sep Oct Nov
## [1,]   1   0   0   0   0   0   0   0   0   0   0
## [2,]   0   1   0   0   0   0   0   0   0   0   0
## [3,]   0   0   1   0   0   0   0   0   0   0   0
## [4,]   0   0   0   1   0   0   0   0   0   0   0
## [5,]   0   0   0   0   1   0   0   0   0   0   0
## [6,]   0   0   0   0   0   1   0   0   0   0   0
dm_forc = pd.get_dummies(forecast_date.month)
dm_forc = dm_forc.iloc[:, :-1]
print(dm_forc.head(6))
##    1   2   3   4   5   6   7   8   9   10  11
## 0   1   0   0   0   0   0   0   0   0   0   0
## 1   0   1   0   0   0   0   0   0   0   0   0
## 2   0   0   1   0   0   0   0   0   0   0   0
## 3   0   0   0   1   0   0   0   0   0   0   0
## 4   0   0   0   0   1   0   0   0   0   0   0
## 5   0   0   0   0   0   1   0   0   0   0   0

Note the following:

  • For this dataset, the series starts at January.
  • Since there are 144 observations and the frequency is \(d = 12\), we have \(144 / 12 = 12\) full periods.

For this dataset, the forecasts start at Juanuary, however, we could always have such cases, where historical data does not end at the end of the year. In such cases, we would also forecast for the remaining current year. For different starting months cases the above code can be modified, if need be, or adapted for different frequencies: weekly, quarterly, annual data etc.

Consequently, the first forecast at \(T+1 = 145\) will be for January - this means that \(\text{dm_1}_{T+1} = 1\) (other \(dm\) variables are zero). For \(T+2\) \(\text{dm_2}_{T+2} = 1\) (other \(dm\) variables are zero) and so on.

We also need to create the appropriate time and time_sq variables to reflect the future values of time:

time_forc <- (nrow(dtf) + 1):(nrow(dtf) + h)
time_forc_sq <- time_forc^2
time_forc = np.array(range(len(dtf.index) + 1, len(dtf.index) + h + 1))
time_forc_sq = time_forc**2

We can verify that the trend variable is appropriately created:

print(time_forc)
##  [1] 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164
print(time_forc)
## [145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162
##  163 164]

Finally, we can combine the results into a dataset, which we will use to predict the future values:

#Create a DataFrame
dtf_forc <- data.frame(time_forc, time_forc_sq, dm_forc)
#Rename the columns:
colnames(dtf_forc) <- c("time", "time_sq", paste0("dm_", 1:ncol(dm_forc)))
#
print(head(dtf_forc, 13))
##    time time_sq dm_1 dm_2 dm_3 dm_4 dm_5 dm_6 dm_7 dm_8 dm_9 dm_10 dm_11
## 1   145   21025    1    0    0    0    0    0    0    0    0     0     0
## 2   146   21316    0    1    0    0    0    0    0    0    0     0     0
## 3   147   21609    0    0    1    0    0    0    0    0    0     0     0
## 4   148   21904    0    0    0    1    0    0    0    0    0     0     0
## 5   149   22201    0    0    0    0    1    0    0    0    0     0     0
## 6   150   22500    0    0    0    0    0    1    0    0    0     0     0
## 7   151   22801    0    0    0    0    0    0    1    0    0     0     0
## 8   152   23104    0    0    0    0    0    0    0    1    0     0     0
## 9   153   23409    0    0    0    0    0    0    0    0    1     0     0
## 10  154   23716    0    0    0    0    0    0    0    0    0     1     0
## 11  155   24025    0    0    0    0    0    0    0    0    0     0     1
## 12  156   24336    0    0    0    0    0    0    0    0    0     0     0
## 13  157   24649    1    0    0    0    0    0    0    0    0     0     0
dtf_forc = np.column_stack((time_forc, time_forc_sq, dm_forc))
#Create a DataFrame
dtf_forc = pd.DataFrame(dtf_forc)
#Rename the columns:
dtf_forc.columns = ["time"] + ["time_sq"] + ["dm_" + str(number) for number in range(1, len(dm_forc.columns) + 1)]
dtf_forc.head(13)
##     time  time_sq  dm_1  dm_2  dm_3  dm_4  dm_5  dm_6  dm_7  dm_8  dm_9  dm_10  dm_11
## 0    145    21025     1     0     0     0     0     0     0     0     0      0      0
## 1    146    21316     0     1     0     0     0     0     0     0     0      0      0
## 2    147    21609     0     0     1     0     0     0     0     0     0      0      0
## 3    148    21904     0     0     0     1     0     0     0     0     0      0      0
## 4    149    22201     0     0     0     0     1     0     0     0     0      0      0
## 5    150    22500     0     0     0     0     0     1     0     0     0      0      0
## 6    151    22801     0     0     0     0     0     0     1     0     0      0      0
## 7    152    23104     0     0     0     0     0     0     0     1     0      0      0
## 8    153    23409     0     0     0     0     0     0     0     0     1      0      0
## 9    154    23716     0     0     0     0     0     0     0     0     0      1      0
## 10   155    24025     0     0     0     0     0     0     0     0     0      0      1
## 11   156    24336     0     0     0     0     0     0     0     0     0      0      0
## 12   157    24649     1     0     0     0     0     0     0     0     0      0      0

Note that in Python the index starts at zero. This will be problematic for plotting the forecasts, hence we need the index to be for the future values as well:

dtf_forc.index = dtf_forc["time"].to_list()
dtf_forc.head(13)
##      time  time_sq  dm_1  dm_2  dm_3  dm_4  dm_5  dm_6  dm_7  dm_8  dm_9  dm_10  dm_11
## 145   145    21025     1     0     0     0     0     0     0     0     0      0      0
## 146   146    21316     0     1     0     0     0     0     0     0     0      0      0
## 147   147    21609     0     0     1     0     0     0     0     0     0      0      0
## 148   148    21904     0     0     0     1     0     0     0     0     0      0      0
## 149   149    22201     0     0     0     0     1     0     0     0     0      0      0
## 150   150    22500     0     0     0     0     0     1     0     0     0      0      0
## 151   151    22801     0     0     0     0     0     0     1     0     0      0      0
## 152   152    23104     0     0     0     0     0     0     0     1     0      0      0
## 153   153    23409     0     0     0     0     0     0     0     0     1      0      0
## 154   154    23716     0     0     0     0     0     0     0     0     0      1      0
## 155   155    24025     0     0     0     0     0     0     0     0     0      0      1
## 156   156    24336     0     0     0     0     0     0     0     0     0      0      0
## 157   157    24649     1     0     0     0     0     0     0     0     0      0      0

Next, we create forecasts for each model similarly to how we did it for cross-sectional data:

# Calculate forecasts
AP_forc_1 <- predict(AP_OLS_1, newdata = dtf_forc)
AP_forc_2 <- predict(AP_OLS_2, newdata = dtf_forc)
AP_forc_3 <- predict(AP_OLS_3, newdata = dtf_forc)
# Calculate forecasts
AP_forc_1 =  fit_1.predict(dtf_forc)
AP_forc_2 =  fit_2.predict(dtf_forc)
AP_forc_3 =  fit_3.predict(dtf_forc)

Then we can combine the output into a single Time Series with fitted and forecast values:

#Combine into a single time series with FITTED and FORECAST values:
AP_fited_1 <- c(fit_1, AP_forc_1)
AP_fited_1 <- ts(AP_fited_1)
#Combine into a single time series with FITTED and FORECAST values:
AP_fited_1 = np.concatenate((fit_1.predict(), np.array(AP_forc_1)))
AP_fited_1 = pd.Series(AP_fited_1)

note the index of the series is a generic index - it does not indicate neither the year, nor the frequency

time(AP_fited_1)
## Time Series:
## Start = 1 
## End = 164 
## Frequency = 1 
##   [1]   1   2   3   4   5   6   7   8   9  10  11  12  13  14  15  16  17  18  19  20  21  22  23  24  25  26  27  28  29  30  31  32  33  34  35  36  37  38  39  40  41  42  43  44  45  46  47  48  49  50  51  52  53  54  55  56  57  58  59  60  61  62  63  64  65  66  67  68  69  70  71  72  73  74  75  76  77  78  79  80  81  82  83  84  85  86  87  88  89  90  91  92  93  94  95  96  97  98  99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164
AP_fited_1.index
## RangeIndex(start=0, stop=164, step=1)

we can correct it as follows (in Python we also add a name for the series to make it easier when plotting)

# Add the correct time index:
AP_fited_1 <- ts(AP_fited_1, 
                 start = start(airpass), 
                 frequency = frequency(airpass))
#Add the correct time index:
AP_fited_1.index = pd.date_range(start = airpass.index[0].to_pydatetime(), 
                                 periods = len(AP_fited_1.index), 
                                 freq = airpass.index.freq).to_period()
AP_fited_1.index = AP_fited_1.index.to_timestamp()
AP_fited_1.name = "Fitted"
print(time(AP_fited_1))
##           Jan      Feb      Mar      Apr      May      Jun      Jul      Aug      Sep      Oct      Nov      Dec
## 1949 1949.000 1949.083 1949.167 1949.250 1949.333 1949.417 1949.500 1949.583 1949.667 1949.750 1949.833 1949.917
## 1950 1950.000 1950.083 1950.167 1950.250 1950.333 1950.417 1950.500 1950.583 1950.667 1950.750 1950.833 1950.917
## 1951 1951.000 1951.083 1951.167 1951.250 1951.333 1951.417 1951.500 1951.583 1951.667 1951.750 1951.833 1951.917
## 1952 1952.000 1952.083 1952.167 1952.250 1952.333 1952.417 1952.500 1952.583 1952.667 1952.750 1952.833 1952.917
## 1953 1953.000 1953.083 1953.167 1953.250 1953.333 1953.417 1953.500 1953.583 1953.667 1953.750 1953.833 1953.917
## 1954 1954.000 1954.083 1954.167 1954.250 1954.333 1954.417 1954.500 1954.583 1954.667 1954.750 1954.833 1954.917
## 1955 1955.000 1955.083 1955.167 1955.250 1955.333 1955.417 1955.500 1955.583 1955.667 1955.750 1955.833 1955.917
## 1956 1956.000 1956.083 1956.167 1956.250 1956.333 1956.417 1956.500 1956.583 1956.667 1956.750 1956.833 1956.917
## 1957 1957.000 1957.083 1957.167 1957.250 1957.333 1957.417 1957.500 1957.583 1957.667 1957.750 1957.833 1957.917
## 1958 1958.000 1958.083 1958.167 1958.250 1958.333 1958.417 1958.500 1958.583 1958.667 1958.750 1958.833 1958.917
## 1959 1959.000 1959.083 1959.167 1959.250 1959.333 1959.417 1959.500 1959.583 1959.667 1959.750 1959.833 1959.917
## 1960 1960.000 1960.083 1960.167 1960.250 1960.333 1960.417 1960.500 1960.583 1960.667 1960.750 1960.833 1960.917
## 1961 1961.000 1961.083 1961.167 1961.250 1961.333 1961.417 1961.500 1961.583 1961.667 1961.750 1961.833 1961.917
## 1962 1962.000 1962.083 1962.167 1962.250 1962.333 1962.417 1962.500 1962.583
AP_fited_1.index
## DatetimeIndex(['1949-01-01', '1949-02-01', '1949-03-01', '1949-04-01',
##                '1949-05-01', '1949-06-01', '1949-07-01', '1949-08-01',
##                '1949-09-01', '1949-10-01',
##                ...
##                '1961-11-01', '1961-12-01', '1962-01-01', '1962-02-01',
##                '1962-03-01', '1962-04-01', '1962-05-01', '1962-06-01',
##                '1962-07-01', '1962-08-01'],
##               dtype='datetime64[ns]', length=164, freq='MS')

For the remaining models (we can use the existing corrected index of AP_fitted_1 in Python):

AP_fited_2 <- ts(c(fit_2, AP_forc_2), 
                 start = start(airpass), 
                 frequency = frequency(airpass))
AP_fited_2 = np.concatenate((fit_2.predict(), np.array(AP_forc_2)))
AP_fited_2 = pd.Series(AP_fited_2)
AP_fited_2.index = AP_fited_1.index
AP_fited_2.name = AP_fited_1.name
AP_fited_3 <- ts(c(fit_3, AP_forc_3), 
                 start = start(airpass), 
                 frequency = frequency(airpass))
AP_fited_3 = np.concatenate((fit_3.predict(), np.array(AP_forc_3)))
AP_fited_3 = pd.Series(AP_fited_3)
AP_fited_3.index = AP_fited_1.index
AP_fited_3.name = AP_fited_1.name

Finally, we can plot the predicted time series alongside the actual data:

legend_args <- list("topleft", legend = c("Fitted", "Actual"),
                    col = c("darkorange", "cornflowerblue"), lty = c(2, 1))
#
par(mfrow = c(1, 3))
plot(AP_fited_1, lty = 2, col = "cornflowerblue", main = "Quadratic for Airpass")
lines(airpass, col = "darkorange")
do.call(legend, legend_args)
#
plot(AP_fited_2, lty = 2, col = "cornflowerblue", main = "Quadratic + Seasonal for Airpass")
lines(airpass, col = "darkorange")
do.call(legend, legend_args)
#
plot(AP_fited_3, lty = 2, col = "cornflowerblue", main = "Quadratic + Seasonal for log(Airpass)")
lines(log_passengers, col = "darkorange")
do.call(legend, legend_args)

fig = plt.figure(figsize = (20, 6))
_ = AP_fited_1.plot(ax = fig.add_subplot(131), linestyle = "--", title = "Quadratic for Airpass")
_ = airpass.plot(label = "Actual")
_ = plt.legend()
#
_ = AP_fited_2.plot(ax = fig.add_subplot(132), linestyle = "--", title = "Quadratic + Seasonal for Airpass")
_ = airpass.plot(label = "Actual")
_ = plt.legend()
#
_ = AP_fited_3.plot(ax = fig.add_subplot(133), linestyle = "--", title = "Quadratic + Seasonal for log(Airpass)")
_ = log_passengers.plot(label = "Actual")
_ = plt.legend()
#
_ = plt.tight_layout()
plt.show()

While the graph in the center attempts to capture the seasonality effect and it does so better than the left-side graph, we can see that the actual data fluctuations increase in size together with the level of airpass. So an additive model is unable to catch this effect. The common approach to this problem is to take the logarithms and estimate the additive model (right-side graph).

Regarding the last plot - we are interested in the number of air passengers, and not their logarithms, so we can easily transform the forecasts by taking the exponent:

par(mfrow = c(1, 2))
#
#
plot(AP_fited_3, lty = 2, col = "cornflowerblue", main = "Quadratic + Seasonal for log(Airpass)")
lines(log_passengers, col = "darkorange")
do.call(legend, legend_args)
#
#
#
plot(exp(AP_fited_3), lty = 2, col = "cornflowerblue", main = "Quadratic + Seasonal for Airpass")
lines(airpass, col = "darkorange")
do.call(legend, legend_args)

fig = plt.figure(figsize = (20, 6))
#
_ = AP_fited_3.plot(ax = fig.add_subplot(121), linestyle = "--", title = "Quadratic + Seasonal for log(Airpass)")
_ = log_passengers.plot(label = "Actual")
_ = plt.legend()
#
_ = np.exp(AP_fited_3).plot(ax = fig.add_subplot(122), linestyle = "--", title = "Quadratic + Seasonal for Airpass")
_ = airpass.plot(label = "Actual")
_ = plt.legend()
#
_ = plt.tight_layout()
plt.show()

The above chart makes it easier to compare the last model with the first two and see that it is much better, not only in terms of the fitted values, but the magnitude of the forecasts as well.

3.2.3.2 Task 6

Exponential smoothing is a technique that can be applied to times series data, either to produce smoothed data for presentation or to make forecasts.

While in the moving average decomposition the past observations are weighted equally, exponential functions are used to assign exponentially decreasing weights over time. It is an easily applied procedure for making some determination based on prior assumptions by the user, such as seasonality. Exponential smoothing is often used for analysis of time-series data.

3.2.3.2.1 Single (i.e. Simple) Exponential Smoothing

We state the simple exponential smoothing procedure as an algorithm for converting the observed series \(Y_t\) into a smoothed series \(\widehat{Y}_t\), \(t = 1,...,T\) and forecasts \(\widehat{Y}_{T+h,T}\):

  1. Initialize at \(t = 1\): \(\widehat{Y}_1 = Y_1\);
  2. Update: \(\widehat{Y}_t = \alpha Y_{t-1} + (1-\alpha) \widehat{Y}_{t-1}\), where \(t=2,...,T\) and \(\alpha \in (0,1)\);
  3. Forecast: \(\widehat{Y}_{T+h,T} = \widehat{Y}_T\), \(h = 1,2,...\).

The smaller \(\alpha\) is, the smoother the estimated level. As \(\alpha\) approaches 0, the smoothed series approaches a constant value. As \(\alpha\) approaches 1, the smoothed series approaches point-by-point interpolation.

We will use the built-in functions to smooth the series:

airpass_smpl_exp = smt.holtwinters.SimpleExpSmoothing(log_passengers)

We can select different smoothing levels:

airpass_smpl_exp_1 <- forecast::ses(y = log_passengers, alpha = 0.1, initial = "simple")
airpass_smpl_exp_2 <- forecast::ses(y = log_passengers, alpha = 0.5, initial = "simple")
airpass_smpl_exp_3 <- forecast::ses(y = log_passengers, alpha = 0.8, initial = "simple")
airpass_smpl_exp_1 = airpass_smpl_exp.fit(smoothing_level = 0.1)
airpass_smpl_exp_2 = airpass_smpl_exp.fit(smoothing_level = 0.5)
airpass_smpl_exp_3 = airpass_smpl_exp.fit(smoothing_level = 0.8)

and plot the actual, fitted and forecast values as well:

#
plot(log_passengers, col = "cornflowerblue", xlim = c(1949, 1962))
#
lines(airpass_smpl_exp_1$fitted, col = "darkorange", lty = 2)
lines(airpass_smpl_exp_1$mean, col = "darkorange")
#
lines(airpass_smpl_exp_2$fitted, col = "forestgreen", lty = 2)
lines(airpass_smpl_exp_2$mean, col = "forestgreen")
#
lines(airpass_smpl_exp_3$fitted, col = "red", lty = 2)
lines(airpass_smpl_exp_3$mean, col = "red")

fig = plt.figure(figsize = (10, 8))
_ = plt.plot(log_passengers, label = "Actual")
_ = airpass_smpl_exp_1.fittedvalues.plot(label = "alpha = 0.1", linestyle = "--")
_ = airpass_smpl_exp_2.fittedvalues.plot(label = "alpha = 0.5", linestyle = "--")
_ = airpass_smpl_exp_3.fittedvalues.plot(label = "alpha = 0.8", linestyle = "--")
_ = airpass_smpl_exp_1.forecast(steps = 12).plot(label = "Forecast: with alpha = 0.1")
_ = airpass_smpl_exp_2.forecast(steps = 12).plot(label = "Forecast: with alpha = 0.5")
_ = airpass_smpl_exp_3.forecast(steps = 12).plot(label = "Forecast: with alpha = 0.8")
_ = plt.xlim(left = log_passengers.index[0] - pd.DateOffset(months = 5))
_ = plt.legend()
plt.show()

Note that the simple exponential smoothing assumes that the series only has a slowly evolving local level - no trend or seasonality.

Consequently - would it make sense for our smoothed series to capture the seasonal patterns? If we try to decrease the \(alpha\) value in order to avoid capturing the seasonality (since capturing seasonality is counter-intuitive to the definition of the single exponential smoothing) - we clearly see that the trend is then underestimated.

Furthermore, we clearly see that the forecasts do not have any seasonality - they are a constant. This again highlights that the single exponential smoothing is only appropriate if the series has the same level - i.e. no trend and no seasonality.

3.2.3.2.2 Double Exponential Smoothing

Now imagine that we have not only a slowly evolving local level, but also a trend with a slowly evolving local slope. In such cases simple exponential smoothing does not do well. Then the optimal smoothing algorithm (Holt’s Linear Method) is as follows:

  1. Initialize at \(t = 2\):

1.1 \(\widehat{Y_2} = Y_2\);

1.2 \(F_2 = Y_2 - Y_1\);

  1. Update for \(t > 1\):

2.1 \(\widehat{Y_t} = \alpha Y_t + (1- \alpha)\cdot (\widehat{Y}_{t-1} + F_{t-1})\), where \(0 < \alpha < 1\);

2.2 \(F_t = \beta (\widehat{Y}_t - \widehat{Y}_{t-1}) + (1 - \beta) F_{t-1}\), where \(0 < \beta < 1\);

  1. Forecast: \(\widehat{Y}_{T+h,T} = \widehat{Y}_T + h \cdot F_T\)

Here \(F_t\) is the best estimate of the trend and \(\widehat{Y}_t\) is the smoothed series. The parameter \(\alpha\) controls smoothing of the level and \(\beta\) controls smoothing of the slope.

airpass_smpl_exp_x2 = smt.holtwinters.Holt(log_passengers)

We can also see how the smoothing changes depending on the \(\alpha\) value:

airpass_smpl_exp_x2_11 <- forecast::holt(log_passengers, alpha = 0.1, beta = 0.1, initial = "simple")
airpass_smpl_exp_x2_12 <- forecast::holt(log_passengers, alpha = 0.3, beta = 0.1, initial = "simple")
airpass_smpl_exp_x2_13 <- forecast::holt(log_passengers, alpha = 0.6, beta = 0.1, initial = "simple")
airpass_smpl_exp_x2_11 = airpass_smpl_exp_x2.fit(smoothing_level = 0.1, smoothing_slope = 0.1)
airpass_smpl_exp_x2_12 = airpass_smpl_exp_x2.fit(smoothing_level = 0.3, smoothing_slope = 0.1)
airpass_smpl_exp_x2_13 = airpass_smpl_exp_x2.fit(smoothing_level = 0.6, smoothing_slope = 0.1)

We can also examine how the smoothing changes depending on the \(\beta\) value:

airpass_smpl_exp_x2_21 <- forecast::holt(log_passengers, alpha = 0.1, beta = 0.01)
airpass_smpl_exp_x2_22 <- forecast::holt(log_passengers, alpha = 0.1, beta = 0.3, initial = "simple")
airpass_smpl_exp_x2_23 <- forecast::holt(log_passengers, alpha = 0.1, beta = 0.9, initial = "simple")
airpass_smpl_exp_x2_21 = airpass_smpl_exp_x2.fit(smoothing_level = 0.1, smoothing_slope = 0.01)
airpass_smpl_exp_x2_22 = airpass_smpl_exp_x2.fit(smoothing_level = 0.1, smoothing_slope = 0.3)
airpass_smpl_exp_x2_23 = airpass_smpl_exp_x2.fit(smoothing_level = 0.1, smoothing_slope = 0.9)
#
#
plot(log_passengers, col = "cornflowerblue", xlim = c(1949, 1962))
#
lines(airpass_smpl_exp_x2_11$fitted, col = "darkorange", lty = 2)
lines(airpass_smpl_exp_x2_11$mean, col = "darkorange")
#
lines(airpass_smpl_exp_x2_12$fitted, col = "forestgreen", lty = 2)
lines(airpass_smpl_exp_x2_12$mean, col = "forestgreen")
#
lines(airpass_smpl_exp_x2_13$fitted, col = "red", lty = 2)
lines(airpass_smpl_exp_x2_13$mean, col = "red")

fig = plt.figure(figsize = (10, 8))
_ = plt.plot(log_passengers, label = "Actual")
_ = airpass_smpl_exp_x2_11.fittedvalues.plot(label = "alpha = 0.1, beta = 0.1", linestyle = "--")
_ = airpass_smpl_exp_x2_12.fittedvalues.plot(label = "alpha = 0.3, beta = 0.1", linestyle = "--")
_ = airpass_smpl_exp_x2_13.fittedvalues.plot(label = "alpha = 0.6, beta = 0.1", linestyle = "--")
_ = airpass_smpl_exp_x2_11.forecast(steps = 12).plot(label = "Forecast: alpha = 0.1, beta = 0.1")
_ = airpass_smpl_exp_x2_12.forecast(steps = 12).plot(label = "Forecast: alpha = 0.3, beta = 0.1")
_ = airpass_smpl_exp_x2_13.forecast(steps = 12).plot(label = "Forecast: alpha = 0.6, beta = 0.1")
# Fix if the beginning of the plot is cut-off
_ = plt.xlim(left = log_passengers.index[0] - pd.DateOffset(months = 5))
_ = plt.legend()
plt.show()

#
#
plot(log_passengers, col = "cornflowerblue", xlim = c(1949, 1962))
#
lines(airpass_smpl_exp_x2_21$fitted, col = "darkorange", lty = 2)
lines(airpass_smpl_exp_x2_21$mean, col = "darkorange")
#
lines(airpass_smpl_exp_x2_22$fitted, col = "forestgreen", lty = 2)
lines(airpass_smpl_exp_x2_22$mean, col = "forestgreen")
#
lines(airpass_smpl_exp_x2_23$fitted, col = "red", lty = 2)
lines(airpass_smpl_exp_x2_23$mean, col = "red")

fig = plt.figure(figsize = (10, 8))
_ = plt.plot(log_passengers, label = "Actual")
_ = airpass_smpl_exp_x2_21.fittedvalues.plot(label = "alpha = 0.1, beta = 0.01", linestyle = "--")
_ = airpass_smpl_exp_x2_22.fittedvalues.plot(label = "alpha = 0.1, beta = 0.3", linestyle = "--")
_ = airpass_smpl_exp_x2_23.fittedvalues.plot(label = "alpha = 0.1, beta = 0.9", linestyle = "--")
_ = airpass_smpl_exp_x2_21.forecast(steps = 12).plot(label = "Forecast: alpha = 0.1, beta = 0.01")
_ = airpass_smpl_exp_x2_22.forecast(steps = 12).plot(label = "Forecast: alpha = 0.1, beta = 0.3")
_ = airpass_smpl_exp_x2_23.forecast(steps = 12).plot(label = "Forecast: alpha = 0.1, beta = 0.9")
# Fix if the beginning of the plot is cut-off
_ = plt.xlim(left = log_passengers.index[0] - pd.DateOffset(months = 5))
_ = plt.legend()
plt.show()

Single exponential smoothing does not excel in following the data when there is a trend. This situation can be improved by the introduction of a second equation with a second constant parameter, \(\beta\), which is chosen in conjunction to \(\alpha\). However, if the data show trend and seasonality - double smoothing will not work.

We see this from the plots - if we increase \(\alpha\) - the seasonality remains (which is counter-intuitive given the definition of the double exponential smoothing). If we try to increase \(\beta\) - the seasonality effect is also reduced, though not completely.

Consequently - would it make sense for our smoothed series to capture the seasonal patterns? If we try to decrease the \(alpha\) value in order to avoid capturing the seasonality - we clearly see that the trend is then underestimated.

Furthermore, we clearly see that the forecasts do not have any seasonality - they only capture the general trend of the series, which also depends on the smoothing parameters. This highlights that the double exponential smoothing is appropriate if the series has a trend - i.e. no seasonality.

3.2.3.2.3 Triple Exponential Smoothing

A method known as Holt-Winters method is based on three smoothing equations - one for the level, one for the trend and one for seasonality. It is similar to Holt’s linear method, with one additional equation dealing with seasonality.

Triple exponential smoothing applies exponential smoothing three times, which is commonly used when there are three high frequency signals to be removed from a time series under study. The use of a triple application is considered a rule of thumb technique, rather than one based on theoretical foundations and has often been over-emphasized by practitioners.

The smoothing algorithm (where the seasonal effect is additive) is as follows:

  1. Initialization:

1.1 Set \(\widehat{Y}_L = \dfrac{Y_1 + ... + Y_L}{L}\)

1.2 The general formula for the initial trend: \(F_L = \dfrac{1}{L} \left(\dfrac{Y_{L+1} - Y_1}{L} + \dfrac{Y_{L+2} - Y_2}{L} + ... \dfrac{Y_{L+L} - Y_L}{L} \right)\)

1.3 Suppose a cycle of seasonal change is of length L so that \(G_t = G_{t+L}\) and \(N\) - the number of complete cycles in the data. As a rule of thumb, a minimum of two full seasons (or 2L periods) of historical data is needed to initialize a set of seasonal factors. The initial estimates for the seasonal indices: \[ \begin{aligned} G_i &= \dfrac{1}{N}\sum_{j = 1}^N{\dfrac{Y_{L(j-1)+i}}{A_j}}, \quad i = 1,...,L\\ \text{where: } A_j &= \dfrac{1}{L} \sum_{k = 1}^L Y_{L(j-1)+k}, \quad j = 1,...,N \end{aligned} \] i.e. \(A_j\) is the average value of \(Y_t\) in the \(j-th\) cycle of your data and \(G_i\) is the average of the weighted values of the \(i-th\) seasonal index from all cycles.

For example, if we have yearly data of \(N\) years, then \(A_1\) would be the average value of the first 12 month data \(Y_1,...,Y_{12}\); and \(A_2\) - the average of \(Y_{13},...,Y_{24}\), etc.

Then \(G_1\) would be the average of the weighted values \(\dfrac{Y_1}{A_1}, \dfrac{Y_{12 \cdot 2 + 1}}{A_2},..., \dfrac{Y_{12 \cdot N + 1}}{A_N}\), \(G_2\) - the average of \(\dfrac{Y_2}{A_1}, \dfrac{Y_{12 \cdot 2 + 2}}{A_2},..., \dfrac{Y_{12 \cdot N + 2}}{A_N}\) etc.

  1. Two possible scenarios:
  • Update for \(t > L\) of an additive model:

2.1 \(S_t = \alpha \left( Y_t - G_{t-L}\right) + (1- \alpha)\cdot (S_{t-1} + F_{t-1})\), where \(0 < \alpha < 1\);

2.2 \(F_t = \beta (S_t - S_{t-1}) + (1 - \beta) F_{t-1}\), where \(0 < \beta < 1\);

2.3 \(G_t = \gamma \left( Y_t-S_{t} \right) + (1 - \gamma) G_{t-L}\)

2.4 Forecast: \(\widehat{Y}_{t+h} = \widehat{Y}_t + h \cdot F_t + G_{t + h - L}\)

  • Update for \(t > L\) of a multiplicative model:

2.1 \(S_t = \alpha \left( \dfrac{Y_t}{G_{t-L}}\right) + (1- \alpha)\cdot (S_{t-1} + F_{t-1})\), where \(0 < \alpha < 1\);

2.2 \(F_t = \beta (S_t - S_{t-1}) + (1 - \beta) F_{t-1}\), where \(0 < \beta < 1\);

2.3 \(G_t = \gamma \left( \dfrac{Y_t}{S_t} \right) + (1 - \gamma) G_{t-L}\)

2.4 Forecast: \(\widehat{Y}_{t+h} = (\widehat{Y}_t + h \cdot F_t) \cdot G_{t + h - L}\)

Here \(F_t\) is the best estimate of the trend, \(G_t\) is the best estimate of the seasonal factors and \(S_t\) is the overall smoothed series. Note that in order to initialize the \(HW\) method, we need at least one complete season’s data to determine initial estimates of the seasonal indices.

The coefficients \(\alpha\), \(\beta\), and \(\gamma\) are constants that must be estimated in such a way that the MSE (i.e. mean squared error) of the residual is minimized, i.e.: \[ \begin{equation} \text{SSE}=\sum_{t=1}^T(Y_t - \widehat{Y}_{t})^2=\sum_{t=1}^Te_t^2 \longrightarrow \min_{\alpha,\ \beta,\ \gamma} \end{equation} \]

#
#
airpass_smpl_exp_hw_1 <- forecast::hw(log_passengers, gamma = 0, 
                                      initial = "simple", h = 12)
airpass_smpl_exp_hw = smt.holtwinters.ExponentialSmoothing(log_passengers)
airpass_smpl_exp_hw_1 = airpass_smpl_exp_hw.fit()
#
#
airpass_smpl_exp_hw_2 <- forecast::hw(log_passengers, 
                                      seasonal = "additive", h = 12)
airpass_smpl_exp_hw2 = smt.holtwinters.ExponentialSmoothing(log_passengers, 
                                  seasonal = "additive", seasonal_periods = 12)
airpass_smpl_exp_hw_2 = airpass_smpl_exp_hw2.fit()
#
airpass_smpl_exp_hw_3 <- forecast::hw(log_passengers, 
                                      seasonal = "multiplicative", h = 12)
airpass_smpl_exp_hw3 = smt.holtwinters.ExponentialSmoothing(log_passengers, 
                            seasonal = "multiplicative", seasonal_periods = 12)
airpass_smpl_exp_hw_3 = airpass_smpl_exp_hw3.fit()
#
plot(log_passengers, col = "cornflowerblue", xlim = c(1949, 1962))
#
lines(airpass_smpl_exp_hw_1$fitted, col = "darkorange", lty = 2)
lines(airpass_smpl_exp_hw_1$mean, col = "darkorange")
#
lines(airpass_smpl_exp_hw_2$fitted, col = "forestgreen", lty = 2)
lines(airpass_smpl_exp_hw_2$mean, col = "forestgreen")
#
lines(airpass_smpl_exp_hw_3$fitted, col = "red", lty = 2)
lines(airpass_smpl_exp_hw_3$mean, col = "red")

fig = plt.figure(figsize = (10, 8))
_ = fig.add_subplot(111)
_ = plt.plot(log_passengers, label = "Actual")
_ = airpass_smpl_exp_hw_1.fittedvalues.plot(label = "Holt-Winter's Exponential Smoothing: No season specification", linestyle = "--")
_ = airpass_smpl_exp_hw_2.fittedvalues.plot(label = "Holt-Winter's Exponential Smoothing: with additive season specification", linestyle = "--")
_ = airpass_smpl_exp_hw_3.fittedvalues.plot(label = "Holt-Winter's Exponential Smoothing: with multiplicative season specification", linestyle = "--")
_ = airpass_smpl_exp_hw_1.forecast(steps = 12).plot(label = "HW: no season spec")
_ = airpass_smpl_exp_hw_2.forecast(steps = 12).plot(label = "HW: with additive season spec", marker = "x")
_ = airpass_smpl_exp_hw_3.forecast(steps = 12).plot(label = "HW: with multiplicative season spec", marker = "x")
_ = plt.legend()
plt.show()

Generally, Holt-Winters exponential smoothing method is usually preferred to the simple and double exponential smoothing methods.

Since our data contains both a trend and a seasonal component - the triple exponential smoothing method is appropriate. Furthermore, we do not need to manually specify the smoothing parameters - they are estimated based on minimizing the \(MSE\).

Furthermore, we see that specifying the triple exponential smoothing without the seasonal frequency results in a less accurate smoothing. On the other hand, if we specify a multiplicative seasonality on an additive series (since we are examining the log of air passengers) - the results are similar to the additive specification (as long as the additive specification is appropriate, otherwise they will differ in accuracy).

Finally, the forecasts for the triple exponential smoothing (with the correct seasonal frequency specification) are closer to the trend and seasonality of the historical data (at least visually), compared to the single and double exponential smoothing.

3.2.3.3 Task 7

The easiest way to compare the methods, besides looking at the plots, is to examine the residuals by looking at the residual series and ACF plots - are the residuals stationary? Are they WN?

resid_OLS = fit_3.resid
tsdisplay(resid_OLS, lags = 30)

Note that only the first couple of lags are significant, while the latter ones are not as much.

Considerations

We can also examine the residuals of the double exponential smoothing with \(\alpha = 0.6\) and \(\beta = 0.1\) as it appears to be the best fit:

resid_exp_x2 = log_passengers - airpass_smpl_exp_x2_13.fittedvalues
tsdisplay(resid_exp_x2, lags = 30)

Note that in the case of using double exponential smoothing, there appears to be a seasonality or cyclic effect as every so often we see a lag spike. Overall, the double exponential smoothing method was ineffective in capturing the seasonality effect.

Finally, the Hold-Winter’s exponential smoothing method with a multiplicative seasonal component appears to have the best residuals (in terms of their white noise property):

resid_hw = log_passengers - airpass_smpl_exp_hw_3.fittedvalues
tsdisplay(resid_hw, lags = 30)

Note that they may still be autocorrelated (again, the point of the decomposition is to separate the deterministic component from the irregular, stationary component, which is not necessarily white noise). We can check this with the \(Ljung-Box\) test.

Note that in the triple exponential smoothing, we have estimated \(\alpha\), \(\beta\) and \(\gamma\), as well as the initial season:

airpass_smpl_exp_hw_3.summary()
## <class 'statsmodels.iolib.summary.Summary'>
## """
##                        ExponentialSmoothing Model Results                       
## ================================================================================
## Dep. Variable:                    value   No. Observations:                  144
## Model:             ExponentialSmoothing   SSE                              0.215
## Optimized:                         True   AIC                           -909.212
## Trend:                             None   BIC                           -867.635
## Seasonal:                Multiplicative   AICC                          -904.929
## Seasonal Periods:                    12   Date:                 Tue, 06 Apr 2021
## Box-Cox:                          False   Time:                         23:19:18
## Box-Cox Coeff.:                    None                                         
## =================================================================================
##                           coeff                 code              optimized      
## ---------------------------------------------------------------------------------
## smoothing_level               0.8029297                alpha                 True
## smoothing_seasonal            0.1949752                gamma                 True
## initial_level                 5.4011643                  l.0                 True
## initial_seasons.0             0.8732819                  s.0                 True
## initial_seasons.1             0.8771272                  s.1                 True
## initial_seasons.2             0.9033847                  s.2                 True
## initial_seasons.3             0.9005346                  s.3                 True
## initial_seasons.4             0.8966069                  s.4                 True
## initial_seasons.5             0.9147479                  s.5                 True
## initial_seasons.6             0.9296504                  s.6                 True
## initial_seasons.7             0.9265155                  s.7                 True
## initial_seasons.8             0.9079395                  s.8                 True
## initial_seasons.9             0.8845327                  s.9                 True
## initial_seasons.10            0.8576898                 s.10                 True
## initial_seasons.11            0.8784410                 s.11                 True
## ---------------------------------------------------------------------------------
## """

Since the initial values are only used at the beginning of the series, we will only count whether \(\alpha\), \(\beta\), \(\gamma\) parameters are included. For the general case, you can refer to the Ljung-Box test documentation in mathworks

tsdiag(resid_hw, lags = 10, df = 2) # from the summary - only alpha and gamma 
##                              1         2         3          4          5          6          7          8          9          10
## Ljung-Box: X-squared   2.377088  2.423707  6.536591  10.938752  11.844569  14.276121  14.582733  16.217306  16.282081  18.086239
## Ljung-Box: p-value          NaN       NaN  0.010568   0.004214   0.007935   0.006464   0.012302   0.012634   0.022661   0.020589
## Box-Pierce: X-squared  2.328243  2.373585  6.345617  10.566868  11.429255  13.727571  14.015283  15.537898  15.597793  17.253664
## Box-Pierce: p-value         NaN       NaN  0.011767   0.005075   0.009617   0.008217   0.015513   0.016461   0.029056   0.027573

The residuals of the OLS model:

tsdiag(resid_OLS, lags = 20, df = len(fit_3.params))
##                               1           2           3           4           5           6           7           8           9           10          11          12          13          14            15            16            17            18            19            20
## Ljung-Box: X-squared   66.168830  100.254549  111.869080  114.139661  115.978600  117.026177  117.087231  117.092978  117.654866  118.251883  120.009057  122.795885  123.188765  123.859113  1.285903e+02  1.447487e+02  1.583279e+02  1.735237e+02  1.917378e+02  2.081217e+02
## Ljung-Box: p-value           NaN         NaN         NaN         NaN         NaN         NaN         NaN         NaN         NaN         NaN         NaN         NaN         NaN         NaN  8.336500e-30  3.700074e-32  4.206909e-34  1.832814e-36  1.660796e-39  3.538704e-42
## Box-Pierce: X-squared  64.809197   97.961060  109.177833  111.355103  113.105874  114.096049  114.153339  114.158693  114.678247  115.226194  116.826907  119.346505  119.699021  120.295906  1.244762e+02  1.386425e+02  1.504545e+02  1.635687e+02  1.791630e+02  1.930780e+02
## Box-Pierce: p-value          NaN         NaN         NaN         NaN         NaN         NaN         NaN         NaN         NaN         NaN         NaN         NaN         NaN         NaN  6.626795e-29  7.837228e-31  2.102387e-32  2.508773e-34  8.075955e-37  5.636515e-39

So, the residuals of the triple exponential smoothing are autocorrelated, just as the residuals of the OLS model.

Considerations

This can be solved by fitting an ARMA models on the residuals:

resid_mdl_order = sm.tsa.stattools.arma_order_select_ic(resid_hw, ic = 'aic', trend = 'nc')
resid_mdl_order
## {'aic':             0           1           2
## 0         NaN -524.646826 -523.485199
## 1 -524.659288 -522.986397 -521.806157
## 2 -523.398190 -521.533280 -526.236312
## 3 -521.907342 -520.013061 -529.129832
## 4 -520.709608 -519.411251 -527.153374, 'aic_min_order': (3, 2)}

Note that since we are estimating the residuals separately, we assume that the residuals and the trend and seasonality are separate processes, such that \(Y_t = T_t + S_t + E_t\), where \(T_t + S_t\) is approximated by Holt-Winter’s exponential smoothing and the remainder, \(E_t\) can be estimated via an \(ARMA\) model.

print(resid_mdl_order.aic_min_order)
## (3, 2)

So, the residuals would need an \(ARMA(3, 2)\) model. We will leave this as an additional exercise. Note that the final fitted values would need to be:

\[ \text{Fitted values from Holt-Winter smoothing } + \text{ Fitted values from the residual ARMA model} \]

Similarly the overall series forecast would be:

\[ \text{Forecast from Holt-Winter smoothing } + \text{ Forecast from the residual ARMA model} \]

3.2.3.4 Task 8

Note that we have already examined the forecasting possibilities of the OLS. Since the residuals from both of the models are autocorrelated, we will go with the Holt-Winters exponential smoothing, since their residual ACF appears to have no more seasonality effects, whereas the residual ACF of the OLS model seems to have some, albeit possibly insignificant, seasonal correlations.

fig = plt.figure(figsize = (10, 6))
_ = plt.plot(log_passengers, label = "Actual")
_ = airpass_smpl_exp_hw_3.fittedvalues.plot(label = "Holt-Winter's Exponential Smoothing: multiplicative season specification", linestyle = "--")
_ = airpass_smpl_exp_hw_3.forecast(steps = 12).plot(label = "HW: multiplicative season spec.", marker = "x")
# Fix if the beginning of the plot is cut-off
_ = plt.xlim(left = log_passengers.index[0] - pd.DateOffset(months = 5))
_ = plt.legend()
plt.show()

3.2.4 Exercise Set 4: Additional Decomposition Methods (Global NLS, SARMA, Kernel Smoothing, X13, etc.)

Next we will examine a number of additional decomposition methods, which are just as, if not more, important than the previously introduced methods.

3.2.4.1 Task 9

In order to fit the seasonal \(ARMA\) model, we first need to remove the trend component from the series (unless the series is a \(DS\) series - something which will be examined in later chapters).

We choose to use the residuals of the polynomial trend regression and we will estimate the model on \(\log(Y_t)\), since it appears that the series is multiplicative:

AP_OLS_trend = sm.formula.ols(formula = 'np.log(AP) ~ time + time_sq', data = dtf)
AP_OLS_trend_fit = AP_OLS_trend.fit()
print(AP_OLS_trend_fit.summary().tables[1])
## ==============================================================================
##                  coef    std err          t      P>|t|      [0.025      0.975]
## ------------------------------------------------------------------------------
## Intercept      4.7364      0.034    138.117      0.000       4.669       4.804
## time           0.0132      0.001     12.112      0.000       0.011       0.015
## time_sq    -2.191e-05   7.29e-06     -3.004      0.003   -3.63e-05   -7.49e-06
## ==============================================================================

Checking the residuals:

tsdisplay(AP_OLS_trend_fit.resid, lags = 25)

Reveals that the seasonality remains (since we have only fitted a quadratic trend model - we expect the seasonality to remain).

It appears the trend is estimated relatively well. The residuals appear to have a seasonal component and a constant variance.

In order to model seasonality, we will estimate an \(\rm SARIMA(p,d,q)(P,D,Q)_s\), restricting ourselves to \(d = D = 0\) (again, we will examine the model for differences in later chapters).

Similarly to \(ARMA\) models, a lower lag order is usually sufficient. For example, if we wanted to estimate a \(\rm SARIMA(2,0,2)(1,0,1)_{12}\) model (we select \(s = 12\), since the data is monthly):

resid_sarima = sm.tsa.statespace.SARIMAX(endog = AP_OLS_trend_fit.resid, 
                              order = (2, 0, 2), seasonal_order = (1, 0, 1, 12),
                              enforce_invertibility = True, trend = 'n')
resid_sarima_fit = resid_sarima.fit()
print(resid_sarima_fit.summary())
##                                       SARIMAX Results                                       
## ============================================================================================
## Dep. Variable:                                    y   No. Observations:                  144
## Model:             SARIMAX(2, 0, 2)x(1, 0, [1], 12)   Log Likelihood                 261.717
## Date:                              Tue, 06 Apr 2021   AIC                           -509.434
## Time:                                      23:19:24   BIC                           -488.645
## Sample:                                           0   HQIC                          -500.987
##                                               - 144                                         
## Covariance Type:                                opg                                         
## ==============================================================================
##                  coef    std err          z      P>|z|      [0.025      0.975]
## ------------------------------------------------------------------------------
## ar.L1          1.3301      2.087      0.637      0.524      -2.760       5.420
## ar.L2         -0.4445      1.611     -0.276      0.783      -3.601       2.712
## ma.L1         -0.7322      2.095     -0.349      0.727      -4.838       3.374
## ma.L2          0.1412      0.337      0.419      0.675      -0.519       0.801
## ar.S.L12       0.9900      0.008    122.460      0.000       0.974       1.006
## ma.S.L12      -0.5974      0.103     -5.784      0.000      -0.800      -0.395
## sigma2         0.0012      0.000      9.264      0.000       0.001       0.002
## ===================================================================================
## Ljung-Box (L1) (Q):                   0.04   Jarque-Bera (JB):                 9.07
## Prob(Q):                              0.83   Prob(JB):                         0.01
## Heteroskedasticity (H):               0.64   Skew:                            -0.16
## Prob(H) (two-sided):                  0.13   Kurtosis:                         4.19
## ===================================================================================
## 
## Warnings:
## [1] Covariance matrix calculated using the outer product of gradients (complex-step).

Note that sigma2 in the coefficients table is the estimate of the variance of the error term.

If we check the residuals of or model:

tsdisplay(resid_sarima_fit.resid)

we see that they appear to be uncorrelated. We do note that the variance of the residuals seems larger at the start of the series.

Considerations

If we wanted to create our own simple automatic order selection, we can do so:

# Specify the best AIC - default is infinite - so anything lower is better
best_aic = None
# Specify the best order
best_order = None
# Specify the best model
best_mdl = None
# Loop through different (p, q) combinations, starting from SARMA(0, 0)(0, 0), to SARMA(1, 1)(1, 1)
pq_rng = range(2) # range(5) would be [0,1,2,3,4]
for p in pq_rng:
    for q in pq_rng:
        for P in pq_rng:
            for Q in pq_rng:
                try:
                    tmp_mdl = sm.tsa.statespace.SARIMAX(AP_OLS_trend_fit.resid, 
                                                        order=(p, 0, q),
                                                        seasonal_order=(P, 0, Q, 12))
                    tmp_mdl_fit = tmp_mdl.fit()
                    tmp_aic = tmp_mdl_fit.aic
                    if best_aic == None:
                        best_aic = tmp_aic
                        best_order = {"Nonseasonal": (p, 0, q), "Seasonal": (P, 0, Q)}
                        best_mdl = tmp_mdl
                        print("Fitted an ARMA(" + str(p) + ", " + str(q) + ")(" + 
                              str(P) + ", " + str(Q) + ")_12 model - it was better")
                    elif tmp_aic < best_aic:
                        best_aic = tmp_aic
                        best_order = {"Nonseasonal": (p, 0, q), "Seasonal": (P, 0, Q)}
                        best_mdl = tmp_mdl
                        print("Fitted an ARMA(" + str(p) + ", " + str(q) + ")(" + 
                              str(P) + ", " + str(Q) + ")_12 model - it was better")
                    else:
                        print("Fitted an ARMA(" + str(p) + ", " + str(q) + ")(" + 
                              str(P) + ", " + str(Q) + ")_12 model - it was worse")
                except: continue
## Fitted an ARMA(0, 0)(0, 0)_12 model - it was better
## Fitted an ARMA(0, 0)(0, 1)_12 model - it was better
## Fitted an ARMA(0, 0)(1, 0)_12 model - it was better
## Fitted an ARMA(0, 0)(1, 1)_12 model - it was better
## Fitted an ARMA(0, 1)(0, 0)_12 model - it was worse
## Fitted an ARMA(0, 1)(0, 1)_12 model - it was worse
## Fitted an ARMA(0, 1)(1, 0)_12 model - it was better
## Fitted an ARMA(0, 1)(1, 1)_12 model - it was better
## Fitted an ARMA(1, 0)(0, 0)_12 model - it was worse
## Fitted an ARMA(1, 0)(0, 1)_12 model - it was worse
## Fitted an ARMA(1, 0)(1, 0)_12 model - it was better
## Fitted an ARMA(1, 0)(1, 1)_12 model - it was better
## Fitted an ARMA(1, 1)(0, 0)_12 model - it was worse
## Fitted an ARMA(1, 1)(0, 1)_12 model - it was worse
## Fitted an ARMA(1, 1)(1, 0)_12 model - it was worse
## Fitted an ARMA(1, 1)(1, 1)_12 model - it was better

We will fit a \(SARMA\) with a maximum lag order of 1 for all lags. The resulting model:

tmp_mdl = sm.tsa.statespace.SARIMAX(AP_OLS_trend_fit.resid, 
                                    order = (1, 0, 1),
                                    seasonal_order = (1, 0, 1, 12))
#
tmp_mdl_fit = tmp_mdl.fit()
#
print(tmp_mdl_fit.summary())
##                                      SARIMAX Results                                      
## ==========================================================================================
## Dep. Variable:                                  y   No. Observations:                  144
## Model:             SARIMAX(1, 0, 1)x(1, 0, 1, 12)   Log Likelihood                 261.602
## Date:                            Tue, 06 Apr 2021   AIC                           -513.203
## Time:                                    23:19:29   BIC                           -498.354
## Sample:                                         0   HQIC                          -507.169
##                                             - 144                                         
## Covariance Type:                              opg                                         
## ==============================================================================
##                  coef    std err          z      P>|z|      [0.025      0.975]
## ------------------------------------------------------------------------------
## ar.L1          0.7873      0.091      8.615      0.000       0.608       0.966
## ma.L1         -0.1896      0.127     -1.491      0.136      -0.439       0.060
## ar.S.L12       0.9900      0.008    130.377      0.000       0.975       1.005
## ma.S.L12      -0.5948      0.102     -5.849      0.000      -0.794      -0.395
## sigma2         0.0012      0.000      9.459      0.000       0.001       0.001
## ===================================================================================
## Ljung-Box (L1) (Q):                   0.06   Jarque-Bera (JB):                 9.59
## Prob(Q):                              0.81   Prob(JB):                         0.01
## Heteroskedasticity (H):               0.65   Skew:                            -0.16
## Prob(H) (two-sided):                  0.13   Kurtosis:                         4.22
## ===================================================================================
## 
## Warnings:
## [1] Covariance matrix calculated using the outer product of gradients (complex-step).
tsdisplay(tmp_mdl_fit.resid)

Seems similar in terms of residuals to our higher order seasonal ARMA specification. If we compare AIC/BIC values:

pd.DataFrame([[resid_sarima_fit.aic, tmp_mdl_fit.aic], 
              [resid_sarima_fit.bic, tmp_mdl_fit.bic]], 
             index = ["AIC", "BIC"], columns = ["Manual", "Auto"])
##          Manual        Auto
## AIC -509.434000 -513.203134
## BIC -488.645307 -498.354068

We see that the automatic specification has a lower AIC and BIC value, indicating that the \(\rm SARIMA(1, 0, 1)(1, 0, 1)_{12}\) is the better model.

We can write the model using the lag functions as:

\((1 - 0.7873 L)\cdot (1 - 0.9900 L^{12})Y_t = (1 - 0.1896 L)\cdot (1 - 0.5948 L^{12})\epsilon_t\)

Careful!

It is worthwhile to examine this answer on stackoverflow, which states the drawbacks of using the \(Box-Pierce\) and \(Ljung-Box\) tests on time series models. Another answer discusses a more general case.

3.2.4.2 Task 10

Firstly, we will calculate the fitted values.

In general, the fitted values can be calculated by:

  • Calculate the fitted values of the Trend component \(\widehat{T}_t\);
  • Calculate the fitted values of the Seasonal component \(\widehat{S}_t\);
  • Calculate the fitted values of the Irregular component \(\widehat{E}_t\).

Then, we calculate the fitted values of the original series as \(\widehat{Y}_t = \widehat{T}_t + \widehat{S}_t + \widehat{E}_t\).

In our case, we have estimated and SARMA model for \(E_t\), which includes the seasonal component, and we have calculated the residuals from the OLS trend regression for the \(\widehat{\log(Y)}_t\).

Hence \(\widehat{Y}_t = \exp \left( \widehat{T}_t + \widehat{E}_{t}^{(SARMA)} \right)\).

Y_fit = np.exp(AP_OLS_trend_fit.fittedvalues + resid_sarima_fit.fittedvalues)
Y_fit = pd.Series(Y_fit)
Y_fit.index = airpass.index
Y_trend = np.exp(AP_OLS_trend_fit.fittedvalues)
Y_trend = pd.Series(Y_trend)
Y_trend.index = airpass.index
fig = plt.figure(figsize = (10, 6))
_ = airpass.plot(label = "Actual")
_ = Y_fit.plot(label = "Fitted")
_ = Y_trend.plot(label = "Estimated Trend")
_ = plt.title("Actual vs fitted plot")
_ = plt.legend()
plt.show()

Next, we will calculate the forecasts for the trend component for \(h = 20\) steps.

h = 20
h = 20

We will begin by creating a time series index vector, which we will transform into the appropriate seasonal dummy variables:

forecast_date = pd.date_range(start = airpass.index[-1].to_pydatetime() + pd.DateOffset(months = 1), 
                              periods = h, freq = airpass.index.freq)
forecast_date = forecast_date.to_period().to_timestamp()
print(forecast_date)                                     
## DatetimeIndex(['1961-01-01', '1961-02-01', '1961-03-01', '1961-04-01',
##                '1961-05-01', '1961-06-01', '1961-07-01', '1961-08-01',
##                '1961-09-01', '1961-10-01', '1961-11-01', '1961-12-01',
##                '1962-01-01', '1962-02-01', '1962-03-01', '1962-04-01',
##                '1962-05-01', '1962-06-01', '1962-07-01', '1962-08-01'],
##               dtype='datetime64[ns]', freq='MS')
dm_forc = pd.get_dummies(forecast_date.month)
dm_forc = dm_forc.iloc[:, :-1]
print(dm_forc.head(6))
##    1   2   3   4   5   6   7   8   9   10  11
## 0   1   0   0   0   0   0   0   0   0   0   0
## 1   0   1   0   0   0   0   0   0   0   0   0
## 2   0   0   1   0   0   0   0   0   0   0   0
## 3   0   0   0   1   0   0   0   0   0   0   0
## 4   0   0   0   0   1   0   0   0   0   0   0
## 5   0   0   0   0   0   1   0   0   0   0   0

We also need to create the appropriate time and time_sq variables to reflect the future values of time:

time_forc = np.array(range(len(dtf.index) + 1, len(dtf.index) + h + 1))
time_forc_sq = time_forc**2

Finally, we can combine the results into a dataset, which we will use to predict the future values:

dt_forc = np.column_stack((time_forc, time_forc_sq, dm_forc))
# Create a DataFrame
dt_forc = pd.DataFrame(dt_forc)
# Rename the columns:
dt_forc.columns = ["time"] + ["time_sq"] + ["dm_" + str(number) for number in range(1, len(dm_forc.columns) + 1)]
# Change the index to be the same as the time variable
dt_forc.index = dt_forc["time"].to_list()
dt_forc.head(13)
##      time  time_sq  dm_1  dm_2  dm_3  dm_4  dm_5  dm_6  dm_7  dm_8  dm_9  dm_10  dm_11
## 145   145    21025     1     0     0     0     0     0     0     0     0      0      0
## 146   146    21316     0     1     0     0     0     0     0     0     0      0      0
## 147   147    21609     0     0     1     0     0     0     0     0     0      0      0
## 148   148    21904     0     0     0     1     0     0     0     0     0      0      0
## 149   149    22201     0     0     0     0     1     0     0     0     0      0      0
## 150   150    22500     0     0     0     0     0     1     0     0     0      0      0
## 151   151    22801     0     0     0     0     0     0     1     0     0      0      0
## 152   152    23104     0     0     0     0     0     0     0     1     0      0      0
## 153   153    23409     0     0     0     0     0     0     0     0     1      0      0
## 154   154    23716     0     0     0     0     0     0     0     0     0      1      0
## 155   155    24025     0     0     0     0     0     0     0     0     0      0      1
## 156   156    24336     0     0     0     0     0     0     0     0     0      0      0
## 157   157    24649     1     0     0     0     0     0     0     0     0      0      0

Now, we can calculate the forecast of the trend:

trend_forc = AP_OLS_trend_fit.predict(dt_forc)
print(trend_forc.head())
## 145    6.193382
## 146    6.200231
## 147    6.207037
## 148    6.213799
## 149    6.220517
## dtype: float64

Note that this is the trend for the \(\log(Y)_t\) series and not the original series.

Next, we calculate the forecast of the residual \(SARMA\) model:

resid_forc = resid_sarima_fit.forecast(steps = len(dt_forc))

The residual predictions:

print(resid_forc.head())
## 144   -0.090139
## 145   -0.151580
## 146   -0.041015
## 147   -0.026209
## 148   -0.001812
## Name: predicted_mean, dtype: float64

are also for the \(\log(Y)_t\) series.

Also, the forecast of \(SARMA\) and the trend need to have the same index. Furthermore, since this is for forecasts (and not the fitted values, like before) - the index needs to start after the last index value of the historical data, so:

# Last historical value date:
airpass.index[-1]
## Timestamp('1960-12-01 00:00:00', freq='MS')
# Date of the h step forecast
forc_index = pd.date_range(start = airpass.index[-1].to_pydatetime() + pd.DateOffset(months = 1),
                           periods = h, freq = airpass.index.freq).to_period()
forc_index = forc_index.to_timestamp()
print(forc_index)
## DatetimeIndex(['1961-01-01', '1961-02-01', '1961-03-01', '1961-04-01',
##                '1961-05-01', '1961-06-01', '1961-07-01', '1961-08-01',
##                '1961-09-01', '1961-10-01', '1961-11-01', '1961-12-01',
##                '1962-01-01', '1962-02-01', '1962-03-01', '1962-04-01',
##                '1962-05-01', '1962-06-01', '1962-07-01', '1962-08-01'],
##               dtype='datetime64[ns]', freq='MS')
resid_forc.index = forc_index
trend_forc.index = forc_index

In order to calculate for \(Y_t\), we do as before and either calculate \(\exp ( \widehat{T}_t + \widehat{E}_t )\), or equivalently \(\exp ( \widehat{T}_t ) \cdot exp (\widehat{E}_t )\). We will use the first method:

Y_forc = np.exp(resid_forc + trend_forc)
print(Y_forc)
## 1961-01-01    447.305813
## 1961-02-01    423.541491
## 1961-03-01    476.287930
## 1961-04-01    486.671926
## 1961-05-01    502.052853
## 1961-06-01    573.601663
## 1961-07-01    654.841531
## 1961-08-01    651.965639
## 1961-09-01    548.462226
## 1961-10-01    488.775459
## 1961-11-01    424.241266
## 1961-12-01    469.618985
## 1962-01-01    485.473020
## 1962-02-01    459.739250
## 1962-03-01    516.059897
## 1962-04-01    526.821361
## 1962-05-01    542.911951
## 1962-06-01    619.031278
## 1962-07-01    705.311037
## 1962-08-01    701.812973
## Freq: MS, dtype: float64

Finally, we will plot our fitted and forecast values:

fig = plt.figure(figsize = (10, 6))
_ = airpass.plot(label = "Actual")
_ = Y_fit.plot(label = "Fitted: OLS trend + SARMA on residuals")
_ = Y_forc.plot(label = "Forecast: OLS trend + SARMA on residuals")
_ = plt.legend()
plt.show()

Considerations

How would this compare to our OLS model with quadratic trend and seasonal dummy variables?

fig = plt.figure(figsize = (10, 6))
_ = airpass.plot(label = "Actual")
_ = Y_fit.plot(label = "Fitted: OLS trend + SARMA on residuals")
_ = Y_forc.plot(label = "Forecast: OLS trend + SARMA on residuals")
_ = np.exp(AP_fited_3).plot(linestyle = "--", label = "Quadratic + Seasonal for Airpass")
_ = plt.legend()
plt.show()

To get a better look - we will plot the last few historical years with the forecasts:

fig = plt.figure(figsize = (10, 6))
_ = airpass.plot(label = "Actual")
_ = Y_fit.plot(label = "Fitted: OLS trend + SARMA on residuals")
_ = Y_forc.plot(label = "Forecast: OLS trend + SARMA on residuals")
_ = np.exp(AP_fited_3).plot(linestyle = "--", label = "Quadratic + Seasonal for Airpass")
_ = plt.legend()
_ = plt.xlim(left = log_passengers.index[-36])
plt.show()

We see that the OLS seasonality is underestimated in some cases - the peaks are more accurately captured by the OLS trend and residual \(SARMA\) model combination. Furthermore, we see that some initial months at the beginning of each year are overestimated by the pure OLS model. Hence it is quite likely that the same will hold for the forecasts.

We can conclude that the OLS quadratic trend with \(SARMA\) residuals are a better fit for this model.

On the other hand, since we are estimating this series in two-steps, it is quite likely that our estimated model parameters are not as effective, as they may be if we were to estimate a single \(SARMA\) model with exogeneous trend parameters.

3.2.4.3 Task 11

Here we will present some of the additional smoothing methods mentioned during the lectures.

3.2.4.3.1 Kernel Smoothing

Note that the “local constant” type of regression provided here are known as Nadaraya-Watson kernel regression - which is exactly what we need for the kernel smoothing method:

We need to pass the data as a float type, otherwise we would get a Buffer dtype mismatch, expected 'DOUBLE' but got 'long long' error. We will use the KDEUnivariate function, which calculates the conditional mean \(\mathbb{E}(Y|X)\) where \(Y = g(X) + \epsilon\).

airpass_ks = sm.nonparametric.KernelReg(endog = airpass, exog = time, var_type = "c", reg_type = "lc")
airpass_ks_fit = airpass_ks.fit()
airpass_ks_smoothed = pd.Series(airpass_ks_fit[0])
airpass_ks_smoothed.index = airpass.index

We can also compare it with the moving-average smoothing method (i.e. the MA decomposition):

airpass_ma  = smt.seasonal.seasonal_decompose(airpass, model = "multiplicative", period = 12, two_sided = False)
airpass_ma_smoothed = airpass_ma.trend

The plot of the two smoothing methods side-by-side:

fig = plt.figure(figsize = (10, 6))
_ = plt.plot(airpass, label = "Actual")
_ = plt.plot(airpass_ks_smoothed, label = "Kernel Smoothing")
_ = plt.plot(airpass_ma_smoothed, label = "Moving Average")
_ = plt.legend()
plt.show()

3.2.4.3.2 LOWESS (Locally Weighted Scatterplot Smoothing)
airpass_lowess = sm.nonparametric.lowess(endog = airpass, exog = time)
airpass_lowess_smoothed = pd.Series(airpass_lowess[:, 1])
airpass_lowess_smoothed.index = airpass.index

By default, \(66.67\%\) the data is used for each value.

We can also adjust the fraction of the data used when estimating each value. E.g.\(5\%\):

airpass_lowess_5pct = sm.nonparametric.lowess(endog = airpass, exog = time, frac = 0.05)
airpass_lowess_5pct_smoothed = pd.Series(airpass_lowess_5pct[:, 1])
airpass_lowess_5pct_smoothed.index = airpass.index
fig = plt.figure(figsize = (10, 6))
_ = plt.plot(airpass, label = "Actual")
_ = plt.plot(airpass_ks_smoothed, label = "Kernel Smoothing")
_ = plt.plot(airpass_ma_smoothed, label = "Moving Average")
_ = plt.plot(airpass_lowess_smoothed, label = "LOWESS, default (2/3) obs. smoothing")
_ = plt.plot(airpass_lowess_5pct_smoothed, label = "LOWESS, 5% obs. smoothing", color = "red", linestyle = "--")
_ = plt.legend()
plt.show()

3.2.4.3.3 Cubic Smoothing Splines

Some examples of cubic splines in Python can be found:

In order to fit regression splines in Python, we use the dmatrix module from the patsy library. Regression splines can be fit by constructing an appropriate matrix of basis functions.

transformed_x2 = dmatrix("cr(time, df = 3)", {"time": time}, return_type = 'dataframe')
transformed_x2.index = airpass.index

where we have used cr() to specify a natural cubic regression spline basis:

airpass_cubic_spline = sm.GLM(airpass, transformed_x2)
airpass_cubic_smoothed = airpass_cubic_spline.fit()
fig = plt.figure(figsize = (10, 6))
_ = plt.plot(airpass, label = "Actual")
_ = plt.plot(airpass_cubic_smoothed.fittedvalues, label = "Cubic Smoothing Splines")
_ = plt.legend()
plt.show()

Finally, we can compare all of these smoothing methods:

fig = plt.figure(figsize = (10, 6))
_ = plt.plot(airpass, label = "Actual")
_ = plt.plot(airpass_ks_smoothed, label = "Kernel Smoothing")
_ = plt.plot(airpass_ma_smoothed, label = "Moving Average")
_ = plt.plot(airpass_lowess_smoothed, label = "LOWESS, default (2/3) obs. smoothing")
_ = plt.plot(airpass_cubic_smoothed.fittedvalues, label = "Cubic Smoothing Splines")
_ = plt.legend()
plt.show()