#Import the required modules for vectors and matrix operations, data generation
import numpy as np
#Import the required modules for plot creation:
import matplotlib.pyplot as plt
#import the required modules for TimeSeries data generation:
import statsmodels.api as sm
#Import the required modules for test statistic calculation:
import statsmodels.stats as sm_stat
#Import the required modules for model estimation:
import statsmodels.tsa as smt
#Import formula specification
import statsmodels.formula.api as smf
# Import pandas dataset
import pandas as pd
We will also need some custom functions to make ploting easier:
def tsdisplay(y, figsize = (14, 8), title = "", lags = 20):
tmp_data = pd.Series(y)
fig = plt.figure(figsize = figsize)
#Plot the time series
tmp_data.plot(ax = fig.add_subplot(311), title = "$Time\ Series\ " + title + "$", legend = False)
#Plot the ACF:
sm.graphics.tsa.plot_acf(tmp_data, lags = lags, zero = False, ax = fig.add_subplot(323))
plt.xticks(np.arange(1, lags + 1, 1.0))
#Plot the PACF:
sm.graphics.tsa.plot_pacf(tmp_data, lags = lags, zero = False, ax = fig.add_subplot(324))
plt.xticks(np.arange(1, lags + 1, 1.0))
#Plot the QQ plot of the data:
sm.qqplot(tmp_data, line='s', ax = fig.add_subplot(325))
plt.title("QQ Plot")
#Plot the residual histogram:
fig.add_subplot(326).hist(tmp_data, bins = 40, normed = 1)
plt.title("Histogram")
#Fix the layout of the plots:
plt.tight_layout()
plt.show()
def tsdiag(y, figsize = (14,8), title = "", lags = 10):
#The data:
tmp_data = pd.Series(y)
#The Ljung-Box test results for the first k lags:
tmp_acor = list(sm_stat.diagnostic.acorr_ljungbox(tmp_data, lags = lags, boxpierce = True))
# get the p-values
p_vals = pd.Series(tmp_acor[1])
#Start the index from 1 instead of 0 (because Ljung-Box test is for lag values from 1 to k)
p_vals.index += 1
fig = plt.figure(figsize = figsize)
#Plot the p-values:
p_vals.plot(ax = fig.add_subplot(313), linestyle='', marker='o', title = "p-values for Ljung-Box statistic", legend = False)
#Add the horizontal 0.05 critical value line
plt.axhline(y = 0.05, color = 'blue', linestyle='--')
# Annotate the p-value points above and to the left of the vertex
x = np.arange(p_vals.size) + 1
for X, Y, Z in zip(x, p_vals, p_vals):
plt.annotate(round(Z, 4), xy=(X,Y), xytext=(-5, 5), ha = 'left', textcoords='offset points')
plt.show()
# Return the statistics:
col_index = ["Ljung-Box: X-squared", "Ljung-Box: p-value", "Box-Pierce: X-squared", "Box-Pierce: p-value"]
return pd.DataFrame(tmp_acor, index = col_index, columns = range(1, len(tmp_acor[0]) + 1))
Note that these functions are not necessary, but they make plotting much easier since we are always passing a pandas Series
variable and we will frequently need to plot the data, its autocorrelation, as well as test whether the autocorrelation is significant.
We will show an example with airpass
and shampoo
datasets.
We will begin by reading in the data:
airpass = sm.datasets.get_rdataset("AirPassengers", "datasets")
We can examine the R
documentation of this dataset with
print(airpass.__doc__)
Finally, we need to get the dataitself and transform it into a time series
airpass = pd.Series(airpass.data["value"])
airpass.index = pd.date_range(start = "1949-01", periods = len(airpass.index), freq = "M").to_period()
airpass.index = airpass.index.to_timestamp()
airpass.head()
fig = plt.figure(figsize = (14, 5))
airpass.plot(ax = fig.add_subplot(111))
plt.show()
We note a couple of visuall indications of non-stationarity:
- The mean of the process appears to be increasing as time increases;
- The variance of the process (i.e. the fluctuations) appear to be smaller at the beginning and much larger at the end of the time series
- The monthly data appears to exhibit a seasonality pattern
We can view this seasonality by plotting each year as a separate plot.
Note that since we have a time series variable
airpass.index
We can extract its year
and month
print(airpass.index.year)
print(airpass.index.month)
which we can use to transpose the data:
data_to_plot = pd.pivot_table(airpass.to_frame(), index = airpass.index.month, columns = airpass.index.year)
data_to_plot.head()
and plot the seasonal plots
fig = plt.figure(figsize = (20, 8))
data_to_plot.plot(ax = fig.add_subplot(111))
plt.show()
We see that there appears to be an increase in passengers between June and August yeach year, with the least amount of airline passengers in February and November of each year.
This appears to be a deterministic (i.e. non-random and repeating) pattern.
Overall, we have that the series is not stationary.
One way to check for autocorrelation is to examine the ACF and PACF plots, which we can do using our own tsdisplay
function:
tsdisplay(airpass)
To test for significant autocorrelation with the Ljung-Box
test. We will again opt to use our won function as it is a bit more convenient:
tsdiag(airpass)
Since all the p-value
s are less than the 0.05 critical value, we reject the null hypothesis of no autocorrelation and conclude that the series is autocorrelated.
We also see from the ACF plot that:
Since we have monthly data, where every $12^{th}$ lag exhibits a stronger autocorrelation, the seasonal period is $d = 12$. Usually the seasonal period equal the frequency of the time series.
We again state that the variance of the process (i.e. the fluctuations) appear to be smaller at the beginning and much larger at the end of the time series. This indicated that our series may be multiplicative: $Y_t = T_t \cdot S_t \cdot E_t$.
Since in practice it is much easier to deal with additive series: $Y_t = T_t + S_t + E_t$, we will transform the data by taking the logarithms:
log_passengers = np.log(airpass)
tsdisplay(log_passengers)
The series appears to now have a constant variance - the fluctuations at the beginning appear to be similar to those at the end of the time series.
Note that the minimum value in our series is a positive value:
print(airpass.min())
hence we can take the logarithms of our data.
We will begin by examining a simple decomposition method based on averaging (more specifically we select the amount of values that we will use for the averaging and "move" the averaging window based on time $t$). This is known as the moving-average method for decomposition.
We will begin by estimating the trend component of the series.
We will provide an example of the moving-average method for decomposition.
If we select the length of the moving average as an odd number, for example $l = 3$, then:
are calculated for integer times $t$.
If the time series contains a seasonal component and we want to averate it out, the length of the moving average must be equal to the seasonal frequency (for a monthly series, the length $l = 12$). However, there is a slight hurdle - suppose, our times series begin in January ($t = 1$) and we average up to December ($t = 12$). This average corresponds to a time $t = 6.5$ - a time between June and July.
When we come to estimate seasonal effects, we need a moving-average at integer times. This can be achieved by averaging the average of January to December and the average of February ($t = 2$) to January ($t = 13$). This average of the two moving averages corresponds to $t = 7$ and the process is called centering.
Thus, the trend at time $t$ can be estimated by the centered moving average:
Two-sided averaging: $$ \begin{aligned} \widehat{T}_t &= \dfrac{(Y_{t-6} +...+ Y_{t+5})/12 + (Y_{t-5} +...+ Y_{t+6})/12}{2} = \dfrac{(1/2)Y_{t-6} + Y_{t-5}...+ Y_{t+5}+ (1/2)Y_{t+6}}{12} \end{aligned} $$
One-sided averaging: $$ \begin{aligned} \widehat{T}_t &= \dfrac{(Y_{t-12} +...+ Y_{t-1})/12 + (Y_{t-11} +...+ Y_{t})/12}{2} = \dfrac{(1/2)Y_{t-12} + Y_{t-11}...+ Y_{t-1}+ (1/2)Y_{t}}{12} \end{aligned} $$
Note that the weights are different for the first and last elements used in the averaging. we can use the statsmodels.tsa.seasonal.seasonal_decompose to automatically decompose our series.
We will begin with a two-sided decomposition:
decomposition_1 = smt.seasonal.seasonal_decompose(log_passengers, model = "additive", freq = 12, two_sided = True)
trend_1 = decomposition_1.trend
We can plot this decomposition alongside the time series
plt.figure(figsize = (15, 6))
log_passengers.plot(label = "log of Airpassangers")
trend_1.plot(label = "Two-sided decomposition of the trend")
plt.legend()
plt.show()
Since we used a two-sided decomposition, we cannot calculate the trend for the first and last 6 months of our monthly time series.
On the other hand, if we specify a one-sided decomposition, then we will not be able to calculate the trend for the first 12 months:
decomposition_2 = smt.seasonal.seasonal_decompose(log_passengers, model = "additive", freq = 12, two_sided = False)
trend_2 = decomposition_2.trend
plt.figure(figsize = (15, 6))
log_passengers.plot(label = "log of Airpassangers")
trend_2.plot(label = "One-sided decomposition of the trend")
plt.legend()
plt.show()
We would like to stress the importance of correctly selecting the period of the series. If we select a lower period value, then we will begin to capture the seasonal part with our trend filter.
For example, selecting the moving average length $l = 3$, or selecting too large of a period frequency length, say $l = 19$, yields:
decomposition_3 = smt.seasonal.seasonal_decompose(log_passengers, model = "additive", freq = 3, two_sided = False)
trend_3 = decomposition_3.trend
plt.figure(figsize = (10, 6))
log_passengers.plot(label = "log of Airpassangers")
trend_3.plot(label = "One-sided decomposition of the trend with d = 3")
trend_2.plot(label = "One-sided decomposition of the trend woth period d = 12")
plt.legend()
plt.show()
This can be explained by looking at the formulas for the moving averagedecomposition - this is because we are using fewer observations around time $t$. So, our estimated trend values would be closer to the overall series value. However, in doing so, we are not estimating the trend component correctly. As such, it is important to correctly determine whether your data exhibits a seasonal component and if so - to correctly determine the seasonal frequency.
Furthermore, specifying too large of a period frequency length, say $l = 19$ yields:
decomposition_4 = smt.seasonal.seasonal_decompose(log_passengers, model = "additive", freq = 24, two_sided = False)
trend_4 = decomposition_4.trend
plt.figure(figsize = (10, 6))
log_passengers.plot(label = "log of Airpassangers")
trend_4.plot(label = "One-sided decomposition of the trend with period d = 24")
trend_2.plot(label = "One-sided decomposition of the trend woth period d = 12")
plt.legend()
plt.show()
Notice that too large of a period may not capture the trend as accurately. Again, as we take more values, which cover more than one period, we calculate the average value over more than one period, hence our trend estimate is not as accurate.
We could also try to specify our own function to decompose the trend only:
import math as math
def moving_average_decompose(y, l = 12):
tmp_index = y.index
#Time Series:
y = np.array(y)
#Trend Component:
t_two_side = pd.np.zeros(y.size)
t_one_side = pd.np.zeros(y.size)
tmp_val = 0
#Loop through each time point:
for j in range(0, y.size):
###########################################################
# The length is even - use centered averaging:
if l % 2 == 0:
# The centered averaging method weights - see the formula example with l = 12.
tmp_weights = np.concatenate([np.array([1 / (l * 2)]),
np.repeat(1 / l, l - 1),
np.array([1 / (l * 2)])])
#Two-sided
if j + 1 <= l / 2 or j + 1 > y.size - l / 2:
# We do not have enough data to the left or right of the series to calculate trend at time t = j:
t_two_side[j] = np.nan
else:
tmp_val = y[int(j - (l / 2)):int(j + (l / 2 + 1))] * tmp_weights
t_two_side[j] = tmp_val.sum()
#One-sided
if j + 1 <= l:
t_one_side[j] = np.nan
else:
tmp_val = y[int(j - l):int(j + 1)] * tmp_weights
t_one_side[j] = tmp_val.sum()
###########################################################
# The length is odd:
else:
#For the non-centered averaging the weights are simply the mean, i.e. the length l.
#tmp_weights = np.repeat(1 / l, l)
#Two-sided
if j + 1 <= math.floor(l / 2) or j + 1 > y.size - math.floor(l / 2):
# We do not have enough data to the left or right of the series to calculate trend at time t = j:
t_two_side[j] = np.nan
else:
tmp_val = y[int(j - math.floor(l / 2)):int(j + (math.floor(l / 2) + 1))]
t_two_side[j] = tmp_val.mean()
#One-sided
if j * 2 <= l:
t_one_side[j] = np.nan
else:
tmp_val = y[int(j - l + 1):int(j + 1)]
t_one_side[j] = tmp_val.mean()
#Return the same time series:
t_one_side = pd.Series(t_one_side, name = "One-Sided MA with length = " + str(l))
t_one_side.index = tmp_index
t_two_side = pd.Series(t_two_side, name = "Two-Sided MA with length = " + str(l))
t_two_side.index = tmp_index
return {"Two-sided": t_two_side, "One-sided": t_one_side}
Here we have applied the formulas for calculating the moving average for both odd and even length of the period.
We can test our function as follows:
decomposition_custom_1 = moving_average_decompose(log_passengers)
decomposition_custom_2 = moving_average_decompose(log_passengers, l = 3)
decomposition_custom_3 = moving_average_decompose(log_passengers, l = 24)
plt.figure(figsize = (20, 10))
log_passengers.plot()
decomposition_custom_1["One-sided"].plot()
decomposition_custom_1["Two-sided"].plot()
decomposition_custom_2["Two-sided"].plot()
decomposition_custom_3["One-sided"].plot()
plt.legend()
plt.show()
Furthermore, we may wish to compare it with the built-in function:
comparison = decomposition_custom_1["Two-sided"] - trend_1
print("Two-Sided MA, l = 12: max. difference = ", round(comparison.max(), 5), "; min. difference = ", round(comparison.min(), 5))
comparison = decomposition_custom_1["One-sided"] - trend_2
print("One-Sided MA, l = 12: max. difference = ", round(comparison.max(), 5), "; min. difference = ", round(comparison.min(), 5))
comparison = decomposition_custom_2["One-sided"] - trend_3
print("One-Sided MA, l = 3: max. difference = ", round(comparison.max(), 5), "; min. difference = ", round(comparison.min(), 5))
comparison = decomposition_custom_3["One-sided"] - trend_4
print("One-Sided MA, l = 24: max. difference = ", round(comparison.max(), 5), "; min. difference = ", round(comparison.min(), 5))
Note the smallest and largest value are approximately zero. This means that our calculations are approximately the same as the ones provided by statsmodels
.
Having estimated the trend, we can move on to estimating the seasonal effect.
We will decompose the seasonal effect using both trend averaging methods and examine how the decomposition fits the data. We will employ the following algorithm:
Since we have already calculated the trend, we can implement the rest of the procedure (under the assumption that the time series starts from the first season, i.e. January for monthly data) as follows:
def decomp_seas(y, trend, period = 12, title = ""):
#Initial seasonal component estimation:
seas_0 = y - trend
seas_1 = pd.np.zeros(period)
#Seasonal component averaging:
for j in range(0, period):
#Select every j-th month from each year in the dataset:
seas_1[j] = seas_0[j::period].mean()
#Adjustment factor:
w = seas_1.mean()
seas_1 = seas_1 - w
#Save the seasonal component in a time series format:
seasonal_results = pd.np.zeros(y.size)
for j in range(0, period):
#Select every j-th month from each year in the dataset:
seasonal_results[j::period] = seas_1[j]
seasonal_results = pd.Series(seasonal_results, name = "Seasonal " + title)
seasonal_results.index = trend.index
return(seasonal_results)
We can now test it and estimate the seasonal component for both one-sided and two-sided trend decomposition:
seas_results = {"One-sided": decomp_seas(y = log_passengers, trend = decomposition_custom_1["One-sided"], period = 12, title = "(One-sided)"),
"Two-sided": decomp_seas(y = log_passengers, trend = decomposition_custom_1["Two-sided"], period = 12, title = "(Two-sided)")}
We can compare how the seasonal components differ for one-sided and two-sided methods:
plt.figure(figsize = (15, 6))
seas_results["One-sided"].plot()
seas_results["Two-sided"].plot()
plt.legend()
plt.tight_layout()
plt.show()
Note that by construction of the algorithm $\widetilde{S}_t = \widetilde{S}_{t+d}$, meaning we have the seasonal component decomposed for the whole series, unlike the one-sided and two-sided decomposition of the trend component.
Furthermore, we can add the seasonality component to the trend component to get the overall deterministic component $\widehat{T}_t + \widetilde{S}_t$:
T_S = {"One-sided": decomposition_custom_1["One-sided"] + seas_results["One-sided"], "Two-sided": decomposition_custom_1["Two-sided"] + seas_results["Two-sided"]}
T_S["One-sided"].name = "Trend + Season"
T_S["Two-sided"].name = "Trend + Season"
Finally, we want to see how all of these component look alongside one another and the whole time series:
fig = plt.figure(figsize = (15, 8))
log_passengers.plot(ax = fig.add_subplot(211), label = "Log of Airpassangers", title = "One-sided decomposition")
decomposition_custom_1["One-sided"].plot()
T_S["One-sided"].plot()
plt.legend()
#
log_passengers.plot(ax = fig.add_subplot(212), label = "Log of Airpassangers", title = "Two-sided decomposition")
decomposition_custom_1["Two-sided"].plot()
T_S["Two-sided"].plot()
plt.legend()
#
plt.tight_layout()
plt.show()
The decomposed trend and seasonality fit the data much nicer compared to only the trend component. Furthermore, the two-sided decomposition method appears to fit the data much closer than the one-side method. The downside is that we do not have the trend
and consequently the trend + seasonality
for the last 6 months of the series.
We can do the same etimation using the results from smt.seasonal.seasonal_decompose
, which we have already calculated before. First, we note that the estimated seasonal component does not differ from the one we estimated manually:
comparison = seas_results["Two-sided"] - decomposition_1.seasonal
print("Two-Sided MA, l = 12: max. difference = ", round(comparison.max(), 5), "; min. difference = ", round(comparison.min(), 5))
comparison = seas_results["One-sided"] - decomposition_2.seasonal
print("One-Sided MA, l = 12: max. difference = ", round(comparison.max(), 5), "; min. difference = ", round(comparison.min(), 5))
So, the manual estimation method is identical to the built-in method provided in statsmodels
We can also print the output from smt.seasonal.seasonal_decompose
:
fig, ax = plt.subplots(4, 1, figsize=(15, 8))
# Plot the series
decomposition_1.observed.plot(ax = ax[0])
decomposition_1.trend.plot(ax = ax[1])
decomposition_1.seasonal.plot(ax = ax[2])
decomposition_1.resid.plot(ax = ax[3])
# Add the labels to the Y-axis
ax[0].set_ylabel('Observed')
ax[1].set_ylabel('Trend')
ax[2].set_ylabel('Seasonal')
ax[3].set_ylabel('Residual')
# Fix layout
plt.tight_layout()
plt.show()
Or, if we want to plot the same-style plot as before:
fig = plt.figure(figsize = (15, 6))
#
log_passengers.plot(ax = fig.add_subplot(211), label = "Log of Airpassangers", title = "One-sided decomposition")
decomposition_2.trend.plot(label = "Trend")
(decomposition_2.seasonal + decomposition_2.trend).plot(label = "Trend + Season")
plt.legend()
#
log_passengers.plot(ax = fig.add_subplot(212), label = "Log of Airpassangers", title = "Two-sided decomposition")
decomposition_1.trend.plot(label = "Trend")
(decomposition_1.seasonal + decomposition_1.trend).plot(label = "Trend + Season")
plt.legend()
#
plt.tight_layout()
plt.show()
As mentioned before, this is exatly the same method that we have used to manually specify the calculation.
Finally, we can check if the remained is stationary. We can either calculated it manually:
fig = plt.figure(figsize = (15, 4))
(log_passengers - T_S["One-sided"]).plot(ax = fig.add_subplot(121), title = "One-sided decomposition Residuals")
(log_passengers - T_S["Two-sided"]).plot(ax = fig.add_subplot(122), title = "Two-sided decomposition Residuals")
plt.tight_layout()
plt.show()
Or exctract it from thesmt.seasonal.seasonal_decompose
output:
fig = plt.figure(figsize = (15, 4))
decomposition_2.resid.plot(ax = fig.add_subplot(121), title = "One-sided decomposition Residuals")
decomposition_1.resid.plot(ax = fig.add_subplot(122), title = "Two-sided decomposition Residuals")
plt.tight_layout()
plt.show()
We note that the residuals might not be stationary - we can compare the variances from different time periods:
print("Two-sided decomp. Variance for [1949-01-01 - 1954-01-01]: ", round(decomposition_1.resid.loc['19490101':'19540101'].var(), 6))
print("Two-sided decomp. Variance for [1954-02-01 - 1960-12-01]: ", round(decomposition_1.resid.loc['19540201':'19600101'].var(), 6))
print("-----------------------------------------------------------")
print("One-sided decomp. Variance for [1949-01-01 - 1954-01-01]: ", round(decomposition_2.resid.loc['19490101':'19540101'].var(), 6))
print("One-sided decomp. Variance for [1954-02-01 - 1960-12-01]: ", round(decomposition_2.resid.loc['19540201':'19600101'].var(), 6))
The residuals in the second-half of the time series appear to have a smaller variance.
Let's also look at the residual correlation plots:
fig = plt.figure(figsize = (14, 8))
fig = sm.graphics.tsa.plot_acf(decomposition_1.resid.dropna(), zero = False, lags=40, ax = fig.add_subplot(221), title = "ACF of Two-sided decomposition Residuals")
fig = sm.graphics.tsa.plot_acf(decomposition_2.resid.dropna(), zero = False, lags=40, ax = fig.add_subplot(222), title = "ACF of One-sided decomposition Residuals")
#
fig = sm.graphics.tsa.plot_acf(decomposition_1.resid.dropna(), zero = False, lags=40, ax = fig.add_subplot(223), title = "PACF of Two-sided decomposition Residuals")
fig = sm.graphics.tsa.plot_acf(decomposition_2.resid.dropna(), zero = False, lags=40, ax = fig.add_subplot(224), title = "PACF of One-sided decomposition Residuals")
#
plt.tight_layout()
plt.show()
The residual series is not White Noise, so we would need to try to estimate an $ARMA$ model for it, as long as the residuals are stationary. Unfortunately, it seems that the two-sided decomposition still leaves a seasonality effect, as shown by the oscillating residual ACF plot.
Looking at the residual plots from this decomposition, we note that they do not appear to be stationary for the two-sided decomposition method on this data - the variance is smaller in the middle of the data, compared to the beginning of the time series. It also seems to be the case for the one-sided decomposition residuals as well.
We can see that this decomposition method does not fully eliminate the deterministic components from this specific time series data sample.
Note that this method does not allow to produce forecasts.
We will now examine a number of alternative decomposition methods, which allow to forecast the series, as well as decompose it. These methods usually are more robust and can usually capture the seasonality and trend, which may not be accoutned for via the moving-average decomposition.
The OLS method estimates coefficients of an additive model. If needed, we can also specify not only a linear but also a polynomial trend. We will estimate three different models via OLS:
where:
Recall, that in order to avoid the dummy variable trap (a scenario in which the independent variables are multicollinear (highly correlated)) we have to exclude one dummy variable - in this case $\text{dm}_{12,t}$ - from our regression models.
We begin by examining the initial (non-logged) data once more:
tsdisplay(airpass)
So it seems to be either an exponential or a polynomial growing time series. We need to create additional dummy/indicator variables:
AP = airpass
#Create trend
time = np.array(range(1, len(airpass.index) + 1))
time_sq = time ** 2
#Create a dummy variable matrix of zeroes:
dm = np.column_stack(np.zeros((11, airpass.size)))
#Create a DataFrame
dm = pd.DataFrame(dm)
#Add values to dummy variables:
for j in range(0, len(dm.columns)):
#Select every j-th month from each year in the dataset:
dm.iloc[j::12, j] = 1
dm.head()
Finally, we can combine everything into one data matrix:
dtf = np.column_stack((airpass, time, time_sq, dm))
#Create a DataFrame
dtf = pd.DataFrame(dtf)
#Rename the columns:
dtf.columns = ["AP"] + ["time"] + ["time_sq"] + ["dm_" + str(number) for number in range(1, 12)]
dtf.index = [str(number) for number in range(1, len(dtf.index) + 1)]
dtf.head(13)
To make it a bit easier to specify, we will defined these formulas:
formula_2 = "AP ~ " + " + ".join(dtf.columns.intersection(dtf.columns.difference(["AP"])))
print(formula_2)
formula_3 = "np.log(AP) ~ " + " + ".join(dtf.columns.intersection(dtf.columns.difference(["AP"])))
print(formula_3)
Now we can estimate the models.
AP_OLS_1 = smf.ols(formula = 'AP ~ time + time_sq', data = dtf)
print(AP_OLS_1.fit().summary().tables[1])
The first, quadratic model has all significanc coefficients.
AP_OLS_2 = smf.ols(formula = formula_2, data = dtf)
print(AP_OLS_2.fit().summary().tables[1])
While the January
, February
and December
dummy variables are not statistically significantly different from zero (and as such, these months are not significantly different from the baseline group of December
), most of the remaining dummy variables are statistically significant, so we can conclude that the seasonal effect is significant in our data.
AP_OLS_3 = smf.ols(formula = formula_3, data = dtf)
print(AP_OLS_3.fit().summary().tables[1])
In the final model we "linearized" the time series by taking the logarithm. We see that we come to the same conclusions regarding coefficient significance as in the second model.
The fitted values can be calculated as
fit_1 = AP_OLS_1.fit()
fit_2 = AP_OLS_2.fit()
fit_3 = AP_OLS_3.fit()
Finally, it would be interesting to predict the series with each model. For that, we need to create a new sample of explanatory variables to predict and plot.
An important distinction for time series forecasting - we need future values of exogeneous variables. Thankfully, we have only used deterministic variables - the year and months - hence we know their exact values in the future. So, we construct the exogeneous varibale matrix, which we will use for forecasting.
We will begin by taking the first 36 observations from our dataset and changing their index to be for the future values
dtf_forc = dtf.iloc[0:36, 1:]
dtf_forc.index = [str(number) for number in range(len(dtf.index) + 1, len(dtf.index) + len(dtf_forc) + 1)]
For comparison - the last few observation of the historical data
dtf.tail()
and the first few observations for the future forecast calculation
dtf_forc.head()
while we have changed the index - we still need to change the time
and time_sq
variables to reflect the future values of time:
dtf_forc["time"] = np.array(range(int(dtf_forc.index[0]), int(dtf_forc.index[-1]) + 1))
dtf_forc["time_sq"] = dtf_forc["time"] ** 2
dtf_forc.head()
Note the following:
January
.Consequently, the first forecast at $T+1 = 145$ will be for January
- this means that $\text{dm_1}_{T+1} = 1$ (other $dm$ variables are zero). For $T+2$ $\text{dm_2}_{T+1} = 1$ (other $dm$ variables are zero) and so on. This is the reason that we took the first observations of our historical dataset - we do not need to assign different values for dm
.
On the other hand, if we did not have a full season, we would need to change their values accordingly
dtf_forc.loc[:, ["dm_1"]].head(15).T
assign zeros
dtf_forc.loc[:, ["dm_1"]] = 0
dtf_forc.loc[:, ["dm_1"]].head(15).T
the column dm_1
is indexed as
dtf_forc.columns.get_loc("dm_1")
Select the first index item (i.e. at index = 0
), for which dm_1 = 1
, then select every 12th
observation and assign 1
:
dtf_forc.iloc[0::12, dtf_forc.columns.get_loc("dm_1")] = 1
dtf_forc.loc[:, ["dm_1"]].head(15).T
If the first forecast $T+1$ was not for January, but say February, then we would need to assign the values something like this
for j, k in zip(range(0, 12), ["dm_" + str(number) for number in range(2, 12)]):
print("First index of " + k + " is " + str(j))
dtf_forc.iloc[0::4, :]
dtf_forc.iloc[11, :]
dtf_forc.iloc[11 + 12, :]
Note that the 9
th index is for Novermber
, so the 10th
index element is December
. However, we do not include the December
dummy variable, hence the first index for January
is 11
.
Now assume that the first forecast $T+1$ is June
, then the indexes are
for j, k in zip(range(0, 6), ["dm_" + str(number) for number in range(6, 12)]):
print("First index of " + k + " is " + str(j))
Then index = 6
is December
, so the remaining first indexes are
for j, k in zip(range(7, 12), ["dm_" + str(number) for number in range(1, 6)]):
print("First index of " + k + " is " + str(j))
Going back to our example, to assign the values where the first forecast is January
, we do the following
for j, k in zip(range(0, 12), ["dm_" + str(number) for number in range(1, 12)]):
print("First index of " + k + " is " + str(j))
dtf_forc.iloc[j::12, dtf_forc.columns.get_loc(k)] = 0
dtf_forc.iloc[j::12, dtf_forc.columns.get_loc(k)] = 1
If we check our dataset, we can verify that this is correct
dtf_forc.head(24)
For different starting months cases you should have no problems modifying and combining the previously provided code for different starting months.
Next, we create forecasts for each model similarly to how we did it for cross-sectional data
#Calculate forecasts
AP_forc_1 = fit_1.predict(dtf_forc)
AP_forc_2 = fit_2.predict(dtf_forc)
AP_forc_3 = fit_3.predict(dtf_forc)
Then we can combine the output into a single Time Series with fitted and forecast values:
#Combine into a single time series with FITTED and FORECAST values:
AP_fited_1 = np.concatenate((fit_1.predict(), np.array(AP_forc_1)))
AP_fited_1 = pd.Series(AP_fited_1)
note the index of the series is a generic index - it does not indicate neither the year, nor the frequency
AP_fited_1.index
we can correct it as follows ( we also add a name for the series to make it easier when plotting)
#Add the correct time index:
AP_fited_1.index = pd.date_range(start = "1949-01", periods = len(AP_fited_1.index), freq = "M").to_period()
AP_fited_1.index = AP_fited_1.index.to_timestamp()
AP_fited_1.name = "Fitted"
AP_fited_1.index
For the remaining models, we can use the existing corrected index of AP_fitted_1
:
AP_fited_2 = np.concatenate((fit_2.predict(), np.array(AP_forc_2)))
AP_fited_2 = pd.Series(AP_fited_2)
AP_fited_2.index = AP_fited_1.index
AP_fited_2.name = AP_fited_1.name
AP_fited_3 = np.concatenate((fit_3.predict(), np.array(AP_forc_3)))
AP_fited_3 = pd.Series(AP_fited_3)
AP_fited_3.index = AP_fited_1.index
AP_fited_3.name = AP_fited_1.name
Finally, we can plot the predicted time series alongside the actual data:
fig = plt.figure(figsize = (20, 6))
AP_fited_1.plot(ax = fig.add_subplot(131), linestyle = "--", title = "Quadratic")
airpass.plot(label = "Actual")
plt.legend()
#
AP_fited_2.plot(ax = fig.add_subplot(132), linestyle = "--", title = "Quadratic + Seasonal for Airpass")
airpass.plot(label = "Actual")
plt.legend()
#
AP_fited_3.plot(ax = fig.add_subplot(133), linestyle = "--", title = "Quadratic + Seasonal for log(Airpass)")
log_passengers.plot(label = "Actual")
plt.legend()
#
plt.tight_layout()
plt.show()
While the graph in the center attempts to capture the seasonality effect and it does so better than the left-side graph, we can see that the actual data fluctuations increase in size together with the level of airpass. So an additive model is unable to catch this effect. The common approach to this problem is to take the logarithms and estimate the additive model (right-side graph).
Exponential smoothing is a technique that can be applied to times series data, either to produce smoothed data for presentation or to make forecasts.
While in the moving average decomposition the past observations are weighted equally, exponential functions are used to assign exponentially decreasing weights over time. It is an easily applied procedure for making some determination based on prior assumptions by the user, such as seasonality. Exponential smoothing is often used for analysis of time-series data.
We state the simple exponential smoothing procedure as an algorithm for converting the observed series $Y_t$ into a smoothed series $\widehat{Y}_t$, $t = 1,...,T$ and forecasts $\widehat{Y}_{T+h,T}$:
We will create a function which both smooths and forecasts a time series:
def exp_smooth_simple(pars, y_t, return_type = "simple", forc_period = 12):
a = pars[0]
T = len(y_t)
#print(T)
new_y = np.zeros(T)
for i in range(0, T): #note - the first element (when i = 0 is unchanged
if i == 0:
new_y[i] = y_t[i]
else:
new_y[i] = a * y_t[i] + (1.0 - a) * new_y[i-1]
#Forecast ahead:
y_forc = np.zeros(forc_period)
for j in range(0, forc_period):
y_forc[j] = new_y[T - 1]
if return_type == "simple":
return new_y
else:
return {"Y": new_y, "Forecast": y_forc}
The smaller $\alpha$ is, the smoother the estimated level. As $\alpha$ approaches 0, the smoothedseries approaches constantcy. As $\alpha$ approaches 1, the smoothes series pproaches point-by-point interpolation.
smoothed_data_1 = exp_smooth_simple([0.1], np.array(pd.Series.tolist(log_passengers)), return_type = "forcast")
smoothed_data_2 = exp_smooth_simple([0.5], np.array(pd.Series.tolist(log_passengers)), return_type = "forcast")
smoothed_data_3 = exp_smooth_simple([0.8], np.array(pd.Series.tolist(log_passengers)), return_type = "forcast")
We will combine the fitted and forecast output
smoothed_data_1 = pd.Series(np.append(smoothed_data_1["Y"], smoothed_data_1["Forecast"]))
smoothed_data_2 = pd.Series(np.append(smoothed_data_2["Y"], smoothed_data_2["Forecast"]))
smoothed_data_3 = pd.Series(np.append(smoothed_data_3["Y"], smoothed_data_3["Forecast"]))
smoothed_data_1.name = "alpha = 0.1"
smoothed_data_2.name = "alpha = 0.5"
smoothed_data_3.name = "alpha = 0.8"
Change its time index to have a year and month instead of a generic index
smoothed_data_1.index = pd.date_range(start = "1949-01", periods = len(smoothed_data_1.index), freq = "M").to_period()
smoothed_data_1.index = smoothed_data_1.index.to_timestamp()
#
smoothed_data_2.index = smoothed_data_3.index = smoothed_data_1.index
Finally, we can plot the results as follows:
plt.figure(figsize = (20, 8))
plt.plot(log_passengers, label = "Actual")
plt.plot(smoothed_data_1, linestyle = "--")
plt.plot(smoothed_data_2, linestyle = "--")
plt.plot(smoothed_data_3, linestyle = "--", color = "red")
plt.legend()
plt.show()
Thankfully, the simple exponential smoothing is also implemented in statsmodels
via statsmodels.tsa.holtwinters.SimpleExpSmoothing(...):
airpass_smpl_exp = smt.holtwinters.SimpleExpSmoothing(log_passengers)
We can select different smoothing levels:
airpass_smpl_exp_1 = airpass_smpl_exp.fit(smoothing_level = 0.1)
airpass_smpl_exp_2 = airpass_smpl_exp.fit(smoothing_level = 0.5)
airpass_smpl_exp_3 = airpass_smpl_exp.fit(smoothing_level = 0.8)
and plot the actual, fitted and forecast values as well:
# Suppress FutureWarning
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
plt.figure(figsize = (20, 8))
plt.plot(log_passengers, label = "Actual")
airpass_smpl_exp_1.fittedvalues.plot(label = "alpha = 0.1", linestyle = "--")
airpass_smpl_exp_2.fittedvalues.plot(label = "alpha = 0.5", linestyle = "--")
airpass_smpl_exp_3.fittedvalues.plot(label = "alpha = 0.8", linestyle = "--")
airpass_smpl_exp_1.forecast(steps = 12).plot(label = "Forecast: alpha = 0.1")
airpass_smpl_exp_2.forecast(steps = 12).plot(label = "Forecast: alpha = 0.5")
airpass_smpl_exp_3.forecast(steps = 12).plot(label = "Forecast: alpha = 0.8")
plt.legend()
plt.show()
Now imagine that we have not only a slowly evolving local level, but also a trend with a slowly evolving local slope. In such cases simple exponential smoothing does not do well. Then the optimal smoothing algorithm (Holt’s Linear Method) is as follows:
Initialize at $t = 2$:
1.1 $\widehat{Y_2} = Y_2$;
1.2 $F_2 = Y_2 - Y_1$;
Update for $t > 2$:
2.1 $\widehat{Y_t} = \alpha Y_t + (1- \alpha)\cdot (\widehat{Y}_{t-1} + F_{t-1})$, where $0 < \alpha < 1$;
2.2 $F_t = \beta (\widehat{Y}_t - \widehat{Y}_{t-1}) + (1 - \beta) F_{t-1}$, where $0 < \beta < 1$;
Forecast: $\widehat{Y}_{T+h,T} = \widehat{Y}_T + h \cdot F_T$
Here $F_t$ is the best estimate of the trend and $\widehat{Y}_t$ is the smoothed series.
We can also estimate this manually:
def exp_smooth_double(pars, y_t, return_type = "simple", forc_period = 12):
a = pars[0]
b = pars[1]
T = len(y_t)
new_y = np.zeros(T)
new_f = np.zeros(T)
for i in range(0, T):
if i == 0:
new_y[i] = y_t[i]
new_f[i] = y_t[i]
elif i == 1:
new_y[i] = y_t[i]
new_f[i] = y_t[i] - y_t[i-1]
else:
new_y[i] = a * y_t[i] + (1.0 - a) * (new_y[i-1] + new_f[i-1])
new_f[i] = b * (new_y[i] - new_y[i-1]) + (1.0 - b) * new_f[i-1]
#Forecast ahead:
y_forc = np.zeros(forc_period)
for j in range(0, forc_period):
y_forc[j] = new_y[T - 1] + (j + 1) * new_f[T - 1]
if return_type == "simple":
return new_y
else:
return {"Y": new_y, "F": new_f, "Forecast": y_forc}
The parameter $\alpha$ controls smoothing of the level and $\beta$ controls smoothing of the slope.
We will present some examples with different $\alpha$ parameter values:
#Example of double exponential smoothing with different alpha parameters:
smoothed_data_1 = pd.Series(exp_smooth_double([0.1, 0.1], np.array(pd.Series.tolist(log_passengers)), return_type = "forcast"))
smoothed_data_2 = pd.Series(exp_smooth_double([0.3, 0.1], np.array(pd.Series.tolist(log_passengers)), return_type = "forcast"))
smoothed_data_3 = pd.Series(exp_smooth_double([0.6, 0.1], np.array(pd.Series.tolist(log_passengers)), return_type = "forcast"))
smoothed_data_1 = pd.Series(np.append(smoothed_data_1["Y"], smoothed_data_1["Forecast"]))
smoothed_data_2 = pd.Series(np.append(smoothed_data_2["Y"], smoothed_data_2["Forecast"]))
smoothed_data_3 = pd.Series(np.append(smoothed_data_3["Y"], smoothed_data_3["Forecast"]))
smoothed_data_1.name = "alpha = 0.1, beta = 0.1"
smoothed_data_2.name = "alpha = 0.3, beta = 0.1"
smoothed_data_3.name = "alpha = 0.6, beta = 0.1"
smoothed_data_1.index = pd.date_range(start = "1949-01", periods = len(smoothed_data_1.index), freq = "M").to_period()
smoothed_data_1.index = smoothed_data_1.index.to_timestamp()
smoothed_data_2.index = smoothed_data_3.index = smoothed_data_1.index
fig = plt.figure(figsize = (20, 10))
plt.plot(log_passengers)
plt.plot(smoothed_data_1, linestyle = "--")
plt.plot(smoothed_data_2, linestyle = "--")
plt.plot(smoothed_data_3, linestyle = "--", color = "red")
plt.legend()
plt.show()
We can also see how the smoothing changes depending on the $\beta$ value:
#Example of double exponential smoothing with different beta parameters:
smoothed_data_1 = pd.Series(exp_smooth_double([0.1, 0.01], np.array(pd.Series.tolist(log_passengers)), return_type = "forcast"))
smoothed_data_2 = pd.Series(exp_smooth_double([0.1, 0.3], np.array(pd.Series.tolist(log_passengers)), return_type = "forcast"))
smoothed_data_3 = pd.Series(exp_smooth_double([0.1, 0.9], np.array(pd.Series.tolist(log_passengers)), return_type = "forcast"))
smoothed_data_1 = pd.Series(np.append(smoothed_data_1["Y"], smoothed_data_1["Forecast"]))
smoothed_data_2 = pd.Series(np.append(smoothed_data_2["Y"], smoothed_data_2["Forecast"]))
smoothed_data_3 = pd.Series(np.append(smoothed_data_3["Y"], smoothed_data_3["Forecast"]))
smoothed_data_1.name = "alpha = 0.1, beta = 0.01"
smoothed_data_2.name = "alpha = 0.1, beta = 0.3"
smoothed_data_3.name = "alpha = 0.1, beta = 0.9"
smoothed_data_1.index = pd.date_range(start = "1949-01", periods = len(smoothed_data_1.index), freq = "M").to_period()
smoothed_data_1.index = smoothed_data_1.index.to_timestamp()
smoothed_data_2.index = smoothed_data_3.index = smoothed_data_1.index
fig = plt.figure(figsize = (20, 10))
plt.plot(log_passengers)
plt.plot(smoothed_data_1, linestyle = "--")
plt.plot(smoothed_data_2, linestyle = "--")
plt.plot(smoothed_data_3, linestyle = "--", color = "red")
plt.legend()
plt.show()
The double exponential smoothing method is also implemented in statsmodels.tsa.holtwinters.Holt(...):
airpass_smpl_exp_x2 = smt.holtwinters.Holt(log_passengers)
airpass_smpl_exp_x2_11 = airpass_smpl_exp_x2.fit(smoothing_level = 0.1, smoothing_slope = 0.1)
airpass_smpl_exp_x2_12 = airpass_smpl_exp_x2.fit(smoothing_level = 0.3, smoothing_slope = 0.1)
airpass_smpl_exp_x2_13 = airpass_smpl_exp_x2.fit(smoothing_level = 0.6, smoothing_slope = 0.1)
airpass_smpl_exp_x2_21 = airpass_smpl_exp_x2.fit(smoothing_level = 0.1, smoothing_slope = 0.01)
airpass_smpl_exp_x2_22 = airpass_smpl_exp_x2.fit(smoothing_level = 0.1, smoothing_slope = 0.3)
airpass_smpl_exp_x2_23 = airpass_smpl_exp_x2.fit(smoothing_level = 0.1, smoothing_slope = 0.9)
fig = plt.figure(figsize = (20, 8))
fig.add_subplot(121)
plt.plot(log_passengers, label = "Actual")
airpass_smpl_exp_x2_11.fittedvalues.plot(label = "alpha = 0.1, beta = 0.1", linestyle = "--")
airpass_smpl_exp_x2_12.fittedvalues.plot(label = "alpha = 0.3, beta = 0.1", linestyle = "--")
airpass_smpl_exp_x2_13.fittedvalues.plot(label = "alpha = 0.6, beta = 0.1", linestyle = "--")
airpass_smpl_exp_x2_11.forecast(steps = 12).plot(label = "Forecast: alpha = 0.1, beta = 0.1")
airpass_smpl_exp_x2_12.forecast(steps = 12).plot(label = "Forecast: alpha = 0.3, beta = 0.1")
airpass_smpl_exp_x2_13.forecast(steps = 12).plot(label = "Forecast: alpha = 0.6, beta = 0.1")
plt.legend()
#
fig.add_subplot(122)
plt.plot(log_passengers, label = "Actual")
airpass_smpl_exp_x2_21.fittedvalues.plot(label = "alpha = 0.1, beta = 0.01", linestyle = "--")
airpass_smpl_exp_x2_22.fittedvalues.plot(label = "alpha = 0.1, beta = 0.3", linestyle = "--")
airpass_smpl_exp_x2_23.fittedvalues.plot(label = "alpha = 0.1, beta = 0.9", linestyle = "--")
airpass_smpl_exp_x2_21.forecast(steps = 12).plot(label = "Forecast: alpha = 0.1, beta = 0.01")
airpass_smpl_exp_x2_22.forecast(steps = 12).plot(label = "Forecast: alpha = 0.1, beta = 0.3")
airpass_smpl_exp_x2_23.forecast(steps = 12).plot(label = "Forecast: alpha = 0.1, beta = 0.9")
plt.legend()
plt.show()
A method known as Holt-Winters method is based on three smoothing equations - one for the level, one for the trend and one for seasonality. It is similar to Holt’s linear method, with one additional equation dealing with seasonality.
Triple exponential smoothing applies exponential smoothing three times, which is commonly used when there are three high frequency signals to be removed from a time series under study. The use of a triple application is considered a rule of thumb technique, rather than one based on theoretical foundations and has often been over-emphasized by practitioners.
The smoothing algorithm (where the seasonal effect is additive) is as follows:
Initialize:
1.1 Set $\widehat{Y}_L = \dfrac{Y_1 + ... + Y_L}{L}$
1.2 The general formula for the initial trend: $F_L = \dfrac{1}{L} \left(\dfrac{Y_{L+1} - Y_1}{L} + \dfrac{Y_{L+2} - Y_2}{L} + ... \dfrac{Y_{L+L} - Y_L}{L} \right)$
1.3 Suppose a cycle of seasonal change is of length L so that $G_t = G_{t+L}$ and $N$ - the number of somplete cycles in the data. As a rule of thumb, a minimum of two full seasons (or 2L periods) of historical data is needed to initialize a set of seasonal factors. The initial estimates for the seasonal indices: $$ \begin{aligned} G_i &= \dfrac{1}{N}\sum_{j = 1}^N{\dfrac{Y_{L(j-1)+i}}{A_j}}, \quad i = 1,...,L\\ \text{where: } A_j &= \dfrac{1}{L} \sum_{k = 1}^L Y_{L(j-1)+k}, \quad j = 1,...,N \end{aligned} $$
i.e. $A_j$ is the average value of $Y_t$ in the $j-th$ cycle of your data and $G_i$ is the average of the weighted values of the $i-th$ seasonal indice from all cycles.
For example, if we have yearly data of $N$ years, then $A_1$ would be the average value of the first 12 month data $Y_1,...,Y_{12}$; and $A_2$ - the average of $Y_{13},...,Y_{24}$, etc.
Then $G_1$ would be the average of the weighted values $\dfrac{Y_1}{A_1}, \dfrac{Y_{12 \cdot 2 + 1}}{A_2},..., \dfrac{Y_{12 \cdot N + 1}}{A_N}$, $G_2$ - the average of $\dfrac{Y_2}{A_1}, \dfrac{Y_{12 \cdot 2 + 2}}{A_2},..., \dfrac{Y_{12 \cdot N + 2}}{A_N}$ etc.
Two possible scenarios:
2.1 $S_t = \alpha \left( Y_t - G_{t-L}\right) + (1- \alpha)\cdot (S_{t-1} + F_{t-1})$, where $0 < \alpha < 1$;
2.2 $F_t = \beta (S_t - S_{t-1}) + (1 - \beta) F_{t-1}$, where $0 < \beta < 1$;
2.3 $G_t = \gamma \left( Y_t-S_{t} \right) + (1 - \gamma) G_{t-L}$
2.4 Forecast: $\widehat{Y}_{t+h} = \widehat{Y}_t + h \cdot F_t + G_{t + h - L}$
2.1 $S_t = \alpha \left( \dfrac{Y_t}{G_{t-L}}\right) + (1- \alpha)\cdot (S_{t-1} + F_{t-1})$, where $0 < \alpha < 1$;
2.2 $F_t = \beta (S_t - S_{t-1}) + (1 - \beta) F_{t-1}$, where $0 < \beta < 1$;
2.3 $G_t = \gamma \left( \dfrac{Y_t}{S_t} \right) + (1 - \gamma) G_{t-L}$
2.4 Forecast: $\widehat{Y}_{t+h} = (\widehat{Y}_t + h \cdot F_t) \cdot G_{t + h - L}$
Here $F_t$ is the best estimate of the trend, $G_t$ is the best estimate of the seasonal factors and $S_t$ is the smoothed series.
It is also available in statsmodels
as statsmodels.tsa.holtwinters.ExponentialSmoothing(...).
We will not re-implement this method manually as it somewhat varies across different software.
airpass_smpl_exp_hw = smt.holtwinters.ExponentialSmoothing(log_passengers)
airpass_smpl_exp_hw_1 = airpass_smpl_exp_hw.fit()
airpass_smpl_exp_hw2 = smt.holtwinters.ExponentialSmoothing(log_passengers, seasonal = "additive", seasonal_periods = 12)
airpass_smpl_exp_hw_2 = airpass_smpl_exp_hw2.fit()
airpass_smpl_exp_hw3 = smt.holtwinters.ExponentialSmoothing(log_passengers, seasonal = "multiplicative", seasonal_periods = 12)
airpass_smpl_exp_hw_3 = airpass_smpl_exp_hw3.fit()
fig = plt.figure(figsize = (20, 8))
fig.add_subplot(111)
plt.plot(log_passengers, label = "Actual")
airpass_smpl_exp_hw_1.fittedvalues.plot(label = "Holt-Winter's Exponential Smoothing: No season specification", linestyle = "--")
airpass_smpl_exp_hw_2.fittedvalues.plot(label = "Holt-Winter's Exponential Smoothing: with additive season specification", linestyle = "--")
airpass_smpl_exp_hw_3.fittedvalues.plot(label = "Holt-Winter's Exponential Smoothing: with multiplicative season specification", linestyle = "--")
airpass_smpl_exp_hw_1.forecast(steps = 12).plot(label = "HW: no season spec")
airpass_smpl_exp_hw_2.forecast(steps = 12).plot(label = "HW: with additive season spec", marker = "x")
airpass_smpl_exp_hw_3.forecast(steps = 12).plot(label = "HW: with multiplicative season spec", marker = "x")
plt.legend()
plt.show()
R
this can be carried out via the ets()
function)¶The easiest way to compare the methods, besides looking at the plots, is to examine the residuals by looking at the residual series and ACF plots - are the residuals stationary? Are they WN?
resid_OLS = fit_3.resid
tsdisplay(resid_OLS, lags = 30)
Note that only the first couple of lags are significant, while the latter ones are not as much.
We can also examine the residuals of the double exponential smoothing with $\alpha = 0.6$ and $\beta = 0.1$ as it appears to be the best fit:
resid_exp_x2 = log_passengers - airpass_smpl_exp_x2_13.fittedvalues
tsdisplay(resid_exp_x2, lags = 30)
Note that in the case of using double exponential smoothing, there appears to be a seasonality or cyclic effect as every so often we see a lag spike. Overall, the double exponential smoothing method was ineffctive in capturing the seasonality effect.
Finally, the Hold-Winter's exponential smoothing method with a multiplicative
seasonal component appear to have the best residuals (in terms of their white noise property):
resid_hw = log_passengers - airpass_smpl_exp_hw_3.fittedvalues
tsdisplay(resid_hw, lags = 30)
Note that they may still be autocorrelated (again, the point of the decomposition is to separate the deterministic component from the irregular, stationary component, which is not necessarily white noise):
tsdiag(resid_hw)
but this can be solved by fitting an ARMA models on the residuals:
resid_mdl_order = sm.tsa.stattools.arma_order_select_ic(resid_hw, ic = 'aic', trend = 'nc')
resid_mdl_order
resid_mdl_order.aic_min_order
So, the residuals would need an $ARMA(1, 1)$ model. We will leave this as an additional exercise. Note that the final fitted values would need to be:
$ \text{Fitted values from Holt-Winter smoothing } + \text{ Fitted values from the residual ARMA model} $
Similarly the overall series forecast would be:
$ \text{Forecast from Holt-Winter smoothing } + \text{ Forecast from the residual ARMA model} $
Next we will examine a number of additional decomposition methods, which are no less important than the previously introduced methods.
In order to fit the seasonal model, we first need to remove the trend component from the series (unless the series is a $DS$ series - something which will be examined in the next lecture).
We choose to use the residuals of the polynomial trend regression, but we will estimate the model on $\log(Y_t)$, since it appears that the series is multiplicative:
AP_OLS_trend = smf.ols(formula = 'np.log(AP) ~ time + time_sq', data = dtf)
AP_OLS_trend_fit = AP_OLS_trend.fit()
print(AP_OLS_trend_fit.summary().tables[1])
tsdisplay(AP_OLS_trend_fit.resid, lags = 25)
It appears the trend is estimated relatively well. The residuals appear to have a seasonal component but also a constant variance.
In order to model seasonality, we will estimate an $\rm SARIMA(p,d,q)(P,D,Q)_s$, restricting ourselvesto $d = D = 0$ (again, we will examine the model for differences in the following lectures).
Similarly to ARMA models, a lower lag order is usually sufficient. For example, if we wanted to estimate a $\rm SARIMA(2,0,2)(1,0,1)_{12}$ model (we select $s = 12$, since the data is monthly):
resid_sarima = sm.tsa.statespace.SARIMAX(AP_OLS_trend_fit.resid, order=(2, 0, 2), seasonal_order=(1, 0, 1, 12))
resid_sarima_fit = resid_sarima.fit()
print(resid_sarima_fit.summary())
-
Note that sigma2
in the coefficients table is the estimate of the variance of the error term.
If we check the residuals of or model:
tsdisplay(resid_sarima_fit.resid)
we see that they appear to be uncorrelated. We do note that the variance of the residuals seems larger at the start of the series.
If we wanted to create our own simple automatic order selection, we can do so by editing our previous ARMA order selection code (from the GARCH example notebook):
warnings.filterwarnings("ignore", category = FutureWarning)
warnings.filterwarnings("ignore", category = RuntimeWarning)
warnings.filterwarnings("ignore", category = sm.tools.sm_exceptions.ValueWarning)
warnings.filterwarnings("ignore", category = sm.tools.sm_exceptions.HessianInversionWarning)
warnings.filterwarnings("ignore", category = sm.tools.sm_exceptions.ConvergenceWarning)
# Specify the best AIC - default is infinite - so anything lower is better
best_aic = None
# Specify the best order
best_order = None
# Specify the best model
best_mdl = None
# Loop through different (p, q) combinations, starting from SARMA(0, 0)(0, 0), to SARMA(1, 1)(1, 1)
pq_rng = range(2) # range(5) would be [0,1,2,3,4]
for p in pq_rng:
for q in pq_rng:
for P in pq_rng:
for Q in pq_rng:
try:
tmp_mdl = sm.tsa.statespace.SARIMAX(AP_OLS_trend_fit.resid,
order=(p, 0, q),
seasonal_order=(P, 0, Q, 12))
tmp_mdl_fit = tmp_mdl.fit()
tmp_aic = tmp_mdl_fit.aic
if best_aic == None:
best_aic = tmp_aic
best_order = {"Nonseasonal": (p, 0, q), "Seasonal": (P, 0, Q)}
best_mdl = tmp_mdl
print("Fitted an ARMA(" + str(p) + ", " + str(q) + ")(" +
str(P) + ", " + str(Q) + ")_12 model - it was better")
elif tmp_aic < best_aic:
best_aic = tmp_aic
best_order = {"Nonseasonal": (p, 0, q), "Seasonal": (P, 0, Q)}
best_mdl = tmp_mdl
print("Fitted an ARMA(" + str(p) + ", " + str(q) + ")(" +
str(P) + ", " + str(Q) + ")_12 model - it was better")
else:
print("Fitted an ARMA(" + str(p) + ", " + str(q) + ")(" +
str(P) + ", " + str(Q) + ")_12 model - it was worse")
except: continue
For faster esimation, we have opted to fit a SARMA with a maximum lag order of 1
for all lags. The resulting model:
print(tmp_mdl_fit.summary())
tsdisplay(tmp_mdl_fit.resid)
Seems similar in terms of residuals to our higher order seasonal ARMA specification. If we compare AIC/BIC values:
pd.DataFrame([[resid_sarima_fit.aic, tmp_mdl_fit.aic],
[resid_sarima_fit.bic, tmp_mdl_fit.bic]],
index = ["AIC", "BIC"], columns = ["Manual", "Auto"])
We see that the automatic specification has a lower AIC and BIC value, indicating that the $\rm SARIMA(1, 0, 1)(1, 0, 1)_{12}$ is the better model.
We can write the model using the lag functions as:
$(1 - 0.7873 L)\cdot (1 - 0.9900 L^{12})Y_t = (1 - 0.1896 L)\cdot (1 - 0.5948 L^{12})\epsilon_t$
Firstly, we will calculate the fitted values.
In general, the fitted values can be calculated by:
Then, we calculate the fitted values of the original series as $\widehat{Y}_t = \widehat{T}_t + \widehat{S}_t + \widehat{E}_t$.
In our case, we have estimated and SARMA model for $E_t$, which includes the seasonal component, and we have calculated the residuals from the OLS trend regression for the $\widehat{\log(Y)}_t$.
Hence $\widehat{Y}_t = \exp \left( \widehat{T}_t + \widehat{E}_{t}^{(SARMA)} \right)$
Y_fit = np.exp(AP_OLS_trend_fit.fittedvalues + resid_sarima_fit.fittedvalues)
Y_fit = pd.Series(Y_fit)
Y_fit.index = airpass.index
Y_trend = np.exp(AP_OLS_trend_fit.fittedvalues)
Y_trend = pd.Series(Y_trend)
Y_trend.index = airpass.index
fig = plt.figure(figsize = (20, 6))
ax1 = fig.add_subplot(111)
#
airpass.plot(label = "Actual", ax = ax1)
Y_fit.plot(label = "Fitted", ax = ax1)
Y_trend.plot(label = "Estimated Trend", ax = ax1)
plt.title("Actual vs fitted plot")
plt.legend()
plt.show()
Next, we will calculate the forecasts for the trend component. We could do as before, and take an existing subset of dtf
and modify its values, but for this example, we will create a variable matrix for forecasts from scratch.
We will begin by looking at the last values of time
in the design matrix, which we used for fitting the OLS regrssion:
dtf.tail()
So, we will need the series to start from 145
. Since the OLS was fitted only for the trend, we will only need two column - time
and time_sq
:
time_forc = np.array(list(range(145, 145 + 20)))
dt_forc = pd.DataFrame([time_forc, time_forc**2], index = ["time", "time_sq"])
dt_forc.head()
We need to transpose the dataset so that the column would correspond to different variables
dt_forc = dt_forc.T
dt_forc.head()
Now, we can calculate the forecast of the trend:
trend_forc = AP_OLS_trend_fit.predict(dt_forc)
trend_forc.head()
Note that this is the trend for the $\log(Y)_t$ series and not the oridinal series.
To calcualte the forecast of the $SARIMA$ model:
resid_forc = resid_sarima_fit.forecast(steps = len(dt_forc))
resid_forc.head()
which is also for the $\log(Y)_t$ series.
Also the forecast of $SARMA$ and need to have the same index. Furthermore, since this is for forecasts (and not the fitted values, like before) - the index needs to start after the last index value of the historical data, so:
# the last date in the historical data
airpass.index[-1]
forc_index = pd.date_range(start = airpass.index[-1] + 1, periods = len(resid_forc.index), freq = "M").to_period()
forc_index = forc_index.to_timestamp()
forc_index
resid_forc.index = forc_index
trend_forc.index = forc_index
In order to calculate for $Y_t$, we do as before and either calculate $\exp ( \widehat{T}_t + \widehat{E}_t )$, or equivalently $\exp ( \widehat{T}_t ) \cdot exp (\widehat{E}_t )$.
np.exp(resid_forc + trend_forc).head()
(np.exp(resid_forc) * np.exp(trend_forc)).head()
We will use the first method:
Y_forc = np.exp(resid_forc + trend_forc)
pd.Series(Y_forc)
fig = plt.figure(figsize = (20, 6))
ax1 = fig.add_subplot(111)
#
airpass.plot(label = "Actual", ax = ax1)
Y_fit.plot(label = "Fitted", ax = ax1)
Y_forc.plot(label = "Forecast", ax = ax1)
#
plt.legend()
plt.show()
Here we will present some of the additional smoothing methods mentioned during the lectures,
We need to pass the data as a float
type, otherwise we would get a Buffer dtype mismatch, expected 'DOUBLE' but got 'long long'
error.
We will use the KDEUnivariate function, which calculates the conditional mean $\mathbb{E}(Y|X)$ where $Y = g(X) + \epsilon$.
Note that the “local constant” type of regression provided here known as Nadaraya-Watson kernel regression - which is exactly what we need for the kernel smoothing method:
airpass_ks = sm.nonparametric.KernelReg(endog = airpass, exog = time, var_type = "c", reg_type = "lc")
airpass_ks_fit = airpass_ks.fit()
airpass_ks_smoothed = pd.Series(airpass_ks_fit[0])
airpass_ks_smoothed.index = airpass.index
We can also compare it with the moving-average smoothing method (i.e. the MA decomposition):
airpass_ma = smt.seasonal.seasonal_decompose(airpass, model = "muliplicative", freq = 12, two_sided = False)
airpass_ma_smoothed = airpass_ma.trend
The plot of the two smoothing methods side-by-side:
plt.figure(figsize = (20, 6))
#
plt.plot(airpass, label = "Actual")
plt.plot(airpass_ks_smoothed, label = "Kernel Smoothing")
plt.plot(airpass_ma_smoothed, label = "Moving Average")
#
plt.legend()
plt.show()
airpass_lowess = sm.nonparametric.lowess(endog = airpass, exog = time)
airpass_lowess_smoothed = pd.Series(airpass_lowess[:, 1])
airpass_lowess_smoothed.index = airpass.index
By default, $66.67\%$ the data is used for each value.
We can also adjust the fraction
of the data used when estimating each value. E.g.$5\%$:
airpass_lowess_5pct = sm.nonparametric.lowess(endog = airpass, exog = time, frac = 0.05)
airpass_lowess_5pct_smoothed = pd.Series(airpass_lowess_5pct[:, 1])
airpass_lowess_5pct_smoothed.index = airpass.index
plt.figure(figsize = (20, 8))
#
plt.plot(airpass, label = "Actual")
plt.plot(airpass_ks_smoothed, label = "Kernel Smoothing")
plt.plot(airpass_ma_smoothed, label = "Moving Average")
plt.plot(airpass_lowess_smoothed, label = "LOWESS, default (2/3) obs. smoothing")
plt.plot(airpass_lowess_5pct_smoothed, label = "LOWESS, 5% obs. smoothing", color = "red", linestyle = "--")
#
plt.legend()
plt.show()
from patsy import dmatrix
transformed_x2 = dmatrix("cr(time, df = 3)", {"time": time}, return_type = 'dataframe')
transformed_x2.index = airpass.index
where cr()
is used for a natural cubic regression spline basis.
airpass_cubic_spline = sm.GLM(airpass, transformed_x2)
airpass_cubic_smoothed = airpass_cubic_spline.fit()
plt.figure(figsize = (20, 4))
#
plt.plot(airpass, label = "Actual")
plt.plot(airpass_cubic_smoothed.fittedvalues, label = "Cubic Smoothing Splines")
#
plt.legend()
plt.show()
Finally, we can compare all of these smoothing methods:
plt.figure(figsize = (20, 8))
#
plt.plot(airpass, label = "Actual")
plt.plot(airpass_ks_smoothed, label = "Kernel Smoothing")
plt.plot(airpass_ma_smoothed, label = "Moving Average")
plt.plot(airpass_lowess_smoothed, label = "LOWESS, default (2/3) obs. smoothing")
plt.plot(airpass_cubic_smoothed.fittedvalues, label = "Cubic Smoothing Splines")
#
plt.legend()
plt.show()