#Import the required modules for vectors and matrix operations, data generation
import numpy as np
#Import the required modules for plot creation:
import matplotlib.pyplot as plt
#import the required modules for TimeSeries data generation:
import statsmodels.api as sm
#Import the required modules for test statistic calculation:
import statsmodels.stats as sm_stat
#Import the required modules for model estimation:
import statsmodels.tsa as smt
#Import formula specification
import statsmodels.formula.api as smf
# Import pandas dataset
import pandas as pd
We will also need some custom functions to make ploting easier:
def tsdisplay(y, figsize = (14, 8), title = "", lags = 20):
tmp_data = pd.Series(y)
fig = plt.figure(figsize = figsize)
#Plot the time series
tmp_data.plot(ax = fig.add_subplot(311), title = "$Time\ Series\ " + title + "$", legend = False)
#Plot the ACF:
sm.graphics.tsa.plot_acf(tmp_data, lags = lags, zero = False, ax = fig.add_subplot(323))
plt.xticks(np.arange(1, lags + 1, 1.0))
#Plot the PACF:
sm.graphics.tsa.plot_pacf(tmp_data, lags = lags, zero = False, ax = fig.add_subplot(324))
plt.xticks(np.arange(1, lags + 1, 1.0))
#Plot the QQ plot of the data:
sm.qqplot(tmp_data, line='s', ax = fig.add_subplot(325))
plt.title("QQ Plot")
#Plot the residual histogram:
fig.add_subplot(326).hist(tmp_data, bins = 40, normed = 1)
plt.title("Histogram")
#Fix the layout of the plots:
plt.tight_layout()
plt.show()
def tsdiag(y, figsize = (14,8), title = "", lags = 10):
#The data:
tmp_data = pd.Series(y)
#The Ljung-Box test results for the first k lags:
tmp_acor = list(sm_stat.diagnostic.acorr_ljungbox(tmp_data, lags = lags, boxpierce = True))
# get the p-values
p_vals = pd.Series(tmp_acor[1])
#Start the index from 1 instead of 0 (because Ljung-Box test is for lag values from 1 to k)
p_vals.index += 1
fig = plt.figure(figsize = figsize)
#Plot the p-values:
p_vals.plot(ax = fig.add_subplot(313), linestyle='', marker='o', title = "p-values for Ljung-Box statistic", legend = False)
#Add the horizontal 0.05 critical value line
plt.axhline(y = 0.05, color = 'blue', linestyle='--')
# Annotate the p-value points above and to the left of the vertex
x = np.arange(p_vals.size) + 1
for X, Y, Z in zip(x, p_vals, p_vals):
plt.annotate(round(Z, 4), xy=(X,Y), xytext=(-5, 5), ha = 'left', textcoords='offset points')
plt.show()
# Return the statistics:
col_index = ["Ljung-Box: X-squared", "Ljung-Box: p-value", "Box-Pierce: X-squared", "Box-Pierce: p-value"]
return pd.DataFrame(tmp_acor, index = col_index, columns = range(1, len(tmp_acor[0]) + 1))
Note that these functions are not necessary, but they make plotting much easier since we are always passing a pandas Series
variable and we will frequently need to plot the data, its autocorrelation, as well as test whether the autocorrelation is significant.
We will show an example with airpass
and shampoo
datasets.
We will begin by reading in the data:
airpass = sm.datasets.get_rdataset("AirPassengers", "datasets")
We can examine the R
documentation of this dataset with
print(airpass.__doc__)
Finally, we need to get the dataitself and transform it into a time series
airpass = pd.Series(airpass.data["value"])
airpass.index = pd.date_range(start = "1949-01", periods = len(airpass.index), freq = "M").to_period()
airpass.index = airpass.index.to_timestamp()
airpass.head()
fig = plt.figure(figsize = (14, 5))
airpass.plot(ax = fig.add_subplot(111))
plt.show()
We note a couple of visuall indications of non-stationarity:
- The mean of the process appears to be increasing as time increases;
- The variance of the process (i.e. the fluctuations) appear to be smaller at the beginning and much larger at the end of the time series
- The monthly data appears to exhibit a seasonality pattern
We can view this seasonality by plotting each year as a separate plot.
Note that since we have a time series variable
airpass.index
We can extract its year
and month
print(airpass.index.year)
print(airpass.index.month)
which we can use to transpose the data:
data_to_plot = pd.pivot_table(airpass.to_frame(), index = airpass.index.month, columns = airpass.index.year)
data_to_plot.head()
and plot the seasonal plots
fig = plt.figure(figsize = (20, 8))
data_to_plot.plot(ax = fig.add_subplot(111))
plt.show()
We see that there appears to be an increase in passengers between June and August yeach year, with the least amount of airline passengers in February and November of each year.
This appears to be a deterministic (i.e. non-random and repeating) pattern.
Overall, we have that the series is not stationary.
One way to check for autocorrelation is to examine the ACF and PACF plots, which we can do using our own tsdisplay
function:
tsdisplay(airpass)
To test for significant autocorrelation with the Ljung-Box
test. We will again opt to use our won function as it is a bit more convenient:
tsdiag(airpass)
Since all the p-value
s are less than the 0.05 critical value, we reject the null hypothesis of no autocorrelation and conclude that the series is autocorrelated.
We also see from the ACF plot that:
Since we have monthly data, where every $12^{th}$ lag exhibits a stronger autocorrelation, the seasonal period is $d = 12$. Usually the seasonal period equal the frequency of the time series.
We again state that the variance of the process (i.e. the fluctuations) appear to be smaller at the beginning and much larger at the end of the time series. This indicated that our series may be multiplicative: $Y_t = T_t \cdot S_t \cdot E_t$.
Since in practice it is much easier to deal with additive series: $Y_t = T_t + S_t + E_t$, we will transform the data by taking the logarithms:
log_passengers = np.log(airpass)
tsdisplay(log_passengers)
The series appears to now have a constant variance - the fluctuations at the beginning appear to be similar to those at the end of the time series.
Note that the minimum value in our series is a positive value:
print(airpass.min())
hence we can take the logarithms of our data.
We will begin by examining a simple decomposition method based on averaging (more specifically we select the amount of values that we will use for the averaging and "move" the averaging window based on time $t$). This is known as the moving-average method for decomposition.
We will begin by estimating the trend component of the series.
We will provide an example of the moving-average method for decomposition.
If we select the length of the moving average as an odd number, for example $l = 3$, then:
are calculated for integer times $t$.
If the time series contains a seasonal component and we want to averate it out, the length of the moving average must be equal to the seasonal frequency (for a monthly series, the length $l = 12$). However, there is a slight hurdle - suppose, our times series begin in January ($t = 1$) and we average up to December ($t = 12$). This average corresponds to a time $t = 6.5$ - a time between June and July.
When we come to estimate seasonal effects, we need a moving-average at integer times. This can be achieved by averaging the average of January to December and the average of February ($t = 2$) to January ($t = 13$). This average of the two moving averages corresponds to $t = 7$ and the process is called centering.
Thus, the trend at time $t$ can be estimated by the centered moving average:
Two-sided averaging: $$ \begin{aligned} \widehat{T}_t &= \dfrac{(Y_{t-6} +...+ Y_{t+5})/12 + (Y_{t-5} +...+ Y_{t+6})/12}{2} = \dfrac{(1/2)Y_{t-6} + Y_{t-5}...+ Y_{t+5}+ (1/2)Y_{t+6}}{12} \end{aligned} $$
One-sided averaging: $$ \begin{aligned} \widehat{T}_t &= \dfrac{(Y_{t-12} +...+ Y_{t-1})/12 + (Y_{t-11} +...+ Y_{t})/12}{2} = \dfrac{(1/2)Y_{t-12} + Y_{t-11}...+ Y_{t-1}+ (1/2)Y_{t}}{12} \end{aligned} $$
Note that the weights are different for the first and last elements used in the averaging. we can use the statsmodels.tsa.seasonal.seasonal_decompose to automatically decompose our series.
We will begin with a two-sided decomposition:
decomposition_1 = smt.seasonal.seasonal_decompose(log_passengers, model = "additive", freq = 12, two_sided = True)
trend_1 = decomposition_1.trend
We can plot this decomposition alongside the time series
plt.figure(figsize = (15, 6))
log_passengers.plot(label = "log of Airpassangers")
trend_1.plot(label = "Two-sided decomposition of the trend")
plt.legend()
plt.show()
Since we used a two-sided decomposition, we cannot calculate the trend for the first and last 6 months of our monthly time series.
On the other hand, if we specify a one-sided decomposition, then we will not be able to calculate the trend for the first 12 months:
decomposition_2 = smt.seasonal.seasonal_decompose(log_passengers, model = "additive", freq = 12, two_sided = False)
trend_2 = decomposition_2.trend
plt.figure(figsize = (15, 6))
log_passengers.plot(label = "log of Airpassangers")
trend_2.plot(label = "One-sided decomposition of the trend")
plt.legend()
plt.show()
We would like to stress the importance of correctly selecting the period of the series. If we select a lower period value, then we will begin to capture the seasonal part with our trend filter.
For example, selecting the moving average length $l = 3$, or selecting too large of a period frequency length, say $l = 19$, yields:
decomposition_3 = smt.seasonal.seasonal_decompose(log_passengers, model = "additive", freq = 3, two_sided = False)
trend_3 = decomposition_3.trend
plt.figure(figsize = (10, 6))
log_passengers.plot(label = "log of Airpassangers")
trend_3.plot(label = "One-sided decomposition of the trend with d = 3")
trend_2.plot(label = "One-sided decomposition of the trend woth period d = 12")
plt.legend()
plt.show()
This can be explained by looking at the formulas for the moving averagedecomposition - this is because we are using fewer observations around time $t$. So, our estimated trend values would be closer to the overall series value. However, in doing so, we are not estimating the trend component correctly. As such, it is important to correctly determine whether your data exhibits a seasonal component and if so - to correctly determine the seasonal frequency.
Furthermore, specifying too large of a period frequency length, say $l = 19$ yields:
decomposition_4 = smt.seasonal.seasonal_decompose(log_passengers, model = "additive", freq = 24, two_sided = False)
trend_4 = decomposition_4.trend
plt.figure(figsize = (10, 6))
log_passengers.plot(label = "log of Airpassangers")
trend_4.plot(label = "One-sided decomposition of the trend with period d = 24")
trend_2.plot(label = "One-sided decomposition of the trend woth period d = 12")
plt.legend()
plt.show()
Notice that too large of a period may not capture the trend as accurately. Again, as we take more values, which cover more than one period, we calculate the average value over more than one period, hence our trend estimate is not as accurate.
We could also try to specify our own function to decompose the trend only:
import math as math
def moving_average_decompose(y, l = 12):
tmp_index = y.index
#Time Series:
y = np.array(y)
#Trend Component:
t_two_side = pd.np.zeros(y.size)
t_one_side = pd.np.zeros(y.size)
tmp_val = 0
#Loop through each time point:
for j in range(0, y.size):
###########################################################
# The length is even - use centered averaging:
if l % 2 == 0:
# The centered averaging method weights - see the formula example with l = 12.
tmp_weights = np.concatenate([np.array([1 / (l * 2)]),
np.repeat(1 / l, l - 1),
np.array([1 / (l * 2)])])
#Two-sided
if j + 1 <= l / 2 or j + 1 > y.size - l / 2:
# We do not have enough data to the left or right of the series to calculate trend at time t = j:
t_two_side[j] = np.nan
else:
tmp_val = y[int(j - (l / 2)):int(j + (l / 2 + 1))] * tmp_weights
t_two_side[j] = tmp_val.sum()
#One-sided
if j + 1 <= l:
t_one_side[j] = np.nan
else:
tmp_val = y[int(j - l):int(j + 1)] * tmp_weights
t_one_side[j] = tmp_val.sum()
###########################################################
# The length is odd:
else:
#For the non-centered averaging the weights are simply the mean, i.e. the length l.
#tmp_weights = np.repeat(1 / l, l)
#Two-sided
if j + 1 <= math.floor(l / 2) or j + 1 > y.size - math.floor(l / 2):
# We do not have enough data to the left or right of the series to calculate trend at time t = j:
t_two_side[j] = np.nan
else:
tmp_val = y[int(j - math.floor(l / 2)):int(j + (math.floor(l / 2) + 1))]
t_two_side[j] = tmp_val.mean()
#One-sided
if j * 2 <= l:
t_one_side[j] = np.nan
else:
tmp_val = y[int(j - l + 1):int(j + 1)]
t_one_side[j] = tmp_val.mean()
#Return the same time series:
t_one_side = pd.Series(t_one_side, name = "One-Sided MA with length = " + str(l))
t_one_side.index = tmp_index
t_two_side = pd.Series(t_two_side, name = "Two-Sided MA with length = " + str(l))
t_two_side.index = tmp_index
return {"Two-sided": t_two_side, "One-sided": t_one_side}
Here we have applied the formulas for calculating the moving average for both odd and even length of the period.
We can test our function as follows:
decomposition_custom_1 = moving_average_decompose(log_passengers)
decomposition_custom_2 = moving_average_decompose(log_passengers, l = 3)
decomposition_custom_3 = moving_average_decompose(log_passengers, l = 24)
plt.figure(figsize = (20, 10))
log_passengers.plot()
decomposition_custom_1["One-sided"].plot()
decomposition_custom_1["Two-sided"].plot()
decomposition_custom_2["Two-sided"].plot()
decomposition_custom_3["One-sided"].plot()
plt.legend()
plt.show()