## 4.11 Example

Considerations

The following libraries will be useful for this exercise.

suppressPackageStartupMessages({
suppressWarnings({
suppressMessages({
library(lmtest)
library(lrmest)
library(tseries)
library(nortest)
library(car)
library(sandwich)
library(lattice)
library(viridisLite)
library(leaps)
})
})
})      
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import scipy.stats as stats
#
import statsmodels.api as sm
import statsmodels.formula.api as smf
import statsmodels.stats.diagnostic as sm_diagnostic
import statsmodels.stats.stattools as sm_tools
import statsmodels.stats as smstats
import statsmodels.stats.outliers_influence as oi
#
from statsmodels.stats.outliers_influence import summary_table
from statsmodels.sandbox.regression.predstd import wls_prediction_std
# Additional methods for output formatting
from statsmodels.compat import lzip
# Additional method for pcustom lot legend creation
from matplotlib.lines import Line2D
#
import seaborn as sns
import skgof as skgof

We will continue using the dataset in the example from section 3.11, namely Dataset 4.

#
#
data_source <- "http://www.principlesofeconometrics.com/poe5/data/csv/cps5_small.csv"
dt4 <- read.csv(file = data_source, sep = ",", dec = ".", header = TRUE)
print(head(dt4, 10))
##    black educ exper faminc female metro midwest south  wage west
## 1      0   13    45      0      1     1       0     0 44.44    1
## 2      0   14    25  45351      1     1       1     0 16.00    0
## 3      0   18    27  91946      1     1       0     0 15.38    0
## 4      0   13    42  48370      0     1       1     0 13.54    0
## 5      0   13    41  10000      1     1       0     0 25.00    1
## 6      0   16    26 151308      1     1       0     0 24.05    0
## 7      0   16    11 110461      1     1       0     1 40.95    0
## 8      0   18    15      0      1     1       1     0 26.45    0
## 9      0   21    32  67084      0     1       1     0 30.76    0
## 10     0   14    12  14000      0     0       0     0 34.57    1
import pandas as pd
#
data_source = "http://www.principlesofeconometrics.com/poe5/data/csv/cps5_small.csv"
print(dt4.head(10))
##    black  educ  exper  faminc  female  metro  midwest  south   wage  west
## 0      0    13     45       0       1      1        0      0  44.44     1
## 1      0    14     25   45351       1      1        1      0  16.00     0
## 2      0    18     27   91946       1      1        0      0  15.38     0
## 3      0    13     42   48370       0      1        1      0  13.54     0
## 4      0    13     41   10000       1      1        0      0  25.00     1
## 5      0    16     26  151308       1      1        0      0  24.05     0
## 6      0    16     11  110461       1      1        0      1  40.95     0
## 7      0    18     15       0       1      1        1      0  26.45     0
## 8      0    21     32   67084       0      1        1      0  30.76     0
## 9      0    14     12   14000       0      0        0      0  34.57     1

The variable definitions are as follows:

The variable list is as follows:

• black - =1 if black
• educ - years of education
• exper - potential experience = age - educ - 6
• faminc - other family income, $$\$$
• female - =1 if female
• metro - =1 if in metropolitan area
• midwest - =1 if midwest region
• south - =1 if south region
• wage - earnings per hour, $$\$$
• west - =1 if west region

Since we are now interested in determining which variables have a signifficant effect on wage - it is a good idea to check that the data types assigned to each column are correct:

print(str(dt4))
## 'data.frame':    1200 obs. of  10 variables:
##  $black : int 0 0 0 0 0 0 0 0 0 0 ... ##$ educ   : int  13 14 18 13 13 16 16 18 21 14 ...
##  $exper : int 45 25 27 42 41 26 11 15 32 12 ... ##$ faminc : int  0 45351 91946 48370 10000 151308 110461 0 67084 14000 ...
##  $female : int 1 1 1 0 1 1 1 1 0 0 ... ##$ metro  : int  1 1 1 1 1 1 1 1 1 0 ...
##  $midwest: int 0 1 0 1 0 0 0 1 1 0 ... ##$ south  : int  0 0 0 0 0 0 1 0 0 0 ...
##  $wage : num 44.4 16 15.4 13.5 25 ... ##$ west   : int  1 0 0 0 1 0 0 0 0 1 ...
## NULL
print(dt4.dtypes)
## black        int64
## educ         int64
## exper        int64
## faminc       int64
## female       int64
## metro        int64
## midwest      int64
## south        int64
## wage       float64
## west         int64
## dtype: object

and have a quick glance at the summary statistics of each variable:

print(summary(dt4))
##      black             educ          exper           faminc           female         metro           midwest           south            wage             west
##  Min.   :0.0000   Min.   : 0.0   Min.   : 0.00   Min.   :     0   Min.   :0.00   Min.   :0.0000   Min.   :0.0000   Min.   :0.000   Min.   :  3.94   Min.   :0.0000
##  1st Qu.:0.0000   1st Qu.:12.0   1st Qu.:12.00   1st Qu.:     0   1st Qu.:0.00   1st Qu.:1.0000   1st Qu.:0.0000   1st Qu.:0.000   1st Qu.: 13.00   1st Qu.:0.0000
##  Median :0.0000   Median :14.0   Median :24.00   Median : 23679   Median :0.00   Median :1.0000   Median :0.0000   Median :0.000   Median : 19.30   Median :0.0000
##  Mean   :0.0875   Mean   :14.2   Mean   :23.37   Mean   : 35304   Mean   :0.44   Mean   :0.8217   Mean   :0.2475   Mean   :0.325   Mean   : 23.64   Mean   :0.2525
##  3rd Qu.:0.0000   3rd Qu.:16.0   3rd Qu.:34.00   3rd Qu.: 53029   3rd Qu.:1.00   3rd Qu.:1.0000   3rd Qu.:0.0000   3rd Qu.:1.000   3rd Qu.: 29.80   3rd Qu.:1.0000
##  Max.   :1.0000   Max.   :21.0   Max.   :62.00   Max.   :469000   Max.   :1.00   Max.   :1.0000   Max.   :1.0000   Max.   :1.000   Max.   :221.10   Max.   :1.0000
print(dt4.describe())
##              black         educ        exper         faminc       female        metro     midwest       south         wage         west
## count  1200.000000  1200.000000  1200.000000    1200.000000  1200.000000  1200.000000  1200.00000  1200.00000  1200.000000  1200.000000
## mean      0.087500    14.202500    23.374167   35304.421667     0.440000     0.821667     0.24750     0.32500    23.640042     0.252500
## std       0.282684     2.890811    13.269296   45026.488224     0.496594     0.382953     0.43174     0.46857    15.216554     0.434628
## min       0.000000     0.000000     0.000000       0.000000     0.000000     0.000000     0.00000     0.00000     3.940000     0.000000
## 25%       0.000000    12.000000    12.000000       0.000000     0.000000     1.000000     0.00000     0.00000    13.000000     0.000000
## 50%       0.000000    14.000000    24.000000   23679.000000     0.000000     1.000000     0.00000     0.00000    19.300000     0.000000
## 75%       0.000000    16.000000    34.000000   53029.000000     1.000000     1.000000     0.00000     1.00000    29.800000     1.000000
## max       1.000000    21.000000    62.000000  469000.000000     1.000000     1.000000     1.00000     1.00000   221.100000     1.000000

Everything appears to be in order - we can move on to the tasks.

### 4.11.1 Exercise Set 1

We will begin by dividing our data into a training and test sets:

dt_index <- 1:nrow(dt4)
print(head(dt_index, 5))
## [1] 1 2 3 4 5
dt_index = list(range(0, len(dt4.index)))
print(dt_index[:5])
## [0, 1, 2, 3, 4]

Them $$80\%$$ of the data can be subset by subseting the indexes:

set.seed(123)
#
smpl_index <- sample(dt_index, size = floor(0.8 * length(dt_index)), replace = FALSE)
print(head(smpl_index, 5))
## [1] 415 463 179 526 195
print(head(sort(smpl_index), 5))
## [1] 2 3 5 6 8
np.random.seed(123)
#
smpl_index = np.random.choice(dt_index, size = int(np.floor(0.8 * len(dt_index))), replace = False)
print(smpl_index[:5])
## [156 920 971 897  35]
print(np.sort(smpl_index)[:5])
## [0 1 4 5 6]

Considerations

Since we are interested in comparing the results from both R and Python - we will simply take the first $$80\%$$ of the observations, with the assumption that the initial dataset file is already randomly shuffled:

smpl_index <- 1:floor(0.8 * length(dt_index))
smpl_index = list(range(0, int(np.floor(0.8 * len(dt_index)))))

In general, it is recommended to take a random sample instead of the first values, unless you are certain that they are already randomly ordered beforehand.

dt4_train <- dt4[smpl_index, ]
dt4_test  <- dt4[setdiff(dt_index, smpl_index), ]
dt4_train = dt4.iloc[smpl_index, :].reset_index(drop = True)
dt4_test  = dt4.iloc[list(set(dt_index) - set(smpl_index)), ].reset_index(drop = True)

Now we can continue with our tasks using the training and test sets, where approrpiate.

#### 4.11.1.1 Task 1

In this example data, we have the following:

• $$Y$$ in our case is wage;
• $$X_j$$ in our case are the remaining variables: black, educ, exper, faminc, female, metro, south, midwest, west.

We will begin by plotting pairwise scatter-plots for non-indicator variables:

We need some additional functions for convenient ploting:

#From: https://stat.ethz.ch/R-manual/R-devel/library/graphics/html/pairs.html
panel.hist <- function(x, ...){
usr <- par("usr"); on.exit(par(usr))
par(usr = c(usr[1:2], 0, 1.5) )
h <- hist(x, plot = FALSE, breaks = 30)
breaks <- h$breaks; nB <- length(breaks) y <- h$counts; y <- y/max(y)
rect(breaks[-nB], 0, breaks[-1], y, ...)
}
panel.abs_cor <- function(x, y, digits = 2, prefix = "", cex.cor, ...){
usr <- par("usr"); on.exit(par(usr))
par(usr = c(0, 1, 0, 1))
r <- abs(cor(x, y, use = "complete.obs"))
txt <- format(c(r, 0.123456789), digits = digits)[1]
txt <- paste0(prefix, txt)
if(missing(cex.cor)) cex.cor <- 0.8/strwidth(txt)
text(0.5, 0.5, txt, cex = 2)
}
pairs(dt4_train[, c('educ','exper', 'faminc', 'wage')],
diag.panel = panel.hist,
lower.panel = panel.smooth, #upper.panel = panel.cor,
col = "dodgerblue4",
pch = 21,
bg = adjustcolor("dodgerblue3", alpha = 0.2))

_ = pd.plotting.scatter_matrix(dt4_train[['educ','exper', 'faminc', 'wage']],
alpha = 0.2, figsize = (20, 15), marker = "o",
hist_kwds = dict(edgecolor = "black", linewidth = 1, bins = 30),
edgecolor = "black")
_ = plt.tight_layout()
plt.show()

Note that the diagonal elements are the histogram of the variables, while the upper and lower triangle of the plot matrix are the scatter-plots of the same variables.

So, we will examine the diagonal plots and the plots in either the upper, or lower, triangle.

From the plots we can say that:

• None of the variables appear to be normally distributed;
• The wage and faminc data appear to be scattered more for larger values of wage.
• There is a clear relationship between: - educ and exper; - educ and faminc; - educ and wage;
• The relationship between exper and faminc is not as clear from the plots;

We also see that the correlation between explanatory variables is weaker, compared to the correlation between educ and the remaining variables.

print(cor(dt4_train[, c('educ', 'exper', 'faminc', 'wage')]))
##              educ       exper      faminc       wage
## educ    1.0000000 -0.22308357  0.12465415 0.48194764
## exper  -0.2230836  1.00000000 -0.02835951 0.05648362
## faminc  0.1246542 -0.02835951  1.00000000 0.12177966
## wage    0.4819476  0.05648362  0.12177966 1.00000000
print(dt4_train[['educ', 'exper', 'faminc', 'wage']].corr())
##             educ     exper    faminc      wage
## educ    1.000000 -0.223084  0.124654  0.481948
## exper  -0.223084  1.000000 -0.028360  0.056484
## faminc  0.124654 -0.028360  1.000000  0.121780
## wage    0.481948  0.056484  0.121780  1.000000

We can also plot the scatter-matrix of the whole dataset with the absolute correlation values:

#
#
#
#
pairs(dt4_train,
diag.panel = panel.hist,
lower.panel = panel.smooth,
upper.panel = panel.abs_cor,
col = "dodgerblue4",
pch = 21,
bg = adjustcolor("dodgerblue3", alpha = 0.2))

axes = pd.plotting.scatter_matrix(dt4_train, alpha = 0.2, figsize = (20, 15), marker = "o",
hist_kwds = dict(edgecolor = "black", linewidth = 1, bins = 30),
edgecolor = "black")
abs_corr = np.abs(dt4_train.corr().values)
for i, j in zip(*plt.np.triu_indices_from(axes, k = 1)): #triu - TRI-angle U-pper
_ = axes[i, j].set_xlim((1.1, 1.12))
_ = axes[i, j].set_ylim((1.1, 1.12))
_ = axes[i, j].annotate("%.3f" %abs_corr[i,j], (0.5, 0.5), xycoords = 'axes fraction',
ha = 'center', va = 'center', fontsize = 20)
_ = plt.tight_layout()
plt.show()

As we can see, for indicator variables these scatterplots do not show much.

The absolute values of the correlation are useful in determining the overall strength of the linear relationship, regardless of the direction of the correlation.

We may also be interested in checking out the correlation matrix heatmap separately as per 4.9.7.1

myPanel <- function(x, y, z, ...){
lattice::panel.levelplot(x,y,z,...)
my_text <- ifelse(!is.na(z), paste0(round(z, 4)), "")
lattice::panel.text(x, y, my_text)
}
# base colors: terrain.colors(), rainbow(), heat.colors(), topo.colors() or cm.colors()
# Viridis: viridis, magma, inferno, plasma
lattice::levelplot(cor(dt4_train, use = "complete.obs"), panel = myPanel,
col.regions = viridisLite::viridis(100), main = 'Correlation of numerical variables')

import seaborn as sns
#
#
fig = plt.figure(figsize = (10, 8))
_ = plt.title('Correlation of numerical variables', size = 25)
_ = sns.heatmap(dt4_train.corr(), annot = True)
_ = plt.ylim((len(dt4_train.corr()), 0)) # See bug on bottom cutoff: https://github.com/mwaskom/seaborn/issues/1773
#
plt.show()

To make it a bit more readable, we can only plot the lower triangle:

#
#
#
mask = cor(dt4_train, use = "complete.obs")
#
#
panel = myPanel,
col.regions = viridisLite::viridis(100),
main = 'Correlation of numerical variables')

# Generate a mask for the upper triangle
mask = np.zeros_like(dt4_train.corr(), dtype = np.bool)
# Generate a custom diverging colormap
cmap = sns.diverging_palette(220, 10, as_cmap = True)
#
fig = plt.figure(figsize = (10, 8))
_ = plt.title('Correlation of numerical variables', size = 25)
_ = sns.heatmap(dt4_train.corr(), mask = mask, cmap = cmap, annot = True)
_ = plt.ylim((len(dt4_train.corr()), 0)) # See bug on bottom cutoff: https://github.com/mwaskom/seaborn/issues/1773
plt.show()

We see that

• wage and educ are positively correlated. This is the strongest positive correlation in the dataset.
• south, midwest and west would appear to be negatively correlated, however is to be expected - they are indicator variables for the geographical location, so an observation can only have one of these variables with a value of $$1$$ at a time.

Considerations

An important conclusion is that you should avoid using the indicator variables when calculating the correlation. So, we should only be concerned with the correlation matrix of the actual numerical values:

#
#
#
mask = cor(dt4_train[, c('educ', 'exper', 'faminc', 'wage')])
#
#
#
panel = myPanel,
col.regions = viridisLite::viridis(100),
main = 'Correlation of numerical variables')

# Generate a mask for the upper triangle
corr_mat = dt4_train[['educ', 'exper', 'faminc', 'wage']].corr()
mask = np.zeros_like(corr_mat, dtype = np.bool)
# Generate a custom diverging colormap
cmap = sns.diverging_palette(220, 10, as_cmap = True)
#
fig = plt.figure(figsize = (10, 8))
_ = plt.title('Correlation of numerical variables', size = 25)
_ = sns.heatmap(corr_mat, mask = mask, cmap = cmap, annot = True)
_ = plt.ylim((len(corr_mat), 0)) # See bug on bottom cutoff: https://github.com/mwaskom/seaborn/issues/1773
plt.show()

We will examine plots, which are more suited for categorical/indicator values in chapter 5.

Considerations

We will quickly check if the regional indicator variables cover all of the regions in the dataset. Assume that we are not sure whether metro belongs to the same regional categorical variable as south, west and midwest:

print(head(dt4_train[, c("south", "west", "midwest", "metro")]))
##   south west midwest metro
## 1     0    1       0     1
## 2     0    0       1     1
## 3     0    0       0     1
## 4     0    0       1     1
## 5     0    1       0     1
## 6     0    0       0     1
print(dt4_train[["south", "west", "midwest", "metro"]].head())
##    south  west  midwest  metro
## 0      0     1        0      1
## 1      0     0        1      1
## 2      0     0        0      1
## 3      0     0        1      1
## 4      0     1        0      1
print(min(dt4_train[, "south"] + dt4_train[, "west"] + dt4_train[, "midwest"]))
## [1] 0
print(np.min(dt4_train["south"] + dt4_train["west"] + dt4_train["midwest"]))
## 0

Since they do not sum to one - we can include all of the variable in our model without falling into a dummy variable trap (otherwise we would need to exclude one regional indicator variable from the model and treat it as a base region).

We can also look at their frequency table:

print(table(dt4_train[, "south"] + dt4_train[, "west"] + dt4_train[, "midwest"]))
##
##   0   1
## 168 792
print(pd.crosstab(index = dt4_train["south"] + dt4_train["west"] + dt4_train["midwest"], columns="count"))
## col_0  count
## row_0
## 0        168
## 1        792

Note that the maximum value is 1. If the maximum was 2 - this would show that some of the variables indicate something else, than the rest.

For example, if we include the metro indicator variable:

print(table(dt4_train[, "south"] + dt4_train[, "west"] + dt4_train[, "midwest"] + dt4_train[, "metro"]))
##
##   0   1   2
##  32 279 649
print(pd.crosstab(index = dt4_train["south"] + dt4_train["west"] + dt4_train["midwest"] + dt4_train["metro"], columns="count"))
## col_0  count
## row_0
## 0         32
## 1        279
## 2        649

We see that there are a number of rows that have a sum of 2, which means that metro indicates something else, rather than region.

In other words, south, west and midwest regions will be compared to a base OTHER region.

#### 4.11.1.2 Task 2

We will begin by specifying the following model: \begin{aligned} \log(wage_i) &= \beta_0 + \beta_1 educ_i + \beta_2 educ_i^2 + \beta_3 exper_i + \beta_4 exper_i^2 \\ &+ \beta_5 metro_i + \beta_6 south_i + \beta_7 west_i + \beta_8 midwest_i + \beta_9 female_i + \beta_{10} black_i + \epsilon_i,\quad i = 1,...,N \end{aligned}

We expect the following signs for the non-intercept coefficients:

• $$\beta_1 > 0$$ - additional years of education should increase the hourly wage;
• $$\beta_2 > 0$$ - generally it may be hard to say - with each additional year of education, the effect on wage may be lessened, however, if the additional year is for PhD-level education, then the additional year of education may have an increased effect on wage. For now, we will assume that is the case, i.e. $$\beta_2 > 0$$.
• $$\beta_3 > 0$$ - additional years of experience should increase the hourly wage;
• $$\beta_4 < 0$$ - generally, experience is correlated with age. Furthermore, the lever of knowledge tends to even-out for each additional year of experience the more years of experience you have. In other words, if you already have a lot of experience, then an additional year of experience would have a lessened (but not necessarily negative) effect on wage. Note: exper^2 is alongside exper, so we do not know (as of right now), if there is a number of years of experience that results in a decrease in wage. We are assuming that it would result in an increase, but at a lower rate ($$\beta_4 < 0$$), compared to someone with less initial years of experience;
• $$\beta_5 > 0$$ - a metropolitan is a highly dense population area. Generally, businesses are located in such high-density areas, and competition is higher. Consequently, we would expect someone from a metropolitan area to earn more than someone else, from the less-populated areas.
• $$\beta_6, \beta_7, \beta_8$$ - we are not sure of the sign as of yet. The region may also be correlated with the metropolitan indicator variable, as some regions could be less populated.
• $$\beta_{9}, \beta_{10}$$ - we are interested in evaluating if there is discrimination in the workforce (in which case these variables would be negative). Though it is important to note - as we will later see - the model may exhibit all sorts of problems - multicollinearity, autocorrelation, heteroskedasticity - which would result in biased and/or insignificant coefficients. If our goal is to evaluate the effect - we should be careful that our model is correctly specified!

We assume that other family income,faminc, should not affect the wage of a person, i.e. we treat it as insignificant variable, whose correlation may be spurious and as such, we do not include it in the model.

Considerations

Note: it is possible to also include interaction variables in the model. As we already have 10 variables - we will skip this for now, but later on, we will examine some of these additional terms.

#### 4.11.1.3 Task 3

We will now estimate the previously specified model via OLS:

#
mdl_0_fit <- lm(formula = "log(wage) ~ educ + I(educ^2) + exper + I(exper^2) + metro + south + west + midwest + female + black", data = dt4_train)
print(summary(mdl_0_fit))
##
## Call:
## lm(formula = "log(wage) ~ educ + I(educ^2) + exper + I(exper^2) + metro + south + west + midwest + female + black",
##     data = dt4_train)
##
## Residuals:
##      Min       1Q   Median       3Q      Max
## -1.57059 -0.31234 -0.03434  0.30483  1.37067
##
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)
## (Intercept)  1.621e+00  1.868e-01   8.679  < 2e-16 ***
## educ         3.606e-02  2.523e-02   1.429  0.15326
## I(educ^2)    2.484e-03  9.100e-04   2.729  0.00646 **
## exper        2.799e-02  4.000e-03   6.998 4.91e-12 ***
## I(exper^2)  -4.106e-04  8.043e-05  -5.105 4.01e-07 ***
## metro        1.384e-01  3.864e-02   3.581  0.00036 ***
## south       -1.089e-01  4.368e-02  -2.494  0.01281 *
## west        -4.537e-02  4.578e-02  -0.991  0.32193
## midwest     -4.802e-02  4.561e-02  -1.053  0.29267
## female      -1.548e-01  2.996e-02  -5.168 2.88e-07 ***
## black       -4.372e-02  5.574e-02  -0.784  0.43305
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.4519 on 949 degrees of freedom
## Multiple R-squared:  0.348,  Adjusted R-squared:  0.3412
## F-statistic: 50.66 on 10 and 949 DF,  p-value: < 2.2e-16
mdl_0 = smf.ols(formula = "np.log(wage) ~ educ + np.power(educ, 2) + exper + np.power(exper, 2) + metro + south + west + midwest + female + black", data = dt4_train)
mdl_0_fit = mdl_0.fit()
print(mdl_0_fit.summary())
##                             OLS Regression Results
## ==============================================================================
## Dep. Variable:           np.log(wage)   R-squared:                       0.348
## Model:                            OLS   Adj. R-squared:                  0.341
## Method:                 Least Squares   F-statistic:                     50.66
## Date:                Tue, 13 Oct 2020   Prob (F-statistic):           2.29e-81
## Time:                        21:44:16   Log-Likelihood:                -594.15
## No. Observations:                 960   AIC:                             1210.
## Df Residuals:                     949   BIC:                             1264.
## Df Model:                          10
## Covariance Type:            nonrobust
## ======================================================================================
##                          coef    std err          t      P>|t|      [0.025      0.975]
## --------------------------------------------------------------------------------------
## Intercept              1.6215      0.187      8.679      0.000       1.255       1.988
## educ                   0.0361      0.025      1.429      0.153      -0.013       0.086
## np.power(educ, 2)      0.0025      0.001      2.729      0.006       0.001       0.004
## exper                  0.0280      0.004      6.998      0.000       0.020       0.036
## np.power(exper, 2)    -0.0004   8.04e-05     -5.105      0.000      -0.001      -0.000
## metro                  0.1384      0.039      3.581      0.000       0.063       0.214
## south                 -0.1089      0.044     -2.494      0.013      -0.195      -0.023
## west                  -0.0454      0.046     -0.991      0.322      -0.135       0.044
## midwest               -0.0480      0.046     -1.053      0.293      -0.138       0.041
## female                -0.1548      0.030     -5.168      0.000      -0.214      -0.096
## black                 -0.0437      0.056     -0.784      0.433      -0.153       0.066
## ==============================================================================
## Omnibus:                        1.027   Durbin-Watson:                   2.030
## Prob(Omnibus):                  0.598   Jarque-Bera (JB):                1.008
## Skew:                           0.079   Prob(JB):                        0.604
## Kurtosis:                       2.995   Cond. No.                     1.28e+04
## ==============================================================================
##
## Warnings:
## [1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
## [2] The condition number is large, 1.28e+04. This might indicate that there are
## strong multicollinearity or other numerical problems.

From the estimated model we see that:

• the coefficients are the same as we would expect for educ, exper and their squared values;
• the coefficient of metro is also as we would expect;
• the regional coefficients are negative and insignificant for west and midwest, indicating that the regions themselves (since only one of them is significant) may not be indicative of different wages. On the other hand, it may be possible that people working in the south region earn significantly less than people in all of the remaining regions (other + west + midwest), however, we can only check this if we remove the remaining regional variables and only leave the south variable.
• coefficient sign of female is negative and significant, indicating possible discrimination in the work force (note that this is only the initial model, so we cannot be sure yet);
• coefficient sign of black is negative but it is insignificant, indicating that there is no racial discrimination, based on the data sample.

Considerations

If we specify our own power function, we can estimate the same model:

poly_var <- function(x, p){
return(x^p)
}
print(summary(lm(formula = "log(wage) ~ educ + poly_var(educ, 2) + exper + poly_var(exper, 2) + metro + south + west + midwest + female + black", data = dt4_train)))
##
## Call:
## lm(formula = "log(wage) ~ educ + poly_var(educ, 2) + exper + poly_var(exper, 2) + metro + south + west + midwest + female + black",
##     data = dt4_train)
##
## Residuals:
##      Min       1Q   Median       3Q      Max
## -1.57059 -0.31234 -0.03434  0.30483  1.37067
##
## Coefficients:
##                      Estimate Std. Error t value Pr(>|t|)
## (Intercept)         1.621e+00  1.868e-01   8.679  < 2e-16 ***
## educ                3.606e-02  2.523e-02   1.429  0.15326
## poly_var(educ, 2)   2.484e-03  9.100e-04   2.729  0.00646 **
## exper               2.799e-02  4.000e-03   6.998 4.91e-12 ***
## poly_var(exper, 2) -4.106e-04  8.043e-05  -5.105 4.01e-07 ***
## metro               1.384e-01  3.864e-02   3.581  0.00036 ***
## south              -1.089e-01  4.368e-02  -2.494  0.01281 *
## west               -4.537e-02  4.578e-02  -0.991  0.32193
## midwest            -4.802e-02  4.561e-02  -1.053  0.29267
## female             -1.548e-01  2.996e-02  -5.168 2.88e-07 ***
## black              -4.372e-02  5.574e-02  -0.784  0.43305
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.4519 on 949 degrees of freedom
## Multiple R-squared:  0.348,  Adjusted R-squared:  0.3412
## F-statistic: 50.66 on 10 and 949 DF,  p-value: < 2.2e-16
def poly_var(x, p):
return(np.array(x)**p)
#
print(smf.ols(formula = "np.log(wage) ~ educ + poly_var(educ, 2) + exper + poly_var(exper, 2) + metro + south + west + midwest + female + black", data = dt4_train).fit().summary())
##                             OLS Regression Results
## ==============================================================================
## Dep. Variable:           np.log(wage)   R-squared:                       0.348
## Model:                            OLS   Adj. R-squared:                  0.341
## Method:                 Least Squares   F-statistic:                     50.66
## Date:                Tue, 13 Oct 2020   Prob (F-statistic):           2.29e-81
## Time:                        21:44:16   Log-Likelihood:                -594.15
## No. Observations:                 960   AIC:                             1210.
## Df Residuals:                     949   BIC:                             1264.
## Df Model:                          10
## Covariance Type:            nonrobust
## ======================================================================================
##                          coef    std err          t      P>|t|      [0.025      0.975]
## --------------------------------------------------------------------------------------
## Intercept              1.6215      0.187      8.679      0.000       1.255       1.988
## educ                   0.0361      0.025      1.429      0.153      -0.013       0.086
## poly_var(educ, 2)      0.0025      0.001      2.729      0.006       0.001       0.004
## exper                  0.0280      0.004      6.998      0.000       0.020       0.036
## poly_var(exper, 2)    -0.0004   8.04e-05     -5.105      0.000      -0.001      -0.000
## metro                  0.1384      0.039      3.581      0.000       0.063       0.214
## south                 -0.1089      0.044     -2.494      0.013      -0.195      -0.023
## west                  -0.0454      0.046     -0.991      0.322      -0.135       0.044
## midwest               -0.0480      0.046     -1.053      0.293      -0.138       0.041
## female                -0.1548      0.030     -5.168      0.000      -0.214      -0.096
## black                 -0.0437      0.056     -0.784      0.433      -0.153       0.066
## ==============================================================================
## Omnibus:                        1.027   Durbin-Watson:                   2.030
## Prob(Omnibus):                  0.598   Jarque-Bera (JB):                1.008
## Skew:                           0.079   Prob(JB):                        0.604
## Kurtosis:                       2.995   Cond. No.                     1.28e+04
## ==============================================================================
##
## Warnings:
## [1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
## [2] The condition number is large, 1.28e+04. This might indicate that there are
## strong multicollinearity or other numerical problems.

#### 4.11.1.4 Task 4

We want to separately test the hypothesis that a coefficient is significant: $H_0: \beta_j = 0\\ H_1: \beta_j \neq 0$ The test is automatically carried out and the $$t$$-statistic and $$p$$-values are presented in the model summary output.

Insignificant variables are those, whose $$p$$-value is greater than the 0.05 significance level:

print(round(coef(summary(mdl_0_fit)), 4))
##             Estimate Std. Error t value Pr(>|t|)
## (Intercept)   1.6215     0.1868  8.6791   0.0000
## educ          0.0361     0.0252  1.4292   0.1533
## I(educ^2)     0.0025     0.0009  2.7293   0.0065
## exper         0.0280     0.0040  6.9978   0.0000
## I(exper^2)   -0.0004     0.0001 -5.1045   0.0000
## metro         0.1384     0.0386  3.5810   0.0004
## south        -0.1089     0.0437 -2.4936   0.0128
## west         -0.0454     0.0458 -0.9910   0.3219
## midwest      -0.0480     0.0456 -1.0529   0.2927
## female       -0.1548     0.0300 -5.1681   0.0000
## black        -0.0437     0.0557 -0.7843   0.4331
print(mdl_0_fit.summary().tables[1])
## ======================================================================================
##                          coef    std err          t      P>|t|      [0.025      0.975]
## --------------------------------------------------------------------------------------
## Intercept              1.6215      0.187      8.679      0.000       1.255       1.988
## educ                   0.0361      0.025      1.429      0.153      -0.013       0.086
## np.power(educ, 2)      0.0025      0.001      2.729      0.006       0.001       0.004
## exper                  0.0280      0.004      6.998      0.000       0.020       0.036
## np.power(exper, 2)    -0.0004   8.04e-05     -5.105      0.000      -0.001      -0.000
## metro                  0.1384      0.039      3.581      0.000       0.063       0.214
## south                 -0.1089      0.044     -2.494      0.013      -0.195      -0.023
## west                  -0.0454      0.046     -0.991      0.322      -0.135       0.044
## midwest               -0.0480      0.046     -1.053      0.293      -0.138       0.041
## female                -0.1548      0.030     -5.168      0.000      -0.214      -0.096
## black                 -0.0437      0.056     -0.784      0.433      -0.153       0.066
## ======================================================================================

We will begin by removing the insignificant variable (where the $$p$$-value is greater than 0.05) with the largest $$p$$-values - in this case it is the indicator variable black.

#
mdl_1_fit <- lm(formula = "log(wage) ~ educ + I(educ^2) + exper + I(exper^2) + metro + south + west + midwest + female", data = dt4_train)
print(round(coef(summary(mdl_1_fit)), 4))
##             Estimate Std. Error t value Pr(>|t|)
## (Intercept)   1.6194     0.1868  8.6707   0.0000
## educ          0.0359     0.0252  1.4227   0.1552
## I(educ^2)     0.0025     0.0009  2.7500   0.0061
## exper         0.0280     0.0040  7.0060   0.0000
## I(exper^2)   -0.0004     0.0001 -5.1066   0.0000
## metro         0.1373     0.0386  3.5553   0.0004
## south        -0.1132     0.0433 -2.6121   0.0091
## west         -0.0450     0.0458 -0.9832   0.3258
## midwest      -0.0477     0.0456 -1.0451   0.2962
## female       -0.1571     0.0298 -5.2699   0.0000
mdl_1 = smf.ols(formula = "np.log(wage) ~ educ + np.power(educ, 2) + exper + np.power(exper, 2) + metro + south + west + midwest + female", data = dt4_train)
mdl_1_fit = mdl_1.fit()
print(mdl_1_fit.summary().tables[1])
## ======================================================================================
##                          coef    std err          t      P>|t|      [0.025      0.975]
## --------------------------------------------------------------------------------------
## Intercept              1.6194      0.187      8.671      0.000       1.253       1.986
## educ                   0.0359      0.025      1.423      0.155      -0.014       0.085
## np.power(educ, 2)      0.0025      0.001      2.750      0.006       0.001       0.004
## exper                  0.0280      0.004      7.006      0.000       0.020       0.036
## np.power(exper, 2)    -0.0004   8.04e-05     -5.107      0.000      -0.001      -0.000
## metro                  0.1373      0.039      3.555      0.000       0.061       0.213
## south                 -0.1132      0.043     -2.612      0.009      -0.198      -0.028
## west                  -0.0450      0.046     -0.983      0.326      -0.135       0.045
## midwest               -0.0477      0.046     -1.045      0.296      -0.137       0.042
## female                -0.1571      0.030     -5.270      0.000      -0.216      -0.099
## ======================================================================================

Next up, we will remove the indicator variable west. Then we will remove the indicator variable midwest. We note that after doing so, the base group would be other + west + midwest.

#
mdl_2_fit <- lm(formula = "log(wage) ~ educ + I(educ^2) + exper + I(exper^2) + metro + south + midwest + female", data = dt4_train)
print(round(coef(summary(mdl_2_fit)), 4))
##             Estimate Std. Error t value Pr(>|t|)
## (Intercept)   1.5891     0.1842  8.6269   0.0000
## educ          0.0364     0.0252  1.4421   0.1496
## I(educ^2)     0.0025     0.0009  2.7386   0.0063
## exper         0.0281     0.0040  7.0282   0.0000
## I(exper^2)   -0.0004     0.0001 -5.1163   0.0000
## metro         0.1348     0.0385  3.4994   0.0005
## south        -0.0869     0.0341 -2.5484   0.0110
## midwest      -0.0217     0.0372 -0.5837   0.5596
## female       -0.1568     0.0298 -5.2609   0.0000
mdl_2 = smf.ols(formula = "np.log(wage) ~ educ + np.power(educ, 2) + exper + np.power(exper, 2) + metro + south + midwest + female", data = dt4_train)
mdl_2_fit = mdl_2.fit()
print(mdl_2_fit.summary().tables[1])
## ======================================================================================
##                          coef    std err          t      P>|t|      [0.025      0.975]
## --------------------------------------------------------------------------------------
## Intercept              1.5891      0.184      8.627      0.000       1.228       1.951
## educ                   0.0364      0.025      1.442      0.150      -0.013       0.086
## np.power(educ, 2)      0.0025      0.001      2.739      0.006       0.001       0.004
## exper                  0.0281      0.004      7.028      0.000       0.020       0.036
## np.power(exper, 2)    -0.0004   8.04e-05     -5.116      0.000      -0.001      -0.000
## metro                  0.1348      0.039      3.499      0.000       0.059       0.210
## south                 -0.0869      0.034     -2.548      0.011      -0.154      -0.020
## midwest               -0.0217      0.037     -0.584      0.560      -0.095       0.051
## female                -0.1568      0.030     -5.261      0.000      -0.215      -0.098
## ======================================================================================
#
mdl_3_fit <- lm(formula = "log(wage) ~ educ + I(educ^2) + exper + I(exper^2) + metro + south + female", data = dt4_train)
print(round(coef(summary(mdl_3_fit)), 4))
##             Estimate Std. Error t value Pr(>|t|)
## (Intercept)   1.5827     0.1838  8.6104   0.0000
## educ          0.0357     0.0252  1.4174   0.1567
## I(educ^2)     0.0025     0.0009  2.7729   0.0057
## exper         0.0281     0.0040  7.0214   0.0000
## I(exper^2)   -0.0004     0.0001 -5.1059   0.0000
## metro         0.1379     0.0381  3.6157   0.0003
## south        -0.0789     0.0312 -2.5287   0.0116
## female       -0.1574     0.0298 -5.2834   0.0000
mdl_3 = smf.ols(formula = "np.log(wage) ~ educ + np.power(educ, 2) + exper + np.power(exper, 2) + metro + south + female", data = dt4_train)
mdl_3_fit = mdl_3.fit()
print(mdl_3_fit.summary().tables[1])
## ======================================================================================
##                          coef    std err          t      P>|t|      [0.025      0.975]
## --------------------------------------------------------------------------------------
## Intercept              1.5827      0.184      8.610      0.000       1.222       1.943
## educ                   0.0357      0.025      1.417      0.157      -0.014       0.085
## np.power(educ, 2)      0.0025      0.001      2.773      0.006       0.001       0.004
## exper                  0.0281      0.004      7.021      0.000       0.020       0.036
## np.power(exper, 2)    -0.0004   8.04e-05     -5.106      0.000      -0.001      -0.000
## metro                  0.1379      0.038      3.616      0.000       0.063       0.213
## south                 -0.0789      0.031     -2.529      0.012      -0.140      -0.018
## female                -0.1574      0.030     -5.283      0.000      -0.216      -0.099
## ======================================================================================

As long as we do not plan to include any interaction terms with the now-removed regional indicator variables - we make a mental note of the new interpretation of the base region (which we can treat as a not-south region).

Note that while the $$p$$-value of educ is also greater than 0.05, we have also included its squared value educ^2. Following the recommendations outlined in these notes (and in most econometric modelling literature), we should keep the lower order polynomial educ, since educ^2 is significant.

We have removed the insignificant variables from our model, so we will now write down the estimated model.

#### 4.11.1.5 Task 5

From the model output we can write down the estimated model as:

\begin{aligned} \underset{(se)}{\widehat{\log(wage)}} &= \underset{(0.1838)}{1.5827} + \underset{(0.0252)}{0.0357} \cdot educ + \underset{(0.0009)}{0.0025} \cdot educ^2 \\ &+ \underset{(0.0040)}{0.0281} \cdot exper - \underset{(0.0001)}{0.0004} \cdot exper^2 \\ &+ \underset{(0.0381)}{0.1379} \cdot metro - \underset{(0.0312)}{0.0789} \cdot south - \underset{(0.0298)}{0.1574} \cdot female \end{aligned}

Considerations

Furthermore, we can interpret the variables in the following way:

• An increase in educ (education) by 1 year results in an increasing effect on wage (i.e. the positive effect on wage is further increased depending on the initial level of eudc), ceteris paribus:
• If educ = 0 and then increases by $$1$$ year, then wage increases by $$100 \cdot (0.0357 + 0.0025)= 3.82\%$$;
• If educ > 0 and then increases by $$1$$ year, then the difference in the change in educ results in: \begin{aligned} \underset{(se)}{\widehat{\log(wage)}} \bigg|_{educ + 1} - \underset{(se)}{\widehat{\log(wage)}} \bigg|_{educ} &= 0.0357 + 0.0025 \cdot (educ + 1)^2 - 0.0025 \cdot educ^2\\ &= 0.0357 +0.0025 \cdot (2 \cdot educ + 1) \end{aligned} then wage increases by $$100 \cdot \left[ 0.0357 + 0.0025 \cdot (2 \cdot educ + 1) \right] \%$$
• An increase in exper (years of experience) by 1 year results in a decreasing effect on wage (i.e. the positive effect on wage is decreased depending on the initial level of exper, ceteris paribus:
• If exper = 0 increases by $$1$$ year, then wage increases by $$100 \cdot (0.0281 - 0.0004) = 2.77\%$$;
• If exper > 0 increases by $$1$$ year, then the difference in the change in exper results in: \begin{aligned} \underset{(se)}{\widehat{\log(wage)}} \bigg|_{exper + 1} - \underset{(se)}{\widehat{\log(wage)}} \bigg|_{exper} &= 0.0281 - 0.0004 \cdot (exper + 1)^2 + 0.0004 \cdot exper^2\\ &= 0.0281 - 0.0004 \cdot (2 \cdot exper + 1) \end{aligned} then wage changes by $$100 \cdot \left[ 0.0281 - 0.0004 \cdot (2 \cdot exper + 1) \right] \%$$.
• We may be interested in finding whether there are values of exper that result in a maximum (or minimum) value of wage, ceteris paribus. Taking the partial derivative (calculating the marginal effect) and equating it to zero yields: \begin{aligned} \dfrac{\partial {\widehat{\log(wage)}}}{\partial exper} &= 0.0281 - 0.0004 \cdot 2 \cdot exper = 0.0281 - 0.0008 \cdot exper = 0 \end{aligned}
print(cbind(coef(mdl_3_fit)))
##                      [,1]
## (Intercept)  1.5827424723
## educ         0.0356981405
## I(educ^2)    0.0025178325
## exper        0.0280583007
## I(exper^2)  -0.0004103015
## metro        0.1379159163
## south       -0.0788749581
## female      -0.1573849396
print(paste0("Maximum Wage when exper = ", coef(mdl_3_fit)[4] / (-coef(mdl_3_fit)[5] * 2)))
## [1] "Maximum Wage when exper = 34.1922988677104"
print(mdl_3_fit.params)
## Intercept             1.582742
## educ                  0.035698
## np.power(educ, 2)     0.002518
## exper                 0.028058
## np.power(exper, 2)   -0.000410
## metro                 0.137916
## south                -0.078875
## female               -0.157385
## dtype: float64
print("Maximum Wage when exper = " + str(mdl_3_fit.params[3] / (-mdl_3_fit.params[4] * 2)))
## Maximum Wage when exper = 34.19229886771548

So, when $$exper = 34.19$$, wage will be at its maximum since $$\dfrac{\partial^2 {\widehat{\log(wage)}}}{\partial^2 exper} < 0$$.

Note that we used the exact values instead of the rounded values from the formulas. Reason being that the rounding error would give us a different value:

print(0.0281 / 0.0008)
## [1] 35.125
print(0.0281 / 0.0008)
## 35.125

Which would not be the same, one we try with to verify the results with the actual data.

• We can further examine when a unit increase in exper results in no change in wage: \begin{aligned} 0.0281 - 0.0004 \cdot (2 \cdot educ + 1) &= 0 \end{aligned}
zero_inc <- (coef(mdl_3_fit)[4] / (- coef(mdl_3_fit)[5]) - 1) / 2
print(zero_inc)
##   exper
## 33.6923
zero_inc = (mdl_3_fit.params[3] / (-mdl_3_fit.params[4]) - 1) / 2
print(zero_inc)
## 33.69229886771548

So, if the initial value of $$exper = 33.69$$, wage would not change with an additional unit increase in exper. Remember that exper can only be integer-valued.

Furthermore, for $$exper > 33.69$$, wage would decrease from a unit increase in exper. We can verify this by taking the initial value exper = 36:

# Repeat the first row twice:
tst_dt <- dt4[c(1, 1), ]
# Reset row index numbering to avoid duplicate row index numbers
rownames(tst_dt) <- NULL
# Set exper column values:
tst_dt[, "exper"] = c(36, 37)
# Print the sample data:
print(tst_dt)
##   black educ exper faminc female metro midwest south  wage west
## 1     0   13    36      0      1     1       0     0 44.44    1
## 2     0   13    37      0      1     1       0     0 44.44    1
# Repeat the first row twice:
tst_dt = dt4.iloc[[0,0], :]
# Reset row index numbering to avoid duplicate row index numbers
tst_dt = tst_dt.reset_index(drop = True)
# Set exper column values:
tst_dt.loc[:, "exper"] = [36, 37]
# Print the sample data:
print(tst_dt)
##    black  educ  exper  faminc  female  metro  midwest  south   wage  west
## 0      0    13     36       0       1      1        0      0  44.44     1
## 1      0    13     37       0       1      1        0      0  44.44     1

Note the ceteris paribus condition - there is only a unit change in exper, while the remaining values are do not change.

Now, we can calculate and compare the predicted values:

tst_pred <- predict(mdl_3_fit, newdata = tst_dt)
print(tst_pred)
##        1        2
## 2.931211 2.929317
tst_pred = mdl_3_fit.predict(tst_dt)
print(tst_pred)
## 0    2.931211
## 1    2.929317
## dtype: float64
print(diff(tst_pred))
##            2
## -0.001893706
print(np.diff(tst_pred))
## [-0.00189371]

which would be a $$0.189\%$$ decrease in wage for someone with 36 years of experience earning an additional year of experience.

This can be verified manually, by taking the exponents of the predicted values (since these predicted values are from the log-linear model) and calculating their percentage change: $\dfrac{WAGE_{NEW} - WAGE_{OLD}}{WAGE_{OLD}} \cdot 100$

print(diff(exp(tst_pred)) / exp(tst_pred[1]) * 100)
##          2
## -0.1891914
print(np.diff(np.exp(tst_pred)) / np.exp(tst_pred[0]) * 100 )
## [-0.18919144]

We note that this is an approximation only, and not a true equality between logarithm differences and percentage change.

It is very close when the percentage change is small, but for larger percentage changes, it may differ greatly.

• Another example when the wage would still increase is when someone has 33 years of experience and it increases to 34 (since the maximum is attained when exper = 34.19):
tst_dt[, "exper"] <- c(33, 34)
print(tst_dt)
##   black educ exper faminc female metro midwest south  wage west
## 1     0   13    33      0      1     1       0     0 44.44    1
## 2     0   13    34      0      1     1       0     0 44.44    1
tst_dt.loc[:, "exper"] = [33, 34]
print(tst_dt)
##    black  educ  exper  faminc  female  metro  midwest  south   wage  west
## 0      0    13     33       0       1      1        0      0  44.44     1
## 1      0    13     34       0       1      1        0      0  44.44     1
tst_pred <- predict(mdl_3_fit, newdata = tst_dt)
print(tst_pred)
##        1        2
## 2.931969 2.932537
tst_pred = mdl_3_fit.predict(tst_dt)
print(tst_pred)
## 0    2.931969
## 1    2.932537
## dtype: float64
print(diff(tst_pred))
##            2
## 0.0005681025
np.diff(tst_pred)
## array([0.0005681])
• On the other hand, if we were to ignore that exper is only integer valued, then we can verify the point, where wage does not change from a unit increase in exper:
tst_dt[, "exper"] <- c(zero_inc, zero_inc + 1)
print(tst_dt)
##   black educ   exper faminc female metro midwest south  wage west
## 1     0   13 33.6923      0      1     1       0     0 44.44    1
## 2     0   13 34.6923      0      1     1       0     0 44.44    1
tst_dt.loc[:, "exper"] = [zero_inc, zero_inc + 1]
print(tst_dt)
##    black  educ      exper  faminc  female  metro  midwest  south   wage  west
## 0      0    13  33.692299       0       1      1        0      0  44.44     1
## 1      0    13  34.692299       0       1      1        0      0  44.44     1
tst_pred <- predict(mdl_3_fit, newdata = tst_dt)
print(tst_pred)
##        1        2
## 2.932449 2.932449
tst_pred = mdl_3_fit.predict(tst_dt)
print(tst_pred)
## 0    2.932449
## 1    2.932449
## dtype: float64
print(diff(tst_pred))
## 2
## 0
print(np.diff(tst_pred))
## [0.]
• Continuing our coefficient interpretation:
• A person from a metropolitan area (metro = 1) earns around $$100 \cdot 0.1379 = 13.79\%$$ more than someone not from a metro area, ceteris paribus;
• A person from the south region (south = 1) earns around $$100 \cdot 0.0789 = 7.89\%$$ less (since the coefficient is $$-0.0789 < 0$$) than someone not from the south.
• Take note that we have removed other regional coefficients, and as a result, we are left with only one regional indicator. Consequently, we only compare someone from the south, versus someone not from the south (they could be from either west, midwest, or some other region).
• If we were to leave other regional indicator variables, then the south indicator variable would indicate how much more/less someone from the south earns compared to some other base region category, which does NOT include west or midwest.
• As a result, this coefficient also indicates that the wages in the south are lower than wages in the rest of the country.
• If the person is female (female = 1), then they earn around $$100 \cdot 0.1574 = 15.74\%$$ less than someone who is not female.

### 4.11.2 Exercise Set 2

#### 4.11.2.1 Task 6

# Define the plot layout matrix:
layout_mat <- matrix(c(1, 1, 2, 2, 3, 3, 3, 3),
nrow = 2, byrow = TRUE)
# print(layout_mat)
layout(layout_mat)
plot(mdl_3_fit$fitted.values, mdl_3_fit$residuals, type = "p",
pch = 21, bg = "cornflowerblue", main = "Residuals vs Fitted",
ylab = "residuals", xlab = "fitted values", cex = 1.5)
hist(mdl_3_fit$residuals, col = "cornflowerblue", breaks = 30, main = "Residual Histogram") qqnorm(mdl_3_fit$residuals, main = "Q-Q plot of residuals",
pch = 21, bg = "cornflowerblue", cex = 1.5)
qqline(mdl_3_fit$residuals, col = "red", lwd = 2) fig = plt.figure(num = 2, figsize = (10, 8)) # Plot fitted vs residual plots: ax = fig.add_subplot(2, 2, 1) _ = ax.plot(mdl_3_fit.fittedvalues, mdl_3_fit.resid, linestyle = "None", marker = "o", markeredgecolor = "black") # Plot the residual histogram ax = fig.add_subplot(2, 2, 2) _ = ax.hist(mdl_3_fit.resid, bins = 30, edgecolor = "black") # Plot the residual Q-Q plot: ax = fig.add_subplot(2, 1, 2) _ = stats.probplot(mdl_3_fit.resid, dist = "norm", plot = ax) # Fix layout in case the labels do overlap: _ = plt.tight_layout() plt.show() We can see that: • From the residual vs. fitted plot - the residuals appear to havea a constant variance, though there are some points that have large residual values for low fitted values (fitted values are on the horizontal axis), and low residual values for very large fitted values, but those points do not make up a majority (the heteroskedasticity test will help us answer whether their variance is the same across observations). • From fhe residual histogram the residuals appear to be normal, though the histogram does appear to have a longer right tail; • The residuals in the Q-Q plot appear to fall along a straight line with the theoretical quantiles of a normal distribution, except for one point (which is most likely an outlier). Next, we move on to testing a few hypothesis. ##### 4.11.2.1.1 Homoskedasticity tests The hypothesis that we want to test is: $\begin{cases} H_0&: \text{residuals are homoskedastic}\\ H_1&: \text{residuals are heteroskedastic} \end{cases}$ We will begin with the Breusch-Pagan Test: # # Breusch–Pagan Test # print(lmtest::bptest(mdl_3_fit)) ## ## studentized Breusch-Pagan test ## ## data: mdl_3_fit ## BP = 29.724, df = 7, p-value = 0.0001067 name = ['LM statistic', 'LM p-value', 'F-value', 'F p-value'] bp_test = sm_diagnostic.het_breuschpagan(resid = mdl_3_fit.resid, exog_het = pd.DataFrame(mdl_3.exog, columns = mdl_3.exog_names)) print(pd.DataFrame([np.round(bp_test, 8)], columns=name)) ## LM statistic LM p-value F-value F p-value ## 0 29.723534 0.000107 4.345376 0.000094 We are interested in the $$LM$$ (BP) statistic and its associated $$p$$-value. We have that the $$p$$-value < 0.05, so we reject the null hypothesis that the residuasl are homoskedastic. Which means that the residuals are heteroskedastic. Next, we look at the Goldfeld-Quandt Test results: # # Goldfeld–Quandt Test # gq_test <- lmtest::gqtest(mdl_3_fit, alternative = "two.sided") print(gq_test) ## ## Goldfeld-Quandt test ## ## data: mdl_3_fit ## GQ = 0.85481, df1 = 472, df2 = 472, p-value = 0.0887 ## alternative hypothesis: variance changes from segment 1 to 2 name = ['F statistic', 'p-value', 'type'] # Goldfeld-Quandt Test gq_test = sm_diagnostic.het_goldfeldquandt(y = mdl_3_fit.model.endog, x = mdl_3_fit.model.exog, alternative = "two-sided") print(pd.DataFrame([gq_test], columns = name)) ## F statistic p-value type ## 0 0.854814 0.088702 two-sided The $$p$$-value > 0.05, so we have no grounds to reject the null hypothesis and conclude that the residuals are homoskedastic. Finally, we look at the White Test results: # # White Test w_test <- lmtest::bptest(mdl_3_fit, ~ metro*south*midwest*female*educ*exper + I(educ^2) + I(exper^2), data = dt4_train) print(w_test) ## ## studentized Breusch-Pagan test ## ## data: mdl_3_fit ## BP = 72.506, df = 49, p-value = 0.01618 name = ['LM statistic', 'LM p-value', 'F-value', 'F p-value'] # White Test w_test = sm_diagnostic.het_white(resid = mdl_3_fit.resid, exog = mdl_3_fit.model.exog) print(pd.DataFrame([np.round(w_test, 8)], columns = name)) ## LM statistic LM p-value F-value F p-value ## 0 72.415095 0.000023 2.526467 0.000014 The $$p$$-value < 0.05, so we reject the null hypothesis and conclude that the residuals are heteroskedastic. From these test results we can say that the residuals are heteroskedastic. ##### 4.11.2.1.2 Autocorrelation Tests The hypothesis that we want to test is: $\begin{cases} H_0&:\text{the errors are serially uncorrelated}\\ H_1&:\text{the errors are autocorrelated (the exact order of the autocorrelation depends on the test carried out)} \end{cases}$ We will begin with the Durbin-Watson Test, where the alternative hypothesis is that the autocorrelation is of order 1: # Durbin–Watson Test print(lmtest::dwtest(mdl_3_fit, alternative = "two.sided")) ## ## Durbin-Watson test ## ## data: mdl_3_fit ## DW = 2.0304, p-value = 0.6409 ## alternative hypothesis: true autocorrelation is not 0 # Durbin-Watson Test print(sm_tools.durbin_watson(mdl_3_fit.resid)) ## 2.030433345345015 The DW statistic is close to 2 (this can be verified in R - since the associated $$p$$-value > 0.05), so we do not reject the null hypothesis that there is no serial correlation. Next up is the Breusch-Godfrey Test, where we can select the autocorrelation order ourselves. We have selected a 2nd order autocorrelation: # Breusch-Godfrey Test bg_test <- lmtest::bgtest(mdl_3_fit, order = 2) print(bg_test) ## ## Breusch-Godfrey test for serial correlation of order up to 2 ## ## data: mdl_3_fit ## LM test = 0.97837, df = 2, p-value = 0.6131 name = ['LM statistic', 'LM p-value', 'F-value', 'F p-value'] # Breusch-Godfrey Test bg_test = sm_diagnostic.acorr_breusch_godfrey(mdl_3_fit, nlags = 2) print(pd.DataFrame([np.round(bg_test, 8)], columns = name)) ## LM statistic LM p-value F-value F p-value ## 0 0.978369 0.613126 0.484582 0.616106 The $$p$$-value = 0.613126 > 0.05, so we have no grounds to reject the null hypothesis of no autocorrelation. We could test with higher order autocorrelation and examine the results, lets try with up to order 20: for(i in 2:20){ print(paste0("BG Test for autocorrelation order = ", i, "; p-value = ", round(lmtest::bgtest(mdl_3_fit, order = i)$p.value, 5)))
}
## [1] "BG Test for autocorrelation order = 2; p-value = 0.61313"
## [1] "BG Test for autocorrelation order = 3; p-value = 0.36075"
## [1] "BG Test for autocorrelation order = 4; p-value = 0.39979"
## [1] "BG Test for autocorrelation order = 5; p-value = 0.45806"
## [1] "BG Test for autocorrelation order = 6; p-value = 0.50955"
## [1] "BG Test for autocorrelation order = 7; p-value = 0.61729"
## [1] "BG Test for autocorrelation order = 8; p-value = 0.53359"
## [1] "BG Test for autocorrelation order = 9; p-value = 0.63379"
## [1] "BG Test for autocorrelation order = 10; p-value = 0.63069"
## [1] "BG Test for autocorrelation order = 11; p-value = 0.70566"
## [1] "BG Test for autocorrelation order = 12; p-value = 0.66598"
## [1] "BG Test for autocorrelation order = 13; p-value = 0.63818"
## [1] "BG Test for autocorrelation order = 14; p-value = 0.69411"
## [1] "BG Test for autocorrelation order = 15; p-value = 0.68445"
## [1] "BG Test for autocorrelation order = 16; p-value = 0.66285"
## [1] "BG Test for autocorrelation order = 17; p-value = 0.62826"
## [1] "BG Test for autocorrelation order = 18; p-value = 0.56064"
## [1] "BG Test for autocorrelation order = 19; p-value = 0.62158"
## [1] "BG Test for autocorrelation order = 20; p-value = 0.68268"
for i in range(2, 21):
print("BG Test for autocorrelation order = " + str(i) +
"; p-value = " + str(np.round(sm_diagnostic.acorr_breusch_godfrey(mdl_3_fit, nlags = i)[1], 5)))
#    
## BG Test for autocorrelation order = 2; p-value = 0.61313
## BG Test for autocorrelation order = 3; p-value = 0.36075
## BG Test for autocorrelation order = 4; p-value = 0.39979
## BG Test for autocorrelation order = 5; p-value = 0.45806
## BG Test for autocorrelation order = 6; p-value = 0.50955
## BG Test for autocorrelation order = 7; p-value = 0.61729
## BG Test for autocorrelation order = 8; p-value = 0.53359
## BG Test for autocorrelation order = 9; p-value = 0.63379
## BG Test for autocorrelation order = 10; p-value = 0.63069
## BG Test for autocorrelation order = 11; p-value = 0.70566
## BG Test for autocorrelation order = 12; p-value = 0.66598
## BG Test for autocorrelation order = 13; p-value = 0.63818
## BG Test for autocorrelation order = 14; p-value = 0.69411
## BG Test for autocorrelation order = 15; p-value = 0.68445
## BG Test for autocorrelation order = 16; p-value = 0.66285
## BG Test for autocorrelation order = 17; p-value = 0.62826
## BG Test for autocorrelation order = 18; p-value = 0.56064
## BG Test for autocorrelation order = 19; p-value = 0.62158
## BG Test for autocorrelation order = 20; p-value = 0.68268

As we can see, we have no grounds to reject the null hypothesis of autocorrelation in any of the cases.

From these test results we can conclude that the residuals are not serially correlated.

##### 4.11.2.1.3 Normality Tests

The hypothesis that we want to test is: $\begin{cases} H_0&:\text{residuals follow a normal distribution}\\ H_1&:\text{residuals do not follow a normal distribution} \end{cases}$

We will carry our the following tests and combine their $$p$$-values a single output:

norm_tests = c("Anderson-Darling",
"Shapiro-Wilk",
"Kolmogorov-Smirnov",
"Cramer-von Mises",
"Jarque-Bera")
norm_tests = ["Anderson-Darling",
"Shapiro-Wilk",
"Kolmogorov-Smirnov",
"Cramer-von Mises",
"Jarque-Bera"]
norm_test <- data.frame(
p_value = c(nortest::ad.test(mdl_3_fit$residuals)$p.value,
shapiro.test(mdl_3_fit$residuals)$p.value,
ks.test(mdl_3_fit$residuals, y = "pnorm", alternative = "two.sided")$p.value,
nortest::cvm.test(mdl_3_fit$residuals)$p.value,
tseries::jarque.bera.test(mdl_3_fit$residuals)$p.value),
Test = norm_tests)
## Warning in ks.test(mdl_3_fit$residuals, y = "pnorm", alternative = "two.sided"): ties should not be present for the Kolmogorov-Smirnov test norm_test = pd.DataFrame() norm_test["p_value"] = [ sm_diagnostic.normal_ad(x = mdl_3_fit.resid)[1], stats.shapiro(x = mdl_3_fit.resid)[1], sm_diagnostic.kstest_normal(x = mdl_3_fit.resid, dist = "norm")[1], skgof.cvm_test(data = mdl_3_fit.resid, dist = stats.norm(0, np.sqrt(np.var(mdl_3_fit.resid))))[1], sm_tools.jarque_bera(mdl_3_fit.resid)[1] ] norm_test["Test"] = norm_tests print(norm_test) ## p_value Test ## 1 0.002023493 Anderson-Darling ## 2 0.018325238 Shapiro-Wilk ## 3 0.000000000 Kolmogorov-Smirnov ## 4 0.002934992 Cramer-von Mises ## 5 0.609095835 Jarque-Bera print(norm_test) ## p_value Test ## 0 0.002023 Anderson-Darling ## 1 0.018323 Shapiro-Wilk ## 2 0.030978 Kolmogorov-Smirnov ## 3 0.229736 Cramer-von Mises ## 4 0.609096 Jarque-Bera We see that the $$p$$-value is less than the $$5\%$$ significance level for the Anderson-Darling, Shapiro-Wilk and Kolmogorov-Smirnov tests, where we would reject the null hypothesis of normality. On the other hand the $$p$$-value is greater than 0.05 for Jarque-Bera test, where we would not reject the null hypothesis of normality. For Cramer-von Mises test - we do not reject it in R ($$p$$-value < 0.05), but we do reject the null hypothesis in Python ($$p$$-value > 0.05). The difference in the result for Cramer-von Mises may be due to a different implementation - it is best to not rely on this test alone in Python. As indicated in the lecture notes, the Shapiro–Wilk test has the best power for a given significance, furthermore, 3 our of 5 tests suggest non-normal residuals, so we will go with their results. From these test results we can conclude that the residuals are not normally distributed. OVERALL CONCLUSIONS: • Residuals are not serially correlated (asuumption (MR.4) is NOT violated); • Residuals are Heteroskedastic (assumption (MR.3) IS violated); • Residuals are non-normally distributed (assumption (MR.6) IS violated). Assumption (MR.5) is related to multicollinearity and will be examined in a later Task, but from what we have seen so far, almost all of the coefficients are statistically significant, with correct signs. Furthermore, since we were able to estimate the model via OLS, there is not exact collinearity (i.e. no exact linear dependence) between the regressors. So, there may be no collinear variables in our model. Considerations ##### 4.11.2.1.3 Comparison With Univariate Regression Model Results If we look back at our final univariate regression model - the log-linear model: $\log(\text{wage}_i) = \beta_0 + \beta_1 \cdot \text{educ}_i + \epsilon_i$ We can estimate it here as well, and re-examine its residuals: lm_univar_fit <- lm(formula = "log(wage) ~ educ", data = dt4_train) print(coef(summary(lm_univar_fit))) ## Estimate Std. Error t value Pr(>|t|) ## (Intercept) 1.62529902 0.077090004 21.08314 2.330772e-81 ## educ 0.09664525 0.005322977 18.15624 1.537423e-63 lm_univar = smf.ols(formula = "np.log(wage) ~ educ", data = dt4_train) lm_univar_fit = lm_univar.fit() print(lm_univar_fit.summary2().tables[1]) ## Coef. Std.Err. t P>|t| [0.025 0.975] ## Intercept 1.625299 0.077090 21.083136 2.330772e-81 1.474014 1.776584 ## educ 0.096645 0.005323 18.156238 1.537423e-63 0.086199 0.107091 layout(layout_mat) # Plot fitted vs residual plots: plot(lm_univar_fit$fitted.values, lm_univar_fit$residuals, type = "p", pch = 21, bg = "cornflowerblue", main = "Residuals vs Fitted", ylab = "residuals", xlab = "fitted values", cex = 1.5) # Plot the residual histogram: hist(lm_univar_fit$residuals, col = "cornflowerblue",
breaks = 30, main = "Residual Histogram")
# Plot the residual Q-Q plot:
qqnorm(lm_univar_fit$residuals, main = "Q-Q plot of residuals", pch = 21, bg = "cornflowerblue", cex = 1.5) qqline(lm_univar_fit$residuals, col = "red", lwd = 2)

fig = plt.figure(num = 3, figsize = (10, 8))
# Plot fitted vs residual plots:
ax = fig.add_subplot(2, 2, 1)
_ = ax.plot(lm_univar_fit.fittedvalues, lm_univar_fit.resid,
linestyle = "None", marker = "o", markeredgecolor = "black")
# Plot the residual histogram:
ax = fig.add_subplot(2, 2, 2)
_ = ax.hist(lm_univar_fit.resid, bins = 30, edgecolor = "black")
# Plot the residual Q-Q plot:
ax = fig.add_subplot(2, 1, 2)
_ = stats.probplot(lm_univar_fit.resid, dist = "norm", plot = ax)
# Fix layout in case the labels do overlap:
_ = plt.tight_layout()
plt.show()

Compared to the univariate model:

• The residual vs fitted value plot visually looks better in the multiple regression model;
• The residual histogram is similar, but visually looks a bit better in the multiple regression model;
• The residual Q-Q plots are similar in both cases - the outlier still remains in the data.

Again note that the fitted values are on the horizontal axis, which also highlights another interesting poitns regarding the range of attainable fitted values in these models.

Looking at the residual vs fitted value plot, the number of fitted values, greater than 3.2, but less than 3.35 is:

• For the univariate regression with one variable:
length(lm_univar_fit$fitted.values[lm_univar_fit$fitted.values > 3.2 & lm_univar_fit$fitted.values < 3.35]) ## [1] 0 len(lm_univar_fit.fittedvalues[np.logical_and(lm_univar_fit.fittedvalues > 3.2, lm_univar_fit.fittedvalues < 3.35)]) ## 0 • For the multiple regression: length(mdl_3_fit$fitted.values[mdl_3_fit$fitted.values > 3.2 & mdl_3_fit$fitted.values < 3.35])
## [1] 106
len(mdl_3_fit.fittedvalues[np.logical_and(mdl_3_fit.fittedvalues > 3.2, mdl_3_fit.fittedvalues < 3.35)])
## 106

By using the multiple regression mode specification we now get fitted values, which are more evenly scattered across their interval, whereas in the univaraite case, we had fitted values clustered along a limited range.

#### 4.11.2.2 Task 7

To make everything easier to follow, we will examine the interaction terms one-by one, so as not to overwhelm with too many variables in the model.

• It may be interesting to include the interaction term: $female \times black$ to see if there is a different discrimination for females based on race.
#
mdl_4_fit <- lm(formula = "log(wage) ~ educ + I(educ^2) + exper + I(exper^2) + metro + south + female*black", data = dt4_train)
print(summary(mdl_4_fit))
##
## Call:
## lm(formula = "log(wage) ~ educ + I(educ^2) + exper + I(exper^2) + metro + south + female*black",
##     data = dt4_train)
##
## Residuals:
##      Min       1Q   Median       3Q      Max
## -1.54016 -0.31101 -0.03236  0.30491  1.37002
##
## Coefficients:
##                Estimate Std. Error t value Pr(>|t|)
## (Intercept)   1.584e+00  1.839e-01   8.610  < 2e-16 ***
## educ          3.563e-02  2.521e-02   1.413 0.157860
## I(educ^2)     2.511e-03  9.090e-04   2.763 0.005846 **
## exper         2.806e-02  3.999e-03   7.017 4.29e-12 ***
## I(exper^2)   -4.111e-04  8.042e-05  -5.112 3.86e-07 ***
## metro         1.392e-01  3.819e-02   3.644 0.000283 ***
## south        -7.488e-02  3.175e-02  -2.358 0.018569 *
## female       -1.504e-01  3.112e-02  -4.834 1.56e-06 ***
## black        -6.639e-03  8.593e-02  -0.077 0.938435
## female:black -6.167e-02  1.110e-01  -0.556 0.578555
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.4519 on 950 degrees of freedom
## Multiple R-squared:  0.3473, Adjusted R-squared:  0.3411
## F-statistic: 56.17 on 9 and 950 DF,  p-value: < 2.2e-16

Note that we may want to have a more readable format, which can be achieved with the stargazer package.

mdl_4 = smf.ols(formula = "np.log(wage) ~ educ + np.power(educ, 2) + exper + np.power(exper, 2) + metro + south + female*black", data = dt4_train)
mdl_4_fit = mdl_4.fit()
print(mdl_4_fit.summary2())
##                  Results: Ordinary least squares
## ==================================================================
## Model:              OLS              Adj. R-squared:     0.341
## Dependent Variable: np.log(wage)     AIC:                1209.3199
## Date:               2020-10-13 21:44 BIC:                1257.9893
## No. Observations:   960              Log-Likelihood:     -594.66
## Df Model:           9                F-statistic:        56.17
## Df Residuals:       950              Prob (F-statistic): 4.89e-82
## R-squared:          0.347            Scale:              0.20423
## ------------------------------------------------------------------
##                     Coef.  Std.Err.    t    P>|t|   [0.025  0.975]
## ------------------------------------------------------------------
## Intercept           1.5837   0.1839  8.6097 0.0000  1.2227  1.9447
## educ                0.0356   0.0252  1.4134 0.1579 -0.0138  0.0851
## np.power(educ, 2)   0.0025   0.0009  2.7625 0.0058  0.0007  0.0043
## exper               0.0281   0.0040  7.0175 0.0000  0.0202  0.0359
## np.power(exper, 2) -0.0004   0.0001 -5.1115 0.0000 -0.0006 -0.0003
## metro               0.1392   0.0382  3.6436 0.0003  0.0642  0.2141
## south              -0.0749   0.0318 -2.3581 0.0186 -0.1372 -0.0126
## female             -0.1504   0.0311 -4.8342 0.0000 -0.2115 -0.0894
## black              -0.0066   0.0859 -0.0773 0.9384 -0.1753  0.1620
## female:black       -0.0617   0.1110 -0.5557 0.5786 -0.2795  0.1561
## ------------------------------------------------------------------
## Omnibus:               0.969        Durbin-Watson:           2.028
## Prob(Omnibus):         0.616        Jarque-Bera (JB):        0.978
## Skew:                  0.077        Prob(JB):                0.613
## Kurtosis:              2.976        Condition No.:           12581
## ==================================================================
## * The condition number is large (1e+04). This might indicate
## strong multicollinearity or other numerical problems.

Both black and female * black are insignificant, so we can remove them from the regression.

Alternatively, we may want to carry out an $$F$$-test to test the joint hypothesis that: $\begin{cases} H_0&: \beta_{female} = 0, \beta_{black} = 0, \beta_{female \times black} = 0\\ H_1&: \text{at least one of the tested parameters is not zero} \end{cases}$

If fail to reject the null hypothesis, then both race and gender have no significant effect on the model.

print(car::linearHypothesis(mdl_4_fit, c("female=0", "black=0", "female:black=0")))
## Linear hypothesis test
##
## Hypothesis:
## female = 0
## black = 0
## female:black = 0
##
## Model 1: restricted model
## Model 2: log(wage) ~ educ + I(educ^2) + exper + I(exper^2) + metro + south +
##     female * black
##
##   Res.Df    RSS Df Sum of Sq      F    Pr(>F)
## 1    953 199.89
## 2    950 194.01  3    5.8788 9.5952 3.033e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
print(mdl_4_fit.f_test("female=0, black=0, female:black=0"))
## <F test: F=array([[9.59522978]]), p=3.0326890687892767e-06, df_denom=950, df_num=3>

Since the $$p$$-value < 0.05, we reject the null hypothesis and conclude that at least one of the variables is statistically significant.

If we only look at the joint hypothesis for black and female:black:

print(car::linearHypothesis(mdl_4_fit, c("black=0", "female:black=0")))
## Linear hypothesis test
##
## Hypothesis:
## black = 0
## female:black = 0
##
## Model 1: restricted model
## Model 2: log(wage) ~ educ + I(educ^2) + exper + I(exper^2) + metro + south +
##     female * black
##
##   Res.Df    RSS Df Sum of Sq      F Pr(>F)
## 1    952 194.20
## 2    950 194.01  2   0.18448 0.4517 0.6367
print(mdl_4_fit.f_test("black=0, female:black=0"))
## <F test: F=array([[0.45165102]]), p=0.6367129022873298, df_denom=950, df_num=2>

Then we do not reject the null hypothesis that both black and female:black are not statistically significant and thus we can remove them both from our model.

We can also do this with the ANOVA test: by specifying the restricted model under the null:

#
mdl_4_restricted_fit <- lm(formula = "log(wage) ~ educ + I(educ^2) + exper + I(exper^2) + metro + south", data = dt4_train)
print(anova(mdl_4_restricted_fit, mdl_4_fit))
## Analysis of Variance Table
##
## Model 1: log(wage) ~ educ + I(educ^2) + exper + I(exper^2) + metro + south
## Model 2: log(wage) ~ educ + I(educ^2) + exper + I(exper^2) + metro + south +
##     female * black
##   Res.Df    RSS Df Sum of Sq      F    Pr(>F)
## 1    953 199.89
## 2    950 194.01  3    5.8788 9.5952 3.033e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
mdl_4_restricted = smf.ols(formula = "np.log(wage) ~ educ + np.power(educ, 2) + exper + np.power(exper, 2) + metro + south", data = dt4_train)
mdl_4_restricted_fit = mdl_4_restricted.fit()
print(sm.stats.anova_lm(mdl_4_restricted_fit, mdl_4_fit))
##    df_resid         ssr  df_diff  ss_diff        F    Pr(>F)
## 0     953.0  199.892933      0.0      NaN      NaN       NaN
## 1     950.0  194.014163      3.0  5.87877  9.59523  0.000003
#
mdl_4_restricted_fit <- lm(formula = "log(wage) ~ educ + I(educ^2) + exper + I(exper^2) + metro + south + female", data = dt4_train)
print(anova(mdl_4_restricted_fit, mdl_4_fit))
## Analysis of Variance Table
##
## Model 1: log(wage) ~ educ + I(educ^2) + exper + I(exper^2) + metro + south +
##     female
## Model 2: log(wage) ~ educ + I(educ^2) + exper + I(exper^2) + metro + south +
##     female * black
##   Res.Df    RSS Df Sum of Sq      F Pr(>F)
## 1    952 194.20
## 2    950 194.01  2   0.18448 0.4517 0.6367
mdl_4_restricted = smf.ols(formula = "np.log(wage) ~ educ + np.power(educ, 2) + exper + np.power(exper, 2) + metro + south + female", data = dt4_train)
mdl_4_restricted_fit = mdl_4_restricted.fit()
print(sm.stats.anova_lm(mdl_4_restricted_fit, mdl_4_fit))
##    df_resid         ssr  df_diff   ss_diff         F    Pr(>F)
## 0     952.0  194.198640      0.0       NaN       NaN       NaN
## 1     950.0  194.014163      2.0  0.184477  0.451651  0.636713

Note: In case of RunTime Warning - these specific RuntimeWarnings are coming from scipy.stats.distributions, but are “by design”. In statsmodels these “invalid” RuntimeWarnings should not cause problems.

We see that we get the exact same $$F$$-statistic and the exact same $$p$$-value. So, we can use either method to carry out the $$F$$-test for multiple coefficient significance (i.e. multiple restrictions).

Considerations

Note: we may omit print() to get a more concise output:

car::linearHypothesis(mdl_4_fit, c("black=0", "female:black=0"))
## Linear hypothesis test
##
## Hypothesis:
## black = 0
## female:black = 0
##
## Model 1: restricted model
## Model 2: log(wage) ~ educ + I(educ^2) + exper + I(exper^2) + metro + south +
##     female * black
##
##   Res.Df    RSS Df Sum of Sq      F Pr(>F)
## 1    952 194.20
## 2    950 194.01  2   0.18448 0.4517 0.6367
anova(mdl_4_restricted_fit, mdl_4_fit)
## Analysis of Variance Table
##
## Model 1: log(wage) ~ educ + I(educ^2) + exper + I(exper^2) + metro + south +
##     female
## Model 2: log(wage) ~ educ + I(educ^2) + exper + I(exper^2) + metro + south +
##     female * black
##   Res.Df    RSS Df Sum of Sq      F Pr(>F)
## 1    952 194.20
## 2    950 194.01  2   0.18448 0.4517 0.6367

Do note though - when working with .R scripts or via a command terminal (e.g. when executing the scripts on a remote server) the print() command is essential for printing output, whereas simply calling the function would not print the result by default.

• We may also be interested if an additional year of education has a different effect based on gender: $$female \times educ$$. Since the coefficient of female is negative, it would be interesting to see, whether a higher education has a different effect based on a persons gender.
#
mdl_4_fit <- lm(formula = "log(wage) ~ educ + I(educ^2) + exper + I(exper^2) + metro + south + female*educ", data = dt4_train)
print(summary(mdl_4_fit))
##
## Call:
## lm(formula = "log(wage) ~ educ + I(educ^2) + exper + I(exper^2) + metro + south + female*educ",
##     data = dt4_train)
##
## Residuals:
##      Min       1Q   Median       3Q      Max
## -1.61648 -0.30485 -0.02365  0.31103  1.41734
##
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)
## (Intercept)  1.621e+00  1.836e-01   8.827  < 2e-16 ***
## educ         4.064e-02  2.515e-02   1.616 0.106426
## I(educ^2)    1.932e-03  9.271e-04   2.083 0.037477 *
## exper        2.858e-02  3.985e-03   7.173 1.48e-12 ***
## I(exper^2)  -4.152e-04  8.007e-05  -5.186 2.63e-07 ***
## metro        1.423e-01  3.803e-02   3.743 0.000193 ***
## south       -8.088e-02  3.108e-02  -2.602 0.009405 **
## female      -6.067e-01  1.586e-01  -3.825 0.000139 ***
## educ:female  3.121e-02  1.082e-02   2.884 0.004016 **
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.4499 on 951 degrees of freedom
## Multiple R-squared:  0.3524, Adjusted R-squared:  0.3469
## F-statistic: 64.68 on 8 and 951 DF,  p-value: < 2.2e-16
mdl_4 = smf.ols(formula = "np.log(wage) ~ educ + np.power(educ, 2) + exper + np.power(exper, 2) + metro + south + female*educ", data = dt4_train)
mdl_4_fit = mdl_4.fit()
print(mdl_4_fit.summary2())
##                  Results: Ordinary least squares
## ==================================================================
## Model:              OLS              Adj. R-squared:     0.347
## Dependent Variable: np.log(wage)     AIC:                1199.8730
## Date:               2020-10-13 21:44 BIC:                1243.6754
## No. Observations:   960              Log-Likelihood:     -590.94
## Df Model:           8                F-statistic:        64.68
## Df Residuals:       951              Prob (F-statistic): 1.55e-84
## R-squared:          0.352            Scale:              0.20243
## ------------------------------------------------------------------
##                     Coef.  Std.Err.    t    P>|t|   [0.025  0.975]
## ------------------------------------------------------------------
## Intercept           1.6205   0.1836  8.8272 0.0000  1.2602  1.9808
## educ                0.0406   0.0251  1.6160 0.1064 -0.0087  0.0900
## np.power(educ, 2)   0.0019   0.0009  2.0835 0.0375  0.0001  0.0038
## exper               0.0286   0.0040  7.1727 0.0000  0.0208  0.0364
## np.power(exper, 2) -0.0004   0.0001 -5.1857 0.0000 -0.0006 -0.0003
## metro               0.1423   0.0380  3.7431 0.0002  0.0677  0.2170
## south              -0.0809   0.0311 -2.6023 0.0094 -0.1419 -0.0199
## female             -0.6067   0.1586 -3.8253 0.0001 -0.9179 -0.2955
## female:educ         0.0312   0.0108  2.8839 0.0040  0.0100  0.0524
## ------------------------------------------------------------------
## Omnibus:               0.848        Durbin-Watson:           2.025
## Prob(Omnibus):         0.655        Jarque-Bera (JB):        0.839
## Skew:                  0.072        Prob(JB):                0.657
## Kurtosis:              2.990        Condition No.:           12667
## ==================================================================
## * The condition number is large (1e+04). This might indicate
## strong multicollinearity or other numerical problems.

We note that the $$p$$-value of educ > 0.05. On the other hand the interaction variable between gender and education is significant (as well as the squared education, $$educ^2$$), so we will leave the variables included. We will further motivate this decision via the $$F$$-test.

Looking at the $$F$$-test for the hypothesis: $\begin{cases} H_0&: \beta_{educ} = 0, \beta_{female \times educ} = 0\\ H_1&: \text{at least one of the tested parameters is not zero} \end{cases}$

car::linearHypothesis(mdl_4_fit, c("educ=0", "educ:female=0"))
## Linear hypothesis test
##
## Hypothesis:
## educ = 0
## educ:female = 0
##
## Model 1: restricted model
## Model 2: log(wage) ~ educ + I(educ^2) + exper + I(exper^2) + metro + south +
##     female * educ
##
##   Res.Df    RSS Df Sum of Sq      F   Pr(>F)
## 1    953 194.61
## 2    951 192.51  2    2.0935 5.1708 0.005841 **
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
print(mdl_4_fit.f_test("educ=0, female:educ=0"))
## <F test: F=array([[5.17076646]]), p=0.005840994535426029, df_denom=951, df_num=2>

The $$p$$-value is less than 0.05, so we reject the null hypothesis and conclude that at least one variable is statistically significant.

However, removing only educ, but leaving the interaction term would further complicate interpretation, especially since its $$p$$ value is so close to the $$5\%$$ significance level. If we relax the significance level, then all of the variables are statistically significant at the 0.15 ($$15\%$$) significance level.

Considerations

Note: THE NAMES OF THE INTERACTION VARIABLE MUST MATCH EXACTLY THE ONES IN THE OUTPUT. The coefficient names can be viewed by:

print(t(names(coef(mdl_4_fit))))
##      [,1]          [,2]   [,3]        [,4]    [,5]         [,6]    [,7]    [,8]     [,9]
## [1,] "(Intercept)" "educ" "I(educ^2)" "exper" "I(exper^2)" "metro" "south" "female" "educ:female"
print(mdl_4_fit.model.exog_names)
## ['Intercept', 'educ', 'np.power(educ, 2)', 'exper', 'np.power(exper, 2)', 'metro', 'south', 'female', 'female:educ']

If we use female:educ instead of educ:female - we would get an error:

tryCatch({
car::linearHypothesis(mdl_4_fit, c("educ=0", "female:educ=0"))
}, error = function(e) {
print(e)
})
## Warning in constants(lhs, cnames_symb): NAs introduced by coercion
## <simpleError in constants(lhs, cnames_symb): The hypothesis "female:educ=0" is not well formed: contains bad coefficient/variable names.>
try:
print(mdl_4_fit.f_test("educ=0, educ:female=0"))
except Exception as e:
print(e)
## unrecognized token in constraint
##     educ=0, educ:female=0
##                 ^

Note: in order to avoid our code breaking, we wrap the code in a try-catch block and have a function specifically to deal with errors. In this case, it simply prints the error.

In practice, we may need to save the output to a separate file, or have additional methods to handle any possible errors.

INTERPRETATION:

Looking at the model coefficients: \begin{aligned} \underset{(se)}{\widehat{\log(wage)}} &= \underset{(0.1836)}{1.6205} + \underset{(0.0251)}{0.0406} \cdot educ + \underset{(0.0009)}{0.0019} \cdot educ^2 + \underset{(0.0040)}{0.0286} \cdot exper - \underset{(0.0001)}{0.0004} \cdot exper^2 \\ &+ \underset{(0.0380)}{0.1423} \cdot metro - \underset{(0.0311)}{0.0809} \cdot south - \underset{(0.1586)}{0.6067} \cdot female + \underset{(0.0108)}{0.0312} \cdot \left(female \times educ\right) \end{aligned}

or, with a little bit of rearranging, to highlight the effect of gender, we get: \begin{aligned} \underset{(se)}{\widehat{\log(wage)}} &= \underset{(0.1836)}{1.6205} + \underset{(0.0251)}{0.0406} \cdot educ + \underset{(0.0009)}{0.0019} \cdot educ^2 + \underset{(0.0040)}{0.0286} \cdot exper - \underset{(0.0001)}{0.0004} \cdot exper^2 \\ &+ \underset{(0.0380)}{0.1423} \cdot metro - \underset{(0.0311)}{0.0809} \cdot south + \left(\underset{(0.0108)}{0.0312} \cdot educ - \underset{(0.1586)}{0.6067}\right) \cdot female \end{aligned}

a possible interpretation could be as follows: if the person is female, then their $$\log(wage)$$ differs by $$\left(0.0312 \cdot educ - 0.6067\right)$$, compared to males (or the base non-female group), ceteris paribus.

By specifying this model we can see how much education offsets discrimination based on gender. Notice that in this case, if educ = 0, then there is a large difference in wage - thewage is lower by around $$100 \cdot 0.6067 = 60.67 \%$$ for females.

HOWEVER, if we look at the sample data:

print(dt4[dt4[, "educ"] == 0, ])
##      black educ exper faminc female metro midwest south  wage west
## 813      0    0    46  20000      0     0       0     1 12.50    0
## 1083     0    0    35  17000      1     1       0     0  9.19    1
print(dt4.loc[dt4["educ"] == 0])
##       black  educ  exper  faminc  female  metro  midwest  south   wage  west
## 812       0     0     46   20000       0      0        0      1  12.50     0
## 1082      0     0     35   17000       1      1        0      0   9.19     1

We only have two cases when educ = 0 - ONE FOR FEMALES and ONE FOR MALES. Looking at the difference:

(12.50 - 9.19)/9.19
## [1] 0.3601741
(12.50 - 9.19)/9.19
## 0.36017410228509256

is around $$36\%$$, however, other factors, like metro, south and exper are different, while the coefficient in our model holds these values constant (i.e. the same), with only gender being different (this explains the $$60.67\%$$ value in our model).

Having so few datapoints does not reflect the case when educ = 0, hence we should be careful when identifying it.

Considerations

If we check on the training set only:

print(dt4_train[dt4_train[, "educ"] == 0, ])
##     black educ exper faminc female metro midwest south wage west
## 813     0    0    46  20000      0     0       0     1 12.5    0
print(dt4_train.loc[dt4_train["educ"] == 0])
##      black  educ  exper  faminc  female  metro  midwest  south  wage  west
## 812      0     0     46   20000       0      0        0      1  12.5     0

We have only one example point, when educ = 0.

• Yet another possible interaction term is between the the regional indicator(s) and education - since education quality most likely differs depending on the region (especially concerning university quality), it may have a significant effect: $$south \times educ$$.
#
mdl_4_fit = lm(formula = "log(wage) ~ educ + I(educ^2) + exper + I(exper^2) + metro + south*educ + female*educ", data = dt4_train)
print(round(coef(summary(mdl_4_fit)), 5))
##             Estimate Std. Error  t value Pr(>|t|)
## (Intercept)  1.61982    0.18804  8.61402  0.00000
## educ         0.04067    0.02522  1.61270  0.10714
## I(educ^2)    0.00193    0.00093  2.07677  0.03809
## exper        0.02858    0.00399  7.16213  0.00000
## I(exper^2)  -0.00042    0.00008 -5.17555  0.00000
## metro        0.14237    0.03807  3.73932  0.00020
## south       -0.07823    0.15482 -0.50531  0.61346
## female      -0.60656    0.15887 -3.81804  0.00014
## educ:south  -0.00019    0.01067 -0.01746  0.98608
## educ:female  0.03120    0.01084  2.87874  0.00408
mdl_4 = smf.ols(formula = "np.log(wage) ~ educ + np.power(educ, 2) + exper + np.power(exper, 2) + metro + south*educ + female*educ", data = dt4_train)
mdl_4_fit = mdl_4.fit()
print(mdl_4_fit.summary().tables[1])
## ======================================================================================
##                          coef    std err          t      P>|t|      [0.025      0.975]
## --------------------------------------------------------------------------------------
## Intercept              1.6198      0.188      8.614      0.000       1.251       1.989
## educ                   0.0407      0.025      1.613      0.107      -0.009       0.090
## np.power(educ, 2)      0.0019      0.001      2.077      0.038       0.000       0.004
## exper                  0.0286      0.004      7.162      0.000       0.021       0.036
## np.power(exper, 2)    -0.0004   8.02e-05     -5.176      0.000      -0.001      -0.000
## metro                  0.1424      0.038      3.739      0.000       0.068       0.217
## south                 -0.0782      0.155     -0.505      0.613      -0.382       0.226
## south:educ            -0.0002      0.011     -0.017      0.986      -0.021       0.021
## female                -0.6066      0.159     -3.818      0.000      -0.918      -0.295
## female:educ            0.0312      0.011      2.879      0.004       0.010       0.052
## ======================================================================================

We see that the interaction variable between south and educ is insignificant, so we will not include it in our model.

• There may also be other interaction variables, like $$south \times female \times educ$$, or $$metro \times south$$, or many more. But we will not go through all of these step-by-step, as we have done so previously to save space and make this example readable. We have tried a couple more of these interaction terms and arrived at the following model:
#
mdl_4_fit = lm(formula = "log(wage) ~ educ + I(educ^2) + exper + I(exper^2) + south + metro*female*educ", data = dt4_train)
print(round(coef(summary(mdl_4_fit)), 5))
##                   Estimate Std. Error  t value Pr(>|t|)
## (Intercept)        2.06909    0.25791  8.02243  0.00000
## educ               0.01140    0.02805  0.40637  0.68456
## I(educ^2)          0.00156    0.00093  1.67501  0.09426
## exper              0.02830    0.00397  7.12098  0.00000
## I(exper^2)        -0.00041    0.00008 -5.11295  0.00000
## south             -0.08607    0.03103 -2.77343  0.00566
## metro             -0.46074    0.23565 -1.95515  0.05086
## female            -1.59775    0.38997 -4.09705  0.00005
## metro:female       1.15712    0.42041  2.75239  0.00603
## educ:metro         0.04613    0.01733  2.66224  0.00789
## educ:female        0.10495    0.02748  3.81886  0.00014
## educ:metro:female -0.08573    0.02947 -2.90905  0.00371
mdl_4 = smf.ols(formula = "np.log(wage) ~ educ + np.power(educ, 2) + exper + np.power(exper, 2) + south + metro*female*educ", data = dt4_train)
mdl_4_fit = mdl_4.fit()
print(mdl_4_fit.summary().tables[1])
## ======================================================================================
##                          coef    std err          t      P>|t|      [0.025      0.975]
## --------------------------------------------------------------------------------------
## Intercept              2.0691      0.258      8.022      0.000       1.563       2.575
## educ                   0.0114      0.028      0.406      0.685      -0.044       0.066
## np.power(educ, 2)      0.0016      0.001      1.675      0.094      -0.000       0.003
## exper                  0.0283      0.004      7.121      0.000       0.021       0.036
## np.power(exper, 2)    -0.0004   7.99e-05     -5.113      0.000      -0.001      -0.000
## south                 -0.0861      0.031     -2.773      0.006      -0.147      -0.025
## metro                 -0.4607      0.236     -1.955      0.051      -0.923       0.002
## female                -1.5977      0.390     -4.097      0.000      -2.363      -0.832
## metro:female           1.1571      0.420      2.752      0.006       0.332       1.982
## metro:educ             0.0461      0.017      2.662      0.008       0.012       0.080
## female:educ            0.1049      0.027      3.819      0.000       0.051       0.159
## metro:female:educ     -0.0857      0.029     -2.909      0.004      -0.144      -0.028
## ======================================================================================

Note that the $$metro$$ coefficient is now negative - it is possible that due to the dense population and abundance of job-seekers, the limited supply of jobs could result in a lower wage, if $$educ = 0$$ and/or $$female = 0$$. This is because we have also included $$educ \times metro$$ and $$metro \times female$$, which capture specific factors when wages increase in metropolitan areas, as well as $$educ \times metro \times female$$, which results in a decrease in (the logarithm of) $$wage$$.

The $$F$$-test for the joint significance for education significance: $\begin{cases} H_0&: \beta_{educ} = \beta_{educ^2} = \beta_{female \times educ} = \beta_{metro \times educ} = \beta_{metro \times female \times educ}= 0\\ H_1&: \text{at least one of the tested parameters is not zero} \end{cases}$

t(names(coef(mdl_4_fit)))
##      [,1]          [,2]   [,3]        [,4]    [,5]         [,6]    [,7]    [,8]     [,9]           [,10]        [,11]         [,12]
## [1,] "(Intercept)" "educ" "I(educ^2)" "exper" "I(exper^2)" "south" "metro" "female" "metro:female" "educ:metro" "educ:female" "educ:metro:female"
print(mdl_4_fit.params.index.format())
## ['Intercept', 'educ', 'np.power(educ, 2)', 'exper', 'np.power(exper, 2)', 'south', 'metro', 'female', 'metro:female', 'metro:educ', 'female:educ', 'metro:female:educ']
print(car::linearHypothesis(mdl_4_fit, c("educ=0", "I(educ^2)=0", "educ:female=0", "educ:metro=0", "educ:metro:female=0")))
## Linear hypothesis test
##
## Hypothesis:
## educ = 0
## I(educ^2) = 0
## educ:female = 0
## educ:metro = 0
## educ:metro:female = 0
##
## Model 1: restricted model
## Model 2: log(wage) ~ educ + I(educ^2) + exper + I(exper^2) + south + metro *
##     female * educ
##
##   Res.Df    RSS Df Sum of Sq      F    Pr(>F)
## 1    953 275.80
## 2    948 190.52  5    85.282 84.871 < 2.2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
print(mdl_4_fit.f_test("educ=0, np.power(educ, 2) = 0, female:educ=0, metro:educ=0, metro:female:educ=0"))
## <F test: F=array([[84.87053176]]), p=9.519094246918181e-74, df_denom=948, df_num=5>

With $$p$$-value < 0.05, we reject the null hypothesis and conclude that educ is statistically significant in our model.

On the other hand, we could remove the the squared value of educ. Though we will examine this in more detail in the collinearity task.

Furthermore, testing the significance of only the $$educ$$ and its polynomial $$educ^2$$: $\begin{cases} H_0&: \beta_{educ} = \beta_{educ^2} = 0\\ H_1&: \text{at least one of the tested parameters is not zero} \end{cases}$ yields:

car::linearHypothesis(mdl_4_fit, c("educ=0", "I(educ^2)=0"))
## Linear hypothesis test
##
## Hypothesis:
## educ = 0
## I(educ^2) = 0
##
## Model 1: restricted model
## Model 2: log(wage) ~ educ + I(educ^2) + exper + I(exper^2) + south + metro *
##     female * educ
##
##   Res.Df    RSS Df Sum of Sq     F   Pr(>F)
## 1    950 193.08
## 2    948 190.52  2     2.564 6.379 0.001771 **
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
print(mdl_4_fit.f_test("educ=0, np.power(educ, 2) = 0"))
## <F test: F=array([[6.37903426]]), p=0.001770503344307574, df_denom=948, df_num=2>

a $$p$$-value < 0.05, which means that we still reject the null hypothesis and conclude that education has a significant effect on wage.

Finally, the $$R^2_{adj}$$ is:

print(summary(mdl_4_fit)adj.r.squared) ## [1] 0.351647 print(mdl_4_fit.rsquared_adj) ## 0.35164699449392944 Interaction terms are not restricted to indicator variables - we can include interactions where BOTH variables are non-indicators Consequently, let us look at yet another interaction variable, but this time between edu and exper. The motivation for including this interaction variable can be formulated as a question: • Does an additional year of experience have a magnified effect based on years of education? • Does an additional year of education have a magnified effect based on the years of experience? • Does a unit increase in education result in a change in the effectiveness of experience on wage? (e.g. an increase in education lowers the effect (i.e. the coefficient) of experience) • Does a unit increase in experience result in a change in the effectiveness of education on wage? (e.g. an increase in experience lowers the effect (i.e. the coefficient) of education) In other words, we want to include an additional variable, $$educ \times exper$$ in our model: # mdl_4_fit <- lm(formula = "log(wage) ~ educ + I(educ^2) + exper + I(exper^2) + south + metro*female*educ + educ:exper", data = dt4_train) print(round(coef(summary(mdl_4_fit)), 5)) ## Estimate Std. Error t value Pr(>|t|) ## (Intercept) 1.18919 0.36300 3.27602 0.00109 ## educ 0.09552 0.03717 2.57008 0.01032 ## I(educ^2) -0.00009 0.00104 -0.08836 0.92961 ## exper 0.05289 0.00819 6.45440 0.00000 ## I(exper^2) -0.00050 0.00008 -5.96478 0.00000 ## south -0.08416 0.03087 -2.72660 0.00652 ## metro -0.43202 0.23448 -1.84244 0.06572 ## female -1.61971 0.38784 -4.17624 0.00003 ## metro:female 1.16966 0.41806 2.79780 0.00525 ## educ:metro 0.04387 0.01724 2.54423 0.01111 ## educ:female 0.10563 0.02733 3.86538 0.00012 ## educ:exper -0.00145 0.00042 -3.42515 0.00064 ## educ:metro:female -0.08643 0.02930 -2.94950 0.00326 mdl_4 = smf.ols(formula = "np.log(wage) ~ educ + np.power(educ, 2) + exper + np.power(exper, 2) + south + metro*female*educ + educ:exper", data = dt4_train) mdl_4_fit = mdl_4.fit() print(mdl_4_fit.summary().tables[1]) ## ====================================================================================== ## coef std err t P>|t| [0.025 0.975] ## -------------------------------------------------------------------------------------- ## Intercept 1.1892 0.363 3.276 0.001 0.477 1.902 ## educ 0.0955 0.037 2.570 0.010 0.023 0.168 ## np.power(educ, 2) -9.233e-05 0.001 -0.088 0.930 -0.002 0.002 ## exper 0.0529 0.008 6.454 0.000 0.037 0.069 ## np.power(exper, 2) -0.0005 8.38e-05 -5.965 0.000 -0.001 -0.000 ## south -0.0842 0.031 -2.727 0.007 -0.145 -0.024 ## metro -0.4320 0.234 -1.842 0.066 -0.892 0.028 ## female -1.6197 0.388 -4.176 0.000 -2.381 -0.859 ## metro:female 1.1697 0.418 2.798 0.005 0.349 1.990 ## metro:educ 0.0439 0.017 2.544 0.011 0.010 0.078 ## female:educ 0.1056 0.027 3.865 0.000 0.052 0.159 ## metro:female:educ -0.0864 0.029 -2.949 0.003 -0.144 -0.029 ## educ:exper -0.0015 0.000 -3.425 0.001 -0.002 -0.001 ## ====================================================================================== The coefficient of the interaction term educ:exper is statistically significant ($$p$$-value < 0.05). INTERPRETATION: This means that we can write our model as (note, we will keep a general notation to make it easier to see what we want to explain): \begin{aligned} \log(wage) &= \beta_0 + \beta_1 educ + \beta_2 educ^2 + \beta_3 exper + \beta_4 exper^2 \\ &+ \beta_5 metro + \beta_6 south + \beta_7 west + \beta_8 midwest + \beta_9 female + \beta_{10} black \\ &+ \beta_{11} \left( educ \times exper \right) + \epsilon \end{aligned} We can re-write this equation as: \begin{aligned} \log(wage) &= \beta_0 + \left(\beta_1 + \beta_{11} exper \right)educ + \beta_2 educ^2 + \beta_3 exper + \beta_4 exper^2 \\ &+ \beta_5 metro + \beta_6 south + \beta_7 west + \beta_8 midwest + \beta_9 female + \beta_{10} black \\ &+ \epsilon \end{aligned} So, the coefficient $$\beta_{11}$$ can be interpreted as the change in effectiveness of education for a one unit increase in experience. Alternatively, rewriting the equation as: \begin{aligned} \log(wage) &= \beta_0 + \beta_1 educ + \beta_2 educ^2 + \left( \beta_3 + \beta_{11} educ \right) exper + \beta_4 exper^2 \\ &+ \beta_5 metro + \beta_6 south + \beta_7 west + \beta_8 midwest + \beta_9 female + \beta_{10} black \\ &+ \epsilon \end{aligned} In this case, the coefficient $$\beta_{11}$$ can be interpreted as the change in effectiveness of experience for a one unit increase in education. We would also like to point out that the adjusted $$R^2$$ is larger than in the previous model. The $$R^2_{adj}$$ of the new model is slightly larger than before: print(summary(mdl_4_fit)adj.r.squared)
## [1] 0.3589044
print(mdl_4_fit.rsquared_adj)
## 0.3589044149386491

We do note one more result: the square of educ is insignificant - I(educ^2) has a $$p$$-value of 0.92961, in which case we would not reject the null hypothesis that it is insignificant.

Considerations

If we drop this squared variable and compare $$R_{adj}^2$$, AIC and BIC values.

The unrestricted model:

print(summary(mdl_4_fit))
##
## Call:
## lm(formula = "log(wage) ~ educ + I(educ^2) + exper + I(exper^2) + south + metro*female*educ + educ:exper",
##     data = dt4_train)
##
## Residuals:
##      Min       1Q   Median       3Q      Max
## -1.69268 -0.30245 -0.02787  0.29376  1.46367
##
## Coefficients:
##                     Estimate Std. Error t value Pr(>|t|)
## (Intercept)        1.189e+00  3.630e-01   3.276 0.001091 **
## educ               9.552e-02  3.717e-02   2.570 0.010319 *
## I(educ^2)         -9.233e-05  1.045e-03  -0.088 0.929610
## exper              5.289e-02  8.195e-03   6.454 1.73e-10 ***
## I(exper^2)        -4.999e-04  8.381e-05  -5.965 3.46e-09 ***
## south             -8.416e-02  3.087e-02  -2.727 0.006517 **
## metro             -4.320e-01  2.345e-01  -1.842 0.065723 .
## female            -1.620e+00  3.878e-01  -4.176 3.24e-05 ***
## metro:female       1.170e+00  4.181e-01   2.798 0.005250 **
## educ:metro         4.387e-02  1.724e-02   2.544 0.011110 *
## educ:female        1.056e-01  2.733e-02   3.865 0.000118 ***
## educ:exper        -1.451e-03  4.237e-04  -3.425 0.000641 ***
## educ:metro:female -8.643e-02  2.930e-02  -2.949 0.003261 **
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.4458 on 947 degrees of freedom
## Multiple R-squared:  0.3669, Adjusted R-squared:  0.3589
## F-statistic: 45.74 on 12 and 947 DF,  p-value: < 2.2e-16
print(AIC(mdl_4_fit))
## [1] 1188.05
print(BIC(mdl_4_fit))
## [1] 1256.187
print(mdl_4_fit.summary2())
##                  Results: Ordinary least squares
## ==================================================================
## Model:              OLS              Adj. R-squared:     0.359
## Dependent Variable: np.log(wage)     AIC:                1186.0497
## Date:               2020-10-13 21:44 BIC:                1249.3199
## No. Observations:   960              Log-Likelihood:     -580.02
## Df Model:           12               F-statistic:        45.74
## Df Residuals:       947              Prob (F-statistic): 1.35e-85
## R-squared:          0.367            Scale:              0.19872
## ------------------------------------------------------------------
##                     Coef.  Std.Err.    t    P>|t|   [0.025  0.975]
## ------------------------------------------------------------------
## Intercept           1.1892   0.3630  3.2760 0.0011  0.4768  1.9016
## educ                0.0955   0.0372  2.5701 0.0103  0.0226  0.1685
## np.power(educ, 2)  -0.0001   0.0010 -0.0884 0.9296 -0.0021  0.0020
## exper               0.0529   0.0082  6.4544 0.0000  0.0368  0.0690
## np.power(exper, 2) -0.0005   0.0001 -5.9648 0.0000 -0.0007 -0.0003
## south              -0.0842   0.0309 -2.7266 0.0065 -0.1447 -0.0236
## metro              -0.4320   0.2345 -1.8424 0.0657 -0.8922  0.0281
## female             -1.6197   0.3878 -4.1762 0.0000 -2.3808 -0.8586
## metro:female        1.1697   0.4181  2.7978 0.0052  0.3492  1.9901
## metro:educ          0.0439   0.0172  2.5442 0.0111  0.0100  0.0777
## female:educ         0.1056   0.0273  3.8654 0.0001  0.0520  0.1593
## metro:female:educ  -0.0864   0.0293 -2.9495 0.0033 -0.1439 -0.0289
## educ:exper         -0.0015   0.0004 -3.4252 0.0006 -0.0023 -0.0006
## ------------------------------------------------------------------
## Omnibus:               1.478        Durbin-Watson:           2.019
## Prob(Omnibus):         0.478        Jarque-Bera (JB):        1.338
## Skew:                  0.064        Prob(JB):                0.512
## Kurtosis:              3.130        Condition No.:           43030
## ==================================================================
## * The condition number is large (4e+04). This might indicate
## strong multicollinearity or other numerical problems.

The restricted model:

#
mdl_4_R_fit <- lm(formula = "log(wage) ~ educ + exper + I(exper^2) + south + metro*female*educ + educ:exper", data = dt4_train)
print(summary(mdl_4_R_fit))
##
## Call:
## lm(formula = "log(wage) ~ educ + exper + I(exper^2) + south + metro*female*educ + educ:exper",
##     data = dt4_train)
##
## Residuals:
##      Min       1Q   Median       3Q      Max
## -1.69322 -0.30260 -0.02759  0.29370  1.46339
##
## Coefficients:
##                     Estimate Std. Error t value Pr(>|t|)
## (Intercept)        1.210e+00  2.730e-01   4.433 1.04e-05 ***
## educ               9.271e-02  1.933e-02   4.795 1.88e-06 ***
## exper              5.261e-02  7.566e-03   6.954 6.61e-12 ***
## I(exper^2)        -4.993e-04  8.351e-05  -5.979 3.17e-09 ***
## south             -8.419e-02  3.085e-02  -2.729 0.006466 **
## metro             -4.310e-01  2.341e-01  -1.841 0.065892 .
## female            -1.614e+00  3.813e-01  -4.232 2.54e-05 ***
## metro:female       1.166e+00  4.156e-01   2.805 0.005133 **
## educ:metro         4.377e-02  1.720e-02   2.545 0.011086 *
## educ:female        1.052e-01  2.687e-02   3.916 9.66e-05 ***
## educ:exper        -1.434e-03  3.756e-04  -3.818 0.000143 ***
## educ:metro:female -8.614e-02  2.910e-02  -2.960 0.003155 **
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.4455 on 948 degrees of freedom
## Multiple R-squared:  0.3669, Adjusted R-squared:  0.3596
## F-statistic: 49.95 on 11 and 948 DF,  p-value: < 2.2e-16
print(AIC(mdl_4_R_fit))
## [1] 1186.058
print(BIC(mdl_4_R_fit))
## [1] 1249.328
mdl_4_R = smf.ols(formula = "np.log(wage) ~ educ + exper + np.power(exper, 2) + south + metro*female*educ + educ:exper", data = dt4_train)
mdl_4_R_fit = mdl_4_R.fit()
print(mdl_4_R_fit.summary2())
##                  Results: Ordinary least squares
## ==================================================================
## Model:              OLS              Adj. R-squared:     0.360
## Dependent Variable: np.log(wage)     AIC:                1184.0577
## Date:               2020-10-13 21:44 BIC:                1242.4609
## No. Observations:   960              Log-Likelihood:     -580.03
## Df Model:           11               F-statistic:        49.95
## Df Residuals:       948              Prob (F-statistic): 1.87e-86
## R-squared:          0.367            Scale:              0.19851
## ------------------------------------------------------------------
##                     Coef.  Std.Err.    t    P>|t|   [0.025  0.975]
## ------------------------------------------------------------------
## Intercept           1.2103   0.2730  4.4329 0.0000  0.6745  1.7461
## educ                0.0927   0.0193  4.7954 0.0000  0.0548  0.1307
## exper               0.0526   0.0076  6.9539 0.0000  0.0378  0.0675
## np.power(exper, 2) -0.0005   0.0001 -5.9793 0.0000 -0.0007 -0.0003
## south              -0.0842   0.0308 -2.7292 0.0065 -0.1447 -0.0237
## metro              -0.4310   0.2341 -1.8413 0.0659 -0.8904  0.0284
## female             -1.6135   0.3813 -4.2318 0.0000 -2.3618 -0.8653
## metro:female        1.1658   0.4156  2.8051 0.0051  0.3502  1.9815
## metro:educ          0.0438   0.0172  2.5450 0.0111  0.0100  0.0775
## female:educ         0.1052   0.0269  3.9157 0.0001  0.0525  0.1579
## metro:female:educ  -0.0861   0.0291 -2.9598 0.0032 -0.1433 -0.0290
## educ:exper         -0.0014   0.0004 -3.8179 0.0001 -0.0022 -0.0007
## ------------------------------------------------------------------
## Omnibus:               1.491        Durbin-Watson:           2.019
## Prob(Omnibus):         0.475        Jarque-Bera (JB):        1.351
## Skew:                  0.065        Prob(JB):                0.509
## Kurtosis:              3.131        Condition No.:           42574
## ==================================================================
## * The condition number is large (4e+04). This might indicate
## strong multicollinearity or other numerical problems.

While the coefficient of educ is now significant, we see that the adjusted $$R^2$$ is unchanged, the AIC and BIC are slightly lower (indicating a slightly better model).

All in all dropping the variable appears to not yield any noticeable improvement.

In such a case it is useful to:

• re-examine the signs of the coefficients - are they satisfactory, maybe some other variables need to be removed, or additional ones to be added?
• examine the magnitude of the coefficients with the polynomial variable, and without it - are there any values which greatly change (this may indicate overfitting and in general a less-robust model)?

The relevant coefficients, which we want to compare, are:

coef_mat <- data.frame(
COEFS = names(coef(mdl_4_fit)),
UNRESTRICTED = coef(mdl_4_fit),
RESTRICTED = append(x = coef(mdl_4_R_fit), values = NA, after = 2)
)
coef_mat <- data.frame(coef_mat,
'sign(diff)'  = sign(coef_mat[, "RESTRICTED"] - coef_mat[, "UNRESTRICTED"]),
'ABS(CHANGE) (%)' = abs(coef_mat[, "RESTRICTED"] / coef_mat[, "UNRESTRICTED"] - 1) * 100,
check.names = FALSE)
rownames(coef_mat) <- NULL
print(format(coef_mat, scientific = FALSE))
##                COEFS   UNRESTRICTED    RESTRICTED sign(diff) ABS(CHANGE) (%)
## 1        (Intercept)  1.18919202081  1.2103143771          1      1.77619392
## 2               educ  0.09551713594  0.0927131346         -1      2.93560036
## 3          I(educ^2) -0.00009232766            NA         NA              NA
## 4              exper  0.05289124629  0.0526140023         -1      0.52417739
## 5         I(exper^2) -0.00049991054 -0.0004993319          1      0.11574543
## 6              south -0.08415721868 -0.0841887701         -1      0.03749103
## 7              metro -0.43201586345 -0.4310380474          1      0.22633800
## 8             female -1.61971063354 -1.6135327478          1      0.38141910
## 9       metro:female  1.16965593234  1.1658479108         -1      0.32556766
## 10        educ:metro  0.04386827922  0.0437738433         -1      0.21527148
## 11       educ:female  0.10563419362  0.1051985288         -1      0.41242779
## 12        educ:exper -0.00145129350 -0.0014339979          1      1.19173466
## 13 educ:metro:female -0.08643491958 -0.0861441754          1      0.33637352
coef_mat = pd.DataFrame()
coef_mat["COEFS"] = mdl_4.exog_names
coef_mat["UNRESTRICTED"] = np.array(mdl_4_fit.params)
coef_mat["RESTRICTED"]   = np.insert(np.array(mdl_4_R_fit.params), 2, np.nan)
coef_mat["sign(diff)"]   = np.sign(coef_mat["RESTRICTED"].values - coef_mat["UNRESTRICTED"].values)
coef_mat["ABS(CHANGE) (%)"] = np.abs(coef_mat["RESTRICTED"].values / coef_mat["UNRESTRICTED"].values - 1) * 100
#
#
#
print(coef_mat)
##                  COEFS  UNRESTRICTED  RESTRICTED  sign(diff)  ABS(CHANGE) (%)
## 0            Intercept      1.189192    1.210314         1.0         1.776194
## 1                 educ      0.095517    0.092713        -1.0         2.935600
## 2    np.power(educ, 2)     -0.000092         NaN         NaN              NaN
## 3                exper      0.052891    0.052614        -1.0         0.524177
## 4   np.power(exper, 2)     -0.000500   -0.000499         1.0         0.115745
## 5                south     -0.084157   -0.084189        -1.0         0.037491
## 6                metro     -0.432016   -0.431038         1.0         0.226338
## 7               female     -1.619711   -1.613533         1.0         0.381419
## 8         metro:female      1.169656    1.165848        -1.0         0.325568
## 9           metro:educ      0.043868    0.043774        -1.0         0.215271
## 10         female:educ      0.105634    0.105199        -1.0         0.412428
## 11   metro:female:educ     -0.086435   -0.086144         1.0         0.336374
## 12          educ:exper     -0.001451   -0.001434         1.0         1.191735

We see that educ coefficient value is affected the most by the removal of educ^2 - decreasing by around $$2.9\%$$, while the remaining parameters (excluding the intercept and south) decreased between $$0.11\%$$ and $$1.19\%$$.

All in all, removing this variable neither improves, nor worsens our model.

Generally, we may want to remove the insignificant variables. However, before deciding on the removal of this variable, let us examine, whether any linear restrictions can be applied.

Maybe re-estimating the coefficients via RLS would improve the significance of the squared educ variable in our model?

#### 4.11.2.3 Task 8

On the other hand, looking at the coefficient signs and magnitude for educ and exper, we may want to verify the following hypothesis: $\text{Each additional year of education has the same effect as each additional year of experience on }\log(wage)$ Note that this concerns not only educ and exper, but their polynomial terms as well!

This restriction can be formulated as the following hypothesis: $\begin{cases} H_0&: \beta_{educ} = \beta_{exper},\text{ and } \beta_{educ^2} = \beta_{exper^2}\\\\ H_1&: \beta_{educ} \neq \beta_{exper}\text{ or } \beta_{educ^2} \neq \beta_{exper^2} \text{ or both} \end{cases}$ Note that in a previous task we have already carried our a number of multiple restriction tests, but we simply tested whether multiple parameters are significant or not. We did not test whether some parameters are statistically significantly identical to one another.

print(car::linearHypothesis(mdl_4_fit, c("educ-exper=0", "I(educ^2)-I(exper^2)=0")))
## Linear hypothesis test
##
## Hypothesis:
## educ - exper = 0
## I(educ^2) - I(exper^2) = 0
##
## Model 1: restricted model
## Model 2: log(wage) ~ educ + I(educ^2) + exper + I(exper^2) + south + metro *
##     female * educ + educ:exper
##
##   Res.Df    RSS Df Sum of Sq      F   Pr(>F)
## 1    949 190.25
## 2    947 188.19  2     2.061 5.1857 0.005756 **
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
print(mdl_4_fit.f_test("educ-exper=0, np.power(educ, 2) - np.power(exper, 2)=0"))
## <F test: F=array([[5.18574332]]), p=0.005755772616090293, df_denom=947, df_num=2>

So, we reject the null hypothesis (since p-value = 0.005756 < 0.05) and conclude that education and experience have different effects on $$\log(wage)$$.

Nevertheless, we may still be interested to test if the non-squared coefficients are equal, that is: $\begin{cases} H_0&: \beta_{educ} = \beta_{exper}\\\\ H_1&: \beta_{educ} \neq \beta_{exper} \end{cases}$

Note in this case, there is less economic reasoning for this restriction test, since we are ignoring their polynomial variables.

print(car::linearHypothesis(mdl_4_fit, c("educ-exper=0")))
## Linear hypothesis test
##
## Hypothesis:
## educ - exper = 0
##
## Model 1: restricted model
## Model 2: log(wage) ~ educ + I(educ^2) + exper + I(exper^2) + south + metro *
##     female * educ + educ:exper
##
##   Res.Df    RSS Df Sum of Sq      F Pr(>F)
## 1    948 188.51
## 2    947 188.19  1   0.32524 1.6367 0.2011
print(mdl_4_fit.f_test("educ-exper=0"))
## <F test: F=array([[1.63665044]]), p=0.20109902228182291, df_denom=947, df_num=1>

In this case we do not reject the null hypothesis that the coefficients are equal.

This conclusion allows us to re-estimate the regression via restricted least squares (RLS).

### 4.11.3 Exercise Set 3

#### 4.11.3.1 Task 9

In order to re-estimate the model via RLS in Python, we need to specify our model as a Generalized Linear Model (GLM). This is pretty straightforward:

mdl_4_rls = sm.GLM.from_formula(formula = "np.log(wage) ~ educ + np.power(educ, 2) + exper + np.power(exper, 2) + south + metro*female*educ + educ:exper", data = dt4_train)
mdl_4_rls_fit = mdl_4_rls.fit()

Let’s look again at our initial model, which we want to apply RLS:

round(coef(summary(mdl_4_fit)), 5)
##                   Estimate Std. Error  t value Pr(>|t|)
## (Intercept)        1.18919    0.36300  3.27602  0.00109
## educ               0.09552    0.03717  2.57008  0.01032
## I(educ^2)         -0.00009    0.00104 -0.08836  0.92961
## exper              0.05289    0.00819  6.45440  0.00000
## I(exper^2)        -0.00050    0.00008 -5.96478  0.00000
## south             -0.08416    0.03087 -2.72660  0.00652
## metro             -0.43202    0.23448 -1.84244  0.06572
## female            -1.61971    0.38784 -4.17624  0.00003
## metro:female       1.16966    0.41806  2.79780  0.00525
## educ:metro         0.04387    0.01724  2.54423  0.01111
## educ:female        0.10563    0.02733  3.86538  0.00012
## educ:exper        -0.00145    0.00042 -3.42515  0.00064
## educ:metro:female -0.08643    0.02930 -2.94950  0.00326
print(mdl_4_rls_fit.summary().tables[1])
## ======================================================================================
##                          coef    std err          z      P>|z|      [0.025      0.975]
## --------------------------------------------------------------------------------------
## Intercept              1.1892      0.363      3.276      0.001       0.478       1.901
## educ                   0.0955      0.037      2.570      0.010       0.023       0.168
## np.power(educ, 2)  -9.233e-05      0.001     -0.088      0.930      -0.002       0.002
## exper                  0.0529      0.008      6.454      0.000       0.037       0.069
## np.power(exper, 2)    -0.0005   8.38e-05     -5.965      0.000      -0.001      -0.000
## south                 -0.0842      0.031     -2.727      0.006      -0.145      -0.024
## metro                 -0.4320      0.234     -1.842      0.065      -0.892       0.028
## female                -1.6197      0.388     -4.176      0.000      -2.380      -0.860
## metro:female           1.1697      0.418      2.798      0.005       0.350       1.989
## metro:educ             0.0439      0.017      2.544      0.011       0.010       0.078
## female:educ            0.1056      0.027      3.865      0.000       0.052       0.159
## metro:female:educ     -0.0864      0.029     -2.949      0.003      -0.144      -0.029
## educ:exper            -0.0015      0.000     -3.425      0.001      -0.002      -0.001
## ======================================================================================

We see that the output table is pretty much identical to OLS(). GLM maximizes the likelihood function in order to estimate the model, rather than using the exact OLS expression.

We can apply the linear restriction via the restriction matrices, as defined in 4.4.

Based on the positions of educ and exper, in order to apply educ-exper=0 restriction, we need to specify the restriction matrix and value vector as:

LL = matrix(c(0, 1, 0, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0), nrow = 1, byrow = TRUE)
rr = 0
LL = [[0, 1, 0, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0]]
rr = [0]

Then, we can estimate the model via RLS with these restrictions:

mdl_4_rls_fit = lrmest::rls(formula = formula(mdl_4_fit), R = LL, r = rr, data = dt4_train, delt = rep(0, length(rr)))
## Warning in model.matrix.default(md, cal, contrasts): non-list contrasts argument ignored
mdl_4_rls_fit = mdl_4_rls.fit_constrained((LL, rr))
print(mdl_4_rls_fit[[1]])
##                   Estimate Standard_error t_statistic pvalue
## (Intercept)         1.6043         0.1627           0      1
## educ                0.0490         0.0076          NA     NA
## I(educ^2)           0.0011         0.0005          NA     NA
## exper               0.0490         0.0076          NA     NA
## I(exper^2)         -0.0005         0.0001          NA     NA
## south              -0.0861         0.0308          NA     NA
## metro              -0.5489         0.2159          NA     NA
## female             -1.6741         0.3855          NA     NA
## metro:female        1.2501         0.4133          NA     NA
## educ:metro          0.0525         0.0159          NA     NA
## educ:female         0.1101         0.0271          NA     NA
## educ:exper         -0.0012         0.0004          NA     NA
## educ:metro:female  -0.0925         0.0289          NA     NA
print(mdl_4_rls_fit.summary().tables[1])
## ======================================================================================
##                          coef    std err          z      P>|z|      [0.025      0.975]
## --------------------------------------------------------------------------------------
## Intercept              1.6043      0.163      9.855      0.000       1.285       1.923
## educ                   0.0490      0.008      6.441      0.000       0.034       0.064
## np.power(educ, 2)      0.0011      0.001      1.967      0.049    3.55e-06       0.002
## exper                  0.0490      0.008      6.441      0.000       0.034       0.064
## np.power(exper, 2)    -0.0005   8.38e-05     -5.992      0.000      -0.001      -0.000
## south                 -0.0861      0.031     -2.793      0.005      -0.147      -0.026
## metro                 -0.5489      0.216     -2.541      0.011      -0.972      -0.126
## female                -1.6741      0.386     -4.341      0.000      -2.430      -0.918
## metro:female           1.2501      0.413      3.024      0.002       0.440       2.060
## metro:educ             0.0525      0.016      3.304      0.001       0.021       0.084
## female:educ            0.1101      0.027      4.063      0.000       0.057       0.163
## metro:female:educ     -0.0925      0.029     -3.197      0.001      -0.149      -0.036
## educ:exper            -0.0012      0.000     -3.231      0.001      -0.002      -0.000
## ======================================================================================

Note that the t-statistics are not calculated. Fortunately, we can do this manually following the lecture notes (or other literature references therein):

# Calculate OLS estimate:
X_d <- model.matrix(mdl_4_fit)
xtx_inv  <- solve(t(X_d) %*% X_d)
beta_ols <- xtx_inv %*% t(X_d) %*% mdl_4_fit$model[, 1] # Calculate the Adjustment component needed for the RLS: RA_1 <- xtx_inv %*% t(LL) RA_2 <- solve(LL %*% xtx_inv %*% t(LL)) RA_3 <- LL %*% beta_ols - rr RA <- RA_1 %*% RA_2 %*% RA_3 # RLS = OLS - 'Restriction Adjustment' beta_rls <- beta_ols - RA # Get RLS standard error: y_fit <- X_d %*% beta_rls resid <- mdl_4_fit$model[, 1] - y_fit
sigma2_rls <- sum(resid^2) / (nrow(dt4_train) - (length(beta_rls) - length(rr)))
# Get the RLS parameter variance-covariance matrix:
D_mat <- diag(length(beta_rls)) - RA_1 %*% RA_2 %*% LL
rls_vcov <- sigma2_rls * D_mat %*% xtx_inv
beta_rls_se <- sqrt(diag(rls_vcov))

Now we have the model output:

tmp_p_val <- 2 * pt(-abs(beta_rls / beta_rls_se), df = nrow(dt4_train) - (length(beta_rls) - length(rr)), lower = TRUE)
print(round(data.frame(Estimate = beta_rls, Standard_error = beta_rls_se, t_stat = beta_rls / beta_rls_se, p_val = tmp_p_val), 5))
##                   Estimate Standard_error   t_stat   p_val
## (Intercept)        1.60430        0.16279  9.85523 0.00000
## educ               0.04898        0.00760  6.44057 0.00000
## I(educ^2)          0.00105        0.00054  1.96658 0.04952
## exper              0.04898        0.00760  6.44057 0.00000
## I(exper^2)        -0.00050        0.00008 -5.99229 0.00000
## south             -0.08613        0.03084 -2.79289 0.00533
## metro             -0.54893        0.21601 -2.54125 0.01120
## female            -1.67411        0.38563 -4.34123 0.00002
## metro:female       1.25008        0.41345  3.02356 0.00257
## educ:metro         0.05247        0.01588  3.30392 0.00099
## educ:female        0.11014        0.02711  4.06271 0.00005
## educ:exper        -0.00117        0.00036 -3.23135 0.00127
## educ:metro:female -0.09248        0.02893 -3.19678 0.00144

We can also apply the linear restriction as follows:

mdl_4_rls_fit = mdl_4_rls.fit_constrained("educ - exper = 0")
print(mdl_4_rls_fit.summary().tables[1])
## ======================================================================================
##                          coef    std err          z      P>|z|      [0.025      0.975]
## --------------------------------------------------------------------------------------
## Intercept              1.6043      0.163      9.855      0.000       1.285       1.923
## educ                   0.0490      0.008      6.441      0.000       0.034       0.064
## np.power(educ, 2)      0.0011      0.001      1.967      0.049    3.55e-06       0.002
## exper                  0.0490      0.008      6.441      0.000       0.034       0.064
## np.power(exper, 2)    -0.0005   8.38e-05     -5.992      0.000      -0.001      -0.000
## south                 -0.0861      0.031     -2.793      0.005      -0.147      -0.026
## metro                 -0.5489      0.216     -2.541      0.011      -0.972      -0.126
## female                -1.6741      0.386     -4.341      0.000      -2.430      -0.918
## metro:female           1.2501      0.413      3.024      0.002       0.440       2.060
## metro:educ             0.0525      0.016      3.304      0.001       0.021       0.084
## female:educ            0.1101      0.027      4.063      0.000       0.057       0.163
## metro:female:educ     -0.0925      0.029     -3.197      0.001      -0.149      -0.036
## educ:exper            -0.0012      0.000     -3.231      0.001      -0.002      -0.000
## ======================================================================================

We can see from the output that the restricted coefficients are now equal to one another.

Furthermore, a consequence of RLS is that the associated standard errors are smaller. Consequently, $$educ^2$$ is now significant.

#### 4.11.3.2 Task 10

We will calculate the Variance Inflation Factor for each parameter (note that we do not calculate VIF for the intercept).

vif = pd.DataFrame()
#
vif_out = np.array([])
for i in range(1, mdl_4.exog.shape[1]):
tmp_val = smstats.outliers_influence.variance_inflation_factor(mdl_4.exog, i)
vif_out = np.append(vif_out, [tmp_val])
#
vif["Variable"]   = mdl_4.exog_names[1:]
vif["VIF Factor"] = vif_out
print(cbind(car::vif(mdl_4_fit)))
##                         [,1]
## educ               56.631608
## I(educ^2)          33.892108
## exper              56.051007
## I(exper^2)         14.796290
## south               1.012951
## metro              39.591809
## female            178.920795
## metro:female      197.221692
## educ:metro         54.017813
## educ:female       201.224041
## educ:exper         29.766293
## educ:metro:female 219.815937
print(vif)
##               Variable  VIF Factor
## 0                 educ   56.631608
## 1    np.power(educ, 2)   33.892108
## 2                exper   56.051007
## 3   np.power(exper, 2)   14.796290
## 4                south    1.012951
## 5                metro   39.591809
## 6               female  178.920795
## 7         metro:female  197.221692
## 8           metro:educ   54.017813
## 9          female:educ  201.224041
## 10   metro:female:educ  219.815937
## 11          educ:exper   29.766293

Alternatively, a more compact way of using a for loop (without variable names):

[smstats.outliers_influence.variance_inflation_factor(mdl_4.exog, i) for i in range(1, mdl_4.exog.shape[1])]
## [56.6316082950089, 33.892107555039, 56.05100686110664, 14.796290155581115, 1.012951111202197, 39.59180923834962, 178.92079451605525, 197.221692463743, 54.01781280151604, 201.2240408332581, 219.81593699931122, 29.766293479460398]

Considerations

A couple of points regarding high VIF for polynomial and indicator variables:

• If a model has $$x$$ and $$x^2$$, or if it has $$x$$, $$y$$ and $$x \cdot z$$, there is a good chance that they will be collinear. Fortunately, this is not something to be concerned about, because the $$p$$-value for $$x \cdot z$$ is not affected by the multicollinearity. This can be verified by centering the variables:
• You can reduce the correlations by centering the variables (i.e., subtracting their means) before creating the powers or the products.
• The $$p$$-value for $$x_{centered}^2$$ or for $$x_{centered} \cdot z_{centered}$$ will be exactly the same, regardless of whether or not you center. And all the results for the other variables (including the $$R^2$$ but not including the lower-order terms) will be the same in either case. So the multicollinearity has no adverse consequences.
• The variables with high VIFs are indicator (dummy) variables that represent a categorical variable with three or more categories.
• If the proportion of cases in the reference category is small, the indicator variables will necessarily have high VIFs, even if the categorical variable is not associated with other variables in the regression model.
• The $$p$$-values for the indicator variables may be high. But the overall ($$F$$) test that all indicators have coefficients of zero is unaffected by the high VIFs. And nothing else in the regression is affected.
• If you really want to avoid the high VIFs, choose a reference category with a larger fraction of the cases. That may be desirable in order to avoid situations where none of the individual indicators is statistically significant even though the overall set of indicators is significant.

So, in our cases, we see that the interaction terms and indicator variables are taken for all variable combinations. Nevertheless, we may be interested in checking whether educ and exper are collinear.

To do this, we can either define the regression model without any interaction or polynomial variables, or specify the auxiliary regressions manually. We will define a new regression to save some space, but you are encouraged to verify the VIF values by calculating them manually (i.e. without the built-in VIF functions).

mdl_small <- lm(formula = "log(wage) ~ educ + exper + south + metro + female", data = dt4_train)
mdl_small = smf.ols(formula = "np.log(wage) ~ educ + exper + south + metro + female", data = dt4_train)
#
vif_small = pd.DataFrame()
vif_small["VIF Factor"] = [smstats.outliers_influence.variance_inflation_factor(mdl_small.exog, i) for i in range(1, mdl_small.exog.shape[1])]
vif_small["Variable"]   = mdl_small.exog_names[1:]
print(cbind(car::vif(mdl_small)))
##            [,1]
## educ   1.079138
## exper  1.057703
## south  1.004699
## metro  1.016984
## female 1.024949
print(vif_small)
##    VIF Factor Variable
## 0    1.079138     educ
## 1    1.057703    exper
## 2    1.004699    south
## 3    1.016984    metro
## 4    1.024949   female

Note that from the definition of $$VIF$$, the regression itself for wage does not matter - we are using the design matrix to estimate a model for the exogeneous regressors, but we want to only use those exogeneous regressors, which we want to include in our final model.

From the results we see that these variables are NOT collinear. The collinearity only appears from the inclusion of polynomial and interaction variables and are not cause for concern.

Considerations

In this specific case, the no collinearity result initially appears very weird, since from the exper and educ variable definitions for this dataset we have that:

• educ - years of education
• exper - potential experience = age - educ - 6

So, we would expect that educ and exper are collinear. We will examine this with more detail right now!

We will begin by taking a subset of the data with only educ and exper since we do not want to modify the original dataset:

dt4_new <- dt4
dt4_new$age <- dt4_new[, "exper"] + dt4_new[, "educ"] + 6 head(dt4_new) ## black educ exper faminc female metro midwest south wage west age ## 1 0 13 45 0 1 1 0 0 44.44 1 64 ## 2 0 14 25 45351 1 1 1 0 16.00 0 45 ## 3 0 18 27 91946 1 1 0 0 15.38 0 51 ## 4 0 13 42 48370 0 1 1 0 13.54 0 61 ## 5 0 13 41 10000 1 1 0 0 25.00 1 60 ## 6 0 16 26 151308 1 1 0 0 24.05 0 48 dt4_new = dt4 dt4_new = dt4_new.assign(age = dt4_new["exper"] + dt4_new["educ"] + 6) dt4_new.head() ## black educ exper faminc female metro midwest south wage west age ## 0 0 13 45 0 1 1 0 0 44.44 1 64 ## 1 0 14 25 45351 1 1 1 0 16.00 0 45 ## 2 0 18 27 91946 1 1 0 0 15.38 0 51 ## 3 0 13 42 48370 0 1 1 0 13.54 0 61 ## 4 0 13 41 10000 1 1 0 0 25.00 1 60 If we were to calculate the correlation between these variables: print(cor(dt4_new[, c("educ", "exper", "age")])) ## educ exper age ## educ 1.00000000 -0.2026401 0.01553757 ## exper -0.20264013 1.0000000 0.97598653 ## age 0.01553757 0.9759865 1.00000000 dt4_new[["educ", "exper", "age"]].corr() ## educ exper age ## educ 1.000000 -0.202640 0.015538 ## exper -0.202640 1.000000 0.975987 ## age 0.015538 0.975987 1.000000 We would find that: • The correlation between educ and age is very small; • The correlation between educ and exper is around -0.2 - while small it may be somewhat significant; • The correlation between exper and age is very large; So, it may very well be that age and exper are collinear, but not exper and educ. In other words: • exper - the potential experience (from the definition: years spent not in education, assumingly spent working) - is primarily driven by ones age; • since exper and age are highly correlated - and from the definition of exper - we should be able to use age as a proxy variable (i.e. a substitute, or as we will later see - an instrumental variable) for exper • the education variable provides additional information, which is not fully captured by exper, or age variables; We can very easily verify this, by replacing exper with age: # mdl_4_age_fit <- lm(formula = "log(wage) ~ educ + I(educ^2) + age + I(age^2) + south + metro*female*educ + educ:age", data = dt4_new) print(round(coef(summary(mdl_4_age_fit)), 5)) ## Estimate Std. Error t value Pr(>|t|) ## (Intercept) 1.07479 0.35266 3.04763 0.00236 ## educ 0.02993 0.03100 0.96560 0.33444 ## I(educ^2) 0.00108 0.00083 1.31175 0.18986 ## age 0.05373 0.00776 6.92426 0.00000 ## I(age^2) -0.00049 0.00007 -6.58779 0.00000 ## south -0.07031 0.02790 -2.51949 0.01188 ## metro -0.47101 0.21936 -2.14721 0.03198 ## female -1.38342 0.35568 -3.88949 0.00011 ## metro:female 1.04811 0.38339 2.73379 0.00635 ## educ:metro 0.04471 0.01610 2.77690 0.00557 ## educ:female 0.08658 0.02503 3.45904 0.00056 ## educ:age -0.00016 0.00036 -0.45085 0.65218 ## educ:metro:female -0.07611 0.02686 -2.83404 0.00467 mdl_4_age = smf.ols(formula = "np.log(wage) ~ educ + np.power(educ, 2) + age + np.power(age, 2) + south + metro*female*educ + educ:age", data = dt4_new) mdl_4_age_fit = mdl_4_age.fit() print(mdl_4_age_fit.summary().tables[1]) ## ===================================================================================== ## coef std err t P>|t| [0.025 0.975] ## ------------------------------------------------------------------------------------- ## Intercept 1.0748 0.353 3.048 0.002 0.383 1.767 ## educ 0.0299 0.031 0.966 0.334 -0.031 0.091 ## np.power(educ, 2) 0.0011 0.001 1.312 0.190 -0.001 0.003 ## age 0.0537 0.008 6.924 0.000 0.039 0.069 ## np.power(age, 2) -0.0005 7.41e-05 -6.588 0.000 -0.001 -0.000 ## south -0.0703 0.028 -2.519 0.012 -0.125 -0.016 ## metro -0.4710 0.219 -2.147 0.032 -0.901 -0.041 ## female -1.3834 0.356 -3.889 0.000 -2.081 -0.686 ## metro:female 1.0481 0.383 2.734 0.006 0.296 1.800 ## metro:educ 0.0447 0.016 2.777 0.006 0.013 0.076 ## female:educ 0.0866 0.025 3.459 0.001 0.037 0.136 ## metro:female:educ -0.0761 0.027 -2.834 0.005 -0.129 -0.023 ## educ:age -0.0002 0.000 -0.451 0.652 -0.001 0.001 ## ===================================================================================== comparing the coefficients with the previous model: print(round(coef(summary(mdl_4_fit)), 5)) ## Estimate Std. Error t value Pr(>|t|) ## (Intercept) 1.18919 0.36300 3.27602 0.00109 ## educ 0.09552 0.03717 2.57008 0.01032 ## I(educ^2) -0.00009 0.00104 -0.08836 0.92961 ## exper 0.05289 0.00819 6.45440 0.00000 ## I(exper^2) -0.00050 0.00008 -5.96478 0.00000 ## south -0.08416 0.03087 -2.72660 0.00652 ## metro -0.43202 0.23448 -1.84244 0.06572 ## female -1.61971 0.38784 -4.17624 0.00003 ## metro:female 1.16966 0.41806 2.79780 0.00525 ## educ:metro 0.04387 0.01724 2.54423 0.01111 ## educ:female 0.10563 0.02733 3.86538 0.00012 ## educ:exper -0.00145 0.00042 -3.42515 0.00064 ## educ:metro:female -0.08643 0.02930 -2.94950 0.00326 print(mdl_4_fit.summary().tables[1]) ## ====================================================================================== ## coef std err t P>|t| [0.025 0.975] ## -------------------------------------------------------------------------------------- ## Intercept 1.1892 0.363 3.276 0.001 0.477 1.902 ## educ 0.0955 0.037 2.570 0.010 0.023 0.168 ## np.power(educ, 2) -9.233e-05 0.001 -0.088 0.930 -0.002 0.002 ## exper 0.0529 0.008 6.454 0.000 0.037 0.069 ## np.power(exper, 2) -0.0005 8.38e-05 -5.965 0.000 -0.001 -0.000 ## south -0.0842 0.031 -2.727 0.007 -0.145 -0.024 ## metro -0.4320 0.234 -1.842 0.066 -0.892 0.028 ## female -1.6197 0.388 -4.176 0.000 -2.381 -0.859 ## metro:female 1.1697 0.418 2.798 0.005 0.349 1.990 ## metro:educ 0.0439 0.017 2.544 0.011 0.010 0.078 ## female:educ 0.1056 0.027 3.865 0.000 0.052 0.159 ## metro:female:educ -0.0864 0.029 -2.949 0.003 -0.144 -0.029 ## educ:exper -0.0015 0.000 -3.425 0.001 -0.002 -0.001 ## ====================================================================================== We see that, because exper and age are highly correlated - the coefficient of age and age^2 are very similar to exper and exper^2 in terms of value, sign and significance ($$t$$ and $$p$$ values). On the other hand, because educ and age have a very small correlation, the coefficient educ:exper: is insignificant. Furthermore, if we were to replace educ with exper, then, since exper and age are highly correlated, we should run into a collinearity problem: # mdl_4_collin <- lm(formula = "log(wage) ~ exper + I(exper^2) + age + I(age^2) + south + metro*female*exper + exper:age", data = dt4_new) print(round(coef(summary(mdl_4_collin)), 5)) ## Estimate Std. Error t value Pr(>|t|) ## (Intercept) 0.27310 0.47927 0.56983 0.56890 ## exper -0.04755 0.03697 -1.28639 0.19856 ## I(exper^2) 0.00159 0.00081 1.95374 0.05097 ## age 0.10040 0.04091 2.45398 0.01427 ## I(age^2) 0.00090 0.00090 1.00019 0.31742 ## south -0.06127 0.02805 -2.18383 0.02917 ## metro 0.00534 0.09113 0.05859 0.95329 ## female -0.12614 0.13574 -0.92925 0.35295 ## metro:female -0.00846 0.14713 -0.05748 0.95417 ## exper:metro 0.00456 0.00318 1.43560 0.15138 ## exper:female -0.00287 0.00487 -0.58976 0.55546 ## exper:age -0.00299 0.00168 -1.77717 0.07580 ## exper:metro:female 0.00080 0.00533 0.14966 0.88106 mdl_4_collin = smf.ols(formula = "np.log(wage) ~ exper + np.power(exper, 2) + age + np.power(age, 2) + south + metro*female*exper + exper:age", data = dt4_new) mdl_4_collin_fit = mdl_4_collin.fit() print(mdl_4_collin_fit.summary().tables[1]) ## ====================================================================================== ## coef std err t P>|t| [0.025 0.975] ## -------------------------------------------------------------------------------------- ## Intercept 0.2731 0.479 0.570 0.569 -0.667 1.213 ## exper -0.0476 0.037 -1.286 0.199 -0.120 0.025 ## np.power(exper, 2) 0.0016 0.001 1.954 0.051 -6.7e-06 0.003 ## age 0.1004 0.041 2.454 0.014 0.020 0.181 ## np.power(age, 2) 0.0009 0.001 1.000 0.317 -0.001 0.003 ## south -0.0613 0.028 -2.184 0.029 -0.116 -0.006 ## metro 0.0053 0.091 0.059 0.953 -0.173 0.184 ## female -0.1261 0.136 -0.929 0.353 -0.392 0.140 ## metro:female -0.0085 0.147 -0.057 0.954 -0.297 0.280 ## metro:exper 0.0046 0.003 1.436 0.151 -0.002 0.011 ## female:exper -0.0029 0.005 -0.590 0.555 -0.012 0.007 ## metro:female:exper 0.0008 0.005 0.150 0.881 -0.010 0.011 ## exper:age -0.0030 0.002 -1.777 0.076 -0.006 0.000 ## ====================================================================================== What we immediately notice (compared with mdl_4_age_fit) that by replacing this one variable in the model: • most of the parameters are now insignificant; • the sign of exper is negative (more experience leads to a smaller wage, which is questionable); • the signs for metro, age^2, metro:female changed; Furthermore, if we were to carry out an $$F$$-test to check the overall significance (it is immediately available in one of the model output tables): print(summary(mdl_4_collin)) ## ## Call: ## lm(formula = "log(wage) ~ exper + I(exper^2) + age + I(age^2) + south + metro*female*exper + exper:age", ## data = dt4_new) ## ## Residuals: ## Min 1Q Median 3Q Max ## -1.61648 -0.30066 -0.02562 0.29658 2.05121 ## ## Coefficients: ## Estimate Std. Error t value Pr(>|t|) ## (Intercept) 0.2730996 0.4792679 0.570 0.5689 ## exper -0.0475536 0.0369668 -1.286 0.1986 ## I(exper^2) 0.0015912 0.0008145 1.954 0.0510 . ## age 0.1004006 0.0409134 2.454 0.0143 * ## I(age^2) 0.0009045 0.0009043 1.000 0.3174 ## south -0.0612663 0.0280545 -2.184 0.0292 * ## metro 0.0053391 0.0911299 0.059 0.9533 ## female -0.1261411 0.1357444 -0.929 0.3529 ## metro:female -0.0084569 0.1471297 -0.057 0.9542 ## exper:metro 0.0045598 0.0031762 1.436 0.1514 ## exper:female -0.0028734 0.0048721 -0.590 0.5555 ## exper:age -0.0029885 0.0016816 -1.777 0.0758 . ## exper:metro:female 0.0007971 0.0053261 0.150 0.8811 ## --- ## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 ## ## Residual standard error: 0.4522 on 1187 degrees of freedom ## Multiple R-squared: 0.3598, Adjusted R-squared: 0.3533 ## F-statistic: 55.58 on 12 and 1187 DF, p-value: < 2.2e-16 print(mdl_4_collin_fit.summary2()) ## Results: Ordinary least squares ## ================================================================== ## Model: OLS Adj. R-squared: 0.353 ## Dependent Variable: np.log(wage) AIC: 1513.8148 ## Date: 2020-10-13 21:44 BIC: 1579.9858 ## No. Observations: 1200 Log-Likelihood: -743.91 ## Df Model: 12 F-statistic: 55.58 ## Df Residuals: 1187 Prob (F-statistic): 4.44e-106 ## R-squared: 0.360 Scale: 0.20451 ## ------------------------------------------------------------------ ## Coef. Std.Err. t P>|t| [0.025 0.975] ## ------------------------------------------------------------------ ## Intercept 0.2731 0.4793 0.5698 0.5689 -0.6672 1.2134 ## exper -0.0476 0.0370 -1.2864 0.1986 -0.1201 0.0250 ## np.power(exper, 2) 0.0016 0.0008 1.9537 0.0510 -0.0000 0.0032 ## age 0.1004 0.0409 2.4540 0.0143 0.0201 0.1807 ## np.power(age, 2) 0.0009 0.0009 1.0002 0.3174 -0.0009 0.0027 ## south -0.0613 0.0281 -2.1838 0.0292 -0.1163 -0.0062 ## metro 0.0053 0.0911 0.0586 0.9533 -0.1735 0.1841 ## female -0.1261 0.1357 -0.9293 0.3529 -0.3925 0.1402 ## metro:female -0.0085 0.1471 -0.0575 0.9542 -0.2971 0.2802 ## metro:exper 0.0046 0.0032 1.4356 0.1514 -0.0017 0.0108 ## female:exper -0.0029 0.0049 -0.5898 0.5555 -0.0124 0.0067 ## metro:female:exper 0.0008 0.0053 0.1497 0.8811 -0.0097 0.0112 ## exper:age -0.0030 0.0017 -1.7772 0.0758 -0.0063 0.0003 ## ------------------------------------------------------------------ ## Omnibus: 7.520 Durbin-Watson: 2.017 ## Prob(Omnibus): 0.023 Jarque-Bera (JB): 8.476 ## Skew: 0.117 Prob(JB): 0.014 ## Kurtosis: 3.339 Condition No.: 109116 ## ================================================================== ## * The condition number is large (1e+05). This might indicate ## strong multicollinearity or other numerical problems. We can look at specific part of this table to focus our attention on the $$F$$-value: print(cat(paste0(tail(capture.output(summary(mdl_4_collin)), 4), collapse = "\n"))) ## Residual standard error: 0.4522 on 1187 degrees of freedom ## Multiple R-squared: 0.3598, Adjusted R-squared: 0.3533 ## F-statistic: 55.58 on 12 and 1187 DF, p-value: < 2.2e-16 ## NULL We can also calculate the exact $$p$$-value with: print(pf(summary(mdl_4_collin)$fstatistic[1], summary(mdl_4_collin)$fstatistic[2], summary(mdl_4_collin)$fstatistic[3], lower.tail = FALSE))
##         value
## 4.436202e-106
print(mdl_4_collin_fit.summary().tables[0])
##                             OLS Regression Results
## ==============================================================================
## Dep. Variable:           np.log(wage)   R-squared:                       0.360
## Model:                            OLS   Adj. R-squared:                  0.353
## Method:                 Least Squares   F-statistic:                     55.58
## Date:                Tue, 13 Oct 2020   Prob (F-statistic):          4.44e-106
## Time:                        21:44:31   Log-Likelihood:                -743.91
## No. Observations:                1200   AIC:                             1514.
## Df Residuals:                    1187   BIC:                             1580.
## Df Model:                          12
## Covariance Type:            nonrobust
## ==============================================================================

With $$p$$-value = 4.44e-106 < 0.05, we reject the null hypothesis that all of the coefficients (except the intercept) are insignificant, while the individual $$t$$-statistic and their associated $$p$$-values indicate that almost all of the coefficients are insignificant.

If we were to examine the VIF of the parameters from this model:

mdl_small <- lm(formula = "log(wage) ~ exper + age + south + metro + female", data = dt4_new)
mdl_small = smf.ols(formula = "np.log(wage) ~ exper + age + south + metro + female", data = dt4_new)
#
vif_small = pd.DataFrame()
vif_small["Variable"]   = mdl_small.exog_names[1:]
vif_small["VIF Factor"] = [smstats.outliers_influence.variance_inflation_factor(mdl_small.exog, i) for i in range(1, mdl_small.exog.shape[1])]
print(cbind(car::vif(mdl_small)))
##             [,1]
## exper  21.533254
## age    21.489966
## south   1.002666
## metro   1.011143
## female  1.017045
print(vif_small)
##   Variable  VIF Factor
## 0    exper   21.533254
## 1      age   21.489966
## 2    south    1.002666
## 3    metro    1.011143
## 4   female    1.017045

We see that exper and age are highly collinear.

If we were to further include educ, then we would have a perfect multicollinearity, which would result in an error or warning, depending on the software used:

tryCatch({
mdl_small <- lm(formula = "log(wage) ~ educ + exper + age + south + metro + female", data = dt4_new)
print(cbind(car::vif(mdl_small)))
}, error = function(e) {
print(e)
})
## <simpleError in vif.default(mdl_small): there are aliased coefficients in the model>

As before, in order to avoid our code breaking, we wrap the code in a tryCatch({...}) so that our code does not stop at the error. Since we know for a fact that this code produces an error because of multicollinearity, and since this is an additional example, which is not used in any latter tasks, we can safely capture the error output, print it and have our code continue running further blocks.

In practice, blindly using tryCatch in such a way that the error is simply ignored would most surely result in a significant oversight in you analysis. Generally, error handling is important, but it is not the core focus of this course.

mdl_small = smf.ols(formula = "np.log(wage) ~ educ + exper + age + south + metro + female", data = dt4_new)
#
vif_small = pd.DataFrame()
vif_small["Variable"]   = mdl_small.exog_names[1:]
vif_small["VIF Factor"] = [smstats.outliers_influence.variance_inflation_factor(mdl_small.exog, i) for i in range(1, mdl_small.exog.shape[1])]
#
print(vif_small)
##   Variable  VIF Factor
## 0     educ         inf
## 1    exper         inf
## 2      age         inf
## 3    south    1.002666
## 4    metro    1.011143
## 5   female    1.017045

Since $$R_j^2$$ would be 1, for $$j \in \{educ,\ exper,\ age\}$$, then $$VIF = \dfrac{1}{1 - R_j^2} = \dfrac{1}{0} = \infty$$.

Note that in R “aliased” in the message there are aliased coefficients in the model refers to variables that are linearly dependent (i.e. cause perfect multicollinearity).

Fortunately, R has a function to test for a linear dependence between terms - alias:

print(alias(lm(formula = "log(wage) ~ educ + exper + age + south + metro + female", data = dt4_new)))
## Model :
## log(wage) ~ educ + exper + age + south + metro + female
##
## Complete :
##     (Intercept) educ exper south metro female
## age 6           1    1     0     0     0

So, we have determined the following:

• educ and exper are correlated, but the correlation is not high enough to warrant collinearity - educ has additional information, which is not included in exper;
• exper and age are highly correlated, which results in collinearity between them - when estimating a regression model, we need to choose one of them to include in our model;

Possible explanations for the fact that the correlation between educ and exper is smaller, even though it directly enters the formula, used to calculate exper:

• Most people tend to stop gaining additional years in education, while they continue to age. As a result, exper increases with age, while educ tends to level-off (i.e. stop changing) after a certain number of years gained. For example, once someone gains a Master’s degree, or a PhD, it may be very likely that they stop pursuing additional degrees. As a result, their ‘years spent in education’ stops increasing, while they continue to age, and gain additional years of potential experience;
• From the definition of the data educ is more like a categorical variable, with categories corresponding to years in education, these range from 0 (the minimum) to 21 (the maximum), but since the numerical values assigned usually coincide with the number of years, it is treated like a non-categorical variable.

Further Considerations: Using the Full Sample

Note that we can get the full dataset, and examine its description.

dt4_full <- read.csv(file = "http://www.principlesofeconometrics.com/poe5/data/csv/cps5.csv", sep = ",", dec = ".", header = TRUE)
head(dt4_full)
##   age asian black divorced educ exper faminc female hrswork insure married mcaid mcare metro midwest nchild northeast single south union  wage west white
## 1  45     0     0        0   13    26  39200      1      38      1       1     0     0     0       0      0         1      0     0     1 14.38    0     1
## 2  40     0     0        0   14    20  38400      0      40      0       1     0     0     0       0      0         1      0     0     0 10.50    0     1
## 3  63     0     0        0   18    39    902      1      40      1       1     0     0     0       0      0         1      0     0     0 21.63    0     1
## 4  61     0     0        1   16    39      0      0      40      1       0     0     0     0       0      0         1      0     0     0 18.50    0     1
## 5  46     0     0        0   12    28  48303      1      40      1       1     0     0     0       0      0         1      0     0     0 13.14    0     1
## 6  47     0     0        0   12    29  27376      0      40      1       1     0     0     0       0      0         1      0     0     0 20.88    0     1
dt4_full = pd.read_csv("http://www.principlesofeconometrics.com/poe5/data/csv/cps5.csv")
dt4_full.head()
##    age  asian  black  divorced  educ  exper  faminc  female  hrswork  insure  married  mcaid  mcare  metro  midwest  nchild  northeast  single  south  union   wage  west  white
## 0   45      0      0         0    13     26   39200       1       38       1        1      0      0      0        0       0          1       0      0      1  14.38     0      1
## 1   40      0      0         0    14     20   38400       0       40       0        1      0      0      0        0       0          1       0      0      0  10.50     0      1
## 2   63      0      0         0    18     39     902       1       40       1        1      0      0      0        0       0          1       0      0      0  21.63     0      1
## 3   61      0      0         1    16     39       0       0       40       1        0      0      0      0        0       0          1       0      0      0  18.50     0      1
## 4   46      0      0         0    12     28   48303       1       40       1        1      0      0      0        0       0          1       0      0      0  13.14     0      1
print(paste0("Sample size: N = ", nrow(dt4_full)))
## [1] "Sample size: N = 9799"
print("Sample size: N = " + str(len(dt4_full.index)))
## Sample size: N = 9799

not only does the dataset contain more observations, but it also contains additional variables. The full variable list is as follows:

• age - age
• asian - =1 if asian
• black - =1 if black
• divorced - =1 if divorced
• educ - years of education
• exper - potential experience = age - educ - 6
• faminc - other family income, $$\$$
• female - =1 if female
• hrswork - hours worked last week
• insure - covered by private health insurance
• married - =1 if married
• mcaid - =1 if covered by Medicaid last year
• mcare - =1 if covered by Medicare last year
• metro - =1 if in metropolitan area
• midwest - =1 if midwest region
• nchild - number of own children in household
• northeast - =1 if northeast region
• single - =1 if single
• south - =1 if south region
• union - =1 if a union member
• wage - earnings per hour, $$\$$
• west - =1 if west region
• white - =1 if white

In fact, if we look at the regional indicator variables:

print(table(rowSums(dt4_full[, c("south", "west", "midwest")])))
##
##    0    1
## 1989 7810
print(pd.crosstab(index = dt4_full["south"] + dt4_full["west"] + dt4_full["midwest"], columns="count"))
## col_0  count
## row_0
## 0       1989
## 1       7810
print(table(rowSums(dt4_full[, c("south", "west", "midwest", "northeast")])))
##
##    1
## 9799
print(pd.crosstab(index = dt4_full["south"] + dt4_full["west"] + dt4_full["midwest"] + dt4_full["northeast"], columns="count"))
## col_0  count
## row_0
## 1       9799

We see that using all four indicator variables always sums up to one: $\text{south}_i + \text{west}_i + \text{midwest}_i + \text{northeast}_i = 1,\quad \forall i = 1,...,N$ In other words, including all four of the regional variables would result in a dummy variable trap. As they are collinear. So, in our smaller dataset the $$other$$ region is actually the excluded midwest column of the full dataset.

On the other hand, if we were to also examine the metro variable instead of northeast:

print(table(rowSums(dt4_full[, c("south", "west", "midwest", "metro")])))
##
##    0    1    2
##  366 3070 6363
print(pd.crosstab(index = dt4_full["south"] + dt4_full["west"] + dt4_full["midwest"] + dt4_full["metro"], columns="count"))
## col_0  count
## row_0
## 0        366
## 1       3070
## 2       6363

We see that not only do some rows sum to zero - some sum up even to $$2$$. This clearly shows that metro variable indicates something completely different than the regional variables.

If we were to look back at out initial model:

#
mdl_fulldt <- lm(formula = "log(wage) ~ educ + I(educ^2) + exper + I(exper^2) + metro + south + west + midwest + female + black", data = dt4_full)
print(round(coef(summary(mdl_fulldt)), 4))
##             Estimate Std. Error  t value Pr(>|t|)
## (Intercept)   1.6465     0.0650  25.3334   0.0000
## educ          0.0345     0.0087   3.9804   0.0001
## I(educ^2)     0.0024     0.0003   7.8699   0.0000
## exper         0.0298     0.0013  22.9993   0.0000
## I(exper^2)   -0.0005     0.0000 -17.2396   0.0000
## metro         0.1136     0.0123   9.2442   0.0000
## south        -0.0463     0.0135  -3.4282   0.0006
## west         -0.0089     0.0144  -0.6180   0.5366
## midwest      -0.0609     0.0141  -4.3312   0.0000
## female       -0.1635     0.0095 -17.2027   0.0000
## black        -0.1059     0.0169  -6.2627   0.0000
mdl_fulldt = smf.ols(formula = "np.log(wage) ~ educ + np.power(educ, 2) + exper + np.power(exper, 2) + metro + south + west + midwest + female + black", data = dt4_full)
mdl_fulldt_fit = mdl_fulldt.fit()
print(mdl_fulldt_fit.summary().tables[1])
## ======================================================================================
##                          coef    std err          t      P>|t|      [0.025      0.975]
## --------------------------------------------------------------------------------------
## Intercept              1.6465      0.065     25.333      0.000       1.519       1.774
## educ                   0.0345      0.009      3.980      0.000       0.017       0.051
## np.power(educ, 2)      0.0024      0.000      7.870      0.000       0.002       0.003
## exper                  0.0298      0.001     22.999      0.000       0.027       0.032
## np.power(exper, 2)    -0.0005   2.63e-05    -17.240      0.000      -0.001      -0.000
## metro                  0.1136      0.012      9.244      0.000       0.090       0.138
## south                 -0.0463      0.014     -3.428      0.001      -0.073      -0.020
## west                  -0.0089      0.014     -0.618      0.537      -0.037       0.019
## midwest               -0.0609      0.014     -4.331      0.000      -0.088      -0.033
## female                -0.1635      0.010    -17.203      0.000      -0.182      -0.145
## black                 -0.1059      0.017     -6.263      0.000      -0.139      -0.073
## ======================================================================================

We see a completely different result regarding race. Furthermore, regional indicator variables are also significant for most cases, except for west indicator.

As was mentioned during lectures, a larger sample leads to smaller standard errors and more precise estimates. If we want to account for complex interaction effects and a large amount of variables - we need a large dataset, which would cover many possible combinations of these values (i.e. a larger variable value range).

Further looking at the interaction terms:

#
mdl_fulldt <- lm(formula = "log(wage) ~ educ + I(educ^2) + exper + I(exper^2) + metro + south + west + midwest + female*black", data = dt4_full)
print(round(coef(summary(mdl_fulldt)), 4))
##              Estimate Std. Error  t value Pr(>|t|)
## (Intercept)    1.6477     0.0650  25.3636   0.0000
## educ           0.0349     0.0087   4.0300   0.0001
## I(educ^2)      0.0024     0.0003   7.8283   0.0000
## exper          0.0298     0.0013  22.9907   0.0000
## I(exper^2)    -0.0005     0.0000 -17.2179   0.0000
## metro          0.1135     0.0123   9.2397   0.0000
## south         -0.0462     0.0135  -3.4174   0.0006
## west          -0.0090     0.0144  -0.6291   0.5293
## midwest       -0.0611     0.0141  -4.3476   0.0000
## female        -0.1729     0.0100 -17.3687   0.0000
## black         -0.1614     0.0243  -6.6309   0.0000
## female:black   0.1045     0.0330   3.1694   0.0015
mdl_fulldt = smf.ols(formula = "np.log(wage) ~ educ + np.power(educ, 2) + exper + np.power(exper, 2) + metro + south + west + midwest + female*black", data = dt4_full)
mdl_fulldt_fit = mdl_fulldt.fit()
print(mdl_fulldt_fit.summary().tables[1])
## ======================================================================================
##                          coef    std err          t      P>|t|      [0.025      0.975]
## --------------------------------------------------------------------------------------
## Intercept              1.6477      0.065     25.364      0.000       1.520       1.775
## educ                   0.0349      0.009      4.030      0.000       0.018       0.052
## np.power(educ, 2)      0.0024      0.000      7.828      0.000       0.002       0.003
## exper                  0.0298      0.001     22.991      0.000       0.027       0.032
## np.power(exper, 2)    -0.0005   2.63e-05    -17.218      0.000      -0.001      -0.000
## metro                  0.1135      0.012      9.240      0.000       0.089       0.138
## south                 -0.0462      0.014     -3.417      0.001      -0.073      -0.020
## west                  -0.0090      0.014     -0.629      0.529      -0.037       0.019
## midwest               -0.0611      0.014     -4.348      0.000      -0.089      -0.034
## female                -0.1729      0.010    -17.369      0.000      -0.192      -0.153
## black                 -0.1614      0.024     -6.631      0.000      -0.209      -0.114
## female:black           0.1045      0.033      3.169      0.002       0.040       0.169
## ======================================================================================

We now see that the interaction term $$female \times black$$ is statistically significant.

We can further create even more complex models by including even more interaction terms.

#
mdl_fulldt <- lm(formula = "log(wage) ~ educ + I(educ^2) + exper + I(exper^2) + metro + south + west + midwest + female*black + metro*female*educ + educ:exper", data = dt4_full)
print(round(coef(summary(mdl_fulldt)), 4))
##                   Estimate Std. Error  t value Pr(>|t|)
## (Intercept)         1.4298     0.1203  11.8857   0.0000
## educ                0.0638     0.0120   5.3331   0.0000
## I(educ^2)           0.0013     0.0003   4.0380   0.0001
## exper               0.0483     0.0026  18.3188   0.0000
## I(exper^2)         -0.0005     0.0000 -18.8645   0.0000
## metro              -0.2907     0.0869  -3.3450   0.0008
## south              -0.0450     0.0135  -3.3419   0.0008
## west               -0.0037     0.0143  -0.2555   0.7983
## midwest            -0.0581     0.0140  -4.1495   0.0000
## female             -0.6457     0.1284  -5.0278   0.0000
## black              -0.1620     0.0242  -6.6807   0.0000
## female:black        0.1080     0.0329   3.2802   0.0010
## metro:female        0.3901     0.1393   2.8013   0.0051
## educ:metro          0.0284     0.0064   4.4407   0.0000
## educ:female         0.0315     0.0092   3.4128   0.0006
## educ:exper         -0.0011     0.0001  -7.9522   0.0000
## educ:metro:female  -0.0254     0.0099  -2.5585   0.0105
mdl_fulldt = smf.ols(formula = "np.log(wage) ~ educ + np.power(educ, 2) + exper + np.power(exper, 2) + metro + south + west + midwest + female*black + metro*female*educ + educ:exper", data = dt4_full)
mdl_fulldt_fit = mdl_fulldt.fit()
print(mdl_fulldt_fit.summary().tables[1])
## ======================================================================================
##                          coef    std err          t      P>|t|      [0.025      0.975]
## --------------------------------------------------------------------------------------
## Intercept              1.4298      0.120     11.886      0.000       1.194       1.666
## educ                   0.0638      0.012      5.333      0.000       0.040       0.087
## np.power(educ, 2)      0.0013      0.000      4.038      0.000       0.001       0.002
## exper                  0.0483      0.003     18.319      0.000       0.043       0.054
## np.power(exper, 2)    -0.0005   2.71e-05    -18.865      0.000      -0.001      -0.000
## metro                 -0.2907      0.087     -3.345      0.001      -0.461      -0.120
## south                 -0.0450      0.013     -3.342      0.001      -0.071      -0.019
## west                  -0.0037      0.014     -0.256      0.798      -0.032       0.024
## midwest               -0.0581      0.014     -4.150      0.000      -0.086      -0.031
## female                -0.6457      0.128     -5.028      0.000      -0.898      -0.394
## black                 -0.1620      0.024     -6.681      0.000      -0.210      -0.114
## female:black           0.1080      0.033      3.280      0.001       0.043       0.173
## metro:female           0.3901      0.139      2.801      0.005       0.117       0.663
## metro:educ             0.0284      0.006      4.441      0.000       0.016       0.041
## female:educ            0.0315      0.009      3.413      0.001       0.013       0.050
## metro:female:educ     -0.0254      0.010     -2.558      0.011      -0.045      -0.006
## educ:exper            -0.0011      0.000     -7.952      0.000      -0.001      -0.001
## ======================================================================================

This highlights the importance of sample size - more data includes more information about the variance and correlation between the dependent variable and the regressors, which allows us to estimate the coefficients with better precision.

#### 4.11.3.3 Task 11

Our finalized model is the following:

print(round(coef(summary(mdl_4_fit)), 4))
##                   Estimate Std. Error t value Pr(>|t|)
## (Intercept)         1.1892     0.3630  3.2760   0.0011
## educ                0.0955     0.0372  2.5701   0.0103
## I(educ^2)          -0.0001     0.0010 -0.0884   0.9296
## exper               0.0529     0.0082  6.4544   0.0000
## I(exper^2)         -0.0005     0.0001 -5.9648   0.0000
## south              -0.0842     0.0309 -2.7266   0.0065
## metro              -0.4320     0.2345 -1.8424   0.0657
## female             -1.6197     0.3878 -4.1762   0.0000
## metro:female        1.1697     0.4181  2.7978   0.0052
## educ:metro          0.0439     0.0172  2.5442   0.0111
## educ:female         0.1056     0.0273  3.8654   0.0001
## educ:exper         -0.0015     0.0004 -3.4252   0.0006
## educ:metro:female  -0.0864     0.0293 -2.9495   0.0033
print(mdl_4_fit.summary().tables[1])
## ======================================================================================
##                          coef    std err          t      P>|t|      [0.025      0.975]
## --------------------------------------------------------------------------------------
## Intercept              1.1892      0.363      3.276      0.001       0.477       1.902
## educ                   0.0955      0.037      2.570      0.010       0.023       0.168
## np.power(educ, 2)  -9.233e-05      0.001     -0.088      0.930      -0.002       0.002
## exper                  0.0529      0.008      6.454      0.000       0.037       0.069
## np.power(exper, 2)    -0.0005   8.38e-05     -5.965      0.000      -0.001      -0.000
## south                 -0.0842      0.031     -2.727      0.007      -0.145      -0.024
## metro                 -0.4320      0.234     -1.842      0.066      -0.892       0.028
## female                -1.6197      0.388     -4.176      0.000      -2.381      -0.859
## metro:female           1.1697      0.418      2.798      0.005       0.349       1.990
## metro:educ             0.0439      0.017      2.544      0.011       0.010       0.078
## female:educ            0.1056      0.027      3.865      0.000       0.052       0.159
## metro:female:educ     -0.0864      0.029     -2.949      0.003      -0.144      -0.029
## educ:exper            -0.0015      0.000     -3.425      0.001      -0.002      -0.001
## ======================================================================================

We begin by testing the model residuals for autocorrelation via Breusch-Godfrey test: $\begin{cases} H_0&:\text{the errors are serially uncorrelated}\\ H_1&:\text{the errors are autocorrelated at lag order 2} \end{cases}$

#
bg_t <- lmtest::bgtest(mdl_4_fit, order = 2)
print(bg_t)
##
##  Breusch-Godfrey test for serial correlation of order up to 2
##
## data:  mdl_4_fit
## LM test = 0.57951, df = 2, p-value = 0.7484
name = ['LM statistic','LM p-value','F-value','F  p-value']
bg_t = sm_diagnostic.acorr_breusch_godfrey(mdl_4_fit, nlags = 2)
print(pd.DataFrame([np.round(bg_t, 6)], columns = name))
##    LM statistic  LM p-value   F-value  F  p-value
## 0      0.579514    0.748445  0.285402    0.751777

We have that the $$p$$-value of the $$LM$$ statistic is greater than the $$5\%$$ significance level, we have no grounds to reject the null hypothesis and conclude that the residuals are not serially correlated.

Next up, we will test for heteroskedasticity in the errors: $\begin{cases} H_0&: \gamma_1 = 0 \text{ (residuals are homoskedastic)}\\ H_1&: \gamma_1 \neq 0 \text{ (residuals are heteroskedastic)} \end{cases}$ For simplicity, we will carry out the Breusch-Pagan Test:

BP_t <- lmtest::bptest(mdl_4_fit)
print(BP_t)
##
##  studentized Breusch-Pagan test
##
## data:  mdl_4_fit
## BP = 30.901, df = 12, p-value = 0.00204
BP_t = sm_diagnostic.het_breuschpagan(resid = mdl_4_fit.resid, exog_het = mdl_4.exog)
print(pd.DataFrame([np.round(BP_t, 6)], columns = name))
##    LM statistic  LM p-value   F-value  F  p-value
## 0      30.90116     0.00204  2.624712    0.001871

Because the $$p$$-value < 0.05, we reject the null hypothesis and conclude that the residuals are heteroskedastic.

Note that since our model already includes the appropriate interaction and polynomial terms - this is similar to the White test.

#### 4.11.3.4 Task 12

Our test results indicated that:

• the model residuals are not serially correlated;
• the model residuals are heteroskedastic.

As a result, we need to correct the OLS standard errors for heteroskedasticity - We can use $$HC0$$, $$HC1$$, $$HC2$$ or $$HC3$$ estimators to consistently estimate the coefficient variance.

We have no need to correct for autocorrelation, as they are not serially correlated - there is no need to use HAC, yet it is a robust method that also takes into account heteroskedasticity, so, as an example, we will use it as well.

For comparison, our current model and its coefficient standard errors:

print(round(coef(summary(mdl_4_fit)), 4))
##                   Estimate Std. Error t value Pr(>|t|)
## (Intercept)         1.1892     0.3630  3.2760   0.0011
## educ                0.0955     0.0372  2.5701   0.0103
## I(educ^2)          -0.0001     0.0010 -0.0884   0.9296
## exper               0.0529     0.0082  6.4544   0.0000
## I(exper^2)         -0.0005     0.0001 -5.9648   0.0000
## south              -0.0842     0.0309 -2.7266   0.0065
## metro              -0.4320     0.2345 -1.8424   0.0657
## female             -1.6197     0.3878 -4.1762   0.0000
## metro:female        1.1697     0.4181  2.7978   0.0052
## educ:metro          0.0439     0.0172  2.5442   0.0111
## educ:female         0.1056     0.0273  3.8654   0.0001
## educ:exper         -0.0015     0.0004 -3.4252   0.0006
## educ:metro:female  -0.0864     0.0293 -2.9495   0.0033
print(mdl_4_fit.summary().tables[1])
## ======================================================================================
##                          coef    std err          t      P>|t|      [0.025      0.975]
## --------------------------------------------------------------------------------------
## Intercept              1.1892      0.363      3.276      0.001       0.477       1.902
## educ                   0.0955      0.037      2.570      0.010       0.023       0.168
## np.power(educ, 2)  -9.233e-05      0.001     -0.088      0.930      -0.002       0.002
## exper                  0.0529      0.008      6.454      0.000       0.037       0.069
## np.power(exper, 2)    -0.0005   8.38e-05     -5.965      0.000      -0.001      -0.000
## south                 -0.0842      0.031     -2.727      0.007      -0.145      -0.024
## metro                 -0.4320      0.234     -1.842      0.066      -0.892       0.028
## female                -1.6197      0.388     -4.176      0.000      -2.381      -0.859
## metro:female           1.1697      0.418      2.798      0.005       0.349       1.990
## metro:educ             0.0439      0.017      2.544      0.011       0.010       0.078
## female:educ            0.1056      0.027      3.865      0.000       0.052       0.159
## metro:female:educ     -0.0864      0.029     -2.949      0.003      -0.144      -0.029
## educ:exper            -0.0015      0.000     -3.425      0.001      -0.002      -0.001
## ======================================================================================

Then, the standard errors, corrected via the different HCE methods, as well as the biased OLS (because the errors are heteroskedastic) s.e.’s can be summarized as follows:

dt_hce_se <- NULL
for(hc_type in c("HC0", "HC1", "HC2", "HC3")){
dt_hce_se <- cbind(dt_hce_se, lmtest::coeftest(mdl_4_fit,
vcov. = sandwich::vcovHC(mdl_4_fit, type = hc_type))[, 2])
}
dt_hce_se <- cbind(dt_hce_se, coef(summary(mdl_4_fit))[, 2])
colnames(dt_hce_se) <- c("HC0", "HC1", "HC2", "HC3", "OLS")
dt_hce_se = pd.DataFrame([mdl_4_fit.HC0_se,
mdl_4_fit.HC1_se,
mdl_4_fit.HC2_se,
mdl_4_fit.HC3_se,
mdl_4_fit.bse]).T
dt_hce_se.columns = ["HC0", "HC1", "HC2", "HC3", "OLS"]
print(round(dt_hce_se, 6))
##                        HC0      HC1      HC2      HC3      OLS
## (Intercept)       0.311114 0.313242 0.315766 0.320623 0.362999
## educ              0.031597 0.031813 0.032085 0.032593 0.037165
## I(educ^2)         0.000873 0.000879 0.000888 0.000904 0.001045
## exper             0.008899 0.008960 0.009021 0.009147 0.008195
## I(exper^2)        0.000092 0.000093 0.000093 0.000095 0.000084
## south             0.031315 0.031529 0.031513 0.031716 0.030865
## metro             0.165834 0.166968 0.168471 0.171268 0.234480
## female            0.329030 0.331281 0.337242 0.345918 0.387839
## metro:female      0.358484 0.360937 0.366820 0.375640 0.418062
## educ:metro        0.011828 0.011909 0.012026 0.012235 0.017242
## educ:female       0.023211 0.023370 0.023836 0.024497 0.027328
## educ:exper        0.000467 0.000470 0.000475 0.000483 0.000424
## educ:metro:female 0.025299 0.025472 0.025926 0.026589 0.029305
print(dt_hce_se)
##                          HC0       HC1       HC2       HC3       OLS
## Intercept           0.311114  0.313242  0.315766  0.320623  0.362999
## educ                0.031597  0.031813  0.032085  0.032593  0.037165
## np.power(educ, 2)   0.000873  0.000879  0.000888  0.000904  0.001045
## exper               0.008899  0.008960  0.009021  0.009147  0.008195
## np.power(exper, 2)  0.000092  0.000093  0.000093  0.000095  0.000084
## south               0.031315  0.031529  0.031513  0.031716  0.030865
## metro               0.165834  0.166968  0.168471  0.171268  0.234480
## female              0.329030  0.331281  0.337242  0.345918  0.387839
## metro:female        0.358484  0.360937  0.366820  0.375640  0.418062
## metro:educ          0.011828  0.011909  0.012026  0.012235  0.017242
## female:educ         0.023211  0.023370  0.023836  0.024497  0.027328
## metro:female:educ   0.025299  0.025472  0.025926  0.026589  0.029305
## educ:exper          0.000467  0.000470  0.000475  0.000483  0.000424

We see that the difference between the four HCE methods is not incredibly large, nevertheless, we will select HC3 and examine the coefficient summary output:

tmp_out <- lmtest::coeftest(mdl_4_fit, vcov. = sandwich::vcovHC(mdl_4_fit, type = "HC3"))
print(round(tmp_out, 4))
##
## t test of coefficients:
##
##                   Estimate Std. Error t value Pr(>|t|)
## (Intercept)         1.1892     0.3206  3.7090   0.0002 ***
## educ                0.0955     0.0326  2.9306   0.0035 **
## I(educ^2)          -0.0001     0.0009 -0.1021   0.9187
## exper               0.0529     0.0091  5.7826   <2e-16 ***
## I(exper^2)         -0.0005     0.0001 -5.2756   <2e-16 ***
## south              -0.0842     0.0317 -2.6535   0.0081 **
## metro              -0.4320     0.1713 -2.5225   0.0118 *
## female             -1.6197     0.3459 -4.6824   <2e-16 ***
## metro:female        1.1697     0.3756  3.1138   0.0019 **
## educ:metro          0.0439     0.0122  3.5854   0.0004 ***
## educ:female         0.1056     0.0245  4.3122   <2e-16 ***
## educ:exper         -0.0015     0.0005 -3.0040   0.0027 **
## educ:metro:female  -0.0864     0.0266 -3.2507   0.0012 **
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
tmp_out = mdl_4_fit.get_robustcov_results(cov_type = "HC3")
print(tmp_out.summary())
##                             OLS Regression Results
## ==============================================================================
## Dep. Variable:           np.log(wage)   R-squared:                       0.367
## Model:                            OLS   Adj. R-squared:                  0.359
## Method:                 Least Squares   F-statistic:                     52.09
## Date:                Tue, 13 Oct 2020   Prob (F-statistic):           1.20e-95
## Time:                        21:44:38   Log-Likelihood:                -580.02
## No. Observations:                 960   AIC:                             1186.
## Df Residuals:                     947   BIC:                             1249.
## Df Model:                          12
## Covariance Type:                  HC3
## ======================================================================================
##                          coef    std err          t      P>|t|      [0.025      0.975]
## --------------------------------------------------------------------------------------
## Intercept              1.1892      0.321      3.709      0.000       0.560       1.818
## educ                   0.0955      0.033      2.931      0.003       0.032       0.159
## np.power(educ, 2)  -9.233e-05      0.001     -0.102      0.919      -0.002       0.002
## exper                  0.0529      0.009      5.783      0.000       0.035       0.071
## np.power(exper, 2)    -0.0005   9.48e-05     -5.276      0.000      -0.001      -0.000
## south                 -0.0842      0.032     -2.654      0.008      -0.146      -0.022
## metro                 -0.4320      0.171     -2.522      0.012      -0.768      -0.096
## female                -1.6197      0.346     -4.682      0.000      -2.299      -0.941
## metro:female           1.1697      0.376      3.114      0.002       0.432       1.907
## metro:educ             0.0439      0.012      3.585      0.000       0.020       0.068
## female:educ            0.1056      0.024      4.312      0.000       0.058       0.154
## metro:female:educ     -0.0864      0.027     -3.251      0.001      -0.139      -0.034
## educ:exper            -0.0015      0.000     -3.004      0.003      -0.002      -0.001
## ==============================================================================
## Omnibus:                        1.478   Durbin-Watson:                   2.019
## Prob(Omnibus):                  0.478   Jarque-Bera (JB):                1.338
## Skew:                           0.064   Prob(JB):                        0.512
## Kurtosis:                       3.130   Cond. No.                     4.30e+04
## ==============================================================================
##
## Warnings:
## [1] Standard Errors are heteroscedasticity robust (HC3)
## [2] The condition number is large, 4.3e+04. This might indicate that there are
## strong multicollinearity or other numerical problems.

Note the results - $$\text{educ}^2$$ is still insignificant, the $$p$$-value of metro decreased and is now less than $$0.05$$, the $$p$$-value of south increased slightly.

If we wanted to also extract the HAC correction standard errors:

V_HAC <- sandwich::NeweyWest(mdl_4_fit, lag = 2)
print(round(data.frame(HAC = sqrt(diag(V_HAC))), 6))
##                        HAC
## (Intercept)       0.321951
## educ              0.032934
## I(educ^2)         0.000887
## exper             0.008877
## I(exper^2)        0.000087
## south             0.031121
## metro             0.166630
## female            0.317430
## metro:female      0.350372
## educ:metro        0.011962
## educ:female       0.022412
## educ:exper        0.000487
## educ:metro:female 0.024744
V_HAC = smstats.sandwich_covariance.cov_hac_simple(mdl_4_fit, nlags = 2)
print(pd.DataFrame(np.sqrt(np.diag(V_HAC)), index = mdl_4.exog_names, columns = ["HAC"]))
##                          HAC
## Intercept           0.319538
## educ                0.032808
## np.power(educ, 2)   0.000896
## exper               0.008957
## np.power(exper, 2)  0.000090
## south               0.031330
## metro               0.166811
## female              0.322378
## metro:female        0.356004
## metro:educ          0.011933
## female:educ         0.022734
## metro:female:educ   0.025097
## educ:exper          0.000483

And the full model output:

print(round(lmtest::coeftest(mdl_4_fit, sandwich::NeweyWest(mdl_4_fit, lag = 2))[, ], 4))
##                   Estimate Std. Error t value Pr(>|t|)
## (Intercept)         1.1892     0.3220  3.6937   0.0002
## educ                0.0955     0.0329  2.9003   0.0038
## I(educ^2)          -0.0001     0.0009 -0.1041   0.9171
## exper               0.0529     0.0089  5.9585   0.0000
## I(exper^2)         -0.0005     0.0001 -5.7189   0.0000
## south              -0.0842     0.0311 -2.7042   0.0070
## metro              -0.4320     0.1666 -2.5927   0.0097
## female             -1.6197     0.3174 -5.1026   0.0000
## metro:female        1.1697     0.3504  3.3383   0.0009
## educ:metro          0.0439     0.0120  3.6673   0.0003
## educ:female         0.1056     0.0224  4.7132   0.0000
## educ:exper         -0.0015     0.0005 -2.9813   0.0029
## educ:metro:female  -0.0864     0.0247 -3.4931   0.0005
print(mdl_4_fit.get_robustcov_results(cov_type = 'HAC', maxlags = 2).summary())
##                             OLS Regression Results
## ==============================================================================
## Dep. Variable:           np.log(wage)   R-squared:                       0.367
## Model:                            OLS   Adj. R-squared:                  0.359
## Method:                 Least Squares   F-statistic:                     56.27
## Date:                Tue, 13 Oct 2020   Prob (F-statistic):          5.33e-102
## Time:                        21:44:39   Log-Likelihood:                -580.02
## No. Observations:                 960   AIC:                             1186.
## Df Residuals:                     947   BIC:                             1249.
## Df Model:                          12
## Covariance Type:                  HAC
## ======================================================================================
##                          coef    std err          t      P>|t|      [0.025      0.975]
## --------------------------------------------------------------------------------------
## Intercept              1.1892      0.317      3.747      0.000       0.566       1.812
## educ                   0.0955      0.033      2.931      0.003       0.032       0.159
## np.power(educ, 2)  -9.233e-05      0.001     -0.104      0.917      -0.002       0.002
## exper                  0.0529      0.009      5.945      0.000       0.035       0.070
## np.power(exper, 2)    -0.0005   8.95e-05     -5.583      0.000      -0.001      -0.000
## south                 -0.0842      0.031     -2.705      0.007      -0.145      -0.023
## metro                 -0.4320      0.166     -2.608      0.009      -0.757      -0.107
## female                -1.6197      0.320     -5.059      0.000      -2.248      -0.991
## metro:female           1.1697      0.354      3.308      0.001       0.476       1.864
## metro:educ             0.0439      0.012      3.701      0.000       0.021       0.067
## female:educ            0.1056      0.023      4.678      0.000       0.061       0.150
## metro:female:educ     -0.0864      0.025     -3.468      0.001      -0.135      -0.038
## educ:exper            -0.0015      0.000     -3.027      0.003      -0.002      -0.001
## ==============================================================================
## Omnibus:                        1.478   Durbin-Watson:                   2.019
## Prob(Omnibus):                  0.478   Jarque-Bera (JB):                1.338
## Skew:                           0.064   Prob(JB):                        0.512
## Kurtosis:                       3.130   Cond. No.                     4.30e+04
## ==============================================================================
##
## Warnings:
## [1] Standard Errors are heteroscedasticity and autocorrelation robust (HAC) using 2 lags and without small sample correction
## [2] The condition number is large, 4.3e+04. This might indicate that there are
## strong multicollinearity or other numerical problems.

We see a decrease in the $$p$$-values, similarly to the results when using HCE. We again emphasize the fact that in this case the residuals were found to be uncorrelated, hence we did not expect HAC to offer any large changes in variable significance, compared to HCE.

Since we have estimated determined that the residuals are heteroskedastic, but not autocorrelated, we can use WLS with a generic weight function $$\widehat{h}_i = \exp\left(\widehat{\log(\epsilon_i^2)}\right)$$, where $$\log(\epsilon_i^2)$$ are the fitted values from the following residual regression $$\log(\epsilon_i^2) = \alpha_0 + \alpha_1 Z_{1,i} + ... + \alpha_m Z_{m,i} + v_i$$:

log_resid_sq_ols <- lm.fit(y = log(mdl_4_fit$residuals^2), x = model.matrix(mdl_4_fit)) Note that lm.fit returns a different output from lm but we can pass the endogeneous variable as y and the exogeneous variable design matrix (including the intercept and interaction/polynomial terms) as x. log_resid_sq_ols = sm.OLS(np.log(mdl_4_fit.resid**2), mdl_4.exog) h_est = exp(log_resid_sq_ols$fitted.values)
h_est = np.exp(log_resid_sq_ols.fit().fittedvalues)

Next, we can use the diagonal elements of $$\widehat{\mathbf{\Omega}}^{-1} = \text{diag} \left(\widehat{h}_1^{-1},...,\widehat{h}_N^{-1} \right)$$ as the weights:

#
mdl_4_wls_fit <- lm(formula = formula(mdl_4_fit), data = dt4_train, weights =  1 / h_est)
print(round(coef(summary(mdl_4_wls_fit)), 4))
##                   Estimate Std. Error t value Pr(>|t|)
## (Intercept)         1.2147     0.2921  4.1579   0.0000
## educ                0.0874     0.0294  2.9728   0.0030
## I(educ^2)           0.0004     0.0008  0.5075   0.6119
## exper               0.0541     0.0077  7.0094   0.0000
## I(exper^2)         -0.0005     0.0001 -5.8605   0.0000
## south              -0.0859     0.0310 -2.7704   0.0057
## metro              -0.4406     0.1758 -2.5067   0.0124
## female             -1.5567     0.3820 -4.0749   0.0000
## metro:female        1.2851     0.4014  3.2018   0.0014
## educ:metro          0.0440     0.0128  3.4300   0.0006
## educ:female         0.1010     0.0275  3.6704   0.0003
## educ:exper         -0.0016     0.0004 -4.3637   0.0000
## educ:metro:female  -0.0938     0.0289 -3.2473   0.0012
mdl_4_wls = smf.wls(formula = mdl_4.formula, data = dt4_train, weights = 1.0 / h_est)
mdl_4_wls_fit = mdl_4_wls.fit()
#
print(mdl_4_wls_fit.summary().tables[1])
## ======================================================================================
##                          coef    std err          t      P>|t|      [0.025      0.975]
## --------------------------------------------------------------------------------------
## Intercept              1.2147      0.292      4.158      0.000       0.641       1.788
## educ                   0.0874      0.029      2.973      0.003       0.030       0.145
## np.power(educ, 2)      0.0004      0.001      0.508      0.612      -0.001       0.002
## exper                  0.0541      0.008      7.009      0.000       0.039       0.069
## np.power(exper, 2)    -0.0005   8.09e-05     -5.860      0.000      -0.001      -0.000
## south                 -0.0859      0.031     -2.770      0.006      -0.147      -0.025
## metro                 -0.4406      0.176     -2.507      0.012      -0.786      -0.096
## female                -1.5567      0.382     -4.075      0.000      -2.306      -0.807
## metro:female           1.2851      0.401      3.202      0.001       0.497       2.073
## metro:educ             0.0440      0.013      3.430      0.001       0.019       0.069
## female:educ            0.1010      0.028      3.670      0.000       0.047       0.155
## metro:female:educ     -0.0938      0.029     -3.247      0.001      -0.151      -0.037
## educ:exper            -0.0016      0.000     -4.364      0.000      -0.002      -0.001
## ======================================================================================

Compared to our OLS results:

print(round(coef(summary(mdl_4_fit)), 4))
##                   Estimate Std. Error t value Pr(>|t|)
## (Intercept)         1.1892     0.3630  3.2760   0.0011
## educ                0.0955     0.0372  2.5701   0.0103
## I(educ^2)          -0.0001     0.0010 -0.0884   0.9296
## exper               0.0529     0.0082  6.4544   0.0000
## I(exper^2)         -0.0005     0.0001 -5.9648   0.0000
## south              -0.0842     0.0309 -2.7266   0.0065
## metro              -0.4320     0.2345 -1.8424   0.0657
## female             -1.6197     0.3878 -4.1762   0.0000
## metro:female        1.1697     0.4181  2.7978   0.0052
## educ:metro          0.0439     0.0172  2.5442   0.0111
## educ:female         0.1056     0.0273  3.8654   0.0001
## educ:exper         -0.0015     0.0004 -3.4252   0.0006
## educ:metro:female  -0.0864     0.0293 -2.9495   0.0033
print(mdl_4_fit.summary().tables[1])
## ======================================================================================
##                          coef    std err          t      P>|t|      [0.025      0.975]
## --------------------------------------------------------------------------------------
## Intercept              1.1892      0.363      3.276      0.001       0.477       1.902
## educ                   0.0955      0.037      2.570      0.010       0.023       0.168
## np.power(educ, 2)  -9.233e-05      0.001     -0.088      0.930      -0.002       0.002
## exper                  0.0529      0.008      6.454      0.000       0.037       0.069
## np.power(exper, 2)    -0.0005   8.38e-05     -5.965      0.000      -0.001      -0.000
## south                 -0.0842      0.031     -2.727      0.007      -0.145      -0.024
## metro                 -0.4320      0.234     -1.842      0.066      -0.892       0.028
## female                -1.6197      0.388     -4.176      0.000      -2.381      -0.859
## metro:female           1.1697      0.418      2.798      0.005       0.349       1.990
## metro:educ             0.0439      0.017      2.544      0.011       0.010       0.078
## female:educ            0.1056      0.027      3.865      0.000       0.052       0.159
## metro:female:educ     -0.0864      0.029     -2.949      0.003      -0.144      -0.029
## educ:exper            -0.0015      0.000     -3.425      0.001      -0.002      -0.001
## ======================================================================================

we see that the WLS coefficient estimate for metro is statistically significant. The sign of $$educ^2$$ has changed but it remains insignificant.

Regarding the $$R^2$$ - in the WLS it is:

print(summary(mdl_4_wls_fit)$adj.r.squared) ## [1] 0.39521 print(mdl_4_wls_fit.rsquared_adj) ## 0.39521003912064123 But do note, that it is calculated on the weighted (i.e. transformed) data, so it is not directly comparable to the OLS $$R^2$$. #### 4.11.3.5 Task 13 In general, the coefficients themselves are mostly unchanged dt_diff <- data.frame('OLS' = coef(mdl_4_fit), 'WLS' = coef(mdl_4_wls_fit), 'diff (%)' = (coef(mdl_4_fit) / coef(mdl_4_wls_fit) - 1)* 100, check.names = FALSE) print(round(dt_diff, 5)) ## OLS WLS diff (%) ## (Intercept) 1.18919 1.21473 -2.10238 ## educ 0.09552 0.08740 9.28230 ## I(educ^2) -0.00009 0.00041 -122.48548 ## exper 0.05289 0.05407 -2.18734 ## I(exper^2) -0.00050 -0.00047 5.41701 ## south -0.08416 -0.08587 -1.98894 ## metro -0.43202 -0.44060 -1.94776 ## female -1.61971 -1.55675 4.04463 ## metro:female 1.16966 1.28515 -8.98669 ## educ:metro 0.04387 0.04401 -0.32098 ## educ:female 0.10563 0.10105 4.54016 ## educ:exper -0.00145 -0.00163 -11.14899 ## educ:metro:female -0.08643 -0.09382 -7.87008 dt_diff = pd.DataFrame([mdl_4_fit.params, mdl_4_wls_fit.params, (mdl_4_fit.params / mdl_4_wls_fit.params - 1) * 100]).T dt_diff.columns = ["OLS", "WLS", "diff (%)"] print(dt_diff.round(5)) ## OLS WLS diff (%) ## Intercept 1.18919 1.21473 -2.10238 ## educ 0.09552 0.08740 9.28230 ## np.power(educ, 2) -0.00009 0.00041 -122.48548 ## exper 0.05289 0.05407 -2.18734 ## np.power(exper, 2) -0.00050 -0.00047 5.41701 ## south -0.08416 -0.08587 -1.98894 ## metro -0.43202 -0.44060 -1.94776 ## female -1.61971 -1.55675 4.04463 ## metro:female 1.16966 1.28515 -8.98669 ## metro:educ 0.04387 0.04401 -0.32098 ## female:educ 0.10563 0.10105 4.54016 ## metro:female:educ -0.08643 -0.09382 -7.87008 ## educ:exper -0.00145 -0.00163 -11.14899 with the exception of the squared term $$educ^2$$. Only one variable - metro- had a change in significance - and even it is also included in interaction terms, so it does not effect our final model. Hence, we are inclined to believe that this model is likely correctly specified. Furthermore, the sign of $$educ^2$$ has changed, though it remains insignificant. On the other hand - it is significant in the RLS model (at this point we may even start thinking of a restricted WLS model). On the other hand, we may be more interested in comparing the model residuals of the WLS. It would make sense to compare the WLS residuals which are from the transformed data, since the model was fitted on the transformed values: e_star <- 1.0 / sqrt(h_est) * mdl_4_wls_fit$residuals
e_star = 1.0 / np.sqrt(h_est) * mdl_4_wls_fit.resid
par(mfrow = c(1, 2))
#
# Plot fitted vs residual plots:
plot(mdl_4_wls_fit$fitted.values, e_star, pch = 21, col = "black", bg = "cornflowerblue") points(mdl_4_fit$fitted.values, mdl_4_fit$residuals, pch = 21, col = "black", bg = "red") legend("topright", legend = c("WLS", "OLS"), pch = c(21, 21), pt.cex = 1.2, cex = 0.8, pt.bg = c("cornflowerblue", "red")) # Plot the residual histogram hist(e_star, breaks = 30, col = "cornflowerblue", main = NULL) hist(mdl_4_fit$residuals, breaks = 30, col = "red", add = TRUE)
legend("topright", legend = c("WLS", "OLS"),
pch = c(22, 22), pt.cex = 1.2, cex = 0.8,
pt.bg = c("cornflowerblue", "red"))

fig = plt.figure(num = 9, figsize = (10, 8))
# Plot fitted vs residual plots:
ax = fig.add_subplot(1, 2, 1)
_ = ax.plot(mdl_4_wls_fit.fittedvalues, e_star, linestyle = "None", marker = "o", markeredgecolor = "black", label = "WLS")
_ = ax.plot(mdl_4_fit.fittedvalues, mdl_4_fit.resid, linestyle = "None", marker = "o", color = "red", markeredgecolor = "black", label = "OLS")
_ = ax.legend()
# Plot the residual histogram
ax = fig.add_subplot(1, 2, 2)
_ = ax.hist(e_star, bins = 30, edgecolor = "black", label = "WLS")
_ = ax.hist(mdl_4_fit.resid, bins = 30, edgecolor = "black", color = "red", label = "OLS")
_ = ax.legend()
# Fix layout in case the labels do overlap:
_ = plt.tight_layout()
plt.show()

We do note that the residual variance is larger in the transformed data. Generally, we would hope that WLS (and (F)GLS) would reduce the variance of the residuals. This may indicate, that we need different weights. On the other hand, this may also depend on the magnitude and shape of the residual heteroskedasticity. Nevertheless, for now, we will use the WLS model.

Looking at it in a bit more detail:

layout(layout_mat)
#
#
# Plot fitted vs residual plots:
plot(mdl_4_wls_fit$fitted.values, e_star, type = "p", pch = 21, bg = "cornflowerblue", main = "Residuals vs Fitted", ylab = "residuals", xlab = "fitted values", cex = 1.5) # # Plot the residual histogram hist(e_star, col = "cornflowerblue", breaks = 30, main = "Residual Histogram") # # Plot the residual Q-Q plot: qqnorm(e_star, main = "Q-Q plot of residuals", pch = 21, bg = "cornflowerblue", cex = 1.5) qqline(e_star, col = "red", lwd = 2) fig = plt.figure(num = 10, figsize = (10, 8)) # Plot fitted vs residual plots: ax = fig.add_subplot(2, 2, 1) _ = ax.plot(mdl_4_wls_fit.fittedvalues, e_star, linestyle = "None", marker = "o", markeredgecolor = "black", label = "WLS") _ = ax.legend() # Plot the residual histogram ax = fig.add_subplot(2, 2, 2) _ = ax.hist(e_star, bins = 30, edgecolor = "black", label = "WLS") _ = ax.legend() # Plot the residual Q-Q plot: ax = fig.add_subplot(2, 1, 2) _ = stats.probplot(e_star, dist = "norm", plot = ax) # Fix layout in case the labels do overlap: _ = plt.tight_layout() plt.show() Visually, the scatterplot of the residuals may be better, but we are not sure. Thankfully, we know some tests, which can help us out. # bg_t = lmtest::bgtest(mdl_4_wls_fit, order = 2) print(bg_t) ## ## Breusch-Godfrey test for serial correlation of order up to 2 ## ## data: mdl_4_wls_fit ## LM test = 0.57951, df = 2, p-value = 0.7484 name = ['LM-stat', 'LM: p-value', 'F-value', 'F: p-value'] bg_t = sm_diagnostic.acorr_breusch_godfrey(mdl_4_wls_fit, nlags = 2) print(pd.DataFrame(bg_t, index = name).T) ## LM-stat LM: p-value F-value F: p-value ## 0 2.95002 0.228776 0.290531 0.747933 Since the $$p$$-value is greater than 0.05 - we have no grounds to reject the null hypothesis and conclude that the WLS residuals are not serially correlated (as was the case with the OLS residuals). # print(car::ncvTest(mdl_4_wls_fit)) ## Non-constant Variance Score Test ## Variance formula: ~ fitted.values ## Chisquare = 0.6093169, Df = 1, p = 0.43505 BP_t = sm_diagnostic.het_breuschpagan(resid = e_star, exog_het = mdl_4.exog) print(pd.DataFrame(lzip(['LM statistic', 'p-value', 'F-value', 'F: p-value'], BP_t))) ## 0 1 ## 0 LM statistic 8.955895 ## 1 p-value 0.706693 ## 2 F-value 0.743151 ## 3 F: p-value 0.709505 The $$p$$-value is larger - we would not reject the null hypothesis that the residuals are homoskedastic. So, our WLS procedure did take into account all of the heteroskedasticity. Note that since we calculated the weights using the same exogeneous variables, as in the main model, it may be very likely, that the residuals variance depends on some additional exogeneous variables, which we did not include in our main model. Though in this case, it appears that we have accounted for most of the significant heteroskedasticity. Considerations Instead of bptest(), we have used ncvTest(): print(car::ncvTest(mdl_4_wls_fit)) ## Non-constant Variance Score Test ## Variance formula: ~ fitted.values ## Chisquare = 0.6093169, Df = 1, p = 0.43505 Note that ncvTest() supports either a weighted or unweighted linear model, produced by lm. See also this answer for comparison between bptest() and ncvTest(). Another discussion can be found here: • for lmtest::bptest() - the error variance is a function of a linear combination of the predictors in the model; • for car::ncvTest() - the error variance is a function of the expectation of $$Y$$ (i.e. $$\widehat{Y}$$) (variance formula ~ fitted.values). For the weighted model, ncvTest() uses the Pearson residuals and hence takes the weights into account. Unfortunately, even specifying varformula to use the fitted values, the result does not match that of ncvTest(): # BP test doesn't give the same results for the WLS case as ncvTest lmtest::bptest(mdl_4_wls_fit, varformula = ~ fitted.values(mdl_4_wls_fit), studentize = FALSE) ## ## Breusch-Pagan test ## ## data: mdl_4_wls_fit ## BP = 8.3746, df = 1, p-value = 0.003805 This is something to keep in mind in case you ever need to carry out a WLS and check its residuals. Considerations On the other hand, if there (hypothetically speaking) were still some heteroskedasticity - we would need to correct our WLS standard errors. We can do this quite easily with: tmp_out <- lmtest::coeftest(mdl_4_wls_fit, vcov. = sandwich::vcovHC(mdl_4_wls_fit, type = "HC3")) print(round(tmp_out, 4)) ## ## t test of coefficients: ## ## Estimate Std. Error t value Pr(>|t|) ## (Intercept) 1.2147 0.2809 4.3241 <2e-16 *** ## educ 0.0874 0.0282 3.0971 0.0020 ** ## I(educ^2) 0.0004 0.0008 0.5322 0.5947 ## exper 0.0541 0.0082 6.5746 <2e-16 *** ## I(exper^2) -0.0005 0.0001 -5.4087 <2e-16 *** ## south -0.0859 0.0305 -2.8164 0.0050 ** ## metro -0.4406 0.1479 -2.9800 0.0030 ** ## female -1.5567 0.3273 -4.7565 <2e-16 *** ## metro:female 1.2851 0.3540 3.6303 0.0003 *** ## educ:metro 0.0440 0.0106 4.1400 <2e-16 *** ## educ:female 0.1010 0.0232 4.3614 <2e-16 *** ## educ:exper -0.0016 0.0004 -3.8593 0.0001 *** ## educ:metro:female -0.0938 0.0251 -3.7391 0.0002 *** ## --- ## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 print(mdl_4_wls_fit.get_robustcov_results(cov_type = "HC3").summary()) ## WLS Regression Results ## ============================================================================== ## Dep. Variable: np.log(wage) R-squared: 0.403 ## Model: WLS Adj. R-squared: 0.395 ## Method: Least Squares F-statistic: 54.60 ## Date: Tue, 13 Oct 2020 Prob (F-statistic): 1.77e-99 ## Time: 21:44:43 Log-Likelihood: -565.53 ## No. Observations: 960 AIC: 1157. ## Df Residuals: 947 BIC: 1220. ## Df Model: 12 ## Covariance Type: HC3 ## ====================================================================================== ## coef std err t P>|t| [0.025 0.975] ## -------------------------------------------------------------------------------------- ## Intercept 1.2147 0.281 4.324 0.000 0.663 1.766 ## educ 0.0874 0.028 3.097 0.002 0.032 0.143 ## np.power(educ, 2) 0.0004 0.001 0.532 0.595 -0.001 0.002 ## exper 0.0541 0.008 6.575 0.000 0.038 0.070 ## np.power(exper, 2) -0.0005 8.77e-05 -5.409 0.000 -0.001 -0.000 ## south -0.0859 0.030 -2.816 0.005 -0.146 -0.026 ## metro -0.4406 0.148 -2.980 0.003 -0.731 -0.150 ## female -1.5567 0.327 -4.757 0.000 -2.199 -0.914 ## metro:female 1.2851 0.354 3.630 0.000 0.590 1.980 ## metro:educ 0.0440 0.011 4.140 0.000 0.023 0.065 ## female:educ 0.1010 0.023 4.361 0.000 0.056 0.147 ## metro:female:educ -0.0938 0.025 -3.739 0.000 -0.143 -0.045 ## educ:exper -0.0016 0.000 -3.859 0.000 -0.002 -0.001 ## ============================================================================== ## Omnibus: 1.502 Durbin-Watson: 2.011 ## Prob(Omnibus): 0.472 Jarque-Bera (JB): 1.497 ## Skew: 0.096 Prob(JB): 0.473 ## Kurtosis: 2.983 Cond. No. 4.31e+04 ## ============================================================================== ## ## Warnings: ## [1] Standard Errors are heteroscedasticity robust (HC3) ## [2] The condition number is large, 4.31e+04. This might indicate that there are ## strong multicollinearity or other numerical problems. We would again return to the conclusion that we should remove $$educ^2$$ as it is insignificant (though we might get different results using HC0, HC1 or HC2, instead of HC3). In such a case, if heteroskedasticity were still present in WLS residuals - we could conclude the following: • the WLS correction did not take into account all possible heteroskedasticity. • After correcting WLS standard errors with HAC, we came to the same conclusion regarding $$educ^2$$ - we should remove it from our model as it is not significantly different from zero (note that we have not included any interactions with the polynomial term itself, so it would be safe to remove it). Consequently, having carried out all of these tests and different estimation methods, we would still like to account for the remaining heteroskedasticity. To do this we could: • examine residual vs explanatory variable scatter plots to get better weights - maybe there is a specific explanatory variable, which we should use as a weight? • remove outliers - if there are a few points which significantly skew the data - we should remove, as long as they are either due to an error, or some kind of exception. On the other hand, the residual vs fitted scatter plot does not indicate that any single outlier could be the only cause for heteroskedasticity. So it will most likely not completely fix heteroskedasticity. • get additional observations and additional variables - as we have seen from the full dataset - there are many more additional variables, and observations, which may result in a different model (as an example, the variable black is significant in the full dataset). Considerations For interests sake, if we were to compare the residuals for the original data - they would have minor differences: par(mfrow = c(1, 2)) # Plot fitted vs residual plots: plot(mdl_4_wls_fit$fitted.values, mdl_4_wls_fit$residuals, pch = 21, col = "black", bg = "cornflowerblue") points(mdl_4_fit$fitted.values, mdl_4_fit$residuals, pch = 21, col = "black", bg = "red") legend("topright", legend = c("WLS", "OLS"), pch = c(21, 21), pt.cex = 1.2, cex = 0.8, pt.bg = c("cornflowerblue", "red")) # Plot the residual histogram hist(mdl_4_fit$residuals, breaks = 30, col = "red", main = NULL)
hist(mdl_4_wls_fit$residuals, breaks = 30, col = "cornflowerblue", add = TRUE) legend("topright", legend = c("WLS", "OLS"), pch = c(22, 22), pt.cex = 1.2, cex = 0.8, pt.bg = c("cornflowerblue", "red")) fig = plt.figure(num = 11, figsize = (10, 8)) # Plot fitted vs residual plots: ax = fig.add_subplot(1, 2, 1) _ = ax.plot(mdl_4_wls_fit.fittedvalues, mdl_4_wls_fit.resid, linestyle = "None", marker = "o", markeredgecolor = "black", label = "WLS") _ = ax.plot(mdl_4_fit.fittedvalues, mdl_4_fit.resid, linestyle = "None", marker = "o", color = "red", markeredgecolor = "black", label = "OLS") _ = ax.legend() # Plot the residual histogram ax = fig.add_subplot(1, 2, 2) _ = ax.hist(mdl_4_fit.resid, bins = 30, edgecolor = "black", color = "red") _ = ax.hist(mdl_4_wls_fit.resid, bins = 30, edgecolor = "black") # # Fix layout in case the labels do overlap: _ = plt.tight_layout() plt.show() Again, since WLS fits a model on the transformed data, we are interested if the residuals, from the fitted transformed data adhere to our (MR.3) - (MR.6) assumptions. ### 4.11.4 Exercise Set 4 #### 4.11.4.1 Task 14 We will carry out the Rainbow Test for Linearity: # rain_t1 <- lmtest::raintest(formula(mdl_4_fit), order.by = ~ educ, data = dt4_train) print(rain_t1) ## ## Rainbow test ## ## data: formula(mdl_4_fit) ## Rain = 0.88066, df1 = 480, df2 = 467, p-value = 0.9165 mdl_ord_1 = smf.ols(mdl_4.formula, data = dt4_train.iloc[np.argsort(dt4_train["educ"]), :]).fit() rain_t1 = sm_diagnostic.linear_rainbow(mdl_ord_1) print(pd.DataFrame(rain_t1, index = ["F-stat", "p-value"]).T) ## F-stat p-value ## 0 1.046716 0.309963 # rain_t2 <- lmtest::raintest(formula(mdl_4_fit), order.by = ~ exper, data = dt4_train) print(rain_t2) ## ## Rainbow test ## ## data: formula(mdl_4_fit) ## Rain = 1.1624, df1 = 480, df2 = 467, p-value = 0.05105 mdl_ord_2 = smf.ols(mdl_4.formula, data = dt4_train.iloc[np.argsort(dt4_train["exper"]), :]).fit() rain_t2 = sm_diagnostic.linear_rainbow(mdl_ord_2) print(pd.DataFrame(rain_t2, index = ["F-stat", "p-value"]).T) ## F-stat p-value ## 0 1.149015 0.06567 We see that: • Ordering the data by $$educ$$ gives the $$p$$-value > 0.05, so we have no grounds to reject the null hypothesis that the true model is linear (in other words, our specified linear model captures the nonlinearities). • Ordering the data by $$exper$$ gives the $$p$$-values which is quite close to 0.05, which may indicate, that there may indeed be some nonlinearities remaining. We could try to add $$exper^3$$ to the model, with the hope that it captures any remaining nonlinearities. On the other hand, there may be another explanatory variable, which is correlated with $$exper$$, but which we do not have in our dataset. #### 4.11.4.2 Task 15 Next, we will carry out the Ramsey Regression Specification Error Test. We will begin by testing for a possibility that we do not account for all quadratic terms in our model (in such a case, $$\widehat{Y}^2$$ would be significant in our model): reset_t2 <- lmtest::resettest(formula(mdl_4_fit), data = dt4_train, power = 2, type = "fitted") print(reset_t2) ## ## RESET test ## ## data: formula(mdl_4_fit) ## RESET = 0.45826, df1 = 1, df2 = 946, p-value = 0.4986 reset_t2 = oi.reset_ramsey(mdl_4_fit, degree = 2) print(reset_t2) ## <F test: F=array([[0.45826259]]), p=0.4986025708488683, df_denom=946, df_num=1> In this case, the $$p$$-value > 0.05, so we have no grounds to reject the null hypothesis, that our model adequately captures the quadratic properties of the dependent variable, $$wage$$. On the other hand, if we were to check for the cubic specification: reset_t3 <- lmtest::resettest(formula(mdl_4_fit), data = dt4_train, power = 3, type = "fitted") print(reset_t3) ## ## RESET test ## ## data: formula(mdl_4_fit) ## RESET = 10.962, df1 = 1, df2 = 946, p-value = 0.0009652 reset_t3 = oi.reset_ramsey(mdl_4_fit, degree = 3) print(reset_t3) ## <F test: F=array([[5.55307281]]), p=0.004003065215105123, df_denom=945, df_num=2> In this case, we have that $$p$$-value < 0.05, so we reject the null hypothesis and conclude that our model is inadequate. Considerations Would including cubic terms improve our model? While it might, it is more likely that there are additional explanatory (or even interaction) variables, which could help explain some of the remaining non-linearity in our model. #### 4.11.4.3 Task 16 We will use the automatic variable selection procedure to select the best model from the available regressor combinations. To make it easier to compare, we will plot the coefficients, $$R^2_{adj}$$ and $$BIC$$ for our best model: print(round(coef(summary(mdl_4_fit)), 4)) ## Estimate Std. Error t value Pr(>|t|) ## (Intercept) 1.1892 0.3630 3.2760 0.0011 ## educ 0.0955 0.0372 2.5701 0.0103 ## I(educ^2) -0.0001 0.0010 -0.0884 0.9296 ## exper 0.0529 0.0082 6.4544 0.0000 ## I(exper^2) -0.0005 0.0001 -5.9648 0.0000 ## south -0.0842 0.0309 -2.7266 0.0065 ## metro -0.4320 0.2345 -1.8424 0.0657 ## female -1.6197 0.3878 -4.1762 0.0000 ## metro:female 1.1697 0.4181 2.7978 0.0052 ## educ:metro 0.0439 0.0172 2.5442 0.0111 ## educ:female 0.1056 0.0273 3.8654 0.0001 ## educ:exper -0.0015 0.0004 -3.4252 0.0006 ## educ:metro:female -0.0864 0.0293 -2.9495 0.0033 print(mdl_4_fit.summary().tables[1]) ## ====================================================================================== ## coef std err t P>|t| [0.025 0.975] ## -------------------------------------------------------------------------------------- ## Intercept 1.1892 0.363 3.276 0.001 0.477 1.902 ## educ 0.0955 0.037 2.570 0.010 0.023 0.168 ## np.power(educ, 2) -9.233e-05 0.001 -0.088 0.930 -0.002 0.002 ## exper 0.0529 0.008 6.454 0.000 0.037 0.069 ## np.power(exper, 2) -0.0005 8.38e-05 -5.965 0.000 -0.001 -0.000 ## south -0.0842 0.031 -2.727 0.007 -0.145 -0.024 ## metro -0.4320 0.234 -1.842 0.066 -0.892 0.028 ## female -1.6197 0.388 -4.176 0.000 -2.381 -0.859 ## metro:female 1.1697 0.418 2.798 0.005 0.349 1.990 ## metro:educ 0.0439 0.017 2.544 0.011 0.010 0.078 ## female:educ 0.1056 0.027 3.865 0.000 0.052 0.159 ## metro:female:educ -0.0864 0.029 -2.949 0.003 -0.144 -0.029 ## educ:exper -0.0015 0.000 -3.425 0.001 -0.002 -0.001 ## ====================================================================================== print(BIC(mdl_4_fit)) ## [1] 1256.187 print(mdl_4_fit.bic) ## 1249.3198720313121 print(summary(mdl_4_fit)$adj.r.squared)
## [1] 0.3589044
print(mdl_4_fit.rsquared_adj)
## 0.3589044149386491

We will begin by simply using only the variables, which are in the original dataset:

out_vs1 <- leaps::regsubsets(log(wage) ~ ., data = dt4_train, nbest = 1)
print(summary(out_vs1))
## Subset selection object
## Call: regsubsets.formula(log(wage) ~ ., data = dt4_train, nbest = 1)
## 9 Variables  (and intercept)
##         Forced in Forced out
## black       FALSE      FALSE
## educ        FALSE      FALSE
## exper       FALSE      FALSE
## faminc      FALSE      FALSE
## female      FALSE      FALSE
## metro       FALSE      FALSE
## midwest     FALSE      FALSE
## south       FALSE      FALSE
## west        FALSE      FALSE
## 1 subsets of each size up to 8
## Selection Algorithm: exhaustive
##          black educ exper faminc female metro midwest south west
## 1  ( 1 ) " "   "*"  " "   " "    " "    " "   " "     " "   " "
## 2  ( 1 ) " "   "*"  "*"   " "    " "    " "   " "     " "   " "
## 3  ( 1 ) " "   "*"  "*"   " "    "*"    " "   " "     " "   " "
## 4  ( 1 ) " "   "*"  "*"   " "    "*"    "*"   " "     " "   " "
## 5  ( 1 ) " "   "*"  "*"   " "    "*"    "*"   " "     "*"   " "
## 6  ( 1 ) " "   "*"  "*"   "*"    "*"    "*"   " "     "*"   " "
## 7  ( 1 ) "*"   "*"  "*"   "*"    "*"    "*"   " "     "*"   " "
## 8  ( 1 ) " "   "*"  "*"   "*"    "*"    "*"   "*"     "*"   "*"
1
## 1
par(mfrow = c(1, 2))
plot(out_vs1)
plot(out_vs1, scale = "adjr2")

In this case, we see that the recommended model with the smallest $$BIC$$ value is (Intercept) + educ + exper + female + metro.

1
## 1

If we compare the $$BIC$$ and $$R^2_{adj}$$:

mdl_auto <- lm(log(wage) ~ 1 + educ + exper + female + metro, dt4_train)
print(BIC(mdl_auto))
## [1] 1267.625
print(summary(mdl_auto)$adj.r.squared) ## [1] 0.3187659 1 ## 1 We see that they are worse than our model. Considerations We will next try to include the polynomial and interaction terms, which were significant in our model, in order to see whether a different combination of those parameters could have offered some improvement: out_vs2 <- leaps::regsubsets(log(wage) ~ . + I(educ^2) + I(exper^2) + educ:exper, data = dt4_train, nbest = 1) print(summary(out_vs2)) ## Subset selection object ## Call: regsubsets.formula(log(wage) ~ . + I(educ^2) + I(exper^2) + educ:exper, ## data = dt4_train, nbest = 1) ## 12 Variables (and intercept) ## Forced in Forced out ## black FALSE FALSE ## educ FALSE FALSE ## exper FALSE FALSE ## faminc FALSE FALSE ## female FALSE FALSE ## metro FALSE FALSE ## midwest FALSE FALSE ## south FALSE FALSE ## west FALSE FALSE ## I(educ^2) FALSE FALSE ## I(exper^2) FALSE FALSE ## educ:exper FALSE FALSE ## 1 subsets of each size up to 8 ## Selection Algorithm: exhaustive ## black educ exper faminc female metro midwest south west I(educ^2) I(exper^2) educ:exper ## 1 ( 1 ) " " " " " " " " " " " " " " " " " " "*" " " " " ## 2 ( 1 ) " " " " "*" " " " " " " " " " " " " "*" " " " " ## 3 ( 1 ) " " " " "*" " " " " " " " " " " " " "*" "*" " " ## 4 ( 1 ) " " " " "*" " " "*" " " " " " " " " "*" "*" " " ## 5 ( 1 ) " " "*" "*" " " "*" " " " " " " " " " " "*" "*" ## 6 ( 1 ) " " "*" "*" " " "*" "*" " " " " " " " " "*" "*" ## 7 ( 1 ) " " "*" "*" " " "*" "*" " " "*" " " " " "*" "*" ## 8 ( 1 ) " " "*" "*" "*" "*" "*" " " "*" " " " " "*" "*" 1 ## 1 par(mfrow = c(1, 2)) plot(out_vs2) plot(out_vs2, scale = "adjr2") In this case, the best model in terms of $$BIC$$ is (Intercept) + educ + exper + female + metro + I(exper^2) + educ:exper. 1 ## 1 The estimated automatically selected model is: mdl_auto <- lm(log(wage) ~ 1 + educ + exper + female + metro + I(exper^2) + educ:exper, dt4_train) print(round(coef(summary(mdl_auto)), 5)) ## Estimate Std. Error t value Pr(>|t|) ## (Intercept) 0.48289 0.17201 2.80730 0.00510 ## educ 0.14629 0.01102 13.27979 0.00000 ## exper 0.05523 0.00761 7.25554 0.00000 ## female -0.17155 0.02970 -5.77520 0.00000 ## metro 0.13339 0.03795 3.51527 0.00046 ## I(exper^2) -0.00050 0.00008 -5.92142 0.00000 ## educ:exper -0.00164 0.00038 -4.34780 0.00002 1 ## 1 print(BIC(mdl_auto)) ## [1] 1240.213 print(summary(mdl_auto)$adj.r.squared)
## [1] 0.3459774
1
## 1

which is slightly better in terms of $$BIC$$, as we have not removed the insignificant variable $$I(educ^2)$$, but also slightly worse in terms of $$R^2_{adj}$$. The signs are the same as in our manually specified model.

Note that we would also need to carry out all of the residual tests, which we have done before, as well as the model specification test and correct for heteroskedasticity, if we want to make sure that the significance is correct (note the significance of metro in the WLS model is the same as the one here, but it does not mean that this automatically selected model accounts for heteroskedasticity).

Furthermore:

• our original model includes more interaction terms - maybe including them here would also lead to a similar model;
• if we would not have our manually model - we would have to include all possible combinations of interaction terms. Some of these interactions may also cause collinearity problems.

We will try to specify a number of additional interaction terms (assume that we are fine with a maximum of $$10$$ explanatory variables in our model):

out_vs3 <- leaps::regsubsets(log(wage) ~ . + black*female*educ*exper +
metro*south*educ*exper +
metro*west*educ*exper +
metro*midwest*educ*exper +
I(educ^2) + I(exper^2), data = dt4_train,
nvmax = 10, nbest = 1)
1
## 1
plot(out_vs3)

From the vertical axis, we see that there are several models, which have $$BIC$$ close to $$-380$$.

We can also plot some additional diagnostic plots to see how the $$BIC$$, $$R^2_{adj}$$ and number of variables relate:

out_vs3_summary <- summary(out_vs3)
par(mfrow=c(1, 2))
plot(out_vs3_summary$adjr2, xlab = "Number of Variables", ylab = "djusted RSq", type = "l") plot(out_vs3_summary$bic, xlab = "Number of Variables", ylab = "BIC", type = "l")

This exhaustive search suggests that our best model in terms of smallest $$BIC$$ is the one with $$8$$ explanatory variables (excluding the constant):

min_bic <- which.min(out_vs3_summary$bic) #out_vs3_summary$which
print(round(cbind(coef(out_vs3, min_bic)), 5))
##                            [,1]
## (Intercept)             0.75214
## educ                    0.13462
## exper                   0.05500
## female                 -0.61751
## I(exper^2)             -0.00050
## educ:female             0.03110
## educ:exper             -0.00195
## educ:exper:metro        0.00055
## educ:exper:metro:south -0.00038
1
## 1

Again, if we were to estimate this model (as it is difficult to extract certain model characteristics from the automated procedure output):

mdl_auto <- lm(log(wage) ~ 1 + educ + exper + female +
I(exper^2) + educ:female + educ:exper +
educ:metro + educ:exper:metro:south, dt4_train)
1
## 1
print(round(coef(summary(mdl_auto)), 5))
##                        Estimate Std. Error  t value Pr(>|t|)
## (Intercept)             0.78535    0.18057  4.34926  0.00002
## educ                    0.12134    0.01225  9.90339  0.00000
## exper                   0.05440    0.00754  7.21134  0.00000
## female                 -0.61607    0.15346 -4.01448  0.00006
## I(exper^2)             -0.00052    0.00008 -6.22199  0.00000
## educ:female             0.03119    0.01050  2.97075  0.00305
## educ:exper             -0.00142    0.00038 -3.77580  0.00017
## educ:metro              0.01262    0.00277  4.55629  0.00001
## educ:exper:metro:south -0.00035    0.00009 -3.76491  0.00018
print(BIC(mdl_auto))
## [1] 1230.481
print(summary(mdl_auto)$adj.r.squared) ## [1] 0.3604282 1 ## 1 print(round(coef(summary(mdl_4_fit)), 4)) ## Estimate Std. Error t value Pr(>|t|) ## (Intercept) 1.1892 0.3630 3.2760 0.0011 ## educ 0.0955 0.0372 2.5701 0.0103 ## I(educ^2) -0.0001 0.0010 -0.0884 0.9296 ## exper 0.0529 0.0082 6.4544 0.0000 ## I(exper^2) -0.0005 0.0001 -5.9648 0.0000 ## south -0.0842 0.0309 -2.7266 0.0065 ## metro -0.4320 0.2345 -1.8424 0.0657 ## female -1.6197 0.3878 -4.1762 0.0000 ## metro:female 1.1697 0.4181 2.7978 0.0052 ## educ:metro 0.0439 0.0172 2.5442 0.0111 ## educ:female 0.1056 0.0273 3.8654 0.0001 ## educ:exper -0.0015 0.0004 -3.4252 0.0006 ## educ:metro:female -0.0864 0.0293 -2.9495 0.0033 print(BIC(mdl_4_fit)) ## [1] 1256.187 print(summary(mdl_4_fit)$adj.r.squared)
## [1] 0.3589044
1
## 1

The automated procedure resulted in a model with a lower $$BIC$$ and a higher $$R^2$$. Furthermore, it appears that all of the coefficients are statistically significant. Nevertheless, we may also need to carry out all of the residuals tests to verify that this model is adequate.

Careful!

While the automated procedure resulted in a model with a lower $$BIC$$ and a higher $$R^2$$ - the main problem may come from the variable signs - one of the differences from our manually specified model is that we have $$educ \times exper \times metro \times south$$.

However, does this coefficient sign make economic sense sense, especially considering the fact that the automated model includes $$south$$ only in the interaction term? Since $$south$$ is not included in the model itself, what is the region, or the base case(-s), that this interaction is being compared to?

Furthermore, imagine if the automated model is estimated at the first step of the analysis - before making any assumptions about variable signs, functional form, etc. You run the risk of tailoring your interpretations to the esimated model - you run the risk of evaluating a spurious relationship (not to mention data dredging as well).

Finally, as already mentioned, residual and model adequacy tests are still needed to be carried out for this model. If the model residuals are inadequate - this furthers the previously mentioned data dredging problem.

#### 4.11.4.4 Task 17

Taking note of our concerns for the automatically selected model in the previous task, we opt to use ourmanually specified mdl_4_wls_fit and can calculate the fitted values on the test subset as follows:

dt4_test_predict_wls <- predict(mdl_4_wls_fit, newdata = dt4_test)
dt4_test_predict_wls = mdl_4_wls_fit.predict(exog = dt4_test, transform = True)

We can calculate the $$MSE$$ and compare with the in-sample MSE for the logarithm of wage:

print(paste0("out-of-sample MSE = ", mean((log(dt4_test$wage) - dt4_test_predict_wls)^2))) ## [1] "out-of-sample MSE = 0.221343528223734" print("out-of-sample MSE = " + str(np.mean((np.log(dt4_test["wage"]) - dt4_test_predict_wls)**2))) ## out-of-sample MSE = 0.22134352822373488 print(paste0("in-of-sample MSE = ", mean((mdl_4_wls_fit$residuals)^2)))
## [1] "in-of-sample MSE = 0.196516478628273"
print("in-of-sample MSE = " + str(np.mean((mdl_4_wls_fit.resid)**2)))
## in-of-sample MSE = 0.19651647862827215

As expected, the in-sample MSE is lower than the out-of-sample MSE on new data.

Considerations

For interests sake, we can compare the fitted values from the WLS and OLS:

dt4_test_predict_ols <- predict(mdl_4_fit, newdata = dt4_test)
dt4_test_predict_ols = mdl_4_fit.predict(exog = dt4_test, transform = True)
plot(1:nrow(dt4_test), log(dt4_test$wage), col = "black") # points(1:nrow(dt4_test), dt4_test_predict_ols, col = "blue") # points(1:nrow(dt4_test), dt4_test_predict_wls, col = "red") # legend("topleft", legend = c("actual", "fit_ols", "fit_wls"), pch = 1, col = c("black", "blue", "red")) fig = plt.figure(num = 15, figsize = (10, 8)) _ = plt.plot(list(range(0, len(dt4_test.index))), np.log(dt4_test["wage"]), label = "actual", linestyle = "None", marker = "o", color = "None", markeredgecolor = "black") _ = plt.plot(list(range(0, len(dt4_test.index))), dt4_test_predict_ols, label = "fit_ols", linestyle = "None", marker = "o", color = "None", markeredgecolor = "blue") _ = plt.plot(list(range(0, len(dt4_test.index))), dt4_test_predict_wls, label = "fit_wls", linestyle = "None", marker = "o", color = "None", markeredgecolor = "red") _ = plt.legend(loc = "upper left") plt.show() We can calculate the percentage difference: print(summary((dt4_test_predict_ols - dt4_test_predict_wls) / dt4_test_predict_wls * 100)) ## Min. 1st Qu. Median Mean 3rd Qu. Max. ## -14.78772 -0.24819 0.17354 -0.01992 0.41373 1.96355 print(pd.DataFrame({"diff": (dt4_test_predict_ols - dt4_test_predict_wls) / dt4_test_predict_wls * 100}).describe()) ## diff ## count 240.000000 ## mean -0.019919 ## std 1.176930 ## min -14.787719 ## 25% -0.248192 ## 50% 0.173544 ## 75% 0.413728 ## max 1.963548 we see that the predictions are on average very close. We see that the OLS fitted values are $$\sim 14.8\%$$ lower than WLS at the minimum and only $$\sim 2\%$$ larger than WLS at the maximum. Looking at the scatterplot, we see that the largest difference is for the prediction of the lowest value. Whether this is better, or worse, depends on the residuals: # plot(1:nrow(dt4_test), log(dt4_test$wage) - dt4_test_predict_ols,
col = "blue")
#
points(1:nrow(dt4_test), log(dt4_test\$wage) - dt4_test_predict_wls,
col = "red")

fig = plt.figure(num = 15, figsize = (10, 8))
_ = plt.plot(list(range(0, len(dt4_test.index))), np.log(dt4_test["wage"]) - dt4_test_predict_ols,
linestyle = "None", marker = "o", color = "None", markeredgecolor = "blue")
_ = plt.plot(list(range(0, len(dt4_test.index))), np.log(dt4_test["wage"]) - dt4_test_predict_wls,
linestyle = "None", marker = "o", color = "None", markeredgecolor = "red")
plt.show()