## 5.4 Example

Considerations

The following libraries will be useful for this exercise.

suppressPackageStartupMessages({
suppressWarnings({
suppressMessages({
library(lmtest)
library(lrmest)
library(tseries)
library(nortest)
library(car)
library(sandwich)
library(lattice)
library(viridisLite)
library(leaps)
library(InformationValue)
library(ROCR)
library(mfx)
})
})
})      
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import scipy.stats as stats
import patsy as patsy
#
import statsmodels.api as sm
import statsmodels.formula.api as smf
import statsmodels.stats.diagnostic as sm_diagnostic
import statsmodels.stats.stattools as sm_tools
import statsmodels.stats as smstats
import statsmodels.stats.outliers_influence as oi
#
from sklearn import metrics
#
from statsmodels.stats.outliers_influence import summary_table
from statsmodels.sandbox.regression.predstd import wls_prediction_std
# Additional methods for output formatting
from statsmodels.compat import lzip
# Additional method for pcustom lot legend creation
from matplotlib.lines import Line2D
#
import seaborn as sns

Note that the response variables in these datasets are fundamentally different from the ones in 3.11 and 4.11.

We will use the fifth dataset. The MROZ dataset contains cross-sectional labor force participation data:

• inlf - =1 if in labor force, 1975

• hours- hours worked, 1975

• kidslt6- # kids < 6 years

• kidsge6- # kids 6-18

• age- woman’s age in yrs

• educ- years of schooling

• wage- estimated wage from earns., hours

• repwage- reported wage at interview in 1976

• hushrs- hours worked by husband, 1975

• husage- husband’s age

• huseduc- husband’s years of schooling

• huswage- husband’s hourly wage, 1975

• faminc- family income, 1975

• mtr - fed. marginal tax rate facing woman

• motheduc- mother’s years of schooling

• fatheduc- father’s years of schooling

• unem- unem. rate in county of resid.

• city- =1 if live in SMSA

• exper- actual labor mkt exper

• nwifeinc- (faminc - wage*hours)/1000

• lwage- log(wage)

• expersq - exper^2

assume that we are interested in modelling the labor force participation, inlf, of married women.

dt5 <- foreign::read.dta("http://fmwww.bc.edu/ec-p/data/wooldridge/mroz.dta")
dt5 <- data.frame(dt5)
print(head(dt5, 6))
##   inlf hours kidslt6 kidsge6 age educ   wage repwage hushrs husage huseduc huswage faminc    mtr motheduc fatheduc unem city exper  nwifeinc     lwage expersq
## 1    1  1610       1       0  32   12 3.3540    2.65   2708     34      12  4.0288  16310 0.7215       12        7  5.0    0    14 10.910060 1.2101541     196
## 2    1  1656       0       2  30   12 1.3889    2.65   2310     30       9  8.4416  21800 0.6615        7        7 11.0    1     5 19.499981 0.3285121      25
## 3    1  1980       1       3  35   12 4.5455    4.04   3072     40      12  3.5807  21040 0.6915       12        7  5.0    0    15 12.039910 1.5141380     225
## 4    1   456       0       3  34   12 1.0965    3.25   1920     53      10  3.5417   7300 0.7815        7        7  5.0    0     6  6.799996 0.0921233      36
## 5    1  1568       1       2  31   14 4.5918    3.60   2000     32      12 10.0000  27300 0.6215       12       14  9.5    1     7 20.100060 1.5242720      49
## 6    1  2032       0       0  54   12 4.7421    4.70   1040     57      11  6.7106  19495 0.6915       14        7  7.5    1    33  9.859054 1.5564801    1089
#
print(dt5.head(6))
##    inlf   hours  kidslt6  kidsge6   age  educ    wage  repwage  hushrs  husage  huseduc  huswage   faminc     mtr  motheduc  fatheduc  unem  city  exper   nwifeinc     lwage  expersq
## 0   1.0  1610.0      1.0      0.0  32.0  12.0  3.3540     2.65  2708.0    34.0     12.0   4.0288  16310.0  0.7215      12.0       7.0   5.0   0.0   14.0  10.910060  1.210154    196.0
## 1   1.0  1656.0      0.0      2.0  30.0  12.0  1.3889     2.65  2310.0    30.0      9.0   8.4416  21800.0  0.6615       7.0       7.0  11.0   1.0    5.0  19.499981  0.328512     25.0
## 2   1.0  1980.0      1.0      3.0  35.0  12.0  4.5455     4.04  3072.0    40.0     12.0   3.5807  21040.0  0.6915      12.0       7.0   5.0   0.0   15.0  12.039910  1.514138    225.0
## 3   1.0   456.0      0.0      3.0  34.0  12.0  1.0965     3.25  1920.0    53.0     10.0   3.5417   7300.0  0.7815       7.0       7.0   5.0   0.0    6.0   6.799996  0.092123     36.0
## 4   1.0  1568.0      1.0      2.0  31.0  14.0  4.5918     3.60  2000.0    32.0     12.0  10.0000  27300.0  0.6215      12.0      14.0   9.5   1.0    7.0  20.100060  1.524272     49.0
## 5   1.0  2032.0      0.0      0.0  54.0  12.0  4.7421     4.70  1040.0    57.0     11.0   6.7106  19495.0  0.6915      14.0       7.0   7.5   1.0   33.0   9.859054  1.556480   1089.0

### 5.4.1 Exercise Set 1

We will begin by dividing our data into training and test sets. These are extremely important for binary outcome modelling - we need to make sure our model can correctly classify new data.

y <- h$counts; y <- y/max(y) rect(breaks[-nB], 0, breaks[-1], y, ...) } panel.abs_cor <- function(x, y, digits = 2, prefix = "", cex.cor, ...){ usr <- par("usr"); on.exit(par(usr)) par(usr = c(0, 1, 0, 1)) r <- abs(cor(x, y, use = "complete.obs")) txt <- format(c(r, 0.123456789), digits = digits)[1] txt <- paste0(prefix, txt) if(missing(cex.cor)) cex.cor <- 0.8/strwidth(txt) text(0.5, 0.5, txt, cex = 2) } plt_cols <- c("hours", "age", "educ", "wage", "exper", "husage", "huseduc", "huswage", "faminc") # # # # pairs(dt5_train[, plt_cols], diag.panel = panel.hist, lower.panel = panel.smooth, upper.panel = panel.abs_cor, col = "dodgerblue4", pch = 21, bg = adjustcolor("dodgerblue3", alpha = 0.2)) plt_cols = ["hours", "age", "educ", "wage", "exper", "husage", "huseduc", "huswage", "faminc"] axes = pd.plotting.scatter_matrix(dt5_train[plt_cols], alpha = 0.2, figsize = (20, 15), marker = "o", hist_kwds = dict(edgecolor = "black", linewidth = 1, bins = 30), edgecolor = "black") abs_corr = np.abs(dt5_train.corr().values) for i, j in zip(*plt.np.triu_indices_from(axes, k = 1)): #triu - TRI-angle U-pper _ = axes[i, j].set_xlim((1.1, 1.12)) _ = axes[i, j].set_ylim((1.1, 1.12)) _ = axes[i, j].annotate("%.3f" %abs_corr[i,j], (0.5, 0.5), xycoords = 'axes fraction', ha = 'center', va = 'center', fontsize = 20) _ = plt.tight_layout() plt.show() myPanel <- function(x, y, z, ...){ lattice::panel.levelplot(x,y,z,...) my_text <- ifelse(!is.na(z), paste0(round(z, 4)), "") lattice::panel.text(x, y, my_text) } # mask = cor(dt5_train[, plt_cols], use = "complete.obs") mask[upper.tri(mask, diag = TRUE)] <- NA # lattice::levelplot(mask, panel = myPanel, col.regions = viridisLite::viridis(100), main = 'Correlation of numerical variables') # # # Generate a mask for the upper triangle mask = np.zeros_like(dt5_train[plt_cols].corr(), dtype = np.bool) mask[np.triu_indices_from(mask)] = True # Generate a custom diverging colormap cmap = sns.diverging_palette(220, 10, as_cmap = True) # fig = plt.figure(figsize = (10, 8)) _ = plt.title('Correlation of numerical variables', size = 25) _ = sns.heatmap(dt5_train[plt_cols].corr(), mask = mask, cmap = cmap, annot = True) _ = plt.ylim((len(dt5_train[plt_cols].corr()), 0)) # See bug on bottom cutoff: https://github.com/mwaskom/seaborn/issues/1773 plt.show() We see that for some variables - like age, educ, exper, there is a difference between the two groups. We would expect the following relationship between the independent variables and the participation rate: • $$\beta_{educ} > 0$$ - a higher education is more likely to lead to an active labor force participation, since the person has already invested their time into improving themselves and have a better chance of earning a higher wage, hence they are more likely to be motivated to return to the workforce; • $$\beta_{exper} > 0$$ - the more experience they have, the more likely that they will return to a higher pay job, hence they are more motivated to return to the workforce; • $$\beta_{age} < 0$$ - the older they are, the less likely they are to be hired back in a good paying position. Hence they may be discouraged from returning to the workforce; In addition: • $$\beta_{kidslt6} < 0$$ - the more younger kinds in the family, the higher chance that one of the parents will remain at home to look after them; • $$\beta_{kidsge6} > 0$$ - older children are at school or daycare, so the parent is less likely to stay at home, when they could be working (especially since buying school supplies, clothes, etc. would benefit from additional income). Considerations We can see that the correlations in R and Python are different. We can check for missing values: print(cbind(apply(dt5_train[plt_cols], 2, function(x){sum(is.na(x))}))) ## [,1] ## hours 0 ## age 0 ## educ 0 ## wage 174 ## exper 0 ## husage 0 ## huseduc 0 ## huswage 0 ## faminc 0 print(dt5_train[plt_cols].isna().sum()) ## hours 0 ## age 0 ## educ 0 ## wage 174 ## exper 0 ## husage 0 ## huseduc 0 ## huswage 0 ## faminc 0 ## dtype: int64 So, wage has 174 missing observations. If we compare the correlation matrix of all of these variables: print(cor(dt5_train[, plt_cols], use = "complete.obs")) ## hours age educ wage exper husage huseduc huswage faminc ## hours 1.00000000 0.05486465 -0.06487945 -0.09762756 0.29923008 0.04584942 -0.08593461 -0.11031196 0.15073816 ## age 0.05486465 1.00000000 -0.05217135 0.03039444 0.48364624 0.89442269 -0.06931940 0.08869675 0.11392319 ## educ -0.06487945 -0.05217135 1.00000000 0.34195444 -0.01520617 -0.06985821 0.59434325 0.30300522 0.36232809 ## wage -0.09762756 0.03039444 0.34195444 1.00000000 0.05499150 0.02565592 0.16632982 0.21588573 0.30265172 ## exper 0.29923008 0.48364624 -0.01520617 0.05499150 1.00000000 0.41387168 -0.08321311 -0.11167355 -0.02748859 ## husage 0.04584942 0.89442269 -0.06985821 0.02565592 0.41387168 1.00000000 -0.11392087 0.07238782 0.08672004 ## huseduc -0.08593461 -0.06931940 0.59434325 0.16632982 -0.08321311 -0.11392087 1.00000000 0.39641601 0.35468441 ## huswage -0.11031196 0.08869675 0.30300522 0.21588573 -0.11167355 0.07238782 0.39641601 1.00000000 0.66875621 ## faminc 0.15073816 0.11392319 0.36232809 0.30265172 -0.02748859 0.08672004 0.35468441 0.66875621 1.00000000 print(dt5_train[plt_cols].corr()) ## hours age educ wage exper husage huseduc huswage faminc ## hours 1.000000 -0.009723 0.079324 -0.097628 0.399119 -0.020243 -0.033895 -0.096275 0.186113 ## age -0.009723 1.000000 -0.131189 0.030394 0.370998 0.888971 -0.154236 0.039179 0.065755 ## educ 0.079324 -0.131189 1.000000 0.341954 0.065681 -0.160103 0.592238 0.295962 0.348235 ## wage -0.097628 0.030394 0.341954 1.000000 0.054991 0.025656 0.166330 0.215886 0.302652 ## exper 0.399119 0.370998 0.065681 0.054991 1.000000 0.291947 -0.042077 -0.100794 -0.012938 ## husage -0.020243 0.888971 -0.160103 0.025656 0.291947 1.000000 -0.198246 0.029199 0.048197 ## huseduc -0.033895 -0.154236 0.592238 0.166330 -0.042077 -0.198246 1.000000 0.389622 0.342842 ## huswage -0.096275 0.039179 0.295962 0.215886 -0.100794 0.029199 0.389622 1.000000 0.679731 ## faminc 0.186113 0.065755 0.348235 0.302652 -0.012938 0.048197 0.342842 0.679731 1.000000 with the correlation matrix without wage: print(cor(dt5_train[, plt_cols[-4]], use = "complete.obs")) ## hours age educ exper husage huseduc huswage faminc ## hours 1.000000000 -0.009722653 0.07932400 0.39911906 -0.02024281 -0.03389459 -0.09627502 0.18611296 ## age -0.009722653 1.000000000 -0.13118868 0.37099790 0.88897071 -0.15423616 0.03917928 0.06575474 ## educ 0.079323997 -0.131188678 1.00000000 0.06568143 -0.16010274 0.59223834 0.29596240 0.34823451 ## exper 0.399119061 0.370997900 0.06568143 1.00000000 0.29194744 -0.04207658 -0.10079356 -0.01293827 ## husage -0.020242809 0.888970711 -0.16010274 0.29194744 1.00000000 -0.19824603 0.02919907 0.04819651 ## huseduc -0.033894595 -0.154236156 0.59223834 -0.04207658 -0.19824603 1.00000000 0.38962214 0.34284234 ## huswage -0.096275018 0.039179278 0.29596240 -0.10079356 0.02919907 0.38962214 1.00000000 0.67973079 ## faminc 0.186112959 0.065754743 0.34823451 -0.01293827 0.04819651 0.34284234 0.67973079 1.00000000 print(dt5_train[plt_cols[:2] + plt_cols[4:]].corr()) ## hours age exper husage huseduc huswage faminc ## hours 1.000000 -0.009723 0.399119 -0.020243 -0.033895 -0.096275 0.186113 ## age -0.009723 1.000000 0.370998 0.888971 -0.154236 0.039179 0.065755 ## exper 0.399119 0.370998 1.000000 0.291947 -0.042077 -0.100794 -0.012938 ## husage -0.020243 0.888971 0.291947 1.000000 -0.198246 0.029199 0.048197 ## huseduc -0.033895 -0.154236 -0.042077 -0.198246 1.000000 0.389622 0.342842 ## huswage -0.096275 0.039179 -0.100794 0.029199 0.389622 1.000000 0.679731 ## faminc 0.186113 0.065755 -0.012938 0.048197 0.342842 0.679731 1.000000 We see that Python calculates pairwise correlations, based on their individual observation availability, while R excludes all the rows that have NA values and calculates the correlation for each variables on the same data. When estimating a model, all of the rows, where at least one of the included explanatory variable is NA, will be dropped. • In this case, R calculates the correlation matrix, which would show the variable correlations if we were to include wage in the model. • On the other hand, if we won’t include wage, then the correlation matrix in Python is convenient. If we wanted, we could make Python drop the rows with NA values and calculate the correlation matrix, to get the same results as in R: print(dt5_train[plt_cols].dropna().corr()) ## hours age educ wage exper husage huseduc huswage faminc ## hours 1.000000 0.054865 -0.064879 -0.097628 0.299230 0.045849 -0.085935 -0.110312 0.150738 ## age 0.054865 1.000000 -0.052171 0.030394 0.483646 0.894423 -0.069319 0.088697 0.113923 ## educ -0.064879 -0.052171 1.000000 0.341954 -0.015206 -0.069858 0.594343 0.303005 0.362328 ## wage -0.097628 0.030394 0.341954 1.000000 0.054991 0.025656 0.166330 0.215886 0.302652 ## exper 0.299230 0.483646 -0.015206 0.054991 1.000000 0.413872 -0.083213 -0.111674 -0.027489 ## husage 0.045849 0.894423 -0.069858 0.025656 0.413872 1.000000 -0.113921 0.072388 0.086720 ## huseduc -0.085935 -0.069319 0.594343 0.166330 -0.083213 -0.113921 1.000000 0.396416 0.354684 ## huswage -0.110312 0.088697 0.303005 0.215886 -0.111674 0.072388 0.396416 1.000000 0.668756 ## faminc 0.150738 0.113923 0.362328 0.302652 -0.027489 0.086720 0.354684 0.668756 1.000000 Finally, looking at the above plots and the data definitions we note that someone not in the labor force should not have any wage. So, it may be of interest whether wage is only NA for someone not in the labor force? We will use the whole dataset to get a better look: print(table(dt5$inlf[is.na(dt5$wage)])) ## ## 0 ## 325 print(dt5["inlf"][pd.isna(dt5['wage']) == True].value_counts()) ## 0.0 325 ## Name: inlf, dtype: int64 As we have suspected, wage is only NA if someone is not in the labor force. We might at first think that it may be more practical to replace the NA values with zero to indicate that such people “earn” zero dollars. On the other hand, it is natural that someone not working will also not earn any money. So the cases with wage = 0 will naturally mean that someone is not in the labor force. In other words, if our model were to include wage with 0 instead of NA - then it will perfectly predict when someone is not in the labor force. In other words the statement that: If someone “earns” a wage of 0 - they are not in the labor force. is obvious, as only working people receive wages. Hence why we should not replace NA in wage with 0, since the NA values are used to indicate whether someone is in the labor force - they are not just missing data! • Overall, the current wage itself will not help us determine whether someone will return to the labor force or not. • If we were to have a variable that indicated, for example, the average wage earned by the person in question - it may be useful in determining whether such person would return to the labor force or not. #### 5.4.1.3 Task 3 We will estimate both a logit and probit models. my_formula <- inlf ~ 1 + educ + exper + age + kidslt6 + kidsge6 logit_glm <- glm(my_formula, data = dt5_train, family = binomial(link = "logit")) print(round(coef(summary(logit_glm)), 5)) ## Estimate Std. Error z value Pr(>|z|) ## (Intercept) 1.44243 1.00899 1.42958 0.15284 ## educ 0.18468 0.04800 3.84757 0.00012 ## exper 0.12986 0.01691 7.67801 0.00000 ## age -0.09074 0.01700 -5.33889 0.00000 ## kidslt6 -1.42174 0.22830 -6.22740 0.00000 ## kidsge6 0.07344 0.08762 0.83818 0.40193 my_formula = "inlf ~ 1 + educ + exper + age + kidslt6 + kidsge6" logit_glm = smf.logit(my_formula, data = dt5_train).fit(disp = 0) print(logit_glm.summary2().tables[1].round(5)) ## Coef. Std.Err. z P>|z| [0.025 0.975] ## Intercept 1.44243 1.00899 1.42958 0.15284 -0.53515 3.42001 ## educ 0.18468 0.04800 3.84757 0.00012 0.09060 0.27875 ## exper 0.12986 0.01691 7.67801 0.00000 0.09671 0.16302 ## age -0.09074 0.01700 -5.33889 0.00000 -0.12405 -0.05743 ## kidslt6 -1.42174 0.22830 -6.22740 0.00000 -1.86921 -0.97427 ## kidsge6 0.07344 0.08762 0.83818 0.40193 -0.09829 0.24517 probit_glm <- glm(my_formula, data = dt5_train, family = binomial(link = "probit")) print(round(coef(summary(probit_glm)), 5)) ## Estimate Std. Error z value Pr(>|z|) ## (Intercept) 0.93704 0.58188 1.61038 0.10731 ## educ 0.11016 0.02755 3.99892 0.00006 ## exper 0.07380 0.00925 7.98287 0.00000 ## age -0.05496 0.00969 -5.67441 0.00000 ## kidslt6 -0.86249 0.13284 -6.49269 0.00000 ## kidsge6 0.03735 0.05087 0.73418 0.46284 probit_glm = smf.probit(my_formula, data = dt5_train).fit(disp = 0) print(probit_glm.summary2().tables[1].round(5)) ## Coef. Std.Err. z P>|z| [0.025 0.975] ## Intercept 0.93705 0.58526 1.60109 0.10936 -0.21003 2.08413 ## educ 0.11016 0.02764 3.98617 0.00007 0.05600 0.16433 ## exper 0.07380 0.00903 8.16866 0.00000 0.05610 0.09151 ## age -0.05496 0.00971 -5.66135 0.00000 -0.07399 -0.03594 ## kidslt6 -0.86249 0.13360 -6.45577 0.00000 -1.12434 -0.60064 ## kidsge6 0.03735 0.05020 0.74404 0.45685 -0.06104 0.13574 The signs are as we would expect, but kidsge6 appears to be insignificant. #### 5.4.1.4 Task 4 We will check for collinearity via the VIF (or GVIF, if needed): # # # print(cbind(car::vif(logit_glm))) ## [,1] ## educ 1.074126 ## exper 1.156804 ## age 1.703891 ## kidslt6 1.450408 ## kidsge6 1.242962 vif = pd.DataFrame() vif["VIF Factor"] = [oi.variance_inflation_factor(logit_glm.model.exog, i) for i in range(1, logit_glm.model.exog.shape[1])] vif["Variable"] = logit_glm.model.exog_names[1:] print(vif) ## VIF Factor Variable ## 0 1.048660 educ ## 1 1.232963 exper ## 2 1.521952 age ## 3 1.242066 kidslt6 ## 4 1.234228 kidsge6 Note that if we were to calculate the $$VIF$$ on an lm() model: print(cbind(car::vif(lm(my_formula, data = dt5_train)))) ## [,1] ## educ 1.048660 ## exper 1.232963 ## age 1.521952 ## kidslt6 1.242066 ## kidsge6 1.234228 Then we would get the same results as in Python. Since all of the $$VIF < 5$$, we do not have any collinear variables. ### 5.4.2 Exercise Set 2 #### 5.4.2.1 Task 5 We may want to include the following variables: • $$educ^2$$ - each additional year of education would have a decreasing effect on the participation rate, so that $$\beta_{educ^2} < 0$$; • $$exper^2$$ - each additional year of experience would have a decreasing effect on the participation rate, so that $$\beta_{exper^2} < 0$$; • $$age^2$$ - each additional year of experience would have an increasing negative effect on the participation rate, so that $$\beta_{age^2} < 0$$; • $$kidslt6^2$$ - each additional young kid would significantly reduce the participation rate further, so that $$\beta_{kidslt6^2} < 0$$ • $$kidsge6^2$$ - each additional older kid would further increase the participation rate, since more money is needed for clothes, books, trips, after school activities, etc. On the other hand, younger children may get hand-me-downs from their older siblings. Regarding interaction terms, lets assume that we want to include: • $$kidslt6 \times kidsge6$$ - if we already have a number of older kids, and then we have a younger child, then it may be more likely that the older sibling will babysit the younger ones. Meanwhile the parents can go to work. If we try to include all of them at the same time: logit_glm <- glm(inlf ~ 1 + educ + I(educ^2) + exper + I(exper^2) + age + I(age^2) + kidslt6 + I(kidslt6^2) + kidsge6 + I(kidsge6^2), data = dt5_train, family = binomial(link = "logit")) print(round(coef(summary(logit_glm)), 5)) ## Estimate Std. Error z value Pr(>|z|) ## (Intercept) 2.37291 3.89189 0.60971 0.54206 ## educ -0.21699 0.31343 -0.69231 0.48874 ## I(educ^2) 0.01618 0.01287 1.25699 0.20876 ## exper 0.20877 0.03959 5.27303 0.00000 ## I(exper^2) -0.00293 0.00128 -2.29261 0.02187 ## age -0.04097 0.16160 -0.25351 0.79987 ## I(age^2) -0.00054 0.00187 -0.28590 0.77495 ## kidslt6 -0.87533 0.62116 -1.40920 0.15878 ## I(kidslt6^2) -0.34784 0.36620 -0.94985 0.34219 ## kidsge6 0.10136 0.22997 0.44074 0.65940 ## I(kidsge6^2) -0.00725 0.05133 -0.14126 0.88767 logit_glm = smf.logit("inlf ~ 1 + educ + np.power(educ, 2) + exper + np.power(exper, 2) + age + np.power(age, 2) + kidslt6 + np.power(kidslt6, 2) + kidsge6 + np.power(kidsge6, 2)", data = dt5_train).fit(disp = 0) print(logit_glm.summary2().tables[1].round(5)) ## Coef. Std.Err. z P>|z| [0.025 0.975] ## Intercept 2.37291 3.89189 0.60971 0.54206 -5.25505 10.00088 ## educ -0.21699 0.31343 -0.69231 0.48874 -0.83129 0.39731 ## np.power(educ, 2) 0.01618 0.01287 1.25699 0.20876 -0.00905 0.04141 ## exper 0.20877 0.03959 5.27303 0.00000 0.13117 0.28638 ## np.power(exper, 2) -0.00293 0.00128 -2.29261 0.02187 -0.00544 -0.00043 ## age -0.04097 0.16160 -0.25351 0.79987 -0.35769 0.27576 ## np.power(age, 2) -0.00054 0.00187 -0.28590 0.77495 -0.00421 0.00314 ## kidslt6 -0.87533 0.62116 -1.40920 0.15878 -2.09278 0.34211 ## np.power(kidslt6, 2) -0.34784 0.36620 -0.94985 0.34219 -1.06557 0.36990 ## kidsge6 0.10136 0.22997 0.44074 0.65940 -0.34938 0.55210 ## np.power(kidsge6, 2) -0.00725 0.05133 -0.14126 0.88767 -0.10785 0.09335 We see lots of insignificant variables. Instead, let’s try adding one by one. logit_glm <- glm(inlf ~ 1 + educ + I(educ^2) + exper + age + kidslt6 + kidsge6, data = dt5_train, family = binomial(link = "logit")) print(round(coef(summary(logit_glm)), 5)) ## Estimate Std. Error z value Pr(>|z|) ## (Intercept) 3.57652 2.06186 1.73461 0.08281 ## educ -0.17722 0.30834 -0.57476 0.56546 ## I(educ^2) 0.01497 0.01265 1.18385 0.23647 ## exper 0.12976 0.01686 7.69479 0.00000 ## age -0.09132 0.01699 -5.37619 0.00000 ## kidslt6 -1.44724 0.23135 -6.25566 0.00000 ## kidsge6 0.08243 0.08795 0.93722 0.34864 logit_glm = smf.logit("inlf ~ 1 + educ + np.power(educ, 2) + exper + age + kidslt6 + kidsge6", data = dt5_train).fit(disp = 0) print(logit_glm.summary2().tables[1].round(5)) ## Coef. Std.Err. z P>|z| [0.025 0.975] ## Intercept 3.57652 2.06186 1.73461 0.08281 -0.46465 7.61770 ## educ -0.17722 0.30834 -0.57476 0.56546 -0.78155 0.42711 ## np.power(educ, 2) 0.01497 0.01265 1.18385 0.23647 -0.00982 0.03977 ## exper 0.12976 0.01686 7.69479 0.00000 0.09671 0.16281 ## age -0.09132 0.01699 -5.37619 0.00000 -0.12461 -0.05803 ## kidslt6 -1.44724 0.23135 -6.25566 0.00000 -1.90068 -0.99381 ## kidsge6 0.08243 0.08795 0.93722 0.34864 -0.08996 0.25482 We see that educ^2 and kidsge6 are insignificant. We remove educ^2 for now and continue adding our variables (ignoring the insignificance of kidsge6 for now): logit_glm <- glm(inlf ~ educ + exper + I(exper^2) + age + kidslt6 + kidsge6, data = dt5_train, family = binomial(link = "logit")) print(round(coef(summary(logit_glm)), 5)) ## Estimate Std. Error z value Pr(>|z|) ## (Intercept) 1.17661 1.02507 1.14783 0.25104 ## educ 0.17237 0.04844 3.55860 0.00037 ## exper 0.20720 0.03857 5.37226 0.00000 ## I(exper^2) -0.00293 0.00124 -2.36180 0.01819 ## age -0.08862 0.01729 -5.12716 0.00000 ## kidslt6 -1.41915 0.22987 -6.17366 0.00000 ## kidsge6 0.07470 0.08919 0.83753 0.40229 logit_glm = smf.logit("inlf ~ 1 + educ + exper + np.power(exper, 2) + age + kidslt6 + kidsge6", data = dt5_train).fit(disp = 0) print(logit_glm.summary2().tables[1].round(5)) ## Coef. Std.Err. z P>|z| [0.025 0.975] ## Intercept 1.17661 1.02513 1.14776 0.25107 -0.83262 3.18584 ## educ 0.17237 0.04844 3.55834 0.00037 0.07743 0.26731 ## exper 0.20720 0.03857 5.37172 0.00000 0.13160 0.28280 ## np.power(exper, 2) -0.00293 0.00124 -2.36144 0.01820 -0.00535 -0.00050 ## age -0.08862 0.01729 -5.12677 0.00000 -0.12251 -0.05474 ## kidslt6 -1.41915 0.22989 -6.17327 0.00000 -1.86971 -0.96858 ## kidsge6 0.07470 0.08919 0.83749 0.40232 -0.10011 0.24951 We note that exper^2 is significant. Before removing kidsge6 - let’s try to include the interaction term, which we have been considering: logit_glm <- glm(inlf ~ educ + exper + I(exper^2) + age + kidslt6*kidsge6, data = dt5_train, family = binomial(link = "logit")) print(round(coef(summary(logit_glm)), 5)) ## Estimate Std. Error z value Pr(>|z|) ## (Intercept) 1.70098 1.04919 1.62123 0.10497 ## educ 0.19791 0.05020 3.94197 0.00008 ## exper 0.21693 0.03877 5.59589 0.00000 ## I(exper^2) -0.00318 0.00123 -2.57767 0.00995 ## age -0.10395 0.01832 -5.67236 0.00000 ## kidslt6 -2.28803 0.37195 -6.15140 0.00000 ## kidsge6 -0.08289 0.09919 -0.83566 0.40334 ## kidslt6:kidsge6 0.53630 0.16900 3.17341 0.00151 logit_glm = smf.logit("inlf ~ 1 + educ + exper + np.power(exper, 2) + age + kidslt6*kidsge6", data = dt5_train).fit(disp = 0) print(logit_glm.summary2().tables[1].round(5)) ## Coef. Std.Err. z P>|z| [0.025 0.975] ## Intercept 1.70098 1.04919 1.62123 0.10497 -0.35540 3.75735 ## educ 0.19791 0.05020 3.94197 0.00008 0.09951 0.29631 ## exper 0.21693 0.03877 5.59589 0.00000 0.14095 0.29291 ## np.power(exper, 2) -0.00318 0.00123 -2.57767 0.00995 -0.00560 -0.00076 ## age -0.10395 0.01832 -5.67236 0.00000 -0.13986 -0.06803 ## kidslt6 -2.28803 0.37195 -6.15140 0.00000 -3.01705 -1.55902 ## kidsge6 -0.08289 0.09919 -0.83566 0.40334 -0.27729 0.11152 ## kidslt6:kidsge6 0.53630 0.16900 3.17341 0.00151 0.20507 0.86753 It is significant. But the squared terms of the interaction variables are not: logit_glm <- glm(inlf ~ educ + exper + I(exper^2) + age + kidslt6*kidsge6 + I(kidslt6^2) + I(kidsge6^2), data = dt5_train, family = binomial(link = "logit")) print(round(coef(summary(logit_glm)), 5)) ## Estimate Std. Error z value Pr(>|z|) ## (Intercept) 1.73274 1.09457 1.58304 0.11341 ## educ 0.19860 0.05039 3.94126 0.00008 ## exper 0.21690 0.03890 5.57533 0.00000 ## I(exper^2) -0.00319 0.00124 -2.57792 0.00994 ## age -0.10446 0.01904 -5.48616 0.00000 ## kidslt6 -2.26474 0.74439 -3.04240 0.00235 ## kidsge6 -0.11724 0.22999 -0.50978 0.61021 ## I(kidslt6^2) -0.01828 0.37168 -0.04918 0.96078 ## I(kidsge6^2) 0.00829 0.04956 0.16729 0.86714 ## kidslt6:kidsge6 0.53807 0.17471 3.07977 0.00207 logit_glm = smf.logit("inlf ~ 1 + educ + exper + np.power(exper, 2) + age + kidslt6*kidsge6 + np.power(kidslt6, 2) + np.power(kidsge6, 2)", data = dt5_train).fit(disp = 0) print(logit_glm.summary2().tables[1].round(5)) ## Coef. Std.Err. z P>|z| [0.025 0.975] ## Intercept 1.73274 1.09457 1.58304 0.11341 -0.41257 3.87806 ## educ 0.19860 0.05039 3.94126 0.00008 0.09984 0.29736 ## exper 0.21690 0.03890 5.57533 0.00000 0.14065 0.29315 ## np.power(exper, 2) -0.00319 0.00124 -2.57792 0.00994 -0.00562 -0.00076 ## age -0.10446 0.01904 -5.48616 0.00000 -0.14178 -0.06714 ## kidslt6 -2.26474 0.74439 -3.04240 0.00235 -3.72372 -0.80576 ## kidsge6 -0.11724 0.22999 -0.50978 0.61021 -0.56801 0.33352 ## kidslt6:kidsge6 0.53807 0.17471 3.07977 0.00207 0.19564 0.88050 ## np.power(kidslt6, 2) -0.01828 0.37168 -0.04918 0.96078 -0.74677 0.71021 ## np.power(kidsge6, 2) 0.00829 0.04956 0.16729 0.86714 -0.08885 0.10543 Hence, our finalized model is: logit_glm <- glm(inlf ~ educ + exper + I(exper^2) + age + kidslt6*kidsge6, data = dt5_train, family = binomial(link = "logit")) print(round(coef(summary(logit_glm)), 5)) ## Estimate Std. Error z value Pr(>|z|) ## (Intercept) 1.70098 1.04919 1.62123 0.10497 ## educ 0.19791 0.05020 3.94197 0.00008 ## exper 0.21693 0.03877 5.59589 0.00000 ## I(exper^2) -0.00318 0.00123 -2.57767 0.00995 ## age -0.10395 0.01832 -5.67236 0.00000 ## kidslt6 -2.28803 0.37195 -6.15140 0.00000 ## kidsge6 -0.08289 0.09919 -0.83566 0.40334 ## kidslt6:kidsge6 0.53630 0.16900 3.17341 0.00151 logit_glm = smf.logit("inlf ~ 1 + educ + exper + np.power(exper, 2) + age + kidslt6*kidsge6", data = dt5_train).fit(disp = 0) print(logit_glm.summary2().tables[1].round(5)) ## Coef. Std.Err. z P>|z| [0.025 0.975] ## Intercept 1.70098 1.04919 1.62123 0.10497 -0.35540 3.75735 ## educ 0.19791 0.05020 3.94197 0.00008 0.09951 0.29631 ## exper 0.21693 0.03877 5.59589 0.00000 0.14095 0.29291 ## np.power(exper, 2) -0.00318 0.00123 -2.57767 0.00995 -0.00560 -0.00076 ## age -0.10395 0.01832 -5.67236 0.00000 -0.13986 -0.06803 ## kidslt6 -2.28803 0.37195 -6.15140 0.00000 -3.01705 -1.55902 ## kidsge6 -0.08289 0.09919 -0.83566 0.40334 -0.27729 0.11152 ## kidslt6:kidsge6 0.53630 0.16900 3.17341 0.00151 0.20507 0.86753 Again note that kidslt6:kidsge6 is significant, so we cannot remove the main effect kidsge6, even though it is significant (unless we want to make interpretation harder for ourselves, which we don’t). The same model is estimated for the probit regression: probit_glm <- glm(formula(logit_glm), data = dt5_train, family = binomial(link = "probit")) print(round(coef(summary(probit_glm)), 5)) ## Estimate Std. Error z value Pr(>|z|) ## (Intercept) 0.95926 0.60504 1.58545 0.11286 ## educ 0.11640 0.02866 4.06061 0.00005 ## exper 0.12800 0.02226 5.75096 0.00000 ## I(exper^2) -0.00188 0.00071 -2.65361 0.00796 ## age -0.06025 0.01036 -5.81485 0.00000 ## kidslt6 -1.34130 0.21021 -6.38088 0.00000 ## kidsge6 -0.04653 0.05730 -0.81207 0.41675 ## kidslt6:kidsge6 0.31497 0.09829 3.20436 0.00135 probit_glm = smf.probit(logit_glm.model.formula, data = dt5_train).fit(disp = 0) print(probit_glm.summary2().tables[1].round(5)) ## Coef. Std.Err. z P>|z| [0.025 0.975] ## Intercept 0.95926 0.60729 1.57958 0.11420 -0.23100 2.14953 ## educ 0.11640 0.02868 4.05878 0.00005 0.06019 0.17260 ## exper 0.12800 0.02195 5.83111 0.00000 0.08498 0.17102 ## np.power(exper, 2) -0.00188 0.00070 -2.69562 0.00703 -0.00324 -0.00051 ## age -0.06025 0.01031 -5.84510 0.00000 -0.08045 -0.04005 ## kidslt6 -1.34130 0.20747 -6.46511 0.00000 -1.74793 -0.93467 ## kidsge6 -0.04653 0.05694 -0.81717 0.41383 -0.15813 0.06507 ## kidslt6:kidsge6 0.31497 0.09781 3.22010 0.00128 0.12326 0.50668 The coefficients are also significant. #### 5.4.2.2 Task 6 We will calculate the predicted probability for different variable value combinations in a single plot. To do this we will use: • the range of ages in the training dataset as continuous data • and the number of kids less than 6 years old, kidslt6, as discrete data, since the number of kids is a finite number from the training dataset. Note that the amount of different values for age is: print(max(dt5_train$age) - min(dt5_train$age) + 1) ## [1] 31 print(dt5_train["age"].max() - dt5_train["age"].min() + 1) ## 31.0 So, we will need to create a dataset for prediction, which consists of 31 different values of age, ceteris paribus. To do this, we will firstly repeat the first line of the data 31 times: predict_data_0 <- dt5[rep(1, 31), , drop = FALSE] rownames(predict_data_0) <- NULL print(head(predict_data_0, 5)) ## inlf hours kidslt6 kidsge6 age educ wage repwage hushrs husage huseduc huswage faminc mtr motheduc fatheduc unem city exper nwifeinc lwage expersq ## 1 1 1610 1 0 32 12 3.354 2.65 2708 34 12 4.0288 16310 0.7215 12 7 5 0 14 10.91006 1.210154 196 ## 2 1 1610 1 0 32 12 3.354 2.65 2708 34 12 4.0288 16310 0.7215 12 7 5 0 14 10.91006 1.210154 196 ## 3 1 1610 1 0 32 12 3.354 2.65 2708 34 12 4.0288 16310 0.7215 12 7 5 0 14 10.91006 1.210154 196 ## 4 1 1610 1 0 32 12 3.354 2.65 2708 34 12 4.0288 16310 0.7215 12 7 5 0 14 10.91006 1.210154 196 ## 5 1 1610 1 0 32 12 3.354 2.65 2708 34 12 4.0288 16310 0.7215 12 7 5 0 14 10.91006 1.210154 196 predict_data_0 = dt5.iloc[[0]*31].reset_index(drop = True) # print(predict_data_0.head(5)) ## inlf hours kidslt6 kidsge6 age educ wage repwage hushrs husage huseduc huswage faminc mtr motheduc fatheduc unem city exper nwifeinc lwage expersq ## 0 1.0 1610.0 1.0 0.0 32.0 12.0 3.354 2.65 2708.0 34.0 12.0 4.0288 16310.0 0.7215 12.0 7.0 5.0 0.0 14.0 10.91006 1.210154 196.0 ## 1 1.0 1610.0 1.0 0.0 32.0 12.0 3.354 2.65 2708.0 34.0 12.0 4.0288 16310.0 0.7215 12.0 7.0 5.0 0.0 14.0 10.91006 1.210154 196.0 ## 2 1.0 1610.0 1.0 0.0 32.0 12.0 3.354 2.65 2708.0 34.0 12.0 4.0288 16310.0 0.7215 12.0 7.0 5.0 0.0 14.0 10.91006 1.210154 196.0 ## 3 1.0 1610.0 1.0 0.0 32.0 12.0 3.354 2.65 2708.0 34.0 12.0 4.0288 16310.0 0.7215 12.0 7.0 5.0 0.0 14.0 10.91006 1.210154 196.0 ## 4 1.0 1610.0 1.0 0.0 32.0 12.0 3.354 2.65 2708.0 34.0 12.0 4.0288 16310.0 0.7215 12.0 7.0 5.0 0.0 14.0 10.91006 1.210154 196.0 Next, we will replace age with values to be within the ranges of the actual data: predict_data_1 <- predict_data_0 predict_data_1$age <- min(dt5_train$age):max(dt5_train$age)
predict_data_1        = predict_data_0.copy()
predict_data_1["age"] = list(range(int(dt5_train["age"].min()), int(dt5_train["age"].max() + 1)))

And we will separate two cases: kidslt6 = 0 and kidslt6 = 1:

predict_data_2 <- predict_data_1
#
predict_data_1$kidslt6 <- 0 predict_data_2$kidslt6 <- 1
predict_data_2 = predict_data_1.copy()
#
predict_data_1["kidslt6"] = 0
predict_data_2["kidslt6"] = 1

Then, the predicted values are (here we also calculate their standard errors, which we will use to calculate the confidence bounds):

predict_1 <- predict(logit_glm, newdata = predict_data_1, type = "response", se.fit = TRUE)
predict_2 <- predict(logit_glm, newdata = predict_data_2, type = "response", se.fit = TRUE)
predict_1 = logit_glm.predict(exog = predict_data_1)
predict_2 = logit_glm.predict(exog = predict_data_2)

The parameter variance-covariance matrix, $$\mathbb{V}{\rm ar}( \widehat{\boldsymbol{\beta}})$$:

print(vcov(logit_glm))
##                   (Intercept)          educ         exper    I(exper^2)           age       kidslt6       kidsge6 kidslt6:kidsge6
## (Intercept)      1.1007987013 -2.844867e-02 -2.930640e-03  1.411362e-04 -1.508642e-02 -1.278700e-01 -4.944921e-02    2.712274e-02
## educ            -0.0284486727  2.520516e-03 -1.496552e-04  5.820870e-06 -1.804033e-05 -4.677607e-03 -5.880139e-05    1.352341e-03
## exper           -0.0029306398 -1.496552e-04  1.502766e-03 -4.365827e-05 -7.334632e-05 -1.375430e-03  8.216827e-05    7.159497e-04
## I(exper^2)       0.0001411362  5.820870e-06 -4.365827e-05  1.524588e-06 -5.787982e-07  2.636548e-05  3.380823e-06   -1.767184e-05
## age             -0.0150864155 -1.804033e-05 -7.334632e-05 -5.787982e-07  3.358019e-04  3.584616e-03  7.871850e-04   -8.362006e-04
## kidslt6         -0.1278700208 -4.677607e-03 -1.375430e-03  2.636548e-05  3.584616e-03  1.383491e-01  1.643111e-02   -4.760569e-02
## kidsge6         -0.0494492103 -5.880139e-05  8.216827e-05  3.380823e-06  7.871850e-04  1.643111e-02  9.837965e-03   -7.636762e-03
## kidslt6:kidsge6  0.0271227419  1.352341e-03  7.159497e-04 -1.767184e-05 -8.362006e-04 -4.760569e-02 -7.636762e-03    2.856037e-02
print(logit_glm.cov_params())
##                     Intercept      educ     exper  np.power(exper, 2)           age   kidslt6   kidsge6  kidslt6:kidsge6
## Intercept            1.100799 -0.028449 -0.002931        1.411362e-04 -1.508642e-02 -0.127870 -0.049449         0.027123
## educ                -0.028449  0.002521 -0.000150        5.820871e-06 -1.804037e-05 -0.004678 -0.000059         0.001352
## exper               -0.002931 -0.000150  0.001503       -4.365828e-05 -7.334637e-05 -0.001375  0.000082         0.000716
## np.power(exper, 2)   0.000141  0.000006 -0.000044        1.524588e-06 -5.787973e-07  0.000026  0.000003        -0.000018
## age                 -0.015086 -0.000018 -0.000073       -5.787973e-07  3.358020e-04  0.003585  0.000787        -0.000836
## kidslt6             -0.127870 -0.004678 -0.001375        2.636550e-05  3.584617e-03  0.138349  0.016431        -0.047606
## kidsge6             -0.049449 -0.000059  0.000082        3.380825e-06  7.871852e-04  0.016431  0.009838        -0.007637
## kidslt6:kidsge6      0.027123  0.001352  0.000716       -1.767184e-05 -8.362008e-04 -0.047606 -0.007637         0.028560

We can create the design $$\mathbf{X}$$ matrix using the model formulas as follows:

pd_1 <- model.matrix(formula(logit_glm), predict_data_1)
print(head(pd_1, 5))
##   (Intercept) educ exper I(exper^2) age kidslt6 kidsge6 kidslt6:kidsge6
## 1           1   12    14        196  30       0       0               0
## 2           1   12    14        196  31       0       0               0
## 3           1   12    14        196  32       0       0               0
## 4           1   12    14        196  33       0       0               0
## 5           1   12    14        196  34       0       0               0
pd_1 = patsy.dmatrix(logit_glm.model.formula.split(" ~ ")[1], predict_data_1, return_type = "dataframe")
print(pd_1.head(5))
##    Intercept  educ  exper  np.power(exper, 2)   age  kidslt6  kidsge6  kidslt6:kidsge6
## 0        1.0  12.0   14.0               196.0  30.0      0.0      0.0              0.0
## 1        1.0  12.0   14.0               196.0  31.0      0.0      0.0              0.0
## 2        1.0  12.0   14.0               196.0  32.0      0.0      0.0              0.0
## 3        1.0  12.0   14.0               196.0  33.0      0.0      0.0              0.0
## 4        1.0  12.0   14.0               196.0  34.0      0.0      0.0              0.0

Next, we can calculate $$\mathbb{V}{\rm ar}(\mathbf{X} \widehat{\boldsymbol{\beta}})$$:

vcov_pd_1 <- sqrt(diag(pd_1 %*% vcov(logit_glm) %*% t(pd_1)))
print(unname(vcov_pd_1))
##  [1] 0.4006169 0.3848265 0.3692703 0.3539793 0.3389892 0.3243418 0.3100858 0.2962775 0.2829825 0.2702766 0.2582468 0.2469919 0.2366224 0.2272597 0.2190327 0.2120739 0.2065113 0.2024601 0.2000123 0.1992268 0.2001233 0.2026794 0.2068336 0.2124923 0.2195390 0.2278452 0.2372784 0.2477101 0.2590195 0.2710970 0.2838443
vcov_pd_1 = np.sqrt(np.diag(pd_1.dot(logit_glm.cov_params()).dot(pd_1.T)))
print(vcov_pd_1)
## [0.40061697 0.38482656 0.36927039 0.35397932 0.33898923 0.32434187
##  0.31008579 0.29627749 0.28298253 0.27027666 0.25824686 0.24699192
##  0.23662245 0.22725968 0.21903275 0.21207389 0.20651132 0.20246017
##  0.20001229 0.1992268  0.20012328 0.20267939 0.20683363 0.21249228
##  0.21953905 0.22784516 0.23727842 0.24771008 0.25901954 0.27109697
##  0.28384434]

Note that this gives the same results as in R, if we predict the log-odds:

predict(logit_glm, newdata = predict_data_1, type = "link", se.fit = TRUE)$se.fit ## 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 ## 0.4006169 0.3848265 0.3692703 0.3539793 0.3389892 0.3243418 0.3100858 0.2962775 0.2829825 0.2702766 0.2582468 0.2469919 0.2366224 0.2272597 0.2190327 0.2120739 0.2065113 0.2024601 0.2000123 0.1992268 0.2001233 0.2026794 0.2068336 0.2124923 0.2195390 0.2278452 0.2372784 0.2477101 0.2590195 0.2710970 0.2838443 Finally, we can do the same for the other data frame: pd_2 <- model.matrix(formula(logit_glm), predict_data_2) vcov_pd_2 <- sqrt(diag(pd_2 %*% vcov(logit_glm) %*% t(pd_2))) print(unname(vcov_pd_2)) ## [1] 0.3431336 0.3354213 0.3285511 0.3225768 0.3175490 0.3135132 0.3105082 0.3085639 0.3077007 0.3079274 0.3092418 0.3116301 0.3150678 0.3195211 0.3249482 0.3313013 0.3385283 0.3465744 0.3553841 0.3649020 0.3750743 0.3858491 0.3971776 0.4090135 0.4213143 0.4340403 0.4471553 0.4606260 0.4744222 0.4885162 0.5028830 pd_2 = patsy.dmatrix(logit_glm.model.formula.split(" ~ ")[1], predict_data_2, return_type = "dataframe") vcov_pd_2 = np.sqrt(np.diag(pd_2.dot(logit_glm.cov_params()).dot(pd_2.T))) print(vcov_pd_2) ## [0.34313361 0.33542129 0.32855107 0.32257678 0.31754899 0.31351322 ## 0.31050817 0.30856396 0.30770068 0.30792743 0.30924182 0.31163009 ## 0.3150678 0.3195211 0.32494823 0.33130133 0.33852828 0.34657442 ## 0.3553841 0.36490204 0.37507431 0.38584917 0.39717758 0.40901355 ## 0.4213143 0.43404032 0.44715531 0.46062603 0.47442219 0.48851621 ## 0.50288306] Considerations As per (this answer on stackoverflow), the standard errors for the type = "response" give a first order approximation for the true standard error. This first order approximation is known as the (Delta method) and an example is given (here). We can compare with the results in R: predict_1$se.fit
##          1          2          3          4          5          6          7          8          9         10         11         12         13         14         15         16         17         18         19         20         21         22         23         24         25         26         27         28         29         30         31
## 0.01286851 0.01361606 0.01438090 0.01516033 0.01595159 0.01675199 0.01755923 0.01837176 0.01918925 0.02001320 0.02084766 0.02170008 0.02258218 0.02351082 0.02450861 0.02560416 0.02683167 0.02822957 0.02983826 0.03169688 0.03383959 0.03629184 0.03906718 0.04216511 0.04557000 0.04925119 0.05316388 0.05725080 0.06144430 0.06566886 0.06984389

As the link function is generally non-linear, it is better to get the confidence interval for the link function, and then map it to the response variable scale.

There is also a problem with the way that glm() produces “prediction” intervals - see (1), (2), (3) and (4)

And finally, we can calculate the prediction interval upper and lower bounds:

xb_1 <- predict(logit_glm, newdata = predict_data_1, type = "link", se.fit = FALSE)
xb_2 <- predict(logit_glm, newdata = predict_data_2, type = "link", se.fit = FALSE)
#
predict_1_bounds <- data.frame(lwr = exp(xb_1 - 1.96 * vcov_pd_1)/(1 + exp(xb_1 - 1.96 * vcov_pd_1)),
upr = exp(xb_1 + 1.96 * vcov_pd_1)/(1 + exp(xb_1 + 1.96 * vcov_pd_1)))
predict_2_bounds <- data.frame(lwr = exp(xb_2 - 1.96 * vcov_pd_2)/(1 + exp(xb_2 - 1.96 * vcov_pd_2)),
upr = exp(xb_2 + 1.96 * vcov_pd_2)/(1 + exp(xb_2 + 1.96 * vcov_pd_2)))
xb_1 = logit_glm.predict(exog = predict_data_1, linear = True)
xb_2 = logit_glm.predict(exog = predict_data_2, linear = True)
#
predict_1_bounds = pd.DataFrame([np.exp(xb_1 - 1.96 * vcov_pd_1)/(1 + np.exp(xb_1 - 1.96 * vcov_pd_1)),
np.exp(xb_1 + 1.96 * vcov_pd_1)/(1 + np.exp(xb_1 + 1.96 * vcov_pd_1))], index = ["lwr", "upr"]).T
predict_2_bounds = pd.DataFrame([np.exp(xb_2 - 1.96 * vcov_pd_2)/(1 + np.exp(xb_2 - 1.96 * vcov_pd_2)),
np.exp(xb_2 + 1.96 * vcov_pd_2)/(1 + np.exp(xb_2 + 1.96 * vcov_pd_2))], index = ["lwr", "upr"]).T

The plot, along with the $$95\%$$ confidence interval for the predicted probability is:

plot(dt5_train$age, dt5_train$inlf)
title("Predicted probability values of the Logistic Regression with 95% confidence intervals")
polygon(c(predict_data_1$age, rev(predict_data_1$age)),
c(predict_1_bounds$upr, rev(predict_1_bounds$lwr)),
col = "grey90", border = NA)
polygon(c(predict_data_2$age, rev(predict_data_2$age)),
c(predict_2_bounds$upr, rev(predict_2_bounds$lwr)),
col = "grey90", border = NA)
# Plot the fitted probability values:
lines(predict_data_1$age, predict_1$fit, col = "red")
lines(predict_data_2$age, predict_2$fit, col = "blue")
# Plot the confidence bounds:
lines(predict_data_1$age, predict_1_bounds$lwr, col = "red", lty = 3)
lines(predict_data_1$age, predict_1_bounds$upr, col = "red", lty = 3)
# Plot the confidence bounds:
lines(predict_data_1$age, predict_2_bounds$lwr, col = "blue", lty = 3)
lines(predict_data_1$age, predict_2_bounds$upr, col = "blue", lty = 3)
#
legend("bottomleft", legend = c("kidslt6 = 0", "kidslt6 = 1"), col = c("red", "blue"), lty = 1)

fig = plt.figure(figsize = (10, 8))
#
#
_ = plt.scatter(dt5_train["age"], dt5_train["inlf"], color = "None", edgecolor = "black")
_ = plt.title("Predicted probability values of the Logistic Regression with 95% confidence intervals")
_ = plt.fill_between(predict_data_1["age"], predict_1_bounds["lwr"], predict_1_bounds["upr"], color = "0.8")
_ = plt.fill_between(predict_data_2["age"], predict_2_bounds["lwr"], predict_2_bounds["upr"], color = "0.8")
# Plot the fitted probability values:
_ = plt.plot(predict_data_1["age"], predict_1, color = "red", label = "kidlt6 = 0")
_ = plt.plot(predict_data_2["age"], predict_2, color = "blue", label = "kidlt6 = 1")
# Plot the confidence bounds:
_ = plt.plot(predict_data_1["age"], predict_1_bounds["lwr"], color = "red", linestyle = "--")
_ = plt.plot(predict_data_1["age"], predict_1_bounds["upr"], color = "red", linestyle = "--")
# Plot the confidence bounds:
_ = plt.plot(predict_data_1["age"], predict_2_bounds["lwr"], color = "blue", linestyle = "--")
_ = plt.plot(predict_data_1["age"], predict_2_bounds["upr"], color = "blue", linestyle = "--")
#
_ = plt.legend(loc = "lower left")
plt.show()

From the above we see that:

• If someone does not have any kids under 6 years old (i.e. kidslt6 = 0), then the probability that they would return to the workforce is higher, than if someone has one child under 6 years (i.e. kidslt6 = 1).

• If a woman is 30 years old and has no children under 6 years old (i.e. age = 30 and kidslt6 = 0), then the probability of returning to the labor force is very high, and the confidence interval is quite narrow.

Considerations

Furthermore, at least visually, the confidence intervals for the predicted probability appear to be smaller for someone with kidslt6 = 0, up until their age is 60. However, how likely is it that someone will have children in their late 50’s ?

We can even iterate this for various values of number kids under 6 years old:

#
#
#
predict_data_temp <- predict_data_1
plot(dt5_train$age, dt5_train$inlf)
#
for(i in 0:max(dt5_train$kidslt6)){ # predict_data_temp$kidslt6 <- i
#
tmp_predict_logit  <- predict(logit_glm, newdata = predict_data_temp, type = "response", se.fit = FALSE)
tmp_predict_probit <- predict(probit_glm, newdata = predict_data_temp, type = "response", se.fit = FALSE)
#
lines(predict_data_temp$age, tmp_predict_logit, col = i + 1) lines(predict_data_temp$age, tmp_predict_probit, col = i + 1, lty = 2)
}
legend(x = 29,  y = 0.5, legend = paste0(0:max(dt5_train$kidslt6)), lty = rep(1, max(dt5_train$kidslt6) + 1), col = 0:max(dt5_train$kidslt6) + 1, cex = 0.8) legend(x = 29, y = 0.6, legend = c("Logit", "Probit"), lty = c(1, 2), col = c(1, 1), cex = 0.8) # title("Probability to return to the workforce based on the number of children under the age of 6\n Logit vs Probit regression") predict_data_temp = predict_data_1.copy() clrs = ["black", "red", "green", "blue"] # fig = plt.figure(figsize = (10, 8)) _ = plt.scatter(dt5_train["age"], dt5_train["inlf"], color = "None", edgecolor = "black") _ = plt.title("Probability to return to the workforce based on the number of children under the age of 6\n Logit vs Probit regression") for i in range(0, int(dt5_train["kidslt6"].max() + 1)): predict_data_temp["kidslt6"] = i tmp_predict_logit = logit_glm.predict(exog = predict_data_temp) tmp_predict_probit = probit_glm.predict(exog = predict_data_temp) # _ = plt.plot(predict_data_temp["age"], tmp_predict_logit, color = clrs[i], linestyle = "-") _ = plt.plot(predict_data_temp["age"], tmp_predict_probit, color = clrs[i], linestyle = "--") # _ = plt.legend(loc = "center left", labels = ["Logit", "Probit"] + list(range(0, int(dt5_train["kidslt6"].max() + 1))), handles = [Line2D([0], [0], color = "black", linestyle = "-"), Line2D([0], [0], color = "black", linestyle = "--")] + [Line2D([0], [0], color = i, linestyle = "-") for i in ["black", "red", "green", "blue"]]) _ = plt.xlabel("age") plt.show() • We can conclude that the more children under 6 years old are in the family, the less likely that the mother would return to the labor force. If kidslt6 = 3, then the predicted probability is very close to zero, regardless of age. • The logit and probit models provide similar predicted probability values, hence we see little to no reason to use one over the other in terms of the prediction. Finally, we can also do the same with kids over 6 years kidsge6: predict_data_temp <- predict_data_1 predict_data_temp$kidslt6 <- 0
#
plot(dt5_train$age, dt5_train$inlf)
for(i in 0:max(dt5_train$kidsge6)){ predict_data_temp$kidsge6 <- i
tmp_predict_logit  <- predict(logit_glm, newdata = predict_data_temp, type = "response", se.fit = FALSE)
tmp_predict_probit <- predict(probit_glm, newdata = predict_data_temp, type = "response", se.fit = FALSE)
#
lines(predict_data_temp$age, tmp_predict_logit, col = i + 1) lines(predict_data_temp$age, tmp_predict_probit, col = i + 1, lty = 2)
}
legend(x = 29,  y = 0.5, legend = paste0(0:max(dt5_train$kidsge6)), lty = rep(1, max(dt5_train$kidsge6) + 1), col = 0:max(dt5_train$kidsge6) + 1, cex = 0.8) legend(x = 29, y = 0.6, legend = c("Logit", "Probit"), lty = c(1, 2), col = c(1, 1), cex = 0.8) # title("Probability to return to the workforce based on the number of children OVER the age of 6\n Logit vs Probit regression") predict_data_temp = predict_data_1.copy() predict_data_temp["kidslt6"] = 0 # fig = plt.figure(figsize = (10, 8)) _ = plt.scatter(dt5_train["age"], dt5_train["inlf"], color = "None", edgecolor = "black") _ = plt.title("Probability to return to the workforce based on the number of children OVER the age of 6\n Logit vs Probit regression") for i in range(0, int(dt5_train["kidsge6"].max() + 1)): predict_data_temp["kidsge6"] = i tmp_predict_logit = logit_glm.predict(exog = predict_data_temp) tmp_predict_probit = probit_glm.predict(exog = predict_data_temp) # _ = plt.plot(predict_data_temp["age"], tmp_predict_logit, label = "Logit : kidsge6 = " + str(i), linestyle = "-") _ = plt.plot(predict_data_temp["age"], tmp_predict_probit, label = "Probit: kidsge6 = " + str(i), linestyle = "--") # _ = plt.legend(loc = "center left") _ = plt.xlabel("age") plt.show() • We see a similar trend - the more children there are, the less likely that the mother would return to the labor force. • On the other hand, if we compare with the previous plots with the changes in kidslt6 - we see that the decrease in probability of being in the labor force with each additional kidsge6 is less than the decrease in probability with each additional kidslt6. Considerations Note that both for kidslt6 and kidsge6, we have only used existing cases, i.e. up to the maximum value from the dataset. How much could we trust our model(-s) to predict for higher values of kidslt6 and kidsge6, than in the dataset? #### 5.4.2.3 Task 7 We begin by examining the results using the default cutoff $$0.5$$: predicted_probs <- predict(logit_glm, dt5_train, type = "response") tmp_out <- InformationValue::confusionMatrix(dt5_train$inlf, predicted_probs, threshold = 0.5)
print(tmp_out)
##    0   1
## 0 77  42
## 1 97 386
predicted_probs = logit_glm.predict(exog = dt5_train)
tmp_out = metrics.confusion_matrix(logit_glm.model.endog, np.where(predicted_probs >= 0.5, 1, 0))
print(tmp_out)
## [[ 77  97]
##  [ 42 386]]

We compare the results with the true values:

print(table(dt5_train$inlf)) ## ## 0 1 ## 174 428 print(dt5_train["inlf"].value_counts()) ## 1.0 428 ## 0.0 174 ## Name: inlf, dtype: int64 Considerations From the above results it may be easier to format the results as: colnames(tmp_out) <- paste0("Actual ", colnames(tmp_out)) rownames(tmp_out) <- paste0("Predicted ", rownames(tmp_out)) print(tmp_out) ## Actual 0 Actual 1 ## Predicted 0 77 42 ## Predicted 1 97 386 tmp_out = pd.DataFrame(tmp_out, columns = ["Predicted 0", "Predicted 1"], index = ["Actual 0", "Actual 1"]) print(tmp_out) ## Predicted 0 Predicted 1 ## Actual 0 77 97 ## Actual 1 42 386 This also highlights the differences in the way that the results are formatted in R and Python We see that: • We have correctly predicted that 386 women will return to the labor force. • Furthermore, we have correctly predicted that 77 women will not return to the labor force. On the other hand: • We have incorrectly predicted that 42 women will return to the labor force (they actually did not return); • We have incorrectly predicted that 97 women will not return to the labor force (they actually did return); Considerations In total, if we were interested in determining the amount of jobs needed for women, returning to the labor force - the difference between the number of predicted returns, and the true returns to labor force is: print(tmp_out['Predicted 1', 'Actual 1'] + tmp_out['Predicted 1', 'Actual 0'] - table(dt5_train$inlf)[2])
##  1
## 55
print(tmp_out.loc['Actual 1', 'Predicted 1'] +
tmp_out.loc['Actual 0', 'Predicted 1'] -
dt5_train["inlf"].value_counts()[1])
## 55

In other words, we would have overestimated the overall amount of women returning to the workforce. This would have several complications:

• too many vacant job positions may be created;
• too many school/daycare workers would be hired (as some of the women decided to remain at home to take care of their children);
• if we made any other decisions regarding compensation/wage based on the expected value of returns to labor force - it would be overestimated;

If we were to use the optimal cutoff, which is based on minimizing the misclassification error:

#
#
#
#
#
#
#
optimalCutoff_logit <- InformationValue::optimalCutoff(dt5_train$inlf, predicted_probs, optimiseFor = "misclasserror") print(optimalCutoff_logit) ## [1] 0.4551387 # Similar to optimalCutoff from R: https://github.com/selva86/InformationValue/blob/master/R/Main.R thresholds = np.arange(start = np.max(predicted_probs), stop = np.min(predicted_probs), step = -0.01) # misclasserror = [] for k in thresholds: y_pred = np.where(predicted_probs >= k, 1, 0) misclasserror.append(1 - metrics.accuracy_score(dt5_train['inlf'], y_pred)) # optimalCutoff_logit = thresholds[misclasserror == np.min(misclasserror)][0] print(optimalCutoff_logit) ## 0.45513867283261955 The optimal cutoff is around 0.045 less than the default - this seems like a small value, but the final confusion matrix is now different: tmp_out_opt <- InformationValue::confusionMatrix(dt5_train$inlf,
predicted_probs, threshold = optimalCutoff_logit)
colnames(tmp_out_opt) <- paste0("Actual ", colnames(tmp_out_opt))
rownames(tmp_out_opt) <- paste0("Predicted ", rownames(tmp_out_opt))
print(tmp_out_opt)
##             Actual 0 Actual 1
## Predicted 0       71       27
## Predicted 1      103      401
tmp_out_opt = metrics.confusion_matrix(logit_glm.model.endog,
np.where(predicted_probs >= optimalCutoff_logit, 1, 0))
tmp_out_opt = pd.DataFrame(tmp_out_opt, columns = ["Predicted 0", "Predicted 1"],
index = ["Actual 0", "Actual 1"])
print(tmp_out_opt)
##           Predicted 0  Predicted 1
## Actual 0           71          103
## Actual 1           27          401

Take note, whether the columns (as in R), or the rows (as in Python) sum up to the true totals.

• the amount of correct returns to labor force predictions increased from 386 to 401 (GOOD);
• the amount of correct non-returns to labor force predictions decreased from 77 to 71 (BAD);
• the amount of incorrect returns to labor force predictions decreased from 42 to 27 (GOOD);
• the amount of incorrect non-returns to labor force predictions increased from 97 to 103 (BAD);

Considerations

In total, if we were interested in determining the amount of jobs needed for women, returning to the labor force - the difference between the number of predicted returns, and the true returns to labor force is:

print(tmp_out_opt['Predicted 1', 'Actual 1'] +
tmp_out_opt['Predicted 1', 'Actual 0'] -
table(dt5_train$inlf)[2]) ## 1 ## 76 print(tmp_out_opt.loc['Actual 1', 'Predicted 1'] + tmp_out_opt.loc['Actual 0', 'Predicted 1'] - dt5_train["inlf"].value_counts()[1]) ## 76 We see an important result: • Our total prediction of correct labor force returns increased; • Our incorrect predictions resulted in even more vacant jobs created than before In general it is important to determine what our objective is: • If we are interested in the correct prediction on an individual level, for example, if our correct predictions allow us to create specific specialized job positions - then the increased correct predictions - True Positive Rate ($$TRP$$) - outweigh the False Positive Rate ($$FPR$$) increase. Compared to the default cutoff: print((tmp_out_opt['Predicted 1', 'Actual 1'] / tmp_out['Predicted 1', 'Actual 1'] - 1) * 100) ## [1] 3.88601 print((tmp_out_opt.loc['Actual 1', 'Predicted 1'] / tmp_out.loc['Actual 1', 'Predicted 1'] - 1) * 100) ## 3.886010362694292 We have correctly classified $$3.89\%$$ more returns to labor force with the optimal cutoff. On the other hand: print((tmp_out_opt['Predicted 1', 'Actual 0'] / tmp_out['Predicted 1', 'Actual 0'] - 1) * 100) ## [1] 6.185567 print((tmp_out_opt.loc['Actual 0', 'Predicted 1'] / tmp_out.loc['Actual 0', 'Predicted 1'] - 1) * 100) ## 6.185567010309279 We have incorrectly classified $$6.19\%$$ more false returns to labor force (i.e. we predicted 1, but it was actually 0) using the optimal cutoff, which is based on minimizing the misclassification error. If we want to look at the total correctly predicted returns to labor out of all predicted returns to labor (i.e. predicted 1’s): # # # print(InformationValue::precision(dt5_train$inlf, predicted_probs, threshold = 0.5))
## [1] 0.7991718
tmp_diagnostics = pd.DataFrame(
metrics.precision_recall_fscore_support(logit_glm.model.endog, np.where(predicted_probs >= 0.5, 1, 0)),
index = ["precision","recall", "fbeta_score", "support"])
print(tmp_diagnostics.iloc[0, 1])                      
## 0.7991718426501035
#
#
#
print(InformationValue::precision(dt5_train$inlf, predicted_probs, threshold = optimalCutoff_logit)) ## [1] 0.7956349 tmp_diagnostics = pd.DataFrame( metrics.precision_recall_fscore_support(logit_glm.model.endog, np.where(predicted_probs >= optimalCutoff_logit, 1, 0)), index = ["precision","recall", "fbeta_score", "support"]) print(tmp_diagnostics.iloc[0, 1])  ## 0.7956349206349206 The optimal cutoff gives a slightly lower precision value, indicating that our model may be too optimistic (we are assuming that inlf = 1 - the return/participation in the labor force - is a positive result). On the other hand, basing our analysis on the results using the optimal cutoff may result in the creation of more job positions - if this is important to our analysis, we may need to fine-tune the cutoff further. Considerations See 5.2.6.5, 5.2.6.7.1 and 5.2.6.8 for alternative methods for choosing the optimal cutoff. Comparing with the probit model: predicted_probs <- predict(probit_glm, dt5_train, type = "response") tmp_out <- InformationValue::confusionMatrix(dt5_train$inlf, predicted_probs, threshold = 0.5)
colnames(tmp_out) <- paste0("Actual ", colnames(tmp_out))
rownames(tmp_out) <- paste0("Predicted ", rownames(tmp_out))
print(tmp_out)
##             Actual 0 Actual 1
## Predicted 0       77       42
## Predicted 1       97      386
predicted_probs = probit_glm.predict(exog = dt5_train)
tmp_out = metrics.confusion_matrix(probit_glm.model.endog, np.where(predicted_probs >= 0.5, 1, 0))
tmp_out = pd.DataFrame(tmp_out, columns = ["Predicted 0", "Predicted 1"],
index = ["Actual 0", "Actual 1"])
print(tmp_out)
##           Predicted 0  Predicted 1
## Actual 0           77           97
## Actual 1           42          386

We see that default cutoff produces the same results. Meanwhile the optimal cutoff:

#
#
#
#
#
#
optimalCutoff_probit <- InformationValue::optimalCutoff(dt5_train$inlf, predicted_probs, optimiseFor = "misclasserror") print(optimalCutoff_probit) ## [1] 0.6030785 thresholds = np.arange(start = np.max(predicted_probs), stop = np.min(predicted_probs), step = -0.01) # misclasserror = [] for k in thresholds: y_pred = np.where(predicted_probs >= k, 1, 0) misclasserror.append(1 - metrics.accuracy_score(dt5_train['inlf'], y_pred)) # optimalCutoff_probit = thresholds[misclasserror == np.min(misclasserror)][0] print(optimalCutoff_probit) ## 0.6030785219207573 is a little bit higher, compared to the logit case. This results in the following confusion matrix table using the optimal cutoff: tmp_out_opt <- InformationValue::confusionMatrix(dt5_train$inlf,
predicted_probs, threshold = optimalCutoff_probit)
colnames(tmp_out_opt) <- paste0("Actual ", colnames(tmp_out_opt))
rownames(tmp_out_opt) <- paste0("Predicted ", rownames(tmp_out_opt))
print(tmp_out_opt)
##             Actual 0 Actual 1
## Predicted 0      107       63
## Predicted 1       67      365
tmp_out_opt = metrics.confusion_matrix(probit_glm.model.endog,
np.where(predicted_probs >= optimalCutoff_probit, 1, 0))
tmp_out_opt = pd.DataFrame(tmp_out_opt, columns = ["Predicted 0", "Predicted 1"],
index = ["Actual 0", "Actual 1"])
print(tmp_out_opt)
##           Predicted 0  Predicted 1
## Actual 0          107           67
## Actual 1           63          365

Then our insights are now slightly different:

• The amount of correctly predicted returns to labor force decreased from 386 to 365 (BAD);
• The amount of correctly predicted non-returns to labor increased from 77 to 107 (GOOD);
• The amount of incorrectly predicted non-returns to labor increased from 42 to 63 (BAD);
• The amount of incorrectly predicted returns to labor decreased from 97 to 67 (GOOD);

So, while the models are close, the automated optimal cutoff procedures give slightly different optimal cutoff points.

Considerations

Now, the total amount of jobs that we would create is:

print(tmp_out_opt['Predicted 1', 'Actual 1'] +
tmp_out_opt['Predicted 1', 'Actual 0'] -
table(dt5_train$inlf)[2]) ## 1 ## 4 print(tmp_out_opt.loc['Actual 1', 'Predicted 1'] + tmp_out_opt.loc['Actual 0', 'Predicted 1'] - dt5_train["inlf"].value_counts()[1]) ## 4 We would predict 4 more people returning to the labor force, than would return in reality On the other hand, we may have just gotten lucky with the amount of false positives. After all, our model has to accurately predict the true 1. Furthermore, we now incorrectly classify that a woman will not return to the labor force (when in fact she will return): print((tmp_out_opt['Predicted 0', 'Actual 1'] / tmp_out['Predicted 0', 'Actual 1'] - 1) * 100) ## [1] 50 print((tmp_out_opt.loc['Actual 1', 'Predicted 0'] / tmp_out.loc['Actual 1', 'Predicted 0'] - 1) * 100) ## 50.0 With the optimal cutoff, we would make this incorrect classification $$50\%$$ more than with the default cutoff! If it is vital that we correctly classify those women, who will return to the labor force, as returning to the labor force (maybe based on their work experience, number of children, and other factors, which are specifically different for these misclassification cases), then this is quite bad. Furthermore: print((tmp_out_opt['Predicted 1', 'Actual 1'] / tmp_out['Predicted 1', 'Actual 1'] - 1) * 100) ## [1] -5.440415 print((tmp_out_opt.loc['Actual 1', 'Predicted 1'] / tmp_out.loc['Actual 1', 'Predicted 1'] - 1) * 100) ## -5.440414507772018 While our overall amount of predicted labor is close to the true amount of women returning to the labor force, We see that, using the optimal cutoff, we have made more mistakes in classifying which specific women will return to the labor force. If the women are identical in terms of family composition, education, age and all other factors - this may not be a problem. In reality, unfortunately, this is not the case - we may have misclassified women, which have more experience, as not returning to the workforce, while women, with less - experience -as returning. As a result, we may opt to create more lower skill job positions, for which the actual women returning to the workforce will be overqualified, which may result in those women not taking the jobs and either being unemployed, or - even worse - leaving the workforce altogether! On the other hand, our model precision appears to have increased: # # # print(InformationValue::precision(dt5_train$inlf, predicted_probs, threshold = optimalCutoff_probit))
## [1] 0.8449074
tmp_diagnostics = pd.DataFrame(
metrics.precision_recall_fscore_support(probit_glm.model.endog, np.where(predicted_probs >= optimalCutoff_probit, 1, 0)),
index = ["precision","recall", "fbeta_score", "support"])
print(tmp_diagnostics.iloc[0, 1])   
## 0.8449074074074074

So, looking at it from an economic/policymaker perspective - it is quite difficult to determine what is more important - it all depends on our task and the data that we have. Consequently, we need to take care when analysing this type of data as the results of $$TPR$$, $$FPR$$, $$Precision$$, $$Accuracy$$ and any other statistic may not be as useful in every case.

thresholds <- seq(from = 0.01, to = 0.99, by = 0.01)
ppred <- data.frame(logit = predict(logit_glm, dt5_train, type = "response"),
probit = predict(probit_glm, dt5_train, type = "response"))
TPR <- NULL
FPR <- NULL
#
for(j in 1:ncol(ppred)){
tpr <- NULL
fpr <- NULL
for(i in thresholds){
tpr <- c(tpr,
InformationValue::sensitivity(dt5_train$inlf, ppred[, j], threshold = i)) fpr <- c(fpr, 1 - InformationValue::specificity(dt5_train$inlf, ppred[, j], threshold = i))
}
TPR <- cbind(TPR, tpr)
FPR <- cbind(FPR, fpr)
}
#
#
thresholds = np.arange(start = 0.01, stop = 0.99 + 0.01, step = 0.01)
ppred = pd.DataFrame((logit_glm.predict(exog = dt5_train),
probit_glm.predict(exog = dt5_train))).T
TPR = np.empty(shape = (0, 99))
FPR = np.empty(shape = (0, 99))
#
for j in range(0, len(ppred.columns)):
tpr = np.array([])
fpr = np.array([])
for i in thresholds:
tpr = np.hstack((tpr,
metrics.recall_score(dt5_train[["inlf"]], np.where(ppred[j] >= i, 1, 0))))
fpr = np.hstack((fpr,
1 - metrics.precision_recall_fscore_support(dt5_train[["inlf"]], np.where(ppred[j] >= i, 1, 0))[1][0]))
#
TPR = np.vstack((TPR, tpr))
FPR = np.vstack((FPR, fpr))
#
TPR = pd.DataFrame(TPR).T
FPR = pd.DataFrame(FPR).T
#
plot(thresholds, thresholds,
lty = 2, col = "red", type = "l", lwd = 2,
xlab = "FPR or (1 - Specificity)",
ylab = "TPR or Sensitivity",
main = "ROC Curve")
for(j in 1:ncol(ppred)){
lines(FPR[, j], TPR[, j], type = "l",
col = c("black", "blue", "darkorange")[j], lwd = 2)
}
legend("topleft",
legend = colnames(ppred), lty = 1,
col = c("black", "blue", "darkorange"), lwd = 2, cex = 1.5)

fig = plt.figure(figsize = (10, 8))
_ = plt.plot(thresholds,
thresholds, linestyle = "--", color = "red")
_ = plt.xlabel("FPR or (1 - Specificity)")
_ = plt.ylabel("TPR or Sensitivity")
_ = plt.title("ROC Curve")
for j in range(0, len(ppred.columns)):
_ = plt.plot(FPR[j], TPR[j], linestyle = "-",
color = ["black", "blue", "darkorange"][j],
label = ["logit", "probit", "random_coin_toss"][j])
#
_ = plt.legend(loc = "upper left")
plt.show()

We do see that the logit and probit models do not differ much in terms of ROC.

Finally, we want to examine the AUC:

pred_logit  <- ROCR::prediction(predictions = ppred$logit, labels = dt5_train$inlf)
#
auc_logit <- ROCR::performance(pred_logit, measure = "auc")
auc_logit <- unlist(auc_logit@y.values)
print(auc_logit)
## [1] 0.8091028
#
#
fpr, tpr, thresholds = metrics.roc_curve(dt5_train[["inlf"]], ppred[0], pos_label = 1)
roc_auc = metrics.auc(fpr, tpr)
print(roc_auc)
## 0.8091027500268557
pred_probit  <- ROCR::prediction(predictions = ppred$probit, labels = dt5_train$inlf)
#
auc_probit <- ROCR::performance(pred_probit, measure = "auc")
auc_probit <- unlist(auc_probit@y.values)
print(auc_probit)
## [1] 0.808861
#
#
fpr, tpr, thresholds = metrics.roc_curve(dt5_train[["inlf"]], ppred[1], pos_label = 1)
roc_auc = metrics.auc(fpr, tpr)
print(roc_auc)
## 0.8088610484477388

Generally, an AUC of $$0.7 - 0.8$$ indicates an adequate model; $$0.5$$ indicates random guessing; $$0.5-0.7$$ indicates a poor model; Ideally, we would strive for the AUC value to be $$> 0.8$$.

The AUC for both logit and probit models is around $$0.8$$, indicating that our estimated models are adequate. The AUC is slightly higher for the logit regression.

Considerations

We can plot both the $$ROC$$ and the $$AUC$$ for each model separately:

def plotROC(fpr, tpr, thresholds):
#
roc_auc = metrics.auc(fpr, tpr)
#
fig = plt.figure(figsize = (10, 8))
plt.plot(fpr, tpr, linestyle = "-", color = "cornflowerblue")
# Fill in the ROC curve
plt.fill_between(fpr, tpr, 0, color = "cornflowerblue")
plt.annotate(s = "AUROC: " + str(roc_auc.round(4)), xy = (0.3, 0.3), color = "white", fontsize = 25)
for i in range(1, len(fpr))[::10]:
_ = plt.annotate(s = thresholds[i].round(2), xy = (fpr[i], tpr[i]))
#
plt.xlabel("1-Specificity (FPR)", color = "cornflowerblue", fontsize = 25)
plt.ylabel("Sensitivity (TPR)", color = "cornflowerblue", fontsize = 25)
plt.title("ROC Curve", color = "cornflowerblue", fontsize = 25)
# Setting the background color
ax.set_facecolor("lightgray")
plt.grid(True, zorder = 0, color = "white")
# Make sure the grid is in the background
plt.rcParams['axes.axisbelow'] = True
plt.tight_layout()
plt.show()
#
#
InformationValue::plotROC(dt5_train$inlf, ppred$logit, Show.labels = TRUE)

fpr, tpr, thresholds = metrics.roc_curve(dt5_train[["inlf"]], ppred[0], pos_label = 1)
plotROC(fpr, tpr, thresholds)

#
InformationValue::plotROC(dt5_train$inlf, ppred$probit, Show.labels = TRUE)

fpr, tpr, thresholds = metrics.roc_curve(dt5_train[["inlf"]], ppred[1], pos_label = 1)
plotROC(fpr, tpr, thresholds)

Note that the values may be slightly different in R due to the way that the threshold vector is created.

### 5.4.3 Exercise Set 3

The parameters of a linear regression model have a straightforward interpretation:

$\beta_j \text{ measures the effect of a unit change in } X_j \text{ on } \mathbb{E}(Y|\mathbf{X})$

On the other hand, the parameters of nonlinear logit and probit models are not as easy to interpret, since the linear predictor affects the mean through the link function. Nevertheless, we can measure this effect via the marginal effects:

$\dfrac{\partial \mu}{\partial X_j} = \dfrac{\partial g^{-1}(\mathbf{X} \boldsymbol{\beta})}{\partial X_j}$

which have the same interpretations as the coefficients in the linear model.

The partial effects differ by regressor values and make it difficult to present them in a compact way. Therefore, it is common to aggregate the effects in two ways:

• Partial effects at the average value of $$X$$ ($$PEA$$);

• Average partial effects ($$APE$$).

We will begin by examining the coefficients themselves of the logistic regression:

print(round(coef(summary(logit_glm)), 5))
##                 Estimate Std. Error  z value Pr(>|z|)
## (Intercept)      1.70098    1.04919  1.62123  0.10497
## educ             0.19791    0.05020  3.94197  0.00008
## exper            0.21693    0.03877  5.59589  0.00000
## I(exper^2)      -0.00318    0.00123 -2.57767  0.00995
## age             -0.10395    0.01832 -5.67236  0.00000
## kidslt6         -2.28803    0.37195 -6.15140  0.00000
## kidsge6         -0.08289    0.09919 -0.83566  0.40334
## kidslt6:kidsge6  0.53630    0.16900  3.17341  0.00151
print(logit_glm.summary2().tables[1].round(5))
##                       Coef.  Std.Err.        z    P>|z|   [0.025   0.975]
## Intercept           1.70098   1.04919  1.62123  0.10497 -0.35540  3.75735
## educ                0.19791   0.05020  3.94197  0.00008  0.09951  0.29631
## exper               0.21693   0.03877  5.59589  0.00000  0.14095  0.29291
## np.power(exper, 2) -0.00318   0.00123 -2.57767  0.00995 -0.00560 -0.00076
## age                -0.10395   0.01832 -5.67236  0.00000 -0.13986 -0.06803
## kidslt6            -2.28803   0.37195 -6.15140  0.00000 -3.01705 -1.55902
## kidsge6            -0.08289   0.09919 -0.83566  0.40334 -0.27729  0.11152
## kidslt6:kidsge6     0.53630   0.16900  3.17341  0.00151  0.20507  0.86753

In the logit model, we can interpret the coefficients in terms of the log-odds-ratio, as we did before. Some of the coefficients can be interpreted as follows:

• If educ increases by 1 year then the log-odds of returning to the labor force (versus not returning) increase by 0.19791.
• If age increases by 1 year, then the log-odds of returning to the labor force (versus not returning) decrease by -0.10395.
• If kidslt6 increases by 1 (i.e. if we have an additional child, younger than 6 years), then the log-odds of returning to the labor force (versus not returning) changes by $$−2.2880 + 0.53630 \cdot kidsge6$$, ceteris paribus.
• If exper increases by 1 year, then the log-odds of returning to the labor force (versus not returning) changes by $$0.21693 -0.00318(2 \cdot exper + 1)$$

We can also interpret them in terms of the odds-ratio:

round(exp(coef(summary(logit_glm)))[, 1, drop = FALSE], 5)
##                 Estimate
## (Intercept)      5.47929
## educ             1.21885
## exper            1.24225
## I(exper^2)       0.99682
## age              0.90127
## kidslt6          0.10147
## kidsge6          0.92046
## kidslt6:kidsge6  1.70967
print(np.exp(logit_glm.summary2().tables[1].iloc[:, 0]).round(5))
## Intercept             5.47929
## educ                  1.21885
## exper                 1.24225
## np.power(exper, 2)    0.99682
## age                   0.90127
## kidslt6               0.10147
## kidsge6               0.92046
## kidslt6:kidsge6       1.70967
## Name: Coef., dtype: float64

The interpretation is then as follows:

• If educ increases by 1 year then the odds of returning to the labor force (versus not returning) increase by 1.21885.
• If age increases by 1 year, then the log-odds of returning to the labor force (versus not returning) increase by a factor of 0.90127.
• and so on.

Next up, we want to evaluate the partial effects of the explanatory variables on the probability:

PEA_logit = mfx::logitmfx(formula = logit_glm$formula, data = logit_glm$data, atmean = TRUE)
print(PEA_logit$mfxest) ## dF/dx Std. Err. z P>|z| ## educ 0.0356010216 0.0089769875 3.9658094 7.314729e-05 ## exper 0.0390228423 0.0069853509 5.5863825 2.318482e-08 ## I(exper^2) -0.0005725429 0.0002234722 -2.5620319 1.040618e-02 ## age -0.0186986384 0.0032165601 -5.8132408 6.127481e-09 ## kidslt6 -0.4115914788 0.0666696754 -6.1735936 6.675501e-10 ## kidsge6 -0.0149103840 0.0177874785 -0.8382517 4.018894e-01 ## kidslt6:kidsge6 0.0964745724 0.0304359033 3.1697621 1.525638e-03 PEA_logit = logit_glm.get_margeff(at = "mean") print(PEA_logit.summary()) ## Logit Marginal Effects ## ===================================== ## Dep. Variable: inlf ## Method: dydx ## At: mean ## ====================================================================================== ## dy/dx std err z P>|z| [0.025 0.975] ## -------------------------------------------------------------------------------------- ## educ 0.0356 0.009 3.966 0.000 0.018 0.053 ## exper 0.0390 0.007 5.586 0.000 0.025 0.053 ## np.power(exper, 2) -0.0006 0.000 -2.562 0.010 -0.001 -0.000 ## age -0.0187 0.003 -5.813 0.000 -0.025 -0.012 ## kidslt6 -0.4116 0.067 -6.174 0.000 -0.542 -0.281 ## kidsge6 -0.0149 0.018 -0.838 0.402 -0.050 0.020 ## kidslt6:kidsge6 0.0965 0.030 3.170 0.002 0.037 0.156 ## ====================================================================================== PEA - the Partial Effects at the Average - shows that: • a unit increase in the sample average value of educ results in a 0.0356 increase in the probability of returning to the labor force ; • a unit increase in the sample average value of age results in a 0.018698 decrease in the probability of returning to the labor force ; • if the sample average number of kids under the age of 6, kidslt6 increases by one, then the probability of returning to the labor force changes by $$-0.411591 + 0.096475 \cdot \overline{kidsge6}$$; • if the average number of exper increases by one, then the probability of returning to the labor force changes by $$0.039023 - 0.000573 \cdot (2 \cdot \overline{exper})$$. APE_logit = mfx::logitmfx(formula = logit_glm$formula, data = logit_glm$data, atmean = FALSE) print(APE_logit$mfxest)
##                         dF/dx    Std. Err.          z        P>|z|
## educ             0.0299008551 0.0081927749  3.6496615 2.625861e-04
## exper            0.0327747997 0.0068154334  4.8089091 1.517562e-06
## I(exper^2)      -0.0004808716 0.0001937699 -2.4816628 1.307709e-02
## age             -0.0157047537 0.0032096172 -4.8930302 9.929517e-07
## kidslt6         -0.3456905620 0.0667862419 -5.1760745 2.266030e-07
## kidsge6         -0.0125230460 0.0150278296 -0.8333237 4.046622e-01
## kidslt6:kidsge6  0.0810277930 0.0268936290  3.0128992 2.587649e-03
APE_logit = logit_glm.get_margeff(at = "overall")
print(APE_logit.summary())
##         Logit Marginal Effects
## =====================================
## Dep. Variable:                   inlf
## Method:                          dydx
## At:                           overall
## ======================================================================================
##                         dy/dx    std err          z      P>|z|      [0.025      0.975]
## --------------------------------------------------------------------------------------
## educ                   0.0299      0.007      4.112      0.000       0.016       0.044
## exper                  0.0328      0.005      6.186      0.000       0.022       0.043
## np.power(exper, 2)    -0.0005      0.000     -2.634      0.008      -0.001      -0.000
## age                   -0.0157      0.003     -6.214      0.000      -0.021      -0.011
## kidslt6               -0.3457      0.050     -6.894      0.000      -0.444      -0.247
## kidsge6               -0.0125      0.015     -0.836      0.403      -0.042       0.017
## kidslt6:kidsge6        0.0810      0.025      3.263      0.001       0.032       0.130
## ======================================================================================

APE - the Average Partial Effect - shows that:

• a unit increase in educ results in the sample average estimated increase of 0.0299 in the probability of returning to the labor force;
• a unit increase in age results in the sample average estimated decrease of 0.015705 in the probability of returning to the labor force;
• a unit increase in kidslt6 results results in the sample average estimated change of $$-0.01252 + 0.081028 \cdot \overline{kidsge6}$$.
Note: this is similar to the PEA expression, which stems from the fact that the partial derivative with respect to kidsge6 returns $$-0.01252 + 0.081028 \cdot kidsge6$$. Since APE averages all the partial effects, kidsge6 turns into its average $$\overline{kidsge6}$$. In contrast, PEA already uses $$\overline{kidsge6}$$, but the expressions become equivalent, although with different coefficients.

We can do the same with the probit regression:

print(round(coef(summary(probit_glm)), 5))
##                 Estimate Std. Error  z value Pr(>|z|)
## (Intercept)      0.95926    0.60504  1.58545  0.11286
## educ             0.11640    0.02866  4.06061  0.00005
## exper            0.12800    0.02226  5.75096  0.00000
## I(exper^2)      -0.00188    0.00071 -2.65361  0.00796
## age             -0.06025    0.01036 -5.81485  0.00000
## kidslt6         -1.34130    0.21021 -6.38088  0.00000
## kidsge6         -0.04653    0.05730 -0.81207  0.41675
## kidslt6:kidsge6  0.31497    0.09829  3.20436  0.00135
print(probit_glm.summary2().tables[1].round(5))
##                       Coef.  Std.Err.        z    P>|z|   [0.025   0.975]
## Intercept           0.95926   0.60729  1.57958  0.11420 -0.23100  2.14953
## educ                0.11640   0.02868  4.05878  0.00005  0.06019  0.17260
## exper               0.12800   0.02195  5.83111  0.00000  0.08498  0.17102
## np.power(exper, 2) -0.00188   0.00070 -2.69562  0.00703 -0.00324 -0.00051
## age                -0.06025   0.01031 -5.84510  0.00000 -0.08045 -0.04005
## kidslt6            -1.34130   0.20747 -6.46511  0.00000 -1.74793 -0.93467
## kidsge6            -0.04653   0.05694 -0.81717  0.41383 -0.15813  0.06507
## kidslt6:kidsge6     0.31497   0.09781  3.22010  0.00128  0.12326  0.50668

Interpretation follows a similar idea to the logit, with the difference being that instead of the log-odds-ratio we now have the inverse normal cdf of the probability, $$\Phi^{-1}(p_i)$$.

Consequently, it makes more sense to evaluate the PEA and APE:

PEA_probit = mfx::probitmfx(formula = probit_glm$formula, data = probit_glm$data, atmean = TRUE)
print(PEA_probit$mfxest) ## dF/dx Std. Err. z P>|z| ## educ 0.0364118090 0.0089471120 4.0696718 4.707942e-05 ## exper 0.0400413820 0.0069878069 5.7301786 1.003249e-08 ## I(exper^2) -0.0005869283 0.0002220151 -2.6436419 8.201938e-03 ## age -0.0188477517 0.0032028840 -5.8846188 3.989724e-09 ## kidslt6 -0.4195922325 0.0658753362 -6.3694891 1.896590e-10 ## kidsge6 -0.0145560517 0.0178951313 -0.8134085 4.159839e-01 ## kidslt6:kidsge6 0.0985304764 0.0307986942 3.1991771 1.378205e-03 PEA_probit = probit_glm.get_margeff(at = "mean") print(PEA_probit.summary()) ## Probit Marginal Effects ## ===================================== ## Dep. Variable: inlf ## Method: dydx ## At: mean ## ====================================================================================== ## dy/dx std err z P>|z| [0.025 0.975] ## -------------------------------------------------------------------------------------- ## educ 0.0364 0.009 4.073 0.000 0.019 0.054 ## exper 0.0400 0.007 5.806 0.000 0.027 0.054 ## np.power(exper, 2) -0.0006 0.000 -2.684 0.007 -0.001 -0.000 ## age -0.0188 0.003 -5.924 0.000 -0.025 -0.013 ## kidslt6 -0.4196 0.065 -6.476 0.000 -0.547 -0.293 ## kidsge6 -0.0146 0.018 -0.819 0.413 -0.049 0.020 ## kidslt6:kidsge6 0.0985 0.031 3.220 0.001 0.039 0.159 ## ====================================================================================== APE_probit = mfx::probitmfx(formula = probit_glm$formula, data = probit_glm$data, atmean = FALSE) print(APE_probit$mfxest)
##                         dF/dx    Std. Err.          z        P>|z|
## educ             0.0301981575 0.0071744405  4.2091307 2.563551e-05
## exper            0.0332083463 0.0053187829  6.2435987 4.276162e-10
## I(exper^2)      -0.0004867694 0.0001802354 -2.7007420 6.918499e-03
## age             -0.0156313952 0.0024919704 -6.2727051 3.548286e-10
## kidslt6         -0.3479890919 0.0495031781 -7.0296313 2.070800e-12
## kidsge6         -0.0120720710 0.0148541006 -0.8127097 4.163845e-01
## kidslt6:kidsge6  0.0817163149 0.0249294073  3.2779085 1.045793e-03
APE_probit = probit_glm.get_margeff(at = "overall")
print(APE_probit.summary())
##        Probit Marginal Effects
## =====================================
## Dep. Variable:                   inlf
## Method:                          dydx
## At:                           overall
## ======================================================================================
##                         dy/dx    std err          z      P>|z|      [0.025      0.975]
## --------------------------------------------------------------------------------------
## educ                   0.0302      0.007      4.213      0.000       0.016       0.044
## exper                  0.0332      0.005      6.344      0.000       0.023       0.043
## np.power(exper, 2)    -0.0005      0.000     -2.743      0.006      -0.001      -0.000
## age                   -0.0156      0.002     -6.312      0.000      -0.020      -0.011
## kidslt6               -0.3480      0.049     -7.153      0.000      -0.443      -0.253
## kidsge6               -0.0121      0.015     -0.818      0.413      -0.041       0.017
## kidslt6:kidsge6        0.0817      0.025      3.296      0.001       0.033       0.130
## ======================================================================================

The interpretation of PEA and APE of the probit model are analogous to the logit model - a unit increase in the (average, if PEA) regressor results in the specified change (or average change in the case of APE) in the probability.

Considerations

Marginal effects show the change in probability when the predictor, or independent, variable increases by one unit. For continuous variables this represents the instantaneous change given that the ‘unit’ may be very small. For binary variables, the change is from 0 to 1, so one ‘unit’ as it is usually thought.

As in the multiple linear regression case, the intercept does not have a meaningful interpretation.

The fitted Logit model:

\begin{aligned} \log \left( \dfrac{\widehat{p}_i}{1 - \widehat{p}_i} \right) &= \underset{(1.0492)}{1.7010} + \underset{(0.050)}{0.1979} \cdot educ + \underset{(0.0388)}{0.2169} \cdot exper - \underset{(0.0012)}{0.0032} \cdot exper^2 - \underset{(0.0183)}{0.1039} \cdot age \\ &- \underset{(0.3720)}{2.2880} \cdot kidslt6 - \underset{(0.0992)}{0.0829} \cdot kidsge6 + \underset{(0.1690)}{0.5363} \cdot (kidslt6 \times kidsge6) \end{aligned}

The fitted Probit model:

\begin{aligned} \widehat{p}_i = \mathbf{\Phi} \bigg( \underset{(0.6050)}{0.9593} &+ \underset{(0.0287)}{0.1164} \cdot educ + \underset{(0.0223)}{0.1280} \cdot exper - \underset{(0.0007)}{0.0019} \cdot exper^2 - \underset{(0.0104)}{0.0603} \cdot age \\ &- \underset{(0.2102)}{1.3413} \cdot kidslt6 - \underset{(0.0573)}{0.0465} \cdot kidsge6 + \underset{(0.0983)}{0.3150} \cdot (kidslt6 \times kidsge6) \bigg) \end{aligned}

We will check some arbitrary linear restrictions to highlight how such tests could be carried out:

print(round(coef(summary(logit_glm)), 5))
##                 Estimate Std. Error  z value Pr(>|z|)
## (Intercept)      1.70098    1.04919  1.62123  0.10497
## educ             0.19791    0.05020  3.94197  0.00008
## exper            0.21693    0.03877  5.59589  0.00000
## I(exper^2)      -0.00318    0.00123 -2.57767  0.00995
## age             -0.10395    0.01832 -5.67236  0.00000
## kidslt6         -2.28803    0.37195 -6.15140  0.00000
## kidsge6         -0.08289    0.09919 -0.83566  0.40334
## kidslt6:kidsge6  0.53630    0.16900  3.17341  0.00151
print(logit_glm.summary2().tables[1].round(5))
##                       Coef.  Std.Err.        z    P>|z|   [0.025   0.975]
## Intercept           1.70098   1.04919  1.62123  0.10497 -0.35540  3.75735
## educ                0.19791   0.05020  3.94197  0.00008  0.09951  0.29631
## exper               0.21693   0.03877  5.59589  0.00000  0.14095  0.29291
## np.power(exper, 2) -0.00318   0.00123 -2.57767  0.00995 -0.00560 -0.00076
## age                -0.10395   0.01832 -5.67236  0.00000 -0.13986 -0.06803
## kidslt6            -2.28803   0.37195 -6.15140  0.00000 -3.01705 -1.55902
## kidsge6            -0.08289   0.09919 -0.83566  0.40334 -0.27729  0.11152
## kidslt6:kidsge6     0.53630   0.16900  3.17341  0.00151  0.20507  0.86753

We will begin with a simple economic hypothesis: $\text{ education is twice as effective, in absolute value, on the log-odds-ratio, compared to age}$ this is equivalent to the hypothesis that:

$\begin{cases} H_0&: educ + 2 \cdot age = 0\\ H_1&: educ + 2 \cdot age \neq 0 \end{cases}$

print(car::linearHypothesis(logit_glm, c("educ + 2 * age = 0")))
## Linear hypothesis test
##
## Hypothesis:
## educ  + 2 age = 0
##
## Model 1: restricted model
## Model 2: inlf ~ educ + exper + I(exper^2) + age + kidslt6 * kidsge6
##
##   Res.Df Df  Chisq Pr(>Chisq)
## 1    595
## 2    594  1 0.0263     0.8712
print(logit_glm.wald_test("educ + 2 * age = 0", use_f = False))
## <Wald test (chi2): statistic=[[0.02629718]], p-value=0.8711766750958096, df_denom=1>

Since p-value = 0.8712 > 0.05, we have no grounds to reject the null hypothesis and conclude that educations effect on the log-odds-ratio is twice as strong as the effect of age.

Furthermore, we may examine another hypothesis:

$\begin{cases} H_0&: educ + 2 \cdot age + kidslt6= 0\\ H_1&: educ + 2 \cdot age + kidslt6 \neq 0 \end{cases}$

print(car::linearHypothesis(logit_glm, c("educ + 2 * age + kidslt6 = 0")))
## Linear hypothesis test
##
## Hypothesis:
## educ  + 2 age  + kidslt6 = 0
##
## Model 1: restricted model
## Model 2: inlf ~ educ + exper + I(exper^2) + age + kidslt6 * kidsge6
##
##   Res.Df Df  Chisq Pr(>Chisq)
## 1    595
## 2    594  1 35.894  2.083e-09 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
print(logit_glm.wald_test("educ + 2 * age + kidslt6 = 0", use_f = False))
## <Wald test (chi2): statistic=[[35.89410407]], p-value=2.0833820432189484e-09, df_denom=1>

Since p-value < 0.05, we reject the null hypothesis of this linear restriction.

Finally, we may examine a joint hypothesis of two linear restrictions:

$\begin{cases} H_0&: educ + 2 \cdot age = 0 \text{ and } kidslt6 = 28 \cdot kidsge6\\ H_1&: educ + 2 \cdot age \neq 0, \text{ or } kidslt6 \neq 28 \cdot kidsge6, \text{ or both} \end{cases}$

print(car::linearHypothesis(logit_glm, c("educ + 2 * age = 0", "kidslt6 = 28 * kidsge6")))
## Linear hypothesis test
##
## Hypothesis:
## educ  + 2 age = 0
## kidslt6 - 28 kidsge6 = 0
##
## Model 1: restricted model
## Model 2: inlf ~ educ + exper + I(exper^2) + age + kidslt6 * kidsge6
##
##   Res.Df Df  Chisq Pr(>Chisq)
## 1    596
## 2    594  2 0.0271     0.9865
print(logit_glm.wald_test("educ + 2 * age = 0, kidslt6 = 28 * kidsge6", use_f = False))
## <Wald test (chi2): statistic=[[0.02710237]], p-value=0.9865402197730535, df_denom=2>

Since p-value = 0.9865 > 0.05 we do not reject the joint hypothesis of these two linear restrictions.

Before examining the model on the test data subset, we examine the $$Y$$ values:

print(table(dt5_test$inlf)) ## ## 0 ## 151 print(dt5_test["inlf"].value_counts()) ## 0.0 151 ## Name: inlf, dtype: int64 Note that in this data sample, our test set contains only zeros! Considerations This may be a nice example of the type of new data that we may obtain for testing - there is always a possibility that the new data, for which we do not yet know the outcome, may have only 1’s, or only 0’s. Firstly, using the naive $$0.5$$ threshold: probs_logit_test <- predict(logit_glm, newdata = dt5_test, type = "response") test <- InformationValue::confusionMatrix(dt5_test$inlf, probs_logit_test, threshold = 0.5)
colnames(test) <- paste0("Actual ", colnames(test))
rownames(test) <- paste0("Predicted ", rownames(test))
print(test)
##             Actual 0
## Predicted 0       59
## Predicted 1       92
probs_logit_test = logit_glm.predict(exog = dt5_test)
test = metrics.confusion_matrix(dt5_test["inlf"], np.where(probs_logit_test >= 0.5, 1, 0))
test = pd.DataFrame(test, columns = ["Predicted 0", "Predicted 1"], index = ["Actual 0", "Actual 1"])
#
print(test)
##           Predicted 0  Predicted 1
## Actual 0           59           92
## Actual 1            0            0
probs_probit_test <- predict(probit_glm, newdata = dt5_test, type = "response")
test <- InformationValue::confusionMatrix(dt5_test$inlf, probs_probit_test, threshold = 0.5) colnames(test) <- paste0("Actual ", colnames(test)) rownames(test) <- paste0("Predicted ", rownames(test)) print(test) ## Actual 0 ## Predicted 0 59 ## Predicted 1 92 probs_probit_test = probit_glm.predict(exog = dt5_test) test = metrics.confusion_matrix(dt5_test["inlf"], np.where(probs_probit_test >= 0.5, 1, 0)) test = pd.DataFrame(test, columns = ["Predicted 0", "Predicted 1"], index = ["Actual 0", "Actual 1"]) # print(test) ## Predicted 0 Predicted 1 ## Actual 0 59 92 ## Actual 1 0 0 Gives the same results - around more than half of the data is incorrectly classified. If we look at the optimal cutoff prediction rule (which are different for logit and probit models): probs_logit_test <- predict(logit_glm, newdata = dt5_test, type = "response") test <- InformationValue::confusionMatrix(dt5_test$inlf, probs_logit_test, threshold = optimalCutoff_logit)
colnames(test) <- paste0("Actual ", colnames(test))
rownames(test) <- paste0("Predicted ", rownames(test))
print(test)
##             Actual 0
## Predicted 0       51
## Predicted 1      100
probs_logit_test = logit_glm.predict(exog = dt5_test)
test = metrics.confusion_matrix(dt5_test["inlf"], np.where(probs_logit_test >= optimalCutoff_logit, 1, 0))
test = pd.DataFrame(test, columns = ["Predicted 0", "Predicted 1"], index = ["Actual 0", "Actual 1"])
#
print(test)
##           Predicted 0  Predicted 1
## Actual 0           51          100
## Actual 1            0            0
probs_probit_test <- predict(probit_glm, newdata = dt5_test, type = "response")
test <- InformationValue::confusionMatrix(dt5_test$inlf, probs_probit_test, threshold = optimalCutoff_probit) colnames(test) <- paste0("Actual ", colnames(test)) rownames(test) <- paste0("Predicted ", rownames(test)) print(test) ## Actual 0 ## Predicted 0 87 ## Predicted 1 64 probs_probit_test = probit_glm.predict(exog = dt5_test) test = metrics.confusion_matrix(dt5_test["inlf"], np.where(probs_probit_test >= optimalCutoff_probit, 1, 0)) test = pd.DataFrame(test, columns = ["Predicted 0", "Predicted 1"], index = ["Actual 0", "Actual 1"]) # print(test) ## Predicted 0 Predicted 1 ## Actual 0 87 64 ## Actual 1 0 0 In this case, the results from using an optimal cutoff point are slightly better for the probit regression. Unfortunately, since we do not have any 1 in our dataset - we cannot calculate the ROC and AUC. Considerations At this point our analysis on the test data would come to an end - since we do not have any 1 in the test data, we cannot do any further analysis. For interests sake, assume that out new data contains both zeros and ones (we will ignore that some (or all) of the sample observations may come from our train set): set.seed(123) # new_index <- sample(1:nrow(dt5), nrow(dt5) - floor(0.8 * length(dt_index)), replace = FALSE) dt5_test_v2 <- dt5[new_index, ] row.names(dt5_test_v2) <- NULL print(head(dt5_test_v2, 5)) ## inlf hours kidslt6 kidsge6 age educ wage repwage hushrs husage huseduc huswage faminc mtr motheduc fatheduc unem city exper nwifeinc lwage expersq ## 1 1 384 0 3 36 14 1.6927 0 3060 43 17 5.4020 32650 0.5815 10 10 14.0 1 1 32.00000 0.5263249 1 ## 2 0 0 2 3 34 12 NA 0 3160 30 12 0.7595 5000 0.9415 12 12 7.5 0 10 5.00000 NA 100 ## 3 1 528 0 2 51 12 3.7879 0 2500 50 11 10.4000 40000 0.6215 7 7 7.5 0 8 37.99999 1.3318120 64 ## 4 0 0 0 0 52 11 NA 0 2210 51 12 10.4070 24000 0.6615 7 7 11.0 1 3 24.00000 NA 9 ## 5 1 800 0 3 33 13 1.7500 0 3000 37 14 8.3333 32600 0.6215 12 7 7.5 1 4 31.20000 0.5596158 16 np.random.seed(123) # new_index = np.random.choice(list(range(0, len(dt5.index))), size = len(dt5.index) - int(np.floor(0.8 * len(dt_index))), replace = False) dt5_test_v2 = dt5.iloc[new_index, :].reset_index() # print(dt5_test_v2.head(5)) ## index inlf hours kidslt6 kidsge6 age educ wage repwage hushrs husage huseduc huswage faminc mtr motheduc fatheduc unem city exper nwifeinc lwage expersq ## 0 373 1.0 2143.0 1.0 1.0 34.0 12.0 4.4797 4.80 1935.0 40.0 12.0 3.4109 16200.0 0.7215 10.0 10.0 14.0 0.0 16.0 6.600003 1.499556 256.0 ## 1 246 1.0 1980.0 0.0 4.0 31.0 9.0 2.5253 2.56 2070.0 36.0 10.0 4.6860 19900.0 0.7515 10.0 7.0 5.0 1.0 2.0 14.899910 0.926360 4.0 ## 2 686 0.0 0.0 0.0 0.0 54.0 15.0 NaN 0.00 1920.0 52.0 15.0 10.4170 20000.0 0.6915 14.0 16.0 14.0 0.0 6.0 20.000000 NaN 36.0 ## 3 691 0.0 0.0 0.0 0.0 55.0 16.0 NaN 0.00 2304.0 56.0 11.0 2.6701 13000.0 0.7215 12.0 12.0 7.5 0.0 33.0 13.000000 NaN 1089.0 ## 4 381 1.0 1960.0 0.0 1.0 43.0 14.0 3.9286 3.85 2000.0 45.0 14.0 5.3350 24470.0 0.6915 7.0 7.0 7.5 0.0 14.0 16.769939 1.368283 196.0 print(table(dt5_test_v2$inlf))
##
##  0  1
## 61 90
print(dt5_test_v2["inlf"].value_counts())
## 1.0    85
## 0.0    66
## Name: inlf, dtype: int64

Now that our dataset has both 1 and 0 - we can compare the models with the optimal cutoff’s:

probs_logit_test <- predict(logit_glm, newdata = dt5_test_v2, type = "response")
test <- InformationValue::confusionMatrix(dt5_test_v2$inlf, probs_logit_test, threshold = optimalCutoff_logit) colnames(test) <- paste0("Actual ", colnames(test)) rownames(test) <- paste0("Predicted ", rownames(test)) print(test) ## Actual 0 Actual 1 ## Predicted 0 22 5 ## Predicted 1 39 85 probs_logit_test = logit_glm.predict(exog = dt5_test_v2) test = metrics.confusion_matrix(dt5_test_v2["inlf"], np.where(probs_logit_test >= optimalCutoff_logit, 1, 0)) test = pd.DataFrame(test, columns = ["Predicted 0", "Predicted 1"], index = ["Actual 0", "Actual 1"]) # print(test) ## Predicted 0 Predicted 1 ## Actual 0 17 49 ## Actual 1 11 74 probs_probit_test <- predict(probit_glm, newdata = dt5_test_v2, type = "response") test <- InformationValue::confusionMatrix(dt5_test_v2$inlf, probs_probit_test, threshold = optimalCutoff_probit)
colnames(test) <- paste0("Actual ", colnames(test))
rownames(test) <- paste0("Predicted ", rownames(test))
print(test)
##             Actual 0 Actual 1
## Predicted 0       34       11
## Predicted 1       27       79
probs_probit_test = probit_glm.predict(exog = dt5_test_v2)
test = metrics.confusion_matrix(dt5_test_v2["inlf"], np.where(probs_probit_test >= optimalCutoff_probit, 1, 0))
test = pd.DataFrame(test, columns = ["Predicted 0", "Predicted 1"], index = ["Actual 0", "Actual 1"])
#
print(test)
##           Predicted 0  Predicted 1
## Actual 0           33           33
## Actual 1           18           67

We see that for this test set, we correctly predict the majority of 1’s. Again part of this is because all of the observations with 1 were included in our model. In this case, the logit model appears to be better at classifying 1 correctly, compared to the probit model. For the correct prediction of 0 - it appears that the probit may be better suited.

We cal also plot the $$ROC$$ for the test data as:

#
InformationValue::plotROC(dt5_test_v2$inlf, probs_logit_test, Show.labels = FALSE) fpr, tpr, thresholds = metrics.roc_curve(dt5_test_v2[["inlf"]], probs_logit_test, pos_label = 1) plotROC(fpr, tpr, thresholds) # InformationValue::plotROC(dt5_test_v2$inlf, probs_probit_test, Show.labels = FALSE)

fpr, tpr, thresholds = metrics.roc_curve(dt5_test_v2[["inlf"]], probs_probit_test, pos_label = 1)
plotROC(fpr, tpr, thresholds)

Note: since we have used randomization to select the subset - the results in R and Python will be different due to the different data subsamples selected.