Example: linear regression

We will use the following libraries in some capacity:

R
Python

suppressPackageStartupMessages({
  suppressWarnings({
    library(tidyverse)
    library(data.table)
    library(GGally)
    library(lmtest)
    library(lrmest)
    library(tseries)
    library(nortest)
    library(car)
    library(sandwich)
    library(lattice)
    library(viridisLite)
    library(leaps)
    library(skimr)
  })
})

import pandas as pd
# https://pandas.pydata.org/docs/user_guide/options.html
pd.set_option('display.max_columns', None)
pd.set_option('display.expand_frame_repr', False)
#
import numpy as np
import scipy
import skimpy
import statsmodels.api as sm
import statsmodels.formula.api as smf
from statsmodels.stats.outliers_influence import variance_inflation_factor
import patsy
import plotnine as pl
# For plot size options, see: plotnine.options.figure_size = (., .)
import matplotlib.pyplot as plt
# plt.figure(figsize = (., .))
import seaborn as sns

We will use the cps5_small dataset. As an alternative to directly downloading it online, we will download the .csv file and place it in the same directory as our code. We are doing this to reduce the load on the website that this dataset is hosted, as well as to avoid periods, when the website is in maintanance mode and the files are temporary unavailable.

R
Python

dt <- read.csv(file = "./cps5_small.csv", sep = ",", dec = ".", header = TRUE) %>% as_tibble()
dt %>% head(10) %>% print()

# A tibble: 10 × 10
   black  educ exper faminc female metro midwest south  wage  west
   <int> <int> <int>  <int>  <int> <int>   <int> <int> <dbl> <int>
 1     0    13    45      0      1     1       0     0  44.4     1
 2     0    14    25  45351      1     1       1     0  16       0
 3     0    18    27  91946      1     1       0     0  15.4     0
 4     0    13    42  48370      0     1       1     0  13.5     0
 5     0    13    41  10000      1     1       0     0  25       1
 6     0    16    26 151308      1     1       0     0  24.0     0
 7     0    16    11 110461      1     1       0     1  41.0     0
 8     0    18    15      0      1     1       1     0  26.4     0
 9     0    21    32  67084      0     1       1     0  30.8     0
10     0    14    12  14000      0     0       0     0  34.6     1

dt = pd.read_csv("./cps5_small.csv")
print(dt.head(10))

   black  educ  exper  faminc  female  metro  midwest  south   wage  west
0      0    13     45       0       1      1        0      0  44.44     1
1      0    14     25   45351       1      1        1      0  16.00     0
2      0    18     27   91946       1      1        0      0  15.38     0
3      0    13     42   48370       0      1        1      0  13.54     0
4      0    13     41   10000       1      1        0      0  25.00     1
5      0    16     26  151308       1      1        0      0  24.05     0
6      0    16     11  110461       1      1        0      1  40.95     0
7      0    18     15       0       1      1        1      0  26.45     0
8      0    21     32   67084       0      1        1      0  30.76     0
9      0    14     12   14000       0      0        0      0  34.57     1

Alternative libraries to read the data (click to expand)

There are many different libraries that provide dataframe-like objects. These objects differ in terms of their efficiency and speed when dealing with large-scale datasets, but also in terms of their syntax, when used for daga transformation and aggregation. A large benchmark of various such libraries is provided by the developers of duckdb - you can find the newest benchmark results here.

Below we present some code to read the data using the aformentioned libraries:

dt_datatable <- data.table::fread(file = "./cps5_small.csv", sep = ",", dec = ".", header = TRUE)
dt_datatable %>% head(10) %>% print()

    black educ exper faminc female metro midwest south  wage west
 1:     0   13    45      0      1     1       0     0 44.44    1
 2:     0   14    25  45351      1     1       1     0 16.00    0
 3:     0   18    27  91946      1     1       0     0 15.38    0
 4:     0   13    42  48370      0     1       1     0 13.54    0
 5:     0   13    41  10000      1     1       0     0 25.00    1
 6:     0   16    26 151308      1     1       0     0 24.05    0
 7:     0   16    11 110461      1     1       0     1 40.95    0
 8:     0   18    15      0      1     1       1     0 26.45    0
 9:     0   21    32  67084      0     1       1     0 30.76    0
10:     0   14    12  14000      0     0       0     0 34.57    1

import datatable
#
dt_datatable = datatable.fread(file = "./cps5_small.csv", sep = ",", dec = ".")
print(dt_datatable.head(10))

   | black   educ  exper  faminc  female  metro  midwest  south     wage   west
   | bool8  int32  int32   int32   bool8  bool8    bool8  bool8  float64  bool8
-- + -----  -----  -----  ------  ------  -----  -------  -----  -------  -----
 0 |     0     13     45       0       1      1        0      0    44.44      1
 1 |     0     14     25   45351       1      1        1      0    16         0
 2 |     0     18     27   91946       1      1        0      0    15.38      0
 3 |     0     13     42   48370       0      1        1      0    13.54      0
 4 |     0     13     41   10000       1      1        0      0    25         1
 5 |     0     16     26  151308       1      1        0      0    24.05      0
 6 |     0     16     11  110461       1      1        0      1    40.95      0
 7 |     0     18     15       0       1      1        1      0    26.45      0
 8 |     0     21     32   67084       0      1        1      0    30.76      0
 9 |     0     14     12   14000       0      0        0      0    34.57      1
[10 rows x 10 columns]

dt_polars <- polars::pl$read_csv(path = "./cps5_small.csv")
print(dt_polars$head(n = 10))

shape: (10, 10)
┌───────┬──────┬───────┬────────┬───┬─────────┬───────┬───────┬──────┐
│ black ┆ educ ┆ exper ┆ faminc ┆ … ┆ midwest ┆ south ┆ wage  ┆ west │
│ ---   ┆ ---  ┆ ---   ┆ ---    ┆   ┆ ---     ┆ ---   ┆ ---   ┆ ---  │
│ i64   ┆ i64  ┆ i64   ┆ i64    ┆   ┆ i64     ┆ i64   ┆ f64   ┆ i64  │
╞═══════╪══════╪═══════╪════════╪═══╪═════════╪═══════╪═══════╪══════╡
│ 0     ┆ 13   ┆ 45    ┆ 0      ┆ … ┆ 0       ┆ 0     ┆ 44.44 ┆ 1    │
│ 0     ┆ 14   ┆ 25    ┆ 45351  ┆ … ┆ 1       ┆ 0     ┆ 16.0  ┆ 0    │
│ 0     ┆ 18   ┆ 27    ┆ 91946  ┆ … ┆ 0       ┆ 0     ┆ 15.38 ┆ 0    │
│ 0     ┆ 13   ┆ 42    ┆ 48370  ┆ … ┆ 1       ┆ 0     ┆ 13.54 ┆ 0    │
│ 0     ┆ 13   ┆ 41    ┆ 10000  ┆ … ┆ 0       ┆ 0     ┆ 25.0  ┆ 1    │
│ 0     ┆ 16   ┆ 26    ┆ 151308 ┆ … ┆ 0       ┆ 0     ┆ 24.05 ┆ 0    │
│ 0     ┆ 16   ┆ 11    ┆ 110461 ┆ … ┆ 0       ┆ 1     ┆ 40.95 ┆ 0    │
│ 0     ┆ 18   ┆ 15    ┆ 0      ┆ … ┆ 1       ┆ 0     ┆ 26.45 ┆ 0    │
│ 0     ┆ 21   ┆ 32    ┆ 67084  ┆ … ┆ 1       ┆ 0     ┆ 30.76 ┆ 0    │
│ 0     ┆ 14   ┆ 12    ┆ 14000  ┆ … ┆ 0       ┆ 0     ┆ 34.57 ┆ 1    │
└───────┴──────┴───────┴────────┴───┴─────────┴───────┴───────┴──────┘

import polars
#
dt_polars = polars.read_csv(source = "./cps5_small.csv")
print(dt_polars.head(10))

shape: (10, 10)
┌───────┬──────┬───────┬────────┬───┬─────────┬───────┬───────┬──────┐
│ black ┆ educ ┆ exper ┆ faminc ┆ … ┆ midwest ┆ south ┆ wage  ┆ west │
│ ---   ┆ ---  ┆ ---   ┆ ---    ┆   ┆ ---     ┆ ---   ┆ ---   ┆ ---  │
│ i64   ┆ i64  ┆ i64   ┆ i64    ┆   ┆ i64     ┆ i64   ┆ f64   ┆ i64  │
╞═══════╪══════╪═══════╪════════╪═══╪═════════╪═══════╪═══════╪══════╡
│ 0     ┆ 13   ┆ 45    ┆ 0      ┆ … ┆ 0       ┆ 0     ┆ 44.44 ┆ 1    │
│ 0     ┆ 14   ┆ 25    ┆ 45351  ┆ … ┆ 1       ┆ 0     ┆ 16.0  ┆ 0    │
│ 0     ┆ 18   ┆ 27    ┆ 91946  ┆ … ┆ 0       ┆ 0     ┆ 15.38 ┆ 0    │
│ 0     ┆ 13   ┆ 42    ┆ 48370  ┆ … ┆ 1       ┆ 0     ┆ 13.54 ┆ 0    │
│ 0     ┆ 13   ┆ 41    ┆ 10000  ┆ … ┆ 0       ┆ 0     ┆ 25.0  ┆ 1    │
│ 0     ┆ 16   ┆ 26    ┆ 151308 ┆ … ┆ 0       ┆ 0     ┆ 24.05 ┆ 0    │
│ 0     ┆ 16   ┆ 11    ┆ 110461 ┆ … ┆ 0       ┆ 1     ┆ 40.95 ┆ 0    │
│ 0     ┆ 18   ┆ 15    ┆ 0      ┆ … ┆ 1       ┆ 0     ┆ 26.45 ┆ 0    │
│ 0     ┆ 21   ┆ 32    ┆ 67084  ┆ … ┆ 1       ┆ 0     ┆ 30.76 ┆ 0    │
│ 0     ┆ 14   ┆ 12    ┆ 14000  ┆ … ┆ 0       ┆ 0     ┆ 34.57 ┆ 1    │
└───────┴──────┴───────┴────────┴───┴─────────┴───────┴───────┴──────┘

library("duckdb")

Loading required package: DBI

# to start an in-memory database
con <- dbConnect(duckdb(), dbdir = ":memory:")
dbWriteTable(con, name = "'cps5_small.csv'", 
             value = read.csv("./cps5_small.csv", sep = ",", dec = ".", header = TRUE), 
             overwrite = TRUE)
print(dbGetQuery(con, "SELECT * FROM 'cps5_small.csv' LIMIT 10;"))

   black educ exper faminc female metro midwest south  wage west
1      0   13    45      0      1     1       0     0 44.44    1
2      0   14    25  45351      1     1       1     0 16.00    0
3      0   18    27  91946      1     1       0     0 15.38    0
4      0   13    42  48370      0     1       1     0 13.54    0
5      0   13    41  10000      1     1       0     0 25.00    1
6      0   16    26 151308      1     1       0     0 24.05    0
7      0   16    11 110461      1     1       0     1 40.95    0
8      0   18    15      0      1     1       1     0 26.45    0
9      0   21    32  67084      0     1       1     0 30.76    0
10     0   14    12  14000      0     0       0     0 34.57    1

# close the connection and frees resources
dbDisconnect(con, shutdown = TRUE)

import duckdb
#
con = duckdb.connect()
con.read_csv("./cps5_small.csv")

con.sql("SELECT * FROM 'cps5_small.csv' LIMIT 10;")

┌───────┬───────┬───────┬────────┬────────┬───────┬─────────┬───────┬────────┬───────┐
│ black │ educ  │ exper │ faminc │ female │ metro │ midwest │ south │  wage  │ west  │
│ int64 │ int64 │ int64 │ int64  │ int64  │ int64 │  int64  │ int64 │ double │ int64 │
├───────┼───────┼───────┼────────┼────────┼───────┼─────────┼───────┼────────┼───────┤
│     0 │    13 │    45 │      0 │      1 │     1 │       0 │     0 │  44.44 │     1 │
│     0 │    14 │    25 │  45351 │      1 │     1 │       1 │     0 │   16.0 │     0 │
│     0 │    18 │    27 │  91946 │      1 │     1 │       0 │     0 │  15.38 │     0 │
│     0 │    13 │    42 │  48370 │      0 │     1 │       1 │     0 │  13.54 │     0 │
│     0 │    13 │    41 │  10000 │      1 │     1 │       0 │     0 │   25.0 │     1 │
│     0 │    16 │    26 │ 151308 │      1 │     1 │       0 │     0 │  24.05 │     0 │
│     0 │    16 │    11 │ 110461 │      1 │     1 │       0 │     1 │  40.95 │     0 │
│     0 │    18 │    15 │      0 │      1 │     1 │       1 │     0 │  26.45 │     0 │
│     0 │    21 │    32 │  67084 │      0 │     1 │       1 │     0 │  30.76 │     0 │
│     0 │    14 │    12 │  14000 │      0 │     0 │       0 │     0 │  34.57 │     1 │
├───────┴───────┴───────┴────────┴────────┴───────┴─────────┴───────┴────────┴───────┤
│ 10 rows                                                                 10 columns │
└────────────────────────────────────────────────────────────────────────────────────┘

# close the connection and frees resources
con.close()

The variable list is as follows:

black - $=1$ if black
educ - years of education
exper - $\rm potential\_experience = age - educ - 6$
faminc - other family income, $\$$
female - $=1$ if female
metro - $=1$ if in metropolitan area
midwest - $=1$ if midwest region
south - $=1$ if south region
wage - earnings per hour, $\$$
west - $=1$ if west region

Important

We want to identify which explanatory variables (educ, exper and other columns) affects the wage (wage column) of a person from the data sample.