Code Illustrations

Introductory Example

We first build up the very simple two-dimensional Gaussian Example we have seen a few times. However, instead of using an MCAR missingness mechanism, we introduce a MAR mechanism where

\[ P(M=m_2 \mid x)=0.8 \mathbf{1}\{ x_2 > 0\} \]

That is, \(X_1\) is missing (pattern \(m_2=(1,0)\)) with probability \(p=0.8\) whenever \(X_2 > 0\).

###Code adapted from Stef van Buuren's book
###https://stefvanbuuren.name/fimd/sec-true.html
library(mice)
library(energy)

create.X <- function(beta = 1, sigma2 = 2, n = 50,
                        run = 1) {
  set.seed(seed = run)
  x <- rnorm(n)
  y <- beta * x + rnorm(n, sd = sqrt(sigma2))
  cbind(X1 = x, X2 = y)
}


set.seed(10)
n<-2000

Xstar <- create.X(run = 1, n=n)

plot(Xstar, cex=0.8, col="darkblue")

## MCAR Example
#M<-cbind(sample(c(0,1), size=nrow(Xstar), replace=T, prob = c(1-0.2,0.2) ), 0  )
#X<-Xstar
#X[M==1] <- NA
## MAR Example
M<-matrix(0, nrow=nrow(Xstar), ncol=ncol(Xstar))
M[Xstar[,2] > 0, 1] <- sample(c(0,1), size=sum(Xstar[,2] > 0), replace=T, prob = c(1-0.8,0.8) )
colnames(M) <- cbind("M1", "M2")

#M[Xstar[,2] > 0, 1] <- sample(c(0,1), size=sum(Xstar[,2] > 0), replace=T, prob = c(0,1) )

X<-Xstar
X[M==1] <- NA


head(cbind(X, M))
             X1         X2 M1 M2
[1,] -0.6264538 -1.8796586  0  0
[2,]  0.1836433 -2.5348356  0  0
[3,]         NA  1.4549741  1  0
[4,]         NA  2.3296393  1  0
[5,]         NA  0.2505240  1  0
[6,]         NA  0.1644148  1  0
head(Xstar)
             X1         X2
[1,] -0.6264538 -1.8796586
[2,]  0.1836433 -2.5348356
[3,] -0.8356286  1.4549741
[4,]  1.5952808  2.3296393
[5,]  0.3295078  0.2505240
[6,] -0.8204684  0.1644148
## (0) Mean Imputation ##

# 1. Estimate the mean
meanX1<-mean(X[!is.na(X[,1]),1])

## 2. Impute
meanimp<-X
meanimp[is.na(X[,1]),1] <-meanX1

## (1) Regression Imputation ##

# 1. Estimate Regression
lmodelX1X2<-lm(X1~X2, data=as.data.frame(X[!is.na(X[,1]),])   )

## 2. Impute
impnormpredict<-X
impnormpredict[is.na(X[,1]),1] <-predict(lmodelX1X2, newdata= as.data.frame(X[is.na(X[,1]),])  )


## (2) Gaussian Imputation ##

# 1. Estimate Regression
#lmodelX1X2<-lm(X1~X2, X=as.X.frame(X[!is.na(X[,1]),])   )
# (same as before)

## 2. Impute
impnorm<-X
meanx<-predict(lmodelX1X2, newdata= as.data.frame(X[is.na(X[,1]),])  )
var <- var(lmodelX1X2$residuals)
impnorm[is.na(X[,1]),1] <-rnorm(n=length(meanx), mean = meanx, sd=sqrt(var) )

##RMSE calculation

RMSEcalc<-function(impX){
  
 round(mean(apply(Xstar - impX,1,function(x) norm(as.matrix(x), type="F"  ) )),2)
  
}

energycalc <- function(impX){
  
  round(eqdist.e( rbind(Xstar,impX), c(nrow(Xstar), nrow(impX))  ),2)
  
}

We now recreate the plots we have seen in the first part, with some more details:

# Plot the Imputations

par(mfrow=c(2,2))


plot(meanimp[!is.na(X[,1]),c("X2","X1")], main=paste("Mean Imputation", "\nRMSE", RMSEcalc(meanimp), "\nEnergy", energycalc(meanimp)), cex=0.8, col="darkblue", cex.main=1.5)
points(meanimp[is.na(X[,1]),c("X2","X1")], col="darkred", cex=0.8 )

plot(impnormpredict[!is.na(X[,1]),c("X2","X1")], main=paste("Regression Imputation","\nRMSE", RMSEcalc(impnormpredict), "\nEnergy", energycalc(impnormpredict)), cex=0.8, col="darkblue", cex.main=1.5)
points(impnormpredict[is.na(X[,1]),c("X2","X1")], col="darkred", cex=0.8 )

plot(impnorm[!is.na(X[,1]),c("X2","X1")], main=paste("Gaussian Imputation","\nRMSE", RMSEcalc(impnorm), "\nEnergy", energycalc(impnorm)), col="darkblue", cex.main=1.5)
points(impnorm[is.na(X[,1]),c("X2","X1")], col="darkred", cex=0.8 )

#plot(Xstar[,c("X2","X1")], main="Truth", col="darkblue", cex.main=1.5)
plot(Xstar[!is.na(X[,1]),c("X2","X1")], main="Truth", col="darkblue", cex.main=1.5)
points(Xstar[is.na(X[,1]),c("X2","X1")], col="darkgreen", cex=0.8 )

\[ \textbf{ Lesson I: Imputation is a Generative Approach } \]

\[ \textbf{ Lesson II: FCS might just work, but it is hard} \]

\[ \textbf{ Lesson III: Imputation should be evaluated as a Generative Approach } \]

We now try to estimate the regression coefficient when regressing \(X_2\) on \(X_1\) (exactly the opposite regression we did for imputation). Lets call this parameter of interest \(\theta\). In this case, the truth is \(\theta=1\).

Approach 1: MLE without imputation

As this is an MAR example and \(\theta\) is not part of the conditional distribution of \(M \mid X\), we can use MLE in this case. However, since \(X_1\) is not observed in pattern \(m_2\), we can only use pattern \(m_1\) and so it can be shown that in this case

\[ \hat{\theta} = \text{arg}\,\text{max}_{\theta} \prod_{i=1}^n \mathcal{L}(\theta; o(X_i,M_i)) = \text{arg}\,\text{max}_{\theta}\prod_{i:M_i=m_1} N(\theta X_{1,i}, 2)\prod_{i:M_i=m_2} N(0, \theta^2 + 2). \]

This is actually already quite complicated to optimize, so we may estimate \(\theta\) numerically as:

library(optimx)

X1<-X[!is.na(X[,1]),1]
X2m1<-X[!is.na(X[,1]),2]
X2m2<-X[is.na(X[,1]),2]

f <- function(theta){
  
  Likm1<-sum(dnorm(X2m1, mean=theta*X1, sd=sqrt(2), log=T))
  Likm2<-sum(dnorm(X2m2, mean=0, sd=sqrt(theta^2+2), log=T))
  
  return( -(Likm1 + Likm2))
  
}

res <- optimx(0, fn = f, method = "BFGS", control=list(dowarn=F))

print( paste0("Correct Likelihood Optimization: ", round(res$p1,2) ) )
[1] "Correct Likelihood Optimization: 1.06"
## Complete-case analysis is biased!

lmodelX2X1<-lm(X2~X1, data=as.data.frame(X[!is.na(X[,1]),])   )
hatthetaMLE <- lmodelX2X1$coefficients["X1"]

print( paste0("Complete-case analysis: ", round(hatthetaMLE,2) ) )
[1] "Complete-case analysis: 0.83"

Approach 2: Imputation

We now estimate \(\theta\) using the different imputations above.

hattheta_meanimp<-lm(X2~X1, data=data.frame(meanimp))$coefficient["X1"]
hattheta_normpredict<-lm(X2~X1, data=data.frame(impnormpredict))$coefficient["X1"]
hattheta_norm<-lm(X2~X1, data=data.frame(impnorm))$coefficient["X1"]

print(paste0("Mean Imputation: ", round(hattheta_meanimp,2)) )
[1] "Mean Imputation: 0.83"
print(paste0("Mean Imputation: ", round(hattheta_normpredict,2)) )
[1] "Mean Imputation: 1.38"
print(paste0("Mean Imputation: ", round(hattheta_norm,2)) )
[1] "Mean Imputation: 1.04"

Clearly, the Gaussian imputation with random draws is the only one doing the job correctly. It is also the one that allows for multiple imputation. To demonstrate this, we resample the X set with lower \(n\).

set.seed(10)
n<-2000

m<-50

Xstar <- create.X(run = 1, n=n)

## MCAR Example
#M<-cbind(sample(c(0,1), size=nrow(Xstar), replace=T, prob = c(1-0.2,0.2) ), 0  )
#X<-Xstar
#X[M==1] <- NA
## MAR Example
M<-matrix(0, nrow=nrow(Xstar), ncol=ncol(Xstar))
M[Xstar[,2] > 0, 1] <- sample(c(0,1), size=sum(Xstar[,2] > 0), replace=T, prob = c(1-0.8,0.8) )


## (2) Gaussian Imputation ##

# 1. Estimate Regression
#lmodelX1X2<-lm(X1~X2, X=as.X.frame(X[!is.na(X[,1]),])   )
# (same as before)

## 2. Impute
meanx<-predict(lmodelX1X2, newdata= as.data.frame(X[is.na(X[,1]),])  )
var <- var(lmodelX1X2$residuals)


# ##CI with only one imputation:
# impnorm<-X
# impnorm[is.na(X[,1]),1] <-rnorm(n=length(meanx), mean = meanx, sd=sqrt(var))
# 
# ##Calculate theta for each imputation
# model<-lm(X2~X1, X=X.frame(implist[[j]]))
# hattheta<-model$coefficient["X1"]
# vartheta <- vcov(model)[2,2]
# 
# # CI:
# CIupper <- hattheta + qnorm(0.975)*sqrt(vartheta)
# CIlower <- hattheta - qnorm(0.975)*sqrt(vartheta)
# 
# print( paste("lower: ", round(CIlower,2), "upper: ", round(CIupper,2)  ) )





##Create m imputations
implist<-list()
hattheta_norm<-c()
varvec<-c()
for (j in 1:m){
  implist[[j]]<-X
implist[[j]][is.na(X[,1]),1] <-rnorm(n=length(meanx), mean = meanx, sd=sqrt(var))

##Calculate theta for each imputation
modelj<-lm(X2~X1, data=data.frame(implist[[j]]))
hattheta_norm[j]<-modelj$coefficient["X1"]

varvec[j] <- vcov(modelj)[2,2]
}

## Mean Estimate:
hattheta <- mean(hattheta_norm)
# First part of variance
Ubar<-mean(varvec)
# Second part of variance
B <- var(hattheta_norm)

Tvar <- Ubar + (1+1/m)*B

# CI:
CIupper <- hattheta + qnorm(0.975)*sqrt(Tvar)
CIlower <- hattheta - qnorm(0.975)*sqrt(Tvar)

print( paste("lower: ", round(CIlower,2), "upper: ", round(CIupper,2)  ) )
[1] "lower:  0.96 upper:  1.09"

However notice that the Gaussian imputation we use is technically still not proper! For this, one would need to also factor in the uncertainty coming from the imputation method itself. This would essentially mean accounting for the sampling distribution of the regression parameter and the variance. A method that does this is the ``norm.boot’’ imputation in mice. However this is outside of the scope of this workshop and I refer to the following paper.

The following two real data examples is adapted to a large extend from the guidance on Rmisstastic_descriptive_statistics_with_missing_values):

Air Quality Data

Air pollution is currently one of the most serious public health worries worldwide. Many epidemiological studies have proved the influence that some chemical compounds, such as sulphur dioxide (SO2), nitrogen dioxide (NO2), ozone (O3) can have on our health. Associations set up to monitor air quality are active all over the world to measure the concentration of these pollutants.

The data set we use here is a small subset of a cleaned version of air pollution measurements in the US. For more details, I refer to the Appendix C of the following paper. In this example, I actually induced missing values here, so that we have full control over the missing mechanism and access to the true data.

We first load a real (prepared) data set:

library(mice)

## Naniar provides principled, tidy ways to summarise, visualise, and manipulate missing data with minimal deviations from the workflows in ggplot2 and tidy data.
library(naniar)
library(VIM)
library(FactoMineR)


X<- read.csv("data.csv", header=T, row.names=1)
Xstar<- read.csv("fulldata.csv", header=T, row.names=1)

head(X)
  max_PM2.5 max_O3 max_PM10  Longitude Latitude Elevation Land.Use_AGRICULTURAL
1      4.75  0.027       11 -106.58520 35.13430      1591                     0
2      7.15  0.055       25 -120.68028 38.20185        NA                     1
3      1.45  0.044        8 -105.30353 42.76697      1508                     1
4      7.00  0.058       20  -70.74802 43.07537         8                     0
5     12.00  0.017       NA -112.09577 33.50383        NA                     0
6      5.40  0.049       17  -85.89804 38.06091       135                     0
  Land.Use_COMMERCIAL Land.Use_INDUSTRIAL Location.Setting_RURAL
1                   0                   0                      0
2                   0                   0                      1
3                   0                   0                      1
4                   0                   0                      0
5                   0                   0                      0
6                   0                   0                      0
  Location.Setting_SUBURBAN
1                         0
2                         0
3                         0
4                         0
5                         0
6                         1
head(Xstar)
  max_PM2.5 max_O3 max_PM10  Longitude Latitude Elevation Land.Use_AGRICULTURAL
1      4.75  0.027       11 -106.58520 35.13430      1591                     0
2      7.15  0.055       25 -120.68028 38.20185       310                     1
3      1.45  0.044        8 -105.30353 42.76697      1508                     1
4      7.00  0.058       20  -70.74802 43.07537         8                     0
5     12.00  0.017       15 -112.09577 33.50383       343                     0
6      5.40  0.049       17  -85.89804 38.06091       135                     0
  Land.Use_COMMERCIAL Land.Use_INDUSTRIAL Location.Setting_RURAL
1                   0                   0                      0
2                   0                   0                      1
3                   0                   0                      1
4                   0                   0                      0
5                   0                   0                      0
6                   0                   0                      0
  Location.Setting_SUBURBAN
1                         0
2                         0
3                         0
4                         0
5                         0
6                         1
summary(X)
   max_PM2.5          max_O3          max_PM10        Longitude      
 Min.   :-3.300   Min.   :0.0010   Min.   :  0.00   Min.   :-124.18  
 1st Qu.: 4.100   1st Qu.:0.0330   1st Qu.:  9.00   1st Qu.:-117.33  
 Median : 6.200   Median :0.0410   Median : 15.00   Median :-106.51  
 Mean   : 7.332   Mean   :0.0413   Mean   : 18.31   Mean   :-102.33  
 3rd Qu.: 9.200   3rd Qu.:0.0490   3rd Qu.: 23.00   3rd Qu.: -88.22  
 Max.   :56.333   Max.   :0.0910   Max.   :143.00   Max.   : -68.26  
 NA's   :159      NA's   :162      NA's   :338      NA's   :159      
    Latitude       Elevation    Land.Use_AGRICULTURAL Land.Use_COMMERCIAL
 Min.   :26.05   Min.   : -14   Min.   :0.000         Min.   :0.0000     
 1st Qu.:34.49   1st Qu.:  68   1st Qu.:0.000         1st Qu.:0.0000     
 Median :38.20   Median : 269   Median :0.000         Median :0.0000     
 Mean   :38.35   Mean   : 430   Mean   :0.125         Mean   :0.3115     
 3rd Qu.:41.30   3rd Qu.: 571   3rd Qu.:0.000         3rd Qu.:1.0000     
 Max.   :48.64   Max.   :2195   Max.   :1.000         Max.   :1.0000     
                 NA's   :353                                             
 Land.Use_INDUSTRIAL Location.Setting_RURAL Location.Setting_SUBURBAN
 Min.   :0.00        Min.   :0.000          Min.   :0.000            
 1st Qu.:0.00        1st Qu.:0.000          1st Qu.:0.000            
 Median :0.00        Median :0.000          Median :0.000            
 Mean   :0.03        Mean   :0.193          Mean   :0.428            
 3rd Qu.:0.00        3rd Qu.:0.000          3rd Qu.:1.000            
 Max.   :1.00        Max.   :1.000          Max.   :1.000            
                                                                     

1) Descriptive statistics with missing values

We start by inspecting various plots for the missing values:

# also on Rmistatic maybe you can use the same simulations: https://rmisstastic.netlify.app/how-to/impute/missimp

res<-summary(aggr(X, sortVar=TRUE))$combinations


 Variables sorted by number of missings: 
                  Variable  Count
                 Elevation 0.1765
                  max_PM10 0.1690
                    max_O3 0.0810
                 max_PM2.5 0.0795
                 Longitude 0.0795
                  Latitude 0.0000
     Land.Use_AGRICULTURAL 0.0000
       Land.Use_COMMERCIAL 0.0000
       Land.Use_INDUSTRIAL 0.0000
    Location.Setting_RURAL 0.0000
 Location.Setting_SUBURBAN 0.0000
res[rev(order(res[,2])),]
           Combinations Count Percent
1 0:0:0:0:0:0:0:0:0:0:0  1008   50.40
5 0:0:1:0:0:1:0:0:0:0:0   179    8.95
2 0:0:0:0:0:1:0:0:0:0:0   174    8.70
6 0:1:0:0:0:0:0:0:0:0:0   162    8.10
7 1:0:0:0:0:0:0:0:0:0:0   159    7.95
4 0:0:1:0:0:0:0:0:0:0:0   159    7.95
3 0:0:0:1:0:0:0:0:0:0:0   159    7.95
#The VIM function matrixplot creates a matrix plot in which all cells of a X matrix are visualized by rectangles. Available X is coded according to a continuous color scheme (gray scale), while missing/imputed X is visualized by a clearly distinguishable color (red). If you use Rstudio the plot is not interactive (there are the warnings), but if you use R directly, you can click on a column of your choice: the rows are sorted (decreasing order) of the values of this column. This is useful to check if there is an association between the value of a variable and the missingness of another one.

Creating the res variable renders a nice plot, showing the percentage of missing values for each variable. Moreover the next command nicely shows the patterns (M), as well as their frequency of occurring in the data set. In particular, we can further visualize the pattern using the matrixplot function:

matrixplot(X, sortby = 3)

The VIM function marginplot creates a scatterplot with additional information on the missing values. If you plot the variables (x,y), the points with no missing values are represented as in a standard scatterplot. The points for which x (resp. y) is missing are represented in red along the y (resp. x) axis. In addition, boxplots of the x and y variables are represented along the axes with and without missing values (in red all variables x where y is missing, in blue all variables x where y is observed).

marginplot(X[,2:3])

This plot can be used to check whether MCAR might hold. Under MCAR, the distribution of a variable when another variable is missing should always be the same. Under MAR this can be violated as we have seen (distribution shifts!). This plotting is a convenient way to check this a bit.

There are many more plotting possibilities with VIM, as demonstrated e.g., in 2012ADAC.pdf (tuwien.ac.at).

2) Imputation

We now finally use the mice package for imputation. We consider several methods and then start by choosing the best one according to the new I-Score. As the best version of the score not only scores one imputation but an imputation method itself for this dataset, we need to define a function for each:

library(mice)
source("Iscore.R")

X<-as.matrix(X)

methods<-c("pmm",      # mice-pmm
           "cart",     # mice-cart
           "rf",       #mice-rf
           "sample",   #mice-sample
           "norm.nob") # Gaussian Imputation


  imputationfuncs<-list()
  for (method in methods){
    ##Probably not the smartes solution
    imputationfuncs[[method]][[1]] <- method
    imputationfuncs[[method]][[2]] <- function(X,m, method, maxit=5){ 
      
      tmp<-mice(X, m = m, method = method,  printFlag = F, visitSequence="arabic", maxit = maxit)
      
      return(mice::complete(tmp, action="all"))
      
      }
  }


scoreslist <- Iscores_new(X,imputations=NULL,score="mIScore2", imputationfuncs=imputationfuncs, N=30)  

scores<-do.call(cbind,lapply(scoreslist, function(x) x$score ))
names(scores)<-methods
scores[order(scores)]

The score considers mice-cart to to be the best method. As a side note however, mice-rf is deemed second best and might have better properties for uncertainty estimation and multiple imputation, thus both should be considered. Here, we go with mice-cart:

imp.mice <- mice(X, m = 10, method = "cart", printFlag = F)

Since we have the true data in this case, we analyze the imputation method a bit closer:

## This here is not possible without the fully observed data ###
Ximp<-mice::complete(imp.mice)

index1<-1
index2<-2

par(mfrow=c(1,2))
plot(Xstar[is.na(X[,index1]) | is.na(X[,index2]),c(index1,index2)])
plot(Ximp[is.na(X[,index1]) | is.na(X[,index2]),c(index1,index2)])

# Replicating first and second moments
colMeans(Xstar)-colMeans(Ximp)
                max_PM2.5                    max_O3                  max_PM10 
             -0.008211667               0.000190000              -0.130166667 
                Longitude                  Latitude                 Elevation 
             -0.038610589               0.000000000               0.166295199 
    Land.Use_AGRICULTURAL       Land.Use_COMMERCIAL       Land.Use_INDUSTRIAL 
              0.000000000               0.000000000               0.000000000 
   Location.Setting_RURAL Location.Setting_SUBURBAN 
              0.000000000               0.000000000 
norm(cov(Xstar) - cov(Ximp))/norm(cov(Xstar))
[1] 0.006222356

3) Analyse

# Apply a regression to the mulitple imputation
lm.mice.out <- with(imp.mice, lm(max_O3 ~ max_PM2.5 +Longitude +Latitude +Elevation +Land.Use_AGRICULTURAL+Land.Use_COMMERCIAL+Land.Use_INDUSTRIAL+Location.Setting_RURAL+Location.Setting_SUBURBAN))


# Use Rubins Rules to aggregate the estimates
res<-pool(lm.mice.out)
summary(res)
                        term      estimate    std.error   statistic        df
1                (Intercept)  3.921780e-02 3.795853e-03 10.33174933  738.5496
2                  max_PM2.5  1.243004e-04 5.515270e-05  2.25374925  624.5218
3                  Longitude -1.052071e-05 2.064805e-05 -0.50952566  925.0302
4                   Latitude -5.239621e-06 7.496017e-05 -0.06989874  690.3293
5                  Elevation -1.120471e-06 6.984538e-07 -1.60421579 1180.1607
6      Land.Use_AGRICULTURAL -1.788547e-04 1.445738e-03 -0.12371167  749.1631
7        Land.Use_COMMERCIAL  6.776164e-04 7.146750e-04  0.94814623  660.7492
8        Land.Use_INDUSTRIAL -1.979572e-03 1.940207e-03 -1.02028919  232.4925
9     Location.Setting_RURAL  1.879744e-03 1.243417e-03  1.51175635  365.6486
10 Location.Setting_SUBURBAN  8.833360e-04 7.031050e-04  1.25633580  543.8848
        p.value
1  1.832133e-23
2  2.455788e-02
3  6.105053e-01
4  9.442945e-01
5  1.089341e-01
6  9.015768e-01
7  3.434017e-01
8  3.086518e-01
9  1.314596e-01
10 2.095336e-01

Importantly, this works here because we have all the ingredients for the pool function, which are (according to ?pool):

  • the estimates of the model;

  • the standard error of each estimate;

  • the residual degrees of freedom of the model.

Just to double check, we also perform the regression on \(X^*\):

## This here is not possible without the fully observed data ###
res.not.attainable<-lm(max_O3 ~ max_PM2.5 +Longitude +Latitude +Elevation +Land.Use_AGRICULTURAL+Land.Use_COMMERCIAL+Land.Use_INDUSTRIAL+Location.Setting_RURAL+Location.Setting_SUBURBAN, data=as.data.frame(Xstar))

summary(res.not.attainable)

Call:
lm(formula = max_O3 ~ max_PM2.5 + Longitude + Latitude + Elevation + 
    Land.Use_AGRICULTURAL + Land.Use_COMMERCIAL + Land.Use_INDUSTRIAL + 
    Location.Setting_RURAL + Location.Setting_SUBURBAN, data = as.data.frame(Xstar))

Residuals:
      Min        1Q    Median        3Q       Max 
-0.043412 -0.008181 -0.000635  0.007901  0.049148 

Coefficients:
                            Estimate Std. Error t value Pr(>|t|)    
(Intercept)                4.092e-02  3.627e-03  11.282   <2e-16 ***
max_PM2.5                  1.305e-04  5.333e-05   2.447   0.0145 *  
Longitude                  1.314e-08  1.988e-05   0.001   0.9995    
Latitude                  -2.495e-05  7.142e-05  -0.349   0.7269    
Elevation                 -9.333e-07  6.769e-07  -1.379   0.1681    
Land.Use_AGRICULTURAL      3.083e-04  1.382e-03   0.223   0.8235    
Land.Use_COMMERCIAL        5.986e-04  6.794e-04   0.881   0.3784    
Land.Use_INDUSTRIAL       -1.777e-03  1.751e-03  -1.015   0.3103    
Location.Setting_RURAL     1.883e-03  1.152e-03   1.634   0.1023    
Location.Setting_SUBURBAN  8.635e-04  6.632e-04   1.302   0.1931    
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 0.01288 on 1990 degrees of freedom
Multiple R-squared:  0.007379,  Adjusted R-squared:  0.00289 
F-statistic: 1.644 on 9 and 1990 DF,  p-value: 0.09762
cbind(round( (res$pooled$estimate-res.not.attainable$coefficients)/res.not.attainable$coefficients, 3)  )
                              [,1]
(Intercept)                 -0.042
max_PM2.5                   -0.047
Longitude                 -801.369
Latitude                    -0.790
Elevation                    0.201
Land.Use_AGRICULTURAL       -1.580
Land.Use_COMMERCIAL          0.132
Land.Use_INDUSTRIAL          0.114
Location.Setting_RURAL      -0.002
Location.Setting_SUBURBAN    0.023

This example is a very simple i.i.d. example as we have treated throughout this workshop. Of course there are many more challenges, especially also for data that may be partly dependent (for instance repeat measurement or panel data). Please refer to the provided links for more information. In particular also the task view on missing data.