9.4 Exercises Solutions

Exercise 9.4 Exercise 1 Solution

data <- read.csv("./Data/titanic3.csv", stringsAsFactors = FALSE, na.strings = "")
n_nan_print = function(df){
    for (i in 1:ncol(df)){
        if (sum(is.na(df[,i])) > 0){
            print(paste("The column", colnames(data[i]), "have", sum(is.na(df[,i])), "missing values"))
        }
    }
}

n_nan_print(data)

## [1] "The column age have 263 missing values"
## [1] "The column fare have 1 missing values"
## [1] "The column cabin have 1014 missing values"
## [1] "The column embarked have 2 missing values"
## [1] "The column boat have 823 missing values"
## [1] "The column body have 1188 missing values"
## [1] "The column home.dest have 564 missing values"

Exercise 9.5 Exercise 2 Solution

remove_mv = function(df,per=0.8){
    df_new = df[ ,(colSums((is.na(df))/nrow(df)) < per)]
    df_new = na.omit(df_new)
    return(df_new)
}

z = remove_mv(data,0.4)
c(dim(data), dim(z))

## [1] 1309   14 1043   10

Exercise 9.6 Exercise 3 Solution:

dataImputed = data
dataImputed$body[is.na(dataImputed$body)] = 0
dataImputed$home.dest[is.na(dataImputed$home.dest)] = "Unknown"
dataImputed$age[is.na(dataImputed$age)] = round(mean(data$age, na.rm=T))
datatable(dataImputed)