9.4 Exercises Solutions
Exercise 9.4 Exercise 1 Solution
data <- read.csv("./Data/titanic3.csv", stringsAsFactors = FALSE, na.strings = "")
n_nan_print = function(df){
for (i in 1:ncol(df)){
if (sum(is.na(df[,i])) > 0){
print(paste("The column", colnames(data[i]), "have", sum(is.na(df[,i])), "missing values"))
}
}
}
n_nan_print(data)## [1] "The column age have 263 missing values"
## [1] "The column fare have 1 missing values"
## [1] "The column cabin have 1014 missing values"
## [1] "The column embarked have 2 missing values"
## [1] "The column boat have 823 missing values"
## [1] "The column body have 1188 missing values"
## [1] "The column home.dest have 564 missing values"
Exercise 9.5 Exercise 2 Solution
remove_mv = function(df,per=0.8){
df_new = df[ ,(colSums((is.na(df))/nrow(df)) < per)]
df_new = na.omit(df_new)
return(df_new)
}
z = remove_mv(data,0.4)
c(dim(data), dim(z))## [1] 1309 14 1043 10
Exercise 9.6 Exercise 3 Solution:
dataImputed = data
dataImputed$body[is.na(dataImputed$body)] = 0
dataImputed$home.dest[is.na(dataImputed$home.dest)] = "Unknown"
dataImputed$age[is.na(dataImputed$age)] = round(mean(data$age, na.rm=T))
datatable(dataImputed)