This is another new knowledge I learn after attending a Predictive Analysis course at NUS.

Logistic Regression. It is used to predict the odds of being a case based on the values of the independent variable (predictors). The Odds are defined as the probability that a particular outcome is a case divided by the probability that it is a noncase. (Wikipedia)

odd_ratio = p/(1-p)

Below is the take home exercise given to me and the code.

data source

Case :

Logistic Regression. It is used to predict the odds of being a case based on the values of the independent variable (predictors). The Odds are defined as the probability that a particular outcome is a case divided by the probability that it is a noncase. (Wikipedia)

odd_ratio = p/(1-p)

Below is the take home exercise given to me and the code.

data source

Case :

I don't plan to spend too much time discussing the preliminary data assessment and the basic descriptive data analysis we need to do before proceed for any other kind of deep dive data analysis. It is common sense.

# Note : the script is tested working (1/10/2019) # Reference : https://stats.idre.ucla.edu/r/dae/logit-regression/ # Reference : https://stats.idre.ucla.edu/other/mult-pkg/faq/general/faq-how-do-i-interpret-odds-ratios-in-logistic-regression/ # Applicable for standard logistic regression analysis # Not tested on any real case# Set working directory setwd("set your own working directory")

# Load required library pacman::p_load(tidyverse, lubridate,zoo,forecast, fUnitRoots, readxl, fpp2, ggplot2, Ecdat, caret, ggpubr, rpart, caTools, MASS, ROCR, oddsratio, broom)

# Load data from working director, name the data frame as Raw_Data # Set qualitative data as factor Raw_Data = read.csv("BURN1000_1.csv", header = TRUE) cols = c('Fac_Class', 'DEATH', 'RACEC', 'INH_INJ', 'FLAME' , 'GENDER') Raw_Data[,cols] = lapply(Raw_Data[, cols], factor)

# Split Train & Test data with ration 70 : 30 set.seed(1) split_values = sample.split(Raw_Data$DEATH, SplitRatio = 0.7) train_set = subset(Raw_Data, split_values==T) test_set = subset(Raw_Data, split_values==F)

# Perform quick a quick model assessment without interaction # Check which are the main effect factor_screening = DEATH ~ AGE + RACEC + TBSA + FLAME + GENDER + Fac_Class + INH_INJ

Main_Factor = glm(factor_screening, data = Raw_Data , family = "binomial") summary(Main_Factor)

# Run a full factoria GLM model, for general overview of GLM model Full_Term = DEATH ~ AGE + RACEC + TBSA + FLAME + GENDER + Fac_Class + INH_INJ + AGE:RACEC + AGE:TBSA + AGE:FLAME + AGE:GENDER + AGE:Fac_Class + AGE:INH_INJ + RACEC:TBSA + RACEC:FLAME + RACEC:GENDER + RACEC:Fac_Class + RACEC:INH_INJ + TBSA:FLAME + TBSA:GENDER + TBSA:Fac_Class + TBSA:INH_INJ + GENDER:Fac_Class + GENDER:INH_INJ + Fac_Class:INH_INJ Full_Model = glm(Full_Term, data = train_set , family = "binomial") summary(Full_Model)

# Run stepAIC to a model giving minimum AIC value # stepAIC(Full_Model, direction = 'backward')

Sim_Term_1 = DEATH ~ AGE + RACEC + TBSA + FLAME + GENDER + INH_INJ + AGE:INH_INJ + RACEC:FLAME + TBSA:INH_INJ

# Model training Sim_Model_1 = glm(Sim_Term_1, data = train_set , family = "binomial") summary(Sim_Model_1) or_glm(data = train_set, model = Sim_Model_1 , incr = list(AGE = 10, FLAME = 1, TBSA = 10, INH_INJ = 1 )) # Model validation with training data set predict1 = predict(Sim_Model_1, train_set, type = "response") summary(predict1) train_set$prob1 = predict1 sapply(train_set, class)

x1 = ifelse(predict1 > 0.5, 1, 0) x1 = as.factor(x1)

con_mx_train = confusionMatrix(data = x1, reference = train_set$DEATH)

# Generate a table to get optimum F1 score list1 = c(1:8)/10 F1_Tb_Train = data.frame() for (i in list1) { x2 = ifelse(predict1 > i, 1, 0) x2 = as.factor(x2) con_mx_train1 = confusionMatrix(data = x2, reference = train_set$DEATH) byClass = data.frame(con_mx_train1$byClass) Precision = byClass[5,] Recall = byClass[6,] F1 = byClass[7,] a = cbind(i, Precision, Recall, F1) F1_Tb_Train = rbind(F1_Tb_Train, a) }

F1_Tb_Train = round(F1_Tb_Train,4) # Plot F1 score chart p1 = ggplot(F1_Tb_Train, aes( x = F1_Tb_Train$i, y = F1_Tb_Train$F1)) + geom_line(color="blue") + labs(x="i", y="F1 Score", title="F1 Score chart")

# Model validataion with test data set predict2 = predict(Sim_Model_1, test_set, type = "response") summary(predict2) test_set$prob1 = predict2 sapply(test_set, class) x3 = ifelse(predict2 > 0.5, 1, 0) x3 = as.factor(x3) con_mx_test = confusionMatrix(data = x3, reference = test_set$DEATH) # Generate a table to get optimum F1 score list1 = c(1:8)/10 F1_Tb_Test = data.frame() for (i in list1) { x3 = ifelse(predict2 > i, 1, 0) x3 = as.factor(x3) con_mx_test1 = confusionMatrix(data = x3, reference = test_set$DEATH) byClass = data.frame(con_mx_test1$byClass) Precision = byClass[5,] Recall = byClass[6,] F1 = byClass[7,] a = cbind(i, Precision, Recall, F1) F1_Tb_Test = rbind(F1_Tb_Test, a) } F1_Tb_Test = round(F1_Tb_Test,4) # Plot F1 Score chart p2 = ggplot(F1_Tb_Test, aes( x = F1_Tb_Test$i, y = F1_Tb_Test$F1)) + geom_line(color="blue") + labs(x="i", y="F1 Score", title="F1 Score chart") ggarrange(p1, p2, ncol = 2, nrow = 1) # Aplication # The file below is specially created for prediction purpose. # Allow user to key in the input variable, and scrip to call the data for analysis Application = read.csv("BURN1000_20_80_1.csv", header = TRUE) # Define data type of each column cols = c('Fac_Class', 'DEATH', 'RACEC', 'INH_INJ', 'FLAME' , 'GENDER') Application[,cols] = lapply( Application[, cols], factor) # Calculate probability and Odd Ratio Predict_App = predict(Sim_Model_1, Application, type = "response") summary(Predict_App) Application$prob1 = Predict_App sapply(Application, class) Application$Odd_Ratio = Predict_App/(1-Predict_App)

Result