library(bayesrules)
library(rstanarm)
library(bayesplot)
library(tidyverse)
library(tidybayes)
library(broom.mixed)
library(janitor)
library(pROC)
data <- read.csv("/Users/ruilinwu/Desktop/Stats/stats115/final_project/breast_cancer/breast-cancer.data",
header = FALSE,
na.strings = "?")
colnames(data) <- c("Class", "age", "menopause", "tumorsize", "invnodes",
"nodecaps", "degmalig", "breast", "breastquad", "irradiat")
head(data)
## Class age menopause tumorsize invnodes nodecaps degmalig
## 1 no-recurrence-events 30-39 premeno 30-34 0-2 no 3
## 2 no-recurrence-events 40-49 premeno 20-24 0-2 no 2
## 3 no-recurrence-events 40-49 premeno 20-24 0-2 no 2
## 4 no-recurrence-events 60-69 ge40 15-19 0-2 no 2
## 5 no-recurrence-events 40-49 premeno 0-4 0-2 no 2
## 6 no-recurrence-events 60-69 ge40 15-19 0-2 no 2
## breast breastquad irradiat
## 1 left left_low no
## 2 right right_up no
## 3 left left_low no
## 4 right left_up no
## 5 right right_low no
## 6 left left_low no
summary(data)
## Class age menopause tumorsize
## Length:286 Length:286 Length:286 Length:286
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
## invnodes nodecaps degmalig breast
## Length:286 Length:286 Min. :1.000 Length:286
## Class :character Class :character 1st Qu.:2.000 Class :character
## Mode :character Mode :character Median :2.000 Mode :character
## Mean :2.049
## 3rd Qu.:3.000
## Max. :3.000
## breastquad irradiat
## Length:286 Length:286
## Class :character Class :character
## Mode :character Mode :character
##
##
##
#View(data)
na_per_column <- colSums(is.na(data))
na_per_column
## Class age menopause tumorsize invnodes nodecaps degmalig
## 0 0 0 0 0 8 0
## breast breastquad irradiat
## 0 1 0
data_complete <- na.omit(data)
# Check the structure of the dataset
str(data_complete)
## 'data.frame': 277 obs. of 10 variables:
## $ Class : chr "no-recurrence-events" "no-recurrence-events" "no-recurrence-events" "no-recurrence-events" ...
## $ age : chr "30-39" "40-49" "40-49" "60-69" ...
## $ menopause : chr "premeno" "premeno" "premeno" "ge40" ...
## $ tumorsize : chr "30-34" "20-24" "20-24" "15-19" ...
## $ invnodes : chr "0-2" "0-2" "0-2" "0-2" ...
## $ nodecaps : chr "no" "no" "no" "no" ...
## $ degmalig : int 3 2 2 2 2 2 2 1 2 2 ...
## $ breast : chr "left" "right" "left" "right" ...
## $ breastquad: chr "left_low" "right_up" "left_low" "left_up" ...
## $ irradiat : chr "no" "no" "no" "no" ...
## - attr(*, "na.action")= 'omit' Named int [1:9] 146 164 165 184 185 207 234 264 265
## ..- attr(*, "names")= chr [1:9] "146" "164" "165" "184" ...
# Get a summary of each variable
summary(data_complete)
## Class age menopause tumorsize
## Length:277 Length:277 Length:277 Length:277
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
## invnodes nodecaps degmalig breast
## Length:277 Length:277 Min. :1.000 Length:277
## Class :character Class :character 1st Qu.:2.000 Class :character
## Mode :character Mode :character Median :2.000 Mode :character
## Mean :2.058
## 3rd Qu.:3.000
## Max. :3.000
## breastquad irradiat
## Length:277 Length:277
## Class :character Class :character
## Mode :character Mode :character
##
##
##
# Frequency tables for each variable
lapply(data_complete, table)
## $Class
##
## no-recurrence-events recurrence-events
## 196 81
##
## $age
##
## 20-29 30-39 40-49 50-59 60-69 70-79
## 1 36 89 91 55 5
##
## $menopause
##
## ge40 lt40 premeno
## 123 5 149
##
## $tumorsize
##
## 0-4 10-14 15-19 20-24 25-29 30-34 35-39 40-44 45-49 5-9 50-54
## 8 28 29 48 51 57 19 22 3 4 8
##
## $invnodes
##
## 0-2 12-14 15-17 24-26 3-5 6-8 9-11
## 209 3 6 1 34 17 7
##
## $nodecaps
##
## no yes
## 221 56
##
## $degmalig
##
## 1 2 3
## 66 129 82
##
## $breast
##
## left right
## 145 132
##
## $breastquad
##
## central left_low left_up right_low right_up
## 21 106 94 23 33
##
## $irradiat
##
## no yes
## 215 62
#only 1 invnodes 24-26
data_complete <- data_complete %>%
filter(invnodes != "24-26")
#only 1 age 20-29
data_complete <- data_complete %>%
filter(age != "20-29")
library(ggplot2)
ggplot(data_complete, aes(x = Class)) +
geom_bar(fill = "skyblue", color = "black") +
labs(title = "Distribution of Recurrence Events",
x = "Class",
y = "Count")

# Cross-tabulation between Class and age
age_class_table <- table(data_complete$Class, data_complete$age)
print(age_class_table)
##
## 30-39 40-49 50-59 60-69 70-79
## no-recurrence-events 21 62 69 38 5
## recurrence-events 15 27 22 16 0
data_complete$Class <- factor(data_complete$Class)
data_complete$age <- factor(data_complete$age)
data_complete$menopause <- factor(data_complete$menopause)
data_complete$tumorsize <- factor(data_complete$tumorsize)
data_complete$invnodes <- factor(data_complete$invnodes)
data_complete$nodecaps <- factor(data_complete$nodecaps)
data_complete$breast <- factor(data_complete$breast)
data_complete$breastquad<- factor(data_complete$breastquad)
data_complete$irradiat <- factor(data_complete$irradiat)
# Fit the Bayesian logistic regression model
breast_model <- stan_glm(Class ~ age + menopause + tumorsize + invnodes +
nodecaps + degmalig + breast + breastquad + irradiat,
data = data_complete,
family = binomial(link = "logit"),
prior = normal(location = 0, scale = 2.5),
prior_intercept = normal(location = 0, scale = 5),
chains = 4,
iter = 10000,
seed = 12345)
##
## SAMPLING FOR MODEL 'bernoulli' NOW (CHAIN 1).
## Chain 1:
## Chain 1: Gradient evaluation took 0.000367 seconds
## Chain 1: 1000 transitions using 10 leapfrog steps per transition would take 3.67 seconds.
## Chain 1: Adjust your expectations accordingly!
## Chain 1:
## Chain 1:
## Chain 1: Iteration: 1 / 10000 [ 0%] (Warmup)
## Chain 1: Iteration: 1000 / 10000 [ 10%] (Warmup)
## Chain 1: Iteration: 2000 / 10000 [ 20%] (Warmup)
## Chain 1: Iteration: 3000 / 10000 [ 30%] (Warmup)
## Chain 1: Iteration: 4000 / 10000 [ 40%] (Warmup)
## Chain 1: Iteration: 5000 / 10000 [ 50%] (Warmup)
## Chain 1: Iteration: 5001 / 10000 [ 50%] (Sampling)
## Chain 1: Iteration: 6000 / 10000 [ 60%] (Sampling)
## Chain 1: Iteration: 7000 / 10000 [ 70%] (Sampling)
## Chain 1: Iteration: 8000 / 10000 [ 80%] (Sampling)
## Chain 1: Iteration: 9000 / 10000 [ 90%] (Sampling)
## Chain 1: Iteration: 10000 / 10000 [100%] (Sampling)
## Chain 1:
## Chain 1: Elapsed Time: 0.612 seconds (Warm-up)
## Chain 1: 0.574 seconds (Sampling)
## Chain 1: 1.186 seconds (Total)
## Chain 1:
##
## SAMPLING FOR MODEL 'bernoulli' NOW (CHAIN 2).
## Chain 2:
## Chain 2: Gradient evaluation took 7e-06 seconds
## Chain 2: 1000 transitions using 10 leapfrog steps per transition would take 0.07 seconds.
## Chain 2: Adjust your expectations accordingly!
## Chain 2:
## Chain 2:
## Chain 2: Iteration: 1 / 10000 [ 0%] (Warmup)
## Chain 2: Iteration: 1000 / 10000 [ 10%] (Warmup)
## Chain 2: Iteration: 2000 / 10000 [ 20%] (Warmup)
## Chain 2: Iteration: 3000 / 10000 [ 30%] (Warmup)
## Chain 2: Iteration: 4000 / 10000 [ 40%] (Warmup)
## Chain 2: Iteration: 5000 / 10000 [ 50%] (Warmup)
## Chain 2: Iteration: 5001 / 10000 [ 50%] (Sampling)
## Chain 2: Iteration: 6000 / 10000 [ 60%] (Sampling)
## Chain 2: Iteration: 7000 / 10000 [ 70%] (Sampling)
## Chain 2: Iteration: 8000 / 10000 [ 80%] (Sampling)
## Chain 2: Iteration: 9000 / 10000 [ 90%] (Sampling)
## Chain 2: Iteration: 10000 / 10000 [100%] (Sampling)
## Chain 2:
## Chain 2: Elapsed Time: 0.612 seconds (Warm-up)
## Chain 2: 0.627 seconds (Sampling)
## Chain 2: 1.239 seconds (Total)
## Chain 2:
##
## SAMPLING FOR MODEL 'bernoulli' NOW (CHAIN 3).
## Chain 3:
## Chain 3: Gradient evaluation took 7e-06 seconds
## Chain 3: 1000 transitions using 10 leapfrog steps per transition would take 0.07 seconds.
## Chain 3: Adjust your expectations accordingly!
## Chain 3:
## Chain 3:
## Chain 3: Iteration: 1 / 10000 [ 0%] (Warmup)
## Chain 3: Iteration: 1000 / 10000 [ 10%] (Warmup)
## Chain 3: Iteration: 2000 / 10000 [ 20%] (Warmup)
## Chain 3: Iteration: 3000 / 10000 [ 30%] (Warmup)
## Chain 3: Iteration: 4000 / 10000 [ 40%] (Warmup)
## Chain 3: Iteration: 5000 / 10000 [ 50%] (Warmup)
## Chain 3: Iteration: 5001 / 10000 [ 50%] (Sampling)
## Chain 3: Iteration: 6000 / 10000 [ 60%] (Sampling)
## Chain 3: Iteration: 7000 / 10000 [ 70%] (Sampling)
## Chain 3: Iteration: 8000 / 10000 [ 80%] (Sampling)
## Chain 3: Iteration: 9000 / 10000 [ 90%] (Sampling)
## Chain 3: Iteration: 10000 / 10000 [100%] (Sampling)
## Chain 3:
## Chain 3: Elapsed Time: 0.614 seconds (Warm-up)
## Chain 3: 0.638 seconds (Sampling)
## Chain 3: 1.252 seconds (Total)
## Chain 3:
##
## SAMPLING FOR MODEL 'bernoulli' NOW (CHAIN 4).
## Chain 4:
## Chain 4: Gradient evaluation took 9e-06 seconds
## Chain 4: 1000 transitions using 10 leapfrog steps per transition would take 0.09 seconds.
## Chain 4: Adjust your expectations accordingly!
## Chain 4:
## Chain 4:
## Chain 4: Iteration: 1 / 10000 [ 0%] (Warmup)
## Chain 4: Iteration: 1000 / 10000 [ 10%] (Warmup)
## Chain 4: Iteration: 2000 / 10000 [ 20%] (Warmup)
## Chain 4: Iteration: 3000 / 10000 [ 30%] (Warmup)
## Chain 4: Iteration: 4000 / 10000 [ 40%] (Warmup)
## Chain 4: Iteration: 5000 / 10000 [ 50%] (Warmup)
## Chain 4: Iteration: 5001 / 10000 [ 50%] (Sampling)
## Chain 4: Iteration: 6000 / 10000 [ 60%] (Sampling)
## Chain 4: Iteration: 7000 / 10000 [ 70%] (Sampling)
## Chain 4: Iteration: 8000 / 10000 [ 80%] (Sampling)
## Chain 4: Iteration: 9000 / 10000 [ 90%] (Sampling)
## Chain 4: Iteration: 10000 / 10000 [100%] (Sampling)
## Chain 4:
## Chain 4: Elapsed Time: 0.616 seconds (Warm-up)
## Chain 4: 0.673 seconds (Sampling)
## Chain 4: 1.289 seconds (Total)
## Chain 4:
neff_ratio(breast_model)
## (Intercept) age40-49 age50-59 age60-69
## 0.51680 0.74220 0.62620 0.59875
## age70-79 menopauselt40 menopausepremeno tumorsize10-14
## 0.93740 1.06840 0.77680 0.56740
## tumorsize15-19 tumorsize20-24 tumorsize25-29 tumorsize30-34
## 0.39260 0.30115 0.30475 0.31725
## tumorsize35-39 tumorsize40-44 tumorsize45-49 tumorsize5-9
## 0.39865 0.41655 0.83410 0.96570
## tumorsize50-54 invnodes12-14 invnodes15-17 invnodes3-5
## 0.48235 1.16310 1.10170 0.87420
## invnodes6-8 invnodes9-11 nodecapsyes degmalig
## 0.87205 1.04725 0.79400 1.11965
## breastright breastquadleft_low breastquadleft_up breastquadright_low
## 1.12955 0.49630 0.49110 0.66070
## breastquadright_up irradiatyes
## 0.55110 1.09580
rhat(breast_model)
## (Intercept) age40-49 age50-59 age60-69
## 1.0000469 0.9999426 1.0000072 1.0000395
## age70-79 menopauselt40 menopausepremeno tumorsize10-14
## 0.9999379 0.9999480 1.0002746 1.0003117
## tumorsize15-19 tumorsize20-24 tumorsize25-29 tumorsize30-34
## 1.0008661 1.0011012 1.0010895 1.0008840
## tumorsize35-39 tumorsize40-44 tumorsize45-49 tumorsize5-9
## 1.0006955 1.0008287 0.9999609 1.0000347
## tumorsize50-54 invnodes12-14 invnodes15-17 invnodes3-5
## 1.0005021 0.9999772 0.9999421 1.0002229
## invnodes6-8 invnodes9-11 nodecapsyes degmalig
## 1.0000993 0.9999372 1.0001835 0.9999927
## breastright breastquadleft_low breastquadleft_up breastquadright_low
## 0.9999531 1.0005100 1.0005409 1.0002414
## breastquadright_up irradiatyes
## 1.0005083 1.0000498
posterior_interval(breast_model, prob = 0.80)
## 10% 90%
## (Intercept) -5.38444454 -2.37157858
## age40-49 -1.09481641 0.18199070
## age50-59 -1.30295484 0.14325187
## age60-69 -0.72593090 1.10554441
## age70-79 -4.21864960 0.38741244
## menopauselt40 -4.33277717 0.02497944
## menopausepremeno -0.12491821 1.13371262
## tumorsize10-14 -3.16408902 -0.19069111
## tumorsize15-19 -0.96564582 1.29938069
## tumorsize20-24 -0.61718921 1.45431471
## tumorsize25-29 -0.36286918 1.64788647
## tumorsize30-34 -0.19740845 1.81605558
## tumorsize35-39 -0.95896168 1.44248986
## tumorsize40-44 -1.12368365 1.19288982
## tumorsize45-49 -1.91022581 1.69556225
## tumorsize5-9 -4.08444322 0.55922338
## tumorsize50-54 -0.52810278 2.13128368
## invnodes12-14 -0.78909432 2.59464908
## invnodes15-17 -0.67018539 1.79890969
## invnodes3-5 0.03486566 1.36387297
## invnodes6-8 0.02638643 1.76838214
## invnodes9-11 0.25834655 2.77072098
## nodecapsyes -0.33708799 0.90032633
## degmalig 0.58317327 1.25755322
## breastright -0.75737351 0.14925873
## breastquadleft_low -0.48173600 1.19616289
## breastquadleft_up -0.69824898 0.98971879
## breastquadright_low -1.17256523 0.98122318
## breastquadright_up -0.14451700 1.76397080
## irradiatyes -0.05295653 0.93283460
exp(posterior_interval(breast_model, prob = 0.80))
## 10% 90%
## (Intercept) 0.004587388 0.09333328
## age40-49 0.334601031 1.19960303
## age50-59 0.271727693 1.15402043
## age60-69 0.483873923 3.02086861
## age70-79 0.014718507 1.47316396
## menopauselt40 0.013131030 1.02529404
## menopausepremeno 0.882569088 3.10717087
## tumorsize10-14 0.042252615 0.82638781
## tumorsize15-19 0.380737231 3.66702493
## tumorsize20-24 0.539458614 4.28154837
## tumorsize25-29 0.695677435 5.19598633
## tumorsize30-34 0.820855283 6.14756202
## tumorsize35-39 0.383290659 4.23121784
## tumorsize40-44 0.325080106 3.29659401
## tumorsize45-49 0.148046953 5.44970919
## tumorsize5-9 0.016832509 1.74931342
## tumorsize50-54 0.589722741 8.42567573
## invnodes12-14 0.454256020 13.39188705
## invnodes15-17 0.511613723 6.04305507
## invnodes3-5 1.035480593 3.91131239
## invnodes6-8 1.026737636 5.86136282
## invnodes9-11 1.294787448 15.97014406
## nodecapsyes 0.713846022 2.46040589
## degmalig 1.791715017 3.51680612
## breastright 0.468896362 1.16097333
## breastquadleft_low 0.617710115 3.30740169
## breastquadleft_up 0.497455596 2.69047777
## breastquadright_low 0.309571799 2.66771735
## breastquadright_up 0.865440202 5.83556332
## irradiatyes 0.948421244 2.54170369
predicted_probs <- predict(breast_model, newdata = data_complete, type = "response")
roc_obj <- roc(data_complete$Class, predicted_probs,
levels = c("no-recurrence-events", "recurrence-events"),
direction = "<")
plot(roc_obj, main = "ROC Curve for Bayesian Logistic Regression")

best_cutoff <- coords(roc_obj, "best", ret = "threshold")
print(best_cutoff)
## threshold
## 1 0.3471479
# Perform 10-fold cross-validation classification summary
cv_accuracy <- classification_summary_cv(
model = breast_model,
data = data_complete,
cutoff = 0.33,
k = 10
)
cv_accuracy$cv
## sensitivity specificity overall_accuracy
## 1 0.613254 0.7209879 0.6940476
# Posterior predictive check
pp_check(breast_model) +
ggtitle("Posterior Predictive Check") +
theme(plot.title = element_text(color = "black", face = "bold", size = 14, hjust = 0.5))

# Extract posterior samples with chain info
posterior_array <- as.array(breast_model)
# Density overlay for all parameters
mcmc_dens_overlay(posterior_array) +
ggtitle("MCMC Density Overlay for All Parameters") +
theme(plot.title = element_text(color = "black", face = "bold", size = 14, hjust = 0.5))
