library(bayesrules)
library(rstanarm)
library(bayesplot)
library(tidyverse)
library(tidybayes)
library(broom.mixed)
library(janitor)
library(pROC)
data <- read.csv("/Users/ruilinwu/Desktop/Stats/stats115/final_project/breast_cancer/breast-cancer.data", 
                 header = FALSE, 
                 na.strings = "?")
colnames(data) <- c("Class", "age", "menopause", "tumorsize", "invnodes", 
                    "nodecaps", "degmalig", "breast", "breastquad", "irradiat")
head(data)
##                  Class   age menopause tumorsize invnodes nodecaps degmalig
## 1 no-recurrence-events 30-39   premeno     30-34      0-2       no        3
## 2 no-recurrence-events 40-49   premeno     20-24      0-2       no        2
## 3 no-recurrence-events 40-49   premeno     20-24      0-2       no        2
## 4 no-recurrence-events 60-69      ge40     15-19      0-2       no        2
## 5 no-recurrence-events 40-49   premeno       0-4      0-2       no        2
## 6 no-recurrence-events 60-69      ge40     15-19      0-2       no        2
##   breast breastquad irradiat
## 1   left   left_low       no
## 2  right   right_up       no
## 3   left   left_low       no
## 4  right    left_up       no
## 5  right  right_low       no
## 6   left   left_low       no
summary(data)
##     Class               age             menopause          tumorsize        
##  Length:286         Length:286         Length:286         Length:286        
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##    invnodes           nodecaps            degmalig        breast         
##  Length:286         Length:286         Min.   :1.000   Length:286        
##  Class :character   Class :character   1st Qu.:2.000   Class :character  
##  Mode  :character   Mode  :character   Median :2.000   Mode  :character  
##                                        Mean   :2.049                     
##                                        3rd Qu.:3.000                     
##                                        Max.   :3.000                     
##   breastquad          irradiat        
##  Length:286         Length:286        
##  Class :character   Class :character  
##  Mode  :character   Mode  :character  
##                                       
##                                       
## 
#View(data)
na_per_column <- colSums(is.na(data))
na_per_column
##      Class        age  menopause  tumorsize   invnodes   nodecaps   degmalig 
##          0          0          0          0          0          8          0 
##     breast breastquad   irradiat 
##          0          1          0
data_complete <- na.omit(data)
# Check the structure of the dataset
str(data_complete)
## 'data.frame':    277 obs. of  10 variables:
##  $ Class     : chr  "no-recurrence-events" "no-recurrence-events" "no-recurrence-events" "no-recurrence-events" ...
##  $ age       : chr  "30-39" "40-49" "40-49" "60-69" ...
##  $ menopause : chr  "premeno" "premeno" "premeno" "ge40" ...
##  $ tumorsize : chr  "30-34" "20-24" "20-24" "15-19" ...
##  $ invnodes  : chr  "0-2" "0-2" "0-2" "0-2" ...
##  $ nodecaps  : chr  "no" "no" "no" "no" ...
##  $ degmalig  : int  3 2 2 2 2 2 2 1 2 2 ...
##  $ breast    : chr  "left" "right" "left" "right" ...
##  $ breastquad: chr  "left_low" "right_up" "left_low" "left_up" ...
##  $ irradiat  : chr  "no" "no" "no" "no" ...
##  - attr(*, "na.action")= 'omit' Named int [1:9] 146 164 165 184 185 207 234 264 265
##   ..- attr(*, "names")= chr [1:9] "146" "164" "165" "184" ...
# Get a summary of each variable
summary(data_complete)
##     Class               age             menopause          tumorsize        
##  Length:277         Length:277         Length:277         Length:277        
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##    invnodes           nodecaps            degmalig        breast         
##  Length:277         Length:277         Min.   :1.000   Length:277        
##  Class :character   Class :character   1st Qu.:2.000   Class :character  
##  Mode  :character   Mode  :character   Median :2.000   Mode  :character  
##                                        Mean   :2.058                     
##                                        3rd Qu.:3.000                     
##                                        Max.   :3.000                     
##   breastquad          irradiat        
##  Length:277         Length:277        
##  Class :character   Class :character  
##  Mode  :character   Mode  :character  
##                                       
##                                       
## 
# Frequency tables for each variable
lapply(data_complete, table)
## $Class
## 
## no-recurrence-events    recurrence-events 
##                  196                   81 
## 
## $age
## 
## 20-29 30-39 40-49 50-59 60-69 70-79 
##     1    36    89    91    55     5 
## 
## $menopause
## 
##    ge40    lt40 premeno 
##     123       5     149 
## 
## $tumorsize
## 
##   0-4 10-14 15-19 20-24 25-29 30-34 35-39 40-44 45-49   5-9 50-54 
##     8    28    29    48    51    57    19    22     3     4     8 
## 
## $invnodes
## 
##   0-2 12-14 15-17 24-26   3-5   6-8  9-11 
##   209     3     6     1    34    17     7 
## 
## $nodecaps
## 
##  no yes 
## 221  56 
## 
## $degmalig
## 
##   1   2   3 
##  66 129  82 
## 
## $breast
## 
##  left right 
##   145   132 
## 
## $breastquad
## 
##   central  left_low   left_up right_low  right_up 
##        21       106        94        23        33 
## 
## $irradiat
## 
##  no yes 
## 215  62
#only 1 invnodes 24-26
data_complete <- data_complete %>%
  filter(invnodes != "24-26")

#only 1 age 20-29
data_complete <- data_complete %>%
   filter(age != "20-29")
library(ggplot2)

ggplot(data_complete, aes(x = Class)) +
  geom_bar(fill = "skyblue", color = "black") +
  labs(title = "Distribution of Recurrence Events",
       x = "Class",
       y = "Count")

# Cross-tabulation between Class and age
age_class_table <- table(data_complete$Class, data_complete$age)
print(age_class_table)
##                       
##                        30-39 40-49 50-59 60-69 70-79
##   no-recurrence-events    21    62    69    38     5
##   recurrence-events       15    27    22    16     0
data_complete$Class     <- factor(data_complete$Class)
data_complete$age       <- factor(data_complete$age)
data_complete$menopause <- factor(data_complete$menopause)
data_complete$tumorsize <- factor(data_complete$tumorsize)
data_complete$invnodes  <- factor(data_complete$invnodes)
data_complete$nodecaps  <- factor(data_complete$nodecaps)
data_complete$breast    <- factor(data_complete$breast)
data_complete$breastquad<- factor(data_complete$breastquad)
data_complete$irradiat  <- factor(data_complete$irradiat)
# Fit the Bayesian logistic regression model
breast_model <- stan_glm(Class ~ age + menopause + tumorsize + invnodes +
                           nodecaps + degmalig + breast + breastquad + irradiat,
                         data = data_complete,
                         family = binomial(link = "logit"),
                         prior = normal(location = 0, scale = 2.5),
                         prior_intercept = normal(location = 0, scale = 5),
                         chains = 4,
                         iter = 10000,
                         seed = 12345)
## 
## SAMPLING FOR MODEL 'bernoulli' NOW (CHAIN 1).
## Chain 1: 
## Chain 1: Gradient evaluation took 0.000367 seconds
## Chain 1: 1000 transitions using 10 leapfrog steps per transition would take 3.67 seconds.
## Chain 1: Adjust your expectations accordingly!
## Chain 1: 
## Chain 1: 
## Chain 1: Iteration:    1 / 10000 [  0%]  (Warmup)
## Chain 1: Iteration: 1000 / 10000 [ 10%]  (Warmup)
## Chain 1: Iteration: 2000 / 10000 [ 20%]  (Warmup)
## Chain 1: Iteration: 3000 / 10000 [ 30%]  (Warmup)
## Chain 1: Iteration: 4000 / 10000 [ 40%]  (Warmup)
## Chain 1: Iteration: 5000 / 10000 [ 50%]  (Warmup)
## Chain 1: Iteration: 5001 / 10000 [ 50%]  (Sampling)
## Chain 1: Iteration: 6000 / 10000 [ 60%]  (Sampling)
## Chain 1: Iteration: 7000 / 10000 [ 70%]  (Sampling)
## Chain 1: Iteration: 8000 / 10000 [ 80%]  (Sampling)
## Chain 1: Iteration: 9000 / 10000 [ 90%]  (Sampling)
## Chain 1: Iteration: 10000 / 10000 [100%]  (Sampling)
## Chain 1: 
## Chain 1:  Elapsed Time: 0.612 seconds (Warm-up)
## Chain 1:                0.574 seconds (Sampling)
## Chain 1:                1.186 seconds (Total)
## Chain 1: 
## 
## SAMPLING FOR MODEL 'bernoulli' NOW (CHAIN 2).
## Chain 2: 
## Chain 2: Gradient evaluation took 7e-06 seconds
## Chain 2: 1000 transitions using 10 leapfrog steps per transition would take 0.07 seconds.
## Chain 2: Adjust your expectations accordingly!
## Chain 2: 
## Chain 2: 
## Chain 2: Iteration:    1 / 10000 [  0%]  (Warmup)
## Chain 2: Iteration: 1000 / 10000 [ 10%]  (Warmup)
## Chain 2: Iteration: 2000 / 10000 [ 20%]  (Warmup)
## Chain 2: Iteration: 3000 / 10000 [ 30%]  (Warmup)
## Chain 2: Iteration: 4000 / 10000 [ 40%]  (Warmup)
## Chain 2: Iteration: 5000 / 10000 [ 50%]  (Warmup)
## Chain 2: Iteration: 5001 / 10000 [ 50%]  (Sampling)
## Chain 2: Iteration: 6000 / 10000 [ 60%]  (Sampling)
## Chain 2: Iteration: 7000 / 10000 [ 70%]  (Sampling)
## Chain 2: Iteration: 8000 / 10000 [ 80%]  (Sampling)
## Chain 2: Iteration: 9000 / 10000 [ 90%]  (Sampling)
## Chain 2: Iteration: 10000 / 10000 [100%]  (Sampling)
## Chain 2: 
## Chain 2:  Elapsed Time: 0.612 seconds (Warm-up)
## Chain 2:                0.627 seconds (Sampling)
## Chain 2:                1.239 seconds (Total)
## Chain 2: 
## 
## SAMPLING FOR MODEL 'bernoulli' NOW (CHAIN 3).
## Chain 3: 
## Chain 3: Gradient evaluation took 7e-06 seconds
## Chain 3: 1000 transitions using 10 leapfrog steps per transition would take 0.07 seconds.
## Chain 3: Adjust your expectations accordingly!
## Chain 3: 
## Chain 3: 
## Chain 3: Iteration:    1 / 10000 [  0%]  (Warmup)
## Chain 3: Iteration: 1000 / 10000 [ 10%]  (Warmup)
## Chain 3: Iteration: 2000 / 10000 [ 20%]  (Warmup)
## Chain 3: Iteration: 3000 / 10000 [ 30%]  (Warmup)
## Chain 3: Iteration: 4000 / 10000 [ 40%]  (Warmup)
## Chain 3: Iteration: 5000 / 10000 [ 50%]  (Warmup)
## Chain 3: Iteration: 5001 / 10000 [ 50%]  (Sampling)
## Chain 3: Iteration: 6000 / 10000 [ 60%]  (Sampling)
## Chain 3: Iteration: 7000 / 10000 [ 70%]  (Sampling)
## Chain 3: Iteration: 8000 / 10000 [ 80%]  (Sampling)
## Chain 3: Iteration: 9000 / 10000 [ 90%]  (Sampling)
## Chain 3: Iteration: 10000 / 10000 [100%]  (Sampling)
## Chain 3: 
## Chain 3:  Elapsed Time: 0.614 seconds (Warm-up)
## Chain 3:                0.638 seconds (Sampling)
## Chain 3:                1.252 seconds (Total)
## Chain 3: 
## 
## SAMPLING FOR MODEL 'bernoulli' NOW (CHAIN 4).
## Chain 4: 
## Chain 4: Gradient evaluation took 9e-06 seconds
## Chain 4: 1000 transitions using 10 leapfrog steps per transition would take 0.09 seconds.
## Chain 4: Adjust your expectations accordingly!
## Chain 4: 
## Chain 4: 
## Chain 4: Iteration:    1 / 10000 [  0%]  (Warmup)
## Chain 4: Iteration: 1000 / 10000 [ 10%]  (Warmup)
## Chain 4: Iteration: 2000 / 10000 [ 20%]  (Warmup)
## Chain 4: Iteration: 3000 / 10000 [ 30%]  (Warmup)
## Chain 4: Iteration: 4000 / 10000 [ 40%]  (Warmup)
## Chain 4: Iteration: 5000 / 10000 [ 50%]  (Warmup)
## Chain 4: Iteration: 5001 / 10000 [ 50%]  (Sampling)
## Chain 4: Iteration: 6000 / 10000 [ 60%]  (Sampling)
## Chain 4: Iteration: 7000 / 10000 [ 70%]  (Sampling)
## Chain 4: Iteration: 8000 / 10000 [ 80%]  (Sampling)
## Chain 4: Iteration: 9000 / 10000 [ 90%]  (Sampling)
## Chain 4: Iteration: 10000 / 10000 [100%]  (Sampling)
## Chain 4: 
## Chain 4:  Elapsed Time: 0.616 seconds (Warm-up)
## Chain 4:                0.673 seconds (Sampling)
## Chain 4:                1.289 seconds (Total)
## Chain 4:
neff_ratio(breast_model)
##         (Intercept)            age40-49            age50-59            age60-69 
##             0.51680             0.74220             0.62620             0.59875 
##            age70-79       menopauselt40    menopausepremeno      tumorsize10-14 
##             0.93740             1.06840             0.77680             0.56740 
##      tumorsize15-19      tumorsize20-24      tumorsize25-29      tumorsize30-34 
##             0.39260             0.30115             0.30475             0.31725 
##      tumorsize35-39      tumorsize40-44      tumorsize45-49        tumorsize5-9 
##             0.39865             0.41655             0.83410             0.96570 
##      tumorsize50-54       invnodes12-14       invnodes15-17         invnodes3-5 
##             0.48235             1.16310             1.10170             0.87420 
##         invnodes6-8        invnodes9-11         nodecapsyes            degmalig 
##             0.87205             1.04725             0.79400             1.11965 
##         breastright  breastquadleft_low   breastquadleft_up breastquadright_low 
##             1.12955             0.49630             0.49110             0.66070 
##  breastquadright_up         irradiatyes 
##             0.55110             1.09580
rhat(breast_model)
##         (Intercept)            age40-49            age50-59            age60-69 
##           1.0000469           0.9999426           1.0000072           1.0000395 
##            age70-79       menopauselt40    menopausepremeno      tumorsize10-14 
##           0.9999379           0.9999480           1.0002746           1.0003117 
##      tumorsize15-19      tumorsize20-24      tumorsize25-29      tumorsize30-34 
##           1.0008661           1.0011012           1.0010895           1.0008840 
##      tumorsize35-39      tumorsize40-44      tumorsize45-49        tumorsize5-9 
##           1.0006955           1.0008287           0.9999609           1.0000347 
##      tumorsize50-54       invnodes12-14       invnodes15-17         invnodes3-5 
##           1.0005021           0.9999772           0.9999421           1.0002229 
##         invnodes6-8        invnodes9-11         nodecapsyes            degmalig 
##           1.0000993           0.9999372           1.0001835           0.9999927 
##         breastright  breastquadleft_low   breastquadleft_up breastquadright_low 
##           0.9999531           1.0005100           1.0005409           1.0002414 
##  breastquadright_up         irradiatyes 
##           1.0005083           1.0000498
posterior_interval(breast_model, prob = 0.80)
##                             10%         90%
## (Intercept)         -5.38444454 -2.37157858
## age40-49            -1.09481641  0.18199070
## age50-59            -1.30295484  0.14325187
## age60-69            -0.72593090  1.10554441
## age70-79            -4.21864960  0.38741244
## menopauselt40       -4.33277717  0.02497944
## menopausepremeno    -0.12491821  1.13371262
## tumorsize10-14      -3.16408902 -0.19069111
## tumorsize15-19      -0.96564582  1.29938069
## tumorsize20-24      -0.61718921  1.45431471
## tumorsize25-29      -0.36286918  1.64788647
## tumorsize30-34      -0.19740845  1.81605558
## tumorsize35-39      -0.95896168  1.44248986
## tumorsize40-44      -1.12368365  1.19288982
## tumorsize45-49      -1.91022581  1.69556225
## tumorsize5-9        -4.08444322  0.55922338
## tumorsize50-54      -0.52810278  2.13128368
## invnodes12-14       -0.78909432  2.59464908
## invnodes15-17       -0.67018539  1.79890969
## invnodes3-5          0.03486566  1.36387297
## invnodes6-8          0.02638643  1.76838214
## invnodes9-11         0.25834655  2.77072098
## nodecapsyes         -0.33708799  0.90032633
## degmalig             0.58317327  1.25755322
## breastright         -0.75737351  0.14925873
## breastquadleft_low  -0.48173600  1.19616289
## breastquadleft_up   -0.69824898  0.98971879
## breastquadright_low -1.17256523  0.98122318
## breastquadright_up  -0.14451700  1.76397080
## irradiatyes         -0.05295653  0.93283460
exp(posterior_interval(breast_model, prob = 0.80))
##                             10%         90%
## (Intercept)         0.004587388  0.09333328
## age40-49            0.334601031  1.19960303
## age50-59            0.271727693  1.15402043
## age60-69            0.483873923  3.02086861
## age70-79            0.014718507  1.47316396
## menopauselt40       0.013131030  1.02529404
## menopausepremeno    0.882569088  3.10717087
## tumorsize10-14      0.042252615  0.82638781
## tumorsize15-19      0.380737231  3.66702493
## tumorsize20-24      0.539458614  4.28154837
## tumorsize25-29      0.695677435  5.19598633
## tumorsize30-34      0.820855283  6.14756202
## tumorsize35-39      0.383290659  4.23121784
## tumorsize40-44      0.325080106  3.29659401
## tumorsize45-49      0.148046953  5.44970919
## tumorsize5-9        0.016832509  1.74931342
## tumorsize50-54      0.589722741  8.42567573
## invnodes12-14       0.454256020 13.39188705
## invnodes15-17       0.511613723  6.04305507
## invnodes3-5         1.035480593  3.91131239
## invnodes6-8         1.026737636  5.86136282
## invnodes9-11        1.294787448 15.97014406
## nodecapsyes         0.713846022  2.46040589
## degmalig            1.791715017  3.51680612
## breastright         0.468896362  1.16097333
## breastquadleft_low  0.617710115  3.30740169
## breastquadleft_up   0.497455596  2.69047777
## breastquadright_low 0.309571799  2.66771735
## breastquadright_up  0.865440202  5.83556332
## irradiatyes         0.948421244  2.54170369
predicted_probs <- predict(breast_model, newdata = data_complete, type = "response")

roc_obj <- roc(data_complete$Class, predicted_probs,
               levels = c("no-recurrence-events", "recurrence-events"),
               direction = "<")

plot(roc_obj, main = "ROC Curve for Bayesian Logistic Regression")

best_cutoff <- coords(roc_obj, "best", ret = "threshold")
print(best_cutoff)
##   threshold
## 1 0.3471479
# Perform 10-fold cross-validation classification summary
cv_accuracy <- classification_summary_cv(
  model = breast_model, 
  data = data_complete, 
  cutoff = 0.33, 
  k = 10
)
cv_accuracy$cv
##   sensitivity specificity overall_accuracy
## 1    0.613254   0.7209879        0.6940476
# Posterior predictive check
pp_check(breast_model) +
  ggtitle("Posterior Predictive Check") +
  theme(plot.title = element_text(color = "black", face = "bold", size = 14, hjust = 0.5))

# Extract posterior samples with chain info
posterior_array <- as.array(breast_model)

# Density overlay for all parameters
mcmc_dens_overlay(posterior_array) +
  ggtitle("MCMC Density Overlay for All Parameters")  +
  theme(plot.title = element_text(color = "black", face = "bold", size = 14, hjust = 0.5))