Shiny--Code/samplesize/survival/app.R

# ==============================================================================
# ỨNG DỤNG SHINY TÍNH CỠ MẪU CHO PHÂN TÍCH SỐNG CÒN (LOG-RANK TEST)
# - Xây dựng dựa trên cấu trúc ứng dụng Hồi quy Logistic.
# - Bao gồm tính toán, mô phỏng, và diễn giải chi tiết.
# Author: Gemini & User Collaboration
# Date: 2025-10-17
# ==============================================================================

# ------------------------------------------------------------------------------
# THIẾT LẬP: Tải các thư viện cần thiết
# ------------------------------------------------------------------------------
library(shiny)
library(bslib)
library(ggplot2)
library(shinycssloaders)
library(dplyr)
library(purrr)
library(scales)
library(survival)

# ==============================================================================
# PHẦN 1: CÁC HÀM HỖ TRỢ (HELPER FUNCTIONS)
# ==============================================================================

#' Tính số biến cố (events) cần thiết cho kiểm định Log-rank (Schoenfeld, 1983).
#'
#' @param hr Tỷ số rủi ro (Hazard Ratio) kỳ vọng.
#' @param power Công suất mong muốn (1 - beta).
#' @param sig.level Mức ý nghĩa alpha.
#' @param p_alloc Tỷ lệ phân bổ vào nhóm 1 (can thiệp).
#' @return Số nguyên là tổng số biến cố cần thiết.
calc_events_logrank <- function(hr, power = 0.8, sig.level = 0.05, p_alloc = 0.5) {
  if (hr <= 0) return(NA_real_)
  z_a <- qnorm(1 - sig.level / 2)
  z_b <- qnorm(power)

  num <- (z_a + z_b)^2
  den <- (log(hr)^2) * p_alloc * (1 - p_alloc)

  if (den <= 0) return(NA_real_)

  ceiling(num / den)
}


#' Ước tính công suất thực nghiệm cho kiểm định Log-rank bằng mô phỏng Monte Carlo.
#'
#' @param n Tổng cỡ mẫu.
#' @param hr Tỷ số rủi ro.
#' @param surv_prob_control Tỷ lệ sống sót ở nhóm chứng sau một thời gian theo dõi.
#' @param follow_up_time Thời gian theo dõi.
#' @param alpha Mức ý nghĩa.
#' @param p_alloc Tỷ lệ phân bổ vào nhóm 1.
#' @param n_sim Số lần mô phỏng.
#' @return Giá trị công suất thực nghiệm (từ 0 đến 1).
simulate_logrank_power <- function(n, hr, surv_prob_control, follow_up_time = 1,
                                   alpha = 0.05, p_alloc = 0.5, n_sim = 500) {

  # Từ tỷ lệ sống, tính rate lambda cho phân phối mũ
  rate_control <- -log(surv_prob_control) / follow_up_time
  rate_treatment <- rate_control * hr

  n1 <- round(n * p_alloc)
  n2 <- n - n1

  sim_results <- replicate(n_sim, {
    # Tạo dữ liệu thời gian sống cho 2 nhóm
    times1 <- rexp(n1, rate = rate_treatment)
    times2 <- rexp(n2, rate = rate_control)

    # Tạo dữ liệu kiểm duyệt (censoring) tại thời điểm follow_up_time
    event1 <- ifelse(times1 < follow_up_time, 1, 0)
    event2 <- ifelse(times2 < follow_up_time, 1, 0)

    time_obs1 <- pmin(times1, follow_up_time)
    time_obs2 <- pmin(times2, follow_up_time)

    # Gộp dữ liệu
    all_times <- c(time_obs1, time_obs2)
    all_events <- c(event1, event2)
    group <- factor(c(rep("Treatment", n1), rep("Control", n2)))

    # Thực hiện kiểm định log-rank
    sdf <- try(survdiff(Surv(all_times, all_events) ~ group), silent = TRUE)

    if (inherits(sdf, "try-error")) return(NA)

    # Lấy p-value
    p_val <- 1 - pchisq(sdf$chisq, df = 1)
    return(p_val < alpha)
  })

  mean(sim_results, na.rm = TRUE)
}


# ==============================================================================
# PHẦN 2: GIAO DIỆN NGƯỜI DÙNG (USER INTERFACE - UI)
# ==============================================================================

ui <- fluidPage(
  theme = bs_theme(version = 5, bootswatch = "cerulean"),
  withMathJax(),

  titlePanel("Công Cụ Tính Cỡ Mẫu Cho Phân Tích Sống Còn (Log-Rank)"),

  sidebarLayout(
    sidebarPanel(
      width = 4,
      h4("Nhập Tham Số"),
      br(),
      h5("Tham số hiệu ứng & Quần thể"),
      sliderInput("s_a", "Tỷ lệ sống còn kỳ vọng ở Nhóm 1 (Can thiệp):", value = 0.45, min = 0.01, max = 0.99),
      sliderInput("s_b", "Tỷ lệ sống còn kỳ vọng ở Nhóm 2 (Chứng):", value = 0.30, min = 0.01, max = 0.99),
      numericInput("follow_up", "Thời gian theo dõi (đơn vị tùy chọn, ví dụ: năm):", value=3, min=0.1),

      hr(),
      h5("Tham số kiểm định & Thiết kế"),
      sliderInput("power_surv", "Công suất mong muốn (1 - \\(\\beta\\)):", value = 0.8, min = 0.5, max = 0.99),
      sliderInput("alpha_surv", "Mức ý nghĩa (\\(\\alpha\\)):", value = 0.05, min = 0.01, max = 0.1),
      sliderInput("p_alloc_surv", "Tỷ lệ phân bổ vào Nhóm 1:", value = 0.5, min = 0.1, max = 0.9),
      helpText("Giá trị 0.5 tương ứng tỷ lệ 1:1."),

      hr(),
      actionButton("go_surv", "Tính toán & Phân tích", class = "btn-primary w-100", icon = icon("calculator"))
    ),

    mainPanel(
      width = 8,
      tabsetPanel(
        id = "surv_results_tabs",
        type = "pills",
        tabPanel("Kết quả & Diễn giải",
                 withSpinner(uiOutput("surv_result_text"), type = 6, color = "#007bff")),
        tabPanel("Đồ thị Power & Mô phỏng",
                 withSpinner(plotOutput("surv_power_plot"), type = 6, color = "#007bff"),
                 hr(),
                 h4("Phân tích Mô phỏng Chi tiết"),
                 p("Chạy mô phỏng sâu hơn với một cỡ mẫu cụ thể để xem phân phối của p-value và ước tính power chính xác hơn."),
                 fluidRow(
                   column(6, numericInput("n_for_sim_detail_surv", "Nhập cỡ mẫu (N) để mô phỏng:", value = 300, min = 20)),
                   column(6, actionButton("run_sim_detail_surv", "Chạy mô phỏng chi tiết", icon = icon("play-circle"), class="btn-success", style="margin-top: 25px;"))
                 ),
                 withSpinner(plotOutput("surv_sim_detail_plot"), type = 6, color = "#007bff")),
        tabPanel("Phân tích Effect Size",
                 withSpinner(uiOutput("surv_effect_size_ui"), type = 6, color = "#007bff")),

        tabPanel("Giả thuyết, Công thức & Ví dụ",
                 h4("Giả thuyết thống kê"),
                 HTML("Kiểm định Log-rank so sánh toàn bộ đường cong sống còn giữa các nhóm. Giả thuyết không có tham số cụ thể như hồi quy, mà là:"),
                 p(strong("$$H_0: S_1(t) = S_2(t) \\text{ for all } t$$")),
                 em("Diễn giải: Không có sự khác biệt về hàm sống còn giữa nhóm 1 và nhóm 2 tại mọi thời điểm."),
                 br(),
                 p(strong("$$H_a: S_1(t) \\neq S_2(t) \\text{ for some } t$$")),
                 em("Diễn giải: Có sự khác biệt về hàm sống còn giữa hai nhóm."),
                 hr(),

                 h4("Công thức tính cỡ mẫu (Schoenfeld, 1983)"),
                 p("Quá trình tính toán gồm 2 bước:"),
                 p(strong("Bước 1: Tính tổng số biến cố (events) cần thiết (d)")),
                 withMathJax(HTML("$$d = \\frac{(Z_{1-\\alpha/2} + Z_{\\text{power}})^2}{(\\ln(HR))^2 \\cdot p_1 \\cdot p_2}$$")),
                 p(strong("Bước 2: Tính tổng cỡ mẫu (N) từ số biến cố")),
                 withMathJax(HTML("$$N = \\frac{d}{P(\\text{event})}$$")),
                 p(strong("Trong đó:")),
                 tags$ul(
                   withMathJax(tags$li("\\(HR\\) là Tỷ số rủi ro (Hazard Ratio) kỳ vọng. Có thể ước tính \\(HR \\approx \\frac{\\ln(S_1)}{\\ln(S_2)}\\).")),
                   withMathJax(tags$li("\\(p_1, p_2\\) là tỷ lệ phân bổ vào mỗi nhóm (ví dụ: 0.5 và 0.5).")),
                   withMathJax(tags$li("\\(P(\\text{event})\\) là xác suất trung bình một người tham gia sẽ có biến cố trong thời gian nghiên cứu. \\(P(\\text{event}) = 1 - (p_1 S_1 + p_2 S_2)\\)."))
                 ),
                 hr(),

                 h4("Ví dụ trong Y tế công cộng"),
                 p(strong("Bối cảnh nghiên cứu:")),
                 p("Một nhà nghiên cứu muốn thử nghiệm một loại thuốc mới (Nhóm 1) để kéo dài thời gian sống không tái phát của bệnh nhân ung thư so với phác đồ chuẩn (Nhóm 2). Thời gian theo dõi là 3 năm."),
                 tags$ul(
                   tags$li(strong("Tỷ lệ sống còn Nhóm 2 (chuẩn):"), withMathJax(HTML(" Dựa trên dữ liệu trước, tỷ lệ sống không tái phát sau 3 năm của phác đồ chuẩn là 30% (\\(S_2=0.3\\))."))),
                   tags$li(strong("Tỷ lệ sống còn Nhóm 1 (mới):"), withMathJax(HTML(" Nhà nghiên cứu kỳ vọng thuốc mới sẽ cải thiện tỷ lệ này lên 45% (\\(S_1=0.45\\))."))),
                   tags$li(strong("Effect size (HR):"), withMathJax(HTML(" HR ước tính là \\(\\frac{\\ln(0.45)}{\\ln(0.30)} \\approx 0.66\\)."))),
                   tags$li(strong("Xác suất biến cố (tái phát):"), withMathJax(HTML(" \\(P(\\text{event}) = 1 - (0.5 \\times 0.45 + 0.5 \\times 0.30) = 0.625\\)."))),
                   tags$li(strong("Power và \\(\\alpha\\):"), " Nghiên cứu cần 80% công suất và mức ý nghĩa 5%."),
                   tags$li(strong("Thiết kế:"), " Phân bổ đều bệnh nhân vào 2 nhóm (1:1).")
                 ),
                 p("Các giá trị này đã được đặt làm mặc định trong ứng dụng để bạn dễ hình dung.")
        )
      )
    )
  )
)


# ==============================================================================
# PHẦN 3: LOGIC MÁY CHỦ (SERVER)
# ==============================================================================
server <- function(input, output, session) {

  rv_surv <- reactiveValues()

  observeEvent(input$go_surv, {
    validate(
      need(input$s_a != input$s_b, "Tỷ lệ sống còn ở hai nhóm phải khác nhau.")
    )

    # Tính toán
    hr_est <- log(input$s_a) / log(input$s_b)
    d_req <- calc_events_logrank(hr_est, input$power_surv, input$alpha_surv, input$p_alloc_surv)

    prob_event_1 <- 1 - input$s_a
    prob_event_2 <- 1 - input$s_b
    avg_prob_event <- input$p_alloc_surv * prob_event_1 + (1 - input$p_alloc_surv) * prob_event_2

    n_total <- ceiling(d_req / avg_prob_event)
    n1 <- ceiling(n_total * input$p_alloc_surv)
    n2 <- n_total - n1

    rv_surv$hr <- hr_est
    rv_surv$d <- d_req
    rv_surv$N <- n_total
    rv_surv$n1 <- n1
    rv_surv$n2 <- n2

    # Dữ liệu cho biểu đồ
    if (!is.na(n_total)) {
        n_range_surv <- unique(round(seq(max(30, n_total * 0.5), n_total * 1.5, length.out = 15)))

        power_values_surv <- map_dbl(n_range_surv, ~simulate_logrank_power(
          n = .x,
          hr = hr_est,
          surv_prob_control = input$s_b,
          follow_up_time = input$follow_up,
          alpha = input$alpha_surv,
          p_alloc = input$p_alloc_surv,
          n_sim = 500
        ))
        rv_surv$plot_data <- tibble(SampleSize = n_range_surv, Power = power_values_surv)
    } else {
        rv_surv$plot_data <- NULL
    }
  })

  output$surv_result_text <- renderUI({
    if (input$go_surv == 0) {
      return(tags$div(class="alert alert-info", "Nhập các tham số và nhấn 'Tính toán & Phân tích' để xem kết quả."))
    }

    req(rv_surv$N)

    tagList(
      h4("Kết quả tính toán (Xấp xỉ Schoenfeld)"),
      p("Với Hazard Ratio (HR) ước tính là", tags$b(round(rv_surv$hr, 3)), "và các tham số đã cho, kết quả như sau:"),
      tags$div(class="alert alert-light", style="text-align: center;",
        p(style="font-size: 1.2em;", "Tổng số biến cố (events) cần quan sát: ", tags$strong(style="color: #dc3545;", rv_surv$d))
      ),
      h3(style = "color: #007bff; text-align: center; margin-top: 20px;", "Tổng Cỡ Mẫu (N) ≈ ", rv_surv$N),
      p(style = "text-align: center; font-size: 1.2em;",
        "Cỡ mẫu Nhóm 1 (Can thiệp): ", tags$b(rv_surv$n1), br(),
        "Cỡ mẫu Nhóm 2 (Chứng): ", tags$b(rv_surv$n2)
      ),
      hr(),
      tags$div(class = "alert alert-light",
         tags$b("Ghi chú:"), " Đây là kết quả dựa trên công thức. Biểu đồ trong tab 'Đồ thị Power & Mô phỏng' được tạo ra bằng mô phỏng và có thể cho kết quả chính xác hơn, đặc biệt với cỡ mẫu nhỏ.")
    )
  })

  output$surv_power_plot <- renderPlot({
    req(rv_surv$plot_data, rv_surv$N)
    ggplot(rv_surv$plot_data, aes(x = SampleSize, y = Power)) +
      geom_line(color = "#007bff", size = 1.2) +
      geom_point(color = "#007bff", size = 3) +
      geom_hline(yintercept = input$power_surv, linetype = "dashed", color = "red") +
      geom_vline(xintercept = rv_surv$N, linetype = "dotted", color = "darkorange", size=1.2) +
      labs(title = "Công suất thực nghiệm (Mô phỏng) vs. Cỡ mẫu", x = "Tổng Cỡ mẫu (N)", y = "Công suất (1 - \\(\\beta\\))") +
      annotate("text", x = rv_surv$N * 1.05, y = 0.1, label = paste("Formula Approx.\nN =", rv_surv$N), color = "darkorange", hjust = 0) +
      scale_y_continuous(labels = percent, limits = c(0, 1)) +
      theme_minimal(base_size = 14)
  })

  output$surv_effect_size_ui <- renderUI({
    req(rv_surv$hr)
    hr <- rv_surv$hr

    interpretation <- if (hr < 1) {
      paste0("rủi ro (hazard) xảy ra biến cố tại bất kỳ thời điểm nào ", tags$b("giảm đi ", round((1 - hr) * 100, 1), "%"), " so với nhóm chứng.")
    } else {
      paste0("rủi ro (hazard) xảy ra biến cố tại bất kỳ thời điểm nào ", tags$b("tăng lên ", round((hr - 1) * 100, 1), "%"), " so với nhóm chứng.")
    }

    tagList(
      h4("Phân tích Effect Size: Tỷ số rủi ro (Hazard Ratio - HR)"),
      p("Trong phân tích sống còn, effect size chính là Tỷ số rủi ro (HR). Nó đo lường hiệu quả tức thời của một can thiệp lên xác suất xảy ra biến cố."),
      p("Nó được ước tính từ tỷ lệ sống còn của hai nhóm (giả định rằng tỷ lệ rủi ro là hằng số theo thời gian): \\(HR \\approx \\ln(S_1) / \\ln(S_2)\\)."),
      hr(),
      p("Với các tỷ lệ sống còn bạn đã chọn, Tỷ số rủi ro ước tính là:"),
      tags$h3(style = "color: #007bff; text-align: center;", round(hr, 3)),
      tags$div(class = "alert alert-light",
         tags$b("Diễn giải:"),
         p("Một HR bằng ", tags$b(round(hr, 3)), " có nghĩa là nhóm can thiệp có ", interpretation)
      )
    )
  })

  observeEvent(input$run_sim_detail_surv, {
      pvals <- replicate(1000, {
        n <- input$n_for_sim_detail_surv
        hr <- log(input$s_a) / log(input$s_b)

        rate_control <- -log(input$s_b) / input$follow_up
        rate_treatment <- rate_control * hr

        n1 <- round(n * input$p_alloc_surv)
        n2 <- n - n1

        times1 <- rexp(n1, rate = rate_treatment)
        times2 <- rexp(n2, rate = rate_control)

        event1 <- ifelse(times1 < input$follow_up, 1, 0)
        event2 <- ifelse(times2 < input$follow_up, 1, 0)

        time_obs1 <- pmin(times1, input$follow_up)
        time_obs2 <- pmin(times2, input$follow_up)

        all_times <- c(time_obs1, time_obs2)
        all_events <- c(event1, event2)
        group <- factor(c(rep("Treatment", n1), rep("Control", n2)))

        sdf <- try(survdiff(Surv(all_times, all_events) ~ group), silent = TRUE)
        if (inherits(sdf, "try-error")) return(NA_real_)

        1 - pchisq(sdf$chisq, df = 1)
      })
      rv_surv$sim_detail_data <- tibble(p_value = pvals)
  })

  output$surv_sim_detail_plot <- renderPlot({
      req(rv_surv$sim_detail_data)
      power_est <- mean(rv_surv$sim_detail_data$p_value < input$alpha_surv, na.rm=TRUE)

      ggplot(rv_surv$sim_detail_data, aes(x = p_value)) +
        geom_histogram(bins = 30, fill = "#28a745", color = "black", boundary=0) +
        geom_vline(xintercept = input$alpha_surv, linetype = "dashed", color = "red", size = 1) +
        labs(
          title = paste0("Phân phối P-value từ mô phỏng (N=", input$n_for_sim_detail_surv, ")"),
          subtitle = paste0("Công suất thực nghiệm ước tính: ", percent(power_est, accuracy = 0.1)),
          x = "P-value (từ Kiểm định Log-rank)", y = "Tần suất"
        ) +
        theme_minimal(base_size = 14)
  })

}


# ==============================================================================
# PHẦN 4: CHẠY ỨNG DỤNG
# ==============================================================================
shinyApp(ui, server)