The first function process_ddPCR_data to read your amplitude (.csv) file exported from QX Manager

# Load dataset for each file
process_ddPCR_data <- function(file_path, channel_num1 = 1, channel_num2 = 2) {
  library(dplyr)
  
  # Read the dataset
  examplesheet <- read.table(sep=",", file_path, skip = 3, header = TRUE)
  
  # Construct column names based on input numbers
  ch1 <- paste0("Ch", channel_num1, "Amplitude")
  ch2 <- paste0("Ch", channel_num2, "Amplitude")
  x1 <- paste0("X", channel_num1)
  x2 <- paste0("X", channel_num2)
  
  # Select only the specified amplitude channels and their corresponding classification columns
  processed_data <- examplesheet %>%
    select(all_of(c(ch1, ch2, x1, x2))) %>%
    mutate(Population = case_when(
      X1 == 0 & X2 == 0 ~ "Double neg",
      X1 == 1 & X2 == 1 ~ "Double pos",
      X1 == 1 ~ "Ch1 pos",
      X2 == 1 ~ "Ch2 pos"))
  
  return(processed_data)
}

# Example usage:
# processed_data <- process_ddPCR_data("6-Color_SingleTargetPerChannel_Example_D01_Amplitude.csv", 1, 2)
The output produced by the first function will be a transformed data table that looks like this:
processed_data <- process_ddPCR_data("C:/6-Color_SingleTargetPerChannel_Example_D01_Amplitude.csv", 1, 2)

head(processed_data, 5)
##   Ch1Amplitude Ch2Amplitude X1 X2 Population
## 1     5185.150     1825.349  1  0    Ch1 pos
## 2     1083.530     1539.010  0  0 Double neg
## 3     5345.329     2048.549  1  0    Ch1 pos
## 4     1123.036     1395.804  0  0 Double neg
## 5     1007.840     6052.240  0  1    Ch2 pos

The second function apply_thresholds to modify the default (at the time of export) thresholds and plot the different droplet clusters

apply_thresholds <- function(processed_data, ch1_threshold = NULL, ch2_threshold = NULL, 
                                    ch1_col = "Ch1Amplitude", ch2_col = "Ch2Amplitude") {
  library(ggplot2)
  # Copy original Population to CustomPopulation
  processed_data <- processed_data %>%
    mutate(CustomPopulation = Population)
  
  # If no manual threshold given, calculate defaults
  if (is.null(ch1_threshold)) {
    ch1_threshold <- processed_data %>%
      filter(Population == "Ch1 pos" | Population == "Double pos") %>%
      summarise(min_amp = min(.data[[ch1_col]], na.rm = TRUE)) %>%
      pull(min_amp) - 1
  }
  
  if (is.null(ch2_threshold)) {
    ch2_threshold <- processed_data %>%
      filter(Population == "Ch2 pos" | Population == "Double pos") %>%
      summarise(min_amp = min(.data[[ch2_col]], na.rm = TRUE)) %>%
      pull(min_amp) - 1
  }
  
  # Improved thresholding (partition assignment) logic
  processed_data <- processed_data %>%
    mutate(CustomPopulation = case_when(
      .data[[ch1_col]] >= ch1_threshold & .data[[ch2_col]] >= ch2_threshold ~ "Double pos",
      .data[[ch1_col]] >= ch1_threshold & .data[[ch2_col]] <  ch2_threshold ~ "Ch1 pos",
      .data[[ch1_col]] <  ch1_threshold & .data[[ch2_col]] >= ch2_threshold ~ "Ch2 pos",
      .data[[ch1_col]] <  ch1_threshold & .data[[ch2_col]] <  ch2_threshold ~ "Double neg",
      TRUE ~ "Undefined"
    ))
  
  # Simple density scatterplot
  
  p<-ggplot(processed_data, aes(x = Ch2Amplitude, y = Ch1Amplitude, colour = CustomPopulation))+
    geom_point()+
    scale_colour_manual(values = c("Ch1 pos" = "#000666", "Ch2 pos" = "#006600",
                                   "Double pos"="#FF6600", "Double neg" ="#333333"))+
    geom_hex(alpha=0.3)+
    scale_fill_gradient(low="white", high="red")+
    geom_hline(yintercept = ch1_threshold, linetype = "dashed", color = "black") +   # Horizontal threshold
    geom_vline(xintercept = ch2_threshold, linetype = "dashed", color = "black") +   # Vertical threshold
    labs(colour = "Droplet population") +  
    theme_minimal()
  
  # Return both processed data and threshold values
  return(list(
    processed_data_thr = processed_data,
    ch1_threshold = ch1_threshold,
    ch2_threshold = ch2_threshold,
    p = p
  ))
}

# Example usage:
## processed_data <- process_ddPCR_data("yourfile.csv", 1, 2)

# With default thresholds:
## data_with_default <- apply_thresholds(processed_data)

# OR with manual thresholds:
## data_with_custom <- apply_thresholds(processed_data, ch1_threshold = 3000, ch2_threshold = 4000)

# Extract data from the returned list
## data_table <- data_with_custom$processed_data_thr
## ch1_line <- data_with_custom$ch1_threshold
## ch2_line <- data_with_custom$ch2_threshold

## Plot
## Plot <- data_with_default$p
## p
The output of the second function could look something like this:
result <- apply_thresholds(processed_data, ch1_threshold = 2000, ch2_threshold = 2500)

print(result[[4]])

The third function simulate_droplet_clusters to compare your experimental clusters with simulated (perfectly distributed) clusters

simulate_droplet_clusters <- function(processed_data, population_col = "Population") {
  
  library(dplyr)
  library(ggplot2)
  library(patchwork)
  
  # Helper to pull population (either Population or CustomPopulation - you choose) data
  pop_data <- processed_data %>% rename(PopulationUse = !!population_col)
  
  ### --- Ch1 pos cluster (e.g., FAM) ---
  ch1_cluster <- pop_data %>% filter(PopulationUse == "Ch1 pos")
  
  mean_ch1 <- mean(ch1_cluster$Ch1Amplitude, na.rm = TRUE)
  sd_ch1   <- sd(ch1_cluster$Ch1Amplitude, na.rm = TRUE)
  
  mean_ch2_in_ch1 <- mean(ch1_cluster$Ch2Amplitude, na.rm = TRUE)
  sd_ch2_in_ch1   <- sd(ch1_cluster$Ch2Amplitude, na.rm = TRUE)
  
  n_ch1 <- nrow(ch1_cluster)
  
  # Simulated Ch1 pos data
  sim_ch1 <- data.frame(
    Ch1 = rnorm(n_ch1, mean = mean_ch1, sd = sd_ch1),
    Ch2 = rnorm(n_ch1, mean = mean_ch2_in_ch1, sd = sd_ch2_in_ch1),
    Type = "simulated"
  )
  
  # Experimental Ch1 pos data
  exp_ch1 <- ch1_cluster %>%
    select(Ch1 = Ch1Amplitude, Ch2 = Ch2Amplitude) %>%
    mutate(Type = "experimental")
  
  data_Ch1_pos_both <- bind_rows(sim_ch1, exp_ch1)
  
  ### Ch2 pos cluster (e.g., HEX)
  ch2_cluster <- pop_data %>% filter(PopulationUse == "Ch2 pos")
  
  mean_ch2 <- mean(ch2_cluster$Ch2Amplitude, na.rm = TRUE)
  sd_ch2   <- sd(ch2_cluster$Ch2Amplitude, na.rm = TRUE)
  
  mean_ch1_in_ch2 <- mean(ch2_cluster$Ch1Amplitude, na.rm = TRUE)
  sd_ch1_in_ch2   <- sd(ch2_cluster$Ch1Amplitude, na.rm = TRUE)
  
  n_ch2 <- nrow(ch2_cluster)
  
  # Simulated Ch2 pos data
  sim_ch2 <- data.frame(
    Ch1 = rnorm(n_ch2, mean = mean_ch1_in_ch2, sd = sd_ch1_in_ch2),
    Ch2 = rnorm(n_ch2, mean = mean_ch2, sd = sd_ch2),
    Type = "simulated"
  )
  
  # Experimental Ch2 pos data
  exp_ch2 <- ch2_cluster %>%
    select(Ch1 = Ch1Amplitude, Ch2 = Ch2Amplitude) %>%
    mutate(Type = "experimental")
  
  data_Ch2_pos_both <- bind_rows(sim_ch2, exp_ch2)
  
  # Plotting direct comparisons
  
    # Extract the combined experimental + simulated clusters
    data_FAM_pos_both <- data_Ch1_pos_both
    data_HEX_pos_both <- data_Ch2_pos_both
  
    # FAM (Ch1 pos) plot
    fam_sim_exp <- ggplot(data_Ch1_pos_both, aes(x = Ch2, y = Ch1)) +
      geom_point(color = "#000666", alpha = 0.6,inherit.aes = TRUE) +
      facet_grid(~Type) +
      theme_minimal() +
      labs(
        title = "Ch1-positive Droplets (FAM)",
        x = "Ch2 Amplitude",
        y = "Ch1 Amplitude"
      )
    
    # HEX (Ch2 pos) plot
    hex_sim_exp <- ggplot(data_Ch2_pos_both, aes(x = Ch2, y = Ch1)) +
      geom_point(color = "#006600", alpha = 0.6,inherit.aes = TRUE) +
      facet_grid(~Type) +
      theme_minimal() +
      labs(
        title = "Ch2-positive Droplets (HEX)",
        x = "Ch2 Amplitude",
        y = "Ch1 Amplitude"
      )
  
    # Combine using patchwork
    combined_plot <- fam_sim_exp / hex_sim_exp +
      plot_annotation(title = "Simulated vs Experimental Droplet Clusters")
    
  return(list(
    Ch1_cluster = data_Ch1_pos_both,
    Ch2_cluster = data_Ch2_pos_both,
    combined_plot = combined_plot
  ))
}

# Example usage
## clusters <- simulate_droplet_clusters(data_with_custom, population_col = "CustomPopulation")
## OR
## clusters <- simulate_droplet_clusters(processed_data)
## ch1_cluster_data <- clusters$Ch1_cluster
## ch2_cluster_data <- clusters$Ch2_cluster
## combined_plot <- clusters$combined_plot
## combined_plot
An example of cluster comparison (experimental vs simulated) would look something like this:
simulated_clusters<-simulate_droplet_clusters(processed_data)

print(simulated_clusters[[3]])

The fourth and final function simulate_ddPCR_counts_plot plots the theoretical copies per each droplet (0, 1, >1…) together with 95% conf. intervals using experimentally derived CPD

simulate_ddPCR_counts_plot <- function(processed_data, population_col = "Population", num_simulations = 100) {
  library(dplyr)
  library(ggplot2)
  library(tidyr)
  
  # Dynamically extract correct population column
  pop_data <- processed_data %>% rename(PopulationUse = !!population_col)
  
  # Total droplets
  total_droplets <- nrow(pop_data)
  
  # Positive droplets for each channel
  positive_droplets_Ch1 <- nrow(filter(pop_data, PopulationUse %in% c("Ch1 pos", "Double pos")))
  positive_droplets_Ch2 <- nrow(filter(pop_data, PopulationUse %in% c("Ch2 pos", "Double pos")))
  
  # Proportions and Poisson lambda estimates
  lambda_estimated_Ch1 <- -log(1 - positive_droplets_Ch1 / total_droplets)
  lambda_estimated_Ch2 <- -log(1 - positive_droplets_Ch2 / total_droplets)
  
  # Run simulations
  simulated_data <- lapply(1:num_simulations, function(i) {
    x_vals <- rpois(total_droplets, lambda = lambda_estimated_Ch1)
    y_vals <- rpois(total_droplets, lambda = lambda_estimated_Ch2)
    data.frame(x = x_vals, y = "Ch1", sim = i) %>%
      bind_rows(data.frame(x = y_vals, y = "Ch2", sim = i))
  }) %>% bind_rows()
  
  # Count occurrences
  simulated_counts <- simulated_data %>%
    group_by(y, sim, x) %>%
    summarise(count = n(), .groups = "drop")
  
  # Compute means and 95% CI
  summary_data <- simulated_counts %>%
    group_by(y, x) %>%
    summarise(
      mean_count = mean(count),
      sd_count = ifelse(n() > 1, sd(count), 0),
      .groups = "drop"
    ) %>%
    mutate(
      ci_lower = mean_count - 1.96 * (sd_count / sqrt(num_simulations)),
      ci_upper = mean_count + 1.96 * (sd_count / sqrt(num_simulations))
    ) %>%
    replace_na(list(ci_lower = 0, ci_upper = 0))
  
  # Plot
  plot <- ggplot(summary_data, aes(x = x, y = mean_count, fill = y)) +
    geom_col(alpha = 0.6, position = "dodge") +
    geom_errorbar(aes(ymin = pmax(ci_lower, 0), ymax = ci_upper),
                  width = 0.2, position = position_dodge(0.9)) +
    geom_text(aes(label = round(mean_count, 1)),
              vjust = -0.5, position = position_dodge(0.9)) +
    labs(title = paste0("Simulated Frequencies of Droplets with 0, 1 and >1 target copies (n = ", num_simulations, ")"),
         x = "Number of DNA molecules per droplet",
         y = "Mean Frequency (± 95% CI)") +
    scale_fill_manual(values = c("Ch1" = "#000666", "Ch2" = "#006600")) +
    facet_wrap(~y) +
    theme_minimal() +
    theme(legend.position = "none")
  
  return(plot)
}

# Example usage 1
## plot <- simulate_ddPCR_counts_plot(processed_data = processed_data, population_col = "Population")
# plot

# Example usage 2
## plot_custom <- simulate_ddPCR_counts_plot(processed_data = data_with_custom$processed_data_thr, population_col = "CustomPopulation")
## plot_custom
The output of the fourth function would be a Poisson distribution of the # of target copies (0, 1, 2…) plotted against their frequency (i.e. number of droplets). By default the number of simulations is 100 and the means and confidence intervals are plotted, for both channels.
simulate_ddPCR_counts_plot(processed_data)