box::use(
  shiny[reactive],
  dplyr[filter, select, collect],
  data.table,  # Import the full package
  arrow[read_parquet, open_dataset],
  rlang,
  lubridate,
  future.apply[future_lapply],
  future[plan, multisession],
  cachem,
  memoise
)

# Initialize future plan for parallel processing
plan(multisession)

# Create a cache for data loading
cache <- cachem::cache_mem(max_size = 1024 * 1024 * 1024)  # 1GB cache
data_cache <- memoise::memoise(read_parquet, cache = cache)

#' Load a file path with error checking
#' @param data_file The base name of the parquet file
#' @param check_exists Whether to verify file existence
#' @return The full file path
#' @export
get_file_path <- function(data_file, check_exists = TRUE) {
  file_path <- paste0("./app/data/", data_file, ".parquet")
  
  if (check_exists && !file.exists(file_path)) {
    stop("File does not exist: ", file_path)
  }
  
  return(file_path)
}

#' @export
filter_data <- function(dataset, start, end, site) {
  # Use cached data loading
  file_path <- get_file_path(dataset)
  
  # Use arrow to filter data at the source for better performance
  ds <- open_dataset(file_path)
  
  # Find the date column using metadata if available, otherwise scan
  date_cols <- grep("Date", names(ds$schema), value = TRUE)
  date_col <- date_cols[1]
  
  # Apply filters directly at the data source level when possible
  dt <- ds |>
    filter(Location == site) |>
    filter(!!rlang::sym(date_col) >= start & !!rlang::sym(date_col) <= end) |>
    collect()
  
  # Convert to data.table for further processing
  data.table::setDT(dt)
  
  return(dt)
}

#' @export
load_data <- function(data_file, cols = NULL) {
  file_path <- get_file_path(data_file)
  
  # Use the cached data loading function
  if (is.null(cols)) {
    # If no columns are specified, read all
    data <- data_cache(file_path)
  } else {
    # Read only needed columns for better performance
    ds <- open_dataset(file_path)
    data <- ds |> select(all_of(cols)) |> collect()
  }
  
  # Convert to data.table and remove duplicates efficiently
  data.table::setDT(data)
  data <- unique(data)
  
  return(data)
}

#' @export
load_openfield <- function(data_file1, data_file2) {
  # Get file paths with existence check
  file_path1 <- get_file_path(data_file1)
  file_path2 <- get_file_path(data_file2)
  
  # Use arrow to select only needed columns for both datasets
  ds1 <- open_dataset(file_path1)
  dt1 <- ds1 |>
    select(Location, Crossnumber, Cross_Type, Female_Genotype, 
           Male_Genotype, First_Pollination_Date) |>
    collect()
  
  ds2 <- open_dataset(file_path2)
  dt2 <- ds2 |>
    select(Location, Crossnumber, PlantletID, Female_Genotype, 
           Male_Genotype, Openfield_Transfer_Date, Number_in_Openfield) |>
    collect()
  
  # Convert to data.table for faster joins
  data.table::setDT(dt1)
  data.table::setDT(dt2)
  dt1 <- unique(dt1)
  dt2 <- unique(dt2)
  
  # Use data.table join for better performance
  data <- dt2[dt1, on = c("Location", "Crossnumber", "Female_Genotype", "Male_Genotype")]
  
  return(data)
}

#' @export
highchart_data <- function(data, stage, cols, date_col, num_col = NULL, start_date, end_date, site = NULL) {
  # Convert to data.table if not already
  if (!data.table::is.data.table(data)) data.table::setDT(data)
  
  # Rename date column efficiently
  data.table::setnames(data, date_col, "Date")
  
  # Filter by site if needed
  if (!is.null(site) && site != "All") {
    data <- data[Location %in% site]
  }
  
  # Filter by date range
  data <- data[Date >= start_date & Date <= end_date]
  
  # Add date components directly
  data[, c("Yearly", "Monthly", "Daily") := list(
    lubridate::year(Date),
    lubridate::month(Date),
    lubridate::day(Date)
  )]
  
  # Set number column
  if (stage %in% c("Crosses", "Banana Bunches")) {
    data[, Number := 1]
  } else {
    if (!is.null(num_col)) {
      data <- data[!is.na(get(num_col))]
      data[, Number := get(num_col)]
    }
  }
  
  return(data)
}

#' @export
grouped_data <- function(data, group) {
  # Ensure data is a data.table
  if (!data.table::is.data.table(data)) data.table::setDT(data)
  
  if (group == "daily") {
    # Group by location and date
    result <- data[, list(number = sum(Number, na.rm = TRUE)), by = list(Location, Date)]
    result[, Time := Date]
    group_name <- "Daily"
  } else if (group == "monthly") {
    # Format month strings efficiently and group
    data[, Time := sprintf("%d-%02d", Yearly, Monthly)]
    result <- data[, list(number = sum(Number, na.rm = TRUE)), by = list(Location, Time)]
    group_name <- "Monthly"
  } else if (group == "yearly") {
    # Group by year
    result <- data[, list(number = sum(Number, na.rm = TRUE)), by = list(Location, Yearly)]
    result[, Time := Yearly]
    group_name <- "Yearly"
  }
  
  return(result)
}

#' @export
load_activity_data <- function(data_file, cols = NULL, sub_ = NULL, date_col, start_, end_) {
  file_path <- get_file_path(data_file)
  
  # Use arrow to filter at the source when possible
  ds <- open_dataset(file_path)
  
  # Build query progressively
  query <- ds
  
  # Select columns if specified
  if (!is.null(cols)) {
    query <- query |> select(all_of(cols))
  }
  
  # Filter by date directly in arrow
  query <- query |> 
    filter(!!rlang::sym(date_col) >= start_ & !!rlang::sym(date_col) <= end_)
  
  # Collect data
  data <- query |> collect()
  data.table::setDT(data)
  data <- unique(data)
  
  # Apply any additional filters that couldn't be pushed down
  if (!is.null(sub_)) {
    for (col in sub_) {
      data <- data[!is.na(get(col))]
    }
  }
  
  return(data)
}

#' @export
tc_embryo_data <- function(dt, number_per_tube, number_of_copies, rows_selected) {
  # Ensure data is a data.table
  if (!data.table::is.data.table(dt)) data.table::setDT(dt)
  
  # Filter selected rows efficiently
  if (!is.null(rows_selected)) {
    dt <- dt[rows_selected]
  }
  
  # Handle single plant labels with replication
  if (number_per_tube == 'single plant') {
    if (number_of_copies > 0) {
      # More efficient replication
      indices <- rep(seq_len(nrow(dt)), each = number_of_copies)
      dt <- dt[indices]
      data.table::setorder(dt, 1)
    }
  } else {
    # Handle other cases based on plants per tube
    plants_per_tube <- switch(number_per_tube,
                              "3 plants/test tube" = 3,
                              "6 plants/test tube" = 6,
                              "Equal to # Embryo Rescued" = 1)
    
    if (plants_per_tube > 1) {
      # Calculate replication factor and replicate rows
      dt[, n := pmin(ceiling(Number_of_Embryo_Rescued / plants_per_tube), Number_of_Embryo_Rescued)]
      indices <- rep(seq_len(nrow(dt)), times = dt$n)
      dt <- dt[indices]
      dt[, n := NULL]  # Clean up temporary column
    } else if (plants_per_tube == 1) {
      # Replicate rows based on Number_of_Embryo_Rescued
      indices <- rep(seq_len(nrow(dt)), times = dt$Number_of_Embryo_Rescued)
      dt <- dt[indices]
    }
  }
  
  # Extract prefix and suffix in a vectorized manner
  split_parts <- data.table::tstrsplit(dt$Crossnumber, "_", fixed = TRUE)
  dt[, Prefix := split_parts[[1]]]
  dt[, Suffix := gsub("[()]", "", split_parts[[2]])]
  
  # Return only necessary columns
  result <- dt[, list(Crossnumber, Prefix, Suffix)]
  
  return(result)
}

#' @export
tc_embryo_germination <- function(dt, number_of_copies, rows_selected = NULL) {
  # Ensure data is a data.table
  if (!data.table::is.data.table(dt)) data.table::setDT(dt)
  
  # Filter selected rows efficiently
  if (!is.null(rows_selected)) {
    dt <- dt[rows_selected]
  }
  
  # Replicate rows if needed
  if (number_of_copies > 0) {
    indices <- rep(seq_len(nrow(dt)), each = number_of_copies)
    dt <- dt[indices]
  }
  
  # Split PlantletID into components using data.table's tstrsplit
  split_parts <- data.table::tstrsplit(dt$PlantletID, "_", fixed = TRUE)
  dt[, Prefix := split_parts[[1]]]
  dt[, Suffix := gsub("[()]", "", split_parts[[2]])]
  dt[, EmbryoNo := split_parts[[3]]]
  
  # Return only necessary columns
  result <- dt[, list(PlantletID, Prefix, Suffix, EmbryoNo)]
  
  return(result)
}

#' @export
tc_plantlet_data <- function(dt, embryo_col, number_per_tube, number_of_copies, rows_selected = NULL) {
  # Ensure data is a data.table - more efficient than class() check
  if (!data.table::is.data.table(dt)) {
    data.table::setDT(dt)  # Convert to data.table if not already
  }
  
  # Filter selected rows efficiently using data.table syntax
  if (!is.null(rows_selected)) {
    dt <- dt[rows_selected]  # No need for comma in data.table syntax
  }
  
  # Handle single plant case (replicate rows more efficiently)
  if (number_per_tube == 'single plant' && number_of_copies > 0) {
    # Use rep with seq_len instead of 1:nrow for better performance
    indices <- rep(seq_len(nrow(dt)), each = number_of_copies)
    dt <- dt[indices]
    data.table::setorder(dt, 1)  # Order the rows by the first column
  }
  # Handle plants/test tube cases more efficiently with data.table
  else if (number_per_tube %in% c("3 plants/test tube", "6 plants/test tube")) {
    plants_per_tube <- ifelse(number_per_tube == "3 plants/test tube", 3, 6)
    
    # Calculate n directly and more efficiently
    dt[, n := pmin(ceiling(get(embryo_col) / plants_per_tube), get(embryo_col))]
    
    # Replicate rows based on 'n' using data.table
    indices <- rep(seq_len(nrow(dt)), times = dt$n)
    dt <- dt[indices]
    dt[, n := NULL]  # Clean up temporary column
  }
  # Handle Equal to # of embryo_col case
  else if (number_per_tube == "Equal to # of embryo_col") {
    # More efficient replication with vectorized operations
    indices <- rep(seq_len(nrow(dt)), times = dt[[embryo_col]])
    dt <- dt[indices]
  }
  
  # Split PlantletID into components using data.table's tstrsplit (much faster)
  split_result <- data.table::tstrsplit(dt$PlantletID, "_", fixed = TRUE)
  dt[, c("Prefix", "Suffix", "EmbryoNo") := list(split_result[[1]], 
                                                 gsub("\\(|\\)", "", split_result[[2]]), 
                                                 split_result[[3]])]
  
  # Return only the needed columns (more efficient than creating a new data frame)
  result <- dt[, .(PlantletID, Prefix, Suffix, EmbryoNo)]
  
  return(result)
}

#' @export
summary_table <- function(data_file) {
  file_path <- paste0("./app/data/", data_file, ".parquet")
  
  if (!file.exists(file_path)) {
    stop("File does not exist: ", file_path)
  }
  
  # Use arrow to read only necessary columns for better performance
  data <- arrow::open_dataset(file_path) |>
    dplyr::select(-contains("Plot"), -contains("Cycle")) |>
    dplyr::filter(Female_Genotype != '', Male_Genotype != '', Location != '') |>
    dplyr::collect()
  
  # Convert to data.table for faster processing
  data.table::setDT(data)
  data <- unique(data)
  
  # Add derived columns more efficiently with data.table
  data[, `:=`(
    Banana_Bunches = fifelse(!is.na(Bunch_Harvest_Date), 1L, 0L),
    Year_of_Pollination = lubridate::year(First_Pollination_Date),
    Month_of_Pollination = lubridate::month(First_Pollination_Date)
  )]
  
  # Remove columns to save memory
  data[, c("Bunch_Harvest_Date", "Seed_Extraction_Date", 
           "Embryo_Rescue_Date", "Germination_Date") := NULL]
  
  # Create month data as a data.table
  month_data <- data.table::data.table(
    month = factor(month.name, levels = month.name),
    Month_of_Pollination = 1:12
  )
  
  # Join efficiently with data.table
  result <- month_data[data, on = "Month_of_Pollination"]
  data.table::setnames(result, "month", "Month_of_Pollination")
  
  # Select only relevant columns
  cols_to_keep <- c(
    "Location", "Crossnumber", "Female_Genotype", "Female_Ploidy", "Female_Sub_Group", 
    "Male_Genotype", "Male_Ploidy", "Male_Sub_Group", "Cross_Type", 
    "First_Pollination_Date", "Year_of_Pollination", "Month_of_Pollination", 
    "Number_of_Repeats", "Banana_Bunches", "Total_Seeds", "Good_Seeds",
    "Number_of_Embryo_Rescued", "Number_of_Embryo_Germinating", "Number_of_Subcultures",
    "Number_Rooting", "Number_Sent_Out", "Weaning_2_Plantlets", "Number_in_Screenhouse",
    "Number_in_hardening", "Number_in_Openfield"
  )
  
  # Keep only columns that exist in the data
  cols_to_keep <- intersect(cols_to_keep, names(result))
  result <- result[, ..cols_to_keep]
  
  # Set data types more efficiently
  factor_cols <- c("Location", "Cross_Type", "Female_Genotype", "Female_Sub_Group", 
                   "Female_Ploidy", "Male_Genotype", "Male_Sub_Group", "Male_Ploidy", 
                   "Year_of_Pollination", "Month_of_Pollination")
  
  num_cols <- grep("Number|Seeds|Weaning|Plantlets", names(result), value = TRUE)
  
  # Convert columns to factors
  for (col in factor_cols) {
    if (col %in% names(result)) {
      data.table::set(result, j = col, value = as.factor(result[[col]]))
    }
  }
  
  # Convert columns to numeric
  for (col in num_cols) {
    if (col %in% names(result)) {
      data.table::set(result, j = col, value = as.numeric(result[[col]]))
    }
  }
  
  return(result)
}

#' @export
summarize_activity <- function(data, group_by_col, activity) {
  # Convert to data.table for more efficient operations
  if (!data.table::is.data.table(data)) {
    data.table::setDT(data)
  }
  
  # Determine which column to summarize based on activity
  sum_col <- switch(activity,
                    "first_pollination" = NULL,  # Just count rows
                    "banana_bunches" = NULL,     # Just count rows
                    "seed_extraction" = "Total_Seeds",
                    "good_seeds" = "Good_Seeds",
                    "embryo_rescue" = "Number_of_Embryo_Rescued",
                    "embryo_germination" = "Number_of_Embryo_Germinating",
                    "subcultures" = embryo_col,  # Assuming embryo_col is passed as argument
                    "rooting" = "Number_Rooting",
                    "weaning1" = "Number_Sent_Out",
                    "weaning2" = "Number_in_Weaning_2",
                    "screenhouse" = "Number_in_Screenhouse",
                    "hardening" = "Number_in_Hardening",
                    "openfield" = "Number_in_Openfield",
                    NULL
  )
  
  # Create result column name
  result_col_name <- paste0("Number_of_", gsub(" ", "_", activity))
  
  # Group and summarize efficiently with data.table
  if (is.null(sum_col)) {
    # Cases where we just count rows
    result <- data[, .(temp = .N), by = c(group_by_col)]
    data.table::setnames(result, "temp", result_col_name)
  } else {
    # Cases where we sum a specific column
    if (sum_col %in% names(data)) {
      result <- data[, .(temp = sum(get(sum_col), na.rm = TRUE)), by = c(group_by_col)]
      data.table::setnames(result, "temp", result_col_name)
    } else {
      warning("Column '", sum_col, "' not found in data. Returning NULL.")
      return(NULL)
    }
  }
  
  return(result)
}
