From 74bef918eb74fd8e4f498b3819f6cce9d6343e8a Mon Sep 17 00:00:00 2001 From: Jan Marvin Garbuszus Date: Wed, 29 Jan 2025 13:28:48 +0100 Subject: [PATCH 01/11] [save] enable PUGIXML_COMPACT --- inst/include/pugixml/pugiconfig.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/inst/include/pugixml/pugiconfig.hpp b/inst/include/pugixml/pugiconfig.hpp index c7c9af18e..0101acea4 100644 --- a/inst/include/pugixml/pugiconfig.hpp +++ b/inst/include/pugixml/pugiconfig.hpp @@ -18,7 +18,7 @@ // #define PUGIXML_WCHAR_MODE // Uncomment this to enable compact mode -// #define PUGIXML_COMPACT +#define PUGIXML_COMPACT // Uncomment this to disable XPath // #define PUGIXML_NO_XPATH From 745c6380fddb46f15af1a197c038d0a3ce159b2d Mon Sep 17 00:00:00 2001 From: Jan Marvin Garbuszus Date: Wed, 29 Jan 2025 19:41:30 +0100 Subject: [PATCH 02/11] [save] avoid memory blow up --- R/class-workbook.R | 31 +++++++++++++++++-------------- src/RcppExports.cpp | 4 ++-- src/write_file.cpp | 2 +- 3 files changed, 20 insertions(+), 17 deletions(-) diff --git a/R/class-workbook.R b/R/class-workbook.R index 91da0d6b3..e5e68b0bb 100644 --- a/R/class-workbook.R +++ b/R/class-workbook.R @@ -9778,40 +9778,43 @@ wbWorkbook <- R6::R6Class( } } else { ## Write worksheets - ws <- self$worksheets[[i]] - hasHL <- length(ws$hyperlinks) > 0 + # ws <- self$worksheets[[i]] + hasHL <- length(self$worksheets[[i]]$hyperlinks) > 0 - prior <- ws$get_prior_sheet_data() - post <- ws$get_post_sheet_data() + prior <- self$worksheets[[i]]$get_prior_sheet_data() + post <- self$worksheets[[i]]$get_post_sheet_data() - if (!is.null(ws$sheet_data$cc)) { + if (!is.null(self$worksheets[[i]]$sheet_data$cc)) { - cc <- ws$sheet_data$cc - cc$r <- stringi::stri_join(cc$c_r, cc$row_r) + self$worksheets[[i]]$sheet_data$cc$r <- with( + self$worksheets[[i]]$sheet_data$cc, + stringi::stri_join(c_r, row_r) + ) + cc <- self$worksheets[[i]]$sheet_data$cc # prepare data for output # there can be files, where row_attr is incomplete because a row # is lacking any attributes (presumably was added before saving) # still row_attr is what we want! - rows_attr <- ws$sheet_data$row_attr - ws$sheet_data$row_attr <- rows_attr[order(as.numeric(rows_attr[, "r"])), ] + rows_attr <- self$worksheets[[i]]$sheet_data$row_attr + self$worksheets[[i]]$sheet_data$row_attr <- rows_attr[order(as.numeric(rows_attr[, "r"])), ] - cc_rows <- ws$sheet_data$row_attr$r + cc_rows <- self$worksheets[[i]]$sheet_data$row_attr$r # c("row_r", "c_r", "r", "v", "c_t", "c_s", "c_cm", "c_ph", "c_vm", "f", "f_attr", "is") cc <- cc[cc$row_r %in% cc_rows, ] - ws$sheet_data$cc <- cc[order(as.integer(cc[, "row_r"]), col2int(cc[, "c_r"])), ] + self$worksheets[[i]]$sheet_data$cc <- cc[order(as.integer(cc[, "row_r"]), col2int(cc[, "c_r"])), ] } else { - ws$sheet_data$row_attr <- NULL - ws$sheet_data$cc <- NULL + self$worksheets[[i]]$sheet_data$row_attr <- NULL + self$worksheets[[i]]$sheet_data$cc <- NULL } # create entire sheet prior to writing it sheet_xml <- write_worksheet( prior = prior, post = post, - sheet_data = ws$sheet_data + sheet_data = self$worksheets[[i]]$sheet_data ) ws_file <- file.path(xlworksheetsDir, sprintf("sheet%s.xml", i)) write_xmlPtr(doc = sheet_xml, fl = ws_file) diff --git a/src/RcppExports.cpp b/src/RcppExports.cpp index 87e042c7f..afab64d3f 100644 --- a/src/RcppExports.cpp +++ b/src/RcppExports.cpp @@ -854,14 +854,14 @@ BEGIN_RCPP END_RCPP } // write_worksheet -XPtrXML write_worksheet(std::string prior, std::string post, Rcpp::Environment sheet_data); +XPtrXML write_worksheet(std::string prior, std::string post, Rcpp::Environment& sheet_data); RcppExport SEXP _openxlsx2_write_worksheet(SEXP priorSEXP, SEXP postSEXP, SEXP sheet_dataSEXP) { BEGIN_RCPP Rcpp::RObject rcpp_result_gen; Rcpp::RNGScope rcpp_rngScope_gen; Rcpp::traits::input_parameter< std::string >::type prior(priorSEXP); Rcpp::traits::input_parameter< std::string >::type post(postSEXP); - Rcpp::traits::input_parameter< Rcpp::Environment >::type sheet_data(sheet_dataSEXP); + Rcpp::traits::input_parameter< Rcpp::Environment& >::type sheet_data(sheet_dataSEXP); rcpp_result_gen = Rcpp::wrap(write_worksheet(prior, post, sheet_data)); return rcpp_result_gen; END_RCPP diff --git a/src/write_file.cpp b/src/write_file.cpp index 6cb2dcb90..7509307b3 100644 --- a/src/write_file.cpp +++ b/src/write_file.cpp @@ -202,7 +202,7 @@ void xml_sheet_data(pugi::xml_node &doc, Rcpp::DataFrame &row_attr, Rcpp::DataFr // create single xml rows of sheet_data. // // [[Rcpp::export]] -XPtrXML write_worksheet(std::string prior, std::string post, Rcpp::Environment sheet_data) { +XPtrXML write_worksheet(std::string prior, std::string post, Rcpp::Environment &sheet_data) { uint32_t pugi_parse_flags = pugi::parse_cdata | pugi::parse_wconv_attribute | pugi::parse_ws_pcdata | pugi::parse_eol; // sheet_data will be in order, just need to check for row_heights From 34e66d3c735393c79f51b8c16c1511c30d6920a3 Mon Sep 17 00:00:00 2001 From: Jan Marvin Garbuszus Date: Sat, 20 Jul 2024 19:24:42 +0200 Subject: [PATCH 03/11] [writing] provide openxlsx2.export_with_pugi = FALSE option --- R/RcppExports.R | 4 + R/class-workbook.R | 77 +++++++++------ src/RcppExports.cpp | 14 +++ src/write_file.cpp | 186 ++++++++++++++++++++++++++++++++++++ tests/testthat/test-write.R | 22 +++++ 5 files changed, 276 insertions(+), 27 deletions(-) diff --git a/R/RcppExports.R b/R/RcppExports.R index 77566d09a..43a519d5c 100644 --- a/R/RcppExports.R +++ b/R/RcppExports.R @@ -340,6 +340,10 @@ set_sst <- function(sharedStrings) { .Call(`_openxlsx2_set_sst`, sharedStrings) } +write_worksheet_slim <- function(sheet_data, prior, post, fl) { + invisible(.Call(`_openxlsx2_write_worksheet_slim`, sheet_data, prior, post, fl)) +} + write_worksheet <- function(prior, post, sheet_data) { .Call(`_openxlsx2_write_worksheet`, prior, post, sheet_data) } diff --git a/R/class-workbook.R b/R/class-workbook.R index e5e68b0bb..2cc17a210 100644 --- a/R/class-workbook.R +++ b/R/class-workbook.R @@ -9784,41 +9784,64 @@ wbWorkbook <- R6::R6Class( prior <- self$worksheets[[i]]$get_prior_sheet_data() post <- self$worksheets[[i]]$get_post_sheet_data() - if (!is.null(self$worksheets[[i]]$sheet_data$cc)) { + use_pugixml_export <- getOption("openxlsx2.export_with_pugi", default = TRUE) - self$worksheets[[i]]$sheet_data$cc$r <- with( - self$worksheets[[i]]$sheet_data$cc, - stringi::stri_join(c_r, row_r) - ) - cc <- self$worksheets[[i]]$sheet_data$cc - # prepare data for output + if (use_pugixml_export) { + # failsaves. check that all rows and cells + # are available and in the correct order + if (!is.null(self$worksheets[[i]]$sheet_data$cc)) { + + self$worksheets[[i]]$sheet_data$cc$r <- with( + self$worksheets[[i]]$sheet_data$cc, + stringi::stri_join(c_r, row_r) + ) + cc <- self$worksheets[[i]]$sheet_data$cc + # prepare data for output - # there can be files, where row_attr is incomplete because a row - # is lacking any attributes (presumably was added before saving) - # still row_attr is what we want! + # there can be files, where row_attr is incomplete because a row + # is lacking any attributes (presumably was added before saving) + # still row_attr is what we want! - rows_attr <- self$worksheets[[i]]$sheet_data$row_attr - self$worksheets[[i]]$sheet_data$row_attr <- rows_attr[order(as.numeric(rows_attr[, "r"])), ] + rows_attr <- self$worksheets[[i]]$sheet_data$row_attr + self$worksheets[[i]]$sheet_data$row_attr <- rows_attr[order(as.numeric(rows_attr[, "r"])), ] - cc_rows <- self$worksheets[[i]]$sheet_data$row_attr$r - # c("row_r", "c_r", "r", "v", "c_t", "c_s", "c_cm", "c_ph", "c_vm", "f", "f_attr", "is") - cc <- cc[cc$row_r %in% cc_rows, ] + cc_rows <- self$worksheets[[i]]$sheet_data$row_attr$r + # c("row_r", "c_r", "r", "v", "c_t", "c_s", "c_cm", "c_ph", "c_vm", "f", "f_attr", "is") + cc <- cc[cc$row_r %in% cc_rows, ] - self$worksheets[[i]]$sheet_data$cc <- cc[order(as.integer(cc[, "row_r"]), col2int(cc[, "c_r"])), ] - } else { - self$worksheets[[i]]$sheet_data$row_attr <- NULL - self$worksheets[[i]]$sheet_data$cc <- NULL + self$worksheets[[i]]$sheet_data$cc <- cc[order(as.integer(cc[, "row_r"]), col2int(cc[, "c_r"])), ] + rm(cc) + } else { + self$worksheets[[i]]$sheet_data$row_attr <- NULL + self$worksheets[[i]]$sheet_data$cc <- NULL + } } - # create entire sheet prior to writing it - sheet_xml <- write_worksheet( - prior = prior, - post = post, - sheet_data = self$worksheets[[i]]$sheet_data - ) ws_file <- file.path(xlworksheetsDir, sprintf("sheet%s.xml", i)) - write_xmlPtr(doc = sheet_xml, fl = ws_file) - rm(sheet_xml) + + if (use_pugixml_export) { + + # create entire sheet prior to writing it + sheet_xml <- write_worksheet( + prior = prior, + post = post, + sheet_data = self$worksheets[[i]]$sheet_data + ) + write_xmlPtr(doc = sheet_xml, fl = ws_file) + + } else { + + if (grepl("", prior)) + prior <- substr(prior, 1, nchar(prior) - 13) # remove " " + + write_worksheet_slim( + sheet_data = self$worksheets[[i]]$sheet_data, + prior = prior, + post = post, + fl = ws_file + ) + + } ## write worksheet rels if (length(self$worksheets_rels[[i]])) { diff --git a/src/RcppExports.cpp b/src/RcppExports.cpp index afab64d3f..eee07da48 100644 --- a/src/RcppExports.cpp +++ b/src/RcppExports.cpp @@ -853,6 +853,19 @@ BEGIN_RCPP return rcpp_result_gen; END_RCPP } +// write_worksheet_slim +void write_worksheet_slim(Rcpp::Environment sheet_data, std::string prior, std::string post, std::string fl); +RcppExport SEXP _openxlsx2_write_worksheet_slim(SEXP sheet_dataSEXP, SEXP priorSEXP, SEXP postSEXP, SEXP flSEXP) { +BEGIN_RCPP + Rcpp::RNGScope rcpp_rngScope_gen; + Rcpp::traits::input_parameter< Rcpp::Environment >::type sheet_data(sheet_dataSEXP); + Rcpp::traits::input_parameter< std::string >::type prior(priorSEXP); + Rcpp::traits::input_parameter< std::string >::type post(postSEXP); + Rcpp::traits::input_parameter< std::string >::type fl(flSEXP); + write_worksheet_slim(sheet_data, prior, post, fl); + return R_NilValue; +END_RCPP +} // write_worksheet XPtrXML write_worksheet(std::string prior, std::string post, Rcpp::Environment& sheet_data); RcppExport SEXP _openxlsx2_write_worksheet(SEXP priorSEXP, SEXP postSEXP, SEXP sheet_dataSEXP) { @@ -1039,6 +1052,7 @@ static const R_CallMethodDef CallEntries[] = { {"_openxlsx2_read_colors", (DL_FUNC) &_openxlsx2_read_colors, 1}, {"_openxlsx2_write_colors", (DL_FUNC) &_openxlsx2_write_colors, 1}, {"_openxlsx2_set_sst", (DL_FUNC) &_openxlsx2_set_sst, 1}, + {"_openxlsx2_write_worksheet_slim", (DL_FUNC) &_openxlsx2_write_worksheet_slim, 4}, {"_openxlsx2_write_worksheet", (DL_FUNC) &_openxlsx2_write_worksheet, 3}, {"_openxlsx2_write_xmlPtr", (DL_FUNC) &_openxlsx2_write_xmlPtr, 2}, {"_openxlsx2_styles_bin", (DL_FUNC) &_openxlsx2_styles_bin, 3}, diff --git a/src/write_file.cpp b/src/write_file.cpp index 7509307b3..2f81d83ce 100644 --- a/src/write_file.cpp +++ b/src/write_file.cpp @@ -20,6 +20,192 @@ Rcpp::CharacterVector set_sst(Rcpp::CharacterVector sharedStrings) { return sst; } +// write xml by streaming to files. this takes whatever input we provide and +// dumps it into the file. no xml checking, no unicode checking +void xml_sheet_data_slim( + Rcpp::DataFrame row_attr, + Rcpp::DataFrame cc, + std::string prior, + std::string post, + std::string fl +) { + + bool has_cm = cc.containsElementNamed("c_cm"); + bool has_ph = cc.containsElementNamed("c_ph"); + bool has_vm = cc.containsElementNamed("c_vm"); + + std::ofstream file(fl); + + auto lastrow = 0; // integer value of the last row with column data + auto thisrow = 0; // integer value of the current row with column data + auto row_idx = 0; // the index of the row_attr file. this is != rowid + auto rowid = 0; // integer value of the r field in row_attr + + std::string xml_preserver = " "; + + file << "\n"; + file << prior; + + Rcpp::CharacterVector cc_c_cm, cc_c_ph, cc_c_vm; + + if (cc.nrow() && cc.ncol()) { + // we cannot access rows directly in the dataframe. + // Have to extract the columns and use these + Rcpp::CharacterVector cc_row_r = cc["row_r"]; // 1 + Rcpp::CharacterVector cc_r = cc["r"]; // A1 + Rcpp::CharacterVector cc_v = cc["v"]; + Rcpp::CharacterVector cc_c_t = cc["c_t"]; + Rcpp::CharacterVector cc_c_s = cc["c_s"]; + if (has_cm) cc_c_cm = cc["c_cm"]; + if (has_ph) cc_c_ph = cc["c_ph"]; + if (has_vm) cc_c_vm = cc["c_vm"]; + Rcpp::CharacterVector cc_f = cc["f"]; + Rcpp::CharacterVector cc_f_attr = cc["f_attr"]; + Rcpp::CharacterVector cc_is = cc["is"]; + + Rcpp::CharacterVector row_r = row_attr["r"]; + + + file << ""; + for (auto i = 0; i < cc.nrow(); ++i) { + + thisrow = std::stoi(Rcpp::as(cc_row_r[i])); + + if (lastrow < thisrow) { + + // there might be entirely empty rows in between. this is the case for + // loadExample. We check the rowid and write the line and skip until we + // have every row and only then continue writing the column + while (rowid < thisrow) { + + rowid = std::stoi(Rcpp::as( + row_r[row_idx] + )); + + if (row_idx) file << ""; + file << "(row_attr[j])[row_idx]; + + if (cv_s[0] != "") { + const std::string val_strl = Rcpp::as(cv_s); + file << " " << attrnams[j] << "=\"" << val_strl.c_str() << "\""; + } + } + file << ">"; // end + + // read the next row_idx when visiting again + ++row_idx; + } + } + + // create node + file << " + file << " r" << "=\"" << to_string(cc_r[i]).c_str() << "\""; + + if (!to_string(cc_c_s[i]).empty()) + file << " s" << "=\"" << to_string(cc_c_s[i]).c_str() << "\""; + + // assign type if not aka numeric + if (!to_string(cc_c_t[i]).empty()) + file << " t" << "=\"" << to_string(cc_c_t[i]).c_str() << "\""; + + // CellMetaIndex: suppress curly brackets in spreadsheet software + if (has_cm && !to_string(cc_c_cm[i]).empty()) + file << " cm" << "=\"" << to_string(cc_c_cm[i]).c_str() << "\""; + + // phonetics spelling + if (has_ph && !to_string(cc_c_ph[i]).empty()) + file << " ph" << "=\"" << to_string(cc_c_ph[i]).c_str() << "\""; + + // suppress curly brackets in spreadsheet software + if (has_vm && !to_string(cc_c_vm[i]).empty()) + file << " vm" << "=\"" << to_string(cc_c_vm[i]).c_str() << "\""; + + file << ">"; // end + + bool f_si = false; + + // ... + // f node: formula to be evaluated + if (!to_string(cc_f[i]).empty() || !to_string(cc_f_attr[i]).empty()) { + file << ""; + + file << to_string(cc_f[i]).c_str(); + + file << ""; + } + + // v node: value stored from evaluated formula + if (!to_string(cc_v[i]).empty()) { + if (!f_si & (to_string(cc_v[i]).compare(xml_preserver.c_str()) == 0)) { + // this looks strange + file << ""; + file << " "; + file << ""; + } else { + file << "" << to_string(cc_v[i]).c_str() << ""; + } + } + + // ... + if (to_string(cc_c_t[i]).compare("inlineStr") == 0) { + if (!to_string(cc_is[i]).empty()) { + file << to_string(cc_is[i]).c_str(); + } + } + + file << ""; + + // update lastrow + lastrow = thisrow; + } + + file << ""; + file << ""; + } else { + file << ""; + } + + + file << post; + file << ""; + + file.close(); + +} + +// export worksheet without pugixml +// this should be way quicker, uses far less memory, but also skips all of the checks pugi does +// +// [[Rcpp::export]] +void write_worksheet_slim( + Rcpp::Environment sheet_data, + std::string prior, + std::string post, + std::string fl +){ + // sheet_data will be in order, just need to check for row_heights + // CharacterVector cell_col = int_to_col(sheet_data.field("cols")); + Rcpp::DataFrame row_attr = Rcpp::as(sheet_data["row_attr"]); + Rcpp::DataFrame cc = Rcpp::as(sheet_data["cc"]); + + xml_sheet_data_slim(row_attr, cc, prior, post, fl); +} + // creates an xml row // data in xml is ordered row wise. therefore we need the row attributes and // the column data used in this row. This function uses both to create a single diff --git a/tests/testthat/test-write.R b/tests/testthat/test-write.R index e57d13d4a..8c94cc34e 100644 --- a/tests/testthat/test-write.R +++ b/tests/testthat/test-write.R @@ -1470,5 +1470,27 @@ test_that("guarding against overwriting shared formula reference works", { exp <- c("1", "2", "B1 + 1", "C1 + 1") got <- unname(unlist(wb$to_df(show_formula = TRUE, col_names = FALSE))) expect_equal(exp, got) +}) + +test_that("writing without pugixml works", { + + temp <- temp_xlsx() + expect_silent(write_xlsx(x = mtcars, file = temp)) + expect_silent(wb <- wb_load(temp)) + + temp <- temp_xlsx() + options("openxlsx2.export_with_pugi" = FALSE) + expect_silent(write_xlsx(x = mtcars, file = temp)) + expect_silent(wb <- wb_load(temp)) + + temp <- temp_xlsx() + options("openxlsx2.export_with_pugi" = TRUE) + expect_silent(write_xlsx(x = mtcars, file = temp)) + expect_silent(wb <- wb_load(temp)) + + temp <- temp_xlsx() + options("openxlsx2.export_with_pugi" = NULL) + expect_silent(write_xlsx(x = mtcars, file = temp)) + expect_silent(wb <- wb_load(temp)) }) From bb91cbcdd7ad79c56620bd570aff49dd27a7d9d8 Mon Sep 17 00:00:00 2001 From: Jan Marvin Garbuszus Date: Mon, 3 Mar 2025 13:11:43 +0100 Subject: [PATCH 04/11] [save] provide `flush` argument --- R/class-workbook-wrappers.R | 18 ++++++++++++++++-- R/class-workbook.R | 11 ++++++----- R/write_xlsx.R | 11 +++++++++-- man/wbWorkbook.Rd | 4 +++- man/wb_save.Rd | 18 +++++++++++++++++- 5 files changed, 51 insertions(+), 11 deletions(-) diff --git a/R/class-workbook-wrappers.R b/R/class-workbook-wrappers.R index fe3272470..9f3bbc7ec 100644 --- a/R/class-workbook-wrappers.R +++ b/R/class-workbook-wrappers.R @@ -67,10 +67,24 @@ wb_workbook <- function( #' Save a workbook to file #' +#' @details When saving a `wbWorkbook` to a file, memory usage may spike +#' depending on the worksheet size. This happens because the entire XML +#' structure is created in memory before writing to disk. The memory +#' required depends on worksheet size, as XML files consist of character +#' data and include additional overhead for validity checks. +#' +#' The `flush` argument streams worksheet XML data directly to disk, +#' avoiding the need to build the full XML tree in memory. This reduces +#' memory usage but skips some XML validity checks. It also bypasses +#' the `pugixml` functions that `openxlsx2` uses, omitting certain +#' preliminary sanity checks before writing. As the name suggests, +#' the output is simply flushed to disk. +#' #' @param wb A `wbWorkbook` object to write to file #' @param file A path to save the workbook to #' @param overwrite If `FALSE`, will not overwrite when `file` already exists. #' @param path Deprecated argument. Please use `file` in new code. +#' @param flush Experimental, streams the worksheet file to disk #' #' @export #' @family workbook wrappers @@ -86,9 +100,9 @@ wb_workbook <- function( #' \donttest{ #' wb_save(wb, file = temp_xlsx(), overwrite = TRUE) #' } -wb_save <- function(wb, file = NULL, overwrite = TRUE, path = NULL) { +wb_save <- function(wb, file = NULL, overwrite = TRUE, path = NULL, flush = FALSE) { assert_workbook(wb) - wb$clone()$save(file = file, overwrite = overwrite, path = path) + wb$clone()$save(file = file, overwrite = overwrite, path = path, flush = flush) } # add data ---------------------------------------------------------------- diff --git a/R/class-workbook.R b/R/class-workbook.R index 2cc17a210..3022df014 100644 --- a/R/class-workbook.R +++ b/R/class-workbook.R @@ -2910,8 +2910,9 @@ wbWorkbook <- R6::R6Class( #' @param file The path to save the workbook to #' @param overwrite If `FALSE`, will not overwrite when `path` exists #' @param path Deprecated argument previously used for file. Please use file in new code. + #' @param flush Experimental, streams the worksheet file to disk #' @return The `wbWorkbook` object invisibly - save = function(file = self$path, overwrite = TRUE, path = NULL) { + save = function(file = self$path, overwrite = TRUE, path = NULL, flush = FALSE) { if (!is.null(path)) { .Deprecated(old = "wb_save(path)", new = "wb_save(file)", package = "openxlsx2") @@ -3435,7 +3436,8 @@ wbWorkbook <- R6::R6Class( xlchartsDir, xlchartsRelsDir, xlworksheetsDir, - xlworksheetsRelsDir + xlworksheetsRelsDir, + use_pugixml_export = isFALSE(flush) ) ## write sharedStrings.xml @@ -9643,7 +9645,8 @@ wbWorkbook <- R6::R6Class( xlchartsDir, xlchartsRelsDir, xlworksheetsDir, - xlworksheetsRelsDir + xlworksheetsRelsDir, + use_pugixml_export ) { ## write charts @@ -9784,8 +9787,6 @@ wbWorkbook <- R6::R6Class( prior <- self$worksheets[[i]]$get_prior_sheet_data() post <- self$worksheets[[i]]$get_post_sheet_data() - use_pugixml_export <- getOption("openxlsx2.export_with_pugi", default = TRUE) - if (use_pugixml_export) { # failsaves. check that all rows and cells # are available and in the correct order diff --git a/R/write_xlsx.R b/R/write_xlsx.R index 05b3ca5b3..3fac2ea44 100644 --- a/R/write_xlsx.R +++ b/R/write_xlsx.R @@ -58,7 +58,8 @@ write_xlsx <- function(x, file, as_table = FALSE, ...) { "table_name", "with_filter", "first_active_row", "first_active_col", "first_row", "first_col", "col_widths", "na.strings", "overwrite", "title", "subject", "category", - "font_size", "font_color", "font_name" + "font_size", "font_color", "font_name", + "flush" ) params <- list(...) @@ -296,6 +297,12 @@ write_xlsx <- function(x, file, as_table = FALSE, ...) { font_args$font_name <- params$font_name } + # Flush stream file to disk + flush <- FALSE + if ("flush" %in% names(params)) { + flush <- params$flush + } + ## create new Workbook object wb <- wb_workbook(creator = creator, title = title, subject = subject, category = category) @@ -479,7 +486,7 @@ write_xlsx <- function(x, file, as_table = FALSE, ...) { } if (!missing(file)) - wb_save(wb, file = file, overwrite = overwrite) + wb_save(wb, file = file, overwrite = overwrite, flush = flush) invisible(wb) } diff --git a/man/wbWorkbook.Rd b/man/wbWorkbook.Rd index 42fe1d887..d72d628de 100644 --- a/man/wbWorkbook.Rd +++ b/man/wbWorkbook.Rd @@ -1102,7 +1102,7 @@ The \code{wbWorkbook} object invisibly \subsection{Method \code{save()}}{ Save the workbook \subsection{Usage}{ -\if{html}{\out{
}}\preformatted{wbWorkbook$save(file = self$path, overwrite = TRUE, path = NULL)}\if{html}{\out{
}} +\if{html}{\out{
}}\preformatted{wbWorkbook$save(file = self$path, overwrite = TRUE, path = NULL, flush = FALSE)}\if{html}{\out{
}} } \subsection{Arguments}{ @@ -1113,6 +1113,8 @@ Save the workbook \item{\code{overwrite}}{If \code{FALSE}, will not overwrite when \code{path} exists} \item{\code{path}}{Deprecated argument previously used for file. Please use file in new code.} + +\item{\code{flush}}{Experimental, streams the worksheet file to disk} } \if{html}{\out{}} } diff --git a/man/wb_save.Rd b/man/wb_save.Rd index 44beff524..dc591de23 100644 --- a/man/wb_save.Rd +++ b/man/wb_save.Rd @@ -4,7 +4,7 @@ \alias{wb_save} \title{Save a workbook to file} \usage{ -wb_save(wb, file = NULL, overwrite = TRUE, path = NULL) +wb_save(wb, file = NULL, overwrite = TRUE, path = NULL, flush = FALSE) } \arguments{ \item{wb}{A \code{wbWorkbook} object to write to file} @@ -14,6 +14,8 @@ wb_save(wb, file = NULL, overwrite = TRUE, path = NULL) \item{overwrite}{If \code{FALSE}, will not overwrite when \code{file} already exists.} \item{path}{Deprecated argument. Please use \code{file} in new code.} + +\item{flush}{Experimental, streams the worksheet file to disk} } \value{ the \code{wbWorkbook} object, invisibly @@ -21,6 +23,20 @@ the \code{wbWorkbook} object, invisibly \description{ Save a workbook to file } +\details{ +When saving a \code{wbWorkbook} to a file, memory usage may spike +depending on the worksheet size. This happens because the entire XML +structure is created in memory before writing to disk. The memory +required depends on worksheet size, as XML files consist of character +data and include additional overhead for validity checks. + +The \code{flush} argument streams worksheet XML data directly to disk, +avoiding the need to build the full XML tree in memory. This reduces +memory usage but skips some XML validity checks. It also bypasses +the \code{pugixml} functions that \code{openxlsx2} uses, omitting certain +preliminary sanity checks before writing. As the name suggests, +the output is simply flushed to disk. +} \examples{ ## Create a new workbook and add a worksheet wb <- wb_workbook("Creator of workbook") From 306a58c37f631fadae3d3e79408ba172b3231772 Mon Sep 17 00:00:00 2001 From: Jan Marvin Garbuszus Date: Mon, 3 Mar 2025 13:11:51 +0100 Subject: [PATCH 05/11] Revert "[save] enable PUGIXML_COMPACT" This reverts commit 74bef918eb74fd8e4f498b3819f6cce9d6343e8a. --- inst/include/pugixml/pugiconfig.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/inst/include/pugixml/pugiconfig.hpp b/inst/include/pugixml/pugiconfig.hpp index 0101acea4..c7c9af18e 100644 --- a/inst/include/pugixml/pugiconfig.hpp +++ b/inst/include/pugixml/pugiconfig.hpp @@ -18,7 +18,7 @@ // #define PUGIXML_WCHAR_MODE // Uncomment this to enable compact mode -#define PUGIXML_COMPACT +// #define PUGIXML_COMPACT // Uncomment this to disable XPath // #define PUGIXML_NO_XPATH From 8b7f60cc37eb5172f0a7d11507b7ceda35683e66 Mon Sep 17 00:00:00 2001 From: Jan Marvin Garbuszus Date: Mon, 3 Mar 2025 13:14:08 +0100 Subject: [PATCH 06/11] [save] assert `flush` --- R/class-workbook.R | 1 + 1 file changed, 1 insertion(+) diff --git a/R/class-workbook.R b/R/class-workbook.R index 3022df014..23f931ed4 100644 --- a/R/class-workbook.R +++ b/R/class-workbook.R @@ -2921,6 +2921,7 @@ wbWorkbook <- R6::R6Class( assert_class(file, "character") assert_class(overwrite, "logical") + assert_class(flush, "logical") if (file.exists(file) & !overwrite) { stop("File already exists!") From bc409542297a439415707ef0aa020c3634b020cd Mon Sep 17 00:00:00 2001 From: Jan Marvin Garbuszus Date: Mon, 3 Mar 2025 13:14:46 +0100 Subject: [PATCH 07/11] [misc] update WORDLIST --- inst/WORDLIST | 1 - 1 file changed, 1 deletion(-) diff --git a/inst/WORDLIST b/inst/WORDLIST index 9c5ed25e1..e2862fd54 100644 --- a/inst/WORDLIST +++ b/inst/WORDLIST @@ -407,7 +407,6 @@ veryHidden vm vml wb -wbColor wbWorkbook wedgeEllipseCallout wedgeRectCallout From e5cc58a5c789f91e75b13afe2aff921fb2763cbb Mon Sep 17 00:00:00 2001 From: Jan Marvin Garbuszus Date: Mon, 3 Mar 2025 13:32:26 +0100 Subject: [PATCH 08/11] [test] fix tests for `flush = TRUE` --- src/write_file.cpp | 17 ++++++++++++++++- 1 file changed, 16 insertions(+), 1 deletion(-) diff --git a/src/write_file.cpp b/src/write_file.cpp index 2f81d83ce..777982cda 100644 --- a/src/write_file.cpp +++ b/src/write_file.cpp @@ -103,6 +103,21 @@ void xml_sheet_data_slim( } } + + if ( + cc_c_s[i].empty() && + cc_c_t[i].empty() && + (!has_cm || (has_cm && cc_c_cm[i].empty())) && + (!has_ph || (has_ph && cc_c_ph[i].empty())) && + (!has_vm || (has_vm && cc_c_vm[i].empty())) && + cc_v[i].empty() && + cc_f[i].empty() && + cc_f_attr[i].empty() && + cc_is[i].empty() + ) { + continue; + } + // create node file << ""; From ae316b032070334294c5676f95958f301ad395ab Mon Sep 17 00:00:00 2001 From: Jan Marvin Garbuszus Date: Mon, 3 Mar 2025 13:37:52 +0100 Subject: [PATCH 09/11] [save] clang-format --- src/write_file.cpp | 46 ++++++++++++++++++++-------------------------- 1 file changed, 20 insertions(+), 26 deletions(-) diff --git a/src/write_file.cpp b/src/write_file.cpp index 777982cda..af89faf58 100644 --- a/src/write_file.cpp +++ b/src/write_file.cpp @@ -36,10 +36,10 @@ void xml_sheet_data_slim( std::ofstream file(fl); - auto lastrow = 0; // integer value of the last row with column data - auto thisrow = 0; // integer value of the current row with column data - auto row_idx = 0; // the index of the row_attr file. this is != rowid - auto rowid = 0; // integer value of the r field in row_attr + auto lastrow = 0; // integer value of the last row with column data + auto thisrow = 0; // integer value of the current row with column data + auto row_idx = 0; // the index of the row_attr file. this is != rowid + auto rowid = 0; // integer value of the r field in row_attr std::string xml_preserver = " "; @@ -68,11 +68,9 @@ void xml_sheet_data_slim( file << ""; for (auto i = 0; i < cc.nrow(); ++i) { - thisrow = std::stoi(Rcpp::as(cc_row_r[i])); if (lastrow < thisrow) { - // there might be entirely empty rows in between. this is the case for // loadExample. We check the rowid and write the line and skip until we // have every row and only then continue writing the column @@ -87,7 +85,6 @@ void xml_sheet_data_slim( Rcpp::CharacterVector attrnams = row_attr.names(); for (auto j = 0; j < row_attr.ncol(); ++j) { - Rcpp::CharacterVector cv_s = ""; cv_s = Rcpp::as(row_attr[j])[row_idx]; @@ -96,27 +93,26 @@ void xml_sheet_data_slim( file << " " << attrnams[j] << "=\"" << val_strl.c_str() << "\""; } } - file << ">"; // end + file << ">"; // end // read the next row_idx when visiting again ++row_idx; } } - - if ( - cc_c_s[i].empty() && - cc_c_t[i].empty() && - (!has_cm || (has_cm && cc_c_cm[i].empty())) && - (!has_ph || (has_ph && cc_c_ph[i].empty())) && - (!has_vm || (has_vm && cc_c_vm[i].empty())) && - cc_v[i].empty() && - cc_f[i].empty() && - cc_f_attr[i].empty() && - cc_is[i].empty() - ) { - continue; - } + if ( + cc_c_s[i].empty() && + cc_c_t[i].empty() && + (!has_cm || (has_cm && cc_c_cm[i].empty())) && + (!has_ph || (has_ph && cc_c_ph[i].empty())) && + (!has_vm || (has_vm && cc_c_vm[i].empty())) && + cc_v[i].empty() && + cc_f[i].empty() && + cc_f_attr[i].empty() && + cc_is[i].empty() + ) { + continue; + } // create node file << ""; // end + file << ">"; // end bool f_si = false; @@ -195,12 +191,10 @@ void xml_sheet_data_slim( file << ""; } - file << post; file << ""; file.close(); - } // export worksheet without pugixml @@ -231,7 +225,7 @@ void xml_sheet_data(pugi::xml_node &doc, Rcpp::DataFrame &row_attr, Rcpp::DataFr auto lastrow = 0; // integer value of the last row with column data auto thisrow = 0; // integer value of the current row with column data auto row_idx = 0; // the index of the row_attr file. this is != rowid - auto rowid = 0; // integer value of the r field in row_attr + auto rowid = 0; // integer value of the r field in row_attr pugi::xml_node row; From f3dc635323ba82a0e99b0b94c3ef5f92d3a18ac5 Mon Sep 17 00:00:00 2001 From: Jan Marvin Garbuszus Date: Mon, 3 Mar 2025 13:48:34 +0100 Subject: [PATCH 10/11] [misc] update NEWS --- NEWS.md | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/NEWS.md b/NEWS.md index 189875a6b..71a53fa0b 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,5 +1,9 @@ # openxlsx2 (development version) +## New features + +* A new experimental `flush` argument has been introduced to `wb_save()`, allowing a custom XML streaming function for worksheets to help prevent memory spikes. This feature has only been tested within `openxlsx2` and not extensively with spreadsheet software. Since it bypasses certain failsafe mechanisms, including XML validity checks, it should only be used as a last-resort solution. [1255](https://github.com/JanMarvin/openxlsx2/pull/1255) + ## Fixes * Input validation has been added to `fmt_txt()`, similar to how it has been added to the `create_*()` family a while ago. From 0b8633e7353f57ee2999832039becda63d5d42bf Mon Sep 17 00:00:00 2001 From: Jan Marvin Garbuszus Date: Mon, 3 Mar 2025 13:55:51 +0100 Subject: [PATCH 11/11] [misc] final cleanup --- src/RcppExports.cpp | 4 ++-- src/write_file.cpp | 8 ++++---- tests/testthat/test-write.R | 16 +++++----------- 3 files changed, 11 insertions(+), 17 deletions(-) diff --git a/src/RcppExports.cpp b/src/RcppExports.cpp index eee07da48..1ff451036 100644 --- a/src/RcppExports.cpp +++ b/src/RcppExports.cpp @@ -867,14 +867,14 @@ BEGIN_RCPP END_RCPP } // write_worksheet -XPtrXML write_worksheet(std::string prior, std::string post, Rcpp::Environment& sheet_data); +XPtrXML write_worksheet(std::string prior, std::string post, Rcpp::Environment sheet_data); RcppExport SEXP _openxlsx2_write_worksheet(SEXP priorSEXP, SEXP postSEXP, SEXP sheet_dataSEXP) { BEGIN_RCPP Rcpp::RObject rcpp_result_gen; Rcpp::RNGScope rcpp_rngScope_gen; Rcpp::traits::input_parameter< std::string >::type prior(priorSEXP); Rcpp::traits::input_parameter< std::string >::type post(postSEXP); - Rcpp::traits::input_parameter< Rcpp::Environment& >::type sheet_data(sheet_dataSEXP); + Rcpp::traits::input_parameter< Rcpp::Environment >::type sheet_data(sheet_dataSEXP); rcpp_result_gen = Rcpp::wrap(write_worksheet(prior, post, sheet_data)); return rcpp_result_gen; END_RCPP diff --git a/src/write_file.cpp b/src/write_file.cpp index af89faf58..11c43ae51 100644 --- a/src/write_file.cpp +++ b/src/write_file.cpp @@ -35,11 +35,12 @@ void xml_sheet_data_slim( bool has_vm = cc.containsElementNamed("c_vm"); std::ofstream file(fl); + if (!file.is_open()) Rcpp::stop("could not open output file"); auto lastrow = 0; // integer value of the last row with column data auto thisrow = 0; // integer value of the current row with column data auto row_idx = 0; // the index of the row_attr file. this is != rowid - auto rowid = 0; // integer value of the r field in row_attr + auto rowid = 0; // integer value of the r field in row_attr std::string xml_preserver = " "; @@ -65,7 +66,6 @@ void xml_sheet_data_slim( Rcpp::CharacterVector row_r = row_attr["r"]; - file << ""; for (auto i = 0; i < cc.nrow(); ++i) { thisrow = std::stoi(Rcpp::as(cc_row_r[i])); @@ -225,7 +225,7 @@ void xml_sheet_data(pugi::xml_node &doc, Rcpp::DataFrame &row_attr, Rcpp::DataFr auto lastrow = 0; // integer value of the last row with column data auto thisrow = 0; // integer value of the current row with column data auto row_idx = 0; // the index of the row_attr file. this is != rowid - auto rowid = 0; // integer value of the r field in row_attr + auto rowid = 0; // integer value of the r field in row_attr pugi::xml_node row; @@ -397,7 +397,7 @@ void xml_sheet_data(pugi::xml_node &doc, Rcpp::DataFrame &row_attr, Rcpp::DataFr // create single xml rows of sheet_data. // // [[Rcpp::export]] -XPtrXML write_worksheet(std::string prior, std::string post, Rcpp::Environment &sheet_data) { +XPtrXML write_worksheet(std::string prior, std::string post, Rcpp::Environment sheet_data) { uint32_t pugi_parse_flags = pugi::parse_cdata | pugi::parse_wconv_attribute | pugi::parse_ws_pcdata | pugi::parse_eol; // sheet_data will be in order, just need to check for row_heights diff --git a/tests/testthat/test-write.R b/tests/testthat/test-write.R index 8c94cc34e..aa60db992 100644 --- a/tests/testthat/test-write.R +++ b/tests/testthat/test-write.R @@ -1475,22 +1475,16 @@ test_that("guarding against overwriting shared formula reference works", { test_that("writing without pugixml works", { temp <- temp_xlsx() - expect_silent(write_xlsx(x = mtcars, file = temp)) + expect_silent(write_xlsx(x = mtcars, file = temp, flush = TRUE)) expect_silent(wb <- wb_load(temp)) - temp <- temp_xlsx() - options("openxlsx2.export_with_pugi" = FALSE) - expect_silent(write_xlsx(x = mtcars, file = temp)) - expect_silent(wb <- wb_load(temp)) + wb <- wb_workbook()$add_worksheet()$add_data(x = mtcars) - temp <- temp_xlsx() - options("openxlsx2.export_with_pugi" = TRUE) - expect_silent(write_xlsx(x = mtcars, file = temp)) + expect_silent(wb_save(wb = wb, file = temp, flush = TRUE)) expect_silent(wb <- wb_load(temp)) + wb <- wb_workbook()$add_worksheet()$add_data(x = mtcars) - temp <- temp_xlsx() - options("openxlsx2.export_with_pugi" = NULL) - expect_silent(write_xlsx(x = mtcars, file = temp)) + expect_silent(wb$save(file = temp, flush = TRUE)) expect_silent(wb <- wb_load(temp)) })