From ebd152d585a02593ca67f3b8b7d87e3f0fa0c99c Mon Sep 17 00:00:00 2001 From: Benjamin Schwendinger Date: Tue, 28 Oct 2025 18:09:00 +0100 Subject: [PATCH 01/27] modular optimization paths - init --- R/data.table.R | 610 +++++++++++++++++++++++++++++-------------------- 1 file changed, 360 insertions(+), 250 deletions(-) diff --git a/R/data.table.R b/R/data.table.R index db74384c4..255c9aeac 100644 --- a/R/data.table.R +++ b/R/data.table.R @@ -147,6 +147,350 @@ replace_dot_alias = function(e) { } } +# Transform lapply(.SD, fun) or Map(fun, .SD) into list(fun(col1), fun(col2), ...) +.massageSD = function(jsub, sdvars, SDenv, funi) { + txt = as.list(jsub)[-1L] + if (length(names(txt))>1L) .Call(Csetcharvec, names(txt), 2L, "") # fixes bug #110 + # support Map instead of lapply #5336 + fun = if (jsub %iscall% "Map") txt[[1L]] else txt[[2L]] + if (fun %iscall% "function") { + # Fix for #2381: added SDenv$.SD to 'eval' to take care of cases like: lapply(.SD, function(x) weighted.mean(x, bla)) where "bla" is a column in DT + # http://stackoverflow.com/questions/13441868/data-table-and-stratified-means + # adding this does not compromise in speed (that is, not any lesser than without SDenv$.SD) + # replaced SDenv$.SD to SDenv to deal with Bug #87 reported by Ricardo (Nice catch!) + thisfun = paste0("..FUN", funi) # Fix for #985 + assign(thisfun, eval(fun, SDenv, SDenv), SDenv) # to avoid creating function() for each column of .SD + lockBinding(thisfun, SDenv) + txt[[1L]] = as.name(thisfun) + } else { + if (is.character(fun)) fun = as.name(fun) + txt[[1L]] = fun + } + ans = vector("list", length(sdvars)+1L) + ans[[1L]] = as.name("list") + for (ii in seq_along(sdvars)) { + txt[[2L]] = as.name(sdvars[ii]) + ans[[ii+1L]] = as.call(txt) + } + jsub = as.call(ans) # important no names here + jvnames = sdvars # but here instead + list(jsub=jsub, jvnames=jvnames, funi=funi+1L) + # It may seem inefficient to construct a potentially long expression. But, consider calling + # lapply 100000 times. The C code inside lapply does the LCONS stuff anyway, every time it + # is called, involving small memory allocations. + # The R level lapply calls as.list which needs a shallow copy. + # lapply also does a setAttib of names (duplicating the same names over and over again + # for each group) which is terrible for our needs. We replace all that with a + # (ok, long, but not huge in memory terms) list() which is primitive (so avoids symbol + # lookup), and the eval() inside dogroups hardly has to do anything. All this results in + # overhead minimised. We don't need to worry about the env passed to the eval in a possible + # lapply replacement, or how to pass ... efficiently to it. + # Plus we optimize lapply first, so that mean() can be optimized too as well, next. +} + +# Optimize .SD subsetting patterns like .SD[1], head(.SD), first(.SD) +# return NULL for no optimization possible +.optimize_sd_subset = function(jsub, sdvars, SDenv, envir) { + if (!is.call(jsub) || length(jsub) < 2L || !is.name(jsub[[2L]]) || jsub[[2L]] != ".SD") return(NULL) + + # g[[ only applies to atomic input, for now, was causing #4159. be sure to eval with enclos=parent.frame() for #4612 + subopt = length(jsub) == 3L && + (jsub %iscall% "[" || + (jsub %iscall% "[[" && is.name(jsub[[2L]]) && eval(call('is.atomic', jsub[[2L]]), SDenv$.SDall, envir))) && + (is.numeric(jsub[[3L]]) || jsub[[3L]] == ".N") + headopt = jsub %iscall% c("head", "tail") + firstopt = jsub %iscall% c("first", "last") # fix for #2030 + if (subopt || headopt || firstopt) { + if (headopt && length(jsub)==2L) jsub[["n"]] = 6L # head-tail n=6 when missing #3462 + # optimise .SD[1] or .SD[2L]. Not sure how to test .SD[a] as to whether a is numeric/integer or a data.table, yet. + jsub_new = as.call(c(quote(list), lapply(sdvars, function(x) { jsub[[2L]] = as.name(x); jsub }))) + return(list(jsub=jsub_new, jvnames=sdvars)) + } + + NULL +} + +# Optimize c(...) expressions +.optimize_c_expr = function(jsub, jvnames, sdvars, SDenv, funi, envir) { + if (!jsub %iscall% "c" || length(jsub) <= 1L) { + return(list(jsub=jsub, jvnames=jvnames, funi=funi, optimized=FALSE)) + } + # FR #2722 is just about optimisation of j=c(.N, lapply(.SD, .)) that is taken care of here. + # FR #735 tries to optimise j-expressions of the form c(...) as long as ... contains + # 1) lapply(.SD, ...), 2) simply .SD or .SD[..], 3) .N, 4) list(...) and 5) functions that normally return a single value* + # On 5)* the IMPORTANT point to note is that things that are not wrapped within "list(...)" should *always* + # return length 1 output for us to optimise. Else, there's no equivalent to optimising c(...) to list(...) AFAICT. + # One issue could be that these functions (e.g., mean) can be "re-defined" by the OP to produce a length > 1 output + # Of course this is worrying too much though. If the issue comes up, we'll just remove the relevant optimisations. + # For now, we optimise all functions mentioned in 'optfuns' below. + optfuns = c("max", "min", "mean", "length", "sum", "median", "sd", "var") + is_valid = TRUE + any_SD = FALSE + jsubl = as.list.default(jsub) + oldjvnames = jvnames + jvnames = NULL # TODO: not let jvnames grow, maybe use (number of lapply(.SD, .))*length(sdvars) + other jvars ?? not straightforward. + # Fix for #744. Don't use 'i' in for-loops. It masks the 'i' from the input!! + for (i_ in 2L:length(jsubl)) { + this = jsub[[i_]] + # Case 1: Plain name (.SD or .N) + if (is.name(this)) { # no need to check length(this)==1L; is.name() returns single TRUE or FALSE (documented); can't have a vector of names + if (this == ".SD") { # optimise '.SD' alone + any_SD = TRUE + jsubl[[i_]] = lapply(sdvars, as.name) + jvnames = c(jvnames, sdvars) + } else if (this == ".N") { + # don't optimise .I in c(.SD, .I), it's length can be > 1 + # only c(.SD, list(.I)) should be optimised!! .N is always length 1. + jvnames = c(jvnames, gsub("^[.]([N])$", "\\1", this)) + } else { + # jvnames = c(jvnames, if (is.null(names(jsubl))) "" else names(jsubl)[i_]) + is_valid = FALSE + break + } + } + # Case 2: Call expression + else if (is.call(this)) { + # Case 2a: lapply(.SD, ...) or Map(fun, .SD) + is_lapply = this[[1L]] == "lapply" && length(this) >= 2L && this[[2L]] == ".SD" + is_map = this[[1L]] == "Map" && length(this) >= 3L && this[[3L]] == ".SD" + if ((is_lapply || is_map) && length(sdvars)) { + any_SD = TRUE + massage_result = .massageSD(this, sdvars, SDenv, funi) + funi = massage_result$funi + jsubl[[i_]] = as.list(massage_result$jsub[-1L]) # just keep the '.' from list(.) + jn__ = massage_result$jvnames + if (isTRUE(nzchar(names(jsubl)[i_]))) { + # Fix for #2311, prepend named arguments of c() to column names of .SD + # e.g. c(mean=lapply(.SD, mean)) or c(mean=Map(mean, .SD)) + jn__ = paste(names(jsubl)[i_], jn__, sep=".") # sep="." for consistency with c(A=list(a=1,b=1)) + } + jvnames = c(jvnames, jn__) + } + # Case 2b: list(...) + else if (this[[1L]] == "list") { + # also handle c(lapply(.SD, sum), list()) - silly, yes, but can happen + if (length(this) > 1L) { + jl__ = as.list(jsubl[[i_]])[-1L] # just keep the '.' from list(.) + if (isTRUE(nzchar(names(jsubl)[i_]))) { + # Fix for #2311, prepend named list arguments of c() to that list's names. See tests 2283.* + njl__ = names(jl__) %||% rep("", length(jl__)) + njl__nonblank = nzchar(names(jl__)) + if (length(jl__) > 1L) { + jn__ = paste0(names(jsubl)[i_], seq_along(jl__)) + } else { + jn__ = names(jsubl)[i_] + } + jn__[njl__nonblank] = paste(names(jsubl)[i_], njl__[njl__nonblank], sep=".") + } else { + jn__ = names(jl__) %||% rep("", length(jl__)) + } + idx = unlist(lapply(jl__, function(x) is.name(x) && x == ".I")) + if (any(idx)) + jn__[idx & !nzchar(jn__)] = "I" # this & is correct not && + jvnames = c(jvnames, jn__) + jsubl[[i_]] = jl__ + } + } + # Case 2c: Single-value functions like mean, sum, etc. + else if (this %iscall% optfuns && length(this)>1L) { + jvnames = c(jvnames, if (is.null(names(jsubl))) "" else names(jsubl)[i_]) + } + # Case 2d: .SD[1] or similar subsetting + else if (length(this) == 3L && (this[[1L]] == "[" || this[[1L]] == "head") && + this[[2L]] == ".SD" && (is.numeric(this[[3L]]) || this[[3L]] == ".N")) { + # optimise .SD[1] or .SD[2L]. Not sure how to test .SD[a] as to whether a is numeric/integer or a data.table, yet. + any_SD = TRUE + jsubl[[i_]] = lapply(sdvars, function(x) { this[[2L]] = as.name(x); this }) + jvnames = c(jvnames, sdvars) + } + # Case 2e: Complex .SD usage - can't optimize + else if (any(all.vars(this) == ".SD")) { + # TODO, TO DO: revisit complex cases (as illustrated below) + # complex cases like DT[, c(.SD[x>1], .SD[J(.)], c(.SD), a + .SD, lapply(.SD, sum)), by=grp] + # hard to optimise such cases (+ difficulty in counting exact columns and therefore names). revert back to no optimisation. + is_valid = FALSE + break + } + # Case 2f: Other cases - skip optimization + else { + # TO DO, TODO: maybe a message/warning here so that we can catch the overlooked cases, if any? + is_valid = FALSE + break + } + } + # Case 3: Other types - can't optimize + else { + is_valid = FALSE + break + } + } + + # Return result + if (!is_valid || !any_SD) { + # Can't optimize - return original + return(list(jsub=jsub, jvnames=oldjvnames, funi=funi, optimized=FALSE)) + } else { + # Optimization successful + setattr(jsubl, 'names', NULL) + jsub_new = as.call(unlist(jsubl, use.names=FALSE)) + jsub_new[[1L]] = quote(list) + return(list(jsub=jsub_new, jvnames=jvnames, funi=funi, optimized=TRUE)) + } +} + +# Optimize lapply(.SD, ...) expressions +# This function transforms lapply(.SD, fun) into list(fun(col1), fun(col2), ...) +# Returns: list(jsub=call/name, jvnames=character) +.optimize_lapply = function(jsub, jvnames, sdvars, SDenv, verbose, envir) { + oldjsub = jsub + funi = 1L # Fix for #985 + + # Try different optimization patterns in order + + # Pattern 1: Plain .SD -> list(col1, col2, ...) + if (is.name(jsub) && jsub == ".SD") { + jsub = as.call(c(quote(list), lapply(sdvars, as.name))) + jvnames = sdvars + } + # Pattern 2: .SD subsetting like .SD[1], head(.SD), first(.SD) + else if (!is.null(result <- .optimize_sd_subset(jsub, sdvars, SDenv, envir))) { + jsub = result$jsub + jvnames = result$jvnames + } + # Pattern 3a: lapply(.SD, fun) + else if (is.call(jsub) && jsub %iscall% "lapply" && length(jsub) >= 2L && jsub[[2L]] == ".SD" && length(sdvars)) { + massage_result = .massageSD(jsub, sdvars, SDenv, funi) + jsub = massage_result$jsub + jvnames = massage_result$jvnames + funi = massage_result$funi + } + # Pattern 3b: Map(fun, .SD) + else if (is.call(jsub) && jsub %iscall% "Map" && length(jsub) >= 3L && jsub[[3L]] == ".SD" && length(sdvars)) { + massage_result = .massageSD(jsub, sdvars, SDenv, funi) + jsub = massage_result$jsub + jvnames = massage_result$jvnames + funi = massage_result$funi + } + # Pattern 4: c(...) with .SD components + else if (is.call(jsub)) { + c_result = .optimize_c_expr(jsub, jvnames, sdvars, SDenv, funi, envir) + if (c_result$optimized) { + jsub = c_result$jsub + jvnames = c_result$jvnames + funi = c_result$funi + } + } + + # Verbose output + if (verbose) { + if (!identical(oldjsub, jsub)) + catf("lapply optimization changed j from '%s' to '%s'\n", deparse(oldjsub), deparse(jsub,width.cutoff=200L, nlines=1L)) + else + catf("lapply optimization is on, j unchanged as '%s'\n", deparse(jsub,width.cutoff=200L, nlines=1L)) + } + + list(jsub=jsub, jvnames=jvnames) +} + +# Optimize expressions using GForce (C-level optimizations) +# This function replaces functions like mean() with gmean() for fast C implementations +.optimize_gforce = function(jsub, SDenv, verbose, i, byjoin, f__, ansvars, use.I, lhs, names_x, envir) { + GForce = FALSE + + # FR #971, GForce kicks in on all subsets, no joins yet. Although joins could work with + # nomatch=NULL even now.. but not switching it on yet, will deal it separately. + if (getOption("datatable.optimize")>=2L && !is.data.table(i) && !byjoin && length(f__)) { + if (!length(ansvars) && !use.I) { + GForce = FALSE + if ( ((is.name(jsub) && jsub==".N") || (jsub %iscall% 'list' && length(jsub)==2L && jsub[[2L]]==".N")) && !length(lhs) ) { + GForce = TRUE + if (verbose) catf("GForce optimized j to '%s' (see ?GForce)\n",deparse(jsub, width.cutoff=200L, nlines=1L)) + } + } else if (length(lhs) && is.symbol(jsub)) { # turn off GForce for the combination of := and .N + GForce = FALSE + } else { + # Apply GForce + if (jsub %iscall% "list") { + GForce = TRUE + for (ii in seq.int(from=2L, length.out=length(jsub)-1L)) { + if (!.gforce_ok(jsub[[ii]], SDenv$.SDall, envir)) {GForce = FALSE; break} + } + } else + GForce = .gforce_ok(jsub, SDenv$.SDall, envir) + if (GForce) { + if (jsub %iscall% "list") + for (ii in seq_along(jsub)[-1L]) { + if (is.N(jsub[[ii]])) next; # For #334 + jsub[[ii]] = .gforce_jsub(jsub[[ii]], names_x, envir) + } + else { + # adding argument to ghead/gtail if none is supplied to g-optimized head/tail + if (length(jsub) == 2L && jsub %iscall% c("head", "tail")) jsub[["n"]] = 6L + jsub = .gforce_jsub(jsub, names_x, envir) + } + if (verbose) catf("GForce optimized j to '%s' (see ?GForce)\n", deparse(jsub, width.cutoff=200L, nlines=1L)) + } else if (verbose) catf("GForce is on, but not activated for this query; left j unchanged (see ?GForce)\n"); + } + } + + list(GForce=GForce, jsub=jsub) +} + +# Old mean() optimization fallback when GForce is not used +.optimize_mean = function(jsub, SDenv, verbose, GForce) { + if (!GForce && !is.name(jsub)) { + # Still do the old speedup for mean, for now + nomeanopt=FALSE # to be set by .optmean() using <<- inside it + oldjsub = jsub + if (jsub %iscall% "list") { + # Addressing #1369, #2949 and #1974. This used to be 30s (vs 0.5s) with 30K elements items in j, #1470. Could have been is.N() and/or the for-looped if() + # jsub[[1]]=="list" so the first item of todo will always be FALSE + todo = sapply(jsub, `%iscall%`, 'mean') + if (any(todo)) { + w = which(todo) + jsub[w] = lapply(jsub[w], .optmean) + } + } else if (jsub %iscall% "mean") { + jsub = .optmean(jsub) + } + if (nomeanopt) { + warningf("Unable to optimize call to mean() and could be very slow. You must name 'na.rm' like that otherwise if you do mean(x,TRUE) the TRUE is taken to mean 'trim' which is the 2nd argument of mean. 'trim' is not yet optimized.", immediate.=TRUE) + } + if (verbose) { + if (!identical(oldjsub, jsub)) + catf("Old mean optimization changed j from '%s' to '%s'\n", deparse(oldjsub), deparse(jsub, width.cutoff=200L, nlines=1L)) + else + catf("Old mean optimization is on, left j unchanged.\n") + } + assign("Cfastmean", Cfastmean, SDenv) + # Old comments still here for now ... + # Here in case nomeanopt=TRUE or some calls to mean weren't detected somehow. Better but still slow. + # Maybe change to : + # assign("mean", fastmean, SDenv) # neater than the hard work above, but slower + # when fastmean can do trim. + } + + jsub +} + +# attempts to optimize j expressions using lapply, GForce, and mean optimizations +.attempt_optimize = function(jsub, jvnames, sdvars, SDenv, verbose, i, byjoin, f__, ansvars, use.I, lhs, names_x, envir) { + # Step 1: Apply lapply(.SD) optimization + lapply_result = .optimize_lapply(jsub, jvnames, sdvars, SDenv, verbose, envir) + jsub = lapply_result$jsub + jvnames = lapply_result$jvnames + + # Step 2: Apply GForce optimization + gforce_result = .optimize_gforce(jsub, SDenv, verbose, i, byjoin, f__, ansvars, use.I, lhs, names_x, envir) + GForce = gforce_result$GForce + jsub = gforce_result$jsub + + # Step 3: Apply old mean optimization (fallback when GForce is not used) + jsub = .optimize_mean(jsub, SDenv, verbose, GForce) + + list(GForce=GForce, jsub=jsub, jvnames=jvnames) +} + "[.data.table" = function(x, i, j, by, keyby, with=TRUE, nomatch=NA, mult="all", roll=FALSE, rollends=if (roll=="nearest") c(TRUE,TRUE) else if (roll>=0.0) c(FALSE,TRUE) else c(TRUE,FALSE), which=FALSE, .SDcols, verbose=getOption("datatable.verbose"), allow.cartesian=getOption("datatable.allow.cartesian"), drop=NULL, on=NULL, env=NULL, showProgress=getOption("datatable.showProgress", interactive())) { # ..selfcount <<- ..selfcount+1 # in dev, we check no self calls, each of which doubles overhead, or could @@ -1612,252 +1956,18 @@ replace_dot_alias = function(e) { SDenv$.NGRP = length(f__) lockBinding(".NGRP", SDenv) - GForce = FALSE - if ( getOption("datatable.optimize")>=1L && (is.call(jsub) || (is.name(jsub) && jsub %chin% c(".SD", ".N"))) ) { # Ability to turn off if problems or to benchmark the benefit - # Optimization to reduce overhead of calling lapply over and over for each group - oldjsub = jsub - funi = 1L # Fix for #985 - # converted the lapply(.SD, ...) to a function and used below, easier to implement FR #2722 then. - .massageSD = function(jsub) { - txt = as.list(jsub)[-1L] - if (length(names(txt))>1L) .Call(Csetcharvec, names(txt), 2L, "") # fixes bug #110 - fun = txt[[2L]] - if (fun %iscall% "function") { - # Fix for #2381: added SDenv$.SD to 'eval' to take care of cases like: lapply(.SD, function(x) weighted.mean(x, bla)) where "bla" is a column in DT - # http://stackoverflow.com/questions/13441868/data-table-and-stratified-means - # adding this does not compromise in speed (that is, not any lesser than without SDenv$.SD) - # replaced SDenv$.SD to SDenv to deal with Bug #87 reported by Ricardo (Nice catch!) - thisfun = paste0("..FUN", funi) # Fix for #985 - assign(thisfun,eval(fun, SDenv, SDenv), SDenv) # to avoid creating function() for each column of .SD - lockBinding(thisfun,SDenv) - txt[[1L]] = as.name(thisfun) - } else { - if (is.character(fun)) fun = as.name(fun) - txt[[1L]] = fun - } - ans = vector("list", length(sdvars)+1L) - ans[[1L]] = as.name("list") - for (ii in seq_along(sdvars)) { - txt[[2L]] = as.name(sdvars[ii]) - ans[[ii+1L]] = as.call(txt) - } - jsub = as.call(ans) # important no names here - jvnames = sdvars # but here instead - list(jsub, jvnames) - # It may seem inefficient to construct a potentially long expression. But, consider calling - # lapply 100000 times. The C code inside lapply does the LCONS stuff anyway, every time it - # is called, involving small memory allocations. - # The R level lapply calls as.list which needs a shallow copy. - # lapply also does a setAttib of names (duplicating the same names over and over again - # for each group) which is terrible for our needs. We replace all that with a - # (ok, long, but not huge in memory terms) list() which is primitive (so avoids symbol - # lookup), and the eval() inside dogroups hardly has to do anything. All this results in - # overhead minimised. We don't need to worry about the env passed to the eval in a possible - # lapply replacement, or how to pass ... efficiently to it. - # Plus we optimize lapply first, so that mean() can be optimized too as well, next. - } - if (is.name(jsub)) { - if (jsub == ".SD") { - jsub = as.call(c(quote(list), lapply(sdvars, as.name))) - jvnames = sdvars - } - } else if (is.name(jsub[[1L]])) { # Else expect problems with - # g[[ only applies to atomic input, for now, was causing #4159. be sure to eval with enclos=parent.frame() for #4612 - subopt = length(jsub) == 3L && - (jsub %iscall% "[" || - (jsub %iscall% "[[" && is.name(jsub[[2L]]) && eval(call('is.atomic', jsub[[2L]]), x, parent.frame()))) && - (is.numeric(jsub[[3L]]) || jsub[[3L]] == ".N") - headopt = jsub %iscall% c("head", "tail") - firstopt = jsub %iscall% c("first", "last") # fix for #2030 - if ((length(jsub) >= 2L && jsub[[2L]] == ".SD") && - (subopt || headopt || firstopt)) { - if (headopt && length(jsub)==2L) jsub[["n"]] = 6L # head-tail n=6 when missing #3462 - # optimise .SD[1] or .SD[2L]. Not sure how to test .SD[a] as to whether a is numeric/integer or a data.table, yet. - jsub = as.call(c(quote(list), lapply(sdvars, function(x) { jsub[[2L]] = as.name(x); jsub }))) - jvnames = sdvars - } else if (jsub %iscall% "lapply" && jsub[[2L]]==".SD" && length(xcols)) { - deparse_ans = .massageSD(jsub) - jsub = deparse_ans[[1L]] - jvnames = deparse_ans[[2L]] - } else if (jsub %iscall% "c" && length(jsub) > 1L) { - # TODO, TO DO: raise the checks for 'jvnames' earlier (where jvnames is set by checking 'jsub') and set 'jvnames' already. - # FR #2722 is just about optimisation of j=c(.N, lapply(.SD, .)) that is taken care of here. - # FR #735 tries to optimise j-expressions of the form c(...) as long as ... contains - # 1) lapply(.SD, ...), 2) simply .SD or .SD[..], 3) .N, 4) list(...) and 5) functions that normally return a single value* - # On 5)* the IMPORTANT point to note is that things that are not wrapped within "list(...)" should *always* - # return length 1 output for us to optimise. Else, there's no equivalent to optimising c(...) to list(...) AFAICT. - # One issue could be that these functions (e.g., mean) can be "re-defined" by the OP to produce a length > 1 output - # Of course this is worrying too much though. If the issue comes up, we'll just remove the relevant optimisations. - # For now, we optimise all functions mentioned in 'optfuns' below. - optfuns = c("max", "min", "mean", "length", "sum", "median", "sd", "var") - is_valid = TRUE - any_SD = FALSE - jsubl = as.list.default(jsub) - oldjvnames = jvnames - jvnames = NULL # TODO: not let jvnames grow, maybe use (number of lapply(.SD, .))*length(sdvars) + other jvars ?? not straightforward. - # Fix for #744. Don't use 'i' in for-loops. It masks the 'i' from the input!! - for (i_ in 2L:length(jsubl)) { - this = jsub[[i_]] - if (is.name(this)) { # no need to check length(this)==1L; is.name() returns single TRUE or FALSE (documented); can't have a vector of names - if (this == ".SD") { # optimise '.SD' alone - any_SD = TRUE - jsubl[[i_]] = lapply(sdvars, as.name) - jvnames = c(jvnames, sdvars) - } else if (this == ".N") { - # don't optimise .I in c(.SD, .I), it's length can be > 1 - # only c(.SD, list(.I)) should be optimised!! .N is always length 1. - jvnames = c(jvnames, gsub("^[.]([N])$", "\\1", this)) - } else { - # jvnames = c(jvnames, if (is.null(names(jsubl))) "" else names(jsubl)[i_]) - is_valid=FALSE - break - } - } else if (is.call(this)) { - if (this[[1L]] == "lapply" && this[[2L]] == ".SD" && length(xcols)) { - any_SD = TRUE - deparse_ans = .massageSD(this) - funi = funi + 1L # Fix for #985 - jsubl[[i_]] = as.list(deparse_ans[[1L]][-1L]) # just keep the '.' from list(.) - jn__ = deparse_ans[[2L]] - if (isTRUE(nzchar(names(jsubl)[i_]))) { - # Fix for #2311, prepend named arguments of c() to column names of .SD - # e.g. c(mean=lapply(.SD, mean)) - jn__ = paste(names(jsubl)[i_], jn__, sep=".") # sep="." for consistency with c(A=list(a=1,b=1)) - } - jvnames = c(jvnames, jn__) - } else if (this[[1L]] == "list") { - # also handle c(lapply(.SD, sum), list()) - silly, yes, but can happen - if (length(this) > 1L) { - jl__ = as.list(jsubl[[i_]])[-1L] # just keep the '.' from list(.) - if (isTRUE(nzchar(names(jsubl)[i_]))) { - # Fix for #2311, prepend named list arguments of c() to that list's names. See tests 2283.* - njl__ = names(jl__) %||% rep("", length(jl__)) - njl__nonblank = nzchar(names(jl__)) - if (length(jl__) > 1L) { - jn__ = paste0(names(jsubl)[i_], seq_along(jl__)) - } else { - jn__ = names(jsubl)[i_] - } - jn__[njl__nonblank] = paste(names(jsubl)[i_], njl__[njl__nonblank], sep=".") - } else { - jn__ = names(jl__) %||% rep("", length(jl__)) - } - idx = unlist(lapply(jl__, function(x) is.name(x) && x == ".I")) - if (any(idx)) - jn__[idx & !nzchar(jn__)] = "I" # this & is correct not && - jvnames = c(jvnames, jn__) - jsubl[[i_]] = jl__ - } - } else if (this %iscall% optfuns && length(this)>1L) { - jvnames = c(jvnames, if (is.null(names(jsubl))) "" else names(jsubl)[i_]) - } else if ( length(this) == 3L && (this[[1L]] == "[" || this[[1L]] == "head") && - this[[2L]] == ".SD" && (is.numeric(this[[3L]]) || this[[3L]] == ".N") ) { - # optimise .SD[1] or .SD[2L]. Not sure how to test .SD[a] as to whether a is numeric/integer or a data.table, yet. - any_SD = TRUE - jsubl[[i_]] = lapply(sdvars, function(x) { this[[2L]] = as.name(x); this }) - jvnames = c(jvnames, sdvars) - } else if (any(all.vars(this) == ".SD")) { - # TODO, TO DO: revisit complex cases (as illustrated below) - # complex cases like DT[, c(.SD[x>1], .SD[J(.)], c(.SD), a + .SD, lapply(.SD, sum)), by=grp] - # hard to optimise such cases (+ difficulty in counting exact columns and therefore names). revert back to no optimisation. - is_valid=FALSE - break - } else { # just to be sure that any other case (I've overlooked) runs smoothly, without optimisation - # TO DO, TODO: maybe a message/warning here so that we can catch the overlooked cases, if any? - is_valid=FALSE - break - } - } else { - is_valid = FALSE - break - } - } - if (!is_valid || !any_SD) { # restore if c(...) doesn't contain lapply(.SD, ..) or if it's just invalid - jvnames = oldjvnames # reset jvnames - jsub = oldjsub # reset jsub - jsubl = as.list.default(jsubl) # reset jsubl - } else { - setattr(jsubl, 'names', NULL) - jsub = as.call(unlist(jsubl, use.names=FALSE)) - jsub[[1L]] = quote(list) - } - } - } + # Call extracted GForce optimization function + if ( getOption("datatable.optimize")>=1L && (is.call(jsub) || (is.name(jsub) && jsub %chin% c(".SD", ".N"))) ) { + gforce_result = .attempt_optimize(jsub, jvnames, sdvars, SDenv, verbose, i, byjoin, f__, ansvars, use.I, lhs, names_x, parent.frame()) + GForce = gforce_result$GForce + jsub = gforce_result$jsub + jvnames = gforce_result$jvnames + } else { + GForce = FALSE if (verbose) { - if (!identical(oldjsub, jsub)) - catf("lapply optimization changed j from '%s' to '%s'\n", deparse(oldjsub), deparse(jsub,width.cutoff=200L, nlines=1L)) - else - catf("lapply optimization is on, j unchanged as '%s'\n", deparse(jsub,width.cutoff=200L, nlines=1L)) - } - # FR #971, GForce kicks in on all subsets, no joins yet. Although joins could work with - # nomatch=NULL even now.. but not switching it on yet, will deal it separately. - if (getOption("datatable.optimize")>=2L && !is.data.table(i) && !byjoin && length(f__)) { - if (!length(ansvars) && !use.I) { - GForce = FALSE - if ( ((is.name(jsub) && jsub==".N") || (jsub %iscall% 'list' && length(jsub)==2L && jsub[[2L]]==".N")) && !length(lhs) ) { - GForce = TRUE - if (verbose) catf("GForce optimized j to '%s' (see ?GForce)\n",deparse(jsub, width.cutoff=200L, nlines=1L)) - } - } else if (length(lhs) && is.symbol(jsub)) { # turn off GForce for the combination of := and .N - GForce = FALSE - } else { - # Apply GForce - if (jsub %iscall% "list") { - GForce = TRUE - for (ii in seq.int(from=2L, length.out=length(jsub)-1L)) { - if (!.gforce_ok(jsub[[ii]], SDenv$.SDall)) {GForce = FALSE; break} - } - } else - GForce = .gforce_ok(jsub, SDenv$.SDall) - if (GForce) { - if (jsub %iscall% "list") - for (ii in seq_along(jsub)[-1L]) { - if (is.N(jsub[[ii]])) next; # For #334 - jsub[[ii]] = .gforce_jsub(jsub[[ii]], names_x) - } - else { - # adding argument to ghead/gtail if none is supplied to g-optimized head/tail - if (length(jsub) == 2L && jsub %iscall% c("head", "tail")) jsub[["n"]] = 6L - jsub = .gforce_jsub(jsub, names_x) - } - if (verbose) catf("GForce optimized j to '%s' (see ?GForce)\n", deparse(jsub, width.cutoff=200L, nlines=1L)) - } else if (verbose) catf("GForce is on, but not activated for this query; left j unchanged (see ?GForce)\n"); - } - } - if (!GForce && !is.name(jsub)) { - # Still do the old speedup for mean, for now - nomeanopt=FALSE # to be set by .optmean() using <<- inside it - oldjsub = jsub - if (jsub %iscall% "list") { - # Addressing #1369, #2949 and #1974. This used to be 30s (vs 0.5s) with 30K elements items in j, #1470. Could have been is.N() and/or the for-looped if() - # jsub[[1]]=="list" so the first item of todo will always be FALSE - todo = sapply(jsub, `%iscall%`, 'mean') - if (any(todo)) { - w = which(todo) - jsub[w] = lapply(jsub[w], .optmean) - } - } else if (jsub %iscall% "mean") { - jsub = .optmean(jsub) - } - if (nomeanopt) { - warningf("Unable to optimize call to mean() and could be very slow. You must name 'na.rm' like that otherwise if you do mean(x,TRUE) the TRUE is taken to mean 'trim' which is the 2nd argument of mean. 'trim' is not yet optimized.", immediate.=TRUE) - } - if (verbose) { - if (!identical(oldjsub, jsub)) - catf("Old mean optimization changed j from '%s' to '%s'\n", deparse(oldjsub), deparse(jsub, width.cutoff=200L, nlines=1L)) - else - catf("Old mean optimization is on, left j unchanged.\n") - } - assign("Cfastmean", Cfastmean, SDenv) - # Old comments still here for now ... - # Here in case nomeanopt=TRUE or some calls to mean weren't detected somehow. Better but still slow. - # Maybe change to : - # assign("mean", fastmean, SDenv) # neater than the hard work above, but slower - # when fastmean can do trim. + if (getOption("datatable.optimize")<1L) catf("All optimizations are turned off\n") + else catf("Optimization is on but left j unchanged (single plain symbol): '%s'\n", deparse(jsub, width.cutoff=200L, nlines=1L)) } - } else if (verbose) { - if (getOption("datatable.optimize")<1L) catf("All optimizations are turned off\n") - else catf("Optimization is on but left j unchanged (single plain symbol): '%s'\n", deparse(jsub, width.cutoff=200L, nlines=1L)) } if (byjoin) { groups = i @@ -3181,11 +3291,11 @@ is_constantish = function(q, check_singleton=FALSE) { length(q) == 3L && is_constantish(q[[3L]], check_singleton = TRUE) } -`.g[_ok` = function(q, x) { +`.g[_ok` = function(q, x, envir=parent.frame(3L)) { length(q) == 3L && is_constantish(q[[3L]], check_singleton = TRUE) && (q[[1L]] != "[[" || eval(call('is.atomic', q[[2L]]), envir=x)) && - !(as.character(q[[3L]]) %chin% names(x)) && is.numeric(q3 <- eval(q[[3L]], parent.frame(3L))) && length(q3)==1L && q3>0L + !(as.character(q[[3L]]) %chin% names(x)) && is.numeric(q3 <- eval(q[[3L]], envir)) && length(q3)==1L && q3>0L } .gweighted.mean_ok = function(q, x) { #3977 q = match.call(gweighted.mean, q) @@ -3209,7 +3319,7 @@ is_constantish = function(q, check_singleton=FALSE) { # is robust to unnamed expr. Note that NA names are not possible here. .arg_is_narm = function(expr, which=3L) !is.null(nm <- names(expr)[which]) && startsWith(nm, "na") -.gforce_ok = function(q, x) { +.gforce_ok = function(q, x, envir=parent.frame(2L)) { if (is.N(q)) return(TRUE) # For #334 q1 = .get_gcall(q) if (is.null(q1)) return(FALSE) @@ -3219,19 +3329,19 @@ is_constantish = function(q, check_singleton=FALSE) { "shift" = .gshift_ok(q), "weighted.mean" = .gweighted.mean_ok(q, x), "tail" = , "head" = .ghead_ok(q), - "[[" = , "[" = `.g[_ok`(q, x), + "[[" = , "[" = `.g[_ok`(q, x, envir), FALSE ) } -.gforce_jsub = function(q, names_x) { +.gforce_jsub = function(q, names_x, envir=parent.frame(2L)) { call_name = if (is.symbol(q[[1L]])) q[[1L]] else q[[1L]][[3L]] # latter is like data.table::shift, #5942. .gshift_ok checked this will work. q[[1L]] = as.name(paste0("g", call_name)) # gforce needs to evaluate arguments before calling C part TODO: move the evaluation into gforce_ok # do not evaluate vars present as columns in x if (length(q) >= 3L) { for (i in 3:length(q)) { - if (is.symbol(q[[i]]) && !(q[[i]] %chin% names_x)) q[[i]] = eval(q[[i]], parent.frame(2L)) # tests 1187.2 & 1187.4 + if (is.symbol(q[[i]]) && !(q[[i]] %chin% names_x)) q[[i]] = eval(q[[i]], envir) # tests 1187.2 & 1187.4 } } q From 71b21ab4538e5a93570239917ba2a4bfa5ed3119 Mon Sep 17 00:00:00 2001 From: Benjamin Schwendinger Date: Wed, 29 Oct 2025 09:17:11 +0100 Subject: [PATCH 02/27] make linter happy --- R/data.table.R | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/R/data.table.R b/R/data.table.R index 255c9aeac..aa184f5c4 100644 --- a/R/data.table.R +++ b/R/data.table.R @@ -210,7 +210,7 @@ replace_dot_alias = function(e) { NULL } -# Optimize c(...) expressions +# Optimize c(...) expressions .optimize_c_expr = function(jsub, jvnames, sdvars, SDenv, funi, envir) { if (!jsub %iscall% "c" || length(jsub) <= 1L) { return(list(jsub=jsub, jvnames=jvnames, funi=funi, optimized=FALSE)) @@ -328,13 +328,13 @@ replace_dot_alias = function(e) { # Return result if (!is_valid || !any_SD) { # Can't optimize - return original - return(list(jsub=jsub, jvnames=oldjvnames, funi=funi, optimized=FALSE)) + list(jsub=jsub, jvnames=oldjvnames, funi=funi, optimized=FALSE) } else { # Optimization successful setattr(jsubl, 'names', NULL) jsub_new = as.call(unlist(jsubl, use.names=FALSE)) jsub_new[[1L]] = quote(list) - return(list(jsub=jsub_new, jvnames=jvnames, funi=funi, optimized=TRUE)) + list(jsub=jsub_new, jvnames=jvnames, funi=funi, optimized=TRUE) } } From 8a9e72734e63c3b8b0efde67755a9b94368476be Mon Sep 17 00:00:00 2001 From: Benjamin Schwendinger Date: Thu, 30 Oct 2025 16:02:25 +0100 Subject: [PATCH 03/27] move tests --- R/test.data.table.R | 32 +- inst/tests/benchmark.Rraw | 25 +- inst/tests/tests.Rraw | 896 ++++++-------------------------------- man/test.Rd | 3 +- 4 files changed, 167 insertions(+), 789 deletions(-) diff --git a/R/test.data.table.R b/R/test.data.table.R index 6e264c871..bfc61412a 100644 --- a/R/test.data.table.R +++ b/R/test.data.table.R @@ -361,7 +361,37 @@ gc_mem = function() { # nocov end } -test = function(num,x,y=TRUE,error=NULL,warning=NULL,message=NULL,output=NULL,notOutput=NULL,ignore.warning=NULL,options=NULL,env=NULL) { +test = function(num,x,y=TRUE,error=NULL,warning=NULL,message=NULL,output=NULL,notOutput=NULL,ignore.warning=NULL,options=NULL,env=NULL,levels=NULL) { + # if levels is provided, test across multiple optimization levels + if (!is.null(levels)) { + cl = match.call() + cl$levels = NULL # Remove levels from the recursive call + + vector_params = c("error", "warning", "message", "output", "notOutput", "ignore.warning") + # Check if y was explicitly provided (not just the default) + y_provided = !missing(y) + compare = !y_provided && length(levels)>1L && !any(vapply_1b(vector_params, function(p) length(get(p, envir=environment())) > 0L)) + + for (i in seq_along(levels)) { + cl$num = num + (i - 1L) * 1e-6 + opt_level = list(datatable.optimize = levels[i]) + cl$options = if (!is.null(options)) c(as.list(options), opt_level) else opt_level + for (p in vector_params) { + val = get(p, envir=environment()) + if (length(val) > 0L) { + cl[[p]] = val[((i - 1L) %% length(val)) + 1L] # ccycle through values if fewer than levels + } else if (p %in% names(cl)) { + cl[[p]] = NULL + } + } + + if (compare && i == 1L) cl$y = eval(cl$x, parent.frame()) + eval(cl, parent.frame()) + } + return(invisible()) + } + + # print(match.call()) if (!is.null(env)) { old = Sys.getenv(names(env), names=TRUE, unset=NA) to_unset = !lengths(env) diff --git a/inst/tests/benchmark.Rraw b/inst/tests/benchmark.Rraw index 62075dcf8..52a91c723 100644 --- a/inst/tests/benchmark.Rraw +++ b/inst/tests/benchmark.Rraw @@ -190,24 +190,13 @@ DT = data.table(A=1:10,B=rnorm(10),C=paste("a",1:100010,sep="")) test(301.1, nrow(DT[,sum(B),by=C])==100010) # Test := by key, and that := to the key by key unsets the key. Make it non-trivial in size too. -local({ - old = options(datatable.optimize=0L); on.exit(options(old)) - set.seed(1) - DT = data.table(a=sample(1:100, 1e6, replace=TRUE), b=sample(1:1000, 1e6, replace=TRUE), key="a") - test(637.1, DT[, m:=sum(b), by=a][1:3], data.table(a=1L, b=c(156L, 808L, 848L), m=DT[J(1), sum(b)], key="a")) - test(637.2, key(DT[J(43L), a:=99L]), NULL) - setkey(DT, a) - test(637.3, key(DT[, a:=99L, by=a]), NULL) -}) -local({ - options(datatable.optimize=2L); on.exit(options(old)) - set.seed(1) - DT = data.table(a=sample(1:100, 1e6, replace=TRUE), b=sample(1:1000, 1e6, replace=TRUE), key="a") - test(638.1, DT[, m:=sum(b), by=a][1:3], data.table(a=1L, b=c(156L, 808L, 848L), m=DT[J(1), sum(b)], key="a")) - test(638.2, key(DT[J(43L), a:=99L]), NULL) - setkey(DT,a) - test(638.3, key(DT[, a:=99L, by=a]), NULL) -}) +set.seed(1) +DT = data.table(a=sample(1:100, 1e6, replace=TRUE), b=sample(1:1000, 1e6, replace=TRUE), key="a") +opt = c(0L,2L) +test(637.1, levels=opt, copy(DT)[, m:=sum(b), by=a][1:3], data.table(a=1L, b=c(156L, 808L, 848L), m=DT[J(1), sum(b)], key="a")) +test(637.2, levels=opt, key(copy(DT)[J(43L), a:=99L]), NULL) +setkey(DT, a) +test(637.3, levels=opt, key(copy(DT)[, a:=99L, by=a]), NULL) # Test X[Y] slowdown, #2216 # Many minutes in 1.8.2! Now well under 1s, but 10s for very wide tolerance for CRAN. We'd like CRAN to tell us if any changes diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 7be4af235..5540b212d 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -1851,53 +1851,7 @@ x = sample(LETTERS,1000,replace=TRUE) test(610.3, chorder(x), base::order(x, method="radix")) test(610.4, unique(x[chgroup(x)]), unique(x)) -# := by group -options(datatable.optimize=0L) -DT = data.table(a=1:3,b=(1:9)/10) -test(611.1, DT[,v:=sum(b),by=a], data.table(a=1:3,b=(1:9)/10,v=c(1.2,1.5,1.8))) -setkey(DT,a) -test(611.2, DT[,v:=min(b),by=a], data.table(a=1:3,b=(1:9)/10,v=(1:3)/10,key="a")) -# Combining := by group with i -test(611.3, DT[a>1,p:=sum(b)]$p, rep(c(NA,3.3),c(3,6))) -test(611.4, DT[a>1,q:=sum(b),by=a]$q, rep(c(NA,1.5,1.8),each=3)) -options(datatable.optimize=2L) -DT = data.table(a=1:3,b=(1:9)/10) -test(612.1, DT[,v:=sum(b),by=a], data.table(a=1:3,b=(1:9)/10,v=c(1.2,1.5,1.8))) -setkey(DT,a) -test(612.2, DT[,v:=min(b),by=a], data.table(a=1:3,b=(1:9)/10,v=(1:3)/10,key="a")) -# Combining := by group with i -test(612.3, DT[a>1,p:=sum(b)]$p, rep(c(NA,3.3),c(3,6))) -test(612.4, DT[a>1,q:=sum(b),by=a]$q, rep(c(NA,1.5,1.8),each=3)) -# Assign to subset ok (NA initialized in the other items) ok : -test(613, DT[J(2),w:=8.3]$w, rep(c(NA,8.3,NA),each=3)) -test(614, DT[J(3),x:=9L]$x, rep(c(NA_integer_,NA_integer_,9L),each=3)) -test(615, DT[J(2),z:=list(list(c(10L,11L)))]$z, rep(list(NULL, 10:11, NULL),each=3)) - -# Empty i clause, #2034. Thanks to Chris for testing, tests from him. Plus changes from #759 -ans = copy(DT)[,r:=NA_real_] -options(datatable.optimize=0L) -test(618.1, copy(DT)[a>3,r:=sum(b)], ans) -test(618.2, copy(DT)[J(-1),r:=sum(b)], ans) -test(618.3, copy(DT)[NA,r:=sum(b)], ans) -test(618.4, copy(DT)[0,r:=sum(b)], ans) -test(618.5, copy(DT)[NULL,r:=sum(b)], null.data.table()) -options(datatable.optimize=2L) -test(619.1, copy(DT)[a>3,r:=sum(b)], ans) -test(619.2, copy(DT)[J(-1),r:=sum(b)], ans) -test(619.3, copy(DT)[NA,r:=sum(b)], ans) -test(619.4, copy(DT)[0,r:=sum(b)], ans) -test(619.5, copy(DT)[NULL,r:=sum(b)], null.data.table()) - -DT = data.table(x=letters, key="x") -test(621, copy(DT)[J("bb"), x:="foo"], DT) # when no update, key should be retained -test(622, copy(DT)[J("bb"), x:="foo",nomatch=0], DT, warning="ignoring nomatch") - -set.seed(2) -DT = data.table(a=rnorm(5)*10, b=1:5) -options(datatable.optimize=0L) -test(623.1, copy(DT)[,s:=sum(b),by=round(a)%%2]$s, c(10L,5L,5L,10L,10L)) -options(datatable.optimize=2L) -test(623.2, copy(DT)[,s:=sum(b),by=round(a)%%2]$s, c(10L,5L,5L,10L,10L)) +# tests 611-623 moved to optimize.Rraw # Tests on POSIXct attributes @@ -2018,20 +1972,10 @@ setnames(ans2,"x","V1") setnames(ans2,"y","V2") test(654, ans1, ans2) -options(datatable.optimize = 0L) -test(656.1, DT[ , mean(x), by=grp1, verbose=TRUE], output='(GForce FALSE)') -test(656.2, DT[ , list(mean(x)), by=grp1, verbose=TRUE], output="(GForce FALSE)") -test(656.3, DT[ , list(mean(x), mean(y)), by=grp1, verbose=TRUE], output="(GForce FALSE)") -options(datatable.optimize = 1L) -test(657.1, DT[ , mean(x), by=grp1, verbose=TRUE], output='(GForce FALSE)') -test(657.2, DT[ , list(mean(x)), by=grp1, verbose=TRUE], output="(GForce FALSE)") -test(657.3, DT[ , list(mean(x), mean(y)), by=grp1, verbose=TRUE], output="(GForce FALSE)") -options(datatable.optimize = 2L) -test(658.1, DT[ , mean(x), by=grp1, verbose=TRUE], output="GForce optimized j to.*gmean") -test(658.2, DT[ , list(mean(x)), by=grp1, verbose=TRUE], output="GForce optimized j to.*gmean") -test(658.3, DT[ , list(mean(x), mean(y)), by=grp1, verbose=TRUE], output="GForce optimized j to.*gmean") -tt = capture.output(DT[,list(mean(x),mean(y)),by=list(grp1,grp2),verbose=TRUE]) -test(659, !length(grep("Wrote less rows", tt))) # first group is one row with this seed. Ensure we treat this as aggregate case rather than allocate too many rows. +# test 656-658 moved to optimize.Rraw +# test is not testing what it should since #2671 +# tt = capture.output(DT[,list(mean(x),mean(y)),by=list(grp1,grp2),verbose=TRUE]) +# test(659, !length(grep("Wrote less rows", tt))) # first group is one row with this seed. Ensure we treat this as aggregate case rather than allocate too many rows. # Test .N for logical i subset DT = data.table(a=1:10, b=rnorm(10)) @@ -2059,33 +2003,7 @@ test(667, DT[a<3,sum(b),by=paste("a")], error='Otherwise, by=eval(paste("a")) sh test(668, DT[a<3,sum(b),by=eval(paste("a"))], DT[a<3,sum(b),by=a]) test(669, DT[a<3,sum(b),by=c(2)], error="must evaluate to 'character'") -# Test := keyby does setkey, #2065 -options(datatable.optimize=0L) -DT = data.table(x=1:2, y=1:6) -ans = data.table(x=rep(1:2,each=3),y=c(1L,3L,5L,2L,4L,6L),z=rep(c(9L,12L),each=3),key="x") -test(670.1, DT[,z:=sum(y),keyby=x], ans) -DT = data.table(x=1:2, y=1:6) -test(670.2, DT[,z:=sum(y),keyby="x"], ans) -DT = data.table(x=1:2, y=1:6) -test(670.3, DT[,z:=sum(y),keyby=x%%2], data.table(x=1:2,y=1:6,z=c(9L,12L)), - warning="The setkey() normally performed by keyby= has been skipped (as if by= was used) because := is being used together with keyby= but the keyby= contains some expressions. To avoid this warning, use by= instead, or provide existing column names to keyby=") -DT = data.table(x=1:2, y=1:6) -test(670.4, DT[,z:=sum(y),by=x%%2], data.table(x=1:2,y=1:6,z=c(9L,12L))) -DT = data.table(x=1:2, y=1:6) -test(670.5, DT[x>1,z:=sum(y),keyby=x], error=":= with keyby is only possible when i is not supplied since") -options(datatable.optimize=2L) -DT = data.table(x=1:2, y=1:6) -ans = data.table(x=rep(1:2,each=3),y=c(1L,3L,5L,2L,4L,6L),z=rep(c(9L,12L),each=3),key="x") -test(671.1, DT[,z:=sum(y),keyby=x], ans) -DT = data.table(x=1:2, y=1:6) -test(671.2, DT[,z:=sum(y),keyby="x"], ans) -DT = data.table(x=1:2, y=1:6) -test(671.3, DT[,z:=sum(y),keyby=x%%2], data.table(x=1:2,y=1:6,z=c(9L,12L)), - warning="The setkey() normally performed by keyby= has been skipped (as if by= was used) because := is being used together with keyby= but the keyby= contains some expressions. To avoid this warning, use by= instead, or provide existing column names to keyby=") -DT = data.table(x=1:2, y=1:6) -test(671.4, DT[,z:=sum(y),by=x%%2], data.table(x=1:2,y=1:6,z=c(9L,12L))) -DT = data.table(x=1:2, y=1:6) -test(671.5, DT[x>1,z:=sum(y),keyby=x], error=":= with keyby is only possible when i is not supplied since") +# tests 670+671 moved to optimize.Rraw # Test new .() DT = data.table(x=1:2, y=1:6, key="x") @@ -2270,23 +2188,7 @@ test(749, DT[,c("c","d","e"):=list(.N,sum(b),a*10L),by=a], data.table(a=rep(6:8, test(750.1, copy(DT)[a<8,`:=`(f=b+sum(d),g=.N),by=c][,6:7,with=FALSE], data.table(f=INT(2,12,13,NA,NA,NA),g=INT(1,2,2,NA,NA,NA))) test(750.2, copy(DT)[a<8,let(f=b+sum(d),g=.N),by=c][,6:7,with=FALSE], data.table(f=INT(2,12,13,NA,NA,NA),g=INT(1,2,2,NA,NA,NA))) -# varname holding colnames, by group, linked from #2120. -options(datatable.optimize=0L) -DT = data.table(a=rep(1:3,1:3),b=1:6) -colname = "newcol" -test(751.1, DT[,(colname):=sum(b),by=a], data.table(a=rep(1:3,1:3),b=1:6,newcol=INT(1,5,5,15,15,15))) -options(datatable.optimize=2L) -DT = data.table(a=rep(1:3,1:3),b=1:6) -colname = "newcol" -test(751.2, DT[,(colname):=sum(b),by=a], data.table(a=rep(1:3,1:3),b=1:6,newcol=INT(1,5,5,15,15,15))) - -# Add tests for nested := in j by group, #1987 -options(datatable.optimize=0L) -DT = data.table(a=rep(1:3,2:4),b=1:9) -test(752.1, DT[,head(.SD,2)[,new:=1:.N],by=a], data.table(a=rep(1:3,each=2),b=c(1:4,6:7),new=1:2)) -options(datatable.optimize=2L) -DT = data.table(a=rep(1:3,2:4),b=1:9) -test(752.2, DT[,head(.SD,2)[,new:=1:.N],by=a], data.table(a=rep(1:3,each=2),b=c(1:4,6:7),new=1:2)) +# tests 751, 752 moved to optimize.Rraw # Test duplicate() of recycled plonking RHS, #2298 DT = data.table(a=letters[3:1],x=1:3) @@ -2590,18 +2492,8 @@ test(864.3, rbindlist(list(data.table(logical(0),logical(0)), DT<-data.table(baz message="Column 1 [[]'baz'[]] of item 2 is missing in item 1.*Use fill=TRUE.*or use.names=FALSE.*v1.12.2") # Steve's find that setnames failed for numeric 'old' when pointing to duplicated names -DT = data.table(a=1:3,b=1:3,v=1:6,w=1:6) -options(datatable.optimize = 0L) -test(865.1, ans1<-DT[,{list(name1=sum(v),name2=sum(w))},by=c('a', 'b'),verbose=TRUE], output="(GForce FALSE)") -options(datatable.optimize = 1L) -test(865.2, ans1<-DT[,{list(name1=sum(v),name2=sum(w))},by=c('a', 'b'),verbose=TRUE], output="(GForce FALSE)") -options(datatable.optimize = 2L) -test(865.3, ans1<-DT[,{list(name1=sum(v),name2=sum(w))},by=c('a', 'b'),verbose=TRUE], - output="GForce optimized.*gsum[(]v[)], gsum[(]w[)]") # v1.9.7 treats wrapped {} better, so this is now optimized -options(datatable.optimize = Inf) -test(866, names(ans1), c("a","b","name1","name2")) -test(867, names(ans2<-DT[,list(name1=sum(v),name2=sum(w)),by=c('a', 'b')]), c("a","b","name1","name2")) # list names extracted here -test(868, ans1, ans2) +# tests 865-868 moved to optimize.Rraw + # and related to setnames, too DT = data.table(a=1:3,b=1:6,key="a") test(869, DT[J(2,42,84),print(.SD),by=.EACHI], output=" b\n.*1.*2\n2:.*5.*Empty data.table [(]0 rows and 3 cols[)]: a,V2,V3") # .* for when verbose mode @@ -3998,10 +3890,7 @@ test(1133.3, DT[, new := c(1,2), by=x], error="Supplied 2 items to be assigned test(1133.4, DT[, new := c(1L,2L), by=x], error="Supplied 2 items to be assigned to group 1 of size 5 in column 'new'") test(1133.5, DT, data.table(x=INT(1,1,1,1,1,2,2), new=99L)) test(1133.6, DT[, new := rep(-.GRP, .N), by=x], data.table(x=INT(1,1,1,1,1,2,2), new=INT(-1,-1,-1,-1,-1,-2,-2))) -options(datatable.optimize=0L) -test(1133.7, DT[, new := .N, by=x], data.table(x=INT(1,1,1,1,1,2,2), new=INT(5,5,5,5,5,2,2))) -options(datatable.optimize=2L) -test(1133.75, DT[, new := .N, by=x], data.table(x=INT(1,1,1,1,1,2,2), new=INT(5,5,5,5,5,2,2))) +test(1133.7, levels=c(0L, 2L), DT[, new := .N, by=x], data.table(x=INT(1,1,1,1,1,2,2), new=INT(5,5,5,5,5,2,2))) # on a new column with warning on 2nd assign DT[,new:=NULL] test(1133.8, DT[, new := if (.GRP==1L) 7L else 3.4, by=x], data.table(x=INT(1,1,1,1,1,2,2), new=INT(7,7,7,7,7,3,3)), @@ -4104,10 +3993,8 @@ DT<-data.table(X=factor(2006:2012),Y=rep(1:7,2)) test(1143.2, DT[, Z:=paste(X,.N,sep=" - "), by=list(X)], data.table(X=factor(2006:2012),Y=rep(1:7,2), Z=paste(as.character(2006:2012), 2L, sep=" - "))) DT = data.table(x=as.POSIXct(c("2009-02-17 17:29:23.042", "2009-02-17 17:29:25.160")), y=c(1L,2L)) test(1143.3, DT[, list(lx=x[.N]), by=x], data.table(x=DT$x, lx=DT$x)) -options(datatable.optimize=0L) -test(1143.4, copy(DT)[,`:=`(lx=tail(x,1L)), by=y], copy(DT)[, lx:=x]) -test(1143.5, copy(DT)[, let(lx=tail(x,1L)), by=y], copy(DT)[, lx:=x]) -options(datatable.optimize=2L) +test(1143.4, levels=c(0L, 2L), copy(DT)[,`:=`(lx=tail(x,1L)), by=y], copy(DT)[, lx:=x]) +test(1143.5, levels=c(0L, 2L), copy(DT)[, let(lx=tail(x,1L)), by=y], copy(DT)[, lx:=x]) test(1143.6, copy(DT)[,`:=`(lx=tail(x,1L)), by=y], copy(DT)[, lx:=x]) test(1143.7, copy(DT)[, let(lx=tail(x,1L)), by=y], copy(DT)[, lx:=x]) @@ -4382,40 +4269,7 @@ test(1181, forderv(INT(1,3,5000000,NA)), INT(4,1,2,3)) test(1182, forderv(INT(1,-1,5000000,NA)), INT(4,2,1,3)) test(1183, forderv(INT(-3,-7,1,-6000000,NA,3,5000000,NA,8)), INT(5,8,4,2,1,3,6,9,7)) -# tests of gsum and gmean with NA -DT = data.table(x=rep(c("a","b","c","d"),each=3), y=c(1L,3L,6L), v=as.numeric(1:12)) -set(DT,c(3L,8L),"y",NA) -set(DT,c(5L,9L),"v",NA) -set(DT,10:12,"y",NA) -set(DT,10:12,"v",NA) -options(datatable.optimize=1) # turn off GForce -test(1184.1, DT[, sum(v), by=x, verbose=TRUE], output="(GForce FALSE)") -test(1184.2, DT[, mean(v), by=x, verbose=TRUE], output="(GForce FALSE)") -test(1185.1, DT[, list(sum(y), sum(v), sum(y,na.rm=TRUE), sum(v,na.rm=TRUE)), by=x], - data.table(x=c("a","b","c","d"), V1=c(NA,10L,NA,NA), V2=c(6,NA,NA,NA), V3=c(4L,10L,7L,0L), V4=c(6,10,15,0))) -options(datatable.optimize=0) # turn off fastmean optimization to get the answer to match to -test(1185.2, ans <- DT[, list(mean(y), mean(v), mean(y,na.rm=TRUE), mean(v,na.rm=TRUE)), by=x, verbose=TRUE], output="All optimizations.*off") -options(datatable.optimize=1) # turn on old fastmean optimization only -test(1185.3, DT[, list(mean(y), mean(v), mean(y,na.rm=TRUE), mean(v,na.rm=TRUE)), by=x, verbose=TRUE], ans, output="Old mean.*changed j") -options(datatable.optimize=Inf) # turn on GForce -test(1185.4, DT[, list(mean(y), mean(v), mean(y,na.rm=TRUE), mean(v,na.rm=TRUE)), by=x, verbose=TRUE], ans, output="GForce optimized j to") -test(1186, DT[, sum(v), by=x, verbose=TRUE], output="GForce optimized j to") -test(1187.1, DT[, list(sum(y), sum(v), sum(y,na.rm=TRUE), sum(v,na.rm=TRUE)), by=x], - data.table(x=c("a","b","c","d"), V1=c(NA,10L,NA,NA), V2=c(6,NA,NA,NA), V3=c(4L,10L,7L,0L), V4=c(6,10,15,0))) -MyVar = TRUE -test(1187.2, DT[, list(sum(y,na.rm=MyVar), mean(y,na.rm=MyVar)), by=x, verbose=TRUE], output="GForce optimized j to", - DT[, list(sum(y,na.rm=TRUE), mean(y,na.rm=TRUE)), by=x]) -test(1187.3, DT[, mean(y,na.rm=MyVar), by=x, verbose=TRUE], output="GForce optimized j to", - DT[, mean(y,na.rm=TRUE), by=x]) -MyVar = FALSE -test(1187.4, DT[, list(sum(y,na.rm=MyVar), mean(y,na.rm=MyVar)), by=x, verbose=TRUE], output="GForce optimized j to", - DT[, list(sum(y,na.rm=FALSE), mean(y,na.rm=FALSE)), by=x]) -test(1187.5, DT[, mean(y,na.rm=MyVar), by=x, verbose=TRUE], output="GForce optimized j to", - DT[, mean(y,na.rm=FALSE), by=x]) -# GForce should not turn on when the .ok function isn't triggered -test(1187.6, DT[, mean(y, trim=.2), by=x, verbose=TRUE], - data.table(x = c("a", "b", "c", "d"), V1 = c(NA, 3.33333333333333, NA, NA)), - output='j unchanged', warning="'trim' is not yet optimized") +# tests 1184-1187 moved to optimize.Rraw # test from Zach Mayer a <- c("\"\"\"\") \" \" \" \" \"", "\"\"\"\") \" \" \" \" \"", "\"\"\"\") \" \" \" \" ,\"") @@ -4610,18 +4464,17 @@ test(1228.6, class(DT), class(DT[a>1, c:=sum(b), by=a])) # Test that ad hoc by detects if ordered and dogroups switches to memcpy if contiguous, #1050 DT = data.table(a=1:3,b=1:6,key="a") -options(datatable.optimize=1) # turn off GForce, to test dogroups -test(1230, DT[, sum(b), by=a, verbose=TRUE], output="memcpy contiguous groups") +# turn off GForce, to test dogroups +test(1230, levels=1L, DT[, sum(b), by=a, verbose=TRUE], output="memcpy contiguous groups") setkey(DT,NULL) -test(1231, DT[, sum(b), by=a, verbose=TRUE], output="memcpy contiguous groups") -test(1232, DT[, sum(b), by=a+1, verbose=TRUE], output="memcpy contiguous groups") -test(1233, DT[, sum(b), by=a%%2, verbose=TRUE], output="collecting discontiguous groups") -test(1234, DT[, sum(a), by=b, verbose=TRUE], output="memcpy contiguous groups") # as from v1.12.0 the out-of-order but grouped-ness is detected and utilized +test(1231, levels=1L, DT[, sum(b), by=a, verbose=TRUE], output="memcpy contiguous groups") +test(1232, levels=1L, DT[, sum(b), by=a+1, verbose=TRUE], output="memcpy contiguous groups") +test(1233, levels=1L, DT[, sum(b), by=a%%2, verbose=TRUE], output="collecting discontiguous groups") +test(1234, levels=1L, DT[, sum(a), by=b, verbose=TRUE], output="memcpy contiguous groups") # as from v1.12.0 the out-of-order but grouped-ness is detected and utilized setkey(DT,a) -test(1235, DT[.(2:3),sum(b),by=.EACHI,verbose=TRUE], data.table(a=2:3,V1=c(7L,9L),key="a"), output="memcpy contiguous groups") -test(1236, DT[.(3:2),sum(b),by=.EACHI,verbose=TRUE], data.table(a=3:2,V1=c(9L,7L)), output="memcpy contiguous groups") -test(1237, DT[.(3:2),sum(b),keyby=.EACHI,verbose=TRUE], data.table(a=2:3,V1=c(7L,9L),key="a"), output="memcpy contiguous groups") -options(datatable.optimize=Inf) +test(1235, levels=1L, DT[.(2:3),sum(b),by=.EACHI,verbose=TRUE], data.table(a=2:3,V1=c(7L,9L),key="a"), output="memcpy contiguous groups") +test(1236, levels=1L, DT[.(3:2),sum(b),by=.EACHI,verbose=TRUE], data.table(a=3:2,V1=c(9L,7L)), output="memcpy contiguous groups") +test(1237, levels=1L, DT[.(3:2),sum(b),keyby=.EACHI,verbose=TRUE], data.table(a=2:3,V1=c(7L,9L),key="a"), output="memcpy contiguous groups") # check that key is not preserved when length of fastorder is > 0 DT <- data.table(x=1:5, y=6:10, key="x") @@ -4887,44 +4740,40 @@ set.seed(45L) dt = data.table(a=sample(2,10,TRUE), b=sample(3,10,TRUE), c=sample(4,10,TRUE), d=sample(5,10,TRUE)) dt2 = data.table(x=c(1,1,1,2,2,2), y=1:6) -options(datatable.optimize=0L) # auto-naming behavior is different for no-optimization case; just check optimization is off -test(1268.01, dt[, c(lapply(.SD, mean), lapply(.SD, sum)), by=a, verbose=TRUE], output = 'All optimizations are turned off') -test(1268.02, dt[, c(lapply(.SD, mean), .N), by=a, verbose=TRUE], output = 'All optimizations are turned off') -test(1268.03, dt[, c(list(c), lapply(.SD, mean)), by=a, verbose=TRUE], output="All optimizations are turned off") -test(1268.04, dt[, c(sum(d), lapply(.SD, mean)), by=a, verbose=TRUE], output="All optimizations are turned off") -test(1268.05, dt[, c(list(sum(d)), lapply(.SD, mean)), by=a, verbose=TRUE], output="All optimizations are turned off") +test(1268.01, levels=0L, dt[, c(lapply(.SD, mean), lapply(.SD, sum)), by=a, verbose=TRUE], output = 'All optimizations are turned off') +test(1268.02, levels=0L, dt[, c(lapply(.SD, mean), .N), by=a, verbose=TRUE], output = 'All optimizations are turned off') +test(1268.03, levels=0L, dt[, c(list(c), lapply(.SD, mean)), by=a, verbose=TRUE], output="All optimizations are turned off") +test(1268.04, levels=0L, dt[, c(sum(d), lapply(.SD, mean)), by=a, verbose=TRUE], output="All optimizations are turned off") +test(1268.05, levels=0L, dt[, c(list(sum(d)), lapply(.SD, mean)), by=a, verbose=TRUE], output="All optimizations are turned off") # newly added tests for #861 -- optimise, but no GForce -test(1268.06, dt[, c(list(sum(d), .I), lapply(.SD, mean)), by=a, verbose=TRUE], output="All optimizations are turned off") +test(1268.06, levels=0L, dt[, c(list(sum(d), .I), lapply(.SD, mean)), by=a, verbose=TRUE], output="All optimizations are turned off") # don't optimise .I in c(...) -test(1268.07, dt2[, c(.I, lapply(.SD, mean)), by=x, verbose=TRUE], output="All optimizations are turned off") - -options(datatable.optimize=1L) -test(1268.08, ans1 <- dt[ , c(lapply(.SD, mean), lapply(.SD, sum)), by=a, verbose=TRUE], output="Old mean optimization.*(GForce FALSE)") -test(1268.09, ans2 <- dt[, c(lapply(.SD, mean), .N), by=a, verbose = TRUE], output="Old mean optimization.*GForce FALSE") -test(1268.10, ans3 <- dt[, c(list(c), lapply(.SD, mean)), by=a, verbose=TRUE], output = 'Old mean optimization.*GForce FALSE') -test(1268.11, ans4 <- dt[, c(sum(d), lapply(.SD, mean)), by=a, verbose = TRUE], output="Old mean optimization.*GForce FALSE") -test(1268.12, ans5 <- dt[, c(list(sum(d)), lapply(.SD, mean)), by=a, verbose=TRUE], output="Old mean optimization.*GForce FALSE") -test(1268.13, ans6 <- dt[, c(list(sum(d), .I), lapply(.SD, mean)), by=a, verbose=TRUE], output="Old mean optimization.*GForce FALSE") -test(1268.14, ans7 <- dt2[, c(.I, lapply(.SD, mean)), by=x, verbose=TRUE], output="Old mean optimization.*GForce FALSE") - -options(datatable.optimize=Inf) -test(1268.15, dt[, c(lapply(.SD, mean), lapply(.SD, sum)), by=a, verbose=TRUE], ans1, +test(1268.07, levels=0L, dt2[, c(.I, lapply(.SD, mean)), by=x, verbose=TRUE], output="All optimizations are turned off") + +test(1268.08, levels=1L, ans1 <- dt[ , c(lapply(.SD, mean), lapply(.SD, sum)), by=a, verbose=TRUE], output="Old mean optimization.*(GForce FALSE)") +test(1268.09, levels=1L, ans2 <- dt[, c(lapply(.SD, mean), .N), by=a, verbose = TRUE], output="Old mean optimization.*GForce FALSE") +test(1268.10, levels=1L, ans3 <- dt[, c(list(c), lapply(.SD, mean)), by=a, verbose=TRUE], output = 'Old mean optimization.*GForce FALSE') +test(1268.11, levels=1L, ans4 <- dt[, c(sum(d), lapply(.SD, mean)), by=a, verbose = TRUE], output="Old mean optimization.*GForce FALSE") +test(1268.12, levels=1L, ans5 <- dt[, c(list(sum(d)), lapply(.SD, mean)), by=a, verbose=TRUE], output="Old mean optimization.*GForce FALSE") +test(1268.13, levels=1L, ans6 <- dt[, c(list(sum(d), .I), lapply(.SD, mean)), by=a, verbose=TRUE], output="Old mean optimization.*GForce FALSE") +test(1268.14, levels=1L, ans7 <- dt2[, c(.I, lapply(.SD, mean)), by=x, verbose=TRUE], output="Old mean optimization.*GForce FALSE") + +test(1268.15, levels=Inf, dt[, c(lapply(.SD, mean), lapply(.SD, sum)), by=a, verbose=TRUE], ans1, output="GForce optimized j to 'list(gmean(b), gmean(c), gmean(d), gsum(b), gsum(c), gsum(d))'") -test(1268.16, dt[, c(lapply(.SD, mean), .N), by=a, verbose=TRUE], ans2, +test(1268.16, levels=Inf, dt[, c(lapply(.SD, mean), .N), by=a, verbose=TRUE], ans2, output = "lapply optimization changed j from 'c(lapply(.SD, mean), .N)' to 'list(mean(b), mean(c), mean(d), .N)'") -test(1268.17, dt[, c(list(c), lapply(.SD, mean)), by=a, verbose=TRUE], ans3, +test(1268.17, levels=Inf, dt[, c(list(c), lapply(.SD, mean)), by=a, verbose=TRUE], ans3, output = "lapply optimization changed j from 'c(list(c), lapply(.SD, mean))' to 'list(c, mean(b), mean(c), mean(d))") -test(1268.18, dt[, c(sum(d), lapply(.SD, mean)), by=a, verbose=TRUE], ans4, +test(1268.18, levels=Inf, dt[, c(sum(d), lapply(.SD, mean)), by=a, verbose=TRUE], ans4, output = "GForce optimized j to 'list(gsum(d), gmean(b), gmean(c), gmean(d))'") -test(1268.19, dt[, c(list(sum(d)), lapply(.SD, mean)), by=a, verbose=TRUE], ans5, +test(1268.19, levels=Inf, dt[, c(list(sum(d)), lapply(.SD, mean)), by=a, verbose=TRUE], ans5, output = "GForce optimized j to 'list(gsum(d), gmean(b), gmean(c), gmean(d))'") -test(1268.20, dt[, c(list(sum(d), .I), lapply(.SD, mean)), by=a, verbose=TRUE], ans6, +test(1268.20, levels=Inf, dt[, c(list(sum(d), .I), lapply(.SD, mean)), by=a, verbose=TRUE], ans6, output = "lapply optimization changed j from 'c(list(sum(d), .I), lapply(.SD, mean))' to 'list(sum(d), .I, mean(b), mean(c), mean(d))'") -test(1268.21, dt2[, c(.I, lapply(.SD, mean)), by=x, verbose=TRUE], ans7, +test(1268.21, levels=Inf, dt2[, c(.I, lapply(.SD, mean)), by=x, verbose=TRUE], ans7, output = "lapply optimization is on, j unchanged as 'c(.I, lapply(.SD, mean))'") - -test(1268.22, dt[, c(as.list(c), lapply(.SD, mean)), by=a], +test(1268.22, levels=Inf, dt[, c(as.list(c), lapply(.SD, mean)), by=a], error = "j doesn't evaluate to the same number of columns for each group") @@ -5267,21 +5116,7 @@ set(DT,1L,"b",3L) test(1302, 0L[1L], 3L-3L) test(1303, 0L, 3L-3L) -# FR #334. Test to just make sure that GForce and dogroups with .N are giving the same results. -set.seed(2L) -dt <- data.table(x=sample(rep(1:5e3, each=3)), y=sample(10)) -options(datatable.optimize = 1L) -ans1 <- dt[, list(.N, sum(y)), by=x] -options(datatable.optimize = 2L) -ans2 <- dt[, list(.N, sum(y)), by=x] -test(1304.1, ans1, ans2) - -dt <- data.table(x=sample(rep(1:5e3, each=3)), y=sample(10), key="x") -options(datatable.optimize = 1L) -ans1 <- dt[, list(.N, sum(y)), by=x] -options(datatable.optimize = 2L) -ans2 <- dt[, list(.N, sum(y)), by=x] -test(1304.2, ans1, ans2) +# test 1304 moved to optimize.Rraw # FR #338 DT <- data.table(x=1:5, y=6:10) @@ -5330,59 +5165,7 @@ DT = data.table(a=1:3,b=6:1) test(1312, DT[,setkey(.SD),by=a], error="Setting a physical key on .SD is reserved for possible future use") # was warning "Already keyed by this key but had invalid row order" due to the key not being cleared after the previous group. A solution could have been to put back the original key on populating .SD for each group. But instead we reserve it for future use and push the user towards doing it a different more efficient way (see Arun's speedups in the datatable-help thread). -# gmin and gmax extensive testing (because there are tricky cases) -DT <- data.table(x=rep(1:6, each=3), y=INT(4,-1,0, NA,4,10, 4,NA,10, 4,10,NA, -2147483647, -2147483647, -2147483647, 2147483647, 2147483647, 2147483647)) -# make sure GForce is running -options(datatable.optimize=3L) - -# for integers -test(1313.01, DT[, min(y), by=x], DT[, base::min(y), by=x]) -test(1313.02, DT[, max(y), by=x], DT[, base::max(y), by=x]) -test(1313.03, DT[, min(y, na.rm=TRUE), by=x], DT[, base::min(y, na.rm=TRUE), by=x]) -test(1313.04, DT[, max(y, na.rm=TRUE), by=x], DT[, base::max(y, na.rm=TRUE), by=x]) -# testing all NA - GForce automatically converts to numeric.. optimize=1L errors due to change from integer/numeric (like median) -DT[x==6, y := INT(NA)] -test(1313.05, DT[, min(y), by=x], DT[, base::min(y), by=x]) -test(1313.06, DT[, max(y), by=x], DT[, base::max(y), by=x]) -test(1313.07, DT[, min(y, na.rm=TRUE), by=x], data.table(x=1:6, V1=INT(-1,4,4,4,-2147483647,NA))) -test(1313.08, DT[, max(y, na.rm=TRUE), by=x], data.table(x=1:6, V1=INT(4,10,10,10,-2147483647,NA))) - -# for numeric -DT <- data.table(x=rep(1:6, each=3), y=c(4,-1,0, NA,4,10, 4,NA,10, 4,10,NA, -Inf, NA, NA, Inf, NA, NA)) -test(1313.09, DT[, min(y), by=x], DT[, base::min(y), by=x]) -test(1313.10, DT[, max(y), by=x], DT[, base::max(y), by=x]) -test(1313.11, DT[, min(y, na.rm=TRUE), by=x], DT[, base::min(y, na.rm=TRUE), by=x]) -test(1313.12, DT[, max(y, na.rm=TRUE), by=x], DT[, base::max(y, na.rm=TRUE), by=x]) -# testing all NA - GForce automatically converts to numeric.. optimize=1L errors due to change from integer/numeric (like median) -DT[x==6, y := NA_real_] -test(1313.13, DT[, min(y), by=x], DT[, base::min(y), by=x]) -test(1313.14, DT[, max(y), by=x], DT[, base::max(y), by=x]) -test(1313.15, DT[, min(y, na.rm=TRUE), by=x], data.table(x=1:6, V1=c(-1,4,4,4,-Inf,NA))) -test(1313.16, DT[, max(y, na.rm=TRUE), by=x], data.table(x=1:6, V1=c(4,10,10,10,-Inf,NA))) - -# for date (attribute check.. especially after issues/689 !!!) -DT <- data.table(x = rep(letters[1:2], each=5), y = as.POSIXct('2010-01-01', tz="UTC") + seq(0, 86400*9, 86400)) -test(1313.17, DT[, list(y=min(y)), by=x], DT[c(1,6)]) -test(1313.18, DT[, list(y=max(y)), by=x], DT[c(5,10)]) -DT[c(1,6), y := NA] -test(1313.19, DT[, list(y=min(y)), by=x], DT[c(1,6)]) -test(1313.20, DT[, list(y=max(y)), by=x], DT[c(1,6)]) -test(1313.21, DT[, list(y=min(y, na.rm=TRUE)), by=x], DT[c(2,7)]) -test(1313.22, DT[, list(y=max(y, na.rm=TRUE)), by=x], DT[c(5,10)]) - -# for character -set.seed(1L) -DT <- data.table(x=rep(1:7, each=3), y=sample(c("", letters[1:3], NA), 21, TRUE)) -DT[x==7, y := c("","b","c")] -test(1313.23, DT[, min(y), by=x], DT[, base::min(y), by=x]) -test(1313.24, DT[, max(y), by=x], DT[, base::max(y), by=x]) -test(1313.25, DT[, min(y, na.rm=TRUE), by=x], DT[, base::min(y, na.rm=TRUE), by=x]) -test(1313.26, DT[, max(y, na.rm=TRUE), by=x], DT[, base::max(y, na.rm=TRUE), by=x]) -DT[x==6, y := NA_character_] -test(1313.27, DT[, min(y), by=x], DT[, base::min(y), by=x]) -test(1313.28, DT[, max(y), by=x], DT[, base::max(y), by=x]) -test(1313.29, DT[, min(y, na.rm=TRUE), by=x], data.table(x=1:7, V1=c("a","a","c","","a",NA,""))) -test(1313.30, DT[, max(y, na.rm=TRUE), by=x], data.table(x=1:7, V1=c("b","a","c","a","c",NA,"c"))) +# test 1313 moved to optimze.Rraw # bug 700 - bmerge, roll=TRUE and nomatch=0L when i's key group occurs more than once dt1 <- data.table(structure(list(x = c(7L, 33L), y = structure(c(15912, 15912), class = "Date"), z = c(626550.35284, 7766.385)), .Names = @@ -6495,10 +6278,7 @@ test(1437.17, DT[!a %chin% c("A", "B") & x == 2], DT[c(4, 5, 6)]) test(1437.18, DT[x == 2, .(test = x+y), verbose = TRUE], output = "Optimized subsetting") test(1437.19, DT[x == 2, test := x+y, verbose = TRUE], output = "Optimized subsetting") ## optimize option level 3 is required to get optimized subsetting -options(datatable.optimize = 2L) -test(1437.21, DT[x == 2, verbose = TRUE], output = "^ x y") -options(datatable.optimize = Inf) -test(1437.22, DT[x == 2, verbose = TRUE], output = "Optimized subsetting") +test(1437.21, levels=c(2,Inf), DT[x == 2, verbose = TRUE], output = c("^ x y", "Optimized subsetting")) ## NaN on right hand side is treated correctly. NA on right hand side is not reaching .prepareFastSubset, so not tested here DT <- data.table(x = c(1L:10L, NA_integer_, NA_integer_), y = c(1:10, NA_real_, NaN)) test(1437.23, DT[y == NaN], DT[0]) @@ -6583,35 +6363,22 @@ if (.Machine$sizeof.pointer>4) { # temporarily disabled for 32bit, #2767 for(t in seq_len(nrow(all))){ ## test the query with missing j thisQuery <- all$query[t] - options("datatable.optimize" = 3L) - ansOpt <- DT[eval(parse(text = thisQuery))] - options("datatable.optimize" = 2L) - ansRef <- DT[eval(parse(text = thisQuery))] test_no <- test_no + 1L - test(1438.0 + test_no*0.0001, ansOpt, ansRef) + test(1438.0 + test_no*0.0001, levels=c(2L,3L), DT[eval(parse(text = thisQuery))]) ## repeat the test with 'which = TRUE' - options("datatable.optimize" = 3L) - ansOpt <- DT[eval(parse(text = thisQuery)), which = TRUE] - options("datatable.optimize" = 2L) - ansRef <- DT[eval(parse(text = thisQuery)), which = TRUE] test_no <- test_no + 1L - test(1438.0 + test_no*0.0001, ansOpt, ansRef) + test(1438.0 + test_no*0.0001, levels=c(2L,3L), DT[eval(parse(text = thisQuery)), which = TRUE]) ## repeat the test with the j queries for(thisJquery in jQueries) { ## do it with and without existing "by" for(thisBy in bys){ - options("datatable.optimize" = 3L) - ansOpt <- DT[eval(parse(text = thisQuery)), eval(parse(text = thisJquery)), by = thisBy] - options("datatable.optimize" = 2L) - ansRef <- DT[eval(parse(text = thisQuery)), eval(parse(text = thisJquery)), by = thisBy] test_no <- test_no + 1L - test(1438.0 + test_no*0.0001, ansOpt, ansRef) + test(1438.0 + test_no*0.0001, levels=c(2L,3L), DT[eval(parse(text = thisQuery)), eval(parse(text = thisJquery)), by = thisBy]) } } } } -options(datatable.optimize = Inf) # fread dec=',' e.g. France test(1439, fread("A;B\n1;2,34\n", dec="12"), error=base_messages$stopifnot("nchar(dec) == 1L")) @@ -8146,14 +7913,7 @@ test(1564.1, truelength(dt[, .SD]), 1025L) test(1564.2, truelength(dt[a==5, .SD]), 1025L) test(1564.3, dt[a==5, .SD][, b := 1L], data.table(a=5L, b=1L)) -# Fix for #1251, DT[, .N, by=a] and DT[, .(.N), by=a] uses GForce now -dt = data.table(a=sample(3,20,TRUE), b=1:10) -options(datatable.optimize = 0L) -test(1565.1, ans <- dt[, .N, by=a, verbose=TRUE], output="All optimizations are turned off") -options(datatable.optimize = 1L) -test(1565.2, dt[ , .N, by=a, verbose=TRUE], ans, output="lapply optimization is on, j unchanged") -options(datatable.optimize = Inf) -test(1565.3, dt[ , .N, by=a, verbose=TRUE], ans, output = "GForce optimized j to") +# test 1565 moved to optimize.Rraw # Fix for #1212 set.seed(123) @@ -8262,83 +8022,7 @@ test(1578.7, fread(f, skip=49L), data.table(V1=1:2, V2=3:4)) test(1578.8, fread(f, skip=47L, blank.lines.skip=TRUE), data.table(a=1:2, b=3:4)) test(1578.9, fread(f, skip=48L), data.table(V1=1:2, V2=3:4)) # start on blank line 49 and skip="auto" to first data row on line 50 -# gforce optimisations -dt = data.table(x = sample(letters, 300, TRUE), - i1 = sample(-10:10, 300, TRUE), - i2 = sample(c(-10:10, NA), 300, TRUE), - d1 = as.numeric(sample(-10:10, 300, TRUE)), - d2 = as.numeric(sample(c(NA, NaN, -10:10), 300, TRUE))) -if (test_bit64) { - dt[, `:=`(d3 = as.integer64(sample(-10:10, 300, TRUE)))] - dt[, `:=`(d4 = as.integer64(sample(c(-10:10,NA), 300, TRUE)))] -} - -# make sure gforce is on -options(datatable.optimize=2L) - -# testing gforce::gmedian -test(1579.01, dt[, lapply(.SD, median), by=x], - dt[, lapply(.SD, function(x) median(as.numeric(x))), by=x]) -test(1579.02, dt[, lapply(.SD, median, na.rm=TRUE), by=x], - dt[, lapply(.SD, function(x) median(as.numeric(x), na.rm=TRUE)), by=x]) -test(1579.03, dt[, lapply(.SD, median), keyby=x], - dt[, lapply(.SD, function(x) median(as.numeric(x))), keyby=x]) -test(1579.04, dt[, lapply(.SD, median, na.rm=TRUE), keyby=x], - dt[, lapply(.SD, function(x) median(as.numeric(x), na.rm=TRUE)), keyby=x]) -ans = capture.output(dt[, lapply(.SD, median), by=x, verbose=TRUE]) -test(1579.05, any(grepl("GForce optimized", ans)), TRUE) - -# testing gforce::ghead and gforce::gtail -# head(.SD, 1) and tail(.SD, 1) optimisation -test(1579.06, dt[, head(.SD,1), by=x], dt[, utils::head(.SD,1), by=x]) -test(1579.07, dt[, head(.SD,1), by=x], dt[, utils::head(.SD,1), by=x]) -test(1579.08, dt[, head(.SD,1), keyby=x], dt[, utils::head(.SD,1), keyby=x]) -test(1579.09, dt[, head(.SD,1), keyby=x], dt[, utils::head(.SD,1), keyby=x]) -test(1579.10, dt[, head(.SD,1L), by=x], dt[, utils::head(.SD,1L), by=x]) -test(1579.11, dt[, head(.SD,1L), by=x], dt[, utils::head(.SD,1L), by=x]) -test(1579.12, dt[, head(.SD,1L), keyby=x], dt[, utils::head(.SD,1L), keyby=x]) -test(1579.13, dt[, head(.SD,1L), keyby=x], dt[, utils::head(.SD,1L), keyby=x]) - -test(1579.14, dt[, tail(.SD,1), by=x], dt[, utils::tail(.SD,1), by=x]) -test(1579.15, dt[, tail(.SD,1), by=x], dt[, utils::tail(.SD,1), by=x]) -test(1579.16, dt[, tail(.SD,1), keyby=x], dt[, utils::tail(.SD,1), keyby=x]) -test(1579.17, dt[, tail(.SD,1), keyby=x], dt[, utils::tail(.SD,1), keyby=x]) -test(1579.18, dt[, tail(.SD,1L), by=x], dt[, utils::tail(.SD,1L), by=x]) -test(1579.19, dt[, tail(.SD,1L), by=x], dt[, utils::tail(.SD,1L), by=x]) -test(1579.20, dt[, tail(.SD,1L), keyby=x], dt[, utils::tail(.SD,1L), keyby=x]) -test(1579.21, dt[, tail(.SD,1L), keyby=x], dt[, utils::tail(.SD,1L), keyby=x]) -# 1579.22 tested gtail with n>1; now 1579.4+ below - -mysub <- function(x, n) x[n] -test(1579.23, dt[, .SD[2], by=x, verbose=TRUE], dt[, mysub(.SD,2), by=x], output="GForce optimized.*g[[]") -test(1579.24, dt[, .SD[2], keyby=x], dt[, mysub(.SD,2), keyby=x]) -test(1579.25, dt[, .SD[2L], by=x], dt[, mysub(.SD,2L), by=x]) -test(1579.26, dt[, .SD[2L], keyby=x], dt[, mysub(.SD,2L), keyby=x]) -test(1579.27, dt[, .SD[15], by=x], dt[, mysub(.SD,15), by=x]) # tests 15 > grpsize and that NA is correct including for integer64 -test(1579.28, dt[, .SD[15], keyby=x], dt[, mysub(.SD,15), keyby=x]) - -# gforce head/tail for n>1, #5060 -set.seed(99) -DT = data.table(x = sample(letters[1:5], 20, TRUE), - y = rep.int(1:2, 10), # to test 2 grouping columns get rep'd properly - i = sample(c(-2L,0L,3L,NA), 20, TRUE), - d = sample(c(1.2,-3.4,5.6,NA), 20, TRUE), - s = sample(c("foo","bar",NA), 20, TRUE), - l = sample(list(1:3, mean, letters[4:5], NULL), 20, replace=TRUE)) -if (test_bit64) DT[, i64:=sample(as.integer64(c(-2200000000,+2400000000,NA)), 20, TRUE)] -options(datatable.optimize=2L) -test(1579.401, DT[, .N, by=x]$N, INT(4,6,5,2,3)) # the smallest group is 2, so n=5 tests n constrained to grpsize -test(1579.402, DT[, head(.SD,2), by=x, verbose=TRUE], DT[, utils::head(.SD,2), by=x], output="optimized.*ghead") -test(1579.403, DT[, head(.SD,2), keyby=x, verbose=TRUE], DT[, utils::head(.SD,2), keyby=x], output="optimized.*ghead") -test(1579.404, DT[, head(.SD,5), by=x, verbose=TRUE], DT[, utils::head(.SD,5), by=x], output="optimized.*ghead") -test(1579.405, DT[, head(.SD,5), keyby=x, verbose=TRUE], DT[, utils::head(.SD,5), keyby=x], output="optimized.*ghead") -test(1579.406, DT[, tail(.SD,2), by=x, verbose=TRUE], DT[, utils::tail(.SD,2), by=x], output="optimized.*gtail") -test(1579.407, DT[, tail(.SD,2), keyby=x, verbose=TRUE], DT[, utils::tail(.SD,2), keyby=x], output="optimized.*gtail") -test(1579.408, DT[, tail(.SD,5), by=x, verbose=TRUE], DT[, utils::tail(.SD,5), by=x], output="optimized.*gtail") -test(1579.409, DT[, tail(.SD,5), keyby=x, verbose=TRUE], DT[, utils::tail(.SD,5), keyby=x], output="optimized.*gtail") -test(1579.410, DT[, tail(.SD,2), by=.(x,y), verbose=TRUE], DT[, utils::tail(.SD,2), by=.(x,y)], output="optimized.*gtail") - -options(datatable.optimize = Inf) +# test 1579 moved to optimize.Rraw # test for #1419, rleid doesn't remove names attribute x = c("a"=TRUE, "b"=FALSE) @@ -8346,104 +8030,12 @@ nx = copy(names(x)) r = rleid(x) test(1580, nx, names(x)) -# FR #971, partly addressed (only subsets in 'i') -# make sure GForce kicks in and the results are identical -dt = dt[, .(x, d1, d2)] -options(datatable.optimize=1L) - -test(1581.01, ans1 <- dt[x %in% letters[15:20], - c(.N, lapply(.SD, sum, na.rm=TRUE), - lapply(.SD, min, na.rm=TRUE), - lapply(.SD, max, na.rm=TRUE), - lapply(.SD, mean, na.rm=TRUE), - lapply(.SD, median, na.rm=TRUE) - ), by=x, verbose=TRUE], - output = "(GForce FALSE)") -options(datatable.optimize=2L) -test(1581.02, ans2 <- dt[x %in% letters[15:20], - c(.N, lapply(.SD, sum, na.rm=TRUE), - lapply(.SD, min, na.rm=TRUE), - lapply(.SD, max, na.rm=TRUE), - lapply(.SD, mean, na.rm=TRUE), - lapply(.SD, median, na.rm=TRUE) - ), by=x, verbose=TRUE], - output = "GForce optimized j") -test(1581.03, ans1, ans2) - -# subsets in 'i' for head and tail -options(datatable.optimize=1L) -test(1581.04, ans1 <- dt[x %in% letters[15:20], head(.SD,1), by=x, verbose=TRUE], - output = "(GForce FALSE)") -options(datatable.optimize=2L) -test(1581.05, ans2 <- dt[x %in% letters[15:20], head(.SD,1), by=x, verbose=TRUE], - output = "GForce optimized j") -test(1581.06, ans1, ans2) - -options(datatable.optimize=1L) -test(1581.07, ans1 <- dt[x %in% letters[15:20], tail(.SD,1), by=x, verbose=TRUE], - output = "(GForce FALSE)") -options(datatable.optimize=2L) -test(1581.08, ans2 <- dt[x %in% letters[15:20], tail(.SD,1), by=x, verbose=TRUE], - output = "GForce optimized j") -test(1581.09, ans1, ans2) - -options(datatable.optimize=1L) -test(1581.10, ans1 <- dt[x %in% letters[15:20], .SD[2], by=x, verbose=TRUE], - output = "(GForce FALSE)") -options(datatable.optimize=2L) -test(1581.11, ans2 <- dt[x %in% letters[15:20], .SD[2], by=x, verbose=TRUE], - output = "GForce optimized j") -test(1581.12, ans1, ans2) -options(datatable.optimize = Inf) - -# #3209 g[[ -options(datatable.optimize=1L) -test(1581.13, ans1 <- dt[x %in% letters[15:20], d1[[2]], by=x, verbose=TRUE], - output = "(GForce FALSE)") -options(datatable.optimize=Inf) -test(1581.14, ans2 <- dt[x %in% letters[15:20], d1[[2]], by=x, verbose=TRUE], - output = "GForce optimized j") -test(1581.15, ans1, ans2) -# also, block for non-atomic input, #4159 -dt = data.table(a=1:3) -dt[ , l := .(list(1, 2, 3))] -test(1581.16, dt[ , .(l = l[[1L]]), by=a, verbose=TRUE], - dt[ , l := unlist(l)], output='(GForce FALSE)') -# make sure not to apply when `[[` is applied to a nested call, #4413 -DT = data.table(f1=c("a","b"), f2=c("x","y")) -l = list(a = c(x = "ax", y = "ay"), b = c(x = "bx", y = "by")) -test(1581.17, DT[ , as.list(l[[f1]])[[f2]], by=c("f1","f2")], - data.table(f1 = c("a", "b"), f2 = c("x", "y"), V1 = c("ax", "by"))) -test(1581.18, DT[, v:=l[[f1]][f2], by=c("f1","f2")], - data.table(f1=c("a","b"), f2=c("x","y"), v=c("ax", "by"))) -# When the object being [[ is in parent.frame(), not x, -# need eval to have enclos=parent.frame(), #4612 -DT = data.table(id = c(1, 1, 2), value = c("a", "b", "c")) -DT0 = copy(DT) -fun = function(DT, tag = c("A", "B")) DT[, var := tag[[.GRP]], by = "id"] -fun(DT) -test(1581.19, DT, DT0[ , var := c('A', 'A', 'B')]) +# test 1581 moved to optimize.Rraw # handle NULL value correctly #1429 test(1582, uniqueN(NULL), 0L) -# bug fix #1461 related to NaN not being recognized due to ISNA vs ISNAN at C level -# verbatim test from the original report: -options(datatable.optimize=Inf) # ensure gforce is on -DT = data.table( - C1 = c(rep("A", 4), rep("B",4), rep("C", 4)), - C2 = c(rep("a", 3), rep("b",3), rep("c",3), rep("d",3)), - Val = c(1:5, NaN, NaN, 8,9,10,NaN,12)) -test(1583.1, DT[, .(agg = min(Val, na.rm=TRUE)), by=c('C1', 'C2')], - data.table(C1=c("A","A","B","B","C","C"), - C2=c("a","b","b","c","c","d"), - agg=c(1,4,5,8,9,10))) -# extra test with a size-1 group containing one NaN too -DT = data.table(x=INT(1,1,1,2,2,2,3,3,3,4,4,4,5), y=c(NaN,1,2, 2,NaN,1, NA,NaN,2, NaN,NA,NaN, NaN)) -test(1583.2, DT[, min(y, na.rm=TRUE), by=x], data.table(x=1:5, V1=c(1,1,2,NA,NA))) -test(1583.3, DT[, max(y, na.rm=TRUE), by=x], data.table(x=1:5, V1=c(2,2,2,NA,NA))) -test(1583.4, DT[, min(y), by=x], data.table(x=1:5, V1=c(NaN,NaN,NA,NaN,NaN))) -test(1583.5, DT[, max(y), by=x], data.table(x=1:5, V1=c(NaN,NaN,NA,NaN,NaN))) +# test 1583 moved to optimize.Rraw # Fixed a minor bug in fread when blank.lines.skip=TRUE f1 <- function(x, f=TRUE, b=FALSE) fread(x, fill=f, blank.lines.skip=b, data.table=FALSE, logical01=FALSE) @@ -8563,32 +8155,7 @@ test(1592.2, names(setnames(DT, -1, c("m", "n"))), c("x", "m", "n")) # fix for #1513 test(1593, CJ(c(1,2,2), c(1,2,3)), data.table(V1=rep(c(1,2), c(3,6)), V2=c(1,2,3,1,1,2,2,3,3), key=c("V1", "V2"))) -# FR #523, var, sd and prod -options(datatable.optimize = Inf) # ensure gforce is on -DT = data.table(x=sample(5, 100, TRUE), - y1=sample(6, 100, TRUE), - y2=sample(c(1:10,NA), 100, TRUE), - z1=runif(100), - z2=sample(c(runif(10),NA,NaN), 100, TRUE)) -test(1594.01, DT[, lapply(.SD, var, na.rm=FALSE), by=x], DT[, lapply(.SD, stats::var, na.rm=FALSE), by=x]) -test(1594.02, DT[, lapply(.SD, var, na.rm=TRUE), by=x], DT[, lapply(.SD, stats::var, na.rm=TRUE), by=x]) -test(1594.03, DT[, lapply(.SD, var, na.rm=TRUE), by=x, verbose=TRUE], output="GForce optimized j to.*gvar") -# coverage: default group .N=1 case -idx=DT[ , .I[1L], by=x]$V1 -out=data.table(x=DT[(idx), x], V1=NA_real_) -test(1594.05, DT[(idx), var(y1), by=x], out) -test(1594.06, DT[(idx), var(y1, na.rm=TRUE), by=x], out) -test(1594.07, DT[(idx), var(z1), by=x], out) -test(1594.08, DT[(idx), var(z1, na.rm=TRUE), by=x], out) - -test(1594.09, DT[, lapply(.SD, sd, na.rm=FALSE), by=x], DT[, lapply(.SD, stats::sd, na.rm=FALSE), by=x]) -test(1594.10, DT[, lapply(.SD, sd, na.rm=TRUE), by=x], DT[, lapply(.SD, stats::sd, na.rm=TRUE), by=x]) -test(1594.11, DT[, lapply(.SD, sd, na.rm=TRUE), by=x, verbose=TRUE], output="GForce optimized j to.*gsd") - -test(1594.12, DT[, lapply(.SD, prod, na.rm=FALSE), by=x], DT[, lapply(.SD, base::prod, na.rm=FALSE), by=x]) -test(1594.13, DT[, lapply(.SD, prod, na.rm=TRUE), by=x], DT[, lapply(.SD, base::prod, na.rm=TRUE), by=x]) -test(1594.14, DT[, lapply(.SD, prod, na.rm=TRUE), by=x, verbose=TRUE], output="GForce optimized j to.*gprod") - +# test 1594 moved to optimize.Rraw # FR #1517 dt1 = data.table(x=c(1,1,2), y=1:3) @@ -9211,24 +8778,20 @@ test(1629.07, dt[0][, .SD*v1, .SDcols=v2:v3], dt[0][, .SD, .SDcols=v2:v3]) dt2 = copy(dt) test(1629.08, dt2[, c("v2", "v3") := .SD*v1, .SDcols=v2:v3], dt[, .(grp, v1, v2=v2*v1, v3=v3*v1)]) # grouping operations -options(datatable.optimize = 1L) # no gforce -test(1629.09, dt[, .SD*sum(v1), by=grp, .SDcols=v2:v3], dt[, .SD*sum(v1), by=grp][, v1 := NULL]) -ans1 = dt[, sum(v1), by=grp] +test(1629.09, levels=c(1L,2L), dt[, .SD*sum(v1), by=grp, .SDcols=v2:v3], dt[, .SD*sum(v1), by=grp][, v1 := NULL]) +ans1 = dt[, base::sum(v1), by=grp] ans2 = dt[, base::max(.SD), by=grp, .SDcols=v2:v3] -test(1629.10, dt[, max(.SD)*sum(v1), by=grp, .SDcols=v2:v3], ans1[, .(grp, V1=V1*ans2$V1)]) -test(1629.11, dt[, lapply(.SD, function(x) weighted.mean(x, w=v2)), .SDcols=c("v1","v3"), by=grp], +test(1629.10, levels=c(1L,2L), dt[, max(.SD)*sum(v1), by=grp, .SDcols=v2:v3], ans1[, .(grp, V1=V1*ans2$V1)]) +test(1629.11, levels=c(1L,2L), dt[, lapply(.SD, function(x) weighted.mean(x, w=v2)), .SDcols=c("v1","v3"), by=grp], dt[, .(v1=weighted.mean(v1,w=v2), v3=weighted.mean(v3, w=v2)), by=grp]) -test(1629.12, dt[, c(v1=max(v1), lapply(.SD, base::min)), by=grp, .SDcols=v2:v3], dt[, .(v1=max(v1), v2=min(v2), v3=min(v3)), by=grp]) +test(1629.12, levels=c(1L,Inf), dt[, c(v1=max(v1), lapply(.SD, base::min)), by=grp, .SDcols=v2:v3], dt[, .(v1=max(v1), v2=min(v2), v3=min(v3)), by=grp]) # gforce -options(datatable.optimize = Inf) # Inf -test(1629.13, dt[, c(v1=max(v1), lapply(.SD, min)), by=grp, .SDcols=v2:v3], dt[, .(v1=max(v1), v2=min(v2), v3=min(v3)), by=grp]) # even more complex, shouldn't run any optimisation dt[, v4 := v1/2] test(1629.14, dt[, c(.(v1=v1*min(v4)), lapply(.SD, function(x) x*max(v4))), by=grp, .SDcols=v2:v3], dt[, .(v1=v1*min(v4), v2=v2*max(v4), v3=v3*max(v4)), by=grp]) test(1629.15, copy(dt)[, c("a", "b", "c") := c(min(v1), lapply(.SD, function(x) max(x)*min(v1))), by=grp, .SDcols=v3:v4], copy(dt)[, c("a", "b", "c") := .(min(v1), max(v3)*min(v1), max(v4)*min(v1)), by=grp]) -options(datatable.optimize = Inf) # by=.EACHI and operations with 'i' test(1629.16, dt[.(c(2,3)), c(.(sum(v1)), lapply(.SD, function(x) mean(x)*min(v1))), by=.EACHI, .SDcols=v2:v3, on="grp"], dt[grp %in% 2:3, c(.(sum(v1)), lapply(.SD, function(x) mean(x)*min(v1))), by=grp, .SDcols=v2:v3]) @@ -9311,11 +8874,7 @@ test(1637.3, dt[, data.table(a, .SD), by = a], data.table(a=1,a=1,b=1)) test(1637.4, dt[, data.table(b, .SD), by = cumsum(a)], data.table(cumsum=1, b=1, b=1)) test(1637.5, dt[, data.table(a, b), by = cumsum(a)], data.table(cumsum=1, a=1, b=1)) -# when datatable.optimize<1, no optimisation of j should take place: -options(datatable.optimize=0L) -dt = data.table(x=1:5, y=6:10, z=c(1,1,1,2,2)) -test(1638, dt[, .SD, by=z, verbose=TRUE], output="All optimizations are turned off") -options(datatable.optimize=Inf) +# test 1638 moved to optimize.Rraw rm_all() @@ -13197,9 +12756,8 @@ DT[ , V1:=as.ordered(V1)] test(1918.3, DT[, min(V1)], structure(1L, .Label = lev, class = c("ordered", "factor"))) test(1918.4, DT[, max(V1)], structure(5L, .Label = lev, class = c("ordered", "factor"))) ## make sure GForce is activated -options(datatable.optimize = Inf) -test(1918.5, DT[, min(V1), by=V2], data.table(V2=c("f", "g", "h"), V1=structure(1:3, .Label=lev, class=c("ordered", "factor")))) -test(1918.6, DT[, max(V1), by=V2], data.table(V2=c("f", "g", "h"), V1=structure(3:5, .Label=lev, class=c("ordered", "factor")))) +test(1918.5, levels=Inf, DT[, min(V1), by=V2], data.table(V2=c("f", "g", "h"), V1=structure(1:3, .Label=lev, class=c("ordered", "factor")))) +test(1918.6, levels=Inf, DT[, max(V1), by=V2], data.table(V2=c("f", "g", "h"), V1=structure(3:5, .Label=lev, class=c("ordered", "factor")))) # as.ITime.character bug for NA handling #2940 test(1919, as.ITime(c('xxx', '10:43')), structure(c(NA, 38580L), class = "ITime")) @@ -14082,17 +13640,12 @@ suppressWarnings(rm(`___data.table_internal_test_1967.68___`)) test(1967.68, setDT(`___data.table_internal_test_1967.68___`), error = 'Cannot find symbol') ### [.data.table verbosity & non-equi-join tests -options(datatable.optimize = 0L) -verbose_output = capture.output(x[order(a), .N, verbose = TRUE]) -test(1967.69, !any(grepl('forder.c', verbose_output, fixed = TRUE))) -test(1967.70, any(grepl('[1] 5', verbose_output, fixed = TRUE))) -options('datatable.optimize' = 1L) -test(1967.71, x[order(a), .N, verbose = TRUE], 5L, +test(1967.69, levels=0L, x[order(a), .N, verbose = TRUE], output='[1] 5', notOutput='forder.c') +test(1967.71, levels=1L, x[order(a), .N, verbose = TRUE], 5L, output = "forder.c received 5 rows and 1 column") setkey(x) -test(1967.72, x[x, .N, on = 'a', verbose = TRUE], 5L, +test(1967.72, levels=1L, x[x, .N, on = 'a', verbose = TRUE], 5L, output = "on= matches existing key") -options(datatable.optimize = Inf) x = data.table( i1 = c(234L, 250L, 169L, 234L, 147L, 96L, 96L, 369L, 147L, 96L), @@ -14243,14 +13796,9 @@ x <- as.array(1:5) test(1980, names(data.table(x)), "x") # crash when n="lead", #3354 -options(datatable.optimize=0L) DT = data.table( id = 1:5 , val = letters[1:5] ) -test(1981.1, DT[, new_col := shift(val, "lead")], error=base_messages$stopifnot("is.numeric(n)")) -test(1981.2, DT[, new_col := shift(val, NA_integer_)], error="Item 1 of n is NA") -options(datatable.optimize=Inf) -DT = data.table( id = 1:5 , val = letters[1:5] ) -test(1981.3, DT[, new_col := shift(val, "lead")], error=base_messages$stopifnot("is.numeric(n)")) -test(1981.4, DT[, new_col := shift(val, NA_integer_)], error="Item 1 of n is NA") +test(1981.1, levels=c(0L, Inf), DT[, new_col := shift(val, "lead")], error=base_messages$stopifnot("is.numeric(n)")) +test(1981.2, levels=c(0L, Inf), DT[, new_col := shift(val, NA_integer_)], error="Item 1 of n is NA") # 1982 moved to benchmark.Rraw, #5517 @@ -14282,8 +13830,7 @@ test(1984.081, DT[, sum(a), by=as.raw(0)], error="Column or expression.*1.*t test(1984.082, data.table(A=1:4, L=list(1, 1:2, 1, 1:3), V=1:4)[, sum(V), by=.(A,L)], # better error message, 4308 error="Column or expression.*2.*type 'list'.*not.*supported") test(1984.09, DT[, sum(a), by=.(1,1:2)], error="The items in the 'by' or 'keyby' list have lengths [1, 2]. Each must be length 10; the same length as there are rows in x (after subsetting if i is provided).") -options('datatable.optimize' = Inf) -test(1984.10, DT[ , 1, by = .(a %% 2), verbose = TRUE], +test(1984.10, levels=Inf, DT[ , 1, by = .(a %% 2), verbose = TRUE], data.table(a = c(1, 0), V1 = c(1, 1)), output = 'Optimization is on but left j unchanged') DT[ , f := rep(1:2, each = 5)] @@ -15168,16 +14715,11 @@ test(2042.4, DT[ , round(mean(DiffTime)), by=Group, verbose=TRUE], # gforce wrongly applied to external variable; #875 DT = data.table(x=INT(1,1,1,2,2), y=1:5) z = 1:5 -options(datatable.optimize = Inf) -test(2043.1, DT[, list(mean(z), mean(y)), by=x], ans<-data.table(x=1:2, V1=c(3,3), V2=c(2.0,4.5))) -options(datatable.optimize = 1L) -test(2043.2, DT[, list(mean(z), mean(y)), by=x], ans) -options(datatable.optimize = 0L) -test(2043.3, DT[, list(mean(z), mean(y)), by=x], ans) -options(datatable.optimize = Inf) -test(2043.4, DT[, list(sd(z), sd(y)), by=x], data.table(x=1:2, V1=sd(z), V2=c(sd(1:3), sd(4:5)))) +opt = c(Inf,1L,0L) +test(2043.1, levels=opt, DT[, list(mean(z), mean(y)), by=x], data.table(x=1:2, V1=c(3,3), V2=c(2.0,4.5))) +test(2043.4, levels=opt, DT[, list(sd(z), sd(y)), by=x], data.table(x=1:2, V1=sd(z), V2=c(sd(1:3), sd(4:5)))) z = 1:4 -test(2043.5, DT[, list(mean(z), mean(y)), by=x], ans<-data.table(x=1:2, V1=c(2.5,2.5), V2=c(2.0,4.5))) # was length error about z +test(2043.5, levels=opt, DT[, list(mean(z), mean(y)), by=x], ans<-data.table(x=1:2, V1=c(2.5,2.5), V2=c(2.0,4.5))) # was length error about z # test type coercion in joins, #2592 dt1 <- data.table(int = 1L:10L, @@ -16340,12 +15882,10 @@ test(2098.1, DT[do.call(order, mget(groups)), verbose=TRUE], ans<-data.table(id= test(2098.2, DT[with(DT, do.call(order, mget(groups))), verbose=TRUE], ans, output=out) test(2098.3, DT[do.call(forder, mget(groups)), verbose=TRUE], ans, output=out) test(2098.4, DT[with(DT, do.call(forder, mget(groups))), verbose=TRUE], ans, output=out) -old = options(datatable.optimize=0L) -test(2098.5, DT[do.call(order, mget(groups)), verbose=TRUE], ans, notOutput="forder.c") -test(2098.6, DT[with(DT, do.call(order, mget(groups))), verbose=TRUE], ans, notOutput="forder.c") -test(2098.7, DT[do.call(forder, mget(groups)), verbose=TRUE], ans, output=out) -test(2098.8, DT[with(DT, do.call(forder, mget(groups))), verbose=TRUE], ans, output=out) -options(old) +test(2098.5, levels=0L, DT[do.call(order, mget(groups)), verbose=TRUE], ans, notOutput="forder.c") +test(2098.6, levels=0L, DT[with(DT, do.call(order, mget(groups))), verbose=TRUE], ans, notOutput="forder.c") +test(2098.7, levels=0L, DT[do.call(forder, mget(groups)), verbose=TRUE], ans, output=out) +test(2098.8, levels=0L, DT[with(DT, do.call(forder, mget(groups))), verbose=TRUE], ans, output=out) # Error in update join when joining on factor, #3559 d1 <- data.table(fac = factor(letters[1:4]), char = letters[1:4], val = c(1L, NA, 3L, NA)) @@ -17770,16 +17310,14 @@ test(2207, dcast(DT, x~y, value.var="z"), data.table(x=1:3, a=c(1+6i, 3+4i, 5+2i # gmin/gmax for integer64, #4444 if (test_bit64) { DT = data.table(grp=c(1L, 1L, 1L, 2L), i64=as.integer64(c(NA, 1:3))) - old = options(datatable.optimize=2L) - test(2208.1, DT[, min(i64), by=grp], data.table(grp=1:2, V1=as.integer64(c(NA, 3)))) - test(2208.2, DT[, min(i64, na.rm=TRUE), by=grp], data.table(grp=1:2, V1=as.integer64(c(1, 3)))) - test(2208.3, DT[, max(i64), by=grp], data.table(grp=1:2, V1=as.integer64(c(NA, 3)))) - test(2208.4, DT[, max(i64, na.rm=TRUE), by=grp], data.table(grp=1:2, V1=as.integer64(2:3))) + test(2208.1, levels=2L, DT[, min(i64), by=grp], data.table(grp=1:2, V1=as.integer64(c(NA, 3)))) + test(2208.2, levels=2L, DT[, min(i64, na.rm=TRUE), by=grp], data.table(grp=1:2, V1=as.integer64(c(1, 3)))) + test(2208.3, levels=2L, DT[, max(i64), by=grp], data.table(grp=1:2, V1=as.integer64(c(NA, 3)))) + test(2208.4, levels=2L, DT[, max(i64, na.rm=TRUE), by=grp], data.table(grp=1:2, V1=as.integer64(2:3))) # create an all-NA group DT[, i64:=rev(i64)] - test(2208.7, DT[, min(i64, na.rm=TRUE), by=grp], data.table(grp=1:2, V1=as.integer64(c(1,NA)))) - test(2208.8, DT[, max(i64, na.rm=TRUE), by=grp], data.table(grp=1:2, V1=as.integer64(c(3,NA)))) - options(old) + test(2208.7, levels=2L, DT[, min(i64, na.rm=TRUE), by=grp], data.table(grp=1:2, V1=as.integer64(c(1,NA)))) + test(2208.8, levels=2L, DT[, max(i64, na.rm=TRUE), by=grp], data.table(grp=1:2, V1=as.integer64(c(3,NA)))) } # when user supplies dec=',' don't try sep=',', #4483 @@ -17995,17 +17533,16 @@ if (test_bit64) test(2219.2, DT[3, A:=as.integer64("4611686018427387906")], data # gforce improve coverage DT = data.table(g=1:2, i=c(NA, 1:4, NA), f=factor(letters[1:6]), l=as.list(1:6)) -options(datatable.optimize = 2L) funs = c("sum", "mean", "min", "max", "median", "var", "sd", "prod") testnum = 0L for (fun in funs) { testnum = testnum + 1L - test(2220.0 + testnum*0.01, EVAL("DT[,",fun,"(i, na.rm='a'), g]"), error="na.rm must be TRUE or FALSE") + test(2220.0 + testnum*0.01, levels=2L, EVAL("DT[,",fun,"(i, na.rm='a'), g]"), error="na.rm must be TRUE or FALSE") testnum = testnum + 1L - test(2220.0 + testnum*0.01, EVAL("DT[,",fun,"(f), g]"), error=sprintf("%s is not meaningful for factors.", fun)) + test(2220.0 + testnum*0.01, levels=2L, EVAL("DT[,",fun,"(f), g]"), error=sprintf("%s is not meaningful for factors.", fun)) } testnum = testnum + 1L -test(2220.0 + testnum*0.01, DT[, prod(l), g], error="GForce prod can only be applied to columns, not .SD or similar.") +test(2220.0 + testnum*0.01, levels=2L, DT[, prod(l), g], error="GForce prod can only be applied to columns, not .SD or similar.") # tables() error when called from inside a function(...), #5197 test(2221, (function(...) tables())(), output = "No objects of class data.table exist") @@ -18021,7 +17558,6 @@ test(2223.1, DT[.(4), nomatch=FALSE], data.table(A=integer(), key="A")) test(2223.2, DT[.(4), nomatch=NA_character_], data.table(A=4L, key="A")) # gshift, #5205 -options(datatable.optimize = 2L) set.seed(123) DT = data.table(x = sample(letters[1:5], 20, TRUE), y = rep.int(1:2, 10), # to test 2 grouping columns get rep'd properly @@ -18043,7 +17579,7 @@ for (col in names(DT)[-1]) { for (type in c('lag','lead','shift','cyclic')) { # fill is tested by group in tests 2218.*; see comments in #5205 # sapply(sapply()) changed to for(for(for())) to save 29MiB, #5517 - test(2224.1+i/10000, # 192 tests here when test_bit64=TRUE; 168 when FALSE + test(2224.1+i/10000, levels=2L, # 192 tests here when test_bit64=TRUE; 168 when FALSE EVAL(sprintf("DT[, shift(%s, %d, type='%s'), by=x]$V1", col, n, type)), ans[[i]]) i = i+1L @@ -18132,86 +17668,8 @@ test(2230.12, merge(DT, y, by="k2", NULL, NULL, FALSE, FALSE, FALSE, TRUE, c(".x test(2230.13, merge(DT, y, by="k2", NULL, NULL, FALSE, FALSE, FALSE, TRUE, c(".x", ".y"), TRUE, getOption("datatable.allow.cartesian"), NULL, unk1=1L, unk2=2L, 3L, 4L), merge(DT, y, by="k2"), warning=c("Supplied both `by` and `by.x`/`by.y`. `by` argument will be ignored.", "2 unnamed arguments.*2 unknown keyword arguments.*\\[unk1, unk2\\]")) +# tests 2231 moved to optimize.Rraw -# weighted.mean GForce optimized, #3977 -old = options(datatable.optimize=1L) -DT = data.table(x=c(3.7,3.3,3.5,2.8), w=c(5,5,4,1), g=1L) -test(2231.01, DT[, weighted.mean(x, w), g, verbose=TRUE], data.table(g=1L, V1=3.45333333333333), output="GForce FALSE") -test(2231.02, DT[, weighted.mean(w, x), g, verbose=TRUE], data.table(g=1L, V1=3.89473684210526), output="GForce FALSE") -test(2231.03, DT[, weighted.mean(x), g, verbose=TRUE], data.table(g=1L, V1=3.325), output="GForce FALSE") -# multiple groups -DT = data.table(x=c(1L,2L,2L,3L,4L,5L,5L,6L), w=c(1L,1L,1L,1L,2L,2L,2L,2L), g=rep(1L:2L, each=4L)) -test(2231.04, DT[, weighted.mean(x, w), g, verbose=TRUE], data.table(g=c(1L,2L), V1=c(2,5)), output="GForce FALSE") -test(2231.05, DT[, weighted.mean(x, w), g, verbose=TRUE], data.table(g=c(1L,2L), V1=c(2,5)), output="GForce FALSE") -test(2231.06, DT[, weighted.mean(x, w), seq(nrow(DT)), verbose=TRUE], data.table(seq=1L:8L, V1=c(1,2,2,3,4,5,5,6)), output="GForce FALSE") -# (only x XOR w) containing NA -DT = data.table(x=c(1L,NA,2L,3L,4L,5L,5L,6L), w=c(1L,1L,1L,1L,2L,NA,2L,2L), g=rep(1L:2L, each=4L)) -test(2231.07, DT[, weighted.mean(x, w, na.rm=FALSE), g, verbose=TRUE], data.table(g=c(1L,2L), V1=c(NA_real_, NA_real_)), output="GForce FALSE") -test(2231.08, DT[, weighted.mean(x, w, na.rm=TRUE), g, verbose=TRUE], data.table(g=c(1L,2L), V1=c(2, NA_real_)), output="GForce FALSE") -test(2231.09, DT[, weighted.mean(x, na.rm=FALSE), g, verbose=TRUE], data.table(g=c(1L,2L), V1=c(NA, 5)), output="GForce FALSE") -test(2231.10, DT[, weighted.mean(x, na.rm=TRUE), g, verbose=TRUE], data.table(g=c(1L,2L), V1=c(2, 5)), output="GForce FALSE") -# (only x XOR w) containing NaN -DT = data.table(x=c(1L,2L,NaN,3L,4L,5L,5L,6L), w=c(1L,1L,1L,1L,2L,2L,NaN,2L), g=rep(1L:2L, each=4L)) -test(2231.11, DT[, weighted.mean(x, w, na.rm=FALSE), g, verbose=TRUE], data.table(g=c(1L,2L), V1=c(NaN, NA)), output="GForce FALSE") -test(2231.12, DT[, weighted.mean(x, w, na.rm=TRUE), g, verbose=TRUE], data.table(g=c(1L,2L), V1=c(2, NA)), output="GForce FALSE") -test(2231.13, DT[, weighted.mean(x, na.rm=FALSE), g, verbose=TRUE], data.table(g=c(1L,2L), V1=c(NaN, 5)), output="GForce FALSE") -test(2231.14, DT[, weighted.mean(x, na.rm=TRUE), g, verbose=TRUE], data.table(g=c(1L,2L), V1=c(2, 5)), output="GForce FALSE") -# (only x XOR w) containing NA and NaN -DT = data.table(x=c(1L,NA,NaN,3L,4L,5L,5L,6L), w=c(1L,1L,1L,1L,2L,NA,NaN,2L), g=rep(1L:2L, each=4L)) -test(2231.15, DT[, weighted.mean(x, w, na.rm=FALSE), g, verbose=TRUE], data.table(g=c(1L,2L), V1=c(NA_real_, NA_real_)), output="GForce FALSE") -test(2231.16, DT[, weighted.mean(x, w, na.rm=TRUE), g, verbose=TRUE], data.table(g=c(1L,2L), V1=c(2, NA)), output="GForce FALSE") -test(2231.17, DT[, weighted.mean(x, na.rm=FALSE), g, verbose=TRUE], data.table(g=c(1L,2L), V1=c(NA, 5)), output="GForce FALSE") -test(2231.18, DT[, weighted.mean(x, na.rm=TRUE), g, verbose=TRUE], data.table(g=c(1L,2L), V1=c(2, 5)), output="GForce FALSE") -# (x and w) containing NA and NaN -DT = data.table(x=c(1L,NA,NaN,3L,4L,5L,5L,6L), w=c(1L,NA,NaN,1L,2L,2L,2L,2L), g=rep(1L:2L, each=4L)) -test(2231.19, DT[, weighted.mean(x, w, na.rm=FALSE), g, verbose=TRUE], data.table(g=c(1L,2L), V1=c(NA, 5)), output="GForce FALSE") -test(2231.20, DT[, weighted.mean(x, w, na.rm=TRUE), g, verbose=TRUE], data.table(g=c(1L,2L), V1=c(2, 5)), output="GForce FALSE") -DT = data.table(x=c(1L,NA,NaN,3L,4L,5L,5L,6L), w=c(1L,NaN,NA,1L,2L,2L,2L,2L), g=rep(1L:2L, each=4L)) -test(2231.21, DT[, weighted.mean(x, w, na.rm=FALSE), g, verbose=TRUE], data.table(g=c(1L,2L), V1=c(NA, 5)), output="GForce FALSE") -test(2231.22, DT[, weighted.mean(x, w, na.rm=TRUE), g, verbose=TRUE], data.table(g=c(1L,2L), V1=c(2, 5)), output="GForce FALSE") -# same as previous test cases but now GForce optimized -options(datatable.optimize=2L) -DT = data.table(x=c(3.7,3.3,3.5,2.8), w=c(5,5,4,1), g=1L) -test(2231.31, DT[, weighted.mean(x, w), g, verbose=TRUE], data.table(g=1L, V1=3.45333333333333), output="GForce optimized j to") -test(2231.32, DT[, weighted.mean(w, x), g, verbose=TRUE], data.table(g=1L, V1=3.89473684210526), output="GForce optimized j to") -test(2231.33, DT[, weighted.mean(x), g, verbose=TRUE], data.table(g=1L, V1=3.325), output="GForce optimized j to") -# multiple groups -DT = data.table(x=c(1L,2L,2L,3L,4L,5L,5L,6L), w=c(1L,1L,1L,1L,2L,2L,2L,2L), g=rep(1L:2L, each=4L)) -test(2231.34, DT[, weighted.mean(x, w), g, verbose=TRUE], data.table(g=c(1L,2L), V1=c(2,5)), output="GForce optimized j to") -test(2231.35, DT[, weighted.mean(x, w), g, verbose=TRUE], data.table(g=c(1L,2L), V1=c(2,5)), output="GForce optimized j to") -test(2231.36, DT[, weighted.mean(x, w), seq(nrow(DT)), verbose=TRUE], data.table(seq=1L:8L, V1=c(1,2,2,3,4,5,5,6)), output="GForce optimized j to") -# (only x XOR w) containing NA -DT = data.table(x=c(1L,NA,2L,3L,4L,5L,5L,6L), w=c(1L,1L,1L,1L,2L,NA,2L,2L), g=rep(1L:2L, each=4L)) -test(2231.37, DT[, weighted.mean(x, w, na.rm=FALSE), g, verbose=TRUE], data.table(g=c(1L,2L), V1=c(NA_real_, NA_real_)), output="GForce optimized j to") -test(2231.38, DT[, weighted.mean(x, w, na.rm=TRUE), g, verbose=TRUE], data.table(g=c(1L,2L), V1=c(2, NA_real_)), output="GForce optimized j to") -test(2231.39, DT[, weighted.mean(x, na.rm=FALSE), g, verbose=TRUE], data.table(g=c(1L,2L), V1=c(NA, 5)), output="GForce optimized j to") -test(2231.40, DT[, weighted.mean(x, na.rm=TRUE), g, verbose=TRUE], data.table(g=c(1L,2L), V1=c(2, 5)), output="GForce optimized j to") -# (only x XOR w) containing NaN -DT = data.table(x=c(1L,2L,NaN,3L,4L,5L,5L,6L), w=c(1L,1L,1L,1L,2L,2L,NaN,2L), g=rep(1L:2L, each=4L)) -test(2231.41, DT[, weighted.mean(x, w, na.rm=FALSE), g, verbose=TRUE], data.table(g=c(1L,2L), V1=c(NaN, NA)), output="GForce optimized j to") -test(2231.42, DT[, weighted.mean(x, w, na.rm=TRUE), g, verbose=TRUE], data.table(g=c(1L,2L), V1=c(2, NA)), output="GForce optimized j to") -test(2231.43, DT[, weighted.mean(x, na.rm=FALSE), g, verbose=TRUE], data.table(g=c(1L,2L), V1=c(NaN, 5)), output="GForce optimized j to") -test(2231.44, DT[, weighted.mean(x, na.rm=TRUE), g, verbose=TRUE], data.table(g=c(1L,2L), V1=c(2, 5)), output="GForce optimized j to") -# (only x XOR w) containing NA and NaN -DT = data.table(x=c(1L,NA,NaN,3L,4L,5L,5L,6L), w=c(1L,1L,1L,1L,2L,NA,NaN,2L), g=rep(1L:2L, each=4L)) -test(2231.45, DT[, weighted.mean(x, w, na.rm=FALSE), g, verbose=TRUE], data.table(g=c(1L,2L), V1=c(NA_real_, NA_real_)), output="GForce optimized j to") -test(2231.46, DT[, weighted.mean(x, w, na.rm=TRUE), g, verbose=TRUE], data.table(g=c(1L,2L), V1=c(2, NA)), output="GForce optimized j to") -test(2231.47, DT[, weighted.mean(x, na.rm=FALSE), g, verbose=TRUE], data.table(g=c(1L,2L), V1=c(NA, 5)), output="GForce optimized j to") -test(2231.48, DT[, weighted.mean(x, na.rm=TRUE), g, verbose=TRUE], data.table(g=c(1L,2L), V1=c(2, 5)), output="GForce optimized j to") -# (x and w) containing NA and NaN -DT = data.table(x=c(1L,NA,NaN,3L,4L,5L,5L,6L), w=c(1L,NA,NaN,1L,2L,2L,2L,2L), g=rep(1L:2L, each=4L)) -test(2231.49, DT[, weighted.mean(x, w, na.rm=FALSE), g, verbose=TRUE], data.table(g=c(1L,2L), V1=c(NA, 5)), output="GForce optimized j to") -test(2231.50, DT[, weighted.mean(x, w, na.rm=TRUE), g, verbose=TRUE], data.table(g=c(1L,2L), V1=c(2, 5)), output="GForce optimized j to") -DT = data.table(x=c(1L,NA,NaN,3L,4L,5L,5L,6L), w=c(1L,NaN,NA,1L,2L,2L,2L,2L), g=rep(1L:2L, each=4L)) -test(2231.51, DT[, weighted.mean(x, w, na.rm=FALSE), g, verbose=TRUE], data.table(g=c(1L,2L), V1=c(NA, 5)), output="GForce optimized j to") -test(2231.52, DT[, weighted.mean(x, w, na.rm=TRUE), g, verbose=TRUE], data.table(g=c(1L,2L), V1=c(2, 5)), output="GForce optimized j to") -# let wrongly named arguments get lost in ellipsis #5543 -DT = data.table(x=c(3.7,3.3,3.5,2.8), w=c(5,5,4,1), g=1L) -test(2231.61, DT[, weighted.mean(x, w), g, verbose=TRUE], data.table(g=1L, V1=3.45+1/300), output="GForce optimized j to") -test(2231.62, DT[, weighted.mean(x, weight=w), g, verbose=TRUE], data.table(g=1L, V1=3.325), output="GForce optimized j to") -test(2231.63, DT[, weighted.mean(x, w, na.rm=FALSE), g], DT[, stats::weighted.mean(x, w, na.rm=FALSE), g]) -test(2231.64, DT[, weighted.mean(x, weight=w, na.rm=TRUE)], DT[, stats::weighted.mean(x, weight=w, na.rm=TRUE)]) -options(old) # cols argument for unique.data.table, #5243 DT = data.table(g = rep(letters, 3), v1=1:78, v2=78:1) @@ -18223,17 +17681,18 @@ test(2232.3, unique(DT[1:26], by='g', cols='v1'), DT[1:26, !'v2']) test(2232.4, unique(DT, by='g', cols='v3'), error="non-existing column(s)") # support := with GForce #1414 -options(datatable.optimize = 2L) DT = data.table(a=1:3,b=(1:9)/10) -test(2233.01, DT[, v := min(b), a, verbose=TRUE], data.table(a=1:3, b=(1:9)/10, v=(1:3)/10), output="GForce optimized j to") +opt = 0:2 +out = c("GForce FALSE", "GForce FALSE", "GForce optimized j to") +test(2233.01, levels=opt, DT[, v := min(b), a, verbose=TRUE], data.table(a=1:3, b=(1:9)/10, v=(1:3)/10), output=out) # GForce returning full length -test(2233.02, DT[, v := head(b, 3L), a, verbose=TRUE], data.table(a=1:3, b=(1:9)/10, v=(1:9)/10), output="GForce optimized j to") +test(2233.02, levels=opt, DT[, v := head(b, 3L), a, verbose=TRUE], data.table(a=1:3, b=(1:9)/10, v=(1:9)/10), output=out) # GForce neither returning 1 per group nor full length -test(2233.03, DT[, v := head(b, 2L), a], error="Supplied 6 items to be assigned to 9 items of column 'v'.") +test(2233.03, levels=opt, DT[, v := head(b, 2L), a], error="Supplied .* items to be assigned to .* column 'v'.") # compare to non GForce version DT = data.table(a=1:3,b=(1:9)/10) -test(2233.04, copy(DT)[, v := min(b), a, verbose=TRUE], copy(DT)[, v := base::min(b), a, ], output="GForce optimized j to") -test(2233.05, copy(DT)[, v := head(b, 3L), a, verbose=TRUE], copy(DT)[, v := utils::head(b, 3L), a], output="GForce optimized j to") +test(2233.04, levels=opt, copy(DT)[, v := min(b), a, verbose=TRUE], copy(DT)[, v := base::min(b), a, ], output=out) +test(2233.05, levels=opt, copy(DT)[, v := head(b, 3L), a, verbose=TRUE], copy(DT)[, v := utils::head(b, 3L), a], output=out) # with key and grouping by key DT = data.table(a=1:3,b=(1:9)/10, key="a") @@ -18304,27 +17763,21 @@ test(2233.38, copy(DT)[, val:=v[1L], keyby=.(A,B), verbose=TRUE], data.table(A=I set.seed(10) n = 100 a = data.table(id1=1:n, id2=sample(1:900,n,replace=TRUE), flag=sample(c(0,0,0,1),n,replace=TRUE)) -for (opt in c(0,Inf)) { - options(datatable.optimize=opt) - out = if (opt) "GForce.*gsum" else "GForce FALSE" - B = copy(a) - A = a[sample(seq_len(nrow(a)), nrow(a))] # shuffle - num_bump = (opt>0)/100 - test(2233.39+num_bump+0.001, A[, t1 := sum(flag, na.rm=TRUE), by=id2, verbose=TRUE], A, output=out) # y=A dummy just to test output= - setorder(A, id1) - test(2233.39+num_bump+0.002, A[, t2 := sum(flag, na.rm=TRUE), by=id2, verbose=TRUE], A, output=out) - test(2233.39+num_bump+0.003, any(A[,t1!=t2]), FALSE) - test(2233.39+num_bump+0.004, any(A[, length(unique(t1))>1, by=id2]$V1), FALSE) - test(2233.39+num_bump+0.005, any(A[, length(unique(t2))>1, by=id2]$V1), FALSE) -} +opt = c(0,Inf) +out = c("GForce FALSE", "GForce.*gsum") +B = copy(a) +A = a[sample(seq_len(nrow(a)), nrow(a))] # shuffle +test(2233.391, levels=opt, A[, t1 := sum(flag, na.rm=TRUE), by=id2, verbose=TRUE], A, output=out) # y=A dummy just to test output= +setorder(A, id1) +test(2233.392, levels=opt, A[, t2 := sum(flag, na.rm=TRUE), by=id2, verbose=TRUE], A, output=out) +test(2233.393, levels=opt, any(A[,t1!=t2]), FALSE) +test(2233.394, levels=opt, any(A[, length(unique(t1))>1, by=id2]$V1), FALSE) +test(2233.395, levels=opt, any(A[, length(unique(t2))>1, by=id2]$V1), FALSE) # test from #5337 n=4; k=2 mm = data.table(a = rep(1:k,n), b=seq_len(n*k), d=rep(1:n,k)) ans = copy(mm)[, e:=INT(NA,8,NA,12,NA,8,NA,12)] -options(datatable.optimize=0) -test(2233.41, copy(mm)[a==2, e:=sum(b), by=d, verbose=TRUE], ans, output="GForce FALSE") -options(datatable.optimize=Inf) -test(2233.42, copy(mm)[a==2, e:=sum(b), by=d, verbose=TRUE], ans, output="GForce.*gsum") +test(2233.41, levels=opt, copy(mm)[a==2, e:=sum(b), by=d, verbose=TRUE], ans, output=c("GForce FALSE", "GForce.*gsum")) # test from #5345 set.seed(1) DT = data.table( @@ -18335,32 +17788,20 @@ DT = data.table( ) load(testDir("test2233-43.Rdata")) # ans setDT(ans) # to silence verbose messages about internal.selfref being NULL when loaded from disk -test(2233.43, - options = list(datatable.verbose=TRUE, datatable.optimize=0), +test(2233.43, levels=c(0,Inf), options = list(datatable.verbose=TRUE), copy(DT)[, sum_v2_idT:=sum(v2), by=c("id", "t") ][, n_idT :=dim(.SD)[[1]], by=list(t, id) ][, sum_v2_id :=sum(v2), by=.(id) ][, sum_v1_idT:=sum(v1), by=c("id", "t") ][, sum_v1_id :=sum(v1), by=c("id")], ans, - output="GForce FALSE") -test(2233.44, - options = list(datatable.verbose=TRUE, datatable.optimize=Inf), - copy(DT)[, sum_v2_idT:=sum(v2), by=c("id", "t") - ][, n_idT :=dim(.SD)[[1]], by=list(t, id) - ][, sum_v2_id :=sum(v2), by=.(id) - ][, sum_v1_idT:=sum(v1), by=c("id", "t") - ][, sum_v1_id :=sum(v1), by=c("id")], - ans, - output="GForce.*gsum") + output=c("GForce FALSE", "GForce.*gsum")) # optimized := with gforce functions that can return lists #5403 -old = options(datatable.verbose=TRUE) DT = data.table(grp=1:2, x=1:4) out = "Making each group and running j (GForce TRUE)" -test(2233.45, copy(DT)[, c("y", "z") := .(shift(x, type="lag", n=1), shift(x, type="lead", n=1)), by=grp], data.table(grp=1:2, x=1:4, y=c(NA, NA, 1:2), z=c(3:4, NA, NA)), output=out) -test(2233.46, copy(DT)[, l := shift(x, n=c(0, 0)), by=grp], data.table(grp=1:2, x=1:4, l=list(INT(1, 1), INT(2, 2), INT(3, 3), INT(4, 4))), output=out) -test(2233.47, copy(DT)[, c("l1", "l2") := shift(x, n=c(-1, 1)), by=grp], data.table(grp=1:2, x=1:4, l1=c(3:4,NA,NA), l2=c(NA,NA,1:2)), output=out) -options(old) +test(2233.45, options=c(datatable.verbose=TRUE), copy(DT)[, c("y", "z") := .(shift(x, type="lag", n=1), shift(x, type="lead", n=1)), by=grp], data.table(grp=1:2, x=1:4, y=c(NA, NA, 1:2), z=c(3:4, NA, NA)), output=out) +test(2233.46, options=c(datatable.verbose=TRUE), copy(DT)[, l := shift(x, n=c(0, 0)), by=grp], data.table(grp=1:2, x=1:4, l=list(INT(1, 1), INT(2, 2), INT(3, 3), INT(4, 4))), output=out) +test(2233.47, options=c(datatable.verbose=TRUE), copy(DT)[, c("l1", "l2") := shift(x, n=c(-1, 1)), by=grp], data.table(grp=1:2, x=1:4, l1=c(3:4,NA,NA), l2=c(NA,NA,1:2)), output=out) # support by=.I; #1732 DT = data.table(V1=1:5, V2=3:7, V3=5:1) @@ -18497,20 +17938,14 @@ test(2243.38, dt[, sd(y, na.rm=as.logical(j)), g, verbose=TRUE], data.table( dt = data.table(x = c(2,2,1,1), y = 1:4, z=letters[1:4]) i=c(1,2) j=1L -test(2243.41, options=c(datatable.optimize=1L), dt[, .I[TRUE], x]$V1, 1:4) -test(2243.42, options=c(datatable.optimize=1L), dt[, z[y], x], data.table(x=c(2,2,1,1), V1=c("a","b",NA,NA))) -test(2243.51, options=list(datatable.optimize=2L, datatable.verbose=TRUE), - dt[, .I[TRUE], x]$V1, 1:4, output="GForce FALSE") -test(2243.52, options=list(datatable.optimize=2L, datatable.verbose=TRUE), - dt[, z[y], x], data.table(x=c(2,2,1,1), V1=c("a","b",NA,NA)), output="GForce FALSE") -test(2243.53, options=list(datatable.optimize=2L, datatable.verbose=TRUE), - dt[, .I[1], x]$V1, c(1L, 3L), output="GForce TRUE") -test(2243.54, options=list(datatable.optimize=2L, datatable.verbose=TRUE), - dt[, .I[j], x]$V1, c(1L, 3L), output="GForce TRUE") -test(2243.55, options=list(datatable.optimize=2L, datatable.verbose=TRUE), - dt[, .I[i], x]$V1, 1:4, output="GForce FALSE") -test(2243.56, options=list(datatable.optimize=2L, datatable.verbose=TRUE), - dt[, .I[1:2], x]$V1, 1:4, output="GForce FALSE") +opt = c(1L,2L) +out = c("GForce FALSE", "GForce TRUE") +test(2243.41, levels=opt, options=c(datatable.verbose=TRUE), dt[, .I[TRUE], x]$V1, 1:4, output="GForce FALSE") +test(2243.42, levels=opt, options=c(datatable.verbose=TRUE), dt[, z[y], x], data.table(x=c(2,2,1,1), V1=c("a","b",NA,NA)), output="GForce FALSE") +test(2243.53, levels=opt, options=c(datatable.verbose=TRUE), dt[, .I[1], x]$V1, c(1L, 3L), output=out) +test(2243.54, levels=opt, options=c(datatable.verbose=TRUE), dt[, .I[j], x]$V1, c(1L, 3L), output=out) +test(2243.55, levels=opt, options=c(datatable.verbose=TRUE), dt[, .I[i], x]$V1, 1:4, output="GForce FALSE") +test(2243.56, levels=opt, options=c(datatable.verbose=TRUE), dt[, .I[1:2], x]$V1, 1:4, output="GForce FALSE") DT = data.table(1) test(2244.1, DT[, `:=`(a=1, )], error="`:=`.*Did you forget a trailing comma\\?") @@ -18531,11 +17966,9 @@ test(2245.3, dt[1], data.table(foo = 1L, bar = 4L)) # Default in this environmen # data.table:: doesn't turn off GForce, #5942 DT = data.table(a = rep(1:5, 2L), b = 1:10) -old = options(datatable.optimize=Inf, datatable.verbose=TRUE) -test(2246.1, DT[, data.table::shift(b), by=a], DT[, shift(b), by=a], output="GForce TRUE") -test(2246.2, DT[, data.table::first(b), by=a], DT[, first(b), by=a], output="GForce TRUE") -test(2246.3, DT[, data.table::last(b), by=a], DT[, last(b), by=a], output="GForce TRUE") -options(old) +test(2246.1, options=list(datatable.optimize=Inf, datatable.verbose=TRUE), DT[, data.table::shift(b), by=a], DT[, shift(b), by=a], output="GForce TRUE") +test(2246.2, options=list(datatable.optimize=Inf, datatable.verbose=TRUE), DT[, data.table::first(b), by=a], DT[, first(b), by=a], output="GForce TRUE") +test(2246.3, options=list(datatable.optimize=Inf, datatable.verbose=TRUE), DT[, data.table::last(b), by=a], DT[, last(b), by=a], output="GForce TRUE") # 5392 split(x,f) works with formula f dt = data.table(x=1:4, y=factor(letters[1:2])) @@ -18833,14 +18266,7 @@ test(2262.6, set(null.data.table(), j=c("a","b"), value=list(1:2, 3:4)), dt3) test(2262.7, data.table(a=1, b=2)[, c("a", "b") := list(NULL, NULL)], null.data.table()) test(2262.8, data.table(a=1, b=2)[, c("a", "b") := list(NULL)], null.data.table()) -# GForce retains attributes in by arguments #5567 -dt = data.table(a=letters[1:4], b=structure(1:4, class = c("class_b", "integer"), att=1), c=structure(c(1L,2L,1L,2L), class = c("class_c", "integer"))) -test(2263.1, options=list(datatable.verbose=TRUE, datatable.optimize=0L), dt[, .N, b], data.table(b=dt$b, N=1L), output="GForce FALSE") -test(2263.2, options=list(datatable.verbose=TRUE, datatable.optimize=0L), dt[, .N, .(b,c)], data.table(b=dt$b, c=dt$c, N=1L), output="GForce FALSE") -test(2263.3, options=list(datatable.verbose=TRUE, datatable.optimize=0L), names(attributes(dt[, .N, b]$b)), c("class", "att"), output="GForce FALSE") -test(2263.4, options=list(datatable.verbose=TRUE, datatable.optimize=Inf), dt[, .N, b], data.table(b=dt$b, N=1L), output="GForce optimized j to") -test(2263.5, options=list(datatable.verbose=TRUE, datatable.optimize=Inf), dt[, .N, .(b,c)], data.table(b=dt$b, c=dt$c, N=1L), output="GForce optimized j to") -test(2263.6, options=list(datatable.verbose=TRUE, datatable.optimize=Inf), names(attributes(dt[, .N, b]$b)), c("class", "att"), output="GForce optimized j to") +# test 2263 moved to optimize.Rraw # tests for printing indices alongside data.tables NN = 200 @@ -18976,7 +18402,7 @@ test(2269.2, fread("x\n?\n \n", colClasses="POSIXct", na.strings="?"), dt) # Error found by revdep in #6284: mean(a,b) is valid, expr names() can be NULL DT = data.table(a = 1, b = 2) -test(2270, options=c(datatable.optimize=1L), DT[, mean(b, 1), by=a], data.table(a=1, V1=2), warning="Unable to optimize call to mean()") +test(2270, levels=1L, DT[, mean(b, 1), by=a], data.table(a=1, V1=2), warning="Unable to optimize call to mean()") # Missing newline in verbose output -> harder to read DT1 = data.table(a=1:2) @@ -19288,75 +18714,7 @@ test(2282.08, rowwiseDT(A=,B=,1,2,C=,4), error="Header must be the first N argum ncols = 1e6 test(2282.09, rowwiseDT(A=,ncols), data.table(A=ncols)) -# named arguments of c() in j get prepended to lapply(.SD, FUN) #2311 - -M <- as.data.table(mtcars) -M[, " " := hp] -M[, "." := hp] - -sdnames <- setdiff(names(M), "cyl") -sdlist <- vector("list", length(sdnames)) -names(sdlist) <- sdnames - -for (opt in c(0, 1, 2)) { - test(2283 + opt/10 + 0.001, options=c(datatable.optimize=opt), - names(M[, c(m=lapply(.SD, mean)), by="cyl"]), - c("cyl", names(c(m=sdlist)))) - test(2283 + opt/10 + 0.002, options=c(datatable.optimize=opt), - names(M[, c(Mpg=list(mpg), lapply(.SD, mean)), by="cyl"]), - c("cyl", "Mpg", sdnames)) - test(2283 + opt/10 + 0.003, options=c(datatable.optimize=opt), - names(M[, c(Mpg=list(mpg), m=lapply(.SD, mean)), by="cyl"]), - c("cyl", "Mpg", names(c(m=sdlist)))) - test(2283 + opt/10 + 0.004, options=c(datatable.optimize=opt), - names(M[, c(mpg=list(mpg), mpg=lapply(.SD, mean)), by="cyl"]), - c("cyl", "mpg", names(c(mpg=sdlist)))) - test(2283 + opt/10 + 0.005, options=c(datatable.optimize=opt), - names(M[, c(list(mpg), lapply(.SD, mean)), by="cyl"]), - c("cyl", "V1", sdnames)) - test(2283 + opt/10 + 0.006, options=c(datatable.optimize=opt), - names(M[, c(lapply(.SD, mean), list(mpg)), by="cyl"]), - c("cyl", sdnames, sprintf("V%d", length(sdnames)+1L))) - test(2283 + opt/10 + 0.007, options=c(datatable.optimize=opt), - names(M[, c(lapply(.SD, mean), lapply(.SD, sum)), by="cyl"]), - c("cyl", sdnames, sdnames)) - test(2283 + opt/10 + 0.008, options=c(datatable.optimize=opt), - names(M[, c(mean=lapply(.SD, mean), sum=lapply(.SD, sum)), by="cyl"]), - c("cyl", names(c(mean=sdlist, sum=sdlist)))) - test(2283 + opt/10 + 0.009, options=c(datatable.optimize=opt), - names(M[, c(lapply(.SD, mean), sum=lapply(.SD, sum)), by="cyl"]), - c("cyl", sdnames, names(c(sum=sdlist))) ) - test(2283 + opt/10 + 0.010, options=c(datatable.optimize=opt), - names(M[, c(" "=lapply(.SD, mean), "."=lapply(.SD, sum)), by="cyl"]), - c("cyl", names(c(" "=sdlist, "."=sdlist)))) - test(2283 + opt/10 + 0.011, options=c(datatable.optimize=opt), - names(M[, c(A=list(a=mpg, b=hp), lapply(.SD, mean)), by="cyl"]), - c("cyl", names(c(A=list(a=0, b=0))), sdnames)) - test(2283 + opt/10 + 0.012, options=c(datatable.optimize=opt), - names(M[, c(A=list(mpg, hp), lapply(.SD, mean)), by="cyl"]), - c("cyl", names(c(A=list(0, 0))), sdnames)) - test(2283 + opt/10 + 0.013, options=c(datatable.optimize=opt), - names(M[, c(A=list(mpg, b=hp, wt), lapply(.SD, mean)), by="cyl"]), - c("cyl", names(c(A=list(0, b=0, 0))), sdnames)) - test(2283 + opt/10 + 0.014, options=c(datatable.optimize=opt), - names(M[, c(A=list(mpg), lapply(.SD, mean)), by="cyl"]), - c("cyl", names(c(A=list(0))), sdnames)) - test(2283 + opt/10 + 0.015, options=c(datatable.optimize=opt), - names(M[, c(" "=list(" "=hp, "."=disp, mpg), lapply(.SD, mean)), by="cyl"]), - c("cyl", names(c(" "=list(" "=0, "."=0, 0))), sdnames)) - test(2283 + opt/10 + 0.016, options=c(datatable.optimize=opt), - names(M[, c("."=list(" "=hp, "."=disp, mpg), lapply(.SD, mean)), by="cyl"]), - c("cyl", names(c("."=list(" "=0, "."=0, 0))), sdnames)) - test(2283 + opt/10 + 0.017, options=c(datatable.optimize=opt), - names(M[, c(list(mpg, b=hp), lapply(.SD, mean)), by="cyl", .SDcols=c("vs", "am")]), - c("cyl", "V1", "b", "vs", "am")) - test(2283 + opt/10 + 0.018, options=c(datatable.optimize=opt), - names(M[, c(list(mpg, b=hp), c(lapply(.SD, mean))), by="cyl", .SDcols=c("vs", "am")]), - c("cyl", "V1", "b", "vs", "am")) - test(2283 + opt/10 + 0.019, options=c(datatable.optimize=opt), - names(M[, c(mpg[1], list(mpg, b=hp), c(lapply(.SD, mean))), by="cyl", .SDcols=c("vs", "am")]), - c("cyl", "V1", "V2", "b", "vs", "am")) -} +# test 2283 moved tests to optimize.Rraw # Confusing behavior with DT[, min(var):max(var)] #2069 DT = data.table(t = c(2L, 1L, 3L), a=0, b=1) diff --git a/man/test.Rd b/man/test.Rd index 594040aca..4a25a592c 100644 --- a/man/test.Rd +++ b/man/test.Rd @@ -8,7 +8,7 @@ test(num, x, y = TRUE, error = NULL, warning = NULL, message = NULL, output = NULL, notOutput = NULL, ignore.warning = NULL, - options = NULL, env = NULL) + options = NULL, env = NULL, levels = NULL) } \arguments{ \item{num}{ A unique identifier for a test, helpful in identifying the source of failure when testing is not working. Currently, we use a manually-incremented system with tests formatted as \code{n.m}, where essentially \code{n} indexes an issue and \code{m} indexes aspects of that issue. For the most part, your new PR should only have one value of \code{n} (scroll to the end of \code{inst/tests/tests.Rraw} to see the next available ID) and then index the tests within your PR by increasing \code{m}. Note -- \code{n.m} is interpreted as a number, so \code{123.4} and \code{123.40} are actually the same -- please \code{0}-pad as appropriate. Test identifiers are checked to be in increasing order at runtime to prevent duplicates being possible. } @@ -22,6 +22,7 @@ test(num, x, y = TRUE, \item{ignore.warning}{ A single character string. Any warnings emitted by \code{x} that contain this string are dropped. Remaining warnings are compared to the expected \code{warning} as normal. } \item{options}{ A named list of options to set for the duration of the test. Any code evaluated during this call to \code{test()} (usually, \code{x}, or maybe \code{y}) will run with the named options set, and the original options will be restored on return. This is a named list since different options can have different types in general, but in typical usage, only one option is set at a time, in which case a named vector is also accepted. } \item{env}{ A named list of environment variables to set for the duration of the test, much like \code{options}. A list entry set to \code{NULL} will unset (i.e., \code{\link{Sys.unsetenv}}) the corresponding variable. } +\item{levels}{ A vector of different optimization levels to test. The code in \code{x} will be run once for each optimization level, with \code{options(datatable.optimize.level=level)} set accordingly. All levels must pass the test for the overall test to pass. If no y is supplied, the results from the different levels are compared to each other for equality. If a y is supplied, the results from each level are compared to y for equality. } } \note{ \code{NA_real_} and \code{NaN} are treated as equal, use \code{identical} if distinction is needed. See examples below. From 04e578230641f00646a297b9285d2ff74b499eac Mon Sep 17 00:00:00 2001 From: Benjamin Schwendinger Date: Thu, 30 Oct 2025 16:35:03 +0100 Subject: [PATCH 04/27] add lapply(list(col1, col2, ...), fun) pattern --- R/data.table.R | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/R/data.table.R b/R/data.table.R index aa184f5c4..ffe5cfc24 100644 --- a/R/data.table.R +++ b/R/data.table.R @@ -364,6 +364,18 @@ replace_dot_alias = function(e) { jvnames = massage_result$jvnames funi = massage_result$funi } + # Pattern 3a2: lapply(list(col1, col2, ...), fun) + else if (is.call(jsub) && jsub %iscall% "lapply" && length(jsub) >= 2L && + jsub[[2L]] %iscall% "list" && length(jsub[[2L]]) > 1L) { + cnames = as.list(jsub[[2L]])[-1L] + if (all(vapply_1b(cnames, is.name))) { + cnames = vapply_1c(cnames, as.character) + massage_result = .massageSD(jsub, cnames, SDenv, funi) + jsub = massage_result$jsub + jvnames = NULL # consistent with datatable.optimize=0L behavior + funi = massage_result$funi + } + } # Pattern 3b: Map(fun, .SD) else if (is.call(jsub) && jsub %iscall% "Map" && length(jsub) >= 3L && jsub[[3L]] == ".SD" && length(sdvars)) { massage_result = .massageSD(jsub, sdvars, SDenv, funi) From a8dde19a1aa0cca46eb9f9cfb318c57f8d79989e Mon Sep 17 00:00:00 2001 From: Benjamin Schwendinger Date: Fri, 31 Oct 2025 14:51:43 +0100 Subject: [PATCH 05/27] turn on optimization --- R/data.table.R | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/R/data.table.R b/R/data.table.R index ffe5cfc24..c7eed447a 100644 --- a/R/data.table.R +++ b/R/data.table.R @@ -225,7 +225,7 @@ replace_dot_alias = function(e) { # For now, we optimise all functions mentioned in 'optfuns' below. optfuns = c("max", "min", "mean", "length", "sum", "median", "sd", "var") is_valid = TRUE - any_SD = FALSE + any_optimized = FALSE jsubl = as.list.default(jsub) oldjvnames = jvnames jvnames = NULL # TODO: not let jvnames grow, maybe use (number of lapply(.SD, .))*length(sdvars) + other jvars ?? not straightforward. @@ -235,7 +235,7 @@ replace_dot_alias = function(e) { # Case 1: Plain name (.SD or .N) if (is.name(this)) { # no need to check length(this)==1L; is.name() returns single TRUE or FALSE (documented); can't have a vector of names if (this == ".SD") { # optimise '.SD' alone - any_SD = TRUE + any_optimized = TRUE jsubl[[i_]] = lapply(sdvars, as.name) jvnames = c(jvnames, sdvars) } else if (this == ".N") { @@ -254,7 +254,7 @@ replace_dot_alias = function(e) { is_lapply = this[[1L]] == "lapply" && length(this) >= 2L && this[[2L]] == ".SD" is_map = this[[1L]] == "Map" && length(this) >= 3L && this[[3L]] == ".SD" if ((is_lapply || is_map) && length(sdvars)) { - any_SD = TRUE + any_optimized = TRUE massage_result = .massageSD(this, sdvars, SDenv, funi) funi = massage_result$funi jsubl[[i_]] = as.list(massage_result$jsub[-1L]) # just keep the '.' from list(.) @@ -289,6 +289,7 @@ replace_dot_alias = function(e) { jn__[idx & !nzchar(jn__)] = "I" # this & is correct not && jvnames = c(jvnames, jn__) jsubl[[i_]] = jl__ + any_optimized = TRUE } } # Case 2c: Single-value functions like mean, sum, etc. @@ -299,7 +300,7 @@ replace_dot_alias = function(e) { else if (length(this) == 3L && (this[[1L]] == "[" || this[[1L]] == "head") && this[[2L]] == ".SD" && (is.numeric(this[[3L]]) || this[[3L]] == ".N")) { # optimise .SD[1] or .SD[2L]. Not sure how to test .SD[a] as to whether a is numeric/integer or a data.table, yet. - any_SD = TRUE + any_optimized = TRUE jsubl[[i_]] = lapply(sdvars, function(x) { this[[2L]] = as.name(x); this }) jvnames = c(jvnames, sdvars) } @@ -326,7 +327,7 @@ replace_dot_alias = function(e) { } # Return result - if (!is_valid || !any_SD) { + if (!is_valid || !any_optimized) { # Can't optimize - return original list(jsub=jsub, jvnames=oldjvnames, funi=funi, optimized=FALSE) } else { From 67f28749e1e18e7e90a76437b5c9b1139597043e Mon Sep 17 00:00:00 2001 From: Benjamin Schwendinger Date: Sat, 1 Nov 2025 16:50:33 +0100 Subject: [PATCH 06/27] add type conversion support to GForce --- R/data.table.R | 20 ++++++++++++++++---- src/gsumm.c | 2 +- 2 files changed, 17 insertions(+), 5 deletions(-) diff --git a/R/data.table.R b/R/data.table.R index c7eed447a..52686ed81 100644 --- a/R/data.table.R +++ b/R/data.table.R @@ -378,7 +378,9 @@ replace_dot_alias = function(e) { } } # Pattern 3b: Map(fun, .SD) - else if (is.call(jsub) && jsub %iscall% "Map" && length(jsub) >= 3L && jsub[[3L]] == ".SD" && length(sdvars)) { + # Only optimize if .SD appears exactly once to avoid cases like Map(rep, .SD, .SD) + else if (is.call(jsub) && jsub %iscall% "Map" && length(jsub) >= 3L && jsub[[3L]] == ".SD" && length(sdvars) && + sum(vapply_1b(as.list(jsub), function(x) identical(x, quote(.SD)))) == 1L) { massage_result = .massageSD(jsub, sdvars, SDenv, funi) jsub = massage_result$jsub jvnames = massage_result$jvnames @@ -3313,13 +3315,15 @@ is_constantish = function(q, check_singleton=FALSE) { .gweighted.mean_ok = function(q, x) { #3977 q = match.call(gweighted.mean, q) is_constantish(q[["na.rm"]]) && + !(is.symbol(q[["na.rm"]]) && q[["na.rm"]] %chin% names(x)) && (is.null(q[["w"]]) || eval(call('is.numeric', q[["w"]]), envir=x)) } # run GForce for simple f(x) calls and f(x, na.rm = TRUE)-like calls where x is a column of .SD .get_gcall = function(q) { if (!is.call(q)) return(NULL) # is.symbol() is for #1369, #1974 and #2949 - if (!is.symbol(q[[2L]])) return(NULL) + if (!is.symbol(q[[2L]]) && !is.call(q[[2L]])) return(NULL) + if (is.call(q[[2L]]) && !.is_type_conversion(q[[2L]])) return(NULL) q1 = q[[1L]] if (is.symbol(q1)) return(if (q1 %chin% gfuns) q1) if (!q1 %iscall% "::") return(NULL) @@ -3332,12 +3336,20 @@ is_constantish = function(q, check_singleton=FALSE) { # is robust to unnamed expr. Note that NA names are not possible here. .arg_is_narm = function(expr, which=3L) !is.null(nm <- names(expr)[which]) && startsWith(nm, "na") +.is_type_conversion = function(expr) { + is.call(expr) && is.symbol(expr[[1L]]) && expr[[1L]] %chin% + c("as.numeric", "as.double", "as.integer", "as.character", "as.integer64", + "as.complex", "as.logical", "as.Date", "as.POSIXct", "as.factor") +} + .gforce_ok = function(q, x, envir=parent.frame(2L)) { if (is.N(q)) return(TRUE) # For #334 q1 = .get_gcall(q) if (is.null(q1)) return(FALSE) - if (!(q2 <- q[[2L]]) %chin% names(x) && q2 != ".I") return(FALSE) # 875 - if (length(q)==2L || (.arg_is_narm(q) && is_constantish(q[[3L]]))) return(TRUE) + q2 = if (.is_type_conversion(q[[2L]]) && is.symbol(q[[2L]][[2L]])) q[[2L]][[2L]] else q[[2L]] + if (!q2 %chin% names(x) && q2 != ".I") return(FALSE) # 875 + if (length(q)==2L || (.arg_is_narm(q) && is_constantish(q[[3L]]) && + !(is.symbol(q[[3L]]) && q[[3L]] %chin% names(x)))) return(TRUE) switch(as.character(q1), "shift" = .gshift_ok(q), "weighted.mean" = .gweighted.mean_ok(q, x), diff --git a/src/gsumm.c b/src/gsumm.c index 5970f5919..be8ce1611 100644 --- a/src/gsumm.c +++ b/src/gsumm.c @@ -410,7 +410,7 @@ SEXP gsum(SEXP x, SEXP narmArg) //Rprintf(_("gsum int took %.3f\n"), wallclock()-started); if (overflow) { UNPROTECT(1); // discard the result with overflow - warning(_("The sum of an integer column for a group was more than type 'integer' can hold so the result has been coerced to 'numeric' automatically for convenience.")); + warning(_("The sum of an integer column for a group was more than type 'integer' can hold so the result has been coerced to 'numeric' automatically for convenience. Consider using 'as.numeric' on the column beforehand to avoid this warning.")); ans = PROTECT(allocVector(REALSXP, ngrp)); double *restrict ansp = REAL(ans); memset(ansp, 0, ngrp*sizeof(double)); From 2876ebef76346c1f315c74c498e9f39e91f4a5f4 Mon Sep 17 00:00:00 2001 From: Benjamin Schwendinger Date: Sat, 1 Nov 2025 16:57:13 +0100 Subject: [PATCH 07/27] remove stale branch --- R/test.data.table.R | 2 -- 1 file changed, 2 deletions(-) diff --git a/R/test.data.table.R b/R/test.data.table.R index bfc61412a..decb600bb 100644 --- a/R/test.data.table.R +++ b/R/test.data.table.R @@ -380,8 +380,6 @@ test = function(num,x,y=TRUE,error=NULL,warning=NULL,message=NULL,output=NULL,no val = get(p, envir=environment()) if (length(val) > 0L) { cl[[p]] = val[((i - 1L) %% length(val)) + 1L] # ccycle through values if fewer than levels - } else if (p %in% names(cl)) { - cl[[p]] = NULL } } From c445c3830ce83428181f6fe9559de8e059d95161 Mon Sep 17 00:00:00 2001 From: Benjamin Schwendinger Date: Sun, 2 Nov 2025 15:13:14 +0100 Subject: [PATCH 08/27] add tests --- inst/tests/optimize.Rraw | 455 +++++++++++++++++++++++++++++++++++++++ inst/tests/tests.Rraw | 16 ++ tests/optimize.R | 2 + 3 files changed, 473 insertions(+) create mode 100644 inst/tests/optimize.Rraw create mode 100644 tests/optimize.R diff --git a/inst/tests/optimize.Rraw b/inst/tests/optimize.Rraw new file mode 100644 index 000000000..57d53f052 --- /dev/null +++ b/inst/tests/optimize.Rraw @@ -0,0 +1,455 @@ +require(methods) +if (exists("test.data.table", .GlobalEnv, inherits=FALSE)) { + if ((tt<-compiler::enableJIT(-1))>0) + cat("This is dev mode and JIT is enabled (level ", tt, ") so there will be a brief pause around the first test.\n", sep="") +} else { + require(data.table) + test = data.table:::test + null.data.table = data.table:::null.data.table + INT = data.table:::INT +} + +sugg = c("bit64") +for (s in sugg) { + assign(paste0("test_",s), loaded<-suppressWarnings(suppressMessages( + library(s, character.only=TRUE, logical.return=TRUE, quietly=TRUE, warn.conflicts=FALSE, pos="package:base") # attach at the end for #5101 + ))) + if (!loaded) cat("\n**** Suggested package",s,"is not installed or has dependencies missing. Tests using it will be skipped.\n\n") +} + +# := by group +DT = data.table(a=1:3,b=(1:9)/10) +test(611.1, levels=c(0L, 2L), DT[,v:=sum(b),by=a], data.table(a=1:3,b=(1:9)/10,v=c(1.2,1.5,1.8))) +setkey(DT,a) +test(611.2, levels=c(0L, 2L), DT[,v:=min(b),by=a], data.table(a=1:3,b=(1:9)/10,v=(1:3)/10,key="a")) +# Combining := by group with i +test(611.3, levels=c(0L, 2L), DT[a>1,p:=sum(b)]$p, rep(c(NA,3.3),c(3,6))) +test(611.4, levels=c(0L, 2L), DT[a>1,q:=sum(b),by=a]$q, rep(c(NA,1.5,1.8),each=3)) +# 612 was just level repetition of 611 +# Assign to subset ok (NA initialized in the other items) ok : +test(613, levels=c(0L, 2L), DT[J(2),w:=8.3]$w, rep(c(NA,8.3,NA),each=3)) +test(614, levels=c(0L, 2L), DT[J(3),x:=9L]$x, rep(c(NA_integer_,NA_integer_,9L),each=3)) +test(615, levels=c(0L, 2L), DT[J(2),z:=list(list(c(10L,11L)))]$z, rep(list(NULL, 10:11, NULL),each=3)) + +# Empty i clause, #2034. Thanks to Chris for testing, tests from him. Plus changes from #759 +ans = copy(DT)[,r:=NA_real_] +test(618.1, levels=c(0L, 2L), copy(DT)[a>3,r:=sum(b)], ans) +test(618.2, levels=c(0L, 2L), copy(DT)[J(-1),r:=sum(b)], ans) +test(618.3, levels=c(0L, 2L), copy(DT)[NA,r:=sum(b)], ans) +test(618.4, levels=c(0L, 2L), copy(DT)[0,r:=sum(b)], ans) +test(618.5, levels=c(0L, 2L), copy(DT)[NULL,r:=sum(b)], null.data.table()) +# test 619 was level 2 of 618 + +DT = data.table(x=letters, key="x") +test(621, levels=c(0L, 2L), copy(DT)[J("bb"), x:="foo"], DT) # when no update, key should be retained +test(622, levels=c(0L, 2L), copy(DT)[J("bb"), x:="foo",nomatch=0], DT, warning="ignoring nomatch") + +set.seed(2) +DT = data.table(a=rnorm(5)*10, b=1:5) +test(623, levels=c(0L, 2L), copy(DT)[,s:=sum(b),by=round(a)%%2]$s, c(10L,5L,5L,10L,10L)) + +# Setup for test 656.x - gforce tests +set.seed(9) +n = 1e3 +DT = data.table(grp1=sample.int(150L, n, replace=TRUE), + grp2=sample.int(150L, n, replace=TRUE), + x=rnorm(n), + y=rnorm(n)) +opt = 0:2 +out = c('GForce FALSE', 'GForce FALSE' ,'GForce TRUE') +test(656.1, levels=opt, DT[ , mean(x), by=grp1, verbose=TRUE], output=out) +test(656.2, levels=opt, DT[ , list(mean(x)), by=grp1, verbose=TRUE], output=out) +test(656.3, levels=opt, DT[ , list(mean(x), mean(y)), by=grp1, verbose=TRUE], output=out) + +# Test := keyby does setkey, #2065 +DT = data.table(x=1:2, y=1:6) +ans = data.table(x=rep(1:2,each=3),y=c(1L,3L,5L,2L,4L,6L),z=rep(c(9L,12L),each=3),key="x") +test(670.1, levels=c(0L, 2L), copy(DT)[,z:=sum(y),keyby=x], ans) +test(670.2, levels=c(0L, 2L), copy(DT)[,z:=sum(y),keyby="x"], ans) +test(670.3, levels=c(0L, 2L), copy(DT)[,z:=sum(y),keyby=x%%2], data.table(x=1:2,y=1:6,z=c(9L,12L)), + warning="The setkey() normally performed by keyby= has been skipped (as if by= was used) because := is being used together with keyby= but the keyby= contains some expressions. To avoid this warning, use by= instead, or provide existing column names to keyby=") +test(670.4, levels=c(0L, 2L), copy(DT)[,z:=sum(y),by=x%%2], data.table(x=1:2,y=1:6,z=c(9L,12L))) +test(670.5, levels=c(0L, 2L), copy(DT)[x>1,z:=sum(y),keyby=x], error=":= with keyby is only possible when i is not supplied since") +# test 671 was level 2 of 670 + +# varname holding colnames, by group, linked from #2120. +DT = data.table(a=rep(1:3,1:3),b=1:6) +colname = "newcol" +test(751, levels=c(0L, 2L), DT[,(colname):=sum(b),by=a], data.table(a=rep(1:3,1:3),b=1:6,newcol=INT(1,5,5,15,15,15))) + +# Add tests for nested := in j by group, #1987 +DT = data.table(a=rep(1:3,2:4),b=1:9) +test(752, levels=c(0L, 2L), DT[,head(.SD,2)[,new:=1:.N],by=a], data.table(a=rep(1:3,each=2),b=c(1:4,6:7),new=1:2)) + +DT = data.table(a=1:3,b=1:3,v=1:6,w=1:6) +opt = c(0:2, Inf) +out = c('GForce FALSE', 'GForce FALSE', 'GForce TRUE', 'GForce TRUE') +# v1.9.7 treats wrapped {} better, so this is now optimized +test(865, levels=opt, DT[,{list(name1=sum(v),name2=sum(w))},by=c('a', 'b'),verbose=TRUE], output=out) +test(867, levels=opt, names(DT[,list(name1=sum(v),name2=sum(w)),by=c('a', 'b')]), c("a", "b", "name1", "name2")) # list names extracted here + +# tests of gsum and gmean with NA +DT = data.table(x=rep(c("a","b","c","d"),each=3), y=c(1L,3L,6L), v=as.numeric(1:12)) +set(DT,c(3L,8L),"y",NA) +set(DT,c(5L,9L),"v",NA) +set(DT,10:12,"y",NA) +set(DT,10:12,"v",NA) +opt = c(1L, 2L) +out = c("(GForce FALSE)", "GForce optimized j to") +test(1184.1, levels=opt, DT[, sum(v), by=x, verbose=TRUE], output=out) +test(1184.2, levels=1L, DT[, mean(v), by=x, verbose=TRUE], output="(GForce FALSE)") +test(1185.2, levels=c(0L,1L,Inf), DT[, list(mean(y), mean(v), mean(y,na.rm=TRUE), mean(v,na.rm=TRUE)), by=x, verbose=TRUE], + output=c("All optimizations.*off", "Old mean.*changed j", "GForce optimized j to")) +test(1187.1, levels=c(0L, 1L, 2L), DT[, list(sum(y), sum(v), sum(y,na.rm=TRUE), sum(v,na.rm=TRUE)), by=x], + data.table(x=c("a","b","c","d"), V1=c(NA,10L,NA,NA), V2=c(6,NA,NA,NA), V3=c(4L,10L,7L,0L), V4=c(6,10,15,0))) +MyVar = TRUE +test(1187.2, levels=opt, DT[, list(sum(y,na.rm=MyVar), mean(y,na.rm=MyVar)), by=x, verbose=TRUE], output=out) +test(1187.3, levels=opt, DT[, mean(y,na.rm=MyVar), by=x, verbose=TRUE], output=out) +MyVar = FALSE +test(1187.4, levels=opt, DT[, list(sum(y,na.rm=MyVar), mean(y,na.rm=MyVar)), by=x, verbose=TRUE], output=out) +test(1187.5, levels=opt, DT[, mean(y,na.rm=MyVar), by=x, verbose=TRUE], output=out) +# GForce should not turn on when the .ok function isn't triggered +test(1187.6, levels=2L, DT[, mean(y, trim=.2), by=x, verbose=TRUE], + data.table(x = c("a", "b", "c", "d"), V1 = c(NA, 3.33333333333333, NA, NA)), + output='j unchanged', warning="'trim' is not yet optimized") + +# FR #334. Test to just make sure that GForce and dogroups with .N are giving the same results. +set.seed(2L) +dt <- data.table(x=sample(rep(1:5e3, each=3)), y=sample(10)) +test(1304.1, levels=0:2, dt[, list(.N, sum(y)), by=x]) +dt <- data.table(x=sample(rep(1:5e3, each=3)), y=sample(10), key="x") +test(1304.2, levels=0:2, dt[, list(.N, sum(y)), by=x]) + +# gmin and gmax extensive testing (because there are tricky cases) +DT <- data.table(x=rep(1:6, each=3), y=INT(4,-1,0, NA,4,10, 4,NA,10, 4,10,NA, -2147483647, -2147483647, -2147483647, 2147483647, 2147483647, 2147483647)) +opts = 0:2 +# for integers +test(1313.01, levels=opts, DT[, min(y), by=x], DT[, base::min(y), by=x]) +test(1313.02, levels=opts, DT[, max(y), by=x], DT[, base::max(y), by=x]) +test(1313.03, levels=opts, DT[, min(y, na.rm=TRUE), by=x], DT[, base::min(y, na.rm=TRUE), by=x]) +test(1313.04, levels=opts, DT[, max(y, na.rm=TRUE), by=x], DT[, base::max(y, na.rm=TRUE), by=x]) +# testing all NA - GForce automatically converts to numeric.. optimize=1L errors due to change from integer/numeric (like median) +DT[x==6, y := INT(NA)] +test(1313.05, levels=opts, DT[, min(y), by=x], DT[, base::min(y), by=x]) +test(1313.06, levels=opts, DT[, max(y), by=x], DT[, base::max(y), by=x]) +test(1313.07, levels=2L, DT[, min(y, na.rm=TRUE), by=x], data.table(x=1:6, V1=INT(-1,4,4,4,-2147483647,NA))) +test(1313.08, levels=2L, DT[, max(y, na.rm=TRUE), by=x], data.table(x=1:6, V1=INT(4,10,10,10,-2147483647,NA))) +# for numeric +DT <- data.table(x=rep(1:6, each=3), y=c(4,-1,0, NA,4,10, 4,NA,10, 4,10,NA, -Inf, NA, NA, Inf, NA, NA)) +test(1313.09, levels=opts, DT[, min(y), by=x], DT[, base::min(y), by=x]) +test(1313.10, levels=opts, DT[, max(y), by=x], DT[, base::max(y), by=x]) +test(1313.11, levels=opts, DT[, min(y, na.rm=TRUE), by=x], DT[, base::min(y, na.rm=TRUE), by=x]) +test(1313.12, levels=opts, DT[, max(y, na.rm=TRUE), by=x], DT[, base::max(y, na.rm=TRUE), by=x]) +# testing all NA - GForce automatically converts to numeric.. optimize=1L errors due to change from integer/numeric (like median) +DT[x==6, y := NA_real_] +test(1313.13, levels=opts, DT[, min(y), by=x], DT[, base::min(y), by=x]) +test(1313.14, levels=opts, DT[, max(y), by=x], DT[, base::max(y), by=x]) +test(1313.15, levels=2L, DT[, min(y, na.rm=TRUE), by=x], data.table(x=1:6, V1=c(-1,4,4,4,-Inf,NA))) +test(1313.16, levels=2L, DT[, max(y, na.rm=TRUE), by=x], data.table(x=1:6, V1=c(4,10,10,10,-Inf,NA))) +# for date (attribute check.. especially after issues/689 !!!) +DT <- data.table(x = rep(letters[1:2], each=5), y = as.POSIXct('2010-01-01', tz="UTC") + seq(0, 86400*9, 86400)) +test(1313.17, levels=opts, DT[, list(y=min(y)), by=x], DT[c(1,6)]) +test(1313.18, levels=opts, DT[, list(y=max(y)), by=x], DT[c(5,10)]) +DT[c(1,6), y := NA] +test(1313.19, levels=opts, DT[, list(y=min(y)), by=x], DT[c(1,6)]) +test(1313.20, levels=opts, DT[, list(y=max(y)), by=x], DT[c(1,6)]) +test(1313.21, levels=opts, DT[, list(y=min(y, na.rm=TRUE)), by=x], DT[c(2,7)]) +test(1313.22, levels=opts, DT[, list(y=max(y, na.rm=TRUE)), by=x], DT[c(5,10)]) +# for character +set.seed(1L) +DT <- data.table(x=rep(1:7, each=3), y=sample(c("", letters[1:3], NA), 21, TRUE)) +DT[x==7, y := c("","b","c")] +test(1313.23, levels=opts, DT[, min(y), by=x], DT[, base::min(y), by=x]) +test(1313.24, levels=opts, DT[, max(y), by=x], DT[, base::max(y), by=x]) +test(1313.25, levels=opts, DT[, min(y, na.rm=TRUE), by=x], DT[, base::min(y, na.rm=TRUE), by=x]) +test(1313.26, levels=opts, DT[, max(y, na.rm=TRUE), by=x], DT[, base::max(y, na.rm=TRUE), by=x]) +DT[x==6, y := NA_character_] +test(1313.27, levels=opts, DT[, min(y), by=x], DT[, base::min(y), by=x]) +test(1313.28, levels=opts, DT[, max(y), by=x], DT[, base::max(y), by=x]) +test(1313.29, levels=2L, DT[, min(y, na.rm=TRUE), by=x], data.table(x=1:7, V1=c("a","a","c","","a",NA,""))) +test(1313.30, levels=2L, DT[, max(y, na.rm=TRUE), by=x], data.table(x=1:7, V1=c("b","a","c","a","c",NA,"c"))) + +# Fix for #1251, DT[, .N, by=a] and DT[, .(.N), by=a] uses GForce now +dt = data.table(a=sample(3,20,TRUE), b=1:10) +test(1565.1, levels=c(0,1,Inf), dt[, .N, by=a, verbose=TRUE], + output=c("All optimizations are turned off", "lapply optimization is on, j unchanged", "GForce optimized j to")) + +# gforce optimisations +dt = data.table(x = sample(letters, 300, TRUE), + i1 = sample(-10:10, 300, TRUE), + i2 = sample(c(-10:10, NA), 300, TRUE), + d1 = as.numeric(sample(-10:10, 300, TRUE)), + d2 = as.numeric(sample(c(NA, NaN, -10:10), 300, TRUE))) +if (test_bit64) { + dt[, `:=`(d3 = as.integer64(sample(-10:10, 300, TRUE)))] + dt[, `:=`(d4 = as.integer64(sample(c(-10:10,NA), 300, TRUE)))] +} +opt = 0:2 +out = c('GForce FALSE', 'GForce FALSE' ,'GForce TRUE') +# make sure gforce is on +# testing gforce::gmedian +test(1579.01, levels=2L, dt[, lapply(.SD, median), by=x, verbose=TRUE], + dt[, lapply(.SD, function(x) median(as.numeric(x))), by=x], output="GForce optimized") +test(1579.02, levels=2L, dt[, lapply(.SD, median, na.rm=TRUE), by=x], + dt[, lapply(.SD, function(x) median(as.numeric(x), na.rm=TRUE)), by=x]) +test(1579.03, levels=2L, dt[, lapply(.SD, median), keyby=x], + dt[, lapply(.SD, function(x) median(as.numeric(x))), keyby=x]) +test(1579.04, levels=2L, dt[, lapply(.SD, median, na.rm=TRUE), keyby=x], + dt[, lapply(.SD, function(x) median(as.numeric(x), na.rm=TRUE)), keyby=x]) +# testing gforce::ghead and gforce::gtail +# head(.SD, 1) and tail(.SD, 1) optimisation +test(1579.06, levels=opt, dt[, head(.SD,1), by=x, verbose=TRUE], output=out) +test(1579.08, levels=opt, dt[, head(.SD,1), keyby=x, verbose=TRUE], output=out) +test(1579.10, levels=opt, dt[, head(.SD,1L), by=x, verbose=TRUE], output=out) +test(1579.12, levels=opt, dt[, head(.SD,1L), keyby=x, verbose=TRUE], output=out) +test(1579.14, levels=opt, dt[, tail(.SD,1), by=x, verbose=TRUE], output=out) +test(1579.16, levels=opt, dt[, tail(.SD,1), keyby=x, verbose=TRUE], output=out) +test(1579.18, levels=opt, dt[, tail(.SD,1L), by=x, verbose=TRUE], output=out) +test(1579.20, levels=opt, dt[, tail(.SD,1L), keyby=x, verbose=TRUE], output=out) +# 1579.22 tested gtail with n>1; now 1579.4+ below +mysub <- function(x, n) x[n] +test(1579.23, levels=2L, dt[, .SD[2], by=x, verbose=TRUE], dt[, mysub(.SD,2), by=x], output="GForce optimized.*g[[]") +test(1579.24, levels=opt, dt[, .SD[2], keyby=x], dt[, mysub(.SD,2), keyby=x]) +test(1579.25, levels=opt, dt[, .SD[2L], by=x], dt[, mysub(.SD,2L), by=x]) +test(1579.26, levels=opt, dt[, .SD[2L], keyby=x], dt[, mysub(.SD,2L), keyby=x]) +test(1579.27, levels=opt, dt[, .SD[15], by=x], dt[, mysub(.SD,15), by=x]) # tests 15 > grpsize and that NA is correct including for integer64 +test(1579.28, levels=opt, dt[, .SD[15], keyby=x], dt[, mysub(.SD,15), keyby=x]) +# gforce head/tail for n>1, #5060 +set.seed(99) +DT = data.table(x = sample(letters[1:5], 20, TRUE), + y = rep.int(1:2, 10), # to test 2 grouping columns get rep'd properly + i = sample(c(-2L,0L,3L,NA), 20, TRUE), + d = sample(c(1.2,-3.4,5.6,NA), 20, TRUE), + s = sample(c("foo","bar",NA), 20, TRUE), + l = sample(list(1:3, mean, letters[4:5], NULL), 20, replace=TRUE)) +if (test_bit64) DT[, i64:=sample(as.integer64(c(-2200000000,+2400000000,NA)), 20, TRUE)] +test(1579.401, levels=0:2, DT[, .N, by=x]$N, INT(4,6,5,2,3)) # the smallest group is 2, so n=5 tests n constrained to grpsize +test(1579.402, levels=2L, DT[, head(.SD,2), by=x, verbose=TRUE], DT[, utils::head(.SD,2), by=x], output="optimized.*ghead") +test(1579.403, levels=2L, DT[, head(.SD,2), keyby=x, verbose=TRUE], DT[, utils::head(.SD,2), keyby=x], output="optimized.*ghead") +test(1579.404, levels=2L, DT[, head(.SD,5), by=x, verbose=TRUE], DT[, utils::head(.SD,5), by=x], output="optimized.*ghead") +test(1579.405, levels=2L, DT[, head(.SD,5), keyby=x, verbose=TRUE], DT[, utils::head(.SD,5), keyby=x], output="optimized.*ghead") +test(1579.406, levels=2L, DT[, tail(.SD,2), by=x, verbose=TRUE], DT[, utils::tail(.SD,2), by=x], output="optimized.*gtail") +test(1579.407, levels=2L, DT[, tail(.SD,2), keyby=x, verbose=TRUE], DT[, utils::tail(.SD,2), keyby=x], output="optimized.*gtail") +test(1579.408, levels=2L, DT[, tail(.SD,5), by=x, verbose=TRUE], DT[, utils::tail(.SD,5), by=x], output="optimized.*gtail") +test(1579.409, levels=2L, DT[, tail(.SD,5), keyby=x, verbose=TRUE], DT[, utils::tail(.SD,5), keyby=x], output="optimized.*gtail") +test(1579.410, levels=2L, DT[, tail(.SD,2), by=.(x,y), verbose=TRUE], DT[, utils::tail(.SD,2), by=.(x,y)], output="optimized.*gtail") + + +# FR #971, partly addressed (only subsets in 'i') +# make sure GForce kicks in and the results are identical +dt = data.table(x = sample(letters, 300, TRUE), + d1 = as.numeric(sample(-10:10, 300, TRUE)), + d2 = as.numeric(sample(c(NA, NaN, -10:10), 300, TRUE))) +opt = 1:2 +out = c("GForce FALSE","GForce optimized j") +test(1581.01, levels=opt, ans1 <- dt[x %in% letters[15:20], + c(.N, lapply(.SD, sum, na.rm=TRUE), + lapply(.SD, min, na.rm=TRUE), + lapply(.SD, max, na.rm=TRUE), + lapply(.SD, mean, na.rm=TRUE), + lapply(.SD, median, na.rm=TRUE) + ), by=x, verbose=TRUE], + output = out) + +# subsets in 'i' for head and tail +test(1581.04, levels=opt, dt[x %in% letters[15:20], head(.SD,1), by=x, verbose=TRUE], output=out) +test(1581.07, levels=opt, dt[x %in% letters[15:20], tail(.SD,1), by=x, verbose=TRUE], output=out) +test(1581.10, levels=opt, dt[x %in% letters[15:20], .SD[2], by=x, verbose=TRUE], output=out) +# #3209 g[[ +test(1581.13, levels=opt, dt[x %in% letters[15:20], d1[[2]], by=x, verbose=TRUE], output=out) +# also, block for non-atomic input, #4159 +dt = data.table(a=1:3) +dt[ , l := .(list(1, 2, 3))] +test(1581.16, dt[ , .(l = l[[1L]]), by=a, verbose=TRUE], + dt[ , l := unlist(l)], output='(GForce FALSE)') +# make sure not to apply when `[[` is applied to a nested call, #4413 +DT = data.table(f1=c("a","b"), f2=c("x","y")) +l = list(a = c(x = "ax", y = "ay"), b = c(x = "bx", y = "by")) +test(1581.17, DT[ , as.list(l[[f1]])[[f2]], by=c("f1","f2")], + data.table(f1 = c("a", "b"), f2 = c("x", "y"), V1 = c("ax", "by"))) +test(1581.18, DT[, v:=l[[f1]][f2], by=c("f1","f2")], + data.table(f1=c("a","b"), f2=c("x","y"), v=c("ax", "by"))) +# When the object being [[ is in parent.frame(), not x, +# need eval to have enclos=parent.frame(), #4612 +DT = data.table(id = c(1, 1, 2), value = c("a", "b", "c")) +DT0 = copy(DT) +fun = function(DT, tag = c("A", "B")) DT[, var := tag[[.GRP]], by = "id"] +fun(DT) +test(1581.19, DT, DT0[ , var := c('A', 'A', 'B')]) + +# bug fix #1461 related to NaN not being recognized due to ISNA vs ISNAN at C level +# verbatim test from the original report: +DT = data.table( + C1 = c(rep("A", 4), rep("B",4), rep("C", 4)), + C2 = c(rep("a", 3), rep("b",3), rep("c",3), rep("d",3)), + Val = c(1:5, NaN, NaN, 8,9,10,NaN,12)) +opt = 0:2 +test(1583.1, levels=opt, DT[, .(agg = min(Val, na.rm=TRUE)), by=c('C1', 'C2')], + data.table(C1=c("A","A","B","B","C","C"), + C2=c("a","b","b","c","c","d"), + agg=c(1,4,5,8,9,10))) +# extra test with a size-1 group containing one NaN too +DT = data.table(x=INT(1,1,1,2,2,2,3,3,3,4,4,4,5), y=c(NaN,1,2, 2,NaN,1, NA,NaN,2, NaN,NA,NaN, NaN)) +test(1583.2, levels=2L, DT[, min(y, na.rm=TRUE), by=x], data.table(x=1:5, V1=c(1,1,2,NA,NA))) +test(1583.3, levels=2L, DT[, max(y, na.rm=TRUE), by=x], data.table(x=1:5, V1=c(2,2,2,NA,NA))) +test(1583.4, levels=opt, DT[, min(y), by=x], data.table(x=1:5, V1=c(NaN,NaN,NA,NaN,NaN))) +test(1583.5, levels=opt, DT[, max(y), by=x], data.table(x=1:5, V1=c(NaN,NaN,NA,NaN,NaN))) + +# FR #523, var, sd and prod +DT = data.table(x=sample(5, 100, TRUE), + y1=sample(6, 100, TRUE), + y2=sample(c(1:10,NA), 100, TRUE), + z1=runif(100), + z2=sample(c(runif(10),NA,NaN), 100, TRUE)) +opt = 0:2 +out = c("GForce FALSE", "GForce FALSE", "GForce optimized j to") +test(1594.01, levels=opt, DT[, lapply(.SD, var, na.rm=FALSE), by=x]) +test(1594.02, levels=opt, DT[, lapply(.SD, var, na.rm=TRUE), by=x]) +test(1594.03, levels=opt, DT[, lapply(.SD, var, na.rm=TRUE), by=x, verbose=TRUE], output=out) +# coverage: default group .N=1 case +idx=DT[ , .I[1L], by=x]$V1 +ans=data.table(x=DT[(idx), x], V1=NA_real_) +test(1594.05, levels=opt, DT[(idx), var(y1), by=x], ans) +test(1594.06, levels=opt, DT[(idx), var(y1, na.rm=TRUE), by=x], ans) +test(1594.07, levels=opt, DT[(idx), var(z1), by=x], ans) +test(1594.08, levels=opt, DT[(idx), var(z1, na.rm=TRUE), by=x], ans) + +test(1594.09, levels=opt,DT[, lapply(.SD, sd, na.rm=FALSE), by=x]) +test(1594.10, levels=opt, DT[, lapply(.SD, sd, na.rm=TRUE), by=x], DT[, lapply(.SD, stats::sd, na.rm=TRUE), by=x]) +test(1594.11, levels=opt, DT[, lapply(.SD, sd, na.rm=TRUE), by=x, verbose=TRUE], output=out) + +test(1594.12, levels=opt, DT[, lapply(.SD, prod, na.rm=FALSE), by=x]) +test(1594.13, levels=opt, DT[, lapply(.SD, prod, na.rm=TRUE), by=x]) +test(1594.14, levels=opt, DT[, lapply(.SD, prod, na.rm=TRUE), by=x, verbose=TRUE], output=out) + +# when datatable.optimize<1, no optimisation of j should take place: +dt = data.table(x=1:5, y=6:10, z=c(1,1,1,2,2)) +test(1638, options=c(datatable.optimize=0L), dt[, .SD, by=z, verbose=TRUE], output="All optimizations are turned off") + +# weighted.mean GForce optimized, #3977 +DT = data.table(x=c(3.7,3.3,3.5,2.8), w=c(5,5,4,1), g=1L) +opt = c(1L,2L) +out = c("GForce FALSE", "GForce optimized j to") +test(2231.01, levels=opt, DT[, weighted.mean(x, w), g, verbose=TRUE], data.table(g=1L, V1=3.45333333333333), output=out) +test(2231.02, levels=opt, DT[, weighted.mean(w, x), g, verbose=TRUE], data.table(g=1L, V1=3.89473684210526), output=out) +test(2231.03, levels=opt, DT[, weighted.mean(x), g, verbose=TRUE], data.table(g=1L, V1=3.325), output=out) +# multiple groups +DT = data.table(x=c(1L,2L,2L,3L,4L,5L,5L,6L), w=c(1L,1L,1L,1L,2L,2L,2L,2L), g=rep(1L:2L, each=4L)) +test(2231.04, levels=opt, DT[, weighted.mean(x, w), g, verbose=TRUE], data.table(g=c(1L,2L), V1=c(2,5)), output=out) +test(2231.05, levels=opt, DT[, weighted.mean(x, w), g, verbose=TRUE], data.table(g=c(1L,2L), V1=c(2,5)), output=out) +test(2231.06, levels=opt, DT[, weighted.mean(x, w), seq(nrow(DT)), verbose=TRUE], data.table(seq=1L:8L, V1=c(1,2,2,3,4,5,5,6)), output=out) +# (only x XOR w) containing NA +DT = data.table(x=c(1L,NA,2L,3L,4L,5L,5L,6L), w=c(1L,1L,1L,1L,2L,NA,2L,2L), g=rep(1L:2L, each=4L)) +test(2231.07, levels=opt, DT[, weighted.mean(x, w, na.rm=FALSE), g, verbose=TRUE], data.table(g=c(1L,2L), V1=c(NA_real_, NA_real_)), output=out) +test(2231.08, levels=opt, DT[, weighted.mean(x, w, na.rm=TRUE), g, verbose=TRUE], data.table(g=c(1L,2L), V1=c(2, NA_real_)), output=out) +test(2231.09, levels=opt, DT[, weighted.mean(x, na.rm=FALSE), g, verbose=TRUE], data.table(g=c(1L,2L), V1=c(NA, 5)), output=out) +test(2231.10, levels=opt, DT[, weighted.mean(x, na.rm=TRUE), g, verbose=TRUE], data.table(g=c(1L,2L), V1=c(2, 5)), output=out) +# (only x XOR w) containing NaN +DT = data.table(x=c(1L,2L,NaN,3L,4L,5L,5L,6L), w=c(1L,1L,1L,1L,2L,2L,NaN,2L), g=rep(1L:2L, each=4L)) +test(2231.11, levels=opt, DT[, weighted.mean(x, w, na.rm=FALSE), g, verbose=TRUE], data.table(g=c(1L,2L), V1=c(NaN, NA)), output=out) +test(2231.12, levels=opt, DT[, weighted.mean(x, w, na.rm=TRUE), g, verbose=TRUE], data.table(g=c(1L,2L), V1=c(2, NA)), output=out) +test(2231.13, levels=opt, DT[, weighted.mean(x, na.rm=FALSE), g, verbose=TRUE], data.table(g=c(1L,2L), V1=c(NaN, 5)), output=out) +test(2231.14, levels=opt, DT[, weighted.mean(x, na.rm=TRUE), g, verbose=TRUE], data.table(g=c(1L,2L), V1=c(2, 5)), output=out) +# (only x XOR w) containing NA and NaN +DT = data.table(x=c(1L,NA,NaN,3L,4L,5L,5L,6L), w=c(1L,1L,1L,1L,2L,NA,NaN,2L), g=rep(1L:2L, each=4L)) +test(2231.15, levels=opt, DT[, weighted.mean(x, w, na.rm=FALSE), g, verbose=TRUE], data.table(g=c(1L,2L), V1=c(NA_real_, NA_real_)), output=out) +test(2231.16, levels=opt, DT[, weighted.mean(x, w, na.rm=TRUE), g, verbose=TRUE], data.table(g=c(1L,2L), V1=c(2, NA)), output=out) +test(2231.17, levels=opt, DT[, weighted.mean(x, na.rm=FALSE), g, verbose=TRUE], data.table(g=c(1L,2L), V1=c(NA, 5)), output=out) +test(2231.18, levels=opt, DT[, weighted.mean(x, na.rm=TRUE), g, verbose=TRUE], data.table(g=c(1L,2L), V1=c(2, 5)), output=out) +# (x and w) containing NA and NaN +DT = data.table(x=c(1L,NA,NaN,3L,4L,5L,5L,6L), w=c(1L,NA,NaN,1L,2L,2L,2L,2L), g=rep(1L:2L, each=4L)) +test(2231.19, levels=opt, DT[, weighted.mean(x, w, na.rm=FALSE), g, verbose=TRUE], data.table(g=c(1L,2L), V1=c(NA, 5)), output=out) +test(2231.20, levels=opt, DT[, weighted.mean(x, w, na.rm=TRUE), g, verbose=TRUE], data.table(g=c(1L,2L), V1=c(2, 5)), output=out) +DT = data.table(x=c(1L,NA,NaN,3L,4L,5L,5L,6L), w=c(1L,NaN,NA,1L,2L,2L,2L,2L), g=rep(1L:2L, each=4L)) +test(2231.21, levels=opt, DT[, weighted.mean(x, w, na.rm=FALSE), g, verbose=TRUE], data.table(g=c(1L,2L), V1=c(NA, 5)), output=out) +test(2231.22, levels=opt, DT[, weighted.mean(x, w, na.rm=TRUE), g, verbose=TRUE], data.table(g=c(1L,2L), V1=c(2, 5)), output=out) +# let wrongly named arguments get lost in ellipsis #5543 +DT = data.table(x=c(3.7,3.3,3.5,2.8), w=c(5,5,4,1), g=1L) +test(2231.61, levels=opt, DT[, weighted.mean(x, w), g, verbose=TRUE], data.table(g=1L, V1=3.45+1/300), output=out) +test(2231.62, levels=opt, DT[, weighted.mean(x, weight=w), g, verbose=TRUE], data.table(g=1L, V1=3.325), output=out) +test(2231.63, levels=opt, DT[, weighted.mean(x, w, na.rm=FALSE), g], DT[, stats::weighted.mean(x, w, na.rm=FALSE), g]) +test(2231.64, levels=opt, DT[, weighted.mean(x, weight=w, na.rm=TRUE)], DT[, stats::weighted.mean(x, weight=w, na.rm=TRUE)]) + +# GForce retains attributes in by arguments #5567 +dt = data.table(a=letters[1:4], b=structure(1:4, class = c("class_b", "integer"), att=1), c=structure(c(1L,2L,1L,2L), class = c("class_c", "integer"))) +opt = c(0,Inf) +out = c("GForce FALSE", "GForce optimized j to") +test(2263.1, levels=opt, options=list(datatable.verbose=TRUE), dt[, .N, b], data.table(b=dt$b, N=1L), output=out) +test(2263.2, levels=opt, options=list(datatable.verbose=TRUE), dt[, .N, .(b,c)], data.table(b=dt$b, c=dt$c, N=1L), output=out) +test(2263.3, levels=opt, options=list(datatable.verbose=TRUE), names(attributes(dt[, .N, b]$b)), c("class", "att"), output=out) + +# named arguments of c() in j get prepended to lapply(.SD, FUN) #2311 +M <- as.data.table(mtcars) +M[, " " := hp] +M[, "." := hp] + +sdnames <- setdiff(names(M), "cyl") +sdlist <- vector("list", length(sdnames)) +names(sdlist) <- sdnames + +opts = 0:2 +test(2283 + 0.01, levels=opts, + names(M[, c(m=lapply(.SD, mean)), by="cyl"]), + c("cyl", names(c(m=sdlist)))) +test(2283 + 0.02, levels=opts, + names(M[, c(Mpg=list(mpg), lapply(.SD, mean)), by="cyl"]), + c("cyl", "Mpg", sdnames)) +test(2283 + 0.03, levels=opts, + names(M[, c(Mpg=list(mpg), m=lapply(.SD, mean)), by="cyl"]), + c("cyl", "Mpg", names(c(m=sdlist)))) +test(2283 + 0.04, levels=opts, + names(M[, c(mpg=list(mpg), mpg=lapply(.SD, mean)), by="cyl"]), + c("cyl", "mpg", names(c(mpg=sdlist)))) +test(2283 + 0.05, levels=opts, + names(M[, c(list(mpg), lapply(.SD, mean)), by="cyl"]), + c("cyl", "V1", sdnames)) +test(2283 + 0.06, levels=opts, + names(M[, c(lapply(.SD, mean), list(mpg)), by="cyl"]), + c("cyl", sdnames, sprintf("V%d", length(sdnames)+1L))) +test(2283 + 0.07, levels=opts, + names(M[, c(lapply(.SD, mean), lapply(.SD, sum)), by="cyl"]), + c("cyl", sdnames, sdnames)) +test(2283 + 0.08, levels=opts, + names(M[, c(mean=lapply(.SD, mean), sum=lapply(.SD, sum)), by="cyl"]), + c("cyl", names(c(mean=sdlist, sum=sdlist)))) +test(2283 + 0.09, levels=opts, + names(M[, c(lapply(.SD, mean), sum=lapply(.SD, sum)), by="cyl"]), + c("cyl", sdnames, names(c(sum=sdlist))) ) +test(2283 + 0.10, levels=opts, + names(M[, c(" "=lapply(.SD, mean), "."=lapply(.SD, sum)), by="cyl"]), + c("cyl", names(c(" "=sdlist, "."=sdlist)))) +test(2283 + 0.11, levels=opts, + names(M[, c(A=list(a=mpg, b=hp), lapply(.SD, mean)), by="cyl"]), + c("cyl", names(c(A=list(a=0, b=0))), sdnames)) +test(2283 + 0.12, levels=opts, + names(M[, c(A=list(mpg, hp), lapply(.SD, mean)), by="cyl"]), + c("cyl", names(c(A=list(0, 0))), sdnames)) +test(2283 + 0.13, levels=opts, + names(M[, c(A=list(mpg, b=hp, wt), lapply(.SD, mean)), by="cyl"]), + c("cyl", names(c(A=list(0, b=0, 0))), sdnames)) +test(2283 + 0.14, levels=opts, + names(M[, c(A=list(mpg), lapply(.SD, mean)), by="cyl"]), + c("cyl", names(c(A=list(0))), sdnames)) +test(2283 + 0.15, levels=opts, + names(M[, c(" "=list(" "=hp, "."=disp, mpg), lapply(.SD, mean)), by="cyl"]), + c("cyl", names(c(" "=list(" "=0, "."=0, 0))), sdnames)) +test(2283 + 0.16, levels=opts, + names(M[, c("."=list(" "=hp, "."=disp, mpg), lapply(.SD, mean)), by="cyl"]), + c("cyl", names(c("."=list(" "=0, "."=0, 0))), sdnames)) +test(2283 + 0.17, levels=opts, + names(M[, c(list(mpg, b=hp), lapply(.SD, mean)), by="cyl", .SDcols=c("vs", "am")]), + c("cyl", "V1", "b", "vs", "am")) +test(2283 + 0.18, levels=opts, + names(M[, c(list(mpg, b=hp), c(lapply(.SD, mean))), by="cyl", .SDcols=c("vs", "am")]), + c("cyl", "V1", "b", "vs", "am")) +test(2283 + 0.19, levels=opts, + names(M[, c(mpg[1], list(mpg, b=hp), c(lapply(.SD, mean))), by="cyl", .SDcols=c("vs", "am")]), + c("cyl", "V1", "V2", "b", "vs", "am")) + +# Test lapply(list(...), fun) optimization +dt = as.data.table(iris) +opt = c(0L, 1L) +out = c('GForce FALSE', 'lapply optimization changed j') +test(2284.1, levels=opt, dt[, lapply(list(Petal.Length, Sepal.Length), sum), by=Species, verbose=TRUE], output=out) +test(2284.2, levels=opt, names(dt[, lapply(list(Petal.Length, Sepal.Length), sum), by=Species])) +test(2284.3, levels=opt, dt[, c(list(sum(Petal.Length), sum(Sepal.Length))), by=Species, verbose=TRUE], output=out) diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 5540b212d..6c191e91f 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -13676,6 +13676,7 @@ DT = data.table(A=1:5, B=-3i, C=2147483647L) test(1968.2, storage.mode(DT$C), "integer") test(1968.3, DT[, sum(C), by=A%%2L], data.table(A=c(1L,0L), V1=c(6442450941, 4294967294)), warning="sum.*integer column.*more than type 'integer' can hold.*coerced to 'numeric'") +test(1968.35, DT[, sum(as.numeric(C)), by=A%%2L], data.table(A=c(1L,0L), V1=c(6442450941, 4294967294))) DT[3,C:=NA] test(1968.4, DT[, sum(C), by=A%%2L], data.table(A=c(1L,0L), V1=c(NA, 4294967294)), warning="coerced to 'numeric'") test(1968.5, DT[, sum(C,na.rm=TRUE), by=A%%2L], data.table(A=c(1L,0L), V1=c(4294967294, 4294967294)), warning="coerced to 'numeric'") @@ -21213,3 +21214,18 @@ test(2344.03, setkey(d1[, .(V1, label = c("one", "zero", "one"), V2)][data.table # keep sub-key in case of multiple keys, even with new columns and changing column order DT = data.table(V1 = 1:2, V2 = 3:4, V3 = 5:6, key = c("V1", "V2", "V3")) test(2344.04, key(DT[, .(V4 = c("b", "a"), V2, V5 = c("y", "x"), V1)]), c("V1", "V2")) + +# gforce should also work with Map in j #5336 +# conversions should not turn gforce off #2934 +# lapply gforce should also work without .SD #5032 +dt = data.table(a=1:4, b=1:2) +out = c("GForce FALSE", "GForce FALSE", "GForce TRUE") +test(2345.01, levels=0:2, dt[, max(as.character(a)), by=b, verbose=TRUE], data.table(b=1:2, V1=c("3","4")), output=out) +test(2345.02, levels=0:2, dt[, max(as.numeric(a)), by=b, verbose=TRUE], data.table(b=1:2, V1=c(3,4)), output=out) +test(2345.03, levels=0:2, dt[, Map(sum, .SD), b, verbose=TRUE], dt[, lapply(.SD, sum), b], output=out) +dt = data.table(a = NA_integer_, b = 1:2, c = c(TRUE, FALSE)) +test(2345.04, levels=0:2, dt[, Map(weighted.mean, .SD, na.rm=c), b, .SDcols="a", verbose=TRUE], data.table(b=1:2, a=c(NaN, NA_real_)), output="GForce FALSE") +test(2345.05, levels=0:2, dt[,list(weighted.mean(a, na.rm=c)), b, verbose=TRUE], data.table(b=1:2, V1=c(NaN, NA_real_)), output="GForce FALSE") +dt = data.table(a=1:2, b=1, c=1:4) +test(2345.06, levels=1:2, dt[, lapply(list(b, c), sum), by=a, verbose=TRUE], output=c("lapply optimization changed j", "GForce TRUE")) +test(2345.07, levels=1:2, dt[, c(list(sum(b), sum(c))), by=a, verbose=TRUE], output=c("lapply optimization changed j", "GForce TRUE")) diff --git a/tests/optimize.R b/tests/optimize.R new file mode 100644 index 000000000..69dc4954c --- /dev/null +++ b/tests/optimize.R @@ -0,0 +1,2 @@ +require(data.table) +test.data.table(script="optimize.Rraw") From 5410e315bcf1dee46ba2b1de63ac1c381801ec9a Mon Sep 17 00:00:00 2001 From: Benjamin Schwendinger Date: Sun, 2 Nov 2025 15:26:27 +0100 Subject: [PATCH 09/27] update man --- man/test.Rd | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/man/test.Rd b/man/test.Rd index 4a25a592c..5a2f459f2 100644 --- a/man/test.Rd +++ b/man/test.Rd @@ -22,7 +22,7 @@ test(num, x, y = TRUE, \item{ignore.warning}{ A single character string. Any warnings emitted by \code{x} that contain this string are dropped. Remaining warnings are compared to the expected \code{warning} as normal. } \item{options}{ A named list of options to set for the duration of the test. Any code evaluated during this call to \code{test()} (usually, \code{x}, or maybe \code{y}) will run with the named options set, and the original options will be restored on return. This is a named list since different options can have different types in general, but in typical usage, only one option is set at a time, in which case a named vector is also accepted. } \item{env}{ A named list of environment variables to set for the duration of the test, much like \code{options}. A list entry set to \code{NULL} will unset (i.e., \code{\link{Sys.unsetenv}}) the corresponding variable. } -\item{levels}{ A vector of different optimization levels to test. The code in \code{x} will be run once for each optimization level, with \code{options(datatable.optimize.level=level)} set accordingly. All levels must pass the test for the overall test to pass. If no y is supplied, the results from the different levels are compared to each other for equality. If a y is supplied, the results from each level are compared to y for equality. } +\item{levels}{ A vector of different optimization levels to test. The code in \code{x} will be run once for each optimization level, with \code{options(datatable.optimize=level)} set accordingly. All levels must pass the test for the overall test to pass. If no y is supplied, the results from the different levels are compared to each other for equality. If a y is supplied, the results from each level are compared to y. } } \note{ \code{NA_real_} and \code{NaN} are treated as equal, use \code{identical} if distinction is needed. See examples below. From dece1c65b279a2196946947780b8f010bc1f4bbb Mon Sep 17 00:00:00 2001 From: Benjamin Schwendinger Date: Sun, 2 Nov 2025 15:32:29 +0100 Subject: [PATCH 10/27] merge tests --- inst/tests/optimize.Rraw | 8 -------- inst/tests/tests.Rraw | 6 ++++-- 2 files changed, 4 insertions(+), 10 deletions(-) diff --git a/inst/tests/optimize.Rraw b/inst/tests/optimize.Rraw index 57d53f052..16a6bb5a4 100644 --- a/inst/tests/optimize.Rraw +++ b/inst/tests/optimize.Rraw @@ -445,11 +445,3 @@ test(2283 + 0.18, levels=opts, test(2283 + 0.19, levels=opts, names(M[, c(mpg[1], list(mpg, b=hp), c(lapply(.SD, mean))), by="cyl", .SDcols=c("vs", "am")]), c("cyl", "V1", "V2", "b", "vs", "am")) - -# Test lapply(list(...), fun) optimization -dt = as.data.table(iris) -opt = c(0L, 1L) -out = c('GForce FALSE', 'lapply optimization changed j') -test(2284.1, levels=opt, dt[, lapply(list(Petal.Length, Sepal.Length), sum), by=Species, verbose=TRUE], output=out) -test(2284.2, levels=opt, names(dt[, lapply(list(Petal.Length, Sepal.Length), sum), by=Species])) -test(2284.3, levels=opt, dt[, c(list(sum(Petal.Length), sum(Sepal.Length))), by=Species, verbose=TRUE], output=out) diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 6c191e91f..9f22e624f 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -21227,5 +21227,7 @@ dt = data.table(a = NA_integer_, b = 1:2, c = c(TRUE, FALSE)) test(2345.04, levels=0:2, dt[, Map(weighted.mean, .SD, na.rm=c), b, .SDcols="a", verbose=TRUE], data.table(b=1:2, a=c(NaN, NA_real_)), output="GForce FALSE") test(2345.05, levels=0:2, dt[,list(weighted.mean(a, na.rm=c)), b, verbose=TRUE], data.table(b=1:2, V1=c(NaN, NA_real_)), output="GForce FALSE") dt = data.table(a=1:2, b=1, c=1:4) -test(2345.06, levels=1:2, dt[, lapply(list(b, c), sum), by=a, verbose=TRUE], output=c("lapply optimization changed j", "GForce TRUE")) -test(2345.07, levels=1:2, dt[, c(list(sum(b), sum(c))), by=a, verbose=TRUE], output=c("lapply optimization changed j", "GForce TRUE")) +out = c("GForce FALSE", "lapply optimization changed j", "GForce TRUE") +test(2345.06, levels=0:2, dt[, lapply(list(b, c), sum), by=a, verbose=TRUE], output=out) +test(2345.07, levels=0:2, dt[, c(list(sum(b), sum(c))), by=a, verbose=TRUE], output=out) +test(2345.08, levels=0:2, names(dt[, lapply(list(b, c), sum), by=a])) From 5e1789dd3c9898879b937433d8be671cd84cb210 Mon Sep 17 00:00:00 2001 From: Benjamin Schwendinger Date: Sun, 2 Nov 2025 15:37:37 +0100 Subject: [PATCH 11/27] polish test fun --- R/test.data.table.R | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/R/test.data.table.R b/R/test.data.table.R index decb600bb..645addcd8 100644 --- a/R/test.data.table.R +++ b/R/test.data.table.R @@ -379,17 +379,16 @@ test = function(num,x,y=TRUE,error=NULL,warning=NULL,message=NULL,output=NULL,no for (p in vector_params) { val = get(p, envir=environment()) if (length(val) > 0L) { - cl[[p]] = val[((i - 1L) %% length(val)) + 1L] # ccycle through values if fewer than levels + cl[[p]] = val[((i - 1L) %% length(val)) + 1L] # cycle through values if fewer than levels } } if (compare && i == 1L) cl$y = eval(cl$x, parent.frame()) - eval(cl, parent.frame()) + eval(cl, parent.frame()) # actual test call } return(invisible()) } - # print(match.call()) if (!is.null(env)) { old = Sys.getenv(names(env), names=TRUE, unset=NA) to_unset = !lengths(env) From 62f1c483115caa295525af56bfe3b8a839a2b476 Mon Sep 17 00:00:00 2001 From: Benjamin Schwendinger Date: Sun, 2 Nov 2025 16:40:56 +0100 Subject: [PATCH 12/27] add arithmetic --- R/data.table.R | 66 ++++++++++++++++++++++++++++++------------- inst/tests/tests.Rraw | 5 ++++ 2 files changed, 52 insertions(+), 19 deletions(-) diff --git a/R/data.table.R b/R/data.table.R index 52686ed81..8ac34f833 100644 --- a/R/data.table.R +++ b/R/data.table.R @@ -3344,31 +3344,59 @@ is_constantish = function(q, check_singleton=FALSE) { .gforce_ok = function(q, x, envir=parent.frame(2L)) { if (is.N(q)) return(TRUE) # For #334 + if (!is.call(q)) return(FALSE) # plain columns are not gforce-able since they might not aggregate (see test 104.1) + q1 = .get_gcall(q) - if (is.null(q1)) return(FALSE) - q2 = if (.is_type_conversion(q[[2L]]) && is.symbol(q[[2L]][[2L]])) q[[2L]][[2L]] else q[[2L]] - if (!q2 %chin% names(x) && q2 != ".I") return(FALSE) # 875 - if (length(q)==2L || (.arg_is_narm(q) && is_constantish(q[[3L]]) && - !(is.symbol(q[[3L]]) && q[[3L]] %chin% names(x)))) return(TRUE) - switch(as.character(q1), - "shift" = .gshift_ok(q), - "weighted.mean" = .gweighted.mean_ok(q, x), - "tail" = , "head" = .ghead_ok(q), - "[[" = , "[" = `.g[_ok`(q, x, envir), - FALSE - ) + if (!is.null(q1)) { + q2 = if (.is_type_conversion(q[[2L]]) && is.symbol(q[[2L]][[2L]])) q[[2L]][[2L]] else q[[2L]] + if (!q2 %chin% names(x) && q2 != ".I") return(FALSE) # 875 + if (length(q)==2L || (.arg_is_narm(q) && is_constantish(q[[3L]]) && + !(is.symbol(q[[3L]]) && q[[3L]] %chin% names(x)))) return(TRUE) + return(switch(as.character(q1), + "shift" = .gshift_ok(q), + "weighted.mean" = .gweighted.mean_ok(q, x), + "tail" = , "head" = .ghead_ok(q), + "[[" = , "[" = `.g[_ok`(q, x, envir), + FALSE + )) + } + + # check if arithmetic operator -> recursively validate ALL branches (like in AST) + if (is.symbol(q[[1L]]) && q[[1L]] %chin% c("+", "-", "*", "/", "^", "%%", "%/%")) { + for (i in 2:length(q)) { + if (!.gforce_ok(q[[i]], x, envir)) return(FALSE) + } + return(TRUE) + } + + FALSE } .gforce_jsub = function(q, names_x, envir=parent.frame(2L)) { - call_name = if (is.symbol(q[[1L]])) q[[1L]] else q[[1L]][[3L]] # latter is like data.table::shift, #5942. .gshift_ok checked this will work. - q[[1L]] = as.name(paste0("g", call_name)) - # gforce needs to evaluate arguments before calling C part TODO: move the evaluation into gforce_ok - # do not evaluate vars present as columns in x - if (length(q) >= 3L) { - for (i in 3:length(q)) { - if (is.symbol(q[[i]]) && !(q[[i]] %chin% names_x)) q[[i]] = eval(q[[i]], envir) # tests 1187.2 & 1187.4 + if (!is.call(q)) return(q) + + q1 = .get_gcall(q) + if (!is.null(q1)) { + call_name = if (is.symbol(q[[1L]])) q[[1L]] else q[[1L]][[3L]] # latter is like data.table::shift, #5942. .gshift_ok checked this will work. + q[[1L]] = as.name(paste0("g", call_name)) + # gforce needs to evaluate arguments before calling C part TODO: move the evaluation into gforce_ok + # do not evaluate vars present as columns in x + if (length(q) >= 3L) { + for (i in 3:length(q)) { + if (is.symbol(q[[i]]) && !(q[[i]] %chin% names_x)) q[[i]] = eval(q[[i]], envir) # tests 1187.2 & 1187.4 + } + } + return(q) + } + + # if arithmetic operator, recursively substitute its operands. we know what branches are valid from .gforce_ok + if (is.symbol(q[[1L]]) && q[[1L]] %chin% c("+", "-", "*", "/", "^", "%%", "%/%")) { + for (i in 2:length(q)) { + q[[i]] = .gforce_jsub(q[[i]], names_x, envir) } + return(q) } + # should not reach here since .gforce_ok q } diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 9f22e624f..2a58d0ee2 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -21218,6 +21218,7 @@ test(2344.04, key(DT[, .(V4 = c("b", "a"), V2, V5 = c("y", "x"), V1)]), c("V1", # gforce should also work with Map in j #5336 # conversions should not turn gforce off #2934 # lapply gforce should also work without .SD #5032 +# support arithmetic in j with gforce #3815 dt = data.table(a=1:4, b=1:2) out = c("GForce FALSE", "GForce FALSE", "GForce TRUE") test(2345.01, levels=0:2, dt[, max(as.character(a)), by=b, verbose=TRUE], data.table(b=1:2, V1=c("3","4")), output=out) @@ -21231,3 +21232,7 @@ out = c("GForce FALSE", "lapply optimization changed j", "GForce TRUE") test(2345.06, levels=0:2, dt[, lapply(list(b, c), sum), by=a, verbose=TRUE], output=out) test(2345.07, levels=0:2, dt[, c(list(sum(b), sum(c))), by=a, verbose=TRUE], output=out) test(2345.08, levels=0:2, names(dt[, lapply(list(b, c), sum), by=a])) +dt = data.table(a=1:4, b=1:2) +out = c("GForce FALSE", "GForce FALSE", "GForce TRUE") +test(2345.09, levels=0:2, dt[, .(max(a)-min(a)), by=b, verbose=TRUE], output=out) +dt = data.table(a=1:4, b=1:2) From c47ec27e7039a47f444913d040089235878263cc Mon Sep 17 00:00:00 2001 From: Benjamin Schwendinger Date: Sun, 2 Nov 2025 17:32:11 +0100 Subject: [PATCH 13/27] add AST walker and update tests --- R/data.table.R | 11 +++++++++-- inst/tests/tests.Rraw | 22 +++++++++++----------- 2 files changed, 20 insertions(+), 13 deletions(-) diff --git a/R/data.table.R b/R/data.table.R index 8ac34f833..88d34510d 100644 --- a/R/data.table.R +++ b/R/data.table.R @@ -3342,9 +3342,12 @@ is_constantish = function(q, check_singleton=FALSE) { "as.complex", "as.logical", "as.Date", "as.POSIXct", "as.factor") } +.gforce_ops = c("+", "-", "*", "/", "^", "%%", "%/%") + .gforce_ok = function(q, x, envir=parent.frame(2L)) { if (is.N(q)) return(TRUE) # For #334 if (!is.call(q)) return(FALSE) # plain columns are not gforce-able since they might not aggregate (see test 104.1) + if (q %iscall% "(") return(.gforce_ok(q[[2L]], x, envir)) q1 = .get_gcall(q) if (!is.null(q1)) { @@ -3362,7 +3365,7 @@ is_constantish = function(q, check_singleton=FALSE) { } # check if arithmetic operator -> recursively validate ALL branches (like in AST) - if (is.symbol(q[[1L]]) && q[[1L]] %chin% c("+", "-", "*", "/", "^", "%%", "%/%")) { + if (is.symbol(q[[1L]]) && q[[1L]] %chin% .gforce_ops) { for (i in 2:length(q)) { if (!.gforce_ok(q[[i]], x, envir)) return(FALSE) } @@ -3374,6 +3377,10 @@ is_constantish = function(q, check_singleton=FALSE) { .gforce_jsub = function(q, names_x, envir=parent.frame(2L)) { if (!is.call(q)) return(q) + if (q %iscall% "(") { + q[[2L]] = .gforce_jsub(q[[2L]], names_x, envir) + return(q) + } q1 = .get_gcall(q) if (!is.null(q1)) { @@ -3390,7 +3397,7 @@ is_constantish = function(q, check_singleton=FALSE) { } # if arithmetic operator, recursively substitute its operands. we know what branches are valid from .gforce_ok - if (is.symbol(q[[1L]]) && q[[1L]] %chin% c("+", "-", "*", "/", "^", "%%", "%/%")) { + if (is.symbol(q[[1L]]) && q[[1L]] %chin% .gforce_ops) { for (i in 2:length(q)) { q[[i]] = .gforce_jsub(q[[i]], names_x, envir) } diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 2a58d0ee2..f8db7b2d3 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -21219,20 +21219,20 @@ test(2344.04, key(DT[, .(V4 = c("b", "a"), V2, V5 = c("y", "x"), V1)]), c("V1", # conversions should not turn gforce off #2934 # lapply gforce should also work without .SD #5032 # support arithmetic in j with gforce #3815 -dt = data.table(a=1:4, b=1:2) out = c("GForce FALSE", "GForce FALSE", "GForce TRUE") +dt = data.table(a=1:4, b=1:2) test(2345.01, levels=0:2, dt[, max(as.character(a)), by=b, verbose=TRUE], data.table(b=1:2, V1=c("3","4")), output=out) test(2345.02, levels=0:2, dt[, max(as.numeric(a)), by=b, verbose=TRUE], data.table(b=1:2, V1=c(3,4)), output=out) -test(2345.03, levels=0:2, dt[, Map(sum, .SD), b, verbose=TRUE], dt[, lapply(.SD, sum), b], output=out) +dt = data.table(a=1:4, b=1:2) +test(2345.11, levels=0:2, dt[, Map(sum, .SD), b, verbose=TRUE], dt[, lapply(.SD, sum), b], output=out) +test(2345.12, levels=0:2, dt[, Map(sum, .SD, .SD), by=b, verbose=TRUE], output="GForce FALSE") dt = data.table(a = NA_integer_, b = 1:2, c = c(TRUE, FALSE)) -test(2345.04, levels=0:2, dt[, Map(weighted.mean, .SD, na.rm=c), b, .SDcols="a", verbose=TRUE], data.table(b=1:2, a=c(NaN, NA_real_)), output="GForce FALSE") -test(2345.05, levels=0:2, dt[,list(weighted.mean(a, na.rm=c)), b, verbose=TRUE], data.table(b=1:2, V1=c(NaN, NA_real_)), output="GForce FALSE") +test(2345.13, levels=0:2, dt[, Map(weighted.mean, .SD, na.rm=c), b, .SDcols="a", verbose=TRUE], data.table(b=1:2, a=c(NaN, NA_real_)), output="GForce FALSE") +test(2345.14, levels=0:2, dt[,list(weighted.mean(a, na.rm=c)), b, verbose=TRUE], data.table(b=1:2, V1=c(NaN, NA_real_)), output="GForce FALSE") dt = data.table(a=1:2, b=1, c=1:4) -out = c("GForce FALSE", "lapply optimization changed j", "GForce TRUE") -test(2345.06, levels=0:2, dt[, lapply(list(b, c), sum), by=a, verbose=TRUE], output=out) -test(2345.07, levels=0:2, dt[, c(list(sum(b), sum(c))), by=a, verbose=TRUE], output=out) -test(2345.08, levels=0:2, names(dt[, lapply(list(b, c), sum), by=a])) -dt = data.table(a=1:4, b=1:2) -out = c("GForce FALSE", "GForce FALSE", "GForce TRUE") -test(2345.09, levels=0:2, dt[, .(max(a)-min(a)), by=b, verbose=TRUE], output=out) +test(2345.21, levels=0:2, dt[, lapply(list(b, c), sum), by=a, verbose=TRUE], output=out) +test(2345.22, levels=0:2, dt[, c(list(sum(b), sum(c))), by=a, verbose=TRUE], output=out) +test(2345.23, levels=0:2, names(dt[, lapply(list(b, c), sum), by=a])) dt = data.table(a=1:4, b=1:2) +test(2345.31, levels=0:2, dt[, .(max(a)-min(a)), by=b, verbose=TRUE], output=out) +test(2345.32, levels=0:2, dt[, .((max(a) - min(a)) / (max(a) + min(a))), by=b, verbose=TRUE], data.table(b=1:2, V1=c(0.5, 1/3)), output=out) From 1d324d60b991547bbedfa944e0ceaeb3b894041f Mon Sep 17 00:00:00 2001 From: Benjamin Schwendinger Date: Sun, 2 Nov 2025 18:16:34 +0100 Subject: [PATCH 14/27] add tests --- R/data.table.R | 2 +- inst/tests/tests.Rraw | 4 ++++ 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/R/data.table.R b/R/data.table.R index 88d34510d..155e1f74e 100644 --- a/R/data.table.R +++ b/R/data.table.R @@ -3346,7 +3346,7 @@ is_constantish = function(q, check_singleton=FALSE) { .gforce_ok = function(q, x, envir=parent.frame(2L)) { if (is.N(q)) return(TRUE) # For #334 - if (!is.call(q)) return(FALSE) # plain columns are not gforce-able since they might not aggregate (see test 104.1) + if (!is.call(q)) return(is.numeric(q)) # plain columns are not gforce-able since they might not aggregate (see test 104.1) if (q %iscall% "(") return(.gforce_ok(q[[2L]], x, envir)) q1 = .get_gcall(q) diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index f8db7b2d3..2449e0cf9 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -21236,3 +21236,7 @@ test(2345.23, levels=0:2, names(dt[, lapply(list(b, c), sum), by=a])) dt = data.table(a=1:4, b=1:2) test(2345.31, levels=0:2, dt[, .(max(a)-min(a)), by=b, verbose=TRUE], output=out) test(2345.32, levels=0:2, dt[, .((max(a) - min(a)) / (max(a) + min(a))), by=b, verbose=TRUE], data.table(b=1:2, V1=c(0.5, 1/3)), output=out) +test(2345.33, levels=0:2, dt[, sum(a) / .N, b, verbose=TRUE], output=out) +test(2345.34, levels=0:2, dt[, mean(a) * 2L + sum(a), b, verbose=TRUE], output=out) +test(2345.35, levels=0:2, dt[, list(range=max(a)-min(a), avg=mean(a)), by=b, verbose=TRUE], output=out) +test(2345.36, levels=0:2, dt[, .(max(a)-sqrt(min(a))), by=b, verbose=TRUE], output="GForce FALSE") From 22cf35eb5b897eda3c8e576589bd0e08b4c1d045 Mon Sep 17 00:00:00 2001 From: Benjamin Schwendinger Date: Sun, 2 Nov 2025 19:42:11 +0100 Subject: [PATCH 15/27] add NEWS --- NEWS.md | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/NEWS.md b/NEWS.md index 3ae0dc33f..943597125 100644 --- a/NEWS.md +++ b/NEWS.md @@ -296,7 +296,16 @@ See [#2611](https://github.com/Rdatatable/data.table/issues/2611) for details. T # user system elapsed # 0.028 0.000 0.005 ``` - 20. `fread()` now supports the `comment.char` argument to skip trailing comments or comment-only lines, consistent with `read.table()`, [#856](https://github.com/Rdatatable/data.table/issues/856). The default remains `comment.char = ""` (no comment parsing) for backward compatibility and performance, in contrast to `read.table(comment.char = "#")`. Thanks to @arunsrinivasan and many others for the suggestion and @ben-schwen for the implementation. + +20. `fread()` now supports the `comment.char` argument to skip trailing comments or comment-only lines, consistent with `read.table()`, [#856](https://github.com/Rdatatable/data.table/issues/856). The default remains `comment.char = ""` (no comment parsing) for backward compatibility and performance, in contrast to `read.table(comment.char = "#")`. Thanks to @arunsrinivasan and many others for the suggestion and @ben-schwen for the implementation. + +21. GForce and lapply optimization detection has been refactored to use modular optimization paths and an AST (Abstract Syntax Tree) walker for improved maintainability and extensibility. The new architecture separates optimization detection into distinct, composable phases. This makes future optimization enhancements a lot easier. Thanks to @grantmcdermott, @jangorecki, @MichaelChirico, and @HughParsonage for the suggestions and @ben-schwen for the implementation. + + This rewrite also introduces several new optimizations: + - Enables Map instead of lapply optimizations (e.g., `Map(fun, .SD)` -> `list(fun(col1), fun(col2), ...)`) [#5336](https://github.com/Rdatatable/data.table/issues/5336) + - lapply optimization works without .SD (e.g., `lapply(list(col1, col2), fun)` -> `list(fun(col1), fun(col2))` [#5032](https://github.com/Rdatatable/data.table/issues/5032) + - Type conversion support in GForce expressions (e.g., `sum(as.numeric(x))`) [#2934](https://github.com/Rdatatable/data.table/issues/2934) + - Arithmetic operation support in GForce (e.g., `max(x) - min(x)`) [#3815](https://github.com/Rdatatable/data.table/issues/3815) ### BUG FIXES From 25a7e2e4a2540e80f950ec87fec4a5b4319863d0 Mon Sep 17 00:00:00 2001 From: Benjamin Schwendinger Date: Mon, 3 Nov 2025 09:45:52 +0100 Subject: [PATCH 16/27] make function name in massageSD more expressive --- R/data.table.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/data.table.R b/R/data.table.R index 155e1f74e..41d0b5246 100644 --- a/R/data.table.R +++ b/R/data.table.R @@ -158,7 +158,7 @@ replace_dot_alias = function(e) { # http://stackoverflow.com/questions/13441868/data-table-and-stratified-means # adding this does not compromise in speed (that is, not any lesser than without SDenv$.SD) # replaced SDenv$.SD to SDenv to deal with Bug #87 reported by Ricardo (Nice catch!) - thisfun = paste0("..FUN", funi) # Fix for #985 + thisfun = paste0("..LAPPLY_FUN", funi) # Fix for #985 assign(thisfun, eval(fun, SDenv, SDenv), SDenv) # to avoid creating function() for each column of .SD lockBinding(thisfun, SDenv) txt[[1L]] = as.name(thisfun) From eb8056cb12fdf7319333b69eef626f6cda3478d7 Mon Sep 17 00:00:00 2001 From: Benjamin Schwendinger Date: Mon, 3 Nov 2025 10:35:35 +0100 Subject: [PATCH 17/27] rename levels argument to optimization --- R/test.data.table.R | 16 +- inst/tests/benchmark.Rraw | 6 +- inst/tests/optimize.Rraw | 336 +++++++++++++++++++------------------- inst/tests/tests.Rraw | 200 +++++++++++------------ man/test.Rd | 2 +- 5 files changed, 280 insertions(+), 280 deletions(-) diff --git a/R/test.data.table.R b/R/test.data.table.R index 645addcd8..fa2cb426c 100644 --- a/R/test.data.table.R +++ b/R/test.data.table.R @@ -361,25 +361,25 @@ gc_mem = function() { # nocov end } -test = function(num,x,y=TRUE,error=NULL,warning=NULL,message=NULL,output=NULL,notOutput=NULL,ignore.warning=NULL,options=NULL,env=NULL,levels=NULL) { - # if levels is provided, test across multiple optimization levels - if (!is.null(levels)) { +test = function(num,x,y=TRUE,error=NULL,warning=NULL,message=NULL,output=NULL,notOutput=NULL,ignore.warning=NULL,options=NULL,env=NULL,optimization=NULL) { + # if optimization is provided, test across multiple optimization levels + if (!is.null(optimization)) { cl = match.call() - cl$levels = NULL # Remove levels from the recursive call + cl$optimization = NULL # Remove optimization levels from the recursive call vector_params = c("error", "warning", "message", "output", "notOutput", "ignore.warning") # Check if y was explicitly provided (not just the default) y_provided = !missing(y) - compare = !y_provided && length(levels)>1L && !any(vapply_1b(vector_params, function(p) length(get(p, envir=environment())) > 0L)) + compare = !y_provided && length(optimization)>1L && !any(vapply_1b(vector_params, function(p) length(get(p, envir=environment())) > 0L)) - for (i in seq_along(levels)) { + for (i in seq_along(optimization)) { cl$num = num + (i - 1L) * 1e-6 - opt_level = list(datatable.optimize = levels[i]) + opt_level = list(datatable.optimize = optimization[i]) cl$options = if (!is.null(options)) c(as.list(options), opt_level) else opt_level for (p in vector_params) { val = get(p, envir=environment()) if (length(val) > 0L) { - cl[[p]] = val[((i - 1L) %% length(val)) + 1L] # cycle through values if fewer than levels + cl[[p]] = val[((i - 1L) %% length(val)) + 1L] # cycle through values if fewer than optimization levels } } diff --git a/inst/tests/benchmark.Rraw b/inst/tests/benchmark.Rraw index 52a91c723..cdc541559 100644 --- a/inst/tests/benchmark.Rraw +++ b/inst/tests/benchmark.Rraw @@ -193,10 +193,10 @@ test(301.1, nrow(DT[,sum(B),by=C])==100010) set.seed(1) DT = data.table(a=sample(1:100, 1e6, replace=TRUE), b=sample(1:1000, 1e6, replace=TRUE), key="a") opt = c(0L,2L) -test(637.1, levels=opt, copy(DT)[, m:=sum(b), by=a][1:3], data.table(a=1L, b=c(156L, 808L, 848L), m=DT[J(1), sum(b)], key="a")) -test(637.2, levels=opt, key(copy(DT)[J(43L), a:=99L]), NULL) +optimization=opt, copy(DT)[, m:=sum(b), by=a][1:3], data.table(a=1L, b=c(156L, 808L, 848L), m=DT[J(1), sum(b)], key="a")) +optimization=opt, key(copy(DT)[J(43L), a:=99L]), NULL) setkey(DT, a) -test(637.3, levels=opt, key(copy(DT)[, a:=99L, by=a]), NULL) +optimization=opt, key(copy(DT)[, a:=99L, by=a]), NULL) # Test X[Y] slowdown, #2216 # Many minutes in 1.8.2! Now well under 1s, but 10s for very wide tolerance for CRAN. We'd like CRAN to tell us if any changes diff --git a/inst/tests/optimize.Rraw b/inst/tests/optimize.Rraw index 16a6bb5a4..cade0c60c 100644 --- a/inst/tests/optimize.Rraw +++ b/inst/tests/optimize.Rraw @@ -19,34 +19,34 @@ for (s in sugg) { # := by group DT = data.table(a=1:3,b=(1:9)/10) -test(611.1, levels=c(0L, 2L), DT[,v:=sum(b),by=a], data.table(a=1:3,b=(1:9)/10,v=c(1.2,1.5,1.8))) +test(611.1,optimization=c(0L, 2L), DT[,v:=sum(b),by=a], data.table(a=1:3,b=(1:9)/10,v=c(1.2,1.5,1.8))) setkey(DT,a) -test(611.2, levels=c(0L, 2L), DT[,v:=min(b),by=a], data.table(a=1:3,b=(1:9)/10,v=(1:3)/10,key="a")) +test(611.2,optimization=c(0L, 2L), DT[,v:=min(b),by=a], data.table(a=1:3,b=(1:9)/10,v=(1:3)/10,key="a")) # Combining := by group with i -test(611.3, levels=c(0L, 2L), DT[a>1,p:=sum(b)]$p, rep(c(NA,3.3),c(3,6))) -test(611.4, levels=c(0L, 2L), DT[a>1,q:=sum(b),by=a]$q, rep(c(NA,1.5,1.8),each=3)) +test(611.3,optimization=c(0L, 2L), DT[a>1,p:=sum(b)]$p, rep(c(NA,3.3),c(3,6))) +test(611.4,optimization=c(0L, 2L), DT[a>1,q:=sum(b),by=a]$q, rep(c(NA,1.5,1.8),each=3)) # 612 was just level repetition of 611 # Assign to subset ok (NA initialized in the other items) ok : -test(613, levels=c(0L, 2L), DT[J(2),w:=8.3]$w, rep(c(NA,8.3,NA),each=3)) -test(614, levels=c(0L, 2L), DT[J(3),x:=9L]$x, rep(c(NA_integer_,NA_integer_,9L),each=3)) -test(615, levels=c(0L, 2L), DT[J(2),z:=list(list(c(10L,11L)))]$z, rep(list(NULL, 10:11, NULL),each=3)) +test(613,optimization=c(0L, 2L), DT[J(2),w:=8.3]$w, rep(c(NA,8.3,NA),each=3)) +test(614,optimization=c(0L, 2L), DT[J(3),x:=9L]$x, rep(c(NA_integer_,NA_integer_,9L),each=3)) +test(615,optimization=c(0L, 2L), DT[J(2),z:=list(list(c(10L,11L)))]$z, rep(list(NULL, 10:11, NULL),each=3)) # Empty i clause, #2034. Thanks to Chris for testing, tests from him. Plus changes from #759 ans = copy(DT)[,r:=NA_real_] -test(618.1, levels=c(0L, 2L), copy(DT)[a>3,r:=sum(b)], ans) -test(618.2, levels=c(0L, 2L), copy(DT)[J(-1),r:=sum(b)], ans) -test(618.3, levels=c(0L, 2L), copy(DT)[NA,r:=sum(b)], ans) -test(618.4, levels=c(0L, 2L), copy(DT)[0,r:=sum(b)], ans) -test(618.5, levels=c(0L, 2L), copy(DT)[NULL,r:=sum(b)], null.data.table()) +test(618.1,optimization=c(0L, 2L), copy(DT)[a>3,r:=sum(b)], ans) +test(618.2,optimization=c(0L, 2L), copy(DT)[J(-1),r:=sum(b)], ans) +test(618.3,optimization=c(0L, 2L), copy(DT)[NA,r:=sum(b)], ans) +test(618.4,optimization=c(0L, 2L), copy(DT)[0,r:=sum(b)], ans) +test(618.5,optimization=c(0L, 2L), copy(DT)[NULL,r:=sum(b)], null.data.table()) # test 619 was level 2 of 618 DT = data.table(x=letters, key="x") -test(621, levels=c(0L, 2L), copy(DT)[J("bb"), x:="foo"], DT) # when no update, key should be retained -test(622, levels=c(0L, 2L), copy(DT)[J("bb"), x:="foo",nomatch=0], DT, warning="ignoring nomatch") +test(621,optimization=c(0L, 2L), copy(DT)[J("bb"), x:="foo"], DT) # when no update, key should be retained +test(622,optimization=c(0L, 2L), copy(DT)[J("bb"), x:="foo",nomatch=0], DT, warning="ignoring nomatch") set.seed(2) DT = data.table(a=rnorm(5)*10, b=1:5) -test(623, levels=c(0L, 2L), copy(DT)[,s:=sum(b),by=round(a)%%2]$s, c(10L,5L,5L,10L,10L)) +test(623,optimization=c(0L, 2L), copy(DT)[,s:=sum(b),by=round(a)%%2]$s, c(10L,5L,5L,10L,10L)) # Setup for test 656.x - gforce tests set.seed(9) @@ -57,36 +57,36 @@ DT = data.table(grp1=sample.int(150L, n, replace=TRUE), y=rnorm(n)) opt = 0:2 out = c('GForce FALSE', 'GForce FALSE' ,'GForce TRUE') -test(656.1, levels=opt, DT[ , mean(x), by=grp1, verbose=TRUE], output=out) -test(656.2, levels=opt, DT[ , list(mean(x)), by=grp1, verbose=TRUE], output=out) -test(656.3, levels=opt, DT[ , list(mean(x), mean(y)), by=grp1, verbose=TRUE], output=out) +test(656.1,optimization=opt, DT[ , mean(x), by=grp1, verbose=TRUE], output=out) +test(656.2,optimization=opt, DT[ , list(mean(x)), by=grp1, verbose=TRUE], output=out) +test(656.3,optimization=opt, DT[ , list(mean(x), mean(y)), by=grp1, verbose=TRUE], output=out) # Test := keyby does setkey, #2065 DT = data.table(x=1:2, y=1:6) ans = data.table(x=rep(1:2,each=3),y=c(1L,3L,5L,2L,4L,6L),z=rep(c(9L,12L),each=3),key="x") -test(670.1, levels=c(0L, 2L), copy(DT)[,z:=sum(y),keyby=x], ans) -test(670.2, levels=c(0L, 2L), copy(DT)[,z:=sum(y),keyby="x"], ans) -test(670.3, levels=c(0L, 2L), copy(DT)[,z:=sum(y),keyby=x%%2], data.table(x=1:2,y=1:6,z=c(9L,12L)), +test(670.1,optimization=c(0L, 2L), copy(DT)[,z:=sum(y),keyby=x], ans) +test(670.2,optimization=c(0L, 2L), copy(DT)[,z:=sum(y),keyby="x"], ans) +test(670.3,optimization=c(0L, 2L), copy(DT)[,z:=sum(y),keyby=x%%2], data.table(x=1:2,y=1:6,z=c(9L,12L)), warning="The setkey() normally performed by keyby= has been skipped (as if by= was used) because := is being used together with keyby= but the keyby= contains some expressions. To avoid this warning, use by= instead, or provide existing column names to keyby=") -test(670.4, levels=c(0L, 2L), copy(DT)[,z:=sum(y),by=x%%2], data.table(x=1:2,y=1:6,z=c(9L,12L))) -test(670.5, levels=c(0L, 2L), copy(DT)[x>1,z:=sum(y),keyby=x], error=":= with keyby is only possible when i is not supplied since") +test(670.4,optimization=c(0L, 2L), copy(DT)[,z:=sum(y),by=x%%2], data.table(x=1:2,y=1:6,z=c(9L,12L))) +test(670.5,optimization=c(0L, 2L), copy(DT)[x>1,z:=sum(y),keyby=x], error=":= with keyby is only possible when i is not supplied since") # test 671 was level 2 of 670 # varname holding colnames, by group, linked from #2120. DT = data.table(a=rep(1:3,1:3),b=1:6) colname = "newcol" -test(751, levels=c(0L, 2L), DT[,(colname):=sum(b),by=a], data.table(a=rep(1:3,1:3),b=1:6,newcol=INT(1,5,5,15,15,15))) +test(751,optimization=c(0L, 2L), DT[,(colname):=sum(b),by=a], data.table(a=rep(1:3,1:3),b=1:6,newcol=INT(1,5,5,15,15,15))) # Add tests for nested := in j by group, #1987 DT = data.table(a=rep(1:3,2:4),b=1:9) -test(752, levels=c(0L, 2L), DT[,head(.SD,2)[,new:=1:.N],by=a], data.table(a=rep(1:3,each=2),b=c(1:4,6:7),new=1:2)) +test(752,optimization=c(0L, 2L), DT[,head(.SD,2)[,new:=1:.N],by=a], data.table(a=rep(1:3,each=2),b=c(1:4,6:7),new=1:2)) DT = data.table(a=1:3,b=1:3,v=1:6,w=1:6) opt = c(0:2, Inf) out = c('GForce FALSE', 'GForce FALSE', 'GForce TRUE', 'GForce TRUE') # v1.9.7 treats wrapped {} better, so this is now optimized -test(865, levels=opt, DT[,{list(name1=sum(v),name2=sum(w))},by=c('a', 'b'),verbose=TRUE], output=out) -test(867, levels=opt, names(DT[,list(name1=sum(v),name2=sum(w)),by=c('a', 'b')]), c("a", "b", "name1", "name2")) # list names extracted here +test(865,optimization=opt, DT[,{list(name1=sum(v),name2=sum(w))},by=c('a', 'b'),verbose=TRUE], output=out) +test(867,optimization=opt, names(DT[,list(name1=sum(v),name2=sum(w)),by=c('a', 'b')]), c("a", "b", "name1", "name2")) # list names extracted here # tests of gsum and gmean with NA DT = data.table(x=rep(c("a","b","c","d"),each=3), y=c(1L,3L,6L), v=as.numeric(1:12)) @@ -96,82 +96,82 @@ set(DT,10:12,"y",NA) set(DT,10:12,"v",NA) opt = c(1L, 2L) out = c("(GForce FALSE)", "GForce optimized j to") -test(1184.1, levels=opt, DT[, sum(v), by=x, verbose=TRUE], output=out) -test(1184.2, levels=1L, DT[, mean(v), by=x, verbose=TRUE], output="(GForce FALSE)") -test(1185.2, levels=c(0L,1L,Inf), DT[, list(mean(y), mean(v), mean(y,na.rm=TRUE), mean(v,na.rm=TRUE)), by=x, verbose=TRUE], +test(1184.1,optimization=opt, DT[, sum(v), by=x, verbose=TRUE], output=out) +test(1184.2,optimization=1L, DT[, mean(v), by=x, verbose=TRUE], output="(GForce FALSE)") +test(1185.2,optimization=c(0L,1L,Inf), DT[, list(mean(y), mean(v), mean(y,na.rm=TRUE), mean(v,na.rm=TRUE)), by=x, verbose=TRUE], output=c("All optimizations.*off", "Old mean.*changed j", "GForce optimized j to")) -test(1187.1, levels=c(0L, 1L, 2L), DT[, list(sum(y), sum(v), sum(y,na.rm=TRUE), sum(v,na.rm=TRUE)), by=x], +test(1187.1,optimization=c(0L, 1L, 2L), DT[, list(sum(y), sum(v), sum(y,na.rm=TRUE), sum(v,na.rm=TRUE)), by=x], data.table(x=c("a","b","c","d"), V1=c(NA,10L,NA,NA), V2=c(6,NA,NA,NA), V3=c(4L,10L,7L,0L), V4=c(6,10,15,0))) MyVar = TRUE -test(1187.2, levels=opt, DT[, list(sum(y,na.rm=MyVar), mean(y,na.rm=MyVar)), by=x, verbose=TRUE], output=out) -test(1187.3, levels=opt, DT[, mean(y,na.rm=MyVar), by=x, verbose=TRUE], output=out) +test(1187.2,optimization=opt, DT[, list(sum(y,na.rm=MyVar), mean(y,na.rm=MyVar)), by=x, verbose=TRUE], output=out) +test(1187.3,optimization=opt, DT[, mean(y,na.rm=MyVar), by=x, verbose=TRUE], output=out) MyVar = FALSE -test(1187.4, levels=opt, DT[, list(sum(y,na.rm=MyVar), mean(y,na.rm=MyVar)), by=x, verbose=TRUE], output=out) -test(1187.5, levels=opt, DT[, mean(y,na.rm=MyVar), by=x, verbose=TRUE], output=out) +test(1187.4,optimization=opt, DT[, list(sum(y,na.rm=MyVar), mean(y,na.rm=MyVar)), by=x, verbose=TRUE], output=out) +test(1187.5,optimization=opt, DT[, mean(y,na.rm=MyVar), by=x, verbose=TRUE], output=out) # GForce should not turn on when the .ok function isn't triggered -test(1187.6, levels=2L, DT[, mean(y, trim=.2), by=x, verbose=TRUE], +test(1187.6,optimization=2L, DT[, mean(y, trim=.2), by=x, verbose=TRUE], data.table(x = c("a", "b", "c", "d"), V1 = c(NA, 3.33333333333333, NA, NA)), output='j unchanged', warning="'trim' is not yet optimized") # FR #334. Test to just make sure that GForce and dogroups with .N are giving the same results. set.seed(2L) dt <- data.table(x=sample(rep(1:5e3, each=3)), y=sample(10)) -test(1304.1, levels=0:2, dt[, list(.N, sum(y)), by=x]) +test(1304.1,optimization=0:2, dt[, list(.N, sum(y)), by=x]) dt <- data.table(x=sample(rep(1:5e3, each=3)), y=sample(10), key="x") -test(1304.2, levels=0:2, dt[, list(.N, sum(y)), by=x]) +test(1304.2,optimization=0:2, dt[, list(.N, sum(y)), by=x]) # gmin and gmax extensive testing (because there are tricky cases) DT <- data.table(x=rep(1:6, each=3), y=INT(4,-1,0, NA,4,10, 4,NA,10, 4,10,NA, -2147483647, -2147483647, -2147483647, 2147483647, 2147483647, 2147483647)) opts = 0:2 # for integers -test(1313.01, levels=opts, DT[, min(y), by=x], DT[, base::min(y), by=x]) -test(1313.02, levels=opts, DT[, max(y), by=x], DT[, base::max(y), by=x]) -test(1313.03, levels=opts, DT[, min(y, na.rm=TRUE), by=x], DT[, base::min(y, na.rm=TRUE), by=x]) -test(1313.04, levels=opts, DT[, max(y, na.rm=TRUE), by=x], DT[, base::max(y, na.rm=TRUE), by=x]) +test(1313.01,optimization=opts, DT[, min(y), by=x], DT[, base::min(y), by=x]) +test(1313.02,optimization=opts, DT[, max(y), by=x], DT[, base::max(y), by=x]) +test(1313.03,optimization=opts, DT[, min(y, na.rm=TRUE), by=x], DT[, base::min(y, na.rm=TRUE), by=x]) +test(1313.04,optimization=opts, DT[, max(y, na.rm=TRUE), by=x], DT[, base::max(y, na.rm=TRUE), by=x]) # testing all NA - GForce automatically converts to numeric.. optimize=1L errors due to change from integer/numeric (like median) DT[x==6, y := INT(NA)] -test(1313.05, levels=opts, DT[, min(y), by=x], DT[, base::min(y), by=x]) -test(1313.06, levels=opts, DT[, max(y), by=x], DT[, base::max(y), by=x]) -test(1313.07, levels=2L, DT[, min(y, na.rm=TRUE), by=x], data.table(x=1:6, V1=INT(-1,4,4,4,-2147483647,NA))) -test(1313.08, levels=2L, DT[, max(y, na.rm=TRUE), by=x], data.table(x=1:6, V1=INT(4,10,10,10,-2147483647,NA))) +test(1313.05,optimization=opts, DT[, min(y), by=x], DT[, base::min(y), by=x]) +test(1313.06,optimization=opts, DT[, max(y), by=x], DT[, base::max(y), by=x]) +test(1313.07,optimization=2L, DT[, min(y, na.rm=TRUE), by=x], data.table(x=1:6, V1=INT(-1,4,4,4,-2147483647,NA))) +test(1313.08,optimization=2L, DT[, max(y, na.rm=TRUE), by=x], data.table(x=1:6, V1=INT(4,10,10,10,-2147483647,NA))) # for numeric DT <- data.table(x=rep(1:6, each=3), y=c(4,-1,0, NA,4,10, 4,NA,10, 4,10,NA, -Inf, NA, NA, Inf, NA, NA)) -test(1313.09, levels=opts, DT[, min(y), by=x], DT[, base::min(y), by=x]) -test(1313.10, levels=opts, DT[, max(y), by=x], DT[, base::max(y), by=x]) -test(1313.11, levels=opts, DT[, min(y, na.rm=TRUE), by=x], DT[, base::min(y, na.rm=TRUE), by=x]) -test(1313.12, levels=opts, DT[, max(y, na.rm=TRUE), by=x], DT[, base::max(y, na.rm=TRUE), by=x]) +test(1313.09,optimization=opts, DT[, min(y), by=x], DT[, base::min(y), by=x]) +test(1313.10,optimization=opts, DT[, max(y), by=x], DT[, base::max(y), by=x]) +test(1313.11,optimization=opts, DT[, min(y, na.rm=TRUE), by=x], DT[, base::min(y, na.rm=TRUE), by=x]) +test(1313.12,optimization=opts, DT[, max(y, na.rm=TRUE), by=x], DT[, base::max(y, na.rm=TRUE), by=x]) # testing all NA - GForce automatically converts to numeric.. optimize=1L errors due to change from integer/numeric (like median) DT[x==6, y := NA_real_] -test(1313.13, levels=opts, DT[, min(y), by=x], DT[, base::min(y), by=x]) -test(1313.14, levels=opts, DT[, max(y), by=x], DT[, base::max(y), by=x]) -test(1313.15, levels=2L, DT[, min(y, na.rm=TRUE), by=x], data.table(x=1:6, V1=c(-1,4,4,4,-Inf,NA))) -test(1313.16, levels=2L, DT[, max(y, na.rm=TRUE), by=x], data.table(x=1:6, V1=c(4,10,10,10,-Inf,NA))) +test(1313.13,optimization=opts, DT[, min(y), by=x], DT[, base::min(y), by=x]) +test(1313.14,optimization=opts, DT[, max(y), by=x], DT[, base::max(y), by=x]) +test(1313.15,optimization=2L, DT[, min(y, na.rm=TRUE), by=x], data.table(x=1:6, V1=c(-1,4,4,4,-Inf,NA))) +test(1313.16,optimization=2L, DT[, max(y, na.rm=TRUE), by=x], data.table(x=1:6, V1=c(4,10,10,10,-Inf,NA))) # for date (attribute check.. especially after issues/689 !!!) DT <- data.table(x = rep(letters[1:2], each=5), y = as.POSIXct('2010-01-01', tz="UTC") + seq(0, 86400*9, 86400)) -test(1313.17, levels=opts, DT[, list(y=min(y)), by=x], DT[c(1,6)]) -test(1313.18, levels=opts, DT[, list(y=max(y)), by=x], DT[c(5,10)]) +test(1313.17,optimization=opts, DT[, list(y=min(y)), by=x], DT[c(1,6)]) +test(1313.18,optimization=opts, DT[, list(y=max(y)), by=x], DT[c(5,10)]) DT[c(1,6), y := NA] -test(1313.19, levels=opts, DT[, list(y=min(y)), by=x], DT[c(1,6)]) -test(1313.20, levels=opts, DT[, list(y=max(y)), by=x], DT[c(1,6)]) -test(1313.21, levels=opts, DT[, list(y=min(y, na.rm=TRUE)), by=x], DT[c(2,7)]) -test(1313.22, levels=opts, DT[, list(y=max(y, na.rm=TRUE)), by=x], DT[c(5,10)]) +test(1313.19,optimization=opts, DT[, list(y=min(y)), by=x], DT[c(1,6)]) +test(1313.20,optimization=opts, DT[, list(y=max(y)), by=x], DT[c(1,6)]) +test(1313.21,optimization=opts, DT[, list(y=min(y, na.rm=TRUE)), by=x], DT[c(2,7)]) +test(1313.22,optimization=opts, DT[, list(y=max(y, na.rm=TRUE)), by=x], DT[c(5,10)]) # for character set.seed(1L) DT <- data.table(x=rep(1:7, each=3), y=sample(c("", letters[1:3], NA), 21, TRUE)) DT[x==7, y := c("","b","c")] -test(1313.23, levels=opts, DT[, min(y), by=x], DT[, base::min(y), by=x]) -test(1313.24, levels=opts, DT[, max(y), by=x], DT[, base::max(y), by=x]) -test(1313.25, levels=opts, DT[, min(y, na.rm=TRUE), by=x], DT[, base::min(y, na.rm=TRUE), by=x]) -test(1313.26, levels=opts, DT[, max(y, na.rm=TRUE), by=x], DT[, base::max(y, na.rm=TRUE), by=x]) +test(1313.23,optimization=opts, DT[, min(y), by=x], DT[, base::min(y), by=x]) +test(1313.24,optimization=opts, DT[, max(y), by=x], DT[, base::max(y), by=x]) +test(1313.25,optimization=opts, DT[, min(y, na.rm=TRUE), by=x], DT[, base::min(y, na.rm=TRUE), by=x]) +test(1313.26,optimization=opts, DT[, max(y, na.rm=TRUE), by=x], DT[, base::max(y, na.rm=TRUE), by=x]) DT[x==6, y := NA_character_] -test(1313.27, levels=opts, DT[, min(y), by=x], DT[, base::min(y), by=x]) -test(1313.28, levels=opts, DT[, max(y), by=x], DT[, base::max(y), by=x]) -test(1313.29, levels=2L, DT[, min(y, na.rm=TRUE), by=x], data.table(x=1:7, V1=c("a","a","c","","a",NA,""))) -test(1313.30, levels=2L, DT[, max(y, na.rm=TRUE), by=x], data.table(x=1:7, V1=c("b","a","c","a","c",NA,"c"))) +test(1313.27,optimization=opts, DT[, min(y), by=x], DT[, base::min(y), by=x]) +test(1313.28,optimization=opts, DT[, max(y), by=x], DT[, base::max(y), by=x]) +test(1313.29,optimization=2L, DT[, min(y, na.rm=TRUE), by=x], data.table(x=1:7, V1=c("a","a","c","","a",NA,""))) +test(1313.30,optimization=2L, DT[, max(y, na.rm=TRUE), by=x], data.table(x=1:7, V1=c("b","a","c","a","c",NA,"c"))) # Fix for #1251, DT[, .N, by=a] and DT[, .(.N), by=a] uses GForce now dt = data.table(a=sample(3,20,TRUE), b=1:10) -test(1565.1, levels=c(0,1,Inf), dt[, .N, by=a, verbose=TRUE], +test(1565.1,optimization=c(0,1,Inf), dt[, .N, by=a, verbose=TRUE], output=c("All optimizations are turned off", "lapply optimization is on, j unchanged", "GForce optimized j to")) # gforce optimisations @@ -188,32 +188,32 @@ opt = 0:2 out = c('GForce FALSE', 'GForce FALSE' ,'GForce TRUE') # make sure gforce is on # testing gforce::gmedian -test(1579.01, levels=2L, dt[, lapply(.SD, median), by=x, verbose=TRUE], +test(1579.01,optimization=2L, dt[, lapply(.SD, median), by=x, verbose=TRUE], dt[, lapply(.SD, function(x) median(as.numeric(x))), by=x], output="GForce optimized") -test(1579.02, levels=2L, dt[, lapply(.SD, median, na.rm=TRUE), by=x], +test(1579.02,optimization=2L, dt[, lapply(.SD, median, na.rm=TRUE), by=x], dt[, lapply(.SD, function(x) median(as.numeric(x), na.rm=TRUE)), by=x]) -test(1579.03, levels=2L, dt[, lapply(.SD, median), keyby=x], +test(1579.03,optimization=2L, dt[, lapply(.SD, median), keyby=x], dt[, lapply(.SD, function(x) median(as.numeric(x))), keyby=x]) -test(1579.04, levels=2L, dt[, lapply(.SD, median, na.rm=TRUE), keyby=x], +test(1579.04,optimization=2L, dt[, lapply(.SD, median, na.rm=TRUE), keyby=x], dt[, lapply(.SD, function(x) median(as.numeric(x), na.rm=TRUE)), keyby=x]) # testing gforce::ghead and gforce::gtail # head(.SD, 1) and tail(.SD, 1) optimisation -test(1579.06, levels=opt, dt[, head(.SD,1), by=x, verbose=TRUE], output=out) -test(1579.08, levels=opt, dt[, head(.SD,1), keyby=x, verbose=TRUE], output=out) -test(1579.10, levels=opt, dt[, head(.SD,1L), by=x, verbose=TRUE], output=out) -test(1579.12, levels=opt, dt[, head(.SD,1L), keyby=x, verbose=TRUE], output=out) -test(1579.14, levels=opt, dt[, tail(.SD,1), by=x, verbose=TRUE], output=out) -test(1579.16, levels=opt, dt[, tail(.SD,1), keyby=x, verbose=TRUE], output=out) -test(1579.18, levels=opt, dt[, tail(.SD,1L), by=x, verbose=TRUE], output=out) -test(1579.20, levels=opt, dt[, tail(.SD,1L), keyby=x, verbose=TRUE], output=out) +test(1579.06,optimization=opt, dt[, head(.SD,1), by=x, verbose=TRUE], output=out) +test(1579.08,optimization=opt, dt[, head(.SD,1), keyby=x, verbose=TRUE], output=out) +test(1579.10,optimization=opt, dt[, head(.SD,1L), by=x, verbose=TRUE], output=out) +test(1579.12,optimization=opt, dt[, head(.SD,1L), keyby=x, verbose=TRUE], output=out) +test(1579.14,optimization=opt, dt[, tail(.SD,1), by=x, verbose=TRUE], output=out) +test(1579.16,optimization=opt, dt[, tail(.SD,1), keyby=x, verbose=TRUE], output=out) +test(1579.18,optimization=opt, dt[, tail(.SD,1L), by=x, verbose=TRUE], output=out) +test(1579.20,optimization=opt, dt[, tail(.SD,1L), keyby=x, verbose=TRUE], output=out) # 1579.22 tested gtail with n>1; now 1579.4+ below mysub <- function(x, n) x[n] -test(1579.23, levels=2L, dt[, .SD[2], by=x, verbose=TRUE], dt[, mysub(.SD,2), by=x], output="GForce optimized.*g[[]") -test(1579.24, levels=opt, dt[, .SD[2], keyby=x], dt[, mysub(.SD,2), keyby=x]) -test(1579.25, levels=opt, dt[, .SD[2L], by=x], dt[, mysub(.SD,2L), by=x]) -test(1579.26, levels=opt, dt[, .SD[2L], keyby=x], dt[, mysub(.SD,2L), keyby=x]) -test(1579.27, levels=opt, dt[, .SD[15], by=x], dt[, mysub(.SD,15), by=x]) # tests 15 > grpsize and that NA is correct including for integer64 -test(1579.28, levels=opt, dt[, .SD[15], keyby=x], dt[, mysub(.SD,15), keyby=x]) +test(1579.23,optimization=2L, dt[, .SD[2], by=x, verbose=TRUE], dt[, mysub(.SD,2), by=x], output="GForce optimized.*g[[]") +test(1579.24,optimization=opt, dt[, .SD[2], keyby=x], dt[, mysub(.SD,2), keyby=x]) +test(1579.25,optimization=opt, dt[, .SD[2L], by=x], dt[, mysub(.SD,2L), by=x]) +test(1579.26,optimization=opt, dt[, .SD[2L], keyby=x], dt[, mysub(.SD,2L), keyby=x]) +test(1579.27,optimization=opt, dt[, .SD[15], by=x], dt[, mysub(.SD,15), by=x]) # tests 15 > grpsize and that NA is correct including for integer64 +test(1579.28,optimization=opt, dt[, .SD[15], keyby=x], dt[, mysub(.SD,15), keyby=x]) # gforce head/tail for n>1, #5060 set.seed(99) DT = data.table(x = sample(letters[1:5], 20, TRUE), @@ -223,16 +223,16 @@ DT = data.table(x = sample(letters[1:5], 20, TRUE), s = sample(c("foo","bar",NA), 20, TRUE), l = sample(list(1:3, mean, letters[4:5], NULL), 20, replace=TRUE)) if (test_bit64) DT[, i64:=sample(as.integer64(c(-2200000000,+2400000000,NA)), 20, TRUE)] -test(1579.401, levels=0:2, DT[, .N, by=x]$N, INT(4,6,5,2,3)) # the smallest group is 2, so n=5 tests n constrained to grpsize -test(1579.402, levels=2L, DT[, head(.SD,2), by=x, verbose=TRUE], DT[, utils::head(.SD,2), by=x], output="optimized.*ghead") -test(1579.403, levels=2L, DT[, head(.SD,2), keyby=x, verbose=TRUE], DT[, utils::head(.SD,2), keyby=x], output="optimized.*ghead") -test(1579.404, levels=2L, DT[, head(.SD,5), by=x, verbose=TRUE], DT[, utils::head(.SD,5), by=x], output="optimized.*ghead") -test(1579.405, levels=2L, DT[, head(.SD,5), keyby=x, verbose=TRUE], DT[, utils::head(.SD,5), keyby=x], output="optimized.*ghead") -test(1579.406, levels=2L, DT[, tail(.SD,2), by=x, verbose=TRUE], DT[, utils::tail(.SD,2), by=x], output="optimized.*gtail") -test(1579.407, levels=2L, DT[, tail(.SD,2), keyby=x, verbose=TRUE], DT[, utils::tail(.SD,2), keyby=x], output="optimized.*gtail") -test(1579.408, levels=2L, DT[, tail(.SD,5), by=x, verbose=TRUE], DT[, utils::tail(.SD,5), by=x], output="optimized.*gtail") -test(1579.409, levels=2L, DT[, tail(.SD,5), keyby=x, verbose=TRUE], DT[, utils::tail(.SD,5), keyby=x], output="optimized.*gtail") -test(1579.410, levels=2L, DT[, tail(.SD,2), by=.(x,y), verbose=TRUE], DT[, utils::tail(.SD,2), by=.(x,y)], output="optimized.*gtail") +test(1579.401,optimization=0:2, DT[, .N, by=x]$N, INT(4,6,5,2,3)) # the smallest group is 2, so n=5 tests n constrained to grpsize +test(1579.402,optimization=2L, DT[, head(.SD,2), by=x, verbose=TRUE], DT[, utils::head(.SD,2), by=x], output="optimized.*ghead") +test(1579.403,optimization=2L, DT[, head(.SD,2), keyby=x, verbose=TRUE], DT[, utils::head(.SD,2), keyby=x], output="optimized.*ghead") +test(1579.404,optimization=2L, DT[, head(.SD,5), by=x, verbose=TRUE], DT[, utils::head(.SD,5), by=x], output="optimized.*ghead") +test(1579.405,optimization=2L, DT[, head(.SD,5), keyby=x, verbose=TRUE], DT[, utils::head(.SD,5), keyby=x], output="optimized.*ghead") +test(1579.406,optimization=2L, DT[, tail(.SD,2), by=x, verbose=TRUE], DT[, utils::tail(.SD,2), by=x], output="optimized.*gtail") +test(1579.407,optimization=2L, DT[, tail(.SD,2), keyby=x, verbose=TRUE], DT[, utils::tail(.SD,2), keyby=x], output="optimized.*gtail") +test(1579.408,optimization=2L, DT[, tail(.SD,5), by=x, verbose=TRUE], DT[, utils::tail(.SD,5), by=x], output="optimized.*gtail") +test(1579.409,optimization=2L, DT[, tail(.SD,5), keyby=x, verbose=TRUE], DT[, utils::tail(.SD,5), keyby=x], output="optimized.*gtail") +test(1579.410,optimization=2L, DT[, tail(.SD,2), by=.(x,y), verbose=TRUE], DT[, utils::tail(.SD,2), by=.(x,y)], output="optimized.*gtail") # FR #971, partly addressed (only subsets in 'i') @@ -242,7 +242,7 @@ dt = data.table(x = sample(letters, 300, TRUE), d2 = as.numeric(sample(c(NA, NaN, -10:10), 300, TRUE))) opt = 1:2 out = c("GForce FALSE","GForce optimized j") -test(1581.01, levels=opt, ans1 <- dt[x %in% letters[15:20], +test(1581.01,optimization=opt, ans1 <- dt[x %in% letters[15:20], c(.N, lapply(.SD, sum, na.rm=TRUE), lapply(.SD, min, na.rm=TRUE), lapply(.SD, max, na.rm=TRUE), @@ -252,11 +252,11 @@ test(1581.01, levels=opt, ans1 <- dt[x %in% letters[15:20], output = out) # subsets in 'i' for head and tail -test(1581.04, levels=opt, dt[x %in% letters[15:20], head(.SD,1), by=x, verbose=TRUE], output=out) -test(1581.07, levels=opt, dt[x %in% letters[15:20], tail(.SD,1), by=x, verbose=TRUE], output=out) -test(1581.10, levels=opt, dt[x %in% letters[15:20], .SD[2], by=x, verbose=TRUE], output=out) +test(1581.04,optimization=opt, dt[x %in% letters[15:20], head(.SD,1), by=x, verbose=TRUE], output=out) +test(1581.07,optimization=opt, dt[x %in% letters[15:20], tail(.SD,1), by=x, verbose=TRUE], output=out) +test(1581.10,optimization=opt, dt[x %in% letters[15:20], .SD[2], by=x, verbose=TRUE], output=out) # #3209 g[[ -test(1581.13, levels=opt, dt[x %in% letters[15:20], d1[[2]], by=x, verbose=TRUE], output=out) +test(1581.13,optimization=opt, dt[x %in% letters[15:20], d1[[2]], by=x, verbose=TRUE], output=out) # also, block for non-atomic input, #4159 dt = data.table(a=1:3) dt[ , l := .(list(1, 2, 3))] @@ -284,16 +284,16 @@ DT = data.table( C2 = c(rep("a", 3), rep("b",3), rep("c",3), rep("d",3)), Val = c(1:5, NaN, NaN, 8,9,10,NaN,12)) opt = 0:2 -test(1583.1, levels=opt, DT[, .(agg = min(Val, na.rm=TRUE)), by=c('C1', 'C2')], +test(1583.1,optimization=opt, DT[, .(agg = min(Val, na.rm=TRUE)), by=c('C1', 'C2')], data.table(C1=c("A","A","B","B","C","C"), C2=c("a","b","b","c","c","d"), agg=c(1,4,5,8,9,10))) # extra test with a size-1 group containing one NaN too DT = data.table(x=INT(1,1,1,2,2,2,3,3,3,4,4,4,5), y=c(NaN,1,2, 2,NaN,1, NA,NaN,2, NaN,NA,NaN, NaN)) -test(1583.2, levels=2L, DT[, min(y, na.rm=TRUE), by=x], data.table(x=1:5, V1=c(1,1,2,NA,NA))) -test(1583.3, levels=2L, DT[, max(y, na.rm=TRUE), by=x], data.table(x=1:5, V1=c(2,2,2,NA,NA))) -test(1583.4, levels=opt, DT[, min(y), by=x], data.table(x=1:5, V1=c(NaN,NaN,NA,NaN,NaN))) -test(1583.5, levels=opt, DT[, max(y), by=x], data.table(x=1:5, V1=c(NaN,NaN,NA,NaN,NaN))) +test(1583.2,optimization=2L, DT[, min(y, na.rm=TRUE), by=x], data.table(x=1:5, V1=c(1,1,2,NA,NA))) +test(1583.3,optimization=2L, DT[, max(y, na.rm=TRUE), by=x], data.table(x=1:5, V1=c(2,2,2,NA,NA))) +test(1583.4,optimization=opt, DT[, min(y), by=x], data.table(x=1:5, V1=c(NaN,NaN,NA,NaN,NaN))) +test(1583.5,optimization=opt, DT[, max(y), by=x], data.table(x=1:5, V1=c(NaN,NaN,NA,NaN,NaN))) # FR #523, var, sd and prod DT = data.table(x=sample(5, 100, TRUE), @@ -303,24 +303,24 @@ DT = data.table(x=sample(5, 100, TRUE), z2=sample(c(runif(10),NA,NaN), 100, TRUE)) opt = 0:2 out = c("GForce FALSE", "GForce FALSE", "GForce optimized j to") -test(1594.01, levels=opt, DT[, lapply(.SD, var, na.rm=FALSE), by=x]) -test(1594.02, levels=opt, DT[, lapply(.SD, var, na.rm=TRUE), by=x]) -test(1594.03, levels=opt, DT[, lapply(.SD, var, na.rm=TRUE), by=x, verbose=TRUE], output=out) +test(1594.01,optimization=opt, DT[, lapply(.SD, var, na.rm=FALSE), by=x]) +test(1594.02,optimization=opt, DT[, lapply(.SD, var, na.rm=TRUE), by=x]) +test(1594.03,optimization=opt, DT[, lapply(.SD, var, na.rm=TRUE), by=x, verbose=TRUE], output=out) # coverage: default group .N=1 case idx=DT[ , .I[1L], by=x]$V1 ans=data.table(x=DT[(idx), x], V1=NA_real_) -test(1594.05, levels=opt, DT[(idx), var(y1), by=x], ans) -test(1594.06, levels=opt, DT[(idx), var(y1, na.rm=TRUE), by=x], ans) -test(1594.07, levels=opt, DT[(idx), var(z1), by=x], ans) -test(1594.08, levels=opt, DT[(idx), var(z1, na.rm=TRUE), by=x], ans) +test(1594.05,optimization=opt, DT[(idx), var(y1), by=x], ans) +test(1594.06,optimization=opt, DT[(idx), var(y1, na.rm=TRUE), by=x], ans) +test(1594.07,optimization=opt, DT[(idx), var(z1), by=x], ans) +test(1594.08,optimization=opt, DT[(idx), var(z1, na.rm=TRUE), by=x], ans) -test(1594.09, levels=opt,DT[, lapply(.SD, sd, na.rm=FALSE), by=x]) -test(1594.10, levels=opt, DT[, lapply(.SD, sd, na.rm=TRUE), by=x], DT[, lapply(.SD, stats::sd, na.rm=TRUE), by=x]) -test(1594.11, levels=opt, DT[, lapply(.SD, sd, na.rm=TRUE), by=x, verbose=TRUE], output=out) +test(1594.09,optimization=opt,DT[, lapply(.SD, sd, na.rm=FALSE), by=x]) +test(1594.10,optimization=opt, DT[, lapply(.SD, sd, na.rm=TRUE), by=x], DT[, lapply(.SD, stats::sd, na.rm=TRUE), by=x]) +test(1594.11,optimization=opt, DT[, lapply(.SD, sd, na.rm=TRUE), by=x, verbose=TRUE], output=out) -test(1594.12, levels=opt, DT[, lapply(.SD, prod, na.rm=FALSE), by=x]) -test(1594.13, levels=opt, DT[, lapply(.SD, prod, na.rm=TRUE), by=x]) -test(1594.14, levels=opt, DT[, lapply(.SD, prod, na.rm=TRUE), by=x, verbose=TRUE], output=out) +test(1594.12,optimization=opt, DT[, lapply(.SD, prod, na.rm=FALSE), by=x]) +test(1594.13,optimization=opt, DT[, lapply(.SD, prod, na.rm=TRUE), by=x]) +test(1594.14,optimization=opt, DT[, lapply(.SD, prod, na.rm=TRUE), by=x, verbose=TRUE], output=out) # when datatable.optimize<1, no optimisation of j should take place: dt = data.table(x=1:5, y=6:10, z=c(1,1,1,2,2)) @@ -330,53 +330,53 @@ test(1638, options=c(datatable.optimize=0L), dt[, .SD, by=z, verbose=TRUE], outp DT = data.table(x=c(3.7,3.3,3.5,2.8), w=c(5,5,4,1), g=1L) opt = c(1L,2L) out = c("GForce FALSE", "GForce optimized j to") -test(2231.01, levels=opt, DT[, weighted.mean(x, w), g, verbose=TRUE], data.table(g=1L, V1=3.45333333333333), output=out) -test(2231.02, levels=opt, DT[, weighted.mean(w, x), g, verbose=TRUE], data.table(g=1L, V1=3.89473684210526), output=out) -test(2231.03, levels=opt, DT[, weighted.mean(x), g, verbose=TRUE], data.table(g=1L, V1=3.325), output=out) +test(2231.01,optimization=opt, DT[, weighted.mean(x, w), g, verbose=TRUE], data.table(g=1L, V1=3.45333333333333), output=out) +test(2231.02,optimization=opt, DT[, weighted.mean(w, x), g, verbose=TRUE], data.table(g=1L, V1=3.89473684210526), output=out) +test(2231.03,optimization=opt, DT[, weighted.mean(x), g, verbose=TRUE], data.table(g=1L, V1=3.325), output=out) # multiple groups DT = data.table(x=c(1L,2L,2L,3L,4L,5L,5L,6L), w=c(1L,1L,1L,1L,2L,2L,2L,2L), g=rep(1L:2L, each=4L)) -test(2231.04, levels=opt, DT[, weighted.mean(x, w), g, verbose=TRUE], data.table(g=c(1L,2L), V1=c(2,5)), output=out) -test(2231.05, levels=opt, DT[, weighted.mean(x, w), g, verbose=TRUE], data.table(g=c(1L,2L), V1=c(2,5)), output=out) -test(2231.06, levels=opt, DT[, weighted.mean(x, w), seq(nrow(DT)), verbose=TRUE], data.table(seq=1L:8L, V1=c(1,2,2,3,4,5,5,6)), output=out) +test(2231.04,optimization=opt, DT[, weighted.mean(x, w), g, verbose=TRUE], data.table(g=c(1L,2L), V1=c(2,5)), output=out) +test(2231.05,optimization=opt, DT[, weighted.mean(x, w), g, verbose=TRUE], data.table(g=c(1L,2L), V1=c(2,5)), output=out) +test(2231.06,optimization=opt, DT[, weighted.mean(x, w), seq(nrow(DT)), verbose=TRUE], data.table(seq=1L:8L, V1=c(1,2,2,3,4,5,5,6)), output=out) # (only x XOR w) containing NA DT = data.table(x=c(1L,NA,2L,3L,4L,5L,5L,6L), w=c(1L,1L,1L,1L,2L,NA,2L,2L), g=rep(1L:2L, each=4L)) -test(2231.07, levels=opt, DT[, weighted.mean(x, w, na.rm=FALSE), g, verbose=TRUE], data.table(g=c(1L,2L), V1=c(NA_real_, NA_real_)), output=out) -test(2231.08, levels=opt, DT[, weighted.mean(x, w, na.rm=TRUE), g, verbose=TRUE], data.table(g=c(1L,2L), V1=c(2, NA_real_)), output=out) -test(2231.09, levels=opt, DT[, weighted.mean(x, na.rm=FALSE), g, verbose=TRUE], data.table(g=c(1L,2L), V1=c(NA, 5)), output=out) -test(2231.10, levels=opt, DT[, weighted.mean(x, na.rm=TRUE), g, verbose=TRUE], data.table(g=c(1L,2L), V1=c(2, 5)), output=out) +test(2231.07,optimization=opt, DT[, weighted.mean(x, w, na.rm=FALSE), g, verbose=TRUE], data.table(g=c(1L,2L), V1=c(NA_real_, NA_real_)), output=out) +test(2231.08,optimization=opt, DT[, weighted.mean(x, w, na.rm=TRUE), g, verbose=TRUE], data.table(g=c(1L,2L), V1=c(2, NA_real_)), output=out) +test(2231.09,optimization=opt, DT[, weighted.mean(x, na.rm=FALSE), g, verbose=TRUE], data.table(g=c(1L,2L), V1=c(NA, 5)), output=out) +test(2231.10,optimization=opt, DT[, weighted.mean(x, na.rm=TRUE), g, verbose=TRUE], data.table(g=c(1L,2L), V1=c(2, 5)), output=out) # (only x XOR w) containing NaN DT = data.table(x=c(1L,2L,NaN,3L,4L,5L,5L,6L), w=c(1L,1L,1L,1L,2L,2L,NaN,2L), g=rep(1L:2L, each=4L)) -test(2231.11, levels=opt, DT[, weighted.mean(x, w, na.rm=FALSE), g, verbose=TRUE], data.table(g=c(1L,2L), V1=c(NaN, NA)), output=out) -test(2231.12, levels=opt, DT[, weighted.mean(x, w, na.rm=TRUE), g, verbose=TRUE], data.table(g=c(1L,2L), V1=c(2, NA)), output=out) -test(2231.13, levels=opt, DT[, weighted.mean(x, na.rm=FALSE), g, verbose=TRUE], data.table(g=c(1L,2L), V1=c(NaN, 5)), output=out) -test(2231.14, levels=opt, DT[, weighted.mean(x, na.rm=TRUE), g, verbose=TRUE], data.table(g=c(1L,2L), V1=c(2, 5)), output=out) +test(2231.11,optimization=opt, DT[, weighted.mean(x, w, na.rm=FALSE), g, verbose=TRUE], data.table(g=c(1L,2L), V1=c(NaN, NA)), output=out) +test(2231.12,optimization=opt, DT[, weighted.mean(x, w, na.rm=TRUE), g, verbose=TRUE], data.table(g=c(1L,2L), V1=c(2, NA)), output=out) +test(2231.13,optimization=opt, DT[, weighted.mean(x, na.rm=FALSE), g, verbose=TRUE], data.table(g=c(1L,2L), V1=c(NaN, 5)), output=out) +test(2231.14,optimization=opt, DT[, weighted.mean(x, na.rm=TRUE), g, verbose=TRUE], data.table(g=c(1L,2L), V1=c(2, 5)), output=out) # (only x XOR w) containing NA and NaN DT = data.table(x=c(1L,NA,NaN,3L,4L,5L,5L,6L), w=c(1L,1L,1L,1L,2L,NA,NaN,2L), g=rep(1L:2L, each=4L)) -test(2231.15, levels=opt, DT[, weighted.mean(x, w, na.rm=FALSE), g, verbose=TRUE], data.table(g=c(1L,2L), V1=c(NA_real_, NA_real_)), output=out) -test(2231.16, levels=opt, DT[, weighted.mean(x, w, na.rm=TRUE), g, verbose=TRUE], data.table(g=c(1L,2L), V1=c(2, NA)), output=out) -test(2231.17, levels=opt, DT[, weighted.mean(x, na.rm=FALSE), g, verbose=TRUE], data.table(g=c(1L,2L), V1=c(NA, 5)), output=out) -test(2231.18, levels=opt, DT[, weighted.mean(x, na.rm=TRUE), g, verbose=TRUE], data.table(g=c(1L,2L), V1=c(2, 5)), output=out) +test(2231.15,optimization=opt, DT[, weighted.mean(x, w, na.rm=FALSE), g, verbose=TRUE], data.table(g=c(1L,2L), V1=c(NA_real_, NA_real_)), output=out) +test(2231.16,optimization=opt, DT[, weighted.mean(x, w, na.rm=TRUE), g, verbose=TRUE], data.table(g=c(1L,2L), V1=c(2, NA)), output=out) +test(2231.17,optimization=opt, DT[, weighted.mean(x, na.rm=FALSE), g, verbose=TRUE], data.table(g=c(1L,2L), V1=c(NA, 5)), output=out) +test(2231.18,optimization=opt, DT[, weighted.mean(x, na.rm=TRUE), g, verbose=TRUE], data.table(g=c(1L,2L), V1=c(2, 5)), output=out) # (x and w) containing NA and NaN DT = data.table(x=c(1L,NA,NaN,3L,4L,5L,5L,6L), w=c(1L,NA,NaN,1L,2L,2L,2L,2L), g=rep(1L:2L, each=4L)) -test(2231.19, levels=opt, DT[, weighted.mean(x, w, na.rm=FALSE), g, verbose=TRUE], data.table(g=c(1L,2L), V1=c(NA, 5)), output=out) -test(2231.20, levels=opt, DT[, weighted.mean(x, w, na.rm=TRUE), g, verbose=TRUE], data.table(g=c(1L,2L), V1=c(2, 5)), output=out) +test(2231.19,optimization=opt, DT[, weighted.mean(x, w, na.rm=FALSE), g, verbose=TRUE], data.table(g=c(1L,2L), V1=c(NA, 5)), output=out) +test(2231.20,optimization=opt, DT[, weighted.mean(x, w, na.rm=TRUE), g, verbose=TRUE], data.table(g=c(1L,2L), V1=c(2, 5)), output=out) DT = data.table(x=c(1L,NA,NaN,3L,4L,5L,5L,6L), w=c(1L,NaN,NA,1L,2L,2L,2L,2L), g=rep(1L:2L, each=4L)) -test(2231.21, levels=opt, DT[, weighted.mean(x, w, na.rm=FALSE), g, verbose=TRUE], data.table(g=c(1L,2L), V1=c(NA, 5)), output=out) -test(2231.22, levels=opt, DT[, weighted.mean(x, w, na.rm=TRUE), g, verbose=TRUE], data.table(g=c(1L,2L), V1=c(2, 5)), output=out) +test(2231.21,optimization=opt, DT[, weighted.mean(x, w, na.rm=FALSE), g, verbose=TRUE], data.table(g=c(1L,2L), V1=c(NA, 5)), output=out) +test(2231.22,optimization=opt, DT[, weighted.mean(x, w, na.rm=TRUE), g, verbose=TRUE], data.table(g=c(1L,2L), V1=c(2, 5)), output=out) # let wrongly named arguments get lost in ellipsis #5543 DT = data.table(x=c(3.7,3.3,3.5,2.8), w=c(5,5,4,1), g=1L) -test(2231.61, levels=opt, DT[, weighted.mean(x, w), g, verbose=TRUE], data.table(g=1L, V1=3.45+1/300), output=out) -test(2231.62, levels=opt, DT[, weighted.mean(x, weight=w), g, verbose=TRUE], data.table(g=1L, V1=3.325), output=out) -test(2231.63, levels=opt, DT[, weighted.mean(x, w, na.rm=FALSE), g], DT[, stats::weighted.mean(x, w, na.rm=FALSE), g]) -test(2231.64, levels=opt, DT[, weighted.mean(x, weight=w, na.rm=TRUE)], DT[, stats::weighted.mean(x, weight=w, na.rm=TRUE)]) +test(2231.61,optimization=opt, DT[, weighted.mean(x, w), g, verbose=TRUE], data.table(g=1L, V1=3.45+1/300), output=out) +test(2231.62,optimization=opt, DT[, weighted.mean(x, weight=w), g, verbose=TRUE], data.table(g=1L, V1=3.325), output=out) +test(2231.63,optimization=opt, DT[, weighted.mean(x, w, na.rm=FALSE), g], DT[, stats::weighted.mean(x, w, na.rm=FALSE), g]) +test(2231.64,optimization=opt, DT[, weighted.mean(x, weight=w, na.rm=TRUE)], DT[, stats::weighted.mean(x, weight=w, na.rm=TRUE)]) # GForce retains attributes in by arguments #5567 dt = data.table(a=letters[1:4], b=structure(1:4, class = c("class_b", "integer"), att=1), c=structure(c(1L,2L,1L,2L), class = c("class_c", "integer"))) opt = c(0,Inf) out = c("GForce FALSE", "GForce optimized j to") -test(2263.1, levels=opt, options=list(datatable.verbose=TRUE), dt[, .N, b], data.table(b=dt$b, N=1L), output=out) -test(2263.2, levels=opt, options=list(datatable.verbose=TRUE), dt[, .N, .(b,c)], data.table(b=dt$b, c=dt$c, N=1L), output=out) -test(2263.3, levels=opt, options=list(datatable.verbose=TRUE), names(attributes(dt[, .N, b]$b)), c("class", "att"), output=out) +test(2263.1,optimization=opt, options=list(datatable.verbose=TRUE), dt[, .N, b], data.table(b=dt$b, N=1L), output=out) +test(2263.2,optimization=opt, options=list(datatable.verbose=TRUE), dt[, .N, .(b,c)], data.table(b=dt$b, c=dt$c, N=1L), output=out) +test(2263.3,optimization=opt, options=list(datatable.verbose=TRUE), names(attributes(dt[, .N, b]$b)), c("class", "att"), output=out) # named arguments of c() in j get prepended to lapply(.SD, FUN) #2311 M <- as.data.table(mtcars) @@ -388,60 +388,60 @@ sdlist <- vector("list", length(sdnames)) names(sdlist) <- sdnames opts = 0:2 -test(2283 + 0.01, levels=opts, +test(2283 + 0.01, optimization=opts, names(M[, c(m=lapply(.SD, mean)), by="cyl"]), c("cyl", names(c(m=sdlist)))) -test(2283 + 0.02, levels=opts, +test(2283 + 0.02, optimization=opts, names(M[, c(Mpg=list(mpg), lapply(.SD, mean)), by="cyl"]), c("cyl", "Mpg", sdnames)) -test(2283 + 0.03, levels=opts, +test(2283 + 0.03, optimization=opts, names(M[, c(Mpg=list(mpg), m=lapply(.SD, mean)), by="cyl"]), c("cyl", "Mpg", names(c(m=sdlist)))) -test(2283 + 0.04, levels=opts, +test(2283 + 0.04, optimization=opts, names(M[, c(mpg=list(mpg), mpg=lapply(.SD, mean)), by="cyl"]), c("cyl", "mpg", names(c(mpg=sdlist)))) -test(2283 + 0.05, levels=opts, +test(2283 + 0.05, optimization=opts, names(M[, c(list(mpg), lapply(.SD, mean)), by="cyl"]), c("cyl", "V1", sdnames)) -test(2283 + 0.06, levels=opts, +test(2283 + 0.06, optimization=opts, names(M[, c(lapply(.SD, mean), list(mpg)), by="cyl"]), c("cyl", sdnames, sprintf("V%d", length(sdnames)+1L))) -test(2283 + 0.07, levels=opts, +test(2283 + 0.07, optimization=opts, names(M[, c(lapply(.SD, mean), lapply(.SD, sum)), by="cyl"]), c("cyl", sdnames, sdnames)) -test(2283 + 0.08, levels=opts, +test(2283 + 0.08, optimization=opts, names(M[, c(mean=lapply(.SD, mean), sum=lapply(.SD, sum)), by="cyl"]), c("cyl", names(c(mean=sdlist, sum=sdlist)))) -test(2283 + 0.09, levels=opts, +test(2283 + 0.09, optimization=opts, names(M[, c(lapply(.SD, mean), sum=lapply(.SD, sum)), by="cyl"]), c("cyl", sdnames, names(c(sum=sdlist))) ) -test(2283 + 0.10, levels=opts, +test(2283 + 0.10, optimization=opts, names(M[, c(" "=lapply(.SD, mean), "."=lapply(.SD, sum)), by="cyl"]), c("cyl", names(c(" "=sdlist, "."=sdlist)))) -test(2283 + 0.11, levels=opts, +test(2283 + 0.11, optimization=opts, names(M[, c(A=list(a=mpg, b=hp), lapply(.SD, mean)), by="cyl"]), c("cyl", names(c(A=list(a=0, b=0))), sdnames)) -test(2283 + 0.12, levels=opts, +test(2283 + 0.12, optimization=opts, names(M[, c(A=list(mpg, hp), lapply(.SD, mean)), by="cyl"]), c("cyl", names(c(A=list(0, 0))), sdnames)) -test(2283 + 0.13, levels=opts, +test(2283 + 0.13, optimization=opts, names(M[, c(A=list(mpg, b=hp, wt), lapply(.SD, mean)), by="cyl"]), c("cyl", names(c(A=list(0, b=0, 0))), sdnames)) -test(2283 + 0.14, levels=opts, +test(2283 + 0.14, optimization=opts, names(M[, c(A=list(mpg), lapply(.SD, mean)), by="cyl"]), c("cyl", names(c(A=list(0))), sdnames)) -test(2283 + 0.15, levels=opts, +test(2283 + 0.15, optimization=opts, names(M[, c(" "=list(" "=hp, "."=disp, mpg), lapply(.SD, mean)), by="cyl"]), c("cyl", names(c(" "=list(" "=0, "."=0, 0))), sdnames)) -test(2283 + 0.16, levels=opts, +test(2283 + 0.16, optimization=opts, names(M[, c("."=list(" "=hp, "."=disp, mpg), lapply(.SD, mean)), by="cyl"]), c("cyl", names(c("."=list(" "=0, "."=0, 0))), sdnames)) -test(2283 + 0.17, levels=opts, +test(2283 + 0.17, optimization=opts, names(M[, c(list(mpg, b=hp), lapply(.SD, mean)), by="cyl", .SDcols=c("vs", "am")]), c("cyl", "V1", "b", "vs", "am")) -test(2283 + 0.18, levels=opts, +test(2283 + 0.18, optimization=opts, names(M[, c(list(mpg, b=hp), c(lapply(.SD, mean))), by="cyl", .SDcols=c("vs", "am")]), c("cyl", "V1", "b", "vs", "am")) -test(2283 + 0.19, levels=opts, +test(2283 + 0.19, optimization=opts, names(M[, c(mpg[1], list(mpg, b=hp), c(lapply(.SD, mean))), by="cyl", .SDcols=c("vs", "am")]), c("cyl", "V1", "V2", "b", "vs", "am")) diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 2449e0cf9..869616111 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -3890,7 +3890,7 @@ test(1133.3, DT[, new := c(1,2), by=x], error="Supplied 2 items to be assigned test(1133.4, DT[, new := c(1L,2L), by=x], error="Supplied 2 items to be assigned to group 1 of size 5 in column 'new'") test(1133.5, DT, data.table(x=INT(1,1,1,1,1,2,2), new=99L)) test(1133.6, DT[, new := rep(-.GRP, .N), by=x], data.table(x=INT(1,1,1,1,1,2,2), new=INT(-1,-1,-1,-1,-1,-2,-2))) -test(1133.7, levels=c(0L, 2L), DT[, new := .N, by=x], data.table(x=INT(1,1,1,1,1,2,2), new=INT(5,5,5,5,5,2,2))) +test(1133.7,optimization=c(0L, 2L), DT[, new := .N, by=x], data.table(x=INT(1,1,1,1,1,2,2), new=INT(5,5,5,5,5,2,2))) # on a new column with warning on 2nd assign DT[,new:=NULL] test(1133.8, DT[, new := if (.GRP==1L) 7L else 3.4, by=x], data.table(x=INT(1,1,1,1,1,2,2), new=INT(7,7,7,7,7,3,3)), @@ -3993,8 +3993,8 @@ DT<-data.table(X=factor(2006:2012),Y=rep(1:7,2)) test(1143.2, DT[, Z:=paste(X,.N,sep=" - "), by=list(X)], data.table(X=factor(2006:2012),Y=rep(1:7,2), Z=paste(as.character(2006:2012), 2L, sep=" - "))) DT = data.table(x=as.POSIXct(c("2009-02-17 17:29:23.042", "2009-02-17 17:29:25.160")), y=c(1L,2L)) test(1143.3, DT[, list(lx=x[.N]), by=x], data.table(x=DT$x, lx=DT$x)) -test(1143.4, levels=c(0L, 2L), copy(DT)[,`:=`(lx=tail(x,1L)), by=y], copy(DT)[, lx:=x]) -test(1143.5, levels=c(0L, 2L), copy(DT)[, let(lx=tail(x,1L)), by=y], copy(DT)[, lx:=x]) +test(1143.4,optimization=c(0L, 2L), copy(DT)[,`:=`(lx=tail(x,1L)), by=y], copy(DT)[, lx:=x]) +test(1143.5,optimization=c(0L, 2L), copy(DT)[, let(lx=tail(x,1L)), by=y], copy(DT)[, lx:=x]) test(1143.6, copy(DT)[,`:=`(lx=tail(x,1L)), by=y], copy(DT)[, lx:=x]) test(1143.7, copy(DT)[, let(lx=tail(x,1L)), by=y], copy(DT)[, lx:=x]) @@ -4465,16 +4465,16 @@ test(1228.6, class(DT), class(DT[a>1, c:=sum(b), by=a])) # Test that ad hoc by detects if ordered and dogroups switches to memcpy if contiguous, #1050 DT = data.table(a=1:3,b=1:6,key="a") # turn off GForce, to test dogroups -test(1230, levels=1L, DT[, sum(b), by=a, verbose=TRUE], output="memcpy contiguous groups") +test(1230,optimization=1L, DT[, sum(b), by=a, verbose=TRUE], output="memcpy contiguous groups") setkey(DT,NULL) -test(1231, levels=1L, DT[, sum(b), by=a, verbose=TRUE], output="memcpy contiguous groups") -test(1232, levels=1L, DT[, sum(b), by=a+1, verbose=TRUE], output="memcpy contiguous groups") -test(1233, levels=1L, DT[, sum(b), by=a%%2, verbose=TRUE], output="collecting discontiguous groups") -test(1234, levels=1L, DT[, sum(a), by=b, verbose=TRUE], output="memcpy contiguous groups") # as from v1.12.0 the out-of-order but grouped-ness is detected and utilized +test(1231,optimization=1L, DT[, sum(b), by=a, verbose=TRUE], output="memcpy contiguous groups") +test(1232,optimization=1L, DT[, sum(b), by=a+1, verbose=TRUE], output="memcpy contiguous groups") +test(1233,optimization=1L, DT[, sum(b), by=a%%2, verbose=TRUE], output="collecting discontiguous groups") +test(1234,optimization=1L, DT[, sum(a), by=b, verbose=TRUE], output="memcpy contiguous groups") # as from v1.12.0 the out-of-order but grouped-ness is detected and utilized setkey(DT,a) -test(1235, levels=1L, DT[.(2:3),sum(b),by=.EACHI,verbose=TRUE], data.table(a=2:3,V1=c(7L,9L),key="a"), output="memcpy contiguous groups") -test(1236, levels=1L, DT[.(3:2),sum(b),by=.EACHI,verbose=TRUE], data.table(a=3:2,V1=c(9L,7L)), output="memcpy contiguous groups") -test(1237, levels=1L, DT[.(3:2),sum(b),keyby=.EACHI,verbose=TRUE], data.table(a=2:3,V1=c(7L,9L),key="a"), output="memcpy contiguous groups") +test(1235,optimization=1L, DT[.(2:3),sum(b),by=.EACHI,verbose=TRUE], data.table(a=2:3,V1=c(7L,9L),key="a"), output="memcpy contiguous groups") +test(1236,optimization=1L, DT[.(3:2),sum(b),by=.EACHI,verbose=TRUE], data.table(a=3:2,V1=c(9L,7L)), output="memcpy contiguous groups") +test(1237,optimization=1L, DT[.(3:2),sum(b),keyby=.EACHI,verbose=TRUE], data.table(a=2:3,V1=c(7L,9L),key="a"), output="memcpy contiguous groups") # check that key is not preserved when length of fastorder is > 0 DT <- data.table(x=1:5, y=6:10, key="x") @@ -4741,39 +4741,39 @@ dt = data.table(a=sample(2,10,TRUE), b=sample(3,10,TRUE), c=sample(4,10,TRUE), d dt2 = data.table(x=c(1,1,1,2,2,2), y=1:6) # auto-naming behavior is different for no-optimization case; just check optimization is off -test(1268.01, levels=0L, dt[, c(lapply(.SD, mean), lapply(.SD, sum)), by=a, verbose=TRUE], output = 'All optimizations are turned off') -test(1268.02, levels=0L, dt[, c(lapply(.SD, mean), .N), by=a, verbose=TRUE], output = 'All optimizations are turned off') -test(1268.03, levels=0L, dt[, c(list(c), lapply(.SD, mean)), by=a, verbose=TRUE], output="All optimizations are turned off") -test(1268.04, levels=0L, dt[, c(sum(d), lapply(.SD, mean)), by=a, verbose=TRUE], output="All optimizations are turned off") -test(1268.05, levels=0L, dt[, c(list(sum(d)), lapply(.SD, mean)), by=a, verbose=TRUE], output="All optimizations are turned off") +test(1268.01,optimization=0L, dt[, c(lapply(.SD, mean), lapply(.SD, sum)), by=a, verbose=TRUE], output = 'All optimizations are turned off') +test(1268.02,optimization=0L, dt[, c(lapply(.SD, mean), .N), by=a, verbose=TRUE], output = 'All optimizations are turned off') +test(1268.03,optimization=0L, dt[, c(list(c), lapply(.SD, mean)), by=a, verbose=TRUE], output="All optimizations are turned off") +test(1268.04,optimization=0L, dt[, c(sum(d), lapply(.SD, mean)), by=a, verbose=TRUE], output="All optimizations are turned off") +test(1268.05,optimization=0L, dt[, c(list(sum(d)), lapply(.SD, mean)), by=a, verbose=TRUE], output="All optimizations are turned off") # newly added tests for #861 -- optimise, but no GForce -test(1268.06, levels=0L, dt[, c(list(sum(d), .I), lapply(.SD, mean)), by=a, verbose=TRUE], output="All optimizations are turned off") +test(1268.06,optimization=0L, dt[, c(list(sum(d), .I), lapply(.SD, mean)), by=a, verbose=TRUE], output="All optimizations are turned off") # don't optimise .I in c(...) -test(1268.07, levels=0L, dt2[, c(.I, lapply(.SD, mean)), by=x, verbose=TRUE], output="All optimizations are turned off") +test(1268.07,optimization=0L, dt2[, c(.I, lapply(.SD, mean)), by=x, verbose=TRUE], output="All optimizations are turned off") -test(1268.08, levels=1L, ans1 <- dt[ , c(lapply(.SD, mean), lapply(.SD, sum)), by=a, verbose=TRUE], output="Old mean optimization.*(GForce FALSE)") -test(1268.09, levels=1L, ans2 <- dt[, c(lapply(.SD, mean), .N), by=a, verbose = TRUE], output="Old mean optimization.*GForce FALSE") -test(1268.10, levels=1L, ans3 <- dt[, c(list(c), lapply(.SD, mean)), by=a, verbose=TRUE], output = 'Old mean optimization.*GForce FALSE') -test(1268.11, levels=1L, ans4 <- dt[, c(sum(d), lapply(.SD, mean)), by=a, verbose = TRUE], output="Old mean optimization.*GForce FALSE") -test(1268.12, levels=1L, ans5 <- dt[, c(list(sum(d)), lapply(.SD, mean)), by=a, verbose=TRUE], output="Old mean optimization.*GForce FALSE") -test(1268.13, levels=1L, ans6 <- dt[, c(list(sum(d), .I), lapply(.SD, mean)), by=a, verbose=TRUE], output="Old mean optimization.*GForce FALSE") -test(1268.14, levels=1L, ans7 <- dt2[, c(.I, lapply(.SD, mean)), by=x, verbose=TRUE], output="Old mean optimization.*GForce FALSE") +test(1268.08,optimization=1L, ans1 <- dt[ , c(lapply(.SD, mean), lapply(.SD, sum)), by=a, verbose=TRUE], output="Old mean optimization.*(GForce FALSE)") +test(1268.09,optimization=1L, ans2 <- dt[, c(lapply(.SD, mean), .N), by=a, verbose = TRUE], output="Old mean optimization.*GForce FALSE") +test(1268.10,optimization=1L, ans3 <- dt[, c(list(c), lapply(.SD, mean)), by=a, verbose=TRUE], output = 'Old mean optimization.*GForce FALSE') +test(1268.11,optimization=1L, ans4 <- dt[, c(sum(d), lapply(.SD, mean)), by=a, verbose = TRUE], output="Old mean optimization.*GForce FALSE") +test(1268.12,optimization=1L, ans5 <- dt[, c(list(sum(d)), lapply(.SD, mean)), by=a, verbose=TRUE], output="Old mean optimization.*GForce FALSE") +test(1268.13,optimization=1L, ans6 <- dt[, c(list(sum(d), .I), lapply(.SD, mean)), by=a, verbose=TRUE], output="Old mean optimization.*GForce FALSE") +test(1268.14,optimization=1L, ans7 <- dt2[, c(.I, lapply(.SD, mean)), by=x, verbose=TRUE], output="Old mean optimization.*GForce FALSE") -test(1268.15, levels=Inf, dt[, c(lapply(.SD, mean), lapply(.SD, sum)), by=a, verbose=TRUE], ans1, +test(1268.15,optimization=Inf, dt[, c(lapply(.SD, mean), lapply(.SD, sum)), by=a, verbose=TRUE], ans1, output="GForce optimized j to 'list(gmean(b), gmean(c), gmean(d), gsum(b), gsum(c), gsum(d))'") -test(1268.16, levels=Inf, dt[, c(lapply(.SD, mean), .N), by=a, verbose=TRUE], ans2, +test(1268.16,optimization=Inf, dt[, c(lapply(.SD, mean), .N), by=a, verbose=TRUE], ans2, output = "lapply optimization changed j from 'c(lapply(.SD, mean), .N)' to 'list(mean(b), mean(c), mean(d), .N)'") -test(1268.17, levels=Inf, dt[, c(list(c), lapply(.SD, mean)), by=a, verbose=TRUE], ans3, +test(1268.17,optimization=Inf, dt[, c(list(c), lapply(.SD, mean)), by=a, verbose=TRUE], ans3, output = "lapply optimization changed j from 'c(list(c), lapply(.SD, mean))' to 'list(c, mean(b), mean(c), mean(d))") -test(1268.18, levels=Inf, dt[, c(sum(d), lapply(.SD, mean)), by=a, verbose=TRUE], ans4, +test(1268.18,optimization=Inf, dt[, c(sum(d), lapply(.SD, mean)), by=a, verbose=TRUE], ans4, output = "GForce optimized j to 'list(gsum(d), gmean(b), gmean(c), gmean(d))'") -test(1268.19, levels=Inf, dt[, c(list(sum(d)), lapply(.SD, mean)), by=a, verbose=TRUE], ans5, +test(1268.19,optimization=Inf, dt[, c(list(sum(d)), lapply(.SD, mean)), by=a, verbose=TRUE], ans5, output = "GForce optimized j to 'list(gsum(d), gmean(b), gmean(c), gmean(d))'") -test(1268.20, levels=Inf, dt[, c(list(sum(d), .I), lapply(.SD, mean)), by=a, verbose=TRUE], ans6, +test(1268.20,optimization=Inf, dt[, c(list(sum(d), .I), lapply(.SD, mean)), by=a, verbose=TRUE], ans6, output = "lapply optimization changed j from 'c(list(sum(d), .I), lapply(.SD, mean))' to 'list(sum(d), .I, mean(b), mean(c), mean(d))'") -test(1268.21, levels=Inf, dt2[, c(.I, lapply(.SD, mean)), by=x, verbose=TRUE], ans7, +test(1268.21,optimization=Inf, dt2[, c(.I, lapply(.SD, mean)), by=x, verbose=TRUE], ans7, output = "lapply optimization is on, j unchanged as 'c(.I, lapply(.SD, mean))'") -test(1268.22, levels=Inf, dt[, c(as.list(c), lapply(.SD, mean)), by=a], +test(1268.22,optimization=Inf, dt[, c(as.list(c), lapply(.SD, mean)), by=a], error = "j doesn't evaluate to the same number of columns for each group") @@ -6278,7 +6278,7 @@ test(1437.17, DT[!a %chin% c("A", "B") & x == 2], DT[c(4, 5, 6)]) test(1437.18, DT[x == 2, .(test = x+y), verbose = TRUE], output = "Optimized subsetting") test(1437.19, DT[x == 2, test := x+y, verbose = TRUE], output = "Optimized subsetting") ## optimize option level 3 is required to get optimized subsetting -test(1437.21, levels=c(2,Inf), DT[x == 2, verbose = TRUE], output = c("^ x y", "Optimized subsetting")) +test(1437.21,optimization=c(2,Inf), DT[x == 2, verbose = TRUE], output = c("^ x y", "Optimized subsetting")) ## NaN on right hand side is treated correctly. NA on right hand side is not reaching .prepareFastSubset, so not tested here DT <- data.table(x = c(1L:10L, NA_integer_, NA_integer_), y = c(1:10, NA_real_, NaN)) test(1437.23, DT[y == NaN], DT[0]) @@ -6364,16 +6364,16 @@ for(t in seq_len(nrow(all))){ ## test the query with missing j thisQuery <- all$query[t] test_no <- test_no + 1L - test(1438.0 + test_no*0.0001, levels=c(2L,3L), DT[eval(parse(text = thisQuery))]) + test(1438.0 + test_no*0.0001, optimization=c(2L,3L), DT[eval(parse(text = thisQuery))]) ## repeat the test with 'which = TRUE' test_no <- test_no + 1L - test(1438.0 + test_no*0.0001, levels=c(2L,3L), DT[eval(parse(text = thisQuery)), which = TRUE]) + test(1438.0 + test_no*0.0001, optimization=c(2L,3L), DT[eval(parse(text = thisQuery)), which = TRUE]) ## repeat the test with the j queries for(thisJquery in jQueries) { ## do it with and without existing "by" for(thisBy in bys){ test_no <- test_no + 1L - test(1438.0 + test_no*0.0001, levels=c(2L,3L), DT[eval(parse(text = thisQuery)), eval(parse(text = thisJquery)), by = thisBy]) + test(1438.0 + test_no*0.0001, optimization=c(2L,3L), DT[eval(parse(text = thisQuery)), eval(parse(text = thisJquery)), by = thisBy]) } } } @@ -8778,13 +8778,13 @@ test(1629.07, dt[0][, .SD*v1, .SDcols=v2:v3], dt[0][, .SD, .SDcols=v2:v3]) dt2 = copy(dt) test(1629.08, dt2[, c("v2", "v3") := .SD*v1, .SDcols=v2:v3], dt[, .(grp, v1, v2=v2*v1, v3=v3*v1)]) # grouping operations -test(1629.09, levels=c(1L,2L), dt[, .SD*sum(v1), by=grp, .SDcols=v2:v3], dt[, .SD*sum(v1), by=grp][, v1 := NULL]) +test(1629.09,optimization=c(1L,2L), dt[, .SD*sum(v1), by=grp, .SDcols=v2:v3], dt[, .SD*sum(v1), by=grp][, v1 := NULL]) ans1 = dt[, base::sum(v1), by=grp] ans2 = dt[, base::max(.SD), by=grp, .SDcols=v2:v3] -test(1629.10, levels=c(1L,2L), dt[, max(.SD)*sum(v1), by=grp, .SDcols=v2:v3], ans1[, .(grp, V1=V1*ans2$V1)]) -test(1629.11, levels=c(1L,2L), dt[, lapply(.SD, function(x) weighted.mean(x, w=v2)), .SDcols=c("v1","v3"), by=grp], +test(1629.10,optimization=c(1L,2L), dt[, max(.SD)*sum(v1), by=grp, .SDcols=v2:v3], ans1[, .(grp, V1=V1*ans2$V1)]) +test(1629.11,optimization=c(1L,2L), dt[, lapply(.SD, function(x) weighted.mean(x, w=v2)), .SDcols=c("v1","v3"), by=grp], dt[, .(v1=weighted.mean(v1,w=v2), v3=weighted.mean(v3, w=v2)), by=grp]) -test(1629.12, levels=c(1L,Inf), dt[, c(v1=max(v1), lapply(.SD, base::min)), by=grp, .SDcols=v2:v3], dt[, .(v1=max(v1), v2=min(v2), v3=min(v3)), by=grp]) +test(1629.12,optimization=c(1L,Inf), dt[, c(v1=max(v1), lapply(.SD, base::min)), by=grp, .SDcols=v2:v3], dt[, .(v1=max(v1), v2=min(v2), v3=min(v3)), by=grp]) # gforce # even more complex, shouldn't run any optimisation dt[, v4 := v1/2] @@ -12756,8 +12756,8 @@ DT[ , V1:=as.ordered(V1)] test(1918.3, DT[, min(V1)], structure(1L, .Label = lev, class = c("ordered", "factor"))) test(1918.4, DT[, max(V1)], structure(5L, .Label = lev, class = c("ordered", "factor"))) ## make sure GForce is activated -test(1918.5, levels=Inf, DT[, min(V1), by=V2], data.table(V2=c("f", "g", "h"), V1=structure(1:3, .Label=lev, class=c("ordered", "factor")))) -test(1918.6, levels=Inf, DT[, max(V1), by=V2], data.table(V2=c("f", "g", "h"), V1=structure(3:5, .Label=lev, class=c("ordered", "factor")))) +test(1918.5,optimization=Inf, DT[, min(V1), by=V2], data.table(V2=c("f", "g", "h"), V1=structure(1:3, .Label=lev, class=c("ordered", "factor")))) +test(1918.6,optimization=Inf, DT[, max(V1), by=V2], data.table(V2=c("f", "g", "h"), V1=structure(3:5, .Label=lev, class=c("ordered", "factor")))) # as.ITime.character bug for NA handling #2940 test(1919, as.ITime(c('xxx', '10:43')), structure(c(NA, 38580L), class = "ITime")) @@ -13640,11 +13640,11 @@ suppressWarnings(rm(`___data.table_internal_test_1967.68___`)) test(1967.68, setDT(`___data.table_internal_test_1967.68___`), error = 'Cannot find symbol') ### [.data.table verbosity & non-equi-join tests -test(1967.69, levels=0L, x[order(a), .N, verbose = TRUE], output='[1] 5', notOutput='forder.c') -test(1967.71, levels=1L, x[order(a), .N, verbose = TRUE], 5L, +test(1967.69,optimization=0L, x[order(a), .N, verbose = TRUE], output='[1] 5', notOutput='forder.c') +test(1967.71,optimization=1L, x[order(a), .N, verbose = TRUE], 5L, output = "forder.c received 5 rows and 1 column") setkey(x) -test(1967.72, levels=1L, x[x, .N, on = 'a', verbose = TRUE], 5L, +test(1967.72,optimization=1L, x[x, .N, on = 'a', verbose = TRUE], 5L, output = "on= matches existing key") x = data.table( @@ -13798,8 +13798,8 @@ test(1980, names(data.table(x)), "x") # crash when n="lead", #3354 DT = data.table( id = 1:5 , val = letters[1:5] ) -test(1981.1, levels=c(0L, Inf), DT[, new_col := shift(val, "lead")], error=base_messages$stopifnot("is.numeric(n)")) -test(1981.2, levels=c(0L, Inf), DT[, new_col := shift(val, NA_integer_)], error="Item 1 of n is NA") +test(1981.1,optimization=c(0L, Inf), DT[, new_col := shift(val, "lead")], error=base_messages$stopifnot("is.numeric(n)")) +test(1981.2,optimization=c(0L, Inf), DT[, new_col := shift(val, NA_integer_)], error="Item 1 of n is NA") # 1982 moved to benchmark.Rraw, #5517 @@ -13831,7 +13831,7 @@ test(1984.081, DT[, sum(a), by=as.raw(0)], error="Column or expression.*1.*t test(1984.082, data.table(A=1:4, L=list(1, 1:2, 1, 1:3), V=1:4)[, sum(V), by=.(A,L)], # better error message, 4308 error="Column or expression.*2.*type 'list'.*not.*supported") test(1984.09, DT[, sum(a), by=.(1,1:2)], error="The items in the 'by' or 'keyby' list have lengths [1, 2]. Each must be length 10; the same length as there are rows in x (after subsetting if i is provided).") -test(1984.10, levels=Inf, DT[ , 1, by = .(a %% 2), verbose = TRUE], +test(1984.10,optimization=Inf, DT[ , 1, by = .(a %% 2), verbose = TRUE], data.table(a = c(1, 0), V1 = c(1, 1)), output = 'Optimization is on but left j unchanged') DT[ , f := rep(1:2, each = 5)] @@ -14717,10 +14717,10 @@ test(2042.4, DT[ , round(mean(DiffTime)), by=Group, verbose=TRUE], DT = data.table(x=INT(1,1,1,2,2), y=1:5) z = 1:5 opt = c(Inf,1L,0L) -test(2043.1, levels=opt, DT[, list(mean(z), mean(y)), by=x], data.table(x=1:2, V1=c(3,3), V2=c(2.0,4.5))) -test(2043.4, levels=opt, DT[, list(sd(z), sd(y)), by=x], data.table(x=1:2, V1=sd(z), V2=c(sd(1:3), sd(4:5)))) +test(2043.1,optimization=opt, DT[, list(mean(z), mean(y)), by=x], data.table(x=1:2, V1=c(3,3), V2=c(2.0,4.5))) +test(2043.4,optimization=opt, DT[, list(sd(z), sd(y)), by=x], data.table(x=1:2, V1=sd(z), V2=c(sd(1:3), sd(4:5)))) z = 1:4 -test(2043.5, levels=opt, DT[, list(mean(z), mean(y)), by=x], ans<-data.table(x=1:2, V1=c(2.5,2.5), V2=c(2.0,4.5))) # was length error about z +test(2043.5,optimization=opt, DT[, list(mean(z), mean(y)), by=x], ans<-data.table(x=1:2, V1=c(2.5,2.5), V2=c(2.0,4.5))) # was length error about z # test type coercion in joins, #2592 dt1 <- data.table(int = 1L:10L, @@ -15883,10 +15883,10 @@ test(2098.1, DT[do.call(order, mget(groups)), verbose=TRUE], ans<-data.table(id= test(2098.2, DT[with(DT, do.call(order, mget(groups))), verbose=TRUE], ans, output=out) test(2098.3, DT[do.call(forder, mget(groups)), verbose=TRUE], ans, output=out) test(2098.4, DT[with(DT, do.call(forder, mget(groups))), verbose=TRUE], ans, output=out) -test(2098.5, levels=0L, DT[do.call(order, mget(groups)), verbose=TRUE], ans, notOutput="forder.c") -test(2098.6, levels=0L, DT[with(DT, do.call(order, mget(groups))), verbose=TRUE], ans, notOutput="forder.c") -test(2098.7, levels=0L, DT[do.call(forder, mget(groups)), verbose=TRUE], ans, output=out) -test(2098.8, levels=0L, DT[with(DT, do.call(forder, mget(groups))), verbose=TRUE], ans, output=out) +test(2098.5,optimization=0L, DT[do.call(order, mget(groups)), verbose=TRUE], ans, notOutput="forder.c") +test(2098.6,optimization=0L, DT[with(DT, do.call(order, mget(groups))), verbose=TRUE], ans, notOutput="forder.c") +test(2098.7,optimization=0L, DT[do.call(forder, mget(groups)), verbose=TRUE], ans, output=out) +test(2098.8,optimization=0L, DT[with(DT, do.call(forder, mget(groups))), verbose=TRUE], ans, output=out) # Error in update join when joining on factor, #3559 d1 <- data.table(fac = factor(letters[1:4]), char = letters[1:4], val = c(1L, NA, 3L, NA)) @@ -17311,14 +17311,14 @@ test(2207, dcast(DT, x~y, value.var="z"), data.table(x=1:3, a=c(1+6i, 3+4i, 5+2i # gmin/gmax for integer64, #4444 if (test_bit64) { DT = data.table(grp=c(1L, 1L, 1L, 2L), i64=as.integer64(c(NA, 1:3))) - test(2208.1, levels=2L, DT[, min(i64), by=grp], data.table(grp=1:2, V1=as.integer64(c(NA, 3)))) - test(2208.2, levels=2L, DT[, min(i64, na.rm=TRUE), by=grp], data.table(grp=1:2, V1=as.integer64(c(1, 3)))) - test(2208.3, levels=2L, DT[, max(i64), by=grp], data.table(grp=1:2, V1=as.integer64(c(NA, 3)))) - test(2208.4, levels=2L, DT[, max(i64, na.rm=TRUE), by=grp], data.table(grp=1:2, V1=as.integer64(2:3))) + test(2208.1,optimization=2L, DT[, min(i64), by=grp], data.table(grp=1:2, V1=as.integer64(c(NA, 3)))) + test(2208.2,optimization=2L, DT[, min(i64, na.rm=TRUE), by=grp], data.table(grp=1:2, V1=as.integer64(c(1, 3)))) + test(2208.3,optimization=2L, DT[, max(i64), by=grp], data.table(grp=1:2, V1=as.integer64(c(NA, 3)))) + test(2208.4,optimization=2L, DT[, max(i64, na.rm=TRUE), by=grp], data.table(grp=1:2, V1=as.integer64(2:3))) # create an all-NA group DT[, i64:=rev(i64)] - test(2208.7, levels=2L, DT[, min(i64, na.rm=TRUE), by=grp], data.table(grp=1:2, V1=as.integer64(c(1,NA)))) - test(2208.8, levels=2L, DT[, max(i64, na.rm=TRUE), by=grp], data.table(grp=1:2, V1=as.integer64(c(3,NA)))) + test(2208.7,optimization=2L, DT[, min(i64, na.rm=TRUE), by=grp], data.table(grp=1:2, V1=as.integer64(c(1,NA)))) + test(2208.8,optimization=2L, DT[, max(i64, na.rm=TRUE), by=grp], data.table(grp=1:2, V1=as.integer64(c(3,NA)))) } # when user supplies dec=',' don't try sep=',', #4483 @@ -17538,12 +17538,12 @@ funs = c("sum", "mean", "min", "max", "median", "var", "sd", "prod") testnum = 0L for (fun in funs) { testnum = testnum + 1L - test(2220.0 + testnum*0.01, levels=2L, EVAL("DT[,",fun,"(i, na.rm='a'), g]"), error="na.rm must be TRUE or FALSE") + test(2220.0 + testnum*0.01, optimization=2L, EVAL("DT[,",fun,"(i, na.rm='a'), g]"), error="na.rm must be TRUE or FALSE") testnum = testnum + 1L - test(2220.0 + testnum*0.01, levels=2L, EVAL("DT[,",fun,"(f), g]"), error=sprintf("%s is not meaningful for factors.", fun)) + test(2220.0 + testnum*0.01, optimization=2L, EVAL("DT[,",fun,"(f), g]"), error=sprintf("%s is not meaningful for factors.", fun)) } testnum = testnum + 1L -test(2220.0 + testnum*0.01, levels=2L, DT[, prod(l), g], error="GForce prod can only be applied to columns, not .SD or similar.") +test(2220.0 + testnum*0.01, optimization=2L, DT[, prod(l), g], error="GForce prod can only be applied to columns, not .SD or similar.") # tables() error when called from inside a function(...), #5197 test(2221, (function(...) tables())(), output = "No objects of class data.table exist") @@ -17580,7 +17580,7 @@ for (col in names(DT)[-1]) { for (type in c('lag','lead','shift','cyclic')) { # fill is tested by group in tests 2218.*; see comments in #5205 # sapply(sapply()) changed to for(for(for())) to save 29MiB, #5517 - test(2224.1+i/10000, levels=2L, # 192 tests here when test_bit64=TRUE; 168 when FALSE + test(2224.1+i/10000, optimization=2L, # 192 tests here when test_bit64=TRUE; 168 when FALSE EVAL(sprintf("DT[, shift(%s, %d, type='%s'), by=x]$V1", col, n, type)), ans[[i]]) i = i+1L @@ -17685,15 +17685,15 @@ test(2232.4, unique(DT, by='g', cols='v3'), error="non-existing column(s)") DT = data.table(a=1:3,b=(1:9)/10) opt = 0:2 out = c("GForce FALSE", "GForce FALSE", "GForce optimized j to") -test(2233.01, levels=opt, DT[, v := min(b), a, verbose=TRUE], data.table(a=1:3, b=(1:9)/10, v=(1:3)/10), output=out) +test(2233.01,optimization=opt, DT[, v := min(b), a, verbose=TRUE], data.table(a=1:3, b=(1:9)/10, v=(1:3)/10), output=out) # GForce returning full length -test(2233.02, levels=opt, DT[, v := head(b, 3L), a, verbose=TRUE], data.table(a=1:3, b=(1:9)/10, v=(1:9)/10), output=out) +test(2233.02,optimization=opt, DT[, v := head(b, 3L), a, verbose=TRUE], data.table(a=1:3, b=(1:9)/10, v=(1:9)/10), output=out) # GForce neither returning 1 per group nor full length -test(2233.03, levels=opt, DT[, v := head(b, 2L), a], error="Supplied .* items to be assigned to .* column 'v'.") +test(2233.03,optimization=opt, DT[, v := head(b, 2L), a], error="Supplied .* items to be assigned to .* column 'v'.") # compare to non GForce version DT = data.table(a=1:3,b=(1:9)/10) -test(2233.04, levels=opt, copy(DT)[, v := min(b), a, verbose=TRUE], copy(DT)[, v := base::min(b), a, ], output=out) -test(2233.05, levels=opt, copy(DT)[, v := head(b, 3L), a, verbose=TRUE], copy(DT)[, v := utils::head(b, 3L), a], output=out) +test(2233.04,optimization=opt, copy(DT)[, v := min(b), a, verbose=TRUE], copy(DT)[, v := base::min(b), a, ], output=out) +test(2233.05,optimization=opt, copy(DT)[, v := head(b, 3L), a, verbose=TRUE], copy(DT)[, v := utils::head(b, 3L), a], output=out) # with key and grouping by key DT = data.table(a=1:3,b=(1:9)/10, key="a") @@ -17768,17 +17768,17 @@ opt = c(0,Inf) out = c("GForce FALSE", "GForce.*gsum") B = copy(a) A = a[sample(seq_len(nrow(a)), nrow(a))] # shuffle -test(2233.391, levels=opt, A[, t1 := sum(flag, na.rm=TRUE), by=id2, verbose=TRUE], A, output=out) # y=A dummy just to test output= +test(2233.391,optimization=opt, A[, t1 := sum(flag, na.rm=TRUE), by=id2, verbose=TRUE], A, output=out) # y=A dummy just to test output= setorder(A, id1) -test(2233.392, levels=opt, A[, t2 := sum(flag, na.rm=TRUE), by=id2, verbose=TRUE], A, output=out) -test(2233.393, levels=opt, any(A[,t1!=t2]), FALSE) -test(2233.394, levels=opt, any(A[, length(unique(t1))>1, by=id2]$V1), FALSE) -test(2233.395, levels=opt, any(A[, length(unique(t2))>1, by=id2]$V1), FALSE) +test(2233.392,optimization=opt, A[, t2 := sum(flag, na.rm=TRUE), by=id2, verbose=TRUE], A, output=out) +test(2233.393,optimization=opt, any(A[,t1!=t2]), FALSE) +test(2233.394,optimization=opt, any(A[, length(unique(t1))>1, by=id2]$V1), FALSE) +test(2233.395,optimization=opt, any(A[, length(unique(t2))>1, by=id2]$V1), FALSE) # test from #5337 n=4; k=2 mm = data.table(a = rep(1:k,n), b=seq_len(n*k), d=rep(1:n,k)) ans = copy(mm)[, e:=INT(NA,8,NA,12,NA,8,NA,12)] -test(2233.41, levels=opt, copy(mm)[a==2, e:=sum(b), by=d, verbose=TRUE], ans, output=c("GForce FALSE", "GForce.*gsum")) +test(2233.41,optimization=opt, copy(mm)[a==2, e:=sum(b), by=d, verbose=TRUE], ans, output=c("GForce FALSE", "GForce.*gsum")) # test from #5345 set.seed(1) DT = data.table( @@ -17789,7 +17789,7 @@ DT = data.table( ) load(testDir("test2233-43.Rdata")) # ans setDT(ans) # to silence verbose messages about internal.selfref being NULL when loaded from disk -test(2233.43, levels=c(0,Inf), options = list(datatable.verbose=TRUE), +test(2233.43,optimization=c(0,Inf), options = list(datatable.verbose=TRUE), copy(DT)[, sum_v2_idT:=sum(v2), by=c("id", "t") ][, n_idT :=dim(.SD)[[1]], by=list(t, id) ][, sum_v2_id :=sum(v2), by=.(id) @@ -17941,12 +17941,12 @@ i=c(1,2) j=1L opt = c(1L,2L) out = c("GForce FALSE", "GForce TRUE") -test(2243.41, levels=opt, options=c(datatable.verbose=TRUE), dt[, .I[TRUE], x]$V1, 1:4, output="GForce FALSE") -test(2243.42, levels=opt, options=c(datatable.verbose=TRUE), dt[, z[y], x], data.table(x=c(2,2,1,1), V1=c("a","b",NA,NA)), output="GForce FALSE") -test(2243.53, levels=opt, options=c(datatable.verbose=TRUE), dt[, .I[1], x]$V1, c(1L, 3L), output=out) -test(2243.54, levels=opt, options=c(datatable.verbose=TRUE), dt[, .I[j], x]$V1, c(1L, 3L), output=out) -test(2243.55, levels=opt, options=c(datatable.verbose=TRUE), dt[, .I[i], x]$V1, 1:4, output="GForce FALSE") -test(2243.56, levels=opt, options=c(datatable.verbose=TRUE), dt[, .I[1:2], x]$V1, 1:4, output="GForce FALSE") +test(2243.41,optimization=opt, options=c(datatable.verbose=TRUE), dt[, .I[TRUE], x]$V1, 1:4, output="GForce FALSE") +test(2243.42,optimization=opt, options=c(datatable.verbose=TRUE), dt[, z[y], x], data.table(x=c(2,2,1,1), V1=c("a","b",NA,NA)), output="GForce FALSE") +test(2243.53,optimization=opt, options=c(datatable.verbose=TRUE), dt[, .I[1], x]$V1, c(1L, 3L), output=out) +test(2243.54,optimization=opt, options=c(datatable.verbose=TRUE), dt[, .I[j], x]$V1, c(1L, 3L), output=out) +test(2243.55,optimization=opt, options=c(datatable.verbose=TRUE), dt[, .I[i], x]$V1, 1:4, output="GForce FALSE") +test(2243.56,optimization=opt, options=c(datatable.verbose=TRUE), dt[, .I[1:2], x]$V1, 1:4, output="GForce FALSE") DT = data.table(1) test(2244.1, DT[, `:=`(a=1, )], error="`:=`.*Did you forget a trailing comma\\?") @@ -18403,7 +18403,7 @@ test(2269.2, fread("x\n?\n \n", colClasses="POSIXct", na.strings="?"), dt) # Error found by revdep in #6284: mean(a,b) is valid, expr names() can be NULL DT = data.table(a = 1, b = 2) -test(2270, levels=1L, DT[, mean(b, 1), by=a], data.table(a=1, V1=2), warning="Unable to optimize call to mean()") +test(2270,optimization=1L, DT[, mean(b, 1), by=a], data.table(a=1, V1=2), warning="Unable to optimize call to mean()") # Missing newline in verbose output -> harder to read DT1 = data.table(a=1:2) @@ -21221,22 +21221,22 @@ test(2344.04, key(DT[, .(V4 = c("b", "a"), V2, V5 = c("y", "x"), V1)]), c("V1", # support arithmetic in j with gforce #3815 out = c("GForce FALSE", "GForce FALSE", "GForce TRUE") dt = data.table(a=1:4, b=1:2) -test(2345.01, levels=0:2, dt[, max(as.character(a)), by=b, verbose=TRUE], data.table(b=1:2, V1=c("3","4")), output=out) -test(2345.02, levels=0:2, dt[, max(as.numeric(a)), by=b, verbose=TRUE], data.table(b=1:2, V1=c(3,4)), output=out) +test(2345.01,optimization=0:2, dt[, max(as.character(a)), by=b, verbose=TRUE], data.table(b=1:2, V1=c("3","4")), output=out) +test(2345.02,optimization=0:2, dt[, max(as.numeric(a)), by=b, verbose=TRUE], data.table(b=1:2, V1=c(3,4)), output=out) dt = data.table(a=1:4, b=1:2) -test(2345.11, levels=0:2, dt[, Map(sum, .SD), b, verbose=TRUE], dt[, lapply(.SD, sum), b], output=out) -test(2345.12, levels=0:2, dt[, Map(sum, .SD, .SD), by=b, verbose=TRUE], output="GForce FALSE") +test(2345.11,optimization=0:2, dt[, Map(sum, .SD), b, verbose=TRUE], dt[, lapply(.SD, sum), b], output=out) +test(2345.12,optimization=0:2, dt[, Map(sum, .SD, .SD), by=b, verbose=TRUE], output="GForce FALSE") dt = data.table(a = NA_integer_, b = 1:2, c = c(TRUE, FALSE)) -test(2345.13, levels=0:2, dt[, Map(weighted.mean, .SD, na.rm=c), b, .SDcols="a", verbose=TRUE], data.table(b=1:2, a=c(NaN, NA_real_)), output="GForce FALSE") -test(2345.14, levels=0:2, dt[,list(weighted.mean(a, na.rm=c)), b, verbose=TRUE], data.table(b=1:2, V1=c(NaN, NA_real_)), output="GForce FALSE") +test(2345.13,optimization=0:2, dt[, Map(weighted.mean, .SD, na.rm=c), b, .SDcols="a", verbose=TRUE], data.table(b=1:2, a=c(NaN, NA_real_)), output="GForce FALSE") +test(2345.14,optimization=0:2, dt[,list(weighted.mean(a, na.rm=c)), b, verbose=TRUE], data.table(b=1:2, V1=c(NaN, NA_real_)), output="GForce FALSE") dt = data.table(a=1:2, b=1, c=1:4) -test(2345.21, levels=0:2, dt[, lapply(list(b, c), sum), by=a, verbose=TRUE], output=out) -test(2345.22, levels=0:2, dt[, c(list(sum(b), sum(c))), by=a, verbose=TRUE], output=out) -test(2345.23, levels=0:2, names(dt[, lapply(list(b, c), sum), by=a])) +test(2345.21,optimization=0:2, dt[, lapply(list(b, c), sum), by=a, verbose=TRUE], output=out) +test(2345.22,optimization=0:2, dt[, c(list(sum(b), sum(c))), by=a, verbose=TRUE], output=out) +test(2345.23,optimization=0:2, names(dt[, lapply(list(b, c), sum), by=a])) dt = data.table(a=1:4, b=1:2) -test(2345.31, levels=0:2, dt[, .(max(a)-min(a)), by=b, verbose=TRUE], output=out) -test(2345.32, levels=0:2, dt[, .((max(a) - min(a)) / (max(a) + min(a))), by=b, verbose=TRUE], data.table(b=1:2, V1=c(0.5, 1/3)), output=out) -test(2345.33, levels=0:2, dt[, sum(a) / .N, b, verbose=TRUE], output=out) -test(2345.34, levels=0:2, dt[, mean(a) * 2L + sum(a), b, verbose=TRUE], output=out) -test(2345.35, levels=0:2, dt[, list(range=max(a)-min(a), avg=mean(a)), by=b, verbose=TRUE], output=out) -test(2345.36, levels=0:2, dt[, .(max(a)-sqrt(min(a))), by=b, verbose=TRUE], output="GForce FALSE") +test(2345.31,optimization=0:2, dt[, .(max(a)-min(a)), by=b, verbose=TRUE], output=out) +test(2345.32,optimization=0:2, dt[, .((max(a) - min(a)) / (max(a) + min(a))), by=b, verbose=TRUE], data.table(b=1:2, V1=c(0.5, 1/3)), output=out) +test(2345.33,optimization=0:2, dt[, sum(a) / .N, b, verbose=TRUE], output=out) +test(2345.34,optimization=0:2, dt[, mean(a) * 2L + sum(a), b, verbose=TRUE], output=out) +test(2345.35,optimization=0:2, dt[, list(range=max(a)-min(a), avg=mean(a)), by=b, verbose=TRUE], output=out) +test(2345.36,optimization=0:2, dt[, .(max(a)-sqrt(min(a))), by=b, verbose=TRUE], output="GForce FALSE") diff --git a/man/test.Rd b/man/test.Rd index 5a2f459f2..f9d3e4efa 100644 --- a/man/test.Rd +++ b/man/test.Rd @@ -22,7 +22,7 @@ test(num, x, y = TRUE, \item{ignore.warning}{ A single character string. Any warnings emitted by \code{x} that contain this string are dropped. Remaining warnings are compared to the expected \code{warning} as normal. } \item{options}{ A named list of options to set for the duration of the test. Any code evaluated during this call to \code{test()} (usually, \code{x}, or maybe \code{y}) will run with the named options set, and the original options will be restored on return. This is a named list since different options can have different types in general, but in typical usage, only one option is set at a time, in which case a named vector is also accepted. } \item{env}{ A named list of environment variables to set for the duration of the test, much like \code{options}. A list entry set to \code{NULL} will unset (i.e., \code{\link{Sys.unsetenv}}) the corresponding variable. } -\item{levels}{ A vector of different optimization levels to test. The code in \code{x} will be run once for each optimization level, with \code{options(datatable.optimize=level)} set accordingly. All levels must pass the test for the overall test to pass. If no y is supplied, the results from the different levels are compared to each other for equality. If a y is supplied, the results from each level are compared to y. } +\item{optimization}{ A vector of different optimization levels to test. The code in \code{x} will be run once for each optimization level, with \code{options(datatable.optimize=optimization)} set accordingly. All optimization levels must pass the test for the overall test to pass. If no y is supplied, the results from the different levels are compared to each other for equality. If a y is supplied, the results from each level are compared to y. } } \note{ \code{NA_real_} and \code{NaN} are treated as equal, use \code{identical} if distinction is needed. See examples below. From 4544398e0b5ff8f7e8b3a4ef55c32fd242cfee63 Mon Sep 17 00:00:00 2001 From: Benjamin Schwendinger Date: Mon, 3 Nov 2025 11:22:00 +0100 Subject: [PATCH 18/27] update docs --- man/test.Rd | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/man/test.Rd b/man/test.Rd index f9d3e4efa..2e216041c 100644 --- a/man/test.Rd +++ b/man/test.Rd @@ -8,7 +8,7 @@ test(num, x, y = TRUE, error = NULL, warning = NULL, message = NULL, output = NULL, notOutput = NULL, ignore.warning = NULL, - options = NULL, env = NULL, levels = NULL) + options = NULL, env = NULL, optimization = NULL) } \arguments{ \item{num}{ A unique identifier for a test, helpful in identifying the source of failure when testing is not working. Currently, we use a manually-incremented system with tests formatted as \code{n.m}, where essentially \code{n} indexes an issue and \code{m} indexes aspects of that issue. For the most part, your new PR should only have one value of \code{n} (scroll to the end of \code{inst/tests/tests.Rraw} to see the next available ID) and then index the tests within your PR by increasing \code{m}. Note -- \code{n.m} is interpreted as a number, so \code{123.4} and \code{123.40} are actually the same -- please \code{0}-pad as appropriate. Test identifiers are checked to be in increasing order at runtime to prevent duplicates being possible. } From d40edb82c4c23da141a2bcc797fbc85208f6d51e Mon Sep 17 00:00:00 2001 From: Benjamin Schwendinger Date: Mon, 3 Nov 2025 11:46:16 +0100 Subject: [PATCH 19/27] restore test nums --- inst/tests/benchmark.Rraw | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/inst/tests/benchmark.Rraw b/inst/tests/benchmark.Rraw index cdc541559..0132e2995 100644 --- a/inst/tests/benchmark.Rraw +++ b/inst/tests/benchmark.Rraw @@ -193,10 +193,10 @@ test(301.1, nrow(DT[,sum(B),by=C])==100010) set.seed(1) DT = data.table(a=sample(1:100, 1e6, replace=TRUE), b=sample(1:1000, 1e6, replace=TRUE), key="a") opt = c(0L,2L) -optimization=opt, copy(DT)[, m:=sum(b), by=a][1:3], data.table(a=1L, b=c(156L, 808L, 848L), m=DT[J(1), sum(b)], key="a")) -optimization=opt, key(copy(DT)[J(43L), a:=99L]), NULL) +test(637.1, optimization=opt, copy(DT)[, m:=sum(b), by=a][1:3], data.table(a=1L, b=c(156L, 808L, 848L), m=DT[J(1), sum(b)], key="a")) +test(637.2, optimization=opt, key(copy(DT)[J(43L), a:=99L]), NULL) setkey(DT, a) -optimization=opt, key(copy(DT)[, a:=99L, by=a]), NULL) +test(637.3, optimization=opt, key(copy(DT)[, a:=99L, by=a]), NULL) # Test X[Y] slowdown, #2216 # Many minutes in 1.8.2! Now well under 1s, but 10s for very wide tolerance for CRAN. We'd like CRAN to tell us if any changes From 5e7efb73faf7ce4d83c3ec0790434eb3508cd770 Mon Sep 17 00:00:00 2001 From: Benjamin Schwendinger Date: Mon, 3 Nov 2025 11:49:47 +0100 Subject: [PATCH 20/27] remove double tests --- inst/tests/tests.Rraw | 2 -- 1 file changed, 2 deletions(-) diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 869616111..70cbf36ac 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -3995,8 +3995,6 @@ DT = data.table(x=as.POSIXct(c("2009-02-17 17:29:23.042", "2009-02-17 17:29:25.1 test(1143.3, DT[, list(lx=x[.N]), by=x], data.table(x=DT$x, lx=DT$x)) test(1143.4,optimization=c(0L, 2L), copy(DT)[,`:=`(lx=tail(x,1L)), by=y], copy(DT)[, lx:=x]) test(1143.5,optimization=c(0L, 2L), copy(DT)[, let(lx=tail(x,1L)), by=y], copy(DT)[, lx:=x]) -test(1143.6, copy(DT)[,`:=`(lx=tail(x,1L)), by=y], copy(DT)[, lx:=x]) -test(1143.7, copy(DT)[, let(lx=tail(x,1L)), by=y], copy(DT)[, lx:=x]) # FR #2356 - retain names of named vector as column with keep.rownames=TRUE x <- 1:5 From 3826927933e72c718237ebf16238422889f93800 Mon Sep 17 00:00:00 2001 From: Benjamin Schwendinger Date: Mon, 3 Nov 2025 12:02:12 +0100 Subject: [PATCH 21/27] simplify tests --- inst/tests/tests.Rraw | 45 +++++++++++++------------------------------ 1 file changed, 13 insertions(+), 32 deletions(-) diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 70cbf36ac..db7ed84e6 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -4738,43 +4738,24 @@ set.seed(45L) dt = data.table(a=sample(2,10,TRUE), b=sample(3,10,TRUE), c=sample(4,10,TRUE), d=sample(5,10,TRUE)) dt2 = data.table(x=c(1,1,1,2,2,2), y=1:6) +gf_out = c("All optimizations are turned off", "Old mean optimization.*(GForce FALSE)", "GForce optimized j to") +lp_out = c("All optimizations are turned off", "Old mean optimization.*(GForce FALSE)", "lapply optimization changed j") +opt = c(0L, 1L, Inf) # auto-naming behavior is different for no-optimization case; just check optimization is off -test(1268.01,optimization=0L, dt[, c(lapply(.SD, mean), lapply(.SD, sum)), by=a, verbose=TRUE], output = 'All optimizations are turned off') -test(1268.02,optimization=0L, dt[, c(lapply(.SD, mean), .N), by=a, verbose=TRUE], output = 'All optimizations are turned off') -test(1268.03,optimization=0L, dt[, c(list(c), lapply(.SD, mean)), by=a, verbose=TRUE], output="All optimizations are turned off") -test(1268.04,optimization=0L, dt[, c(sum(d), lapply(.SD, mean)), by=a, verbose=TRUE], output="All optimizations are turned off") -test(1268.05,optimization=0L, dt[, c(list(sum(d)), lapply(.SD, mean)), by=a, verbose=TRUE], output="All optimizations are turned off") +test(1268.01,optimization=opt, dt[, c(lapply(.SD, mean), lapply(.SD, sum)), by=a, verbose=TRUE], output = gf_out) +test(1268.02,optimization=opt, dt[, c(lapply(.SD, mean), .N), by=a, verbose=TRUE], output = lp_out) +test(1268.03,optimization=opt, dt[, c(list(c), lapply(.SD, mean)), by=a, verbose=TRUE], output= lp_out) +test(1268.04,optimization=opt, dt[, c(sum(d), lapply(.SD, mean)), by=a, verbose=TRUE], output= gf_out) +test(1268.05,optimization=opt, dt[, c(list(sum(d)), lapply(.SD, mean)), by=a, verbose=TRUE], output= lp_out) # newly added tests for #861 -- optimise, but no GForce -test(1268.06,optimization=0L, dt[, c(list(sum(d), .I), lapply(.SD, mean)), by=a, verbose=TRUE], output="All optimizations are turned off") +test(1268.06,optimization=opt, dt[, c(list(sum(d), .I), lapply(.SD, mean)), by=a, verbose=TRUE], output= lp_out) # don't optimise .I in c(...) -test(1268.07,optimization=0L, dt2[, c(.I, lapply(.SD, mean)), by=x, verbose=TRUE], output="All optimizations are turned off") - -test(1268.08,optimization=1L, ans1 <- dt[ , c(lapply(.SD, mean), lapply(.SD, sum)), by=a, verbose=TRUE], output="Old mean optimization.*(GForce FALSE)") -test(1268.09,optimization=1L, ans2 <- dt[, c(lapply(.SD, mean), .N), by=a, verbose = TRUE], output="Old mean optimization.*GForce FALSE") -test(1268.10,optimization=1L, ans3 <- dt[, c(list(c), lapply(.SD, mean)), by=a, verbose=TRUE], output = 'Old mean optimization.*GForce FALSE') -test(1268.11,optimization=1L, ans4 <- dt[, c(sum(d), lapply(.SD, mean)), by=a, verbose = TRUE], output="Old mean optimization.*GForce FALSE") -test(1268.12,optimization=1L, ans5 <- dt[, c(list(sum(d)), lapply(.SD, mean)), by=a, verbose=TRUE], output="Old mean optimization.*GForce FALSE") -test(1268.13,optimization=1L, ans6 <- dt[, c(list(sum(d), .I), lapply(.SD, mean)), by=a, verbose=TRUE], output="Old mean optimization.*GForce FALSE") -test(1268.14,optimization=1L, ans7 <- dt2[, c(.I, lapply(.SD, mean)), by=x, verbose=TRUE], output="Old mean optimization.*GForce FALSE") - -test(1268.15,optimization=Inf, dt[, c(lapply(.SD, mean), lapply(.SD, sum)), by=a, verbose=TRUE], ans1, - output="GForce optimized j to 'list(gmean(b), gmean(c), gmean(d), gsum(b), gsum(c), gsum(d))'") -test(1268.16,optimization=Inf, dt[, c(lapply(.SD, mean), .N), by=a, verbose=TRUE], ans2, - output = "lapply optimization changed j from 'c(lapply(.SD, mean), .N)' to 'list(mean(b), mean(c), mean(d), .N)'") -test(1268.17,optimization=Inf, dt[, c(list(c), lapply(.SD, mean)), by=a, verbose=TRUE], ans3, - output = "lapply optimization changed j from 'c(list(c), lapply(.SD, mean))' to 'list(c, mean(b), mean(c), mean(d))") -test(1268.18,optimization=Inf, dt[, c(sum(d), lapply(.SD, mean)), by=a, verbose=TRUE], ans4, - output = "GForce optimized j to 'list(gsum(d), gmean(b), gmean(c), gmean(d))'") -test(1268.19,optimization=Inf, dt[, c(list(sum(d)), lapply(.SD, mean)), by=a, verbose=TRUE], ans5, - output = "GForce optimized j to 'list(gsum(d), gmean(b), gmean(c), gmean(d))'") -test(1268.20,optimization=Inf, dt[, c(list(sum(d), .I), lapply(.SD, mean)), by=a, verbose=TRUE], ans6, - output = "lapply optimization changed j from 'c(list(sum(d), .I), lapply(.SD, mean))' to 'list(sum(d), .I, mean(b), mean(c), mean(d))'") -test(1268.21,optimization=Inf, dt2[, c(.I, lapply(.SD, mean)), by=x, verbose=TRUE], ans7, - output = "lapply optimization is on, j unchanged as 'c(.I, lapply(.SD, mean))'") -test(1268.22,optimization=Inf, dt[, c(as.list(c), lapply(.SD, mean)), by=a], +test(1268.07,optimization=opt, dt2[, c(.I, lapply(.SD, mean)), by=x, verbose=TRUE], + output= c("All optimizations are turned off", "Old mean optimization.*(GForce FALSE)", "lapply optimization is on, j unchanged as")) +# tests .08-.21 were different optimization levels +test(1268.22,optimization=opt, dt[, c(as.list(c), lapply(.SD, mean)), by=a], error = "j doesn't evaluate to the same number of columns for each group") - ### FR #2722 tests end here ### # Wide range numeric and integer64, to test all bits From 982343fa6f4df114f413c643de1dbc15b08ef0ee Mon Sep 17 00:00:00 2001 From: Benjamin Schwendinger <52290390+ben-schwen@users.noreply.github.com> Date: Tue, 4 Nov 2025 08:09:14 +0100 Subject: [PATCH 22/27] phrasing Co-authored-by: Michael Chirico --- NEWS.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/NEWS.md b/NEWS.md index 943597125..771ea9e3c 100644 --- a/NEWS.md +++ b/NEWS.md @@ -302,7 +302,7 @@ See [#2611](https://github.com/Rdatatable/data.table/issues/2611) for details. T 21. GForce and lapply optimization detection has been refactored to use modular optimization paths and an AST (Abstract Syntax Tree) walker for improved maintainability and extensibility. The new architecture separates optimization detection into distinct, composable phases. This makes future optimization enhancements a lot easier. Thanks to @grantmcdermott, @jangorecki, @MichaelChirico, and @HughParsonage for the suggestions and @ben-schwen for the implementation. This rewrite also introduces several new optimizations: - - Enables Map instead of lapply optimizations (e.g., `Map(fun, .SD)` -> `list(fun(col1), fun(col2), ...)`) [#5336](https://github.com/Rdatatable/data.table/issues/5336) + - Enables Map in addition to lapply optimizations (e.g., `Map(fun, .SD)` -> `list(fun(col1), fun(col2), ...)`) [#5336](https://github.com/Rdatatable/data.table/issues/5336) - lapply optimization works without .SD (e.g., `lapply(list(col1, col2), fun)` -> `list(fun(col1), fun(col2))` [#5032](https://github.com/Rdatatable/data.table/issues/5032) - Type conversion support in GForce expressions (e.g., `sum(as.numeric(x))`) [#2934](https://github.com/Rdatatable/data.table/issues/2934) - Arithmetic operation support in GForce (e.g., `max(x) - min(x)`) [#3815](https://github.com/Rdatatable/data.table/issues/3815) From 1e6ad035b3ec0b67e200eccf150f1d4c5aafefb8 Mon Sep 17 00:00:00 2001 From: Benjamin Schwendinger Date: Tue, 4 Nov 2025 08:22:56 +0100 Subject: [PATCH 23/27] use mget for all vector params --- R/test.data.table.R | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/R/test.data.table.R b/R/test.data.table.R index fa2cb426c..95a3cb444 100644 --- a/R/test.data.table.R +++ b/R/test.data.table.R @@ -367,19 +367,19 @@ test = function(num,x,y=TRUE,error=NULL,warning=NULL,message=NULL,output=NULL,no cl = match.call() cl$optimization = NULL # Remove optimization levels from the recursive call - vector_params = c("error", "warning", "message", "output", "notOutput", "ignore.warning") # Check if y was explicitly provided (not just the default) y_provided = !missing(y) - compare = !y_provided && length(optimization)>1L && !any(vapply_1b(vector_params, function(p) length(get(p, envir=environment())) > 0L)) + vector_params = mget(c("error", "warning", "message", "output", "notOutput", "ignore.warning"), environment()) + compare = !y_provided && length(optimization)>1L && !any(lengths(vector_params)) for (i in seq_along(optimization)) { cl$num = num + (i - 1L) * 1e-6 opt_level = list(datatable.optimize = optimization[i]) cl$options = if (!is.null(options)) c(as.list(options), opt_level) else opt_level - for (p in vector_params) { - val = get(p, envir=environment()) + for (param in names(vector_params)) { + val = vector_params[[param]] if (length(val) > 0L) { - cl[[p]] = val[((i - 1L) %% length(val)) + 1L] # cycle through values if fewer than optimization levels + cl[[param]] = val[((i - 1L) %% length(val)) + 1L] # cycle through values if fewer than optimization levels } } From 9e1297e558c8c1fa270abbc92cf232b1f217ea88 Mon Sep 17 00:00:00 2001 From: Benjamin Schwendinger Date: Tue, 4 Nov 2025 08:25:24 +0100 Subject: [PATCH 24/27] rename optimization parameter --- R/test.data.table.R | 4 +- inst/tests/benchmark.Rraw | 6 +- inst/tests/optimize.Rraw | 336 +++++++++++++++++++------------------- inst/tests/tests.Rraw | 172 +++++++++---------- man/test.Rd | 4 +- 5 files changed, 261 insertions(+), 261 deletions(-) diff --git a/R/test.data.table.R b/R/test.data.table.R index 95a3cb444..af39e3a09 100644 --- a/R/test.data.table.R +++ b/R/test.data.table.R @@ -361,11 +361,11 @@ gc_mem = function() { # nocov end } -test = function(num,x,y=TRUE,error=NULL,warning=NULL,message=NULL,output=NULL,notOutput=NULL,ignore.warning=NULL,options=NULL,env=NULL,optimization=NULL) { +test = function(num,x,y=TRUE,error=NULL,warning=NULL,message=NULL,output=NULL,notOutput=NULL,ignore.warning=NULL,options=NULL,env=NULL,optimize=NULL) { # if optimization is provided, test across multiple optimization levels if (!is.null(optimization)) { cl = match.call() - cl$optimization = NULL # Remove optimization levels from the recursive call + cl$optimize= NULL # Remove optimization levels from the recursive call # Check if y was explicitly provided (not just the default) y_provided = !missing(y) diff --git a/inst/tests/benchmark.Rraw b/inst/tests/benchmark.Rraw index 0132e2995..1cb35a557 100644 --- a/inst/tests/benchmark.Rraw +++ b/inst/tests/benchmark.Rraw @@ -193,10 +193,10 @@ test(301.1, nrow(DT[,sum(B),by=C])==100010) set.seed(1) DT = data.table(a=sample(1:100, 1e6, replace=TRUE), b=sample(1:1000, 1e6, replace=TRUE), key="a") opt = c(0L,2L) -test(637.1, optimization=opt, copy(DT)[, m:=sum(b), by=a][1:3], data.table(a=1L, b=c(156L, 808L, 848L), m=DT[J(1), sum(b)], key="a")) -test(637.2, optimization=opt, key(copy(DT)[J(43L), a:=99L]), NULL) +test(637.1, optimize=opt, copy(DT)[, m:=sum(b), by=a][1:3], data.table(a=1L, b=c(156L, 808L, 848L), m=DT[J(1), sum(b)], key="a")) +test(637.2, optimize=opt, key(copy(DT)[J(43L), a:=99L]), NULL) setkey(DT, a) -test(637.3, optimization=opt, key(copy(DT)[, a:=99L, by=a]), NULL) +test(637.3, optimize=opt, key(copy(DT)[, a:=99L, by=a]), NULL) # Test X[Y] slowdown, #2216 # Many minutes in 1.8.2! Now well under 1s, but 10s for very wide tolerance for CRAN. We'd like CRAN to tell us if any changes diff --git a/inst/tests/optimize.Rraw b/inst/tests/optimize.Rraw index cade0c60c..cb096e94c 100644 --- a/inst/tests/optimize.Rraw +++ b/inst/tests/optimize.Rraw @@ -19,34 +19,34 @@ for (s in sugg) { # := by group DT = data.table(a=1:3,b=(1:9)/10) -test(611.1,optimization=c(0L, 2L), DT[,v:=sum(b),by=a], data.table(a=1:3,b=(1:9)/10,v=c(1.2,1.5,1.8))) +test(611.1,optimize=c(0L, 2L), DT[,v:=sum(b),by=a], data.table(a=1:3,b=(1:9)/10,v=c(1.2,1.5,1.8))) setkey(DT,a) -test(611.2,optimization=c(0L, 2L), DT[,v:=min(b),by=a], data.table(a=1:3,b=(1:9)/10,v=(1:3)/10,key="a")) +test(611.2,optimize=c(0L, 2L), DT[,v:=min(b),by=a], data.table(a=1:3,b=(1:9)/10,v=(1:3)/10,key="a")) # Combining := by group with i -test(611.3,optimization=c(0L, 2L), DT[a>1,p:=sum(b)]$p, rep(c(NA,3.3),c(3,6))) -test(611.4,optimization=c(0L, 2L), DT[a>1,q:=sum(b),by=a]$q, rep(c(NA,1.5,1.8),each=3)) +test(611.3,optimize=c(0L, 2L), DT[a>1,p:=sum(b)]$p, rep(c(NA,3.3),c(3,6))) +test(611.4,optimize=c(0L, 2L), DT[a>1,q:=sum(b),by=a]$q, rep(c(NA,1.5,1.8),each=3)) # 612 was just level repetition of 611 # Assign to subset ok (NA initialized in the other items) ok : -test(613,optimization=c(0L, 2L), DT[J(2),w:=8.3]$w, rep(c(NA,8.3,NA),each=3)) -test(614,optimization=c(0L, 2L), DT[J(3),x:=9L]$x, rep(c(NA_integer_,NA_integer_,9L),each=3)) -test(615,optimization=c(0L, 2L), DT[J(2),z:=list(list(c(10L,11L)))]$z, rep(list(NULL, 10:11, NULL),each=3)) +test(613,optimize=c(0L, 2L), DT[J(2),w:=8.3]$w, rep(c(NA,8.3,NA),each=3)) +test(614,optimize=c(0L, 2L), DT[J(3),x:=9L]$x, rep(c(NA_integer_,NA_integer_,9L),each=3)) +test(615,optimize=c(0L, 2L), DT[J(2),z:=list(list(c(10L,11L)))]$z, rep(list(NULL, 10:11, NULL),each=3)) # Empty i clause, #2034. Thanks to Chris for testing, tests from him. Plus changes from #759 ans = copy(DT)[,r:=NA_real_] -test(618.1,optimization=c(0L, 2L), copy(DT)[a>3,r:=sum(b)], ans) -test(618.2,optimization=c(0L, 2L), copy(DT)[J(-1),r:=sum(b)], ans) -test(618.3,optimization=c(0L, 2L), copy(DT)[NA,r:=sum(b)], ans) -test(618.4,optimization=c(0L, 2L), copy(DT)[0,r:=sum(b)], ans) -test(618.5,optimization=c(0L, 2L), copy(DT)[NULL,r:=sum(b)], null.data.table()) +test(618.1,optimize=c(0L, 2L), copy(DT)[a>3,r:=sum(b)], ans) +test(618.2,optimize=c(0L, 2L), copy(DT)[J(-1),r:=sum(b)], ans) +test(618.3,optimize=c(0L, 2L), copy(DT)[NA,r:=sum(b)], ans) +test(618.4,optimize=c(0L, 2L), copy(DT)[0,r:=sum(b)], ans) +test(618.5,optimize=c(0L, 2L), copy(DT)[NULL,r:=sum(b)], null.data.table()) # test 619 was level 2 of 618 DT = data.table(x=letters, key="x") -test(621,optimization=c(0L, 2L), copy(DT)[J("bb"), x:="foo"], DT) # when no update, key should be retained -test(622,optimization=c(0L, 2L), copy(DT)[J("bb"), x:="foo",nomatch=0], DT, warning="ignoring nomatch") +test(621,optimize=c(0L, 2L), copy(DT)[J("bb"), x:="foo"], DT) # when no update, key should be retained +test(622,optimize=c(0L, 2L), copy(DT)[J("bb"), x:="foo",nomatch=0], DT, warning="ignoring nomatch") set.seed(2) DT = data.table(a=rnorm(5)*10, b=1:5) -test(623,optimization=c(0L, 2L), copy(DT)[,s:=sum(b),by=round(a)%%2]$s, c(10L,5L,5L,10L,10L)) +test(623,optimize=c(0L, 2L), copy(DT)[,s:=sum(b),by=round(a)%%2]$s, c(10L,5L,5L,10L,10L)) # Setup for test 656.x - gforce tests set.seed(9) @@ -57,36 +57,36 @@ DT = data.table(grp1=sample.int(150L, n, replace=TRUE), y=rnorm(n)) opt = 0:2 out = c('GForce FALSE', 'GForce FALSE' ,'GForce TRUE') -test(656.1,optimization=opt, DT[ , mean(x), by=grp1, verbose=TRUE], output=out) -test(656.2,optimization=opt, DT[ , list(mean(x)), by=grp1, verbose=TRUE], output=out) -test(656.3,optimization=opt, DT[ , list(mean(x), mean(y)), by=grp1, verbose=TRUE], output=out) +test(656.1,optimize=opt, DT[ , mean(x), by=grp1, verbose=TRUE], output=out) +test(656.2,optimize=opt, DT[ , list(mean(x)), by=grp1, verbose=TRUE], output=out) +test(656.3,optimize=opt, DT[ , list(mean(x), mean(y)), by=grp1, verbose=TRUE], output=out) # Test := keyby does setkey, #2065 DT = data.table(x=1:2, y=1:6) ans = data.table(x=rep(1:2,each=3),y=c(1L,3L,5L,2L,4L,6L),z=rep(c(9L,12L),each=3),key="x") -test(670.1,optimization=c(0L, 2L), copy(DT)[,z:=sum(y),keyby=x], ans) -test(670.2,optimization=c(0L, 2L), copy(DT)[,z:=sum(y),keyby="x"], ans) -test(670.3,optimization=c(0L, 2L), copy(DT)[,z:=sum(y),keyby=x%%2], data.table(x=1:2,y=1:6,z=c(9L,12L)), +test(670.1,optimize=c(0L, 2L), copy(DT)[,z:=sum(y),keyby=x], ans) +test(670.2,optimize=c(0L, 2L), copy(DT)[,z:=sum(y),keyby="x"], ans) +test(670.3,optimize=c(0L, 2L), copy(DT)[,z:=sum(y),keyby=x%%2], data.table(x=1:2,y=1:6,z=c(9L,12L)), warning="The setkey() normally performed by keyby= has been skipped (as if by= was used) because := is being used together with keyby= but the keyby= contains some expressions. To avoid this warning, use by= instead, or provide existing column names to keyby=") -test(670.4,optimization=c(0L, 2L), copy(DT)[,z:=sum(y),by=x%%2], data.table(x=1:2,y=1:6,z=c(9L,12L))) -test(670.5,optimization=c(0L, 2L), copy(DT)[x>1,z:=sum(y),keyby=x], error=":= with keyby is only possible when i is not supplied since") +test(670.4,optimize=c(0L, 2L), copy(DT)[,z:=sum(y),by=x%%2], data.table(x=1:2,y=1:6,z=c(9L,12L))) +test(670.5,optimize=c(0L, 2L), copy(DT)[x>1,z:=sum(y),keyby=x], error=":= with keyby is only possible when i is not supplied since") # test 671 was level 2 of 670 # varname holding colnames, by group, linked from #2120. DT = data.table(a=rep(1:3,1:3),b=1:6) colname = "newcol" -test(751,optimization=c(0L, 2L), DT[,(colname):=sum(b),by=a], data.table(a=rep(1:3,1:3),b=1:6,newcol=INT(1,5,5,15,15,15))) +test(751,optimize=c(0L, 2L), DT[,(colname):=sum(b),by=a], data.table(a=rep(1:3,1:3),b=1:6,newcol=INT(1,5,5,15,15,15))) # Add tests for nested := in j by group, #1987 DT = data.table(a=rep(1:3,2:4),b=1:9) -test(752,optimization=c(0L, 2L), DT[,head(.SD,2)[,new:=1:.N],by=a], data.table(a=rep(1:3,each=2),b=c(1:4,6:7),new=1:2)) +test(752,optimize=c(0L, 2L), DT[,head(.SD,2)[,new:=1:.N],by=a], data.table(a=rep(1:3,each=2),b=c(1:4,6:7),new=1:2)) DT = data.table(a=1:3,b=1:3,v=1:6,w=1:6) opt = c(0:2, Inf) out = c('GForce FALSE', 'GForce FALSE', 'GForce TRUE', 'GForce TRUE') # v1.9.7 treats wrapped {} better, so this is now optimized -test(865,optimization=opt, DT[,{list(name1=sum(v),name2=sum(w))},by=c('a', 'b'),verbose=TRUE], output=out) -test(867,optimization=opt, names(DT[,list(name1=sum(v),name2=sum(w)),by=c('a', 'b')]), c("a", "b", "name1", "name2")) # list names extracted here +test(865,optimize=opt, DT[,{list(name1=sum(v),name2=sum(w))},by=c('a', 'b'),verbose=TRUE], output=out) +test(867,optimize=opt, names(DT[,list(name1=sum(v),name2=sum(w)),by=c('a', 'b')]), c("a", "b", "name1", "name2")) # list names extracted here # tests of gsum and gmean with NA DT = data.table(x=rep(c("a","b","c","d"),each=3), y=c(1L,3L,6L), v=as.numeric(1:12)) @@ -96,82 +96,82 @@ set(DT,10:12,"y",NA) set(DT,10:12,"v",NA) opt = c(1L, 2L) out = c("(GForce FALSE)", "GForce optimized j to") -test(1184.1,optimization=opt, DT[, sum(v), by=x, verbose=TRUE], output=out) -test(1184.2,optimization=1L, DT[, mean(v), by=x, verbose=TRUE], output="(GForce FALSE)") -test(1185.2,optimization=c(0L,1L,Inf), DT[, list(mean(y), mean(v), mean(y,na.rm=TRUE), mean(v,na.rm=TRUE)), by=x, verbose=TRUE], +test(1184.1,optimize=opt, DT[, sum(v), by=x, verbose=TRUE], output=out) +test(1184.2,optimize=1L, DT[, mean(v), by=x, verbose=TRUE], output="(GForce FALSE)") +test(1185.2,optimize=c(0L,1L,Inf), DT[, list(mean(y), mean(v), mean(y,na.rm=TRUE), mean(v,na.rm=TRUE)), by=x, verbose=TRUE], output=c("All optimizations.*off", "Old mean.*changed j", "GForce optimized j to")) -test(1187.1,optimization=c(0L, 1L, 2L), DT[, list(sum(y), sum(v), sum(y,na.rm=TRUE), sum(v,na.rm=TRUE)), by=x], +test(1187.1,optimize=c(0L, 1L, 2L), DT[, list(sum(y), sum(v), sum(y,na.rm=TRUE), sum(v,na.rm=TRUE)), by=x], data.table(x=c("a","b","c","d"), V1=c(NA,10L,NA,NA), V2=c(6,NA,NA,NA), V3=c(4L,10L,7L,0L), V4=c(6,10,15,0))) MyVar = TRUE -test(1187.2,optimization=opt, DT[, list(sum(y,na.rm=MyVar), mean(y,na.rm=MyVar)), by=x, verbose=TRUE], output=out) -test(1187.3,optimization=opt, DT[, mean(y,na.rm=MyVar), by=x, verbose=TRUE], output=out) +test(1187.2,optimize=opt, DT[, list(sum(y,na.rm=MyVar), mean(y,na.rm=MyVar)), by=x, verbose=TRUE], output=out) +test(1187.3,optimize=opt, DT[, mean(y,na.rm=MyVar), by=x, verbose=TRUE], output=out) MyVar = FALSE -test(1187.4,optimization=opt, DT[, list(sum(y,na.rm=MyVar), mean(y,na.rm=MyVar)), by=x, verbose=TRUE], output=out) -test(1187.5,optimization=opt, DT[, mean(y,na.rm=MyVar), by=x, verbose=TRUE], output=out) +test(1187.4,optimize=opt, DT[, list(sum(y,na.rm=MyVar), mean(y,na.rm=MyVar)), by=x, verbose=TRUE], output=out) +test(1187.5,optimize=opt, DT[, mean(y,na.rm=MyVar), by=x, verbose=TRUE], output=out) # GForce should not turn on when the .ok function isn't triggered -test(1187.6,optimization=2L, DT[, mean(y, trim=.2), by=x, verbose=TRUE], +test(1187.6,optimize=2L, DT[, mean(y, trim=.2), by=x, verbose=TRUE], data.table(x = c("a", "b", "c", "d"), V1 = c(NA, 3.33333333333333, NA, NA)), output='j unchanged', warning="'trim' is not yet optimized") # FR #334. Test to just make sure that GForce and dogroups with .N are giving the same results. set.seed(2L) dt <- data.table(x=sample(rep(1:5e3, each=3)), y=sample(10)) -test(1304.1,optimization=0:2, dt[, list(.N, sum(y)), by=x]) +test(1304.1,optimize=0:2, dt[, list(.N, sum(y)), by=x]) dt <- data.table(x=sample(rep(1:5e3, each=3)), y=sample(10), key="x") -test(1304.2,optimization=0:2, dt[, list(.N, sum(y)), by=x]) +test(1304.2,optimize=0:2, dt[, list(.N, sum(y)), by=x]) # gmin and gmax extensive testing (because there are tricky cases) DT <- data.table(x=rep(1:6, each=3), y=INT(4,-1,0, NA,4,10, 4,NA,10, 4,10,NA, -2147483647, -2147483647, -2147483647, 2147483647, 2147483647, 2147483647)) opts = 0:2 # for integers -test(1313.01,optimization=opts, DT[, min(y), by=x], DT[, base::min(y), by=x]) -test(1313.02,optimization=opts, DT[, max(y), by=x], DT[, base::max(y), by=x]) -test(1313.03,optimization=opts, DT[, min(y, na.rm=TRUE), by=x], DT[, base::min(y, na.rm=TRUE), by=x]) -test(1313.04,optimization=opts, DT[, max(y, na.rm=TRUE), by=x], DT[, base::max(y, na.rm=TRUE), by=x]) +test(1313.01,optimize=opts, DT[, min(y), by=x], DT[, base::min(y), by=x]) +test(1313.02,optimize=opts, DT[, max(y), by=x], DT[, base::max(y), by=x]) +test(1313.03,optimize=opts, DT[, min(y, na.rm=TRUE), by=x], DT[, base::min(y, na.rm=TRUE), by=x]) +test(1313.04,optimize=opts, DT[, max(y, na.rm=TRUE), by=x], DT[, base::max(y, na.rm=TRUE), by=x]) # testing all NA - GForce automatically converts to numeric.. optimize=1L errors due to change from integer/numeric (like median) DT[x==6, y := INT(NA)] -test(1313.05,optimization=opts, DT[, min(y), by=x], DT[, base::min(y), by=x]) -test(1313.06,optimization=opts, DT[, max(y), by=x], DT[, base::max(y), by=x]) -test(1313.07,optimization=2L, DT[, min(y, na.rm=TRUE), by=x], data.table(x=1:6, V1=INT(-1,4,4,4,-2147483647,NA))) -test(1313.08,optimization=2L, DT[, max(y, na.rm=TRUE), by=x], data.table(x=1:6, V1=INT(4,10,10,10,-2147483647,NA))) +test(1313.05,optimize=opts, DT[, min(y), by=x], DT[, base::min(y), by=x]) +test(1313.06,optimize=opts, DT[, max(y), by=x], DT[, base::max(y), by=x]) +test(1313.07,optimize=2L, DT[, min(y, na.rm=TRUE), by=x], data.table(x=1:6, V1=INT(-1,4,4,4,-2147483647,NA))) +test(1313.08,optimize=2L, DT[, max(y, na.rm=TRUE), by=x], data.table(x=1:6, V1=INT(4,10,10,10,-2147483647,NA))) # for numeric DT <- data.table(x=rep(1:6, each=3), y=c(4,-1,0, NA,4,10, 4,NA,10, 4,10,NA, -Inf, NA, NA, Inf, NA, NA)) -test(1313.09,optimization=opts, DT[, min(y), by=x], DT[, base::min(y), by=x]) -test(1313.10,optimization=opts, DT[, max(y), by=x], DT[, base::max(y), by=x]) -test(1313.11,optimization=opts, DT[, min(y, na.rm=TRUE), by=x], DT[, base::min(y, na.rm=TRUE), by=x]) -test(1313.12,optimization=opts, DT[, max(y, na.rm=TRUE), by=x], DT[, base::max(y, na.rm=TRUE), by=x]) +test(1313.09,optimize=opts, DT[, min(y), by=x], DT[, base::min(y), by=x]) +test(1313.10,optimize=opts, DT[, max(y), by=x], DT[, base::max(y), by=x]) +test(1313.11,optimize=opts, DT[, min(y, na.rm=TRUE), by=x], DT[, base::min(y, na.rm=TRUE), by=x]) +test(1313.12,optimize=opts, DT[, max(y, na.rm=TRUE), by=x], DT[, base::max(y, na.rm=TRUE), by=x]) # testing all NA - GForce automatically converts to numeric.. optimize=1L errors due to change from integer/numeric (like median) DT[x==6, y := NA_real_] -test(1313.13,optimization=opts, DT[, min(y), by=x], DT[, base::min(y), by=x]) -test(1313.14,optimization=opts, DT[, max(y), by=x], DT[, base::max(y), by=x]) -test(1313.15,optimization=2L, DT[, min(y, na.rm=TRUE), by=x], data.table(x=1:6, V1=c(-1,4,4,4,-Inf,NA))) -test(1313.16,optimization=2L, DT[, max(y, na.rm=TRUE), by=x], data.table(x=1:6, V1=c(4,10,10,10,-Inf,NA))) +test(1313.13,optimize=opts, DT[, min(y), by=x], DT[, base::min(y), by=x]) +test(1313.14,optimize=opts, DT[, max(y), by=x], DT[, base::max(y), by=x]) +test(1313.15,optimize=2L, DT[, min(y, na.rm=TRUE), by=x], data.table(x=1:6, V1=c(-1,4,4,4,-Inf,NA))) +test(1313.16,optimize=2L, DT[, max(y, na.rm=TRUE), by=x], data.table(x=1:6, V1=c(4,10,10,10,-Inf,NA))) # for date (attribute check.. especially after issues/689 !!!) DT <- data.table(x = rep(letters[1:2], each=5), y = as.POSIXct('2010-01-01', tz="UTC") + seq(0, 86400*9, 86400)) -test(1313.17,optimization=opts, DT[, list(y=min(y)), by=x], DT[c(1,6)]) -test(1313.18,optimization=opts, DT[, list(y=max(y)), by=x], DT[c(5,10)]) +test(1313.17,optimize=opts, DT[, list(y=min(y)), by=x], DT[c(1,6)]) +test(1313.18,optimize=opts, DT[, list(y=max(y)), by=x], DT[c(5,10)]) DT[c(1,6), y := NA] -test(1313.19,optimization=opts, DT[, list(y=min(y)), by=x], DT[c(1,6)]) -test(1313.20,optimization=opts, DT[, list(y=max(y)), by=x], DT[c(1,6)]) -test(1313.21,optimization=opts, DT[, list(y=min(y, na.rm=TRUE)), by=x], DT[c(2,7)]) -test(1313.22,optimization=opts, DT[, list(y=max(y, na.rm=TRUE)), by=x], DT[c(5,10)]) +test(1313.19,optimize=opts, DT[, list(y=min(y)), by=x], DT[c(1,6)]) +test(1313.20,optimize=opts, DT[, list(y=max(y)), by=x], DT[c(1,6)]) +test(1313.21,optimize=opts, DT[, list(y=min(y, na.rm=TRUE)), by=x], DT[c(2,7)]) +test(1313.22,optimize=opts, DT[, list(y=max(y, na.rm=TRUE)), by=x], DT[c(5,10)]) # for character set.seed(1L) DT <- data.table(x=rep(1:7, each=3), y=sample(c("", letters[1:3], NA), 21, TRUE)) DT[x==7, y := c("","b","c")] -test(1313.23,optimization=opts, DT[, min(y), by=x], DT[, base::min(y), by=x]) -test(1313.24,optimization=opts, DT[, max(y), by=x], DT[, base::max(y), by=x]) -test(1313.25,optimization=opts, DT[, min(y, na.rm=TRUE), by=x], DT[, base::min(y, na.rm=TRUE), by=x]) -test(1313.26,optimization=opts, DT[, max(y, na.rm=TRUE), by=x], DT[, base::max(y, na.rm=TRUE), by=x]) +test(1313.23,optimize=opts, DT[, min(y), by=x], DT[, base::min(y), by=x]) +test(1313.24,optimize=opts, DT[, max(y), by=x], DT[, base::max(y), by=x]) +test(1313.25,optimize=opts, DT[, min(y, na.rm=TRUE), by=x], DT[, base::min(y, na.rm=TRUE), by=x]) +test(1313.26,optimize=opts, DT[, max(y, na.rm=TRUE), by=x], DT[, base::max(y, na.rm=TRUE), by=x]) DT[x==6, y := NA_character_] -test(1313.27,optimization=opts, DT[, min(y), by=x], DT[, base::min(y), by=x]) -test(1313.28,optimization=opts, DT[, max(y), by=x], DT[, base::max(y), by=x]) -test(1313.29,optimization=2L, DT[, min(y, na.rm=TRUE), by=x], data.table(x=1:7, V1=c("a","a","c","","a",NA,""))) -test(1313.30,optimization=2L, DT[, max(y, na.rm=TRUE), by=x], data.table(x=1:7, V1=c("b","a","c","a","c",NA,"c"))) +test(1313.27,optimize=opts, DT[, min(y), by=x], DT[, base::min(y), by=x]) +test(1313.28,optimize=opts, DT[, max(y), by=x], DT[, base::max(y), by=x]) +test(1313.29,optimize=2L, DT[, min(y, na.rm=TRUE), by=x], data.table(x=1:7, V1=c("a","a","c","","a",NA,""))) +test(1313.30,optimize=2L, DT[, max(y, na.rm=TRUE), by=x], data.table(x=1:7, V1=c("b","a","c","a","c",NA,"c"))) # Fix for #1251, DT[, .N, by=a] and DT[, .(.N), by=a] uses GForce now dt = data.table(a=sample(3,20,TRUE), b=1:10) -test(1565.1,optimization=c(0,1,Inf), dt[, .N, by=a, verbose=TRUE], +test(1565.1,optimize=c(0,1,Inf), dt[, .N, by=a, verbose=TRUE], output=c("All optimizations are turned off", "lapply optimization is on, j unchanged", "GForce optimized j to")) # gforce optimisations @@ -188,32 +188,32 @@ opt = 0:2 out = c('GForce FALSE', 'GForce FALSE' ,'GForce TRUE') # make sure gforce is on # testing gforce::gmedian -test(1579.01,optimization=2L, dt[, lapply(.SD, median), by=x, verbose=TRUE], +test(1579.01,optimize=2L, dt[, lapply(.SD, median), by=x, verbose=TRUE], dt[, lapply(.SD, function(x) median(as.numeric(x))), by=x], output="GForce optimized") -test(1579.02,optimization=2L, dt[, lapply(.SD, median, na.rm=TRUE), by=x], +test(1579.02,optimize=2L, dt[, lapply(.SD, median, na.rm=TRUE), by=x], dt[, lapply(.SD, function(x) median(as.numeric(x), na.rm=TRUE)), by=x]) -test(1579.03,optimization=2L, dt[, lapply(.SD, median), keyby=x], +test(1579.03,optimize=2L, dt[, lapply(.SD, median), keyby=x], dt[, lapply(.SD, function(x) median(as.numeric(x))), keyby=x]) -test(1579.04,optimization=2L, dt[, lapply(.SD, median, na.rm=TRUE), keyby=x], +test(1579.04,optimize=2L, dt[, lapply(.SD, median, na.rm=TRUE), keyby=x], dt[, lapply(.SD, function(x) median(as.numeric(x), na.rm=TRUE)), keyby=x]) # testing gforce::ghead and gforce::gtail # head(.SD, 1) and tail(.SD, 1) optimisation -test(1579.06,optimization=opt, dt[, head(.SD,1), by=x, verbose=TRUE], output=out) -test(1579.08,optimization=opt, dt[, head(.SD,1), keyby=x, verbose=TRUE], output=out) -test(1579.10,optimization=opt, dt[, head(.SD,1L), by=x, verbose=TRUE], output=out) -test(1579.12,optimization=opt, dt[, head(.SD,1L), keyby=x, verbose=TRUE], output=out) -test(1579.14,optimization=opt, dt[, tail(.SD,1), by=x, verbose=TRUE], output=out) -test(1579.16,optimization=opt, dt[, tail(.SD,1), keyby=x, verbose=TRUE], output=out) -test(1579.18,optimization=opt, dt[, tail(.SD,1L), by=x, verbose=TRUE], output=out) -test(1579.20,optimization=opt, dt[, tail(.SD,1L), keyby=x, verbose=TRUE], output=out) +test(1579.06,optimize=opt, dt[, head(.SD,1), by=x, verbose=TRUE], output=out) +test(1579.08,optimize=opt, dt[, head(.SD,1), keyby=x, verbose=TRUE], output=out) +test(1579.10,optimize=opt, dt[, head(.SD,1L), by=x, verbose=TRUE], output=out) +test(1579.12,optimize=opt, dt[, head(.SD,1L), keyby=x, verbose=TRUE], output=out) +test(1579.14,optimize=opt, dt[, tail(.SD,1), by=x, verbose=TRUE], output=out) +test(1579.16,optimize=opt, dt[, tail(.SD,1), keyby=x, verbose=TRUE], output=out) +test(1579.18,optimize=opt, dt[, tail(.SD,1L), by=x, verbose=TRUE], output=out) +test(1579.20,optimize=opt, dt[, tail(.SD,1L), keyby=x, verbose=TRUE], output=out) # 1579.22 tested gtail with n>1; now 1579.4+ below mysub <- function(x, n) x[n] -test(1579.23,optimization=2L, dt[, .SD[2], by=x, verbose=TRUE], dt[, mysub(.SD,2), by=x], output="GForce optimized.*g[[]") -test(1579.24,optimization=opt, dt[, .SD[2], keyby=x], dt[, mysub(.SD,2), keyby=x]) -test(1579.25,optimization=opt, dt[, .SD[2L], by=x], dt[, mysub(.SD,2L), by=x]) -test(1579.26,optimization=opt, dt[, .SD[2L], keyby=x], dt[, mysub(.SD,2L), keyby=x]) -test(1579.27,optimization=opt, dt[, .SD[15], by=x], dt[, mysub(.SD,15), by=x]) # tests 15 > grpsize and that NA is correct including for integer64 -test(1579.28,optimization=opt, dt[, .SD[15], keyby=x], dt[, mysub(.SD,15), keyby=x]) +test(1579.23,optimize=2L, dt[, .SD[2], by=x, verbose=TRUE], dt[, mysub(.SD,2), by=x], output="GForce optimized.*g[[]") +test(1579.24,optimize=opt, dt[, .SD[2], keyby=x], dt[, mysub(.SD,2), keyby=x]) +test(1579.25,optimize=opt, dt[, .SD[2L], by=x], dt[, mysub(.SD,2L), by=x]) +test(1579.26,optimize=opt, dt[, .SD[2L], keyby=x], dt[, mysub(.SD,2L), keyby=x]) +test(1579.27,optimize=opt, dt[, .SD[15], by=x], dt[, mysub(.SD,15), by=x]) # tests 15 > grpsize and that NA is correct including for integer64 +test(1579.28,optimize=opt, dt[, .SD[15], keyby=x], dt[, mysub(.SD,15), keyby=x]) # gforce head/tail for n>1, #5060 set.seed(99) DT = data.table(x = sample(letters[1:5], 20, TRUE), @@ -223,16 +223,16 @@ DT = data.table(x = sample(letters[1:5], 20, TRUE), s = sample(c("foo","bar",NA), 20, TRUE), l = sample(list(1:3, mean, letters[4:5], NULL), 20, replace=TRUE)) if (test_bit64) DT[, i64:=sample(as.integer64(c(-2200000000,+2400000000,NA)), 20, TRUE)] -test(1579.401,optimization=0:2, DT[, .N, by=x]$N, INT(4,6,5,2,3)) # the smallest group is 2, so n=5 tests n constrained to grpsize -test(1579.402,optimization=2L, DT[, head(.SD,2), by=x, verbose=TRUE], DT[, utils::head(.SD,2), by=x], output="optimized.*ghead") -test(1579.403,optimization=2L, DT[, head(.SD,2), keyby=x, verbose=TRUE], DT[, utils::head(.SD,2), keyby=x], output="optimized.*ghead") -test(1579.404,optimization=2L, DT[, head(.SD,5), by=x, verbose=TRUE], DT[, utils::head(.SD,5), by=x], output="optimized.*ghead") -test(1579.405,optimization=2L, DT[, head(.SD,5), keyby=x, verbose=TRUE], DT[, utils::head(.SD,5), keyby=x], output="optimized.*ghead") -test(1579.406,optimization=2L, DT[, tail(.SD,2), by=x, verbose=TRUE], DT[, utils::tail(.SD,2), by=x], output="optimized.*gtail") -test(1579.407,optimization=2L, DT[, tail(.SD,2), keyby=x, verbose=TRUE], DT[, utils::tail(.SD,2), keyby=x], output="optimized.*gtail") -test(1579.408,optimization=2L, DT[, tail(.SD,5), by=x, verbose=TRUE], DT[, utils::tail(.SD,5), by=x], output="optimized.*gtail") -test(1579.409,optimization=2L, DT[, tail(.SD,5), keyby=x, verbose=TRUE], DT[, utils::tail(.SD,5), keyby=x], output="optimized.*gtail") -test(1579.410,optimization=2L, DT[, tail(.SD,2), by=.(x,y), verbose=TRUE], DT[, utils::tail(.SD,2), by=.(x,y)], output="optimized.*gtail") +test(1579.401,optimize=0:2, DT[, .N, by=x]$N, INT(4,6,5,2,3)) # the smallest group is 2, so n=5 tests n constrained to grpsize +test(1579.402,optimize=2L, DT[, head(.SD,2), by=x, verbose=TRUE], DT[, utils::head(.SD,2), by=x], output="optimized.*ghead") +test(1579.403,optimize=2L, DT[, head(.SD,2), keyby=x, verbose=TRUE], DT[, utils::head(.SD,2), keyby=x], output="optimized.*ghead") +test(1579.404,optimize=2L, DT[, head(.SD,5), by=x, verbose=TRUE], DT[, utils::head(.SD,5), by=x], output="optimized.*ghead") +test(1579.405,optimize=2L, DT[, head(.SD,5), keyby=x, verbose=TRUE], DT[, utils::head(.SD,5), keyby=x], output="optimized.*ghead") +test(1579.406,optimize=2L, DT[, tail(.SD,2), by=x, verbose=TRUE], DT[, utils::tail(.SD,2), by=x], output="optimized.*gtail") +test(1579.407,optimize=2L, DT[, tail(.SD,2), keyby=x, verbose=TRUE], DT[, utils::tail(.SD,2), keyby=x], output="optimized.*gtail") +test(1579.408,optimize=2L, DT[, tail(.SD,5), by=x, verbose=TRUE], DT[, utils::tail(.SD,5), by=x], output="optimized.*gtail") +test(1579.409,optimize=2L, DT[, tail(.SD,5), keyby=x, verbose=TRUE], DT[, utils::tail(.SD,5), keyby=x], output="optimized.*gtail") +test(1579.410,optimize=2L, DT[, tail(.SD,2), by=.(x,y), verbose=TRUE], DT[, utils::tail(.SD,2), by=.(x,y)], output="optimized.*gtail") # FR #971, partly addressed (only subsets in 'i') @@ -242,7 +242,7 @@ dt = data.table(x = sample(letters, 300, TRUE), d2 = as.numeric(sample(c(NA, NaN, -10:10), 300, TRUE))) opt = 1:2 out = c("GForce FALSE","GForce optimized j") -test(1581.01,optimization=opt, ans1 <- dt[x %in% letters[15:20], +test(1581.01,optimize=opt, ans1 <- dt[x %in% letters[15:20], c(.N, lapply(.SD, sum, na.rm=TRUE), lapply(.SD, min, na.rm=TRUE), lapply(.SD, max, na.rm=TRUE), @@ -252,11 +252,11 @@ test(1581.01,optimization=opt, ans1 <- dt[x %in% letters[15:20], output = out) # subsets in 'i' for head and tail -test(1581.04,optimization=opt, dt[x %in% letters[15:20], head(.SD,1), by=x, verbose=TRUE], output=out) -test(1581.07,optimization=opt, dt[x %in% letters[15:20], tail(.SD,1), by=x, verbose=TRUE], output=out) -test(1581.10,optimization=opt, dt[x %in% letters[15:20], .SD[2], by=x, verbose=TRUE], output=out) +test(1581.04,optimize=opt, dt[x %in% letters[15:20], head(.SD,1), by=x, verbose=TRUE], output=out) +test(1581.07,optimize=opt, dt[x %in% letters[15:20], tail(.SD,1), by=x, verbose=TRUE], output=out) +test(1581.10,optimize=opt, dt[x %in% letters[15:20], .SD[2], by=x, verbose=TRUE], output=out) # #3209 g[[ -test(1581.13,optimization=opt, dt[x %in% letters[15:20], d1[[2]], by=x, verbose=TRUE], output=out) +test(1581.13,optimize=opt, dt[x %in% letters[15:20], d1[[2]], by=x, verbose=TRUE], output=out) # also, block for non-atomic input, #4159 dt = data.table(a=1:3) dt[ , l := .(list(1, 2, 3))] @@ -284,16 +284,16 @@ DT = data.table( C2 = c(rep("a", 3), rep("b",3), rep("c",3), rep("d",3)), Val = c(1:5, NaN, NaN, 8,9,10,NaN,12)) opt = 0:2 -test(1583.1,optimization=opt, DT[, .(agg = min(Val, na.rm=TRUE)), by=c('C1', 'C2')], +test(1583.1,optimize=opt, DT[, .(agg = min(Val, na.rm=TRUE)), by=c('C1', 'C2')], data.table(C1=c("A","A","B","B","C","C"), C2=c("a","b","b","c","c","d"), agg=c(1,4,5,8,9,10))) # extra test with a size-1 group containing one NaN too DT = data.table(x=INT(1,1,1,2,2,2,3,3,3,4,4,4,5), y=c(NaN,1,2, 2,NaN,1, NA,NaN,2, NaN,NA,NaN, NaN)) -test(1583.2,optimization=2L, DT[, min(y, na.rm=TRUE), by=x], data.table(x=1:5, V1=c(1,1,2,NA,NA))) -test(1583.3,optimization=2L, DT[, max(y, na.rm=TRUE), by=x], data.table(x=1:5, V1=c(2,2,2,NA,NA))) -test(1583.4,optimization=opt, DT[, min(y), by=x], data.table(x=1:5, V1=c(NaN,NaN,NA,NaN,NaN))) -test(1583.5,optimization=opt, DT[, max(y), by=x], data.table(x=1:5, V1=c(NaN,NaN,NA,NaN,NaN))) +test(1583.2,optimize=2L, DT[, min(y, na.rm=TRUE), by=x], data.table(x=1:5, V1=c(1,1,2,NA,NA))) +test(1583.3,optimize=2L, DT[, max(y, na.rm=TRUE), by=x], data.table(x=1:5, V1=c(2,2,2,NA,NA))) +test(1583.4,optimize=opt, DT[, min(y), by=x], data.table(x=1:5, V1=c(NaN,NaN,NA,NaN,NaN))) +test(1583.5,optimize=opt, DT[, max(y), by=x], data.table(x=1:5, V1=c(NaN,NaN,NA,NaN,NaN))) # FR #523, var, sd and prod DT = data.table(x=sample(5, 100, TRUE), @@ -303,24 +303,24 @@ DT = data.table(x=sample(5, 100, TRUE), z2=sample(c(runif(10),NA,NaN), 100, TRUE)) opt = 0:2 out = c("GForce FALSE", "GForce FALSE", "GForce optimized j to") -test(1594.01,optimization=opt, DT[, lapply(.SD, var, na.rm=FALSE), by=x]) -test(1594.02,optimization=opt, DT[, lapply(.SD, var, na.rm=TRUE), by=x]) -test(1594.03,optimization=opt, DT[, lapply(.SD, var, na.rm=TRUE), by=x, verbose=TRUE], output=out) +test(1594.01,optimize=opt, DT[, lapply(.SD, var, na.rm=FALSE), by=x]) +test(1594.02,optimize=opt, DT[, lapply(.SD, var, na.rm=TRUE), by=x]) +test(1594.03,optimize=opt, DT[, lapply(.SD, var, na.rm=TRUE), by=x, verbose=TRUE], output=out) # coverage: default group .N=1 case idx=DT[ , .I[1L], by=x]$V1 ans=data.table(x=DT[(idx), x], V1=NA_real_) -test(1594.05,optimization=opt, DT[(idx), var(y1), by=x], ans) -test(1594.06,optimization=opt, DT[(idx), var(y1, na.rm=TRUE), by=x], ans) -test(1594.07,optimization=opt, DT[(idx), var(z1), by=x], ans) -test(1594.08,optimization=opt, DT[(idx), var(z1, na.rm=TRUE), by=x], ans) +test(1594.05,optimize=opt, DT[(idx), var(y1), by=x], ans) +test(1594.06,optimize=opt, DT[(idx), var(y1, na.rm=TRUE), by=x], ans) +test(1594.07,optimize=opt, DT[(idx), var(z1), by=x], ans) +test(1594.08,optimize=opt, DT[(idx), var(z1, na.rm=TRUE), by=x], ans) -test(1594.09,optimization=opt,DT[, lapply(.SD, sd, na.rm=FALSE), by=x]) -test(1594.10,optimization=opt, DT[, lapply(.SD, sd, na.rm=TRUE), by=x], DT[, lapply(.SD, stats::sd, na.rm=TRUE), by=x]) -test(1594.11,optimization=opt, DT[, lapply(.SD, sd, na.rm=TRUE), by=x, verbose=TRUE], output=out) +test(1594.09,optimize=opt,DT[, lapply(.SD, sd, na.rm=FALSE), by=x]) +test(1594.10,optimize=opt, DT[, lapply(.SD, sd, na.rm=TRUE), by=x], DT[, lapply(.SD, stats::sd, na.rm=TRUE), by=x]) +test(1594.11,optimize=opt, DT[, lapply(.SD, sd, na.rm=TRUE), by=x, verbose=TRUE], output=out) -test(1594.12,optimization=opt, DT[, lapply(.SD, prod, na.rm=FALSE), by=x]) -test(1594.13,optimization=opt, DT[, lapply(.SD, prod, na.rm=TRUE), by=x]) -test(1594.14,optimization=opt, DT[, lapply(.SD, prod, na.rm=TRUE), by=x, verbose=TRUE], output=out) +test(1594.12,optimize=opt, DT[, lapply(.SD, prod, na.rm=FALSE), by=x]) +test(1594.13,optimize=opt, DT[, lapply(.SD, prod, na.rm=TRUE), by=x]) +test(1594.14,optimize=opt, DT[, lapply(.SD, prod, na.rm=TRUE), by=x, verbose=TRUE], output=out) # when datatable.optimize<1, no optimisation of j should take place: dt = data.table(x=1:5, y=6:10, z=c(1,1,1,2,2)) @@ -330,53 +330,53 @@ test(1638, options=c(datatable.optimize=0L), dt[, .SD, by=z, verbose=TRUE], outp DT = data.table(x=c(3.7,3.3,3.5,2.8), w=c(5,5,4,1), g=1L) opt = c(1L,2L) out = c("GForce FALSE", "GForce optimized j to") -test(2231.01,optimization=opt, DT[, weighted.mean(x, w), g, verbose=TRUE], data.table(g=1L, V1=3.45333333333333), output=out) -test(2231.02,optimization=opt, DT[, weighted.mean(w, x), g, verbose=TRUE], data.table(g=1L, V1=3.89473684210526), output=out) -test(2231.03,optimization=opt, DT[, weighted.mean(x), g, verbose=TRUE], data.table(g=1L, V1=3.325), output=out) +test(2231.01,optimize=opt, DT[, weighted.mean(x, w), g, verbose=TRUE], data.table(g=1L, V1=3.45333333333333), output=out) +test(2231.02,optimize=opt, DT[, weighted.mean(w, x), g, verbose=TRUE], data.table(g=1L, V1=3.89473684210526), output=out) +test(2231.03,optimize=opt, DT[, weighted.mean(x), g, verbose=TRUE], data.table(g=1L, V1=3.325), output=out) # multiple groups DT = data.table(x=c(1L,2L,2L,3L,4L,5L,5L,6L), w=c(1L,1L,1L,1L,2L,2L,2L,2L), g=rep(1L:2L, each=4L)) -test(2231.04,optimization=opt, DT[, weighted.mean(x, w), g, verbose=TRUE], data.table(g=c(1L,2L), V1=c(2,5)), output=out) -test(2231.05,optimization=opt, DT[, weighted.mean(x, w), g, verbose=TRUE], data.table(g=c(1L,2L), V1=c(2,5)), output=out) -test(2231.06,optimization=opt, DT[, weighted.mean(x, w), seq(nrow(DT)), verbose=TRUE], data.table(seq=1L:8L, V1=c(1,2,2,3,4,5,5,6)), output=out) +test(2231.04,optimize=opt, DT[, weighted.mean(x, w), g, verbose=TRUE], data.table(g=c(1L,2L), V1=c(2,5)), output=out) +test(2231.05,optimize=opt, DT[, weighted.mean(x, w), g, verbose=TRUE], data.table(g=c(1L,2L), V1=c(2,5)), output=out) +test(2231.06,optimize=opt, DT[, weighted.mean(x, w), seq(nrow(DT)), verbose=TRUE], data.table(seq=1L:8L, V1=c(1,2,2,3,4,5,5,6)), output=out) # (only x XOR w) containing NA DT = data.table(x=c(1L,NA,2L,3L,4L,5L,5L,6L), w=c(1L,1L,1L,1L,2L,NA,2L,2L), g=rep(1L:2L, each=4L)) -test(2231.07,optimization=opt, DT[, weighted.mean(x, w, na.rm=FALSE), g, verbose=TRUE], data.table(g=c(1L,2L), V1=c(NA_real_, NA_real_)), output=out) -test(2231.08,optimization=opt, DT[, weighted.mean(x, w, na.rm=TRUE), g, verbose=TRUE], data.table(g=c(1L,2L), V1=c(2, NA_real_)), output=out) -test(2231.09,optimization=opt, DT[, weighted.mean(x, na.rm=FALSE), g, verbose=TRUE], data.table(g=c(1L,2L), V1=c(NA, 5)), output=out) -test(2231.10,optimization=opt, DT[, weighted.mean(x, na.rm=TRUE), g, verbose=TRUE], data.table(g=c(1L,2L), V1=c(2, 5)), output=out) +test(2231.07,optimize=opt, DT[, weighted.mean(x, w, na.rm=FALSE), g, verbose=TRUE], data.table(g=c(1L,2L), V1=c(NA_real_, NA_real_)), output=out) +test(2231.08,optimize=opt, DT[, weighted.mean(x, w, na.rm=TRUE), g, verbose=TRUE], data.table(g=c(1L,2L), V1=c(2, NA_real_)), output=out) +test(2231.09,optimize=opt, DT[, weighted.mean(x, na.rm=FALSE), g, verbose=TRUE], data.table(g=c(1L,2L), V1=c(NA, 5)), output=out) +test(2231.10,optimize=opt, DT[, weighted.mean(x, na.rm=TRUE), g, verbose=TRUE], data.table(g=c(1L,2L), V1=c(2, 5)), output=out) # (only x XOR w) containing NaN DT = data.table(x=c(1L,2L,NaN,3L,4L,5L,5L,6L), w=c(1L,1L,1L,1L,2L,2L,NaN,2L), g=rep(1L:2L, each=4L)) -test(2231.11,optimization=opt, DT[, weighted.mean(x, w, na.rm=FALSE), g, verbose=TRUE], data.table(g=c(1L,2L), V1=c(NaN, NA)), output=out) -test(2231.12,optimization=opt, DT[, weighted.mean(x, w, na.rm=TRUE), g, verbose=TRUE], data.table(g=c(1L,2L), V1=c(2, NA)), output=out) -test(2231.13,optimization=opt, DT[, weighted.mean(x, na.rm=FALSE), g, verbose=TRUE], data.table(g=c(1L,2L), V1=c(NaN, 5)), output=out) -test(2231.14,optimization=opt, DT[, weighted.mean(x, na.rm=TRUE), g, verbose=TRUE], data.table(g=c(1L,2L), V1=c(2, 5)), output=out) +test(2231.11,optimize=opt, DT[, weighted.mean(x, w, na.rm=FALSE), g, verbose=TRUE], data.table(g=c(1L,2L), V1=c(NaN, NA)), output=out) +test(2231.12,optimize=opt, DT[, weighted.mean(x, w, na.rm=TRUE), g, verbose=TRUE], data.table(g=c(1L,2L), V1=c(2, NA)), output=out) +test(2231.13,optimize=opt, DT[, weighted.mean(x, na.rm=FALSE), g, verbose=TRUE], data.table(g=c(1L,2L), V1=c(NaN, 5)), output=out) +test(2231.14,optimize=opt, DT[, weighted.mean(x, na.rm=TRUE), g, verbose=TRUE], data.table(g=c(1L,2L), V1=c(2, 5)), output=out) # (only x XOR w) containing NA and NaN DT = data.table(x=c(1L,NA,NaN,3L,4L,5L,5L,6L), w=c(1L,1L,1L,1L,2L,NA,NaN,2L), g=rep(1L:2L, each=4L)) -test(2231.15,optimization=opt, DT[, weighted.mean(x, w, na.rm=FALSE), g, verbose=TRUE], data.table(g=c(1L,2L), V1=c(NA_real_, NA_real_)), output=out) -test(2231.16,optimization=opt, DT[, weighted.mean(x, w, na.rm=TRUE), g, verbose=TRUE], data.table(g=c(1L,2L), V1=c(2, NA)), output=out) -test(2231.17,optimization=opt, DT[, weighted.mean(x, na.rm=FALSE), g, verbose=TRUE], data.table(g=c(1L,2L), V1=c(NA, 5)), output=out) -test(2231.18,optimization=opt, DT[, weighted.mean(x, na.rm=TRUE), g, verbose=TRUE], data.table(g=c(1L,2L), V1=c(2, 5)), output=out) +test(2231.15,optimize=opt, DT[, weighted.mean(x, w, na.rm=FALSE), g, verbose=TRUE], data.table(g=c(1L,2L), V1=c(NA_real_, NA_real_)), output=out) +test(2231.16,optimize=opt, DT[, weighted.mean(x, w, na.rm=TRUE), g, verbose=TRUE], data.table(g=c(1L,2L), V1=c(2, NA)), output=out) +test(2231.17,optimize=opt, DT[, weighted.mean(x, na.rm=FALSE), g, verbose=TRUE], data.table(g=c(1L,2L), V1=c(NA, 5)), output=out) +test(2231.18,optimize=opt, DT[, weighted.mean(x, na.rm=TRUE), g, verbose=TRUE], data.table(g=c(1L,2L), V1=c(2, 5)), output=out) # (x and w) containing NA and NaN DT = data.table(x=c(1L,NA,NaN,3L,4L,5L,5L,6L), w=c(1L,NA,NaN,1L,2L,2L,2L,2L), g=rep(1L:2L, each=4L)) -test(2231.19,optimization=opt, DT[, weighted.mean(x, w, na.rm=FALSE), g, verbose=TRUE], data.table(g=c(1L,2L), V1=c(NA, 5)), output=out) -test(2231.20,optimization=opt, DT[, weighted.mean(x, w, na.rm=TRUE), g, verbose=TRUE], data.table(g=c(1L,2L), V1=c(2, 5)), output=out) +test(2231.19,optimize=opt, DT[, weighted.mean(x, w, na.rm=FALSE), g, verbose=TRUE], data.table(g=c(1L,2L), V1=c(NA, 5)), output=out) +test(2231.20,optimize=opt, DT[, weighted.mean(x, w, na.rm=TRUE), g, verbose=TRUE], data.table(g=c(1L,2L), V1=c(2, 5)), output=out) DT = data.table(x=c(1L,NA,NaN,3L,4L,5L,5L,6L), w=c(1L,NaN,NA,1L,2L,2L,2L,2L), g=rep(1L:2L, each=4L)) -test(2231.21,optimization=opt, DT[, weighted.mean(x, w, na.rm=FALSE), g, verbose=TRUE], data.table(g=c(1L,2L), V1=c(NA, 5)), output=out) -test(2231.22,optimization=opt, DT[, weighted.mean(x, w, na.rm=TRUE), g, verbose=TRUE], data.table(g=c(1L,2L), V1=c(2, 5)), output=out) +test(2231.21,optimize=opt, DT[, weighted.mean(x, w, na.rm=FALSE), g, verbose=TRUE], data.table(g=c(1L,2L), V1=c(NA, 5)), output=out) +test(2231.22,optimize=opt, DT[, weighted.mean(x, w, na.rm=TRUE), g, verbose=TRUE], data.table(g=c(1L,2L), V1=c(2, 5)), output=out) # let wrongly named arguments get lost in ellipsis #5543 DT = data.table(x=c(3.7,3.3,3.5,2.8), w=c(5,5,4,1), g=1L) -test(2231.61,optimization=opt, DT[, weighted.mean(x, w), g, verbose=TRUE], data.table(g=1L, V1=3.45+1/300), output=out) -test(2231.62,optimization=opt, DT[, weighted.mean(x, weight=w), g, verbose=TRUE], data.table(g=1L, V1=3.325), output=out) -test(2231.63,optimization=opt, DT[, weighted.mean(x, w, na.rm=FALSE), g], DT[, stats::weighted.mean(x, w, na.rm=FALSE), g]) -test(2231.64,optimization=opt, DT[, weighted.mean(x, weight=w, na.rm=TRUE)], DT[, stats::weighted.mean(x, weight=w, na.rm=TRUE)]) +test(2231.61,optimize=opt, DT[, weighted.mean(x, w), g, verbose=TRUE], data.table(g=1L, V1=3.45+1/300), output=out) +test(2231.62,optimize=opt, DT[, weighted.mean(x, weight=w), g, verbose=TRUE], data.table(g=1L, V1=3.325), output=out) +test(2231.63,optimize=opt, DT[, weighted.mean(x, w, na.rm=FALSE), g], DT[, stats::weighted.mean(x, w, na.rm=FALSE), g]) +test(2231.64,optimize=opt, DT[, weighted.mean(x, weight=w, na.rm=TRUE)], DT[, stats::weighted.mean(x, weight=w, na.rm=TRUE)]) # GForce retains attributes in by arguments #5567 dt = data.table(a=letters[1:4], b=structure(1:4, class = c("class_b", "integer"), att=1), c=structure(c(1L,2L,1L,2L), class = c("class_c", "integer"))) opt = c(0,Inf) out = c("GForce FALSE", "GForce optimized j to") -test(2263.1,optimization=opt, options=list(datatable.verbose=TRUE), dt[, .N, b], data.table(b=dt$b, N=1L), output=out) -test(2263.2,optimization=opt, options=list(datatable.verbose=TRUE), dt[, .N, .(b,c)], data.table(b=dt$b, c=dt$c, N=1L), output=out) -test(2263.3,optimization=opt, options=list(datatable.verbose=TRUE), names(attributes(dt[, .N, b]$b)), c("class", "att"), output=out) +test(2263.1,optimize=opt, options=list(datatable.verbose=TRUE), dt[, .N, b], data.table(b=dt$b, N=1L), output=out) +test(2263.2,optimize=opt, options=list(datatable.verbose=TRUE), dt[, .N, .(b,c)], data.table(b=dt$b, c=dt$c, N=1L), output=out) +test(2263.3,optimize=opt, options=list(datatable.verbose=TRUE), names(attributes(dt[, .N, b]$b)), c("class", "att"), output=out) # named arguments of c() in j get prepended to lapply(.SD, FUN) #2311 M <- as.data.table(mtcars) @@ -388,60 +388,60 @@ sdlist <- vector("list", length(sdnames)) names(sdlist) <- sdnames opts = 0:2 -test(2283 + 0.01, optimization=opts, +test(2283 + 0.01, optimize=opts, names(M[, c(m=lapply(.SD, mean)), by="cyl"]), c("cyl", names(c(m=sdlist)))) -test(2283 + 0.02, optimization=opts, +test(2283 + 0.02, optimize=opts, names(M[, c(Mpg=list(mpg), lapply(.SD, mean)), by="cyl"]), c("cyl", "Mpg", sdnames)) -test(2283 + 0.03, optimization=opts, +test(2283 + 0.03, optimize=opts, names(M[, c(Mpg=list(mpg), m=lapply(.SD, mean)), by="cyl"]), c("cyl", "Mpg", names(c(m=sdlist)))) -test(2283 + 0.04, optimization=opts, +test(2283 + 0.04, optimize=opts, names(M[, c(mpg=list(mpg), mpg=lapply(.SD, mean)), by="cyl"]), c("cyl", "mpg", names(c(mpg=sdlist)))) -test(2283 + 0.05, optimization=opts, +test(2283 + 0.05, optimize=opts, names(M[, c(list(mpg), lapply(.SD, mean)), by="cyl"]), c("cyl", "V1", sdnames)) -test(2283 + 0.06, optimization=opts, +test(2283 + 0.06, optimize=opts, names(M[, c(lapply(.SD, mean), list(mpg)), by="cyl"]), c("cyl", sdnames, sprintf("V%d", length(sdnames)+1L))) -test(2283 + 0.07, optimization=opts, +test(2283 + 0.07, optimize=opts, names(M[, c(lapply(.SD, mean), lapply(.SD, sum)), by="cyl"]), c("cyl", sdnames, sdnames)) -test(2283 + 0.08, optimization=opts, +test(2283 + 0.08, optimize=opts, names(M[, c(mean=lapply(.SD, mean), sum=lapply(.SD, sum)), by="cyl"]), c("cyl", names(c(mean=sdlist, sum=sdlist)))) -test(2283 + 0.09, optimization=opts, +test(2283 + 0.09, optimize=opts, names(M[, c(lapply(.SD, mean), sum=lapply(.SD, sum)), by="cyl"]), c("cyl", sdnames, names(c(sum=sdlist))) ) -test(2283 + 0.10, optimization=opts, +test(2283 + 0.10, optimize=opts, names(M[, c(" "=lapply(.SD, mean), "."=lapply(.SD, sum)), by="cyl"]), c("cyl", names(c(" "=sdlist, "."=sdlist)))) -test(2283 + 0.11, optimization=opts, +test(2283 + 0.11, optimize=opts, names(M[, c(A=list(a=mpg, b=hp), lapply(.SD, mean)), by="cyl"]), c("cyl", names(c(A=list(a=0, b=0))), sdnames)) -test(2283 + 0.12, optimization=opts, +test(2283 + 0.12, optimize=opts, names(M[, c(A=list(mpg, hp), lapply(.SD, mean)), by="cyl"]), c("cyl", names(c(A=list(0, 0))), sdnames)) -test(2283 + 0.13, optimization=opts, +test(2283 + 0.13, optimize=opts, names(M[, c(A=list(mpg, b=hp, wt), lapply(.SD, mean)), by="cyl"]), c("cyl", names(c(A=list(0, b=0, 0))), sdnames)) -test(2283 + 0.14, optimization=opts, +test(2283 + 0.14, optimize=opts, names(M[, c(A=list(mpg), lapply(.SD, mean)), by="cyl"]), c("cyl", names(c(A=list(0))), sdnames)) -test(2283 + 0.15, optimization=opts, +test(2283 + 0.15, optimize=opts, names(M[, c(" "=list(" "=hp, "."=disp, mpg), lapply(.SD, mean)), by="cyl"]), c("cyl", names(c(" "=list(" "=0, "."=0, 0))), sdnames)) -test(2283 + 0.16, optimization=opts, +test(2283 + 0.16, optimize=opts, names(M[, c("."=list(" "=hp, "."=disp, mpg), lapply(.SD, mean)), by="cyl"]), c("cyl", names(c("."=list(" "=0, "."=0, 0))), sdnames)) -test(2283 + 0.17, optimization=opts, +test(2283 + 0.17, optimize=opts, names(M[, c(list(mpg, b=hp), lapply(.SD, mean)), by="cyl", .SDcols=c("vs", "am")]), c("cyl", "V1", "b", "vs", "am")) -test(2283 + 0.18, optimization=opts, +test(2283 + 0.18, optimize=opts, names(M[, c(list(mpg, b=hp), c(lapply(.SD, mean))), by="cyl", .SDcols=c("vs", "am")]), c("cyl", "V1", "b", "vs", "am")) -test(2283 + 0.19, optimization=opts, +test(2283 + 0.19, optimize=opts, names(M[, c(mpg[1], list(mpg, b=hp), c(lapply(.SD, mean))), by="cyl", .SDcols=c("vs", "am")]), c("cyl", "V1", "V2", "b", "vs", "am")) diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index db7ed84e6..d8395a80e 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -3890,7 +3890,7 @@ test(1133.3, DT[, new := c(1,2), by=x], error="Supplied 2 items to be assigned test(1133.4, DT[, new := c(1L,2L), by=x], error="Supplied 2 items to be assigned to group 1 of size 5 in column 'new'") test(1133.5, DT, data.table(x=INT(1,1,1,1,1,2,2), new=99L)) test(1133.6, DT[, new := rep(-.GRP, .N), by=x], data.table(x=INT(1,1,1,1,1,2,2), new=INT(-1,-1,-1,-1,-1,-2,-2))) -test(1133.7,optimization=c(0L, 2L), DT[, new := .N, by=x], data.table(x=INT(1,1,1,1,1,2,2), new=INT(5,5,5,5,5,2,2))) +test(1133.7,optimize=c(0L, 2L), DT[, new := .N, by=x], data.table(x=INT(1,1,1,1,1,2,2), new=INT(5,5,5,5,5,2,2))) # on a new column with warning on 2nd assign DT[,new:=NULL] test(1133.8, DT[, new := if (.GRP==1L) 7L else 3.4, by=x], data.table(x=INT(1,1,1,1,1,2,2), new=INT(7,7,7,7,7,3,3)), @@ -3993,8 +3993,8 @@ DT<-data.table(X=factor(2006:2012),Y=rep(1:7,2)) test(1143.2, DT[, Z:=paste(X,.N,sep=" - "), by=list(X)], data.table(X=factor(2006:2012),Y=rep(1:7,2), Z=paste(as.character(2006:2012), 2L, sep=" - "))) DT = data.table(x=as.POSIXct(c("2009-02-17 17:29:23.042", "2009-02-17 17:29:25.160")), y=c(1L,2L)) test(1143.3, DT[, list(lx=x[.N]), by=x], data.table(x=DT$x, lx=DT$x)) -test(1143.4,optimization=c(0L, 2L), copy(DT)[,`:=`(lx=tail(x,1L)), by=y], copy(DT)[, lx:=x]) -test(1143.5,optimization=c(0L, 2L), copy(DT)[, let(lx=tail(x,1L)), by=y], copy(DT)[, lx:=x]) +test(1143.4,optimize=c(0L, 2L), copy(DT)[,`:=`(lx=tail(x,1L)), by=y], copy(DT)[, lx:=x]) +test(1143.5,optimize=c(0L, 2L), copy(DT)[, let(lx=tail(x,1L)), by=y], copy(DT)[, lx:=x]) # FR #2356 - retain names of named vector as column with keep.rownames=TRUE x <- 1:5 @@ -4463,16 +4463,16 @@ test(1228.6, class(DT), class(DT[a>1, c:=sum(b), by=a])) # Test that ad hoc by detects if ordered and dogroups switches to memcpy if contiguous, #1050 DT = data.table(a=1:3,b=1:6,key="a") # turn off GForce, to test dogroups -test(1230,optimization=1L, DT[, sum(b), by=a, verbose=TRUE], output="memcpy contiguous groups") +test(1230,optimize=1L, DT[, sum(b), by=a, verbose=TRUE], output="memcpy contiguous groups") setkey(DT,NULL) -test(1231,optimization=1L, DT[, sum(b), by=a, verbose=TRUE], output="memcpy contiguous groups") -test(1232,optimization=1L, DT[, sum(b), by=a+1, verbose=TRUE], output="memcpy contiguous groups") -test(1233,optimization=1L, DT[, sum(b), by=a%%2, verbose=TRUE], output="collecting discontiguous groups") -test(1234,optimization=1L, DT[, sum(a), by=b, verbose=TRUE], output="memcpy contiguous groups") # as from v1.12.0 the out-of-order but grouped-ness is detected and utilized +test(1231,optimize=1L, DT[, sum(b), by=a, verbose=TRUE], output="memcpy contiguous groups") +test(1232,optimize=1L, DT[, sum(b), by=a+1, verbose=TRUE], output="memcpy contiguous groups") +test(1233,optimize=1L, DT[, sum(b), by=a%%2, verbose=TRUE], output="collecting discontiguous groups") +test(1234,optimize=1L, DT[, sum(a), by=b, verbose=TRUE], output="memcpy contiguous groups") # as from v1.12.0 the out-of-order but grouped-ness is detected and utilized setkey(DT,a) -test(1235,optimization=1L, DT[.(2:3),sum(b),by=.EACHI,verbose=TRUE], data.table(a=2:3,V1=c(7L,9L),key="a"), output="memcpy contiguous groups") -test(1236,optimization=1L, DT[.(3:2),sum(b),by=.EACHI,verbose=TRUE], data.table(a=3:2,V1=c(9L,7L)), output="memcpy contiguous groups") -test(1237,optimization=1L, DT[.(3:2),sum(b),keyby=.EACHI,verbose=TRUE], data.table(a=2:3,V1=c(7L,9L),key="a"), output="memcpy contiguous groups") +test(1235,optimize=1L, DT[.(2:3),sum(b),by=.EACHI,verbose=TRUE], data.table(a=2:3,V1=c(7L,9L),key="a"), output="memcpy contiguous groups") +test(1236,optimize=1L, DT[.(3:2),sum(b),by=.EACHI,verbose=TRUE], data.table(a=3:2,V1=c(9L,7L)), output="memcpy contiguous groups") +test(1237,optimize=1L, DT[.(3:2),sum(b),keyby=.EACHI,verbose=TRUE], data.table(a=2:3,V1=c(7L,9L),key="a"), output="memcpy contiguous groups") # check that key is not preserved when length of fastorder is > 0 DT <- data.table(x=1:5, y=6:10, key="x") @@ -4742,18 +4742,18 @@ gf_out = c("All optimizations are turned off", "Old mean optimization.*(GForce F lp_out = c("All optimizations are turned off", "Old mean optimization.*(GForce FALSE)", "lapply optimization changed j") opt = c(0L, 1L, Inf) # auto-naming behavior is different for no-optimization case; just check optimization is off -test(1268.01,optimization=opt, dt[, c(lapply(.SD, mean), lapply(.SD, sum)), by=a, verbose=TRUE], output = gf_out) -test(1268.02,optimization=opt, dt[, c(lapply(.SD, mean), .N), by=a, verbose=TRUE], output = lp_out) -test(1268.03,optimization=opt, dt[, c(list(c), lapply(.SD, mean)), by=a, verbose=TRUE], output= lp_out) -test(1268.04,optimization=opt, dt[, c(sum(d), lapply(.SD, mean)), by=a, verbose=TRUE], output= gf_out) -test(1268.05,optimization=opt, dt[, c(list(sum(d)), lapply(.SD, mean)), by=a, verbose=TRUE], output= lp_out) +test(1268.01,optimize=opt, dt[, c(lapply(.SD, mean), lapply(.SD, sum)), by=a, verbose=TRUE], output = gf_out) +test(1268.02,optimize=opt, dt[, c(lapply(.SD, mean), .N), by=a, verbose=TRUE], output = lp_out) +test(1268.03,optimize=opt, dt[, c(list(c), lapply(.SD, mean)), by=a, verbose=TRUE], output= lp_out) +test(1268.04,optimize=opt, dt[, c(sum(d), lapply(.SD, mean)), by=a, verbose=TRUE], output= gf_out) +test(1268.05,optimize=opt, dt[, c(list(sum(d)), lapply(.SD, mean)), by=a, verbose=TRUE], output= lp_out) # newly added tests for #861 -- optimise, but no GForce -test(1268.06,optimization=opt, dt[, c(list(sum(d), .I), lapply(.SD, mean)), by=a, verbose=TRUE], output= lp_out) +test(1268.06,optimize=opt, dt[, c(list(sum(d), .I), lapply(.SD, mean)), by=a, verbose=TRUE], output= lp_out) # don't optimise .I in c(...) -test(1268.07,optimization=opt, dt2[, c(.I, lapply(.SD, mean)), by=x, verbose=TRUE], +test(1268.07,optimize=opt, dt2[, c(.I, lapply(.SD, mean)), by=x, verbose=TRUE], output= c("All optimizations are turned off", "Old mean optimization.*(GForce FALSE)", "lapply optimization is on, j unchanged as")) # tests .08-.21 were different optimization levels -test(1268.22,optimization=opt, dt[, c(as.list(c), lapply(.SD, mean)), by=a], +test(1268.22,optimize=opt, dt[, c(as.list(c), lapply(.SD, mean)), by=a], error = "j doesn't evaluate to the same number of columns for each group") ### FR #2722 tests end here ### @@ -6257,7 +6257,7 @@ test(1437.17, DT[!a %chin% c("A", "B") & x == 2], DT[c(4, 5, 6)]) test(1437.18, DT[x == 2, .(test = x+y), verbose = TRUE], output = "Optimized subsetting") test(1437.19, DT[x == 2, test := x+y, verbose = TRUE], output = "Optimized subsetting") ## optimize option level 3 is required to get optimized subsetting -test(1437.21,optimization=c(2,Inf), DT[x == 2, verbose = TRUE], output = c("^ x y", "Optimized subsetting")) +test(1437.21,optimize=c(2,Inf), DT[x == 2, verbose = TRUE], output = c("^ x y", "Optimized subsetting")) ## NaN on right hand side is treated correctly. NA on right hand side is not reaching .prepareFastSubset, so not tested here DT <- data.table(x = c(1L:10L, NA_integer_, NA_integer_), y = c(1:10, NA_real_, NaN)) test(1437.23, DT[y == NaN], DT[0]) @@ -6343,16 +6343,16 @@ for(t in seq_len(nrow(all))){ ## test the query with missing j thisQuery <- all$query[t] test_no <- test_no + 1L - test(1438.0 + test_no*0.0001, optimization=c(2L,3L), DT[eval(parse(text = thisQuery))]) + test(1438.0 + test_no*0.0001, optimize=c(2L,3L), DT[eval(parse(text = thisQuery))]) ## repeat the test with 'which = TRUE' test_no <- test_no + 1L - test(1438.0 + test_no*0.0001, optimization=c(2L,3L), DT[eval(parse(text = thisQuery)), which = TRUE]) + test(1438.0 + test_no*0.0001, optimize=c(2L,3L), DT[eval(parse(text = thisQuery)), which = TRUE]) ## repeat the test with the j queries for(thisJquery in jQueries) { ## do it with and without existing "by" for(thisBy in bys){ test_no <- test_no + 1L - test(1438.0 + test_no*0.0001, optimization=c(2L,3L), DT[eval(parse(text = thisQuery)), eval(parse(text = thisJquery)), by = thisBy]) + test(1438.0 + test_no*0.0001, optimize=c(2L,3L), DT[eval(parse(text = thisQuery)), eval(parse(text = thisJquery)), by = thisBy]) } } } @@ -8757,13 +8757,13 @@ test(1629.07, dt[0][, .SD*v1, .SDcols=v2:v3], dt[0][, .SD, .SDcols=v2:v3]) dt2 = copy(dt) test(1629.08, dt2[, c("v2", "v3") := .SD*v1, .SDcols=v2:v3], dt[, .(grp, v1, v2=v2*v1, v3=v3*v1)]) # grouping operations -test(1629.09,optimization=c(1L,2L), dt[, .SD*sum(v1), by=grp, .SDcols=v2:v3], dt[, .SD*sum(v1), by=grp][, v1 := NULL]) +test(1629.09,optimize=c(1L,2L), dt[, .SD*sum(v1), by=grp, .SDcols=v2:v3], dt[, .SD*sum(v1), by=grp][, v1 := NULL]) ans1 = dt[, base::sum(v1), by=grp] ans2 = dt[, base::max(.SD), by=grp, .SDcols=v2:v3] -test(1629.10,optimization=c(1L,2L), dt[, max(.SD)*sum(v1), by=grp, .SDcols=v2:v3], ans1[, .(grp, V1=V1*ans2$V1)]) -test(1629.11,optimization=c(1L,2L), dt[, lapply(.SD, function(x) weighted.mean(x, w=v2)), .SDcols=c("v1","v3"), by=grp], +test(1629.10,optimize=c(1L,2L), dt[, max(.SD)*sum(v1), by=grp, .SDcols=v2:v3], ans1[, .(grp, V1=V1*ans2$V1)]) +test(1629.11,optimize=c(1L,2L), dt[, lapply(.SD, function(x) weighted.mean(x, w=v2)), .SDcols=c("v1","v3"), by=grp], dt[, .(v1=weighted.mean(v1,w=v2), v3=weighted.mean(v3, w=v2)), by=grp]) -test(1629.12,optimization=c(1L,Inf), dt[, c(v1=max(v1), lapply(.SD, base::min)), by=grp, .SDcols=v2:v3], dt[, .(v1=max(v1), v2=min(v2), v3=min(v3)), by=grp]) +test(1629.12,optimize=c(1L,Inf), dt[, c(v1=max(v1), lapply(.SD, base::min)), by=grp, .SDcols=v2:v3], dt[, .(v1=max(v1), v2=min(v2), v3=min(v3)), by=grp]) # gforce # even more complex, shouldn't run any optimisation dt[, v4 := v1/2] @@ -12735,8 +12735,8 @@ DT[ , V1:=as.ordered(V1)] test(1918.3, DT[, min(V1)], structure(1L, .Label = lev, class = c("ordered", "factor"))) test(1918.4, DT[, max(V1)], structure(5L, .Label = lev, class = c("ordered", "factor"))) ## make sure GForce is activated -test(1918.5,optimization=Inf, DT[, min(V1), by=V2], data.table(V2=c("f", "g", "h"), V1=structure(1:3, .Label=lev, class=c("ordered", "factor")))) -test(1918.6,optimization=Inf, DT[, max(V1), by=V2], data.table(V2=c("f", "g", "h"), V1=structure(3:5, .Label=lev, class=c("ordered", "factor")))) +test(1918.5,optimize=Inf, DT[, min(V1), by=V2], data.table(V2=c("f", "g", "h"), V1=structure(1:3, .Label=lev, class=c("ordered", "factor")))) +test(1918.6,optimize=Inf, DT[, max(V1), by=V2], data.table(V2=c("f", "g", "h"), V1=structure(3:5, .Label=lev, class=c("ordered", "factor")))) # as.ITime.character bug for NA handling #2940 test(1919, as.ITime(c('xxx', '10:43')), structure(c(NA, 38580L), class = "ITime")) @@ -13619,11 +13619,11 @@ suppressWarnings(rm(`___data.table_internal_test_1967.68___`)) test(1967.68, setDT(`___data.table_internal_test_1967.68___`), error = 'Cannot find symbol') ### [.data.table verbosity & non-equi-join tests -test(1967.69,optimization=0L, x[order(a), .N, verbose = TRUE], output='[1] 5', notOutput='forder.c') -test(1967.71,optimization=1L, x[order(a), .N, verbose = TRUE], 5L, +test(1967.69,optimize=0L, x[order(a), .N, verbose = TRUE], output='[1] 5', notOutput='forder.c') +test(1967.71,optimize=1L, x[order(a), .N, verbose = TRUE], 5L, output = "forder.c received 5 rows and 1 column") setkey(x) -test(1967.72,optimization=1L, x[x, .N, on = 'a', verbose = TRUE], 5L, +test(1967.72,optimize=1L, x[x, .N, on = 'a', verbose = TRUE], 5L, output = "on= matches existing key") x = data.table( @@ -13777,8 +13777,8 @@ test(1980, names(data.table(x)), "x") # crash when n="lead", #3354 DT = data.table( id = 1:5 , val = letters[1:5] ) -test(1981.1,optimization=c(0L, Inf), DT[, new_col := shift(val, "lead")], error=base_messages$stopifnot("is.numeric(n)")) -test(1981.2,optimization=c(0L, Inf), DT[, new_col := shift(val, NA_integer_)], error="Item 1 of n is NA") +test(1981.1,optimize=c(0L, Inf), DT[, new_col := shift(val, "lead")], error=base_messages$stopifnot("is.numeric(n)")) +test(1981.2,optimize=c(0L, Inf), DT[, new_col := shift(val, NA_integer_)], error="Item 1 of n is NA") # 1982 moved to benchmark.Rraw, #5517 @@ -13810,7 +13810,7 @@ test(1984.081, DT[, sum(a), by=as.raw(0)], error="Column or expression.*1.*t test(1984.082, data.table(A=1:4, L=list(1, 1:2, 1, 1:3), V=1:4)[, sum(V), by=.(A,L)], # better error message, 4308 error="Column or expression.*2.*type 'list'.*not.*supported") test(1984.09, DT[, sum(a), by=.(1,1:2)], error="The items in the 'by' or 'keyby' list have lengths [1, 2]. Each must be length 10; the same length as there are rows in x (after subsetting if i is provided).") -test(1984.10,optimization=Inf, DT[ , 1, by = .(a %% 2), verbose = TRUE], +test(1984.10,optimize=Inf, DT[ , 1, by = .(a %% 2), verbose = TRUE], data.table(a = c(1, 0), V1 = c(1, 1)), output = 'Optimization is on but left j unchanged') DT[ , f := rep(1:2, each = 5)] @@ -14696,10 +14696,10 @@ test(2042.4, DT[ , round(mean(DiffTime)), by=Group, verbose=TRUE], DT = data.table(x=INT(1,1,1,2,2), y=1:5) z = 1:5 opt = c(Inf,1L,0L) -test(2043.1,optimization=opt, DT[, list(mean(z), mean(y)), by=x], data.table(x=1:2, V1=c(3,3), V2=c(2.0,4.5))) -test(2043.4,optimization=opt, DT[, list(sd(z), sd(y)), by=x], data.table(x=1:2, V1=sd(z), V2=c(sd(1:3), sd(4:5)))) +test(2043.1,optimize=opt, DT[, list(mean(z), mean(y)), by=x], data.table(x=1:2, V1=c(3,3), V2=c(2.0,4.5))) +test(2043.4,optimize=opt, DT[, list(sd(z), sd(y)), by=x], data.table(x=1:2, V1=sd(z), V2=c(sd(1:3), sd(4:5)))) z = 1:4 -test(2043.5,optimization=opt, DT[, list(mean(z), mean(y)), by=x], ans<-data.table(x=1:2, V1=c(2.5,2.5), V2=c(2.0,4.5))) # was length error about z +test(2043.5,optimize=opt, DT[, list(mean(z), mean(y)), by=x], ans<-data.table(x=1:2, V1=c(2.5,2.5), V2=c(2.0,4.5))) # was length error about z # test type coercion in joins, #2592 dt1 <- data.table(int = 1L:10L, @@ -15862,10 +15862,10 @@ test(2098.1, DT[do.call(order, mget(groups)), verbose=TRUE], ans<-data.table(id= test(2098.2, DT[with(DT, do.call(order, mget(groups))), verbose=TRUE], ans, output=out) test(2098.3, DT[do.call(forder, mget(groups)), verbose=TRUE], ans, output=out) test(2098.4, DT[with(DT, do.call(forder, mget(groups))), verbose=TRUE], ans, output=out) -test(2098.5,optimization=0L, DT[do.call(order, mget(groups)), verbose=TRUE], ans, notOutput="forder.c") -test(2098.6,optimization=0L, DT[with(DT, do.call(order, mget(groups))), verbose=TRUE], ans, notOutput="forder.c") -test(2098.7,optimization=0L, DT[do.call(forder, mget(groups)), verbose=TRUE], ans, output=out) -test(2098.8,optimization=0L, DT[with(DT, do.call(forder, mget(groups))), verbose=TRUE], ans, output=out) +test(2098.5,optimize=0L, DT[do.call(order, mget(groups)), verbose=TRUE], ans, notOutput="forder.c") +test(2098.6,optimize=0L, DT[with(DT, do.call(order, mget(groups))), verbose=TRUE], ans, notOutput="forder.c") +test(2098.7,optimize=0L, DT[do.call(forder, mget(groups)), verbose=TRUE], ans, output=out) +test(2098.8,optimize=0L, DT[with(DT, do.call(forder, mget(groups))), verbose=TRUE], ans, output=out) # Error in update join when joining on factor, #3559 d1 <- data.table(fac = factor(letters[1:4]), char = letters[1:4], val = c(1L, NA, 3L, NA)) @@ -17290,14 +17290,14 @@ test(2207, dcast(DT, x~y, value.var="z"), data.table(x=1:3, a=c(1+6i, 3+4i, 5+2i # gmin/gmax for integer64, #4444 if (test_bit64) { DT = data.table(grp=c(1L, 1L, 1L, 2L), i64=as.integer64(c(NA, 1:3))) - test(2208.1,optimization=2L, DT[, min(i64), by=grp], data.table(grp=1:2, V1=as.integer64(c(NA, 3)))) - test(2208.2,optimization=2L, DT[, min(i64, na.rm=TRUE), by=grp], data.table(grp=1:2, V1=as.integer64(c(1, 3)))) - test(2208.3,optimization=2L, DT[, max(i64), by=grp], data.table(grp=1:2, V1=as.integer64(c(NA, 3)))) - test(2208.4,optimization=2L, DT[, max(i64, na.rm=TRUE), by=grp], data.table(grp=1:2, V1=as.integer64(2:3))) + test(2208.1,optimize=2L, DT[, min(i64), by=grp], data.table(grp=1:2, V1=as.integer64(c(NA, 3)))) + test(2208.2,optimize=2L, DT[, min(i64, na.rm=TRUE), by=grp], data.table(grp=1:2, V1=as.integer64(c(1, 3)))) + test(2208.3,optimize=2L, DT[, max(i64), by=grp], data.table(grp=1:2, V1=as.integer64(c(NA, 3)))) + test(2208.4,optimize=2L, DT[, max(i64, na.rm=TRUE), by=grp], data.table(grp=1:2, V1=as.integer64(2:3))) # create an all-NA group DT[, i64:=rev(i64)] - test(2208.7,optimization=2L, DT[, min(i64, na.rm=TRUE), by=grp], data.table(grp=1:2, V1=as.integer64(c(1,NA)))) - test(2208.8,optimization=2L, DT[, max(i64, na.rm=TRUE), by=grp], data.table(grp=1:2, V1=as.integer64(c(3,NA)))) + test(2208.7,optimize=2L, DT[, min(i64, na.rm=TRUE), by=grp], data.table(grp=1:2, V1=as.integer64(c(1,NA)))) + test(2208.8,optimize=2L, DT[, max(i64, na.rm=TRUE), by=grp], data.table(grp=1:2, V1=as.integer64(c(3,NA)))) } # when user supplies dec=',' don't try sep=',', #4483 @@ -17517,12 +17517,12 @@ funs = c("sum", "mean", "min", "max", "median", "var", "sd", "prod") testnum = 0L for (fun in funs) { testnum = testnum + 1L - test(2220.0 + testnum*0.01, optimization=2L, EVAL("DT[,",fun,"(i, na.rm='a'), g]"), error="na.rm must be TRUE or FALSE") + test(2220.0 + testnum*0.01, optimize=2L, EVAL("DT[,",fun,"(i, na.rm='a'), g]"), error="na.rm must be TRUE or FALSE") testnum = testnum + 1L - test(2220.0 + testnum*0.01, optimization=2L, EVAL("DT[,",fun,"(f), g]"), error=sprintf("%s is not meaningful for factors.", fun)) + test(2220.0 + testnum*0.01, optimize=2L, EVAL("DT[,",fun,"(f), g]"), error=sprintf("%s is not meaningful for factors.", fun)) } testnum = testnum + 1L -test(2220.0 + testnum*0.01, optimization=2L, DT[, prod(l), g], error="GForce prod can only be applied to columns, not .SD or similar.") +test(2220.0 + testnum*0.01, optimize=2L, DT[, prod(l), g], error="GForce prod can only be applied to columns, not .SD or similar.") # tables() error when called from inside a function(...), #5197 test(2221, (function(...) tables())(), output = "No objects of class data.table exist") @@ -17559,7 +17559,7 @@ for (col in names(DT)[-1]) { for (type in c('lag','lead','shift','cyclic')) { # fill is tested by group in tests 2218.*; see comments in #5205 # sapply(sapply()) changed to for(for(for())) to save 29MiB, #5517 - test(2224.1+i/10000, optimization=2L, # 192 tests here when test_bit64=TRUE; 168 when FALSE + test(2224.1+i/10000, optimize=2L, # 192 tests here when test_bit64=TRUE; 168 when FALSE EVAL(sprintf("DT[, shift(%s, %d, type='%s'), by=x]$V1", col, n, type)), ans[[i]]) i = i+1L @@ -17664,15 +17664,15 @@ test(2232.4, unique(DT, by='g', cols='v3'), error="non-existing column(s)") DT = data.table(a=1:3,b=(1:9)/10) opt = 0:2 out = c("GForce FALSE", "GForce FALSE", "GForce optimized j to") -test(2233.01,optimization=opt, DT[, v := min(b), a, verbose=TRUE], data.table(a=1:3, b=(1:9)/10, v=(1:3)/10), output=out) +test(2233.01,optimize=opt, DT[, v := min(b), a, verbose=TRUE], data.table(a=1:3, b=(1:9)/10, v=(1:3)/10), output=out) # GForce returning full length -test(2233.02,optimization=opt, DT[, v := head(b, 3L), a, verbose=TRUE], data.table(a=1:3, b=(1:9)/10, v=(1:9)/10), output=out) +test(2233.02,optimize=opt, DT[, v := head(b, 3L), a, verbose=TRUE], data.table(a=1:3, b=(1:9)/10, v=(1:9)/10), output=out) # GForce neither returning 1 per group nor full length -test(2233.03,optimization=opt, DT[, v := head(b, 2L), a], error="Supplied .* items to be assigned to .* column 'v'.") +test(2233.03,optimize=opt, DT[, v := head(b, 2L), a], error="Supplied .* items to be assigned to .* column 'v'.") # compare to non GForce version DT = data.table(a=1:3,b=(1:9)/10) -test(2233.04,optimization=opt, copy(DT)[, v := min(b), a, verbose=TRUE], copy(DT)[, v := base::min(b), a, ], output=out) -test(2233.05,optimization=opt, copy(DT)[, v := head(b, 3L), a, verbose=TRUE], copy(DT)[, v := utils::head(b, 3L), a], output=out) +test(2233.04,optimize=opt, copy(DT)[, v := min(b), a, verbose=TRUE], copy(DT)[, v := base::min(b), a, ], output=out) +test(2233.05,optimize=opt, copy(DT)[, v := head(b, 3L), a, verbose=TRUE], copy(DT)[, v := utils::head(b, 3L), a], output=out) # with key and grouping by key DT = data.table(a=1:3,b=(1:9)/10, key="a") @@ -17747,17 +17747,17 @@ opt = c(0,Inf) out = c("GForce FALSE", "GForce.*gsum") B = copy(a) A = a[sample(seq_len(nrow(a)), nrow(a))] # shuffle -test(2233.391,optimization=opt, A[, t1 := sum(flag, na.rm=TRUE), by=id2, verbose=TRUE], A, output=out) # y=A dummy just to test output= +test(2233.391,optimize=opt, A[, t1 := sum(flag, na.rm=TRUE), by=id2, verbose=TRUE], A, output=out) # y=A dummy just to test output= setorder(A, id1) -test(2233.392,optimization=opt, A[, t2 := sum(flag, na.rm=TRUE), by=id2, verbose=TRUE], A, output=out) -test(2233.393,optimization=opt, any(A[,t1!=t2]), FALSE) -test(2233.394,optimization=opt, any(A[, length(unique(t1))>1, by=id2]$V1), FALSE) -test(2233.395,optimization=opt, any(A[, length(unique(t2))>1, by=id2]$V1), FALSE) +test(2233.392,optimize=opt, A[, t2 := sum(flag, na.rm=TRUE), by=id2, verbose=TRUE], A, output=out) +test(2233.393,optimize=opt, any(A[,t1!=t2]), FALSE) +test(2233.394,optimize=opt, any(A[, length(unique(t1))>1, by=id2]$V1), FALSE) +test(2233.395,optimize=opt, any(A[, length(unique(t2))>1, by=id2]$V1), FALSE) # test from #5337 n=4; k=2 mm = data.table(a = rep(1:k,n), b=seq_len(n*k), d=rep(1:n,k)) ans = copy(mm)[, e:=INT(NA,8,NA,12,NA,8,NA,12)] -test(2233.41,optimization=opt, copy(mm)[a==2, e:=sum(b), by=d, verbose=TRUE], ans, output=c("GForce FALSE", "GForce.*gsum")) +test(2233.41,optimize=opt, copy(mm)[a==2, e:=sum(b), by=d, verbose=TRUE], ans, output=c("GForce FALSE", "GForce.*gsum")) # test from #5345 set.seed(1) DT = data.table( @@ -17768,7 +17768,7 @@ DT = data.table( ) load(testDir("test2233-43.Rdata")) # ans setDT(ans) # to silence verbose messages about internal.selfref being NULL when loaded from disk -test(2233.43,optimization=c(0,Inf), options = list(datatable.verbose=TRUE), +test(2233.43,optimize=c(0,Inf), options = list(datatable.verbose=TRUE), copy(DT)[, sum_v2_idT:=sum(v2), by=c("id", "t") ][, n_idT :=dim(.SD)[[1]], by=list(t, id) ][, sum_v2_id :=sum(v2), by=.(id) @@ -17920,12 +17920,12 @@ i=c(1,2) j=1L opt = c(1L,2L) out = c("GForce FALSE", "GForce TRUE") -test(2243.41,optimization=opt, options=c(datatable.verbose=TRUE), dt[, .I[TRUE], x]$V1, 1:4, output="GForce FALSE") -test(2243.42,optimization=opt, options=c(datatable.verbose=TRUE), dt[, z[y], x], data.table(x=c(2,2,1,1), V1=c("a","b",NA,NA)), output="GForce FALSE") -test(2243.53,optimization=opt, options=c(datatable.verbose=TRUE), dt[, .I[1], x]$V1, c(1L, 3L), output=out) -test(2243.54,optimization=opt, options=c(datatable.verbose=TRUE), dt[, .I[j], x]$V1, c(1L, 3L), output=out) -test(2243.55,optimization=opt, options=c(datatable.verbose=TRUE), dt[, .I[i], x]$V1, 1:4, output="GForce FALSE") -test(2243.56,optimization=opt, options=c(datatable.verbose=TRUE), dt[, .I[1:2], x]$V1, 1:4, output="GForce FALSE") +test(2243.41,optimize=opt, options=c(datatable.verbose=TRUE), dt[, .I[TRUE], x]$V1, 1:4, output="GForce FALSE") +test(2243.42,optimize=opt, options=c(datatable.verbose=TRUE), dt[, z[y], x], data.table(x=c(2,2,1,1), V1=c("a","b",NA,NA)), output="GForce FALSE") +test(2243.53,optimize=opt, options=c(datatable.verbose=TRUE), dt[, .I[1], x]$V1, c(1L, 3L), output=out) +test(2243.54,optimize=opt, options=c(datatable.verbose=TRUE), dt[, .I[j], x]$V1, c(1L, 3L), output=out) +test(2243.55,optimize=opt, options=c(datatable.verbose=TRUE), dt[, .I[i], x]$V1, 1:4, output="GForce FALSE") +test(2243.56,optimize=opt, options=c(datatable.verbose=TRUE), dt[, .I[1:2], x]$V1, 1:4, output="GForce FALSE") DT = data.table(1) test(2244.1, DT[, `:=`(a=1, )], error="`:=`.*Did you forget a trailing comma\\?") @@ -18382,7 +18382,7 @@ test(2269.2, fread("x\n?\n \n", colClasses="POSIXct", na.strings="?"), dt) # Error found by revdep in #6284: mean(a,b) is valid, expr names() can be NULL DT = data.table(a = 1, b = 2) -test(2270,optimization=1L, DT[, mean(b, 1), by=a], data.table(a=1, V1=2), warning="Unable to optimize call to mean()") +test(2270,optimize=1L, DT[, mean(b, 1), by=a], data.table(a=1, V1=2), warning="Unable to optimize call to mean()") # Missing newline in verbose output -> harder to read DT1 = data.table(a=1:2) @@ -21200,22 +21200,22 @@ test(2344.04, key(DT[, .(V4 = c("b", "a"), V2, V5 = c("y", "x"), V1)]), c("V1", # support arithmetic in j with gforce #3815 out = c("GForce FALSE", "GForce FALSE", "GForce TRUE") dt = data.table(a=1:4, b=1:2) -test(2345.01,optimization=0:2, dt[, max(as.character(a)), by=b, verbose=TRUE], data.table(b=1:2, V1=c("3","4")), output=out) -test(2345.02,optimization=0:2, dt[, max(as.numeric(a)), by=b, verbose=TRUE], data.table(b=1:2, V1=c(3,4)), output=out) +test(2345.01,optimize=0:2, dt[, max(as.character(a)), by=b, verbose=TRUE], data.table(b=1:2, V1=c("3","4")), output=out) +test(2345.02,optimize=0:2, dt[, max(as.numeric(a)), by=b, verbose=TRUE], data.table(b=1:2, V1=c(3,4)), output=out) dt = data.table(a=1:4, b=1:2) -test(2345.11,optimization=0:2, dt[, Map(sum, .SD), b, verbose=TRUE], dt[, lapply(.SD, sum), b], output=out) -test(2345.12,optimization=0:2, dt[, Map(sum, .SD, .SD), by=b, verbose=TRUE], output="GForce FALSE") +test(2345.11,optimize=0:2, dt[, Map(sum, .SD), b, verbose=TRUE], dt[, lapply(.SD, sum), b], output=out) +test(2345.12,optimize=0:2, dt[, Map(sum, .SD, .SD), by=b, verbose=TRUE], output="GForce FALSE") dt = data.table(a = NA_integer_, b = 1:2, c = c(TRUE, FALSE)) -test(2345.13,optimization=0:2, dt[, Map(weighted.mean, .SD, na.rm=c), b, .SDcols="a", verbose=TRUE], data.table(b=1:2, a=c(NaN, NA_real_)), output="GForce FALSE") -test(2345.14,optimization=0:2, dt[,list(weighted.mean(a, na.rm=c)), b, verbose=TRUE], data.table(b=1:2, V1=c(NaN, NA_real_)), output="GForce FALSE") +test(2345.13,optimize=0:2, dt[, Map(weighted.mean, .SD, na.rm=c), b, .SDcols="a", verbose=TRUE], data.table(b=1:2, a=c(NaN, NA_real_)), output="GForce FALSE") +test(2345.14,optimize=0:2, dt[,list(weighted.mean(a, na.rm=c)), b, verbose=TRUE], data.table(b=1:2, V1=c(NaN, NA_real_)), output="GForce FALSE") dt = data.table(a=1:2, b=1, c=1:4) -test(2345.21,optimization=0:2, dt[, lapply(list(b, c), sum), by=a, verbose=TRUE], output=out) -test(2345.22,optimization=0:2, dt[, c(list(sum(b), sum(c))), by=a, verbose=TRUE], output=out) -test(2345.23,optimization=0:2, names(dt[, lapply(list(b, c), sum), by=a])) +test(2345.21,optimize=0:2, dt[, lapply(list(b, c), sum), by=a, verbose=TRUE], output=out) +test(2345.22,optimize=0:2, dt[, c(list(sum(b), sum(c))), by=a, verbose=TRUE], output=out) +test(2345.23,optimize=0:2, names(dt[, lapply(list(b, c), sum), by=a])) dt = data.table(a=1:4, b=1:2) -test(2345.31,optimization=0:2, dt[, .(max(a)-min(a)), by=b, verbose=TRUE], output=out) -test(2345.32,optimization=0:2, dt[, .((max(a) - min(a)) / (max(a) + min(a))), by=b, verbose=TRUE], data.table(b=1:2, V1=c(0.5, 1/3)), output=out) -test(2345.33,optimization=0:2, dt[, sum(a) / .N, b, verbose=TRUE], output=out) -test(2345.34,optimization=0:2, dt[, mean(a) * 2L + sum(a), b, verbose=TRUE], output=out) -test(2345.35,optimization=0:2, dt[, list(range=max(a)-min(a), avg=mean(a)), by=b, verbose=TRUE], output=out) -test(2345.36,optimization=0:2, dt[, .(max(a)-sqrt(min(a))), by=b, verbose=TRUE], output="GForce FALSE") +test(2345.31,optimize=0:2, dt[, .(max(a)-min(a)), by=b, verbose=TRUE], output=out) +test(2345.32,optimize=0:2, dt[, .((max(a) - min(a)) / (max(a) + min(a))), by=b, verbose=TRUE], data.table(b=1:2, V1=c(0.5, 1/3)), output=out) +test(2345.33,optimize=0:2, dt[, sum(a) / .N, b, verbose=TRUE], output=out) +test(2345.34,optimize=0:2, dt[, mean(a) * 2L + sum(a), b, verbose=TRUE], output=out) +test(2345.35,optimize=0:2, dt[, list(range=max(a)-min(a), avg=mean(a)), by=b, verbose=TRUE], output=out) +test(2345.36,optimize=0:2, dt[, .(max(a)-sqrt(min(a))), by=b, verbose=TRUE], output="GForce FALSE") diff --git a/man/test.Rd b/man/test.Rd index 2e216041c..db30064c0 100644 --- a/man/test.Rd +++ b/man/test.Rd @@ -8,7 +8,7 @@ test(num, x, y = TRUE, error = NULL, warning = NULL, message = NULL, output = NULL, notOutput = NULL, ignore.warning = NULL, - options = NULL, env = NULL, optimization = NULL) + options = NULL, env = NULL, optimize = NULL) } \arguments{ \item{num}{ A unique identifier for a test, helpful in identifying the source of failure when testing is not working. Currently, we use a manually-incremented system with tests formatted as \code{n.m}, where essentially \code{n} indexes an issue and \code{m} indexes aspects of that issue. For the most part, your new PR should only have one value of \code{n} (scroll to the end of \code{inst/tests/tests.Rraw} to see the next available ID) and then index the tests within your PR by increasing \code{m}. Note -- \code{n.m} is interpreted as a number, so \code{123.4} and \code{123.40} are actually the same -- please \code{0}-pad as appropriate. Test identifiers are checked to be in increasing order at runtime to prevent duplicates being possible. } @@ -22,7 +22,7 @@ test(num, x, y = TRUE, \item{ignore.warning}{ A single character string. Any warnings emitted by \code{x} that contain this string are dropped. Remaining warnings are compared to the expected \code{warning} as normal. } \item{options}{ A named list of options to set for the duration of the test. Any code evaluated during this call to \code{test()} (usually, \code{x}, or maybe \code{y}) will run with the named options set, and the original options will be restored on return. This is a named list since different options can have different types in general, but in typical usage, only one option is set at a time, in which case a named vector is also accepted. } \item{env}{ A named list of environment variables to set for the duration of the test, much like \code{options}. A list entry set to \code{NULL} will unset (i.e., \code{\link{Sys.unsetenv}}) the corresponding variable. } -\item{optimization}{ A vector of different optimization levels to test. The code in \code{x} will be run once for each optimization level, with \code{options(datatable.optimize=optimization)} set accordingly. All optimization levels must pass the test for the overall test to pass. If no y is supplied, the results from the different levels are compared to each other for equality. If a y is supplied, the results from each level are compared to y. } +\item{optimize}{ A vector of different optimization levels to test. The code in \code{x} will be run once for each optimization level, with \code{options(datatable.optimize=optimize)} set accordingly. All optimization levels must pass the test for the overall test to pass. If no y is supplied, the results from the different levels are compared to each other for equality. If a y is supplied, the results from each level are compared to y. } } \note{ \code{NA_real_} and \code{NaN} are treated as equal, use \code{identical} if distinction is needed. See examples below. From f6981d6db8d5582535f0d7541ee2c5d0476536d9 Mon Sep 17 00:00:00 2001 From: Benjamin Schwendinger Date: Tue, 4 Nov 2025 08:27:42 +0100 Subject: [PATCH 25/27] rename optimization parameter also in test --- R/test.data.table.R | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/R/test.data.table.R b/R/test.data.table.R index af39e3a09..32f3a1403 100644 --- a/R/test.data.table.R +++ b/R/test.data.table.R @@ -363,18 +363,18 @@ gc_mem = function() { test = function(num,x,y=TRUE,error=NULL,warning=NULL,message=NULL,output=NULL,notOutput=NULL,ignore.warning=NULL,options=NULL,env=NULL,optimize=NULL) { # if optimization is provided, test across multiple optimization levels - if (!is.null(optimization)) { + if (!is.null(optimize)) { cl = match.call() cl$optimize= NULL # Remove optimization levels from the recursive call # Check if y was explicitly provided (not just the default) y_provided = !missing(y) vector_params = mget(c("error", "warning", "message", "output", "notOutput", "ignore.warning"), environment()) - compare = !y_provided && length(optimization)>1L && !any(lengths(vector_params)) + compare = !y_provided && length(optimize)>1L && !any(lengths(vector_params)) - for (i in seq_along(optimization)) { + for (i in seq_along(optimize)) { cl$num = num + (i - 1L) * 1e-6 - opt_level = list(datatable.optimize = optimization[i]) + opt_level = list(datatable.optimize = optimize[i]) cl$options = if (!is.null(options)) c(as.list(options), opt_level) else opt_level for (param in names(vector_params)) { val = vector_params[[param]] From 9fc4734d4dc7214d0e5da3463c7e62a37d81277c Mon Sep 17 00:00:00 2001 From: Benjamin Schwendinger Date: Tue, 4 Nov 2025 08:42:48 +0100 Subject: [PATCH 26/27] add optimize param checks --- R/test.data.table.R | 2 ++ 1 file changed, 2 insertions(+) diff --git a/R/test.data.table.R b/R/test.data.table.R index 32f3a1403..ca1e22e79 100644 --- a/R/test.data.table.R +++ b/R/test.data.table.R @@ -364,6 +364,8 @@ gc_mem = function() { test = function(num,x,y=TRUE,error=NULL,warning=NULL,message=NULL,output=NULL,notOutput=NULL,ignore.warning=NULL,options=NULL,env=NULL,optimize=NULL) { # if optimization is provided, test across multiple optimization levels if (!is.null(optimize)) { + if (!is.numeric(optimize) || length(optimize) < 1L || anyNA(optimize) || any(optimize < 0L)) + stopf("optimize must be numeric, length >= 1, non-NA, and >= 0; got: %s", optimize) cl = match.call() cl$optimize= NULL # Remove optimization levels from the recursive call From c07999ac76e74cd79df42ac96a78f4364414ddf4 Mon Sep 17 00:00:00 2001 From: Benjamin Schwendinger Date: Tue, 4 Nov 2025 08:53:15 +0100 Subject: [PATCH 27/27] remove trailing ws --- R/test.data.table.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/test.data.table.R b/R/test.data.table.R index ca1e22e79..7b8341944 100644 --- a/R/test.data.table.R +++ b/R/test.data.table.R @@ -372,7 +372,7 @@ test = function(num,x,y=TRUE,error=NULL,warning=NULL,message=NULL,output=NULL,no # Check if y was explicitly provided (not just the default) y_provided = !missing(y) vector_params = mget(c("error", "warning", "message", "output", "notOutput", "ignore.warning"), environment()) - compare = !y_provided && length(optimize)>1L && !any(lengths(vector_params)) + compare = !y_provided && length(optimize)>1L && !any(lengths(vector_params)) for (i in seq_along(optimize)) { cl$num = num + (i - 1L) * 1e-6