make aggregation of empty GroupedDataFrame correct with AsTable (#3222)

bkamins · web-flow · commit 39358888fe71 · 2022-11-12T22:52:50.000+01:00
diff --git a/NEWS.md b/NEWS.md
@@ -4,6 +4,9 @@
 
 * Fix incorrect handling of column metadata in `insertcols!` and `insertcols`
   ([#3220](https://github.com/JuliaData/DataFrames.jl/pull/3220))
+* Correctly handle `GroupedDataFrame` with no groups in multi-column
+  operation specification syntax
+  ([#3122](https://github.com/JuliaData/DataFrames.jl/issues/3122))
 
 ## Display improvements
 
diff --git a/docs/src/man/split_apply_combine.md b/docs/src/man/split_apply_combine.md
@@ -30,15 +30,24 @@ object from your data frame using the `groupby` function that takes two argument
 (1) a data frame to be grouped, and (2) a set of columns to group by.
 
 Operations can then be applied on each group using one of the following functions:
-* `combine`: does not put restrictions on number of rows returned, the order of rows
-  is specified by the order of groups in `GroupedDataFrame`; it is typically used
-  to compute summary statistics by group;
+* `combine`: does not put restrictions on number of rows returned per group;
+  the returned values are vertically concatenaded following order of groups in
+  `GroupedDataFrame`; it is typically used to compute summary statistics by group;
+  for `GroupedDataFrame` if grouping columns are kept they are put as first columns
+  in the result;
 * `select`: return a data frame with the number and order of rows exactly the same
   as the source data frame, including only new calculated columns;
   `select!` is an in-place version of `select`;
 * `transform`: return a data frame with the number and order of rows exactly the same
   as the source data frame, including all columns from the source and new calculated columns;
-  `transform!` is an in-place version of `transform`.
+  `transform!` is an in-place version of `transform`;
+  existing columns in the source data frame are put as first columns in the result;
+
+As a special case, if a `GroupedDataFrame` that has zero groups is passed then
+the result of the operation is determined by performing a single call to the
+transformation function with a 0-row argument passed to it. The output of this
+operation is only used to identify the number and type of produced columns, but
+the result has zero rows.
 
 All these functions take a specification of one or more functions to apply to
 each subset of the `DataFrame`. This specification can be of the following forms:
diff --git a/src/abstractdataframe/selection.jl b/src/abstractdataframe/selection.jl
@@ -37,19 +37,26 @@ const TRANSFORMATION_COMMON_RULES =
     (1) a data frame to be grouped, and (2) a set of columns to group by.
 
     Operations can then be applied on each group using one of the following functions:
-    * `combine`: does not put restrictions on number of rows returned, the order of rows
-      is specified by the order of groups in `GroupedDataFrame`; it is typically used
-      to compute summary statistics by group; for `GroupedDataFrame` if grouping columns
-      are kept they are put as first columns in the result;
+    * `combine`: does not put restrictions on number of rows returned per group;
+      the returned values are vertically concatenaded following order of groups in
+      `GroupedDataFrame`; it is typically used to compute summary statistics by group;
+      for `GroupedDataFrame` if grouping columns are kept they are put as first columns
+      in the result;
     * `select`: return a data frame with the number and order of rows exactly the same
       as the source data frame, including only new calculated columns;
       `select!` is an in-place version of `select`; for `GroupedDataFrame` if grouping columns
       are kept they are put as first columns in the result;
     * `transform`: return a data frame with the number and order of rows exactly the same
       as the source data frame, including all columns from the source and new calculated columns;
-      `transform!` is an in-place version of `transform`; for `GroupedDataFrame`
+      `transform!` is an in-place version of `transform`;
       existing columns in the source data frame are put as first columns in the result;
 
+    As a special case, if a `GroupedDataFrame` that has zero groups is passed then
+    the result of the operation is determined by performing a single call to the
+    transformation function with a 0-row argument passed to it. The output of this
+    operation is only used to identify the number and type of produced columns, but
+    the result has zero rows.
+
     All these functions take a specification of one or more functions to apply to
     each subset of the `DataFrame`. This specification can be of the following forms:
     1. standard column selectors (integers, `Symbol`s, strings, vectors of integers,
diff --git a/src/groupeddataframe/complextransforms.jl b/src/groupeddataframe/complextransforms.jl
@@ -28,7 +28,6 @@ function _combine_with_first((first,)::Ref{Any},
     @assert first isa Union{NamedTuple, DataFrameRow, AbstractDataFrame}
     @assert f isa Base.Callable
     @assert incols isa Union{Nothing, AbstractVector, Tuple, NamedTuple}
-    @assert first isa Union{NamedTuple, DataFrameRow, AbstractDataFrame}
     extrude = false
 
     lgd = length(gd)
diff --git a/src/groupeddataframe/splitapplycombine.jl b/src/groupeddataframe/splitapplycombine.jl
@@ -486,6 +486,23 @@ function _combine_process_pair_symbol(optional_i::Bool,
     end
 end
 
+@noinline function expand_res_astable(res, kp1, emptyres::Bool)
+    prepend = all(x -> x isa Integer, kp1)
+    if !(prepend || all(x -> x isa Symbol, kp1) || all(x -> x isa AbstractString, kp1))
+        throw(ArgumentError("keys of the returned elements must be " *
+                            "`Symbol`s, strings or integers"))
+    end
+    if any(x -> !isequal(keys(x), kp1), res)
+        throw(ArgumentError("keys of the returned elements must be equal"))
+    end
+    outcols = [[x[n] for x in res] for n in kp1]
+    # make sure we only infer column names and types for empty res, but do not
+    # produce values that were generated when computing firstres
+    emptyres && foreach(empty!, outcols)
+    nms = [prepend ? Symbol("x", n) : Symbol(n) for n in kp1]
+    return outcols, nms
+end
+
 # perform a transformation specified using the Pair notation with multiple output columns
 function _combine_process_pair_astable(optional_i::Bool,
                                        gd::GroupedDataFrame,
@@ -506,19 +523,15 @@ function _combine_process_pair_astable(optional_i::Bool,
                                                  firstmulticol, NOTHING_IDX_AGG, threads)
         @assert length(outcol_vec) == 1
         res = outcol_vec[1]
-        @assert length(res) > 0
-
-        kp1 = keys(res[1])
-        prepend = all(x -> x isa Integer, kp1)
-        if !(prepend || all(x -> x isa Symbol, kp1) || all(x -> x isa AbstractString, kp1))
-            throw(ArgumentError("keys of the returned elements must be " *
-                                "`Symbol`s, strings or integers"))
-        end
-        if any(x -> !isequal(keys(x), kp1), res)
-            throw(ArgumentError("keys of the returned elements must be identical"))
+        if isempty(res)
+            emptyres = true
+            res = firstres
+        else
+            emptyres = false
         end
-        outcols = [[x[n] for x in res] for n in kp1]
-        nms = [prepend ? Symbol("x", n) : Symbol(n) for n in kp1]
+        kp1 = isempty(res) ? () : keys(res[1])
+
+        outcols, nms = expand_res_astable(res, kp1, emptyres)
     else
         if !firstmulticol
             firstres = Tables.columntable(firstres)
@@ -527,9 +540,8 @@ function _combine_process_pair_astable(optional_i::Bool,
         end
         idx, outcols, nms = _combine_multicol(Ref{Any}(firstres), Ref{Any}(fun), gd,
                                               wincols, threads)
-
         if !(firstres isa Union{AbstractVecOrMat, AbstractDataFrame,
-            NamedTuple{<:Any, <:Tuple{Vararg{AbstractVector}}}})
+             NamedTuple{<:Any, <:Tuple{Vararg{AbstractVector}}}})
             lock(gd.lazy_lock) do
                 # if idx_agg was not computed yet it is nothing
                 # in this case if we are not passed a vector compute it.
@@ -541,8 +553,8 @@ function _combine_process_pair_astable(optional_i::Bool,
                 idx = idx_agg[]
             end
         end
-        @assert length(outcols) == length(nms)
     end
+    @assert length(outcols) == length(nms)
     if out_col_name isa AbstractVector{Symbol}
         if length(out_col_name) != length(nms)
             throw(ArgumentError("Number of returned columns is $(length(nms)) " *
diff --git a/test/grouping.jl b/test/grouping.jl
@@ -4312,4 +4312,70 @@ end
     @test_throws ArgumentError gdf[Not([true true true true])]
 end
 
+@testset "aggregation of empty GroupedDataFrame with table output" begin
+    df = DataFrame(:a => Int[])
+    gdf = groupby(df, :a)
+    @test isequal_typed(combine(gdf, :a => (x -> [(x=1, y="a")]) => AsTable, :a => :b),
+                        DataFrame(a=Int[], x=Int[], y=String[], b=Int[]))
+    @test isequal_typed(combine(gdf, :a => (x -> [(1, "a")]) => AsTable, :a => :b),
+                        DataFrame(a=Int[], x1=Int[], x2=String[], b=Int[]))
+    @test isequal_typed(combine(gdf, :a => (x -> ["ab"]) => AsTable, :a => :b),
+                        DataFrame(a=Int[], x1=Char[], x2=Char[], b=Int[]))
+    # test below errors because keys for strings do not support == comparison
+    @test_throws ArgumentError combine(gdf, :a => (x -> ["ab", "cd"]) => AsTable, :a => :b)
+    @test isequal_typed(combine(gdf, :a => (x -> []) => AsTable, :a => :b),
+                        DataFrame(a=Int[], b=Int[]))
+    @test_throws ArgumentError combine(gdf, :a => (x -> [(a=x, b=x), (a=x, c=x)]) => AsTable)
+    @test isequal_typed(combine(gdf, :a => (x -> [(x=1, y=2), (x=3, y="a")]) => AsTable),
+                        DataFrame(a=Int[], x=Int[], y=Any[]))
+    @test isequal_typed(combine(gdf, :a => (x -> [(x=[1], y=2), (x=[3], y="a")]) => AsTable),
+                        DataFrame(a=Int[], x=Vector{Int}[], y=Any[]))
+    @test isequal_typed(combine(gdf, :a => (x -> [(x=[1], y=2), (x=[3], y="a")]) => [:z1, :z2]),
+                        DataFrame(a=Int[], z1=Vector{Int}[], z2=Any[]))
+    @test_throws ArgumentError combine(gdf, :a => (x -> [(x=[1], y=2), (x=[3], y="a")]) => [:z1, :z2, :z3])
+
+    df = DataFrame(:a => [1, 2])
+    gdf = groupby(df, :a)[2:1]
+    @test isequal_typed(combine(gdf, :a => (x -> [(x=1, y="a")]) => AsTable, :a => :b),
+                        DataFrame(a=Int[], x=Int[], y=String[], b=Int[]))
+    @test isequal_typed(combine(gdf, :a => (x -> [(1, "a")]) => AsTable, :a => :b),
+                        DataFrame(a=Int[], x1=Int[], x2=String[], b=Int[]))
+    @test isequal_typed(combine(gdf, :a => (x -> ["ab"]) => AsTable, :a => :b),
+                        DataFrame(a=Int[], x1=Char[], x2=Char[], b=Int[]))
+    # test below errors because keys for strings do not support == comparison
+    @test_throws ArgumentError combine(gdf, :a => (x -> ["ab", "cd"]) => AsTable, :a => :b)
+    @test isequal_typed(combine(gdf, :a => (x -> []) => AsTable, :a => :b),
+                        DataFrame(a=Int[], b=Int[]))
+    @test_throws ArgumentError combine(gdf, :a => (x -> [(a=x, b=x), (a=x, c=x)]) => AsTable)
+    @test isequal_typed(combine(gdf, :a => (x -> [(x=1, y=2), (x=3, y="a")]) => AsTable),
+                        DataFrame(a=Int[], x=Int[], y=Any[]))
+    @test isequal_typed(combine(gdf, :a => (x -> [(x=[1], y=2), (x=[3], y="a")]) => AsTable),
+                        DataFrame(a=Int[], x=Vector{Int}[], y=Any[]))
+    @test isequal_typed(combine(gdf, :a => (x -> [(x=[1], y=2), (x=[3], y="a")]) => [:z1, :z2]),
+                        DataFrame(a=Int[], z1=Vector{Int}[], z2=Any[]))
+    @test_throws ArgumentError combine(gdf, :a => (x -> [(x=[1], y=2), (x=[3], y="a")]) => [:z1, :z2, :z3])
+
+    df = DataFrame(:a => [1, 2])
+    gdf = groupby(df, :a)
+    @test isequal_typed(combine(gdf, :a => (x -> [(x=1, y="a")]) => AsTable, :a => :b),
+                        DataFrame(a=1:2, x=[1, 1], y=["a", "a"], b=1:2))
+    @test isequal_typed(combine(gdf, :a => (x -> [(1, "a")]) => AsTable, :a => :b),
+                        DataFrame(a=1:2, x1=[1, 1], x2=["a", "a"], b=1:2))
+    @test isequal_typed(combine(gdf, :a => (x -> ["ab"]) => AsTable, :a => :b),
+                        DataFrame(a=1:2, x1=['a', 'a'], x2=['b', 'b'], b=1:2))
+    # test below errors because keys for strings do not support == comparison
+    @test_throws ArgumentError combine(gdf, :a => (x -> ["ab", "cd"]) => AsTable, :a => :b)
+    @test isequal_typed(combine(gdf, :a => (x -> []) => AsTable, :a => :b),
+                        DataFrame(a=1:2, b=1:2))
+    @test_throws ArgumentError combine(gdf, :a => (x -> [(a=x, b=x), (a=x, c=x)]) => AsTable)
+    @test isequal_typed(combine(gdf, :a => (x -> [(x=1, y=2), (x=3, y="a")]) => AsTable),
+                        DataFrame(a=[1, 1, 2, 2], x=[1, 3, 1, 3], y=Any[2, "a", 2, "a"]))
+    @test isequal_typed(combine(gdf, :a => (x -> [(x=[1], y=2), (x=[3], y="a")]) => AsTable),
+                        DataFrame(a=[1, 1, 2, 2], x=[[1], [3], [1], [3]], y=Any[2, "a", 2, "a"]))
+    @test isequal_typed(combine(gdf, :a => (x -> [(x=[1], y=2), (x=[3], y="a")]) => [:z1, :z2]),
+                        DataFrame(a=[1, 1, 2, 2], z1=[[1], [3], [1], [3]], z2=Any[2, "a", 2, "a"]))
+    @test_throws ArgumentError combine(gdf, :a => (x -> [(x=[1], y=2), (x=[3], y="a")]) => [:z1, :z2, :z3])
+    @test_throws ArgumentError combine(gdf, :a => (x -> [Dict('x' => 1)]) => AsTable)
+end
+
 end # module