Add scalar keyword argument to flatten (#3283)

bkamins · web-flow · commit ec7b123376f1 · 2023-02-05T21:05:53.000+01:00
diff --git a/NEWS.md b/NEWS.md
@@ -26,6 +26,8 @@
 * Add `haskey` and `get` methods to `DataFrameColumns`
   to make it support dictionary interface more completely
   ([#3282](https://github.com/JuliaData/DataFrames.jl/pull/3282))
+* Allow passing `scalar` keyword argument in `flatten`
+  ([#3283](https://github.com/JuliaData/DataFrames.jl/pull/3283))
 
 ## Bug fixes
 
diff --git a/src/abstractdataframe/abstractdataframe.jl b/src/abstractdataframe/abstractdataframe.jl
@@ -2259,7 +2259,7 @@ function Missings.allowmissing(df::AbstractDataFrame,
 end
 
 """
-    flatten(df::AbstractDataFrame, cols)
+    flatten(df::AbstractDataFrame, cols; scalar::Type=Union{})
 
 When columns `cols` of data frame `df` have iterable elements that define
 `length` (for example a `Vector` of `Vector`s), return a `DataFrame` where each
@@ -2273,6 +2273,11 @@ returned `DataFrame` will affect `df`.
 
 `cols` can be any column selector ($COLUMNINDEX_STR; $MULTICOLUMNINDEX_STR).
 
+If `scalar` is passed then values that have this type in flattened columns
+are treated as scalars and broadcasted as many times as is needed to match
+lengths of values stored in other columns. If all values in a row are scalars,
+a single row is produced.
+
 $METADATA_FIXED
 
 # Examples
@@ -2334,10 +2339,33 @@ julia> df3 = DataFrame(a=[1, 2], b=[[1, 2], [3, 4]], c=[[5, 6], [7]])
 
 julia> flatten(df3, [:b, :c])
 ERROR: ArgumentError: Lengths of iterables stored in columns :b and :c are not the same in row 2
+
+julia> df4 = DataFrame(a=[1, 2, 3],
+                       b=[[1, 2], missing, missing],
+                       c=[[5, 6], missing, [7, 8]])
+3×3 DataFrame
+ Row │ a      b        c
+     │ Int64  Array…?  Array…?
+─────┼─────────────────────────
+   1 │     1  [1, 2]   [5, 6]
+   2 │     2  missing  missing
+   3 │     3  missing  [7, 8]
+
+julia> flatten(df4, [:b, :c], scalar=Missing)
+5×3 DataFrame
+ Row │ a      b        c
+     │ Int64  Int64?   Int64?
+─────┼─────────────────────────
+   1 │     1        1        5
+   2 │     1        2        6
+   3 │     2  missing  missing
+   4 │     3  missing        7
+   5 │     3  missing        8
 ```
 """
 function flatten(df::AbstractDataFrame,
-                 cols::Union{ColumnIndex, MultiColumnIndex})
+                 cols::Union{ColumnIndex, MultiColumnIndex};
+                 scalar::Type=Union{})
     _check_consistency(df)
 
     idxcols = index(df)[cols]
@@ -2348,15 +2376,16 @@ function flatten(df::AbstractDataFrame,
     end
 
     col1 = first(idxcols)
-    lengths = length.(df[!, col1])
-    for col in idxcols
-        v = df[!, col]
-        if any(x -> length(x[1]) != x[2], zip(v, lengths))
-            r = findfirst(x -> x != 0, length.(v) .- lengths)
-            colnames = _names(df)
-            throw(ArgumentError("Lengths of iterables stored in columns :$(colnames[col1]) " *
-                                "and :$(colnames[col]) are not the same in row $r"))
-        end
+    lengths = Int[x isa scalar ? -1 : length(x) for x in df[!, col1]]
+    for (i, coli) in enumerate(idxcols)
+        i == 1 && continue
+        update_lengths!(lengths, df[!, coli], scalar, df, col1, coli)
+    end
+
+    # handle case where in all columns we had a scalar
+    # in this case we keep it one time
+    for i in 1:length(lengths)
+        lengths[i] == -1 && (lengths[i] = 1)
     end
 
     new_df = similar(df[!, Not(cols)], sum(lengths))
@@ -2366,18 +2395,38 @@ function flatten(df::AbstractDataFrame,
     length(idxcols) > 1 && sort!(idxcols)
     for col in idxcols
         col_to_flatten = df[!, col]
-        fast_path = eltype(col_to_flatten) isa AbstractVector &&
+        fast_path = eltype(col_to_flatten) <: AbstractVector &&
                     !isempty(col_to_flatten)
-        flattened_col = fast_path ?
-            reduce(vcat, col_to_flatten) :
-            collect(Iterators.flatten(col_to_flatten))
+        flattened_col = if fast_path
+                reduce(vcat, col_to_flatten)
+            elseif scalar === Union{}
+                collect(Iterators.flatten(col_to_flatten))
+            else
+                collect(Iterators.flatten(v isa scalar ? Iterators.repeated(v, l) : v
+                                          for (l, v) in zip(lengths, col_to_flatten)))
+            end
         insertcols!(new_df, col, _names(df)[col] => flattened_col)
     end
 
     _copy_all_note_metadata!(new_df, df)
     return new_df
 end
 
+function update_lengths!(lengths::Vector{Int}, col::AbstractVector, scalar::Type,
+                         df::AbstractDataFrame, col1::Integer, coli::Integer)
+    for (i, v) in enumerate(col)
+        v isa scalar && continue
+        lv = length(v)
+        if lengths[i] == -1
+            lengths[i] = lv
+        elseif lengths[i] != lv
+            colnames = _names(df)
+            throw(ArgumentError("Lengths of iterables stored in columns :$(colnames[col1]) " *
+                                "and :$(colnames[coli]) are not the same in row $i"))
+        end
+    end
+end
+
 function repeat_lengths!(longnew::AbstractVector, shortold::AbstractVector,
                          lengths::AbstractVector{Int})
     counter = 1
diff --git a/test/reshape.jl b/test/reshape.jl
@@ -431,6 +431,71 @@ end
     @test flatten(DataFrame(), All()) == DataFrame()
 end
 
+@testset "flatten with scalar" begin
+    df = DataFrame(a=[1, 2, 3],
+                   b=[[1, 2], missing, [3, 4]],
+                   c=[[5, 6], missing, missing])
+    @test flatten(df, :a) ≅ df
+    @test_throws MethodError flatten(df, :b)
+    @test flatten(df, :b, scalar=Missing) ≅
+          DataFrame(a=[1, 1, 2, 3, 3],
+                    b=[1, 2, missing, 3, 4],
+                    c=[[5, 6], [5, 6], missing, missing, missing])
+    @test flatten(df, [:b, :c], scalar=Missing) ≅
+          DataFrame(a=[1, 1, 2, 3, 3],
+                    b=[1, 2, missing, 3, 4],
+                    c=[5, 6, missing, missing, missing])
+    @test flatten(df, [:b, :c], scalar=Any) ≅ df
+
+    df = DataFrame(a=missing, b=[1], c=missing, d=[[1, 2]])
+    @test_throws ArgumentError flatten(df, All(), scalar=Missing)
+    @test flatten(df, Not(:d), scalar=Missing) ≅
+        DataFrame(a=missing, b=1, c=missing, d=[[1, 2]])
+    @test flatten(df, Not(:b), scalar=Missing) ≅
+        DataFrame(a=[missing, missing], b=[1, 1], c=[missing, missing], d=[1, 2])
+
+    df = DataFrame(a="xy", b=[[1, 2]])
+    @test flatten(df, [:a, :b]) == DataFrame(a=['x', 'y'], b=[1, 2])
+    @test flatten(df, [:a, :b], scalar=String) ==
+          DataFrame(a=["xy", "xy"], b=[1, 2])
+
+    df = DataFrame(a=[[1], [], [3, 4], missing], b = missings(4), id=1:4)
+    @test flatten(df, [:a, :b], scalar=Missing) ≅
+          DataFrame(a=[1, 3, 4, missing], b=missings(4), id=[1, 3, 3, 4])
+    df = DataFrame(id=1:10, x=[1:i-1 for i in 1:10])
+    df.y = [iseven(last(v)) ? missing : v for v in df.x]
+    @test flatten(df, [:x, :y], scalar=Missing) ≅
+          DataFrame(id=reduce(vcat, [fill(i, i-1) for i in 2:10]),
+                    x=reduce(vcat, [1:i for i in 1:9]),
+                    y=reduce(vcat, [iseven(i) ? missings(i) : (1:i) for i in 1:9]))
+
+    # Below are tests showing handling of strings
+    df = DataFrame(id=1:5,
+                   col1=["a", missing, 1:2, 3:4, 5:6],
+                   col2=[11:12, 111:112, 1111:1112, missing, "b"])
+    @test flatten(df, [:col1, :col2], scalar=Union{Missing, AbstractString}) ≅
+          DataFrame(id=[1 ,1, 2, 2, 3, 3, 4, 4, 5, 5],
+                    col1=["a", "a", missing, missing, 1, 2, 3, 4, 5, 6],
+                    col2=[11, 12, 111, 112, 1111, 1112, missing, missing, "b", "b"])
+    @test_throws MethodError flatten(df, [:col1, :col2])
+    @test_throws ArgumentError flatten(df, [:col1, :col2], scalar=Missing)
+    @test_throws MethodError flatten(df, [:col1, :col2], scalar=AbstractString)
+
+    df = DataFrame(id=1:5,
+                   col1=["ab", missing, 1:2, 3:4, 5:6],
+                   col2=[11:12, 111:112, 1111:1112, missing, "cd"])
+    @test flatten(df, [:col1, :col2], scalar=Union{Missing, AbstractString}) ≅
+          DataFrame(id=[1 ,1, 2, 2, 3, 3, 4, 4, 5, 5],
+                    col1=["ab", "ab", missing, missing, 1, 2, 3, 4, 5, 6],
+                    col2=[11, 12, 111, 112, 1111, 1112, missing, missing, "cd", "cd"])
+    @test_throws MethodError flatten(df, [:col1, :col2])
+    @test flatten(df, [:col1, :col2], scalar=Missing) ≅
+          DataFrame(id=[1 ,1, 2, 2, 3, 3, 4, 4, 5, 5],
+                    col1=['a', 'b', missing, missing, 1, 2, 3, 4, 5, 6],
+                    col2=[11, 12, 111, 112, 1111, 1112, missing, missing, 'c', 'd'])
+    @test_throws MethodError flatten(df, [:col1, :col2], scalar=AbstractString)
+end
+
 @testset "stack categorical test" begin
     Random.seed!(1234)
     d1 = DataFrame(a=repeat([1:3;], inner=[4]),