JuliaData · bkamins · Jun 19, 2023 · Apr 7, 2023 · Apr 8, 2023 · Apr 8, 2023
diff --git a/NEWS.md b/NEWS.md
@@ -6,6 +6,10 @@
   treated as if they were wrapped in `Cols` and does not throw an error
   when a vector of duplicate indices is passed when doing column selection
   ([#3302](https://github.com/JuliaData/DataFrames.jl/pull/3302))
+* Added the kwarg `checkunique` to sorting related functions (`issorted`,
+  `sort`, `sort!` and `sortperm`) that throws an error when duplicate elements
+  make multiple sort orders valid
+  ([#2159](https://github.com/JuliaData/DataFrames.jl/issues/2159))
 
 # DataFrames.jl v1.5 Release Notes
 

diff --git a/src/abstractdataframe/sort.jl b/src/abstractdataframe/sort.jl
@@ -357,7 +357,8 @@ column in the corresponding position in `cols`.
              lt::Union{Function, AbstractVector{<:Function}}=isless,
              by::Union{Function, AbstractVector{<:Function}}=identity,
              rev::Union{Bool, AbstractVector{Bool}}=false,
-             order::Union{Ordering, AbstractVector{<:Ordering}}=Forward)
+             order::Union{Ordering, AbstractVector{<:Ordering}}=Forward,
+             checkunique::Bool=false)
 
 Test whether data frame `df` sorted by column(s) `cols`. Checking against
 multiple columns is done lexicographically.
@@ -397,14 +398,26 @@ function Base.issorted(df::AbstractDataFrame, cols=All();
                        lt::Union{Function, AbstractVector{<:Function}}=isless,
                        by::Union{Function, AbstractVector{<:Function}}=identity,
                        rev::Union{Bool, AbstractVector{Bool}}=false,
-                       order::Union{Ordering, AbstractVector{<:Ordering}}=Forward)
+                       order::Union{Ordering, AbstractVector{<:Ordering}}=Forward,
+                       checkunique::Bool=false)
     to_scalar(x::AbstractVector) = only(x)
     to_scalar(x::Any) = x
 
     # exclude AbstractVector as in that case cols can contain order(...) clauses
     if cols isa MultiColumnIndex && !(cols isa AbstractVector)
         cols = index(df)[cols]
     end
+    if checkunique
+        newcols = Int[]
+
+        for col in cols
+            push!(newcols, index(df)[(_getcol(col))])
+        end
+        if !allunique(df, newcols)
+            throw(ArgumentError("Non-unique elements found. Multiple orders " *
+                                "are valid"))
+        end
+    end
     if cols isa ColumnIndex
         return issorted(df[!, cols], lt=to_scalar(lt), by=to_scalar(by),
                         rev=to_scalar(rev), order=to_scalar(order))
@@ -427,7 +440,8 @@ Base.issorted(::AbstractDataFrame, ::Base.Order.Ordering) =
          by::Union{Function, AbstractVector{<:Function}}=identity,
          rev::Union{Bool, AbstractVector{Bool}}=false,
          order::Union{Ordering, AbstractVector{<:Ordering}}=Forward,
-         view::Bool=false)
+         view::Bool=false,
+         checkunique::Bool=false)
 
 Return a data frame containing the rows in `df` sorted by column(s) `cols`.
 Sorting on multiple columns is done lexicographically.
@@ -506,8 +520,9 @@ julia> sort(df, [:x, order(:y, rev=true)])
                            by::Union{Function, AbstractVector{<:Function}}=identity,
                            rev::Union{Bool, AbstractVector{Bool}}=false,
                            order::Union{Ordering, AbstractVector{<:Ordering}}=Forward,
-                           view::Bool=false)
-    rowidxs = sortperm(df, cols, alg=alg, lt=lt, by=by, rev=rev, order=order)
+                           view::Bool=false,
+                           checkunique::Bool=false)
+    rowidxs = sortperm(df, cols, alg=alg, lt=lt, by=by, rev=rev, order=order, checkunique=checkunique)
     return view ? Base.view(df, rowidxs, :) : df[rowidxs, :]
 end
 
@@ -517,7 +532,8 @@ end
              lt::Union{Function, AbstractVector{<:Function}}=isless,
              by::Union{Function, AbstractVector{<:Function}}=identity,
              rev::Union{Bool, AbstractVector{Bool}}=false,
-             order::Union{Ordering, AbstractVector{<:Ordering}}=Forward)
+             order::Union{Ordering, AbstractVector{<:Ordering}}=Forward,
+             checkunique::Bool=false)
 
 Return a permutation vector of row indices of data frame `df` that puts them in
 sorted order according to column(s) `cols`.
@@ -579,13 +595,25 @@ function Base.sortperm(df::AbstractDataFrame, cols=All();
                        lt::Union{Function, AbstractVector{<:Function}}=isless,
                        by::Union{Function, AbstractVector{<:Function}}=identity,
                        rev::Union{Bool, AbstractVector{Bool}}=false,
-                       order::Union{Ordering, AbstractVector{<:Ordering}}=Forward)
+                       order::Union{Ordering, AbstractVector{<:Ordering}}=Forward,
+                       checkunique::Bool=false)
     # exclude AbstractVector as in that case cols can contain order(...) clauses
     if cols isa MultiColumnIndex && !(cols isa AbstractVector)
         cols = index(df)[cols]
     end
     ord = ordering(df, cols, lt, by, rev, order)
     _alg = Sort.defalg(df, ord; alg=alg, cols=cols)
+    if checkunique
+        newcols = Int[]
+
+        for col in cols
+            push!(newcols, index(df)[(_getcol(col))])
+        end
+        if !allunique(df, newcols)
+            throw(ArgumentError("Non-unique elements found. Multiple orders " *
+                                "are valid"))
+        end
+    end
     return _sortperm(df, _alg, ord)
 end
 
@@ -601,7 +629,8 @@ _sortperm(df::AbstractDataFrame, a::Algorithm, o::Ordering) =
           lt::Union{Function, AbstractVector{<:Function}}=isless,
           by::Union{Function, AbstractVector{<:Function}}=identity,
           rev::Union{Bool, AbstractVector{Bool}}=false,
-          order::Union{Ordering, AbstractVector{<:Ordering}}=Forward)
+          order::Union{Ordering, AbstractVector{<:Ordering}}=Forward,
+          checkunique::Bool=false)
 
 Sort data frame `df` by column(s) `cols` by permuting its rows in-place.
 Sorting on multiple columns is done lexicographicallly.
@@ -682,16 +711,36 @@ function Base.sort!(df::AbstractDataFrame, cols=All();
                     lt::Union{Function, AbstractVector{<:Function}}=isless,
                     by::Union{Function, AbstractVector{<:Function}}=identity,
                     rev::Union{Bool, AbstractVector{Bool}}=false,
-                    order::Union{Ordering, AbstractVector{<:Ordering}}=Forward)
+                    order::Union{Ordering, AbstractVector{<:Ordering}}=Forward,
+                    checkunique::Bool=false)
 
     # exclude AbstractVector as in that case cols can contain order(...) clauses
     if cols isa MultiColumnIndex && !(cols isa AbstractVector)
         cols = index(df)[cols]
     end
     ord = ordering(df, cols, lt, by, rev, order)
     _alg = Sort.defalg(df, ord; alg=alg, cols=cols)
+    if checkunique
+        newcols = Int[]
+
+        for col in cols
+            push!(newcols, index(df)[(_getcol(col))])
+        end
+        if !allunique(df, newcols)
+            throw(ArgumentError("Non-unique elements found. Multiple orders " *
+                                "are valid"))
+        end
+    end
     return sort!(df, _alg, ord)
 end
 
-Base.sort!(df::AbstractDataFrame, a::Base.Sort.Algorithm, o::Base.Sort.Ordering) =
+function Base.sort!(df::AbstractDataFrame, a::Base.Sort.Algorithm,
+                    o::Base.Sort.Ordering, checkunique::Bool=false)
+    if checkunique
+        if !allunique(df) # Necessary to check all cols AFAIU
+            throw(ArgumentError("Non-unique elements found. Multiple orders " *
+                                "are valid"))
+        end
+    end
     permute!(df, _sortperm(df, a, o))
+end
diff --git a/test/sort.jl b/test/sort.jl
@@ -16,6 +16,8 @@ using DataFrames, Random, Test, CategoricalArrays
     @test issorted(DataFrame())
     @test sortperm(d) == sortperm(dv1)
     @test sortperm(d[:, [:dv3, :dv1]]) == sortperm(dv3)
+    @test_throws ArgumentError sortperm(d, :cv1, checkunique=true)
+    @test_throws ArgumentError sortperm(d, [:cv1, :dv1], checkunique=true)
     @test sort(d, :dv1)[!, :dv3] == sort(d, "dv1")[!, "dv3"] == sortperm(dv1)
     @test sort(d, :dv2)[!, :dv3] == sortperm(dv1)
     @test sort(d, :cv1)[!, :dv3] == sortperm(dv1)
@@ -30,6 +32,7 @@ using DataFrames, Random, Test, CategoricalArrays
     @test issorted(sort(df, rev=true), rev=true)
     @test issorted(sort(df, [:chrom, :pos])[:, [:chrom, :pos]])
     @test issorted(sort(df, ["chrom", "pos"])[:, ["chrom", "pos"]])
+    @test_throws ArgumentError issorted(sort(df), :rank, checkunique=true)
 
     ds = sort(df, [order(:rank, rev=true), :chrom, :pos])
     @test issorted(ds, [order(:rank, rev=true), :chrom, :pos])