I have a dataframe as the following :
COL_1 COL_2 COL_3 COL_4 COL_5 COL_6
<int> <int> <int> <int> <int> <int>
1 1 1 1 1 1 1
2 1 1 1 1 1 2
3 1 1 1 1 1 3
4 1 1 1 1 1 4
5 1 2 1 1 1 5
6 1 1 1 1 1 6
7 1 3 4 5 6 7
8 1 1 1 1 1 8
9 1 1 9 1 1 9
10 1 3 5 7 9 10
I’d like to filter this dataset to keep only values from COL_1
to COL_6
strictly increasing, so it would be as the following:
COL_1 COL_2 COL_3 COL_4 COL_5 COL_6
<int> <int> <int> <int> <int> <int>
7 1 3 4 5 6 7
10 1 3 5 7 9 10
EDIT : The code should be used in a function with a dynamic number of columns (which will be named from COL_1
to COL_N
). A "basic" code such as
df %>% filter(COL_6 > COL_5 & ... & COL_2 > COL_1)
will not work in my situation. Thank you very much
7 Answers
7
Reset to default
Highest score (default)
Trending (recent votes count more)
Date modified (newest first)
Date created (oldest first)
Method 1
df[!colSums(apply(df, 1, diff) <= 0), ]
# COL_1 COL_2 COL_3 COL_4 COL_5 COL_6
# 7 1 3 4 5 6 7
# 10 1 3 5 7 9 10
The trick is that: !
+ numeric vector will convert non-zeros to FALSE
and zeros to TRUE
.
!(-3:3)
# [1] FALSE FALSE FALSE TRUE FALSE FALSE FALSE
Method 2 (Faster)
df[Reduce(`&`, Map(`>`, df[-1], df[-ncol(df)])), ]
Benchmark on a larger dataset
library(microbenchmark)
bm <- microbenchmark(
MrFlick = df[Reduce(function(x, y) { list(y, x[[2]] & (x[[1]] < y)) }, df, init = list(df[[1]]-1, TRUE))[[2]], ],
Darren_1 = df[!colSums(apply(df, 1, diff) <= 0), ],
Darren_2 = df[Reduce(`&`, Map(`>`, df[-1], df[-ncol(df)])), ],
zx8754_1 = df[ apply(df, 1, function(i) !is.unsorted(i, strictly = TRUE)), ],
zx8754_2 = df[ apply(df, 1, function(i) all(rank(i) == seq.int(ncol(df)))), ],
Thomas_1 = df[rowMeans(df[-1] > df[-ncol(df)]) == 1, ],
Thomas_2 = df[rowSums(df[-1] > df[-ncol(df)]) == ncol(df) - 1, ],
setup = {
df <- as.data.frame(matrix(runif(1e6, 0, 100), 1e4, 1e2))
# pick 100 rows to sort
ind <- sample(1:1e4, 100)
df[ind, ] <- t(apply(df[ind, ], 1, sort))
}
)
Unit: milliseconds
expr min lq mean median uq max neval
MrFlick 7.470067 8.731615 12.28941 9.275596 14.16288 101.26516 100
Darren_1 218.653701 239.837480 280.04091 264.328409 318.58590 451.99662 100
Darren_2 9.142422 10.514188 13.94889 10.982153 16.88846 36.41603 100
zx8754_1 51.888907 61.551056 75.29113 65.437290 73.57925 224.47821 100
zx8754_2 388.489916 411.315632 448.10555 422.858870 475.91631 732.39909 100
Thomas_1 12.699829 14.744726 19.51238 15.957306 20.82237 94.52955 100
Thomas_2 12.592598 14.836320 18.21198 15.572325 20.95979 40.23583 100
1
-
1
impressive benchmarking, +1 already! I found that
ncol
is faster thanlength
, so I updated my solution 😛– ThomasIsCoding
yesterday
With a bit of work, you could use Reduce for this. For example
keep <- Reduce(function(x, y) {
list(y, x[[2]] & (x[[1]] < y))
}, dd, init=list(dd[[1]]-1, TRUE))[[2]]
which(keep)
# [1] 7 10
dd[keep, ]
# COL_1 COL_2 COL_3 COL_4 COL_5 COL_6
# 7 1 3 4 5 6 7
# 10 1 3 5 7 9 10
Tested with
dd <- read.table(text="
COL_1 COL_2 COL_3 COL_4 COL_5 COL_6
1 1 1 1 1 1 1
2 1 1 1 1 1 2
3 1 1 1 1 1 3
4 1 1 1 1 1 4
5 1 2 1 1 1 5
6 1 1 1 1 1 6
7 1 3 4 5 6 7
8 1 1 1 1 1 8
9 1 1 9 1 1 9
10 1 3 5 7 9 10", header=TRUE)
0
Try the following base R options
rowMeans
> df[rowMeans(df[-1] - df[-ncol(df)] > 0) == 1, ]
COL_1 COL_2 COL_3 COL_4 COL_5 COL_6
7 1 3 4 5 6 7
10 1 3 5 7 9 10
rowSums
> df[rowSums(df[-1] > df[-ncol(df)]) == ncol(df) - 1, ]
COL_1 COL_2 COL_3 COL_4 COL_5 COL_6
7 1 3 4 5 6 7
10 1 3 5 7 9 10
NA
+&
> df[complete.cases(NA & (df[-1] <= df[-ncol(df)])), ]
COL_1 COL_2 COL_3 COL_4 COL_5 COL_6
7 1 3 4 5 6 7
10 1 3 5 7 9 10
Benchmarking (borrowed from @Darren Tsai)
bm <- microbenchmark(
MrFlick = df[Reduce(function(x, y) {
list(y, x[[2]] & (x[[1]] < y))
}, df, init = list(df[[1]] - 1, TRUE))[[2]], ],
Darren_1 = df[!colSums(apply(df, 1, diff) <= 0), ],
Darren_2 = df[Reduce(`&`, Map(`>`, df[-1], df[-ncol(df)])), ],
zx8754_1 = df[apply(df, 1, function(i) !is.unsorted(i, strictly = TRUE)), ],
zx8754_2 = df[apply(df, 1, function(i) all(rank(i) == seq.int(ncol(df)))), ],
tic1 = df[rowMeans(df[-1] > df[-ncol(df)]) == 1, ],
tic2 = df[rowSums(df[-1] > df[-ncol(df)]) == ncol(df) - 1, ],
tic3 = df[complete.cases(NA & (df[-1] <= df[-ncol(df)])), ],
setup = {
df <- as.data.frame(matrix(runif(1e6, 0, 100), 1e4, 1e2))
ind <- sample(1:1e4, 1e2)
df[ind, ] <- t(apply(df[ind, ], 1, sort))
},
times = 10L,
unit = "relative"
)
which gives
> bm
Unit: relative
expr min lq mean median uq max neval
MrFlick 0.9080191 0.9144448 1.028867 1.127537 1.0952573 1.0503769 10
Darren_1 16.5282125 17.7915946 19.161257 19.417784 20.7691135 19.3924344 10
Darren_2 1.0000000 1.0000000 1.000000 1.000000 1.0000000 1.0000000 10
zx8754_1 4.3833846 4.5916794 4.958092 4.617921 4.1234763 9.0226479 10
zx8754_2 27.4681979 27.4405513 25.276613 26.550560 22.5151429 24.6191662 10
tic1 1.0823147 1.3835146 1.997294 1.511849 1.5489377 6.0274525 10
tic2 1.0455388 1.0989379 1.016991 1.069731 0.9690896 0.9463357 10
tic3 2.1156887 2.1455595 2.289243 2.501887 2.1517687 2.5138369 10
One base R approach
df[colSums(apply(df,1,diff)>0)==ncol(df)-1,]
COL_1 COL_2 COL_3 COL_4 COL_5 COL_6
7 1 3 4 5 6 7
10 1 3 5 7 9 10
library(dplyr)
df |>
rowwise() |>
filter(all(diff(c_across(starts_with('COL'))) > 0)) |>
ungroup()
Output
COL_1 COL_2 COL_3 COL_4 COL_5 COL_6
<int> <int> <int> <int> <int> <int>
1 1 3 4 5 6 7
2 1 3 5 7 9 10
Data
df <- structure(list(COL_1 = c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L), COL_2 = c(1L, 1L, 1L, 1L, 2L, 1L, 3L, 1L, 1L, 3L), COL_3 = c(1L,
1L, 1L, 1L, 1L, 1L, 4L, 1L, 9L, 5L), COL_4 = c(1L, 1L, 1L, 1L,
1L, 1L, 5L, 1L, 1L, 7L), COL_5 = c(1L, 1L, 1L, 1L, 1L, 1L, 6L,
1L, 1L, 9L), COL_6 = 1:10), class = "data.frame", row.names = c("1",
"2", "3", "4", "5", "6", "7", "8", "9", "10"))
Using is.unsorted, check if vector is unsorted, then negate:
dd[ apply(dd, 1, function(i) !is.unsorted(i, strictly = TRUE)), ]
# COL_1 COL_2 COL_3 COL_4 COL_5 COL_6
# 7 1 3 4 5 6 7
# 10 1 3 5 7 9 10
Using rank, check if the ranking same as 1:number of columns:
dd[ apply(dd, 1, function(i) all(rank(i) == seq.int(ncol(dd)))), ]
# COL_1 COL_2 COL_3 COL_4 COL_5 COL_6
# 7 1 3 4 5 6 7
# 10 1 3 5 7 9 10
With collapse
, you can combine the very efficient fdiff
with fmean
to make a vectorized solution over all columns:
library(collapse)
dd[!fsum(fdiff(t(dd)) <= 0), ]
#or
dd[fmean(fdiff(t(dd)) > 0) == 1, ]
# COL_1 COL_2 COL_3 COL_4 COL_5 COL_6
# 7 1 3 4 5 6 7
# 10 1 3 5 7 9 10
Another solution with matrixStats
:
library(matrixStats)
dd[!rowSums(rowDiffs(as.matrix(dd)) <= 0), ]
Your Answer
Sign up or log in
Post as a guest
Required, but never shown
Post as a guest
Required, but never shown
By clicking “Post Your Answer”, you agree to our terms of service and acknowledge that you have read and understand our privacy policy and code of conduct.
Not the answer you're looking for? Browse other questions tagged
or ask your own question.
or ask your own question.
|