Check monotonically increasing per row

Question

I have a dataframe as the following :

   COL_1 COL_2 COL_3 COL_4 COL_5 COL_6
   <int> <int> <int> <int> <int> <int>
 1     1     1     1     1     1     1
 2     1     1     1     1     1     2
 3     1     1     1     1     1     3
 4     1     1     1     1     1     4
 5     1     2     1     1     1     5
 6     1     1     1     1     1     6
 7     1     3     4     5     6     7
 8     1     1     1     1     1     8
 9     1     1     9     1     1     9
10     1     3     5     7     9    10

I’d like to filter this dataset to keep only values from COL_1 to COL_6 strictly increasing, so it would be as the following:

   COL_1 COL_2 COL_3 COL_4 COL_5 COL_6
   <int> <int> <int> <int> <int> <int>
 7     1     3     4     5     6     7
10     1     3     5     7     9    10

EDIT : The code should be used in a function with a dynamic number of columns (which will be named from COL_1 to COL_N). A "basic" code such as

df %>% filter(COL_6 > COL_5 & ... & COL_2 > COL_1)

will not work in my situation. Thank you very much

score 10 · Accepted Answer · 2023-08-03 13:21:48Z

Method 1

df[!colSums(apply(df, 1, diff) <= 0), ]

#    COL_1 COL_2 COL_3 COL_4 COL_5 COL_6
# 7      1     3     4     5     6     7
# 10     1     3     5     7     9    10

The trick is that: ! + numeric vector will convert non-zeros to FALSE and zeros to TRUE.

!(-3:3)
# [1] FALSE FALSE FALSE  TRUE FALSE FALSE FALSE

Method 2 (Faster)

df[Reduce(`&`, Map(`>`, df[-1], df[-ncol(df)])), ]

Benchmark on a larger dataset

library(microbenchmark)

bm <- microbenchmark(
  MrFlick = df[Reduce(function(x, y) { list(y, x[[2]] & (x[[1]] < y)) }, df, init = list(df[[1]]-1, TRUE))[[2]], ],
  Darren_1 = df[!colSums(apply(df, 1, diff) <= 0), ],
  Darren_2 = df[Reduce(`&`, Map(`>`, df[-1], df[-ncol(df)])), ],
  zx8754_1 = df[ apply(df, 1, function(i) !is.unsorted(i, strictly = TRUE)), ],
  zx8754_2 = df[ apply(df, 1, function(i) all(rank(i) == seq.int(ncol(df)))), ],
  Thomas_1 = df[rowMeans(df[-1] > df[-ncol(df)]) == 1, ],
  Thomas_2 = df[rowSums(df[-1] > df[-ncol(df)]) == ncol(df) - 1, ],
  setup = {
    df <- as.data.frame(matrix(runif(1e6, 0, 100), 1e4, 1e2))
    # pick 100 rows to sort
    ind <- sample(1:1e4, 100)
    df[ind, ] <- t(apply(df[ind, ], 1, sort))
  }
)

Unit: milliseconds
     expr        min         lq      mean     median        uq       max neval
  MrFlick   7.470067   8.731615  12.28941   9.275596  14.16288 101.26516   100
 Darren_1 218.653701 239.837480 280.04091 264.328409 318.58590 451.99662   100
 Darren_2   9.142422  10.514188  13.94889  10.982153  16.88846  36.41603   100
 zx8754_1  51.888907  61.551056  75.29113  65.437290  73.57925 224.47821   100
 zx8754_2 388.489916 411.315632 448.10555 422.858870 475.91631 732.39909   100
 Thomas_1  12.699829  14.744726  19.51238  15.957306  20.82237  94.52955   100
 Thomas_2  12.592598  14.836320  18.21198  15.572325  20.95979  40.23583   100

impressive benchmarking, +1 already! I found that ncol is faster than length, so I updated my solution 😛 — yesterday

MrFlickMrFlick 194k17 gold badges275 silver badges293 bronze badges · Accepted Answer · 2023-08-02 15:05:30Z

With a bit of work, you could use Reduce for this. For example

keep <- Reduce(function(x, y) {
  list(y, x[[2]] & (x[[1]] < y))
}, dd, init=list(dd[[1]]-1, TRUE))[[2]]
which(keep)
# [1]  7 10
dd[keep, ]
#    COL_1 COL_2 COL_3 COL_4 COL_5 COL_6
# 7      1     3     4     5     6     7
# 10     1     3     5     7     9    10

Tested with

dd <- read.table(text="
COL_1 COL_2 COL_3 COL_4 COL_5 COL_6
1     1     1     1     1     1     1
2     1     1     1     1     1     2
3     1     1     1     1     1     3
4     1     1     1     1     1     4
5     1     2     1     1     1     5
6     1     1     1     1     1     6
7     1     3     4     5     6     7
8     1     1     1     1     1     8
9     1     1     9     1     1     9
10     1     3     5     7     9    10", header=TRUE)

score 5 · Accepted Answer · 2023-08-03 11:25:46Z

Try the following base R options

rowMeans

> df[rowMeans(df[-1] - df[-ncol(df)] > 0) == 1, ]
   COL_1 COL_2 COL_3 COL_4 COL_5 COL_6
7      1     3     4     5     6     7
10     1     3     5     7     9    10

rowSums

> df[rowSums(df[-1] > df[-ncol(df)]) == ncol(df) - 1, ]
   COL_1 COL_2 COL_3 COL_4 COL_5 COL_6
7      1     3     4     5     6     7
10     1     3     5     7     9    10

NA + &

> df[complete.cases(NA & (df[-1] <= df[-ncol(df)])), ]
   COL_1 COL_2 COL_3 COL_4 COL_5 COL_6
7      1     3     4     5     6     7
10     1     3     5     7     9    10

Benchmarking (borrowed from @Darren Tsai)

bm <- microbenchmark(
    MrFlick = df[Reduce(function(x, y) {
        list(y, x[[2]] & (x[[1]] < y))
    }, df, init = list(df[[1]] - 1, TRUE))[[2]], ],
    Darren_1 = df[!colSums(apply(df, 1, diff) <= 0), ],
    Darren_2 = df[Reduce(`&`, Map(`>`, df[-1], df[-ncol(df)])), ],
    zx8754_1 = df[apply(df, 1, function(i) !is.unsorted(i, strictly = TRUE)), ],
    zx8754_2 = df[apply(df, 1, function(i) all(rank(i) == seq.int(ncol(df)))), ],
    tic1 = df[rowMeans(df[-1] > df[-ncol(df)]) == 1, ],
    tic2 = df[rowSums(df[-1] > df[-ncol(df)]) == ncol(df) - 1, ],
    tic3 = df[complete.cases(NA & (df[-1] <= df[-ncol(df)])), ],
    setup = {
        df <- as.data.frame(matrix(runif(1e6, 0, 100), 1e4, 1e2))
        ind <- sample(1:1e4, 1e2)
        df[ind, ] <- t(apply(df[ind, ], 1, sort))
    },
    times = 10L,
    unit = "relative"
)

which gives

> bm
Unit: relative
     expr        min         lq      mean    median         uq        max neval
  MrFlick  0.9080191  0.9144448  1.028867  1.127537  1.0952573  1.0503769    10
 Darren_1 16.5282125 17.7915946 19.161257 19.417784 20.7691135 19.3924344    10
 Darren_2  1.0000000  1.0000000  1.000000  1.000000  1.0000000  1.0000000    10
 zx8754_1  4.3833846  4.5916794  4.958092  4.617921  4.1234763  9.0226479    10
 zx8754_2 27.4681979 27.4405513 25.276613 26.550560 22.5151429 24.6191662    10
     tic1  1.0823147  1.3835146  1.997294  1.511849  1.5489377  6.0274525    10
     tic2  1.0455388  1.0989379  1.016991  1.069731  0.9690896  0.9463357    10
     tic3  2.1156887  2.1455595  2.289243  2.501887  2.1517687  2.5138369    10

oneone 2,9561 gold badge4 silver badges24 bronze badges · Accepted Answer · 2023-08-02 15:05:30Z

4

One base R approach

df[colSums(apply(df,1,diff)>0)==ncol(df)-1,]

   COL_1 COL_2 COL_3 COL_4 COL_5 COL_6
7      1     3     4     5     6     7
10     1     3     5     7     9    10

answered yesterday

one

2,9561 gold badge4 silver badges24 bronze badges

Add a comment
|

score 4 · Accepted Answer · 2023-08-02 15:16:11Z

library(dplyr)

df |>
  rowwise() |>
  filter(all(diff(c_across(starts_with('COL'))) > 0)) |>
  ungroup()

Output

  COL_1 COL_2 COL_3 COL_4 COL_5 COL_6
  <int> <int> <int> <int> <int> <int>
1     1     3     4     5     6     7
2     1     3     5     7     9    10

Data

df <- structure(list(COL_1 = c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
1L), COL_2 = c(1L, 1L, 1L, 1L, 2L, 1L, 3L, 1L, 1L, 3L), COL_3 = c(1L, 
1L, 1L, 1L, 1L, 1L, 4L, 1L, 9L, 5L), COL_4 = c(1L, 1L, 1L, 1L, 
1L, 1L, 5L, 1L, 1L, 7L), COL_5 = c(1L, 1L, 1L, 1L, 1L, 1L, 6L, 
1L, 1L, 9L), COL_6 = 1:10), class = "data.frame", row.names = c("1", 
"2", "3", "4", "5", "6", "7", "8", "9", "10"))

zx8754zx8754 52.5k12 gold badges113 silver badges208 bronze badges · Accepted Answer · 2023-08-02 15:41:58Z

Using is.unsorted, check if vector is unsorted, then negate:

dd[ apply(dd, 1, function(i) !is.unsorted(i, strictly = TRUE)), ]
#    COL_1 COL_2 COL_3 COL_4 COL_5 COL_6
# 7      1     3     4     5     6     7
# 10     1     3     5     7     9    10

Using rank, check if the ranking same as 1:number of columns:

dd[ apply(dd, 1, function(i) all(rank(i) == seq.int(ncol(dd)))), ]
#    COL_1 COL_2 COL_3 COL_4 COL_5 COL_6
# 7      1     3     4     5     6     7
# 10     1     3     5     7     9    10

score 2 · Accepted Answer · 2023-08-04 08:04:59Z

With collapse, you can combine the very efficient fdiff with fmean to make a vectorized solution over all columns:

library(collapse)
dd[!fsum(fdiff(t(dd)) <= 0), ]
#or
dd[fmean(fdiff(t(dd)) > 0) == 1, ]

#    COL_1 COL_2 COL_3 COL_4 COL_5 COL_6
# 7      1     3     4     5     6     7
# 10     1     3     5     7     9    10

Another solution with matrixStats:

library(matrixStats)
dd[!rowSums(rowDiffs(as.matrix(dd)) <= 0), ]

Check monotonically increasing per row