Check monotonically increasing per row

Check monotonically increasing per row

10

I have a dataframe as the following :

   COL_1 COL_2 COL_3 COL_4 COL_5 COL_6
   <int> <int> <int> <int> <int> <int>
 1     1     1     1     1     1     1
 2     1     1     1     1     1     2
 3     1     1     1     1     1     3
 4     1     1     1     1     1     4
 5     1     2     1     1     1     5
 6     1     1     1     1     1     6
 7     1     3     4     5     6     7
 8     1     1     1     1     1     8
 9     1     1     9     1     1     9
10     1     3     5     7     9    10

I’d like to filter this dataset to keep only values from COL_1 to COL_6 strictly increasing, so it would be as the following:

   COL_1 COL_2 COL_3 COL_4 COL_5 COL_6
   <int> <int> <int> <int> <int> <int>
 7     1     3     4     5     6     7
10     1     3     5     7     9    10

EDIT : The code should be used in a function with a dynamic number of columns (which will be named from COL_1 to COL_N). A "basic" code such as

df %>% filter(COL_6 > COL_5 & ... & COL_2 > COL_1)

will not work in my situation. Thank you very much

Share

7 Answers
7

Reset to default

Highest score (default)

Trending (recent votes count more)

Date modified (newest first)

Date created (oldest first)

10

Method 1

df[!colSums(apply(df, 1, diff) <= 0), ]

#    COL_1 COL_2 COL_3 COL_4 COL_5 COL_6
# 7      1     3     4     5     6     7
# 10     1     3     5     7     9    10

The trick is that: ! + numeric vector will convert non-zeros to FALSE and zeros to TRUE.

!(-3:3)
# [1] FALSE FALSE FALSE  TRUE FALSE FALSE FALSE

Method 2 (Faster)

df[Reduce(`&`, Map(`>`, df[-1], df[-ncol(df)])), ]

Benchmark on a larger dataset

library(microbenchmark)

bm <- microbenchmark(
  MrFlick = df[Reduce(function(x, y) { list(y, x[[2]] & (x[[1]] < y)) }, df, init = list(df[[1]]-1, TRUE))[[2]], ],
  Darren_1 = df[!colSums(apply(df, 1, diff) <= 0), ],
  Darren_2 = df[Reduce(`&`, Map(`>`, df[-1], df[-ncol(df)])), ],
  zx8754_1 = df[ apply(df, 1, function(i) !is.unsorted(i, strictly = TRUE)), ],
  zx8754_2 = df[ apply(df, 1, function(i) all(rank(i) == seq.int(ncol(df)))), ],
  Thomas_1 = df[rowMeans(df[-1] > df[-ncol(df)]) == 1, ],
  Thomas_2 = df[rowSums(df[-1] > df[-ncol(df)]) == ncol(df) - 1, ],
  setup = {
    df <- as.data.frame(matrix(runif(1e6, 0, 100), 1e4, 1e2))
    # pick 100 rows to sort
    ind <- sample(1:1e4, 100)
    df[ind, ] <- t(apply(df[ind, ], 1, sort))
  }
)

Unit: milliseconds
     expr        min         lq      mean     median        uq       max neval
  MrFlick   7.470067   8.731615  12.28941   9.275596  14.16288 101.26516   100
 Darren_1 218.653701 239.837480 280.04091 264.328409 318.58590 451.99662   100
 Darren_2   9.142422  10.514188  13.94889  10.982153  16.88846  36.41603   100
 zx8754_1  51.888907  61.551056  75.29113  65.437290  73.57925 224.47821   100
 zx8754_2 388.489916 411.315632 448.10555 422.858870 475.91631 732.39909   100
 Thomas_1  12.699829  14.744726  19.51238  15.957306  20.82237  94.52955   100
 Thomas_2  12.592598  14.836320  18.21198  15.572325  20.95979  40.23583   100

Share

1

  • 1

    impressive benchmarking, +1 already! I found that ncol is faster than length, so I updated my solution 😛

    – ThomasIsCoding

    yesterday

9

With a bit of work, you could use Reduce for this. For example

keep <- Reduce(function(x, y) {
  list(y, x[[2]] & (x[[1]] < y))
}, dd, init=list(dd[[1]]-1, TRUE))[[2]]
which(keep)
# [1]  7 10
dd[keep, ]
#    COL_1 COL_2 COL_3 COL_4 COL_5 COL_6
# 7      1     3     4     5     6     7
# 10     1     3     5     7     9    10

Tested with

dd <- read.table(text="
COL_1 COL_2 COL_3 COL_4 COL_5 COL_6
1     1     1     1     1     1     1
2     1     1     1     1     1     2
3     1     1     1     1     1     3
4     1     1     1     1     1     4
5     1     2     1     1     1     5
6     1     1     1     1     1     6
7     1     3     4     5     6     7
8     1     1     1     1     1     8
9     1     1     9     1     1     9
10     1     3     5     7     9    10", header=TRUE)

Share

Recognized by R Language Collective

0

5

Try the following base R options

  • rowMeans
> df[rowMeans(df[-1] - df[-ncol(df)] > 0) == 1, ]
   COL_1 COL_2 COL_3 COL_4 COL_5 COL_6
7      1     3     4     5     6     7
10     1     3     5     7     9    10
  • rowSums
> df[rowSums(df[-1] > df[-ncol(df)]) == ncol(df) - 1, ]
   COL_1 COL_2 COL_3 COL_4 COL_5 COL_6
7      1     3     4     5     6     7
10     1     3     5     7     9    10
  • NA + &
> df[complete.cases(NA & (df[-1] <= df[-ncol(df)])), ]
   COL_1 COL_2 COL_3 COL_4 COL_5 COL_6
7      1     3     4     5     6     7
10     1     3     5     7     9    10

Benchmarking (borrowed from @Darren Tsai)

bm <- microbenchmark(
    MrFlick = df[Reduce(function(x, y) {
        list(y, x[[2]] & (x[[1]] < y))
    }, df, init = list(df[[1]] - 1, TRUE))[[2]], ],
    Darren_1 = df[!colSums(apply(df, 1, diff) <= 0), ],
    Darren_2 = df[Reduce(`&`, Map(`>`, df[-1], df[-ncol(df)])), ],
    zx8754_1 = df[apply(df, 1, function(i) !is.unsorted(i, strictly = TRUE)), ],
    zx8754_2 = df[apply(df, 1, function(i) all(rank(i) == seq.int(ncol(df)))), ],
    tic1 = df[rowMeans(df[-1] > df[-ncol(df)]) == 1, ],
    tic2 = df[rowSums(df[-1] > df[-ncol(df)]) == ncol(df) - 1, ],
    tic3 = df[complete.cases(NA & (df[-1] <= df[-ncol(df)])), ],
    setup = {
        df <- as.data.frame(matrix(runif(1e6, 0, 100), 1e4, 1e2))
        ind <- sample(1:1e4, 1e2)
        df[ind, ] <- t(apply(df[ind, ], 1, sort))
    },
    times = 10L,
    unit = "relative"
)

which gives

> bm
Unit: relative
     expr        min         lq      mean    median         uq        max neval
  MrFlick  0.9080191  0.9144448  1.028867  1.127537  1.0952573  1.0503769    10
 Darren_1 16.5282125 17.7915946 19.161257 19.417784 20.7691135 19.3924344    10
 Darren_2  1.0000000  1.0000000  1.000000  1.000000  1.0000000  1.0000000    10
 zx8754_1  4.3833846  4.5916794  4.958092  4.617921  4.1234763  9.0226479    10
 zx8754_2 27.4681979 27.4405513 25.276613 26.550560 22.5151429 24.6191662    10
     tic1  1.0823147  1.3835146  1.997294  1.511849  1.5489377  6.0274525    10
     tic2  1.0455388  1.0989379  1.016991  1.069731  0.9690896  0.9463357    10
     tic3  2.1156887  2.1455595  2.289243  2.501887  2.1517687  2.5138369    10

Share

Recognized by R Language Collective

4

One base R approach

df[colSums(apply(df,1,diff)>0)==ncol(df)-1,]

   COL_1 COL_2 COL_3 COL_4 COL_5 COL_6
7      1     3     4     5     6     7
10     1     3     5     7     9    10

Share

4

library(dplyr)

df |>
  rowwise() |>
  filter(all(diff(c_across(starts_with('COL'))) > 0)) |>
  ungroup()

Output

  COL_1 COL_2 COL_3 COL_4 COL_5 COL_6
  <int> <int> <int> <int> <int> <int>
1     1     3     4     5     6     7
2     1     3     5     7     9    10

Data

df <- structure(list(COL_1 = c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
1L), COL_2 = c(1L, 1L, 1L, 1L, 2L, 1L, 3L, 1L, 1L, 3L), COL_3 = c(1L, 
1L, 1L, 1L, 1L, 1L, 4L, 1L, 9L, 5L), COL_4 = c(1L, 1L, 1L, 1L, 
1L, 1L, 5L, 1L, 1L, 7L), COL_5 = c(1L, 1L, 1L, 1L, 1L, 1L, 6L, 
1L, 1L, 9L), COL_6 = 1:10), class = "data.frame", row.names = c("1", 
"2", "3", "4", "5", "6", "7", "8", "9", "10"))

Share

3

Using is.unsorted, check if vector is unsorted, then negate:

dd[ apply(dd, 1, function(i) !is.unsorted(i, strictly = TRUE)), ]
#    COL_1 COL_2 COL_3 COL_4 COL_5 COL_6
# 7      1     3     4     5     6     7
# 10     1     3     5     7     9    10

Using rank, check if the ranking same as 1:number of columns:

dd[ apply(dd, 1, function(i) all(rank(i) == seq.int(ncol(dd)))), ]
#    COL_1 COL_2 COL_3 COL_4 COL_5 COL_6
# 7      1     3     4     5     6     7
# 10     1     3     5     7     9    10

Share

Recognized by R Language Collective

2

With collapse, you can combine the very efficient fdiff with fmean to make a vectorized solution over all columns:

library(collapse)
dd[!fsum(fdiff(t(dd)) <= 0), ]
#or
dd[fmean(fdiff(t(dd)) > 0) == 1, ]

#    COL_1 COL_2 COL_3 COL_4 COL_5 COL_6
# 7      1     3     4     5     6     7
# 10     1     3     5     7     9    10

Another solution with matrixStats:

library(matrixStats)
dd[!rowSums(rowDiffs(as.matrix(dd)) <= 0), ]

Share

Recognized by R Language Collective

Your Answer

Draft saved
Draft discarded

Post as a guest

Required, but never shown


By clicking “Post Your Answer”, you agree to our terms of service and acknowledge that you have read and understand our privacy policy and code of conduct.

Not the answer you're looking for? Browse other questions tagged

or ask your own question.

Leave a Reply

Your email address will not be published. Required fields are marked *