Remove elements from dataset by factors

3

Given a dataset :

set.seed(1)
fatores = sample(1:12, 30, replace = T)
x = matrix(rnorm(60), ncol = 2)
dataset = cbind(fatores, x)
dataset = data.frame(dataset)
dataset$fatores = as.factor(dataset$fatores)

I want to remove rows whose number of factors are less than 2. In the example above, factor 1 would remove line 10 and 27. How can I perform this procedure?

    
asked by anonymous 09.05.2016 / 16:23

3 answers

3

You do not have to complicate much to do this, you just have to count the occurrences of each level using table and then remove the rows where the occurrences are smaller than the limit. For example:

tb <- table(dataset$fatores)
rem <- !(dataset$fatores %in% names(tb[tb <= 2]))
dataset[rem, ]

#    fatores          V2          V3
# 2        5 -0.01619026  0.36458196
# 4       11  0.82122120 -0.11234621
# 5        3  0.59390132  0.88110773
# 6       11  0.91897737  0.39810588
# 7       12  0.78213630 -0.61202639
# 8        8  0.07456498  0.34111969
# 9        8 -1.98935170 -1.12936310
# 11       3 -0.05612874  1.98039990
# 12       3 -0.15579551 -0.36722148
# 14       5 -0.47815006  0.56971963
# 18      12  0.38767161  0.68973936
# 19       5 -0.05380504  0.02800216
# 21      12 -0.41499456  0.18879230
# 22       3 -0.39428995 -1.80495863
# 23       8 -0.05931340  1.46555486
# 26       5 -0.16452360  0.47550953
# 28       5  0.69696338  0.61072635
# 29      11  0.55666320 -0.93409763
# 30       5 -0.68875569 -1.25363340

In this case, all lines of c(1, 2, 4, 6, 7, 9, 10) factors have been removed.

You can apply the same logic in other ways. Using sapply to create a vector with the count, and then filter through it:

rem <- sapply(seq_len(nrow(dataset)), function(i) {
  sum(dataset$fatores[i] == dataset$fatores)
}) > 2
dataset[rem, ]

Or by using dplyr , counting line by line how many times that factor occurs and using this as the criteria for the filter.

library(dplyr)
dataset %>% rowwise() %>% filter(sum(fatores == .$fatores) > 2)

A tip: When creating random variables that should not represent numbers, it is better to use letters to make it easier to interpret the results. In your case, it could be letters[1:12] .

    
09.05.2016 / 19:01
1

Thinking here, I came to the following function. It's not very fast, but it does.

#numero_minimo no exemplo é 2
remove_fatores = function(dataset1, numero_minimo){
  fatores = dataset1[,1]
  for(i in 1 : length(fatores)){
    if(length(which(fatores == fatores[i])) <= numero_minimo){
      dataset1 = dataset1[-which(fatores == fatores[i]), ]
    }
  }
  return(dataset1)
}
    
09.05.2016 / 16:58
1

You can do this too:

> dataset %>% 
+   group_by(fatores) %>%
+   summarise(n = n()) %>%
+   filter(n > 2) %>%
+   left_join(dataset, by = "fatores") %>%
+   select(-n)
Source: local data frame [19 x 3]

   fatores          V2          V3
    (fctr)       (dbl)       (dbl)
1        3  0.59390132  0.88110773
2        3 -0.05612874  1.98039990
3        3 -0.15579551 -0.36722148
4        3 -0.39428995 -1.80495863
5        5 -0.01619026  0.36458196
6        5 -0.47815006  0.56971963
7        5 -0.05380504  0.02800216
8        5 -0.16452360  0.47550953
9        5  0.69696338  0.61072635
10       5 -0.68875569 -1.25363340
11       8  0.07456498  0.34111969
12       8 -1.98935170 -1.12936310
13       8 -0.05931340  1.46555486
14      11  0.82122120 -0.11234621
15      11  0.91897737  0.39810588
16      11  0.55666320 -0.93409763
17      12  0.78213630 -0.61202639
18      12  0.38767161  0.68973936
19      12 -0.41499456  0.18879230

I found it interesting to benchmark all solutions. The order I put here is:

  • sol1: first @molx solution, using only base
  • sol2: second solution of @molx, using sapply
  • sol3: third solution of @molx, using dplyr
  • sol4: @Wagner Jorge solution
  • sol5: my solution using dplyr

The results are below:

Unit: microseconds
 expr      min        lq      mean   median       uq      max neval   cld
 sol1  214.468  241.1875  259.3642  255.521  273.039  381.158   100 a    
 sol2 2437.349 2586.5195 2770.6785 2665.087 2733.745 4688.896   100   c  
 sol3 3200.015 3373.2525 3574.7309 3505.259 3658.677 5173.747   100     e
 sol4 3072.610 3234.3445 3386.9734 3349.010 3432.178 4997.856   100    d 
 sol5 1526.396 1640.8840 1742.2747 1721.752 1807.574 3616.563   100  b   
Abaixo o código completo para refazer o benchmark.

library(microbenchmark)
microbenchmark(
  sol1 = {tb <- table(dataset$fatores)
  rem <- !(dataset$fatores %in% names(tb[tb <= 2]))
  dataset[rem, ]},
  sol2 = {rem <- sapply(seq_len(nrow(dataset)), function(i) {
    sum(dataset$fatores[i] == dataset$fatores)
  }) > 2
  dataset[rem, ]},
  sol3 = {dataset %>% rowwise() %>% filter(sum(fatores == .$fatores) > 2)},
  sol4 = {remove_fatores(dataset,2)},
  sol5 = {dataset %>% 
      group_by(fatores) %>%
      summarise(n = n()) %>%
      filter(n > 2) %>%
      left_join(dataset, by = "fatores") %>%
      select(-n)}
)
    
09.05.2016 / 16:51