Changing the missing data NA by "something else"

4

When I import a .sav file I do not want the < NA > in my mydata1 . I would like instead of NA to appear, for example, "something else."

mydata = read.spss('mydata.sav', use.value.labels = TRUE, to.data.frame = TRUE,
               max.value.labels = Inf, trim.factor.names = FALSE,
               trim_values = FALSE, reencode = "UTF-8")



(mydata1<- mydata[10:20,25:31])
   Q_16_O3 Q_16_O4 Q_16_O5 Q_16_O6 Q_16_O7 Q_16_O8 Q_16_O9
10    <NA>    <NA>    <NA>    <NA>    <NA>    <NA>    <NA>
11    <NA>    <NA>    <NA>    <NA>    <NA>    <NA>    <NA>
12    <NA>    <NA>    <NA>    <NA>    <NA>    <NA>    <NA>
13    <NA>    <NA>    <NA>    <NA>    <NA>    <NA>    <NA>
14    <NA>    <NA>    <NA>    <NA>    <NA>    <NA>    <NA>
15    Trem    <NA>    <NA>    <NA>    <NA>    <NA>    <NA>
16    <NA>    <NA>    <NA>    <NA>    <NA>    <NA>    <NA>
17    <NA>    <NA>    <NA>    <NA>    <NA>    <NA>    <NA>
18    <NA>    <NA>    <NA>    <NA>    <NA>    <NA>    <NA>
19    <NA>    <NA>    <NA>    <NA>    <NA>    <NA>    <NA>
20    <NA>    <NA>    <NA>    <NA>    <NA>    <NA>    <NA>

Putting str and dput

str(mydata1)

'data.frame':   11 obs. of  7 variables:

 $ Q_16_O3: Factor w/ 10 levels "Ônibus","Vans",..: NA NA NA NA NA 4 NA NA NA NA ...

 $ Q_16_O4: Factor w/ 10 levels "Ônibus","Vans",..: NA NA NA NA NA NA NA NA NA NA ...

 $ Q_16_O5: Factor w/ 10 levels "Ônibus","Vans",..: NA NA NA NA NA NA NA NA NA NA ...

 $ Q_16_O6: Factor w/ 10 levels "Ônibus","Vans",..: NA NA NA NA NA NA NA NA NA NA ...

 $ Q_16_O7: Factor w/ 10 levels "Ônibus","Vans",..: NA NA NA NA NA NA NA NA NA NA ...

 $ Q_16_O8: Factor w/ 10 levels "Ônibus","Vans",..: NA NA NA NA NA NA NA NA NA NA ...

 $ Q_16_O9: Factor w/ 10 levels "Ônibus","Vans",..: NA NA NA NA NA NA NA NA NA NA ...


dput(head(mydata1))


    structure(list(Q_16_O3 = structure(c(NA, NA, NA, NA, NA, 4L), .Label = c("Ônibus", 
    "Vans", "Metrô", "Trem", "BRT", "Barca", "Catamarã", "Fretados", 
    "VLT/Monotrilho", "Lotação (micro-ônibus especial)"), class = "factor"), 
    Q_16_O4 = structure(c(NA_integer_, NA_integer_, NA_integer_, 
    NA_integer_, NA_integer_, NA_integer_), .Label = c("Ônibus", 
    "Vans", "Metrô", "Trem", "BRT", "Barca", "Catamarã", "Fretados", 
    "VLT/Monotrilho", "Lotação (micro-ônibus especial)"), class = "factor"), 
    Q_16_O5 = structure(c(NA_integer_, NA_integer_, NA_integer_, 
    NA_integer_, NA_integer_, NA_integer_), .Label = c("Ônibus", 
    "Vans", "Metrô", "Trem", "BRT", "Barca", "Catamarã", "Fretados", 
    "VLT/Monotrilho", "Lotação (micro-ônibus especial)"), class = "factor"), 
    Q_16_O6 = structure(c(NA_integer_, NA_integer_, NA_integer_, 
    NA_integer_, NA_integer_, NA_integer_), .Label = c("Ônibus", 
    "Vans", "Metrô", "Trem", "BRT", "Barca", "Catamarã", "Fretados", 
    "VLT/Monotrilho", "Lotação (micro-ônibus especial)"), class = "factor"), 
    Q_16_O7 = structure(c(NA_integer_, NA_integer_, NA_integer_, 
    NA_integer_, NA_integer_, NA_integer_), .Label = c("Ônibus", 
    "Vans", "Metrô", "Trem", "BRT", "Barca", "Catamarã", "Fretados", 
    "VLT/Monotrilho", "Lotação (micro-ônibus especial)"), class = "factor"), 
    Q_16_O8 = structure(c(NA_integer_, NA_integer_, NA_integer_, 
    NA_integer_, NA_integer_, NA_integer_), .Label = c("Ônibus", 
    "Vans", "Metrô", "Trem", "BRT", "Barca", "Catamarã", "Fretados", 
    "VLT/Monotrilho", "Lotação (micro-ônibus especial)"), class = "factor"), 
    Q_16_O9 = structure(c(NA_integer_, NA_integer_, NA_integer_, 
    NA_integer_, NA_integer_, NA_integer_), .Label = c("Ônibus", 
    "Vans", "Metrô", "Trem", "BRT", "Barca", "Catamarã", "Fretados", 
    "VLT/Monotrilho", "Lotação (microônibusespecial)"),class="factor")),.Names=c("Q_16_O3", 
    "Q_16_O4", "Q_16_O5", "Q_16_O6", "Q_16_O7", "Q_16_O8", "Q_16_O9"
     ), row.names = 10:15, class = "data.frame")
    
asked by anonymous 15.05.2014 / 23:24

1 answer

2

The default of read.spss is to transform categorical variables into factors (categories, factors).

When a variable is a factor , it only accepts what you defined as levels for it. So when you try to make a mydata[is.na(mydata)]<- "Outra coisa" the R will give you the following message:

 Warning messages:
1: In '[<-.factor'('*tmp*', thisvar, value = "outra coisa") :
  invalid factor level, NA generated
That is, it is warning you that there is no level "something else" and therefore you are putting NA in place.

The first thing you have to keep in mind is this: why will you replace an NA with another category? In general, NA means that observation does not exist, so perhaps the most appropriate would be to leave it as NA because R knows how to deal with this kind of thing.

For example, if you want to make a frequency table of the first column of mydata1 , you can use the table command and it will omit the NA (here I am using the data you put in dput(head(mydata1)) only the first 6 remarks):

table(mydata1[,1])
                         Ônibus                            Vans                           Metrô 
                              0                               0                               0 
                           Trem                             BRT                           Barca 
                              1                               0                               0 
                       Catamarã                        Fretados                  VLT/Monotrilho 
                              0                               0                               0 
Lotação (micro-ônibus especial) 
                              0 

If you want it to also count the NA's, just put the argument useNA="always" :

 table(mydata1[,1], useNA="always")

                          Ônibus                            Vans                           Metrô                            Trem                             BRT 
                              0                               0                               0                               1                               0 
                          Barca                        Catamarã                        Fretados                  VLT/Monotrilho Lotação (micro-ônibus especial) 
                              0                               0                               0                               0                               0 
                           <NA> 
                              5

Notice that a field now appeared with the 5 observations that are NA.

But assuming you really want to change the NA to something else, then I think the easiest way would be this. First make the factors of your data.frame into characters and then replace the NA with something else.

For example, with the command below you select all columns of mydata1 that are factors and transform them into character :

mydata1[sapply(mydata1, is.factor)] <- lapply(mydata1[sapply(mydata1, is.factor)], as.character)

You can now send a mydata1[is.na(mydata1)] <- "Outra coisa" that will not generate an error message.

By doing table of the first column, notice that we now have 5 "something else" and 1 "train":

table(mydata1[,1])
Outra coisa        Trem 
          5           1 
    
16.05.2014 / 17:59