Creating BoxPlot in ggplot2

2

I'm doing some simulations on R . When generating data.frame and requesting boxplots printing with small sample size and number of samples boxplot is generated perfectly. The problem is when increasing the sample size and the number of samples boxplots are not generated according to the data. For example, I printed my data in excel spreadsheets and generated the boxplots from these spreadsheets, I keep getting boxplots wrong as you can see below. I can not find the problem!

##Box-Plot dos valores simulados dos parâmetros.

library(ggplot2)
library(dplyr)
library(readr)
df1 <- read.csv(url("https://cdn.rawgit.com/fsbmat/StackOverflow/58046aff/d1.csv"), sep = ";")
attach(df1)
df1$parametros <- factor(df1$parametros,
                         levels = c("gamma0","gamma1","gamma2","beta0","beta1","beta2","phi1", "rho"), ordered = TRUE)

g1 <- ggplot(df1, aes(x = parametros,y = val_Sim)) +
  geom_boxplot(aes(fill = parametros),alpha = .6,size = .5)+
  stat_boxplot(geom ='errorbar') +
  guides(fill=FALSE)+geom_point()+
  ggtitle("Boxplot com os valores estimados") +   xlab("Parâmetros")+
  scale_y_continuous(name = "Valores Estimados",
                     breaks = seq(-0.5, 2, 0.5),
                     limits=c(-0.5, 2))+
  theme(plot.title = element_text(hjust = 0.5))

df2 <- read.csv(url("https://cdn.rawgit.com/fsbmat/StackOverflow/58046aff/d2.csv"), sep = ";")

attach(df2)
df2$parametros <- factor(df2$parametros,
                         levels = c("gamma0","gamma1","gamma2"), ordered = TRUE)


g2 <- ggplot(df2, aes(y = val_Sim, x = parametros)) +
  geom_boxplot(aes(fill = parametros),alpha = .6,size = .5)+
  stat_boxplot(geom ='errorbar') +
  guides(fill=FALSE)+geom_point()+
  ggtitle("Boxplot com os valores estimados") +   xlab("Parâmetros") +
  scale_y_continuous(name = "Valores Estimados",
                     breaks = seq(-0.5, 2, 0.5),
                     limits=c(-0.5, 2))+
  theme(plot.title = element_text(hjust = 0.5))


df3 <- read.csv(url("https://cdn.rawgit.com/fsbmat/StackOverflow/58046aff/d3.csv"), sep = ";")
attach(df3)
df3$parametros <- factor(df3$parametros,
                         levels = c("beta0","beta1","beta2"), ordered = TRUE)

g3 <- ggplot(df3, aes(y = val_Sim, x = parametros)) +
  geom_boxplot(aes(fill = parametros),alpha = .6,size = .5)+
  stat_boxplot(geom ='errorbar') +
  guides(fill=FALSE)+geom_point()+
  ggtitle("Boxplot com os valores estimados") +   xlab("Parâmetros") +
  scale_y_continuous(name = "Valores Estimados",
                     breaks = seq(-0.5, 2, 0.5),
                     limits=c(-0.5, 2))+
  theme(plot.title = element_text(hjust = 0.5))


df4 <- read.csv(url("https://cdn.rawgit.com/fsbmat/StackOverflow/58046aff/d4.csv"), sep = ";")
attach(df4)
df4$parametros <- factor(df4$parametros,
                         levels = c("phi1", "rho"), ordered = TRUE)



g4 <- ggplot(df4, aes(y = val_Sim, x = parametros)) +
  geom_boxplot(aes(fill = parametros),alpha = .6,size = .5)+
  stat_boxplot(geom ='errorbar') +
  guides(fill=FALSE)+geom_point()+
  ggtitle("Boxplot com os valores estimados") +   xlab("Parâmetros") +
  scale_y_continuous(name = "Valores Estimados",
                     breaks = seq(-0.5, 2, 0.25),
                     limits=c(-0.5, 2))+
  theme(plot.title = element_text(hjust = 0.5))
library(gridExtra)
grid.arrange(g1,g2, g3, g4)

    
asked by anonymous 04.03.2017 / 18:32

1 answer

1

For some reason that I do not know, your files df2 , df3 and df4 are wrong. See, for example, what happens when I calculate the median for each group, using df1 , df2_original (which is your df2 ) and df2_correto (which I created from df1 , selecting only the lines with gamma values):

df1 <- read.csv(url("https://cdn.rawgit.com/fsbmat/StackOverflow/58046aff/d1.csv"), sep = ";")
df1 %>% 
  group_by(parametros) %>% 
  summarise(mediana = median(val_Sim, na.rm = TRUE))
# A tibble: 8 × 2
  parametros   mediana
      <fctr>     <dbl>
1      beta0 0.9406746
2      beta1 0.8604181
3      beta2 0.2352544
4     gamma0 0.7085335
5     gamma1 0.8904442
6     gamma2 0.9441261
7       phi1 0.2305419
8        rho 0.2301348
df2_original <- read.csv(url("https://cdn.rawgit.com/fsbmat/StackOverflow/58046aff/d2.csv"), sep = ";")
df2_original %>% 
  group_by(parametros) %>% 
  summarise(mediana = median(val_Sim, na.rm = TRUE))
# A tibble: 3 × 2
  parametros   mediana
      <fctr>     <dbl>
1     gamma0 0.7068794
2     gamma1 0.9087855
3     gamma2 0.8771044
df2_correto <- filter(df1, parametros=="gamma0" | parametros=="gamma1" | 
  parametros=="gamma2")
df2_correto %>% 
  group_by(parametros) %>% 
  summarise(mediana = median(val_Sim, na.rm = TRUE))
# A tibble: 3 × 2
  parametros   mediana
      <fctr>     <dbl>
1     gamma0 0.7085335
2     gamma1 0.8904442
3     gamma2 0.9441261

Note that the medians of df2_correto are identical to df1 . They are inclusive, in ascending order, as the boxplot with all parameters suggests. So I suspect there was some error in creating your original subsets, ie the files df2.csv , df3.csv and df4.csv have a problem.

Also, note that the syntax of the filter function is fairly simple. It was trivial to select the lines that interested me to create the df2 only with the values of gamma0 , gamma1 and gamma2 . I did not need to create separate files for this, leaving the code a bit more organized and easier to debug.

So by doing so I was able to get the desired graphics without having to call .csv files in addition to the original d1.csv file. For this, I used the filter function of the dplyr package:

library(ggplot2)
library(dplyr)
library(readr)
library(gridExtra)

df1 <- read.csv(url("https://cdn.rawgit.com/fsbmat/StackOverflow/58046aff/d1.csv"), sep = ";")

df1$parametros <- factor(df1$parametros,
                         levels = c("gamma0","gamma1","gamma2","beta0","beta1","beta2","phi1", "rho"), ordered = TRUE)

g1 <- ggplot(df1, aes(x = parametros,y = val_Sim)) +
  geom_boxplot(aes(fill = parametros),alpha = .6,size = .5)+
  stat_boxplot(geom ='errorbar') +
  guides(fill=FALSE)+geom_point()+
  ggtitle("Boxplot com os valores estimados") +   xlab("Parâmetros")+
  scale_y_continuous(name = "Valores Estimados",
                     breaks = seq(-0.5, 2, 0.5),
                     limits=c(-0.5, 2))+
  theme(plot.title = element_text(hjust = 0.5))

df2_correto <- filter(df1, parametros=="gamma0" | parametros=="gamma1" | 
  parametros=="gamma2")

g2 <- ggplot(df2, aes(x = parametros,y = val_Sim)) +
  geom_boxplot(aes(fill = parametros),alpha = .6,size = .5)+
  stat_boxplot(geom ='errorbar') +
  guides(fill=FALSE)+geom_point()+
  ggtitle("Boxplot com os valores estimados") +   xlab("Parâmetros")+
  scale_y_continuous(name = "Valores Estimados",
                     breaks = seq(-0.5, 2, 0.5),
                     limits=c(-0.5, 2))+
  theme(plot.title = element_text(hjust = 0.5))

df3 <- filter(df1, parametros=="beta0" | parametros=="beta1" | parametros=="beta2")

g3 <- ggplot(df3, aes(y = val_Sim, x = parametros)) +
  geom_boxplot(aes(fill = parametros),alpha = .6,size = .5)+
  stat_boxplot(geom ='errorbar') +
  guides(fill=FALSE)+geom_point()+
  ggtitle("Boxplot com os valores estimados") +   xlab("Parâmetros") +
  scale_y_continuous(name = "Valores Estimados",
                     breaks = seq(-0.5, 2, 0.5),
                     limits=c(-0.5, 2))+
  theme(plot.title = element_text(hjust = 0.5))


df4 <- filter(df1, parametros=="phi1" | parametros=="rho")

g4 <- ggplot(df4, aes(y = val_Sim, x = parametros)) +
  geom_boxplot(aes(fill = parametros),alpha = .6,size = .5)+
  stat_boxplot(geom ='errorbar') +
  guides(fill=FALSE)+geom_point()+
  ggtitle("Boxplot com os valores estimados") +   xlab("Parâmetros") +
  scale_y_continuous(name = "Valores Estimados",
                     breaks = seq(-0.5, 2, 0.25),
                     limits=c(-0.5, 2))+
  theme(plot.title = element_text(hjust = 0.5))

grid.arrange(g1,g2, g3, g4)

Also, note that you did not have to reorder the parametros levels for each new df because the original data frame had already ordered them. And, finally, it is not necessary to use attach , which is a redundant function due to the syntax of ggplot and dplyr .

    
04.03.2017 / 20:13