I have downloaded some processed Microarray data from ArrayExpress (Affymetrix GeneChip Human Genome U133 Plus 2.0). This is normalised data and it looks like below in a dataframe df:

enter image description here

This is the dput(df)

structure(list(Samples = structure(1:9, .Label = c("H_106.CD.act", 
    "H_106.CD.nact", "H_107.CD.act", "H_107.CD.nact", "H_340.normal", 
    "H_404.CD.act", "H_404.CD.nact", "H_738.normal", "H_755.normal"
    ), class = "factor"), Type = structure(c(1L, 2L, 1L, 2L, 3L, 
    1L, 2L, 3L, 3L), .Label = c("Active CD", "Non-Active CD", "Normal"
    ), class = "factor"), PGAM5 = structure(c(H_106.CD.act = 6L, 
    H_106.CD.nact = 4L, H_107.CD.act = 8L, H_107.CD.nact = 1L, H_340.normal = 3L, 
    H_404.CD.act = 7L, H_404.CD.nact = 9L, H_738.normal = 5L, H_755.normal = 2L
    ), .Label = c("4.571231311", "4.755115729", "4.887622107", "4.891329464", 
    "4.912189399", "5.46180878", "5.49774779", "5.612888254", "5.880677067"
    ), class = "factor"), NME1 = structure(c(H_106.CD.act = 1L, H_106.CD.nact = 9L, 
    H_107.CD.act = 3L, H_107.CD.nact = 7L, H_340.normal = 5L, H_404.CD.act = 2L, 
    H_404.CD.nact = 4L, H_738.normal = 6L, H_755.normal = 8L), .Label = c("10.02692043", 
    "10.04369937", "10.57609398", "10.65706982", "8.221264698", "8.906353951", 
    "9.395091983", "9.533567976", "9.676355234"), class = "factor"), 
        LHPP = structure(c(H_106.CD.act = 4L, H_106.CD.nact = 5L, 
        H_107.CD.act = 1L, H_107.CD.nact = 6L, H_340.normal = 7L, 
        H_404.CD.act = 2L, H_404.CD.nact = 3L, H_738.normal = 9L, 
        H_755.normal = 8L), .Label = c("6.344182108", "6.48823957", 
        "6.514741929", "6.562740787", "6.831723902", "7.071119084", 
        "7.188415855", "7.243049713", "7.290671656"), class = "factor"), 
        PHPT1 = structure(c(H_106.CD.act = 5L, H_106.CD.nact = 2L, 
        H_107.CD.act = 7L, H_107.CD.nact = 8L, H_340.normal = 4L, 
        H_404.CD.act = 6L, H_404.CD.nact = 3L, H_738.normal = 1L, 
        H_755.normal = 9L), .Label = c("10.04890824", "10.08906847", 
        "10.215382", "10.30426286", "9.59467692", "9.610542319", 
        "9.787960611", "9.821975201", "9.893869572"), class = "factor")), row.names = c(NA, 
    -9L), class = "data.frame")

I tried making a box plot out of the above data and wanted to check the significance between each Type.

library(reshape2)
library(ggplot2)
df.n <- melt(final6, c("Samples", "Type"))

positions <- c("Normal", "Active CD", "Non-Active CD")
library(ggplot2)
library(ggsignif)
library(EnvStats)
library(ggpubr)
library(forcats)

r <- ggplot(data = df.n, aes(x=fct_reorder(Type, value), y=value)) + 
  geom_boxplot() + facet_wrap(~variable) +
  geom_signif(comparisons = list(c("Normal","Active CD"),
                                 c("Normal","Non-Active CD"), c("Active CD","Non-Active CD")),
              map_signif_level = TRUE, y_position = c(8,9,10)) + 
  theme_bw(base_size = 14) + xlab("")+
  theme(axis.text=element_text(size=15, face = "bold", color = "black"),
        axis.title=element_text(size=15, face = "bold", color = "black"),
        strip.text = element_text(size=15, face = "bold", color = "black"))
r + stat_n_text(size = 4) + scale_x_discrete(limits = positions) + ylab("Normalized Expression")

This gave me an output like below:

enter image description here

May I know why the data looks like that in the box plot? Do I need to normalise this data again? Any solution to make the box plot look better?



Source link