filtering dataframe using multiple columns in R

2

Hi guys,

So, i have this dataframe (for e.g.):

   CGDid           Mass      Source.File        pepSeq
 C1_00060W_A    5117.5552   T0_4_excl.raw   NGFQQQQQQQQQQQQQQQQQQQQIVAPPAAPPAPPTPVTSLSVIDK
 C1_00060W_A    5117.5552   T0_4.raw        NGFQQQQQQQQQQQQQQQQQQQQIVAPPAAPPAPPTPVTSLSVIDK
 C1_00060W_A    5117.5552   T0_4.raw        NGFQQQQQQQQQQQQQQQQQQQQIVAPPAAPPAPPTPVTSLSVIDK
 C1_00061W_A    6304.165    T0_4.raw        NGFQQQQQQQQQQQQQQQQQQQQIVAPPAAPPAPPTPVTSLSVIDKSQYIVNPTQR
 C1_00060W_A    6304.165    T0_4.raw        NGFQQQQQQQQQQQQQQQQQQQQIVAPPAAPPAPPTPVTSLSVIDKSQYIVNPTQR
 C1_00060W_A    6304.165    T0_4_excl.raw   NGFQQQQQQQQQQQQQQQQQQQQIVAPPAAPPAPPTPVTSLSVIDKSQYIVNPTQR
 C1_00063W_A    6304.165    T0_4.raw        NGFQQQQQQQQQQQQQQQQQQQQIVAPPAAPPAPPTPVTSLSVIDKSQYIVNPTQR
 C1_00060W_A    6304.165    T0_4.raw        NGFQQQQQQQQQQQQQQQQQQQQIVAPPAAPPAPPTPVTSLSVIDKSQYIVNPTQR
 C1_00060W_A    6304.165    T0_4_excl.raw   NGFQQQQQQQQQQQQQQQQQQQQIVAPPAAPPAPPTPVTSLSVIDKSQYIVNPTQR
 C1_00060W_A    6305.1489   T0_4_excl.raw   K.NGFQQQQQQQQQQQQQQQQQQQQIVAPPAAPPAPPTPVTSLSVIDKSQYIVNPTQR.A
 C1_00060W_A    6305.1489   T0_4_excl.raw   K.NGFQQQQQQQQQQQQQQQQQQQQIVAPPAAPPAPPTPVTSLSVIDKSQYIVNPTQR.A
 C1_00060W_A    6305.1489   T0_4.raw        K.NGFQQQQQQQQQQQQQQQQQQQQIVAPPAAPPAPPTPVTSLSVIDKSQYIVNPTQR.A
 C1_00062W_A    6305.1489   T0_4.raw        K.NGFQQQQQQQQQQQQQQQQQQQQIVAPPAAPPAPPTPVTSLSVIDKSQYIVNPTQR.A
 C1_00060W_A    6305.1489   T0_4.raw        K.NGFQQQQQQQQQQQQQQQQQQQQIVAPPAAPPAPPTPVTSLSVIDKSQYIVNPTQR.A
 C1_00060W_A    6305.1489   T0_4_excl.raw   K.NGFQQQQQQQQQQQQQQQQQQQQIVAPPAAPPAPPTPVTSLSVIDKSQYIVNPTQR.A

And the thing is i want to filter when the columns CGDid, Mass, pepSeq are equal and the Source.File is different. I am trying to figure out the best way to this but i am blocked.
I will be very appreciate if someone could help me!

All the best,
Andreia


filter


R


dataframe

• 35 views

Using aggregate:

aggregate(Source.File ~ CGDid + Mass + pepSeq, data = d,
          FUN = function(x) { c(names = toString(unique(x)),
                                count = length(unique(x))) })
#         CGDid     Mass                                                       pepSeq       Source.File.names Source.File.count
# 1 C1_00060W_A 6305.149 K.NGFQQQQQQQQQQQQQQQQQQQQIVAPPAAPPAPPTPVTSLSVIDKSQYIVNPTQR.A T0_4_excl.raw, T0_4.raw                 2
# 2 C1_00062W_A 6305.149 K.NGFQQQQQQQQQQQQQQQQQQQQIVAPPAAPPAPPTPVTSLSVIDKSQYIVNPTQR.A                T0_4.raw                 1
# 3 C1_00060W_A 5117.555               NGFQQQQQQQQQQQQQQQQQQQQIVAPPAAPPAPPTPVTSLSVIDK T0_4_excl.raw, T0_4.raw                 2
# 4 C1_00060W_A 6304.165     NGFQQQQQQQQQQQQQQQQQQQQIVAPPAAPPAPPTPVTSLSVIDKSQYIVNPTQR T0_4.raw, T0_4_excl.raw                 2
# 5 C1_00061W_A 6304.165     NGFQQQQQQQQQQQQQQQQQQQQIVAPPAAPPAPPTPVTSLSVIDKSQYIVNPTQR                T0_4.raw                 1
# 6 C1_00063W_A 6304.165     NGFQQQQQQQQQQQQQQQQQQQQIVAPPAAPPAPPTPVTSLSVIDKSQYIVNPTQR                T0_4.raw                 1


# example data
d <- read.table(text = "   CGDid           Mass      Source.File        pepSeq
 C1_00060W_A    5117.5552   T0_4_excl.raw   NGFQQQQQQQQQQQQQQQQQQQQIVAPPAAPPAPPTPVTSLSVIDK
 C1_00060W_A    5117.5552   T0_4.raw        NGFQQQQQQQQQQQQQQQQQQQQIVAPPAAPPAPPTPVTSLSVIDK
 C1_00060W_A    5117.5552   T0_4.raw        NGFQQQQQQQQQQQQQQQQQQQQIVAPPAAPPAPPTPVTSLSVIDK
 C1_00061W_A    6304.165    T0_4.raw        NGFQQQQQQQQQQQQQQQQQQQQIVAPPAAPPAPPTPVTSLSVIDKSQYIVNPTQR
 C1_00060W_A    6304.165    T0_4.raw        NGFQQQQQQQQQQQQQQQQQQQQIVAPPAAPPAPPTPVTSLSVIDKSQYIVNPTQR
 C1_00060W_A    6304.165    T0_4_excl.raw   NGFQQQQQQQQQQQQQQQQQQQQIVAPPAAPPAPPTPVTSLSVIDKSQYIVNPTQR
 C1_00063W_A    6304.165    T0_4.raw        NGFQQQQQQQQQQQQQQQQQQQQIVAPPAAPPAPPTPVTSLSVIDKSQYIVNPTQR
 C1_00060W_A    6304.165    T0_4.raw        NGFQQQQQQQQQQQQQQQQQQQQIVAPPAAPPAPPTPVTSLSVIDKSQYIVNPTQR
 C1_00060W_A    6304.165    T0_4_excl.raw   NGFQQQQQQQQQQQQQQQQQQQQIVAPPAAPPAPPTPVTSLSVIDKSQYIVNPTQR
 C1_00060W_A    6305.1489   T0_4_excl.raw   K.NGFQQQQQQQQQQQQQQQQQQQQIVAPPAAPPAPPTPVTSLSVIDKSQYIVNPTQR.A
 C1_00060W_A    6305.1489   T0_4_excl.raw   K.NGFQQQQQQQQQQQQQQQQQQQQIVAPPAAPPAPPTPVTSLSVIDKSQYIVNPTQR.A
 C1_00060W_A    6305.1489   T0_4.raw        K.NGFQQQQQQQQQQQQQQQQQQQQIVAPPAAPPAPPTPVTSLSVIDKSQYIVNPTQR.A
 C1_00062W_A    6305.1489   T0_4.raw        K.NGFQQQQQQQQQQQQQQQQQQQQIVAPPAAPPAPPTPVTSLSVIDKSQYIVNPTQR.A
 C1_00060W_A    6305.1489   T0_4.raw        K.NGFQQQQQQQQQQQQQQQQQQQQIVAPPAAPPAPPTPVTSLSVIDKSQYIVNPTQR.A
 C1_00060W_A    6305.1489   T0_4_excl.raw   K.NGFQQQQQQQQQQQQQQQQQQQQIVAPPAAPPAPPTPVTSLSVIDKSQYIVNPTQR.A",
 header = TRUE)

t1 <- readr::read_tsv("./1.t")
dplyr::group_by(t1, CGDid, Mass, pepSeq)  %>% dplyr::distinct(Source.File)

output:

# A tibble: 9 x 4
# Groups:   CGDid, Mass, pepSeq [6]
  CGDid       Mass Source.File   pepSeq                                         
  <chr>      <dbl> <chr>         <chr>                                          
1 C1_00060W… 5118. T0_4_excl.raw NGFQQQQQQQQQQQQQQQQQQQQIVAPPAAPPAPPTPVTSLSVIDK 
2 C1_00060W… 5118. T0_4.raw      NGFQQQQQQQQQQQQQQQQQQQQIVAPPAAPPAPPTPVTSLSVIDK 
3 C1_00061W… 6304. T0_4.raw      NGFQQQQQQQQQQQQQQQQQQQQIVAPPAAPPAPPTPVTSLSVIDK…
4 C1_00060W… 6304. T0_4.raw      NGFQQQQQQQQQQQQQQQQQQQQIVAPPAAPPAPPTPVTSLSVIDK…
5 C1_00060W… 6304. T0_4_excl.raw NGFQQQQQQQQQQQQQQQQQQQQIVAPPAAPPAPPTPVTSLSVIDK…
6 C1_00063W… 6304. T0_4.raw      NGFQQQQQQQQQQQQQQQQQQQQIVAPPAAPPAPPTPVTSLSVIDK…
7 C1_00060W… 6305. T0_4_excl.raw K.NGFQQQQQQQQQQQQQQQQQQQQIVAPPAAPPAPPTPVTSLSVI…
8 C1_00060W… 6305. T0_4.raw      K.NGFQQQQQQQQQQQQQQQQQQQQIVAPPAAPPAPPTPVTSLSVI…
9 C1_00062W… 6305. T0_4.raw      K.NGFQQQQQQQQQQQQQQQQQQQQIVAPPAAPPAPPTPVTSLSVI…


Login
before adding your answer.

Traffic: 2422 users visited in the last hour



Source link