'How do I use regex to match a substring before/after the colon?

I want to split info into Refseq ID, cDNA level change and Protein level change, where Refseq ID represents the substring from the start to the first colon :, cDNA level change is the substring between the first and second colon, and Protein level change is the substring after the second colon.

library(stringr)

df=read.csv("variant_calls.txt", sep="\t")
info=df["AAChange.refGene"]
id=stringr::str_extract(info, "SRR.(\\d{6})")
aa=info[!id]

> dput(info)
structure(list(AAChange.refGene = c("NM_002725:c.C301T:p.P101S", 
"NM_001024940:c.T1054A:p.Y352N", "NM_001098209:c.T109C:p.S37P", 
"NM_152539:c.G955A:p.E319K", "NM_032421:c.A2422G:p.T808A", "NM_003141:c.G431A:p.G144E", 
"NM_006645:c.C749T:p.S250L", "NM_206927:c.C778A:p.P260T", "NM_012240:c.G209A:p.G70E", 
"NM_152336:c.A382C:p.K128Q", "NM_002773:c.G750C:p.W250C", "NM_001797:c.C2125T:p.R709W", 
"NM_058216:c.C797A:p.A266D", "NM_198977:c.C1543T:p.R515W", "NM_000307:c.C356T:p.A119V"
)), row.names = c(NA, -15L), class = "data.frame")

Expected output:

Refseq ID cDNA level change Protein level change
NM_001024940 c.T1054A p.Y352N
NM_001098209 c.T109C p.S37P
NM_152539 c.G955A p.E319K
NM_032421 c.A2422G p.T808A


Solution 1:[1]

Using base R with read.csv after replacing the first : with ,

read.csv(text = sub(":", ",", df$AAChange.refGene),
    header = FALSE, col.names = c("id", "aa"))
             id               aa
1     NM_002725  c.C301T:p.P101S
2  NM_001024940 c.T1054A:p.Y352N
3  NM_001098209   c.T109C:p.S37P
4     NM_152539  c.G955A:p.E319K
5     NM_032421 c.A2422G:p.T808A
6     NM_003141  c.G431A:p.G144E
7     NM_006645  c.C749T:p.S250L
8     NM_206927  c.C778A:p.P260T
9     NM_012240   c.G209A:p.G70E
10    NM_152336  c.A382C:p.K128Q
11    NM_002773  c.G750C:p.W250C
12    NM_001797 c.C2125T:p.R709W
13    NM_058216  c.C797A:p.A266D
14    NM_198977 c.C1543T:p.R515W
15    NM_000307  c.C356T:p.A119V

If we don't need the last part after the :

read.csv(text = trimws(df$AAChange.refGene, whitespace = ":[^:]+",
   which = "right"), header = FALSE, col.names = c("id", "aa"), sep = ":")
             id       aa
1     NM_002725  c.C301T
2  NM_001024940 c.T1054A
3  NM_001098209  c.T109C
4     NM_152539  c.G955A
5     NM_032421 c.A2422G
6     NM_003141  c.G431A
7     NM_006645  c.C749T
8     NM_206927  c.C778A
9     NM_012240  c.G209A
10    NM_152336  c.A382C
11    NM_002773  c.G750C
12    NM_001797 c.C2125T
13    NM_058216  c.C797A
14    NM_198977 c.C1543T
15    NM_000307  c.C356T

Solution 2:[2]

You could use sub for a base R option:

df$id <- sub(".*?:([^:]+):.*", "\\1", df$AAChange.refGene)
df$aa <- sub(".*?:", "", df$AAChange.refGene)

Solution 3:[3]

In this case, you can try using seperate() instead of the regex.

library(tidyr)

info %>% 
  separate(AAChange.refGene,
           c("Refseq ID", "cDNA level change", "Protein level change"),
           sep = ":", extra = "drop")
      Refseq ID cDNA level change Protein level change
1     NM_002725           c.C301T              p.P101S
2  NM_001024940          c.T1054A              p.Y352N
3  NM_001098209           c.T109C               p.S37P
4     NM_152539           c.G955A              p.E319K
5     NM_032421          c.A2422G              p.T808A
6     NM_003141           c.G431A              p.G144E
7     NM_006645           c.C749T              p.S250L
8     NM_206927           c.C778A              p.P260T
9     NM_012240           c.G209A               p.G70E
10    NM_152336           c.A382C              p.K128Q
11    NM_002773           c.G750C              p.W250C
12    NM_001797          c.C2125T              p.R709W
13    NM_058216           c.C797A              p.A266D
14    NM_198977          c.C1543T              p.R515W
15    NM_000307           c.C356T              p.A119V

Solution 4:[4]

My favorite method for this kind of task is extract:

library(tidyr)
info %>%
  extract(AAChange.refGene,
          into = c("Refseq ID", "cDNA level change", "Protein level change"),
          regex = "(.*):(.*):(.*)")
      Refseq ID cDNA level change Protein level change
1     NM_002725           c.C301T              p.P101S
2  NM_001024940          c.T1054A              p.Y352N
3  NM_001098209           c.T109C               p.S37P
4     NM_152539           c.G955A              p.E319K
5     NM_032421          c.A2422G              p.T808A
6     NM_003141           c.G431A              p.G144E
7     NM_006645           c.C749T              p.S250L
8     NM_206927           c.C778A              p.P260T
9     NM_012240           c.G209A               p.G70E
10    NM_152336           c.A382C              p.K128Q
11    NM_002773           c.G750C              p.W250C
12    NM_001797          c.C2125T              p.R709W
13    NM_058216           c.C797A              p.A266D
14    NM_198977          c.C1543T              p.R515W
15    NM_000307           c.C356T              p.A119V

Solution 5:[5]

For the sake of completeness, the fread() function from the data.table package is quite handy for cases like this one:

data.table::fread(text = info$AAChange.refGene, sep = ":", header = FALSE, 
                  col.names = c("Refseq ID", "cDNA level change", "Protein level change"))
       Refseq ID cDNA level change Protein level change
 1:    NM_002725           c.C301T              p.P101S
 2: NM_001024940          c.T1054A              p.Y352N
 3: NM_001098209           c.T109C               p.S37P
 4:    NM_152539           c.G955A              p.E319K
 5:    NM_032421          c.A2422G              p.T808A
 6:    NM_003141           c.G431A              p.G144E
 7:    NM_006645           c.C749T              p.S250L
 8:    NM_206927           c.C778A              p.P260T
 9:    NM_012240           c.G209A               p.G70E
10:    NM_152336           c.A382C              p.K128Q
11:    NM_002773           c.G750C              p.W250C
12:    NM_001797          c.C2125T              p.R709W
13:    NM_058216           c.C797A              p.A266D
14:    NM_198977          c.C1543T              p.R515W
15:    NM_000307           c.C356T              p.A119V

Sources

This article follows the attribution requirements of Stack Overflow and is licensed under CC BY-SA 3.0.

Source: Stack Overflow

Solution Source
Solution 1
Solution 2
Solution 3
Solution 4
Solution 5 Uwe