'How do I use regex to match a substring before/after the colon?
I want to split info
into Refseq ID
, cDNA level change
and Protein level change
, where Refseq ID
represents the substring from the start to the first colon :
, cDNA level change
is the substring between the first and second colon, and Protein level change
is the substring after the second colon.
library(stringr)
df=read.csv("variant_calls.txt", sep="\t")
info=df["AAChange.refGene"]
id=stringr::str_extract(info, "SRR.(\\d{6})")
aa=info[!id]
> dput(info)
structure(list(AAChange.refGene = c("NM_002725:c.C301T:p.P101S",
"NM_001024940:c.T1054A:p.Y352N", "NM_001098209:c.T109C:p.S37P",
"NM_152539:c.G955A:p.E319K", "NM_032421:c.A2422G:p.T808A", "NM_003141:c.G431A:p.G144E",
"NM_006645:c.C749T:p.S250L", "NM_206927:c.C778A:p.P260T", "NM_012240:c.G209A:p.G70E",
"NM_152336:c.A382C:p.K128Q", "NM_002773:c.G750C:p.W250C", "NM_001797:c.C2125T:p.R709W",
"NM_058216:c.C797A:p.A266D", "NM_198977:c.C1543T:p.R515W", "NM_000307:c.C356T:p.A119V"
)), row.names = c(NA, -15L), class = "data.frame")
Expected output:
Refseq ID | cDNA level change | Protein level change |
---|---|---|
NM_001024940 | c.T1054A | p.Y352N |
NM_001098209 | c.T109C | p.S37P |
NM_152539 | c.G955A | p.E319K |
NM_032421 | c.A2422G | p.T808A |
Solution 1:[1]
Using base R
with read.csv
after replacing the first :
with ,
read.csv(text = sub(":", ",", df$AAChange.refGene),
header = FALSE, col.names = c("id", "aa"))
id aa
1 NM_002725 c.C301T:p.P101S
2 NM_001024940 c.T1054A:p.Y352N
3 NM_001098209 c.T109C:p.S37P
4 NM_152539 c.G955A:p.E319K
5 NM_032421 c.A2422G:p.T808A
6 NM_003141 c.G431A:p.G144E
7 NM_006645 c.C749T:p.S250L
8 NM_206927 c.C778A:p.P260T
9 NM_012240 c.G209A:p.G70E
10 NM_152336 c.A382C:p.K128Q
11 NM_002773 c.G750C:p.W250C
12 NM_001797 c.C2125T:p.R709W
13 NM_058216 c.C797A:p.A266D
14 NM_198977 c.C1543T:p.R515W
15 NM_000307 c.C356T:p.A119V
If we don't need the last part after the :
read.csv(text = trimws(df$AAChange.refGene, whitespace = ":[^:]+",
which = "right"), header = FALSE, col.names = c("id", "aa"), sep = ":")
id aa
1 NM_002725 c.C301T
2 NM_001024940 c.T1054A
3 NM_001098209 c.T109C
4 NM_152539 c.G955A
5 NM_032421 c.A2422G
6 NM_003141 c.G431A
7 NM_006645 c.C749T
8 NM_206927 c.C778A
9 NM_012240 c.G209A
10 NM_152336 c.A382C
11 NM_002773 c.G750C
12 NM_001797 c.C2125T
13 NM_058216 c.C797A
14 NM_198977 c.C1543T
15 NM_000307 c.C356T
Solution 2:[2]
You could use sub
for a base R option:
df$id <- sub(".*?:([^:]+):.*", "\\1", df$AAChange.refGene)
df$aa <- sub(".*?:", "", df$AAChange.refGene)
Solution 3:[3]
In this case, you can try using seperate()
instead of the regex.
library(tidyr)
info %>%
separate(AAChange.refGene,
c("Refseq ID", "cDNA level change", "Protein level change"),
sep = ":", extra = "drop")
Refseq ID cDNA level change Protein level change
1 NM_002725 c.C301T p.P101S
2 NM_001024940 c.T1054A p.Y352N
3 NM_001098209 c.T109C p.S37P
4 NM_152539 c.G955A p.E319K
5 NM_032421 c.A2422G p.T808A
6 NM_003141 c.G431A p.G144E
7 NM_006645 c.C749T p.S250L
8 NM_206927 c.C778A p.P260T
9 NM_012240 c.G209A p.G70E
10 NM_152336 c.A382C p.K128Q
11 NM_002773 c.G750C p.W250C
12 NM_001797 c.C2125T p.R709W
13 NM_058216 c.C797A p.A266D
14 NM_198977 c.C1543T p.R515W
15 NM_000307 c.C356T p.A119V
Solution 4:[4]
My favorite method for this kind of task is extract
:
library(tidyr)
info %>%
extract(AAChange.refGene,
into = c("Refseq ID", "cDNA level change", "Protein level change"),
regex = "(.*):(.*):(.*)")
Refseq ID cDNA level change Protein level change
1 NM_002725 c.C301T p.P101S
2 NM_001024940 c.T1054A p.Y352N
3 NM_001098209 c.T109C p.S37P
4 NM_152539 c.G955A p.E319K
5 NM_032421 c.A2422G p.T808A
6 NM_003141 c.G431A p.G144E
7 NM_006645 c.C749T p.S250L
8 NM_206927 c.C778A p.P260T
9 NM_012240 c.G209A p.G70E
10 NM_152336 c.A382C p.K128Q
11 NM_002773 c.G750C p.W250C
12 NM_001797 c.C2125T p.R709W
13 NM_058216 c.C797A p.A266D
14 NM_198977 c.C1543T p.R515W
15 NM_000307 c.C356T p.A119V
Solution 5:[5]
For the sake of completeness, the fread()
function from the data.table package is quite handy for cases like this one:
data.table::fread(text = info$AAChange.refGene, sep = ":", header = FALSE,
col.names = c("Refseq ID", "cDNA level change", "Protein level change"))
Refseq ID cDNA level change Protein level change 1: NM_002725 c.C301T p.P101S 2: NM_001024940 c.T1054A p.Y352N 3: NM_001098209 c.T109C p.S37P 4: NM_152539 c.G955A p.E319K 5: NM_032421 c.A2422G p.T808A 6: NM_003141 c.G431A p.G144E 7: NM_006645 c.C749T p.S250L 8: NM_206927 c.C778A p.P260T 9: NM_012240 c.G209A p.G70E 10: NM_152336 c.A382C p.K128Q 11: NM_002773 c.G750C p.W250C 12: NM_001797 c.C2125T p.R709W 13: NM_058216 c.C797A p.A266D 14: NM_198977 c.C1543T p.R515W 15: NM_000307 c.C356T p.A119V
Sources
This article follows the attribution requirements of Stack Overflow and is licensed under CC BY-SA 3.0.
Source: Stack Overflow
Solution | Source |
---|---|
Solution 1 | |
Solution 2 | |
Solution 3 | |
Solution 4 | |
Solution 5 | Uwe |