'Placing Commas Between Names
I am trying to find out if certain patterns appear within a data frame.
Suppose I have the following "dictionary of patterns" (notice "james" vs "jamesj"):
patterns <- c("john", "jack", "james", "jamesj", "jason")
The actual data frame ("date_frame") I have looks like this:
id names
1 1 johnjack jameS
2 2 john/james, jasonjames
3 3 peter_jackjason
4 4 jamesjasonj jack
5 5 jamesjjason, johnjasonjohn , jason-jack sam _ peter
The final result I am trying to produce should look like this:
id names
1 1 john, jack, james
2 2 john, james, jason, james
3 3 peter, jack, jason
4 4 jamesj, asonj, jack
5 5 jamesj, jason, john, jason, john , jason, jack, sam , peter
I tried looking at this post here (R: insert comma after each element from the output) and tried the answer provided there:
> data_frame$parsed_names = dput(data_frame$names)
id names parsed_names
1 1 john, jack, james john, jack, james
2 2 john, james, jason, james john, james, jason, james
3 3 peter, jack, jason peter, jack, jason
4 4 jamesj, asonj, jack jamesj, asonj, jack
5 5 jamesj, jason, john, jason, john , jason, jack, sam , peter jamesj, jason, john, jason, john , jason, jack, sam , peter
But this is not corresponding to what I wanted.
I then tried this post over here (insert commas in text string after certain words in r) and tried the answer provided there:
library(gsubfn)
data_frame$parsed_names = gsubfn("\\w+", as.list(setNames(paste0(patterns, ","), patterns)),
format(data_frame$names))
data_frame
id names parsed_names
1 1 john, jack, james john,, jack,, james,
2 2 john, james, jason, james john,, james,, jason,, james,
3 3 peter, jack, jason peter, jack,, jason,
4 4 jamesj, asonj, jack jamesj,, asonj, jack,
5 5 jamesj, jason, john, jason, john , jason, jack, sam , peter jamesj,, jason,, john,, jason,, john, , jason,, jack,, sam , peter
- Can someone please show me how to fix this?
Thank you!
Solution 1:[1]
Here's a somewhat ad-hoc answer but it meets your requirements (no change in the patterns
vector):
library(tidyverse)
patterns <- c("john", "jack", "james", "jamesj", "jason")
data_frame %>%
separate_rows(names) %>%
mutate(name = str_split(tolower(names), paste0("(?<=(", paste0(patterns, collapse = "|"), "))"))) %>%
unnest(name) %>%
filter(nzchar(name)) %>%
group_by(j = cumsum(!(name == "j"))) %>%
summarise(name = paste(name, collapse = ""),
id = unique(id)) %>%
group_by(id) %>%
summarise(name = toString(name))
## A tibble: 5 × 2
# id name
# <dbl> <chr>
#1 1 john, jack, james
#2 2 john, james, jason, james
#3 3 peter, jack, jason
#4 4 jamesj, asonj, jack
#5 5 jamesj, jason, john, jason, john, jason, jack, sam, peter
Previous answer:
Adding the other possible names in the patterns vector, and reordering the vector so that jamesj
is preferred over james
, you can then use str_extract_all
.
library(stringr)
library(dplyr)
patterns <- c("john", "jack", "jamesj", "james", "jason", "asonj", "peter", "sam")
patterns <- patterns[order(nchar(patterns), decreasing = T)]
data_frame %>%
mutate(names = lapply(str_extract_all(tolower(names), paste(patterns, collapse = "|")), toString))
# id names
#1 1 john, jack, james
#2 2 john, james, jason, james
#3 3 peter, jack, jason
#4 4 jamesj, asonj, jack
#5 5 jamesj, jason, john, jason, john, jason, jack, sam, peter
data
data_frame <- tribble(
~id, ~names,
1, "johnjack jameS",
2, "john/james, jasonjames",
3, "peter_jackjason",
4, "jamesjasonj jack",
5, "jamesjjason, johnjasonjohn , jason-jack sam _ peter"
)
Solution 2:[2]
Updated to retain whole names not in the pattern:
library(tidyverse)
data_frame <- tribble(
~id, ~names,
1, "johnjack jameS",
2, "john/james, jasonjames",
3, "peter_jackjason",
4, "jamesjasonj jack",
5, "jamesjjason, johnjasonjohn , jason-jack sam _ peter"
)
patterns <- c("john", "jack", "jamesj", "james", "jason")
data_frame |>
mutate(names = map_chr(names, ~ str_to_lower(.) |>
str_extract_all(str_c(c(patterns, "[a-z]{3,10}"), collapse = "|")) |>
unlist() |>
stringi::stri_remove_empty() |>
str_c(collapse = ", "))
)
#> # A tibble: 5 × 2
#> id names
#> <dbl> <chr>
#> 1 1 john, jack, james
#> 2 2 john, james, jason, james
#> 3 3 peter, jack, jason
#> 4 4 jamesj, asonj, jack
#> 5 5 jamesj, jason, john, jason, john, jason, jack, sam, peter
Created on 2022-05-14 by the reprex package (v2.0.1)
Solution 3:[3]
It is unclear what rules you are supposed to follow to generate the final output since there seems to be a lot going on.
Here's is what I assumed which are the rules I coded into the regular expressions
and the patterns they are replaced with (..let me know if I'm wrong)
- Add a
,
after matched words if they occur immediately before any alphabet using regex"jack(?=[:alpha:])"
(This is for the words in the middle of the text, followed by other words) - Replace special characters other than a
,
with a,
using regex'(?!,)[:punct:]'
(a special negative lookahead like regular expression pulled off from source) - Add a
,
before spaces in between words, so'(?<=[:alpha:]) (?=[:alpha:])'
- Account for the
jamesj
by ignoring thejames
followed by aj
using regex'james(?!j)(?=[:alpha:])'
These regex are paired with their replacement texts in a named vector and passed into str_replace_all
to do the substitutions.
I like the named vector approach to text substitution because you can print the vector and see at a glance what is going to be replaced with what.
Here's the full code as a reproducible example-
library(tidyverse)
# Load the data frame
# Thanks to @Mael for the code
.df <- tribble(
~id, ~names,
1, "johnjack jameS",
2, "john/james, jasonjames",
3, "peter_jackjason",
4, "jamesjasonj jack",
5, "jamesjjason, johnjasonjohn , jason-jack sam _ peter"
)
# Load the pattern to place commas after;
# Note jamesj comes before james, which is a sub-pattern of (james)j
patterns <- c("john", "jack", "jamesj", "james", "jason")
# Create a named vector for the string substitutions, format : c('regex pattern' = 'replacement', ..)
sub_pattern <- setNames(object = paste0(patterns, ', '), # append comma and space
nm = paste0(patterns, '(?=[:alpha:])')) # for words occurring immediately before any alphabet
# Address james and jamesj double matching
names(sub_pattern) <-
str_replace(names(sub_pattern),
'james(?!j).*', # replace the james matcher with
'james(?!j)(?=[:alpha:])') # ensures james is not followed by a j
# additional substitutions
sub_pattern <- append(sub_pattern,
c('(?!,)[:punct:]' = ', ', # replace non comma punctuations with a comma and space
'(?<=[:alpha:]) (?=[:alpha:])' = ', ')) # insert comma for spaces between words
# '[:space:],' = ',' # remove spaces before comma if needed
# Perform the string substitutions to the names column
newdf <- mutate(.df, names_with_comma = str_replace_all(tolower(names), sub_pattern))
# converting all the text to lower case (for the S in first column, if that's not a typo..)
newdf$names_with_comma
#> [1] "john, jack, james"
#> [2] "john, james, jason, james"
#> [3] "peter, jack, jason"
#> [4] "jamesj, asonj, jack"
#> [5] "jamesj, jason, john, jason, john , jason, jack, sam , peter"
Created on 2022-05-14 by the reprex package (v2.0.1)
And full credits to my eternal regular expression support from the StringR cheatsheet and thanks to @Maël for the code for the data frame
Solution 4:[4]
paste
|
between the patterns
and compare it with the tolower
data_frame$names
and on match add
on both sides of the match using gsub
.
Replace /_, -
with ,
using gsub
.
trimws
where whitespace is ,
data_frame$names <-
trimws( gsub("[/_, -]+", ", ",
gsub( paste0("(", paste(patterns, collapse="|"), ")"), " \\1 ",
tolower(data_frame$names) )
)
, whitespace = ", ")
data_frame
# id names
#1 1 john, jack, james
#2 2 john, james, jason, james
#3 3 peter, jack, jason
#4 4 jamesj, asonj, jack
#5 5 jamesj, jason, john, jason, john, jason, jack, sam, peter
Data:
patterns <- c("john", "jack", "james", "jamesj", "jason")
data_frame <- data.frame(id=1:5, names = c("johnjack jameS",
"john/james, jasonjames", "peter_jackjason", "jamesjasonj jack",
"jamesjjason, johnjasonjohn , jason-jack sam _ peter"))
Solution 5:[5]
You can use crossing()
from tidyr
in combination with str_detect()
from stringr
to find each of the patterns by id.
names_from_pattern <- data_frame|> tidyr::crossing(patterns) %>%
dplyr::rowwise() %>%
dplyr::filter(stringr::str_detect(names, patterns))|>
dplyr::select(id, "names" = patterns)
Then find all other names not in the pattern
find.string <- paste(patterns, collapse = "|") #e.g. 'or' separated
other_names <- tibble(id = data_frame$id, other_names = gsub(find.string, replacement = " ", x = data_frame$names)) %>%
tidytext::unnest_tokens(., input = other_names, output = names)
# removes remaining non-letters (e.g "peter", not "peter_")
other_names$names<- gsub("[^a-z]","",other_names$names)
RBind names from the pattern with other all names
df<- rbind(names_from_pattern, other_names)
Then to format the output to your specification, use dplyr's pivot_wider()
in combination with unite()
from tidyr
.
df <- df |>
pivot_wider(id_cols = id, names_from = names, values_from = names) %>%
unite(.,col='names', 2:length(.), sep=', ', na.rm = TRUE)
Output:
# A tibble: 5 x 2
id names
<int> <chr>
1 1 jack, james, john
2 2 james, john, jason
3 3 jack, jason, peter
4 4 jack, james, jason, jamesj, asonj
5 5 jack, james, john, jason, jamesj, peter, sam
libraries:
library(dplyr)
library(stringr)
library(tidyr)
library(tidytext)
Sources
This article follows the attribution requirements of Stack Overflow and is licensed under CC BY-SA 3.0.
Source: Stack Overflow
Solution | Source |
---|---|
Solution 1 | |
Solution 2 | |
Solution 3 | |
Solution 4 | |
Solution 5 |