'data.table join with date
hello im trying to extract some id with a group and Date in range
> d1
id group Date
1: 1 A 2017-07-02
2: 2 A 2017-07-04
3: 3 A 2017-05-15
4: 4 A 2017-08-02
5: 5 B 2017-12-28
6: 6 B 2015-07-02
7: 7 B 2012-07-02
8: 8 B 2018-07-02
9: 9 C 2017-07-02
10: 10 C 2017-07-02
11: 11 C 2017-07-02
12: 12 C 2017-07-04
13: 13 D 2017-05-15
14: 14 D 2017-08-02
15: 15 D 2017-12-28
16: 16 D 2015-07-02
17: 17 E 2012-07-02
18: 18 E 2018-07-02
19: 19 E 2017-07-02
20: 20 E 2017-07-02
> d2
group timestamp1 timestamp2
1: A 2015-07-01 2017-07-20
2: A 2020-07-12 2017-07-15
3: B 2017-05-15 2020-05-22
and i want the id from d1 matching d2 date range and group
group timestamp1 timestamp2 id
1: A 2017-07-02 2017-07-02 1
2: A 2017-07-04 2017-07-04 2
3: A 2017-05-15 2017-05-15 3
4: B 2017-12-28 2017-12-28 5
5: B 2018-07-02 2018-07-02 8
i checked this How to perform join over date ranges using data.table? i think it's the solution but ican't make it works.
Date, timestamp, timestamp2 are in POSIXct
please help :)
Solution 1:[1]
The OP has requested to do a non-equi inner join using data.table
:
library(data.table)
d2[d1, on = .(group, timestamp1 <= Date, timestamp2 >= Date), nomatch = 0L]
group timestamp1 timestamp2 id 1: A 2017-07-02 2017-07-02 1 2: A 2017-07-04 2017-07-04 2 3: A 2017-05-15 2017-05-15 3 4: B 2017-12-28 2017-12-28 5 5: B 2018-07-02 2018-07-02 8
Data
library(data.table)
d1 <- fread(
"rn id group Date
1: 1 A 2017-07-02
2: 2 A 2017-07-04
3: 3 A 2017-05-15
4: 4 A 2017-08-02
5: 5 B 2017-12-28
6: 6 B 2015-07-02
7: 7 B 2012-07-02
8: 8 B 2018-07-02
9: 9 C 2017-07-02
10: 10 C 2017-07-02
11: 11 C 2017-07-02
12: 12 C 2017-07-04
13: 13 D 2017-05-15
14: 14 D 2017-08-02
15: 15 D 2017-12-28
16: 16 D 2015-07-02
17: 17 E 2012-07-02
18: 18 E 2018-07-02
19: 19 E 2017-07-02
20: 20 E 2017-07-02", drop = 1L)[
, Date := as.POSIXct(Date)]
d2 <- fread(
"rn group timestamp1 timestamp2
1: A 2015-07-01 2017-07-20
2: A 2020-07-12 2017-07-15
3: B 2017-05-15 2020-05-22", drop = 1L)
cols = c("timestamp1", "timestamp2")
d2[, (cols) := lapply(.SD, as.POSIXct), .SDcols = cols]
Solution 2:[2]
Using data.table
df2$timestamp1 <- as.Date(df2$timestamp1, format = "%Y-%m-%d")
df2$timestamp2 <- as.Date(df2$timestamp2, format = "%Y-%m-%d")
df1$Date <- as.Date(df1$Date, format = "%Y-%m-%d")
df1T <- data.table(df1, key = "group")
df2T <- data.table(df2, key = "group")
df3f <- df1T[df2T]
df3f[df3f$timestamp1 < df3f$Date & df3f$Date < df3f$timestamp2 , ]
id group Date timestamp1 timestamp2
1: 1 A 2017-07-02 2015-07-01 2017-07-20
2: 2 A 2017-07-04 2015-07-01 2017-07-20
3: 3 A 2017-05-15 2015-07-01 2017-07-20
4: 5 B 2017-12-28 2017-05-15 2020-05-22
5: 8 B 2018-07-02 2017-05-15 2020-05-22
You can also use a left join and filter in dplyr like so:
df3 <- df2%>%left_join(df1, by = "group")%>%
mutate(timestamp1 = as.Date(timestamp1, format = "%Y-%m-%d"),
timestamp2 = as.Date(timestamp2, format = "%Y-%m-%d"),
Date = as.Date(Date, format = "%Y-%m-%d"))%>%
filter(timestamp1<Date&Date<timestamp2)%>%print()
group timestamp1 timestamp2 id Date
1 A 2015-07-01 2017-07-20 1 2017-07-02
2 A 2015-07-01 2017-07-20 2 2017-07-04
3 A 2015-07-01 2017-07-20 3 2017-05-15
4 B 2017-05-15 2020-05-22 5 2017-12-28
5 B 2017-05-15 2020-05-22 8 2018-07-02
Solution 3:[3]
Another option with sqldf
:
library(sqldf)
sqldf("select df1.id, df1.Date, df2.* from df1
inner join df2 on df1.'group' = df2.'group'
where df1.Date between df2.timestamp1 and df2.timestamp2")
Result:
id Date group timestamp1 timestamp2
1 1 2017-07-02 A 2015-07-01 2017-07-20
2 2 2017-07-04 A 2015-07-01 2017-07-20
3 3 2017-05-15 A 2015-07-01 2017-07-20
4 5 2017-12-28 B 2017-05-15 2020-05-22
5 8 2018-07-02 B 2017-05-15 2020-05-22
Data:
df1 = read.table(text = " id group Date
1: 1 A 2017-07-02
2: 2 A 2017-07-04
3: 3 A 2017-05-15
4: 4 A 2017-08-02
5: 5 B 2017-12-28
6: 6 B 2015-07-02
7: 7 B 2012-07-02
8: 8 B 2018-07-02
9: 9 C 2017-07-02
10: 10 C 2017-07-02
11: 11 C 2017-07-02
12: 12 C 2017-07-04
13: 13 D 2017-05-15
14: 14 D 2017-08-02
15: 15 D 2017-12-28
16: 16 D 2015-07-02
17: 17 E 2012-07-02
18: 18 E 2018-07-02
19: 19 E 2017-07-02
20: 20 E 2017-07-02", header = TRUE, row.names = 1)
df2 = read.table(text = " group timestamp1 timestamp2
1: A 2015-07-01 2017-07-20
2: A 2020-07-12 2017-07-15
3: B 2017-05-15 2020-05-22", header = TRUE, row.names = 1)
Sources
This article follows the attribution requirements of Stack Overflow and is licensed under CC BY-SA 3.0.
Source: Stack Overflow
Solution | Source |
---|---|
Solution 1 | |
Solution 2 | |
Solution 3 | acylam |