'data.table join with date

hello im trying to extract some id with a group and Date in range

> d1
    id group       Date
 1:  1     A 2017-07-02
 2:  2     A 2017-07-04
 3:  3     A 2017-05-15
 4:  4     A 2017-08-02
 5:  5     B 2017-12-28
 6:  6     B 2015-07-02
 7:  7     B 2012-07-02
 8:  8     B 2018-07-02
 9:  9     C 2017-07-02
10: 10     C 2017-07-02
11: 11     C 2017-07-02
12: 12     C 2017-07-04
13: 13     D 2017-05-15
14: 14     D 2017-08-02
15: 15     D 2017-12-28
16: 16     D 2015-07-02
17: 17     E 2012-07-02
18: 18     E 2018-07-02
19: 19     E 2017-07-02
20: 20     E 2017-07-02

> d2
   group timestamp1 timestamp2
1:     A 2015-07-01 2017-07-20
2:     A 2020-07-12 2017-07-15
3:     B 2017-05-15 2020-05-22

and i want the id from d1 matching d2 date range and group

   group timestamp1 timestamp2 id
1:     A 2017-07-02 2017-07-02  1
2:     A 2017-07-04 2017-07-04  2
3:     A 2017-05-15 2017-05-15  3
4:     B 2017-12-28 2017-12-28  5
5:     B 2018-07-02 2018-07-02  8

i checked this How to perform join over date ranges using data.table? i think it's the solution but ican't make it works.

Date, timestamp, timestamp2 are in POSIXct

please help :)



Solution 1:[1]

The OP has requested to do a non-equi inner join using data.table:

library(data.table)
d2[d1, on = .(group, timestamp1 <= Date, timestamp2 >= Date), nomatch = 0L]
   group timestamp1 timestamp2 id
1:     A 2017-07-02 2017-07-02  1
2:     A 2017-07-04 2017-07-04  2
3:     A 2017-05-15 2017-05-15  3
4:     B 2017-12-28 2017-12-28  5
5:     B 2018-07-02 2018-07-02  8

Data

library(data.table)
d1 <- fread(
"rn id group       Date
 1:  1     A 2017-07-02
 2:  2     A 2017-07-04
 3:  3     A 2017-05-15
 4:  4     A 2017-08-02
 5:  5     B 2017-12-28
 6:  6     B 2015-07-02
 7:  7     B 2012-07-02
 8:  8     B 2018-07-02
 9:  9     C 2017-07-02
10: 10     C 2017-07-02
11: 11     C 2017-07-02
12: 12     C 2017-07-04
13: 13     D 2017-05-15
14: 14     D 2017-08-02
15: 15     D 2017-12-28
16: 16     D 2015-07-02
17: 17     E 2012-07-02
18: 18     E 2018-07-02
19: 19     E 2017-07-02
20: 20     E 2017-07-02", drop = 1L)[
  , Date := as.POSIXct(Date)]

d2 <- fread(
  "rn    group timestamp1 timestamp2
1:     A 2015-07-01 2017-07-20
2:     A 2020-07-12 2017-07-15
3:     B 2017-05-15 2020-05-22", drop = 1L)
cols = c("timestamp1", "timestamp2")
d2[, (cols) := lapply(.SD, as.POSIXct), .SDcols = cols]

Solution 2:[2]

Using data.table

df2$timestamp1 <- as.Date(df2$timestamp1, format = "%Y-%m-%d")
df2$timestamp2 <- as.Date(df2$timestamp2, format = "%Y-%m-%d")
df1$Date <- as.Date(df1$Date, format = "%Y-%m-%d")

df1T <- data.table(df1, key = "group")
df2T <- data.table(df2, key = "group")
df3f <- df1T[df2T]
df3f[df3f$timestamp1 < df3f$Date & df3f$Date < df3f$timestamp2 , ]

   id group       Date timestamp1 timestamp2
1:  1     A 2017-07-02 2015-07-01 2017-07-20
2:  2     A 2017-07-04 2015-07-01 2017-07-20
3:  3     A 2017-05-15 2015-07-01 2017-07-20
4:  5     B 2017-12-28 2017-05-15 2020-05-22
5:  8     B 2018-07-02 2017-05-15 2020-05-22

You can also use a left join and filter in dplyr like so:

df3 <- df2%>%left_join(df1, by  = "group")%>%
  mutate(timestamp1 = as.Date(timestamp1, format = "%Y-%m-%d"),
         timestamp2 = as.Date(timestamp2, format = "%Y-%m-%d"),
         Date = as.Date(Date, format = "%Y-%m-%d"))%>%
         filter(timestamp1<Date&Date<timestamp2)%>%print()

  group timestamp1 timestamp2 id       Date
1     A 2015-07-01 2017-07-20  1 2017-07-02
2     A 2015-07-01 2017-07-20  2 2017-07-04
3     A 2015-07-01 2017-07-20  3 2017-05-15
4     B 2017-05-15 2020-05-22  5 2017-12-28
5     B 2017-05-15 2020-05-22  8 2018-07-02

Solution 3:[3]

Another option with sqldf:

library(sqldf)

sqldf("select df1.id, df1.Date, df2.* from df1
      inner join df2 on df1.'group' = df2.'group'
      where df1.Date between df2.timestamp1 and df2.timestamp2")

Result:

  id       Date group timestamp1 timestamp2
1  1 2017-07-02     A 2015-07-01 2017-07-20
2  2 2017-07-04     A 2015-07-01 2017-07-20
3  3 2017-05-15     A 2015-07-01 2017-07-20
4  5 2017-12-28     B 2017-05-15 2020-05-22
5  8 2018-07-02     B 2017-05-15 2020-05-22

Data:

df1 = read.table(text = "    id group       Date
                  1:  1     A 2017-07-02
                 2:  2     A 2017-07-04
                 3:  3     A 2017-05-15
                 4:  4     A 2017-08-02
                 5:  5     B 2017-12-28
                 6:  6     B 2015-07-02
                 7:  7     B 2012-07-02
                 8:  8     B 2018-07-02
                 9:  9     C 2017-07-02
                 10: 10     C 2017-07-02
                 11: 11     C 2017-07-02
                 12: 12     C 2017-07-04
                 13: 13     D 2017-05-15
                 14: 14     D 2017-08-02
                 15: 15     D 2017-12-28
                 16: 16     D 2015-07-02
                 17: 17     E 2012-07-02
                 18: 18     E 2018-07-02
                 19: 19     E 2017-07-02
                 20: 20     E 2017-07-02", header = TRUE, row.names = 1)

df2 = read.table(text = "   group timestamp1 timestamp2
1:     A 2015-07-01 2017-07-20
2:     A 2020-07-12 2017-07-15
3:     B 2017-05-15 2020-05-22", header = TRUE, row.names = 1)

Sources

This article follows the attribution requirements of Stack Overflow and is licensed under CC BY-SA 3.0.

Source: Stack Overflow

Solution Source
Solution 1
Solution 2
Solution 3 acylam