다음은 중복 레코드를 감지하는 데 data.table
사용 하는 솔루션입니다 foverlaps
(@GenesRus에서 이미 언급 한대로). 겹치는 레코드는 그룹에 할당되어 레코드를 최대로 필터링합니다. 그룹의 우선 순위. 이 절차가 3 개 이상의 겹치는 레코드에 대해서도 작동 함을 보여주기 위해 예제 데이터에 두 개 이상의 레코드를 추가했습니다.
편집 :data.table
더 빠른 코드를 제공하는 @pgcudahy의 솔루션을 수정하고 번역했습니다 .
library(data.table)
library(lubridate)
times_df <- data.frame(
start = as_datetime(
c(
"2019-10-05 14:05:25",
"2019-10-05 17:30:20",
"2019-10-05 17:37:00",
"2019-10-06 04:43:55",
"2019-10-06 04:53:45",
"2019-10-06 04:53:46",
"2019-10-06 04:53:47"
)
),
stop = as_datetime(
c(
"2019-10-05 14:19:20",
"2019-10-05 17:45:15",
"2019-10-05 17:50:45",
"2019-10-06 04:59:00",
"2019-10-06 05:07:10",
"2019-10-06 05:07:11",
"2019-10-06 05:07:12"
)
),
priority = c(5, 3, 4, 3, 4, 5, 6)
)
resultDT <- setDT(times_df, key="start")[!(stop >= shift(start, type="lead", fill = TRUE) & priority > shift(priority, type="lead", fill = TRUE)) &
!(start <= shift(stop, type="lag", fill = FALSE) & priority > shift(priority, type="lag", fill = TRUE))]
# old approach ------------------------------------------------------------
# times_dt <- as.data.table(times_df)
# setkey(times_dt, start, stop)[, index := .I]
# overlaps_dt <- foverlaps(times_dt, times_dt, type = "any", which = TRUE)[xid != yid][, group := fifelse(xid > yid, yes = paste0(yid, "_", xid), no = paste0(xid, "_", yid))]
# overlaps_merged <- merge(times_dt, overlaps_dt, by.x = "index", by.y = "xid")[, .(delete_index = index[priority == max(priority)]), by = "group"]
# result_dt <- times_dt[!unique(overlaps_merged$delete_index)][, index := NULL]
자세한 내용은 다음을 참조하십시오 ?foverlaps
- maxgap
, minoverlap
또는 type
(임의의 내부, 시작, 종료 및 동일과 같은) 겹침으로 간주되는 것을 제어하기 위해 구현 된 더 유용한 기능이 있습니다 .
업데이트-새로운 벤치 마크
Unit: microseconds
expr min lq mean median uq max neval
Paul 25572.550 26105.2710 30183.930 26514.342 29614.272 153810.600 100
MKa 5100.447 5276.8350 6508.333 5401.275 5832.270 23137.879 100
pgcudahy 3330.243 3474.4345 4284.640 3556.802 3748.203 21241.260 100
ismirsehregal 711.084 913.3475 1144.829 1013.096 1433.427 2316.159 100
벤치 마크 코드 :
#### library ----
library(dplyr)
library(lubridate)
library(igraph)
library(data.table)
library(microbenchmark)
#### data ----
times_df <- data.frame(
start = as_datetime(
c(
"2019-10-05 14:05:25",
"2019-10-05 17:30:20",
"2019-10-05 17:37:00",
"2019-10-06 04:43:55",
"2019-10-06 04:53:45",
"2019-10-06 04:53:46",
"2019-10-06 04:53:47"
)
),
stop = as_datetime(
c(
"2019-10-05 14:19:20",
"2019-10-05 17:45:15",
"2019-10-05 17:50:45",
"2019-10-06 04:59:00",
"2019-10-06 05:07:10",
"2019-10-06 05:07:11",
"2019-10-06 05:07:12"
)
),
priority = c(5, 3, 4, 3, 4, 5, 6)
)
times_tib <- as_tibble(times_df)
times_dt <- as.data.table(times_df)
#### group_interval function ----
# buffer to take a form similar to: days(1), weeks(2), etc.
group_interval <- function(start, end, buffer = 0) {
dat <- tibble(rid = 1:length(start),
start = start,
end = end,
intervals = case_when(!is.na(start) & !is.na(end) ~ interval(start, end),
is.na(start) ~ interval(end, end),
is.na(end) ~ interval(start, start),
TRUE ~ interval(NA, NA)))
# apply buffer period to intervals
int_start(dat$intervals) <- int_start(dat$intervals) - buffer + seconds(0.01)
int_end(dat$intervals) <- int_end(dat$intervals) + buffer - seconds(0.01)
df_overlap <- bind_cols(
expand.grid(dat$rid, dat$rid), # make a 2 col table with every combination of id numbers
expand.grid(dat$intervals, dat$intervals)) %>% # make a combination of every interval
mutate(overlap = int_overlaps(.data$Var11, .data$Var21)) %>% # determine if intervals overlap
rename("row" = "Var1", "col" = "Var2")
# Find groups via graph theory See igraph package
dat_graph <- graph_from_data_frame(filter(df_overlap, overlap) %>% select(row, col))
groups <- components(dat_graph)$membership[df_overlap$row]
# create a 2 column df with row (index) and group number, arrange on row number and return distinct values
df_groups <- tibble(row = as.integer(names(groups)), group = groups) %>%
unique()
# returns
left_join(select(dat, rid), df_groups, by = c("rid" = "row"))$group
}
#### benchmark ----
library(igraph)
library(data.table)
library(dplyr)
library(lubridate)
library(microbenchmark)
df_Paul <- df_MKa <- df_pgcudahy <- df_ismirsehregal <- times_df <- data.frame(
start = as_datetime(
c(
"2019-10-05 14:05:25",
"2019-10-05 17:30:20",
"2019-10-05 17:37:00",
"2019-10-06 04:43:55",
"2019-10-06 04:53:45",
"2019-10-06 04:53:46",
"2019-10-07 06:00:00",
"2019-10-07 06:10:00",
"2019-10-07 06:20:00",
"2019-10-08 06:00:00",
"2019-10-08 06:10:00",
"2019-10-08 06:20:00",
"2019-10-09 03:00:00",
"2019-10-09 03:10:00",
"2019-10-10 03:00:00",
"2019-10-10 03:10:00",
"2019-10-11 05:00:00",
"2019-10-11 05:00:00")
),
stop = as_datetime(
c(
"2019-10-05 14:19:20",
"2019-10-05 17:45:15",
"2019-10-05 17:50:45",
"2019-10-06 04:59:00",
"2019-10-06 05:07:10",
"2019-10-06 05:07:11",
"2019-10-07 06:18:00",
"2019-10-07 06:28:00",
"2019-10-07 06:38:00",
"2019-10-08 06:18:00",
"2019-10-08 06:28:00",
"2019-10-08 06:38:00",
"2019-10-09 03:30:00",
"2019-10-09 03:20:00",
"2019-10-10 03:30:00",
"2019-10-10 03:20:00",
"2019-10-11 05:40:00",
"2019-10-11 05:40:00")
),
priority = c(5, 3, 4, 3, 4, 5, 4, 3, 4, 3, 4, 3, 1, 2, 2, 1, 3, 4)
)
benchmarks <- microbenchmark(Paul = {
group_interval <- function(start, end, buffer = 0) {
dat <- tibble(rid = 1:length(start),
start = start,
end = end,
intervals = case_when(!is.na(start) & !is.na(end) ~ interval(start, end),
is.na(start) ~ interval(end, end),
is.na(end) ~ interval(start, start),
TRUE ~ interval(NA, NA)))
int_start(dat$intervals) <- int_start(dat$intervals) - buffer + seconds(0.01)
int_end(dat$intervals) <- int_end(dat$intervals) + buffer - seconds(0.01)
df_overlap <- bind_cols(
expand.grid(dat$rid, dat$rid), # make a 2 col table with every combination of id numbers
expand.grid(dat$intervals, dat$intervals)) %>% # make a combination of every interval
mutate(overlap = int_overlaps(.data$Var11, .data$Var21)) %>% # determine if intervals overlap
rename("row" = "Var1", "col" = "Var2")
dat_graph <- graph_from_data_frame(filter(df_overlap, overlap) %>% select(row, col))
groups <- components(dat_graph)$membership[df_overlap$row]
df_groups <- tibble(row = as.integer(names(groups)), group = groups) %>%
unique()
left_join(select(dat, rid), df_groups, by = c("rid" = "row"))$group
}
times_tib <- as_tibble(df_Paul)
mutate(times_tib, group = group_interval(start, stop)) %>%
group_by(group) %>%
top_n(1, desc(priority)) %>%
ungroup() %>%
select(-group)
},
MKa = {
df_MKa$id <- 1:nrow(df_MKa)
# Create consolidated df which we will use to check if stop date is in between start and stop
my_df <- bind_rows(replicate(n = nrow(df_MKa), expr = df_MKa, simplify = FALSE))
my_df$stop_chk <- rep(df_MKa$stop, each = nrow(df_MKa))
# Flag if stop date sits in between start and stop
my_df$chk <- my_df$stop_chk >= my_df$start & my_df$stop_chk <= my_df$stop
my_df$chk_id <- df_MKa[match(my_df$stop_chk, df_MKa$stop), "id"]
# Using igrpah to cluster ids to create unique groups
# this will identify any overlapping groups
library(igraph)
g <- graph.data.frame(my_df[my_df$chk == TRUE, c("id", "chk_id")])
df_g <- data.frame(clusters(g)$membership)
df_g$chk_id <- row.names(df_g)
# copy the unique groups to the df
my_df$new_id <- df_g[match(my_df$chk_id, df_g$chk_id), "clusters.g..membership"]
my_df %>%
filter(chk == TRUE) %>%
arrange(priority) %>%
filter(!duplicated(new_id)) %>%
select(start, stop, priority) %>%
arrange(start)
}, pgcudahy = {
df_pgcudahy %>%
arrange(start) %>%
mutate(remove1 = ifelse((stop >= lead(start, default=FALSE)) &
(priority > lead(priority, default=(max(priority) + 1))), TRUE, FALSE)) %>%
mutate(remove2 = ifelse((start <= lag(stop, default=FALSE)) &
(priority > lag(priority, default=(max(priority) + 1))), TRUE, FALSE)) %>%
filter(remove1 == FALSE & remove2 == FALSE) %>%
select(1:3)
}, ismirsehregal = {
setDT(df_ismirsehregal, key="start")[!(stop >= shift(start, type="lead", fill = TRUE) & priority > shift(priority, type="lead", fill = TRUE)) &
!(start <= shift(stop, type="lag", fill = FALSE) & priority > shift(priority, type="lag", fill = TRUE))]
})
benchmarks
combn
있지만 행이 많은 경우 비용이 많이들 수 있습니다.times_df %>% mutate(interval = interval(start, stop)) %>% {combn(nrow(.), 2, function(x) if (int_overlaps(.$interval[x[1]], .$interval[x[2]])) x[which.min(.$priority[x])], simplify = FALSE)} %>% unlist() %>% {slice(times_df, -.)}