An R package to build and curate cohorts in the OMOP Common Data Model







1) Create base cohorts: Cohorts defined using clinical concepts (e.g., asthma diagnoses) or demographics (e.g., females aged >18)
2) Cohort-curation: Tranform base cohorts to meet study-specific inclusion criteria.


Base cohorts Cohort construction based on clinical concepts or demographics.
Requirements and Filtering Demographic restrictions, event presence/absence conditions, and filtering specific records.
Time Manipulation Adjusting entry and exit dates to align with study periods, observation windows, or key events.
Transformation and Combination Merging, stratifying, collapsing, matching, or intersecting cohorts.


We will assume all participants are familiar with OMOP cohorts in R: the <cohort_table> class and its attributes.
We will assume all participants know how to create base cohorts. This includes the use of CodelistGenerator to define a concept set of interest and conceptCohort to build a base cohort.
During the session we will revise cohort curation functions in detail, and conclude with a hands-on practical session.


On demographics
On cohort entries
Require presence or absence based on other cohorts, concepts, and tables


Main function


Specific functions
requireAge(
cohort,
ageRange,
cohortId = NULL,
indexDate = "cohort_start_date",
atFirst = FALSE,
name = tableName(cohort),
.softValidation = TRUE
)requireSex(
cohort,
sex,
cohortId = NULL,
atFirst = FALSE,
name = tableName(cohort),
.softValidation = TRUE
)requirePriorObservation(
cohort,
minPriorObservation,
cohortId = NULL,
indexDate = "cohort_start_date",
atFirst = FALSE,
name = tableName(cohort),
.softValidation = TRUE
)requireFutureObservation(
cohort,
minFutureObservation,
cohortId = NULL,
indexDate = "cohort_start_date",
atFirst = FALSE,
name = tableName(cohort),
.softValidation = TRUE
)

Any entry
requireIsEntry(
cohort,
entryRange,
cohortId = NULL,
name = tableName(cohort),
.softValidation = TRUE
)First entry
requireIsFirstEntry(
cohort,
cohortId = NULL,
name = tableName(cohort),
.softValidation = TRUE
)Last entry
requireIsLastEntry(
cohort,
cohortId = NULL,
name = tableName(cohort),
.softValidation = TRUE
)

Intersecctions based on other cohorts
requireCohortIntersect(
cohort,
targetCohortTable,
window,
intersections = c(1, Inf),
cohortId = NULL,
targetCohortId = NULL,
indexDate = "cohort_start_date",
targetStartDate = "cohort_start_date",
targetEndDate = "cohort_end_date",
censorDate = NULL,
atFirst = FALSE,
name = tableName(cohort),
.softValidation = TRUE
)







Based on concept sets
requireConceptIntersect(
cohort,
conceptSet,
window,
intersections = c(1, Inf),
cohortId = NULL,
indexDate = "cohort_start_date",
targetStartDate = "event_start_date",
targetEndDate = "event_end_date",
inObservation = TRUE,
censorDate = NULL,
atFirst = FALSE,
name = tableName(cohort),
.softValidation = TRUE
)Based on tables in the CDM
requireTableIntersect(
cohort,
tableName,
window,
intersections = c(1, Inf),
cohortId = NULL,
indexDate = "cohort_start_date",
targetStartDate = startDateColumn(tableName),
targetEndDate = endDateColumn(tableName),
inObservation = TRUE,
censorDate = NULL,
atFirst = FALSE,
name = tableName(cohort),
.softValidation = TRUE
)







In date range
requireInDateRange(
cohort,
dateRange,
cohortId = NULL,
indexDate = "cohort_start_date",
atFirst = FALSE,
name = tableName(cohort),
.softValidation = TRUE
)A minimum number of counts
requireMinCohortCount(
cohort,
minCohortCount,
cohortId = NULL,
updateSettings = FALSE,
name = tableName(cohort)
)

Cohort entry
Trim start and end dates
Pad start and end dates


exitAtDeath(
cohort,
cohortId = NULL,
requireDeath = FALSE,
name = tableName(cohort),
.softValidation = FALSE
)


exitAtDeath(
cohort,
cohortId = NULL,
requireDeath = FALSE,
name = tableName(cohort),
.softValidation = FALSE
)


exitAtObservationEnd(
cohort,
cohortId = NULL,
limitToCurrentPeriod = TRUE,
name = tableName(cohort),
.softValidation = FALSE
)


exitAtObservationEnd(
cohort,
cohortId = NULL,
limitToCurrentPeriod = TRUE,
name = tableName(cohort),
.softValidation = FALSE
)


At first date
exitAtFirstDate(
cohort,
dateColumns,
cohortId = NULL,
returnReason = FALSE,
keepDateColumns = TRUE,
name = tableName(cohort),
.softValidation = FALSE
)At last date
exitAtLastDate(
cohort,
dateColumns,
cohortId = NULL,
returnReason = FALSE,
keepDateColumns = TRUE,
name = tableName(cohort),
.softValidation = FALSE
)

Use case
cdm$study_population <- cdm$study_population |>
PatientProfiles::addFutureObservation(futureObservationName = "observation_end")
PatientProfiles::addDeathDate() |>
PatientProfiles::addCohortIntersectDate(
targetCohortTable = "covid_vaccines",
targetCohortId = "any_covid_vaccine",
window = c(1, Inf),
nameStyle = "next_covid_vaccine"
) |>
exitAtFirstDateStudy(
dateColumns = c("date_of_death", "next_covid_vaccine", "observation_end"),
keepDateColumns = FALSE,
returnReason = TRUE
)


At first date
entryAtFirstDate(
cohort,
dateColumns,
cohortId = NULL,
returnReason = FALSE,
keepDateColumns = TRUE,
name = tableName(cohort),
.softValidation = FALSE
)At last date
entryAtLastDate(
cohort,
dateColumns,
cohortId = NULL,
returnReason = FALSE,
keepDateColumns = TRUE,
name = tableName(cohort),
.softValidation = FALSE
)

Based on demographics
trimDemographics(
cohort,
cohortId = NULL,
ageRange = NULL,
sex = NULL,
minPriorObservation = NULL,
minFutureObservation = NULL,
name = tableName(cohort),
.softValidation = TRUE
)Based on date range
trimToDateRange(
cohort,
dateRange,
cohortId = NULL,
startDate = "cohort_start_date",
endDate = "cohort_end_date",
name = tableName(cohort),
.softValidation = FALSE
)







padCohortDate(
cohort,
days,
cohortDate = "cohort_start_date",
indexDate = "cohort_start_date",
collapse = TRUE,
padObservation = TRUE,
cohortId = NULL,
name = tableName(cohort),
.softValidation = FALSE
)padCohortStart(
cohort,
days,
collapse = TRUE,
padObservation = TRUE,
cohortId = NULL,
name = tableName(cohort),
.softValidation = FALSE
)padCohortEnd(
cohort,
days,
collapse = TRUE,
padObservation = TRUE,
cohortId = NULL,
name = tableName(cohort),
.softValidation = FALSE
)

Use case: apply wash-out
cdm$cardiac_events <- cdm$cardiac_events |>
padCohortEnd(
days = 90,
collapse = TRUE,
padObservation = TRUE
)


Based on years
yearCohorts(
cohort,
years,
cohortId = NULL,
name = tableName(cohort),
.softValidation = FALSE
)Based on other columns
stratifyCohorts(
cohort,
strata,
cohortId = NULL,
removeStrata = TRUE,
name = tableName(cohort),
.softValidation = TRUE
)

unionCohorts(
cohort,
cohortId = NULL,
gap = 0,
cohortName = NULL,
keepOriginalCohorts = FALSE,
name = tableName(cohort),
.softValidation = TRUE
)


unionCohorts(
cohort,
cohortId = NULL,
gap = 0,
cohortName = NULL,
keepOriginalCohorts = FALSE,
name = tableName(cohort),
.softValidation = TRUE
)


intersectCohorts(
cohort,
cohortId = NULL,
gap = 0,
returnNonOverlappingCohorts = FALSE,
keepOriginalCohorts = FALSE,
name = tableName(cohort),
.softValidation = FALSE
)


intersectCohorts(
cohort,
cohortId = NULL,
gap = 0,
returnNonOverlappingCohorts = FALSE,
keepOriginalCohorts = FALSE,
name = tableName(cohort),
.softValidation = FALSE
)


intersectCohorts(
cohort,
cohortId = NULL,
gap = 0,
returnNonOverlappingCohorts = FALSE,
keepOriginalCohorts = FALSE,
name = tableName(cohort),
.softValidation = FALSE
)


Sample
sampleCohorts(
cohort,
n,
cohortId = NULL,
name = tableName(cohort)
)Subset
subsetCohorts(
cohort,
cohortId,
name = tableName(cohort),
.softValidation = TRUE
)

collapseCohorts(
cohort,
cohortId = NULL,
gap = 0,
name = tableName(cohort),
.softValidation = FALSE
)

matchCohorts(
cohort,
cohortId = NULL,
matchSex = TRUE,
matchYearOfBirth = TRUE,
ratio = 1,
keepOriginalCohorts = FALSE,
name = tableName(cohort),
.softValidation = FALSE
)In our study we will characterise recipients of pneumococcal (concept 40213201) and varicella vaccines (concept 40213251). Recipients of the pneumococcal vaccine must be aged between 60 and 80, while recipients of varicella vaccine must be under 5. We will only include individuals first ever vaccination. We will exclude anyone with a prior history of acute bronchitis (concept 260139). To summarise our results we will create a table summasing attrition and another table summarising patient demographics (age and sex).
# Good lucklibrary(dplyr)
library(CDMConnector)
library(CodelistGenerator)
library(CohortConstructor)
library(CohortCharacteristics)
con <- DBI::dbConnect(duckdb::duckdb(),
eunomiaDir(datasetName = "GiBleed"))
cdm <- cdmFromCon(con,
cdmSchema = "main",
writeSchema = "main")
# vaccine codes
vaccine_codes <- list(pneumococcal_vaccine = 40213201L,
varicella_vaccine = 40213251L) |>
newCodelist()
# acute bronchitis
acute_bronchitis_codes <- list(acute_bronchitis = 260139L) |>
newCodelist()
# do people have more than one vaccine record?
# does mapping from source look sensible for vaccines?
summariseCodeUse(
vaccine_codes, cdm) |>
tableCodeUse()
# Create base cohorts
cdm$vaccine_cohorts <- cdm |>
conceptCohort(
.....)
cdm$acute_bronchitis_cohort <- cdm |>
conceptCohort(
conceptSet = acute_bronchitis_codes,
.....)
# exit as end of observation
# persist across observation periods
cdm$vaccine_cohorts <- cdm$vaccine_cohorts |>
requireIsFirst....
# for pneumococcal_vaccine keep those aged between 60 to 80
cdm$vaccine_cohorts <- cdm$vaccine_cohorts |>
requireAge(
....
cohortId = "pneumococcal_vaccine")
# for varicella_vaccine keep those aged under 5
cdm$vaccine_cohorts <- cdm$vaccine_cohorts |>
requireAge(
....
cohortId = "varicella_vaccine")
# exclude anyone in one of the exclusion cohorts
cdm$vaccine_cohorts <- cdm$vaccine_cohorts |>
requireCohortIntersect( .....)
# use cohort characteristics to make an attrition table
summariseCohortAttrition(cdm$vaccine_cohorts) |>
tableCohortAttrition()
# use cohort characteristics to make a characteristics table
summariseCharacteristics(cdm$vaccine_cohorts) |>
tableCharacteristics()library(dplyr)
library(CDMConnector)
library(CodelistGenerator)
library(CohortConstructor)
library(CohortCharacteristics)
con <- DBI::dbConnect(duckdb::duckdb(),
eunomiaDir(datasetName = "GiBleed"))
cdm <- cdmFromCon(con,
cdmSchema = "main",
writeSchema = "main")
# vaccine codes
vaccine_codes <- list(pneumococcal_vaccine = 40213201L,
varicella_vaccine = 40213251L) |>
newCodelist()
# acute bronchitis
acute_bronchitis_codes <- list(acute_bronchitis = 260139L) |>
newCodelist()
# do people have more than one vaccine record?
# does mapping from source look sensible for vaccines?
summariseCodeUse(
vaccine_codes, cdm) |>
tableCodeUse()
# Create base cohorts
cdm$vaccine_cohorts <- cdm |>
conceptCohort(
conceptSet = vaccine_codes,
name = "vaccine_cohorts",
exit = "event_start_date",
useSourceFields = TRUE)
cdm$acute_bronchitis_cohort <- cdm |>
conceptCohort(
conceptSet = acute_bronchitis_codes,
name = "acute_bronchitis_cohort",
exit = "event_start_date",
subsetCohort = "vaccine_cohorts")
# exit as end of observation
# persist across observation periods
cdm$vaccine_cohorts <- cdm$vaccine_cohorts |>
requireIsFirstEntry()
# for pneumococcal_vaccine keep those aged between 60 to 80
cdm$vaccine_cohorts <- cdm$vaccine_cohorts |>
requireAge(
name = "vaccine_cohorts",
ageRange = c(60, 80),
cohortId = "pneumococcal_vaccine")
# for varicella_vaccine keep those aged under 5
cdm$vaccine_cohorts <- cdm$vaccine_cohorts |>
requireAge(
name = "vaccine_cohorts",
ageRange = c(0, 5),
cohortId = "varicella_vaccine")
# exclude anyone in one of the exclusion cohorts
cdm$vaccine_cohorts <- cdm$vaccine_cohorts |>
requireCohortIntersect(
targetCohortTable = "acute_bronchitis_cohort",
window = c(-Inf, -1))
# use cohort characteristics to make an attrition table
summariseCohortAttrition(cdm$vaccine_cohorts) |>
tableCohortAttrition()
# use cohort characteristics to make a characteristics table
summariseCharacteristics(cdm$vaccine_cohorts) |>
tableCharacteristics()


HDS internal training