CohortConstructor

An R package to build and curate cohorts in the OMOP Common Data Model

Introduction

  • CohortConstructor package is designed to support cohort building pipelines in R.
  • CohortConstructor v0.5.0 is available in CRAN.
  • Currently working towards version 1.0.0 (time for changes, if any!).

CohortConstructor pipeline

1) Create base cohorts: Cohorts defined using clinical concepts (e.g., asthma diagnoses) or demographics (e.g., females aged >18)

2) Cohort-curation: Tranform base cohorts to meet study-specific inclusion criteria.

Function sets

 

Base cohorts Cohort construction based on clinical concepts or demographics.

 

Requirements and Filtering Demographic restrictions, event presence/absence conditions, and filtering specific records.

 

Time Manipulation Adjusting entry and exit dates to align with study periods, observation windows, or key events.

 

Transformation and Combination Merging, stratifying, collapsing, matching, or intersecting cohorts.

Training Objectives

  • We will assume all participants are familiar with OMOP cohorts in R: the <cohort_table> class and its attributes.

  • We will assume all participants know how to create base cohorts. This includes the use of CodelistGenerator to define a concept set of interest and conceptCohort to build a base cohort.

  • During the session we will revise cohort curation functions in detail, and conclude with a hands-on practical session.

Requirements and Filtering

Functions to apply requirements and filter

Require demographics

Main function

requireDemographics(
  cohort,
  cohortId = NULL, 
  indexDate = "cohort_start_date",
  ageRange = c(0, 150), 
  sex = c("Both"), 
  minPriorObservation = 0, 
  minFutureObservation = 0,
  atFirst = FALSE, 
  name = tableName(cohort),
  .softValidation = TRUE
)

Require demographics

Specific functions

requireAge(
  cohort,
  ageRange,
  cohortId = NULL,
  indexDate = "cohort_start_date",
  atFirst = FALSE,
  name = tableName(cohort),
  .softValidation = TRUE
)
requireSex(
  cohort,
  sex,
  cohortId = NULL,
  atFirst = FALSE,
  name = tableName(cohort),
  .softValidation = TRUE
)
requirePriorObservation(
  cohort,
  minPriorObservation,
  cohortId = NULL,
  indexDate = "cohort_start_date",
  atFirst = FALSE,
  name = tableName(cohort),
  .softValidation = TRUE
)
requireFutureObservation(
  cohort,
  minFutureObservation,
  cohortId = NULL,
  indexDate = "cohort_start_date",
  atFirst = FALSE,
  name = tableName(cohort),
  .softValidation = TRUE
)

Require entries

Any entry

requireIsEntry(
  cohort,
  entryRange,
  cohortId = NULL,
  name = tableName(cohort),
  .softValidation = TRUE
)

First entry

requireIsFirstEntry(
  cohort,
  cohortId = NULL,
  name = tableName(cohort),
  .softValidation = TRUE
)

Last entry

requireIsLastEntry(
  cohort,
  cohortId = NULL,
  name = tableName(cohort),
  .softValidation = TRUE
)

Require intersecctions

Intersecctions based on other cohorts

requireCohortIntersect(
  cohort, 
  targetCohortTable, 
  window, 
  intersections = c(1, Inf),
  cohortId = NULL,
  targetCohortId = NULL,
  indexDate = "cohort_start_date",
  targetStartDate = "cohort_start_date",
  targetEndDate = "cohort_end_date",
  censorDate = NULL,
  atFirst = FALSE,
  name = tableName(cohort),
  .softValidation = TRUE
)

Require intersecctions based on other cohorts

Require intersecctions based on other cohorts

Require intersecctions

Based on concept sets

requireConceptIntersect(
  cohort,
  conceptSet,
  window,
  intersections = c(1, Inf),
  cohortId = NULL,
  indexDate = "cohort_start_date",
  targetStartDate = "event_start_date",
  targetEndDate = "event_end_date",
  inObservation = TRUE,
  censorDate = NULL,
  atFirst = FALSE,
  name = tableName(cohort),
  .softValidation = TRUE
)

Based on tables in the CDM

requireTableIntersect(
  cohort,
  tableName,
  window,
  intersections = c(1, Inf),
  cohortId = NULL,
  indexDate = "cohort_start_date",
  targetStartDate = startDateColumn(tableName),
  targetEndDate = endDateColumn(tableName),
  inObservation = TRUE,
  censorDate = NULL,
  atFirst = FALSE,
  name = tableName(cohort),
  .softValidation = TRUE
)

Require intersecctions based on concepts

Require intersecctions based on concepts

Other require functions

In date range

requireInDateRange(
  cohort,
  dateRange,
  cohortId = NULL,
  indexDate = "cohort_start_date",
  atFirst = FALSE,
  name = tableName(cohort),
  .softValidation = TRUE
)

A minimum number of counts

requireMinCohortCount(
  cohort,
  minCohortCount,
  cohortId = NULL,
  updateSettings = FALSE,
  name = tableName(cohort)
)

Time Manipulation

Functions to update cohort start and end dates

Exit at death date

exitAtDeath(
  cohort,
  cohortId = NULL,
  requireDeath = FALSE,
  name = tableName(cohort),
  .softValidation = FALSE
)

Exit at death date

exitAtDeath(
  cohort,
  cohortId = NULL,
  requireDeath = FALSE,
  name = tableName(cohort),
  .softValidation = FALSE
)

Exit at observation end date

exitAtObservationEnd(
  cohort,
  cohortId = NULL,
  limitToCurrentPeriod = TRUE,
  name = tableName(cohort),
  .softValidation = FALSE
)

Exit at observation end date

exitAtObservationEnd(
  cohort,
  cohortId = NULL,
  limitToCurrentPeriod = TRUE,
  name = tableName(cohort),
  .softValidation = FALSE
)

Exit at first/last date

At first date

exitAtFirstDate(
  cohort,
  dateColumns,
  cohortId = NULL,
  returnReason = FALSE,
  keepDateColumns = TRUE,
  name = tableName(cohort),
  .softValidation = FALSE
)

At last date

exitAtLastDate(
  cohort,
  dateColumns,
  cohortId = NULL,
  returnReason = FALSE,
  keepDateColumns = TRUE,
  name = tableName(cohort),
  .softValidation = FALSE
)

Exit at first/last date

Use case

cdm$study_population <- cdm$study_population |>
  PatientProfiles::addFutureObservation(futureObservationName = "observation_end") 
  PatientProfiles::addDeathDate() |>
  PatientProfiles::addCohortIntersectDate(
    targetCohortTable = "covid_vaccines",
    targetCohortId = "any_covid_vaccine",
    window = c(1, Inf),
    nameStyle = "next_covid_vaccine"
  ) |>
  exitAtFirstDateStudy(
    dateColumns = c("date_of_death", "next_covid_vaccine", "observation_end"),
    keepDateColumns = FALSE,
    returnReason = TRUE
  )

Entry at first/last date

At first date

entryAtFirstDate(
  cohort,
  dateColumns,
  cohortId = NULL,
  returnReason = FALSE,
  keepDateColumns = TRUE,
  name = tableName(cohort),
  .softValidation = FALSE
)

At last date

entryAtLastDate(
  cohort,
  dateColumns,
  cohortId = NULL,
  returnReason = FALSE,
  keepDateColumns = TRUE,
  name = tableName(cohort),
  .softValidation = FALSE
)

Trim functions

Based on demographics

trimDemographics(
  cohort,
  cohortId = NULL,
  ageRange = NULL,
  sex = NULL,
  minPriorObservation = NULL,
  minFutureObservation = NULL,
  name = tableName(cohort),
  .softValidation = TRUE
)

Based on date range

trimToDateRange(
  cohort,
  dateRange,
  cohortId = NULL,
  startDate = "cohort_start_date",
  endDate = "cohort_end_date",
  name = tableName(cohort),
  .softValidation = FALSE
)

Trim based on demographics

Trim based on demographics

Pad dates functions

padCohortDate(
  cohort,
  days,
  cohortDate = "cohort_start_date",
  indexDate = "cohort_start_date",
  collapse = TRUE,
  padObservation = TRUE,
  cohortId = NULL,
  name = tableName(cohort),
  .softValidation = FALSE
)
padCohortStart(
  cohort,
  days,
  collapse = TRUE,
  padObservation = TRUE,
  cohortId = NULL,
  name = tableName(cohort),
  .softValidation = FALSE
)
padCohortEnd(
  cohort,
  days,
  collapse = TRUE,
  padObservation = TRUE,
  cohortId = NULL,
  name = tableName(cohort),
  .softValidation = FALSE
)

Pad dates functions

Use case: apply wash-out

cdm$cardiac_events <- cdm$cardiac_events |>
  padCohortEnd(
    days = 90,
    collapse = TRUE,
    padObservation = TRUE
  )

Transformation and Combination

Split cohorts

Based on years

yearCohorts(
  cohort,
  years,
  cohortId = NULL,
  name = tableName(cohort),
  .softValidation = FALSE
)

Based on other columns

stratifyCohorts(
  cohort,
  strata,
  cohortId = NULL,
  removeStrata = TRUE,
  name = tableName(cohort),
  .softValidation = TRUE
)

Combine cohorts - Union cohorts

unionCohorts(
  cohort,
  cohortId = NULL,
  gap = 0,
  cohortName = NULL,
  keepOriginalCohorts = FALSE,
  name = tableName(cohort),
  .softValidation = TRUE
)

Combine cohorts - Union cohorts

unionCohorts(
  cohort,
  cohortId = NULL,
  gap = 0,
  cohortName = NULL,
  keepOriginalCohorts = FALSE,
  name = tableName(cohort),
  .softValidation = TRUE
)

Combine cohorts - Intersect cohorts

intersectCohorts(
  cohort,
  cohortId = NULL,
  gap = 0,
  returnNonOverlappingCohorts = FALSE,
  keepOriginalCohorts = FALSE,
  name = tableName(cohort),
  .softValidation = FALSE
)

Combine cohorts - Intersect cohorts

intersectCohorts(
  cohort,
  cohortId = NULL,
  gap = 0,
  returnNonOverlappingCohorts = FALSE,
  keepOriginalCohorts = FALSE,
  name = tableName(cohort),
  .softValidation = FALSE
)

Combine cohorts - Intersect cohorts

intersectCohorts(
  cohort,
  cohortId = NULL,
  gap = 0,
  returnNonOverlappingCohorts = FALSE,
  keepOriginalCohorts = FALSE,
  name = tableName(cohort),
  .softValidation = FALSE
)

Filter cohorts

Sample

sampleCohorts(
  cohort, 
  n, 
  cohortId = NULL, 
  name = tableName(cohort)
)

Subset

subsetCohorts(
  cohort,
  cohortId,
  name = tableName(cohort),
  .softValidation = TRUE
)

Collapse cohorts

collapseCohorts(
  cohort,
  cohortId = NULL,
  gap = 0,
  name = tableName(cohort),
  .softValidation = FALSE
)

Matched cohorts

matchCohorts(
  cohort,
  cohortId = NULL,
  matchSex = TRUE,
  matchYearOfBirth = TRUE,
  ratio = 1,
  keepOriginalCohorts = FALSE,
  name = tableName(cohort),
  .softValidation = FALSE
)

Your turn

Cohorts to create

In our study we will characterise recipients of pneumococcal (concept 40213201) and varicella vaccines (concept 40213251). Recipients of the pneumococcal vaccine must be aged between 60 and 80, while recipients of varicella vaccine must be under 5. We will only include individuals first ever vaccination. We will exclude anyone with a prior history of acute bronchitis (concept 260139). To summarise our results we will create a table summasing attrition and another table summarising patient demographics (age and sex).

Minimal help

# Good luck

A lot of help

library(dplyr)
library(CDMConnector)
library(CodelistGenerator)
library(CohortConstructor)
library(CohortCharacteristics)

con <- DBI::dbConnect(duckdb::duckdb(), 
                      eunomiaDir(datasetName = "GiBleed"))
cdm <- cdmFromCon(con,
                  cdmSchema = "main",
                  writeSchema = "main")

# vaccine codes
vaccine_codes <- list(pneumococcal_vaccine = 40213201L,
                      varicella_vaccine = 40213251L) |> 
  newCodelist()

# acute bronchitis
acute_bronchitis_codes <- list(acute_bronchitis = 260139L) |> 
  newCodelist()

# do people have more than one vaccine record?
# does mapping from source look sensible for vaccines?
summariseCodeUse(
  vaccine_codes, cdm) |> 
  tableCodeUse()

# Create base cohorts
cdm$vaccine_cohorts <- cdm |>
  conceptCohort(
    .....)

cdm$acute_bronchitis_cohort <- cdm |>
  conceptCohort(
    conceptSet = acute_bronchitis_codes,
    .....)

# exit as end of observation
# persist across observation periods
cdm$vaccine_cohorts <- cdm$vaccine_cohorts |>
  requireIsFirst....

# for pneumococcal_vaccine keep those aged between 60 to 80
cdm$vaccine_cohorts <- cdm$vaccine_cohorts |>
  requireAge(
    ....
    cohortId = "pneumococcal_vaccine")

# for varicella_vaccine keep those aged under 5
cdm$vaccine_cohorts <- cdm$vaccine_cohorts |>
  requireAge(
    .... 
    cohortId = "varicella_vaccine")

# exclude anyone in one of the exclusion cohorts
cdm$vaccine_cohorts <- cdm$vaccine_cohorts |>
  requireCohortIntersect( .....)

# use cohort characteristics to make an attrition table
summariseCohortAttrition(cdm$vaccine_cohorts) |> 
  tableCohortAttrition()

# use cohort characteristics to make a characteristics table
summariseCharacteristics(cdm$vaccine_cohorts) |> 
  tableCharacteristics()

Have done it for you

library(dplyr)
library(CDMConnector)
library(CodelistGenerator)
library(CohortConstructor)
library(CohortCharacteristics)

con <- DBI::dbConnect(duckdb::duckdb(), 
                      eunomiaDir(datasetName = "GiBleed"))
cdm <- cdmFromCon(con,
                  cdmSchema = "main",
                  writeSchema = "main")

# vaccine codes
vaccine_codes <- list(pneumococcal_vaccine = 40213201L,
                      varicella_vaccine = 40213251L) |> 
  newCodelist()

# acute bronchitis
acute_bronchitis_codes <- list(acute_bronchitis = 260139L) |> 
  newCodelist()

# do people have more than one vaccine record?
# does mapping from source look sensible for vaccines?
summariseCodeUse(
  vaccine_codes, cdm) |> 
  tableCodeUse()

# Create base cohorts
cdm$vaccine_cohorts <- cdm |>
  conceptCohort(
    conceptSet = vaccine_codes,
    name = "vaccine_cohorts",
    exit = "event_start_date", 
    useSourceFields = TRUE)

cdm$acute_bronchitis_cohort <- cdm |>
  conceptCohort(
    conceptSet = acute_bronchitis_codes,
    name = "acute_bronchitis_cohort",
    exit = "event_start_date", 
    subsetCohort = "vaccine_cohorts")

# exit as end of observation
# persist across observation periods
cdm$vaccine_cohorts <- cdm$vaccine_cohorts |>
  requireIsFirstEntry()

# for pneumococcal_vaccine keep those aged between 60 to 80
cdm$vaccine_cohorts <- cdm$vaccine_cohorts |>
  requireAge(
    name = "vaccine_cohorts",
    ageRange = c(60, 80), 
    cohortId = "pneumococcal_vaccine")

# for varicella_vaccine keep those aged under 5
cdm$vaccine_cohorts <- cdm$vaccine_cohorts |>
  requireAge(
    name = "vaccine_cohorts",
    ageRange = c(0, 5), 
    cohortId = "varicella_vaccine")

# exclude anyone in one of the exclusion cohorts
cdm$vaccine_cohorts <- cdm$vaccine_cohorts |>
  requireCohortIntersect(
    targetCohortTable = "acute_bronchitis_cohort", 
    window = c(-Inf, -1))

# use cohort characteristics to make an attrition table
summariseCohortAttrition(cdm$vaccine_cohorts) |> 
  tableCohortAttrition()

# use cohort characteristics to make a characteristics table
summariseCharacteristics(cdm$vaccine_cohorts) |> 
  tableCharacteristics()

Changes

  • What would you change if you wanted to use the cohort for prevalence?
  • What would you change if you wanted to exclude people who had a record of acute bronchitis in the prior year?
  • What would you change if you wanted to exclude people who had ongoing acute bronchitis event in the last year?
  • How could you define a second dose cohort?

Thank you!