IncidencePrevalence

Working with data in the OMOP CDM format

Standarisation of the data format

The OMOP Common Data Model

Creating a reference to the OMOP common data model

We’re going to use a modified version of the Eunomia synthetic data. The CDMConnector package creates a reference to the OMOP CDM data as a whole.

library(DBI)
library(dplyr)
library(duckdb)
library(CDMConnector)
library(here)

db <- dbConnect(duckdb(), dbdir = dbPath)

cdm <- cdmFromCon(con = db,  cdmSchema = "public", writeSchema = "results")
cdm

── # OMOP CDM reference (duckdb) of Synthea ────────────────────────────────────

• omop tables: person, observation_period, visit_occurrence, visit_detail,
condition_occurrence, drug_exposure, procedure_occurrence, device_exposure,
measurement, observation, death, note, note_nlp, specimen, fact_relationship,
location, care_site, provider, payer_plan_period, cost, drug_era, dose_era,
condition_era, metadata, cdm_source, concept, vocabulary, domain,
concept_class, concept_relationship, relationship, concept_synonym,
concept_ancestor, source_to_concept_map, drug_strength, cohort_definition,
attribute_definition

• cohort tables: -

• achilles tables: -

• other tables: -

CDM clinical tables

cdm$person |> glimpse()

Rows: ??
Columns: 18
Database: DuckDB v1.0.0 [eburn@Windows 10 x64:R 4.4.0/C:\Users\eburn\Documents\EMA training november 2024\Presentations\CdmIncPrev\darwinTutorialTest.duckdb]
$ person_id                   <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,…
$ gender_concept_id           <int> 8532, 8532, 8532, 8507, 8532, 8507, 8507, …
$ year_of_birth               <int> 1970, 1929, 1970, 1966, 1936, 1996, 1923, …
$ month_of_birth              <int> 4, 3, 4, 2, 6, 5, 11, 8, 2, 3, 3, 3, 3, 5,…
$ day_of_birth                <int> 24, 18, 4, 26, 10, 29, 14, 20, 11, 7, 11, …
$ birth_datetime              <dttm> 1970-04-24, 1929-03-18, 1970-04-04, 1966-…
$ race_concept_id             <int> 8527, 8527, 8527, 8527, 8527, 8516, 8527, …
$ ethnicity_concept_id        <int> 38003564, 38003564, 38003564, 38003564, 38…
$ location_id                 <int> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA…
$ provider_id                 <int> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA…
$ care_site_id                <int> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA…
$ person_source_value         <chr> "00019d62-30d1-e285-e01c-68b371598db0", "0…
$ gender_source_value         <chr> "F", "F", "F", "M", "F", "M", "M", "M", "F…
$ gender_source_concept_id    <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
$ race_source_value           <chr> "white", "white", "white", "white", "white…
$ race_source_concept_id      <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
$ ethnicity_source_value      <chr> "nonhispanic", "nonhispanic", "nonhispanic…
$ ethnicity_source_concept_id <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …

CDM clinical tables

cdm$observation_period |> glimpse()

Rows: ??
Columns: 5
Database: DuckDB v1.0.0 [eburn@Windows 10 x64:R 4.4.0/C:\Users\eburn\Documents\EMA training november 2024\Presentations\CdmIncPrev\darwinTutorialTest.duckdb]
$ observation_period_id         <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 1…
$ person_id                     <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 1…
$ observation_period_start_date <date> 2014-05-09, 1977-04-11, 2014-04-19, 201…
$ observation_period_end_date   <date> 2023-05-12, 1986-09-15, 2023-04-22, 202…
$ period_type_concept_id        <int> 44814724, 44814724, 44814724, 44814724, …

CDM clinical tables

cdm$condition_occurrence |> glimpse()

Rows: ??
Columns: 16
Database: DuckDB v1.0.0 [eburn@Windows 10 x64:R 4.4.0/C:\Users\eburn\Documents\EMA training november 2024\Presentations\CdmIncPrev\darwinTutorialTest.duckdb]
$ condition_occurrence_id       <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 1…
$ person_id                     <int> 2, 6, 7, 8, 8, 8, 8, 16, 16, 18, 18, 25,…
$ condition_concept_id          <int> 381316, 321042, 381316, 37311061, 437663…
$ condition_start_date          <date> 1986-09-08, 2021-06-23, 2021-04-07, 202…
$ condition_start_datetime      <dttm> 1986-09-08, 2021-06-23, 2021-04-07, 202…
$ condition_end_date            <date> 1986-09-08, 2021-06-23, 2021-04-07, 202…
$ condition_end_datetime        <dttm> 1986-09-08, 2021-06-23, 2021-04-07, 202…
$ condition_type_concept_id     <int> 38000175, 38000175, 38000175, 38000175, …
$ condition_status_concept_id   <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
$ stop_reason                   <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …
$ provider_id                   <int> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …
$ visit_occurrence_id           <int> 19, 55, 67, 79, 79, 79, 79, 168, 171, 19…
$ visit_detail_id               <int> 1000019, 1000055, 1000067, 1000079, 1000…
$ condition_source_value        <chr> "230690007", "410429000", "230690007", "…
$ condition_source_concept_id   <int> 381316, 321042, 381316, 37311061, 437663…
$ condition_status_source_value <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …

CDM clinical tables

cdm$procedure_occurrence |> glimpse()

Rows: ??
Columns: 14
Database: DuckDB v1.0.0 [eburn@Windows 10 x64:R 4.4.0/C:\Users\eburn\Documents\EMA training november 2024\Presentations\CdmIncPrev\darwinTutorialTest.duckdb]
$ procedure_occurrence_id     <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,…
$ person_id                   <int> 2, 2, 6, 6, 6, 7, 7, 16, 16, 16, 16, 18, 1…
$ procedure_concept_id        <int> 4202451, 4230911, 4057420, 40492359, 42309…
$ procedure_date              <date> 1986-09-08, 1986-09-08, 2021-06-23, 2021-…
$ procedure_datetime          <dttm> 1986-09-08 22:07:35, 1986-09-08 22:07:35,…
$ procedure_type_concept_id   <int> 38000267, 38000267, 38000267, 38000267, 38…
$ modifier_concept_id         <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
$ quantity                    <int> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA…
$ provider_id                 <int> 30, 30, 10, 10, 10, 26, 26, 3, 14089, 3, 1…
$ visit_occurrence_id         <int> 19, 19, 55, 55, 55, 67, 67, 168, 170, 168,…
$ visit_detail_id             <int> 1000019, 1000019, 1000055, 1000055, 100005…
$ procedure_source_value      <chr> "433112001", "40701008", "18286008", "4473…
$ procedure_source_concept_id <int> 4202451, 4230911, 4057420, 40492359, 42309…
$ modifier_source_value       <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA…

CDM clinical tables

cdm$drug_exposure |> glimpse()

Rows: ??
Columns: 23
Database: DuckDB v1.0.0 [eburn@Windows 10 x64:R 4.4.0/C:\Users\eburn\Documents\EMA training november 2024\Presentations\CdmIncPrev\darwinTutorialTest.duckdb]
$ drug_exposure_id             <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13…
$ person_id                    <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,…
$ drug_concept_id              <int> 40213260, 40213260, 40213260, 40213260, 4…
$ drug_exposure_start_date     <date> 2021-04-30, 2020-04-24, 2021-04-30, 2020…
$ drug_exposure_start_datetime <dttm> 2021-04-30 16:49:39, 2020-04-24 16:49:39…
$ drug_exposure_end_date       <date> 2021-04-30, 2020-04-24, 2021-04-30, 2020…
$ drug_exposure_end_datetime   <dttm> 2021-04-30 16:49:39, 2020-04-24 16:49:39…
$ verbatim_end_date            <date> 2021-04-30, 2020-04-24, 2021-04-30, 2020…
$ drug_type_concept_id         <int> 32869, 32869, 32869, 32869, 32869, 32869,…
$ stop_reason                  <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
$ refills                      <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
$ quantity                     <dbl> 30, 1, 30, 5, 30, 5, 1, 10, 30, 0, 5, 0, …
$ days_supply                  <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
$ sig                          <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
$ route_concept_id             <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
$ lot_number                   <chr> "0", "0", "0", "0", "0", "0", "0", "0", "…
$ provider_id                  <int> 12357, 12357, 12356, 12356, 12357, 12356,…
$ visit_occurrence_id          <int> 6, 8, 6, 8, 6, 6, 7, 2, 6, 8, 9, 1, 7, 2,…
$ visit_detail_id              <int> 1000006, 1000008, 1000006, 1000008, 10000…
$ drug_source_value            <chr> "121", "121", "121", "121", "113", "113",…
$ drug_source_concept_id       <int> 40213260, 40213260, 40213260, 40213260, 4…
$ route_source_value           <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
$ dose_unit_source_value       <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…

CDM vocabulary tables

cdm$concept |> glimpse()

Rows: ??
Columns: 10
Database: DuckDB v1.0.0 [eburn@Windows 10 x64:R 4.4.0/C:\Users\eburn\Documents\EMA training november 2024\Presentations\CdmIncPrev\darwinTutorialTest.duckdb]
$ concept_id       <int> 44909219, 45028194, 44892033, 45318169, 45113390, 453…
$ concept_name     <chr> "albuterol sulfate 2.5mg/.5mL RESPIRATORY (INHALATION…
$ domain_id        <chr> "Drug", "Drug", "Drug", "Drug", "Drug", "Drug", "Drug…
$ vocabulary_id    <chr> "NDC", "NDC", "NDC", "NDC", "NDC", "NDC", "NDC", "NDC…
$ concept_class_id <chr> "9-digit NDC", "11-digit NDC", "11-digit NDC", "11-di…
$ standard_concept <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
$ concept_code     <chr> "004879901", "00487990102", "00487990110", "004879901…
$ valid_start_date <date> 2001-06-26, 2008-06-01, 2008-06-01, 2008-06-01, 2007…
$ valid_end_date   <date> 2099-12-31, 2019-02-01, 2010-02-01, 2099-12-31, 2015…
$ invalid_reason   <chr> NA, "D", "D", NA, "D", "D", "D", "D", "D", "D", "D", …

cdm$concept |> 
  filter(concept_name == "Atrial fibrillation") |> 
  glimpse()

Rows: ??
Columns: 10
Database: DuckDB v1.0.0 [eburn@Windows 10 x64:R 4.4.0/C:\Users\eburn\Documents\EMA training november 2024\Presentations\CdmIncPrev\darwinTutorialTest.duckdb]
$ concept_id       <int> 313217, 44821957
$ concept_name     <chr> "Atrial fibrillation", "Atrial fibrillation"
$ domain_id        <chr> "Condition", "Condition"
$ vocabulary_id    <chr> "SNOMED", "ICD9CM"
$ concept_class_id <chr> "Clinical Finding", "5-dig billing code"
$ standard_concept <chr> "S", NA
$ concept_code     <chr> "49436004", "427.31"
$ valid_start_date <date> 2002-01-31, 1970-01-01
$ valid_end_date   <date> 2099-12-31, 2099-12-31
$ invalid_reason   <chr> NA, NA

CDM vocabulary tables

cdm$concept_ancestor |> glimpse()

Rows: ??
Columns: 4
Database: DuckDB v1.0.0 [eburn@Windows 10 x64:R 4.4.0/C:\Users\eburn\Documents\EMA training november 2024\Presentations\CdmIncPrev\darwinTutorialTest.duckdb]
$ ancestor_concept_id      <int> 824838, 1125315, 830312, 820402, 757996, 8209…
$ descendant_concept_id    <int> 1829903, 40926860, 2055095, 21081826, 4102887…
$ min_levels_of_separation <int> 2, 1, 1, 2, 2, 2, 2, 3, 2, 2, 2, 1, 2, 2, 1, …
$ max_levels_of_separation <int> 2, 1, 1, 3, 2, 3, 3, 4, 2, 2, 3, 1, 3, 3, 1, …

CDM vocabulary tables

cdm$concept_ancestor |> 
  filter(ancestor_concept_id == 313217) |> 
  glimpse()

Rows: ??
Columns: 4
Database: DuckDB v1.0.0 [eburn@Windows 10 x64:R 4.4.0/C:\Users\eburn\Documents\EMA training november 2024\Presentations\CdmIncPrev\darwinTutorialTest.duckdb]
$ ancestor_concept_id      <int> 313217, 313217, 313217, 313217, 313217, 31321…
$ descendant_concept_id    <int> 4232691, 4199501, 1340258, 313217, 4119601, 4…
$ min_levels_of_separation <int> 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1
$ max_levels_of_separation <int> 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1

cdm$concept |> 
  filter(concept_id == 4232691) |> 
  glimpse()

Rows: ??
Columns: 10
Database: DuckDB v1.0.0 [eburn@Windows 10 x64:R 4.4.0/C:\Users\eburn\Documents\EMA training november 2024\Presentations\CdmIncPrev\darwinTutorialTest.duckdb]
$ concept_id       <int> 4232691
$ concept_name     <chr> "Permanent atrial fibrillation"
$ domain_id        <chr> "Condition"
$ vocabulary_id    <chr> "SNOMED"
$ concept_class_id <chr> "Clinical Finding"
$ standard_concept <chr> "S"
$ concept_code     <chr> "440028005"
$ valid_start_date <date> 2009-01-31
$ valid_end_date   <date> 2099-12-31
$ invalid_reason   <chr> NA

CDM vocabulary tables

cdm$concept_relationship |> glimpse()

Rows: ??
Columns: 6
Database: DuckDB v1.0.0 [eburn@Windows 10 x64:R 4.4.0/C:\Users\eburn\Documents\EMA training november 2024\Presentations\CdmIncPrev\darwinTutorialTest.duckdb]
$ concept_id_1     <int> 2063711, 35762727, 41163454, 41319147, 43683052, 2072…
$ concept_id_2     <int> 2016204, 21017486, 43566451, 40819996, 40817322, 2015…
$ relationship_id  <chr> "Has brand name", "Has brand name", "Has brand name",…
$ valid_start_date <date> 2020-07-31, 2017-08-02, 2017-07-18, 2017-07-18, 2017…
$ valid_end_date   <date> 2099-12-31, 2099-12-31, 2099-12-31, 2099-12-31, 2099…
$ invalid_reason   <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…

cdm$concept |> 
  filter(concept_id == 4068155) |> glimpse()

Rows: ??
Columns: 10
Database: DuckDB v1.0.0 [eburn@Windows 10 x64:R 4.4.0/C:\Users\eburn\Documents\EMA training november 2024\Presentations\CdmIncPrev\darwinTutorialTest.duckdb]
$ concept_id       <int> 4068155
$ concept_name     <chr> "Atrial arrhythmia"
$ domain_id        <chr> "Condition"
$ vocabulary_id    <chr> "SNOMED"
$ concept_class_id <chr> "Clinical Finding"
$ standard_concept <chr> "S"
$ concept_code     <chr> "17366009"
$ valid_start_date <date> 2002-01-31
$ valid_end_date   <date> 2099-12-31
$ invalid_reason   <chr> NA

Creating study cohorts

cdm <- generateConceptCohortSet(
  cdm = cdm,
  name = "atrial_fibrillation",
  conceptSet = list(
    "afib" = 313217
  ), 
  limit = "all",
  end = "event_end_date"
)
cdm$atrial_fibrillation |> glimpse()

Rows: ??
Columns: 4
Database: DuckDB v1.0.0 [eburn@Windows 10 x64:R 4.4.0/C:\Users\eburn\Documents\EMA training november 2024\Presentations\CdmIncPrev\darwinTutorialTest.duckdb]
$ cohort_definition_id <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1…
$ subject_id           <int> 2190, 3403, 4584, 6199, 3926, 5982, 6751, 8964, 9…
$ cohort_start_date    <date> 1995-03-12, 2012-08-13, 2009-04-27, 2013-02-15, …
$ cohort_end_date      <date> 1995-03-12, 2012-08-13, 2009-04-27, 2013-02-15, …

Creating study cohorts

cohortCount(cdm$atrial_fibrillation)

# A tibble: 1 × 3
  cohort_definition_id number_records number_subjects
                 <int>          <int>           <int>
1                    1            782             782

settings(cdm$atrial_fibrillation)

# A tibble: 1 × 6
  cohort_definition_id cohort_name limit prior_observation future_observation
                 <int> <chr>       <chr>             <dbl>              <dbl>
1                    1 afib        all                   0                  0
# ℹ 1 more variable: end <chr>

The IncidencePrevalence R package

Denominator population

Observation periods

Denominator population

Observation periods + study period

Denominator population

Observation periods + study period + prior history requirement

Denominator population

Observation periods + study period + prior history requirement + age (and sex) restriction

generateDenominatorCohortSet()

library(IncidencePrevalence)
library(ggplot2)
cdm <- mockIncidencePrevalenceRef(sampleSize = 50000)

cdm <- generateDenominatorCohortSet(cdm, name = "dpop")

cdm$dpop |>
  glimpse()

Rows: ??
Columns: 4
Database: DuckDB v1.0.0 [eburn@Windows 10 x64:R 4.4.0/:memory:]
$ cohort_definition_id <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1…
$ subject_id           <int> 1, 3, 7, 11, 14, 21, 29, 32, 33, 36, 37, 41, 42, …
$ cohort_start_date    <date> 1949-10-29, 1977-03-02, 1994-07-10, 1945-08-21, …
$ cohort_end_date      <date> 1957-03-20, 1982-06-05, 1996-11-16, 1951-02-11, …

generateDenominatorCohortSet()

cdm <- generateDenominatorCohortSet(
  cdm = cdm, name = "dpop",
  cohortDateRange = as.Date(c("2008-01-01", "2012-01-01"))
)

cdm$dpop |>
  glimpse()

Rows: ??
Columns: 4
Database: DuckDB v1.0.0 [eburn@Windows 10 x64:R 4.4.0/:memory:]
$ cohort_definition_id <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1…
$ subject_id           <int> 33, 41, 52, 59, 69, 76, 79, 89, 122, 141, 151, 16…
$ cohort_start_date    <date> 2008-01-01, 2008-01-01, 2008-01-01, 2008-01-01, …
$ cohort_end_date      <date> 2008-08-04, 2012-01-01, 2011-10-24, 2012-01-01, …

generateDenominatorCohortSet()

cohortCount(cdm$dpop)

# A tibble: 1 × 3
  cohort_definition_id number_records number_subjects
                 <int>          <int>           <int>
1                    1           3575            3575

settings(cdm$dpop)

# A tibble: 1 × 10
  cohort_definition_id cohort_name        age_group sex   days_prior_observation
                 <int> <chr>              <chr>     <chr>                  <dbl>
1                    1 denominator_cohor… 0 to 150  Both                       0
# ℹ 5 more variables: start_date <date>, end_date <date>, time_at_risk <chr>,
#   target_cohort_definition_id <int>, target_cohort_name <chr>

generateDenominatorCohortSet()

cdm <- generateDenominatorCohortSet(
  cdm = cdm, name = "dpop",
  cohortDateRange = as.Date(c("2008-01-01", "2012-01-01")),
  ageGroup = list(
    c(0, 49),
    c(50, 100)
  ),
  sex = c("Male", "Female")
)

cdm$dpop |>
  glimpse()

Rows: ??
Columns: 4
Database: DuckDB v1.0.0 [eburn@Windows 10 x64:R 4.4.0/:memory:]
$ cohort_definition_id <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1…
$ subject_id           <int> 59, 89, 200, 208, 396, 501, 579, 581, 615, 642, 6…
$ cohort_start_date    <date> 2008-01-01, 2008-01-01, 2008-01-01, 2008-01-01, …
$ cohort_end_date      <date> 2012-01-01, 2009-08-06, 2010-12-23, 2012-01-01, …

generateDenominatorCohortSet()

cdm <- generateDenominatorCohortSet(
  cdm = cdm, name = "dpop",
  cohortDateRange = as.Date(c("2008-01-01", "2012-01-01")),
  ageGroup = list(
    c(0, 49),
    c(50, 100)
  ),
  sex = c("Male", "Female"),
  daysPriorObservation= c(0, 180)
)

cdm$dpop |>
  glimpse()

Rows: ??
Columns: 4
Database: DuckDB v1.0.0 [eburn@Windows 10 x64:R 4.4.0/:memory:]
$ cohort_definition_id <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1…
$ subject_id           <int> 59, 89, 200, 208, 396, 501, 579, 581, 615, 642, 6…
$ cohort_start_date    <date> 2008-01-01, 2008-01-01, 2008-01-01, 2008-01-01, …
$ cohort_end_date      <date> 2012-01-01, 2009-08-06, 2010-12-23, 2012-01-01, …

generateDenominatorCohortSet()

settings(cdm$dpop)

# A tibble: 8 × 10
  cohort_definition_id cohort_name        age_group sex   days_prior_observation
                 <int> <chr>              <chr>     <chr>                  <dbl>
1                    1 denominator_cohor… 0 to 49   Male                       0
2                    2 denominator_cohor… 0 to 49   Male                     180
3                    3 denominator_cohor… 0 to 49   Fema…                      0
4                    4 denominator_cohor… 0 to 49   Fema…                    180
5                    5 denominator_cohor… 50 to 100 Male                       0
6                    6 denominator_cohor… 50 to 100 Male                     180
7                    7 denominator_cohor… 50 to 100 Fema…                      0
8                    8 denominator_cohor… 50 to 100 Fema…                    180
# ℹ 5 more variables: start_date <date>, end_date <date>, time_at_risk <chr>,
#   target_cohort_definition_id <int>, target_cohort_name <chr>

generateDenominatorCohortSet()

cohortCount(cdm$dpop)

# A tibble: 8 × 3
  cohort_definition_id number_records number_subjects
                 <int>          <int>           <int>
1                    1            973             973
2                    2            959             959
3                    3            902             902
4                    4            890             890
5                    5            933             933
6                    6            925             925
7                    7            922             922
8                    8            900             900

Adding (time-invariant) variables for stratification

If later we want to estimate incidence or prevalence stratified for some time-invariant characteristic, we will need to add a variable to our denominator cohort table.

cdm$dpop <- cdm$dpop |> 
  mutate(group = if_else(as.numeric(subject_id)  < 20, "first", "second")) 

cdm$dpop |> 
  glimpse()

Rows: ??
Columns: 5
Database: DuckDB v1.0.0 [eburn@Windows 10 x64:R 4.4.0/:memory:]
$ cohort_definition_id <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1…
$ subject_id           <int> 59, 89, 200, 208, 396, 501, 579, 581, 615, 642, 6…
$ cohort_start_date    <date> 2008-01-01, 2008-01-01, 2008-01-01, 2008-01-01, …
$ cohort_end_date      <date> 2012-01-01, 2009-08-06, 2010-12-23, 2012-01-01, …
$ group                <chr> "second", "second", "second", "second", "second",…

generateTargetDenominatorCohortSet()

When we want to stratify on a time-varying characteristic, we will do this by first creating a cohort for it. Once we have the cohort we will will use it when creating our denominator cohort.

generateTargetDenominatorCohortSet()

cdm <- generateTargetDenominatorCohortSet(
  cdm = cdm, 
  name = "denominator_acute_asthma",
  targetCohortTable = "target"
)

cdm$denominator_acute_asthma |> 
  dplyr::glimpse()

Rows: ??
Columns: 4
Database: DuckDB v1.0.0 [eburn@Windows 10 x64:R 4.4.0/:memory:]
$ cohort_definition_id <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1…
$ subject_id           <int> 15732, 47567, 4033, 501, 9766, 31427, 40502, 2126…
$ cohort_start_date    <date> 1994-07-26, 2007-06-10, 2006-05-03, 2004-03-14, …
$ cohort_end_date      <date> 2003-12-04, 2008-02-07, 2008-04-06, 2008-12-05, …

generateTargetDenominatorCohortSet()

We can add demographic requirements like before. But it is important to note that these are applied at the cohort start date of the target cohort.

cdm <- generateTargetDenominatorCohortSet(
  cdm = cdm, 
  name = "denominator_acute_asthma_2",
  ageGroup = list(c(11, 15)),
  sex = "Female",
  daysPriorObservation = 0,
  targetCohortTable = "target"
)
cdm$denominator_acute_asthma_2 |> 
  dplyr::glimpse()

Rows: ??
Columns: 4
Database: DuckDB v1.0.0 [eburn@Windows 10 x64:R 4.4.0/:memory:]
$ cohort_definition_id <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1…
$ subject_id           <int> 19607, 2874, 32933, 8264, 45743, 29239, 47281, 44…
$ cohort_start_date    <date> 1993-10-05, 1981-10-21, 2000-10-21, 1935-10-04, …
$ cohort_end_date      <date> 1994-08-15, 1982-11-21, 2000-12-20, 1937-12-15, …

Incidence rates

Washout all history, no repetitive events

Incidence rates

No washout, no repetitive events

Incidence rates

Some washout, no repetitive events

Incidence rates

Some washout, repetitive events

estimateIncidence()

cdm <- mockIncidencePrevalenceRef(
  sampleSize = 50000,
  outPre = 0.5
)

cdm <- generateDenominatorCohortSet(
  cdm = cdm, name = "denominator",
  cohortDateRange = as.Date(c("2008-01-01", "2012-01-01")),
  ageGroup = list(
    c(0, 30),
    c(31, 50),
    c(51, 70),
    c(71, 100)
  )
)
inc <- estimateIncidence(
  cdm = cdm,
  denominatorTable = "denominator",
  outcomeTable = "outcome",
  interval = "years",
  outcomeWashout = Inf,
  repeatedEvents = FALSE
)

estimateIncidence()

inc |>
  glimpse()

Rows: 288
Columns: 13
$ result_id        <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,…
$ cdm_name         <chr> "mock", "mock", "mock", "mock", "mock", "mock", "mock…
$ group_name       <chr> "denominator_cohort_name", "denominator_cohort_name",…
$ group_level      <chr> "denominator_cohort_1", "denominator_cohort_1", "deno…
$ strata_name      <chr> "overall", "overall", "overall", "overall", "overall"…
$ strata_level     <chr> "overall", "overall", "overall", "overall", "overall"…
$ variable_name    <chr> "outcome_cohort_name", "outcome_cohort_name", "outcom…
$ variable_level   <chr> "cohort_1", "cohort_1", "cohort_1", "cohort_1", "coho…
$ estimate_name    <chr> "denominator_count", "outcome_count", "person_days", …
$ estimate_type    <chr> "integer", "integer", "numeric", "numeric", "numeric"…
$ estimate_value   <chr> "662", "63", "189075", "517.659", "12170.174", "9351.…
$ additional_name  <chr> "incidence_start_date &&& incidence_end_date", "incid…
$ additional_level <chr> "2008-01-01 &&& 2008-12-31", "2008-01-01 &&& 2008-12-…

estimateIncidence()

inc <- estimateIncidence(
  cdm = cdm,
  denominatorTable = "denominator",
  outcomeTable = "outcome",
  interval = c("Months"),
  outcomeWashout = c(0, 365),
  repeatedEvents = FALSE
)
inc |>
  glimpse()

Rows: 3,040
Columns: 13
$ result_id        <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,…
$ cdm_name         <chr> "mock", "mock", "mock", "mock", "mock", "mock", "mock…
$ group_name       <chr> "denominator_cohort_name", "denominator_cohort_name",…
$ group_level      <chr> "denominator_cohort_1", "denominator_cohort_1", "deno…
$ strata_name      <chr> "overall", "overall", "overall", "overall", "overall"…
$ strata_level     <chr> "overall", "overall", "overall", "overall", "overall"…
$ variable_name    <chr> "outcome_cohort_name", "outcome_cohort_name", "outcom…
$ variable_level   <chr> "cohort_1", "cohort_1", "cohort_1", "cohort_1", "coho…
$ estimate_name    <chr> "denominator_count", "outcome_count", "person_days", …
$ estimate_type    <chr> "integer", "integer", "numeric", "numeric", "numeric"…
$ estimate_value   <chr> "766", NA, NA, NA, NA, NA, NA, "761", NA, NA, NA, NA,…
$ additional_name  <chr> "incidence_start_date &&& incidence_end_date", "incid…
$ additional_level <chr> "2008-01-01 &&& 2008-01-31", "2008-01-01 &&& 2008-01-…

estimateIncidence()

settings(inc) |>
  glimpse()

Rows: 16
Columns: 17
$ result_id                            <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11…
$ result_type                          <chr> "incidence", "incidence", "incide…
$ package_name                         <chr> "IncidencePrevalence", "Incidence…
$ package_version                      <chr> "0.8.0.900", "0.8.0.900", "0.8.0.…
$ analysis_outcome_washout             <chr> "0", "365", "0", "365", "0", "365…
$ analysis_repeated_events             <chr> "FALSE", "FALSE", "FALSE", "FALSE…
$ analysis_interval                    <chr> "months", "months", "months", "mo…
$ analysis_complete_database_intervals <chr> "TRUE", "TRUE", "TRUE", "TRUE", "…
$ denominator_age_group                <chr> "0 to 30", "0 to 30", "31 to 50",…
$ denominator_sex                      <chr> "Both", "Both", "Both", "Both", "…
$ denominator_days_prior_observation   <chr> "0", "0", "0", "0", "0", "0", "0"…
$ denominator_start_date               <chr> "2008-01-01", "2008-01-01", "2008…
$ denominator_end_date                 <chr> "2012-01-01", "2012-01-01", "2012…
$ denominator_time_at_risk             <chr> "0 to Inf", "0 to Inf", "0 to Inf…
$ denominator_target_cohort_name       <chr> "None", "None", "None", "None", "…
$ outcome_cohort_name                  <chr> "cohort_1", "cohort_1", "cohort_1…
$ min_cell_count                       <int> 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, …

estimateIncidence()

plot <- plotIncidence(inc,
              facet = "denominator_age_group")

estimateIncidence()

plot

Prevalence

Point prevalence

Prevalence

Period prevalence

estimatePointPrevalence() and estimatePeriodPrevalence()

cdm <- mockIncidencePrevalenceRef(
  sampleSize = 50000,
  outPre = 0.5
)

cdm <- generateDenominatorCohortSet(
  cdm = cdm, name = "denominator",
  cohortDateRange = as.Date(c("2008-01-01", "2012-01-01")),
  ageGroup = list(
    c(0, 30),
    c(31, 50),
    c(51, 70),
    c(71, 100)
  )
)
prev <- estimatePointPrevalence(
  cdm = cdm,
  denominatorTable = "denominator",
  outcomeTable = "outcome",
  interval = "Years"
)

estimatePointPrevalence() and estimatePeriodPrevalence()

prev |>
  glimpse()

Rows: 276
Columns: 13
$ result_id        <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,…
$ cdm_name         <chr> "mock", "mock", "mock", "mock", "mock", "mock", "mock…
$ group_name       <chr> "denominator_cohort_name", "denominator_cohort_name",…
$ group_level      <chr> "denominator_cohort_1", "denominator_cohort_1", "deno…
$ strata_name      <chr> "overall", "overall", "overall", "overall", "overall"…
$ strata_level     <chr> "overall", "overall", "overall", "overall", "overall"…
$ variable_name    <chr> "outcome_cohort_name", "outcome_cohort_name", "outcom…
$ variable_level   <chr> "cohort_1", "cohort_1", "cohort_1", "cohort_1", "coho…
$ estimate_name    <chr> "denominator_count", "outcome_count", "prevalence", "…
$ estimate_type    <chr> "integer", "integer", "numeric", "numeric", "numeric"…
$ estimate_value   <chr> "755", NA, NA, NA, NA, "708", NA, NA, NA, NA, "670", …
$ additional_name  <chr> "prevalence_start_date &&& prevalence_end_date", "pre…
$ additional_level <chr> "2008-01-01 &&& 2008-01-01", "2008-01-01 &&& 2008-01-…

estimatePointPrevalence() and estimatePeriodPrevalence()

settings(prev) |>
  glimpse()

Rows: 8
Columns: 17
$ result_id                            <int> 1, 2, 3, 4, 5, 6, 7, 8
$ result_type                          <chr> "prevalence", "prevalence", "prev…
$ package_name                         <chr> "IncidencePrevalence", "Incidence…
$ package_version                      <chr> "0.8.0.900", "0.8.0.900", "0.8.0.…
$ analysis_type                        <chr> "point prevalence", "point preval…
$ analysis_interval                    <chr> "years", "years", "years", "years…
$ analysis_complete_database_intervals <chr> "FALSE", "FALSE", "FALSE", "FALSE…
$ analysis_full_contribution           <chr> "FALSE", "FALSE", "FALSE", "FALSE…
$ denominator_age_group                <chr> "0 to 30", "31 to 50", "51 to 70"…
$ denominator_sex                      <chr> "Both", "Both", "Both", "Both", "…
$ denominator_days_prior_observation   <chr> "0", "0", "0", "0", "0", "0", "0"…
$ denominator_start_date               <chr> "2008-01-01", "2008-01-01", "2008…
$ denominator_end_date                 <chr> "2012-01-01", "2012-01-01", "2012…
$ denominator_time_at_risk             <chr> "0 to Inf", "0 to Inf", "0 to Inf…
$ denominator_target_cohort_name       <chr> "None", "None", "None", "None", "…
$ outcome_cohort_name                  <chr> NA, NA, NA, NA, "cohort_1", "coho…
$ min_cell_count                       <int> 5, 5, 5, 5, 5, 5, 5, 5

estimatePointPrevalence() and estimatePeriodPrevalence()

prev <- estimatePeriodPrevalence(
  cdm = cdm,
  denominatorTable = "denominator",
  outcomeTable = "outcome",
  interval = "months"
)

prev |>
  glimpse()

Rows: 1,136
Columns: 13
$ result_id        <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,…
$ cdm_name         <chr> "mock", "mock", "mock", "mock", "mock", "mock", "mock…
$ group_name       <chr> "denominator_cohort_name", "denominator_cohort_name",…
$ group_level      <chr> "denominator_cohort_1", "denominator_cohort_1", "deno…
$ strata_name      <chr> "overall", "overall", "overall", "overall", "overall"…
$ strata_level     <chr> "overall", "overall", "overall", "overall", "overall"…
$ variable_name    <chr> "outcome_cohort_name", "outcome_cohort_name", "outcom…
$ variable_level   <chr> "cohort_1", "cohort_1", "cohort_1", "cohort_1", "coho…
$ estimate_name    <chr> "denominator_count", "outcome_count", "prevalence", "…
$ estimate_type    <chr> "integer", "integer", "numeric", "numeric", "numeric"…
$ estimate_value   <chr> "766", NA, NA, NA, NA, "764", NA, NA, NA, NA, "764", …
$ additional_name  <chr> "prevalence_start_date &&& prevalence_end_date", "pre…
$ additional_level <chr> "2008-01-01 &&& 2008-01-31", "2008-01-01 &&& 2008-01-…

estimatePointPrevalence() and estimatePeriodPrevalence()

plot <- plotPrevalence(prev,
              facet = "denominator_age_group")

estimatePointPrevalence() and estimatePeriodPrevalence()

plot

Package paper

https://onlinelibrary.wiley.com/doi/10.1002/pds.5717