proper support for multiple subgroup variables

wjchulme · wjchulme · commit a6e156bc4cf8 · 2025-04-23T15:04:23.000+01:00
This allows multiple subgroups to be specified, separated by a dash. For example "age_group-sex".

If more than one subgroup is specified, analyses will be multiplicatively stratified, for example exposure * (subgroup1 * subgroup2), rather than exposure * (subgroup1 + subgroup2).
diff --git a/README.md b/README.md
@@ -50,7 +50,7 @@ follows:
             [default: NULL] character. The name of an exposure variable in the input dataset. Must be binary or not given. All outputs will be stratified by this variable. This could be an exposure in the usual sense, or it could (mis)used to show different types of events (as long as the censoring structure is the same). If not specified, no stratification will occur.
 
         --subgroups=SUBGROUP_VARNAME
-            [default: NULL] The name of a subgroup variable or list of variable names. If a subgroup variable is used, analyses will be stratified as exposure * ( subgroup1, subgroup2, ...). If not specified, no stratification will occur.
+            [default: NULL] The name(s) of the subgroup variable(s). If using multiple subgroup variables, delimit with a dash (-), for example 'age_group-sex'. If subgroup variables are used, analyses will be stratified as exposure * subgroup1 * subgroup2 * ... (multiplicatively, not additively). If not specified, no stratification will occur.
 
         --origin_date=ORIGIN_VARNAME
             [default: must be specified] The name of a date variable (or name of a variable that is coercable to a date eg 'YYYY-MM-DD') in the input dataset that represents the start of follow-up.
diff --git a/analysis/dataset_definition.py b/analysis/dataset_definition.py
@@ -66,7 +66,7 @@
 )
 
 
-# grouping variables
+# example exposure / stratification variables
 
 dataset.sex = patients.sex
 
@@ -79,6 +79,8 @@
   otherwise="unknown",
 )
 
+dataset.region = registered_patients.practice_nuts1_region_name
+
 
 # start of follow up variable
 
diff --git a/analysis/km.R b/analysis/km.R
@@ -29,7 +29,7 @@ if(length(args)==0){
   df_input <- "output/extract.arrow"
   dir_output <- "output/km_estimates/"
   exposure <- c("sex")
-  subgroups <- c("age_group")
+  subgroups <- c("age_group-region")
   origin_date <- "first_vax_date"
   event_date <- "second_vax_date"
   censor_date <- character() # "censor_date"
@@ -40,7 +40,7 @@ if(length(args)==0){
   smooth <- as.logical("FALSE")
   smooth_df <- as.integer("4")
   concise <- as.logical("TRUE")
-  plot <- as.logical("FALSE")
+  plot <- as.logical("TRUE")
   contrast <- as.logical("TRUE")
   filename_suffix <- as.character("")
 } else {
@@ -58,7 +58,7 @@ if(length(args)==0){
                 help = "[default: NULL] character. The name of an exposure variable in the input dataset. Must be binary or not given. All outputs will be stratified by this variable. This could be an exposure in the usual sense, or it could (mis)used to show different types of events (as long as the censoring structure is the same). If not specified, no stratification will occur.",
                 metavar = "exposure_varname"),
     make_option("--subgroups", type = "character", default = character(),
-                help = "[default: NULL] The name of a subgroup variable or list of variable names. If a subgroup variable is used, analyses will be stratified as exposure * ( subgroup1, subgroup2, ...). If not specified, no stratification will occur.",
+                help = "[default: NULL] The name(s) of the subgroup variable(s). If using multiple subgroup variables, delimit with a dash (-), for example 'age_group-sex'. If subgroup variables are used, analyses will be stratified as exposure * subgroup1 * subgroup2 * ... (multiplicatively, not additively). If not specified, no stratification will occur.",
                 metavar = "subgroup_varname"),
     make_option("--origin_date", type = "character",
                 help = "[default: must be specified] The name of a date variable (or name of a variable that is coercable to a date eg 'YYYY-MM-DD') in the input dataset that represents the start of follow-up.",
@@ -118,6 +118,11 @@ if(length(args)==0){
 # the quasiquotation still works inside ggplot, transmute, etc
 
 exposure_syms <- syms(exposure)
+
+
+if(length(subgroups)>0) {
+  subgroups <- strsplit(subgroups, "-")[[1]]
+}
 subgroup_syms <- syms(subgroups)
 
 # Create output directory ----
diff --git a/project.yaml b/project.yaml
@@ -17,7 +17,7 @@ actions:
       --df_input output/extract.arrow
       --dir_output output/km_estimates/
       --exposure sex
-      --subgroups age_group
+      --subgroups age_group-region
       --origin_date first_vax_date
       --event_date second_vax_date
       --censor_date censor_date

Original file line number	Diff line number	Diff line change
`@@ -66,7 +66,7 @@`
`66`	`66`	`)`
`67`	`67`
`68`	`68`
`69`		`-# grouping variables`
	`69`	`+# example exposure / stratification variables`
`70`	`70`
`71`	`71`	`dataset.sex = patients.sex`
`72`	`72`
`@@ -79,6 +79,8 @@`
`79`	`79`	`otherwise="unknown",`
`80`	`80`	`)`
`81`	`81`
	`82`	`+dataset.region = registered_patients.practice_nuts1_region_name`
	`83`	`+`
`82`	`84`
`83`	`85`	`# start of follow up variable`
`84`	`86`