From 766e01e49191cefa41d4e614f728d762a12ff7a6 Mon Sep 17 00:00:00 2001 From: sakshimohan Date: Thu, 30 Oct 2025 19:00:42 +0000 Subject: [PATCH 01/15] update availability of consumables at level 1b to reflect the weighted average of availability at levels 1b and 2 --- ...rceFile_Consumables_availability_small.csv | 4 +- .../consumables_availability_estimation.py | 109 ++++++++++++++++++ 2 files changed, 111 insertions(+), 2 deletions(-) diff --git a/resources/healthsystem/consumables/ResourceFile_Consumables_availability_small.csv b/resources/healthsystem/consumables/ResourceFile_Consumables_availability_small.csv index ed46fac7cb..481c80b383 100644 --- a/resources/healthsystem/consumables/ResourceFile_Consumables_availability_small.csv +++ b/resources/healthsystem/consumables/ResourceFile_Consumables_availability_small.csv @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:f945b15a98e571464b6931f0a3a071c1c90be93d8ba0bd9d1eca751caab34793 -size 55657974 +oid sha256:eb58d8cea48bcea0d515ac160415a8ac0c694472ef82d7d2eeae62c39bb48a2d +size 6269825 diff --git a/src/scripts/data_file_processing/healthsystem/consumables/consumable_resource_analyses_with_lmis/consumables_availability_estimation.py b/src/scripts/data_file_processing/healthsystem/consumables/consumable_resource_analyses_with_lmis/consumables_availability_estimation.py index c19114402b..587c15e517 100644 --- a/src/scripts/data_file_processing/healthsystem/consumables/consumable_resource_analyses_with_lmis/consumables_availability_estimation.py +++ b/src/scripts/data_file_processing/healthsystem/consumables/consumable_resource_analyses_with_lmis/consumables_availability_estimation.py @@ -878,6 +878,115 @@ def interpolate_missing_with_mean(_ser): full_set_interpolated = full_set_interpolated.reset_index() #full_set_interpolated = full_set_interpolated.reset_index().merge(item_code_category_mapping, on = 'item_code', how = 'left', validate = 'm:1') +def update_level1b_availability( + full_set_interpolated: pd.DataFrame, + facilities_by_level: dict, + resourcefilepath: Path, + district_to_city_dict: dict, +) -> pd.DataFrame: + """ + Updates the availability of Level 1b facilities to be the weighted average + of availability at Level 1b and 2 facilities, since these levels are merged + together in simulations. + """ + # Load and prepare base weights (facility counts) + # --------------------------------------------------------------------- + weight = ( + pd.read_csv(resourcefilepath / 'healthsystem' / 'organisation' / 'ResourceFile_Master_Facilities_List.csv') + [["District", "Facility_Level", "Facility_ID", "Facility_Count"]] + ) + + # Keep only Level 1b and 2 facilities + lvl1b2_weights = weight[weight["Facility_Level"].isin(["1b", "2"])].copy() + + # Replace city names with their parent districts (temporarily for grouping) + city_to_district_dict = {v: k for k, v in district_to_city_dict.items()} + lvl1b2_weights["District"] = lvl1b2_weights["District"].replace(city_to_district_dict) + + # Aggregate counts per (District, Level) + lvl1b2_weights = ( + lvl1b2_weights + .groupby(["District", "Facility_Level"], as_index=False)["Facility_Count"] + .sum() + ) + + # Compute total and proportional weights within each district + lvl1b2_weights["total_facilities"] = lvl1b2_weights.groupby("District")["Facility_Count"].transform("sum") + lvl1b2_weights["weight"] = lvl1b2_weights["Facility_Count"] / lvl1b2_weights["total_facilities"] + + # Add back city districts (reverse mapping) + for source, destination in copy_source_to_destination.items(): + new_rows = lvl1b2_weights.loc[lvl1b2_weights.District == source].copy() + new_rows.District = destination + lvl1b2_weights = pd.concat([lvl1b2_weights, new_rows], axis=0, ignore_index=True) + + # Merge Facility_ID back + lvl1b2_weights = lvl1b2_weights.merge( + weight.loc[weight["Facility_Level"].isin(["1b", "2"]), ["District", "Facility_Level", "Facility_ID"]], + on=["District", "Facility_Level"], + how="left", + validate="1:1" + ) + + # Subset Level 1b and 2 facilities and apply weights + # --------------------------------------------------------------------- + lvl1b2_ids = list(facilities_by_level.get("1b", [])) + list(facilities_by_level.get("2", [])) + full_set_interpolated_levels1b2 = full_set_interpolated[ + full_set_interpolated["Facility_ID"].isin(lvl1b2_ids) + ].copy() + + full_set_interpolated_levels1b2 = full_set_interpolated_levels1b2.merge( + lvl1b2_weights[["District", "Facility_Level", "Facility_ID", "weight"]], + on="Facility_ID", + how="left", + validate="m:1" + ) + + # Apply weighting + full_set_interpolated_levels1b2["available_prop"] *= full_set_interpolated_levels1b2["weight"] + + # Aggregate to district-month-item level + full_set_interpolated_levels1b2 = ( + full_set_interpolated_levels1b2 + .groupby(["District", "month", "item_code"], as_index=False)["available_prop"] + .sum() + ) + full_set_interpolated_levels1b2["Facility_Level"] = "1b" + + # Reattach Facility_IDs for level 1b + full_set_interpolated_levels1b2 = full_set_interpolated_levels1b2.merge( + lvl1b2_weights.query("Facility_Level == '1b'")[["District", "Facility_Level", "Facility_ID", "weight"]], + on=["District", "Facility_Level"], + how="left", + validate="m:1" + ) + + # Replace old level 1b facilities and recompute weighted availability + # --------------------------------------------------------------------- + # Drop old Level 1b facilities + full_set_interpolated = full_set_interpolated[ + ~full_set_interpolated["Facility_ID"].isin(facilities_by_level.get("1b", [])) + ] + + # Append new 1b facility data + full_set_interpolated = pd.concat( + [ + full_set_interpolated, + full_set_interpolated_levels1b2[["Facility_ID", "month", "item_code", "available_prop"]] + ], + axis=0, + ignore_index=True + ) + + return full_set_interpolated + +full_set_interpolated = update_level1b_availability( + full_set_interpolated=full_set_interpolated, + facilities_by_level=facilities_by_level, + resourcefilepath=resourcefilepath, + district_to_city_dict=copy_source_to_destination, +) + # --- Check that the exported file has the properties required of it by the model code. --- # check_format_of_consumables_file(df=full_set_interpolated, fac_ids=fac_ids) From 176b8b5254a7771fc2e824a73dc4041036a90fa4 Mon Sep 17 00:00:00 2001 From: sakshimohan Date: Thu, 30 Oct 2025 19:34:51 +0000 Subject: [PATCH 02/15] allow for different weights when calculating average --- .../consumables_availability_estimation.py | 62 +++++++++++++++---- 1 file changed, 49 insertions(+), 13 deletions(-) diff --git a/src/scripts/data_file_processing/healthsystem/consumables/consumable_resource_analyses_with_lmis/consumables_availability_estimation.py b/src/scripts/data_file_processing/healthsystem/consumables/consumable_resource_analyses_with_lmis/consumables_availability_estimation.py index 587c15e517..dc7a71091c 100644 --- a/src/scripts/data_file_processing/healthsystem/consumables/consumable_resource_analyses_with_lmis/consumables_availability_estimation.py +++ b/src/scripts/data_file_processing/healthsystem/consumables/consumable_resource_analyses_with_lmis/consumables_availability_estimation.py @@ -883,11 +883,18 @@ def update_level1b_availability( facilities_by_level: dict, resourcefilepath: Path, district_to_city_dict: dict, + weighting: str = "district_1b_to_2_ratio" ) -> pd.DataFrame: """ Updates the availability of Level 1b facilities to be the weighted average of availability at Level 1b and 2 facilities, since these levels are merged together in simulations. + + weighting : {'level2', 'national_1b_to_2_ratio', 'district_1b_to_2_ratio'}, default 'district_1b_to_2_ratio' + Weighting strategy: + - 'level2': Replace 1b availability entirely with level 2 values. + - 'national_1b_to_2_ratio': Apply a single national 1b:2 ratio to all districts. + - 'district_1b_to_2_ratio': (default) Use district-specific 1b:2 ratios. """ # Load and prepare base weights (facility counts) # --------------------------------------------------------------------- @@ -899,20 +906,48 @@ def update_level1b_availability( # Keep only Level 1b and 2 facilities lvl1b2_weights = weight[weight["Facility_Level"].isin(["1b", "2"])].copy() - # Replace city names with their parent districts (temporarily for grouping) - city_to_district_dict = {v: k for k, v in district_to_city_dict.items()} - lvl1b2_weights["District"] = lvl1b2_weights["District"].replace(city_to_district_dict) - - # Aggregate counts per (District, Level) - lvl1b2_weights = ( - lvl1b2_weights - .groupby(["District", "Facility_Level"], as_index=False)["Facility_Count"] - .sum() - ) + # Compute weights depending on strategy + # --------------------------------------------------------------------- + if weighting == "level2": + # Force all weight on level 2 + lvl1b2_weights = lvl1b2_weights[~lvl1b2_weights.District.str.contains("City")] + lvl1b2_weights["weight"] = (lvl1b2_weights["Facility_Level"] == "2").astype(float) + lvl1b2_weights = lvl1b2_weights.drop(columns = 'Facility_ID') + + elif weighting == "national_1b_to_2_ratio": + lvl1b2_weights = lvl1b2_weights[~lvl1b2_weights.District.str.contains("City")] + # National total counts + national_counts = ( + lvl1b2_weights.groupby("Facility_Level")["Facility_Count"].sum().to_dict() + ) + total_fac = national_counts.get("1b", 0) + national_counts.get("2", 0) + if total_fac == 0: + raise ValueError("No facilities found at levels 1b or 2.") + lvl1b2_weights["weight"] = lvl1b2_weights["Facility_Level"].map( + {lvl: cnt / total_fac for lvl, cnt in national_counts.items()} + ) + lvl1b2_weights = lvl1b2_weights.drop(columns='Facility_ID') + + elif weighting == "district_1b_to_2_ratio": + # Replace city names with their parent districts (temporarily for grouping) + city_to_district_dict = {v: k for k, v in district_to_city_dict.items()} + lvl1b2_weights["District"] = lvl1b2_weights["District"].replace(city_to_district_dict) + + # District-level weighting (default) + lvl1b2_weights = ( + lvl1b2_weights + .groupby(["District", "Facility_Level"], as_index=False)["Facility_Count"] + .sum() + ) + + lvl1b2_weights["total_facilities"] = lvl1b2_weights.groupby("District")["Facility_Count"].transform("sum") + lvl1b2_weights["weight"] = lvl1b2_weights["Facility_Count"] / lvl1b2_weights["total_facilities"] - # Compute total and proportional weights within each district - lvl1b2_weights["total_facilities"] = lvl1b2_weights.groupby("District")["Facility_Count"].transform("sum") - lvl1b2_weights["weight"] = lvl1b2_weights["Facility_Count"] / lvl1b2_weights["total_facilities"] + else: + raise ValueError( + f"Invalid weighting '{weighting}'. Choose from " + "'level2', 'national_1b_to_2_ratio', or 'district_1b_to_2_ratio'." + ) # Add back city districts (reverse mapping) for source, destination in copy_source_to_destination.items(): @@ -985,6 +1020,7 @@ def update_level1b_availability( facilities_by_level=facilities_by_level, resourcefilepath=resourcefilepath, district_to_city_dict=copy_source_to_destination, + weighting = 'district_1b_to_2_ratio', ) # --- Check that the exported file has the properties required of it by the model code. --- # From 7415afebd55224eb15ec652abceb206238fe791d Mon Sep 17 00:00:00 2001 From: sakshimohan Date: Thu, 30 Oct 2025 19:42:41 +0000 Subject: [PATCH 03/15] revert original resource file --- .../ResourceFile_Consumables_availability_small.csv | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/resources/healthsystem/consumables/ResourceFile_Consumables_availability_small.csv b/resources/healthsystem/consumables/ResourceFile_Consumables_availability_small.csv index 481c80b383..70127334c5 100644 --- a/resources/healthsystem/consumables/ResourceFile_Consumables_availability_small.csv +++ b/resources/healthsystem/consumables/ResourceFile_Consumables_availability_small.csv @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:eb58d8cea48bcea0d515ac160415a8ac0c694472ef82d7d2eeae62c39bb48a2d -size 6269825 +oid sha256:b355a102f70912825c22eb41ab806fa16b52d30ebc930bf8bb115a28f3d3b1ee +size 6234141 From 2b5a1f94273d79689eb5469f1fa551b1fff2d69e Mon Sep 17 00:00:00 2001 From: sakshimohan Date: Thu, 30 Oct 2025 19:45:25 +0000 Subject: [PATCH 04/15] add new resource files with different ways of handling level 1b availability --- ...e_Consumables_availability_small_district_1b_to_2_ratio.csv | 3 +++ .../ResourceFile_Consumables_availability_small_level2.csv | 3 +++ ...e_Consumables_availability_small_national_1b_to_2_ratio.csv | 3 +++ 3 files changed, 9 insertions(+) create mode 100644 resources/healthsystem/consumables/ResourceFile_Consumables_availability_small_district_1b_to_2_ratio.csv create mode 100644 resources/healthsystem/consumables/ResourceFile_Consumables_availability_small_level2.csv create mode 100644 resources/healthsystem/consumables/ResourceFile_Consumables_availability_small_national_1b_to_2_ratio.csv diff --git a/resources/healthsystem/consumables/ResourceFile_Consumables_availability_small_district_1b_to_2_ratio.csv b/resources/healthsystem/consumables/ResourceFile_Consumables_availability_small_district_1b_to_2_ratio.csv new file mode 100644 index 0000000000..481c80b383 --- /dev/null +++ b/resources/healthsystem/consumables/ResourceFile_Consumables_availability_small_district_1b_to_2_ratio.csv @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:eb58d8cea48bcea0d515ac160415a8ac0c694472ef82d7d2eeae62c39bb48a2d +size 6269825 diff --git a/resources/healthsystem/consumables/ResourceFile_Consumables_availability_small_level2.csv b/resources/healthsystem/consumables/ResourceFile_Consumables_availability_small_level2.csv new file mode 100644 index 0000000000..739883e6a3 --- /dev/null +++ b/resources/healthsystem/consumables/ResourceFile_Consumables_availability_small_level2.csv @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e957ee6a5fa548284d8d4bb2dc855b8ee26311f4b12739f56d28e0091402c21f +size 5956337 diff --git a/resources/healthsystem/consumables/ResourceFile_Consumables_availability_small_national_1b_to_2_ratio.csv b/resources/healthsystem/consumables/ResourceFile_Consumables_availability_small_national_1b_to_2_ratio.csv new file mode 100644 index 0000000000..b6ee5b35ea --- /dev/null +++ b/resources/healthsystem/consumables/ResourceFile_Consumables_availability_small_national_1b_to_2_ratio.csv @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bc707785d81234361edad9a8a0ccac75ab1801af06b939b0bddcc828089bedf2 +size 6505026 From 88abd1ca10ce7dc0836417ae18b9cd88a98769b4 Mon Sep 17 00:00:00 2001 From: sakshimohan Date: Fri, 31 Oct 2025 12:49:51 +0000 Subject: [PATCH 05/15] merge in xpert update from master --- ...rceFile_Consumables_availability_small.csv | 4 +-- ...ilability_small_district_1b_to_2_ratio.csv | 4 +-- ..._Consumables_availability_small_level2.csv | 4 +-- ...ilability_small_national_1b_to_2_ratio.csv | 4 +-- .../consumables_availability_estimation.py | 28 ++++++++++++++++++- 5 files changed, 35 insertions(+), 9 deletions(-) diff --git a/resources/healthsystem/consumables/ResourceFile_Consumables_availability_small.csv b/resources/healthsystem/consumables/ResourceFile_Consumables_availability_small.csv index 70127334c5..a8cdcde0fe 100644 --- a/resources/healthsystem/consumables/ResourceFile_Consumables_availability_small.csv +++ b/resources/healthsystem/consumables/ResourceFile_Consumables_availability_small.csv @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:b355a102f70912825c22eb41ab806fa16b52d30ebc930bf8bb115a28f3d3b1ee -size 6234141 +oid sha256:3884d44c70e83abf753ae7bfe43e208e5e5c8c34704f7136d6bf9cb2642eddb0 +size 6245740 diff --git a/resources/healthsystem/consumables/ResourceFile_Consumables_availability_small_district_1b_to_2_ratio.csv b/resources/healthsystem/consumables/ResourceFile_Consumables_availability_small_district_1b_to_2_ratio.csv index 481c80b383..093e694227 100644 --- a/resources/healthsystem/consumables/ResourceFile_Consumables_availability_small_district_1b_to_2_ratio.csv +++ b/resources/healthsystem/consumables/ResourceFile_Consumables_availability_small_district_1b_to_2_ratio.csv @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:eb58d8cea48bcea0d515ac160415a8ac0c694472ef82d7d2eeae62c39bb48a2d -size 6269825 +oid sha256:2dca9728a126e5cfdfc9536b5a6a25b49370efa1fcf69b9e123e81723a3e79d2 +size 6280722 diff --git a/resources/healthsystem/consumables/ResourceFile_Consumables_availability_small_level2.csv b/resources/healthsystem/consumables/ResourceFile_Consumables_availability_small_level2.csv index 739883e6a3..e85bcb4f1f 100644 --- a/resources/healthsystem/consumables/ResourceFile_Consumables_availability_small_level2.csv +++ b/resources/healthsystem/consumables/ResourceFile_Consumables_availability_small_level2.csv @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:e957ee6a5fa548284d8d4bb2dc855b8ee26311f4b12739f56d28e0091402c21f -size 5956337 +oid sha256:77860b34c463ed9b045d0e117a7fb9c5eb5ea95f4be6d669b390dee789351515 +size 5967523 diff --git a/resources/healthsystem/consumables/ResourceFile_Consumables_availability_small_national_1b_to_2_ratio.csv b/resources/healthsystem/consumables/ResourceFile_Consumables_availability_small_national_1b_to_2_ratio.csv index b6ee5b35ea..68ad830085 100644 --- a/resources/healthsystem/consumables/ResourceFile_Consumables_availability_small_national_1b_to_2_ratio.csv +++ b/resources/healthsystem/consumables/ResourceFile_Consumables_availability_small_national_1b_to_2_ratio.csv @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:bc707785d81234361edad9a8a0ccac75ab1801af06b939b0bddcc828089bedf2 -size 6505026 +oid sha256:600197dda8d1a7c50661ac4e565e3aa97cc329eb557f6da82d858b2ea7a56274 +size 6514451 diff --git a/src/scripts/data_file_processing/healthsystem/consumables/consumable_resource_analyses_with_lmis/consumables_availability_estimation.py b/src/scripts/data_file_processing/healthsystem/consumables/consumable_resource_analyses_with_lmis/consumables_availability_estimation.py index dc7a71091c..da53e66c2e 100644 --- a/src/scripts/data_file_processing/healthsystem/consumables/consumable_resource_analyses_with_lmis/consumables_availability_estimation.py +++ b/src/scripts/data_file_processing/healthsystem/consumables/consumable_resource_analyses_with_lmis/consumables_availability_estimation.py @@ -765,7 +765,33 @@ def get_inflow_to_outflow_ratio_by_item_and_facilitylevel(_df): mwanza_1b = sf.loc[(sf.district_std == 'Mwanza') & (sf.fac_type_tlo == '1a')].copy().assign(fac_type_tlo='1b') sf = pd.concat([sf, mwanza_1b], axis=0, ignore_index=True) -# 4) Copy all the results to create a level 0 with an availability equal to half that in the respective 1a +# 4) Update the availability Xpert (item_code = 187) +# First add rows for Xpert at level 1b by cloning rows for level 2 -> only if not already present +xpert_item = sf['item_code'].eq(187) +level_2 = sf['fac_type_tlo'].eq('2') +level_1b = sf['fac_type_tlo'].eq('1b') + +# Clone rows from level 2 +base = sf.loc[level_2 & xpert_item].copy() +new_rows = base.copy() +new_rows['fac_type_tlo'] = '1b' + +# Add rows to main availability dataframe and drop duplicates, if any +sf = pd.concat([sf, new_rows], ignore_index=True) +id_cols = [c for c in sf.columns if c != 'available_prop'] +dupe_mask = sf.duplicated(subset=id_cols, keep=False) +dupes = sf.loc[dupe_mask].sort_values(id_cols) +sf = sf.drop_duplicates(subset=id_cols, keep='first').reset_index(drop=True) + +# Compute the average availability Sep–Dec (months >= 9) for level 2, item 187 +sep_to_dec = sf['month'].ge(9) +new_xpert_availability = sf.loc[level_2 & xpert_item & sep_to_dec, 'available_prop'].mean() +# Assign new availability to relevant facility levels +levels_1b_2_or_3 = sf['fac_type_tlo'].isin(['1b', '2', '3']) +xpert_item = sf['item_code'].eq(187) +sf.loc[levels_1b_2_or_3 & xpert_item, 'available_prop'] = new_xpert_availability + +# 5) Copy all the results to create a level 0 with an availability equal to half that in the respective 1a all_1a = sf.loc[sf.fac_type_tlo == '1a'] all_0 = sf.loc[sf.fac_type_tlo == '1a'].copy().assign(fac_type_tlo='0') all_0.available_prop *= 0.5 From fce280e45a2834c2767fdc65927672bd445f34e3 Mon Sep 17 00:00:00 2001 From: sakshimohan Date: Fri, 2 Jan 2026 14:38:36 +0000 Subject: [PATCH 06/15] apply weighted average of level 1b and 2 facilities to level 1b probability across alternative consumable scenarios - set up so that `consumable_availability_estimation.py` produces the final RF using functions from `generate_consumable_availability_scenarios_for_impact_analysis.py` --- ...rceFile_Consumables_availability_small.csv | 4 +- .../consumables_availability_estimation.py | 898 +++++---- ...ilability_scenarios_for_impact_analysis.py | 1651 +++++++---------- 3 files changed, 1223 insertions(+), 1330 deletions(-) diff --git a/resources/healthsystem/consumables/ResourceFile_Consumables_availability_small.csv b/resources/healthsystem/consumables/ResourceFile_Consumables_availability_small.csv index a8cdcde0fe..2cf60fec13 100644 --- a/resources/healthsystem/consumables/ResourceFile_Consumables_availability_small.csv +++ b/resources/healthsystem/consumables/ResourceFile_Consumables_availability_small.csv @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:3884d44c70e83abf753ae7bfe43e208e5e5c8c34704f7136d6bf9cb2642eddb0 -size 6245740 +oid sha256:2ab824924a4c0e798eeec6c2514c50577ba83185120ec257d5065c583fe1dfa6 +size 57341233 diff --git a/src/scripts/data_file_processing/healthsystem/consumables/consumable_resource_analyses_with_lmis/consumables_availability_estimation.py b/src/scripts/data_file_processing/healthsystem/consumables/consumable_resource_analyses_with_lmis/consumables_availability_estimation.py index da53e66c2e..867733d62f 100644 --- a/src/scripts/data_file_processing/healthsystem/consumables/consumable_resource_analyses_with_lmis/consumables_availability_estimation.py +++ b/src/scripts/data_file_processing/healthsystem/consumables/consumable_resource_analyses_with_lmis/consumables_availability_estimation.py @@ -1,13 +1,21 @@ """ This script generates estimates of availability of consumables used by disease modules: +OUTPUTS: * ResourceFile_Consumables_availability_small.csv (estimate of consumable available - file for use in the simulation). * ResourceFile_Consumables_Inflow_Outflow_Ratio.csv (a file that gives the ratio of inflow of consumables to outflow to * capture the extent of wastage as a proportion of use for each consumable by month, district and level. - -N.B. The file uses `ResourceFile_Consumables_matched.csv` as an input. +INPUTS: +* `ResourceFile_Consumables_matched.csv` - matches consumable names in OpenLMIS to those in TLO model +* `ResourceFile_LMIS_2018.csv` - consumable availability in OpenLMIS 2018. Data from OpenLMIS includes closing balance, +quantity received, quantity dispensed, and average monthly consumption for each month by facility. +* `ResourceFile_hhfa_consumables.xlsx` - provides consumable availability from other sources, mainly Harmonised Health + Facility Assessment 2018-19 (to fill gaps in Open LMIS data +* `ResourceFile_Consumables_Item_Designations.csv` to categorise consumables into disease/public health programs +* `ResourceFile_Master_Facilities_List.csv` - to obtain the Facility_Level associated with each Facility_ID +* `ResourceFile_Population_2010.csv` - to get the list of districts It creates one row for each consumable for availability at a specific facility and month when the data is extracted from the OpenLMIS dataset and one row for each consumable for availability aggregated across all facilities when the data is @@ -15,9 +23,15 @@ Consumable availability is measured as probability of stockout at any point in time. -Data from OpenLMIS includes closing balance, quantity received, quantity dispensed, and average monthly consumption -for each month by facility. - +Steps: +1. Prepare OpenLMIS data (A. Import, B. Reshape, C. Interpolate, D. Summarise by month and facility) +2. Match with TLO Model consumable names +3. Add data from other sources where OpenLMIS data is missing +4. Interpolate missing data +5. Add alternative availability scenarios +6. Check format and save as resourcefile +7. Produce validation plots +8. Plot summary of availability across scenarios """ import calendar @@ -29,8 +43,10 @@ import matplotlib.pyplot as plt import numpy as np import pandas as pd +from typing import Optional, List from tlo.methods.consumables import check_format_of_consumables_file +from scripts.data_file_processing.healthsystem.consumables.generating_consumable_scenarios import generate_alternative_availability_scenarios, generate_descriptive_consumable_availability_plots # Set local shared folder source path_to_share = Path( # <-- point to the shared folder @@ -51,8 +67,8 @@ resourcefilepath = Path("./resources") path_for_new_resourcefiles = resourcefilepath / "healthsystem/consumables" - # Define necessary functions +# Functions to clean LMIS data def change_colnames(df, NameChangeList): # Change column names ColNames = df.columns ColNames2 = ColNames @@ -62,12 +78,417 @@ def change_colnames(df, NameChangeList): # Change column names df.columns = ColNames2 return df +def rename_items_to_address_inconsistentencies(_df, item_dict): + """Return a dataframe with rows for the same item with inconsistent names collapsed into one""" + # Recode item names appearing from Jan to Aug to the new names adopted from September onwards + old_unique_item_count = _df.item.nunique() + for item in item_dict: + print(len(_df[_df.item == item_dict[item]]), ''' instances of "''', item_dict[item], '''"''' + ''' changed to "''', item, + '''"''') + # row_newname = _df.item == item + row_oldname = _df.item == item_dict[item] + _df.loc[row_oldname, 'item'] = item -# %% -# 1. DATA IMPORT AND CLEANING ## -######################################################################################### + # Make a list of column names to be collapsed using different methods + columns_to_sum = [col for col in _df.columns if + col[0].startswith(('amc', 'closing_bal', 'dispensed', 'received', 'stkout_days'))] + columns_to_preserve = [col for col in _df.columns if + col[0].startswith(('data_source'))] + + # Define aggregation function to be applied to collapse data by item + def custom_agg(x): + if x.name in columns_to_sum: + return x.sum(skipna=True) if np.any( + x.notnull() & (x >= 0)) else np.nan # this ensures that the NaNs are retained + # , i.e. not changed to 0, when the corresponding data for both item name variations are NaN, and when there + # is a 0 or positive value for one or both item name variation, the sum is taken. + elif x.name in columns_to_preserve: + return x.str.cat( + sep='') # for the data_source column, this function concatenates the string values + + # Collapse dataframe + _collapsed_df = _df.groupby(['program', 'item', 'district', 'fac_type_tlo', 'fac_name']).agg( + {col: custom_agg for col in columns_to_preserve + columns_to_sum} + ).reset_index() + + # Test that all items in the dictionary have been found in the dataframe + new_unique_item_count = _collapsed_df.item.nunique() + assert len(item_dict) == old_unique_item_count - new_unique_item_count + return _collapsed_df + +def replace_old_item_names_in_lmis_data(_df, item_dict): + """Return a dataframe with old LMIS consumable names replaced with the new name""" + for item in item_dict: + cond_oldname = _df.item == item_dict[item] + _df.loc[cond_oldname, 'item'] = item + return _df + +def recategorize_modules_into_consumable_categories(_df): + _df['item_category'] = _df['module_name'].str.lower() + cond_RH = (_df['item_category'].str.contains('care_of_women_during_pregnancy')) | \ + (_df['item_category'].str.contains('labour')) + cond_newborn = (_df['item_category'].str.contains('newborn')) + cond_newborn[cond_newborn.isna()] = False + cond_childhood = (_df['item_category'] == 'acute lower respiratory infections') | \ + (_df['item_category'] == 'measles') | \ + (_df['item_category'] == 'diarrhoea') + cond_rti = _df['item_category'] == 'road traffic injuries' + cond_cancer = _df['item_category'].str.contains('cancer') + cond_cancer[cond_cancer.isna()] = False + cond_ncds = (_df['item_category'] == 'epilepsy') | \ + (_df['item_category'] == 'depression') + _df.loc[cond_RH, 'item_category'] = 'reproductive_health' + _df.loc[cond_cancer, 'item_category'] = 'cancer' + _df.loc[cond_newborn, 'item_category'] = 'neonatal_health' + _df.loc[cond_childhood, 'item_category'] = 'other_childhood_illnesses' + _df.loc[cond_rti, 'item_category'] = 'road_traffic_injuries' + _df.loc[cond_ncds, 'item_category'] = 'ncds' + cond_condom = _df['item_code'] == 2 + _df.loc[cond_condom, 'item_category'] = 'contraception' + + # Create a general consumables category + general_cons_list = [300, 33, 57, 58, 141, 5, 6, 10, 21, 23, 127, 24, 80, 93, 144, 149, 154, 40, 67, 73, 76, + 82, 101, 103, 88, 126, 135, 71, 98, 171, 133, 134, 244, 247, 49, 112, 1933, 1960] + cond_general = _df['item_code'].isin(general_cons_list) + _df.loc[cond_general, 'item_category'] = 'general' + + # Fill gaps in categories + dict_for_missing_categories = {292: 'acute lower respiratory infections', 293: 'acute lower respiratory infections', + 307: 'reproductive_health', 2019: 'reproductive_health', + 2678: 'tb', 1171: 'other_childhood_illnesses', 1237: 'cancer', 1239: 'cancer'} + # Use map to create a new series from item_code to fill missing values in category + mapped_categories = _df['item_code'].map(dict_for_missing_categories) + # Use fillna on the 'item_category' column to fill missing values using the mapped_categories + _df['item_category'] = _df['item_category'].fillna(mapped_categories) + + return _df + +# Function to extract inflow to outflow ratio for costing +def get_inflow_to_outflow_ratio_by_item_and_facilitylevel(_df): + df_by_item_level_month = \ + _df.groupby(['item_category', 'item_code', 'district', 'fac_type_tlo', 'month'])[ + ['closing_bal', 'dispensed', 'received']].sum() + df_by_item_level_month = df_by_item_level_month.loc[df_by_item_level_month.index.get_level_values('month') != "Aggregate"] + # Opening balance in January is the closing balance for the month minus what was received during the month plus what was dispensed + opening_bal_january = df_by_item_level_month.loc[df_by_item_level_month.index.get_level_values('month') == 'January', 'closing_bal'] + \ + df_by_item_level_month.loc[df_by_item_level_month.index.get_level_values('month') == 'January', 'dispensed'] - \ + df_by_item_level_month.loc[df_by_item_level_month.index.get_level_values('month') == 'January', 'received'] + closing_bal_december = df_by_item_level_month.loc[df_by_item_level_month.index.get_level_values('month') == 'December', 'closing_bal'] + # the consumable inflow during the year is the opening balance in January + what was received throughout the year - what was transferred to the next year (i.e. closing bal of December) + total_consumables_inflow_during_the_year = df_by_item_level_month['received'].groupby(level=['item_category', 'item_code', 'district', 'fac_type_tlo']).sum() +\ + opening_bal_january.reset_index(level='month', drop=True) -\ + closing_bal_december.reset_index(level='month', drop=True) + total_consumables_outflow_during_the_year = df_by_item_level_month['dispensed'].groupby(level=['item_category', 'item_code', 'district', 'fac_type_tlo']).sum() + inflow_to_outflow_ratio = total_consumables_inflow_during_the_year.div(total_consumables_outflow_during_the_year, fill_value=1) + inflow_to_outflow_ratio.loc[inflow_to_outflow_ratio < 1] = 1 # Ratio can't be less than 1 + + return inflow_to_outflow_ratio + +def update_level1b_availability( + availability_df: pd.DataFrame, + facilities_by_level: dict, + resourcefilepath: Path, + district_to_city_dict: dict, + weighting: str = "district_1b_to_2_ratio" +) -> pd.DataFrame: + """ + Updates the availability of Level 1b facilities to be the weighted average + of availability at Level 1b and 2 facilities, since these levels are merged + together in simulations. + + weighting : {'level2', 'national_1b_to_2_ratio', 'district_1b_to_2_ratio'}, default 'district_1b_to_2_ratio' + Weighting strategy: + - 'level2': Replace 1b availability entirely with level 2 values. + - 'national_1b_to_2_ratio': Apply a single national 1b:2 ratio to all districts. + - 'district_1b_to_2_ratio': (default) Use district-specific 1b:2 ratios. + """ + # Load and prepare base weights (facility counts) + # --------------------------------------------------------------------- + weight = ( + pd.read_csv(resourcefilepath / 'healthsystem' / 'organisation' / 'ResourceFile_Master_Facilities_List.csv') + [["District", "Facility_Level", "Facility_ID", "Facility_Count"]] + ) + + # Keep only Level 1b and 2 facilities + lvl1b2_weights = weight[weight["Facility_Level"].isin(["1b", "2"])].copy() + + # Compute weights depending on strategy + # --------------------------------------------------------------------- + if weighting == "level2": + # Force all weight on level 2 + lvl1b2_weights = lvl1b2_weights[~lvl1b2_weights.District.str.contains("City")] + lvl1b2_weights["weight"] = (lvl1b2_weights["Facility_Level"] == "2").astype(float) + lvl1b2_weights = lvl1b2_weights.drop(columns = 'Facility_ID') + + elif weighting == "national_1b_to_2_ratio": + lvl1b2_weights = lvl1b2_weights[~lvl1b2_weights.District.str.contains("City")] + # National total counts + national_counts = ( + lvl1b2_weights.groupby("Facility_Level")["Facility_Count"].sum().to_dict() + ) + total_fac = national_counts.get("1b", 0) + national_counts.get("2", 0) + if total_fac == 0: + raise ValueError("No facilities found at levels 1b or 2.") + lvl1b2_weights["weight"] = lvl1b2_weights["Facility_Level"].map( + {lvl: cnt / total_fac for lvl, cnt in national_counts.items()} + ) + lvl1b2_weights = lvl1b2_weights.drop(columns='Facility_ID') + + elif weighting == "district_1b_to_2_ratio": + # Replace city names with their parent districts (temporarily for grouping) + city_to_district_dict = {v: k for k, v in district_to_city_dict.items()} + lvl1b2_weights["District"] = lvl1b2_weights["District"].replace(city_to_district_dict) + + # District-level weighting (default) + lvl1b2_weights = ( + lvl1b2_weights + .groupby(["District", "Facility_Level"], as_index=False)["Facility_Count"] + .sum() + ) + + lvl1b2_weights["total_facilities"] = lvl1b2_weights.groupby("District")["Facility_Count"].transform("sum") + lvl1b2_weights["weight"] = lvl1b2_weights["Facility_Count"] / lvl1b2_weights["total_facilities"] + + else: + raise ValueError( + f"Invalid weighting '{weighting}'. Choose from " + "'level2', 'national_1b_to_2_ratio', or 'district_1b_to_2_ratio'." + ) + + # Add back city districts (reverse mapping) + for source, destination in copy_source_to_destination.items(): + new_rows = lvl1b2_weights.loc[lvl1b2_weights.District == source].copy() + new_rows.District = destination + lvl1b2_weights = pd.concat([lvl1b2_weights, new_rows], axis=0, ignore_index=True) + + # Merge Facility_ID back + lvl1b2_weights = lvl1b2_weights.merge( + weight.loc[weight["Facility_Level"].isin(["1b", "2"]), ["District", "Facility_Level", "Facility_ID"]], + on=["District", "Facility_Level"], + how="left", + validate="1:1" + ) + + # Subset Level 1b and 2 facilities and apply weights + # --------------------------------------------------------------------- + lvl1b2_ids = list(facilities_by_level.get("1b", [])) + list(facilities_by_level.get("2", [])) + availability_levels1b2 = availability_df[ + availability_df["Facility_ID"].isin(lvl1b2_ids) + ].copy() + + availability_levels1b2 = availability_levels1b2.merge( + lvl1b2_weights[["District", "Facility_Level", "Facility_ID", "weight"]], + on="Facility_ID", + how="left", + validate="m:1" + ) + + # Apply weighting + available_cols = [c for c in availability_levels1b2.columns if c.startswith("available_prop")] + availability_levels1b2[available_cols] *= availability_levels1b2["weight"].values[:, None] + + # Aggregate to district-month-item level + availability_levels1b2 = ( + availability_levels1b2 + .groupby(["District", "month", "item_code"], as_index=False)[available_cols] + .sum() + ) + + # Add facility level + availability_levels1b2["Facility_Level"] = "1b" + + # Reattach Facility_IDs and weights for level 1b + full_set_interpolated_levels1b2 = availability_levels1b2.merge( + lvl1b2_weights.query("Facility_Level == '1b'")[["District", "Facility_Level", "Facility_ID", "weight"]], + on=["District", "Facility_Level"], + how="left", + validate="m:1" + ) + + # Replace old level 1b facilities and recompute weighted availability + # --------------------------------------------------------------------- + # Drop old Level 1b facilities + availability_df = availability_df[ + ~availability_df["Facility_ID"].isin(facilities_by_level.get("1b", [])) + ] + + # Append new 1b facility data + availability_df = pd.concat( + [ + availability_df, + full_set_interpolated_levels1b2[["Facility_ID", "month", "item_code", *available_cols]] + ], + axis=0, + ignore_index=True + ) + + return availability_df + +# Function to compute average availability by facility level +def compute_avg_availability_by_var(df: pd.DataFrame = None, # TLO availability dataframe with each row representing one Facility_ID, item, month, + mfl: Optional[pd.DataFrame] = None, # Master Facility list mapping Facility_Level to Faciility_ID + program_item_mapping: Optional[pd.DataFrame] = None, + groupby_var:str = 'month', + available_cols: List[str] = ['available_prop'], # List of availability columns to summarise + label:str = "Average"): + if groupby_var == 'Facility_Level': + # Merge Facility_Level + df = (df.merge(mfl[['District', 'Facility_Level', 'Facility_ID']],on=['Facility_ID'], how='left')) + if groupby_var == 'item_category': + # Merge Program + program_item_mapping = program_item_mapping.rename(columns ={'Item_Code': 'item_code'})[program_item_mapping.item_category.notna()] + df = df.merge(program_item_mapping, on = ['item_code'], how='left') + + out = ( + df + .groupby(groupby_var)[available_cols] + .mean() + .reset_index() + .melt( + id_vars=groupby_var, + value_vars=available_cols, + var_name="Scenario", + value_name="Average_Availability" + ) + ) + out["Dataset"] = label + return out + +def plot_availability_before_and_after_level1b_fix(old_df: pd.DataFrame = None, + new_df: pd.DataFrame = None, + mfl: pd.DataFrame = None, + available_cols: List[str] = ['available_prop'], # List of availability columns to summarise + save_figure_as:Path = None): + avg_old = compute_avg_availability_by_var(df=old_df, + mfl=mfl, + groupby_var='Facility_Level', + available_cols=available_cols, + label="Original") + + avg_new = compute_avg_availability_by_var(df=new_df, + mfl=mfl, + groupby_var='Facility_Level', + available_cols=available_cols, + label="Updated") + + plot_df = pd.concat([avg_old, avg_new], ignore_index=True) + facility_levels = plot_df["Facility_Level"].unique() + scenarios = plot_df["Scenario"].unique() + + x = np.arange(len(scenarios)) + width = 0.35 + + fig, axes = plt.subplots( + nrows=len(facility_levels), + figsize=(14, 5 * len(facility_levels)), + sharey=True + ) + + if len(facility_levels) == 1: + axes = [axes] + + for ax, fl in zip(axes, facility_levels): + sub = plot_df[plot_df["Facility_Level"] == fl] + + orig = sub[sub["Dataset"] == "Original"].set_index("Scenario").loc[scenarios] + new = sub[sub["Dataset"] == "Updated"].set_index("Scenario").loc[scenarios] + + ax.bar(x - width / 2, orig["Average_Availability"], width, label="Original") + ax.bar(x + width / 2, new["Average_Availability"], width, label="Updated") + + ax.set_title(f"Average Availability by Scenario – Facility Level {fl}") + ax.set_xticks(x) + ax.set_xticklabels(scenarios, rotation=45, ha="right") + ax.set_ylabel("Average Availability") + ax.legend() + + plt.tight_layout() + plt.savefig(save_figure_as) + +def collapse_stockout_data(_df, groupby_list, var): + """Return a dataframe with rows for the same TLO model item code collapsed into 1""" + # Define column lists based on the aggregation function to be applied + columns_to_multiply = [var] + columns_to_sum = ['closing_bal', 'amc', 'dispensed', 'received'] + columns_to_preserve = ['data_source', 'consumable_reporting_freq', 'consumables_reported_in_mth'] + + # Define aggregation function to be applied to collapse data by item + def custom_agg_stkout(x): + if x.name in columns_to_multiply: + return x.prod(skipna=True) if np.any( + x.notnull() & (x >= 0)) else np.nan # this ensures that the NaNs are retained + elif x.name in columns_to_sum: + return x.sum(skipna=True) if np.any( + x.notnull() & (x >= 0)) else np.nan # this ensures that the NaNs are retained + # , i.e. not changed to 1, when the corresponding data for both item name variations are NaN, and when there + # is a 0 or positive value for one or both item name variation, the sum is taken. + elif x.name in columns_to_preserve: + return x.iloc[0] # this function extracts the first value + + # Collapse dataframe + _collapsed_df = _df.groupby(groupby_list).agg( + {col: custom_agg_stkout for col in columns_to_multiply + columns_to_sum + columns_to_preserve} + ).reset_index() + + return _collapsed_df + +# Functions for interpolation +def get_other_facilities_of_same_level(_fac_id): + """Return a set of facility_id for other facilities that are of the same level as that provided.""" + for v in facilities_by_level.values(): + if _fac_id in v: + return v - {_fac_id} + +def interpolate_missing_with_mean(_ser): + """Return a series in which any values that are null are replaced with the mean of the non-missing.""" + if pd.isnull(_ser).all(): + raise ValueError + return _ser.fillna(_ser.mean()) + +# Function to draw calibration plots at different levels of disaggregation (comparing final TLO data to HHFA) +def comparison_plot(level_of_disaggregation, group_by_var, colour): + comparison_df_agg = comparison_df.groupby([group_by_var], + as_index=False).agg({'available_prop': 'mean', + 'available_prop_hhfa': 'mean', + 'Facility_Level': 'first', + 'consumable_labels': 'first'}) + comparison_df_agg['labels'] = comparison_df_agg[level_of_disaggregation] + + ax = comparison_df_agg.plot.scatter('available_prop', 'available_prop_hhfa', c=colour) + ax.axline([0, 0], [1, 1]) + for i, label in enumerate(comparison_df_agg['labels']): + plt.annotate(label, + (comparison_df_agg['available_prop'][i] + 0.005, + comparison_df_agg['available_prop_hhfa'][i] + 0.005), + fontsize=6, rotation=38) + if level_of_disaggregation != 'aggregate': + plt.title('Disaggregated by ' + level_of_disaggregation, fontsize=size, weight="bold") + else: + plt.title('Aggregate', fontsize=size, weight="bold") + plt.xlabel('Pr(drug available) as per TLO model') + plt.ylabel('Pr(drug available) as per HHFA') + save_name = 'comparison_plots/calibration_to_hhfa_' + level_of_disaggregation + '.png' + plt.savefig(outputfilepath / save_name) -# Import 2018 data +def comparison_plot_by_level(fac_type): + cond_fac_type = comparison_df['Facility_Level'] == fac_type + comparison_df_by_level = comparison_df[cond_fac_type].reset_index() + plt.scatter(comparison_df_by_level['available_prop'], + comparison_df_by_level['available_prop_hhfa']) + plt.axline([0, 0], [1, 1]) + for i, label in enumerate(comparison_df_by_level['consumable_labels']): + plt.annotate(label, (comparison_df_by_level['available_prop'][i] + 0.005, + comparison_df_by_level['available_prop_hhfa'][i] + 0.005), + fontsize=6, rotation=27) + plt.title(fac_type, fontsize=size, weight="bold") + plt.xlabel('Pr(drug available) as per TLO model') + plt.ylabel('Pr(drug available) as per HHFA') + +# %% +# 1. PREPARE OPENLMIS DATA +######################################################################################################################## +# 1A. Import 2018 data lmis_df = pd.read_csv(path_to_files_in_the_tlo_shared_drive / 'OpenLMIS/2018/ResourceFile_LMIS_2018.csv', low_memory=False) # 1. BASIC CLEANING ## @@ -143,9 +564,7 @@ def change_colnames(df, NameChangeList): # Change column names months_withdata = ['January', 'February', 'April', 'October', 'November'] months_interpolated = ['March', 'May', 'June', 'July', 'August', 'September', 'December'] -# 2. RESHAPE AND REORDER ## -######################################################################################### - +# 1B. RESHAPE AND REORDER # Reshape dataframe so that each row represent a unique consumable and facility lmis_df_wide = lmis_df.pivot_table(index=['district', 'fac_type_tlo', 'fac_name', 'program', 'item'], columns='month', values=['closing_bal', 'dispensed', 'received', 'stkout_days', 'amc'], @@ -160,8 +579,7 @@ def change_colnames(df, NameChangeList): # Change column names num = lmis_df_wide._get_numeric_data() lmis_df_wide[num < 0] = np.nan -# 3. INTERPOLATE MISSING VALUES ## -######################################################################################### +# 1C. INTERPOLATE MISSING VALUES ## # When stkout_days is empty but closing balance, dispensed and received entries are available lmis_df_wide_flat = lmis_df_wide.reset_index() count_stkout_entries = lmis_df_wide_flat['stkout_days'].count(axis=1).sum() @@ -237,45 +655,6 @@ def change_colnames(df, NameChangeList): # Change column names # TODO check whether there is any issue with the above items_introduced_in_september which only show up from September # onwards -def rename_items_to_address_inconsistentencies(_df, item_dict): - """Return a dataframe with rows for the same item with inconsistent names collapsed into one""" - # Recode item names appearing from Jan to Aug to the new names adopted from September onwards - old_unique_item_count = _df.item.nunique() - for item in item_dict: - print(len(_df[_df.item == item_dict[item]]), ''' instances of "''', item_dict[item], '''"''' - ''' changed to "''', item, - '''"''') - # row_newname = _df.item == item - row_oldname = _df.item == item_dict[item] - _df.loc[row_oldname, 'item'] = item - - # Make a list of column names to be collapsed using different methods - columns_to_sum = [col for col in _df.columns if - col[0].startswith(('amc', 'closing_bal', 'dispensed', 'received', 'stkout_days'))] - columns_to_preserve = [col for col in _df.columns if - col[0].startswith(('data_source'))] - - # Define aggregation function to be applied to collapse data by item - def custom_agg(x): - if x.name in columns_to_sum: - return x.sum(skipna=True) if np.any( - x.notnull() & (x >= 0)) else np.nan # this ensures that the NaNs are retained - # , i.e. not changed to 0, when the corresponding data for both item name variations are NaN, and when there - # is a 0 or positive value for one or both item name variation, the sum is taken. - elif x.name in columns_to_preserve: - return x.str.cat( - sep='') # for the data_source column, this function concatenates the string values - - # Collapse dataframe - _collapsed_df = _df.groupby(['program', 'item', 'district', 'fac_type_tlo', 'fac_name']).agg( - {col: custom_agg for col in columns_to_preserve + columns_to_sum} - ).reset_index() - - # Test that all items in the dictionary have been found in the dataframe - new_unique_item_count = _collapsed_df.item.nunique() - assert len(item_dict) == old_unique_item_count - new_unique_item_count - return _collapsed_df - # Hold out the dataframe with no naming inconsistencies list_of_items_with_inconsistent_names_zipped = set(zip(inconsistent_item_names_mapping.keys(), inconsistent_item_names_mapping.values())) list_of_items_with_inconsistent_names = [item for sublist in list_of_items_with_inconsistent_names_zipped for item in sublist] @@ -288,7 +667,7 @@ def custom_agg(x): lmis_df_wide_flat = pd.concat([df_without_consistent_item_names_corrected, df_with_consistent_item_names], ignore_index=True) -# --- 3.1 RULE: 1.If i) stockout is missing, ii) closing_bal, amc and received are not missing , and iii) amc !=0 and, +# 1. --- RULE: 1.If i) stockout is missing, ii) closing_bal, amc and received are not missing , and iii) amc !=0 and, # then stkout_days[m] = (amc[m] - closing_bal[m-1] - received)/amc * number of days in the month --- # (Note that the number of entries for closing balance, dispensed and received is always the same) for m in range(2, 13): @@ -324,7 +703,7 @@ def custom_agg(x): count_stkout_entries = lmis_df_wide_flat['stkout_days'].count(axis=1).sum() print(count_stkout_entries, "stockout entries after first interpolation") -# 3.2 --- If any stockout_days < 0 after the above interpolation, update to stockout_days = 0 --- +# 2. --- If any stockout_days < 0 after the above interpolation, update to stockout_days = 0 --- # RULE: If closing balance[previous month] - dispensed[this month] + received[this month] > 0, stockout == 0 for m in range(1, 13): cond1 = lmis_df_wide_flat['stkout_days', months_dict[m]] < 0 @@ -342,7 +721,7 @@ def custom_agg(x): # Flatten multilevel columns lmis_df_wide_flat.columns = [' '.join(col).strip() for col in lmis_df_wide_flat.columns.values] -# 3.3 --- If the consumable was previously reported and during a given month, if any consumable was reported, assume +# 3. --- If the consumable was previously reported and during a given month, if any consumable was reported, assume # 100% days of stckout --- # RULE: If the balance on a consumable is ever reported and if any consumables are reported during the month, stkout_ # days = number of days of the month @@ -370,9 +749,7 @@ def custom_agg(x): count_stkout_entries = count_stkout_entries + lmis_df_wide_flat['stkout_days ' + months_dict[m]].count().sum() print(count_stkout_entries, "stockout entries after third interpolation") -# 4. CALCULATE STOCK OUT RATES BY MONTH and FACILITY ## -######################################################################################### - +# 1D. CALCULATE STOCK OUT RATES BY MONTH and FACILITY lmis = lmis_df_wide_flat # choose dataframe # Generate variables denoting the stockout proportion in each month @@ -391,10 +768,9 @@ def custom_agg(x): sep=' ', suffix=r'\w+') lmis = lmis.reset_index() -# 5. LOAD CLEANED MATCHED CONSUMABLE LIST FROM TLO MODEL AND MERGE WITH LMIS DATA ## -######################################################################################### - -# 5.1 --- Load and clean data --- +# 2. LOAD CLEANED MATCHED CONSUMABLE LIST FROM TLO MODEL AND MERGE WITH LMIS DATA +######################################################################################################################## +# 1. --- Load and clean data --- # Import matched list of consumanbles consumables_df = pd.read_csv(path_for_new_resourcefiles / 'ResourceFile_consumables_matched.csv', low_memory=False, encoding="ISO-8859-1") @@ -408,53 +784,16 @@ def custom_agg(x): # Rename columns NameChangeList = [('consumable_name_lmis', 'item'), ] -change_colnames(consumables_df, NameChangeList) -change_colnames(matched_consumables, NameChangeList) - - -# Update matched consumable name where the name in the OpenLMIS data was updated in September -def replace_old_item_names_in_lmis_data(_df, item_dict): - """Return a dataframe with old LMIS consumable names replaced with the new name""" - for item in item_dict: - cond_oldname = _df.item == item_dict[item] - _df.loc[cond_oldname, 'item'] = item - return _df - - -matched_consumables = replace_old_item_names_in_lmis_data(matched_consumables, inconsistent_item_names_mapping) - -# 5.2 --- Merge data with LMIS data --- -lmis_matched_df = pd.merge(lmis, matched_consumables, how='inner', on=['item']) -lmis_matched_df = lmis_matched_df.sort_values('data_source') - - -def collapse_stockout_data(_df, groupby_list, var): - """Return a dataframe with rows for the same TLO model item code collapsed into 1""" - # Define column lists based on the aggregation function to be applied - columns_to_multiply = [var] - columns_to_sum = ['closing_bal', 'amc', 'dispensed', 'received'] - columns_to_preserve = ['data_source', 'consumable_reporting_freq', 'consumables_reported_in_mth'] - - # Define aggregation function to be applied to collapse data by item - def custom_agg_stkout(x): - if x.name in columns_to_multiply: - return x.prod(skipna=True) if np.any( - x.notnull() & (x >= 0)) else np.nan # this ensures that the NaNs are retained - elif x.name in columns_to_sum: - return x.sum(skipna=True) if np.any( - x.notnull() & (x >= 0)) else np.nan # this ensures that the NaNs are retained - # , i.e. not changed to 1, when the corresponding data for both item name variations are NaN, and when there - # is a 0 or positive value for one or both item name variation, the sum is taken. - elif x.name in columns_to_preserve: - return x.iloc[0] # this function extracts the first value +change_colnames(consumables_df, NameChangeList) +change_colnames(matched_consumables, NameChangeList) - # Collapse dataframe - _collapsed_df = _df.groupby(groupby_list).agg( - {col: custom_agg_stkout for col in columns_to_multiply + columns_to_sum + columns_to_preserve} - ).reset_index() - return _collapsed_df +# Update matched consumable name where the name in the OpenLMIS data was updated in September +matched_consumables = replace_old_item_names_in_lmis_data(matched_consumables, inconsistent_item_names_mapping) +# 2. --- Merge data with LMIS data --- +lmis_matched_df = pd.merge(lmis, matched_consumables, how='inner', on=['item']) +lmis_matched_df = lmis_matched_df.sort_values('data_source') # 2.i. For substitable drugs (within drug category), collapse by taking the product of stkout_prop (OR condition) # This represents Pr(all substitutes with the item code are stocked out) @@ -503,10 +842,9 @@ def custom_agg_stkout(x): 'available_prop', 'closing_bal', 'amc', 'dispensed', 'received', 'data_source', 'consumable_reporting_freq', 'consumables_reported_in_mth']] -# 6. ADD STOCKOUT DATA FROM OTHER SOURCES TO COMPLETE STOCKOUT DATAFRAME ## -######################################################################################### - -# --- 6.1. Generate a dataframe of stock availability for consumables which were not found in the OpenLMIS data but +# 3. ADD STOCKOUT DATA FROM OTHER SOURCES TO COMPLETE STOCKOUT DATAFRAME +######################################################################################################################## +# --- 1. Generate a dataframe of stock availability for consumables which were not found in the OpenLMIS data but # available in the HHFA 2018/19 --- # # Save the list of items for which a match was not found in the OpenLMIS data unmatched_consumables = consumables_df.drop_duplicates(['item_code']) @@ -582,13 +920,13 @@ def custom_agg_stkout(x): ('available_prop_hhfa', 'available_prop')] change_colnames(unmatched_consumables_df, NameChangeList) -# --- 6.2 Append OpenLMIS stockout dataframe with HHFA stockout dataframe and Extract in .csv format --- # +# --- 2. Append OpenLMIS stockout dataframe with HHFA stockout dataframe and Extract in .csv format --- # # Append common consumables stockout dataframe with the main dataframe cond = unmatched_consumables_df['available_prop'].notna() unmatched_consumables_df.loc[~cond, 'data_source'] = 'Not available' stkout_df = pd.concat([stkout_df, unmatched_consumables_df], axis=0, ignore_index=True) -# --- 6.3 Append stockout rate for facility level 0 from HHFA --- # +# --- 3. Append stockout rate for facility level 0 from HHFA --- # cond = hhfa_df['item_code'].notna() hhfa_fac0 = hhfa_df[cond][ ['item_code', 'consumable_name_tlo', 'fac_count_Facility_level_0', 'available_prop_hhfa_Facility_level_0']] @@ -605,47 +943,7 @@ def custom_agg_stkout(x): stkout_df = stkout_df[~cond] stkout_df = pd.concat([stkout_df, hhfa_fac0], axis=0, ignore_index=True) -# --- 6.4 Generate new category variable for analysis --- # -def recategorize_modules_into_consumable_categories(_df): - _df['item_category'] = _df['module_name'].str.lower() - cond_RH = (_df['item_category'].str.contains('care_of_women_during_pregnancy')) | \ - (_df['item_category'].str.contains('labour')) - cond_newborn = (_df['item_category'].str.contains('newborn')) - cond_newborn[cond_newborn.isna()] = False - cond_childhood = (_df['item_category'] == 'acute lower respiratory infections') | \ - (_df['item_category'] == 'measles') | \ - (_df['item_category'] == 'diarrhoea') - cond_rti = _df['item_category'] == 'road traffic injuries' - cond_cancer = _df['item_category'].str.contains('cancer') - cond_cancer[cond_cancer.isna()] = False - cond_ncds = (_df['item_category'] == 'epilepsy') | \ - (_df['item_category'] == 'depression') - _df.loc[cond_RH, 'item_category'] = 'reproductive_health' - _df.loc[cond_cancer, 'item_category'] = 'cancer' - _df.loc[cond_newborn, 'item_category'] = 'neonatal_health' - _df.loc[cond_childhood, 'item_category'] = 'other_childhood_illnesses' - _df.loc[cond_rti, 'item_category'] = 'road_traffic_injuries' - _df.loc[cond_ncds, 'item_category'] = 'ncds' - cond_condom = _df['item_code'] == 2 - _df.loc[cond_condom, 'item_category'] = 'contraception' - - # Create a general consumables category - general_cons_list = [300, 33, 57, 58, 141, 5, 6, 10, 21, 23, 127, 24, 80, 93, 144, 149, 154, 40, 67, 73, 76, - 82, 101, 103, 88, 126, 135, 71, 98, 171, 133, 134, 244, 247, 49, 112, 1933, 1960] - cond_general = _df['item_code'].isin(general_cons_list) - _df.loc[cond_general, 'item_category'] = 'general' - - # Fill gaps in categories - dict_for_missing_categories = {292: 'acute lower respiratory infections', 293: 'acute lower respiratory infections', - 307: 'reproductive_health', 2019: 'reproductive_health', - 2678: 'tb', 1171: 'other_childhood_illnesses', 1237: 'cancer', 1239: 'cancer'} - # Use map to create a new series from item_code to fill missing values in category - mapped_categories = _df['item_code'].map(dict_for_missing_categories) - # Use fillna on the 'item_category' column to fill missing values using the mapped_categories - _df['item_category'] = _df['item_category'].fillna(mapped_categories) - - return _df - +# --- 4. Generate new category variable for analysis --- # stkout_df = recategorize_modules_into_consumable_categories(stkout_df) item_code_category_mapping = stkout_df[['item_category', 'item_code']].drop_duplicates() @@ -655,12 +953,12 @@ def recategorize_modules_into_consumable_categories(_df): item_designations = item_designations.merge(item_code_category_mapping, left_on = 'Item_Code', right_on = 'item_code', how = 'left', validate = '1:1') item_designations.drop(columns = 'item_code').to_csv(path_for_new_resourcefiles / 'ResourceFile_Consumables_Item_Designations.csv', index = False) -# --- 6.5 Replace district/fac_name/month entries where missing --- # +# --- 5. Replace district/fac_name/month entries where missing --- # for var in ['district', 'fac_name', 'month']: cond = stkout_df[var].isna() stkout_df.loc[cond, var] = 'Aggregate' -# --- 6.6 Export final stockout dataframe --- # +# --- 6. Export final stockout dataframe --- # # stkout_df.to_csv(path_for_new_resourcefiles / "ResourceFile_Consumables_availability_and_usage.csv") # <-- this line commented out as the file is very large. @@ -670,26 +968,6 @@ def recategorize_modules_into_consumable_categories(_df): lmis_consumable_usage = stkout_df.copy() # TODO Generate a smaller version of this file # Collapse individual facilities -def get_inflow_to_outflow_ratio_by_item_and_facilitylevel(_df): - df_by_item_level_month = \ - _df.groupby(['item_category', 'item_code', 'district', 'fac_type_tlo', 'month'])[ - ['closing_bal', 'dispensed', 'received']].sum() - df_by_item_level_month = df_by_item_level_month.loc[df_by_item_level_month.index.get_level_values('month') != "Aggregate"] - # Opening balance in January is the closing balance for the month minus what was received during the month plus what was dispensed - opening_bal_january = df_by_item_level_month.loc[df_by_item_level_month.index.get_level_values('month') == 'January', 'closing_bal'] + \ - df_by_item_level_month.loc[df_by_item_level_month.index.get_level_values('month') == 'January', 'dispensed'] - \ - df_by_item_level_month.loc[df_by_item_level_month.index.get_level_values('month') == 'January', 'received'] - closing_bal_december = df_by_item_level_month.loc[df_by_item_level_month.index.get_level_values('month') == 'December', 'closing_bal'] - # the consumable inflow during the year is the opening balance in January + what was received throughout the year - what was transferred to the next year (i.e. closing bal of December) - total_consumables_inflow_during_the_year = df_by_item_level_month['received'].groupby(level=['item_category', 'item_code', 'district', 'fac_type_tlo']).sum() +\ - opening_bal_january.reset_index(level='month', drop=True) -\ - closing_bal_december.reset_index(level='month', drop=True) - total_consumables_outflow_during_the_year = df_by_item_level_month['dispensed'].groupby(level=['item_category', 'item_code', 'district', 'fac_type_tlo']).sum() - inflow_to_outflow_ratio = total_consumables_inflow_during_the_year.div(total_consumables_outflow_during_the_year, fill_value=1) - inflow_to_outflow_ratio.loc[inflow_to_outflow_ratio < 1] = 1 # Ratio can't be less than 1 - - return inflow_to_outflow_ratio - inflow_to_outflow_ratio = get_inflow_to_outflow_ratio_by_item_and_facilitylevel(lmis_consumable_usage) # Clean values for analysis inflow_to_outflow_ratio.loc[inflow_to_outflow_ratio < 1] = 1 # Ratio can't be less than 1 @@ -711,7 +989,6 @@ def get_inflow_to_outflow_ratio_by_item_and_facilitylevel(_df): # the Master Facilities List. # unify the set within each facility_id - mfl = pd.read_csv(resourcefilepath / "healthsystem" / "organisation" / "ResourceFile_Master_Facilities_List.csv") districts = set(pd.read_csv(resourcefilepath / 'demography' / 'ResourceFile_Population_2010.csv')['District']) fac_levels = {'0', '1a', '1b', '2', '3', '4'} @@ -737,7 +1014,8 @@ def get_inflow_to_outflow_ratio_by_item_and_facilitylevel(_df): # Take averages (now that 'Mzimba' is mapped-to by both 'Mzimba South' and 'Mzimba North'.) sf = sf.groupby(by=['district_std', 'fac_type_tlo', 'month', 'item_code'])['available_prop'].mean().reset_index() -# Fill in missing data: +# 4. INTERPOLATE MISSING DATA TO ENSURE DATA IS AVAILABLE FOR ALL ITEMS, MONTHS, LEVELS, DISTRICTS +######################################################################################################################## # 1) Cities to get same results as their respective regions copy_source_to_destination = { 'Mzimba': 'Mzuzu City', @@ -837,26 +1115,10 @@ def get_inflow_to_outflow_ratio_by_item_and_facilitylevel(_df): full_set = full_set.combine_first(sf_final.set_index(['Facility_ID', 'month', 'item_code'])['available_prop']) # Fill in the blanks with rules for interpolation. - facilities_by_level = defaultdict(set) for ix, row in mfl.iterrows(): facilities_by_level[row['Facility_Level']].add(row['Facility_ID']) - -def get_other_facilities_of_same_level(_fac_id): - """Return a set of facility_id for other facilities that are of the same level as that provided.""" - for v in facilities_by_level.values(): - if _fac_id in v: - return v - {_fac_id} - - -def interpolate_missing_with_mean(_ser): - """Return a series in which any values that are null are replaced with the mean of the non-missing.""" - if pd.isnull(_ser).all(): - raise ValueError - return _ser.fillna(_ser.mean()) - - # Create new dataset that include the interpolations (The operation is not done "in place", because the logic is based # on what results are missing before the interpolations in other facilities). full_set_interpolated = full_set * np.nan @@ -904,164 +1166,46 @@ def interpolate_missing_with_mean(_ser): full_set_interpolated = full_set_interpolated.reset_index() #full_set_interpolated = full_set_interpolated.reset_index().merge(item_code_category_mapping, on = 'item_code', how = 'left', validate = 'm:1') -def update_level1b_availability( - full_set_interpolated: pd.DataFrame, - facilities_by_level: dict, - resourcefilepath: Path, - district_to_city_dict: dict, - weighting: str = "district_1b_to_2_ratio" -) -> pd.DataFrame: - """ - Updates the availability of Level 1b facilities to be the weighted average - of availability at Level 1b and 2 facilities, since these levels are merged - together in simulations. - - weighting : {'level2', 'national_1b_to_2_ratio', 'district_1b_to_2_ratio'}, default 'district_1b_to_2_ratio' - Weighting strategy: - - 'level2': Replace 1b availability entirely with level 2 values. - - 'national_1b_to_2_ratio': Apply a single national 1b:2 ratio to all districts. - - 'district_1b_to_2_ratio': (default) Use district-specific 1b:2 ratios. - """ - # Load and prepare base weights (facility counts) - # --------------------------------------------------------------------- - weight = ( - pd.read_csv(resourcefilepath / 'healthsystem' / 'organisation' / 'ResourceFile_Master_Facilities_List.csv') - [["District", "Facility_Level", "Facility_ID", "Facility_Count"]] - ) - - # Keep only Level 1b and 2 facilities - lvl1b2_weights = weight[weight["Facility_Level"].isin(["1b", "2"])].copy() - - # Compute weights depending on strategy - # --------------------------------------------------------------------- - if weighting == "level2": - # Force all weight on level 2 - lvl1b2_weights = lvl1b2_weights[~lvl1b2_weights.District.str.contains("City")] - lvl1b2_weights["weight"] = (lvl1b2_weights["Facility_Level"] == "2").astype(float) - lvl1b2_weights = lvl1b2_weights.drop(columns = 'Facility_ID') - - elif weighting == "national_1b_to_2_ratio": - lvl1b2_weights = lvl1b2_weights[~lvl1b2_weights.District.str.contains("City")] - # National total counts - national_counts = ( - lvl1b2_weights.groupby("Facility_Level")["Facility_Count"].sum().to_dict() - ) - total_fac = national_counts.get("1b", 0) + national_counts.get("2", 0) - if total_fac == 0: - raise ValueError("No facilities found at levels 1b or 2.") - lvl1b2_weights["weight"] = lvl1b2_weights["Facility_Level"].map( - {lvl: cnt / total_fac for lvl, cnt in national_counts.items()} - ) - lvl1b2_weights = lvl1b2_weights.drop(columns='Facility_ID') - - elif weighting == "district_1b_to_2_ratio": - # Replace city names with their parent districts (temporarily for grouping) - city_to_district_dict = {v: k for k, v in district_to_city_dict.items()} - lvl1b2_weights["District"] = lvl1b2_weights["District"].replace(city_to_district_dict) - - # District-level weighting (default) - lvl1b2_weights = ( - lvl1b2_weights - .groupby(["District", "Facility_Level"], as_index=False)["Facility_Count"] - .sum() - ) - - lvl1b2_weights["total_facilities"] = lvl1b2_weights.groupby("District")["Facility_Count"].transform("sum") - lvl1b2_weights["weight"] = lvl1b2_weights["Facility_Count"] / lvl1b2_weights["total_facilities"] - - else: - raise ValueError( - f"Invalid weighting '{weighting}'. Choose from " - "'level2', 'national_1b_to_2_ratio', or 'district_1b_to_2_ratio'." - ) - - # Add back city districts (reverse mapping) - for source, destination in copy_source_to_destination.items(): - new_rows = lvl1b2_weights.loc[lvl1b2_weights.District == source].copy() - new_rows.District = destination - lvl1b2_weights = pd.concat([lvl1b2_weights, new_rows], axis=0, ignore_index=True) - - # Merge Facility_ID back - lvl1b2_weights = lvl1b2_weights.merge( - weight.loc[weight["Facility_Level"].isin(["1b", "2"]), ["District", "Facility_Level", "Facility_ID"]], - on=["District", "Facility_Level"], - how="left", - validate="1:1" - ) - - # Subset Level 1b and 2 facilities and apply weights - # --------------------------------------------------------------------- - lvl1b2_ids = list(facilities_by_level.get("1b", [])) + list(facilities_by_level.get("2", [])) - full_set_interpolated_levels1b2 = full_set_interpolated[ - full_set_interpolated["Facility_ID"].isin(lvl1b2_ids) - ].copy() - - full_set_interpolated_levels1b2 = full_set_interpolated_levels1b2.merge( - lvl1b2_weights[["District", "Facility_Level", "Facility_ID", "weight"]], - on="Facility_ID", - how="left", - validate="m:1" - ) - - # Apply weighting - full_set_interpolated_levels1b2["available_prop"] *= full_set_interpolated_levels1b2["weight"] - - # Aggregate to district-month-item level - full_set_interpolated_levels1b2 = ( - full_set_interpolated_levels1b2 - .groupby(["District", "month", "item_code"], as_index=False)["available_prop"] - .sum() - ) - full_set_interpolated_levels1b2["Facility_Level"] = "1b" - - # Reattach Facility_IDs for level 1b - full_set_interpolated_levels1b2 = full_set_interpolated_levels1b2.merge( - lvl1b2_weights.query("Facility_Level == '1b'")[["District", "Facility_Level", "Facility_ID", "weight"]], - on=["District", "Facility_Level"], - how="left", - validate="m:1" - ) - - # Replace old level 1b facilities and recompute weighted availability - # --------------------------------------------------------------------- - # Drop old Level 1b facilities - full_set_interpolated = full_set_interpolated[ - ~full_set_interpolated["Facility_ID"].isin(facilities_by_level.get("1b", [])) - ] - - # Append new 1b facility data - full_set_interpolated = pd.concat( - [ - full_set_interpolated, - full_set_interpolated_levels1b2[["Facility_ID", "month", "item_code", "available_prop"]] - ], - axis=0, - ignore_index=True - ) - - return full_set_interpolated +# 5. ADD ALTERNATIVE AVAILABILITY SCENARIOS +######################################################################################################################## +# Add alternative availability scenarios to represent improved or reduce consumable availability +full_set_interpolated_with_scenarios = generate_alternative_availability_scenarios(full_set_interpolated) -full_set_interpolated = update_level1b_availability( - full_set_interpolated=full_set_interpolated, +full_set_interpolated_with_scenarios_level1b_fixed = update_level1b_availability( + availability_df=full_set_interpolated_with_scenarios, facilities_by_level=facilities_by_level, resourcefilepath=resourcefilepath, district_to_city_dict=copy_source_to_destination, weighting = 'district_1b_to_2_ratio', ) +# Compare availability averages by Facility_Level before and after the 1b fix +available_cols = [c for c in full_set_interpolated_with_scenarios.columns if c.startswith("available_prop")] +level1b_fix_plots_path = outputfilepath / 'comparison_plots' +figurespath_scenarios = outputfilepath / 'consumable_scenarios' +if not os.path.exists(level1b_fix_plots_path): + os.makedirs(level1b_fix_plots_path) +plot_availability_before_and_after_level1b_fix(old_df = full_set_interpolated_with_scenarios, + new_df = full_set_interpolated_with_scenarios_level1b_fixed, + mfl = mfl, + available_cols = available_cols, # List of availability columns to summarise + save_figure_as = level1b_fix_plots_path / 'availability_before_and_after_level1b_fix.png') + +# 6. CHECK FORMAT AND SAVE AS RESOURCEFILE +######################################################################################################################## # --- Check that the exported file has the properties required of it by the model code. --- # -check_format_of_consumables_file(df=full_set_interpolated, fac_ids=fac_ids) +check_format_of_consumables_file(df=full_set_interpolated_with_scenarios_level1b_fixed, fac_ids=fac_ids) # %% # Save -full_set_interpolated.to_csv( +full_set_interpolated_with_scenarios_level1b_fixed.to_csv( path_for_new_resourcefiles / "ResourceFile_Consumables_availability_small.csv", index=False ) # %% -# 7. COMPARISON WITH HHFA DATA, 2018/19 ## -######################################################################################### +# 7. COMPARISON WITH HHFA DATA, 2018/19 +######################################################################################################################## # --- 7.1 Prepare comparison dataframe --- ## # Note that this only plot consumables for which data is available in the HHFA # i. Prepare data from HHFA @@ -1076,7 +1220,7 @@ def update_level1b_availability( hhfa_comparison_df = hhfa_comparison_df.rename({'fac_type_tlo': 'Facility_Level'}, axis=1) # ii. Collapse final model availability data by facility level -final_availability_df = full_set_interpolated +final_availability_df = full_set_interpolated_with_scenarios_level1b_fixed mfl = pd.read_csv(resourcefilepath / "healthsystem" / "organisation" / "ResourceFile_Master_Facilities_List.csv") final_availability_df = pd.merge(final_availability_df, mfl[['District', 'Facility_Level', 'Facility_ID']], how="left", on=['Facility_ID'], @@ -1098,39 +1242,7 @@ def update_level1b_availability( size = 10 comparison_df['consumable_labels'] = comparison_df['consumable_name_tlo'].str[:10] -# Define function to draw calibration plots at different levels of disaggregation -def comparison_plot(level_of_disaggregation, group_by_var, colour): - comparison_df_agg = comparison_df.groupby([group_by_var], - as_index=False).agg({'available_prop': 'mean', - 'available_prop_hhfa': 'mean', - 'Facility_Level': 'first', - 'consumable_labels': 'first'}) - comparison_df_agg['labels'] = comparison_df_agg[level_of_disaggregation] - - ax = comparison_df_agg.plot.scatter('available_prop', 'available_prop_hhfa', c=colour) - ax.axline([0, 0], [1, 1]) - for i, label in enumerate(comparison_df_agg['labels']): - plt.annotate(label, - (comparison_df_agg['available_prop'][i] + 0.005, - comparison_df_agg['available_prop_hhfa'][i] + 0.005), - fontsize=6, rotation=38) - if level_of_disaggregation != 'aggregate': - plt.title('Disaggregated by ' + level_of_disaggregation, fontsize=size, weight="bold") - else: - plt.title('Aggregate', fontsize=size, weight="bold") - plt.xlabel('Pr(drug available) as per TLO model') - plt.ylabel('Pr(drug available) as per HHFA') - save_name = 'comparison_plots/calibration_to_hhfa_' + level_of_disaggregation + '.png' - plt.savefig(outputfilepath / save_name) - - # 7.2.1 Aggregate plot -# First create folder in which to store the plots - -if not os.path.exists(outputfilepath / 'comparison_plots'): - os.makedirs(outputfilepath / 'comparison_plots') - print("folder to store Model-HHFA comparison plots created") - comparison_df['aggregate'] = 'aggregate' level_of_disaggregation = 'aggregate' colour = 'red' @@ -1149,23 +1261,7 @@ def comparison_plot(level_of_disaggregation, group_by_var, colour): colour = 'yellow' comparison_plot(level_of_disaggregation, group_by_var, colour) - # 7.2.4 Plot by item and facility level -def comparison_plot_by_level(fac_type): - cond_fac_type = comparison_df['Facility_Level'] == fac_type - comparison_df_by_level = comparison_df[cond_fac_type].reset_index() - plt.scatter(comparison_df_by_level['available_prop'], - comparison_df_by_level['available_prop_hhfa']) - plt.axline([0, 0], [1, 1]) - for i, label in enumerate(comparison_df_by_level['consumable_labels']): - plt.annotate(label, (comparison_df_by_level['available_prop'][i] + 0.005, - comparison_df_by_level['available_prop_hhfa'][i] + 0.005), - fontsize=6, rotation=27) - plt.title(fac_type, fontsize=size, weight="bold") - plt.xlabel('Pr(drug available) as per TLO model') - plt.ylabel('Pr(drug available) as per HHFA') - - fig = plt.figure(figsize=(22, 22)) plt.subplot(421) comparison_plot_by_level(comparison_df['Facility_Level'].unique()[1]) @@ -1176,3 +1272,33 @@ def comparison_plot_by_level(fac_type): plt.subplot(424) comparison_plot_by_level(comparison_df['Facility_Level'].unique()[4]) plt.savefig(outputfilepath / 'comparison_plots/calibration_to_hhfa_fac_type_and_consumable.png') + +# %% +# 8. PLOT SCENARIO SUMMARIES +######################################################################################################################## +# Create the directory if it doesn't exist +figurespath_scenarios = outputfilepath / 'consumable_scenarios' +if not os.path.exists(figurespath_scenarios): + os.makedirs(figurespath_scenarios) + +chosen_availability_columns = [c for c in full_set_interpolated_with_scenarios_level1b_fixed.columns if c.startswith("available_prop")] +scenario_names_dict = {'available_prop': 'Actual', 'available_prop_scenario1': 'Non-therapeutic \n consumables', 'available_prop_scenario2': 'Vital medicines', + 'available_prop_scenario3': 'Pharmacist-\n managed', 'available_prop_scenario4': 'Level 1b', 'available_prop_scenario5': 'CHAM', + 'available_prop_scenario6': '75th percentile\n facility', 'available_prop_scenario7': '90th percentile \n facility', 'available_prop_scenario8': 'Best \n facility', + 'available_prop_scenario9': 'Best facility \n (including DHO)','available_prop_scenario10': 'HIV supply \n chain', 'available_prop_scenario11': 'EPI supply \n chain', + 'available_prop_scenario12': 'HIV moved to \n Govt supply chain \n (Avg by Level)', 'available_prop_scenario13': 'HIV moved to \n Govt supply chain \n (Avg by Facility_ID)', + 'available_prop_scenario14': 'HIV moved to \n Govt supply chain \n (Avg by Facility_ID times 1.25)', + 'available_prop_scenario15': 'HIV moved to \n Govt supply chain \n (Avg by Facility_ID times 0.75)'} + +# Generate descriptive plots of consumable availability +program_item_mapping = pd.read_csv(path_for_new_resourcefiles / 'ResourceFile_Consumables_Item_Designations.csv')[['Item_Code', 'item_category']] +program_item_mapping = program_item_mapping.rename(columns ={'Item_Code': 'item_code'})[program_item_mapping.item_category.notna()] +generate_descriptive_consumable_availability_plots(tlo_availability_df = full_set_interpolated_with_scenarios_level1b_fixed, + figurespath = figurespath_scenarios, + mfl = mfl, + program_item_mapping = program_item_mapping, + chosen_availability_columns = None, + scenario_names_dict = scenario_names_dict,) + + + diff --git a/src/scripts/data_file_processing/healthsystem/consumables/generating_consumable_scenarios/generate_consumable_availability_scenarios_for_impact_analysis.py b/src/scripts/data_file_processing/healthsystem/consumables/generating_consumable_scenarios/generate_consumable_availability_scenarios_for_impact_analysis.py index 9ca0554650..32281bbe96 100644 --- a/src/scripts/data_file_processing/healthsystem/consumables/generating_consumable_scenarios/generate_consumable_availability_scenarios_for_impact_analysis.py +++ b/src/scripts/data_file_processing/healthsystem/consumables/generating_consumable_scenarios/generate_consumable_availability_scenarios_for_impact_analysis.py @@ -45,7 +45,7 @@ import numpy as np import pandas as pd import seaborn as sns -from plotnine import aes, element_text, geom_bar, ggplot, labs, theme, ylim # for ggplots from R +#from plotnine import aes, element_text, geom_bar, ggplot, labs, theme, ylim # for ggplots from R from tlo.methods.consumables import check_format_of_consumables_file @@ -61,637 +61,659 @@ resourcefilepath = Path("./resources") path_for_new_resourcefiles = resourcefilepath / "healthsystem/consumables" -# 1. Import and clean data files -#********************************** -# 1.1 Import TLO model availability data -#------------------------------------------------------ -tlo_availability_df = pd.read_csv(path_for_new_resourcefiles / "ResourceFile_Consumables_availability_small.csv") -# Drop any scenario data previously included in the resourcefile -tlo_availability_df = tlo_availability_df[['Facility_ID', 'month','item_code', 'available_prop']] - -# Import item_category -program_item_mapping = pd.read_csv(path_for_new_resourcefiles / 'ResourceFile_Consumables_Item_Designations.csv')[['Item_Code', 'item_category']] -program_item_mapping = program_item_mapping.rename(columns ={'Item_Code': 'item_code'})[program_item_mapping.item_category.notna()] - -# 1.1.1 Attach district, facility level and item_category to this dataset -#---------------------------------------------------------------- -# Get TLO Facility_ID for each district and facility level -mfl = pd.read_csv(resourcefilepath / "healthsystem" / "organisation" / "ResourceFile_Master_Facilities_List.csv") -districts = set(pd.read_csv(resourcefilepath / 'demography' / 'ResourceFile_Population_2010.csv')['District']) -fac_levels = {'0', '1a', '1b', '2', '3', '4'} -tlo_availability_df = tlo_availability_df.merge(mfl[['District', 'Facility_Level', 'Facility_ID']], - on = ['Facility_ID'], how='left') - -tlo_availability_df = tlo_availability_df.merge(program_item_mapping, - on = ['item_code'], how='left') - -# 1.2 Import scenario data -#------------------------------------------------------ -scenario_availability_df = pd.read_csv(outputfilepath / "regression_analysis/predictions/predicted_consumable_availability_regression_scenarios.csv") -scenario_availability_df = scenario_availability_df.drop(['Unnamed: 0'], axis=1) -scenario_availability_df = scenario_availability_df.rename({'item': 'item_hhfa'}, axis=1) - -# Prepare scenario data to be merged to TLO model availability based on TLO model features -# 1.2.1 Level of care -#------------------------------------------------------ -scenario_availability_df['fac_type'] = scenario_availability_df['fac_type_original'].str.replace("Facility_level_", "") - -# 1.2.2 District -#------------------------------------------------------ -# Do some mapping to make the Districts in the scenario file line-up with the definition of Districts in the model -rename_and_collapse_to_model_districts = { - 'Mzimba South': 'Mzimba', - 'Mzimba North': 'Mzimba', -} -scenario_availability_df = scenario_availability_df.rename({'district': 'district_original'}, axis=1) -scenario_availability_df['District'] = scenario_availability_df['district_original'].replace(rename_and_collapse_to_model_districts) - -# Cities to get same results as their respective regions -copy_source_to_destination = { - 'Mzimba': 'Mzuzu City', - 'Lilongwe': 'Lilongwe City', - 'Zomba': 'Zomba City', - 'Blantyre': 'Blantyre City', - 'Nkhata Bay': 'Likoma' # based on anecdotal evidence, assume that they experience the same change in avaiability as a result of interventions based on regression results -} -for source, destination in copy_source_to_destination.items(): - new_rows = scenario_availability_df.loc[scenario_availability_df.District == source].copy() # standardised district names - new_rows.District = destination - scenario_availability_df = pd.concat([scenario_availability_df, new_rows], ignore_index = True) - -assert sorted(set(districts)) == sorted(set(pd.unique(scenario_availability_df.District))) - -# 1.2.3 Facility_ID -# #------------------------------------------------------ -# Merge-in facility_id -scenario_availability_df = scenario_availability_df.merge(mfl[['District', 'Facility_Level', 'Facility_ID']], - left_on=['District', 'fac_type'], - right_on=['District', 'Facility_Level'], how='left', indicator=True) -scenario_availability_df = scenario_availability_df.rename({'_merge': 'merge_facid'}, axis=1) - -# Extract list of District X Facility Level combinations for which there is no HHFA data -df_to_check_prediction_completeness = scenario_availability_df.merge(mfl[['District', 'Facility_Level', 'Facility_ID']], - left_on=['District', 'Facility_Level'], - right_on=['District', 'Facility_Level'], how='right', indicator=True) -cond_no_1b = (df_to_check_prediction_completeness['Facility_Level'].isin(['1b'])) & (df_to_check_prediction_completeness['_merge'] == 'right_only') -cond_no_1a = (df_to_check_prediction_completeness['Facility_Level'].isin(['1a'])) & (df_to_check_prediction_completeness['_merge'] == 'right_only') -districts_with_no_scenario_data_for_1b = df_to_check_prediction_completeness[cond_no_1b]['District'].unique() -districts_with_no_scenario_data_for_1a = df_to_check_prediction_completeness[cond_no_1a]['District'].unique() -districts_with_no_scenario_data_for_1b_only = np.setdiff1d(districts_with_no_scenario_data_for_1b, districts_with_no_scenario_data_for_1a) - -# According to HHFA data, Balaka, Machinga, Mwanza, Ntchisi and Salima do not have level 1b facilities -# Likoma was not included in the regression because of the limited variation within the district - only 4 facilities - we have assumed that the change of consumable -# availability in Likoma is equal to that predicted for Nkhata Bay - -# 1.2.4 Program -#------------------------------------------------------ -scenario_availability_df.loc[scenario_availability_df.program_plot == 'infection_prev', 'program_plot'] = 'general' # there is no separate infection_prevention category in the TLO availability data -map_model_programs_to_hhfa = { - 'contraception': 'contraception', - 'general': 'general', - 'reproductive_health': 'obs&newb', - 'road_traffic_injuries': 'surgical', - 'epi': 'epi', - 'neonatal_health': 'obs&newb', - 'other_childhood_illnesses': 'alri', - 'malaria': 'malaria', - 'tb': 'tb', - 'hiv': 'hiv', - 'undernutrition': 'child', - 'ncds': 'ncds', - 'cardiometabolicdisorders': 'ncds', - 'cancer': 'ncds', -} -# Reverse the map_model_programs_to_hhfa dictionary -hhfa_to_model_programs = {v: k for k, v in map_model_programs_to_hhfa.items()} - -scenario_availability_df['category_tlo'] = scenario_availability_df['program_plot'].replace(hhfa_to_model_programs) # TODO this might not be relevant - -# 1.2.5 Consumable/Item code and Category -#------------------------------------------------------ -# Load TLO - HHFA consumable name crosswalk -consumable_crosswalk_df = pd.read_csv(path_for_new_resourcefiles / 'ResourceFile_consumables_matched.csv', encoding='ISO-8859-1')[['item_code', 'consumable_name_tlo', -'item_hhfa_for_scenario_generation', 'hhfa_mapping_rationale_for_scenario_generation']] - -# Keep only item_codes in the availability dataframe -consumable_crosswalk_df = consumable_crosswalk_df.merge(tlo_availability_df[['item_code']], how = 'right', on = 'item_code') -# TODO is module_name used? -# TODO add new consumables Rifapentine to this? - -# Now merge in TLO item codes -scenario_availability_df = scenario_availability_df.reset_index(drop = True) -scenario_availability_df = scenario_availability_df.merge(consumable_crosswalk_df[['item_code', 'item_hhfa_for_scenario_generation', 'hhfa_mapping_rationale_for_scenario_generation', 'consumable_name_tlo']], - left_on = ['item_hhfa'], right_on = ['item_hhfa_for_scenario_generation'], how='right', indicator=True, validate = "m:m") -scenario_availability_df = scenario_availability_df.drop_duplicates(['Facility_ID', 'item_code']) -scenario_availability_df = scenario_availability_df.rename({'_merge': 'merge_itemcode'}, axis=1) -print("Number of item codes from the TLO model for which no match was found in the regression-based scenario data = ", scenario_availability_df.merge_itemcode.value_counts()[1]) - -# Before merging the above dataframe with tlo_availability_df, and apply a general interpolation rule to fill any gaps, -# we need to make sure that any specific interpolation rules are applied to the scenario dataframe - -# Further a row needs to be added for 1b level under Balaka, Machinga, Mwanza, Ntchisi and Salima -print("Number of unique facility IDs: \n - TLO consumable data = ", tlo_availability_df.Facility_ID.nunique(), - "\n - Scenario based on regression = ", scenario_availability_df.Facility_ID.nunique(), - "\nNumber of unique item codes: \n - TLO consumable availability data = ", tlo_availability_df.item_code.nunique(), - "\n - TLO consumable availability repository = ", consumable_crosswalk_df.item_code.nunique(), - "\n - Scenario based on regression = ", scenario_availability_df.item_code.nunique()) - -# Extract list of TLO consumables which weren't matched with the availability prediction dataframe -items_not_matched = scenario_availability_df['merge_itemcode'] == 'right_only' - -# Get average availability_change_prop value by facility_ID and category_tlo -scenario_availability_df = scenario_availability_df.merge(program_item_mapping, - on = ['item_code'], validate = "m:1", - how = "left") - -# 1.3 Initial interpolation -#------------------------------------------------------ -# 1.3.1 Items not relevant to the regression analysis -items_not_relevant_to_regression = (items_not_matched) & (scenario_availability_df['hhfa_mapping_rationale_for_scenario_generation'] == 'not relevant to logistic regression analysis') -# For category 3, replace availability_change_prop with 1, since we assume that the system-level intervention does not change availability -list_of_scenario_variables = ['change_proportion_scenario1', 'change_proportion_scenario2', - 'change_proportion_scenario3', 'change_proportion_scenario4', 'change_proportion_scenario5'] -for var in list_of_scenario_variables: - scenario_availability_df.loc[items_not_relevant_to_regression,var] = 1 - -# 1.3.2 For level 1b for the districts where this level was not present in the regression analysis/HHFA dataset, assume -# that the change is equal to the product of the (ratio of average change across districts for level 1b to -# average change across districts for level 1a) and change for each item_code for level 1a for that district -#------------------------------------------------------------------------------------------------------------ -average_change_across_districts = scenario_availability_df.groupby(['Facility_Level','item_code'])[list_of_scenario_variables].mean().reset_index() - -# Generate the ratio of the proportional changes to availability of level 1b to 1a in the districts for which level 1b data is available -new_colnames_1a = {col: col + '_1a' if col in list_of_scenario_variables else col for col in average_change_across_districts.columns} -new_colnames_1b = {col: col + '_1b' if col in list_of_scenario_variables else col for col in average_change_across_districts.columns} -average_change_across_districts_for_1a = average_change_across_districts[average_change_across_districts.Facility_Level == "1a"].rename(new_colnames_1a, axis = 1).drop('Facility_Level', axis = 1) -average_change_across_districts_for_1b = average_change_across_districts[average_change_across_districts.Facility_Level == "1b"].rename(new_colnames_1b, axis = 1).drop('Facility_Level', axis = 1) -ratio_of_change_across_districts_1b_to_1a = average_change_across_districts_for_1a.merge(average_change_across_districts_for_1b, - how = "left", on = ['item_code']) -for var in list_of_scenario_variables: - var_ratio = 'ratio_' + var - var_1a = var + '_1a' - var_1b = var + '_1b' - ratio_of_change_across_districts_1b_to_1a[var_ratio] = (ratio_of_change_across_districts_1b_to_1a[var_1b])/(ratio_of_change_across_districts_1b_to_1a[var_1a]) -ratio_of_change_across_districts_1b_to_1a.reset_index(drop = True) -# TODO check if this ratio should be of the proportions minus 1 - -# For districts with no level 1b data in the HHFA, use the ratio of change in level 1b facilities to level 1a facilities to generate the expected proportional change in availability -# for level 1b facilities in those districts -scenario_availability_df = scenario_availability_df.reset_index(drop = True) -cond_districts_with_1b_missing = scenario_availability_df.District.isin(districts_with_no_scenario_data_for_1b_only) -cond_1a = scenario_availability_df.Facility_Level == '1a' -cond_1b = scenario_availability_df.Facility_Level == '1b' -df_1a = scenario_availability_df[cond_districts_with_1b_missing & cond_1a] - -ratio_vars = ['ratio_' + item for item in list_of_scenario_variables] # create columns to represent the ratio of change in 1b facilities to level 1a facilities - -item_var = ['item_code'] - -# First merge the dataframe with changes at level 1a with the ratio of 1b to 1a -df_missing_1b_imputed = df_1a.merge(ratio_of_change_across_districts_1b_to_1a[item_var + ratio_vars], - on = item_var, - how = 'left', validate = "m:1") - -# Then multiply the ratio of 1b to 1a with the change at level 1a to get the expected change at level 1b -for var in list_of_scenario_variables: - df_missing_1b_imputed[var] = df_missing_1b_imputed[var] * df_missing_1b_imputed['ratio_' + var] -# Update columns so the dataframe in fact refers to level 1b facilities -df_missing_1b_imputed.Facility_Level = '1b' # Update facility level to 1 -# Replace Facility_IDs -df_missing_1b_imputed = df_missing_1b_imputed.drop('Facility_ID', axis = 1).merge(mfl[['District', 'Facility_Level', 'Facility_ID']], - on =['District', 'Facility_Level'], - how = 'left') -# Append the new imputed level 1b dataframe to the original dataframe -df_without_districts_with_no_1b_facilities = scenario_availability_df[~(cond_districts_with_1b_missing & cond_1b)] -scenario_availability_df = pd.concat([df_without_districts_with_no_1b_facilities, df_missing_1b_imputed], ignore_index = True) - -# 2. Merge TLO model availability data with scenario data using crosswalk -#************************************************************************* -# 2.1 Merge the two datasets -#------------------------------------------------------ -id_variables = ['item_code','Facility_ID'] - -full_scenario_df = tlo_availability_df.merge(scenario_availability_df[id_variables + list_of_scenario_variables], - how='left', on=['Facility_ID', 'item_code'], indicator = True) -full_scenario_df = full_scenario_df.rename({'_merge': 'merge_scenario'}, axis=1) -full_scenario_df = full_scenario_df.drop_duplicates(['Facility_ID', 'item_code', 'month']) - -# Check that level 1b values are currently imputed -#full_scenario_df[full_scenario_df.District == 'Balaka'].groupby(['District', 'Facility_Level'])['change_proportion_scenario1'].mean() - -# 2.2 Further imputation -#------------------------------------------------------ -# 2.2.1 For all levels other than 1a and 1b, there will be no change in consumable availability -#------------------------------------------------------------------------------------------------------------ -fac_levels_not_relevant_to_regression = full_scenario_df.Facility_Level.isin(['0', '2', '3', '4']) - -for var in list_of_scenario_variables: - full_scenario_df.loc[fac_levels_not_relevant_to_regression, var] = 1 - -# 2.3 Final checks -#------------------------------------------------------ -# 2.3.1 Check that the merged dataframe has the same number of unique items, facility IDs, and total -# number of rows as the original small availability resource file -#--------------------------------------------------------------------------------------------------------- -assert(full_scenario_df.item_code.nunique() == tlo_availability_df.item_code.nunique()) # the number of items in the new dataframe is the same at the original availability dataframe -assert(full_scenario_df.Facility_ID.nunique() == tlo_availability_df.Facility_ID.nunique()) # the number of Facility IDs in the new dataframe is the same at the original availability dataframe -assert(len(full_scenario_df) == len(tlo_availability_df)) # the number of rows in the new dataframe is the same at the original availability dataframe - -# 2.3.2 Construct dataset that conforms to the principles expected by the simulation: i.e. that there is an entry for every -# facility_id and for every month for every item_code. -#----------------------------------------------------------------------------------------------------------------------- -# Generate the dataframe that has the desired size and shape -fac_ids = set(mfl.loc[mfl.Facility_Level != '5'].Facility_ID) -item_codes = set(tlo_availability_df.item_code.unique()) -months = range(1, 13) -all_availability_columns = ['available_prop'] + list_of_scenario_variables - -# Create a MultiIndex from the product of fac_ids, months, and item_codes -index = pd.MultiIndex.from_product([fac_ids, months, item_codes], names=['Facility_ID', 'month', 'item_code']) - -# Initialize a DataFrame with the MultiIndex and columns, filled with NaN -full_set = pd.DataFrame(index=index, columns=all_availability_columns) -full_set = full_set.astype(float) # Ensure all columns are float type and filled with NaN - -# Insert the data, where it is available. -full_set = full_set.combine_first(full_scenario_df.set_index(['Facility_ID', 'month', 'item_code'])[all_availability_columns]) - -# Fill in the blanks with rules for interpolation. -facilities_by_level = defaultdict(set) -for ix, row in mfl.iterrows(): - facilities_by_level[row['Facility_Level']].add(row['Facility_ID']) - -items_by_category = defaultdict(set) -for ix, row in program_item_mapping.iterrows(): - items_by_category[row['item_category']].add(row['item_code']) - -def get_other_facilities_of_same_level(_fac_id): - """Return a set of facility_id for other facilities that are of the same level as that provided.""" - for v in facilities_by_level.values(): - if _fac_id in v: - return v - {_fac_id} - -def get_other_items_of_same_category(_item_code): - """Return a set of item_codes for other items that are in the same category/program as that provided.""" - for v in items_by_category.values(): - if _item_code in v: - return v - {_item_code} -def interpolate_missing_with_mean(_ser): - """Return a series in which any values that are null are replaced with the mean of the non-missing.""" - if pd.isnull(_ser).all(): - raise ValueError - return _ser.fillna(_ser.mean()) - -# Create new dataset that include the interpolations (The operation is not done "in place", because the logic is based -# on what results are missing before the interpolations in other facilities). -full_set_interpolated = full_set * np.nan -full_set_interpolated.available_prop = full_set.available_prop - -for fac in fac_ids: - for item in item_codes: - for col in list_of_scenario_variables: - print(f"Now doing: fac={fac}, item={item}, column={col}") - - # Get records of the availability of this item in this facility. - _monthly_records = full_set.loc[(fac, slice(None), item), col].copy() - - if pd.notnull(_monthly_records).any(): - # If there is at least one record of this item at this facility, then interpolate the missing months from - # the months for there are data on this item in this facility. (If none are missing, this has no effect). - _monthly_records = interpolate_missing_with_mean(_monthly_records) - - else: - # If there is no record of this item at this facility, check to see if it's available at other facilities - # of the same level - # Or if there is no record of item at other facilities at this level, check to see if other items of this category - # are available at this facility level - facilities = list(get_other_facilities_of_same_level(fac)) - - other_items = get_other_items_of_same_category(item) - items = list(other_items) if other_items else other_items - - recorded_at_other_facilities_of_same_level = pd.notnull( - full_set.loc[(facilities, slice(None), item), col] - ).any() - - if not items: - category_recorded_at_other_facilities_of_same_level = False - else: - # Filter only items that exist in the MultiIndex at this facility - valid_items = [ - itm for itm in items - if any((fac, m, itm) in full_set.index for m in months) - ] - - category_recorded_at_other_facilities_of_same_level = pd.notnull( - full_set.loc[(fac, slice(None), valid_items), col] - ).any() +def generate_alternative_availability_scenarios(tlo_availability_df: pd.DataFrame = None) -> pd.DataFrame: + # 1. Import and clean data files + #********************************** + # 1.1 Import TLO model availability data + # ------------------------------------------------------ + # Drop any scenario data previously included in the resourcefile + tlo_availability_df = tlo_availability_df[['Facility_ID', 'month','item_code', 'available_prop']] + + # Import item_category + program_item_mapping = pd.read_csv(path_for_new_resourcefiles / 'ResourceFile_Consumables_Item_Designations.csv')[['Item_Code', 'item_category']] + program_item_mapping = program_item_mapping.rename(columns ={'Item_Code': 'item_code'})[program_item_mapping.item_category.notna()] + + # 1.1.1 Attach district, facility level and item_category to this dataset + #---------------------------------------------------------------- + # Get TLO Facility_ID for each district and facility level + mfl = pd.read_csv(resourcefilepath / "healthsystem" / "organisation" / "ResourceFile_Master_Facilities_List.csv") + districts = set(pd.read_csv(resourcefilepath / 'demography' / 'ResourceFile_Population_2010.csv')['District']) + fac_levels = {'0', '1a', '1b', '2', '3', '4'} + tlo_availability_df = tlo_availability_df.merge(mfl[['District', 'Facility_Level', 'Facility_ID']], + on = ['Facility_ID'], how='left') + + tlo_availability_df = tlo_availability_df.merge(program_item_mapping, + on = ['item_code'], how='left') + + # 1.2 Import scenario data + #------------------------------------------------------ + scenario_availability_df = pd.read_csv(outputfilepath / "regression_analysis/predictions/predicted_consumable_availability_regression_scenarios.csv") + scenario_availability_df = scenario_availability_df.drop(['Unnamed: 0'], axis=1) + scenario_availability_df = scenario_availability_df.rename({'item': 'item_hhfa'}, axis=1) + + # Prepare scenario data to be merged to TLO model availability based on TLO model features + # 1.2.1 Level of care + #------------------------------------------------------ + scenario_availability_df['fac_type'] = scenario_availability_df['fac_type_original'].str.replace("Facility_level_", "") + + # 1.2.2 District + #------------------------------------------------------ + # Do some mapping to make the Districts in the scenario file line-up with the definition of Districts in the model + rename_and_collapse_to_model_districts = { + 'Mzimba South': 'Mzimba', + 'Mzimba North': 'Mzimba', + } + scenario_availability_df = scenario_availability_df.rename({'district': 'district_original'}, axis=1) + scenario_availability_df['District'] = scenario_availability_df['district_original'].replace(rename_and_collapse_to_model_districts) + + # Cities to get same results as their respective regions + copy_source_to_destination = { + 'Mzimba': 'Mzuzu City', + 'Lilongwe': 'Lilongwe City', + 'Zomba': 'Zomba City', + 'Blantyre': 'Blantyre City', + 'Nkhata Bay': 'Likoma' # based on anecdotal evidence, assume that they experience the same change in avaiability as a result of interventions based on regression results + } + for source, destination in copy_source_to_destination.items(): + new_rows = scenario_availability_df.loc[scenario_availability_df.District == source].copy() # standardised district names + new_rows.District = destination + scenario_availability_df = pd.concat([scenario_availability_df, new_rows], ignore_index = True) + + assert sorted(set(districts)) == sorted(set(pd.unique(scenario_availability_df.District))) + + # 1.2.3 Facility_ID + # #------------------------------------------------------ + # Merge-in facility_id + scenario_availability_df = scenario_availability_df.merge(mfl[['District', 'Facility_Level', 'Facility_ID']], + left_on=['District', 'fac_type'], + right_on=['District', 'Facility_Level'], how='left', indicator=True) + scenario_availability_df = scenario_availability_df.rename({'_merge': 'merge_facid'}, axis=1) + + # Extract list of District X Facility Level combinations for which there is no HHFA data + df_to_check_prediction_completeness = scenario_availability_df.merge(mfl[['District', 'Facility_Level', 'Facility_ID']], + left_on=['District', 'Facility_Level'], + right_on=['District', 'Facility_Level'], how='right', indicator=True) + cond_no_1b = (df_to_check_prediction_completeness['Facility_Level'].isin(['1b'])) & (df_to_check_prediction_completeness['_merge'] == 'right_only') + cond_no_1a = (df_to_check_prediction_completeness['Facility_Level'].isin(['1a'])) & (df_to_check_prediction_completeness['_merge'] == 'right_only') + districts_with_no_scenario_data_for_1b = df_to_check_prediction_completeness[cond_no_1b]['District'].unique() + districts_with_no_scenario_data_for_1a = df_to_check_prediction_completeness[cond_no_1a]['District'].unique() + districts_with_no_scenario_data_for_1b_only = np.setdiff1d(districts_with_no_scenario_data_for_1b, districts_with_no_scenario_data_for_1a) + + # According to HHFA data, Balaka, Machinga, Mwanza, Ntchisi and Salima do not have level 1b facilities + # Likoma was not included in the regression because of the limited variation within the district - only 4 facilities - we have assumed that the change of consumable + # availability in Likoma is equal to that predicted for Nkhata Bay + + # 1.2.4 Program + #------------------------------------------------------ + scenario_availability_df.loc[scenario_availability_df.program_plot == 'infection_prev', 'program_plot'] = 'general' # there is no separate infection_prevention category in the TLO availability data + map_model_programs_to_hhfa = { + 'contraception': 'contraception', + 'general': 'general', + 'reproductive_health': 'obs&newb', + 'road_traffic_injuries': 'surgical', + 'epi': 'epi', + 'neonatal_health': 'obs&newb', + 'other_childhood_illnesses': 'alri', + 'malaria': 'malaria', + 'tb': 'tb', + 'hiv': 'hiv', + 'undernutrition': 'child', + 'ncds': 'ncds', + 'cardiometabolicdisorders': 'ncds', + 'cancer': 'ncds', + } + # Reverse the map_model_programs_to_hhfa dictionary + hhfa_to_model_programs = {v: k for k, v in map_model_programs_to_hhfa.items()} + + scenario_availability_df['category_tlo'] = scenario_availability_df['program_plot'].replace(hhfa_to_model_programs) # TODO this might not be relevant + + # 1.2.5 Consumable/Item code and Category + #------------------------------------------------------ + # Load TLO - HHFA consumable name crosswalk + consumable_crosswalk_df = pd.read_csv(path_for_new_resourcefiles / 'ResourceFile_consumables_matched.csv', encoding='ISO-8859-1')[['item_code', 'consumable_name_tlo', + 'item_hhfa_for_scenario_generation', 'hhfa_mapping_rationale_for_scenario_generation']] + + # Keep only item_codes in the availability dataframe + consumable_crosswalk_df = consumable_crosswalk_df.merge(tlo_availability_df[['item_code']], how = 'right', on = 'item_code') + # TODO is module_name used? + # TODO add new consumables Rifapentine to this? + + # Now merge in TLO item codes + scenario_availability_df = scenario_availability_df.reset_index(drop = True) + scenario_availability_df = scenario_availability_df.merge(consumable_crosswalk_df[['item_code', 'item_hhfa_for_scenario_generation', 'hhfa_mapping_rationale_for_scenario_generation', 'consumable_name_tlo']], + left_on = ['item_hhfa'], right_on = ['item_hhfa_for_scenario_generation'], how='right', indicator=True, validate = "m:m") + scenario_availability_df = scenario_availability_df.drop_duplicates(['Facility_ID', 'item_code']) + scenario_availability_df = scenario_availability_df.rename({'_merge': 'merge_itemcode'}, axis=1) + print("Number of item codes from the TLO model for which no match was found in the regression-based scenario data = ", scenario_availability_df.merge_itemcode.value_counts()[1]) + + # Before merging the above dataframe with tlo_availability_df, and apply a general interpolation rule to fill any gaps, + # we need to make sure that any specific interpolation rules are applied to the scenario dataframe + + # Further a row needs to be added for 1b level under Balaka, Machinga, Mwanza, Ntchisi and Salima + print("Number of unique facility IDs: \n - TLO consumable data = ", tlo_availability_df.Facility_ID.nunique(), + "\n - Scenario based on regression = ", scenario_availability_df.Facility_ID.nunique(), + "\nNumber of unique item codes: \n - TLO consumable availability data = ", tlo_availability_df.item_code.nunique(), + "\n - TLO consumable availability repository = ", consumable_crosswalk_df.item_code.nunique(), + "\n - Scenario based on regression = ", scenario_availability_df.item_code.nunique()) + + # Extract list of TLO consumables which weren't matched with the availability prediction dataframe + items_not_matched = scenario_availability_df['merge_itemcode'] == 'right_only' + + # Get average availability_change_prop value by facility_ID and category_tlo + scenario_availability_df = scenario_availability_df.merge(program_item_mapping, + on = ['item_code'], validate = "m:1", + how = "left") + + # 1.3 Initial interpolation + #------------------------------------------------------ + # 1.3.1 Items not relevant to the regression analysis + items_not_relevant_to_regression = (items_not_matched) & (scenario_availability_df['hhfa_mapping_rationale_for_scenario_generation'] == 'not relevant to logistic regression analysis') + # For category 3, replace availability_change_prop with 1, since we assume that the system-level intervention does not change availability + list_of_scenario_variables = ['change_proportion_scenario1', 'change_proportion_scenario2', + 'change_proportion_scenario3', 'change_proportion_scenario4', 'change_proportion_scenario5'] + for var in list_of_scenario_variables: + scenario_availability_df.loc[items_not_relevant_to_regression,var] = 1 + + # 1.3.2 For level 1b for the districts where this level was not present in the regression analysis/HHFA dataset, assume + # that the change is equal to the product of the (ratio of average change across districts for level 1b to + # average change across districts for level 1a) and change for each item_code for level 1a for that district + #------------------------------------------------------------------------------------------------------------ + average_change_across_districts = scenario_availability_df.groupby(['Facility_Level','item_code'])[list_of_scenario_variables].mean().reset_index() + + # Generate the ratio of the proportional changes to availability of level 1b to 1a in the districts for which level 1b data is available + new_colnames_1a = {col: col + '_1a' if col in list_of_scenario_variables else col for col in average_change_across_districts.columns} + new_colnames_1b = {col: col + '_1b' if col in list_of_scenario_variables else col for col in average_change_across_districts.columns} + average_change_across_districts_for_1a = average_change_across_districts[average_change_across_districts.Facility_Level == "1a"].rename(new_colnames_1a, axis = 1).drop('Facility_Level', axis = 1) + average_change_across_districts_for_1b = average_change_across_districts[average_change_across_districts.Facility_Level == "1b"].rename(new_colnames_1b, axis = 1).drop('Facility_Level', axis = 1) + ratio_of_change_across_districts_1b_to_1a = average_change_across_districts_for_1a.merge(average_change_across_districts_for_1b, + how = "left", on = ['item_code']) + for var in list_of_scenario_variables: + var_ratio = 'ratio_' + var + var_1a = var + '_1a' + var_1b = var + '_1b' + ratio_of_change_across_districts_1b_to_1a[var_ratio] = (ratio_of_change_across_districts_1b_to_1a[var_1b])/(ratio_of_change_across_districts_1b_to_1a[var_1a]) + ratio_of_change_across_districts_1b_to_1a.reset_index(drop = True) + # TODO check if this ratio should be of the proportions minus 1 + + # For districts with no level 1b data in the HHFA, use the ratio of change in level 1b facilities to level 1a facilities to generate the expected proportional change in availability + # for level 1b facilities in those districts + scenario_availability_df = scenario_availability_df.reset_index(drop = True) + cond_districts_with_1b_missing = scenario_availability_df.District.isin(districts_with_no_scenario_data_for_1b_only) + cond_1a = scenario_availability_df.Facility_Level == '1a' + cond_1b = scenario_availability_df.Facility_Level == '1b' + df_1a = scenario_availability_df[cond_districts_with_1b_missing & cond_1a] + + ratio_vars = ['ratio_' + item for item in list_of_scenario_variables] # create columns to represent the ratio of change in 1b facilities to level 1a facilities + + item_var = ['item_code'] + + # First merge the dataframe with changes at level 1a with the ratio of 1b to 1a + df_missing_1b_imputed = df_1a.merge(ratio_of_change_across_districts_1b_to_1a[item_var + ratio_vars], + on = item_var, + how = 'left', validate = "m:1") + + # Then multiply the ratio of 1b to 1a with the change at level 1a to get the expected change at level 1b + for var in list_of_scenario_variables: + df_missing_1b_imputed[var] = df_missing_1b_imputed[var] * df_missing_1b_imputed['ratio_' + var] + # Update columns so the dataframe in fact refers to level 1b facilities + df_missing_1b_imputed.Facility_Level = '1b' # Update facility level to 1 + # Replace Facility_IDs + df_missing_1b_imputed = df_missing_1b_imputed.drop('Facility_ID', axis = 1).merge(mfl[['District', 'Facility_Level', 'Facility_ID']], + on =['District', 'Facility_Level'], + how = 'left') + # Append the new imputed level 1b dataframe to the original dataframe + df_without_districts_with_no_1b_facilities = scenario_availability_df[~(cond_districts_with_1b_missing & cond_1b)] + scenario_availability_df = pd.concat([df_without_districts_with_no_1b_facilities, df_missing_1b_imputed], ignore_index = True) + + # 2. Merge TLO model availability data with scenario data using crosswalk + #************************************************************************* + # 2.1 Merge the two datasets + #------------------------------------------------------ + id_variables = ['item_code','Facility_ID'] + + full_scenario_df = tlo_availability_df.merge(scenario_availability_df[id_variables + list_of_scenario_variables], + how='left', on=['Facility_ID', 'item_code'], indicator = True) + full_scenario_df = full_scenario_df.rename({'_merge': 'merge_scenario'}, axis=1) + full_scenario_df = full_scenario_df.drop_duplicates(['Facility_ID', 'item_code', 'month']) + + # Check that level 1b values are currently imputed + #full_scenario_df[full_scenario_df.District == 'Balaka'].groupby(['District', 'Facility_Level'])['change_proportion_scenario1'].mean() + + # 2.2 Further imputation + #------------------------------------------------------ + # 2.2.1 For all levels other than 1a and 1b, there will be no change in consumable availability + #------------------------------------------------------------------------------------------------------------ + fac_levels_not_relevant_to_regression = full_scenario_df.Facility_Level.isin(['0', '2', '3', '4']) + + for var in list_of_scenario_variables: + full_scenario_df.loc[fac_levels_not_relevant_to_regression, var] = 1 + + # 2.3 Final checks + #------------------------------------------------------ + # 2.3.1 Check that the merged dataframe has the same number of unique items, facility IDs, and total + # number of rows as the original small availability resource file + #--------------------------------------------------------------------------------------------------------- + assert(full_scenario_df.item_code.nunique() == tlo_availability_df.item_code.nunique()) # the number of items in the new dataframe is the same at the original availability dataframe + assert(full_scenario_df.Facility_ID.nunique() == tlo_availability_df.Facility_ID.nunique()) # the number of Facility IDs in the new dataframe is the same at the original availability dataframe + assert(len(full_scenario_df) == len(tlo_availability_df)) # the number of rows in the new dataframe is the same at the original availability dataframe + + # 2.3.2 Construct dataset that conforms to the principles expected by the simulation: i.e. that there is an entry for every + # facility_id and for every month for every item_code. + #----------------------------------------------------------------------------------------------------------------------- + # Generate the dataframe that has the desired size and shape + fac_ids = set(mfl.loc[mfl.Facility_Level != '5'].Facility_ID) + item_codes = set(tlo_availability_df.item_code.unique()) + months = range(1, 13) + all_availability_columns = ['available_prop'] + list_of_scenario_variables + + # Create a MultiIndex from the product of fac_ids, months, and item_codes + index = pd.MultiIndex.from_product([fac_ids, months, item_codes], names=['Facility_ID', 'month', 'item_code']) + + # Initialize a DataFrame with the MultiIndex and columns, filled with NaN + full_set = pd.DataFrame(index=index, columns=all_availability_columns) + full_set = full_set.astype(float) # Ensure all columns are float type and filled with NaN + + # Insert the data, where it is available. + full_set = full_set.combine_first(full_scenario_df.set_index(['Facility_ID', 'month', 'item_code'])[all_availability_columns]) + + # Fill in the blanks with rules for interpolation. + facilities_by_level = defaultdict(set) + for ix, row in mfl.iterrows(): + facilities_by_level[row['Facility_Level']].add(row['Facility_ID']) + + items_by_category = defaultdict(set) + for ix, row in program_item_mapping.iterrows(): + items_by_category[row['item_category']].add(row['item_code']) + + def get_other_facilities_of_same_level(_fac_id): + """Return a set of facility_id for other facilities that are of the same level as that provided.""" + for v in facilities_by_level.values(): + if _fac_id in v: + return v - {_fac_id} + + def get_other_items_of_same_category(_item_code): + """Return a set of item_codes for other items that are in the same category/program as that provided.""" + for v in items_by_category.values(): + if _item_code in v: + return v - {_item_code} + def interpolate_missing_with_mean(_ser): + """Return a series in which any values that are null are replaced with the mean of the non-missing.""" + if pd.isnull(_ser).all(): + raise ValueError + return _ser.fillna(_ser.mean()) + + # Create new dataset that include the interpolations (The operation is not done "in place", because the logic is based + # on what results are missing before the interpolations in other facilities). + full_set_interpolated = full_set * np.nan + full_set_interpolated.available_prop = full_set.available_prop + + for fac in fac_ids: + for item in item_codes: + for col in list_of_scenario_variables: + print(f"Now doing: fac={fac}, item={item}, column={col}") + + # Get records of the availability of this item in this facility. + _monthly_records = full_set.loc[(fac, slice(None), item), col].copy() - if recorded_at_other_facilities_of_same_level: - # If it recorded at other facilities of same level, find the average availability of the item at other - # facilities of the same level. - print("Data for facility ", fac, " extrapolated from other facilities within level - ", facilities) + if pd.notnull(_monthly_records).any(): + # If there is at least one record of this item at this facility, then interpolate the missing months from + # the months for there are data on this item in this facility. (If none are missing, this has no effect). + _monthly_records = interpolate_missing_with_mean(_monthly_records) + + else: + # If there is no record of this item at this facility, check to see if it's available at other facilities + # of the same level + # Or if there is no record of item at other facilities at this level, check to see if other items of this category + # are available at this facility level facilities = list(get_other_facilities_of_same_level(fac)) - _monthly_records = interpolate_missing_with_mean( - full_set.loc[(facilities, slice(None), item), col].groupby(level=1).mean() - ) - elif category_recorded_at_other_facilities_of_same_level and valid_items: - # If it recorded at other facilities of same level, find the average availability of the item at other - # facilities of the same level. - print("Data for item ", item, " extrapolated from other items within category - ", valid_items) + other_items = get_other_items_of_same_category(item) + items = list(other_items) if other_items else other_items - _monthly_records = interpolate_missing_with_mean( - full_set.loc[(fac, slice(None), valid_items), col].groupby(level=1).mean() - ) + recorded_at_other_facilities_of_same_level = pd.notnull( + full_set.loc[(facilities, slice(None), item), col] + ).any() + + if not items: + category_recorded_at_other_facilities_of_same_level = False + else: + # Filter only items that exist in the MultiIndex at this facility + valid_items = [ + itm for itm in items + if any((fac, m, itm) in full_set.index for m in months) + ] + + category_recorded_at_other_facilities_of_same_level = pd.notnull( + full_set.loc[(fac, slice(None), valid_items), col] + ).any() + + if recorded_at_other_facilities_of_same_level: + # If it recorded at other facilities of same level, find the average availability of the item at other + # facilities of the same level. + print("Data for facility ", fac, " extrapolated from other facilities within level - ", facilities) + facilities = list(get_other_facilities_of_same_level(fac)) + _monthly_records = interpolate_missing_with_mean( + full_set.loc[(facilities, slice(None), item), col].groupby(level=1).mean() + ) + + elif category_recorded_at_other_facilities_of_same_level and valid_items: + # If it recorded at other facilities of same level, find the average availability of the item at other + # facilities of the same level. + print("Data for item ", item, " extrapolated from other items within category - ", valid_items) + + _monthly_records = interpolate_missing_with_mean( + full_set.loc[(fac, slice(None), valid_items), col].groupby(level=1).mean() + ) + + else: + # If it is not recorded at other facilities of same level, then assume that there is no change + print("No interpolation worked") + _monthly_records = _monthly_records.fillna(1.0) + + # Insert values (including corrections) into the resulting dataset. + full_set_interpolated.loc[(fac, slice(None), item), col] = _monthly_records.values + # temporary code + assert full_set_interpolated.loc[(fac, slice(None), item), col].mean() >= 0 + + # 3. Generate regression-based scenario data on consumable availablity + #************************************************************************* + # Create new consumable availability estimates for TLO model consumables using + # estimates of proportional change from the regression analysis based on HHFA data + #------------------------------------------------------ + prefix = 'change_proportion_' + list_of_scenario_suffixes = [s.replace(prefix, '') for s in list_of_scenario_variables] + + for scenario in list_of_scenario_suffixes: + full_set_interpolated['available_prop_' + scenario] = full_set_interpolated['available_prop'] * full_set_interpolated['change_proportion_' + scenario] + availability_greater_than_1 = full_set_interpolated['available_prop_' + scenario] > 1 + full_set_interpolated.loc[availability_greater_than_1, 'available_prop_' + scenario] = 1 + + assert(sum(full_set_interpolated['available_prop_' + scenario].isna()) == + sum(full_set_interpolated['change_proportion_' + scenario].isna())) # make sure that there is an entry for every row in which there was previously data + + # 4. Generate best performing facility-based scenario data on consumable availability + #*************************************************************************************** + df = full_set_interpolated.reset_index().copy() + + # Try updating the avaiability to represent the 75th percentile by consumable + facility_levels = ['1a', '1b', '2'] + target_percentiles = [75, 90, 99] + + best_performing_facilities = {} + # Populate the dictionary + for level in facility_levels: + # Create an empty dictionary for the current level + best_performing_facilities[level] = {} - else: - # If it is not recorded at other facilities of same level, then assume that there is no change - print("No interpolation worked") - _monthly_records = _monthly_records.fillna(1.0) - - # Insert values (including corrections) into the resulting dataset. - full_set_interpolated.loc[(fac, slice(None), item), col] = _monthly_records.values - # temporary code - assert full_set_interpolated.loc[(fac, slice(None), item), col].mean() >= 0 - -# 3. Generate regression-based scenario data on consumable availablity -#************************************************************************* -# Create new consumable availability estimates for TLO model consumables using -# estimates of proportional change from the regression analysis based on HHFA data -#------------------------------------------------------ -prefix = 'change_proportion_' -list_of_scenario_suffixes = [s.replace(prefix, '') for s in list_of_scenario_variables] - -for scenario in list_of_scenario_suffixes: - full_set_interpolated['available_prop_' + scenario] = full_set_interpolated['available_prop'] * full_set_interpolated['change_proportion_' + scenario] - availability_greater_than_1 = full_set_interpolated['available_prop_' + scenario] > 1 - full_set_interpolated.loc[availability_greater_than_1, 'available_prop_' + scenario] = 1 - - assert(sum(full_set_interpolated['available_prop_' + scenario].isna()) == - sum(full_set_interpolated['change_proportion_' + scenario].isna())) # make sure that there is an entry for every row in which there was previously data - -# 4. Generate best performing facility-based scenario data on consumable availability -#*************************************************************************************** -df = full_set_interpolated.reset_index().copy() - -# Try updating the avaiability to represent the 75th percentile by consumable -facility_levels = ['1a', '1b', '2'] -target_percentiles = [75, 90, 99] - -best_performing_facilities = {} -# Populate the dictionary -for level in facility_levels: - # Create an empty dictionary for the current level - best_performing_facilities[level] = {} - - for item in item_codes: - best_performing_facilities[level][item] = {} - # Get the mean availability by Facility for the current level - mean_consumable_availability = pd.DataFrame(df[(df.Facility_ID.isin(facilities_by_level[level])) & (df.item_code == item)] - .groupby('Facility_ID')['available_prop'].mean()).reset_index() - - # Calculate the percentile rank of each row for 'available_prop' - mean_consumable_availability['percentile_rank'] = mean_consumable_availability['available_prop'].rank(pct=True) * 100 - - # Find the row which is closest to the nth percentile rank for each target percentile - for target_perc in target_percentiles: - # Calculate the difference to target percentile - mean_consumable_availability['diff_to_target_' + str(target_perc)] = np.abs( - mean_consumable_availability['percentile_rank'] - target_perc) - - # Find the row with the minimum difference to the target percentile - closest_row = mean_consumable_availability.loc[ - mean_consumable_availability['diff_to_target_' + str(target_perc)].idxmin()] - - # Store the Facility_ID of the closest row in the dictionary for the current level - best_performing_facilities[level][item][str(target_perc) + 'th percentile'] = closest_row['Facility_ID'] - -print("Reference facilities at each level for each item: ", best_performing_facilities) - -# Obtain the updated availability estimates for level 1a for scenarios 6-8 -updated_availability_1a = df[['item_code', 'month']].drop_duplicates() -updated_availability_1b = df[['item_code', 'month']].drop_duplicates() -updated_availability_2 = df[['item_code', 'month']].drop_duplicates() -temporary_df = pd.DataFrame([]) -availability_dataframes = [updated_availability_1a, updated_availability_1b, updated_availability_2] - -i = 6 # start scenario counter -j = 0 # start level counter -for level in facility_levels: - for target_perc in target_percentiles: for item in item_codes: + best_performing_facilities[level][item] = {} + # Get the mean availability by Facility for the current level + mean_consumable_availability = pd.DataFrame(df[(df.Facility_ID.isin(facilities_by_level[level])) & (df.item_code == item)] + .groupby('Facility_ID')['available_prop'].mean()).reset_index() + + # Calculate the percentile rank of each row for 'available_prop' + mean_consumable_availability['percentile_rank'] = mean_consumable_availability['available_prop'].rank(pct=True) * 100 + + # Find the row which is closest to the nth percentile rank for each target percentile + for target_perc in target_percentiles: + # Calculate the difference to target percentile + mean_consumable_availability['diff_to_target_' + str(target_perc)] = np.abs( + mean_consumable_availability['percentile_rank'] - target_perc) + + # Find the row with the minimum difference to the target percentile + closest_row = mean_consumable_availability.loc[ + mean_consumable_availability['diff_to_target_' + str(target_perc)].idxmin()] + + # Store the Facility_ID of the closest row in the dictionary for the current level + best_performing_facilities[level][item][str(target_perc) + 'th percentile'] = closest_row['Facility_ID'] + + print("Reference facilities at each level for each item: ", best_performing_facilities) + + # Obtain the updated availability estimates for level 1a for scenarios 6-8 + updated_availability_1a = df[['item_code', 'month']].drop_duplicates() + updated_availability_1b = df[['item_code', 'month']].drop_duplicates() + updated_availability_2 = df[['item_code', 'month']].drop_duplicates() + temporary_df = pd.DataFrame([]) + availability_dataframes = [updated_availability_1a, updated_availability_1b, updated_availability_2] + + i = 6 # start scenario counter + j = 0 # start level counter + for level in facility_levels: + for target_perc in target_percentiles: + for item in item_codes: + + print("Running level ", level, "; Running scenario ", str(i), "; Running item ", item) + reference_facility = df['Facility_ID'] == best_performing_facilities[level][item][str(target_perc) + 'th percentile'] + current_item = df['item_code'] == item + availability_at_reference_facility = df[reference_facility & current_item][['item_code', 'month', 'available_prop']] - print("Running level ", level, "; Running scenario ", str(i), "; Running item ", item) - reference_facility = df['Facility_ID'] == best_performing_facilities[level][item][str(target_perc) + 'th percentile'] - current_item = df['item_code'] == item - availability_at_reference_facility = df[reference_facility & current_item][['item_code', 'month', 'available_prop']] - - if temporary_df.empty: - temporary_df = availability_at_reference_facility - else: - temporary_df = pd.concat([temporary_df,availability_at_reference_facility], ignore_index = True) - - column_name = 'available_prop_scenario' + str(i) - temporary_df = temporary_df.rename(columns = {'available_prop': column_name }) - availability_dataframes[j] = availability_dataframes[j].merge(temporary_df, on = ['item_code', 'month'], how = 'left', validate = '1:1') - temporary_df = pd.DataFrame([]) - i = i + 1 - i = 6 # restart scenario counter - j = j + 1 # move to the next level - -# Merge the above scenario data to the full availability scenario dataframe -# 75, 90 and 99th percentile availability data for level 1a -df_new_1a = df[df['Facility_ID'].isin(facilities_by_level['1a'])].merge(availability_dataframes[0],on = ['item_code', 'month'], - how = 'left', - validate = "m:1") -# 75, 90 and 99th percentile availability data for level 1b -df_new_1b = df[df['Facility_ID'].isin(facilities_by_level['1b'])].merge(availability_dataframes[1],on = ['item_code', 'month'], - how = 'left', - validate = "m:1") - -# For scenarios 6-8, replace the new availability probabilities by the max(original availability, availability at target percentile) -for scen in [6,7,8]: - df_new_1a['available_prop_scenario' + str(scen)] = df_new_1a.apply( - lambda row: max(row['available_prop_scenario' + str(scen) ], row['available_prop']), axis=1) - df_new_1b['available_prop_scenario' + str(scen)] = df_new_1b.apply( - lambda row: max(row['available_prop_scenario' + str(scen) ], row['available_prop']), axis=1) - -# 75, 90 and 99th percentile availability data for level 2 -df_new_2 = df[df['Facility_ID'].isin(facilities_by_level['2'])].merge(availability_dataframes[2],on = ['item_code', 'month'], - how = 'left', - validate = "m:1") - -# Generate scenarios 6-8 -#------------------------ -# scenario 6: only levels 1a and 1b changed to availability at 75th percentile for the corresponding level -# scenario 7: only levels 1a and 1b changed to availability at 90th percentile for the corresponding level -# scenario 8: only levels 1a and 1b changed to availability at 99th percentile for the corresponding level -# Scenario 6-8 availability data for other levels -df_new_otherlevels = df[~df['Facility_ID'].isin(facilities_by_level['1a']|facilities_by_level['1b'])] -new_scenario_columns = ['available_prop_scenario6', 'available_prop_scenario7', 'available_prop_scenario8'] -for col in new_scenario_columns: - df_new_otherlevels[col] = df_new_otherlevels['available_prop'] -# Append the above dataframes -df_facility_level_benchmark_scenarios = pd.concat([df_new_1a, df_new_1b, df_new_otherlevels], ignore_index = True) - - -# Generate scenario 9 -#------------------------ -# scenario 9: levels 1a, 1b and 2 changed to availability at 99th percentile for the corresponding level -df_new_otherlevels = df_facility_level_benchmark_scenarios[~df_facility_level_benchmark_scenarios['Facility_ID'].isin(facilities_by_level['1a']|facilities_by_level['1b']|facilities_by_level['2'])].reset_index(drop = True) -df_new_1a_scenario9 = df_facility_level_benchmark_scenarios[df_facility_level_benchmark_scenarios['Facility_ID'].isin(facilities_by_level['1a'])].reset_index(drop = True) -df_new_1b_scenario9 = df_facility_level_benchmark_scenarios[df_facility_level_benchmark_scenarios['Facility_ID'].isin(facilities_by_level['1b'])].reset_index(drop = True) -df_new_2_scenario9 = df_new_2[df_new_2['Facility_ID'].isin(facilities_by_level['2'])].reset_index(drop = True) -new_scenario_columns = ['available_prop_scenario9'] -for col in new_scenario_columns: - df_new_otherlevels[col] = df_new_otherlevels['available_prop'] - df_new_1a_scenario9[col] = df_new_1a_scenario9['available_prop_scenario8'] - df_new_1b_scenario9[col] = df_new_1b_scenario9['available_prop_scenario8'] - df_new_2_scenario9[col] = df_new_2_scenario9.apply(lambda row: max(row['available_prop_scenario8'], row['available_prop']), axis=1) - -# Append the above dataframes -df_new_scenarios9 = pd.concat([df_new_1a_scenario9, df_new_1b_scenario9, df_new_2_scenario9, df_new_otherlevels], ignore_index = True) - -# 6. Generate scenarios based on the performance of vertical programs -#*************************************************************************************** -df_program_benchmark_scenarios = tlo_availability_df.copy() -# Define common conditions needed for the next set of scenarios -cond_levels1a1b = df_program_benchmark_scenarios.Facility_Level.isin(['1a', '1b']) -cond_hiv = df_program_benchmark_scenarios.item_category == 'hiv' -cond_epi = df_program_benchmark_scenarios.item_category == 'epi' -cond_cancer = df_program_benchmark_scenarios.item_category == 'cancer' -cond_malaria = df_program_benchmark_scenarios.item_category == 'malaria' -cond_tb = df_program_benchmark_scenarios.item_category == 'tb' -cond_non_epi_hiv_cancer = ~(cond_epi | cond_hiv | cond_cancer) - -# Calculate average availabilities -avg_hiv = df_program_benchmark_scenarios[cond_hiv & cond_levels1a1b].groupby('Facility_Level')['available_prop'].mean() -avg_epi = df_program_benchmark_scenarios[cond_epi & cond_levels1a1b].groupby('Facility_Level')['available_prop'].mean() -avg_other_by_level = df_program_benchmark_scenarios[cond_non_epi_hiv_cancer & cond_levels1a1b].groupby('Facility_Level')['available_prop'].mean() -avg_other_by_facility = df_program_benchmark_scenarios[~(cond_epi | cond_hiv | cond_cancer | cond_malaria | cond_tb)].groupby('Facility_ID')['available_prop'].mean() - -# Initialize scenario columns with baseline values -for i in range(10, 16): - df_program_benchmark_scenarios[f'available_prop_scenario{i}'] = df_program_benchmark_scenarios['available_prop'] - -# Define update logic using a loop -scenario_logic = { - 10: lambda row: max(row['available_prop'], avg_hiv[row['Facility_Level']]) - if row['Facility_Level'] in ['1a', '1b'] and row['item_category'] not in ['hiv', 'epi'] else row['available_prop'], - 11: lambda row: max(row['available_prop'], avg_epi[row['Facility_Level']]) - if row['Facility_Level'] in ['1a', '1b'] and row['item_category'] not in ['hiv', 'epi'] else row['available_prop'], - 12: lambda row: min(row['available_prop'], avg_other_by_level[row['Facility_Level']]) - if row['Facility_Level'] in ['1a', '1b'] and row['item_category'] == 'hiv' else row['available_prop'], - 13: lambda row: min(row['available_prop'], avg_other_by_facility[row['Facility_ID']]) - if row['item_category'] == 'hiv' else row['available_prop'], - 14: lambda row: min(row['available_prop'], avg_other_by_facility[row['Facility_ID']] * 1.25) - if row['item_category'] == 'hiv' else row['available_prop'], - 15: lambda row: min(row['available_prop'], avg_other_by_facility[row['Facility_ID']] * 0.75) - if row['item_category'] == 'hiv' else row['available_prop'], -} - -# Apply logic to each scenario -for scen, func in scenario_logic.items(): - df_program_benchmark_scenarios[f'available_prop_scenario{scen}'] = df_program_benchmark_scenarios.apply(func, axis=1) - -# Add scenarios 6 to 12 to the original dataframe -#------------------------------------------------------ -# Combine all scenario suffixes into a single list -scenario_suffixes = list_of_scenario_suffixes + [f'scenario{i}' for i in range(6, 16)] -scenario_vars = [f'available_prop_{s}' for s in scenario_suffixes] -old_vars = ['Facility_ID', 'month', 'item_code'] - -# Prepare the full base dataframe from scenarios 6–8 -full_df_with_scenario = df_facility_level_benchmark_scenarios[ - old_vars + ['available_prop'] + [f'available_prop_scenario{i}' for i in range(1, 9)] -].reset_index(drop=True) - -# Add scenario 9 -full_df_with_scenario = full_df_with_scenario.merge( - df_new_scenarios9[old_vars + ['available_prop_scenario9']], - on=old_vars, how='left', validate='1:1' -) - -# Add scenarios 10–15 from the program benchmark dataframe -full_df_with_scenario = full_df_with_scenario.merge( - df_program_benchmark_scenarios[old_vars + [f'available_prop_scenario{i}' for i in range(10, 16)]], - on=old_vars, how='left', validate='1:1' -) - -# --- Check that the scenarios 6-11 always have higher prob of availability than baseline --- # -for scen in range(6,12): - assert sum(full_df_with_scenario['available_prop_scenario' + str(scen)] < full_df_with_scenario['available_prop']) == 0 - assert sum(full_df_with_scenario['available_prop_scenario' + str(scen)] >= full_df_with_scenario['available_prop']) == len(full_df_with_scenario) -# Check that scenarios 12-15 always have equal to or lower prob of availability than baselin --- # -for scen in range(12,16): - assert sum(full_df_with_scenario['available_prop_scenario' + str(scen)] > full_df_with_scenario['available_prop']) == 0 - assert sum(full_df_with_scenario['available_prop_scenario' + str(scen)] <= full_df_with_scenario['available_prop']) == len(full_df_with_scenario) - -# --- Check that the exported file has the properties required of it by the model code. --- # -check_format_of_consumables_file(df=full_df_with_scenario, fac_ids=fac_ids) - -# Save updated consumable availability resource file with scenario data -full_df_with_scenario.to_csv( - path_for_new_resourcefiles / "ResourceFile_Consumables_availability_small.csv", - index=False -) -# TODO: Create a column providing the source of scenario data - -# 8. Plot new availability estimates by scenario -#********************************************************************************************* -full_df_with_scenario = pd.read_csv(path_for_new_resourcefiles / "ResourceFile_Consumables_availability_small.csv") - -# Create the directory if it doesn't exist -figurespath = outputfilepath / 'consumable_scenario_analysis' -if not os.path.exists(figurespath): - os.makedirs(figurespath) - -# Prepare the availability dataframe for descriptive plots -df_for_plots = full_df_with_scenario.merge(mfl[['Facility_ID', 'Facility_Level']], on = 'Facility_ID', how = 'left', validate = "m:1") -df_for_plots = df_for_plots.merge(program_item_mapping, on = 'item_code', how = 'left', validate = "m:1") -scenario_list = [1,2,3,6,7,8,10,11,12,13,14,15] -chosen_availability_columns = ['available_prop'] + [f'available_prop_scenario{i}' for i in - scenario_list] -scenario_names_dict = {'available_prop': 'Actual', 'available_prop_scenario1': 'Non-therapeutic \n consumables', 'available_prop_scenario2': 'Vital medicines', - 'available_prop_scenario3': 'Pharmacist-\n managed', 'available_prop_scenario4': 'Level 1b', 'available_prop_scenario5': 'CHAM', - 'available_prop_scenario6': '75th percentile\n facility', 'available_prop_scenario7': '90th percentile \n facility', 'available_prop_scenario8': 'Best \n facility', - 'available_prop_scenario9': 'Best facility \n (including DHO)','available_prop_scenario10': 'HIV supply \n chain', 'available_prop_scenario11': 'EPI supply \n chain', - 'available_prop_scenario12': 'HIV moved to \n Govt supply chain \n (Avg by Level)', 'available_prop_scenario13': 'HIV moved to \n Govt supply chain \n (Avg by Facility_ID)', - 'available_prop_scenario14': 'HIV moved to \n Govt supply chain \n (Avg by Facility_ID times 1.25)', - 'available_prop_scenario15': 'HIV moved to \n Govt supply chain \n (Avg by Facility_ID times 0.75)'} -# recreate the chosen columns list based on the mapping above -chosen_availability_columns = [scenario_names_dict[col] for col in chosen_availability_columns] -df_for_plots = df_for_plots.rename(columns = scenario_names_dict) - -# Generate a bar plot of average availability under each scenario by item_category and Facility_Level -def generate_barplot_of_scenarios(_df, _x_axis_var, _filename): - df_for_bar_plot = _df.groupby([_x_axis_var])[chosen_availability_columns].mean() - df_for_bar_plot = df_for_bar_plot.reset_index().melt(id_vars=[_x_axis_var], value_vars=chosen_availability_columns, - var_name='Scenario', value_name='Value') - plot = (ggplot(df_for_bar_plot.reset_index(), aes(x=_x_axis_var, y='Value', fill = 'Scenario')) - + geom_bar(stat='identity', position='dodge') - + ylim(0, 1) - + labs(title = "Probability of availability across scenarios", - x=_x_axis_var, - y='Probability of availability') - + theme(axis_text_x=element_text(angle=45, hjust=1)) - ) - - plot.save(filename= figurespath / _filename, dpi=300, width=10, height=8, units='in') -generate_barplot_of_scenarios(_df = df_for_plots, _x_axis_var = 'item_category', _filename = 'availability_by_category.png') -generate_barplot_of_scenarios(_df = df_for_plots, _x_axis_var = 'Facility_Level', _filename = 'availability_by_level.png') - -# Create heatmaps by Facility_Level of average availability by item_category across chosen scenarios -for level in fac_levels: - # Generate a heatmap + if temporary_df.empty: + temporary_df = availability_at_reference_facility + else: + temporary_df = pd.concat([temporary_df,availability_at_reference_facility], ignore_index = True) + + column_name = 'available_prop_scenario' + str(i) + temporary_df = temporary_df.rename(columns = {'available_prop': column_name }) + availability_dataframes[j] = availability_dataframes[j].merge(temporary_df, on = ['item_code', 'month'], how = 'left', validate = '1:1') + temporary_df = pd.DataFrame([]) + i = i + 1 + i = 6 # restart scenario counter + j = j + 1 # move to the next level + + # Merge the above scenario data to the full availability scenario dataframe + # 75, 90 and 99th percentile availability data for level 1a + df_new_1a = df[df['Facility_ID'].isin(facilities_by_level['1a'])].merge(availability_dataframes[0],on = ['item_code', 'month'], + how = 'left', + validate = "m:1") + # 75, 90 and 99th percentile availability data for level 1b + df_new_1b = df[df['Facility_ID'].isin(facilities_by_level['1b'])].merge(availability_dataframes[1],on = ['item_code', 'month'], + how = 'left', + validate = "m:1") + + # For scenarios 6-8, replace the new availability probabilities by the max(original availability, availability at target percentile) + for scen in [6,7,8]: + df_new_1a['available_prop_scenario' + str(scen)] = df_new_1a.apply( + lambda row: max(row['available_prop_scenario' + str(scen) ], row['available_prop']), axis=1) + df_new_1b['available_prop_scenario' + str(scen)] = df_new_1b.apply( + lambda row: max(row['available_prop_scenario' + str(scen) ], row['available_prop']), axis=1) + + # 75, 90 and 99th percentile availability data for level 2 + df_new_2 = df[df['Facility_ID'].isin(facilities_by_level['2'])].merge(availability_dataframes[2],on = ['item_code', 'month'], + how = 'left', + validate = "m:1") + + # Generate scenarios 6-8 + #------------------------ + # scenario 6: only levels 1a and 1b changed to availability at 75th percentile for the corresponding level + # scenario 7: only levels 1a and 1b changed to availability at 90th percentile for the corresponding level + # scenario 8: only levels 1a and 1b changed to availability at 99th percentile for the corresponding level + # Scenario 6-8 availability data for other levels + df_new_otherlevels = df[~df['Facility_ID'].isin(facilities_by_level['1a']|facilities_by_level['1b'])] + new_scenario_columns = ['available_prop_scenario6', 'available_prop_scenario7', 'available_prop_scenario8'] + for col in new_scenario_columns: + df_new_otherlevels[col] = df_new_otherlevels['available_prop'] + # Append the above dataframes + df_facility_level_benchmark_scenarios = pd.concat([df_new_1a, df_new_1b, df_new_otherlevels], ignore_index = True) + + + # Generate scenario 9 + #------------------------ + # scenario 9: levels 1a, 1b and 2 changed to availability at 99th percentile for the corresponding level + df_new_otherlevels = df_facility_level_benchmark_scenarios[~df_facility_level_benchmark_scenarios['Facility_ID'].isin(facilities_by_level['1a']|facilities_by_level['1b']|facilities_by_level['2'])].reset_index(drop = True) + df_new_1a_scenario9 = df_facility_level_benchmark_scenarios[df_facility_level_benchmark_scenarios['Facility_ID'].isin(facilities_by_level['1a'])].reset_index(drop = True) + df_new_1b_scenario9 = df_facility_level_benchmark_scenarios[df_facility_level_benchmark_scenarios['Facility_ID'].isin(facilities_by_level['1b'])].reset_index(drop = True) + df_new_2_scenario9 = df_new_2[df_new_2['Facility_ID'].isin(facilities_by_level['2'])].reset_index(drop = True) + new_scenario_columns = ['available_prop_scenario9'] + for col in new_scenario_columns: + df_new_otherlevels[col] = df_new_otherlevels['available_prop'] + df_new_1a_scenario9[col] = df_new_1a_scenario9['available_prop_scenario8'] + df_new_1b_scenario9[col] = df_new_1b_scenario9['available_prop_scenario8'] + df_new_2_scenario9[col] = df_new_2_scenario9.apply(lambda row: max(row['available_prop_scenario8'], row['available_prop']), axis=1) + + # Append the above dataframes + df_new_scenarios9 = pd.concat([df_new_1a_scenario9, df_new_1b_scenario9, df_new_2_scenario9, df_new_otherlevels], ignore_index = True) + + # 6. Generate scenarios based on the performance of vertical programs + #*************************************************************************************** + df_program_benchmark_scenarios = tlo_availability_df.copy() + # Define common conditions needed for the next set of scenarios + cond_levels1a1b = df_program_benchmark_scenarios.Facility_Level.isin(['1a', '1b']) + cond_hiv = df_program_benchmark_scenarios.item_category == 'hiv' + cond_epi = df_program_benchmark_scenarios.item_category == 'epi' + cond_cancer = df_program_benchmark_scenarios.item_category == 'cancer' + cond_malaria = df_program_benchmark_scenarios.item_category == 'malaria' + cond_tb = df_program_benchmark_scenarios.item_category == 'tb' + cond_non_epi_hiv_cancer = ~(cond_epi | cond_hiv | cond_cancer) + + # Calculate average availabilities + avg_hiv = df_program_benchmark_scenarios[cond_hiv & cond_levels1a1b].groupby('Facility_Level')['available_prop'].mean() + avg_epi = df_program_benchmark_scenarios[cond_epi & cond_levels1a1b].groupby('Facility_Level')['available_prop'].mean() + avg_other_by_level = df_program_benchmark_scenarios[cond_non_epi_hiv_cancer & cond_levels1a1b].groupby('Facility_Level')['available_prop'].mean() + avg_other_by_facility = df_program_benchmark_scenarios[~(cond_epi | cond_hiv | cond_cancer | cond_malaria | cond_tb)].groupby('Facility_ID')['available_prop'].mean() + + # Initialize scenario columns with baseline values + for i in range(10, 16): + df_program_benchmark_scenarios[f'available_prop_scenario{i}'] = df_program_benchmark_scenarios['available_prop'] + + # Define update logic using a loop + scenario_logic = { + 10: lambda row: max(row['available_prop'], avg_hiv[row['Facility_Level']]) + if row['Facility_Level'] in ['1a', '1b'] and row['item_category'] not in ['hiv', 'epi'] else row['available_prop'], + 11: lambda row: max(row['available_prop'], avg_epi[row['Facility_Level']]) + if row['Facility_Level'] in ['1a', '1b'] and row['item_category'] not in ['hiv', 'epi'] else row['available_prop'], + 12: lambda row: min(row['available_prop'], avg_other_by_level[row['Facility_Level']]) + if row['Facility_Level'] in ['1a', '1b'] and row['item_category'] == 'hiv' else row['available_prop'], + 13: lambda row: min(row['available_prop'], avg_other_by_facility[row['Facility_ID']]) + if row['item_category'] == 'hiv' else row['available_prop'], + 14: lambda row: min(row['available_prop'], avg_other_by_facility[row['Facility_ID']] * 1.25) + if row['item_category'] == 'hiv' else row['available_prop'], + 15: lambda row: min(row['available_prop'], avg_other_by_facility[row['Facility_ID']] * 0.75) + if row['item_category'] == 'hiv' else row['available_prop'], + } + + # Apply logic to each scenario + for scen, func in scenario_logic.items(): + df_program_benchmark_scenarios[f'available_prop_scenario{scen}'] = df_program_benchmark_scenarios.apply(func, axis=1) + + # Add scenarios 6 to 12 to the original dataframe + #------------------------------------------------------ + # Combine all scenario suffixes into a single list + scenario_suffixes = list_of_scenario_suffixes + [f'scenario{i}' for i in range(6, 16)] + scenario_vars = [f'available_prop_{s}' for s in scenario_suffixes] + old_vars = ['Facility_ID', 'month', 'item_code'] + + # Prepare the full base dataframe from scenarios 6–8 + full_df_with_scenario = df_facility_level_benchmark_scenarios[ + old_vars + ['available_prop'] + [f'available_prop_scenario{i}' for i in range(1, 9)] + ].reset_index(drop=True) + + # Add scenario 9 + full_df_with_scenario = full_df_with_scenario.merge( + df_new_scenarios9[old_vars + ['available_prop_scenario9']], + on=old_vars, how='left', validate='1:1' + ) + + # Add scenarios 10–15 from the program benchmark dataframe + full_df_with_scenario = full_df_with_scenario.merge( + df_program_benchmark_scenarios[old_vars + [f'available_prop_scenario{i}' for i in range(10, 16)]], + on=old_vars, how='left', validate='1:1' + ) + + # --- Check that the scenarios 6-11 always have higher prob of availability than baseline --- # + for scen in range(6,12): + assert sum(full_df_with_scenario['available_prop_scenario' + str(scen)] < full_df_with_scenario['available_prop']) == 0 + assert sum(full_df_with_scenario['available_prop_scenario' + str(scen)] >= full_df_with_scenario['available_prop']) == len(full_df_with_scenario) + # Check that scenarios 12-15 always have equal to or lower prob of availability than baselin --- # + for scen in range(12,16): + assert sum(full_df_with_scenario['available_prop_scenario' + str(scen)] > full_df_with_scenario['available_prop']) == 0 + assert sum(full_df_with_scenario['available_prop_scenario' + str(scen)] <= full_df_with_scenario['available_prop']) == len(full_df_with_scenario) + + # --- Check that the exported file has the properties required of it by the model code. --- # + check_format_of_consumables_file(df=full_df_with_scenario, fac_ids=fac_ids) + + # Return updated consumable availability resource file with scenario data + return full_df_with_scenario + +def generate_descriptive_consumable_availability_plots(tlo_availability_df: pd.DataFrame = None, + figurespath: str = None, + mfl: pd.DataFrame = None, + program_item_mapping: pd.DataFrame = None, + chosen_availability_columns:list = None, + scenario_names_dict:dict = None,): + # Prepare the availability dataframe for descriptive plots + df_for_plots = tlo_availability_df.merge(mfl[['Facility_ID', 'Facility_Level']], on = 'Facility_ID', how = 'left', validate = "m:1") + df_for_plots = df_for_plots.merge(program_item_mapping, on = 'item_code', how = 'left', validate = "m:1") + if chosen_availability_columns is None: + # All availability columns + chosen_availability_columns = [c for c in df_for_plots.columns if c.startswith("available_prop")] + + # recreate the chosen columns list based on the mapping above + if scenario_names_dict is not None: + chosen_availability_columns = [scenario_names_dict[col] for col in chosen_availability_columns] + df_for_plots = df_for_plots.rename(columns = scenario_names_dict) + + ''' + # Generate a bar plot of average availability under each scenario by item_category and Facility_Level + def generate_barplot_of_scenarios(_df, _x_axis_var, _filename): + df_for_bar_plot = _df.groupby([_x_axis_var])[chosen_availability_columns].mean() + df_for_bar_plot = df_for_bar_plot.reset_index().melt(id_vars=[_x_axis_var], value_vars=chosen_availability_columns, + var_name='Scenario', value_name='Value') + plot = (ggplot(df_for_bar_plot.reset_index(), aes(x=_x_axis_var, y='Value', fill = 'Scenario')) + + geom_bar(stat='identity', position='dodge') + + ylim(0, 1) + + labs(title = "Probability of availability across scenarios", + x=_x_axis_var, + y='Probability of availability') + + theme(axis_text_x=element_text(angle=45, hjust=1)) + ) + + plot.save(filename= figurespath / _filename, dpi=300, width=10, height=8, units='in') + generate_barplot_of_scenarios(_df = df_for_plots, _x_axis_var = 'item_category', _filename = 'availability_by_category.png') + generate_barplot_of_scenarios(_df = df_for_plots, _x_axis_var = 'Facility_Level', _filename = 'availability_by_level.png') + ''' + + # Plot 1 + # Create heatmaps by Facility_Level of average availability by item_category across chosen scenarios + fac_levels = {'0', '1a', '1b', '2', '3', '4'} + for level in fac_levels: + # Generate a heatmap + # Pivot the DataFrame + aggregated_df = df_for_plots.groupby(['item_category', 'Facility_Level'])[chosen_availability_columns].mean().reset_index() + aggregated_df = aggregated_df[aggregated_df.Facility_Level.isin([level])] + heatmap_data = aggregated_df.set_index('item_category').drop(columns = 'Facility_Level') + + # Calculate the aggregate row and column + aggregate_col= df_for_plots.loc[df_for_plots.Facility_Level.isin([level]), chosen_availability_columns].mean() + #overall_aggregate = aggregate_col.mean() + + # Add aggregate row and column + #heatmap_data['Average'] = aggregate_row + #aggregate_col['Average'] = overall_aggregate + heatmap_data.loc['Average'] = aggregate_col + + # Generate the heatmap + sns.set(font_scale=0.8) + plt.figure(figsize=(10, 8)) + sns.heatmap(heatmap_data, annot=True, cmap='RdYlGn', cbar_kws={'label': 'Proportion of days on which consumable is available'}) + + # Customize the plot + plt.title(f'Facility Level {level}') + plt.xlabel('Scenarios') + plt.ylabel('Disease/Public health \n program') + plt.xticks(rotation=90, fontsize=8) + plt.yticks(rotation=0, fontsize=8) + + plt.savefig(figurespath /f'consumable_availability_heatmap_{level}.png', dpi=300, bbox_inches='tight') + plt.close() + + # Plot 2 + # Create heatmap of average availability by Facility_Level across chosen scenarios # Pivot the DataFrame - aggregated_df = df_for_plots.groupby(['item_category', 'Facility_Level'])[chosen_availability_columns].mean().reset_index() - aggregated_df = aggregated_df[aggregated_df.Facility_Level.isin([level])] - heatmap_data = aggregated_df.set_index('item_category').drop(columns = 'Facility_Level') + aggregated_df = df_for_plots.groupby(['Facility_Level'])[chosen_availability_columns].mean().reset_index() + heatmap_data = aggregated_df.set_index('Facility_Level') # Calculate the aggregate row and column - aggregate_col= df_for_plots.loc[df_for_plots.Facility_Level.isin([level]), chosen_availability_columns].mean() + aggregate_col= df_for_plots[chosen_availability_columns].mean() #overall_aggregate = aggregate_col.mean() # Add aggregate row and column @@ -705,346 +727,91 @@ def generate_barplot_of_scenarios(_df, _x_axis_var, _filename): sns.heatmap(heatmap_data, annot=True, cmap='RdYlGn', cbar_kws={'label': 'Proportion of days on which consumable is available'}) # Customize the plot - plt.title(f'Facility Level {level}') + plt.title('Availability across scenarios') plt.xlabel('Scenarios') - plt.ylabel('Disease/Public health \n program') + plt.ylabel('Facility Level') plt.xticks(rotation=90, fontsize=8) plt.yticks(rotation=0, fontsize=8) - plt.savefig(figurespath /f'consumable_availability_heatmap_{level}.png', dpi=300, bbox_inches='tight') + plt.savefig(figurespath /'consumable_availability_heatmap_alllevels.png', dpi=300, bbox_inches='tight') plt.close() -# Create heatmap of average availability by Facility_Level across chosen scenarios -# Pivot the DataFrame -aggregated_df = df_for_plots.groupby(['Facility_Level'])[chosen_availability_columns].mean().reset_index() -heatmap_data = aggregated_df.set_index('Facility_Level') - -# Calculate the aggregate row and column -aggregate_col= df_for_plots[chosen_availability_columns].mean() -#overall_aggregate = aggregate_col.mean() - -# Add aggregate row and column -#heatmap_data['Average'] = aggregate_row -#aggregate_col['Average'] = overall_aggregate -heatmap_data.loc['Average'] = aggregate_col - -# Generate the heatmap -sns.set(font_scale=0.8) -plt.figure(figsize=(10, 8)) -sns.heatmap(heatmap_data, annot=True, cmap='RdYlGn', cbar_kws={'label': 'Proportion of days on which consumable is available'}) - -# Customize the plot -plt.title('Availability across scenarios') -plt.xlabel('Scenarios') -plt.ylabel('Facility Level') -plt.xticks(rotation=90, fontsize=8) -plt.yticks(rotation=0, fontsize=8) - -plt.savefig(figurespath /'consumable_availability_heatmap_alllevels.png', dpi=300, bbox_inches='tight') -plt.close() - -# Create heatmap of average availability by Facility_Level and program for actual and 75th percentile (Costing paper) -clean_category_names = {'cancer': 'Cancer', 'cardiometabolicdisorders': 'Cardiometabolic Disorders', - 'contraception': 'Contraception', 'general': 'General', 'hiv': 'HIV', 'malaria': 'Malaria', - 'ncds': 'Non-communicable Diseases', 'neonatal_health': 'Neonatal Health', - 'other_childhood_illnesses': 'Other Childhood Illnesses', 'reproductive_health': 'Reproductive Health', - 'road_traffic_injuries': 'Road Traffic Injuries', 'tb': 'Tuberculosis', - 'undernutrition': 'Undernutrition', 'epi': 'Expanded programme on immunization'} -df_with_cleaned_item_category = df_for_plots.copy() -df_with_cleaned_item_category['item_category'] = df_for_plots['item_category'].map(clean_category_names) - -# Actual -aggregated_df = df_with_cleaned_item_category.groupby(['Facility_Level', 'item_category'])['Actual'].mean().reset_index() -heatmap_data = aggregated_df.pivot(index='item_category', columns='Facility_Level', values='Actual') - -# Calculate the aggregate row and column -aggregate_col= df_with_cleaned_item_category.groupby('item_category')['Actual'].mean() -overall_aggregate = df_with_cleaned_item_category['Actual'].mean() -aggregate_row = df_with_cleaned_item_category.groupby('Facility_Level')['Actual'].mean() - -# Add aggregate row and column -heatmap_data['Average'] = aggregate_col -aggregate_row['Average'] = overall_aggregate -heatmap_data.loc['Average'] = aggregate_row - -# Generate the heatmap -sns.set(font_scale=1.2) -plt.figure(figsize=(10, 8)) -sns.heatmap(heatmap_data, annot=True, cmap='RdYlGn', cbar_kws={'label': 'Proportion of days on which consumable is available'}) - -# Customize the plot -plt.xlabel('Facility Level') -plt.ylabel('Program') -plt.xticks(rotation=90) -plt.yticks(rotation=0) - -plt.savefig(figurespath /'heatmap_program_and_level_actual.png', dpi=300, bbox_inches='tight') -plt.show() -plt.close() - -# 75th percentile -aggregated_df = df_with_cleaned_item_category.groupby(['Facility_Level', 'item_category'])['75th percentile\n facility'].mean().reset_index() -heatmap_data = aggregated_df.pivot(index='item_category', columns='Facility_Level', values='75th percentile\n facility') - -# Calculate the aggregate row and column -aggregate_col= df_with_cleaned_item_category.groupby('item_category')['75th percentile\n facility'].mean() -overall_aggregate = df_with_cleaned_item_category['75th percentile\n facility'].mean() -aggregate_row = df_with_cleaned_item_category.groupby('Facility_Level')['75th percentile\n facility'].mean() - - -# Add aggregate row and column -heatmap_data['Average'] = aggregate_col -aggregate_row['Average'] = overall_aggregate -heatmap_data.loc['Average'] = aggregate_row - -# Generate the heatmap -sns.set(font_scale=1.2) -plt.figure(figsize=(10, 8)) -sns.heatmap(heatmap_data, annot=True, cmap='RdYlGn', cbar_kws={'label': 'Proportion of days on which consumable is available'}) - -# Customize the plot -plt.xlabel('Facility Level') -plt.ylabel('Program') -plt.xticks(rotation=90) -plt.yticks(rotation=0) - -plt.savefig(figurespath /'heatmap_program_and_level_75perc.png', dpi=300, bbox_inches='tight') -plt.show() -plt.close() - - -# Create a heatmap of average availability by item_category and scenario -# Base scenario list -base_scenarios = [['Actual']] -# Additional scenarios to add iteratively -additional_scenarios = [ - ['Non-therapeutic \n consumables', 'Vital medicines', 'Pharmacist-\n managed'], - ['75th percentile\n facility', '90th percentile \n facility', 'Best \n facility'], - ['HIV supply \n chain', 'EPI supply \n chain', 'HIV moved to \n Govt supply chain \n (Avg by Level)'] -] -# Generate iteratively chosen availability columns -iteratively_chosen_availability_columns = [ - base_scenarios[0] + sum(additional_scenarios[:i], []) for i in range(len(additional_scenarios) + 1) -] - -i = 1 -for column_list in iteratively_chosen_availability_columns: - # Create heatmap of average availability by item_category across chosen scenarios + # Plot 3 + # Create a barplot of average consumable availability based on the colours used in analysis_impact_of_consumable_scenarios + average_availability = df_for_plots[chosen_availability_columns].mean().reset_index() + average_availability.columns = ['scenario', 'average_availability'] + new_row = pd.DataFrame([['Perfect', 1]], columns=['scenario', 'average_availability']) # new row for perfect availability + average_availability = pd.concat([average_availability, new_row], axis=0, ignore_index=True) # Concatenate the new row with the existing DataFrame + + # Define color mapping for each scenario + color_mapping = { + 'Actual': '#1f77b4', + 'Non-therapeutic \n consumables': '#ff7f0e', + 'Vital medicines': '#2ca02c', + 'Pharmacist-\n managed': '#d62728', + '75th percentile\n facility': '#9467bd', + '90th percentile \n facility': '#8c564b', + 'Best \n facility': '#e377c2', + 'Best facility \n (including DHO)': '#7f7f7f', + 'HIV supply \n chain': '#bcbd22', + 'EPI supply \n chain': '#17becf', + 'HIV moved to \n Govt supply chain \n (Avg by Level)': '#ff6347', + 'HIV moved to \n Govt supply chain \n (Avg by Facility_ID)': '#ff7f50', + 'HIV moved to \n Govt supply chain \n (Avg by Facility_ID times 1.25)': '#fa8072', + 'HIV moved to \n Govt supply chain \n (Avg by Facility_ID times 0.75)': '#cd5c5c', + 'Perfect': '#31a354' + } + + # Create a color list for the bars + colors = [ + color_mapping.get(scenario, '#808080') # default to grey if scenario not in mapping + for scenario in average_availability['scenario'] + ] + # Create the bar plot and capture the bars + plt.figure(figsize=(10, 6)) + bars = plt.bar(average_availability['scenario'], average_availability['average_availability'], color=colors) + plt.title('Average Availability by Scenario') + plt.xlabel('Scenario') + plt.ylabel('Average Availability') + plt.xticks(rotation=90, fontsize=8) + plt.ylim(0, 1) # Adjust based on your data range + plt.grid(axis='y') + # Add data labels + for bar in bars: + yval = bar.get_height() + plt.text(bar.get_x() + bar.get_width() / 2, yval + 0.02, round(yval, 2), ha='center', va='bottom') + + # Save the plot + plt.tight_layout() + plt.tight_layout() + plt.savefig(figurespath / 'scenarios_average_availability.png', dpi=300, bbox_inches='tight') + + # Plot 4 + # Create heatmap of average availability by item for HIV program across chosen scenarios # Pivot the DataFrame - chosen_levels = ['1a', '1b'] - aggregated_df = df_for_plots[df_for_plots.Facility_Level.isin(chosen_levels)] - aggregated_df = aggregated_df.groupby(['item_category'])[column_list].mean().reset_index() - heatmap_data = aggregated_df.set_index('item_category') + aggregated_df = df_for_plots[df_for_plots.item_category == 'hiv'].groupby(['item_code'])[chosen_availability_columns].mean().reset_index() + heatmap_data = aggregated_df.set_index('item_code') # Calculate the aggregate row and column - aggregate_col= df_for_plots.loc[df_for_plots.Facility_Level.isin(chosen_levels), column_list].mean() + aggregate_col= df_for_plots[chosen_availability_columns].mean() #overall_aggregate = aggregate_col.mean() # Add aggregate row and column - heatmap_data['Perfect'] = 1 # Add a column representing the perfect scenario - aggregate_col['Perfect'] = 1 + #heatmap_data['Average'] = aggregate_row + #aggregate_col['Average'] = overall_aggregate heatmap_data.loc['Average'] = aggregate_col - # Ensure all scenarios are represented for consistent column widths - all_scenarios = chosen_availability_columns + ['Perfect'] - heatmap_data = heatmap_data.reindex(columns=all_scenarios) - - # Update column names for x-axis labels # Generate the heatmap - sns.set(font_scale=1.2) + sns.set(font_scale=0.8) plt.figure(figsize=(10, 8)) sns.heatmap(heatmap_data, annot=True, cmap='RdYlGn', cbar_kws={'label': 'Proportion of days on which consumable is available'}) # Customize the plot plt.title('Availability across scenarios') plt.xlabel('Scenarios') - plt.ylabel('Facility Level') - plt.xticks(rotation=90) - plt.yticks(rotation=0) + plt.ylabel('Item Code') + plt.xticks(rotation=90, fontsize=8) + plt.yticks(rotation=0, fontsize=8) - plt.savefig(figurespath /f'consumable_availability_heatmap_bycategory_iter{i}.png', dpi=300, bbox_inches='tight') + plt.savefig(figurespath /'consumable_availability_heatmap_hiv_alllevels_byconsumable.png', dpi=300, bbox_inches='tight') plt.close() - i = i + 1 - -# Create a barplot of average consumable availability based on the colours used in analysis_impact_of_consumable_scenarios -average_availability = df_for_plots[chosen_availability_columns].mean().reset_index() -average_availability.columns = ['scenario', 'average_availability'] -new_row = pd.DataFrame([['Perfect', 1]], columns=['scenario', 'average_availability']) # new row for perfect availability -average_availability = pd.concat([average_availability, new_row], axis=0, ignore_index=True) # Concatenate the new row with the existing DataFrame - -# Define color mapping for each scenario -color_mapping = { - 'Actual': '#1f77b4', - 'Non-therapeutic \n consumables': '#ff7f0e', - 'Vital medicines': '#2ca02c', - 'Pharmacist-\n managed': '#d62728', - '75th percentile\n facility': '#9467bd', - '90th percentile \n facility': '#8c564b', - 'Best \n facility': '#e377c2', - 'Best facility \n (including DHO)': '#7f7f7f', - 'HIV supply \n chain': '#bcbd22', - 'EPI supply \n chain': '#17becf', - 'HIV moved to \n Govt supply chain \n (Avg by Level)': '#ff6347', # original tomato - 'HIV moved to \n Govt supply chain \n (Avg by Facility_ID)': '#ff7f50', # coral - 'HIV moved to \n Govt supply chain \n (Avg by Facility_ID times 1.25)': '#fa8072', # salmon - 'HIV moved to \n Govt supply chain \n (Avg by Facility_ID times 0.75)': '#cd5c5c', # indian red - 'Perfect': '#31a354' -} - -# Create a color list for the bars -colors = [color_mapping[scenario] for scenario in average_availability['scenario']] -# Create the bar plot and capture the bars -plt.figure(figsize=(10, 6)) -bars = plt.bar(average_availability['scenario'], average_availability['average_availability'], color=colors) -plt.title('Average Availability by Scenario') -plt.xlabel('Scenario') -plt.ylabel('Average Availability') -plt.xticks(rotation=90, fontsize=8) -plt.ylim(0, 1) # Adjust based on your data range -plt.grid(axis='y') -# Add data labels -for bar in bars: - yval = bar.get_height() - plt.text(bar.get_x() + bar.get_width() / 2, yval + 0.02, round(yval, 2), ha='center', va='bottom') - -# Save the plot -plt.tight_layout() -plt.tight_layout() -plt.savefig(figurespath / 'scenarios_average_availability.png', dpi=300, bbox_inches='tight') - - -# Create the directory if it doesn't exist -roi_plots_path = outputfilepath / 'horizontal_v_vertical/roi/' -if not os.path.exists(roi_plots_path): - os.makedirs(roi_plots_path) - -# Create a combined plot of heatmaps of average availability for levels 1a and 1b under actual, 75th percentile, HIV and EPI scenarios -# Scenario list -scenarios_for_roi_paper = ['Actual', '75th percentile\n facility', 'HIV supply \n chain', 'EPI supply \n chain'] -# Define facility levels -chosen_levels = ['1a', '1b'] - -# Create a figure with subplots for each level -fig, axes = plt.subplots(nrows=1, ncols=len(chosen_levels), figsize=(20, 8), sharex=True, sharey=True) -# Create a single colorbar axis -cbar_ax = fig.add_axes([.91, .3, .02, .4]) # Position of the colorbar - -for ax, level in zip(axes, chosen_levels): - # Filter data for the current facility level - aggregated_df = df_for_plots[df_for_plots.Facility_Level.isin([level])] - aggregated_df = aggregated_df.groupby(['item_category'])[scenarios_for_roi_paper].mean().reset_index() - heatmap_data = aggregated_df.set_index('item_category') - - # Calculate the aggregate row - aggregate_col = df_for_plots.loc[df_for_plots.Facility_Level.isin([level]), scenarios_for_roi_paper].mean() - heatmap_data.loc['Average'] = aggregate_col - heatmap_data = heatmap_data.rename(columns = {'75th percentile\n facility': "Consumables increased \n to 75th percentile", - 'HIV supply \n chain': "Consuambles increased \n to HIV level", - 'EPI supply \n chain': "Consumables increased \n to EPI level"}) - - # Generate the heatmap on the current subplot - sns.heatmap(heatmap_data, annot=True, cmap='RdYlGn', vmin = 0, vmax = 1, - ax=ax, cbar=(ax == axes[-1]), cbar_ax=(cbar_ax if ax == axes[-1] else None), - annot_kws={"size": 14}) - - # Set labels - ax.set_title(f'Level {level}') - ax.set_xlabel('Scenarios') - ax.set_ylabel('Program' if ax == axes[0] else "") - -cbar_ax.set_ylabel('Proportion of days consumable is available') -# Save the combined heatmap -plt.savefig(roi_plots_path / 'combined_consumable_availability_heatmap_1a_1b.png', dpi=300, bbox_inches='tight') -plt.close() - -# Create a combined plot of heatmaps of average availability for all levels under actual, 75th percentile, HIV and EPI scenarios -chosen_levels = ['0', '1a', '1b', '2', '3'] -# Create a figure with subplots -fig, axes = plt.subplots(nrows=1, ncols=len(chosen_levels), figsize=(20, 8), sharex=True, sharey=True) - -# Create a single colorbar axis -cbar_ax = fig.add_axes([.91, .3, .02, .4]) # Position of the colorbar - -for ax, level in zip(axes, chosen_levels): - # Filter data for the current facility level - aggregated_df = df_for_plots[df_for_plots.Facility_Level.isin([level])] - aggregated_df = aggregated_df.groupby(['item_category'])[scenarios_for_roi_paper].mean().reset_index() - heatmap_data = aggregated_df.set_index('item_category') - - # Calculate the aggregate row - aggregate_col = df_for_plots.loc[df_for_plots.Facility_Level.isin([level]), scenarios_for_roi_paper].mean() - heatmap_data.loc['Average'] = aggregate_col - - # Generate the heatmap on the current subplot - sns.heatmap(heatmap_data, annot=True, cmap='RdYlGn', ax=ax, cbar=(ax == axes[-1]), cbar_ax=(cbar_ax if ax == axes[-1] else None)) - - # Set labels - ax.set_title(f'Level {level}') - ax.set_xlabel('Scenarios') - ax.set_ylabel('Program' if ax == axes[0] else "") - -# Adjust layout -cbar_ax.set_ylabel('Proportion of days consumable is available') -# Save the combined heatmap -plt.savefig(roi_plots_path / 'combined_consumable_availability_heatmap_all_levels.png', dpi=300, bbox_inches='tight') -plt.close() - - -# Create heatmap of average availability by Facility_Level across chosen scenarios -# Pivot the DataFrame -aggregated_df = df_for_plots[df_for_plots.item_category == 'hiv'].groupby(['Facility_Level'])[chosen_availability_columns].mean().reset_index() -heatmap_data = aggregated_df.set_index('Facility_Level') - -# Calculate the aggregate row and column -aggregate_col= df_for_plots[chosen_availability_columns].mean() -#overall_aggregate = aggregate_col.mean() - -# Add aggregate row and column -#heatmap_data['Average'] = aggregate_row -#aggregate_col['Average'] = overall_aggregate -heatmap_data.loc['Average'] = aggregate_col - -# Generate the heatmap -sns.set(font_scale=0.8) -plt.figure(figsize=(10, 8)) -sns.heatmap(heatmap_data, annot=True, cmap='RdYlGn', cbar_kws={'label': 'Proportion of days on which consumable is available'}) - -# Customize the plot -plt.title('Availability across scenarios') -plt.xlabel('Scenarios') -plt.ylabel('Facility Level') -plt.xticks(rotation=90, fontsize=8) -plt.yticks(rotation=0, fontsize=8) - -plt.savefig(figurespath /'consumable_availability_heatmap_hiv_alllevels.png', dpi=300, bbox_inches='tight') -plt.close() - - -# Create heatmap of average availability by Facility_Level across chosen scenarios -# Pivot the DataFrame -aggregated_df = df_for_plots[df_for_plots.item_category == 'hiv'].groupby(['item_code'])[chosen_availability_columns].mean().reset_index() -heatmap_data = aggregated_df.set_index('item_code') - -# Calculate the aggregate row and column -aggregate_col= df_for_plots[chosen_availability_columns].mean() -#overall_aggregate = aggregate_col.mean() - -# Add aggregate row and column -#heatmap_data['Average'] = aggregate_row -#aggregate_col['Average'] = overall_aggregate -heatmap_data.loc['Average'] = aggregate_col - -# Generate the heatmap -sns.set(font_scale=0.8) -plt.figure(figsize=(10, 8)) -sns.heatmap(heatmap_data, annot=True, cmap='RdYlGn', cbar_kws={'label': 'Proportion of days on which consumable is available'}) - -# Customize the plot -plt.title('Availability across scenarios') -plt.xlabel('Scenarios') -plt.ylabel('Item Code') -plt.xticks(rotation=90, fontsize=8) -plt.yticks(rotation=0, fontsize=8) - -plt.savefig(figurespath /'consumable_availability_heatmap_hiv_alllevels_byconsumable.png', dpi=300, bbox_inches='tight') -plt.close() From bed835c887f44c67ff5fdddde9fa69b2f50b6196 Mon Sep 17 00:00:00 2001 From: sakshimohan Date: Fri, 2 Jan 2026 14:41:19 +0000 Subject: [PATCH 07/15] remove test ResourceFiles applying different level 1b-2 weights. --- ...e_Consumables_availability_small_district_1b_to_2_ratio.csv | 3 --- .../ResourceFile_Consumables_availability_small_level2.csv | 3 --- ...e_Consumables_availability_small_national_1b_to_2_ratio.csv | 3 --- 3 files changed, 9 deletions(-) delete mode 100644 resources/healthsystem/consumables/ResourceFile_Consumables_availability_small_district_1b_to_2_ratio.csv delete mode 100644 resources/healthsystem/consumables/ResourceFile_Consumables_availability_small_level2.csv delete mode 100644 resources/healthsystem/consumables/ResourceFile_Consumables_availability_small_national_1b_to_2_ratio.csv diff --git a/resources/healthsystem/consumables/ResourceFile_Consumables_availability_small_district_1b_to_2_ratio.csv b/resources/healthsystem/consumables/ResourceFile_Consumables_availability_small_district_1b_to_2_ratio.csv deleted file mode 100644 index 093e694227..0000000000 --- a/resources/healthsystem/consumables/ResourceFile_Consumables_availability_small_district_1b_to_2_ratio.csv +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:2dca9728a126e5cfdfc9536b5a6a25b49370efa1fcf69b9e123e81723a3e79d2 -size 6280722 diff --git a/resources/healthsystem/consumables/ResourceFile_Consumables_availability_small_level2.csv b/resources/healthsystem/consumables/ResourceFile_Consumables_availability_small_level2.csv deleted file mode 100644 index e85bcb4f1f..0000000000 --- a/resources/healthsystem/consumables/ResourceFile_Consumables_availability_small_level2.csv +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:77860b34c463ed9b045d0e117a7fb9c5eb5ea95f4be6d669b390dee789351515 -size 5967523 diff --git a/resources/healthsystem/consumables/ResourceFile_Consumables_availability_small_national_1b_to_2_ratio.csv b/resources/healthsystem/consumables/ResourceFile_Consumables_availability_small_national_1b_to_2_ratio.csv deleted file mode 100644 index 68ad830085..0000000000 --- a/resources/healthsystem/consumables/ResourceFile_Consumables_availability_small_national_1b_to_2_ratio.csv +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:600197dda8d1a7c50661ac4e565e3aa97cc329eb557f6da82d858b2ea7a56274 -size 6514451 From c446bda97276c82ff2de715ad44ba13bc4df4fcc Mon Sep 17 00:00:00 2001 From: Tim Hallett <39991060+tbhallett@users.noreply.github.com> Date: Wed, 7 Jan 2026 12:39:03 +0000 Subject: [PATCH 08/15] update logical check for changes in consumables availabiltity in routine `update_consumables_availability_to_represent_merging_of_levels_1b_and_2` --- src/tlo/methods/healthsystem.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/tlo/methods/healthsystem.py b/src/tlo/methods/healthsystem.py index 98c07aa4df..e03d8078b0 100644 --- a/src/tlo/methods/healthsystem.py +++ b/src/tlo/methods/healthsystem.py @@ -1129,7 +1129,7 @@ def update_consumables_availability_to_represent_merging_of_levels_1b_and_2(self assert (df_updated.columns == df_original.columns).all() assert (df_updated.dtypes == df_original.dtypes).all() - # check values the same for everything apart from the facility level '2' facilities + # check values the same for everything apart from the facility level 'LABEL_FOR_MERGED_FACILITY_LEVELS_1B_AND_2' facilities_with_any_differences = set( df_updated.loc[ ~( @@ -1137,10 +1137,10 @@ def update_consumables_availability_to_represent_merging_of_levels_1b_and_2(self ).all(axis=1), 'Facility_ID'] ) - level2_facilities = set( - mfl.loc[mfl['Facility_Level'] == '2', 'Facility_ID'] + updated_facilities = set( + mfl.loc[mfl['Facility_Level'] == LABEL_FOR_MERGED_FACILITY_LEVELS_1B_AND_2, 'Facility_ID'] ) - assert facilities_with_any_differences.issubset(level2_facilities) + assert facilities_with_any_differences.issubset(updated_facilities) return df_updated From d65e610222af7293b7d87e2a6012d1b740abf84d Mon Sep 17 00:00:00 2001 From: Tim Hallett <39991060+tbhallett@users.noreply.github.com> Date: Wed, 7 Jan 2026 13:08:48 +0000 Subject: [PATCH 09/15] Add note --- src/tlo/methods/healthsystem.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/tlo/methods/healthsystem.py b/src/tlo/methods/healthsystem.py index 393d015c08..f11b20786a 100644 --- a/src/tlo/methods/healthsystem.py +++ b/src/tlo/methods/healthsystem.py @@ -52,7 +52,8 @@ # those of '2' and have more overall capacity, so # probably account for the majority of the # interactions. - +# Note that, as of PR #1743, this should not be changed, as the availability of consumables at level 1b is now +# encoded to reflect a (weighted) average of the availability of levels '1b' and '2'. def pool_capabilities_at_levels_1b_and_2(df_original: pd.DataFrame) -> pd.DataFrame: """Return a modified version of the imported capabilities DataFrame to reflect that the capabilities of level 1b From ea3a335c5ed8659c3651e4668292920d57c4e774 Mon Sep 17 00:00:00 2001 From: Tim Hallett <39991060+tbhallett@users.noreply.github.com> Date: Wed, 7 Jan 2026 13:09:33 +0000 Subject: [PATCH 10/15] linting --- .../consumables_availability_estimation.py | 7 +++++-- ...onsumable_availability_scenarios_for_impact_analysis.py | 4 +++- 2 files changed, 8 insertions(+), 3 deletions(-) diff --git a/src/scripts/data_file_processing/healthsystem/consumables/consumable_resource_analyses_with_lmis/consumables_availability_estimation.py b/src/scripts/data_file_processing/healthsystem/consumables/consumable_resource_analyses_with_lmis/consumables_availability_estimation.py index 867733d62f..a5862a02a9 100644 --- a/src/scripts/data_file_processing/healthsystem/consumables/consumable_resource_analyses_with_lmis/consumables_availability_estimation.py +++ b/src/scripts/data_file_processing/healthsystem/consumables/consumable_resource_analyses_with_lmis/consumables_availability_estimation.py @@ -39,14 +39,17 @@ import os from collections import defaultdict from pathlib import Path +from typing import List, Optional import matplotlib.pyplot as plt import numpy as np import pandas as pd -from typing import Optional, List +from scripts.data_file_processing.healthsystem.consumables.generating_consumable_scenarios import ( + generate_alternative_availability_scenarios, + generate_descriptive_consumable_availability_plots, +) from tlo.methods.consumables import check_format_of_consumables_file -from scripts.data_file_processing.healthsystem.consumables.generating_consumable_scenarios import generate_alternative_availability_scenarios, generate_descriptive_consumable_availability_plots # Set local shared folder source path_to_share = Path( # <-- point to the shared folder diff --git a/src/scripts/data_file_processing/healthsystem/consumables/generating_consumable_scenarios/generate_consumable_availability_scenarios_for_impact_analysis.py b/src/scripts/data_file_processing/healthsystem/consumables/generating_consumable_scenarios/generate_consumable_availability_scenarios_for_impact_analysis.py index 32281bbe96..7d1ebdf4c8 100644 --- a/src/scripts/data_file_processing/healthsystem/consumables/generating_consumable_scenarios/generate_consumable_availability_scenarios_for_impact_analysis.py +++ b/src/scripts/data_file_processing/healthsystem/consumables/generating_consumable_scenarios/generate_consumable_availability_scenarios_for_impact_analysis.py @@ -45,10 +45,12 @@ import numpy as np import pandas as pd import seaborn as sns -#from plotnine import aes, element_text, geom_bar, ggplot, labs, theme, ylim # for ggplots from R from tlo.methods.consumables import check_format_of_consumables_file +#from plotnine import aes, element_text, geom_bar, ggplot, labs, theme, ylim # for ggplots from R + + # define a timestamp for script outputs timestamp = datetime.datetime.now().strftime("_%Y_%m_%d_%H_%M") From ca12578b7f744931e4bb4cc965045da28dd60b70 Mon Sep 17 00:00:00 2001 From: Tim Hallett <39991060+tbhallett@users.noreply.github.com> Date: Wed, 7 Jan 2026 13:26:26 +0000 Subject: [PATCH 11/15] introduce switch to allow use of the original consumables dataset --- .../ResourceFile_HealthSystem_parameters.csv | 4 ++-- ...Consumables_availability_small_original.csv | 3 +++ src/tlo/methods/healthsystem.py | 18 +++++++++++++++++- 3 files changed, 22 insertions(+), 3 deletions(-) create mode 100644 resources/healthsystem/consumables/ResourceFile_Consumables_availability_small_original.csv diff --git a/resources/healthsystem/ResourceFile_HealthSystem_parameters.csv b/resources/healthsystem/ResourceFile_HealthSystem_parameters.csv index ad23bc747f..172381eac6 100644 --- a/resources/healthsystem/ResourceFile_HealthSystem_parameters.csv +++ b/resources/healthsystem/ResourceFile_HealthSystem_parameters.csv @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:ff3be5d065013bc7c5cffd982f04fa3b37a923ebd3f37e5dbb39542703dd2310 -size 914 +oid sha256:b96e35b518fb33fe0c89b9e47d6b623ec9eb30d454b404554312787a5937404e +size 967 diff --git a/resources/healthsystem/consumables/ResourceFile_Consumables_availability_small_original.csv b/resources/healthsystem/consumables/ResourceFile_Consumables_availability_small_original.csv new file mode 100644 index 0000000000..6afeff00bb --- /dev/null +++ b/resources/healthsystem/consumables/ResourceFile_Consumables_availability_small_original.csv @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3bc044868ad9f1c618fc9b3fd6fc0ed23f32b153c8438f52fd4b72e40d8f503f +size 55831644 diff --git a/src/tlo/methods/healthsystem.py b/src/tlo/methods/healthsystem.py index f11b20786a..a33b17195f 100644 --- a/src/tlo/methods/healthsystem.py +++ b/src/tlo/methods/healthsystem.py @@ -201,6 +201,12 @@ class HealthSystem(Module): Types.DATA_FRAME, "Look-up table for the designations of consumables (whether diagnostic, medicine, or other", ), + "data_source_for_cons_availability_estimates": Parameter( + Types.STRING, "Source of data on consumable availability. Options are: `original` or `updated`." + "The original source was used in the calibration and presented in the overview paper. The " + "updated source introduced in PR #1743 and better reflects the average availability of " + "consumables in the merged 1b/2 facility level." + ), "availability_estimates": Parameter( Types.DATA_FRAME, "Estimated availability of consumables in the LMIS dataset." ), @@ -656,10 +662,20 @@ def read_parameters(self, resourcefilepath: Optional[Path] = None): path_to_resourcefiles_for_healthsystem / "consumables" / "ResourceFile_Consumables_Item_Designations.csv", dtype={"Item_Code": int, "is_diagnostic": bool, "is_medicine": bool, "is_other": bool}, ).set_index("Item_Code") + + # Choose to read-in the updated availabilty estimates or the legacy availability estimates + if self.parameters["data_source_for_cons_availability_estimates"] == 'original': + filename_for_cons_availability_estimates = "ResourceFile_Consumables_availability_small_original.csv" + elif self.parameters["data_source_for_cons_availability_estimates"] == 'updated': + filename_for_cons_availability_estimates = "ResourceFile_Consumables_availability_small_updated.csv" + else: + raise ValueError("data_source_for_cons_availability_estimates should be either 'original' or 'updated'") + self.parameters["availability_estimates"] = pd.read_csv( - path_to_resourcefiles_for_healthsystem / "consumables" / "ResourceFile_Consumables_availability_small.csv" + path_to_resourcefiles_for_healthsystem / "consumables" / filename_for_cons_availability_estimates ) + # Data on the number of beds available of each type by facility_id self.parameters["BedCapacity"] = pd.read_csv( path_to_resourcefiles_for_healthsystem / "infrastructure_and_equipment" / "ResourceFile_Bed_Capacity.csv" From 168fd3a72bafd3a822bbb016f73771ff67389501 Mon Sep 17 00:00:00 2001 From: Tim Hallett <39991060+tbhallett@users.noreply.github.com> Date: Wed, 7 Jan 2026 13:31:53 +0000 Subject: [PATCH 12/15] correct filename --- src/tlo/methods/healthsystem.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/tlo/methods/healthsystem.py b/src/tlo/methods/healthsystem.py index a33b17195f..9ee321dca6 100644 --- a/src/tlo/methods/healthsystem.py +++ b/src/tlo/methods/healthsystem.py @@ -667,7 +667,7 @@ def read_parameters(self, resourcefilepath: Optional[Path] = None): if self.parameters["data_source_for_cons_availability_estimates"] == 'original': filename_for_cons_availability_estimates = "ResourceFile_Consumables_availability_small_original.csv" elif self.parameters["data_source_for_cons_availability_estimates"] == 'updated': - filename_for_cons_availability_estimates = "ResourceFile_Consumables_availability_small_updated.csv" + filename_for_cons_availability_estimates = "ResourceFile_Consumables_availability_small.csv" else: raise ValueError("data_source_for_cons_availability_estimates should be either 'original' or 'updated'") From 5596451743287594327e475d4810608be733e1db Mon Sep 17 00:00:00 2001 From: Tim Hallett <39991060+tbhallett@users.noreply.github.com> Date: Wed, 7 Jan 2026 13:32:21 +0000 Subject: [PATCH 13/15] check both consumables data files --- tests/test_consumables.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/tests/test_consumables.py b/tests/test_consumables.py index a8114aa8d8..5ddc1117aa 100644 --- a/tests/test_consumables.py +++ b/tests/test_consumables.py @@ -536,6 +536,11 @@ def test_check_format_of_consumables_file(): resourcefilepath / 'healthsystem' / 'consumables' / 'ResourceFile_Consumables_availability_small.csv'), fac_ids=fac_ids ) + check_format_of_consumables_file( + pd.read_csv( + resourcefilepath / 'healthsystem' / 'consumables' / 'ResourceFile_Consumables_availability_small_original.csv'), + fac_ids=fac_ids + ) @pytest.mark.slow From 05cb10512feaaa398ebd97522b8e2332b47ebfd6 Mon Sep 17 00:00:00 2001 From: Tim Hallett <39991060+tbhallett@users.noreply.github.com> Date: Wed, 7 Jan 2026 13:32:52 +0000 Subject: [PATCH 14/15] update old scripts to point to original consumbales file to preserve consistency. --- src/scripts/contraception/Items_InclAvailabilityData.py | 2 +- src/scripts/malaria/malaria_cons_availability.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/scripts/contraception/Items_InclAvailabilityData.py b/src/scripts/contraception/Items_InclAvailabilityData.py index 5854e2fb6c..77a2a599f6 100644 --- a/src/scripts/contraception/Items_InclAvailabilityData.py +++ b/src/scripts/contraception/Items_InclAvailabilityData.py @@ -27,7 +27,7 @@ last_line_interv_pkg_name = 'Female Condom' name_containing = 'sutur' what_to_check = 'lines' # 'lines' or 'item_name' -avail_data_filename = 'ResourceFile_Consumables_availability_small.csv' +avail_data_filename = 'ResourceFile_Consumables_availability_small_original.csv' ##### avail_data = pd.read_csv(Path( diff --git a/src/scripts/malaria/malaria_cons_availability.py b/src/scripts/malaria/malaria_cons_availability.py index 221e0c7047..799a6b6f89 100644 --- a/src/scripts/malaria/malaria_cons_availability.py +++ b/src/scripts/malaria/malaria_cons_availability.py @@ -15,7 +15,7 @@ # get consumables spreadsheet cons_availability = pd.read_csv( - resourcefilepath / "healthsystem/consumables/ResourceFile_Consumables_availability_small.csv") + resourcefilepath / "healthsystem/consumables/ResourceFile_Consumables_availability_small_original.csv") items_list = pd.read_csv( resourcefilepath / "healthsystem/consumables/ResourceFile_Consumables_Items_and_Packages.csv") From 61fb89412db63958a4b227c3f7d8fdcf2772c297 Mon Sep 17 00:00:00 2001 From: Tim Hallett <39991060+tbhallett@users.noreply.github.com> Date: Wed, 7 Jan 2026 14:00:46 +0000 Subject: [PATCH 15/15] linting --- ...onsumable_availability_scenarios_for_impact_analysis.py | 5 ++--- tests/test_consumables.py | 7 +++---- 2 files changed, 5 insertions(+), 7 deletions(-) diff --git a/src/scripts/data_file_processing/healthsystem/consumables/generating_consumable_scenarios/generate_consumable_availability_scenarios_for_impact_analysis.py b/src/scripts/data_file_processing/healthsystem/consumables/generating_consumable_scenarios/generate_consumable_availability_scenarios_for_impact_analysis.py index 7d1ebdf4c8..6af7387301 100644 --- a/src/scripts/data_file_processing/healthsystem/consumables/generating_consumable_scenarios/generate_consumable_availability_scenarios_for_impact_analysis.py +++ b/src/scripts/data_file_processing/healthsystem/consumables/generating_consumable_scenarios/generate_consumable_availability_scenarios_for_impact_analysis.py @@ -37,7 +37,6 @@ Consumable availability is measured as probability of consumable being available at any point in time. """ import datetime -import os from collections import defaultdict from pathlib import Path @@ -80,7 +79,7 @@ def generate_alternative_availability_scenarios(tlo_availability_df: pd.DataFram # Get TLO Facility_ID for each district and facility level mfl = pd.read_csv(resourcefilepath / "healthsystem" / "organisation" / "ResourceFile_Master_Facilities_List.csv") districts = set(pd.read_csv(resourcefilepath / 'demography' / 'ResourceFile_Population_2010.csv')['District']) - fac_levels = {'0', '1a', '1b', '2', '3', '4'} + fac_levels = {'0', '1a', '1b', '2', '3', '4'} # noqa: F841 tlo_availability_df = tlo_availability_df.merge(mfl[['District', 'Facility_Level', 'Facility_ID']], on = ['Facility_ID'], how='left') @@ -601,7 +600,7 @@ def interpolate_missing_with_mean(_ser): #------------------------------------------------------ # Combine all scenario suffixes into a single list scenario_suffixes = list_of_scenario_suffixes + [f'scenario{i}' for i in range(6, 16)] - scenario_vars = [f'available_prop_{s}' for s in scenario_suffixes] + scenario_vars = [f'available_prop_{s}' for s in scenario_suffixes] # noqa: F841 old_vars = ['Facility_ID', 'month', 'item_code'] # Prepare the full base dataframe from scenarios 6–8 diff --git a/tests/test_consumables.py b/tests/test_consumables.py index 5ddc1117aa..a754af2b11 100644 --- a/tests/test_consumables.py +++ b/tests/test_consumables.py @@ -531,14 +531,13 @@ def schedule_hsi_that_will_request_consumables(sim): def test_check_format_of_consumables_file(): """Run the check on the file used by default for the Consumables data""" + path_to_file = resourcefilepath / 'healthsystem' / 'consumables' check_format_of_consumables_file( - pd.read_csv( - resourcefilepath / 'healthsystem' / 'consumables' / 'ResourceFile_Consumables_availability_small.csv'), + pd.read_csv(path_to_file / 'ResourceFile_Consumables_availability_small.csv'), fac_ids=fac_ids ) check_format_of_consumables_file( - pd.read_csv( - resourcefilepath / 'healthsystem' / 'consumables' / 'ResourceFile_Consumables_availability_small_original.csv'), + pd.read_csv(path_to_file / 'ResourceFile_Consumables_availability_small_original.csv'), fac_ids=fac_ids )