US_Geodemographic/code/4B_Cluster_Analysis_Groups.R at master · alexsingleton/US_Geodemographic · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
# Load Packages & Data
library(readr)
library(tidycensus)
library(tigris)
library(purrr)
library(foreach)
library(doParallel)
library(dplyr)
library(tidyverse)
library(magrittr)
library(sf)
library(readxl)
library(httr)
library(skimr)
library(corrr)
library(reshape2)
library(ggraph)
library(viridis)
library(summarytools)
library(cluster)
library(janitor)
library(caret)
library(e1071)
library(h2o)
library(gtools)
library(arrow)
library(h2o)

usa.bg <- readRDS("./data/All_data_1.5.rds")
data <- readRDS("./data/data_BG_1.5.rds")
vars_new <- readRDS("./data/vars_new_1.5.rds")

v_used <- vars_new %>% filter(`New Variables` == 1) %>% select(MEASURE) %>% pull() # select proposed used variables

divide100 <- function(x, na.rm=FALSE) (x/100)

usa.bg %<>%
  select(all_of(c("GEOID",v_used))) %>%
  mutate_if(is.numeric, divide100)


################################################################################################
# Cluster Analysis
#################################################################################################

# Constrained Logit

constrained_logit<-function(p,m=6){
  if (is.na(p)) {return(NA)}
  if (p <=0) {return(-m)}
  if (p >=1) {return(m)}
  else {return(min(max(log(p / (1 -p)),-6),6))}
}

usa.bg.transform <- usa.bg %>%
  select(all_of(c("GEOID",v_used))) %>%
  #filter(complete.cases(.)) %>%
  mutate(across(where(is.numeric),~map(.x,constrained_logit)),
         across(where(is.list),unlist))

# EXTRACT COMPLETE CASES (all GEOID checked and present in boundary data)
usa.bg.cc <- usa.bg.transform[complete.cases(usa.bg.transform), ]

# INCOMPELTE CASES (and count missing / problematic GEOID)
usa.bg.ic <- usa.bg.transform[!complete.cases(usa.bg.transform), ]

# Write Parquet File (input data)
# write_parquet(usa.bg.cc, "data/usa.bg.cc.parquet") # Not stored in the repo as too large (https://pcwww.liv.ac.uk/~ucfnale/us_geodemographic_lfs/usa.bg.cc.parquet)
write_parquet(usa.bg.ic, "data/usa.bg.ic.parquet")


#### Clustering


h2o.init(max_mem_size="50G")

aa_h20 <- as.h2o(usa.bg.cc)

# H2O test

results <- h2o.kmeans(training_frame = aa_h20, k = 7, x = v_used, init = "Random",max_iterations=1,standardize = FALSE)
wss <- h2o.tot_withinss(results)

ptm <- proc.time()

for(i in 1:1000) {

results_run <- h2o.kmeans(training_frame = aa_h20, k = 7, x = v_used, init = "Random",max_iterations=1000,standardize = FALSE)
wss_run <- h2o.tot_withinss(results_run)

if(wss_run < wss) {

  wss <- wss_run
  results <- results_run

}
print(i)
}

proc.time() - ptm


# Save clustering results from 10k run ( hours)
#saveRDS(results,"./data/results_10k_7C_Logit_BG.rds")


# Output Complete Case Lookup
ID <- usa.bg %>%
  select(all_of(c(v_used,"GEOID"))) %>%
  filter(complete.cases(.)) %>%
  select("GEOID")

clusters <- as_tibble(h2o.predict(results,aa_h20))

usa.bg.cl <- tibble(GEOID= ID$GEOID,cluster=(clusters$predict +1))


write_parquet(usa.bg.cl, "./data/usa.bg.cl.group.parquet")