-
Notifications
You must be signed in to change notification settings - Fork 2
Expand file tree
/
Copy path4B_Cluster_Analysis_Groups.R
More file actions
138 lines (84 loc) · 2.89 KB
/
4B_Cluster_Analysis_Groups.R
File metadata and controls
138 lines (84 loc) · 2.89 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
# Load Packages & Data
library(readr)
library(tidycensus)
library(tigris)
library(purrr)
library(foreach)
library(doParallel)
library(dplyr)
library(tidyverse)
library(magrittr)
library(sf)
library(readxl)
library(httr)
library(skimr)
library(corrr)
library(reshape2)
library(ggraph)
library(viridis)
library(summarytools)
library(cluster)
library(janitor)
library(caret)
library(e1071)
library(h2o)
library(gtools)
library(arrow)
library(h2o)
usa.bg <- readRDS("./data/All_data_1.5.rds")
data <- readRDS("./data/data_BG_1.5.rds")
vars_new <- readRDS("./data/vars_new_1.5.rds")
v_used <- vars_new %>% filter(`New Variables` == 1) %>% select(MEASURE) %>% pull() # select proposed used variables
divide100 <- function(x, na.rm=FALSE) (x/100)
usa.bg %<>%
select(all_of(c("GEOID",v_used))) %>%
mutate_if(is.numeric, divide100)
################################################################################################
# Cluster Analysis
#################################################################################################
# Constrained Logit
constrained_logit<-function(p,m=6){
if (is.na(p)) {return(NA)}
if (p <=0) {return(-m)}
if (p >=1) {return(m)}
else {return(min(max(log(p / (1 -p)),-6),6))}
}
usa.bg.transform <- usa.bg %>%
select(all_of(c("GEOID",v_used))) %>%
#filter(complete.cases(.)) %>%
mutate(across(where(is.numeric),~map(.x,constrained_logit)),
across(where(is.list),unlist))
# EXTRACT COMPLETE CASES (all GEOID checked and present in boundary data)
usa.bg.cc <- usa.bg.transform[complete.cases(usa.bg.transform), ]
# INCOMPELTE CASES (and count missing / problematic GEOID)
usa.bg.ic <- usa.bg.transform[!complete.cases(usa.bg.transform), ]
# Write Parquet File (input data)
# write_parquet(usa.bg.cc, "data/usa.bg.cc.parquet") # Not stored in the repo as too large (https://pcwww.liv.ac.uk/~ucfnale/us_geodemographic_lfs/usa.bg.cc.parquet)
write_parquet(usa.bg.ic, "data/usa.bg.ic.parquet")
#### Clustering
h2o.init(max_mem_size="50G")
aa_h20 <- as.h2o(usa.bg.cc)
# H2O test
results <- h2o.kmeans(training_frame = aa_h20, k = 7, x = v_used, init = "Random",max_iterations=1,standardize = FALSE)
wss <- h2o.tot_withinss(results)
ptm <- proc.time()
for(i in 1:1000) {
results_run <- h2o.kmeans(training_frame = aa_h20, k = 7, x = v_used, init = "Random",max_iterations=1000,standardize = FALSE)
wss_run <- h2o.tot_withinss(results_run)
if(wss_run < wss) {
wss <- wss_run
results <- results_run
}
print(i)
}
proc.time() - ptm
# Save clustering results from 10k run ( hours)
#saveRDS(results,"./data/results_10k_7C_Logit_BG.rds")
# Output Complete Case Lookup
ID <- usa.bg %>%
select(all_of(c(v_used,"GEOID"))) %>%
filter(complete.cases(.)) %>%
select("GEOID")
clusters <- as_tibble(h2o.predict(results,aa_h20))
usa.bg.cl <- tibble(GEOID= ID$GEOID,cluster=(clusters$predict +1))
write_parquet(usa.bg.cl, "./data/usa.bg.cl.group.parquet")