-
Notifications
You must be signed in to change notification settings - Fork 2
Expand file tree
/
Copy path3_Correlation_Final_Variable_Selection.R
More file actions
92 lines (74 loc) · 2.3 KB
/
3_Correlation_Final_Variable_Selection.R
File metadata and controls
92 lines (74 loc) · 2.3 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
# Load Packages & Data
library(readr)
library(tidycensus)
library(tigris)
library(purrr)
library(foreach)
library(doParallel)
library(dplyr)
library(tidyverse)
library(tidygraph)
library(magrittr)
library(sf)
library(readxl)
library(httr)
library(skimr)
library(corrr)
library(reshape2)
library(ggraph)
library(viridis)
library(summarytools)
library(cluster)
library(janitor)
library(caret)
library(e1071)
All_data <- readRDS("./data/All_data_1.5.rds")
data <- readRDS("./data/data_BG_1.5.rds")
vars_new <- readRDS("./data/vars_new_1.5.rds")
# Add a state ID
All_data %<>%
mutate(state = str_sub(GEOID,1,2))
################################################################################################
# Correlation / Input Refinement
#
# This code produces correlation descriptions and attribute summaries and was used to refine
# the input measure list which are versioned between 1.0 and 1.5
#################################################################################################
v_used <- vars_new %>% filter(`New Variables` == 1) %>%
select(MEASURE) %>%
pull() # select proposed used variables
############################
# Check Correlations 1
############################
# Calculate correlations
data_corr <- All_data %>%
select(all_of(v_used)) %>%
correlate()
# Filter correlations for 0.6 threshold for inspection
g <- data_corr %>%
shave() %>%
stretch() %>%
drop_na() %>%
filter((r > 0.6)|(r < -0.6)) %>%
mutate(band = cut(r, breaks = c(-Inf, -0.7, -0.6,0, 0.6,0.7,Inf)))
# Build a correlation graph
graph_corr <- g %>%
as_tbl_graph()
# Append node attribute measures
graph_corr %<>%
left_join(vars_new, by=c("name" = "MEASURE"))
# Fix level Order
graph_corr %<>%
activate(edges) %>%
mutate(band = factor(band, levels = c("(-Inf,-0.7]","(-0.7,-0.6]","(0.6,0.7]","(0.7, Inf]")))
# Plot graph
graph_corr %>%
ggraph(layout = "kk") +
geom_edge_diagonal(aes(colour = band)) +
scale_edge_colour_viridis(option = "plasma",alpha = 0.5, discrete = TRUE) +
geom_node_point(aes(colour = CONCEPT), size = 0.7) +
geom_node_text(aes(label = name), size = 1, check_overlap =TRUE)
######################################
# Summary of variable characteristics
######################################
#view(dfSummary(All_data, graph.col = FALSE), file = "Summary_Inputs.html")