-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathdata_augmentation
More file actions
executable file
·116 lines (86 loc) · 4.35 KB
/
data_augmentation
File metadata and controls
executable file
·116 lines (86 loc) · 4.35 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
#! /bin/Rscript
library(ggplot2)
library(data.table)
library(dplyr)
library(condMVNorm) # Conditional normal
library(VGAM) # Zipf distribution
library(configr) # Read json
library(RJSONIO) # Write json
setwd('/home/work/cpp/pubsub/cmake-build-debug/')
unlink('./data/augmented', recursive=TRUE)
main <- function() {
print("Reading config file")
config_file <- read.config(file = "config/augmentation.json")
NUMBER_OF_SUBSCRIPTIONS = seq( from= config_file$number_of_subscriptions[1]
, to = config_file$number_of_subscriptions[2]
, by = config_file$number_of_subscriptions[3]
)
WIDTH = seq( from= config_file$width[1]
, to = config_file$width[2]
, by = config_file$width[3]
)
DISTANCE_FROM_MEAN = seq( from= config_file$distance_from_mean[1]
, to = config_file$distance_from_mean[2]
, by = config_file$distance_from_mean[3]
)
NUMBER_OF_CONSTRAINTS = seq( from= config_file$number_of_constraints[1]
, to = config_file$number_of_constraints[2]
, by = config_file$number_of_constraints[3]
)
ALPHA = seq( from= config_file$alpha[1]
, to = config_file$alpha[2]
, by = config_file$alpha[3]
)
if (!file.exists(config_file$input_data)){
print("Raw data directory does not exist! Exiting")
return
}
if (!file.exists(config_file$output_data)){
dir.create(config_file$output_data)
}
if (!file.exists(paste(config_file$output_data, "/", "subscriptions", sep=""))){
dir.create(paste(config_file$output_data, "/", "subscriptions", sep=""))
}
print("Checking data folder for raw data")
raw_data_folder <- list.files(path=config_file$input_data, pattern="*.csv")
raw_data_filename <- paste(config_file$input_data,"/",raw_data_folder[1], sep="")
df <- fread(raw_data_filename, header=TRUE, stringsAsFactors=FALSE)
df <- nafill(df, "locf") %>% unlist() %>% matrix(ncol=ncol(df),byrow = FALSE)
print("Successfully imported")
print("Calculating mean vector and covariance matrix")
mu <- colMeans(df)
covariance <- cov(df)
print("Generating events")
rmvnorm(n=config_file$number_of_events, mean=mu, sigma=covariance) %>%
write.table(paste(config_file$output_data,"/",config_file$events_filename, sep=""), row.names=FALSE, col.names=FALSE)
print("Generating subscriptions")
for (n in NUMBER_OF_SUBSCRIPTIONS) {
for (width in WIDTH) {
for (distance_from_mean in DISTANCE_FROM_MEAN) {
for (number_of_constraints in NUMBER_OF_CONSTRAINTS) {
for (alpha in ALPHA) {
print(paste("n=", n, ", width=", width, "number of constraints=", number_of_constraints, ", alpha=", alpha))
timestamp <- format(Sys.time(), "%s")
list( number_of_subscriptions = n,
width = width,
distance_from_mean = distance_from_mean,
number_of_constraints = number_of_constraints,
maxAtts = length(mu),
alpha = alpha
) %>%
toJSON() %>%
write(paste(config_file$output_data,"/","subscriptions","/",timestamp,".json", sep=""))
if (config_file$distance_from_mean == 0) {
rmvnorm(n=n, mean=mu, sigma=covariance) %>%
write.table(paste(config_file$output_data,"/","subscriptions","/",timestamp,".csv", sep=""), row.names=FALSE, col.names = FALSE)
}
}
}
}
}
}
print("Exporting standard deviations")
write.table(apply(df, 2, sd), paste(config_file$output_data,"/standard_deviations.csv", sep=""), row.names = FALSE, col.names = FALSE)
print("Done")
}
main()