-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathscripts.R
More file actions
127 lines (105 loc) · 3.28 KB
/
scripts.R
File metadata and controls
127 lines (105 loc) · 3.28 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
library(tidyverse)
library(readxl)
included_cd <- c("satu",
"dua",
"tiga",
"lima",
"empat",
"enam",
"delapan",
"tujuh",
"sepuluh",
"belas",
"sembilan",
"puluh",
"seribu",
"sejuta",
"sebelas",
"seratus",
"ratus",
"triliun",
"ribu",
"juta",
"miliar",
"milliar",
"semiliar",
"semilliar",
"milyar",
"semilyar")
# load annotated Excel data
coll_df <- read_xlsx("data/Coll1R1R_WithSemanticsForPaperYear2.xlsx",
sheet = "all buah") |>
# filter out observations that are NOT "se-"
filter(SearchKeyword != "se")
# collocation data for the search term CD + buah
## the filtered collocates are those tagged as NN (1R&1L)
cd_buah_df <- read.table("data/collocations-2.txt",
header = TRUE,
skip = 3,
sep = "\t") |>
as_tibble()
# data of freqlist with the pattern CD + NN
## goal: to get the NN that can occur with numeral (CD) without "buah"
wfreq <- read_xlsx("data/query-freq-breakdown.xlsx")
wfreq1 <- wfreq |>
separate_wider_delim(cols = Search_result,
delim = " ",
names = c("cd", "nn"),
too_few = "debug") |>
filter(Search_result_ok) |>
## harmonise the words to be lowercase for counting purpose
mutate(cd = tolower(cd),
nn = tolower(nn)) |>
group_by(cd, nn) |>
summarise(No_of_occurrences = sum(No_of_occurrences)) |>
arrange(desc(No_of_occurrences)) |>
ungroup()
# wfreq 2 contains nouns in the search pattern [CD + NN]
wfreq2 <- wfreq1 |>
## filter out CD-tagged item that are not word-character
filter(str_detect(cd, "\\W", negate = TRUE))
# this wfreq3 contains NN words (from CD + NN search pattern in wfreq2)
# that are available also in Karlina's Excel sheet file Coll1R1R_With...
# In other words, wfreq3 contains nouns that can AND cannot appear with clf.
wfreq3 <- wfreq2 |>
filter(nn %in% tolower(coll_df$Word))
wfreq3
wfreq3 |>
filter(cd %in% included_cd | str_detect(cd, "[0-9]+")) |>
pull(nn) |>
unique()
# the unique noun-type that can and cannot appear with clf. in CD + (clf.) + N
wfreq3 |>
select(nn) |>
mutate(nn = tolower(nn)) |>
distinct() |>
nrow()
cd_buah_df |>
select(Word) |>
mutate(Word = tolower(Word)) |>
distinct() |>
nrow()
# 1. noun in [Num/CD + nn] BUT NOT in [Num/CD + class + nn]
wfreq3 |>
filter(!tolower(nn) %in% tolower(cd_buah_df$Word)) |>
select(nn) |>
mutate(nn = tolower(nn)) |>
distinct() |>
nrow()
# 167
# 2. noun in [Num + class + nn] BUT NOT in [Num + nn]
cd_buah_df |>
filter(!tolower(Word) %in% tolower(wfreq3$nn)) |>
select(Word) |>
mutate(Word = tolower(Word)) |>
distinct() |>
nrow()
# 44
# 3 noun in [Num + class + nn] AND in [Num + nn]
cd_buah_df |>
filter(tolower(Word) %in% tolower(wfreq3$nn)) |>
select(Word) |>
mutate(Word = tolower(Word)) |>
distinct() |>
nrow()
# 63