clf-optionality/scripts.R at main · complexico/clf-optionality · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
library(tidyverse)
library(readxl)

included_cd <- c("satu",
                 "dua",
                 "tiga",
                 "lima",
                 "empat",
                 "enam",
                 "delapan",
                 "tujuh",
                 "sepuluh",
                 "belas",
                 "sembilan",
                 "puluh",
                 "seribu",
                 "sejuta",
                 "sebelas",
                 "seratus",
                 "ratus",
                 "triliun",
                 "ribu",
                 "juta",
                 "miliar",
                 "milliar",
                 "semiliar",
                 "semilliar",
                 "milyar",
                 "semilyar")

# load annotated Excel data
coll_df <- read_xlsx("data/Coll1R1R_WithSemanticsForPaperYear2.xlsx",
                     sheet = "all buah") |>
  # filter out observations that are NOT "se-"
  filter(SearchKeyword != "se")

# collocation data for the search term CD + buah
## the filtered collocates are those tagged as NN (1R&1L)
cd_buah_df <- read.table("data/collocations-2.txt",
                         header = TRUE,
                         skip = 3,
                         sep = "\t") |>
  as_tibble()

# data of freqlist with the pattern CD + NN
## goal: to get the NN that can occur with numeral (CD) without "buah"
wfreq <- read_xlsx("data/query-freq-breakdown.xlsx")
wfreq1 <- wfreq |>
  separate_wider_delim(cols = Search_result,
                       delim = " ",
                       names = c("cd", "nn"),
                       too_few = "debug") |>
  filter(Search_result_ok) |>
  ## harmonise the words to be lowercase for counting purpose
  mutate(cd = tolower(cd),
         nn = tolower(nn)) |>
  group_by(cd, nn) |>
  summarise(No_of_occurrences = sum(No_of_occurrences)) |>
  arrange(desc(No_of_occurrences)) |>
  ungroup()

# wfreq 2 contains nouns in the search pattern [CD + NN]
wfreq2 <- wfreq1 |>
  ## filter out CD-tagged item that are not word-character
  filter(str_detect(cd, "\\W", negate = TRUE))

# this wfreq3 contains NN words  (from CD + NN search pattern in wfreq2)
# that are available also in Karlina's Excel sheet file Coll1R1R_With...
# In other words, wfreq3 contains nouns that can AND cannot appear with clf.
wfreq3 <- wfreq2 |>
  filter(nn %in% tolower(coll_df$Word))
wfreq3

wfreq3 |>
  filter(cd %in% included_cd | str_detect(cd, "[0-9]+")) |>
  pull(nn) |>
  unique()


# the unique noun-type that can and cannot appear with clf. in CD + (clf.) + N
wfreq3 |>
  select(nn) |>
  mutate(nn = tolower(nn)) |>
  distinct() |>
  nrow()

cd_buah_df |>
  select(Word) |>
  mutate(Word = tolower(Word)) |>
  distinct() |>
  nrow()

# 1. noun in [Num/CD + nn] BUT NOT in [Num/CD + class + nn]
wfreq3 |>
  filter(!tolower(nn) %in% tolower(cd_buah_df$Word)) |>
  select(nn) |>
  mutate(nn = tolower(nn)) |>
  distinct() |>
  nrow()
# 167

# 2. noun in [Num + class + nn] BUT NOT in [Num + nn]
cd_buah_df |>
  filter(!tolower(Word) %in% tolower(wfreq3$nn)) |>
  select(Word) |>
  mutate(Word = tolower(Word)) |>
  distinct() |>
  nrow()
# 44

# 3 noun in [Num + class + nn] AND in [Num + nn]
cd_buah_df |>
  filter(tolower(Word) %in% tolower(wfreq3$nn)) |>
  select(Word) |>
  mutate(Word = tolower(Word)) |>
  distinct() |>
  nrow()
# 63