forked from HarunYOzturk/StringMatching
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathPreAnalysis.java
More file actions
216 lines (182 loc) · 7.46 KB
/
PreAnalysis.java
File metadata and controls
216 lines (182 loc) · 7.46 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
/**
* PreAnalysis interface for students to implement their algorithm selection logic
*
* Students should analyze the characteristics of the text and pattern to determine
* which algorithm would be most efficient for the given input.
*
* The system will automatically use this analysis if the chooseAlgorithm method
* returns a non-null value.
*/
public abstract class PreAnalysis {
/**
* Analyze the text and pattern to choose the best algorithm
*
* @param text The text to search in
* @param pattern The pattern to search for
* @return The name of the algorithm to use (e.g., "Naive", "KMP", "RabinKarp", "BoyerMoore", "GoCrazy")
* Return null if you want to skip pre-analysis and run all algorithms
*
* Tips for students:
* - Consider the length of the text and pattern
* - Consider the characteristics of the pattern (repeating characters, etc.)
* - Consider the alphabet size
* - Think about which algorithm performs best in different scenarios
*/
public abstract String chooseAlgorithm(String text, String pattern);
/**
* Get a description of your analysis strategy
* This will be displayed in the output
*/
public abstract String getStrategyDescription();
}
/**
* Default implementation that students should modify
* This is where students write their pre-analysis logic
*/
class StudentPreAnalysis extends PreAnalysis {
@Override
public String chooseAlgorithm(String text, String pattern) {
if (text == null) text = "";
if (pattern == null) pattern = "";
int n = text.length();
int m = pattern.length();
// Edge cases (benchmark'lerinde Empty Pattern'da KMP iyi çıkabiliyor)
if (m == 0) return "KMP";
if (n == 0) return "Naive";
if (m > n) return "Naive";
// Features (cheap + stable)
int uniq = uniqueCharCount(pattern, 128);
double uniqRatio = (double) uniq / (double) m; // 0..1
double runRatio = maxRunRatio(pattern); // 0..1
double overlapRatio = borderRatio(pattern); // 0..1 (KMP-ish)
// Score each algorithm (not a decision tree)
double scoreNaive = 0;
double scoreKMP = 0;
double scoreRK = 0;
double scoreBM = 0;
double scoreGoCrazy = 0;
// Naive: zero preprocess, short patterns & moderate inputs
scoreNaive += 8.0;
scoreNaive += (m <= 4 ? 6.0 : 0.0);
scoreNaive += (n <= 200 ? 2.0 : 0.0);
scoreNaive -= (n > 5000 ? 3.0 : 0.0);
// KMP: benefits with self-overlap / repetitiveness
scoreKMP += 4.0;
scoreKMP += 10.0 * overlapRatio; // strong boost if big border
scoreKMP += 6.0 * runRatio; // boost if long runs (aaaaa.. or abab..)
scoreKMP -= (m <= 3 ? 3.0 : 0.0); // overhead not worth it for tiny patterns
// Rabin-Karp: good when text is large; hash amortization
scoreRK += 3.0;
scoreRK += (n > 1500 ? 5.0 : 0.0);
scoreRK += (m >= 8 ? 2.0 : -1.0);
scoreRK += (uniqRatio >= 0.6 ? 1.0 : 0.0); // less repetitive patterns are fine
// Boyer-Moore: shines with larger alphabet & longer pattern
scoreBM += 3.0;
scoreBM += (m >= 10 ? 4.0 : -1.0);
scoreBM += 8.0 * uniqRatio; // more unique chars -> better skipping chance
scoreBM += (n >= 800 ? 2.0 : 0.0);
scoreBM -= (uniq <= 3 ? 3.0 : 0.0); // tiny alphabet hurts BM
// GoCrazy: your custom hybrid -> we treat it as "small alphabet + mid patterns"
scoreGoCrazy += 2.0;
scoreGoCrazy += (uniq <= 4 ? 5.0 : 0.0);
scoreGoCrazy += (m >= 5 && m <= 20 ? 2.0 : 0.0);
scoreGoCrazy += (n >= 200 ? 1.5 : 0.0);
// Pick max score
String best = "Naive";
double bestScore = scoreNaive;
if (scoreKMP > bestScore) { bestScore = scoreKMP; best = "KMP"; }
if (scoreRK > bestScore) { bestScore = scoreRK; best = "RabinKarp"; }
if (scoreBM > bestScore) { bestScore = scoreBM; best = "BoyerMoore"; }
if (scoreGoCrazy > bestScore) { bestScore = scoreGoCrazy; best = "GoCrazy"; }
return best;
}
@Override
public String getStrategyDescription() {
return "Score-based pre-analysis: compute pattern features (unique ratio, runs, border overlap) + sizes (n,m), then choose highest-scoring algorithm.";
}
// ---- feature helpers ----
private int uniqueCharCount(String s, int cap) {
java.util.HashSet<Character> set = new java.util.HashSet<>();
for (int i = 0; i < s.length() && set.size() < cap; i++) {
set.add(s.charAt(i));
}
return set.size();
}
// max run length / m (e.g. "aaaaab" -> 5/6)
private double maxRunRatio(String s) {
int m = s.length();
int best = 1, cur = 1;
for (int i = 1; i < m; i++) {
if (s.charAt(i) == s.charAt(i - 1)) cur++;
else { best = Math.max(best, cur); cur = 1; }
}
best = Math.max(best, cur);
return (double) best / (double) m;
}
// longest proper prefix which is also suffix (KMP border) / m
// computed via prefix-function (pi array)
private double borderRatio(String p) {
int m = p.length();
if (m <= 1) return 0.0;
int[] pi = new int[m];
int j = 0;
for (int i = 1; i < m; i++) {
while (j > 0 && p.charAt(i) != p.charAt(j)) j = pi[j - 1];
if (p.charAt(i) == p.charAt(j)) j++;
pi[i] = j;
}
int border = pi[m - 1];
return (double) border / (double) m;
}
}
/**
* Example implementation showing how pre-analysis could work
* This is for demonstration purposes
*/
class ExamplePreAnalysis extends PreAnalysis {
@Override
public String chooseAlgorithm(String text, String pattern) {
int textLen = text.length();
int patternLen = pattern.length();
// Simple heuristic example
if (patternLen <= 3) {
return "Naive"; // For very short patterns, naive is often fastest
} else if (hasRepeatingPrefix(pattern)) {
return "KMP"; // KMP is good for patterns with repeating prefixes
} else if (patternLen > 10 && textLen > 1000) {
return "RabinKarp"; // RabinKarp can be good for long patterns in long texts
} else {
return "Naive"; // Default to naive for other cases
}
}
private boolean hasRepeatingPrefix(String pattern) {
if (pattern.length() < 2) return false;
// Check if first character repeats
char first = pattern.charAt(0);
int count = 0;
for (int i = 0; i < Math.min(pattern.length(), 5); i++) {
if (pattern.charAt(i) == first) count++;
}
return count >= 3;
}
@Override
public String getStrategyDescription() {
return "Example strategy: Choose based on pattern length and characteristics";
}
}
/**
* Instructor's pre-analysis implementation (for testing purposes only)
* Students should NOT modify this class
*/
class InstructorPreAnalysis extends PreAnalysis {
@Override
public String chooseAlgorithm(String text, String pattern) {
// This is a placeholder for instructor testing
// Students should focus on implementing StudentPreAnalysis
return null;
}
@Override
public String getStrategyDescription() {
return "Instructor's testing implementation";
}
}