Skip to content

Commit c4aff6f

Browse files
save work
1 parent 4be8bee commit c4aff6f

File tree

6 files changed

+287328
-1
lines changed

6 files changed

+287328
-1
lines changed

scripts/generate_unicode.py

Lines changed: 205 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,205 @@
1+
# Check that UnicodeData.txt exists or else bail early
2+
import os
3+
4+
unicode_common_properties = []
5+
unicode_boolean_properties = []
6+
unicode_numeric_properties = []
7+
8+
def add_common_property(code_point, name, value):
9+
if not name in ['NFC_QC']:
10+
return
11+
12+
if value != '':
13+
unicode_common_properties.append({
14+
'code_point': code_point,
15+
'name': name,
16+
'value': value
17+
})
18+
19+
def add_boolean_property(code_point, name, value):
20+
if not name in ['XID_Start', 'XID_Continue']:
21+
return
22+
23+
if value == 'Y':
24+
unicode_boolean_properties.append({
25+
'code_point': code_point,
26+
'name': name
27+
})
28+
29+
def add_numeric_property(code_point, name, value):
30+
return
31+
# handle fractional values like '1/5'
32+
if '/' in value:
33+
# Convert '1/5' to a float
34+
parts = value.split('/')
35+
numeric_value = float(parts[0]) / float(parts[1])
36+
else:
37+
numeric_value = float(value)
38+
unicode_numeric_properties.append({
39+
'code_point': code_point,
40+
'name': name,
41+
'value': numeric_value
42+
})
43+
44+
# Trim comments (starting with '#') and whitespace from the line
45+
def parse_line(line):
46+
return line.split('#')[0].strip()
47+
48+
def parse_unicode_data_txt():
49+
if not os.path.exists('scripts/unicode/UnicodeData.txt'):
50+
raise FileNotFoundError("UnicodeData.txt not found in scripts/unicode directory.")
51+
52+
with open('scripts/unicode/UnicodeData.txt', 'r', encoding='utf-8') as f:
53+
lines = f.readlines()
54+
55+
for line in lines:
56+
line = parse_line(line)
57+
if not line:
58+
continue
59+
60+
parts = line.split(';')
61+
code_point_hex = parts[0].strip()
62+
code_point = int(code_point_hex, 16)
63+
name = parts[1].strip()
64+
general_category = parts[2].strip()
65+
canonical_combining_class = parts[3].strip()
66+
bidi_class = parts[4].strip()
67+
decomposition_data = parts[5].strip()
68+
numeric_part_a = parts[6].strip()
69+
numeric_part_b = parts[7].strip()
70+
numeric_part_c = parts[8].strip()
71+
bidi_mirrored = parts[9].strip()
72+
# Not captured: Unicode_1_Name, ISO_Comment
73+
simple_uppercase_mapping = parts[12].strip()
74+
simple_lowercase_mapping = parts[13].strip()
75+
simple_titlecase_mapping = parts[14].strip()
76+
77+
# exclude <label> entries
78+
if not name.startswith('<') and not name.endswith('>'):
79+
add_common_property(code_point, 'Name', name)
80+
add_common_property(code_point, 'General_Category', general_category)
81+
add_common_property(code_point, 'Canonical_Combining_Class', canonical_combining_class)
82+
add_common_property(code_point, 'Bidi_Class', bidi_class)
83+
if not numeric_part_c == '':
84+
add_numeric_property(code_point, 'Numeric_Value', numeric_part_c)
85+
if numeric_part_a == numeric_part_c:
86+
add_common_property(code_point, 'Numeric_Type', 'Decimal')
87+
elif numeric_part_b == numeric_part_c:
88+
add_common_property(code_point, 'Numeric_Type', 'Digit')
89+
else:
90+
add_common_property(code_point, 'Numeric_Type', 'Numeric')
91+
add_boolean_property(code_point, 'Bidi_Mirrored', bidi_mirrored)
92+
add_common_property(code_point, 'Simple_Uppercase_Mapping', simple_uppercase_mapping)
93+
add_common_property(code_point, 'Simple_Lowercase_Mapping', simple_lowercase_mapping)
94+
add_common_property(code_point, 'Simple_Titlecase_Mapping', simple_titlecase_mapping)
95+
96+
def parse_derived_normalization_props_txt():
97+
if not os.path.exists('scripts/unicode/DerivedNormalizationProps.txt'):
98+
raise FileNotFoundError("DerivedNormalizationProps.txt not found in scripts/unicode directory.")
99+
100+
with open('scripts/unicode/DerivedNormalizationProps.txt', 'r', encoding='utf-8') as f:
101+
lines = f.readlines()
102+
103+
for line in lines:
104+
line = parse_line(line)
105+
if not line:
106+
continue
107+
108+
parts = line.split(';')
109+
if len(parts) < 3:
110+
# Skip boolean properties, which have only two parts, but are all either deprecated or redundant.
111+
continue
112+
113+
code_point_hex_pair = parts[0].strip()
114+
if '..' not in code_point_hex_pair:
115+
code_point_start = code_point_end = int(code_point_hex_pair, 16)
116+
else:
117+
# handle ranges like '00A0..00A7'
118+
code_point_hex_start, code_point_hex_end = code_point_hex_pair.split('..')
119+
code_point_start, code_point_end = int(code_point_hex_start, 16), int(code_point_hex_end, 16)
120+
prop = parts[1].strip()
121+
value = parts[2].strip()
122+
123+
# Not handling properties Full_Composition_Exclusion (redundant), Expands_On_* (deprecated),
124+
# FC_NFKC_Closure (deprecated), Changes_When_NFKC_Casefolded (redundant).
125+
if prop in ['NFD_QC', 'NFKD_QC', 'NFC_QC', 'NFKC_QC', 'NFKC_CF', 'NFKC_SCF']:
126+
for code_point in range(code_point_start, code_point_end + 1):
127+
add_common_property(code_point, prop, value)
128+
129+
def parse_derived_core_properties_txt():
130+
if not os.path.exists('scripts/unicode/DerivedCoreProperties.txt'):
131+
raise FileNotFoundError("DerivedCoreProperties.txt not found in scripts/unicode directory.")
132+
133+
with open('scripts/unicode/DerivedCoreProperties.txt', 'r', encoding='utf-8') as f:
134+
lines = f.readlines()
135+
136+
for line in lines:
137+
line = parse_line(line)
138+
if not line:
139+
continue
140+
141+
parts = line.split(';')
142+
code_point_hex_pair = parts[0].strip()
143+
if '..' not in code_point_hex_pair:
144+
code_point_start = code_point_end = int(code_point_hex_pair, 16)
145+
else:
146+
# handle ranges like '00A0..00A7'
147+
code_point_hex_start, code_point_hex_end = code_point_hex_pair.split('..')
148+
code_point_start, code_point_end = int(code_point_hex_start, 16), int(code_point_hex_end, 16)
149+
150+
prop = parts[1].strip()
151+
152+
# skip properties Grapheme_Link (deprecated), Indic_Conjuct_Break (for simplicity, not binary)
153+
if not prop in ['Grapheme_Link', 'Indic_Conjunct_Break']:
154+
for code_point in range(code_point_start, code_point_end + 1):
155+
add_boolean_property(code_point, prop, 'Y')
156+
157+
def write_unicode_data_yaml():
158+
with open('src/qtil/strings/generated/unicode.yaml', 'w', encoding='utf-8') as f:
159+
f.write(
160+
'''extensions:
161+
- addsTo:
162+
pack: advanced-security/qtil
163+
extensible: unicodeHasProperty
164+
data:''')
165+
for entry in unicode_common_properties:
166+
f.write(f"""
167+
- [{entry['code_point']}, '{entry['name']}', '{entry['value']}']""")
168+
169+
f.write('''
170+
- addsTo:
171+
pack: advanced-security/qtil
172+
extensible: unicodeHasBooleanProperty
173+
data:''')
174+
for entry in unicode_boolean_properties:
175+
f.write(f"""
176+
- [{entry['code_point']}, '{entry['name']}']""")
177+
178+
f.write('''
179+
- addsTo:
180+
pack: advanced-security/qtil
181+
extensible: unicodeHasNumericProperty
182+
data:''')
183+
for entry in unicode_numeric_properties:
184+
f.write(f"""
185+
- [{entry['code_point']}, '{entry['name']}', {entry['value']}]""")
186+
187+
if __name__ == "__main__":
188+
print("""
189+
This script generates unicode_data.yaml from UnicodeData.txt, DerivedNormalizationProps.txt,
190+
and DerivedCoreProperties.txt files in the scripts/unicode directory.
191+
192+
Download the latest Unicode data files from:
193+
https://www.unicode.org/Public/UCD/latest/ucd/
194+
195+
Place the downloaded files in the scripts/unicode directory before running this script.
196+
197+
Running....
198+
""")
199+
200+
parse_unicode_data_txt()
201+
parse_derived_normalization_props_txt()
202+
parse_derived_core_properties_txt()
203+
write_unicode_data_yaml()
204+
205+
print("SUCCESS! Unicode data has been successfully generated in 'src/qtil/strings/generated/unicode.yaml'.")

src/qlpack.yml

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,4 +4,6 @@ warnOnImplicitThis: false
44
version: 0.0.1
55
license: MIT
66
dependencies:
7-
codeql/util: "1.0.12"
7+
codeql/util: "1.0.12"
8+
dataExtensions:
9+
- qtil/strings/generated/unicode.yaml

src/qtil/strings/Unicode.qll

Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,37 @@
1+
/**
2+
* Returns the Unicode version used by the current QL environment.
3+
*/
4+
extensible predicate unicodeVersion(string version);
5+
6+
/**
7+
* Provieds properties of a Unicode code point, where the property is of 'enumeration', 'catalog',
8+
* or 'string-valued' type.
9+
*
10+
* For example, `Block` is an enumeration property, `Line_Break` is a catalog property, and
11+
* `Uppercase_Mapping` is a string-valued property.
12+
*
13+
* For boolean properties, see `unicodeHasBooleanProperty`, and for numeric properties, see
14+
* `unicodeHasNumericProperty`.
15+
*/
16+
extensible predicate unicodeHasProperty(int codePoint, string propertyName, string propertyValue);
17+
18+
/**
19+
* Holds when the Unicode code point's boolean property of the given name is true.
20+
*
21+
* For example, `Alphabetic` is a boolean property that can be true or false for a code point.
22+
*
23+
* For other types of properties, see `unicodeHasProperty`.
24+
*/
25+
extensible predicate unicodeHasBooleanProperty(int codePoint, string propertyName);
26+
27+
/**
28+
* Provides the numeric value of a Unicode code point's numeric property.
29+
*
30+
* For example, `Numeric_Value` is a numeric property that can be an integer or a decimal value
31+
* for a code point.
32+
*
33+
* For other types of properties, see `unicodeHasProperty` and `unicodeHasBooleanProperty`.
34+
*/
35+
extensible predicate unicodeHasNumericProperty(
36+
int codePoint, string propertyName, float numericValue
37+
);

0 commit comments

Comments
 (0)