Skip to content

Commit d02924f

Browse files
Implement general unicode property support, and some specific features.
Adds UAX #44 identifier checking, and NFC quick check support, along with a few helpers like `isAscii` and `unescapeUnicode`.
1 parent 9f1097a commit d02924f

File tree

8 files changed

+1118386
-1
lines changed

8 files changed

+1118386
-1
lines changed

scripts/generate_unicode.py

Lines changed: 221 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,221 @@
1+
### This script generates unicode_data.yaml from UnicodeData.txt, DerivedNormalizationProps.txt,
2+
### and DerivedCoreProperties.txt files in the scripts/unicode directory.
3+
###
4+
### Usage:
5+
### python generate_unicode.py ${unicode_version}
6+
###
7+
### Requires the latest Unicode data files from: https://www.unicode.org/Public/UCD/latest/ucd/
8+
### are downloaded and placed in the scripts/unicode directory before running this script.
9+
10+
import os
11+
import sys
12+
13+
# Check that the user provided a Unicode version argument
14+
if len(sys.argv) != 2:
15+
print("Usage: python generate_unicode.py ${unicode_version}")
16+
sys.exit(1)
17+
18+
unicode_version = sys.argv[1]
19+
20+
unicode_common_properties = []
21+
unicode_boolean_properties = []
22+
unicode_numeric_properties = []
23+
24+
25+
26+
def add_common_property(code_point, name, value):
27+
if value != '':
28+
unicode_common_properties.append({
29+
'code_point': code_point,
30+
'name': name,
31+
'value': value
32+
})
33+
34+
def add_boolean_property(code_point, name, value):
35+
if value == 'Y':
36+
unicode_boolean_properties.append({
37+
'code_point': code_point,
38+
'name': name
39+
})
40+
41+
def add_numeric_property(code_point, name, value):
42+
# handle fractional values like '1/5'
43+
if '/' in value:
44+
# Convert '1/5' to a float
45+
parts = value.split('/')
46+
numeric_value = float(parts[0]) / float(parts[1])
47+
else:
48+
numeric_value = float(value)
49+
unicode_numeric_properties.append({
50+
'code_point': code_point,
51+
'name': name,
52+
'value': numeric_value
53+
})
54+
55+
# Trim comments (starting with '#') and whitespace from the line
56+
def parse_line(line):
57+
return line.split('#')[0].strip()
58+
59+
def parse_unicode_data_txt():
60+
if not os.path.exists('scripts/unicode/UnicodeData.txt'):
61+
raise FileNotFoundError("UnicodeData.txt not found in scripts/unicode directory.")
62+
63+
with open('scripts/unicode/UnicodeData.txt', 'r', encoding='utf-8') as f:
64+
lines = f.readlines()
65+
66+
for line in lines:
67+
line = parse_line(line)
68+
if not line:
69+
continue
70+
71+
parts = line.split(';')
72+
code_point_hex = parts[0].strip()
73+
code_point = int(code_point_hex, 16)
74+
name = parts[1].strip()
75+
general_category = parts[2].strip()
76+
canonical_combining_class = parts[3].strip()
77+
bidi_class = parts[4].strip()
78+
decomposition_data = parts[5].strip()
79+
numeric_part_a = parts[6].strip()
80+
numeric_part_b = parts[7].strip()
81+
numeric_part_c = parts[8].strip()
82+
bidi_mirrored = parts[9].strip()
83+
# Not captured: Unicode_1_Name, ISO_Comment
84+
simple_uppercase_mapping = parts[12].strip()
85+
simple_lowercase_mapping = parts[13].strip()
86+
simple_titlecase_mapping = parts[14].strip()
87+
88+
# exclude <label> entries
89+
if not name.startswith('<') and not name.endswith('>'):
90+
add_common_property(code_point, 'Name', name)
91+
add_common_property(code_point, 'General_Category', general_category)
92+
add_common_property(code_point, 'Canonical_Combining_Class', canonical_combining_class)
93+
add_common_property(code_point, 'Bidi_Class', bidi_class)
94+
if not numeric_part_c == '':
95+
add_numeric_property(code_point, 'Numeric_Value', numeric_part_c)
96+
if numeric_part_a == numeric_part_c:
97+
add_common_property(code_point, 'Numeric_Type', 'Decimal')
98+
elif numeric_part_b == numeric_part_c:
99+
add_common_property(code_point, 'Numeric_Type', 'Digit')
100+
else:
101+
add_common_property(code_point, 'Numeric_Type', 'Numeric')
102+
add_boolean_property(code_point, 'Bidi_Mirrored', bidi_mirrored)
103+
add_common_property(code_point, 'Simple_Uppercase_Mapping', simple_uppercase_mapping)
104+
add_common_property(code_point, 'Simple_Lowercase_Mapping', simple_lowercase_mapping)
105+
add_common_property(code_point, 'Simple_Titlecase_Mapping', simple_titlecase_mapping)
106+
107+
def parse_derived_normalization_props_txt():
108+
if not os.path.exists('scripts/unicode/DerivedNormalizationProps.txt'):
109+
raise FileNotFoundError("DerivedNormalizationProps.txt not found in scripts/unicode directory.")
110+
111+
with open('scripts/unicode/DerivedNormalizationProps.txt', 'r', encoding='utf-8') as f:
112+
lines = f.readlines()
113+
114+
for line in lines:
115+
line = parse_line(line)
116+
if not line:
117+
continue
118+
119+
parts = line.split(';')
120+
if len(parts) < 3:
121+
# Skip boolean properties, which have only two parts, but are all either deprecated or redundant.
122+
continue
123+
124+
code_point_hex_pair = parts[0].strip()
125+
if '..' not in code_point_hex_pair:
126+
code_point_start = code_point_end = int(code_point_hex_pair, 16)
127+
else:
128+
# handle ranges like '00A0..00A7'
129+
code_point_hex_start, code_point_hex_end = code_point_hex_pair.split('..')
130+
code_point_start, code_point_end = int(code_point_hex_start, 16), int(code_point_hex_end, 16)
131+
prop = parts[1].strip()
132+
value = parts[2].strip()
133+
134+
# Not handling properties Full_Composition_Exclusion (redundant), Expands_On_* (deprecated),
135+
# FC_NFKC_Closure (deprecated), Changes_When_NFKC_Casefolded (redundant).
136+
if prop in ['NFD_QC', 'NFKD_QC', 'NFC_QC', 'NFKC_QC', 'NFKC_CF', 'NFKC_SCF']:
137+
for code_point in range(code_point_start, code_point_end + 1):
138+
add_common_property(code_point, prop, value)
139+
140+
def parse_derived_core_properties_txt():
141+
if not os.path.exists('scripts/unicode/DerivedCoreProperties.txt'):
142+
raise FileNotFoundError("DerivedCoreProperties.txt not found in scripts/unicode directory.")
143+
144+
with open('scripts/unicode/DerivedCoreProperties.txt', 'r', encoding='utf-8') as f:
145+
lines = f.readlines()
146+
147+
for line in lines:
148+
line = parse_line(line)
149+
if not line:
150+
continue
151+
152+
parts = line.split(';')
153+
code_point_hex_pair = parts[0].strip()
154+
if '..' not in code_point_hex_pair:
155+
code_point_start = code_point_end = int(code_point_hex_pair, 16)
156+
else:
157+
# handle ranges like '00A0..00A7'
158+
code_point_hex_start, code_point_hex_end = code_point_hex_pair.split('..')
159+
code_point_start, code_point_end = int(code_point_hex_start, 16), int(code_point_hex_end, 16)
160+
161+
prop = parts[1].strip()
162+
163+
# skip properties Grapheme_Link (deprecated), Indic_Conjuct_Break (for simplicity, not binary)
164+
if not prop in ['Grapheme_Link', 'Indic_Conjunct_Break']:
165+
for code_point in range(code_point_start, code_point_end + 1):
166+
add_boolean_property(code_point, prop, 'Y')
167+
168+
def write_unicode_data_yaml():
169+
with open('src/qtil/strings/generated/unicode.yaml', 'w', encoding='utf-8') as f:
170+
f.write(
171+
'''extensions:
172+
- addsTo:
173+
pack: advanced-security/qtil
174+
extensible: unicodeVersion
175+
data:
176+
- ["''' + unicode_version + '''"]
177+
- addsTo:
178+
pack: advanced-security/qtil
179+
extensible: unicodeHasProperty
180+
data:''')
181+
for entry in unicode_common_properties:
182+
f.write(f"""
183+
- [{entry['code_point']}, '{entry['name']}', '{entry['value']}']""")
184+
185+
f.write('''
186+
- addsTo:
187+
pack: advanced-security/qtil
188+
extensible: unicodeHasBooleanProperty
189+
data:''')
190+
for entry in unicode_boolean_properties:
191+
f.write(f"""
192+
- [{entry['code_point']}, '{entry['name']}']""")
193+
194+
f.write('''
195+
- addsTo:
196+
pack: advanced-security/qtil
197+
extensible: unicodeHasNumericProperty
198+
data:''')
199+
for entry in unicode_numeric_properties:
200+
f.write(f"""
201+
- [{entry['code_point']}, '{entry['name']}', {entry['value']}]""")
202+
203+
if __name__ == "__main__":
204+
print("""
205+
This script generates unicode_data.yaml from UnicodeData.txt, DerivedNormalizationProps.txt,
206+
and DerivedCoreProperties.txt files in the scripts/unicode directory.
207+
208+
Download the latest Unicode data files from:
209+
https://www.unicode.org/Public/UCD/latest/ucd/
210+
211+
Place the downloaded files in the scripts/unicode directory before running this script.
212+
213+
Running....
214+
""")
215+
216+
parse_unicode_data_txt()
217+
parse_derived_normalization_props_txt()
218+
parse_derived_core_properties_txt()
219+
write_unicode_data_yaml()
220+
221+
print("SUCCESS! Unicode data has been successfully generated in 'src/qtil/strings/generated/unicode.yaml'.")

src/qlpack.yml

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,4 +4,6 @@ warnOnImplicitThis: false
44
version: 0.0.1
55
license: MIT
66
dependencies:
7-
codeql/util: "1.0.12"
7+
codeql/util: "1.0.12"
8+
dataExtensions:
9+
- qtil/strings/generated/unicode.yaml

src/qtil/Qtil.qll

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@ module Qtil {
2626
import qtil.strings.Join
2727
import qtil.strings.Other
2828
import qtil.strings.Plural
29+
import qtil.strings.Unicode
2930
import qtil.tuple.Pair
3031
import qtil.tuple.Product
3132
import qtil.tuple.StringTuple

0 commit comments

Comments
 (0)