1+ # Check that UnicodeData.txt exists or else bail early
2+ import os
3+
4+ unicode_common_properties = []
5+ unicode_boolean_properties = []
6+ unicode_numeric_properties = []
7+
8+ def add_common_property (code_point , name , value ):
9+ if not name in ['NFC_QC' ]:
10+ return
11+
12+ if value != '' :
13+ unicode_common_properties .append ({
14+ 'code_point' : code_point ,
15+ 'name' : name ,
16+ 'value' : value
17+ })
18+
19+ def add_boolean_property (code_point , name , value ):
20+ if not name in ['XID_Start' , 'XID_Continue' ]:
21+ return
22+
23+ if value == 'Y' :
24+ unicode_boolean_properties .append ({
25+ 'code_point' : code_point ,
26+ 'name' : name
27+ })
28+
29+ def add_numeric_property (code_point , name , value ):
30+ return
31+ # handle fractional values like '1/5'
32+ if '/' in value :
33+ # Convert '1/5' to a float
34+ parts = value .split ('/' )
35+ numeric_value = float (parts [0 ]) / float (parts [1 ])
36+ else :
37+ numeric_value = float (value )
38+ unicode_numeric_properties .append ({
39+ 'code_point' : code_point ,
40+ 'name' : name ,
41+ 'value' : numeric_value
42+ })
43+
44+ # Trim comments (starting with '#') and whitespace from the line
45+ def parse_line (line ):
46+ return line .split ('#' )[0 ].strip ()
47+
48+ def parse_unicode_data_txt ():
49+ if not os .path .exists ('scripts/unicode/UnicodeData.txt' ):
50+ raise FileNotFoundError ("UnicodeData.txt not found in scripts/unicode directory." )
51+
52+ with open ('scripts/unicode/UnicodeData.txt' , 'r' , encoding = 'utf-8' ) as f :
53+ lines = f .readlines ()
54+
55+ for line in lines :
56+ line = parse_line (line )
57+ if not line :
58+ continue
59+
60+ parts = line .split (';' )
61+ code_point_hex = parts [0 ].strip ()
62+ code_point = int (code_point_hex , 16 )
63+ name = parts [1 ].strip ()
64+ general_category = parts [2 ].strip ()
65+ canonical_combining_class = parts [3 ].strip ()
66+ bidi_class = parts [4 ].strip ()
67+ decomposition_data = parts [5 ].strip ()
68+ numeric_part_a = parts [6 ].strip ()
69+ numeric_part_b = parts [7 ].strip ()
70+ numeric_part_c = parts [8 ].strip ()
71+ bidi_mirrored = parts [9 ].strip ()
72+ # Not captured: Unicode_1_Name, ISO_Comment
73+ simple_uppercase_mapping = parts [12 ].strip ()
74+ simple_lowercase_mapping = parts [13 ].strip ()
75+ simple_titlecase_mapping = parts [14 ].strip ()
76+
77+ # exclude <label> entries
78+ if not name .startswith ('<' ) and not name .endswith ('>' ):
79+ add_common_property (code_point , 'Name' , name )
80+ add_common_property (code_point , 'General_Category' , general_category )
81+ add_common_property (code_point , 'Canonical_Combining_Class' , canonical_combining_class )
82+ add_common_property (code_point , 'Bidi_Class' , bidi_class )
83+ if not numeric_part_c == '' :
84+ add_numeric_property (code_point , 'Numeric_Value' , numeric_part_c )
85+ if numeric_part_a == numeric_part_c :
86+ add_common_property (code_point , 'Numeric_Type' , 'Decimal' )
87+ elif numeric_part_b == numeric_part_c :
88+ add_common_property (code_point , 'Numeric_Type' , 'Digit' )
89+ else :
90+ add_common_property (code_point , 'Numeric_Type' , 'Numeric' )
91+ add_boolean_property (code_point , 'Bidi_Mirrored' , bidi_mirrored )
92+ add_common_property (code_point , 'Simple_Uppercase_Mapping' , simple_uppercase_mapping )
93+ add_common_property (code_point , 'Simple_Lowercase_Mapping' , simple_lowercase_mapping )
94+ add_common_property (code_point , 'Simple_Titlecase_Mapping' , simple_titlecase_mapping )
95+
96+ def parse_derived_normalization_props_txt ():
97+ if not os .path .exists ('scripts/unicode/DerivedNormalizationProps.txt' ):
98+ raise FileNotFoundError ("DerivedNormalizationProps.txt not found in scripts/unicode directory." )
99+
100+ with open ('scripts/unicode/DerivedNormalizationProps.txt' , 'r' , encoding = 'utf-8' ) as f :
101+ lines = f .readlines ()
102+
103+ for line in lines :
104+ line = parse_line (line )
105+ if not line :
106+ continue
107+
108+ parts = line .split (';' )
109+ if len (parts ) < 3 :
110+ # Skip boolean properties, which have only two parts, but are all either deprecated or redundant.
111+ continue
112+
113+ code_point_hex_pair = parts [0 ].strip ()
114+ if '..' not in code_point_hex_pair :
115+ code_point_start = code_point_end = int (code_point_hex_pair , 16 )
116+ else :
117+ # handle ranges like '00A0..00A7'
118+ code_point_hex_start , code_point_hex_end = code_point_hex_pair .split ('..' )
119+ code_point_start , code_point_end = int (code_point_hex_start , 16 ), int (code_point_hex_end , 16 )
120+ prop = parts [1 ].strip ()
121+ value = parts [2 ].strip ()
122+
123+ # Not handling properties Full_Composition_Exclusion (redundant), Expands_On_* (deprecated),
124+ # FC_NFKC_Closure (deprecated), Changes_When_NFKC_Casefolded (redundant).
125+ if prop in ['NFD_QC' , 'NFKD_QC' , 'NFC_QC' , 'NFKC_QC' , 'NFKC_CF' , 'NFKC_SCF' ]:
126+ for code_point in range (code_point_start , code_point_end + 1 ):
127+ add_common_property (code_point , prop , value )
128+
129+ def parse_derived_core_properties_txt ():
130+ if not os .path .exists ('scripts/unicode/DerivedCoreProperties.txt' ):
131+ raise FileNotFoundError ("DerivedCoreProperties.txt not found in scripts/unicode directory." )
132+
133+ with open ('scripts/unicode/DerivedCoreProperties.txt' , 'r' , encoding = 'utf-8' ) as f :
134+ lines = f .readlines ()
135+
136+ for line in lines :
137+ line = parse_line (line )
138+ if not line :
139+ continue
140+
141+ parts = line .split (';' )
142+ code_point_hex_pair = parts [0 ].strip ()
143+ if '..' not in code_point_hex_pair :
144+ code_point_start = code_point_end = int (code_point_hex_pair , 16 )
145+ else :
146+ # handle ranges like '00A0..00A7'
147+ code_point_hex_start , code_point_hex_end = code_point_hex_pair .split ('..' )
148+ code_point_start , code_point_end = int (code_point_hex_start , 16 ), int (code_point_hex_end , 16 )
149+
150+ prop = parts [1 ].strip ()
151+
152+ # skip properties Grapheme_Link (deprecated), Indic_Conjuct_Break (for simplicity, not binary)
153+ if not prop in ['Grapheme_Link' , 'Indic_Conjunct_Break' ]:
154+ for code_point in range (code_point_start , code_point_end + 1 ):
155+ add_boolean_property (code_point , prop , 'Y' )
156+
157+ def write_unicode_data_yaml ():
158+ with open ('src/qtil/strings/generated/unicode.yaml' , 'w' , encoding = 'utf-8' ) as f :
159+ f .write (
160+ '''extensions:
161+ - addsTo:
162+ pack: advanced-security/qtil
163+ extensible: unicodeHasProperty
164+ data:''' )
165+ for entry in unicode_common_properties :
166+ f .write (f"""
167+ - [{ entry ['code_point' ]} , '{ entry ['name' ]} ', '{ entry ['value' ]} ']""" )
168+
169+ f .write ('''
170+ - addsTo:
171+ pack: advanced-security/qtil
172+ extensible: unicodeHasBooleanProperty
173+ data:''' )
174+ for entry in unicode_boolean_properties :
175+ f .write (f"""
176+ - [{ entry ['code_point' ]} , '{ entry ['name' ]} ']""" )
177+
178+ f .write ('''
179+ - addsTo:
180+ pack: advanced-security/qtil
181+ extensible: unicodeHasNumericProperty
182+ data:''' )
183+ for entry in unicode_numeric_properties :
184+ f .write (f"""
185+ - [{ entry ['code_point' ]} , '{ entry ['name' ]} ', { entry ['value' ]} ]""" )
186+
187+ if __name__ == "__main__" :
188+ print ("""
189+ This script generates unicode_data.yaml from UnicodeData.txt, DerivedNormalizationProps.txt,
190+ and DerivedCoreProperties.txt files in the scripts/unicode directory.
191+
192+ Download the latest Unicode data files from:
193+ https://www.unicode.org/Public/UCD/latest/ucd/
194+
195+ Place the downloaded files in the scripts/unicode directory before running this script.
196+
197+ Running....
198+ """ )
199+
200+ parse_unicode_data_txt ()
201+ parse_derived_normalization_props_txt ()
202+ parse_derived_core_properties_txt ()
203+ write_unicode_data_yaml ()
204+
205+ print ("SUCCESS! Unicode data has been successfully generated in 'src/qtil/strings/generated/unicode.yaml'." )
0 commit comments