1+ ### This script generates unicode_data.yaml from UnicodeData.txt, DerivedNormalizationProps.txt,
2+ ### and DerivedCoreProperties.txt files in the scripts/unicode directory.
3+ ###
4+ ### Usage:
5+ ### python generate_unicode.py ${unicode_version}
6+ ###
7+ ### Requires the latest Unicode data files from: https://www.unicode.org/Public/UCD/latest/ucd/
8+ ### are downloaded and placed in the scripts/unicode directory before running this script.
9+
10+ import os
11+ import sys
12+
13+ # Check that the user provided a Unicode version argument
14+ if len (sys .argv ) != 2 :
15+ print ("Usage: python generate_unicode.py ${unicode_version}" )
16+ sys .exit (1 )
17+
18+ unicode_version = sys .argv [1 ]
19+
20+ unicode_common_properties = []
21+ unicode_boolean_properties = []
22+ unicode_numeric_properties = []
23+
24+
25+
26+ def add_common_property (code_point , name , value ):
27+ if value != '' :
28+ unicode_common_properties .append ({
29+ 'code_point' : code_point ,
30+ 'name' : name ,
31+ 'value' : value
32+ })
33+
34+ def add_boolean_property (code_point , name , value ):
35+ if value == 'Y' :
36+ unicode_boolean_properties .append ({
37+ 'code_point' : code_point ,
38+ 'name' : name
39+ })
40+
41+ def add_numeric_property (code_point , name , value ):
42+ # handle fractional values like '1/5'
43+ if '/' in value :
44+ # Convert '1/5' to a float
45+ parts = value .split ('/' )
46+ numeric_value = float (parts [0 ]) / float (parts [1 ])
47+ else :
48+ numeric_value = float (value )
49+ unicode_numeric_properties .append ({
50+ 'code_point' : code_point ,
51+ 'name' : name ,
52+ 'value' : numeric_value
53+ })
54+
55+ # Trim comments (starting with '#') and whitespace from the line
56+ def parse_line (line ):
57+ return line .split ('#' )[0 ].strip ()
58+
59+ def parse_unicode_data_txt ():
60+ if not os .path .exists ('scripts/unicode/UnicodeData.txt' ):
61+ raise FileNotFoundError ("UnicodeData.txt not found in scripts/unicode directory." )
62+
63+ with open ('scripts/unicode/UnicodeData.txt' , 'r' , encoding = 'utf-8' ) as f :
64+ lines = f .readlines ()
65+
66+ for line in lines :
67+ line = parse_line (line )
68+ if not line :
69+ continue
70+
71+ parts = line .split (';' )
72+ code_point_hex = parts [0 ].strip ()
73+ code_point = int (code_point_hex , 16 )
74+ name = parts [1 ].strip ()
75+ general_category = parts [2 ].strip ()
76+ canonical_combining_class = parts [3 ].strip ()
77+ bidi_class = parts [4 ].strip ()
78+ decomposition_data = parts [5 ].strip ()
79+ numeric_part_a = parts [6 ].strip ()
80+ numeric_part_b = parts [7 ].strip ()
81+ numeric_part_c = parts [8 ].strip ()
82+ bidi_mirrored = parts [9 ].strip ()
83+ # Not captured: Unicode_1_Name, ISO_Comment
84+ simple_uppercase_mapping = parts [12 ].strip ()
85+ simple_lowercase_mapping = parts [13 ].strip ()
86+ simple_titlecase_mapping = parts [14 ].strip ()
87+
88+ # exclude <label> entries
89+ if not name .startswith ('<' ) and not name .endswith ('>' ):
90+ add_common_property (code_point , 'Name' , name )
91+ add_common_property (code_point , 'General_Category' , general_category )
92+ add_common_property (code_point , 'Canonical_Combining_Class' , canonical_combining_class )
93+ add_common_property (code_point , 'Bidi_Class' , bidi_class )
94+ if not numeric_part_c == '' :
95+ add_numeric_property (code_point , 'Numeric_Value' , numeric_part_c )
96+ if numeric_part_a == numeric_part_c :
97+ add_common_property (code_point , 'Numeric_Type' , 'Decimal' )
98+ elif numeric_part_b == numeric_part_c :
99+ add_common_property (code_point , 'Numeric_Type' , 'Digit' )
100+ else :
101+ add_common_property (code_point , 'Numeric_Type' , 'Numeric' )
102+ add_boolean_property (code_point , 'Bidi_Mirrored' , bidi_mirrored )
103+ add_common_property (code_point , 'Simple_Uppercase_Mapping' , simple_uppercase_mapping )
104+ add_common_property (code_point , 'Simple_Lowercase_Mapping' , simple_lowercase_mapping )
105+ add_common_property (code_point , 'Simple_Titlecase_Mapping' , simple_titlecase_mapping )
106+
107+ def parse_derived_normalization_props_txt ():
108+ if not os .path .exists ('scripts/unicode/DerivedNormalizationProps.txt' ):
109+ raise FileNotFoundError ("DerivedNormalizationProps.txt not found in scripts/unicode directory." )
110+
111+ with open ('scripts/unicode/DerivedNormalizationProps.txt' , 'r' , encoding = 'utf-8' ) as f :
112+ lines = f .readlines ()
113+
114+ for line in lines :
115+ line = parse_line (line )
116+ if not line :
117+ continue
118+
119+ parts = line .split (';' )
120+ if len (parts ) < 3 :
121+ # Skip boolean properties, which have only two parts, but are all either deprecated or redundant.
122+ continue
123+
124+ code_point_hex_pair = parts [0 ].strip ()
125+ if '..' not in code_point_hex_pair :
126+ code_point_start = code_point_end = int (code_point_hex_pair , 16 )
127+ else :
128+ # handle ranges like '00A0..00A7'
129+ code_point_hex_start , code_point_hex_end = code_point_hex_pair .split ('..' )
130+ code_point_start , code_point_end = int (code_point_hex_start , 16 ), int (code_point_hex_end , 16 )
131+ prop = parts [1 ].strip ()
132+ value = parts [2 ].strip ()
133+
134+ # Not handling properties Full_Composition_Exclusion (redundant), Expands_On_* (deprecated),
135+ # FC_NFKC_Closure (deprecated), Changes_When_NFKC_Casefolded (redundant).
136+ if prop in ['NFD_QC' , 'NFKD_QC' , 'NFC_QC' , 'NFKC_QC' , 'NFKC_CF' , 'NFKC_SCF' ]:
137+ for code_point in range (code_point_start , code_point_end + 1 ):
138+ add_common_property (code_point , prop , value )
139+
140+ def parse_derived_core_properties_txt ():
141+ if not os .path .exists ('scripts/unicode/DerivedCoreProperties.txt' ):
142+ raise FileNotFoundError ("DerivedCoreProperties.txt not found in scripts/unicode directory." )
143+
144+ with open ('scripts/unicode/DerivedCoreProperties.txt' , 'r' , encoding = 'utf-8' ) as f :
145+ lines = f .readlines ()
146+
147+ for line in lines :
148+ line = parse_line (line )
149+ if not line :
150+ continue
151+
152+ parts = line .split (';' )
153+ code_point_hex_pair = parts [0 ].strip ()
154+ if '..' not in code_point_hex_pair :
155+ code_point_start = code_point_end = int (code_point_hex_pair , 16 )
156+ else :
157+ # handle ranges like '00A0..00A7'
158+ code_point_hex_start , code_point_hex_end = code_point_hex_pair .split ('..' )
159+ code_point_start , code_point_end = int (code_point_hex_start , 16 ), int (code_point_hex_end , 16 )
160+
161+ prop = parts [1 ].strip ()
162+
163+ # skip properties Grapheme_Link (deprecated), Indic_Conjuct_Break (for simplicity, not binary)
164+ if not prop in ['Grapheme_Link' , 'Indic_Conjunct_Break' ]:
165+ for code_point in range (code_point_start , code_point_end + 1 ):
166+ add_boolean_property (code_point , prop , 'Y' )
167+
168+ def write_unicode_data_yaml ():
169+ with open ('src/qtil/strings/generated/unicode.yaml' , 'w' , encoding = 'utf-8' ) as f :
170+ f .write (
171+ '''extensions:
172+ - addsTo:
173+ pack: advanced-security/qtil
174+ extensible: unicodeVersion
175+ data:
176+ - ["''' + unicode_version + '''"]
177+ - addsTo:
178+ pack: advanced-security/qtil
179+ extensible: unicodeHasProperty
180+ data:''' )
181+ for entry in unicode_common_properties :
182+ f .write (f"""
183+ - [{ entry ['code_point' ]} , '{ entry ['name' ]} ', '{ entry ['value' ]} ']""" )
184+
185+ f .write ('''
186+ - addsTo:
187+ pack: advanced-security/qtil
188+ extensible: unicodeHasBooleanProperty
189+ data:''' )
190+ for entry in unicode_boolean_properties :
191+ f .write (f"""
192+ - [{ entry ['code_point' ]} , '{ entry ['name' ]} ']""" )
193+
194+ f .write ('''
195+ - addsTo:
196+ pack: advanced-security/qtil
197+ extensible: unicodeHasNumericProperty
198+ data:''' )
199+ for entry in unicode_numeric_properties :
200+ f .write (f"""
201+ - [{ entry ['code_point' ]} , '{ entry ['name' ]} ', { entry ['value' ]} ]""" )
202+
203+ if __name__ == "__main__" :
204+ print ("""
205+ This script generates unicode_data.yaml from UnicodeData.txt, DerivedNormalizationProps.txt,
206+ and DerivedCoreProperties.txt files in the scripts/unicode directory.
207+
208+ Download the latest Unicode data files from:
209+ https://www.unicode.org/Public/UCD/latest/ucd/
210+
211+ Place the downloaded files in the scripts/unicode directory before running this script.
212+
213+ Running....
214+ """ )
215+
216+ parse_unicode_data_txt ()
217+ parse_derived_normalization_props_txt ()
218+ parse_derived_core_properties_txt ()
219+ write_unicode_data_yaml ()
220+
221+ print ("SUCCESS! Unicode data has been successfully generated in 'src/qtil/strings/generated/unicode.yaml'." )
0 commit comments