From 2e0ffdbb57a74bc263175ea32f46280d6eb535d3 Mon Sep 17 00:00:00 2001 From: Mike Fairhurst Date: Mon, 21 Jul 2025 10:45:41 -0700 Subject: [PATCH] Implement general unicode property support, and some specific features. Adds UAX #44 identifier checking, and NFC quick check support, along with a few helpers like `isAscii` and `unescapeUnicode`. --- scripts/generate_unicode.py | 221 + src/qlpack.yml | 4 +- src/qtil/Qtil.qll | 1 + src/qtil/strings/Unicode.qll | 202 + src/qtil/strings/generated/unicode.yaml | 1117660 ++++++++++++++++++++ test/qtil/strings/UnicodeTest.expected | 1 + test/qtil/strings/UnicodeTest.ql | 188 + 7 files changed, 1118276 insertions(+), 1 deletion(-) create mode 100644 scripts/generate_unicode.py create mode 100644 src/qtil/strings/Unicode.qll create mode 100644 src/qtil/strings/generated/unicode.yaml create mode 100644 test/qtil/strings/UnicodeTest.expected create mode 100644 test/qtil/strings/UnicodeTest.ql diff --git a/scripts/generate_unicode.py b/scripts/generate_unicode.py new file mode 100644 index 0000000..aa17992 --- /dev/null +++ b/scripts/generate_unicode.py @@ -0,0 +1,221 @@ +### This script generates unicode_data.yaml from UnicodeData.txt, DerivedNormalizationProps.txt, +### and DerivedCoreProperties.txt files in the scripts/unicode directory. +### +### Usage: +### python generate_unicode.py ${unicode_version} +### +### Requires the latest Unicode data files from: https://www.unicode.org/Public/UCD/latest/ucd/ +### are downloaded and placed in the scripts/unicode directory before running this script. + +import os +import sys + +# Check that the user provided a Unicode version argument +if len(sys.argv) != 2: + print("Usage: python generate_unicode.py ${unicode_version}") + sys.exit(1) + +unicode_version = sys.argv[1] + +unicode_common_properties = [] +unicode_boolean_properties = [] +unicode_numeric_properties = [] + + + +def add_common_property(code_point, name, value): + if value != '': + unicode_common_properties.append({ + 'code_point': code_point, + 'name': name, + 'value': value + }) + +def add_boolean_property(code_point, name, value): + if value == 'Y': + unicode_boolean_properties.append({ + 'code_point': code_point, + 'name': name + }) + +def add_numeric_property(code_point, name, value): + # handle fractional values like '1/5' + if '/' in value: + # Convert '1/5' to a float + parts = value.split('/') + numeric_value = float(parts[0]) / float(parts[1]) + else: + numeric_value = float(value) + unicode_numeric_properties.append({ + 'code_point': code_point, + 'name': name, + 'value': numeric_value + }) + +# Trim comments (starting with '#') and whitespace from the line +def parse_line(line): + return line.split('#')[0].strip() + +def parse_unicode_data_txt(): + if not os.path.exists('scripts/unicode/UnicodeData.txt'): + raise FileNotFoundError("UnicodeData.txt not found in scripts/unicode directory.") + + with open('scripts/unicode/UnicodeData.txt', 'r', encoding='utf-8') as f: + lines = f.readlines() + + for line in lines: + line = parse_line(line) + if not line: + continue + + parts = line.split(';') + code_point_hex = parts[0].strip() + code_point = int(code_point_hex, 16) + name = parts[1].strip() + general_category = parts[2].strip() + canonical_combining_class = parts[3].strip() + bidi_class = parts[4].strip() + decomposition_data = parts[5].strip() + numeric_part_a = parts[6].strip() + numeric_part_b = parts[7].strip() + numeric_part_c = parts[8].strip() + bidi_mirrored = parts[9].strip() + # Not captured: Unicode_1_Name, ISO_Comment + simple_uppercase_mapping = parts[12].strip() + simple_lowercase_mapping = parts[13].strip() + simple_titlecase_mapping = parts[14].strip() + + # exclude