1717# - confusables.txt
1818# - ReadMe.txt
1919# This script also uses the following Unicode UCD data:
20+ # - DerivedCoreProperties.txt
2021# - Scripts.txt
2122#
2223# Since this should not require frequent updates, we just store this
@@ -53,6 +54,8 @@ def fetch(f):
5354 sys .stderr .write ("cannot load %s\n " % f )
5455 exit (1 )
5556
57+ return f
58+
5659# Download a UCD table file
5760def fetch_unidata (f ):
5861 if not os .path .exists (os .path .basename (f )):
@@ -63,14 +66,14 @@ def fetch_unidata(f):
6366 sys .stderr .write ("cannot load %s" % f )
6467 exit (1 )
6568
66- # Loads code point data from IdentifierStatus.txt and
67- # IdentifierType.txt
68- # Implementation from unicode-segmentation
69+ return f
70+
71+ # Loads code point data from provided filename f
72+ # Implementation adapted from unicode-segmentation
6973def load_properties (f , interestingprops = None ):
70- fetch (f )
7174 props = {}
72- re1 = re .compile (r"^ *([0-9A-F]+) *; *(\w+) " )
73- re2 = re .compile (r"^ *([0-9A-F]+)\.\.([0-9A-F]+) *; *(\w+) " )
75+ re1 = re .compile (r"^ *([0-9A-F]+) *; *([^#\s]+) *# " )
76+ re2 = re .compile (r"^ *([0-9A-F]+)\.\.([0-9A-F]+) *; *([^#\s]+) *# " )
7477
7578 for line in fileinput .input (os .path .basename (f ), openhook = fileinput .hook_encoded ("utf-8" )):
7679 prop = None
@@ -99,42 +102,6 @@ def load_properties(f, interestingprops = None):
99102
100103 return props
101104
102- # Loads script data from Scripts.txt
103- def load_script_properties (f , interestingprops ):
104- fetch_unidata (f )
105- props = {}
106- # Note: these regexes are different from those in unicode-segmentation,
107- # becase we need to handle spaces here
108- re1 = re .compile (r"^ *([0-9A-F]+) *; *([^#]+) *#" )
109- re2 = re .compile (r"^ *([0-9A-F]+)\.\.([0-9A-F]+) *; *([^#]+) *#" )
110-
111- for line in fileinput .input (os .path .basename (f )):
112- prop = None
113- d_lo = 0
114- d_hi = 0
115- m = re1 .match (line )
116- if m :
117- d_lo = m .group (1 )
118- d_hi = m .group (1 )
119- prop = m .group (2 ).strip ()
120- else :
121- m = re2 .match (line )
122- if m :
123- d_lo = m .group (1 )
124- d_hi = m .group (2 )
125- prop = m .group (3 ).strip ()
126- else :
127- continue
128- if interestingprops and prop not in interestingprops :
129- continue
130- d_lo = int (d_lo , 16 )
131- d_hi = int (d_hi , 16 )
132- if prop not in props :
133- props [prop ] = []
134- props [prop ].append ((d_lo , d_hi ))
135-
136- return props
137-
138105# Loads confusables data from confusables.txt
139106def load_confusables (f ):
140107 fetch (f )
@@ -189,7 +156,7 @@ def load_scripts(f):
189156 # changes are introduced, update accordingly.
190157
191158 (longforms , shortforms ) = aliases ()
192- scripts = load_script_properties ( f , [])
159+ scripts = load_properties ( fetch_unidata ( f ) , [])
193160
194161 script_table = []
195162 script_list = []
@@ -546,10 +513,10 @@ def emit_identifier_module(f):
546513""" )
547514
548515 f .write (" // Identifier status table:\n " )
549- identifier_status_table = load_properties ("IdentifierStatus.txt" )
516+ identifier_status_table = load_properties (fetch ( "IdentifierStatus.txt" ) )
550517 emit_table (f , "IDENTIFIER_STATUS" , identifier_status_table ['Allowed' ], "&'static [(char, char)]" , is_pub = False ,
551518 pfun = lambda x : "(%s,%s)" % (escape_char (x [0 ]), escape_char (x [1 ])))
552- identifier_type = load_properties ("IdentifierType.txt" )
519+ identifier_type = load_properties (fetch ( "IdentifierType.txt" ) )
553520 type_table = []
554521 for ty in identifier_type :
555522 type_table .extend ([(x , y , ty ) for (x , y ) in identifier_type [ty ]])
@@ -560,6 +527,26 @@ def emit_identifier_module(f):
560527 pfun = lambda x : "(%s,%s, IdentifierType::%s)" % (escape_char (x [0 ]), escape_char (x [1 ]), x [2 ]))
561528 f .write ("}\n \n " )
562529
530+ def emit_default_ignorable_detection_module (f ):
531+ f .write ("pub mod default_ignorable_code_point {" )
532+ f .write ("""
533+
534+ #[inline]
535+ pub fn default_ignorable_code_point(c: char) -> bool {
536+ match c as usize {
537+ _ => super::util::bsearch_range_table(c, DEFAULT_IGNORABLE)
538+ }
539+ }
540+
541+ """ )
542+
543+ f .write (" // Default ignorable code point table:\n " )
544+ default_ignorable_table = load_properties (fetch_unidata ("DerivedCoreProperties.txt" ), ["Default_Ignorable_Code_Point" ])
545+ emit_table (f , "DEFAULT_IGNORABLE" , default_ignorable_table ["Default_Ignorable_Code_Point" ], "&'static [(char, char)]" , is_pub = False ,
546+ pfun = lambda x : "(%s,%s)" % (escape_char (x [0 ]), escape_char (x [1 ])))
547+
548+ f .write ("}\n \n " )
549+
563550def emit_confusable_detection_module (f ):
564551 f .write ("pub mod confusable_detection {" )
565552 f .write ("""
@@ -601,7 +588,7 @@ def emit_potiential_mixed_script_confusable(f):
601588 }
602589 }
603590""" )
604- identifier_status_table = load_properties ("IdentifierStatus.txt" )
591+ identifier_status_table = load_properties (fetch ( "IdentifierStatus.txt" ) )
605592 _ , scripts = load_scripts ("Scripts.txt" )
606593 identifier_allowed = identifier_status_table ['Allowed' ]
607594 (mixedscript_confusable , mixedscript_confusable_unresolved ) = load_potential_mixedscript_confusables ("confusables.txt" , identifier_allowed , scripts )
@@ -688,6 +675,8 @@ def emit_util_mod(f):
688675 emit_util_mod (rf )
689676 ### identifier module
690677 emit_identifier_module (rf )
678+ ### default_ignorable_detection module
679+ emit_default_ignorable_detection_module (rf )
691680 ### confusable_detection module
692681 emit_confusable_detection_module (rf )
693682 ### mixed_script_confusable_detection module
0 commit comments