@@ -61,6 +61,7 @@ class OpBuilderInputType(Enum):
61
61
STREAM_BUILDER = 2
62
62
UNION = 3
63
63
SEARCH = 4
64
+ SEARCH_RE = 5
64
65
65
66
66
67
def _build_levsearch (fst , term , max_dist ):
@@ -72,19 +73,28 @@ def _build_levsearch(fst, term, max_dist):
72
73
return lib .fst_set_levsearch (fst ._ptr , lev_ptr )
73
74
74
75
76
+ def _build_research (fst , pattern ):
77
+ re_ptr = checked_call (
78
+ lib .fst_regex_new , fst ._ctx ,
79
+ ffi .new ("char[]" , pattern .encode ('utf8' )))
80
+ return lib .fst_set_regexsearch (fst ._ptr , re_ptr )
81
+
82
+
75
83
class OpBuilder (object ):
76
84
77
85
_BUILDERS = {
78
86
OpBuilderInputType .SET : lib .fst_set_make_opbuilder ,
79
87
OpBuilderInputType .STREAM_BUILDER : lib .fst_set_make_opbuilder_streambuilder ,
80
88
OpBuilderInputType .UNION : lib .fst_set_make_opbuilder_union ,
81
89
OpBuilderInputType .SEARCH : lib .fst_set_make_opbuilder_levstream ,
90
+ OpBuilderInputType .SEARCH_RE : lib .fst_set_make_opbuilder_regexstream ,
82
91
}
83
92
_PUSHERS = {
84
93
OpBuilderInputType .SET : lib .fst_set_opbuilder_push ,
85
94
OpBuilderInputType .STREAM_BUILDER : lib .fst_set_opbuilder_push_streambuilder ,
86
95
OpBuilderInputType .UNION : lib .fst_set_opbuilder_push_union ,
87
96
OpBuilderInputType .SEARCH : lib .fst_set_opbuilder_push_levstream ,
97
+ OpBuilderInputType .SEARCH_RE : lib .fst_set_opbuilder_push_regexstream ,
88
98
}
89
99
90
100
@classmethod
@@ -94,6 +104,13 @@ def from_search(cls, fst, term, max_dist):
94
104
input_type = OpBuilderInputType .SEARCH )
95
105
return opbuilder
96
106
107
+ @classmethod
108
+ def from_search_re (cls , fst , pattern ):
109
+ stream_ptr = _build_research (fst , pattern )
110
+ opbuilder = OpBuilder (stream_ptr ,
111
+ input_type = OpBuilderInputType .SEARCH_RE )
112
+ return opbuilder
113
+
97
114
@classmethod
98
115
def from_slice (cls , set_ptr , s ):
99
116
sb = StreamBuilder .from_slice (set_ptr , s )
@@ -535,6 +552,38 @@ def search(self, term, max_dist):
535
552
opbuilder .push (_build_levsearch (fst , term , max_dist ))
536
553
return opbuilder .union ()
537
554
555
+ def search_re (self , pattern ):
556
+ """ Search the set with a regular expression.
557
+
558
+ Note that the regular expression syntax is not Python's, but the one
559
+ supported by the `regex` Rust crate, which is almost identical
560
+ to the engine of the RE2 engine.
561
+
562
+ For a documentation of the syntax, see:
563
+ http://doc.rust-lang.org/regex/regex/index.html#syntax
564
+
565
+ Due to limitations of the underlying FST, only a subset of this syntax
566
+ is supported. Most notably absent are:
567
+
568
+ * Lazy quantifiers (``r'*?'``, ``r'+?'``)
569
+ * Word boundaries (``r'\\ b'``)
570
+ * Other zero-width assertions (``r'^'``, ``r'$'``)
571
+
572
+ For background on these limitations, consult the documentation of
573
+ the Rust crate: http://burntsushi.net/rustdoc/fst/struct.Regex.html
574
+
575
+ :param pattern: A regular expression
576
+ :returns: An iterator over all matching keys in the set
577
+ :rtype: :py:class:`KeyStreamIterator`
578
+ """
579
+ if len (self .sets ) <= 1 :
580
+ raise ValueError (
581
+ "Must have more than one set to operate on." )
582
+ opbuilder = OpBuilder .from_search_re (self .sets [0 ], pattern )
583
+ for fst in self .sets [1 :]:
584
+ opbuilder .push (_build_research (fst , pattern ))
585
+ return opbuilder .union ()
586
+
538
587
def symmetric_difference (self , * others ):
539
588
""" Get an iterator over the keys in the symmetric difference of this
540
589
set and others.
0 commit comments