1// © 2016 and later: Unicode, Inc. and others.
2// License & terms of use: http://www.unicode.org/copyright.html
3//
4// regexst.h
5//
6// Copyright (C) 2004-2015, International Business Machines Corporation and others.
7// All Rights Reserved.
8//
9// This file contains class RegexStaticSets
10//
11// This class is internal to the regular expression implementation.
12// For the public Regular Expression API, see the file "unicode/regex.h"
13//
14// RegexStaticSets groups together the common UnicodeSets that are needed
15// for compiling or executing RegularExpressions. This grouping simplifies
16// the thread safe lazy creation and sharing of these sets across
17// all instances of regular expressions.
18//
19#include "unicode/utypes.h"
20
21#if !UCONFIG_NO_REGULAR_EXPRESSIONS
22
23#include "unicode/unistr.h"
24#include "unicode/uniset.h"
25#include "unicode/uchar.h"
26#include "unicode/regex.h"
27#include "uprops.h"
28#include "cmemory.h"
29#include "cstring.h"
30#include "uassert.h"
31#include "ucln_in.h"
32#include "umutex.h"
33
34#include "regexcst.h" // Contains state table for the regex pattern parser.
35 // generated by a Perl script.
36#include "regexst.h"
37
38U_NAMESPACE_BEGIN
39
40// "Rule Char" Characters are those with special meaning, and therefore
41// need to be escaped to appear as literals in a regexp.
42constexpr char16_t const *gRuleSet_rule_chars = u"*?+[(){}^$|\\.";
43
44//
45// The backslash escape characters that ICU's unescape() function will handle.
46//
47constexpr char16_t const *gUnescapeChars = u"acefnrtuUx";
48
49//
50// Unicode Set pattern for Regular Expression \w
51//
52constexpr char16_t const *gIsWordPattern = u"[\\p{Alphabetic}\\p{M}\\p{Nd}\\p{Pc}\\u200c\\u200d]";
53
54//
55// Unicode Set Definitions for Regular Expression \s
56//
57constexpr char16_t const *gIsSpacePattern = u"[\\p{WhiteSpace}]";
58
59//
60// UnicodeSets used in implementation of Grapheme Cluster detection, \X
61//
62constexpr char16_t const *gGC_ControlPattern = u"[[:Zl:][:Zp:][:Cc:][:Cf:]-[:Grapheme_Extend:]]";
63constexpr char16_t const *gGC_ExtendPattern = u"[\\p{Grapheme_Extend}]";
64constexpr char16_t const *gGC_LPattern = u"[\\p{Hangul_Syllable_Type=L}]";
65constexpr char16_t const *gGC_VPattern = u"[\\p{Hangul_Syllable_Type=V}]";
66constexpr char16_t const *gGC_TPattern = u"[\\p{Hangul_Syllable_Type=T}]";
67constexpr char16_t const *gGC_LVPattern = u"[\\p{Hangul_Syllable_Type=LV}]";
68constexpr char16_t const *gGC_LVTPattern = u"[\\p{Hangul_Syllable_Type=LVT}]";
69
70
71RegexStaticSets *RegexStaticSets::gStaticSets = nullptr;
72UInitOnce gStaticSetsInitOnce = U_INITONCE_INITIALIZER;
73
74
75RegexStaticSets::RegexStaticSets(UErrorCode *status) {
76 // Initialize the shared static sets to their correct values.
77 fUnescapeCharSet.addAll(UnicodeString(true, gUnescapeChars, -1)).freeze();
78 fPropSets[URX_ISWORD_SET].applyPattern(UnicodeString(true, gIsWordPattern, -1), *status).freeze();
79 fPropSets[URX_ISSPACE_SET].applyPattern(UnicodeString(true, gIsSpacePattern, -1), *status).freeze();
80 fPropSets[URX_GC_EXTEND].applyPattern(UnicodeString(TRUE, gGC_ExtendPattern, -1), *status).freeze();
81 fPropSets[URX_GC_CONTROL].applyPattern(UnicodeString(TRUE, gGC_ControlPattern, -1), *status).freeze();
82 fPropSets[URX_GC_L].applyPattern(UnicodeString(TRUE, gGC_LPattern, -1), *status).freeze();
83 fPropSets[URX_GC_V].applyPattern(UnicodeString(TRUE, gGC_VPattern, -1), *status).freeze();
84 fPropSets[URX_GC_T].applyPattern(UnicodeString(TRUE, gGC_TPattern, -1), *status).freeze();
85 fPropSets[URX_GC_LV].applyPattern(UnicodeString(TRUE, gGC_LVPattern, -1), *status).freeze();
86 fPropSets[URX_GC_LVT].applyPattern(UnicodeString(TRUE, gGC_LVTPattern, -1), *status).freeze();
87
88
89 //
90 // "Normal" is the set of characters that don't need special handling
91 // when finding grapheme cluster boundaries.
92 //
93 fPropSets[URX_GC_NORMAL].complement();
94 fPropSets[URX_GC_NORMAL].remove(0xac00, 0xd7a4);
95 fPropSets[URX_GC_NORMAL].removeAll(fPropSets[URX_GC_CONTROL]);
96 fPropSets[URX_GC_NORMAL].removeAll(fPropSets[URX_GC_L]);
97 fPropSets[URX_GC_NORMAL].removeAll(fPropSets[URX_GC_V]);
98 fPropSets[URX_GC_NORMAL].removeAll(fPropSets[URX_GC_T]);
99 fPropSets[URX_GC_NORMAL].freeze();
100
101 // Initialize the 8-bit fast bit sets from the parallel full
102 // UnicodeSets.
103 //
104 // TODO: 25 Oct 2019 are these fast 8-bit sets worth keeping?
105 // Measured 3.5% gain on (non) matching with the pattern "x(?:\\S+)+x"
106 // This runs in exponential time, making it easy to adjust the time for
107 // convenient measuring.
108 //
109 // This 8 bit optimization dates from the early days of ICU,
110 // with a less optimized UnicodeSet. At the time, the difference
111 // was substantial.
112
113 for (int32_t i=0; i<URX_LAST_SET; i++) {
114 fPropSets8[i].init(&fPropSets[i]);
115 }
116
117 // Sets used while parsing rules, but not referenced from the parse state table
118 fRuleSets[kRuleSet_rule_char-128]
119 .addAll(UnicodeString(gRuleSet_rule_chars)).complement().freeze();
120
121 fRuleSets[kRuleSet_digit_char-128].add(u'0', u'9').freeze();
122 fRuleSets[kRuleSet_ascii_letter-128].add(u'A', u'Z').add(u'a', u'z').freeze();
123 fRuleDigitsAlias = &fRuleSets[kRuleSet_digit_char-128];
124
125 // Finally, initialize an empty UText string for utility purposes
126 fEmptyText = utext_openUChars(nullptr, nullptr, 0, status);
127
128}
129
130
131RegexStaticSets::~RegexStaticSets() {
132 fRuleDigitsAlias = nullptr;
133 utext_close(fEmptyText);
134}
135
136
137//------------------------------------------------------------------------------
138//
139// regex_cleanup Memory cleanup function, free/delete all
140// cached memory. Called by ICU's u_cleanup() function.
141//
142//------------------------------------------------------------------------------
143
144U_CDECL_BEGIN
145static UBool U_CALLCONV
146regex_cleanup(void) {
147 delete RegexStaticSets::gStaticSets;
148 RegexStaticSets::gStaticSets = nullptr;
149 gStaticSetsInitOnce.reset();
150 return TRUE;
151}
152
153static void U_CALLCONV initStaticSets(UErrorCode &status) {
154 U_ASSERT(RegexStaticSets::gStaticSets == nullptr);
155 ucln_i18n_registerCleanup(UCLN_I18N_REGEX, regex_cleanup);
156 RegexStaticSets::gStaticSets = new RegexStaticSets(&status);
157 if (U_FAILURE(status)) {
158 delete RegexStaticSets::gStaticSets;
159 RegexStaticSets::gStaticSets = nullptr;
160 }
161 if (RegexStaticSets::gStaticSets == nullptr && U_SUCCESS(status)) {
162 status = U_MEMORY_ALLOCATION_ERROR;
163 }
164}
165U_CDECL_END
166
167void RegexStaticSets::initGlobals(UErrorCode *status) {
168 umtx_initOnce(gStaticSetsInitOnce, &initStaticSets, *status);
169}
170
171U_NAMESPACE_END
172#endif // !UCONFIG_NO_REGULAR_EXPRESSIONS
173