1 | /************************************************* |
2 | * Perl-Compatible Regular Expressions * |
3 | *************************************************/ |
4 | |
5 | /* PCRE is a library of functions to support regular expressions whose syntax |
6 | and semantics are as close as possible to those of the Perl 5 language. |
7 | |
8 | Written by Philip Hazel |
9 | Original API code Copyright (c) 1997-2012 University of Cambridge |
10 | New API code Copyright (c) 2016-2021 University of Cambridge |
11 | |
12 | ----------------------------------------------------------------------------- |
13 | Redistribution and use in source and binary forms, with or without |
14 | modification, are permitted provided that the following conditions are met: |
15 | |
16 | * Redistributions of source code must retain the above copyright notice, |
17 | this list of conditions and the following disclaimer. |
18 | |
19 | * Redistributions in binary form must reproduce the above copyright |
20 | notice, this list of conditions and the following disclaimer in the |
21 | documentation and/or other materials provided with the distribution. |
22 | |
23 | * Neither the name of the University of Cambridge nor the names of its |
24 | contributors may be used to endorse or promote products derived from |
25 | this software without specific prior written permission. |
26 | |
27 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" |
28 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE |
29 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE |
30 | ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE |
31 | LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR |
32 | CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF |
33 | SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS |
34 | INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN |
35 | CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) |
36 | ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE |
37 | POSSIBILITY OF SUCH DAMAGE. |
38 | ----------------------------------------------------------------------------- |
39 | */ |
40 | |
41 | /* This module contains an internal function that is used to match a Unicode |
42 | extended grapheme sequence. It is used by both pcre2_match() and |
43 | pcre2_def_match(). However, it is called only when Unicode support is being |
44 | compiled. Nevertheless, we provide a dummy function when there is no Unicode |
45 | support, because some compilers do not like functionless source files. */ |
46 | |
47 | |
48 | #ifdef HAVE_CONFIG_H |
49 | #include "config.h" |
50 | #endif |
51 | |
52 | |
53 | #include "pcre2_internal.h" |
54 | |
55 | |
56 | /* Dummy function */ |
57 | |
58 | #ifndef SUPPORT_UNICODE |
59 | PCRE2_SPTR |
60 | PRIV(extuni)(uint32_t c, PCRE2_SPTR eptr, PCRE2_SPTR start_subject, |
61 | PCRE2_SPTR end_subject, BOOL utf, int *xcount) |
62 | { |
63 | (void)c; |
64 | (void)eptr; |
65 | (void)start_subject; |
66 | (void)end_subject; |
67 | (void)utf; |
68 | (void)xcount; |
69 | return NULL; |
70 | } |
71 | #else |
72 | |
73 | |
74 | /************************************************* |
75 | * Match an extended grapheme sequence * |
76 | *************************************************/ |
77 | |
78 | /* |
79 | Arguments: |
80 | c the first character |
81 | eptr pointer to next character |
82 | start_subject pointer to start of subject |
83 | end_subject pointer to end of subject |
84 | utf TRUE if in UTF mode |
85 | xcount pointer to count of additional characters, |
86 | or NULL if count not needed |
87 | |
88 | Returns: pointer after the end of the sequence |
89 | */ |
90 | |
91 | PCRE2_SPTR |
92 | PRIV(extuni)(uint32_t c, PCRE2_SPTR eptr, PCRE2_SPTR start_subject, |
93 | PCRE2_SPTR end_subject, BOOL utf, int *xcount) |
94 | { |
95 | int lgb = UCD_GRAPHBREAK(c); |
96 | |
97 | while (eptr < end_subject) |
98 | { |
99 | int rgb; |
100 | int len = 1; |
101 | if (!utf) c = *eptr; else { GETCHARLEN(c, eptr, len); } |
102 | rgb = UCD_GRAPHBREAK(c); |
103 | if ((PRIV(ucp_gbtable)[lgb] & (1u << rgb)) == 0) break; |
104 | |
105 | /* Not breaking between Regional Indicators is allowed only if there |
106 | are an even number of preceding RIs. */ |
107 | |
108 | if (lgb == ucp_gbRegional_Indicator && rgb == ucp_gbRegional_Indicator) |
109 | { |
110 | int ricount = 0; |
111 | PCRE2_SPTR bptr = eptr - 1; |
112 | if (utf) BACKCHAR(bptr); |
113 | |
114 | /* bptr is pointing to the left-hand character */ |
115 | |
116 | while (bptr > start_subject) |
117 | { |
118 | bptr--; |
119 | if (utf) |
120 | { |
121 | BACKCHAR(bptr); |
122 | GETCHAR(c, bptr); |
123 | } |
124 | else |
125 | c = *bptr; |
126 | if (UCD_GRAPHBREAK(c) != ucp_gbRegional_Indicator) break; |
127 | ricount++; |
128 | } |
129 | if ((ricount & 1) != 0) break; /* Grapheme break required */ |
130 | } |
131 | |
132 | /* If Extend or ZWJ follows Extended_Pictographic, do not update lgb; this |
133 | allows any number of them before a following Extended_Pictographic. */ |
134 | |
135 | if ((rgb != ucp_gbExtend && rgb != ucp_gbZWJ) || |
136 | lgb != ucp_gbExtended_Pictographic) |
137 | lgb = rgb; |
138 | |
139 | eptr += len; |
140 | if (xcount != NULL) *xcount += 1; |
141 | } |
142 | |
143 | return eptr; |
144 | } |
145 | |
146 | #endif /* SUPPORT_UNICODE */ |
147 | |
148 | /* End of pcre2_extuni.c */ |
149 | |