1/*
2 * Copyright (c) 2015-2016, Intel Corporation
3 *
4 * Redistribution and use in source and binary forms, with or without
5 * modification, are permitted provided that the following conditions are met:
6 *
7 * * Redistributions of source code must retain the above copyright notice,
8 * this list of conditions and the following disclaimer.
9 * * Redistributions in binary form must reproduce the above copyright
10 * notice, this list of conditions and the following disclaimer in the
11 * documentation and/or other materials provided with the distribution.
12 * * Neither the name of Intel Corporation nor the names of its contributors
13 * may be used to endorse or promote products derived from this software
14 * without specific prior written permission.
15 *
16 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
20 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
21 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
22 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
23 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
24 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
25 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
26 * POSSIBILITY OF SUCH DAMAGE.
27 */
28
29#include "Utf8ComponentClass.h"
30
31#include <algorithm>
32
33using namespace std;
34
35namespace ue2 {
36
37#define UCP_FN(cat) \
38CodePointSet getUcp##cat(void) { \
39 CodePointSet rv; \
40 for (u32 i = 0; i < ARRAY_LENGTH(ucp_##cat##_def); i += 2) { \
41 rv.setRange(ucp_##cat##_def[i], ucp_##cat##_def[i + 1]); \
42 } \
43 return rv; \
44}
45
46struct unicase {
47 unichar base;
48 unichar caseless;
49};
50
51} // namespace ue2
52
53#define UCP_TABLE_DEFINE_FN
54#include "ucp_table.h"
55
56namespace ue2 {
57
58static
59bool operator<(const unicase &a, const unicase &b) {
60 if (a.base < b.base) {
61 return true;
62 }
63
64 if (a.base > b.base) {
65 return false;
66 }
67
68 return a.caseless < b.caseless;
69}
70
71void make_caseless(CodePointSet *cps) {
72 assert(cps);
73 DEBUG_PRINTF("hello\n");
74 // Cheap optimisation: if we are empty or a dot, we're already caseless.
75 if (cps->begin() == cps->end()) {
76 DEBUG_PRINTF("empty\n");
77 return;
78 }
79 if (lower(*cps->begin()) == 0 && upper(*cps->begin()) == MAX_UNICODE) {
80 DEBUG_PRINTF("dot\n");
81 return;
82 }
83
84 CodePointSet base = *cps;
85
86 auto uc_begin = begin(ucp_caseless_def);
87 auto uc_end = end(ucp_caseless_def);
88 DEBUG_PRINTF("uc len %zd\n", distance(uc_begin, uc_end));
89
90 for (const auto &elem : base) {
91 unichar b = lower(elem);
92 unichar e = upper(elem) + 1;
93
94 for (; b < e; b++) {
95 DEBUG_PRINTF("decasing %x\n", b);
96 unicase test = {b, 0}; /* NUL is not a caseless version of anything,
97 * so we are ok */
98 uc_begin = lower_bound(uc_begin, uc_end, test);
99 if (uc_begin == uc_end) {
100 DEBUG_PRINTF("EOL\n");
101 return;
102 }
103 while (uc_begin != uc_end && uc_begin->base == b) {
104 DEBUG_PRINTF("at {%x,%x}\n", uc_begin->base, uc_begin->caseless);
105 cps->set(uc_begin->caseless);
106 ++uc_begin;
107 }
108 }
109 }
110}
111
112/** \brief Flip the case of the codepoint in c, if possible.
113 *
114 * Note that this assumes a one-to-one case mapping, which (though not
115 * realistic) is what PCRE does. */
116bool flip_case(unichar *c) {
117 assert(c);
118
119 const unicase test = { *c, 0 };
120
121 const auto uc_begin = begin(ucp_caseless_def);
122 const auto uc_end = end(ucp_caseless_def);
123 const auto f = lower_bound(uc_begin, uc_end, test);
124 if (f != uc_end && f->base == *c) {
125 DEBUG_PRINTF("flipped c=%x to %x\n", *c, f->caseless);
126 *c = f->caseless;
127 return true;
128 }
129 return false;
130}
131
132} // namespace ue2
133