1 | // Copyright 2008 The RE2 Authors. All Rights Reserved. |
2 | // Use of this source code is governed by a BSD-style |
3 | // license that can be found in the LICENSE file. |
4 | |
5 | #ifndef RE2_UNICODE_CASEFOLD_H_ |
6 | #define RE2_UNICODE_CASEFOLD_H_ |
7 | |
8 | // Unicode case folding tables. |
9 | |
10 | // The Unicode case folding tables encode the mapping from one Unicode point |
11 | // to the next largest Unicode point with equivalent folding. The largest |
12 | // point wraps back to the first. For example, the tables map: |
13 | // |
14 | // 'A' -> 'a' |
15 | // 'a' -> 'A' |
16 | // |
17 | // 'K' -> 'k' |
18 | // 'k' -> 'K' (Kelvin symbol) |
19 | // 'K' -> 'K' |
20 | // |
21 | // Like everything Unicode, these tables are big. If we represent the table |
22 | // as a sorted list of uint32_t pairs, it has 2049 entries and is 16 kB. |
23 | // Most table entries look like the ones around them: |
24 | // 'A' maps to 'A'+32, 'B' maps to 'B'+32, etc. |
25 | // Instead of listing all the pairs explicitly, we make a list of ranges |
26 | // and deltas, so that the table entries for 'A' through 'Z' can be represented |
27 | // as a single entry { 'A', 'Z', +32 }. |
28 | // |
29 | // In addition to blocks that map to each other (A-Z mapping to a-z) |
30 | // there are blocks of pairs that individually map to each other |
31 | // (for example, 0100<->0101, 0102<->0103, 0104<->0105, ...). |
32 | // For those, the special delta value EvenOdd marks even/odd pairs |
33 | // (if even, add 1; if odd, subtract 1), and OddEven marks odd/even pairs. |
34 | // |
35 | // In this form, the table has 274 entries, about 3kB. If we were to split |
36 | // the table into one for 16-bit codes and an overflow table for larger ones, |
37 | // we could get it down to about 1.5kB, but that's not worth the complexity. |
38 | // |
39 | // The grouped form also allows for efficient fold range calculations |
40 | // rather than looping one character at a time. |
41 | |
42 | #include <stdint.h> |
43 | |
44 | #include "util/util.h" |
45 | #include "util/utf.h" |
46 | |
47 | namespace re2 { |
48 | |
49 | enum { |
50 | EvenOdd = 1, |
51 | OddEven = -1, |
52 | EvenOddSkip = 1<<30, |
53 | OddEvenSkip, |
54 | }; |
55 | |
56 | struct CaseFold { |
57 | Rune lo; |
58 | Rune hi; |
59 | int32_t delta; |
60 | }; |
61 | |
62 | extern const CaseFold unicode_casefold[]; |
63 | extern const int num_unicode_casefold; |
64 | |
65 | extern const CaseFold unicode_tolower[]; |
66 | extern const int num_unicode_tolower; |
67 | |
68 | // Returns the CaseFold* in the tables that contains rune. |
69 | // If rune is not in the tables, returns the first CaseFold* after rune. |
70 | // If rune is larger than any value in the tables, returns NULL. |
71 | extern const CaseFold* LookupCaseFold(const CaseFold*, int, Rune rune); |
72 | |
73 | // Returns the result of applying the fold f to the rune r. |
74 | extern Rune ApplyFold(const CaseFold *f, Rune r); |
75 | |
76 | } // namespace re2 |
77 | |
78 | #endif // RE2_UNICODE_CASEFOLD_H_ |
79 | |