1// Copyright 2008 The RE2 Authors. All Rights Reserved.
2// Use of this source code is governed by a BSD-style
3// license that can be found in the LICENSE file.
4
5#ifndef RE2_UNICODE_CASEFOLD_H_
6#define RE2_UNICODE_CASEFOLD_H_
7
8// Unicode case folding tables.
9
10// The Unicode case folding tables encode the mapping from one Unicode point
11// to the next largest Unicode point with equivalent folding. The largest
12// point wraps back to the first. For example, the tables map:
13//
14// 'A' -> 'a'
15// 'a' -> 'A'
16//
17// 'K' -> 'k'
18// 'k' -> 'K' (Kelvin symbol)
19// 'K' -> 'K'
20//
21// Like everything Unicode, these tables are big. If we represent the table
22// as a sorted list of uint32_t pairs, it has 2049 entries and is 16 kB.
23// Most table entries look like the ones around them:
24// 'A' maps to 'A'+32, 'B' maps to 'B'+32, etc.
25// Instead of listing all the pairs explicitly, we make a list of ranges
26// and deltas, so that the table entries for 'A' through 'Z' can be represented
27// as a single entry { 'A', 'Z', +32 }.
28//
29// In addition to blocks that map to each other (A-Z mapping to a-z)
30// there are blocks of pairs that individually map to each other
31// (for example, 0100<->0101, 0102<->0103, 0104<->0105, ...).
32// For those, the special delta value EvenOdd marks even/odd pairs
33// (if even, add 1; if odd, subtract 1), and OddEven marks odd/even pairs.
34//
35// In this form, the table has 274 entries, about 3kB. If we were to split
36// the table into one for 16-bit codes and an overflow table for larger ones,
37// we could get it down to about 1.5kB, but that's not worth the complexity.
38//
39// The grouped form also allows for efficient fold range calculations
40// rather than looping one character at a time.
41
42#include <stdint.h>
43
44#include "util/util.h"
45#include "util/utf.h"
46
47namespace re2 {
48
49enum {
50 EvenOdd = 1,
51 OddEven = -1,
52 EvenOddSkip = 1<<30,
53 OddEvenSkip,
54};
55
56struct CaseFold {
57 Rune lo;
58 Rune hi;
59 int32_t delta;
60};
61
62extern const CaseFold unicode_casefold[];
63extern const int num_unicode_casefold;
64
65extern const CaseFold unicode_tolower[];
66extern const int num_unicode_tolower;
67
68// Returns the CaseFold* in the tables that contains rune.
69// If rune is not in the tables, returns the first CaseFold* after rune.
70// If rune is larger than any value in the tables, returns NULL.
71extern const CaseFold* LookupCaseFold(const CaseFold*, int, Rune rune);
72
73// Returns the result of applying the fold f to the rune r.
74extern Rune ApplyFold(const CaseFold *f, Rune r);
75
76} // namespace re2
77
78#endif // RE2_UNICODE_CASEFOLD_H_
79