1// Copyright 2008 The RE2 Authors. All Rights Reserved.
2// Use of this source code is governed by a BSD-style
3// license that can be found in the LICENSE file.
4
5#ifndef RE2_UNICODE_GROUPS_H_
6#define RE2_UNICODE_GROUPS_H_
7
8// Unicode character groups.
9
10// The codes get split into ranges of 16-bit codes
11// and ranges of 32-bit codes. It would be simpler
12// to use only 32-bit ranges, but these tables are large
13// enough to warrant extra care.
14//
15// Using just 32-bit ranges gives 27 kB of data.
16// Adding 16-bit ranges gives 18 kB of data.
17// Adding an extra table of 16-bit singletons would reduce
18// to 16.5 kB of data but make the data harder to use;
19// we don't bother.
20
21#include <stdint.h>
22
23#include "util/util.h"
24#include "util/utf.h"
25
26namespace re2 {
27
28struct URange16
29{
30 uint16_t lo;
31 uint16_t hi;
32};
33
34struct URange32
35{
36 Rune lo;
37 Rune hi;
38};
39
40struct UGroup
41{
42 const char *name;
43 int sign; // +1 for [abc], -1 for [^abc]
44 const URange16 *r16;
45 int nr16;
46 const URange32 *r32;
47 int nr32;
48};
49
50// Named by property or script name (e.g., "Nd", "N", "Han").
51// Negated groups are not included.
52extern const UGroup unicode_groups[];
53extern const int num_unicode_groups;
54
55// Named by POSIX name (e.g., "[:alpha:]", "[:^lower:]").
56// Negated groups are included.
57extern const UGroup posix_groups[];
58extern const int num_posix_groups;
59
60// Named by Perl name (e.g., "\\d", "\\D").
61// Negated groups are included.
62extern const UGroup perl_groups[];
63extern const int num_perl_groups;
64
65} // namespace re2
66
67#endif // RE2_UNICODE_GROUPS_H_
68