1 | // © 2022 and later: Unicode, Inc. and others. |
2 | // License & terms of use: http://www.unicode.org/copyright.html |
3 | |
4 | #ifndef MLBREAKENGINE_H |
5 | #define MLBREAKENGINE_H |
6 | |
7 | #include "hash.h" |
8 | #include "unicode/resbund.h" |
9 | #include "unicode/uniset.h" |
10 | #include "unicode/utext.h" |
11 | #include "uvectr32.h" |
12 | |
13 | U_NAMESPACE_BEGIN |
14 | |
15 | #if !UCONFIG_NO_BREAK_ITERATION |
16 | |
17 | /** |
18 | * A machine learning break engine for the phrase breaking in Japanese. |
19 | */ |
20 | class MlBreakEngine : public UMemory { |
21 | public: |
22 | /** |
23 | * Constructor. |
24 | * |
25 | * @param digitOrOpenPunctuationOrAlphabetSet An UnicodeSet with the digit, open punctuation and |
26 | * alphabet. |
27 | * @param closePunctuationSet An UnicodeSet with close punctuation. |
28 | * @param status Information on any errors encountered. |
29 | */ |
30 | MlBreakEngine(const UnicodeSet &digitOrOpenPunctuationOrAlphabetSet, |
31 | const UnicodeSet &closePunctuationSet, UErrorCode &status); |
32 | |
33 | /** |
34 | * Virtual destructor. |
35 | */ |
36 | virtual ~MlBreakEngine(); |
37 | |
38 | public: |
39 | /** |
40 | * Divide up a range of characters handled by this break engine. |
41 | * |
42 | * @param inText A UText representing the text |
43 | * @param rangeStart The start of the range of the characters |
44 | * @param rangeEnd The end of the range of the characters |
45 | * @param foundBreaks Output of C array of int32_t break positions, or 0 |
46 | * @param inString The normalized string of text ranging from rangeStart to rangeEnd |
47 | * @param inputMap The vector storing the native index of inText |
48 | * @param status Information on any errors encountered. |
49 | * @return The number of breaks found |
50 | */ |
51 | int32_t divideUpRange(UText *inText, int32_t rangeStart, int32_t rangeEnd, |
52 | UVector32 &foundBreaks, const UnicodeString &inString, |
53 | const LocalPointer<UVector32> &inputMap, UErrorCode &status) const; |
54 | |
55 | private: |
56 | /** |
57 | * Load the machine learning's model file. |
58 | * |
59 | * @param error Information on any errors encountered. |
60 | */ |
61 | void loadMLModel(UErrorCode &error); |
62 | |
63 | /** |
64 | * In the machine learning's model file, specify the name of the key and value to load the |
65 | * corresponding feature and its score. |
66 | * |
67 | * @param rb A ResouceBundle corresponding to the model file. |
68 | * @param keyName The kay name in the model file. |
69 | * @param valueName The value name in the model file. |
70 | * @param model A hashtable to store the pairs of the feature and its score. |
71 | * @param error Information on any errors encountered. |
72 | */ |
73 | void initKeyValue(UResourceBundle *rb, const char *keyName, const char *valueName, |
74 | Hashtable &model, UErrorCode &error); |
75 | |
76 | /** |
77 | * Initialize the index list from the input string. |
78 | * |
79 | * @param inString A input string to be segmented. |
80 | * @param indexList A code unit index list of inString. |
81 | * @param status Information on any errors encountered. |
82 | * @return The number of code units of the first four characters in inString. |
83 | */ |
84 | int32_t initIndexList(const UnicodeString &inString, int32_t *indexList, |
85 | UErrorCode &status) const; |
86 | |
87 | /** |
88 | * Evaluate whether the index is a potential breakpoint. |
89 | * |
90 | * @param inString A input string to be segmented. |
91 | * @param indexList A code unit index list of the inString. |
92 | * @param startIdx The start index of the indexList. |
93 | * @param numCodeUnits The current code unit boundary of the indexList. |
94 | * @param numBreaks The accumulated number of breakpoints. |
95 | * @param boundary A vector including the index of the breakpoint. |
96 | * @param status Information on any errors encountered. |
97 | * @return The number of breakpoints |
98 | */ |
99 | int32_t evaluateBreakpoint(const UnicodeString &inString, int32_t *indexList, int32_t startIdx, |
100 | int32_t numCodeUnits, int32_t numBreaks, UVector32 &boundary, |
101 | UErrorCode &status) const; |
102 | |
103 | void printUnicodeString(const UnicodeString &s) const; |
104 | |
105 | UnicodeSet fDigitOrOpenPunctuationOrAlphabetSet; |
106 | UnicodeSet fClosePunctuationSet; |
107 | Hashtable fModel[13]; // {UW1, UW2, ... UW6, BW1, ... BW3, TW1, TW2, ... TW4} 6+3+4= 13 |
108 | int32_t fNegativeSum; |
109 | }; |
110 | |
111 | #endif |
112 | |
113 | U_NAMESPACE_END |
114 | |
115 | /* MLBREAKENGINE_H */ |
116 | #endif |
117 | |