inputext.cpp source code [ClickHouse/contrib/icu/icu4c/source/i18n/inputext.cpp]

1	// © 2016 and later: Unicode, Inc. and others.
2	// License & terms of use: http://www.unicode.org/copyright.html
3	/*
4	**********************************************************************
5	* Copyright (C) 2005-2016, International Business Machines
6	* Corporation and others. All Rights Reserved.
7	**********************************************************************
8	*/
9
10	#include "unicode/utypes.h"
11
12	#if !UCONFIG_NO_CONVERSION
13
14	#include "inputext.h"
15
16	#include "cmemory.h"
17	#include "cstring.h"
18
19	#include <string.h>
20
21	U_NAMESPACE_BEGIN
22
23	#define BUFFER_SIZE 8192
24
25	#define NEW_ARRAY(type,count) (type ) uprv_malloc((count) sizeof(type))
26	#define DELETE_ARRAY(array) uprv_free((void *) (array))
27
28	InputText::InputText(UErrorCode &status)
29	: fInputBytes(NEW_ARRAY(uint8_t, BUFFER_SIZE)), // The text to be checked. Markup will have been
30	// removed if appropriate.
31	fByteStats(NEW_ARRAY(int16_t, `256`)), // byte frequency statistics for the input text.
32	// Value is percent, not absolute.
33	fDeclaredEncoding(`0`),
34	fRawInput(`0`),
35	fRawLength(`0`)
36	{
37	if (fInputBytes == NULL \|\| fByteStats == NULL) {
38	status = U_MEMORY_ALLOCATION_ERROR;
39	}
40	}
41
42	InputText::~InputText()
43	{
44	DELETE_ARRAY(fDeclaredEncoding);
45	DELETE_ARRAY(fByteStats);
46	DELETE_ARRAY(fInputBytes);
47	}
48
49	void InputText::setText(const char *in, int32_t len)
50	{
51	fInputLen = `0`;
52	fC1Bytes = FALSE;
53	fRawInput = (const uint8_t *) in;
54	fRawLength = len == -`1`? (int32_t)uprv_strlen(in) : len;
55	}
56
57	void InputText::setDeclaredEncoding(const char* encoding, int32_t len)
58	{
59	if(encoding) {
60	if (len == -`1`) {
61	len = (int32_t)uprv_strlen(encoding);
62	}
63
64	len += `1`; // to make place for the \0 at the end.
65	uprv_free(fDeclaredEncoding);
66	fDeclaredEncoding = NEW_ARRAY(char, len);
67	uprv_strncpy(fDeclaredEncoding, encoding, len);
68	}
69	}
70
71	UBool InputText::isSet() const
72	{
73	return fRawInput != NULL;
74	}
75
76	/**
77	* MungeInput - after getting a set of raw input data to be analyzed, preprocess
78	* it by removing what appears to be html markup.
79	*
80	* @internal
81	*/
82	void InputText::MungeInput(UBool fStripTags) {
83	int srci = `0`;
84	int dsti = `0`;
85	uint8_t b;
86	bool inMarkup = FALSE;
87	int32_t openTags = `0`;
88	int32_t badTags = `0`;
89
90	//
91	// html / xml markup stripping.
92	// quick and dirty, not 100% accurate, but hopefully good enough, statistically.
93	// discard everything within < brackets >
94	// Count how many total '<' and illegal (nested) '<' occur, so we can make some
95	// guess as to whether the input was actually marked up at all.
96	// TODO: Think about how this interacts with EBCDIC charsets that are detected.
97	if (fStripTags) {
98	for (srci = `0`; srci < fRawLength && dsti < BUFFER_SIZE; srci += `1`) {
99	b = fRawInput[srci];
100
101	if (b == (uint8_t)`0x3C`) { / Check for the ASCII '<' /
102	if (inMarkup) {
103	badTags += `1`;
104	}
105
106	inMarkup = TRUE;
107	openTags += `1`;
108	}
109
110	if (! inMarkup) {
111	fInputBytes[dsti++] = b;
112	}
113
114	if (b == (uint8_t)`0x3E`) { / Check for the ASCII '>' /
115	inMarkup = FALSE;
116	}
117	}
118
119	fInputLen = dsti;
120	}
121
122	//
123	// If it looks like this input wasn't marked up, or if it looks like it's
124	// essentially nothing but markup abandon the markup stripping.
125	// Detection will have to work on the unstripped input.
126	//
127	if (openTags<`5` \|\| openTags/`5` < badTags \|\|
128	(fInputLen < `100` && fRawLength>`600`))
129	{
130	int32_t limit = fRawLength;
131
132	if (limit > BUFFER_SIZE) {
133	limit = BUFFER_SIZE;
134	}
135
136	for (srci=`0`; srci<limit; srci++) {
137	fInputBytes[srci] = fRawInput[srci];
138	}
139
140	fInputLen = srci;
141	}
142
143	//
144	// Tally up the byte occurence statistics.
145	// These are available for use by the various detectors.
146	//
147
148	uprv_memset(fByteStats, `0`, (sizeof fByteStats[`0`]) * `256`);
149
150	for (srci = `0`; srci < fInputLen; srci += `1`) {
151	fByteStats[fInputBytes[srci]] += `1`;
152	}
153
154	for (int32_t i = `0x80`; i <= `0x9F`; i += `1`) {
155	if (fByteStats[i] != `0`) {
156	fC1Bytes = TRUE;
157	break;
158	}
159	}
160	}
161
162	U_NAMESPACE_END
163	#endif
164
165

Browse the source code of ClickHouse/contrib/icu/icu4c/source/i18n/inputext.cpp