csrutf8.cpp source code [ClickHouse/contrib/icu/icu4c/source/i18n/csrutf8.cpp]

1	// © 2016 and later: Unicode, Inc. and others.
2	// License & terms of use: http://www.unicode.org/copyright.html
3	/*
4	**********************************************************************
5	* Copyright (C) 2005-2014, International Business Machines
6	* Corporation and others. All Rights Reserved.
7	**********************************************************************
8	*/
9
10	#include "unicode/utypes.h"
11
12	#if !UCONFIG_NO_CONVERSION
13
14	#include "csrutf8.h"
15	#include "csmatch.h"
16
17	U_NAMESPACE_BEGIN
18
19	CharsetRecog_UTF8::~CharsetRecog_UTF8()
20	{
21	// nothing to do
22	}
23
24	const char CharsetRecog_UTF8::getName() const*
25	{
26	return "UTF-8";
27	}
28
29	UBool CharsetRecog_UTF8::match(InputText* input, CharsetMatch results) const* {
30	bool hasBOM = FALSE;
31	int32_t numValid = `0`;
32	int32_t numInvalid = `0`;
33	const uint8_t *inputBytes = input->fRawInput;
34	int32_t i;
35	int32_t trailBytes = `0`;
36	int32_t confidence;
37
38	if (input->fRawLength >= `3` &&
39	inputBytes[`0`] == `0xEF` && inputBytes[`1`] == `0xBB` && inputBytes[`2`] == `0xBF`) {
40	hasBOM = TRUE;
41	}
42
43	// Scan for multi-byte sequences
44	for (i=`0`; i < input->fRawLength; i += `1`) {
45	int32_t b = inputBytes[i];
46
47	if ((b & `0x80`) == `0`) {
48	continue; // ASCII
49	}
50
51	// Hi bit on char found. Figure out how long the sequence should be
52	if ((b & `0x0E0`) == `0x0C0`) {
53	trailBytes = `1`;
54	} else if ((b & `0x0F0`) == `0x0E0`) {
55	trailBytes = `2`;
56	} else if ((b & `0x0F8`) == `0xF0`) {
57	trailBytes = `3`;
58	} else {
59	numInvalid += `1`;
60	continue;
61	}
62
63	// Verify that we've got the right number of trail bytes in the sequence
64	for (;;) {
65	i += `1`;
66
67	if (i >= input->fRawLength) {
68	break;
69	}
70
71	b = inputBytes[i];
72
73	if ((b & `0xC0`) != `0x080`) {
74	numInvalid += `1`;
75	break;
76	}
77
78	if (--trailBytes == `0`) {
79	numValid += `1`;
80	break;
81	}
82	}
83
84	}
85
86	// Cook up some sort of confidence score, based on presence of a BOM
87	// and the existence of valid and/or invalid multi-byte sequences.
88	confidence = `0`;
89	if (hasBOM && numInvalid == `0`) {
90	confidence = `100`;
91	} else if (hasBOM && numValid > numInvalid*`10`) {
92	confidence = `80`;
93	} else if (numValid > `3` && numInvalid == `0`) {
94	confidence = `100`;
95	} else if (numValid > `0` && numInvalid == `0`) {
96	confidence = `80`;
97	} else if (numValid == `0` && numInvalid == `0`) {
98	// Plain ASCII. Confidence must be > 10, it's more likely than UTF-16, which
99	// accepts ASCII with confidence = 10.
100	confidence = `15`;
101	} else if (numValid > numInvalid*`10`) {
102	// Probably corruput utf-8 data. Valid sequences aren't likely by chance.
103	confidence = `25`;
104	}
105
106	results->set(input, this, confidence);
107	return (confidence > `0`);
108	}
109
110	U_NAMESPACE_END
111	#endif
112

Browse the source code of ClickHouse/contrib/icu/icu4c/source/i18n/csrutf8.cpp