| 1 | // © 2016 and later: Unicode, Inc. and others. | 
|---|
| 2 | // License & terms of use: http://www.unicode.org/copyright.html | 
|---|
| 3 | /* | 
|---|
| 4 | ********************************************************************** | 
|---|
| 5 | *   Copyright (C) 2005-2016, International Business Machines | 
|---|
| 6 | *   Corporation and others.  All Rights Reserved. | 
|---|
| 7 | ********************************************************************** | 
|---|
| 8 | */ | 
|---|
| 9 |  | 
|---|
| 10 | #include "unicode/utypes.h" | 
|---|
| 11 |  | 
|---|
| 12 | #if !UCONFIG_NO_CONVERSION | 
|---|
| 13 |  | 
|---|
| 14 | #include "inputext.h" | 
|---|
| 15 |  | 
|---|
| 16 | #include "cmemory.h" | 
|---|
| 17 | #include "cstring.h" | 
|---|
| 18 |  | 
|---|
| 19 | #include <string.h> | 
|---|
| 20 |  | 
|---|
| 21 | U_NAMESPACE_BEGIN | 
|---|
| 22 |  | 
|---|
| 23 | #define BUFFER_SIZE 8192 | 
|---|
| 24 |  | 
|---|
| 25 | #define NEW_ARRAY(type,count) (type *) uprv_malloc((count) * sizeof(type)) | 
|---|
| 26 | #define DELETE_ARRAY(array) uprv_free((void *) (array)) | 
|---|
| 27 |  | 
|---|
| 28 | InputText::InputText(UErrorCode &status) | 
|---|
| 29 | : fInputBytes(NEW_ARRAY(uint8_t, BUFFER_SIZE)), // The text to be checked.  Markup will have been | 
|---|
| 30 | //   removed if appropriate. | 
|---|
| 31 | fByteStats(NEW_ARRAY(int16_t, 256)),       // byte frequency statistics for the input text. | 
|---|
| 32 | //   Value is percent, not absolute. | 
|---|
| 33 | fDeclaredEncoding(0), | 
|---|
| 34 | fRawInput(0), | 
|---|
| 35 | fRawLength(0) | 
|---|
| 36 | { | 
|---|
| 37 | if (fInputBytes == NULL || fByteStats == NULL) { | 
|---|
| 38 | status = U_MEMORY_ALLOCATION_ERROR; | 
|---|
| 39 | } | 
|---|
| 40 | } | 
|---|
| 41 |  | 
|---|
| 42 | InputText::~InputText() | 
|---|
| 43 | { | 
|---|
| 44 | DELETE_ARRAY(fDeclaredEncoding); | 
|---|
| 45 | DELETE_ARRAY(fByteStats); | 
|---|
| 46 | DELETE_ARRAY(fInputBytes); | 
|---|
| 47 | } | 
|---|
| 48 |  | 
|---|
| 49 | void InputText::setText(const char *in, int32_t len) | 
|---|
| 50 | { | 
|---|
| 51 | fInputLen  = 0; | 
|---|
| 52 | fC1Bytes   = FALSE; | 
|---|
| 53 | fRawInput  = (const uint8_t *) in; | 
|---|
| 54 | fRawLength = len == -1? (int32_t)uprv_strlen(in) : len; | 
|---|
| 55 | } | 
|---|
| 56 |  | 
|---|
| 57 | void InputText::setDeclaredEncoding(const char* encoding, int32_t len) | 
|---|
| 58 | { | 
|---|
| 59 | if(encoding) { | 
|---|
| 60 | if (len == -1) { | 
|---|
| 61 | len = (int32_t)uprv_strlen(encoding); | 
|---|
| 62 | } | 
|---|
| 63 |  | 
|---|
| 64 | len += 1;     // to make place for the \0 at the end. | 
|---|
| 65 | uprv_free(fDeclaredEncoding); | 
|---|
| 66 | fDeclaredEncoding = NEW_ARRAY(char, len); | 
|---|
| 67 | uprv_strncpy(fDeclaredEncoding, encoding, len); | 
|---|
| 68 | } | 
|---|
| 69 | } | 
|---|
| 70 |  | 
|---|
| 71 | UBool InputText::isSet() const | 
|---|
| 72 | { | 
|---|
| 73 | return fRawInput != NULL; | 
|---|
| 74 | } | 
|---|
| 75 |  | 
|---|
| 76 | /** | 
|---|
| 77 | *  MungeInput - after getting a set of raw input data to be analyzed, preprocess | 
|---|
| 78 | *               it by removing what appears to be html markup. | 
|---|
| 79 | * | 
|---|
| 80 | * @internal | 
|---|
| 81 | */ | 
|---|
| 82 | void InputText::MungeInput(UBool fStripTags) { | 
|---|
| 83 | int     srci = 0; | 
|---|
| 84 | int     dsti = 0; | 
|---|
| 85 | uint8_t b; | 
|---|
| 86 | bool    inMarkup = FALSE; | 
|---|
| 87 | int32_t openTags = 0; | 
|---|
| 88 | int32_t badTags  = 0; | 
|---|
| 89 |  | 
|---|
| 90 | // | 
|---|
| 91 | //  html / xml markup stripping. | 
|---|
| 92 | //     quick and dirty, not 100% accurate, but hopefully good enough, statistically. | 
|---|
| 93 | //     discard everything within < brackets > | 
|---|
| 94 | //     Count how many total '<' and illegal (nested) '<' occur, so we can make some | 
|---|
| 95 | //     guess as to whether the input was actually marked up at all. | 
|---|
| 96 | // TODO: Think about how this interacts with EBCDIC charsets that are detected. | 
|---|
| 97 | if (fStripTags) { | 
|---|
| 98 | for (srci = 0; srci < fRawLength && dsti < BUFFER_SIZE; srci += 1) { | 
|---|
| 99 | b = fRawInput[srci]; | 
|---|
| 100 |  | 
|---|
| 101 | if (b == (uint8_t)0x3C) { /* Check for the ASCII '<' */ | 
|---|
| 102 | if (inMarkup) { | 
|---|
| 103 | badTags += 1; | 
|---|
| 104 | } | 
|---|
| 105 |  | 
|---|
| 106 | inMarkup = TRUE; | 
|---|
| 107 | openTags += 1; | 
|---|
| 108 | } | 
|---|
| 109 |  | 
|---|
| 110 | if (! inMarkup) { | 
|---|
| 111 | fInputBytes[dsti++] = b; | 
|---|
| 112 | } | 
|---|
| 113 |  | 
|---|
| 114 | if (b == (uint8_t)0x3E) { /* Check for the ASCII '>' */ | 
|---|
| 115 | inMarkup = FALSE; | 
|---|
| 116 | } | 
|---|
| 117 | } | 
|---|
| 118 |  | 
|---|
| 119 | fInputLen = dsti; | 
|---|
| 120 | } | 
|---|
| 121 |  | 
|---|
| 122 | // | 
|---|
| 123 | //  If it looks like this input wasn't marked up, or if it looks like it's | 
|---|
| 124 | //    essentially nothing but markup abandon the markup stripping. | 
|---|
| 125 | //    Detection will have to work on the unstripped input. | 
|---|
| 126 | // | 
|---|
| 127 | if (openTags<5 || openTags/5 < badTags || | 
|---|
| 128 | (fInputLen < 100 && fRawLength>600)) | 
|---|
| 129 | { | 
|---|
| 130 | int32_t limit = fRawLength; | 
|---|
| 131 |  | 
|---|
| 132 | if (limit > BUFFER_SIZE) { | 
|---|
| 133 | limit = BUFFER_SIZE; | 
|---|
| 134 | } | 
|---|
| 135 |  | 
|---|
| 136 | for (srci=0; srci<limit; srci++) { | 
|---|
| 137 | fInputBytes[srci] = fRawInput[srci]; | 
|---|
| 138 | } | 
|---|
| 139 |  | 
|---|
| 140 | fInputLen = srci; | 
|---|
| 141 | } | 
|---|
| 142 |  | 
|---|
| 143 | // | 
|---|
| 144 | // Tally up the byte occurence statistics. | 
|---|
| 145 | // These are available for use by the various detectors. | 
|---|
| 146 | // | 
|---|
| 147 |  | 
|---|
| 148 | uprv_memset(fByteStats, 0, (sizeof fByteStats[0]) * 256); | 
|---|
| 149 |  | 
|---|
| 150 | for (srci = 0; srci < fInputLen; srci += 1) { | 
|---|
| 151 | fByteStats[fInputBytes[srci]] += 1; | 
|---|
| 152 | } | 
|---|
| 153 |  | 
|---|
| 154 | for (int32_t i = 0x80; i <= 0x9F; i += 1) { | 
|---|
| 155 | if (fByteStats[i] != 0) { | 
|---|
| 156 | fC1Bytes = TRUE; | 
|---|
| 157 | break; | 
|---|
| 158 | } | 
|---|
| 159 | } | 
|---|
| 160 | } | 
|---|
| 161 |  | 
|---|
| 162 | U_NAMESPACE_END | 
|---|
| 163 | #endif | 
|---|
| 164 |  | 
|---|
| 165 |  | 
|---|