1// Licensed to the Apache Software Foundation (ASF) under one
2// or more contributor license agreements. See the NOTICE file
3// distributed with this work for additional information
4// regarding copyright ownership. The ASF licenses this file
5// to you under the Apache License, Version 2.0 (the
6// "License"); you may not use this file except in compliance
7// with the License. You may obtain a copy of the License at
8//
9// http://www.apache.org/licenses/LICENSE-2.0
10//
11// Unless required by applicable law or agreed to in writing,
12// software distributed under the License is distributed on an
13// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14// KIND, either express or implied. See the License for the
15// specific language governing permissions and limitations
16// under the License.
17
18#include <gtest/gtest.h>
19
20#include <cstdint>
21#include <iostream>
22#include <vector>
23
24#include "parquet/schema.h"
25#include "parquet/types.h"
26#include "parquet/util/comparison.h"
27
28namespace parquet {
29
30namespace test {
31
32using parquet::schema::NodePtr;
33using parquet::schema::PrimitiveNode;
34
35static ByteArray ByteArrayFromString(const std::string& s) {
36 auto ptr = reinterpret_cast<const uint8_t*>(s.data());
37 return ByteArray(static_cast<uint32_t>(s.size()), ptr);
38}
39
40static FLBA FLBAFromString(const std::string& s) {
41 auto ptr = reinterpret_cast<const uint8_t*>(s.data());
42 return FLBA(ptr);
43}
44
45TEST(Comparison, signedByteArray) {
46 NodePtr node =
47 PrimitiveNode::Make("SignedByteArray", Repetition::REQUIRED, Type::BYTE_ARRAY);
48 ColumnDescriptor descr(node, 0, 0);
49
50 CompareDefaultByteArray less;
51
52 std::string s1 = "12345";
53 std::string s2 = "12345678";
54 ByteArray s1ba = ByteArrayFromString(s1);
55 ByteArray s2ba = ByteArrayFromString(s2);
56 ASSERT_TRUE(less(s1ba, s2ba));
57
58 // This is case where signed comparision UTF-8 (PARQUET-686) is incorrect
59 // This example is to only check signed comparison and not UTF-8.
60 s1 = u8"bügeln";
61 s2 = u8"braten";
62 s1ba = ByteArrayFromString(s1);
63 s2ba = ByteArrayFromString(s2);
64 ASSERT_TRUE(less(s1ba, s2ba));
65}
66
67TEST(Comparison, UnsignedByteArray) {
68 NodePtr node = PrimitiveNode::Make("UnsignedByteArray", Repetition::REQUIRED,
69 Type::BYTE_ARRAY, LogicalType::UTF8);
70 ColumnDescriptor descr(node, 0, 0);
71
72 // Check if UTF-8 is compared using unsigned correctly
73 CompareUnsignedByteArray uless;
74
75 std::string s1 = "arrange";
76 std::string s2 = "arrangement";
77 ByteArray s1ba = ByteArrayFromString(s1);
78 ByteArray s2ba = ByteArrayFromString(s2);
79 ASSERT_TRUE(uless(s1ba, s2ba));
80
81 // Multi-byte UTF-8 characters
82 s1 = u8"braten";
83 s2 = u8"bügeln";
84 s1ba = ByteArrayFromString(s1);
85 s2ba = ByteArrayFromString(s2);
86 ASSERT_TRUE(uless(s1ba, s2ba));
87
88 s1 = u8"ünk123456"; // ü = 252
89 s2 = u8"ănk123456"; // ă = 259
90 s1ba = ByteArrayFromString(s1);
91 s2ba = ByteArrayFromString(s2);
92 ASSERT_TRUE(uless(s1ba, s2ba));
93}
94
95TEST(Comparison, SignedFLBA) {
96 int size = 10;
97 NodePtr node = PrimitiveNode::Make("SignedFLBA", Repetition::REQUIRED,
98 Type::FIXED_LEN_BYTE_ARRAY, LogicalType::NONE, size);
99 ColumnDescriptor descr(node, 0, 0);
100
101 CompareDefaultFLBA less(descr.type_length());
102
103 std::string s1 = "Anti123456";
104 std::string s2 = "Bunkd123456";
105 FLBA s1flba = FLBAFromString(s1);
106 FLBA s2flba = FLBAFromString(s2);
107 ASSERT_TRUE(less(s1flba, s2flba));
108
109 s1 = "Bünk123456";
110 s2 = "Bunk123456";
111 s1flba = FLBAFromString(s1);
112 s2flba = FLBAFromString(s2);
113 ASSERT_TRUE(less(s1flba, s2flba));
114}
115
116TEST(Comparison, UnsignedFLBA) {
117 int size = 10;
118 NodePtr node = PrimitiveNode::Make("UnsignedFLBA", Repetition::REQUIRED,
119 Type::FIXED_LEN_BYTE_ARRAY, LogicalType::NONE, size);
120 ColumnDescriptor descr(node, 0, 0);
121
122 CompareUnsignedFLBA uless(descr.type_length());
123
124 std::string s1 = "Anti123456";
125 std::string s2 = "Bunkd123456";
126 FLBA s1flba = FLBAFromString(s1);
127 FLBA s2flba = FLBAFromString(s2);
128 ASSERT_TRUE(uless(s1flba, s2flba));
129
130 s1 = "Bunk123456";
131 s2 = "Bünk123456";
132 s1flba = FLBAFromString(s1);
133 s2flba = FLBAFromString(s2);
134 ASSERT_TRUE(uless(s1flba, s2flba));
135}
136
137TEST(Comparison, SignedInt96) {
138 parquet::Int96 a{{1, 41, 14}}, b{{1, 41, 42}};
139 parquet::Int96 aa{{1, 41, 14}}, bb{{1, 41, 14}};
140 parquet::Int96 aaa{{1, 41, static_cast<uint32_t>(-14)}}, bbb{{1, 41, 42}};
141
142 NodePtr node = PrimitiveNode::Make("SignedInt96", Repetition::REQUIRED, Type::INT96);
143 ColumnDescriptor descr(node, 0, 0);
144
145 CompareDefaultInt96 less;
146
147 ASSERT_TRUE(less(a, b));
148 ASSERT_TRUE(!less(aa, bb) && !less(bb, aa));
149 ASSERT_TRUE(less(aaa, bbb));
150}
151
152TEST(Comparison, UnsignedInt96) {
153 parquet::Int96 a{{1, 41, 14}}, b{{1, static_cast<uint32_t>(-41), 42}};
154 parquet::Int96 aa{{1, 41, 14}}, bb{{1, 41, static_cast<uint32_t>(-14)}};
155 parquet::Int96 aaa, bbb;
156
157 NodePtr node = PrimitiveNode::Make("UnsignedInt96", Repetition::REQUIRED, Type::INT96);
158 ColumnDescriptor descr(node, 0, 0);
159
160 CompareUnsignedInt96 uless;
161
162 ASSERT_TRUE(uless(a, b));
163 ASSERT_TRUE(uless(aa, bb));
164
165 // INT96 Timestamp
166 aaa.value[2] = 2451545; // 2000-01-01
167 bbb.value[2] = 2451546; // 2000-01-02
168 // 12 hours + 34 minutes + 56 seconds.
169 Int96SetNanoSeconds(aaa, 45296000000000);
170 // 12 hours + 34 minutes + 50 seconds.
171 Int96SetNanoSeconds(bbb, 45290000000000);
172 ASSERT_TRUE(uless(aaa, bbb));
173
174 aaa.value[2] = 2451545; // 2000-01-01
175 bbb.value[2] = 2451545; // 2000-01-01
176 // 11 hours + 34 minutes + 56 seconds.
177 Int96SetNanoSeconds(aaa, 41696000000000);
178 // 12 hours + 34 minutes + 50 seconds.
179 Int96SetNanoSeconds(bbb, 45290000000000);
180 ASSERT_TRUE(uless(aaa, bbb));
181
182 aaa.value[2] = 2451545; // 2000-01-01
183 bbb.value[2] = 2451545; // 2000-01-01
184 // 12 hours + 34 minutes + 55 seconds.
185 Int96SetNanoSeconds(aaa, 45295000000000);
186 // 12 hours + 34 minutes + 56 seconds.
187 Int96SetNanoSeconds(bbb, 45296000000000);
188 ASSERT_TRUE(uless(aaa, bbb));
189}
190
191TEST(Comparison, SignedInt64) {
192 int64_t a = 1, b = 4;
193 int64_t aa = 1, bb = 1;
194 int64_t aaa = -1, bbb = 1;
195
196 NodePtr node = PrimitiveNode::Make("SignedInt64", Repetition::REQUIRED, Type::INT64);
197 ColumnDescriptor descr(node, 0, 0);
198
199 CompareDefaultInt64 less;
200
201 ASSERT_TRUE(less(a, b));
202 ASSERT_TRUE(!less(aa, bb) && !less(bb, aa));
203 ASSERT_TRUE(less(aaa, bbb));
204}
205
206TEST(Comparison, UnsignedInt64) {
207 uint64_t a = 1, b = 4;
208 uint64_t aa = 1, bb = 1;
209 uint64_t aaa = 1, bbb = -1;
210
211 NodePtr node = PrimitiveNode::Make("UnsignedInt64", Repetition::REQUIRED, Type::INT64);
212 ColumnDescriptor descr(node, 0, 0);
213
214 CompareUnsignedInt64 less;
215
216 ASSERT_TRUE(less(a, b));
217 ASSERT_TRUE(!less(aa, bb) && !less(bb, aa));
218 ASSERT_TRUE(less(aaa, bbb));
219}
220
221TEST(Comparison, UnsignedInt32) {
222 uint32_t a = 1, b = 4;
223 uint32_t aa = 1, bb = 1;
224 uint32_t aaa = 1, bbb = -1;
225
226 NodePtr node = PrimitiveNode::Make("UnsignedInt32", Repetition::REQUIRED, Type::INT32);
227 ColumnDescriptor descr(node, 0, 0);
228
229 CompareUnsignedInt32 less;
230
231 ASSERT_TRUE(less(a, b));
232 ASSERT_TRUE(!less(aa, bb) && !less(bb, aa));
233 ASSERT_TRUE(less(aaa, bbb));
234}
235
236TEST(Comparison, UnknownSortOrder) {
237 NodePtr node =
238 PrimitiveNode::Make("Unknown", Repetition::REQUIRED, Type::FIXED_LEN_BYTE_ARRAY,
239 LogicalType::INTERVAL, 12);
240 ColumnDescriptor descr(node, 0, 0);
241
242 ASSERT_THROW(Comparator::Make(&descr), ParquetException);
243}
244
245} // namespace test
246
247} // namespace parquet
248