1// Licensed to the Apache Software Foundation (ASF) under one
2// or more contributor license agreements. See the NOTICE file
3// distributed with this work for additional information
4// regarding copyright ownership. The ASF licenses this file
5// to you under the Apache License, Version 2.0 (the
6// "License"); you may not use this file except in compliance
7// with the License. You may obtain a copy of the License at
8//
9// http://www.apache.org/licenses/LICENSE-2.0
10//
11// Unless required by applicable law or agreed to in writing,
12// software distributed under the License is distributed on an
13// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14// KIND, either express or implied. See the License for the
15// specific language governing permissions and limitations
16// under the License.
17
18#include "arrow/sparse_tensor.h"
19
20#include <functional>
21#include <memory>
22#include <numeric>
23
24#include "arrow/compare.h"
25#include "arrow/util/logging.h"
26
27namespace arrow {
28
29namespace {
30
31// ----------------------------------------------------------------------
32// SparseTensorConverter
33
34template <typename TYPE, typename SparseIndexType>
35class SparseTensorConverter {
36 public:
37 explicit SparseTensorConverter(const NumericTensor<TYPE>&) {}
38
39 Status Convert() { return Status::Invalid("Unsupported sparse index"); }
40};
41
42// ----------------------------------------------------------------------
43// SparseTensorConverter for SparseCOOIndex
44
45template <typename TYPE>
46struct SparseTensorConverterBase {
47 using NumericTensorType = NumericTensor<TYPE>;
48 using value_type = typename NumericTensorType::value_type;
49
50 explicit SparseTensorConverterBase(const NumericTensorType& tensor) : tensor_(tensor) {}
51
52 bool TensorIsTriviallyIterable() const {
53 return tensor_.ndim() <= 1 || tensor_.is_contiguous();
54 }
55
56 size_t CountNonZero() const {
57 if (tensor_.size() == 0) {
58 return 0;
59 }
60
61 if (TensorIsTriviallyIterable()) {
62 const value_type* data = reinterpret_cast<const value_type*>(tensor_.raw_data());
63 return std::count_if(data, data + tensor_.size(),
64 [](value_type x) { return x != 0; });
65 }
66
67 const std::vector<int64_t>& shape = tensor_.shape();
68 const int64_t ndim = tensor_.ndim();
69
70 size_t count = 0;
71 std::vector<int64_t> coord(ndim, 0);
72 for (int64_t n = tensor_.size(); n > 0; n--) {
73 if (tensor_.Value(coord) != 0) {
74 ++count;
75 }
76
77 // increment index
78 ++coord[ndim - 1];
79 if (n > 1 && coord[ndim - 1] == shape[ndim - 1]) {
80 int64_t d = ndim - 1;
81 while (d > 0 && coord[d] == shape[d]) {
82 coord[d] = 0;
83 ++coord[d - 1];
84 --d;
85 }
86 }
87 }
88 return count;
89 }
90
91 const NumericTensorType& tensor_;
92};
93
94template <typename TYPE>
95class SparseTensorConverter<TYPE, SparseCOOIndex>
96 : private SparseTensorConverterBase<TYPE> {
97 public:
98 using BaseClass = SparseTensorConverterBase<TYPE>;
99 using NumericTensorType = typename BaseClass::NumericTensorType;
100 using value_type = typename BaseClass::value_type;
101
102 explicit SparseTensorConverter(const NumericTensorType& tensor) : BaseClass(tensor) {}
103
104 Status Convert() {
105 const int64_t ndim = tensor_.ndim();
106 const int64_t nonzero_count = static_cast<int64_t>(CountNonZero());
107
108 std::shared_ptr<Buffer> indices_buffer;
109 RETURN_NOT_OK(
110 AllocateBuffer(sizeof(int64_t) * ndim * nonzero_count, &indices_buffer));
111 int64_t* indices = reinterpret_cast<int64_t*>(indices_buffer->mutable_data());
112
113 std::shared_ptr<Buffer> values_buffer;
114 RETURN_NOT_OK(AllocateBuffer(sizeof(value_type) * nonzero_count, &values_buffer));
115 value_type* values = reinterpret_cast<value_type*>(values_buffer->mutable_data());
116
117 if (ndim <= 1) {
118 const value_type* data = reinterpret_cast<const value_type*>(tensor_.raw_data());
119 const int64_t count = ndim == 0 ? 1 : tensor_.shape()[0];
120 for (int64_t i = 0; i < count; ++i, ++data) {
121 if (*data != 0) {
122 *indices++ = i;
123 *values++ = *data;
124 }
125 }
126 } else {
127 const std::vector<int64_t>& shape = tensor_.shape();
128 std::vector<int64_t> coord(ndim, 0);
129
130 for (int64_t n = tensor_.size(); n > 0; n--) {
131 const value_type x = tensor_.Value(coord);
132 if (tensor_.Value(coord) != 0) {
133 *values++ = x;
134
135 int64_t* indp = indices;
136 for (int64_t i = 0; i < ndim; ++i) {
137 *indp = coord[i];
138 indp += nonzero_count;
139 }
140 indices++;
141 }
142
143 // increment index
144 ++coord[ndim - 1];
145 if (n > 1 && coord[ndim - 1] == shape[ndim - 1]) {
146 int64_t d = ndim - 1;
147 while (d > 0 && coord[d] == shape[d]) {
148 coord[d] = 0;
149 ++coord[d - 1];
150 --d;
151 }
152 }
153 }
154 }
155
156 // make results
157 const std::vector<int64_t> indices_shape = {nonzero_count, ndim};
158 const int64_t indices_elsize = sizeof(int64_t);
159 const std::vector<int64_t> indices_strides = {indices_elsize,
160 indices_elsize * nonzero_count};
161 sparse_index =
162 std::make_shared<SparseCOOIndex>(std::make_shared<SparseCOOIndex::CoordsTensor>(
163 indices_buffer, indices_shape, indices_strides));
164 data = values_buffer;
165
166 return Status::OK();
167 }
168
169 std::shared_ptr<SparseCOOIndex> sparse_index;
170 std::shared_ptr<Buffer> data;
171
172 private:
173 using SparseTensorConverterBase<TYPE>::tensor_;
174 using SparseTensorConverterBase<TYPE>::CountNonZero;
175};
176
177template <typename TYPE, typename SparseIndexType>
178void MakeSparseTensorFromTensor(const Tensor& tensor,
179 std::shared_ptr<SparseIndex>* sparse_index,
180 std::shared_ptr<Buffer>* data) {
181 NumericTensor<TYPE> numeric_tensor(tensor.data(), tensor.shape(), tensor.strides());
182 SparseTensorConverter<TYPE, SparseIndexType> converter(numeric_tensor);
183 DCHECK_OK(converter.Convert());
184 *sparse_index = converter.sparse_index;
185 *data = converter.data;
186}
187
188// ----------------------------------------------------------------------
189// SparseTensorConverter for SparseCSRIndex
190
191template <typename TYPE>
192class SparseTensorConverter<TYPE, SparseCSRIndex>
193 : private SparseTensorConverterBase<TYPE> {
194 public:
195 using BaseClass = SparseTensorConverterBase<TYPE>;
196 using NumericTensorType = typename BaseClass::NumericTensorType;
197 using value_type = typename BaseClass::value_type;
198
199 explicit SparseTensorConverter(const NumericTensorType& tensor) : BaseClass(tensor) {}
200
201 Status Convert() {
202 const int64_t ndim = tensor_.ndim();
203 if (ndim > 2) {
204 return Status::Invalid("Invalid tensor dimension");
205 }
206
207 const int64_t nr = tensor_.shape()[0];
208 const int64_t nc = tensor_.shape()[1];
209 const int64_t nonzero_count = static_cast<int64_t>(CountNonZero());
210
211 std::shared_ptr<Buffer> indptr_buffer;
212 std::shared_ptr<Buffer> indices_buffer;
213
214 std::shared_ptr<Buffer> values_buffer;
215 RETURN_NOT_OK(AllocateBuffer(sizeof(value_type) * nonzero_count, &values_buffer));
216 value_type* values = reinterpret_cast<value_type*>(values_buffer->mutable_data());
217
218 if (ndim <= 1) {
219 return Status::NotImplemented("TODO for ndim <= 1");
220 } else {
221 RETURN_NOT_OK(AllocateBuffer(sizeof(int64_t) * (nr + 1), &indptr_buffer));
222 int64_t* indptr = reinterpret_cast<int64_t*>(indptr_buffer->mutable_data());
223
224 RETURN_NOT_OK(AllocateBuffer(sizeof(int64_t) * nonzero_count, &indices_buffer));
225 int64_t* indices = reinterpret_cast<int64_t*>(indices_buffer->mutable_data());
226
227 int64_t k = 0;
228 *indptr++ = 0;
229 for (int64_t i = 0; i < nr; ++i) {
230 for (int64_t j = 0; j < nc; ++j) {
231 const value_type x = tensor_.Value({i, j});
232 if (x != 0) {
233 *values++ = x;
234 *indices++ = j;
235 k++;
236 }
237 }
238 *indptr++ = k;
239 }
240 }
241
242 std::vector<int64_t> indptr_shape({nr + 1});
243 std::shared_ptr<SparseCSRIndex::IndexTensor> indptr_tensor =
244 std::make_shared<SparseCSRIndex::IndexTensor>(indptr_buffer, indptr_shape);
245
246 std::vector<int64_t> indices_shape({nonzero_count});
247 std::shared_ptr<SparseCSRIndex::IndexTensor> indices_tensor =
248 std::make_shared<SparseCSRIndex::IndexTensor>(indices_buffer, indices_shape);
249
250 sparse_index = std::make_shared<SparseCSRIndex>(indptr_tensor, indices_tensor);
251 data = values_buffer;
252
253 return Status::OK();
254 }
255
256 std::shared_ptr<SparseCSRIndex> sparse_index;
257 std::shared_ptr<Buffer> data;
258
259 private:
260 using BaseClass::tensor_;
261 using SparseTensorConverterBase<TYPE>::CountNonZero;
262};
263
264// ----------------------------------------------------------------------
265// Instantiate templates
266
267#define INSTANTIATE_SPARSE_TENSOR_CONVERTER(IndexType) \
268 template class SparseTensorConverter<UInt8Type, IndexType>; \
269 template class SparseTensorConverter<UInt16Type, IndexType>; \
270 template class SparseTensorConverter<UInt32Type, IndexType>; \
271 template class SparseTensorConverter<UInt64Type, IndexType>; \
272 template class SparseTensorConverter<Int8Type, IndexType>; \
273 template class SparseTensorConverter<Int16Type, IndexType>; \
274 template class SparseTensorConverter<Int32Type, IndexType>; \
275 template class SparseTensorConverter<Int64Type, IndexType>; \
276 template class SparseTensorConverter<HalfFloatType, IndexType>; \
277 template class SparseTensorConverter<FloatType, IndexType>; \
278 template class SparseTensorConverter<DoubleType, IndexType>
279
280INSTANTIATE_SPARSE_TENSOR_CONVERTER(SparseCOOIndex);
281INSTANTIATE_SPARSE_TENSOR_CONVERTER(SparseCSRIndex);
282
283} // namespace
284
285// ----------------------------------------------------------------------
286// SparseCOOIndex
287
288// Constructor with a column-major NumericTensor
289SparseCOOIndex::SparseCOOIndex(const std::shared_ptr<CoordsTensor>& coords)
290 : SparseIndexBase(coords->shape()[0]), coords_(coords) {
291 DCHECK(coords_->is_column_major());
292}
293
294std::string SparseCOOIndex::ToString() const { return std::string("SparseCOOIndex"); }
295
296// ----------------------------------------------------------------------
297// SparseCSRIndex
298
299// Constructor with two index vectors
300SparseCSRIndex::SparseCSRIndex(const std::shared_ptr<IndexTensor>& indptr,
301 const std::shared_ptr<IndexTensor>& indices)
302 : SparseIndexBase(indices->shape()[0]), indptr_(indptr), indices_(indices) {
303 DCHECK_EQ(1, indptr_->ndim());
304 DCHECK_EQ(1, indices_->ndim());
305}
306
307std::string SparseCSRIndex::ToString() const { return std::string("SparseCSRIndex"); }
308
309// ----------------------------------------------------------------------
310// SparseTensor
311
312// Constructor with all attributes
313SparseTensor::SparseTensor(const std::shared_ptr<DataType>& type,
314 const std::shared_ptr<Buffer>& data,
315 const std::vector<int64_t>& shape,
316 const std::shared_ptr<SparseIndex>& sparse_index,
317 const std::vector<std::string>& dim_names)
318 : type_(type),
319 data_(data),
320 shape_(shape),
321 sparse_index_(sparse_index),
322 dim_names_(dim_names) {
323 DCHECK(is_tensor_supported(type->id()));
324}
325
326const std::string& SparseTensor::dim_name(int i) const {
327 static const std::string kEmpty = "";
328 if (dim_names_.size() == 0) {
329 return kEmpty;
330 } else {
331 DCHECK_LT(i, static_cast<int>(dim_names_.size()));
332 return dim_names_[i];
333 }
334}
335
336int64_t SparseTensor::size() const {
337 return std::accumulate(shape_.begin(), shape_.end(), 1LL, std::multiplies<int64_t>());
338}
339
340bool SparseTensor::Equals(const SparseTensor& other) const {
341 return SparseTensorEquals(*this, other);
342}
343
344// ----------------------------------------------------------------------
345// SparseTensorImpl
346
347// Constructor with a dense tensor
348template <typename SparseIndexType>
349SparseTensorImpl<SparseIndexType>::SparseTensorImpl(
350 const std::shared_ptr<DataType>& type, const std::vector<int64_t>& shape,
351 const std::vector<std::string>& dim_names)
352 : SparseTensorImpl(nullptr, type, nullptr, shape, dim_names) {}
353
354// Constructor with a dense tensor
355template <typename SparseIndexType>
356template <typename TYPE>
357SparseTensorImpl<SparseIndexType>::SparseTensorImpl(const NumericTensor<TYPE>& tensor)
358 : SparseTensorImpl(nullptr, tensor.type(), nullptr, tensor.shape(),
359 tensor.dim_names_) {
360 SparseTensorConverter<TYPE, SparseIndexType> converter(tensor);
361 DCHECK_OK(converter.Convert());
362 sparse_index_ = converter.sparse_index;
363 data_ = converter.data;
364}
365
366// Constructor with a dense tensor
367template <typename SparseIndexType>
368SparseTensorImpl<SparseIndexType>::SparseTensorImpl(const Tensor& tensor)
369 : SparseTensorImpl(nullptr, tensor.type(), nullptr, tensor.shape(),
370 tensor.dim_names_) {
371 switch (tensor.type()->id()) {
372 case Type::UINT8:
373 MakeSparseTensorFromTensor<UInt8Type, SparseIndexType>(tensor, &sparse_index_,
374 &data_);
375 return;
376 case Type::INT8:
377 MakeSparseTensorFromTensor<Int8Type, SparseIndexType>(tensor, &sparse_index_,
378 &data_);
379 return;
380 case Type::UINT16:
381 MakeSparseTensorFromTensor<UInt16Type, SparseIndexType>(tensor, &sparse_index_,
382 &data_);
383 return;
384 case Type::INT16:
385 MakeSparseTensorFromTensor<Int16Type, SparseIndexType>(tensor, &sparse_index_,
386 &data_);
387 return;
388 case Type::UINT32:
389 MakeSparseTensorFromTensor<UInt32Type, SparseIndexType>(tensor, &sparse_index_,
390 &data_);
391 return;
392 case Type::INT32:
393 MakeSparseTensorFromTensor<Int32Type, SparseIndexType>(tensor, &sparse_index_,
394 &data_);
395 return;
396 case Type::UINT64:
397 MakeSparseTensorFromTensor<UInt64Type, SparseIndexType>(tensor, &sparse_index_,
398 &data_);
399 return;
400 case Type::INT64:
401 MakeSparseTensorFromTensor<Int64Type, SparseIndexType>(tensor, &sparse_index_,
402 &data_);
403 return;
404 case Type::HALF_FLOAT:
405 MakeSparseTensorFromTensor<HalfFloatType, SparseIndexType>(tensor, &sparse_index_,
406 &data_);
407 return;
408 case Type::FLOAT:
409 MakeSparseTensorFromTensor<FloatType, SparseIndexType>(tensor, &sparse_index_,
410 &data_);
411 return;
412 case Type::DOUBLE:
413 MakeSparseTensorFromTensor<DoubleType, SparseIndexType>(tensor, &sparse_index_,
414 &data_);
415 return;
416 default:
417 break;
418 }
419}
420
421// ----------------------------------------------------------------------
422// Instantiate templates
423
424#define INSTANTIATE_SPARSE_TENSOR(IndexType) \
425 template class ARROW_TEMPLATE_EXPORT SparseTensorImpl<IndexType>; \
426 template ARROW_EXPORT SparseTensorImpl<IndexType>::SparseTensorImpl( \
427 const NumericTensor<UInt8Type>&); \
428 template ARROW_EXPORT SparseTensorImpl<IndexType>::SparseTensorImpl( \
429 const NumericTensor<UInt16Type>&); \
430 template ARROW_EXPORT SparseTensorImpl<IndexType>::SparseTensorImpl( \
431 const NumericTensor<UInt32Type>&); \
432 template ARROW_EXPORT SparseTensorImpl<IndexType>::SparseTensorImpl( \
433 const NumericTensor<UInt64Type>&); \
434 template ARROW_EXPORT SparseTensorImpl<IndexType>::SparseTensorImpl( \
435 const NumericTensor<Int8Type>&); \
436 template ARROW_EXPORT SparseTensorImpl<IndexType>::SparseTensorImpl( \
437 const NumericTensor<Int16Type>&); \
438 template ARROW_EXPORT SparseTensorImpl<IndexType>::SparseTensorImpl( \
439 const NumericTensor<Int32Type>&); \
440 template ARROW_EXPORT SparseTensorImpl<IndexType>::SparseTensorImpl( \
441 const NumericTensor<Int64Type>&); \
442 template ARROW_EXPORT SparseTensorImpl<IndexType>::SparseTensorImpl( \
443 const NumericTensor<HalfFloatType>&); \
444 template ARROW_EXPORT SparseTensorImpl<IndexType>::SparseTensorImpl( \
445 const NumericTensor<FloatType>&); \
446 template ARROW_EXPORT SparseTensorImpl<IndexType>::SparseTensorImpl( \
447 const NumericTensor<DoubleType>&)
448
449INSTANTIATE_SPARSE_TENSOR(SparseCOOIndex);
450INSTANTIATE_SPARSE_TENSOR(SparseCSRIndex);
451
452} // namespace arrow
453