sparse_tensor.cc source code [arrow/arrow/sparse_tensor.cc]

1	// Licensed to the Apache Software Foundation (ASF) under one
2	// or more contributor license agreements. See the NOTICE file
3	// distributed with this work for additional information
4	// regarding copyright ownership. The ASF licenses this file
5	// to you under the Apache License, Version 2.0 (the
6	// "License"); you may not use this file except in compliance
7	// with the License. You may obtain a copy of the License at
8	//
9	// http://www.apache.org/licenses/LICENSE-2.0
10	//
11	// Unless required by applicable law or agreed to in writing,
12	// software distributed under the License is distributed on an
13	// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14	// KIND, either express or implied. See the License for the
15	// specific language governing permissions and limitations
16	// under the License.
17
18	#include "arrow/sparse_tensor.h"
19
20	#include <functional>
21	#include <memory>
22	#include <numeric>
23
24	#include "arrow/compare.h"
25	#include "arrow/util/logging.h"
26
27	namespace arrow {
28
29	namespace {
30
31	// ----------------------------------------------------------------------
32	// SparseTensorConverter
33
34	template <typename TYPE, typename SparseIndexType>
35	class SparseTensorConverter {
36	public:
37	explicit SparseTensorConverter(const NumericTensor<TYPE>&) {}
38
39	Status Convert() { return Status::Invalid("Unsupported sparse index"); }
40	};
41
42	// ----------------------------------------------------------------------
43	// SparseTensorConverter for SparseCOOIndex
44
45	template <typename TYPE>
46	struct SparseTensorConverterBase {
47	using NumericTensorType = NumericTensor<TYPE>;
48	using value_type = typename NumericTensorType::value_type;
49
50	explicit SparseTensorConverterBase(const NumericTensorType& tensor) : tensor_(tensor) {}
51
52	bool TensorIsTriviallyIterable() const {
53	return tensor_.ndim() <= `1` \|\| tensor_.is_contiguous();
54	}
55
56	size_t CountNonZero() const {
57	if (tensor_.size() == `0`) {
58	return `0`;
59	}
60
61	if (TensorIsTriviallyIterable()) {
62	const value_type* data = reinterpret_cast<const value_type*>(tensor_.raw_data());
63	return std::count_if(data, data + tensor_.size(),
64	[](value_type x) { return x != `0`; });
65	}
66
67	const std::vector<int64_t>& shape = tensor_.shape();
68	const int64_t ndim = tensor_.ndim();
69
70	size_t count = `0`;
71	std::vector<int64_t> coord(ndim, `0`);
72	for (int64_t n = tensor_.size(); n > `0`; n--) {
73	if (tensor_.Value(coord) != `0`) {
74	++count;
75	}
76
77	// increment index
78	++coord [ndim - `1`];
79	if (n > `1` && coord [ndim - `1`] == shape [ndim - `1`]) {
80	int64_t d = ndim - `1`;
81	while (d > `0` && coord [d] == shape [d]) {
82	coord [d] = `0`;
83	++coord [d - `1`];
84	--d;
85	}
86	}
87	}
88	return count;
89	}
90
91	const NumericTensorType& tensor_;
92	};
93
94	template <typename TYPE>
95	class SparseTensorConverter<TYPE, SparseCOOIndex>
96	: private SparseTensorConverterBase<TYPE> {
97	public:
98	using BaseClass = SparseTensorConverterBase<TYPE>;
99	using NumericTensorType = typename BaseClass::NumericTensorType;
100	using value_type = typename BaseClass::value_type;
101
102	explicit SparseTensorConverter(const NumericTensorType& tensor) : BaseClass(tensor) {}
103
104	Status Convert() {
105	const int64_t ndim = tensor_.ndim();
106	const int64_t nonzero_count = static_cast<int64_t>(CountNonZero());
107
108	std::shared_ptr<Buffer> indices_buffer;
109	RETURN_NOT_OK(
110	AllocateBuffer(sizeof(int64_t) * ndim * nonzero_count, &indices_buffer));
111	int64_t* indices = reinterpret_cast<int64_t*>(indices_buffer ->mutable_data());
112
113	std::shared_ptr<Buffer> values_buffer;
114	RETURN_NOT_OK(AllocateBuffer(sizeof(value_type) * nonzero_count, &values_buffer));
115	value_type* values = reinterpret_cast<value_type*>(values_buffer ->mutable_data());
116
117	if (ndim <= `1`) {
118	const value_type* data = reinterpret_cast<const value_type*>(tensor_.raw_data());
119	const int64_t count = ndim == `0` ? `1` : tensor_.shape()[`0`];
120	for (int64_t i = `0`; i < count; ++i, ++data) {
121	if (*data != `0`) {
122	*indices++ = i;
123	values++ = data;
124	}
125	}
126	} else {
127	const std::vector<int64_t>& shape = tensor_.shape();
128	std::vector<int64_t> coord(ndim, `0`);
129
130	for (int64_t n = tensor_.size(); n > `0`; n--) {
131	const value_type x = tensor_.Value(coord);
132	if (tensor_.Value(coord) != `0`) {
133	*values++ = x;
134
135	int64_t* indp = indices;
136	for (int64_t i = `0`; i < ndim; ++i) {
137	*indp = coord [i];
138	indp += nonzero_count;
139	}
140	indices++;
141	}
142
143	// increment index
144	++coord [ndim - `1`];
145	if (n > `1` && coord [ndim - `1`] == shape [ndim - `1`]) {
146	int64_t d = ndim - `1`;
147	while (d > `0` && coord [d] == shape [d]) {
148	coord [d] = `0`;
149	++coord [d - `1`];
150	--d;
151	}
152	}
153	}
154	}
155
156	// make results
157	const std::vector<int64_t> indices_shape = {nonzero_count, ndim};
158	const int64_t indices_elsize = sizeof(int64_t);
159	const std::vector<int64_t> indices_strides = {indices_elsize,
160	indices_elsize * nonzero_count};
161	sparse_index =
162	std::make_shared<SparseCOOIndex>(std::make_shared<SparseCOOIndex::CoordsTensor>(
163	indices_buffer, indices_shape, indices_strides));
164	data = values_buffer;
165
166	return Status::OK();
167	}
168
169	std::shared_ptr<SparseCOOIndex> sparse_index;
170	std::shared_ptr<Buffer> data;
171
172	private:
173	using SparseTensorConverterBase<TYPE>::tensor_;
174	using SparseTensorConverterBase<TYPE>::CountNonZero;
175	};
176
177	template <typename TYPE, typename SparseIndexType>
178	void MakeSparseTensorFromTensor(const Tensor& tensor,
179	std::shared_ptr<SparseIndex>* sparse_index,
180	std::shared_ptr<Buffer>* data) {
181	NumericTensor<TYPE> numeric_tensor(tensor.data(), tensor.shape(), tensor.strides());
182	SparseTensorConverter<TYPE, SparseIndexType> converter(numeric_tensor);
183	DCHECK_OK(converter.Convert());
184	*sparse_index = converter.sparse_index;
185	*data = converter.data;
186	}
187
188	// ----------------------------------------------------------------------
189	// SparseTensorConverter for SparseCSRIndex
190
191	template <typename TYPE>
192	class SparseTensorConverter<TYPE, SparseCSRIndex>
193	: private SparseTensorConverterBase<TYPE> {
194	public:
195	using BaseClass = SparseTensorConverterBase<TYPE>;
196	using NumericTensorType = typename BaseClass::NumericTensorType;
197	using value_type = typename BaseClass::value_type;
198
199	explicit SparseTensorConverter(const NumericTensorType& tensor) : BaseClass(tensor) {}
200
201	Status Convert() {
202	const int64_t ndim = tensor_.ndim();
203	if (ndim > `2`) {
204	return Status::Invalid("Invalid tensor dimension");
205	}
206
207	const int64_t nr = tensor_.shape()[`0`];
208	const int64_t nc = tensor_.shape()[`1`];
209	const int64_t nonzero_count = static_cast<int64_t>(CountNonZero());
210
211	std::shared_ptr<Buffer> indptr_buffer;
212	std::shared_ptr<Buffer> indices_buffer;
213
214	std::shared_ptr<Buffer> values_buffer;
215	RETURN_NOT_OK(AllocateBuffer(sizeof(value_type) * nonzero_count, &values_buffer));
216	value_type* values = reinterpret_cast<value_type*>(values_buffer ->mutable_data());
217
218	if (ndim <= `1`) {
219	return Status::NotImplemented("TODO for ndim <= 1");
220	} else {
221	RETURN_NOT_OK(AllocateBuffer(sizeof(int64_t) * (nr + `1`), &indptr_buffer));
222	int64_t* indptr = reinterpret_cast<int64_t*>(indptr_buffer ->mutable_data());
223
224	RETURN_NOT_OK(AllocateBuffer(sizeof(int64_t) * nonzero_count, &indices_buffer));
225	int64_t* indices = reinterpret_cast<int64_t*>(indices_buffer ->mutable_data());
226
227	int64_t k = `0`;
228	*indptr++ = `0`;
229	for (int64_t i = `0`; i < nr; ++i) {
230	for (int64_t j = `0`; j < nc; ++j) {
231	const value_type x = tensor_.Value({i, j});
232	if (x != `0`) {
233	*values++ = x;
234	*indices++ = j;
235	k++;
236	}
237	}
238	*indptr++ = k;
239	}
240	}
241
242	std::vector<int64_t> indptr_shape({nr + `1`});
243	std::shared_ptr<SparseCSRIndex::IndexTensor> indptr_tensor =
244	std::make_shared<SparseCSRIndex::IndexTensor>(indptr_buffer, indptr_shape);
245
246	std::vector<int64_t> indices_shape({nonzero_count});
247	std::shared_ptr<SparseCSRIndex::IndexTensor> indices_tensor =
248	std::make_shared<SparseCSRIndex::IndexTensor>(indices_buffer, indices_shape);
249
250	sparse_index = std::make_shared<SparseCSRIndex>(indptr_tensor, indices_tensor);
251	data = values_buffer;
252
253	return Status::OK();
254	}
255
256	std::shared_ptr<SparseCSRIndex> sparse_index;
257	std::shared_ptr<Buffer> data;
258
259	private:
260	using BaseClass::tensor_;
261	using SparseTensorConverterBase<TYPE>::CountNonZero;
262	};
263
264	// ----------------------------------------------------------------------
265	// Instantiate templates
266
267	#define INSTANTIATE_SPARSE_TENSOR_CONVERTER(IndexType) \
268	template class SparseTensorConverter<UInt8Type, IndexType>; \
269	template class SparseTensorConverter<UInt16Type, IndexType>; \
270	template class SparseTensorConverter<UInt32Type, IndexType>; \
271	template class SparseTensorConverter<UInt64Type, IndexType>; \
272	template class SparseTensorConverter<Int8Type, IndexType>; \
273	template class SparseTensorConverter<Int16Type, IndexType>; \
274	template class SparseTensorConverter<Int32Type, IndexType>; \
275	template class SparseTensorConverter<Int64Type, IndexType>; \
276	template class SparseTensorConverter<HalfFloatType, IndexType>; \
277	template class SparseTensorConverter<FloatType, IndexType>; \
278	template class SparseTensorConverter<DoubleType, IndexType>
279
280	INSTANTIATE_SPARSE_TENSOR_CONVERTER(SparseCOOIndex);
281	INSTANTIATE_SPARSE_TENSOR_CONVERTER(SparseCSRIndex);
282
283	} // namespace
284
285	// ----------------------------------------------------------------------
286	// SparseCOOIndex
287
288	// Constructor with a column-major NumericTensor
289	SparseCOOIndex::SparseCOOIndex(const std::shared_ptr<CoordsTensor>& coords)
290	: SparseIndexBase (coords ->shape()[`0`]), coords_(coords) {
291	DCHECK(coords_->is_column_major());
292	}
293
294	std::string SparseCOOIndex::ToString() const { return std::string ("SparseCOOIndex"); }
295
296	// ----------------------------------------------------------------------
297	// SparseCSRIndex
298
299	// Constructor with two index vectors
300	SparseCSRIndex::SparseCSRIndex(const std::shared_ptr<IndexTensor>& indptr,
301	const std::shared_ptr<IndexTensor>& indices)
302	: SparseIndexBase (indices ->shape()[`0`]), indptr_(indptr), indices_(indices) {
303	DCHECK_EQ(`1`, indptr_->ndim());
304	DCHECK_EQ(`1`, indices_->ndim());
305	}
306
307	std::string SparseCSRIndex::ToString() const { return std::string ("SparseCSRIndex"); }
308
309	// ----------------------------------------------------------------------
310	// SparseTensor
311
312	// Constructor with all attributes
313	SparseTensor::SparseTensor(const std::shared_ptr<DataType>& type,
314	const std::shared_ptr<Buffer>& data,
315	const std::vector<int64_t>& shape,
316	const std::shared_ptr<SparseIndex>& sparse_index,
317	const std::vector<std::string>& dim_names)
318	: type_(type),
319	data_(data),
320	shape_(shape),
321	sparse_index_(sparse_index),
322	dim_names_(dim_names) {
323	DCHECK(is_tensor_supported(type ->id()));
324	}
325
326	const std::string& SparseTensor::dim_name(int i) const {
327	static const std::string kEmpty = "";
328	if (dim_names_.size() == `0`) {
329	return kEmpty;
330	} else {
331	DCHECK_LT(i, static_cast<int>(dim_names_.size()));
332	return dim_names_[i];
333	}
334	}
335
336	int64_t SparseTensor::size() const {
337	return std::accumulate(shape_.begin(), shape_.end(), `1LL`, std::multiplies<int64_t>());
338	}
339
340	bool SparseTensor::Equals(const SparseTensor& other) const {
341	return SparseTensorEquals(*this, other);
342	}
343
344	// ----------------------------------------------------------------------
345	// SparseTensorImpl
346
347	// Constructor with a dense tensor
348	template <typename SparseIndexType>
349	SparseTensorImpl<SparseIndexType>::SparseTensorImpl(
350	const std::shared_ptr<DataType>& type, const std::vector<int64_t>& shape,
351	const std::vector<std::string>& dim_names)
352	: SparseTensorImpl(nullptr, type, nullptr, shape, dim_names) {}
353
354	// Constructor with a dense tensor
355	template <typename SparseIndexType>
356	template <typename TYPE>
357	SparseTensorImpl<SparseIndexType>::SparseTensorImpl(const NumericTensor<TYPE>& tensor)
358	: SparseTensorImpl(nullptr, tensor.type(), nullptr, tensor.shape(),
359	tensor.dim_names_) {
360	SparseTensorConverter<TYPE, SparseIndexType> converter(tensor);
361	DCHECK_OK(converter.Convert());
362	sparse_index_ = converter.sparse_index;
363	data_ = converter.data;
364	}
365
366	// Constructor with a dense tensor
367	template <typename SparseIndexType>
368	SparseTensorImpl<SparseIndexType>::SparseTensorImpl(const Tensor& tensor)
369	: SparseTensorImpl(nullptr, tensor.type(), nullptr, tensor.shape(),
370	tensor.dim_names_) {
371	switch (tensor.type()->id()) {
372	case Type::UINT8:
373	MakeSparseTensorFromTensor<UInt8Type, SparseIndexType>(tensor, &sparse_index_,
374	&data_);
375	return;
376	case Type::INT8:
377	MakeSparseTensorFromTensor<Int8Type, SparseIndexType>(tensor, &sparse_index_,
378	&data_);
379	return;
380	case Type::UINT16:
381	MakeSparseTensorFromTensor<UInt16Type, SparseIndexType>(tensor, &sparse_index_,
382	&data_);
383	return;
384	case Type::INT16:
385	MakeSparseTensorFromTensor<Int16Type, SparseIndexType>(tensor, &sparse_index_,
386	&data_);
387	return;
388	case Type::UINT32:
389	MakeSparseTensorFromTensor<UInt32Type, SparseIndexType>(tensor, &sparse_index_,
390	&data_);
391	return;
392	case Type::INT32:
393	MakeSparseTensorFromTensor<Int32Type, SparseIndexType>(tensor, &sparse_index_,
394	&data_);
395	return;
396	case Type::UINT64:
397	MakeSparseTensorFromTensor<UInt64Type, SparseIndexType>(tensor, &sparse_index_,
398	&data_);
399	return;
400	case Type::INT64:
401	MakeSparseTensorFromTensor<Int64Type, SparseIndexType>(tensor, &sparse_index_,
402	&data_);
403	return;
404	case Type::HALF_FLOAT:
405	MakeSparseTensorFromTensor<HalfFloatType, SparseIndexType>(tensor, &sparse_index_,
406	&data_);
407	return;
408	case Type::FLOAT:
409	MakeSparseTensorFromTensor<FloatType, SparseIndexType>(tensor, &sparse_index_,
410	&data_);
411	return;
412	case Type::DOUBLE:
413	MakeSparseTensorFromTensor<DoubleType, SparseIndexType>(tensor, &sparse_index_,
414	&data_);
415	return;
416	default:
417	break;
418	}
419	}
420
421	// ----------------------------------------------------------------------
422	// Instantiate templates
423
424	#define INSTANTIATE_SPARSE_TENSOR(IndexType) \
425	template class ARROW_TEMPLATE_EXPORT SparseTensorImpl<IndexType>; \
426	template ARROW_EXPORT SparseTensorImpl<IndexType>::SparseTensorImpl( \
427	const NumericTensor<UInt8Type>&); \
428	template ARROW_EXPORT SparseTensorImpl<IndexType>::SparseTensorImpl( \
429	const NumericTensor<UInt16Type>&); \
430	template ARROW_EXPORT SparseTensorImpl<IndexType>::SparseTensorImpl( \
431	const NumericTensor<UInt32Type>&); \
432	template ARROW_EXPORT SparseTensorImpl<IndexType>::SparseTensorImpl( \
433	const NumericTensor<UInt64Type>&); \
434	template ARROW_EXPORT SparseTensorImpl<IndexType>::SparseTensorImpl( \
435	const NumericTensor<Int8Type>&); \
436	template ARROW_EXPORT SparseTensorImpl<IndexType>::SparseTensorImpl( \
437	const NumericTensor<Int16Type>&); \
438	template ARROW_EXPORT SparseTensorImpl<IndexType>::SparseTensorImpl( \
439	const NumericTensor<Int32Type>&); \
440	template ARROW_EXPORT SparseTensorImpl<IndexType>::SparseTensorImpl( \
441	const NumericTensor<Int64Type>&); \
442	template ARROW_EXPORT SparseTensorImpl<IndexType>::SparseTensorImpl( \
443	const NumericTensor<HalfFloatType>&); \
444	template ARROW_EXPORT SparseTensorImpl<IndexType>::SparseTensorImpl( \
445	const NumericTensor<FloatType>&); \
446	template ARROW_EXPORT SparseTensorImpl<IndexType>::SparseTensorImpl( \
447	const NumericTensor<DoubleType>&)
448
449	INSTANTIATE_SPARSE_TENSOR(SparseCOOIndex);
450	INSTANTIATE_SPARSE_TENSOR(SparseCSRIndex);
451
452	} // namespace arrow
453

Browse the source code of arrow/arrow/sparse_tensor.cc