basisu_enc.h source code [Godot/thirdparty/basis_universal/encoder/basisu_enc.h]

1	// basisu_enc.h
2	// Copyright (C) 2019-2021 Binomial LLC. All Rights Reserved.
3	//
4	// Licensed under the Apache License, Version 2.0 (the "License");
5	// you may not use this file except in compliance with the License.
6	// You may obtain a copy of the License at
7	//
8	// http://www.apache.org/licenses/LICENSE-2.0
9	//
10	// Unless required by applicable law or agreed to in writing, software
11	// distributed under the License is distributed on an "AS IS" BASIS,
12	// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13	// See the License for the specific language governing permissions and
14	// limitations under the License.
15	#pragma once
16	#include "../transcoder/basisu.h"
17	#include "../transcoder/basisu_transcoder_internal.h"
18
19	#include <mutex>
20	#include <atomic>
21	#include <condition_variable>
22	#include <functional>
23	#include <thread>
24	#include <unordered_map>
25	#include <ostream>
26
27	#if !defined(_WIN32) \|\| defined(__MINGW32__)
28	#include <libgen.h>
29	#endif
30
31	// This module is really just a huge grab bag of classes and helper functions needed by the encoder.
32
33	// If BASISU_USE_HIGH_PRECISION_COLOR_DISTANCE is 1, quality in perceptual mode will be slightly greater, but at a large increase in encoding CPU time.
34	#define BASISU_USE_HIGH_PRECISION_COLOR_DISTANCE (0)
35
36	#if BASISU_SUPPORT_SSE
37	// Declared in basisu_kernels_imp.h, but we can't include that here otherwise it would lead to circular type errors.
38	extern void update_covar_matrix_16x16_sse41(uint32_t num_vecs, const void* pWeighted_vecs, const void* pOrigin, const uint32_t pVec_indices, void** pMatrix16x16);
39	#endif
40
41	namespace basisu
42	{
43	extern uint8_t g_hamming_dist[`256`];
44	extern const uint8_t g_debug_font8x8_basic[`127` - `32` + `1`][`8`];
45
46	// true if basisu_encoder_init() has been called and returned.
47	extern bool g_library_initialized;
48
49	// Encoder library initialization.
50	// This function MUST be called before encoding anything!
51	void basisu_encoder_init(bool use_opencl = false, bool opencl_force_serialization = false);
52	void basisu_encoder_deinit();
53
54	// basisu_kernels_sse.cpp - will be a no-op and g_cpu_supports_sse41 will always be false unless compiled with BASISU_SUPPORT_SSE=1
55	extern void detect_sse41();
56
57	#if BASISU_SUPPORT_SSE
58	extern bool g_cpu_supports_sse41;
59	#else
60	const bool g_cpu_supports_sse41 = false;
61	#endif
62
63	void error_vprintf(const char* pFmt, va_list args);
64	void error_printf(const char *pFmt, ...);
65
66	// Helpers
67
68	inline uint8_t clamp255(int32_t i)
69	{
70	return (uint8_t)((i & `0xFFFFFF00U`) ? (~(i >> `31`)) : i);
71	}
72
73	inline int32_t clampi(int32_t value, int32_t low, int32_t high)
74	{
75	if (value < low)
76	value = low;
77	else if (value > high)
78	value = high;
79	return value;
80	}
81
82	inline uint8_t mul_8(uint32_t v, uint32_t a)
83	{
84	v = v * a + `128`;
85	return (uint8_t)((v + (v >> `8`)) >> `8`);
86	}
87
88	inline uint64_t read_bits(const uint8_t* pBuf, uint32_t& bit_offset, uint32_t codesize)
89	{
90	assert(codesize <= `64`);
91	uint64_t bits = `0`;
92	uint32_t total_bits = `0`;
93
94	while (total_bits < codesize)
95	{
96	uint32_t byte_bit_offset = bit_offset & `7`;
97	uint32_t bits_to_read = minimum<int>(codesize - total_bits, `8` - byte_bit_offset);
98
99	uint32_t byte_bits = pBuf[bit_offset >> `3`] >> byte_bit_offset;
100	byte_bits &= ((`1` << bits_to_read) - `1`);
101
102	bits \|= ((uint64_t)(byte_bits) << total_bits);
103
104	total_bits += bits_to_read;
105	bit_offset += bits_to_read;
106	}
107
108	return bits;
109	}
110
111	inline uint32_t read_bits32(const uint8_t* pBuf, uint32_t& bit_offset, uint32_t codesize)
112	{
113	assert(codesize <= `32`);
114	uint32_t bits = `0`;
115	uint32_t total_bits = `0`;
116
117	while (total_bits < codesize)
118	{
119	uint32_t byte_bit_offset = bit_offset & `7`;
120	uint32_t bits_to_read = minimum<int>(codesize - total_bits, `8` - byte_bit_offset);
121
122	uint32_t byte_bits = pBuf[bit_offset >> `3`] >> byte_bit_offset;
123	byte_bits &= ((`1` << bits_to_read) - `1`);
124
125	bits \|= (byte_bits << total_bits);
126
127	total_bits += bits_to_read;
128	bit_offset += bits_to_read;
129	}
130
131	return bits;
132	}
133
134	// Hashing
135
136	inline uint32_t bitmix32c(uint32_t v)
137	{
138	v = (v + `0x7ed55d16`) + (v << `12`);
139	v = (v ^ `0xc761c23c`) ^ (v >> `19`);
140	v = (v + `0x165667b1`) + (v << `5`);
141	v = (v + `0xd3a2646c`) ^ (v << `9`);
142	v = (v + `0xfd7046c5`) + (v << `3`);
143	v = (v ^ `0xb55a4f09`) ^ (v >> `16`);
144	return v;
145	}
146
147	inline uint32_t bitmix32(uint32_t v)
148	{
149	v -= (v << `6`);
150	v ^= (v >> `17`);
151	v -= (v << `9`);
152	v ^= (v << `4`);
153	v -= (v << `3`);
154	v ^= (v << `10`);
155	v ^= (v >> `15`);
156	return v;
157	}
158
159	inline uint32_t wang_hash(uint32_t seed)
160	{
161	seed = (seed ^ `61`) ^ (seed >> `16`);
162	seed *= `9`;
163	seed = seed ^ (seed >> `4`);
164	seed *= `0x27d4eb2d`;
165	seed = seed ^ (seed >> `15`);
166	return seed;
167	}
168
169	uint32_t hash_hsieh(const uint8_t* pBuf, size_t len);
170
171	template <typename Key>
172	struct bit_hasher
173	{
174	std::size_t operator()(const Key& k) const
175	{
176	return hash_hsieh(reinterpret_cast<const uint8_t >(&k), sizeof*(k));
177	}
178	};
179
180	class running_stat
181	{
182	public:
183	running_stat() { clear(); }
184
185	void clear()
186	{
187	m_n = `0`;
188	m_total = `0`;
189	m_old_m = `0`;
190	m_new_m = `0`;
191	m_old_s = `0`;
192	m_new_s = `0`;
193	m_min = `0`;
194	m_max = `0`;
195	}
196
197	void push(double x)
198	{
199	m_n++;
200	m_total += x;
201	if (m_n == `1`)
202	{
203	m_old_m = m_new_m = x;
204	m_old_s = `0.0`;
205	m_min = x;
206	m_max = x;
207	}
208	else
209	{
210	// See Knuth TAOCP vol 2, 3rd edition, page 232
211	m_new_m = m_old_m + (x - m_old_m) / m_n;
212	m_new_s = m_old_s + (x - m_old_m) * (x - m_new_m);
213	m_old_m = m_new_m;
214	m_old_s = m_new_s;
215	m_min = basisu::minimum(x, m_min);
216	m_max = basisu::maximum(x, m_max);
217	}
218	}
219
220	uint32_t get_num() const
221	{
222	return m_n;
223	}
224
225	double get_total() const
226	{
227	return m_total;
228	}
229
230	double get_mean() const
231	{
232	return (m_n > `0`) ? m_new_m : `0.0`;
233	}
234
235	// Returns sample variance
236	double get_variance() const
237	{
238	return ((m_n > `1`) ? m_new_s / (m_n - `1`) : `0.0`);
239	}
240
241	double get_std_dev() const
242	{
243	return sqrt(get_variance());
244	}
245
246	double get_min() const
247	{
248	return m_min;
249	}
250
251	double get_max() const
252	{
253	return m_max;
254	}
255
256	private:
257	uint32_t m_n;
258	double m_total, m_old_m, m_new_m, m_old_s, m_new_s, m_min, m_max;
259	};
260
261	// Linear algebra
262
263	template <uint32_t N, typename T>
264	class vec
265	{
266	protected:
267	T m_v[N];
268
269	public:
270	enum { num_elements = N };
271
272	inline vec() { }
273	inline vec(eZero) { set_zero(); }
274
275	explicit inline vec(T val) { set(val); }
276	inline vec(T v0, T v1) { set(v0, v1); }
277	inline vec(T v0, T v1, T v2) { set(v0, v1, v2); }
278	inline vec(T v0, T v1, T v2, T v3) { set(v0, v1, v2, v3); }
279	inline vec(const vec &other) { for (uint32_t i = `0`; i < N; i++) m_v[i] = other.m_v[i]; }
280	template <uint32_t OtherN, typename OtherT> inline vec(const vec<OtherN, OtherT> &other) { set(other); }
281
282	inline T operator[](uint32_t i) const { assert(i < N); return m_v[i]; }
283	inline T &operator[](uint32_t i) { assert(i < N); return m_v[i]; }
284
285	inline T getX() const { return m_v[`0`]; }
286	inline T getY() const { static_assert(N >= `2`, "N too small"); return m_v[`1`]; }
287	inline T getZ() const { static_assert(N >= `3`, "N too small"); return m_v[`2`]; }
288	inline T getW() const { static_assert(N >= `4`, "N too small"); return m_v[`3`]; }
289
290	inline bool operator==(const vec &rhs) const { for (uint32_t i = `0`; i < N; i++) if (m_v[i] != rhs.m_v[i]) return false; return true; }
291	inline bool operator<(const vec &rhs) const { for (uint32_t i = `0`; i < N; i++) { if (m_v[i] < rhs.m_v[i]) return true; else if (m_v[i] != rhs.m_v[i]) return false; } return false; }
292
293	inline void set_zero() { for (uint32_t i = `0`; i < N; i++) m_v[i] = `0`; }
294
295	template <uint32_t OtherN, typename OtherT>
296	inline vec &set(const vec<OtherN, OtherT> &other)
297	{
298	uint32_t i;
299	if ((const void )(&other) == (const* void )(this*))
300	return *this;
301	const uint32_t m = minimum(OtherN, N);
302	for (i = `0`; i < m; i++)
303	m_v[i] = static_cast<T>(other[i]);
304	for (; i < N; i++)
305	m_v[i] = `0`;
306	return *this;
307	}
308
309	inline vec &set_component(uint32_t index, T val) { assert(index < N); m_v[index] = val; return *this; }
310	inline vec &set(T val) { for (uint32_t i = `0`; i < N; i++) m_v[i] = val; return *this; }
311	inline void clear_elements(uint32_t s, uint32_t e) { assert(e <= N); for (uint32_t i = s; i < e; i++) m_v[i] = `0`; }
312
313	inline vec &set(T v0, T v1)
314	{
315	m_v[`0`] = v0;
316	if (N >= `2`)
317	{
318	m_v[`1`] = v1;
319	clear_elements(`2`, N);
320	}
321	return *this;
322	}
323
324	inline vec &set(T v0, T v1, T v2)
325	{
326	m_v[`0`] = v0;
327	if (N >= `2`)
328	{
329	m_v[`1`] = v1;
330	if (N >= `3`)
331	{
332	m_v[`2`] = v2;
333	clear_elements(`3`, N);
334	}
335	}
336	return *this;
337	}
338
339	inline vec &set(T v0, T v1, T v2, T v3)
340	{
341	m_v[`0`] = v0;
342	if (N >= `2`)
343	{
344	m_v[`1`] = v1;
345	if (N >= `3`)
346	{
347	m_v[`2`] = v2;
348
349	if (N >= `4`)
350	{
351	m_v[`3`] = v3;
352	clear_elements(`5`, N);
353	}
354	}
355	}
356	return *this;
357	}
358
359	inline vec &operator=(const vec &rhs) { if (this != &rhs) for (uint32_t i = `0`; i < N; i++) m_v[i] = rhs.m_v[i]; return *this; }
360	template <uint32_t OtherN, typename OtherT> inline vec &operator=(const vec<OtherN, OtherT> &rhs) { set(rhs); return *this; }
361
362	inline const T get_ptr() const* { return reinterpret_cast<const T *>(&m_v[`0`]); }
363	inline T get_ptr() { return* reinterpret_cast<T *>(&m_v[`0`]); }
364
365	inline vec operator- () const { vec res; for (uint32_t i = `0`; i < N; i++) res.m_v[i] = -m_v[i]; return res; }
366	inline vec operator+ () const { return *this; }
367	inline vec &operator+= (const vec &other) { for (uint32_t i = `0`; i < N; i++) m_v[i] += other.m_v[i]; return *this; }
368	inline vec &operator-= (const vec &other) { for (uint32_t i = `0`; i < N; i++) m_v[i] -= other.m_v[i]; return *this; }
369	inline vec &operator/= (const vec &other) { for (uint32_t i = `0`; i < N; i++) m_v[i] /= other.m_v[i]; return *this; }
370	inline vec &operator=(const* vec &other) { for (uint32_t i = `0`; i < N; i++) m_v[i] = other.m_v[i]; return* *this; }
371	inline vec &operator/= (T s) { for (uint32_t i = `0`; i < N; i++) m_v[i] /= s; return *this; }
372	inline vec &operator= (T s) { for* (uint32_t i = `0`; i < N; i++) m_v[i] = s; return* *this; }
373
374	friend inline vec operator+(const vec &lhs, const vec &rhs) { vec res; for (uint32_t i = `0`; i < N; i++) res.m_v[i] = lhs.m_v[i] + rhs.m_v[i]; return res; }
375	friend inline vec operator-(const vec &lhs, const vec &rhs) { vec res; for (uint32_t i = `0`; i < N; i++) res.m_v[i] = lhs.m_v[i] - rhs.m_v[i]; return res; }
376	friend inline vec operator(const* vec &lhs, T val) { vec res; for (uint32_t i = `0`; i < N; i++) res.m_v[i] = lhs.m_v[i] * val; return res; }
377	friend inline vec operator(T val, const* vec &rhs) { vec res; for (uint32_t i = `0`; i < N; i++) res.m_v[i] = val * rhs.m_v[i]; return res; }
378	friend inline vec operator/(const vec &lhs, T val) { vec res; for (uint32_t i = `0`; i < N; i++) res.m_v[i] = lhs.m_v[i] / val; return res; }
379	friend inline vec operator/(const vec &lhs, const vec &rhs) { vec res; for (uint32_t i = `0`; i < N; i++) res.m_v[i] = lhs.m_v[i] / rhs.m_v[i]; return res; }
380
381	static inline T dot_product(const vec &lhs, const vec &rhs) { T res = lhs.m_v[`0`] * rhs.m_v[`0`]; for (uint32_t i = `1`; i < N; i++) res += lhs.m_v[i] * rhs.m_v[i]; return res; }
382
383	inline T dot(const vec &rhs) const { return dot_product(*this, rhs); }
384
385	inline T norm() const { return dot_product(*this, *this); }
386	inline T length() const { return sqrt(norm()); }
387
388	inline T squared_distance(const vec &other) const { T d2 = `0`; for (uint32_t i = `0`; i < N; i++) { T d = m_v[i] - other.m_v[i]; d2 += d * d; } return d2; }
389	inline double squared_distance_d(const vec& other) const { double d2 = `0`; for (uint32_t i = `0`; i < N; i++) { double d = (double)m_v[i] - (double)other.m_v[i]; d2 += d * d; } return d2; }
390
391	inline T distance(const vec &other) const { return static_cast<T>(sqrt(squared_distance(other))); }
392	inline double distance_d(const vec& other) const { return sqrt(squared_distance_d(other)); }
393
394	inline vec &normalize_in_place() { T len = length(); if (len != `0.0f`) *this = (`1.0f` / len); return* *this; }
395
396	inline vec &clamp(T l, T h)
397	{
398	for (uint32_t i = `0`; i < N; i++)
399	m_v[i] = basisu::clamp(m_v[i], l, h);
400	return *this;
401	}
402
403	static vec component_min(const vec& a, const vec& b)
404	{
405	vec res;
406	for (uint32_t i = `0`; i < N; i++)
407	res[i] = minimum(a[i], b[i]);
408	return res;
409	}
410
411	static vec component_max(const vec& a, const vec& b)
412	{
413	vec res;
414	for (uint32_t i = `0`; i < N; i++)
415	res[i] = maximum(a[i], b[i]);
416	return res;
417	}
418	};
419
420	typedef vec<`4`, double> vec4D;
421	typedef vec<`3`, double> vec3D;
422	typedef vec<`2`, double> vec2D;
423	typedef vec<`1`, double> vec1D;
424
425	typedef vec<`4`, float> vec4F;
426	typedef vec<`3`, float> vec3F;
427	typedef vec<`2`, float> vec2F;
428	typedef vec<`1`, float> vec1F;
429
430	typedef vec<`16`, float> vec16F;
431
432	template <uint32_t Rows, uint32_t Cols, typename T>
433	class matrix
434	{
435	public:
436	typedef vec<Rows, T> col_vec;
437	typedef vec<Cols, T> row_vec;
438
439	typedef T scalar_type;
440
441	enum { rows = Rows, cols = Cols };
442
443	protected:
444	row_vec m_r[Rows];
445
446	public:
447	inline matrix() {}
448	inline matrix(eZero) { set_zero(); }
449	inline matrix(const matrix &other) { for (uint32_t i = `0`; i < Rows; i++) m_r[i] = other.m_r[i]; }
450	inline matrix &operator=(const matrix &rhs) { if (this != &rhs) for (uint32_t i = `0`; i < Rows; i++) m_r[i] = rhs.m_r[i]; return *this; }
451
452	inline T operator()(uint32_t r, uint32_t c) const { assert((r < Rows) && (c < Cols)); return m_r[r][c]; }
453	inline T &operator()(uint32_t r, uint32_t c) { assert((r < Rows) && (c < Cols)); return m_r[r][c]; }
454
455	inline const row_vec &operator[](uint32_t r) const { assert(r < Rows); return m_r[r]; }
456	inline row_vec &operator[](uint32_t r) { assert(r < Rows); return m_r[r]; }
457
458	inline matrix &set_zero()
459	{
460	for (uint32_t i = `0`; i < Rows; i++)
461	m_r[i].set_zero();
462	return *this;
463	}
464
465	inline matrix &set_identity()
466	{
467	for (uint32_t i = `0`; i < Rows; i++)
468	{
469	m_r[i].set_zero();
470	if (i < Cols)
471	m_r[i][i] = `1.0f`;
472	}
473	return *this;
474	}
475	};
476
477	template<uint32_t N, typename VectorType>
478	inline VectorType compute_pca_from_covar(matrix<N, N, float> &cmatrix)
479	{
480	VectorType axis;
481	if (N == `1`)
482	axis.set(`1.0f`);
483	else
484	{
485	for (uint32_t i = `0`; i < N; i++)
486	axis[i] = lerp(`.75f`, `1.25f`, i * (`1.0f` / maximum<int>(N - `1`, `1`)));
487	}
488
489	VectorType prev_axis(axis);
490
491	// Power iterations
492	for (uint32_t power_iter = `0`; power_iter < `8`; power_iter++)
493	{
494	VectorType trial_axis;
495	double max_sum = `0`;
496
497	for (uint32_t i = `0`; i < N; i++)
498	{
499	double sum = `0`;
500	for (uint32_t j = `0`; j < N; j++)
501	sum += cmatrix[i][j] * axis[j];
502
503	trial_axis[i] = static_cast<float>(sum);
504
505	max_sum = maximum(fabs(sum), max_sum);
506	}
507
508	if (max_sum != `0.0f`)
509	trial_axis = static_cast<float*>(`1.0f` / max_sum);
510
511	VectorType delta_axis(prev_axis - trial_axis);
512
513	prev_axis = axis;
514	axis = trial_axis;
515
516	if (delta_axis.norm() < `.0024f`)
517	break;
518	}
519
520	return axis.normalize_in_place();
521	}
522
523	template<typename T> inline void indirect_sort(uint32_t num_indices, uint32_t* pIndices, const T* pKeys)
524	{
525	for (uint32_t i = `0`; i < num_indices; i++)
526	pIndices[i] = i;
527
528	std::sort(
529	pIndices,
530	pIndices + num_indices,
531	[pKeys](uint32_t a, uint32_t b) { return pKeys[a] < pKeys[b]; }
532	);
533	}
534
535	// 1-4 byte direct Radix sort.
536	template <typename T>
537	T* radix_sort(uint32_t num_vals, T* pBuf0, T* pBuf1, uint32_t key_ofs, uint32_t key_size)
538	{
539	assert(key_ofs < sizeof(T));
540	assert((key_size >= `1`) && (key_size <= `4`));
541
542	uint32_t hist[`256` * `4`];
543
544	memset(hist, `0`, sizeof(hist[`0`]) * `256` * key_size);
545
546	#define BASISU_GET_KEY(p) ((uint32_t )((uint8_t *)(p) + key_ofs))
547
548	if (key_size == `4`)
549	{
550	T* p = pBuf0;
551	T* q = pBuf0 + num_vals;
552	for (; p != q; p++)
553	{
554	const uint32_t key = BASISU_GET_KEY(p);
555
556	hist[key & `0xFF`]++;
557	hist[`256` + ((key >> `8`) & `0xFF`)]++;
558	hist[`512` + ((key >> `16`) & `0xFF`)]++;
559	hist[`768` + ((key >> `24`) & `0xFF`)]++;
560	}
561	}
562	else if (key_size == `3`)
563	{
564	T* p = pBuf0;
565	T* q = pBuf0 + num_vals;
566	for (; p != q; p++)
567	{
568	const uint32_t key = BASISU_GET_KEY(p);
569
570	hist[key & `0xFF`]++;
571	hist[`256` + ((key >> `8`) & `0xFF`)]++;
572	hist[`512` + ((key >> `16`) & `0xFF`)]++;
573	}
574	}
575	else if (key_size == `2`)
576	{
577	T* p = pBuf0;
578	T* q = pBuf0 + (num_vals >> `1`) * `2`;
579
580	for (; p != q; p += `2`)
581	{
582	const uint32_t key0 = BASISU_GET_KEY(p);
583	const uint32_t key1 = BASISU_GET_KEY(p + `1`);
584
585	hist[key0 & `0xFF`]++;
586	hist[`256` + ((key0 >> `8`) & `0xFF`)]++;
587
588	hist[key1 & `0xFF`]++;
589	hist[`256` + ((key1 >> `8`) & `0xFF`)]++;
590	}
591
592	if (num_vals & `1`)
593	{
594	const uint32_t key = BASISU_GET_KEY(p);
595
596	hist[key & `0xFF`]++;
597	hist[`256` + ((key >> `8`) & `0xFF`)]++;
598	}
599	}
600	else
601	{
602	assert(key_size == `1`);
603	if (key_size != `1`)
604	return NULL;
605
606	T* p = pBuf0;
607	T* q = pBuf0 + (num_vals >> `1`) * `2`;
608
609	for (; p != q; p += `2`)
610	{
611	const uint32_t key0 = BASISU_GET_KEY(p);
612	const uint32_t key1 = BASISU_GET_KEY(p + `1`);
613
614	hist[key0 & `0xFF`]++;
615	hist[key1 & `0xFF`]++;
616	}
617
618	if (num_vals & `1`)
619	{
620	const uint32_t key = BASISU_GET_KEY(p);
621	hist[key & `0xFF`]++;
622	}
623	}
624
625	T* pCur = pBuf0;
626	T* pNew = pBuf1;
627
628	for (uint32_t pass = `0`; pass < key_size; pass++)
629	{
630	const uint32_t* pHist = &hist[pass << `8`];
631
632	uint32_t offsets[`256`];
633
634	uint32_t cur_ofs = `0`;
635	for (uint32_t i = `0`; i < `256`; i += `2`)
636	{
637	offsets[i] = cur_ofs;
638	cur_ofs += pHist[i];
639
640	offsets[i + `1`] = cur_ofs;
641	cur_ofs += pHist[i + `1`];
642	}
643
644	const uint32_t pass_shift = pass << `3`;
645
646	T* p = pCur;
647	T* q = pCur + (num_vals >> `1`) * `2`;
648
649	for (; p != q; p += `2`)
650	{
651	uint32_t c0 = (BASISU_GET_KEY(p) >> pass_shift) & `0xFF`;
652	uint32_t c1 = (BASISU_GET_KEY(p + `1`) >> pass_shift) & `0xFF`;
653
654	if (c0 == c1)
655	{
656	uint32_t dst_offset0 = offsets[c0];
657
658	offsets[c0] = dst_offset0 + `2`;
659
660	pNew[dst_offset0] = p[`0`];
661	pNew[dst_offset0 + `1`] = p[`1`];
662	}
663	else
664	{
665	uint32_t dst_offset0 = offsets[c0]++;
666	uint32_t dst_offset1 = offsets[c1]++;
667
668	pNew[dst_offset0] = p[`0`];
669	pNew[dst_offset1] = p[`1`];
670	}
671	}
672
673	if (num_vals & `1`)
674	{
675	uint32_t c = (BASISU_GET_KEY(p) >> pass_shift) & `0xFF`;
676
677	uint32_t dst_offset = offsets[c];
678	offsets[c] = dst_offset + `1`;
679
680	pNew[dst_offset] = *p;
681	}
682
683	T* t = pCur;
684	pCur = pNew;
685	pNew = t;
686	}
687
688	return pCur;
689	}
690
691	#undef BASISU_GET_KEY
692
693	// Very simple job pool with no dependencies.
694	class job_pool
695	{
696	BASISU_NO_EQUALS_OR_COPY_CONSTRUCT(job_pool);
697
698	public:
699	// num_threads is the TOTAL number of job pool threads, including the calling thread! So 2=1 new thread, 3=2 new threads, etc.
700	job_pool(uint32_t num_threads);
701	~job_pool();
702
703	void add_job(const std::function<void()>& job);
704	void add_job(std::function<void()>&& job);
705
706	void wait_for_all();
707
708	size_t get_total_threads() const { return `1` + m_threads.size(); }
709
710	private:
711	std::vector<std::thread> m_threads;
712	std::vector<std::function<void()> > m_queue;
713
714	std::mutex m_mutex;
715	std::condition_variable m_has_work;
716	std::condition_variable m_no_more_jobs;
717
718	uint32_t m_num_active_jobs;
719
720	std::atomic<bool> m_kill_flag;
721
722	void job_thread(uint32_t index);
723	};
724
725	// Simple 32-bit color class
726
727	class color_rgba_i16
728	{
729	public:
730	union
731	{
732	int16_t m_comps[`4`];
733
734	struct
735	{
736	int16_t r;
737	int16_t g;
738	int16_t b;
739	int16_t a;
740	};
741	};
742
743	inline color_rgba_i16()
744	{
745	static_assert(sizeof(*this) == sizeof(int16_t)`4`, "sizeof(this) == sizeof(int16_t)*4");
746	}
747
748	inline color_rgba_i16(int sr, int sg, int sb, int sa)
749	{
750	set(sr, sg, sb, sa);
751	}
752
753	inline color_rgba_i16 &set(int sr, int sg, int sb, int sa)
754	{
755	m_comps[`0`] = (int16_t)clamp<int>(sr, INT16_MIN, INT16_MAX);
756	m_comps[`1`] = (int16_t)clamp<int>(sg, INT16_MIN, INT16_MAX);
757	m_comps[`2`] = (int16_t)clamp<int>(sb, INT16_MIN, INT16_MAX);
758	m_comps[`3`] = (int16_t)clamp<int>(sa, INT16_MIN, INT16_MAX);
759	return *this;
760	}
761	};
762
763	class color_rgba
764	{
765	public:
766	union
767	{
768	uint8_t m_comps[`4`];
769
770	struct
771	{
772	uint8_t r;
773	uint8_t g;
774	uint8_t b;
775	uint8_t a;
776	};
777	};
778
779	inline color_rgba()
780	{
781	static_assert(sizeof(*this) == `4`, "sizeof(*this) != 4");
782	static_assert(sizeof(*this) == sizeof(basist::color32), "sizeof(*this) != sizeof(basist::color32)");
783	}
784
785	// Not too hot about this idea.
786	inline color_rgba(const basist::color32& other) :
787	r(other.r),
788	g(other.g),
789	b(other.b),
790	a(other.a)
791	{
792	}
793
794	color_rgba& operator= (const basist::color32& rhs)
795	{
796	r = rhs.r;
797	g = rhs.g;
798	b = rhs.b;
799	a = rhs.a;
800	return *this;
801	}
802
803	inline color_rgba(int y)
804	{
805	set(y);
806	}
807
808	inline color_rgba(int y, int na)
809	{
810	set(y, na);
811	}
812
813	inline color_rgba(int sr, int sg, int sb, int sa)
814	{
815	set(sr, sg, sb, sa);
816	}
817
818	inline color_rgba(eNoClamp, int sr, int sg, int sb, int sa)
819	{
820	set_noclamp_rgba((uint8_t)sr, (uint8_t)sg, (uint8_t)sb, (uint8_t)sa);
821	}
822
823	inline color_rgba& set_noclamp_y(int y)
824	{
825	m_comps[`0`] = (uint8_t)y;
826	m_comps[`1`] = (uint8_t)y;
827	m_comps[`2`] = (uint8_t)y;
828	m_comps[`3`] = (uint8_t)`255`;
829	return *this;
830	}
831
832	inline color_rgba &set_noclamp_rgba(int sr, int sg, int sb, int sa)
833	{
834	m_comps[`0`] = (uint8_t)sr;
835	m_comps[`1`] = (uint8_t)sg;
836	m_comps[`2`] = (uint8_t)sb;
837	m_comps[`3`] = (uint8_t)sa;
838	return *this;
839	}
840
841	inline color_rgba &set(int y)
842	{
843	m_comps[`0`] = static_cast<uint8_t>(clamp<int>(y, `0`, `255`));
844	m_comps[`1`] = m_comps[`0`];
845	m_comps[`2`] = m_comps[`0`];
846	m_comps[`3`] = `255`;
847	return *this;
848	}
849
850	inline color_rgba &set(int y, int na)
851	{
852	m_comps[`0`] = static_cast<uint8_t>(clamp<int>(y, `0`, `255`));
853	m_comps[`1`] = m_comps[`0`];
854	m_comps[`2`] = m_comps[`0`];
855	m_comps[`3`] = static_cast<uint8_t>(clamp<int>(na, `0`, `255`));
856	return *this;
857	}
858
859	inline color_rgba &set(int sr, int sg, int sb, int sa)
860	{
861	m_comps[`0`] = static_cast<uint8_t>(clamp<int>(sr, `0`, `255`));
862	m_comps[`1`] = static_cast<uint8_t>(clamp<int>(sg, `0`, `255`));
863	m_comps[`2`] = static_cast<uint8_t>(clamp<int>(sb, `0`, `255`));
864	m_comps[`3`] = static_cast<uint8_t>(clamp<int>(sa, `0`, `255`));
865	return *this;
866	}
867
868	inline color_rgba &set_rgb(int sr, int sg, int sb)
869	{
870	m_comps[`0`] = static_cast<uint8_t>(clamp<int>(sr, `0`, `255`));
871	m_comps[`1`] = static_cast<uint8_t>(clamp<int>(sg, `0`, `255`));
872	m_comps[`2`] = static_cast<uint8_t>(clamp<int>(sb, `0`, `255`));
873	return *this;
874	}
875
876	inline color_rgba &set_rgb(const color_rgba &other)
877	{
878	r = other.r;
879	g = other.g;
880	b = other.b;
881	return *this;
882	}
883
884	inline const uint8_t &operator[] (uint32_t index) const { assert(index < `4`); return m_comps[index]; }
885	inline uint8_t &operator[] (uint32_t index) { assert(index < `4`); return m_comps[index]; }
886
887	inline void clear()
888	{
889	m_comps[`0`] = `0`;
890	m_comps[`1`] = `0`;
891	m_comps[`2`] = `0`;
892	m_comps[`3`] = `0`;
893	}
894
895	inline bool operator== (const color_rgba &rhs) const
896	{
897	if (m_comps[`0`] != rhs.m_comps[`0`]) return false;
898	if (m_comps[`1`] != rhs.m_comps[`1`]) return false;
899	if (m_comps[`2`] != rhs.m_comps[`2`]) return false;
900	if (m_comps[`3`] != rhs.m_comps[`3`]) return false;
901	return true;
902	}
903
904	inline bool operator!= (const color_rgba &rhs) const
905	{
906	return !(*this == rhs);
907	}
908
909	inline bool operator<(const color_rgba &rhs) const
910	{
911	for (int i = `0`; i < `4`; i++)
912	{
913	if (m_comps[i] < rhs.m_comps[i])
914	return true;
915	else if (m_comps[i] != rhs.m_comps[i])
916	return false;
917	}
918	return false;
919	}
920
921	inline int get_601_luma() const { return (`19595U` * m_comps[`0`] + `38470U` * m_comps[`1`] + `7471U` * m_comps[`2`] + `32768U`) >> `16U`; }
922	inline int get_709_luma() const { return (`13938U` * m_comps[`0`] + `46869U` * m_comps[`1`] + `4729U` * m_comps[`2`] + `32768U`) >> `16U`; }
923	inline int get_luma(bool luma_601) const { return luma_601 ? get_601_luma() : get_709_luma(); }
924
925	inline basist::color32 get_color32() const
926	{
927	return basist::color32 (r, g, b, a);
928	}
929
930	static color_rgba comp_min(const color_rgba& a, const color_rgba& b) { return color_rgba (basisu::minimum(a [`0`], b [`0`]), basisu::minimum(a [`1`], b [`1`]), basisu::minimum(a [`2`], b [`2`]), basisu::minimum(a [`3`], b [`3`])); }
931	static color_rgba comp_max(const color_rgba& a, const color_rgba& b) { return color_rgba (basisu::maximum(a [`0`], b [`0`]), basisu::maximum(a [`1`], b [`1`]), basisu::maximum(a [`2`], b [`2`]), basisu::maximum(a [`3`], b [`3`])); }
932	};
933
934	typedef basisu::vector<color_rgba> color_rgba_vec;
935
936	const color_rgba g_black_color(`0`, `0`, `0`, `255`);
937	const color_rgba g_black_trans_color(`0`, `0`, `0`, `0`);
938	const color_rgba g_white_color(`255`, `255`, `255`, `255`);
939
940	inline int color_distance(int r0, int g0, int b0, int r1, int g1, int b1)
941	{
942	int dr = r0 - r1, dg = g0 - g1, db = b0 - b1;
943	return dr * dr + dg * dg + db * db;
944	}
945
946	inline int color_distance(int r0, int g0, int b0, int a0, int r1, int g1, int b1, int a1)
947	{
948	int dr = r0 - r1, dg = g0 - g1, db = b0 - b1, da = a0 - a1;
949	return dr * dr + dg * dg + db * db + da * da;
950	}
951
952	inline int color_distance(const color_rgba &c0, const color_rgba &c1, bool alpha)
953	{
954	if (alpha)
955	return color_distance(c0.r, c0.g, c0.b, c0.a, c1.r, c1.g, c1.b, c1.a);
956	else
957	return color_distance(c0.r, c0.g, c0.b, c1.r, c1.g, c1.b);
958	}
959
960	// TODO: Allow user to control channel weightings.
961	inline uint32_t color_distance(bool perceptual, const color_rgba &e1, const color_rgba &e2, bool alpha)
962	{
963	if (perceptual)
964	{
965	#if BASISU_USE_HIGH_PRECISION_COLOR_DISTANCE
966	const float l1 = e1.r * `.2126f` + e1.g * `.715f` + e1.b * `.0722f`;
967	const float l2 = e2.r * `.2126f` + e2.g * `.715f` + e2.b * `.0722f`;
968
969	const float cr1 = e1.r - l1;
970	const float cr2 = e2.r - l2;
971
972	const float cb1 = e1.b - l1;
973	const float cb2 = e2.b - l2;
974
975	const float dl = l1 - l2;
976	const float dcr = cr1 - cr2;
977	const float dcb = cb1 - cb2;
978
979	uint32_t d = static_cast<uint32_t>(`32.0f``4.0f`dldl + `32.0f``2.0f`(`.5f` / (`1.0f` - `.2126f`))(`.5f` / (`1.0f` - `.2126f`))dcrdcr + `32.0f``.25f`(`.5f` / (`1.0f` - `.0722f`))(`.5f` / (`1.0f` - `.0722f`))dcb*dcb);
980
981	if (alpha)
982	{
983	int da = static_cast<int>(e1.a) - static_cast<int>(e2.a);
984	d += static_cast<uint32_t>(`128.0f`dada);
985	}
986
987	return d;
988	#elif 1
989	int dr = e1.r - e2.r;
990	int dg = e1.g - e2.g;
991	int db = e1.b - e2.b;
992
993	#if 0
994	int delta_l = dr * `27` + dg * `92` + db * `9`;
995	int delta_cr = dr * `128` - delta_l;
996	int delta_cb = db * `128` - delta_l;
997
998	uint32_t id = ((uint32_t)(delta_l * delta_l) >> `7U`) +
999	((((uint32_t)(delta_cr * delta_cr) >> `7U`) * `26U`) >> `7U`) +
1000	((((uint32_t)(delta_cb * delta_cb) >> `7U`) * `3U`) >> `7U`);
1001	#else
1002	int64_t delta_l = dr * `27` + dg * `92` + db * `9`;
1003	int64_t delta_cr = dr * `128` - delta_l;
1004	int64_t delta_cb = db * `128` - delta_l;
1005
1006	uint32_t id = ((uint32_t)((delta_l * delta_l) >> `7U`)) +
1007	((((uint32_t)((delta_cr * delta_cr) >> `7U`)) * `26U`) >> `7U`) +
1008	((((uint32_t)((delta_cb * delta_cb) >> `7U`)) * `3U`) >> `7U`);
1009	#endif
1010
1011	if (alpha)
1012	{
1013	int da = (e1.a - e2.a) << `7`;
1014	// This shouldn't overflow if da is 255 or -255: 29.99 bits after squaring.
1015	id += ((uint32_t)(da * da) >> `7U`);
1016	}
1017
1018	return id;
1019	#else
1020	int dr = e1.r - e2.r;
1021	int dg = e1.g - e2.g;
1022	int db = e1.b - e2.b;
1023
1024	int64_t delta_l = dr * `27` + dg * `92` + db * `9`;
1025	int64_t delta_cr = dr * `128` - delta_l;
1026	int64_t delta_cb = db * `128` - delta_l;
1027
1028	int64_t id = ((delta_l * delta_l) * `128`) +
1029	((delta_cr * delta_cr) * `26`) +
1030	((delta_cb * delta_cb) * `3`);
1031
1032	if (alpha)
1033	{
1034	int64_t da = (e1.a - e2.a);
1035	id += (da * da) * `128`;
1036	}
1037
1038	int d = (id + `8192`) >> `14`;
1039
1040	return d;
1041	#endif
1042	}
1043	else
1044	return color_distance(e1, e2, alpha);
1045	}
1046
1047	static inline uint32_t color_distance_la(const color_rgba& a, const color_rgba& b)
1048	{
1049	const int dl = a.r - b.r;
1050	const int da = a.a - b.a;
1051	return dl * dl + da * da;
1052	}
1053
1054	// String helpers
1055
1056	inline int string_find_right(const std::string& filename, char c)
1057	{
1058	size_t result = filename.find_last_of(c);
1059	return (result == std::string::npos) ? -`1` : (int)result;
1060	}
1061
1062	inline std::string string_get_extension(const std::string &filename)
1063	{
1064	int sep = -`1`;
1065	#ifdef _WIN32
1066	sep = string_find_right(filename, `'\\'`);
1067	#endif
1068	if (sep < `0`)
1069	sep = string_find_right(filename, `'/'`);
1070
1071	int dot = string_find_right(filename, `'.'`);
1072	if (dot <= sep)
1073	return "";
1074
1075	std::string result(filename);
1076	result.erase(`0`, dot + `1`);
1077
1078	return result;
1079	}
1080
1081	inline bool string_remove_extension(std::string &filename)
1082	{
1083	int sep = -`1`;
1084	#ifdef _WIN32
1085	sep = string_find_right(filename, `'\\'`);
1086	#endif
1087	if (sep < `0`)
1088	sep = string_find_right(filename, `'/'`);
1089
1090	int dot = string_find_right(filename, `'.'`);
1091	if ((dot < sep) \|\| (dot < `0`))
1092	return false;
1093
1094	filename.resize(dot);
1095
1096	return true;
1097	}
1098
1099	inline std::string string_format(const char* pFmt, ...)
1100	{
1101	char buf[`2048`];
1102
1103	va_list args;
1104	va_start(args, pFmt);
1105	#ifdef _WIN32
1106	vsprintf_s(buf, sizeof(buf), pFmt, args);
1107	#else
1108	vsnprintf(buf, sizeof(buf), pFmt, args);
1109	#endif
1110	va_end(args);
1111
1112	return std::string (buf);
1113	}
1114
1115	inline std::string string_tolower(const std::string& s)
1116	{
1117	std::string result(s);
1118	for (size_t i = `0`; i < result.size(); i++)
1119	result [i] = (char)tolower((int)result [i]);
1120	return result;
1121	}
1122
1123	inline char strcpy_safe(char* pDst, size_t dst_len, const* char *pSrc)
1124	{
1125	assert(pDst && pSrc && dst_len);
1126	if (!dst_len)
1127	return pDst;
1128
1129	const size_t src_len = strlen(pSrc);
1130	const size_t src_len_plus_terminator = src_len + `1`;
1131
1132	if (src_len_plus_terminator <= dst_len)
1133	memcpy(pDst, pSrc, src_len_plus_terminator);
1134	else
1135	{
1136	if (dst_len > `1`)
1137	memcpy(pDst, pSrc, dst_len - `1`);
1138	pDst[dst_len - `1`] = `'\0'`;
1139	}
1140
1141	return pDst;
1142	}
1143
1144	inline bool string_ends_with(const std::string& s, char c)
1145	{
1146	return (s.size() != `0`) && (s.back() == c);
1147	}
1148
1149	inline bool string_split_path(const char p, std::string pDrive, std::string pDir, std::string pFilename, std::string *pExt)
1150	{
1151	#ifdef _MSC_VER
1152	char drive_buf[_MAX_DRIVE] = { `0` };
1153	char dir_buf[_MAX_DIR] = { `0` };
1154	char fname_buf[_MAX_FNAME] = { `0` };
1155	char ext_buf[_MAX_EXT] = { `0` };
1156
1157	errno_t error = _splitpath_s(p,
1158	pDrive ? drive_buf : NULL, pDrive ? _MAX_DRIVE : `0`,
1159	pDir ? dir_buf : NULL, pDir ? _MAX_DIR : `0`,
1160	pFilename ? fname_buf : NULL, pFilename ? _MAX_FNAME : `0`,
1161	pExt ? ext_buf : NULL, pExt ? _MAX_EXT : `0`);
1162	if (error != `0`)
1163	return false;
1164
1165	if (pDrive) *pDrive = drive_buf;
1166	if (pDir) *pDir = dir_buf;
1167	if (pFilename) *pFilename = fname_buf;
1168	if (pExt) *pExt = ext_buf;
1169	return true;
1170	#else
1171	char dirtmp[`1024`], nametmp[`1024`];
1172	strcpy_safe(dirtmp, sizeof(dirtmp), p);
1173	strcpy_safe(nametmp, sizeof(nametmp), p);
1174
1175	if (pDrive)
1176	pDrive->resize(`0`);
1177
1178	const char *pDirName = dirname(dirtmp);
1179	const char* pBaseName = basename(nametmp);
1180	if ((!pDirName) \|\| (!pBaseName))
1181	return false;
1182
1183	if (pDir)
1184	{
1185	*pDir = pDirName;
1186	if ((pDir->size()) && (pDir->back() != `'/'`))
1187	*pDir += "/";
1188	}
1189
1190	if (pFilename)
1191	{
1192	*pFilename = pBaseName;
1193	string_remove_extension(*pFilename);
1194	}
1195
1196	if (pExt)
1197	{
1198	*pExt = pBaseName;
1199	pExt = string_get_extension(pExt);
1200	if (pExt->size())
1201	pExt = "." + pExt;
1202	}
1203
1204	return true;
1205	#endif
1206	}
1207
1208	inline bool is_path_separator(char c)
1209	{
1210	#ifdef _WIN32
1211	return (c == `'/'`) \|\| (c == `'\\'`);
1212	#else
1213	return (c == `'/'`);
1214	#endif
1215	}
1216
1217	inline bool is_drive_separator(char c)
1218	{
1219	#ifdef _WIN32
1220	return (c == `':'`);
1221	#else
1222	(void)c;
1223	return false;
1224	#endif
1225	}
1226
1227	inline void string_combine_path(std::string &dst, const char p, const* char *q)
1228	{
1229	std::string temp(p);
1230	if (temp.size() && !is_path_separator(q[`0`]))
1231	{
1232	if (!is_path_separator(temp.back()))
1233	temp.append(`1`, BASISU_PATH_SEPERATOR_CHAR);
1234	}
1235	temp += q;
1236	dst.swap(temp);
1237	}
1238
1239	inline void string_combine_path(std::string &dst, const char p, const* char q, const* char *r)
1240	{
1241	string_combine_path(dst, p, q);
1242	string_combine_path(dst, dst.c_str(), r);
1243	}
1244
1245	inline void string_combine_path_and_extension(std::string &dst, const char p, const* char q, const* char r, const* char *pExt)
1246	{
1247	string_combine_path(dst, p, q, r);
1248	if ((!string_ends_with(dst, `'.'`)) && (pExt[`0`]) && (pExt[`0`] != `'.'`))
1249	dst.append(`1`, `'.'`);
1250	dst.append(pExt);
1251	}
1252
1253	inline bool string_get_pathname(const char *p, std::string &path)
1254	{
1255	std::string temp_drive, temp_path;
1256	if (!string_split_path(p, &temp_drive, &temp_path, NULL, NULL))
1257	return false;
1258	string_combine_path(path, temp_drive.c_str(), temp_path.c_str());
1259	return true;
1260	}
1261
1262	inline bool string_get_filename(const char *p, std::string &filename)
1263	{
1264	std::string temp_ext;
1265	if (!string_split_path(p, nullptr, nullptr, &filename, &temp_ext))
1266	return false;
1267	filename += temp_ext;
1268	return true;
1269	}
1270
1271	class rand
1272	{
1273	std::mt19937 m_mt;
1274
1275	public:
1276	rand() { }
1277
1278	rand(uint32_t s) { seed(s); }
1279	void seed(uint32_t s) { m_mt.seed(s); }
1280
1281	// between [l,h]
1282	int irand(int l, int h) { std::uniform_int_distribution<int> d(l, h); return d (m_mt); }
1283
1284	uint32_t urand32() { return static_cast<uint32_t>(irand(INT32_MIN, INT32_MAX)); }
1285
1286	bool bit() { return irand(`0`, `1`) == `1`; }
1287
1288	uint8_t byte() { return static_cast<uint8_t>(urand32()); }
1289
1290	// between [l,h)
1291	float frand(float l, float h) { std::uniform_real_distribution<float> d(l, h); return d (m_mt); }
1292
1293	float gaussian(float mean, float stddev) { std::normal_distribution<float> d(mean, stddev); return d (m_mt); }
1294	};
1295
1296	class priority_queue
1297	{
1298	public:
1299	priority_queue() :
1300	m_size(`0`)
1301	{
1302	}
1303
1304	void clear()
1305	{
1306	m_heap.clear();
1307	m_size = `0`;
1308	}
1309
1310	void init(uint32_t max_entries, uint32_t first_index, float first_priority)
1311	{
1312	m_heap.resize(max_entries + `1`);
1313	m_heap [`1`].m_index = first_index;
1314	m_heap [`1`].m_priority = first_priority;
1315	m_size = `1`;
1316	}
1317
1318	inline uint32_t size() const { return m_size; }
1319
1320	inline uint32_t get_top_index() const { return m_heap [`1`].m_index; }
1321	inline float get_top_priority() const { return m_heap [`1`].m_priority; }
1322
1323	inline void delete_top()
1324	{
1325	assert(m_size > `0`);
1326	m_heap [`1`] = m_heap [m_size];
1327	m_size--;
1328	if (m_size)
1329	down_heap(`1`);
1330	}
1331
1332	inline void add_heap(uint32_t index, float priority)
1333	{
1334	m_size++;
1335
1336	uint32_t k = m_size;
1337
1338	if (m_size >= m_heap.size())
1339	m_heap.resize(m_size + `1`);
1340
1341	for (;;)
1342	{
1343	uint32_t parent_index = k >> `1`;
1344	if ((!parent_index) \|\| (m_heap [parent_index].m_priority > priority))
1345	break;
1346	m_heap [k] = m_heap [parent_index];
1347	k = parent_index;
1348	}
1349
1350	m_heap [k].m_index = index;
1351	m_heap [k].m_priority = priority;
1352	}
1353
1354	private:
1355	struct entry
1356	{
1357	uint32_t m_index;
1358	float m_priority;
1359	};
1360
1361	basisu::vector<entry> m_heap;
1362	uint32_t m_size;
1363
1364	// Push down entry at index
1365	inline void down_heap(uint32_t heap_index)
1366	{
1367	uint32_t orig_index = m_heap [heap_index].m_index;
1368	const float orig_priority = m_heap [heap_index].m_priority;
1369
1370	uint32_t child_index;
1371	while ((child_index = (heap_index << `1`)) <= m_size)
1372	{
1373	if ((child_index < m_size) && (m_heap [child_index].m_priority < m_heap [child_index + `1`].m_priority)) ++child_index;
1374	if (orig_priority > m_heap [child_index].m_priority)
1375	break;
1376	m_heap [heap_index] = m_heap [child_index];
1377	heap_index = child_index;
1378	}
1379
1380	m_heap [heap_index].m_index = orig_index;
1381	m_heap [heap_index].m_priority = orig_priority;
1382	}
1383	};
1384
1385	// Tree structured vector quantization (TSVQ)
1386
1387	template <typename TrainingVectorType>
1388	class tree_vector_quant
1389	{
1390	public:
1391	typedef TrainingVectorType training_vec_type;
1392	typedef std::pair<TrainingVectorType, uint64_t> training_vec_with_weight;
1393	typedef basisu::vector< training_vec_with_weight > array_of_weighted_training_vecs;
1394
1395	tree_vector_quant() :
1396	m_next_codebook_index(`0`)
1397	{
1398	}
1399
1400	void clear()
1401	{
1402	clear_vector(m_training_vecs);
1403	clear_vector(m_nodes);
1404	m_next_codebook_index = `0`;
1405	}
1406
1407	void add_training_vec(const TrainingVectorType &v, uint64_t weight) { m_training_vecs.push_back(std::make_pair(v, weight)); }
1408
1409	size_t get_total_training_vecs() const { return m_training_vecs.size(); }
1410	const array_of_weighted_training_vecs &get_training_vecs() const { return m_training_vecs; }
1411	array_of_weighted_training_vecs &get_training_vecs() { return m_training_vecs; }
1412
1413	void retrieve(basisu::vector< basisu::vector<uint32_t> > &codebook) const
1414	{
1415	for (uint32_t i = `0`; i < m_nodes.size(); i++)
1416	{
1417	const tsvq_node &n = m_nodes[i];
1418	if (!n.is_leaf())
1419	continue;
1420
1421	codebook.resize(codebook.size() + `1`);
1422	codebook.back() = n.m_training_vecs;
1423	}
1424	}
1425
1426	void retrieve(basisu::vector<TrainingVectorType> &codebook) const
1427	{
1428	for (uint32_t i = `0`; i < m_nodes.size(); i++)
1429	{
1430	const tsvq_node &n = m_nodes[i];
1431	if (!n.is_leaf())
1432	continue;
1433
1434	codebook.resize(codebook.size() + `1`);
1435	codebook.back() = n.m_origin;
1436	}
1437	}
1438
1439	void retrieve(uint32_t max_clusters, basisu::vector<uint_vec> &codebook) const
1440	{
1441	uint_vec node_stack;
1442	node_stack.reserve(`512`);
1443
1444	codebook.resize(`0`);
1445	codebook.reserve(max_clusters);
1446
1447	uint32_t node_index = `0`;
1448
1449	while (true)
1450	{
1451	const tsvq_node& cur = m_nodes[node_index];
1452
1453	if (cur.is_leaf() \|\| ((`2` + cur.m_codebook_index) > (int)max_clusters))
1454	{
1455	codebook.resize(codebook.size() + `1`);
1456	codebook.back() = cur.m_training_vecs;
1457
1458	if (node_stack.empty())
1459	break;
1460
1461	node_index = node_stack.back();
1462	node_stack.pop_back();
1463	continue;
1464	}
1465
1466	node_stack.push_back(cur.m_right_index);
1467	node_index = cur.m_left_index;
1468	}
1469	}
1470
1471	bool generate(uint32_t max_size)
1472	{
1473	if (!m_training_vecs.size())
1474	return false;
1475
1476	m_next_codebook_index = `0`;
1477
1478	clear_vector(m_nodes);
1479	m_nodes.reserve(max_size * `2` + `1`);
1480
1481	m_nodes.push_back(prepare_root());
1482
1483	priority_queue var_heap;
1484	var_heap.init(max_size, `0`, m_nodes[`0`].m_var);
1485
1486	basisu::vector<uint32_t> l_children, r_children;
1487
1488	// Now split the worst nodes
1489	l_children.reserve(m_training_vecs.size() + `1`);
1490	r_children.reserve(m_training_vecs.size() + `1`);
1491
1492	uint32_t total_leaf_nodes = `1`;
1493
1494	//interval_timer tm;
1495	//tm.start();
1496
1497	while ((var_heap.size()) && (total_leaf_nodes < max_size))
1498	{
1499	const uint32_t node_index = var_heap.get_top_index();
1500	const tsvq_node &node = m_nodes[node_index];
1501
1502	assert(node.m_var == var_heap.get_top_priority());
1503	assert(node.is_leaf());
1504
1505	var_heap.delete_top();
1506
1507	if (node.m_training_vecs.size() > `1`)
1508	{
1509	if (split_node(node_index, var_heap, l_children, r_children))
1510	{
1511	// This removes one leaf node (making an internal node) and replaces it with two new leaves, so +1 total.
1512	total_leaf_nodes += `1`;
1513	}
1514	}
1515	}
1516
1517	//debug_printf("tree_vector_quant::generate %u: %3.3f secs\n", TrainingVectorType::num_elements, tm.get_elapsed_secs());
1518
1519	return true;
1520	}
1521
1522	private:
1523	class tsvq_node
1524	{
1525	public:
1526	inline tsvq_node() : m_weight(`0`), m_origin(cZero), m_left_index(-`1`), m_right_index(-`1`), m_codebook_index(-`1`) { }
1527
1528	// vecs is erased
1529	inline void set(const TrainingVectorType &org, uint64_t weight, float var, basisu::vector<uint32_t> &vecs) { m_origin = org; m_weight = weight; m_var = var; m_training_vecs.swap(vecs); }
1530
1531	inline bool is_leaf() const { return m_left_index < `0`; }
1532
1533	float m_var;
1534	uint64_t m_weight;
1535	TrainingVectorType m_origin;
1536	int32_t m_left_index, m_right_index;
1537	basisu::vector<uint32_t> m_training_vecs;
1538	int m_codebook_index;
1539	};
1540
1541	typedef basisu::vector<tsvq_node> tsvq_node_vec;
1542	tsvq_node_vec m_nodes;
1543
1544	array_of_weighted_training_vecs m_training_vecs;
1545
1546	uint32_t m_next_codebook_index;
1547
1548	tsvq_node prepare_root() const
1549	{
1550	double ttsum = `0.0f`;
1551
1552	// Prepare root node containing all training vectors
1553	tsvq_node root;
1554	root.m_training_vecs.reserve(m_training_vecs.size());
1555
1556	for (uint32_t i = `0`; i < m_training_vecs.size(); i++)
1557	{
1558	const TrainingVectorType &v = m_training_vecs[i].first;
1559	const uint64_t weight = m_training_vecs[i].second;
1560
1561	root.m_training_vecs.push_back(i);
1562
1563	root.m_origin += (v * static_cast<float>(weight));
1564	root.m_weight += weight;
1565
1566	ttsum += v.dot(v) * weight;
1567	}
1568
1569	root.m_var = static_cast<float>(ttsum - (root.m_origin.dot(root.m_origin) / root.m_weight));
1570
1571	root.m_origin *= (`1.0f` / root.m_weight);
1572
1573	return root;
1574	}
1575
1576	bool split_node(uint32_t node_index, priority_queue &var_heap, basisu::vector<uint32_t> &l_children, basisu::vector<uint32_t> &r_children)
1577	{
1578	TrainingVectorType l_child_org, r_child_org;
1579	uint64_t l_weight = `0`, r_weight = `0`;
1580	float l_var = `0.0f`, r_var = `0.0f`;
1581
1582	// Compute initial left/right child origins
1583	if (!prep_split(m_nodes[node_index], l_child_org, r_child_org))
1584	return false;
1585
1586	// Use k-means iterations to refine these children vectors
1587	if (!refine_split(m_nodes[node_index], l_child_org, l_weight, l_var, l_children, r_child_org, r_weight, r_var, r_children))
1588	return false;
1589
1590	// Create children
1591	const uint32_t l_child_index = (uint32_t)m_nodes.size(), r_child_index = (uint32_t)m_nodes.size() + `1`;
1592
1593	m_nodes[node_index].m_left_index = l_child_index;
1594	m_nodes[node_index].m_right_index = r_child_index;
1595
1596	m_nodes[node_index].m_codebook_index = m_next_codebook_index;
1597	m_next_codebook_index++;
1598
1599	m_nodes.resize(m_nodes.size() + `2`);
1600
1601	tsvq_node &l_child = m_nodes[l_child_index], &r_child = m_nodes[r_child_index];
1602
1603	l_child.set(l_child_org, l_weight, l_var, l_children);
1604	r_child.set(r_child_org, r_weight, r_var, r_children);
1605
1606	if ((l_child.m_var <= `0.0f`) && (l_child.m_training_vecs.size() > `1`))
1607	{
1608	TrainingVectorType v(m_training_vecs[l_child.m_training_vecs[`0`]].first);
1609
1610	for (uint32_t i = `1`; i < l_child.m_training_vecs.size(); i++)
1611	{
1612	if (!(v == m_training_vecs[l_child.m_training_vecs[i]].first))
1613	{
1614	l_child.m_var = `1e-4f`;
1615	break;
1616	}
1617	}
1618	}
1619
1620	if ((r_child.m_var <= `0.0f`) && (r_child.m_training_vecs.size() > `1`))
1621	{
1622	TrainingVectorType v(m_training_vecs[r_child.m_training_vecs[`0`]].first);
1623
1624	for (uint32_t i = `1`; i < r_child.m_training_vecs.size(); i++)
1625	{
1626	if (!(v == m_training_vecs[r_child.m_training_vecs[i]].first))
1627	{
1628	r_child.m_var = `1e-4f`;
1629	break;
1630	}
1631	}
1632	}
1633
1634	if ((l_child.m_var > `0.0f`) && (l_child.m_training_vecs.size() > `1`))
1635	var_heap.add_heap(l_child_index, l_child.m_var);
1636
1637	if ((r_child.m_var > `0.0f`) && (r_child.m_training_vecs.size() > `1`))
1638	var_heap.add_heap(r_child_index, r_child.m_var);
1639
1640	return true;
1641	}
1642
1643	TrainingVectorType compute_split_axis(const tsvq_node &node) const
1644	{
1645	const uint32_t N = TrainingVectorType::num_elements;
1646
1647	matrix<N, N, float> cmatrix;
1648
1649	if ((N != `16`) \|\| (!g_cpu_supports_sse41))
1650	{
1651	cmatrix.set_zero();
1652
1653	// Compute covariance matrix from weighted input vectors
1654	for (uint32_t i = `0`; i < node.m_training_vecs.size(); i++)
1655	{
1656	const TrainingVectorType v(m_training_vecs[node.m_training_vecs[i]].first - node.m_origin);
1657	const TrainingVectorType w(static_cast<float>(m_training_vecs[node.m_training_vecs[i]].second) * v);
1658
1659	for (uint32_t x = `0`; x < N; x++)
1660	for (uint32_t y = x; y < N; y++)
1661	cmatrix[x][y] = cmatrix[x][y] + v[x] * w[y];
1662	}
1663	}
1664	else
1665	{
1666	#if BASISU_SUPPORT_SSE
1667	// Specialize the case with 16x16 matrices, which are quite expensive without SIMD.
1668	// This SSE function takes pointers to void types, so do some sanity checks.
1669	assert(sizeof(TrainingVectorType) == sizeof(float) * `16`);
1670	assert(sizeof(training_vec_with_weight) == sizeof(std::pair<vec16F, uint64_t>));
1671	update_covar_matrix_16x16_sse41(node.m_training_vecs.size(), m_training_vecs.data(), &node.m_origin, node.m_training_vecs.data(), &cmatrix);
1672	#endif
1673	}
1674
1675	const float renorm_scale = `1.0f` / node.m_weight;
1676
1677	for (uint32_t x = `0`; x < N; x++)
1678	for (uint32_t y = x; y < N; y++)
1679	cmatrix[x][y] *= renorm_scale;
1680
1681	// Diagonal flip
1682	for (uint32_t x = `0`; x < (N - `1`); x++)
1683	for (uint32_t y = x + `1`; y < N; y++)
1684	cmatrix[y][x] = cmatrix[x][y];
1685
1686	return compute_pca_from_covar<N, TrainingVectorType>(cmatrix);
1687	}
1688
1689	bool prep_split(const tsvq_node &node, TrainingVectorType &l_child_result, TrainingVectorType &r_child_result) const
1690	{
1691	//const uint32_t N = TrainingVectorType::num_elements;
1692
1693	if (`2` == node.m_training_vecs.size())
1694	{
1695	l_child_result = m_training_vecs[node.m_training_vecs[`0`]].first;
1696	r_child_result = m_training_vecs[node.m_training_vecs[`1`]].first;
1697	return true;
1698	}
1699
1700	TrainingVectorType axis(compute_split_axis(node)), l_child(`0.0f`), r_child(`0.0f`);
1701	double l_weight = `0.0f`, r_weight = `0.0f`;
1702
1703	// Compute initial left/right children
1704	for (uint32_t i = `0`; i < node.m_training_vecs.size(); i++)
1705	{
1706	const float weight = (float)m_training_vecs[node.m_training_vecs[i]].second;
1707
1708	const TrainingVectorType &v = m_training_vecs[node.m_training_vecs[i]].first;
1709
1710	double t = (v - node.m_origin).dot(axis);
1711	if (t >= `0.0f`)
1712	{
1713	r_child += v * weight;
1714	r_weight += weight;
1715	}
1716	else
1717	{
1718	l_child += v * weight;
1719	l_weight += weight;
1720	}
1721	}
1722
1723	if ((l_weight > `0.0f`) && (r_weight > `0.0f`))
1724	{
1725	l_child_result = l_child * static_cast<float>(`1.0f` / l_weight);
1726	r_child_result = r_child * static_cast<float>(`1.0f` / r_weight);
1727	}
1728	else
1729	{
1730	TrainingVectorType l(`1e+20f`);
1731	TrainingVectorType h(-`1e+20f`);
1732	for (uint32_t i = `0`; i < node.m_training_vecs.size(); i++)
1733	{
1734	const TrainingVectorType& v = m_training_vecs[node.m_training_vecs[i]].first;
1735
1736	l = TrainingVectorType::component_min(l, v);
1737	h = TrainingVectorType::component_max(h, v);
1738	}
1739
1740	TrainingVectorType r(h - l);
1741
1742	float largest_axis_v = `0.0f`;
1743	int largest_axis_index = -`1`;
1744	for (uint32_t i = `0`; i < TrainingVectorType::num_elements; i++)
1745	{
1746	if (r[i] > largest_axis_v)
1747	{
1748	largest_axis_v = r[i];
1749	largest_axis_index = i;
1750	}
1751	}
1752
1753	if (largest_axis_index < `0`)
1754	return false;
1755
1756	basisu::vector<float> keys(node.m_training_vecs.size());
1757	for (uint32_t i = `0`; i < node.m_training_vecs.size(); i++)
1758	keys [i] = m_training_vecs[node.m_training_vecs[i]].first[largest_axis_index];
1759
1760	uint_vec indices(node.m_training_vecs.size());
1761	indirect_sort((uint32_t)node.m_training_vecs.size(), &indices [`0`], &keys [`0`]);
1762
1763	l_child.set_zero();
1764	l_weight = `0`;
1765
1766	r_child.set_zero();
1767	r_weight = `0`;
1768
1769	const uint32_t half_index = (uint32_t)node.m_training_vecs.size() / `2`;
1770	for (uint32_t i = `0`; i < node.m_training_vecs.size(); i++)
1771	{
1772	const float weight = (float)m_training_vecs[node.m_training_vecs[i]].second;
1773
1774	const TrainingVectorType& v = m_training_vecs[node.m_training_vecs[i]].first;
1775
1776	if (i < half_index)
1777	{
1778	l_child += v * weight;
1779	l_weight += weight;
1780	}
1781	else
1782	{
1783	r_child += v * weight;
1784	r_weight += weight;
1785	}
1786	}
1787
1788	if ((l_weight > `0.0f`) && (r_weight > `0.0f`))
1789	{
1790	l_child_result = l_child * static_cast<float>(`1.0f` / l_weight);
1791	r_child_result = r_child * static_cast<float>(`1.0f` / r_weight);
1792	}
1793	else
1794	{
1795	l_child_result = l;
1796	r_child_result = h;
1797	}
1798	}
1799
1800	return true;
1801	}
1802
1803	bool refine_split(const tsvq_node &node,
1804	TrainingVectorType &l_child, uint64_t &l_weight, float &l_var, basisu::vector<uint32_t> &l_children,
1805	TrainingVectorType &r_child, uint64_t &r_weight, float &r_var, basisu::vector<uint32_t> &r_children) const
1806	{
1807	l_children.reserve(node.m_training_vecs.size());
1808	r_children.reserve(node.m_training_vecs.size());
1809
1810	float prev_total_variance = `1e+10f`;
1811
1812	// Refine left/right children locations using k-means iterations
1813	const uint32_t cMaxIters = `6`;
1814	for (uint32_t iter = `0`; iter < cMaxIters; iter++)
1815	{
1816	l_children.resize(`0`);
1817	r_children.resize(`0`);
1818
1819	TrainingVectorType new_l_child(cZero), new_r_child(cZero);
1820
1821	double l_ttsum = `0.0f`, r_ttsum = `0.0f`;
1822
1823	l_weight = `0`;
1824	r_weight = `0`;
1825
1826	for (uint32_t i = `0`; i < node.m_training_vecs.size(); i++)
1827	{
1828	const TrainingVectorType &v = m_training_vecs[node.m_training_vecs[i]].first;
1829	const uint64_t weight = m_training_vecs[node.m_training_vecs[i]].second;
1830
1831	double left_dist2 = l_child.squared_distance_d(v), right_dist2 = r_child.squared_distance_d(v);
1832
1833	if (left_dist2 >= right_dist2)
1834	{
1835	new_r_child += (v * static_cast<float>(weight));
1836	r_weight += weight;
1837
1838	r_ttsum += weight * v.dot(v);
1839	r_children.push_back(node.m_training_vecs[i]);
1840	}
1841	else
1842	{
1843	new_l_child += (v * static_cast<float>(weight));
1844	l_weight += weight;
1845
1846	l_ttsum += weight * v.dot(v);
1847	l_children.push_back(node.m_training_vecs[i]);
1848	}
1849	}
1850
1851	// Node is unsplittable using the above algorithm - try something else to split it up.
1852	if ((!l_weight) \|\| (!r_weight))
1853	{
1854	l_children.resize(`0`);
1855	new_l_child.set(`0.0f`);
1856	l_ttsum = `0.0f`;
1857	l_weight = `0`;
1858
1859	r_children.resize(`0`);
1860	new_r_child.set(`0.0f`);
1861	r_ttsum = `0.0f`;
1862	r_weight = `0`;
1863
1864	TrainingVectorType firstVec;
1865	for (uint32_t i = `0`; i < node.m_training_vecs.size(); i++)
1866	{
1867	const TrainingVectorType& v = m_training_vecs[node.m_training_vecs[i]].first;
1868	const uint64_t weight = m_training_vecs[node.m_training_vecs[i]].second;
1869
1870	if ((!i) \|\| (v == firstVec))
1871	{
1872	firstVec = v;
1873
1874	new_r_child += (v * static_cast<float>(weight));
1875	r_weight += weight;
1876
1877	r_ttsum += weight * v.dot(v);
1878	r_children.push_back(node.m_training_vecs[i]);
1879	}
1880	else
1881	{
1882	new_l_child += (v * static_cast<float>(weight));
1883	l_weight += weight;
1884
1885	l_ttsum += weight * v.dot(v);
1886	l_children.push_back(node.m_training_vecs[i]);
1887	}
1888	}
1889
1890	if ((!l_weight) \|\| (!r_weight))
1891	return false;
1892	}
1893
1894	l_var = static_cast<float>(l_ttsum - (new_l_child.dot(new_l_child) / l_weight));
1895	r_var = static_cast<float>(r_ttsum - (new_r_child.dot(new_r_child) / r_weight));
1896
1897	new_l_child *= (`1.0f` / l_weight);
1898	new_r_child *= (`1.0f` / r_weight);
1899
1900	l_child = new_l_child;
1901	r_child = new_r_child;
1902
1903	float total_var = l_var + r_var;
1904	const float cGiveupVariance = `.00001f`;
1905	if (total_var < cGiveupVariance)
1906	break;
1907
1908	// Check to see if the variance has settled
1909	const float cVarianceDeltaThresh = `.00125f`;
1910	if (((prev_total_variance - total_var) / total_var) < cVarianceDeltaThresh)
1911	break;
1912
1913	prev_total_variance = total_var;
1914	}
1915
1916	return true;
1917	}
1918	};
1919
1920	struct weighted_block_group
1921	{
1922	uint64_t m_total_weight;
1923	uint_vec m_indices;
1924	};
1925
1926	template<typename Quantizer>
1927	bool generate_hierarchical_codebook_threaded_internal(Quantizer& q,
1928	uint32_t max_codebook_size, uint32_t max_parent_codebook_size,
1929	basisu::vector<uint_vec>& codebook,
1930	basisu::vector<uint_vec>& parent_codebook,
1931	uint32_t max_threads, bool limit_clusterizers, job_pool *pJob_pool)
1932	{
1933	codebook.resize(`0`);
1934	parent_codebook.resize(`0`);
1935
1936	if ((max_threads <= `1`) \|\| (q.get_training_vecs().size() < `256`) \|\| (max_codebook_size < max_threads * `16`))
1937	{
1938	if (!q.generate(max_codebook_size))
1939	return false;
1940
1941	q.retrieve(codebook);
1942
1943	if (max_parent_codebook_size)
1944	q.retrieve(max_parent_codebook_size, parent_codebook);
1945
1946	return true;
1947	}
1948
1949	const uint32_t cMaxThreads = `16`;
1950	if (max_threads > cMaxThreads)
1951	max_threads = cMaxThreads;
1952
1953	if (!q.generate(max_threads))
1954	return false;
1955
1956	basisu::vector<uint_vec> initial_codebook;
1957
1958	q.retrieve(initial_codebook);
1959
1960	if (initial_codebook.size() < max_threads)
1961	{
1962	codebook = initial_codebook;
1963
1964	if (max_parent_codebook_size)
1965	q.retrieve(max_parent_codebook_size, parent_codebook);
1966
1967	return true;
1968	}
1969
1970	Quantizer quantizers[cMaxThreads];
1971
1972	bool success_flags[cMaxThreads];
1973	clear_obj(success_flags);
1974
1975	basisu::vector<uint_vec> local_clusters[cMaxThreads];
1976	basisu::vector<uint_vec> local_parent_clusters[cMaxThreads];
1977
1978	for (uint32_t thread_iter = `0`; thread_iter < max_threads; thread_iter++)
1979	{
1980	#ifndef __EMSCRIPTEN__
1981	pJob_pool->add_job( [thread_iter, &local_clusters, &local_parent_clusters, &success_flags, &quantizers, &initial_codebook, &q, &limit_clusterizers, &max_codebook_size, &max_threads, &max_parent_codebook_size] {
1982	#endif
1983
1984	Quantizer& lq = quantizers[thread_iter];
1985	uint_vec& cluster_indices = initial_codebook [thread_iter];
1986
1987	uint_vec local_to_global(cluster_indices.size());
1988
1989	for (uint32_t i = `0`; i < cluster_indices.size(); i++)
1990	{
1991	const uint32_t global_training_vec_index = cluster_indices [i];
1992	local_to_global [i] = global_training_vec_index;
1993
1994	lq.add_training_vec(q.get_training_vecs()[global_training_vec_index].first, q.get_training_vecs()[global_training_vec_index].second);
1995	}
1996
1997	const uint32_t max_clusters = limit_clusterizers ? ((max_codebook_size + max_threads - `1`) / max_threads) : (uint32_t)lq.get_total_training_vecs();
1998
1999	success_flags[thread_iter] = lq.generate(max_clusters);
2000
2001	if (success_flags[thread_iter])
2002	{
2003	lq.retrieve(local_clusters[thread_iter]);
2004
2005	for (uint32_t i = `0`; i < local_clusters[thread_iter].size(); i++)
2006	{
2007	for (uint32_t j = `0`; j < local_clusters[thread_iter][i].size(); j++)
2008	local_clusters[thread_iter][i][j] = local_to_global [local_clusters[thread_iter][i][j]];
2009	}
2010
2011	if (max_parent_codebook_size)
2012	{
2013	lq.retrieve((max_parent_codebook_size + max_threads - `1`) / max_threads, local_parent_clusters[thread_iter]);
2014
2015	for (uint32_t i = `0`; i < local_parent_clusters[thread_iter].size(); i++)
2016	{
2017	for (uint32_t j = `0`; j < local_parent_clusters[thread_iter][i].size(); j++)
2018	local_parent_clusters[thread_iter][i][j] = local_to_global [local_parent_clusters[thread_iter][i][j]];
2019	}
2020	}
2021	}
2022
2023	#ifndef __EMSCRIPTEN__
2024	} );
2025	#endif
2026
2027	} // thread_iter
2028
2029	#ifndef __EMSCRIPTEN__
2030	pJob_pool->wait_for_all();
2031	#endif
2032
2033	uint32_t total_clusters = `0`, total_parent_clusters = `0`;
2034
2035	for (int thread_iter = `0`; thread_iter < (int)max_threads; thread_iter++)
2036	{
2037	if (!success_flags[thread_iter])
2038	return false;
2039	total_clusters += (uint32_t)local_clusters[thread_iter].size();
2040	total_parent_clusters += (uint32_t)local_parent_clusters[thread_iter].size();
2041	}
2042
2043	codebook.reserve(total_clusters);
2044	parent_codebook.reserve(total_parent_clusters);
2045
2046	for (uint32_t thread_iter = `0`; thread_iter < max_threads; thread_iter++)
2047	{
2048	for (uint32_t j = `0`; j < local_clusters[thread_iter].size(); j++)
2049	{
2050	codebook.resize(codebook.size() + `1`);
2051	codebook.back().swap(local_clusters[thread_iter][j]);
2052	}
2053
2054	for (uint32_t j = `0`; j < local_parent_clusters[thread_iter].size(); j++)
2055	{
2056	parent_codebook.resize(parent_codebook.size() + `1`);
2057	parent_codebook.back().swap(local_parent_clusters[thread_iter][j]);
2058	}
2059	}
2060
2061	return true;
2062	}
2063
2064	template<typename Quantizer>
2065	bool generate_hierarchical_codebook_threaded(Quantizer& q,
2066	uint32_t max_codebook_size, uint32_t max_parent_codebook_size,
2067	basisu::vector<uint_vec>& codebook,
2068	basisu::vector<uint_vec>& parent_codebook,
2069	uint32_t max_threads, job_pool *pJob_pool,
2070	bool even_odd_input_pairs_equal)
2071	{
2072	typedef bit_hasher<typename Quantizer::training_vec_type> training_vec_bit_hasher;
2073
2074	typedef std::unordered_map < typename Quantizer::training_vec_type, weighted_block_group,
2075	training_vec_bit_hasher> group_hash;
2076
2077	//interval_timer tm;
2078	//tm.start();
2079
2080	group_hash unique_vecs;
2081
2082	unique_vecs.reserve(`20000`);
2083
2084	weighted_block_group g;
2085
2086	if (even_odd_input_pairs_equal)
2087	{
2088	g.m_indices.resize(`2`);
2089
2090	assert(q.get_training_vecs().size() >= `2` && (q.get_training_vecs().size() & `1`) == `0`);
2091
2092	for (uint32_t i = `0`; i < q.get_training_vecs().size(); i += `2`)
2093	{
2094	assert(q.get_training_vecs()[i].first == q.get_training_vecs()[i + `1`].first);
2095
2096	g.m_total_weight = q.get_training_vecs()[i].second + q.get_training_vecs()[i + `1`].second;
2097	g.m_indices [`0`] = i;
2098	g.m_indices [`1`] = i + `1`;
2099
2100	auto ins_res = unique_vecs.insert(std::make_pair(q.get_training_vecs()[i].first, g));
2101
2102	if (!ins_res.second)
2103	{
2104	(ins_res.first)->second.m_total_weight += g.m_total_weight;
2105	(ins_res.first)->second.m_indices.push_back(i);
2106	(ins_res.first)->second.m_indices.push_back(i + `1`);
2107	}
2108	}
2109	}
2110	else
2111	{
2112	g.m_indices.resize(`1`);
2113
2114	for (uint32_t i = `0`; i < q.get_training_vecs().size(); i++)
2115	{
2116	g.m_total_weight = q.get_training_vecs()[i].second;
2117	g.m_indices [`0`] = i;
2118
2119	auto ins_res = unique_vecs.insert(std::make_pair(q.get_training_vecs()[i].first, g));
2120
2121	if (!ins_res.second)
2122	{
2123	(ins_res.first)->second.m_total_weight += g.m_total_weight;
2124	(ins_res.first)->second.m_indices.push_back(i);
2125	}
2126	}
2127	}
2128
2129	//debug_printf("generate_hierarchical_codebook_threaded: %u training vectors, %u unique training vectors, %3.3f secs\n", q.get_total_training_vecs(), (uint32_t)unique_vecs.size(), tm.get_elapsed_secs());
2130	debug_printf("generate_hierarchical_codebook_threaded: %u training vectors, %u unique training vectors\n", q.get_total_training_vecs(), (uint32_t)unique_vecs.size());
2131
2132	Quantizer group_quant;
2133	typedef typename group_hash::const_iterator group_hash_const_iter;
2134	basisu::vector<group_hash_const_iter> unique_vec_iters;
2135	unique_vec_iters.reserve(unique_vecs.size());
2136
2137	for (auto iter = unique_vecs.begin(); iter != unique_vecs.end(); ++iter)
2138	{
2139	group_quant.add_training_vec(iter->first, iter->second.m_total_weight);
2140	unique_vec_iters.push_back(iter);
2141	}
2142
2143	bool limit_clusterizers = true;
2144	if (unique_vecs.size() <= max_codebook_size)
2145	limit_clusterizers = false;
2146
2147	debug_printf("Limit clusterizers: %u\n", limit_clusterizers);
2148
2149	basisu::vector<uint_vec> group_codebook, group_parent_codebook;
2150	bool status = generate_hierarchical_codebook_threaded_internal(group_quant,
2151	max_codebook_size, max_parent_codebook_size,
2152	group_codebook,
2153	group_parent_codebook,
2154	(unique_vecs.size() < `65536`*`4`) ? `1` : max_threads, limit_clusterizers, pJob_pool);
2155
2156	if (!status)
2157	return false;
2158
2159	codebook.resize(`0`);
2160	for (uint32_t i = `0`; i < group_codebook.size(); i++)
2161	{
2162	codebook.resize(codebook.size() + `1`);
2163
2164	for (uint32_t j = `0`; j < group_codebook [i].size(); j++)
2165	{
2166	const uint32_t group_index = group_codebook [i][j];
2167
2168	typename group_hash::const_iterator group_iter = unique_vec_iters[group_index];
2169	const uint_vec& training_vec_indices = group_iter->second.m_indices;
2170
2171	append_vector(codebook.back(), training_vec_indices);
2172	}
2173	}
2174
2175	parent_codebook.resize(`0`);
2176	for (uint32_t i = `0`; i < group_parent_codebook.size(); i++)
2177	{
2178	parent_codebook.resize(parent_codebook.size() + `1`);
2179
2180	for (uint32_t j = `0`; j < group_parent_codebook [i].size(); j++)
2181	{
2182	const uint32_t group_index = group_parent_codebook [i][j];
2183
2184	typename group_hash::const_iterator group_iter = unique_vec_iters[group_index];
2185	const uint_vec& training_vec_indices = group_iter->second.m_indices;
2186
2187	append_vector(parent_codebook.back(), training_vec_indices);
2188	}
2189	}
2190
2191	return true;
2192	}
2193
2194	// Canonical Huffman coding
2195
2196	class histogram
2197	{
2198	basisu::vector<uint32_t> m_hist;
2199
2200	public:
2201	histogram(uint32_t size = `0`) { init(size); }
2202
2203	void clear()
2204	{
2205	clear_vector(m_hist);
2206	}
2207
2208	void init(uint32_t size)
2209	{
2210	m_hist.resize(`0`);
2211	m_hist.resize(size);
2212	}
2213
2214	inline uint32_t size() const { return static_cast<uint32_t>(m_hist.size()); }
2215
2216	inline const uint32_t &operator[] (uint32_t index) const
2217	{
2218	return m_hist [index];
2219	}
2220
2221	inline uint32_t &operator[] (uint32_t index)
2222	{
2223	return m_hist [index];
2224	}
2225
2226	inline void inc(uint32_t index)
2227	{
2228	m_hist [index]++;
2229	}
2230
2231	uint64_t get_total() const
2232	{
2233	uint64_t total = `0`;
2234	for (uint32_t i = `0`; i < m_hist.size(); ++i)
2235	total += m_hist [i];
2236	return total;
2237	}
2238
2239	double get_entropy() const
2240	{
2241	double total = static_cast<double>(get_total());
2242	if (total == `0.0f`)
2243	return `0.0f`;
2244
2245	const double inv_total = `1.0f` / total;
2246	const double neg_inv_log2 = -`1.0f` / log(`2.0f`);
2247
2248	double e = `0.0f`;
2249	for (uint32_t i = `0`; i < m_hist.size(); i++)
2250	if (m_hist [i])
2251	e += log(m_hist [i] * inv_total) * neg_inv_log2 * static_cast<double>(m_hist [i]);
2252
2253	return e;
2254	}
2255	};
2256
2257	struct sym_freq
2258	{
2259	uint32_t m_key;
2260	uint16_t m_sym_index;
2261	};
2262
2263	sym_freq canonical_huffman_radix_sort_syms(uint32_t num_syms, sym_freq pSyms0, sym_freq *pSyms1);
2264	void canonical_huffman_calculate_minimum_redundancy(sym_freq A, int* num_syms);
2265	void canonical_huffman_enforce_max_code_size(int pNum_codes, int* code_list_len, int max_code_size);
2266
2267	class huffman_encoding_table
2268	{
2269	public:
2270	huffman_encoding_table()
2271	{
2272	}
2273
2274	void clear()
2275	{
2276	clear_vector(m_codes);
2277	clear_vector(m_code_sizes);
2278	}
2279
2280	bool init(const histogram &h, uint32_t max_code_size = cHuffmanMaxSupportedCodeSize)
2281	{
2282	return init(h.size(), &h [`0`], max_code_size);
2283	}
2284
2285	bool init(uint32_t num_syms, const uint16_t *pFreq, uint32_t max_code_size);
2286	bool init(uint32_t num_syms, const uint32_t *pSym_freq, uint32_t max_code_size);
2287
2288	inline const uint16_vec &get_codes() const { return m_codes; }
2289	inline const uint8_vec &get_code_sizes() const { return m_code_sizes; }
2290
2291	uint32_t get_total_used_codes() const
2292	{
2293	for (int i = static_cast<int>(m_code_sizes.size()) - `1`; i >= `0`; i--)
2294	if (m_code_sizes [i])
2295	return i + `1`;
2296	return `0`;
2297	}
2298
2299	private:
2300	uint16_vec m_codes;
2301	uint8_vec m_code_sizes;
2302	};
2303
2304	class bitwise_coder
2305	{
2306	public:
2307	bitwise_coder() :
2308	m_bit_buffer(`0`),
2309	m_bit_buffer_size(`0`),
2310	m_total_bits(`0`)
2311	{
2312	}
2313
2314	inline void clear()
2315	{
2316	clear_vector(m_bytes);
2317	m_bit_buffer = `0`;
2318	m_bit_buffer_size = `0`;
2319	m_total_bits = `0`;
2320	}
2321
2322	inline const uint8_vec &get_bytes() const { return m_bytes; }
2323
2324	inline uint64_t get_total_bits() const { return m_total_bits; }
2325	inline void clear_total_bits() { m_total_bits = `0`; }
2326
2327	inline void init(uint32_t reserve_size = `1024`)
2328	{
2329	m_bytes.reserve(reserve_size);
2330	m_bytes.resize(`0`);
2331
2332	m_bit_buffer = `0`;
2333	m_bit_buffer_size = `0`;
2334	m_total_bits = `0`;
2335	}
2336
2337	inline uint32_t flush()
2338	{
2339	if (m_bit_buffer_size)
2340	{
2341	m_total_bits += `8` - (m_bit_buffer_size & `7`);
2342	append_byte(static_cast<uint8_t>(m_bit_buffer));
2343
2344	m_bit_buffer = `0`;
2345	m_bit_buffer_size = `0`;
2346
2347	return `8`;
2348	}
2349
2350	return `0`;
2351	}
2352
2353	inline uint32_t put_bits(uint32_t bits, uint32_t num_bits)
2354	{
2355	assert(num_bits <= `32`);
2356	assert(bits < (`1ULL` << num_bits));
2357
2358	if (!num_bits)
2359	return `0`;
2360
2361	m_total_bits += num_bits;
2362
2363	uint64_t v = (static_cast<uint64_t>(bits) << m_bit_buffer_size) \| m_bit_buffer;
2364	m_bit_buffer_size += num_bits;
2365
2366	while (m_bit_buffer_size >= `8`)
2367	{
2368	append_byte(static_cast<uint8_t>(v));
2369	v >>= `8`;
2370	m_bit_buffer_size -= `8`;
2371	}
2372
2373	m_bit_buffer = static_cast<uint8_t>(v);
2374	return num_bits;
2375	}
2376
2377	inline uint32_t put_code(uint32_t sym, const huffman_encoding_table &tab)
2378	{
2379	uint32_t code = tab.get_codes()[sym];
2380	uint32_t code_size = tab.get_code_sizes()[sym];
2381	assert(code_size >= `1`);
2382	put_bits(code, code_size);
2383	return code_size;
2384	}
2385
2386	inline uint32_t put_truncated_binary(uint32_t v, uint32_t n)
2387	{
2388	assert((n >= `2`) && (v < n));
2389
2390	uint32_t k = floor_log2i(n);
2391	uint32_t u = (`1` << (k + `1`)) - n;
2392
2393	if (v < u)
2394	return put_bits(v, k);
2395
2396	uint32_t x = v + u;
2397	assert((x >> `1`) >= u);
2398
2399	put_bits(x >> `1`, k);
2400	put_bits(x & `1`, `1`);
2401	return k + `1`;
2402	}
2403
2404	inline uint32_t put_rice(uint32_t v, uint32_t m)
2405	{
2406	assert(m);
2407
2408	const uint64_t start_bits = m_total_bits;
2409
2410	uint32_t q = v >> m, r = v & ((`1` << m) - `1`);
2411
2412	// rice coding sanity check
2413	assert(q <= `64`);
2414
2415	for (; q > `16`; q -= `16`)
2416	put_bits(`0xFFFF`, `16`);
2417
2418	put_bits((`1` << q) - `1`, q);
2419	put_bits(r << `1`, m + `1`);
2420
2421	return (uint32_t)(m_total_bits - start_bits);
2422	}
2423
2424	inline uint32_t put_vlc(uint32_t v, uint32_t chunk_bits)
2425	{
2426	assert(chunk_bits);
2427
2428	const uint32_t chunk_size = `1` << chunk_bits;
2429	const uint32_t chunk_mask = chunk_size - `1`;
2430
2431	uint32_t total_bits = `0`;
2432
2433	for ( ; ; )
2434	{
2435	uint32_t next_v = v >> chunk_bits;
2436
2437	total_bits += put_bits((v & chunk_mask) \| (next_v ? chunk_size : `0`), chunk_bits + `1`);
2438	if (!next_v)
2439	break;
2440
2441	v = next_v;
2442	}
2443
2444	return total_bits;
2445	}
2446
2447	uint32_t emit_huffman_table(const huffman_encoding_table &tab);
2448
2449	private:
2450	uint8_vec m_bytes;
2451	uint32_t m_bit_buffer, m_bit_buffer_size;
2452	uint64_t m_total_bits;
2453
2454	void append_byte(uint8_t c)
2455	{
2456	m_bytes.resize(m_bytes.size() + `1`);
2457	m_bytes.back() = c;
2458	}
2459
2460	static void end_nonzero_run(uint16_vec &syms, uint32_t &run_size, uint32_t len);
2461	static void end_zero_run(uint16_vec &syms, uint32_t &run_size);
2462	};
2463
2464	class huff2D
2465	{
2466	public:
2467	huff2D() { }
2468	huff2D(uint32_t bits_per_sym, uint32_t total_syms_per_group) { init(bits_per_sym, total_syms_per_group); }
2469
2470	inline const histogram &get_histogram() const { return m_histogram; }
2471	inline const huffman_encoding_table &get_encoding_table() const { return m_encoding_table; }
2472
2473	inline void init(uint32_t bits_per_sym, uint32_t total_syms_per_group)
2474	{
2475	assert((bits_per_sym * total_syms_per_group) <= `16` && total_syms_per_group >= `1` && bits_per_sym >= `1`);
2476
2477	m_bits_per_sym = bits_per_sym;
2478	m_total_syms_per_group = total_syms_per_group;
2479	m_cur_sym_bits = `0`;
2480	m_cur_num_syms = `0`;
2481	m_decode_syms_remaining = `0`;
2482	m_next_decoder_group_index = `0`;
2483
2484	m_histogram.init(`1` << (bits_per_sym * total_syms_per_group));
2485	}
2486
2487	inline void clear()
2488	{
2489	m_group_bits.clear();
2490
2491	m_cur_sym_bits = `0`;
2492	m_cur_num_syms = `0`;
2493	m_decode_syms_remaining = `0`;
2494	m_next_decoder_group_index = `0`;
2495	}
2496
2497	inline void emit(uint32_t sym)
2498	{
2499	m_cur_sym_bits \|= (sym << (m_cur_num_syms * m_bits_per_sym));
2500	m_cur_num_syms++;
2501
2502	if (m_cur_num_syms == m_total_syms_per_group)
2503	flush();
2504	}
2505
2506	inline void flush()
2507	{
2508	if (m_cur_num_syms)
2509	{
2510	m_group_bits.push_back(m_cur_sym_bits);
2511	m_histogram.inc(m_cur_sym_bits);
2512
2513	m_cur_sym_bits = `0`;
2514	m_cur_num_syms = `0`;
2515	}
2516	}
2517
2518	inline bool start_encoding(uint32_t code_size_limit = `16`)
2519	{
2520	flush();
2521
2522	if (!m_encoding_table.init(m_histogram, code_size_limit))
2523	return false;
2524
2525	m_decode_syms_remaining = `0`;
2526	m_next_decoder_group_index = `0`;
2527
2528	return true;
2529	}
2530
2531	inline uint32_t emit_next_sym(bitwise_coder &c)
2532	{
2533	uint32_t bits = `0`;
2534
2535	if (!m_decode_syms_remaining)
2536	{
2537	bits = c.put_code(m_group_bits [m_next_decoder_group_index++], m_encoding_table);
2538	m_decode_syms_remaining = m_total_syms_per_group;
2539	}
2540
2541	m_decode_syms_remaining--;
2542	return bits;
2543	}
2544
2545	inline void emit_flush()
2546	{
2547	m_decode_syms_remaining = `0`;
2548	}
2549
2550	private:
2551	uint_vec m_group_bits;
2552	huffman_encoding_table m_encoding_table;
2553	histogram m_histogram;
2554	uint32_t m_bits_per_sym, m_total_syms_per_group, m_cur_sym_bits, m_cur_num_syms, m_next_decoder_group_index, m_decode_syms_remaining;
2555	};
2556
2557	bool huffman_test(int rand_seed);
2558
2559	// VQ index reordering
2560
2561	class palette_index_reorderer
2562	{
2563	public:
2564	palette_index_reorderer()
2565	{
2566	}
2567
2568	void clear()
2569	{
2570	clear_vector(m_hist);
2571	clear_vector(m_total_count_to_picked);
2572	clear_vector(m_entries_picked);
2573	clear_vector(m_entries_to_do);
2574	clear_vector(m_remap_table);
2575	}
2576
2577	// returns [0,1] distance of entry i to entry j
2578	typedef float(pEntry_dist_func)(uint32_t i, uint32_t j, void* *pCtx);
2579
2580	void init(uint32_t num_indices, const uint32_t pIndices, uint32_t num_syms, pEntry_dist_func pDist_func, void* pCtx, float* dist_func_weight);
2581
2582	// Table remaps old to new symbol indices
2583	inline const uint_vec &get_remap_table() const { return m_remap_table; }
2584
2585	private:
2586	uint_vec m_hist, m_total_count_to_picked, m_entries_picked, m_entries_to_do, m_remap_table;
2587
2588	inline uint32_t get_hist(int i, int j, int n) const { return (i > j) ? m_hist [j * n + i] : m_hist [i * n + j]; }
2589	inline void inc_hist(int i, int j, int n) { if ((i != j) && (i < j) && (i != -`1`) && (j != -`1`)) { assert(((uint32_t)i < (uint32_t)n) && ((uint32_t)j < (uint32_t)n)); m_hist [i * n + j]++; } }
2590
2591	void prepare_hist(uint32_t num_syms, uint32_t num_indices, const uint32_t *pIndices);
2592	void find_initial(uint32_t num_syms);
2593	void find_next_entry(uint32_t &best_entry, double &best_count, pEntry_dist_func pDist_func, void pCtx, float* dist_func_weight);
2594	float pick_side(uint32_t num_syms, uint32_t entry_to_move, pEntry_dist_func pDist_func, void pCtx, float* dist_func_weight);
2595	};
2596
2597	// Simple 32-bit 2D image class
2598
2599	class image
2600	{
2601	public:
2602	image() :
2603	m_width(`0`), m_height(`0`), m_pitch(`0`)
2604	{
2605	}
2606
2607	image(uint32_t w, uint32_t h, uint32_t p = UINT32_MAX) :
2608	m_width(`0`), m_height(`0`), m_pitch(`0`)
2609	{
2610	resize(w, h, p);
2611	}
2612
2613	image(const uint8_t *pImage, uint32_t width, uint32_t height, uint32_t comps) :
2614	m_width(`0`), m_height(`0`), m_pitch(`0`)
2615	{
2616	init(pImage, width, height, comps);
2617	}
2618
2619	image(const image &other) :
2620	m_width(`0`), m_height(`0`), m_pitch(`0`)
2621	{
2622	*this = other;
2623	}
2624
2625	image &swap(image &other)
2626	{
2627	std::swap(m_width, other.m_width);
2628	std::swap(m_height, other.m_height);
2629	std::swap(m_pitch, other.m_pitch);
2630	m_pixels.swap(other.m_pixels);
2631	return *this;
2632	}
2633
2634	image &operator= (const image &rhs)
2635	{
2636	if (this != &rhs)
2637	{
2638	m_width = rhs.m_width;
2639	m_height = rhs.m_height;
2640	m_pitch = rhs.m_pitch;
2641	m_pixels = rhs.m_pixels;
2642	}
2643	return *this;
2644	}
2645
2646	image &clear()
2647	{
2648	m_width = `0`;
2649	m_height = `0`;
2650	m_pitch = `0`;
2651	clear_vector(m_pixels);
2652	return *this;
2653	}
2654
2655	image &resize(uint32_t w, uint32_t h, uint32_t p = UINT32_MAX, const color_rgba& background = g_black_color)
2656	{
2657	return crop(w, h, p, background);
2658	}
2659
2660	image &set_all(const color_rgba &c)
2661	{
2662	for (uint32_t i = `0`; i < m_pixels.size(); i++)
2663	m_pixels [i] = c;
2664	return *this;
2665	}
2666
2667	void init(const uint8_t *pImage, uint32_t width, uint32_t height, uint32_t comps)
2668	{
2669	assert(comps >= `1` && comps <= `4`);
2670
2671	resize(width, height);
2672
2673	for (uint32_t y = `0`; y < height; y++)
2674	{
2675	for (uint32_t x = `0`; x < width; x++)
2676	{
2677	const uint8_t pSrc = &pImage[(x + y width) * comps];
2678	color_rgba &dst = (*this)(x, y);
2679
2680	if (comps == `1`)
2681	{
2682	dst.r = pSrc[`0`];
2683	dst.g = pSrc[`0`];
2684	dst.b = pSrc[`0`];
2685	dst.a = `255`;
2686	}
2687	else if (comps == `2`)
2688	{
2689	dst.r = pSrc[`0`];
2690	dst.g = pSrc[`0`];
2691	dst.b = pSrc[`0`];
2692	dst.a = pSrc[`1`];
2693	}
2694	else
2695	{
2696	dst.r = pSrc[`0`];
2697	dst.g = pSrc[`1`];
2698	dst.b = pSrc[`2`];
2699	if (comps == `4`)
2700	dst.a = pSrc[`3`];
2701	else
2702	dst.a = `255`;
2703	}
2704	}
2705	}
2706	}
2707
2708	image &fill_box(uint32_t x, uint32_t y, uint32_t w, uint32_t h, const color_rgba &c)
2709	{
2710	for (uint32_t iy = `0`; iy < h; iy++)
2711	for (uint32_t ix = `0`; ix < w; ix++)
2712	set_clipped(x + ix, y + iy, c);
2713	return *this;
2714	}
2715
2716	image& fill_box_alpha(uint32_t x, uint32_t y, uint32_t w, uint32_t h, const color_rgba& c)
2717	{
2718	for (uint32_t iy = `0`; iy < h; iy++)
2719	for (uint32_t ix = `0`; ix < w; ix++)
2720	set_clipped_alpha(x + ix, y + iy, c);
2721	return *this;
2722	}
2723
2724	image &crop_dup_borders(uint32_t w, uint32_t h)
2725	{
2726	const uint32_t orig_w = m_width, orig_h = m_height;
2727
2728	crop(w, h);
2729
2730	if (orig_w && orig_h)
2731	{
2732	if (m_width > orig_w)
2733	{
2734	for (uint32_t x = orig_w; x < m_width; x++)
2735	for (uint32_t y = `0`; y < m_height; y++)
2736	set_clipped(x, y, get_clamped(minimum(x, orig_w - `1U`), minimum(y, orig_h - `1U`)));
2737	}
2738
2739	if (m_height > orig_h)
2740	{
2741	for (uint32_t y = orig_h; y < m_height; y++)
2742	for (uint32_t x = `0`; x < m_width; x++)
2743	set_clipped(x, y, get_clamped(minimum(x, orig_w - `1U`), minimum(y, orig_h - `1U`)));
2744	}
2745	}
2746	return *this;
2747	}
2748
2749	// pPixels MUST have been allocated using malloc() (basisu::vector will eventually use free() on the pointer).
2750	image& grant_ownership(color_rgba* pPixels, uint32_t w, uint32_t h, uint32_t p = UINT32_MAX)
2751	{
2752	if (p == UINT32_MAX)
2753	p = w;
2754
2755	clear();
2756
2757	if ((!p) \|\| (!w) \|\| (!h))
2758	return *this;
2759
2760	m_pixels.grant_ownership(pPixels, p * h, p * h);
2761
2762	m_width = w;
2763	m_height = h;
2764	m_pitch = p;
2765
2766	return *this;
2767	}
2768
2769	image &crop(uint32_t w, uint32_t h, uint32_t p = UINT32_MAX, const color_rgba &background = g_black_color, bool init_image = true)
2770	{
2771	if (p == UINT32_MAX)
2772	p = w;
2773
2774	if ((w == m_width) && (m_height == h) && (m_pitch == p))
2775	return *this;
2776
2777	if ((!w) \|\| (!h) \|\| (!p))
2778	{
2779	clear();
2780	return *this;
2781	}
2782
2783	color_rgba_vec cur_state;
2784	cur_state.swap(m_pixels);
2785
2786	m_pixels.resize(p * h);
2787
2788	if (init_image)
2789	{
2790	if (m_width \|\| m_height)
2791	{
2792	for (uint32_t y = `0`; y < h; y++)
2793	{
2794	for (uint32_t x = `0`; x < w; x++)
2795	{
2796	if ((x < m_width) && (y < m_height))
2797	m_pixels [x + y * p] = cur_state [x + y * m_pitch];
2798	else
2799	m_pixels [x + y * p] = background;
2800	}
2801	}
2802	}
2803	else
2804	{
2805	m_pixels.set_all(background);
2806	}
2807	}
2808
2809	m_width = w;
2810	m_height = h;
2811	m_pitch = p;
2812
2813	return *this;
2814	}
2815
2816	inline const color_rgba &operator() (uint32_t x, uint32_t y) const { assert(x < m_width && y < m_height); return m_pixels [x + y * m_pitch]; }
2817	inline color_rgba &operator() (uint32_t x, uint32_t y) { assert(x < m_width && y < m_height); return m_pixels [x + y * m_pitch]; }
2818
2819	inline const color_rgba &get_clamped(int x, int y) const { return (*this)(clamp<int>(x, `0`, m_width - `1`), clamp<int>(y, `0`, m_height - `1`)); }
2820	inline color_rgba &get_clamped(int x, int y) { return (*this)(clamp<int>(x, `0`, m_width - `1`), clamp<int>(y, `0`, m_height - `1`)); }
2821
2822	inline const color_rgba &get_clamped_or_wrapped(int x, int y, bool wrap_u, bool wrap_v) const
2823	{
2824	x = wrap_u ? posmod(x, m_width) : clamp<int>(x, `0`, m_width - `1`);
2825	y = wrap_v ? posmod(y, m_height) : clamp<int>(y, `0`, m_height - `1`);
2826	return m_pixels [x + y * m_pitch];
2827	}
2828
2829	inline color_rgba &get_clamped_or_wrapped(int x, int y, bool wrap_u, bool wrap_v)
2830	{
2831	x = wrap_u ? posmod(x, m_width) : clamp<int>(x, `0`, m_width - `1`);
2832	y = wrap_v ? posmod(y, m_height) : clamp<int>(y, `0`, m_height - `1`);
2833	return m_pixels [x + y * m_pitch];
2834	}
2835
2836	inline image &set_clipped(int x, int y, const color_rgba &c)
2837	{
2838	if ((static_cast<uint32_t>(x) < m_width) && (static_cast<uint32_t>(y) < m_height))
2839	(*this)(x, y) = c;
2840	return *this;
2841	}
2842
2843	inline image& set_clipped_alpha(int x, int y, const color_rgba& c)
2844	{
2845	if ((static_cast<uint32_t>(x) < m_width) && (static_cast<uint32_t>(y) < m_height))
2846	(*this)(x, y).m_comps[`3`] = c.m_comps[`3`];
2847	return *this;
2848	}
2849
2850	// Very straightforward blit with full clipping. Not fast, but it works.
2851	image &blit(const image &src, int src_x, int src_y, int src_w, int src_h, int dst_x, int dst_y)
2852	{
2853	for (int y = `0`; y < src_h; y++)
2854	{
2855	const int sy = src_y + y;
2856	if (sy < `0`)
2857	continue;
2858	else if (sy >= (int)src.get_height())
2859	break;
2860
2861	for (int x = `0`; x < src_w; x++)
2862	{
2863	const int sx = src_x + x;
2864	if (sx < `0`)
2865	continue;
2866	else if (sx >= (int)src.get_height())
2867	break;
2868
2869	set_clipped(dst_x + x, dst_y + y, src (sx, sy));
2870	}
2871	}
2872
2873	return *this;
2874	}
2875
2876	const image &extract_block_clamped(color_rgba pDst, uint32_t src_x, uint32_t src_y, uint32_t w, uint32_t h) const*
2877	{
2878	if (((src_x + w) > m_width) \|\| ((src_y + h) > m_height))
2879	{
2880	// Slower clamping case
2881	for (uint32_t y = `0`; y < h; y++)
2882	for (uint32_t x = `0`; x < w; x++)
2883	*pDst++ = get_clamped(src_x + x, src_y + y);
2884	}
2885	else
2886	{
2887	const color_rgba* pSrc = &m_pixels [src_x + src_y * m_pitch];
2888
2889	for (uint32_t y = `0`; y < h; y++)
2890	{
2891	memcpy(pDst, pSrc, w * sizeof(color_rgba));
2892	pSrc += m_pitch;
2893	pDst += w;
2894	}
2895	}
2896
2897	return *this;
2898	}
2899
2900	image &set_block_clipped(const color_rgba *pSrc, uint32_t dst_x, uint32_t dst_y, uint32_t w, uint32_t h)
2901	{
2902	for (uint32_t y = `0`; y < h; y++)
2903	for (uint32_t x = `0`; x < w; x++)
2904	set_clipped(dst_x + x, dst_y + y, *pSrc++);
2905	return *this;
2906	}
2907
2908	inline uint32_t get_width() const { return m_width; }
2909	inline uint32_t get_height() const { return m_height; }
2910	inline uint32_t get_pitch() const { return m_pitch; }
2911	inline uint32_t get_total_pixels() const { return m_width * m_height; }
2912
2913	inline uint32_t get_block_width(uint32_t w) const { return (m_width + (w - `1`)) / w; }
2914	inline uint32_t get_block_height(uint32_t h) const { return (m_height + (h - `1`)) / h; }
2915	inline uint32_t get_total_blocks(uint32_t w, uint32_t h) const { return get_block_width(w) * get_block_height(h); }
2916
2917	inline const color_rgba_vec &get_pixels() const { return m_pixels; }
2918	inline color_rgba_vec &get_pixels() { return m_pixels; }
2919
2920	inline const color_rgba get_ptr() const* { return &m_pixels [`0`]; }
2921	inline color_rgba get_ptr() { return* &m_pixels [`0`]; }
2922
2923	bool has_alpha() const
2924	{
2925	for (uint32_t y = `0`; y < m_height; ++y)
2926	for (uint32_t x = `0`; x < m_width; ++x)
2927	if ((*this)(x, y).a < `255`)
2928	return true;
2929
2930	return false;
2931	}
2932
2933	image &set_alpha(uint8_t a)
2934	{
2935	for (uint32_t y = `0`; y < m_height; ++y)
2936	for (uint32_t x = `0`; x < m_width; ++x)
2937	(*this)(x, y).a = a;
2938	return *this;
2939	}
2940
2941	image &flip_y()
2942	{
2943	for (uint32_t y = `0`; y < m_height / `2`; ++y)
2944	for (uint32_t x = `0`; x < m_width; ++x)
2945	std::swap((*this)(x, y), (*this)(x, m_height - `1` - y));
2946	return *this;
2947	}
2948
2949	// TODO: There are many ways to do this, not sure this is the best way.
2950	image &renormalize_normal_map()
2951	{
2952	for (uint32_t y = `0`; y < m_height; y++)
2953	{
2954	for (uint32_t x = `0`; x < m_width; x++)
2955	{
2956	color_rgba &c = (*this)(x, y);
2957	if ((c.r == `128`) && (c.g == `128`) && (c.b == `128`))
2958	continue;
2959
2960	vec3F v(c.r, c.g, c.b);
2961	v = (v * (`2.0f` / `255.0f`)) - vec3F (`1.0f`);
2962	v.clamp(-`1.0f`, `1.0f`);
2963
2964	float length = v.length();
2965	const float cValidThresh = `.077f`;
2966	if (length < cValidThresh)
2967	{
2968	c.set(`128`, `128`, `128`, c.a);
2969	}
2970	else if (fabs(length - `1.0f`) > cValidThresh)
2971	{
2972	if (length)
2973	v /= length;
2974
2975	for (uint32_t i = `0`; i < `3`; i++)
2976	c [i] = static_cast<uint8_t>(clamp<float>(floor((v [i] + `1.0f`) * `255.0f` * `.5f` + `.5f`), `0.0f`, `255.0f`));
2977
2978	if ((c.g == `128`) && (c.r == `128`))
2979	{
2980	if (c.b < `128`)
2981	c.b = `0`;
2982	else
2983	c.b = `255`;
2984	}
2985	}
2986	}
2987	}
2988	return *this;
2989	}
2990
2991	void debug_text(uint32_t x_ofs, uint32_t y_ofs, uint32_t x_scale, uint32_t y_scale, const color_rgba &fg, const color_rgba pBG, bool* alpha_only, const char* p, ...);
2992
2993	private:
2994	uint32_t m_width, m_height, m_pitch; // all in pixels
2995	color_rgba_vec m_pixels;
2996	};
2997
2998	// Float images
2999
3000	typedef basisu::vector<vec4F> vec4F_vec;
3001
3002	class imagef
3003	{
3004	public:
3005	imagef() :
3006	m_width(`0`), m_height(`0`), m_pitch(`0`)
3007	{
3008	}
3009
3010	imagef(uint32_t w, uint32_t h, uint32_t p = UINT32_MAX) :
3011	m_width(`0`), m_height(`0`), m_pitch(`0`)
3012	{
3013	resize(w, h, p);
3014	}
3015
3016	imagef(const imagef &other) :
3017	m_width(`0`), m_height(`0`), m_pitch(`0`)
3018	{
3019	*this = other;
3020	}
3021
3022	imagef &swap(imagef &other)
3023	{
3024	std::swap(m_width, other.m_width);
3025	std::swap(m_height, other.m_height);
3026	std::swap(m_pitch, other.m_pitch);
3027	m_pixels.swap(other.m_pixels);
3028	return *this;
3029	}
3030
3031	imagef &operator= (const imagef &rhs)
3032	{
3033	if (this != &rhs)
3034	{
3035	m_width = rhs.m_width;
3036	m_height = rhs.m_height;
3037	m_pitch = rhs.m_pitch;
3038	m_pixels = rhs.m_pixels;
3039	}
3040	return *this;
3041	}
3042
3043	imagef &clear()
3044	{
3045	m_width = `0`;
3046	m_height = `0`;
3047	m_pitch = `0`;
3048	clear_vector(m_pixels);
3049	return *this;
3050	}
3051
3052	imagef &set(const image &src, const vec4F &scale = vec4F (`1`), const vec4F &bias = vec4F (`0`))
3053	{
3054	const uint32_t width = src.get_width();
3055	const uint32_t height = src.get_height();
3056
3057	resize(width, height);
3058
3059	for (int y = `0`; y < (int)height; y++)
3060	{
3061	for (uint32_t x = `0`; x < width; x++)
3062	{
3063	const color_rgba &src_pixel = src (x, y);
3064	(*this)(x, y).set((float)src_pixel.r * scale [`0`] + bias [`0`], (float)src_pixel.g * scale [`1`] + bias [`1`], (float)src_pixel.b * scale [`2`] + bias [`2`], (float)src_pixel.a * scale [`3`] + bias [`3`]);
3065	}
3066	}
3067
3068	return *this;
3069	}
3070
3071	imagef &resize(const imagef &other, uint32_t p = UINT32_MAX, const vec4F& background = vec4F (`0`,`0`,`0`,`1`))
3072	{
3073	return resize(other.get_width(), other.get_height(), p, background);
3074	}
3075
3076	imagef &resize(uint32_t w, uint32_t h, uint32_t p = UINT32_MAX, const vec4F& background = vec4F (`0`,`0`,`0`,`1`))
3077	{
3078	return crop(w, h, p, background);
3079	}
3080
3081	imagef &set_all(const vec4F &c)
3082	{
3083	for (uint32_t i = `0`; i < m_pixels.size(); i++)
3084	m_pixels [i] = c;
3085	return *this;
3086	}
3087
3088	imagef &fill_box(uint32_t x, uint32_t y, uint32_t w, uint32_t h, const vec4F &c)
3089	{
3090	for (uint32_t iy = `0`; iy < h; iy++)
3091	for (uint32_t ix = `0`; ix < w; ix++)
3092	set_clipped(x + ix, y + iy, c);
3093	return *this;
3094	}
3095
3096	imagef &crop(uint32_t w, uint32_t h, uint32_t p = UINT32_MAX, const vec4F &background = vec4F (`0`,`0`,`0`,`1`))
3097	{
3098	if (p == UINT32_MAX)
3099	p = w;
3100
3101	if ((w == m_width) && (m_height == h) && (m_pitch == p))
3102	return *this;
3103
3104	if ((!w) \|\| (!h) \|\| (!p))
3105	{
3106	clear();
3107	return *this;
3108	}
3109
3110	vec4F_vec cur_state;
3111	cur_state.swap(m_pixels);
3112
3113	m_pixels.resize(p * h);
3114
3115	for (uint32_t y = `0`; y < h; y++)
3116	{
3117	for (uint32_t x = `0`; x < w; x++)
3118	{
3119	if ((x < m_width) && (y < m_height))
3120	m_pixels [x + y * p] = cur_state [x + y * m_pitch];
3121	else
3122	m_pixels [x + y * p] = background;
3123	}
3124	}
3125
3126	m_width = w;
3127	m_height = h;
3128	m_pitch = p;
3129
3130	return *this;
3131	}
3132
3133	inline const vec4F &operator() (uint32_t x, uint32_t y) const { assert(x < m_width && y < m_height); return m_pixels [x + y * m_pitch]; }
3134	inline vec4F &operator() (uint32_t x, uint32_t y) { assert(x < m_width && y < m_height); return m_pixels [x + y * m_pitch]; }
3135
3136	inline const vec4F &get_clamped(int x, int y) const { return (*this)(clamp<int>(x, `0`, m_width - `1`), clamp<int>(y, `0`, m_height - `1`)); }
3137	inline vec4F &get_clamped(int x, int y) { return (*this)(clamp<int>(x, `0`, m_width - `1`), clamp<int>(y, `0`, m_height - `1`)); }
3138
3139	inline const vec4F &get_clamped_or_wrapped(int x, int y, bool wrap_u, bool wrap_v) const
3140	{
3141	x = wrap_u ? posmod(x, m_width) : clamp<int>(x, `0`, m_width - `1`);
3142	y = wrap_v ? posmod(y, m_height) : clamp<int>(y, `0`, m_height - `1`);
3143	return m_pixels [x + y * m_pitch];
3144	}
3145
3146	inline vec4F &get_clamped_or_wrapped(int x, int y, bool wrap_u, bool wrap_v)
3147	{
3148	x = wrap_u ? posmod(x, m_width) : clamp<int>(x, `0`, m_width - `1`);
3149	y = wrap_v ? posmod(y, m_height) : clamp<int>(y, `0`, m_height - `1`);
3150	return m_pixels [x + y * m_pitch];
3151	}
3152
3153	inline imagef &set_clipped(int x, int y, const vec4F &c)
3154	{
3155	if ((static_cast<uint32_t>(x) < m_width) && (static_cast<uint32_t>(y) < m_height))
3156	(*this)(x, y) = c;
3157	return *this;
3158	}
3159
3160	// Very straightforward blit with full clipping. Not fast, but it works.
3161	imagef &blit(const imagef &src, int src_x, int src_y, int src_w, int src_h, int dst_x, int dst_y)
3162	{
3163	for (int y = `0`; y < src_h; y++)
3164	{
3165	const int sy = src_y + y;
3166	if (sy < `0`)
3167	continue;
3168	else if (sy >= (int)src.get_height())
3169	break;
3170
3171	for (int x = `0`; x < src_w; x++)
3172	{
3173	const int sx = src_x + x;
3174	if (sx < `0`)
3175	continue;
3176	else if (sx >= (int)src.get_height())
3177	break;
3178
3179	set_clipped(dst_x + x, dst_y + y, src (sx, sy));
3180	}
3181	}
3182
3183	return *this;
3184	}
3185
3186	const imagef &extract_block_clamped(vec4F pDst, uint32_t src_x, uint32_t src_y, uint32_t w, uint32_t h) const*
3187	{
3188	for (uint32_t y = `0`; y < h; y++)
3189	for (uint32_t x = `0`; x < w; x++)
3190	*pDst++ = get_clamped(src_x + x, src_y + y);
3191	return *this;
3192	}
3193
3194	imagef &set_block_clipped(const vec4F *pSrc, uint32_t dst_x, uint32_t dst_y, uint32_t w, uint32_t h)
3195	{
3196	for (uint32_t y = `0`; y < h; y++)
3197	for (uint32_t x = `0`; x < w; x++)
3198	set_clipped(dst_x + x, dst_y + y, *pSrc++);
3199	return *this;
3200	}
3201
3202	inline uint32_t get_width() const { return m_width; }
3203	inline uint32_t get_height() const { return m_height; }
3204	inline uint32_t get_pitch() const { return m_pitch; }
3205	inline uint32_t get_total_pixels() const { return m_width * m_height; }
3206
3207	inline uint32_t get_block_width(uint32_t w) const { return (m_width + (w - `1`)) / w; }
3208	inline uint32_t get_block_height(uint32_t h) const { return (m_height + (h - `1`)) / h; }
3209	inline uint32_t get_total_blocks(uint32_t w, uint32_t h) const { return get_block_width(w) * get_block_height(h); }
3210
3211	inline const vec4F_vec &get_pixels() const { return m_pixels; }
3212	inline vec4F_vec &get_pixels() { return m_pixels; }
3213
3214	inline const vec4F get_ptr() const* { return &m_pixels [`0`]; }
3215	inline vec4F get_ptr() { return* &m_pixels [`0`]; }
3216
3217	private:
3218	uint32_t m_width, m_height, m_pitch; // all in pixels
3219	vec4F_vec m_pixels;
3220	};
3221
3222	// Image metrics
3223
3224	class image_metrics
3225	{
3226	public:
3227	// TODO: Add ssim
3228	float m_max, m_mean, m_mean_squared, m_rms, m_psnr, m_ssim;
3229
3230	image_metrics()
3231	{
3232	clear();
3233	}
3234
3235	void clear()
3236	{
3237	m_max = `0`;
3238	m_mean = `0`;
3239	m_mean_squared = `0`;
3240	m_rms = `0`;
3241	m_psnr = `0`;
3242	m_ssim = `0`;
3243	}
3244
3245	void print(const char pPrefix = nullptr*) { printf("%sMax: %3.0f Mean: %3.3f RMS: %3.3f PSNR: %2.3f dB\n", pPrefix ? pPrefix : "", m_max, m_mean, m_rms, m_psnr); }
3246
3247	void calc(const image &a, const image &b, uint32_t first_chan = `0`, uint32_t total_chans = `0`, bool avg_comp_error = true, bool use_601_luma = false);
3248	};
3249
3250	// Image saving/loading/resampling
3251
3252	bool load_png(const uint8_t* pBuf, size_t buf_size, image& img, const char* pFilename = nullptr);
3253	bool load_png(const char* pFilename, image& img);
3254	inline bool load_png(const std::string &filename, image &img) { return load_png(filename.c_str(), img); }
3255
3256	bool load_tga(const char* pFilename, image& img);
3257	inline bool load_tga(const std::string &filename, image &img) { return load_tga(filename.c_str(), img); }
3258
3259	bool load_jpg(const char *pFilename, image& img);
3260	inline bool load_jpg(const std::string &filename, image &img) { return load_jpg(filename.c_str(), img); }
3261
3262	// Currently loads .PNG, .TGA, or .JPG
3263	bool load_image(const char* pFilename, image& img);
3264	inline bool load_image(const std::string &filename, image &img) { return load_image(filename.c_str(), img); }
3265
3266	uint8_t read_tga(const* uint8_t pBuf, uint32_t buf_size, int* &width, int &height, int &n_chans);
3267	uint8_t read_tga(const* char pFilename, int* &width, int &height, int &n_chans);
3268
3269	enum
3270	{
3271	cImageSaveGrayscale = `1`,
3272	cImageSaveIgnoreAlpha = `2`
3273	};
3274
3275	bool save_png(const char* pFilename, const image& img, uint32_t image_save_flags = `0`, uint32_t grayscale_comp = `0`);
3276	inline bool save_png(const std::string &filename, const image &img, uint32_t image_save_flags = `0`, uint32_t grayscale_comp = `0`) { return save_png(filename.c_str(), img, image_save_flags, grayscale_comp); }
3277
3278	bool read_file_to_vec(const char* pFilename, uint8_vec& data);
3279
3280	bool write_data_to_file(const char* pFilename, const void* pData, size_t len);
3281
3282	inline bool write_vec_to_file(const char* pFilename, const uint8_vec& v) { return v.size() ? write_data_to_file(pFilename, &v [`0`], v.size()) : write_data_to_file(pFilename, "", `0`); }
3283
3284	float linear_to_srgb(float l);
3285	float srgb_to_linear(float s);
3286
3287	bool image_resample(const image &src, image &dst, bool srgb = false,
3288	const char pFilter = "lanczos4", float* filter_scale = `1.0f`,
3289	bool wrapping = false,
3290	uint32_t first_comp = `0`, uint32_t num_comps = `4`);
3291
3292	// Timing
3293
3294	typedef uint64_t timer_ticks;
3295
3296	class interval_timer
3297	{
3298	public:
3299	interval_timer();
3300
3301	void start();
3302	void stop();
3303
3304	double get_elapsed_secs() const;
3305	inline double get_elapsed_ms() const { return `1000.0f`* get_elapsed_secs(); }
3306
3307	static void init();
3308	static inline timer_ticks get_ticks_per_sec() { return g_freq; }
3309	static timer_ticks get_ticks();
3310	static double ticks_to_secs(timer_ticks ticks);
3311	static inline double ticks_to_ms(timer_ticks ticks) { return ticks_to_secs(ticks) * `1000.0f`; }
3312
3313	private:
3314	static timer_ticks g_init_ticks, g_freq;
3315	static double g_timer_freq;
3316
3317	timer_ticks m_start_time, m_stop_time;
3318
3319	bool m_started, m_stopped;
3320	};
3321
3322	// 2D array
3323
3324	template<typename T>
3325	class vector2D
3326	{
3327	typedef basisu::vector<T> TVec;
3328
3329	uint32_t m_width, m_height;
3330	TVec m_values;
3331
3332	public:
3333	vector2D() :
3334	m_width(`0`),
3335	m_height(`0`)
3336	{
3337	}
3338
3339	vector2D(uint32_t w, uint32_t h) :
3340	m_width(`0`),
3341	m_height(`0`)
3342	{
3343	resize(w, h);
3344	}
3345
3346	vector2D(const vector2D &other)
3347	{
3348	*this = other;
3349	}
3350
3351	vector2D &operator= (const vector2D &other)
3352	{
3353	if (this != &other)
3354	{
3355	m_width = other.m_width;
3356	m_height = other.m_height;
3357	m_values = other.m_values;
3358	}
3359	return *this;
3360	}
3361
3362	inline bool operator== (const vector2D &rhs) const
3363	{
3364	return (m_width == rhs.m_width) && (m_height == rhs.m_height) && (m_values == rhs.m_values);
3365	}
3366
3367	inline uint32_t size_in_bytes() const { return (uint32_t)m_values.size() * sizeof(m_values[`0`]); }
3368
3369	inline const T &operator() (uint32_t x, uint32_t y) const { assert(x < m_width && y < m_height); return m_values[x + y * m_width]; }
3370	inline T &operator() (uint32_t x, uint32_t y) { assert(x < m_width && y < m_height); return m_values[x + y * m_width]; }
3371
3372	inline const T &operator[] (uint32_t i) const { return m_values[i]; }
3373	inline T &operator[] (uint32_t i) { return m_values[i]; }
3374
3375	inline const T &at_clamped(int x, int y) const { return (*this)(clamp<int>(x, `0`, m_width), clamp<int>(y, `0`, m_height)); }
3376	inline T &at_clamped(int x, int y) { return (*this)(clamp<int>(x, `0`, m_width), clamp<int>(y, `0`, m_height)); }
3377
3378	void clear()
3379	{
3380	m_width = `0`;
3381	m_height = `0`;
3382	m_values.clear();
3383	}
3384
3385	void set_all(const T&val)
3386	{
3387	vector_set_all(m_values, val);
3388	}
3389
3390	inline const T* get_ptr() const { return &m_values[`0`]; }
3391	inline T* get_ptr() { return &m_values[`0`]; }
3392
3393	vector2D &resize(uint32_t new_width, uint32_t new_height)
3394	{
3395	if ((m_width == new_width) && (m_height == new_height))
3396	return *this;
3397
3398	TVec oldVals(new_width * new_height);
3399	oldVals.swap(m_values);
3400
3401	const uint32_t w = minimum(m_width, new_width);
3402	const uint32_t h = minimum(m_height, new_height);
3403
3404	if ((w) && (h))
3405	{
3406	for (uint32_t y = `0`; y < h; y++)
3407	for (uint32_t x = `0`; x < w; x++)
3408	m_values[x + y * new_width] = oldVals[x + y * m_width];
3409	}
3410
3411	m_width = new_width;
3412	m_height = new_height;
3413
3414	return *this;
3415	}
3416	};
3417
3418	inline FILE fopen_safe(const* char pFilename, const* char *pMode)
3419	{
3420	#ifdef _WIN32
3421	FILE pFile = nullptr*;
3422	fopen_s(&pFile, pFilename, pMode);
3423	return pFile;
3424	#else
3425	return fopen(pFilename, pMode);
3426	#endif
3427	}
3428
3429	void fill_buffer_with_random_bytes(void *pBuf, size_t size, uint32_t seed = `1`);
3430
3431	const uint32_t cPixelBlockWidth = `4`;
3432	const uint32_t cPixelBlockHeight = `4`;
3433	const uint32_t cPixelBlockTotalPixels = cPixelBlockWidth * cPixelBlockHeight;
3434
3435	struct pixel_block
3436	{
3437	color_rgba m_pixels[cPixelBlockHeight][cPixelBlockWidth]; // [y][x]
3438
3439	inline const color_rgba& operator() (uint32_t x, uint32_t y) const { assert((x < cPixelBlockWidth) && (y < cPixelBlockHeight)); return m_pixels[y][x]; }
3440	inline color_rgba& operator() (uint32_t x, uint32_t y) { assert((x < cPixelBlockWidth) && (y < cPixelBlockHeight)); return m_pixels[y][x]; }
3441
3442	inline const color_rgba* get_ptr() const { return &m_pixels[`0`][`0`]; }
3443	inline color_rgba* get_ptr() { return &m_pixels[`0`][`0`]; }
3444
3445	inline void clear() { clear_obj(*this); }
3446
3447	inline bool operator== (const pixel_block& rhs) const
3448	{
3449	return memcmp(m_pixels, rhs.m_pixels, sizeof(m_pixels)) == `0`;
3450	}
3451	};
3452	typedef basisu::vector<pixel_block> pixel_block_vec;
3453
3454	} // namespace basisu
3455
3456
3457

Browse the source code of Godot/thirdparty/basis_universal/encoder/basisu_enc.h