1/*
2Convection Texture Tools
3Copyright (c) 2018-2019 Eric Lasota
4
5Permission is hereby granted, free of charge, to any person obtaining
6a copy of this software and associated documentation files (the
7"Software"), to deal in the Software without restriction, including
8without limitation the rights to use, copy, modify, merge, publish,
9distribute, sublicense, and/or sell copies of the Software, and to
10permit persons to whom the Software is furnished to do so, subject
11to the following conditions:
12
13The above copyright notice and this permission notice shall be included
14in all copies or substantial portions of the Software.
15
16THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
17OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
19IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
20CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
21TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
22SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
23
24-------------------------------------------------------------------------------------
25
26Portions based on DirectX Texture Library (DirectXTex)
27
28Copyright (c) Microsoft Corporation. All rights reserved.
29Licensed under the MIT License.
30
31http://go.microsoft.com/fwlink/?LinkId=248926
32*/
33#include "ConvectionKernels_Config.h"
34
35#if !defined(CVTT_SINGLE_FILE) || defined(CVTT_SINGLE_FILE_IMPL)
36
37#include "ConvectionKernels_BC67.h"
38
39#include "ConvectionKernels_AggregatedError.h"
40#include "ConvectionKernels_BCCommon.h"
41#include "ConvectionKernels_BC7_Prio.h"
42#include "ConvectionKernels_BC7_SingleColor.h"
43#include "ConvectionKernels_BC6H_IO.h"
44#include "ConvectionKernels_EndpointRefiner.h"
45#include "ConvectionKernels_EndpointSelector.h"
46#include "ConvectionKernels_IndexSelectorHDR.h"
47#include "ConvectionKernels_ParallelMath.h"
48#include "ConvectionKernels_UnfinishedEndpoints.h"
49
50namespace cvtt
51{
52 namespace Internal
53 {
54 namespace BC67
55 {
56 typedef ParallelMath::Float MFloat;
57 typedef ParallelMath::UInt15 MUInt15;
58
59 struct WorkInfo
60 {
61 MUInt15 m_mode;
62 MFloat m_error;
63 MUInt15 m_ep[3][2][4];
64 MUInt15 m_indexes[16];
65 MUInt15 m_indexes2[16];
66
67 union
68 {
69 MUInt15 m_partition;
70 struct IndexSelectorAndRotation
71 {
72 MUInt15 m_indexSelector;
73 MUInt15 m_rotation;
74 } m_isr;
75 } m_u;
76 };
77 }
78
79 namespace BC6HData
80 {
81 enum EField
82 {
83 NA, // N/A
84 M, // Mode
85 D, // Shape
86 RW,
87 RX,
88 RY,
89 RZ,
90 GW,
91 GX,
92 GY,
93 GZ,
94 BW,
95 BX,
96 BY,
97 BZ,
98 };
99
100 struct ModeDescriptor
101 {
102 EField m_eField;
103 uint8_t m_uBit;
104 };
105
106 const ModeDescriptor g_modeDescriptors[14][82] =
107 {
108 { // Mode 1 (0x00) - 10 5 5 5
109 { M, 0 },{ M, 1 },{ GY, 4 },{ BY, 4 },{ BZ, 4 },{ RW, 0 },{ RW, 1 },{ RW, 2 },{ RW, 3 },{ RW, 4 },
110 { RW, 5 },{ RW, 6 },{ RW, 7 },{ RW, 8 },{ RW, 9 },{ GW, 0 },{ GW, 1 },{ GW, 2 },{ GW, 3 },{ GW, 4 },
111 { GW, 5 },{ GW, 6 },{ GW, 7 },{ GW, 8 },{ GW, 9 },{ BW, 0 },{ BW, 1 },{ BW, 2 },{ BW, 3 },{ BW, 4 },
112 { BW, 5 },{ BW, 6 },{ BW, 7 },{ BW, 8 },{ BW, 9 },{ RX, 0 },{ RX, 1 },{ RX, 2 },{ RX, 3 },{ RX, 4 },
113 { GZ, 4 },{ GY, 0 },{ GY, 1 },{ GY, 2 },{ GY, 3 },{ GX, 0 },{ GX, 1 },{ GX, 2 },{ GX, 3 },{ GX, 4 },
114 { BZ, 0 },{ GZ, 0 },{ GZ, 1 },{ GZ, 2 },{ GZ, 3 },{ BX, 0 },{ BX, 1 },{ BX, 2 },{ BX, 3 },{ BX, 4 },
115 { BZ, 1 },{ BY, 0 },{ BY, 1 },{ BY, 2 },{ BY, 3 },{ RY, 0 },{ RY, 1 },{ RY, 2 },{ RY, 3 },{ RY, 4 },
116 { BZ, 2 },{ RZ, 0 },{ RZ, 1 },{ RZ, 2 },{ RZ, 3 },{ RZ, 4 },{ BZ, 3 },{ D, 0 },{ D, 1 },{ D, 2 },
117 { D, 3 },{ D, 4 },
118 },
119
120 { // Mode 2 (0x01) - 7 6 6 6
121 { M, 0 },{ M, 1 },{ GY, 5 },{ GZ, 4 },{ GZ, 5 },{ RW, 0 },{ RW, 1 },{ RW, 2 },{ RW, 3 },{ RW, 4 },
122 { RW, 5 },{ RW, 6 },{ BZ, 0 },{ BZ, 1 },{ BY, 4 },{ GW, 0 },{ GW, 1 },{ GW, 2 },{ GW, 3 },{ GW, 4 },
123 { GW, 5 },{ GW, 6 },{ BY, 5 },{ BZ, 2 },{ GY, 4 },{ BW, 0 },{ BW, 1 },{ BW, 2 },{ BW, 3 },{ BW, 4 },
124 { BW, 5 },{ BW, 6 },{ BZ, 3 },{ BZ, 5 },{ BZ, 4 },{ RX, 0 },{ RX, 1 },{ RX, 2 },{ RX, 3 },{ RX, 4 },
125 { RX, 5 },{ GY, 0 },{ GY, 1 },{ GY, 2 },{ GY, 3 },{ GX, 0 },{ GX, 1 },{ GX, 2 },{ GX, 3 },{ GX, 4 },
126 { GX, 5 },{ GZ, 0 },{ GZ, 1 },{ GZ, 2 },{ GZ, 3 },{ BX, 0 },{ BX, 1 },{ BX, 2 },{ BX, 3 },{ BX, 4 },
127 { BX, 5 },{ BY, 0 },{ BY, 1 },{ BY, 2 },{ BY, 3 },{ RY, 0 },{ RY, 1 },{ RY, 2 },{ RY, 3 },{ RY, 4 },
128 { RY, 5 },{ RZ, 0 },{ RZ, 1 },{ RZ, 2 },{ RZ, 3 },{ RZ, 4 },{ RZ, 5 },{ D, 0 },{ D, 1 },{ D, 2 },
129 { D, 3 },{ D, 4 },
130 },
131
132 { // Mode 3 (0x02) - 11 5 4 4
133 { M, 0 },{ M, 1 },{ M, 2 },{ M, 3 },{ M, 4 },{ RW, 0 },{ RW, 1 },{ RW, 2 },{ RW, 3 },{ RW, 4 },
134 { RW, 5 },{ RW, 6 },{ RW, 7 },{ RW, 8 },{ RW, 9 },{ GW, 0 },{ GW, 1 },{ GW, 2 },{ GW, 3 },{ GW, 4 },
135 { GW, 5 },{ GW, 6 },{ GW, 7 },{ GW, 8 },{ GW, 9 },{ BW, 0 },{ BW, 1 },{ BW, 2 },{ BW, 3 },{ BW, 4 },
136 { BW, 5 },{ BW, 6 },{ BW, 7 },{ BW, 8 },{ BW, 9 },{ RX, 0 },{ RX, 1 },{ RX, 2 },{ RX, 3 },{ RX, 4 },
137 { RW,10 },{ GY, 0 },{ GY, 1 },{ GY, 2 },{ GY, 3 },{ GX, 0 },{ GX, 1 },{ GX, 2 },{ GX, 3 },{ GW,10 },
138 { BZ, 0 },{ GZ, 0 },{ GZ, 1 },{ GZ, 2 },{ GZ, 3 },{ BX, 0 },{ BX, 1 },{ BX, 2 },{ BX, 3 },{ BW,10 },
139 { BZ, 1 },{ BY, 0 },{ BY, 1 },{ BY, 2 },{ BY, 3 },{ RY, 0 },{ RY, 1 },{ RY, 2 },{ RY, 3 },{ RY, 4 },
140 { BZ, 2 },{ RZ, 0 },{ RZ, 1 },{ RZ, 2 },{ RZ, 3 },{ RZ, 4 },{ BZ, 3 },{ D, 0 },{ D, 1 },{ D, 2 },
141 { D, 3 },{ D, 4 },
142 },
143
144 { // Mode 4 (0x06) - 11 4 5 4
145 { M, 0 },{ M, 1 },{ M, 2 },{ M, 3 },{ M, 4 },{ RW, 0 },{ RW, 1 },{ RW, 2 },{ RW, 3 },{ RW, 4 },
146 { RW, 5 },{ RW, 6 },{ RW, 7 },{ RW, 8 },{ RW, 9 },{ GW, 0 },{ GW, 1 },{ GW, 2 },{ GW, 3 },{ GW, 4 },
147 { GW, 5 },{ GW, 6 },{ GW, 7 },{ GW, 8 },{ GW, 9 },{ BW, 0 },{ BW, 1 },{ BW, 2 },{ BW, 3 },{ BW, 4 },
148 { BW, 5 },{ BW, 6 },{ BW, 7 },{ BW, 8 },{ BW, 9 },{ RX, 0 },{ RX, 1 },{ RX, 2 },{ RX, 3 },{ RW,10 },
149 { GZ, 4 },{ GY, 0 },{ GY, 1 },{ GY, 2 },{ GY, 3 },{ GX, 0 },{ GX, 1 },{ GX, 2 },{ GX, 3 },{ GX, 4 },
150 { GW,10 },{ GZ, 0 },{ GZ, 1 },{ GZ, 2 },{ GZ, 3 },{ BX, 0 },{ BX, 1 },{ BX, 2 },{ BX, 3 },{ BW,10 },
151 { BZ, 1 },{ BY, 0 },{ BY, 1 },{ BY, 2 },{ BY, 3 },{ RY, 0 },{ RY, 1 },{ RY, 2 },{ RY, 3 },{ BZ, 0 },
152 { BZ, 2 },{ RZ, 0 },{ RZ, 1 },{ RZ, 2 },{ RZ, 3 },{ GY, 4 },{ BZ, 3 },{ D, 0 },{ D, 1 },{ D, 2 },
153 { D, 3 },{ D, 4 },
154 },
155
156 { // Mode 5 (0x0a) - 11 4 4 5
157 { M, 0 },{ M, 1 },{ M, 2 },{ M, 3 },{ M, 4 },{ RW, 0 },{ RW, 1 },{ RW, 2 },{ RW, 3 },{ RW, 4 },
158 { RW, 5 },{ RW, 6 },{ RW, 7 },{ RW, 8 },{ RW, 9 },{ GW, 0 },{ GW, 1 },{ GW, 2 },{ GW, 3 },{ GW, 4 },
159 { GW, 5 },{ GW, 6 },{ GW, 7 },{ GW, 8 },{ GW, 9 },{ BW, 0 },{ BW, 1 },{ BW, 2 },{ BW, 3 },{ BW, 4 },
160 { BW, 5 },{ BW, 6 },{ BW, 7 },{ BW, 8 },{ BW, 9 },{ RX, 0 },{ RX, 1 },{ RX, 2 },{ RX, 3 },{ RW,10 },
161 { BY, 4 },{ GY, 0 },{ GY, 1 },{ GY, 2 },{ GY, 3 },{ GX, 0 },{ GX, 1 },{ GX, 2 },{ GX, 3 },{ GW,10 },
162 { BZ, 0 },{ GZ, 0 },{ GZ, 1 },{ GZ, 2 },{ GZ, 3 },{ BX, 0 },{ BX, 1 },{ BX, 2 },{ BX, 3 },{ BX, 4 },
163 { BW,10 },{ BY, 0 },{ BY, 1 },{ BY, 2 },{ BY, 3 },{ RY, 0 },{ RY, 1 },{ RY, 2 },{ RY, 3 },{ BZ, 1 },
164 { BZ, 2 },{ RZ, 0 },{ RZ, 1 },{ RZ, 2 },{ RZ, 3 },{ BZ, 4 },{ BZ, 3 },{ D, 0 },{ D, 1 },{ D, 2 },
165 { D, 3 },{ D, 4 },
166 },
167
168 { // Mode 6 (0x0e) - 9 5 5 5
169 { M, 0 },{ M, 1 },{ M, 2 },{ M, 3 },{ M, 4 },{ RW, 0 },{ RW, 1 },{ RW, 2 },{ RW, 3 },{ RW, 4 },
170 { RW, 5 },{ RW, 6 },{ RW, 7 },{ RW, 8 },{ BY, 4 },{ GW, 0 },{ GW, 1 },{ GW, 2 },{ GW, 3 },{ GW, 4 },
171 { GW, 5 },{ GW, 6 },{ GW, 7 },{ GW, 8 },{ GY, 4 },{ BW, 0 },{ BW, 1 },{ BW, 2 },{ BW, 3 },{ BW, 4 },
172 { BW, 5 },{ BW, 6 },{ BW, 7 },{ BW, 8 },{ BZ, 4 },{ RX, 0 },{ RX, 1 },{ RX, 2 },{ RX, 3 },{ RX, 4 },
173 { GZ, 4 },{ GY, 0 },{ GY, 1 },{ GY, 2 },{ GY, 3 },{ GX, 0 },{ GX, 1 },{ GX, 2 },{ GX, 3 },{ GX, 4 },
174 { BZ, 0 },{ GZ, 0 },{ GZ, 1 },{ GZ, 2 },{ GZ, 3 },{ BX, 0 },{ BX, 1 },{ BX, 2 },{ BX, 3 },{ BX, 4 },
175 { BZ, 1 },{ BY, 0 },{ BY, 1 },{ BY, 2 },{ BY, 3 },{ RY, 0 },{ RY, 1 },{ RY, 2 },{ RY, 3 },{ RY, 4 },
176 { BZ, 2 },{ RZ, 0 },{ RZ, 1 },{ RZ, 2 },{ RZ, 3 },{ RZ, 4 },{ BZ, 3 },{ D, 0 },{ D, 1 },{ D, 2 },
177 { D, 3 },{ D, 4 },
178 },
179
180 { // Mode 7 (0x12) - 8 6 5 5
181 { M, 0 },{ M, 1 },{ M, 2 },{ M, 3 },{ M, 4 },{ RW, 0 },{ RW, 1 },{ RW, 2 },{ RW, 3 },{ RW, 4 },
182 { RW, 5 },{ RW, 6 },{ RW, 7 },{ GZ, 4 },{ BY, 4 },{ GW, 0 },{ GW, 1 },{ GW, 2 },{ GW, 3 },{ GW, 4 },
183 { GW, 5 },{ GW, 6 },{ GW, 7 },{ BZ, 2 },{ GY, 4 },{ BW, 0 },{ BW, 1 },{ BW, 2 },{ BW, 3 },{ BW, 4 },
184 { BW, 5 },{ BW, 6 },{ BW, 7 },{ BZ, 3 },{ BZ, 4 },{ RX, 0 },{ RX, 1 },{ RX, 2 },{ RX, 3 },{ RX, 4 },
185 { RX, 5 },{ GY, 0 },{ GY, 1 },{ GY, 2 },{ GY, 3 },{ GX, 0 },{ GX, 1 },{ GX, 2 },{ GX, 3 },{ GX, 4 },
186 { BZ, 0 },{ GZ, 0 },{ GZ, 1 },{ GZ, 2 },{ GZ, 3 },{ BX, 0 },{ BX, 1 },{ BX, 2 },{ BX, 3 },{ BX, 4 },
187 { BZ, 1 },{ BY, 0 },{ BY, 1 },{ BY, 2 },{ BY, 3 },{ RY, 0 },{ RY, 1 },{ RY, 2 },{ RY, 3 },{ RY, 4 },
188 { RY, 5 },{ RZ, 0 },{ RZ, 1 },{ RZ, 2 },{ RZ, 3 },{ RZ, 4 },{ RZ, 5 },{ D, 0 },{ D, 1 },{ D, 2 },
189 { D, 3 },{ D, 4 },
190 },
191
192 { // Mode 8 (0x16) - 8 5 6 5
193 { M, 0 },{ M, 1 },{ M, 2 },{ M, 3 },{ M, 4 },{ RW, 0 },{ RW, 1 },{ RW, 2 },{ RW, 3 },{ RW, 4 },
194 { RW, 5 },{ RW, 6 },{ RW, 7 },{ BZ, 0 },{ BY, 4 },{ GW, 0 },{ GW, 1 },{ GW, 2 },{ GW, 3 },{ GW, 4 },
195 { GW, 5 },{ GW, 6 },{ GW, 7 },{ GY, 5 },{ GY, 4 },{ BW, 0 },{ BW, 1 },{ BW, 2 },{ BW, 3 },{ BW, 4 },
196 { BW, 5 },{ BW, 6 },{ BW, 7 },{ GZ, 5 },{ BZ, 4 },{ RX, 0 },{ RX, 1 },{ RX, 2 },{ RX, 3 },{ RX, 4 },
197 { GZ, 4 },{ GY, 0 },{ GY, 1 },{ GY, 2 },{ GY, 3 },{ GX, 0 },{ GX, 1 },{ GX, 2 },{ GX, 3 },{ GX, 4 },
198 { GX, 5 },{ GZ, 0 },{ GZ, 1 },{ GZ, 2 },{ GZ, 3 },{ BX, 0 },{ BX, 1 },{ BX, 2 },{ BX, 3 },{ BX, 4 },
199 { BZ, 1 },{ BY, 0 },{ BY, 1 },{ BY, 2 },{ BY, 3 },{ RY, 0 },{ RY, 1 },{ RY, 2 },{ RY, 3 },{ RY, 4 },
200 { BZ, 2 },{ RZ, 0 },{ RZ, 1 },{ RZ, 2 },{ RZ, 3 },{ RZ, 4 },{ BZ, 3 },{ D, 0 },{ D, 1 },{ D, 2 },
201 { D, 3 },{ D, 4 },
202 },
203
204 { // Mode 9 (0x1a) - 8 5 5 6
205 { M, 0 },{ M, 1 },{ M, 2 },{ M, 3 },{ M, 4 },{ RW, 0 },{ RW, 1 },{ RW, 2 },{ RW, 3 },{ RW, 4 },
206 { RW, 5 },{ RW, 6 },{ RW, 7 },{ BZ, 1 },{ BY, 4 },{ GW, 0 },{ GW, 1 },{ GW, 2 },{ GW, 3 },{ GW, 4 },
207 { GW, 5 },{ GW, 6 },{ GW, 7 },{ BY, 5 },{ GY, 4 },{ BW, 0 },{ BW, 1 },{ BW, 2 },{ BW, 3 },{ BW, 4 },
208 { BW, 5 },{ BW, 6 },{ BW, 7 },{ BZ, 5 },{ BZ, 4 },{ RX, 0 },{ RX, 1 },{ RX, 2 },{ RX, 3 },{ RX, 4 },
209 { GZ, 4 },{ GY, 0 },{ GY, 1 },{ GY, 2 },{ GY, 3 },{ GX, 0 },{ GX, 1 },{ GX, 2 },{ GX, 3 },{ GX, 4 },
210 { BZ, 0 },{ GZ, 0 },{ GZ, 1 },{ GZ, 2 },{ GZ, 3 },{ BX, 0 },{ BX, 1 },{ BX, 2 },{ BX, 3 },{ BX, 4 },
211 { BX, 5 },{ BY, 0 },{ BY, 1 },{ BY, 2 },{ BY, 3 },{ RY, 0 },{ RY, 1 },{ RY, 2 },{ RY, 3 },{ RY, 4 },
212 { BZ, 2 },{ RZ, 0 },{ RZ, 1 },{ RZ, 2 },{ RZ, 3 },{ RZ, 4 },{ BZ, 3 },{ D, 0 },{ D, 1 },{ D, 2 },
213 { D, 3 },{ D, 4 },
214 },
215
216 { // Mode 10 (0x1e) - 6 6 6 6
217 { M, 0 },{ M, 1 },{ M, 2 },{ M, 3 },{ M, 4 },{ RW, 0 },{ RW, 1 },{ RW, 2 },{ RW, 3 },{ RW, 4 },
218 { RW, 5 },{ GZ, 4 },{ BZ, 0 },{ BZ, 1 },{ BY, 4 },{ GW, 0 },{ GW, 1 },{ GW, 2 },{ GW, 3 },{ GW, 4 },
219 { GW, 5 },{ GY, 5 },{ BY, 5 },{ BZ, 2 },{ GY, 4 },{ BW, 0 },{ BW, 1 },{ BW, 2 },{ BW, 3 },{ BW, 4 },
220 { BW, 5 },{ GZ, 5 },{ BZ, 3 },{ BZ, 5 },{ BZ, 4 },{ RX, 0 },{ RX, 1 },{ RX, 2 },{ RX, 3 },{ RX, 4 },
221 { RX, 5 },{ GY, 0 },{ GY, 1 },{ GY, 2 },{ GY, 3 },{ GX, 0 },{ GX, 1 },{ GX, 2 },{ GX, 3 },{ GX, 4 },
222 { GX, 5 },{ GZ, 0 },{ GZ, 1 },{ GZ, 2 },{ GZ, 3 },{ BX, 0 },{ BX, 1 },{ BX, 2 },{ BX, 3 },{ BX, 4 },
223 { BX, 5 },{ BY, 0 },{ BY, 1 },{ BY, 2 },{ BY, 3 },{ RY, 0 },{ RY, 1 },{ RY, 2 },{ RY, 3 },{ RY, 4 },
224 { RY, 5 },{ RZ, 0 },{ RZ, 1 },{ RZ, 2 },{ RZ, 3 },{ RZ, 4 },{ RZ, 5 },{ D, 0 },{ D, 1 },{ D, 2 },
225 { D, 3 },{ D, 4 },
226 },
227
228 { // Mode 11 (0x03) - 10 10
229 { M, 0 },{ M, 1 },{ M, 2 },{ M, 3 },{ M, 4 },{ RW, 0 },{ RW, 1 },{ RW, 2 },{ RW, 3 },{ RW, 4 },
230 { RW, 5 },{ RW, 6 },{ RW, 7 },{ RW, 8 },{ RW, 9 },{ GW, 0 },{ GW, 1 },{ GW, 2 },{ GW, 3 },{ GW, 4 },
231 { GW, 5 },{ GW, 6 },{ GW, 7 },{ GW, 8 },{ GW, 9 },{ BW, 0 },{ BW, 1 },{ BW, 2 },{ BW, 3 },{ BW, 4 },
232 { BW, 5 },{ BW, 6 },{ BW, 7 },{ BW, 8 },{ BW, 9 },{ RX, 0 },{ RX, 1 },{ RX, 2 },{ RX, 3 },{ RX, 4 },
233 { RX, 5 },{ RX, 6 },{ RX, 7 },{ RX, 8 },{ RX, 9 },{ GX, 0 },{ GX, 1 },{ GX, 2 },{ GX, 3 },{ GX, 4 },
234 { GX, 5 },{ GX, 6 },{ GX, 7 },{ GX, 8 },{ GX, 9 },{ BX, 0 },{ BX, 1 },{ BX, 2 },{ BX, 3 },{ BX, 4 },
235 { BX, 5 },{ BX, 6 },{ BX, 7 },{ BX, 8 },{ BX, 9 },{ NA, 0 },{ NA, 0 },{ NA, 0 },{ NA, 0 },{ NA, 0 },
236 { NA, 0 },{ NA, 0 },{ NA, 0 },{ NA, 0 },{ NA, 0 },{ NA, 0 },{ NA, 0 },{ NA, 0 },{ NA, 0 },{ NA, 0 },
237 { NA, 0 },{ NA, 0 },
238 },
239
240 { // Mode 12 (0x07) - 11 9
241 { M, 0 },{ M, 1 },{ M, 2 },{ M, 3 },{ M, 4 },{ RW, 0 },{ RW, 1 },{ RW, 2 },{ RW, 3 },{ RW, 4 },
242 { RW, 5 },{ RW, 6 },{ RW, 7 },{ RW, 8 },{ RW, 9 },{ GW, 0 },{ GW, 1 },{ GW, 2 },{ GW, 3 },{ GW, 4 },
243 { GW, 5 },{ GW, 6 },{ GW, 7 },{ GW, 8 },{ GW, 9 },{ BW, 0 },{ BW, 1 },{ BW, 2 },{ BW, 3 },{ BW, 4 },
244 { BW, 5 },{ BW, 6 },{ BW, 7 },{ BW, 8 },{ BW, 9 },{ RX, 0 },{ RX, 1 },{ RX, 2 },{ RX, 3 },{ RX, 4 },
245 { RX, 5 },{ RX, 6 },{ RX, 7 },{ RX, 8 },{ RW,10 },{ GX, 0 },{ GX, 1 },{ GX, 2 },{ GX, 3 },{ GX, 4 },
246 { GX, 5 },{ GX, 6 },{ GX, 7 },{ GX, 8 },{ GW,10 },{ BX, 0 },{ BX, 1 },{ BX, 2 },{ BX, 3 },{ BX, 4 },
247 { BX, 5 },{ BX, 6 },{ BX, 7 },{ BX, 8 },{ BW,10 },{ NA, 0 },{ NA, 0 },{ NA, 0 },{ NA, 0 },{ NA, 0 },
248 { NA, 0 },{ NA, 0 },{ NA, 0 },{ NA, 0 },{ NA, 0 },{ NA, 0 },{ NA, 0 },{ NA, 0 },{ NA, 0 },{ NA, 0 },
249 { NA, 0 },{ NA, 0 },
250 },
251
252 { // Mode 13 (0x0b) - 12 8
253 { M, 0 },{ M, 1 },{ M, 2 },{ M, 3 },{ M, 4 },{ RW, 0 },{ RW, 1 },{ RW, 2 },{ RW, 3 },{ RW, 4 },
254 { RW, 5 },{ RW, 6 },{ RW, 7 },{ RW, 8 },{ RW, 9 },{ GW, 0 },{ GW, 1 },{ GW, 2 },{ GW, 3 },{ GW, 4 },
255 { GW, 5 },{ GW, 6 },{ GW, 7 },{ GW, 8 },{ GW, 9 },{ BW, 0 },{ BW, 1 },{ BW, 2 },{ BW, 3 },{ BW, 4 },
256 { BW, 5 },{ BW, 6 },{ BW, 7 },{ BW, 8 },{ BW, 9 },{ RX, 0 },{ RX, 1 },{ RX, 2 },{ RX, 3 },{ RX, 4 },
257 { RX, 5 },{ RX, 6 },{ RX, 7 },{ RW,11 },{ RW,10 },{ GX, 0 },{ GX, 1 },{ GX, 2 },{ GX, 3 },{ GX, 4 },
258 { GX, 5 },{ GX, 6 },{ GX, 7 },{ GW,11 },{ GW,10 },{ BX, 0 },{ BX, 1 },{ BX, 2 },{ BX, 3 },{ BX, 4 },
259 { BX, 5 },{ BX, 6 },{ BX, 7 },{ BW,11 },{ BW,10 },{ NA, 0 },{ NA, 0 },{ NA, 0 },{ NA, 0 },{ NA, 0 },
260 { NA, 0 },{ NA, 0 },{ NA, 0 },{ NA, 0 },{ NA, 0 },{ NA, 0 },{ NA, 0 },{ NA, 0 },{ NA, 0 },{ NA, 0 },
261 { NA, 0 },{ NA, 0 },
262 },
263
264 { // Mode 14 (0x0f) - 16 4
265 { M, 0 },{ M, 1 },{ M, 2 },{ M, 3 },{ M, 4 },{ RW, 0 },{ RW, 1 },{ RW, 2 },{ RW, 3 },{ RW, 4 },
266 { RW, 5 },{ RW, 6 },{ RW, 7 },{ RW, 8 },{ RW, 9 },{ GW, 0 },{ GW, 1 },{ GW, 2 },{ GW, 3 },{ GW, 4 },
267 { GW, 5 },{ GW, 6 },{ GW, 7 },{ GW, 8 },{ GW, 9 },{ BW, 0 },{ BW, 1 },{ BW, 2 },{ BW, 3 },{ BW, 4 },
268 { BW, 5 },{ BW, 6 },{ BW, 7 },{ BW, 8 },{ BW, 9 },{ RX, 0 },{ RX, 1 },{ RX, 2 },{ RX, 3 },{ RW,15 },
269 { RW,14 },{ RW,13 },{ RW,12 },{ RW,11 },{ RW,10 },{ GX, 0 },{ GX, 1 },{ GX, 2 },{ GX, 3 },{ GW,15 },
270 { GW,14 },{ GW,13 },{ GW,12 },{ GW,11 },{ GW,10 },{ BX, 0 },{ BX, 1 },{ BX, 2 },{ BX, 3 },{ BW,15 },
271 { BW,14 },{ BW,13 },{ BW,12 },{ BW,11 },{ BW,10 },{ NA, 0 },{ NA, 0 },{ NA, 0 },{ NA, 0 },{ NA, 0 },
272 { NA, 0 },{ NA, 0 },{ NA, 0 },{ NA, 0 },{ NA, 0 },{ NA, 0 },{ NA, 0 },{ NA, 0 },{ NA, 0 },{ NA, 0 },
273 { NA, 0 },{ NA, 0 },
274 },
275 };
276 }
277
278 namespace BC7Data
279 {
280 enum AlphaMode
281 {
282 AlphaMode_Combined,
283 AlphaMode_Separate,
284 AlphaMode_None,
285 };
286
287 enum PBitMode
288 {
289 PBitMode_PerEndpoint,
290 PBitMode_PerSubset,
291 PBitMode_None
292 };
293
294 struct BC7ModeInfo
295 {
296 PBitMode m_pBitMode;
297 AlphaMode m_alphaMode;
298 int m_rgbBits;
299 int m_alphaBits;
300 int m_partitionBits;
301 int m_numSubsets;
302 int m_indexBits;
303 int m_alphaIndexBits;
304 bool m_hasIndexSelector;
305 };
306
307 BC7ModeInfo g_modes[] =
308 {
309 { PBitMode_PerEndpoint, AlphaMode_None, 4, 0, 4, 3, 3, 0, false }, // 0
310 { PBitMode_PerSubset, AlphaMode_None, 6, 0, 6, 2, 3, 0, false }, // 1
311 { PBitMode_None, AlphaMode_None, 5, 0, 6, 3, 2, 0, false }, // 2
312 { PBitMode_PerEndpoint, AlphaMode_None, 7, 0, 6, 2, 2, 0, false }, // 3 (Mode reference has an error, P-bit is really per-endpoint)
313
314 { PBitMode_None, AlphaMode_Separate, 5, 6, 0, 1, 2, 3, true }, // 4
315 { PBitMode_None, AlphaMode_Separate, 7, 8, 0, 1, 2, 2, false }, // 5
316 { PBitMode_PerEndpoint, AlphaMode_Combined, 7, 7, 0, 1, 4, 0, false }, // 6
317 { PBitMode_PerEndpoint, AlphaMode_Combined, 5, 5, 6, 2, 2, 0, false } // 7
318 };
319
320 const int g_weight2[] = { 0, 21, 43, 64 };
321 const int g_weight3[] = { 0, 9, 18, 27, 37, 46, 55, 64 };
322 const int g_weight4[] = { 0, 4, 9, 13, 17, 21, 26, 30, 34, 38, 43, 47, 51, 55, 60, 64 };
323
324 const int *g_weightTables[] =
325 {
326 NULL,
327 NULL,
328 g_weight2,
329 g_weight3,
330 g_weight4
331 };
332
333 struct BC6HModeInfo
334 {
335 uint16_t m_modeID;
336 bool m_partitioned;
337 bool m_transformed;
338 int m_aPrec;
339 int m_bPrec[3];
340 };
341
342 // [partitioned][precision]
343 bool g_hdrModesExistForPrecision[2][17] =
344 {
345 //0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16
346 { false, false, false, false, false, false, false, false, false, false, true, true, true, false, false, false, true },
347 { false, false, false, false, false, false, true, true, true, true, true, true, false, false, false, false, false },
348 };
349
350 BC6HModeInfo g_hdrModes[] =
351 {
352 { 0x00, true, true, 10,{ 5, 5, 5 } },
353 { 0x01, true, true, 7,{ 6, 6, 6 } },
354 { 0x02, true, true, 11,{ 5, 4, 4 } },
355 { 0x06, true, true, 11,{ 4, 5, 4 } },
356 { 0x0a, true, true, 11,{ 4, 4, 5 } },
357 { 0x0e, true, true, 9,{ 5, 5, 5 } },
358 { 0x12, true, true, 8,{ 6, 5, 5 } },
359 { 0x16, true, true, 8,{ 5, 6, 5 } },
360 { 0x1a, true, true, 8,{ 5, 5, 6 } },
361 { 0x1e, true, false, 6,{ 6, 6, 6 } },
362 { 0x03, false, false, 10,{ 10, 10, 10 } },
363 { 0x07, false, true, 11,{ 9, 9, 9 } },
364 { 0x0b, false, true, 12,{ 8, 8, 8 } },
365 { 0x0f, false, true, 16,{ 4, 4, 4 } },
366 };
367
368 const int g_maxHDRPrecision = 16;
369
370 static const size_t g_numHDRModes = sizeof(g_hdrModes) / sizeof(g_hdrModes[0]);
371
372 static uint16_t g_partitionMap[64] =
373 {
374 0xCCCC, 0x8888, 0xEEEE, 0xECC8,
375 0xC880, 0xFEEC, 0xFEC8, 0xEC80,
376 0xC800, 0xFFEC, 0xFE80, 0xE800,
377 0xFFE8, 0xFF00, 0xFFF0, 0xF000,
378 0xF710, 0x008E, 0x7100, 0x08CE,
379 0x008C, 0x7310, 0x3100, 0x8CCE,
380 0x088C, 0x3110, 0x6666, 0x366C,
381 0x17E8, 0x0FF0, 0x718E, 0x399C,
382 0xaaaa, 0xf0f0, 0x5a5a, 0x33cc,
383 0x3c3c, 0x55aa, 0x9696, 0xa55a,
384 0x73ce, 0x13c8, 0x324c, 0x3bdc,
385 0x6996, 0xc33c, 0x9966, 0x660,
386 0x272, 0x4e4, 0x4e40, 0x2720,
387 0xc936, 0x936c, 0x39c6, 0x639c,
388 0x9336, 0x9cc6, 0x817e, 0xe718,
389 0xccf0, 0xfcc, 0x7744, 0xee22,
390 };
391
392 static uint32_t g_partitionMap2[64] =
393 {
394 0xaa685050, 0x6a5a5040, 0x5a5a4200, 0x5450a0a8,
395 0xa5a50000, 0xa0a05050, 0x5555a0a0, 0x5a5a5050,
396 0xaa550000, 0xaa555500, 0xaaaa5500, 0x90909090,
397 0x94949494, 0xa4a4a4a4, 0xa9a59450, 0x2a0a4250,
398 0xa5945040, 0x0a425054, 0xa5a5a500, 0x55a0a0a0,
399 0xa8a85454, 0x6a6a4040, 0xa4a45000, 0x1a1a0500,
400 0x0050a4a4, 0xaaa59090, 0x14696914, 0x69691400,
401 0xa08585a0, 0xaa821414, 0x50a4a450, 0x6a5a0200,
402 0xa9a58000, 0x5090a0a8, 0xa8a09050, 0x24242424,
403 0x00aa5500, 0x24924924, 0x24499224, 0x50a50a50,
404 0x500aa550, 0xaaaa4444, 0x66660000, 0xa5a0a5a0,
405 0x50a050a0, 0x69286928, 0x44aaaa44, 0x66666600,
406 0xaa444444, 0x54a854a8, 0x95809580, 0x96969600,
407 0xa85454a8, 0x80959580, 0xaa141414, 0x96960000,
408 0xaaaa1414, 0xa05050a0, 0xa0a5a5a0, 0x96000000,
409 0x40804080, 0xa9a8a9a8, 0xaaaaaa44, 0x2a4a5254,
410 };
411
412 static int g_fixupIndexes2[64] =
413 {
414 15,15,15,15,
415 15,15,15,15,
416 15,15,15,15,
417 15,15,15,15,
418 15, 2, 8, 2,
419 2, 8, 8,15,
420 2, 8, 2, 2,
421 8, 8, 2, 2,
422
423 15,15, 6, 8,
424 2, 8,15,15,
425 2, 8, 2, 2,
426 2,15,15, 6,
427 6, 2, 6, 8,
428 15,15, 2, 2,
429 15,15,15,15,
430 15, 2, 2,15,
431 };
432
433 static int g_fixupIndexes3[64][2] =
434 {
435 { 3,15 },{ 3, 8 },{ 15, 8 },{ 15, 3 },
436 { 8,15 },{ 3,15 },{ 15, 3 },{ 15, 8 },
437 { 8,15 },{ 8,15 },{ 6,15 },{ 6,15 },
438 { 6,15 },{ 5,15 },{ 3,15 },{ 3, 8 },
439 { 3,15 },{ 3, 8 },{ 8,15 },{ 15, 3 },
440 { 3,15 },{ 3, 8 },{ 6,15 },{ 10, 8 },
441 { 5, 3 },{ 8,15 },{ 8, 6 },{ 6,10 },
442 { 8,15 },{ 5,15 },{ 15,10 },{ 15, 8 },
443
444 { 8,15 },{ 15, 3 },{ 3,15 },{ 5,10 },
445 { 6,10 },{ 10, 8 },{ 8, 9 },{ 15,10 },
446 { 15, 6 },{ 3,15 },{ 15, 8 },{ 5,15 },
447 { 15, 3 },{ 15, 6 },{ 15, 6 },{ 15, 8 },
448 { 3,15 },{ 15, 3 },{ 5,15 },{ 5,15 },
449 { 5,15 },{ 8,15 },{ 5,15 },{ 10,15 },
450 { 5,15 },{ 10,15 },{ 8,15 },{ 13,15 },
451 { 15, 3 },{ 12,15 },{ 3,15 },{ 3, 8 },
452 };
453
454 static const unsigned char g_fragments[] =
455 {
456 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, // 0, 16
457 0, 1, 2, 3, // 16, 4
458 0, 1, 4, // 20, 3
459 0, 1, 2, 4, // 23, 4
460 2, 3, 7, // 27, 3
461 1, 2, 3, 7, // 30, 4
462 0, 1, 2, 3, 4, 5, 6, 7, // 34, 8
463 0, 1, 4, 8, // 42, 4
464 0, 1, 2, 4, 5, 8, // 46, 6
465 0, 1, 2, 3, 4, 5, 6, 8, // 52, 8
466 1, 4, 5, 6, 9, // 60, 5
467 2, 5, 6, 7, 10, // 65, 5
468 5, 6, 9, 10, // 70, 4
469 2, 3, 7, 11, // 74, 4
470 1, 2, 3, 6, 7, 11, // 78, 6
471 0, 1, 2, 3, 5, 6, 7, 11, // 84, 8
472 0, 1, 2, 3, 8, 9, 10, 11, // 92, 8
473 2, 3, 6, 7, 8, 9, 10, 11, // 100, 8
474 4, 5, 6, 7, 8, 9, 10, 11, // 108, 8
475 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, // 116, 12
476 0, 4, 8, 12, // 128, 4
477 0, 2, 3, 4, 6, 7, 8, 12, // 132, 8
478 0, 1, 2, 4, 5, 8, 9, 12, // 140, 8
479 0, 1, 2, 3, 4, 5, 6, 8, 9, 12, // 148, 10
480 3, 6, 7, 8, 9, 12, // 158, 6
481 3, 5, 6, 7, 8, 9, 10, 12, // 164, 8
482 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 12, // 172, 12
483 0, 1, 2, 5, 6, 7, 11, 12, // 184, 8
484 5, 8, 9, 10, 13, // 192, 5
485 8, 12, 13, // 197, 3
486 4, 8, 12, 13, // 200, 4
487 2, 3, 6, 9, 12, 13, // 204, 6
488 0, 1, 2, 3, 8, 9, 12, 13, // 210, 8
489 0, 1, 4, 5, 8, 9, 12, 13, // 218, 8
490 2, 3, 6, 7, 8, 9, 12, 13, // 226, 8
491 2, 3, 5, 6, 9, 10, 12, 13, // 234, 8
492 0, 3, 6, 7, 9, 10, 12, 13, // 242, 8
493 0, 1, 2, 3, 4, 5, 6, 8, 9, 10, 12, 13, // 250, 12
494 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 12, 13, // 262, 13
495 2, 3, 4, 7, 8, 11, 12, 13, // 275, 8
496 1, 2, 6, 7, 8, 11, 12, 13, // 283, 8
497 2, 3, 4, 6, 7, 8, 9, 11, 12, 13, // 291, 10
498 2, 3, 4, 5, 10, 11, 12, 13, // 301, 8
499 0, 1, 6, 7, 10, 11, 12, 13, // 309, 8
500 6, 9, 10, 11, 14, // 317, 5
501 0, 2, 4, 6, 8, 10, 12, 14, // 322, 8
502 1, 3, 5, 7, 8, 10, 12, 14, // 330, 8
503 1, 3, 4, 6, 9, 11, 12, 14, // 338, 8
504 0, 2, 5, 7, 9, 11, 12, 14, // 346, 8
505 0, 3, 4, 5, 8, 9, 13, 14, // 354, 8
506 2, 3, 4, 7, 8, 9, 13, 14, // 362, 8
507 1, 2, 5, 6, 9, 10, 13, 14, // 370, 8
508 0, 3, 4, 7, 9, 10, 13, 14, // 378, 8
509 0, 3, 5, 6, 8, 11, 13, 14, // 386, 8
510 1, 2, 4, 7, 8, 11, 13, 14, // 394, 8
511 0, 1, 4, 7, 10, 11, 13, 14, // 402, 8
512 0, 3, 6, 7, 10, 11, 13, 14, // 410, 8
513 8, 12, 13, 14, // 418, 4
514 1, 2, 3, 7, 8, 12, 13, 14, // 422, 8
515 4, 8, 9, 12, 13, 14, // 430, 6
516 0, 4, 5, 8, 9, 12, 13, 14, // 436, 8
517 1, 2, 3, 6, 7, 8, 9, 12, 13, 14, // 444, 10
518 2, 6, 8, 9, 10, 12, 13, 14, // 454, 8
519 0, 1, 2, 4, 5, 6, 8, 9, 10, 12, 13, 14, // 462, 12
520 0, 7, 9, 10, 11, 12, 13, 14, // 474, 8
521 1, 2, 3, 4, 5, 6, 8, 15, // 482, 8
522 3, 7, 11, 15, // 490, 4
523 0, 1, 3, 4, 5, 7, 11, 15, // 494, 8
524 0, 4, 5, 10, 11, 15, // 502, 6
525 1, 2, 3, 6, 7, 10, 11, 15, // 508, 8
526 0, 1, 2, 3, 5, 6, 7, 10, 11, 15, // 516, 10
527 0, 4, 5, 6, 9, 10, 11, 15, // 526, 8
528 0, 1, 2, 3, 4, 5, 6, 7, 9, 10, 11, 15, // 534, 12
529 1, 2, 4, 5, 8, 9, 12, 15, // 546, 8
530 2, 3, 5, 6, 8, 9, 12, 15, // 554, 8
531 0, 3, 5, 6, 9, 10, 12, 15, // 562, 8
532 1, 2, 4, 7, 9, 10, 12, 15, // 570, 8
533 1, 2, 5, 6, 8, 11, 12, 15, // 578, 8
534 0, 3, 4, 7, 8, 11, 12, 15, // 586, 8
535 0, 1, 5, 6, 10, 11, 12, 15, // 594, 8
536 1, 2, 6, 7, 10, 11, 12, 15, // 602, 8
537 1, 3, 4, 6, 8, 10, 13, 15, // 610, 8
538 0, 2, 5, 7, 8, 10, 13, 15, // 618, 8
539 0, 2, 4, 6, 9, 11, 13, 15, // 626, 8
540 1, 3, 5, 7, 9, 11, 13, 15, // 634, 8
541 0, 1, 2, 3, 4, 5, 7, 8, 12, 13, 15, // 642, 11
542 2, 3, 4, 5, 8, 9, 14, 15, // 653, 8
543 0, 1, 6, 7, 8, 9, 14, 15, // 661, 8
544 0, 1, 5, 10, 14, 15, // 669, 6
545 0, 3, 4, 5, 9, 10, 14, 15, // 675, 8
546 0, 1, 5, 6, 9, 10, 14, 15, // 683, 8
547 11, 14, 15, // 691, 3
548 7, 11, 14, 15, // 694, 4
549 1, 2, 4, 5, 8, 11, 14, 15, // 698, 8
550 0, 1, 4, 7, 8, 11, 14, 15, // 706, 8
551 0, 1, 4, 5, 10, 11, 14, 15, // 714, 8
552 2, 3, 6, 7, 10, 11, 14, 15, // 722, 8
553 4, 5, 6, 7, 10, 11, 14, 15, // 730, 8
554 0, 1, 4, 5, 7, 8, 10, 11, 14, 15, // 738, 10
555 0, 1, 2, 3, 5, 6, 7, 9, 10, 11, 14, 15, // 748, 12
556 0, 1, 2, 3, 4, 5, 6, 7, 9, 10, 11, 14, 15, // 760, 13
557 0, 1, 2, 3, 4, 6, 7, 11, 12, 14, 15, // 773, 11
558 3, 4, 8, 9, 10, 13, 14, 15, // 784, 8
559 11, 13, 14, 15, // 792, 4
560 0, 1, 2, 4, 11, 13, 14, 15, // 796, 8
561 0, 1, 2, 4, 5, 10, 11, 13, 14, 15, // 804, 10
562 7, 10, 11, 13, 14, 15, // 814, 6
563 3, 6, 7, 10, 11, 13, 14, 15, // 820, 8
564 1, 5, 9, 10, 11, 13, 14, 15, // 828, 8
565 1, 2, 3, 5, 6, 7, 9, 10, 11, 13, 14, 15, // 836, 12
566 12, 13, 14, 15, // 848, 4
567 0, 1, 2, 3, 12, 13, 14, 15, // 852, 8
568 0, 1, 4, 5, 12, 13, 14, 15, // 860, 8
569 4, 5, 6, 7, 12, 13, 14, 15, // 868, 8
570 4, 8, 9, 10, 12, 13, 14, 15, // 876, 8
571 0, 4, 5, 8, 9, 10, 12, 13, 14, 15, // 884, 10
572 0, 1, 4, 5, 6, 8, 9, 10, 12, 13, 14, 15, // 894, 12
573 0, 1, 2, 3, 4, 7, 8, 11, 12, 13, 14, 15, // 906, 12
574 0, 1, 3, 4, 8, 9, 11, 12, 13, 14, 15, // 918, 11
575 0, 2, 3, 7, 8, 10, 11, 12, 13, 14, 15, // 929, 11
576 7, 9, 10, 11, 12, 13, 14, 15, // 940, 8
577 3, 6, 7, 9, 10, 11, 12, 13, 14, 15, // 948, 10
578 2, 3, 5, 6, 7, 9, 10, 11, 12, 13, 14, 15, // 958, 12
579 8, 9, 10, 11, 12, 13, 14, 15, // 970, 8
580 0, 4, 5, 6, 8, 9, 10, 11, 12, 13, 14, 15, // 978, 12
581 0, 1, 4, 5, 6, 8, 9, 10, 11, 12, 13, 14, 15, // 990, 13
582 3, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, // 1003, 12
583 2, 3, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, // 1015, 13
584 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, // 1028, 12
585 0, 2, // 1040, 2
586 1, 3, // 1042, 2
587 0, 1, 4, 5, // 1044, 4
588 0, 1, 2, 4, 5, // 1048, 5
589 2, 3, 6, // 1053, 3
590 0, 2, 4, 6, // 1056, 4
591 1, 2, 5, 6, // 1060, 4
592 0, 1, 2, 3, 5, 6, // 1064, 6
593 0, 1, 2, 4, 5, 6, // 1070, 6
594 0, 1, 2, 3, 4, 5, 6, // 1076, 7
595 0, 3, 4, 7, // 1083, 4
596 0, 1, 2, 3, 4, 7, // 1087, 6
597 1, 3, 5, 7, // 1093, 4
598 2, 3, 6, 7, // 1097, 4
599 1, 2, 3, 6, 7, // 1101, 5
600 1, 2, 3, 5, 6, 7, // 1106, 6
601 0, 1, 2, 3, 5, 6, 7, // 1112, 7
602 4, 5, 6, 7, // 1119, 4
603 0, 8, // 1123, 2
604 0, 1, 4, 5, 8, // 1125, 5
605 0, 1, 8, 9, // 1130, 4
606 4, 5, 8, 9, // 1134, 4
607 0, 1, 4, 5, 8, 9, // 1138, 6
608 2, 6, 8, 9, // 1144, 4
609 6, 7, 8, 9, // 1148, 4
610 0, 2, 4, 6, 8, 10, // 1152, 6
611 1, 2, 5, 6, 9, 10, // 1158, 6
612 0, 3, 4, 7, 9, 10, // 1164, 6
613 0, 1, 2, 8, 9, 10, // 1170, 6
614 4, 5, 6, 8, 9, 10, // 1176, 6
615 3, 11, // 1182, 2
616 2, 3, 6, 7, 11, // 1184, 5
617 0, 3, 8, 11, // 1189, 4
618 0, 3, 4, 7, 8, 11, // 1193, 6
619 1, 3, 5, 7, 9, 11, // 1199, 6
620 2, 3, 10, 11, // 1205, 4
621 1, 5, 10, 11, // 1209, 4
622 4, 5, 10, 11, // 1213, 4
623 6, 7, 10, 11, // 1217, 4
624 2, 3, 6, 7, 10, 11, // 1221, 6
625 1, 2, 3, 9, 10, 11, // 1227, 6
626 5, 6, 7, 9, 10, 11, // 1233, 6
627 8, 9, 10, 11, // 1239, 4
628 4, 12, // 1243, 2
629 0, 1, 2, 3, 4, 5, 8, 12, // 1245, 8
630 8, 9, 12, // 1253, 3
631 0, 4, 5, 8, 9, 12, // 1256, 6
632 0, 1, 4, 5, 8, 9, 12, // 1262, 7
633 2, 3, 5, 6, 8, 9, 12, // 1269, 7
634 1, 5, 9, 13, // 1276, 4
635 6, 7, 9, 13, // 1280, 4
636 1, 4, 7, 10, 13, // 1284, 5
637 1, 6, 8, 11, 13, // 1289, 5
638 0, 1, 12, 13, // 1294, 4
639 4, 5, 12, 13, // 1298, 4
640 0, 1, 6, 7, 12, 13, // 1302, 6
641 0, 1, 4, 8, 12, 13, // 1308, 6
642 8, 9, 12, 13, // 1314, 4
643 4, 8, 9, 12, 13, // 1318, 5
644 4, 5, 8, 9, 12, 13, // 1323, 6
645 0, 4, 5, 8, 9, 12, 13, // 1329, 7
646 0, 1, 6, 10, 12, 13, // 1336, 6
647 3, 6, 7, 9, 10, 12, 13, // 1342, 7
648 0, 1, 10, 11, 12, 13, // 1349, 6
649 2, 4, 7, 9, 14, // 1355, 5
650 4, 5, 10, 14, // 1360, 4
651 2, 6, 10, 14, // 1364, 4
652 2, 5, 8, 11, 14, // 1368, 5
653 0, 2, 12, 14, // 1373, 4
654 8, 10, 12, 14, // 1377, 4
655 4, 6, 8, 10, 12, 14, // 1381, 6
656 13, 14, // 1387, 2
657 9, 10, 13, 14, // 1389, 4
658 5, 6, 9, 10, 13, 14, // 1393, 6
659 0, 1, 2, 12, 13, 14, // 1399, 6
660 4, 5, 6, 12, 13, 14, // 1405, 6
661 8, 9, 12, 13, 14, // 1411, 5
662 8, 9, 10, 12, 13, 14, // 1416, 6
663 7, 15, // 1422, 2
664 0, 5, 10, 15, // 1424, 4
665 0, 1, 2, 3, 6, 7, 11, 15, // 1428, 8
666 10, 11, 15, // 1436, 3
667 0, 1, 5, 6, 10, 11, 15, // 1439, 7
668 3, 6, 7, 10, 11, 15, // 1446, 6
669 12, 15, // 1452, 2
670 0, 3, 12, 15, // 1454, 4
671 4, 7, 12, 15, // 1458, 4
672 0, 3, 6, 9, 12, 15, // 1462, 6
673 0, 3, 5, 10, 12, 15, // 1468, 6
674 8, 11, 12, 15, // 1474, 4
675 5, 6, 8, 11, 12, 15, // 1478, 6
676 4, 7, 8, 11, 12, 15, // 1484, 6
677 1, 3, 13, 15, // 1490, 4
678 9, 11, 13, 15, // 1494, 4
679 5, 7, 9, 11, 13, 15, // 1498, 6
680 2, 3, 14, 15, // 1504, 4
681 2, 3, 4, 5, 14, 15, // 1508, 6
682 6, 7, 14, 15, // 1514, 4
683 2, 3, 5, 9, 14, 15, // 1518, 6
684 2, 3, 8, 9, 14, 15, // 1524, 6
685 10, 14, 15, // 1530, 3
686 0, 4, 5, 9, 10, 14, 15, // 1533, 7
687 2, 3, 7, 11, 14, 15, // 1540, 6
688 10, 11, 14, 15, // 1546, 4
689 7, 10, 11, 14, 15, // 1550, 5
690 6, 7, 10, 11, 14, 15, // 1555, 6
691 1, 2, 3, 13, 14, 15, // 1561, 6
692 5, 6, 7, 13, 14, 15, // 1567, 6
693 10, 11, 13, 14, 15, // 1573, 5
694 9, 10, 11, 13, 14, 15, // 1578, 6
695 0, 4, 8, 9, 12, 13, 14, 15, // 1584, 8
696 9, 10, 12, 13, 14, 15, // 1592, 6
697 8, 11, 12, 13, 14, 15, // 1598, 6
698 3, 7, 10, 11, 12, 13, 14, 15, // 1604, 8
699 };
700 static const int g_shapeRanges[][2] =
701 {
702 { 0, 16 },{ 16, 4 },{ 20, 3 },{ 23, 4 },{ 27, 3 },{ 30, 4 },{ 34, 8 },{ 42, 4 },{ 46, 6 },{ 52, 8 },{ 60, 5 },
703 { 65, 5 },{ 70, 4 },{ 74, 4 },{ 78, 6 },{ 84, 8 },{ 92, 8 },{ 100, 8 },{ 108, 8 },{ 116, 12 },{ 128, 4 },{ 132, 8 },
704 { 140, 8 },{ 148, 10 },{ 158, 6 },{ 164, 8 },{ 172, 12 },{ 184, 8 },{ 192, 5 },{ 197, 3 },{ 200, 4 },{ 204, 6 },{ 210, 8 },
705 { 218, 8 },{ 226, 8 },{ 234, 8 },{ 242, 8 },{ 250, 12 },{ 262, 13 },{ 275, 8 },{ 283, 8 },{ 291, 10 },{ 301, 8 },{ 309, 8 },
706 { 317, 5 },{ 322, 8 },{ 330, 8 },{ 338, 8 },{ 346, 8 },{ 354, 8 },{ 362, 8 },{ 370, 8 },{ 378, 8 },{ 386, 8 },{ 394, 8 },
707 { 402, 8 },{ 410, 8 },{ 418, 4 },{ 422, 8 },{ 430, 6 },{ 436, 8 },{ 444, 10 },{ 454, 8 },{ 462, 12 },{ 474, 8 },{ 482, 8 },
708 { 490, 4 },{ 494, 8 },{ 502, 6 },{ 508, 8 },{ 516, 10 },{ 526, 8 },{ 534, 12 },{ 546, 8 },{ 554, 8 },{ 562, 8 },{ 570, 8 },
709 { 578, 8 },{ 586, 8 },{ 594, 8 },{ 602, 8 },{ 610, 8 },{ 618, 8 },{ 626, 8 },{ 634, 8 },{ 642, 11 },{ 653, 8 },{ 661, 8 },
710 { 669, 6 },{ 675, 8 },{ 683, 8 },{ 691, 3 },{ 694, 4 },{ 698, 8 },{ 706, 8 },{ 714, 8 },{ 722, 8 },{ 730, 8 },{ 738, 10 },
711 { 748, 12 },{ 760, 13 },{ 773, 11 },{ 784, 8 },{ 792, 4 },{ 796, 8 },{ 804, 10 },{ 814, 6 },{ 820, 8 },{ 828, 8 },{ 836, 12 },
712 { 848, 4 },{ 852, 8 },{ 860, 8 },{ 868, 8 },{ 876, 8 },{ 884, 10 },{ 894, 12 },{ 906, 12 },{ 918, 11 },{ 929, 11 },{ 940, 8 },
713 { 948, 10 },{ 958, 12 },{ 970, 8 },{ 978, 12 },{ 990, 13 },{ 1003, 12 },{ 1015, 13 },{ 1028, 12 },{ 1040, 2 },{ 1042, 2 },{ 1044, 4 },
714 { 1048, 5 },{ 1053, 3 },{ 1056, 4 },{ 1060, 4 },{ 1064, 6 },{ 1070, 6 },{ 1076, 7 },{ 1083, 4 },{ 1087, 6 },{ 1093, 4 },{ 1097, 4 },
715 { 1101, 5 },{ 1106, 6 },{ 1112, 7 },{ 1119, 4 },{ 1123, 2 },{ 1125, 5 },{ 1130, 4 },{ 1134, 4 },{ 1138, 6 },{ 1144, 4 },{ 1148, 4 },
716 { 1152, 6 },{ 1158, 6 },{ 1164, 6 },{ 1170, 6 },{ 1176, 6 },{ 1182, 2 },{ 1184, 5 },{ 1189, 4 },{ 1193, 6 },{ 1199, 6 },{ 1205, 4 },
717 { 1209, 4 },{ 1213, 4 },{ 1217, 4 },{ 1221, 6 },{ 1227, 6 },{ 1233, 6 },{ 1239, 4 },{ 1243, 2 },{ 1245, 8 },{ 1253, 3 },{ 1256, 6 },
718 { 1262, 7 },{ 1269, 7 },{ 1276, 4 },{ 1280, 4 },{ 1284, 5 },{ 1289, 5 },{ 1294, 4 },{ 1298, 4 },{ 1302, 6 },{ 1308, 6 },{ 1314, 4 },
719 { 1318, 5 },{ 1323, 6 },{ 1329, 7 },{ 1336, 6 },{ 1342, 7 },{ 1349, 6 },{ 1355, 5 },{ 1360, 4 },{ 1364, 4 },{ 1368, 5 },{ 1373, 4 },
720 { 1377, 4 },{ 1381, 6 },{ 1387, 2 },{ 1389, 4 },{ 1393, 6 },{ 1399, 6 },{ 1405, 6 },{ 1411, 5 },{ 1416, 6 },{ 1422, 2 },{ 1424, 4 },
721 { 1428, 8 },{ 1436, 3 },{ 1439, 7 },{ 1446, 6 },{ 1452, 2 },{ 1454, 4 },{ 1458, 4 },{ 1462, 6 },{ 1468, 6 },{ 1474, 4 },{ 1478, 6 },
722 { 1484, 6 },{ 1490, 4 },{ 1494, 4 },{ 1498, 6 },{ 1504, 4 },{ 1508, 6 },{ 1514, 4 },{ 1518, 6 },{ 1524, 6 },{ 1530, 3 },{ 1533, 7 },
723 { 1540, 6 },{ 1546, 4 },{ 1550, 5 },{ 1555, 6 },{ 1561, 6 },{ 1567, 6 },{ 1573, 5 },{ 1578, 6 },{ 1584, 8 },{ 1592, 6 },{ 1598, 6 },
724 { 1604, 8 },
725 };
726 static const int g_shapes1[][2] =
727 {
728 { 0, 16 }
729 };
730 static const int g_shapes2[64][2] =
731 {
732 { 33, 96 },{ 63, 66 },{ 20, 109 },{ 22, 107 },{ 37, 92 },{ 7, 122 },{ 8, 121 },{ 23, 106 },
733 { 38, 91 },{ 2, 127 },{ 9, 120 },{ 26, 103 },{ 3, 126 },{ 6, 123 },{ 1, 128 },{ 19, 110 },
734 { 15, 114 },{ 124, 5 },{ 72, 57 },{ 115, 14 },{ 125, 4 },{ 70, 59 },{ 100, 29 },{ 60, 69 },
735 { 116, 13 },{ 99, 30 },{ 78, 51 },{ 94, 35 },{ 104, 25 },{ 111, 18 },{ 71, 58 },{ 90, 39 },
736 { 45, 84 },{ 16, 113 },{ 82, 47 },{ 95, 34 },{ 87, 42 },{ 83, 46 },{ 53, 76 },{ 48, 81 },
737 { 68, 61 },{ 105, 24 },{ 98, 31 },{ 88, 41 },{ 75, 54 },{ 43, 86 },{ 52, 77 },{ 117, 12 },
738 { 119, 10 },{ 118, 11 },{ 85, 44 },{ 101, 28 },{ 36, 93 },{ 55, 74 },{ 89, 40 },{ 79, 50 },
739 { 56, 73 },{ 49, 80 },{ 64, 65 },{ 27, 102 },{ 32, 97 },{ 112, 17 },{ 67, 62 },{ 21, 108 },
740 };
741 static const int g_shapes3[64][3] =
742 {
743 { 148, 160, 240 },{ 132, 212, 205 },{ 136, 233, 187 },{ 175, 237, 143 },{ 6, 186, 232 },{ 33, 142, 232 },{ 131, 123, 142 },{ 131, 96, 186 },
744 { 6, 171, 110 },{ 1, 18, 110 },{ 1, 146, 123 },{ 33, 195, 66 },{ 20, 51, 66 },{ 20, 178, 96 },{ 2, 177, 106 },{ 211, 4, 59 },
745 { 8, 191, 91 },{ 230, 14, 29 },{ 1, 188, 234 },{ 151, 110, 168 },{ 20, 144, 238 },{ 137, 66, 206 },{ 173, 179, 232 },{ 209, 194, 186 },
746 { 239, 165, 142 },{ 131, 152, 242 },{ 214, 54, 12 },{ 140, 219, 201 },{ 190, 150, 231 },{ 156, 135, 241 },{ 185, 227, 167 },{ 145, 210, 59 },
747 { 138, 174, 106 },{ 189, 229, 14 },{ 176, 133, 106 },{ 78, 178, 195 },{ 111, 146, 171 },{ 216, 180, 196 },{ 217, 181, 193 },{ 184, 228, 166 },
748 { 192, 225, 153 },{ 134, 141, 123 },{ 6, 222, 198 },{ 149, 183, 96 },{ 33, 226, 164 },{ 161, 215, 51 },{ 197, 221, 18 },{ 1, 223, 199 },
749 { 154, 163, 110 },{ 20, 236, 169 },{ 157, 204, 66 },{ 1, 202, 220 },{ 20, 170, 235 },{ 203, 158, 66 },{ 162, 155, 110 },{ 6, 201, 218 },
750 { 139, 135, 123 },{ 33, 167, 224 },{ 182, 150, 96 },{ 19, 200, 213 },{ 63, 207, 159 },{ 147, 172, 109 },{ 129, 130, 128 },{ 208, 14, 59 },
751 };
752
753 static const int g_shapeList1[] =
754 {
755 0,
756 };
757
758 static const int g_shapeList2[] =
759 {
760 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11,
761 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22,
762 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33,
763 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44,
764 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55,
765 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66,
766 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77,
767 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88,
768 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99,
769 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110,
770 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121,
771 122, 123, 124, 125, 126, 127, 128,
772 };
773
774 static const int g_shapeList12[] =
775 {
776 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10,
777 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21,
778 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32,
779 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43,
780 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54,
781 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65,
782 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76,
783 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87,
784 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98,
785 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109,
786 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120,
787 121, 122, 123, 124, 125, 126, 127, 128,
788 };
789
790 static const int g_shapeList3[] =
791 {
792 1, 2, 4, 6, 8, 12, 14, 18, 19, 20, 29,
793 33, 51, 54, 59, 63, 66, 78, 91, 96, 106, 109,
794 110, 111, 123, 128, 129, 130, 131, 132, 133, 134, 135,
795 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146,
796 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157,
797 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168,
798 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179,
799 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190,
800 191, 192, 193, 194, 195, 196, 197, 198, 199, 200, 201,
801 202, 203, 204, 205, 206, 207, 208, 209, 210, 211, 212,
802 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223,
803 224, 225, 226, 227, 228, 229, 230, 231, 232, 233, 234,
804 235, 236, 237, 238, 239, 240, 241, 242,
805 };
806
807 static const int g_shapeList3Short[] =
808 {
809 1, 2, 4, 6, 18, 20, 33, 51, 59, 66, 96,
810 106, 110, 123, 131, 132, 136, 142, 143, 146, 148, 160,
811 171, 175, 177, 178, 186, 187, 195, 205, 211, 212, 232,
812 233, 237, 240,
813 };
814
815 static const int g_shapeListAll[] =
816 {
817 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10,
818 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21,
819 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32,
820 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43,
821 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54,
822 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65,
823 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76,
824 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87,
825 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98,
826 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109,
827 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120,
828 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131,
829 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142,
830 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153,
831 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164,
832 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175,
833 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186,
834 187, 188, 189, 190, 191, 192, 193, 194, 195, 196, 197,
835 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208,
836 209, 210, 211, 212, 213, 214, 215, 216, 217, 218, 219,
837 220, 221, 222, 223, 224, 225, 226, 227, 228, 229, 230,
838 231, 232, 233, 234, 235, 236, 237, 238, 239, 240, 241,
839 242,
840 };
841
842 static const int g_numShapes1 = sizeof(g_shapeList1) / sizeof(g_shapeList1[0]);
843 static const int g_numShapes2 = sizeof(g_shapeList2) / sizeof(g_shapeList2[0]);
844 static const int g_numShapes12 = sizeof(g_shapeList12) / sizeof(g_shapeList12[0]);
845 static const int g_numShapes3 = sizeof(g_shapeList3) / sizeof(g_shapeList3[0]);
846 static const int g_numShapes3Short = sizeof(g_shapeList3Short) / sizeof(g_shapeList3Short[0]);
847 static const int g_numShapesAll = sizeof(g_shapeListAll) / sizeof(g_shapeListAll[0]);
848 static const int g_numFragments = sizeof(g_fragments) / sizeof(g_fragments[0]);
849 }
850
851 struct PackingVector
852 {
853 uint32_t m_vector[4];
854 int m_offset;
855
856 void Init()
857 {
858 for (int i = 0; i < 4; i++)
859 m_vector[i] = 0;
860
861 m_offset = 0;
862 }
863
864 void InitPacked(const uint32_t *v, int bits)
865 {
866 for (int b = 0; b < bits; b += 32)
867 m_vector[b / 32] = v[b / 32];
868
869 m_offset = bits;
870 }
871
872 inline void Pack(ParallelMath::ScalarUInt16 value, int bits)
873 {
874 int vOffset = m_offset >> 5;
875 int bitOffset = m_offset & 0x1f;
876
877 m_vector[vOffset] |= (static_cast<uint32_t>(value) << bitOffset) & static_cast<uint32_t>(0xffffffff);
878
879 int overflowBits = bitOffset + bits - 32;
880 if (overflowBits > 0)
881 m_vector[vOffset + 1] |= (static_cast<uint32_t>(value) >> (bits - overflowBits));
882
883 m_offset += bits;
884 }
885
886 inline void Flush(uint8_t* output)
887 {
888 assert(m_offset == 128);
889
890 for (int v = 0; v < 4; v++)
891 {
892 uint32_t chunk = m_vector[v];
893 for (int b = 0; b < 4; b++)
894 output[v * 4 + b] = static_cast<uint8_t>((chunk >> (b * 8)) & 0xff);
895 }
896 }
897 };
898
899
900 struct UnpackingVector
901 {
902 uint32_t m_vector[4];
903
904 void Init(const uint8_t *bytes)
905 {
906 for (int i = 0; i < 4; i++)
907 m_vector[i] = 0;
908
909 for (int b = 0; b < 16; b++)
910 m_vector[b / 4] |= (bytes[b] << ((b % 4) * 8));
911 }
912
913 inline void UnpackStart(uint32_t *v, int bits)
914 {
915 for (int b = 0; b < bits; b += 32)
916 v[b / 32] = m_vector[b / 32];
917
918 int entriesShifted = bits / 32;
919 int carry = bits % 32;
920
921 for (int i = entriesShifted; i < 4; i++)
922 m_vector[i - entriesShifted] = m_vector[i];
923
924 int entriesRemaining = 4 - entriesShifted;
925 if (carry)
926 {
927 uint32_t bitMask = (1 << carry) - 1;
928 for (int i = 0; i < entriesRemaining; i++)
929 {
930 m_vector[i] >>= carry;
931 if (i != entriesRemaining - 1)
932 m_vector[i] |= (m_vector[i + 1] & bitMask) << (32 - carry);
933 }
934 }
935 }
936
937 inline ParallelMath::ScalarUInt16 Unpack(int bits)
938 {
939 uint32_t bitMask = (1 << bits) - 1;
940
941 ParallelMath::ScalarUInt16 result = static_cast<ParallelMath::ScalarUInt16>(m_vector[0] & bitMask);
942
943 for (int i = 0; i < 4; i++)
944 {
945 m_vector[i] >>= bits;
946 if (i != 3)
947 m_vector[i] |= (m_vector[i + 1] & bitMask) << (32 - bits);
948 }
949
950 return result;
951 }
952 };
953
954 ParallelMath::Float ScaleHDRValue(const ParallelMath::Float &v, bool isSigned)
955 {
956 if (isSigned)
957 {
958 ParallelMath::Float offset = ParallelMath::Select(ParallelMath::Less(v, ParallelMath::MakeFloatZero()), ParallelMath::MakeFloat(-30.0f), ParallelMath::MakeFloat(30.0f));
959 return (v * 32.0f + offset) / 31.0f;
960 }
961 else
962 return (v * 64.0f + 30.0f) / 31.0f;
963 }
964
965 ParallelMath::SInt16 UnscaleHDRValueSigned(const ParallelMath::SInt16 &v)
966 {
967#ifdef CVTT_ENABLE_ASSERTS
968 for (int i = 0; i < ParallelMath::ParallelSize; i++)
969 assert(ParallelMath::Extract(v, i) != -32768)
970#endif
971
972 ParallelMath::Int16CompFlag negative = ParallelMath::Less(v, ParallelMath::MakeSInt16(0));
973 ParallelMath::UInt15 absComp = ParallelMath::LosslessCast<ParallelMath::UInt15>::Cast(ParallelMath::Select(negative, ParallelMath::SInt16(ParallelMath::MakeSInt16(0) - v), v));
974
975 ParallelMath::UInt31 multiplied = ParallelMath::XMultiply(absComp, ParallelMath::MakeUInt15(31));
976 ParallelMath::UInt31 shifted = ParallelMath::RightShift(multiplied, 5);
977 ParallelMath::UInt15 absCompScaled = ParallelMath::ToUInt15(shifted);
978 ParallelMath::SInt16 signBits = ParallelMath::SelectOrZero(negative, ParallelMath::MakeSInt16(-32768));
979
980 return ParallelMath::LosslessCast<ParallelMath::SInt16>::Cast(absCompScaled) | signBits;
981 }
982
983 ParallelMath::UInt15 UnscaleHDRValueUnsigned(const ParallelMath::UInt16 &v)
984 {
985 return ParallelMath::ToUInt15(ParallelMath::RightShift(ParallelMath::XMultiply(v, ParallelMath::MakeUInt15(31)), 6));
986 }
987
988 void UnscaleHDREndpoints(const ParallelMath::AInt16 inEP[2][3], ParallelMath::AInt16 outEP[2][3], bool isSigned)
989 {
990 for (int epi = 0; epi < 2; epi++)
991 {
992 for (int ch = 0; ch < 3; ch++)
993 {
994 if (isSigned)
995 outEP[epi][ch] = ParallelMath::LosslessCast<ParallelMath::AInt16>::Cast(UnscaleHDRValueSigned(ParallelMath::LosslessCast<ParallelMath::SInt16>::Cast(inEP[epi][ch])));
996 else
997 outEP[epi][ch] = ParallelMath::LosslessCast<ParallelMath::AInt16>::Cast(UnscaleHDRValueUnsigned(ParallelMath::LosslessCast<ParallelMath::UInt16>::Cast(inEP[epi][ch])));
998 }
999 }
1000 }
1001
1002 struct SinglePlaneTemporaries
1003 {
1004 UnfinishedEndpoints<3> unfinishedRGB[BC7Data::g_numShapesAll];
1005 UnfinishedEndpoints<4> unfinishedRGBA[BC7Data::g_numShapes12];
1006
1007 ParallelMath::UInt15 fragmentBestIndexes[BC7Data::g_numFragments];
1008 ParallelMath::UInt15 shapeBestEP[BC7Data::g_numShapesAll][2][4];
1009 ParallelMath::Float shapeBestError[BC7Data::g_numShapesAll];
1010 };
1011 }
1012}
1013
1014void cvtt::Internal::BC7Computer::TweakAlpha(const MUInt15 original[2], int tweak, int range, MUInt15 result[2])
1015{
1016 ParallelMath::RoundTowardNearestForScope roundingMode;
1017
1018 float tf[2];
1019 Util::ComputeTweakFactors(tweak, range, tf);
1020
1021 MFloat base = ParallelMath::ToFloat(original[0]);
1022 MFloat offs = ParallelMath::ToFloat(original[1]) - base;
1023
1024 result[0] = ParallelMath::RoundAndConvertToU15(ParallelMath::Clamp(base + offs * tf[0], 0.0f, 255.0f), &roundingMode);
1025 result[1] = ParallelMath::RoundAndConvertToU15(ParallelMath::Clamp(base + offs * tf[1], 0.0f, 255.0f), &roundingMode);
1026}
1027
1028void cvtt::Internal::BC7Computer::Quantize(MUInt15* color, int bits, int channels)
1029{
1030 for (int ch = 0; ch < channels; ch++)
1031 color[ch] = ParallelMath::RightShift(((color[ch] << bits) - color[ch]) + ParallelMath::MakeUInt15(127 + (1 << (7 - bits))), 8);
1032}
1033
1034void cvtt::Internal::BC7Computer::QuantizeP(MUInt15* color, int bits, uint16_t p, int channels)
1035{
1036 int16_t addend;
1037 if (p)
1038 addend = ((1 << (8 - bits)) - 1);
1039 else
1040 addend = 255;
1041
1042 for (int ch = 0; ch < channels; ch++)
1043 {
1044 MUInt16 ch16 = ParallelMath::LosslessCast<MUInt16>::Cast(color[ch]);
1045 ch16 = ParallelMath::RightShift((ch16 << (bits + 1)) - ch16 + addend, 9);
1046 ch16 = (ch16 << 1) | ParallelMath::MakeUInt16(p);
1047 color[ch] = ParallelMath::LosslessCast<MUInt15>::Cast(ch16);
1048 }
1049}
1050
1051void cvtt::Internal::BC7Computer::Unquantize(MUInt15* color, int bits, int channels)
1052{
1053 for (int ch = 0; ch < channels; ch++)
1054 {
1055 MUInt15 clr = color[ch];
1056 clr = clr << (8 - bits);
1057 color[ch] = clr | ParallelMath::RightShift(clr, bits);
1058 }
1059}
1060
1061void cvtt::Internal::BC7Computer::CompressEndpoints0(MUInt15 ep[2][4], uint16_t p[2])
1062{
1063 for (int j = 0; j < 2; j++)
1064 {
1065 QuantizeP(ep[j], 4, p[j], 3);
1066 Unquantize(ep[j], 5, 3);
1067 ep[j][3] = ParallelMath::MakeUInt15(255);
1068 }
1069}
1070
1071void cvtt::Internal::BC7Computer::CompressEndpoints1(MUInt15 ep[2][4], uint16_t p)
1072{
1073 for (int j = 0; j < 2; j++)
1074 {
1075 QuantizeP(ep[j], 6, p, 3);
1076 Unquantize(ep[j], 7, 3);
1077 ep[j][3] = ParallelMath::MakeUInt15(255);
1078 }
1079}
1080
1081void cvtt::Internal::BC7Computer::CompressEndpoints2(MUInt15 ep[2][4])
1082{
1083 for (int j = 0; j < 2; j++)
1084 {
1085 Quantize(ep[j], 5, 3);
1086 Unquantize(ep[j], 5, 3);
1087 ep[j][3] = ParallelMath::MakeUInt15(255);
1088 }
1089}
1090
1091void cvtt::Internal::BC7Computer::CompressEndpoints3(MUInt15 ep[2][4], uint16_t p[2])
1092{
1093 for (int j = 0; j < 2; j++)
1094 {
1095 QuantizeP(ep[j], 7, p[j], 3);
1096 ep[j][3] = ParallelMath::MakeUInt15(255);
1097 }
1098}
1099
1100void cvtt::Internal::BC7Computer::CompressEndpoints4(MUInt15 epRGB[2][3], MUInt15 epA[2])
1101{
1102 for (int j = 0; j < 2; j++)
1103 {
1104 Quantize(epRGB[j], 5, 3);
1105 Unquantize(epRGB[j], 5, 3);
1106
1107 Quantize(epA + j, 6, 1);
1108 Unquantize(epA + j, 6, 1);
1109 }
1110}
1111
1112void cvtt::Internal::BC7Computer::CompressEndpoints5(MUInt15 epRGB[2][3], MUInt15 epA[2])
1113{
1114 for (int j = 0; j < 2; j++)
1115 {
1116 Quantize(epRGB[j], 7, 3);
1117 Unquantize(epRGB[j], 7, 3);
1118 }
1119
1120 // Alpha is full precision
1121 (void)epA;
1122}
1123
1124void cvtt::Internal::BC7Computer::CompressEndpoints6(MUInt15 ep[2][4], uint16_t p[2])
1125{
1126 for (int j = 0; j < 2; j++)
1127 QuantizeP(ep[j], 7, p[j], 4);
1128}
1129
1130void cvtt::Internal::BC7Computer::CompressEndpoints7(MUInt15 ep[2][4], uint16_t p[2])
1131{
1132 for (int j = 0; j < 2; j++)
1133 {
1134 QuantizeP(ep[j], 5, p[j], 4);
1135 Unquantize(ep[j], 6, 4);
1136 }
1137}
1138
1139void cvtt::Internal::BC7Computer::TrySingleColorRGBAMultiTable(uint32_t flags, const MUInt15 pixels[16][4], const MFloat average[4], int numRealChannels, const uint8_t *fragmentStart, int shapeLength, const MFloat &staticAlphaError, const ParallelMath::Int16CompFlag punchThroughInvalid[4], MFloat& shapeBestError, MUInt15 shapeBestEP[2][4], MUInt15 *fragmentBestIndexes, const float *channelWeightsSq, const cvtt::Tables::BC7SC::Table*const* tables, int numTables, const ParallelMath::RoundTowardNearestForScope *rtn)
1140{
1141 MFloat bestAverageError = ParallelMath::MakeFloat(FLT_MAX);
1142
1143 MUInt15 intAverage[4];
1144 for (int ch = 0; ch < 4; ch++)
1145 intAverage[ch] = ParallelMath::RoundAndConvertToU15(average[ch], rtn);
1146
1147 MUInt15 eps[2][4];
1148 MUInt15 reconstructed[4];
1149 MUInt15 index = ParallelMath::MakeUInt15(0);
1150
1151 for (int epi = 0; epi < 2; epi++)
1152 {
1153 for (int ch = 0; ch < 3; ch++)
1154 eps[epi][ch] = ParallelMath::MakeUInt15(0);
1155 eps[epi][3] = ParallelMath::MakeUInt15(255);
1156 }
1157
1158 for (int ch = 0; ch < 3; ch++)
1159 reconstructed[ch] = ParallelMath::MakeUInt15(0);
1160 reconstructed[3] = ParallelMath::MakeUInt15(255);
1161
1162 // Depending on the target index and parity bits, there are multiple valid solid colors.
1163 // We want to find the one closest to the actual average.
1164 MFloat epsAverageDiff = ParallelMath::MakeFloat(FLT_MAX);
1165 for (int t = 0; t < numTables; t++)
1166 {
1167 const cvtt::Tables::BC7SC::Table& table = *(tables[t]);
1168
1169 ParallelMath::Int16CompFlag pti = punchThroughInvalid[table.m_pBits];
1170
1171 MUInt15 candidateReconstructed[4];
1172 MUInt15 candidateEPs[2][4];
1173
1174 for (int i = 0; i < ParallelMath::ParallelSize; i++)
1175 {
1176 for (int ch = 0; ch < numRealChannels; ch++)
1177 {
1178 ParallelMath::ScalarUInt16 avgValue = ParallelMath::Extract(intAverage[ch], i);
1179 assert(avgValue >= 0 && avgValue <= 255);
1180
1181 const cvtt::Tables::BC7SC::TableEntry &entry = table.m_entries[avgValue];
1182
1183 ParallelMath::PutUInt15(candidateEPs[0][ch], i, entry.m_min);
1184 ParallelMath::PutUInt15(candidateEPs[1][ch], i, entry.m_max);
1185 ParallelMath::PutUInt15(candidateReconstructed[ch], i, entry.m_actualColor);
1186 }
1187 }
1188
1189 MFloat avgError = ParallelMath::MakeFloatZero();
1190 for (int ch = 0; ch < numRealChannels; ch++)
1191 {
1192 MFloat delta = ParallelMath::ToFloat(candidateReconstructed[ch]) - average[ch];
1193 avgError = avgError + delta * delta * channelWeightsSq[ch];
1194 }
1195
1196 ParallelMath::Int16CompFlag better = ParallelMath::FloatFlagToInt16(ParallelMath::Less(avgError, bestAverageError));
1197 better = ParallelMath::AndNot(pti, better); // Mask out punch-through invalidations
1198
1199 if (ParallelMath::AnySet(better))
1200 {
1201 ParallelMath::ConditionalSet(bestAverageError, ParallelMath::Int16FlagToFloat(better), avgError);
1202
1203 MUInt15 candidateIndex = ParallelMath::MakeUInt15(table.m_index);
1204
1205 ParallelMath::ConditionalSet(index, better, candidateIndex);
1206
1207 for (int ch = 0; ch < numRealChannels; ch++)
1208 ParallelMath::ConditionalSet(reconstructed[ch], better, candidateReconstructed[ch]);
1209
1210 for (int epi = 0; epi < 2; epi++)
1211 for (int ch = 0; ch < numRealChannels; ch++)
1212 ParallelMath::ConditionalSet(eps[epi][ch], better, candidateEPs[epi][ch]);
1213 }
1214 }
1215
1216 AggregatedError<4> aggError;
1217 for (int pxi = 0; pxi < shapeLength; pxi++)
1218 {
1219 int px = fragmentStart[pxi];
1220
1221 BCCommon::ComputeErrorLDR<4>(flags, reconstructed, pixels[px], numRealChannels, aggError);
1222 }
1223
1224 MFloat error = aggError.Finalize(flags, channelWeightsSq) + staticAlphaError;
1225
1226 ParallelMath::Int16CompFlag better = ParallelMath::FloatFlagToInt16(ParallelMath::Less(error, shapeBestError));
1227 if (ParallelMath::AnySet(better))
1228 {
1229 shapeBestError = ParallelMath::Min(shapeBestError, error);
1230 for (int epi = 0; epi < 2; epi++)
1231 {
1232 for (int ch = 0; ch < numRealChannels; ch++)
1233 ParallelMath::ConditionalSet(shapeBestEP[epi][ch], better, eps[epi][ch]);
1234 }
1235
1236 for (int pxi = 0; pxi < shapeLength; pxi++)
1237 ParallelMath::ConditionalSet(fragmentBestIndexes[pxi], better, index);
1238 }
1239}
1240
1241void cvtt::Internal::BC7Computer::TrySinglePlane(uint32_t flags, const MUInt15 pixels[16][4], const MFloat floatPixels[16][4], const float channelWeights[4], const BC7EncodingPlan &encodingPlan, int numRefineRounds, BC67::WorkInfo& work, const ParallelMath::RoundTowardNearestForScope *rtn)
1242{
1243 if (numRefineRounds < 1)
1244 numRefineRounds = 1;
1245
1246 float channelWeightsSq[4];
1247
1248 for (int ch = 0; ch < 4; ch++)
1249 channelWeightsSq[ch] = channelWeights[ch] * channelWeights[ch];
1250
1251 SinglePlaneTemporaries temps;
1252
1253 MUInt15 maxAlpha = ParallelMath::MakeUInt15(0);
1254 MUInt15 minAlpha = ParallelMath::MakeUInt15(255);
1255 ParallelMath::Int16CompFlag isPunchThrough = ParallelMath::MakeBoolInt16(true);
1256 for (int px = 0; px < 16; px++)
1257 {
1258 MUInt15 a = pixels[px][3];
1259 maxAlpha = ParallelMath::Max(maxAlpha, a);
1260 minAlpha = ParallelMath::Min(minAlpha, a);
1261
1262 isPunchThrough = (isPunchThrough & (ParallelMath::Equal(a, ParallelMath::MakeUInt15(0)) | ParallelMath::Equal(a, ParallelMath::MakeUInt15(255))));
1263 }
1264
1265 ParallelMath::Int16CompFlag blockHasNonMaxAlpha = ParallelMath::Less(minAlpha, ParallelMath::MakeUInt15(255));
1266 ParallelMath::Int16CompFlag blockHasNonZeroAlpha = ParallelMath::Less(ParallelMath::MakeUInt15(0), maxAlpha);
1267
1268 bool anyBlockHasAlpha = ParallelMath::AnySet(blockHasNonMaxAlpha);
1269
1270 // Try RGB modes if any block has a min alpha 251 or higher
1271 bool allowRGBModes = ParallelMath::AnySet(ParallelMath::Less(ParallelMath::MakeUInt15(250), minAlpha));
1272
1273 // Try mode 7 if any block has alpha.
1274 // Mode 7 is almost never selected for RGB blocks because mode 4 has very accurate 7.7.7.1 endpoints
1275 // and its parity bit doesn't affect alpha, meaning mode 7 can only be better in extremely specific
1276 // situations, and only by at most 1 unit of error per pixel.
1277 bool allowMode7 = anyBlockHasAlpha || (encodingPlan.mode7RGBPartitionEnabled != 0);
1278
1279 MFloat preWeightedPixels[16][4];
1280
1281 BCCommon::PreWeightPixelsLDR<4>(preWeightedPixels, pixels, channelWeights);
1282
1283 // Get initial RGB endpoints
1284 if (allowRGBModes)
1285 {
1286 const uint8_t *shapeList = encodingPlan.rgbShapeList;
1287 int numShapesToEvaluate = encodingPlan.rgbNumShapesToEvaluate;
1288
1289 for (int shapeIter = 0; shapeIter < numShapesToEvaluate; shapeIter++)
1290 {
1291 int shape = shapeList[shapeIter];
1292
1293 int shapeStart = BC7Data::g_shapeRanges[shape][0];
1294 int shapeSize = BC7Data::g_shapeRanges[shape][1];
1295
1296 EndpointSelector<3, 8> epSelector;
1297
1298 for (int epPass = 0; epPass < NumEndpointSelectorPasses; epPass++)
1299 {
1300 for (int spx = 0; spx < shapeSize; spx++)
1301 {
1302 int px = BC7Data::g_fragments[shapeStart + spx];
1303 epSelector.ContributePass(preWeightedPixels[px], epPass, ParallelMath::MakeFloat(1.0f));
1304 }
1305 epSelector.FinishPass(epPass);
1306 }
1307 temps.unfinishedRGB[shape] = epSelector.GetEndpoints(channelWeights);
1308 }
1309 }
1310
1311 // Get initial RGBA endpoints
1312 {
1313 const uint8_t *shapeList = encodingPlan.rgbaShapeList;
1314 int numShapesToEvaluate = encodingPlan.rgbaNumShapesToEvaluate;
1315
1316 for (int shapeIter = 0; shapeIter < numShapesToEvaluate; shapeIter++)
1317 {
1318 int shape = shapeList[shapeIter];
1319
1320 if (anyBlockHasAlpha || !allowRGBModes)
1321 {
1322 int shapeStart = BC7Data::g_shapeRanges[shape][0];
1323 int shapeSize = BC7Data::g_shapeRanges[shape][1];
1324
1325 EndpointSelector<4, 8> epSelector;
1326
1327 for (int epPass = 0; epPass < NumEndpointSelectorPasses; epPass++)
1328 {
1329 for (int spx = 0; spx < shapeSize; spx++)
1330 {
1331 int px = BC7Data::g_fragments[shapeStart + spx];
1332 epSelector.ContributePass(preWeightedPixels[px], epPass, ParallelMath::MakeFloat(1.0f));
1333 }
1334 epSelector.FinishPass(epPass);
1335 }
1336 temps.unfinishedRGBA[shape] = epSelector.GetEndpoints(channelWeights);
1337 }
1338 else
1339 {
1340 temps.unfinishedRGBA[shape] = temps.unfinishedRGB[shape].ExpandTo<4>(255);
1341 }
1342 }
1343 }
1344
1345 for (uint16_t mode = 0; mode <= 7; mode++)
1346 {
1347 if (mode == 4 || mode == 5)
1348 continue;
1349
1350 if (mode < 4 && !allowRGBModes)
1351 continue;
1352
1353 if (mode == 7 && !allowMode7)
1354 continue;
1355
1356 uint64_t partitionEnabledBits = 0;
1357 switch (mode)
1358 {
1359 case 0:
1360 partitionEnabledBits = encodingPlan.mode0PartitionEnabled;
1361 break;
1362 case 1:
1363 partitionEnabledBits = encodingPlan.mode1PartitionEnabled;
1364 break;
1365 case 2:
1366 partitionEnabledBits = encodingPlan.mode2PartitionEnabled;
1367 break;
1368 case 3:
1369 partitionEnabledBits = encodingPlan.mode3PartitionEnabled;
1370 break;
1371 case 6:
1372 partitionEnabledBits = encodingPlan.mode6Enabled ? 1 : 0;
1373 break;
1374 case 7:
1375 if (anyBlockHasAlpha)
1376 partitionEnabledBits = encodingPlan.mode7RGBAPartitionEnabled;
1377 else
1378 partitionEnabledBits = encodingPlan.mode7RGBPartitionEnabled;
1379 break;
1380 default:
1381 break;
1382 }
1383
1384 bool isRGB = (mode < 4);
1385
1386 unsigned int numPartitions = 1 << BC7Data::g_modes[mode].m_partitionBits;
1387 int numSubsets = BC7Data::g_modes[mode].m_numSubsets;
1388 int indexPrec = BC7Data::g_modes[mode].m_indexBits;
1389
1390 int parityBitMax = 1;
1391 if (BC7Data::g_modes[mode].m_pBitMode == BC7Data::PBitMode_PerEndpoint)
1392 parityBitMax = 4;
1393 else if (BC7Data::g_modes[mode].m_pBitMode == BC7Data::PBitMode_PerSubset)
1394 parityBitMax = 2;
1395
1396 int numRealChannels = isRGB ? 3 : 4;
1397
1398 int numShapes;
1399 const int *shapeList;
1400
1401 if (numSubsets == 1)
1402 {
1403 numShapes = BC7Data::g_numShapes1;
1404 shapeList = BC7Data::g_shapeList1;
1405 }
1406 else if (numSubsets == 2)
1407 {
1408 numShapes = BC7Data::g_numShapes2;
1409 shapeList = BC7Data::g_shapeList2;
1410 }
1411 else
1412 {
1413 assert(numSubsets == 3);
1414 if (numPartitions == 16)
1415 {
1416 numShapes = BC7Data::g_numShapes3Short;
1417 shapeList = BC7Data::g_shapeList3Short;
1418 }
1419 else
1420 {
1421 assert(numPartitions == 64);
1422 numShapes = BC7Data::g_numShapes3;
1423 shapeList = BC7Data::g_shapeList3;
1424 }
1425 }
1426
1427 for (int slot = 0; slot < BC7Data::g_numShapesAll; slot++)
1428 temps.shapeBestError[slot] = ParallelMath::MakeFloat(FLT_MAX);
1429
1430 for (int shapeIter = 0; shapeIter < numShapes; shapeIter++)
1431 {
1432 int shape = shapeList[shapeIter];
1433
1434 int numTweakRounds = 0;
1435 if (isRGB)
1436 numTweakRounds = encodingPlan.seedPointsForShapeRGB[shape];
1437 else
1438 numTweakRounds = encodingPlan.seedPointsForShapeRGBA[shape];
1439
1440 if (numTweakRounds == 0)
1441 continue;
1442
1443 if (numTweakRounds > MaxTweakRounds)
1444 numTweakRounds = MaxTweakRounds;
1445
1446 int shapeStart = BC7Data::g_shapeRanges[shape][0];
1447 int shapeLength = BC7Data::g_shapeRanges[shape][1];
1448
1449 AggregatedError<1> alphaAggError;
1450 if (isRGB && anyBlockHasAlpha)
1451 {
1452 MUInt15 filledAlpha[1] = { ParallelMath::MakeUInt15(255) };
1453
1454 for (int pxi = 0; pxi < shapeLength; pxi++)
1455 {
1456 int px = BC7Data::g_fragments[shapeStart + pxi];
1457 MUInt15 original[1] = { pixels[px][3] };
1458 BCCommon::ComputeErrorLDR<1>(flags, filledAlpha, original, alphaAggError);
1459 }
1460 }
1461
1462 float alphaWeightsSq[1] = { channelWeightsSq[3] };
1463 MFloat staticAlphaError = alphaAggError.Finalize(flags, alphaWeightsSq);
1464
1465 MUInt15 tweakBaseEP[MaxTweakRounds][2][4];
1466
1467 for (int tweak = 0; tweak < numTweakRounds; tweak++)
1468 {
1469 if (isRGB)
1470 {
1471 temps.unfinishedRGB[shape].FinishLDR(tweak, 1 << indexPrec, tweakBaseEP[tweak][0], tweakBaseEP[tweak][1]);
1472 tweakBaseEP[tweak][0][3] = tweakBaseEP[tweak][1][3] = ParallelMath::MakeUInt15(255);
1473 }
1474 else
1475 {
1476 temps.unfinishedRGBA[shape].FinishLDR(tweak, 1 << indexPrec, tweakBaseEP[tweak][0], tweakBaseEP[tweak][1]);
1477 }
1478 }
1479
1480 ParallelMath::Int16CompFlag punchThroughInvalid[4];
1481 for (int pIter = 0; pIter < parityBitMax; pIter++)
1482 {
1483 punchThroughInvalid[pIter] = ParallelMath::MakeBoolInt16(false);
1484
1485 if ((flags & Flags::BC7_RespectPunchThrough) && (mode == 6 || mode == 7))
1486 {
1487 // Modes 6 and 7 have parity bits that affect alpha
1488 if (pIter == 0)
1489 punchThroughInvalid[pIter] = (isPunchThrough & blockHasNonZeroAlpha);
1490 else if (pIter == parityBitMax - 1)
1491 punchThroughInvalid[pIter] = (isPunchThrough & blockHasNonMaxAlpha);
1492 else
1493 punchThroughInvalid[pIter] = isPunchThrough;
1494 }
1495 }
1496
1497 for (int pIter = 0; pIter < parityBitMax; pIter++)
1498 {
1499 if (ParallelMath::AllSet(punchThroughInvalid[pIter]))
1500 continue;
1501
1502 bool needPunchThroughCheck = ParallelMath::AnySet(punchThroughInvalid[pIter]);
1503
1504 for (int tweak = 0; tweak < numTweakRounds; tweak++)
1505 {
1506 uint16_t p[2];
1507 p[0] = (pIter & 1);
1508 p[1] = ((pIter >> 1) & 1);
1509
1510 MUInt15 ep[2][4];
1511
1512 for (int epi = 0; epi < 2; epi++)
1513 for (int ch = 0; ch < 4; ch++)
1514 ep[epi][ch] = tweakBaseEP[tweak][epi][ch];
1515
1516 for (int refine = 0; refine < numRefineRounds; refine++)
1517 {
1518 switch (mode)
1519 {
1520 case 0:
1521 CompressEndpoints0(ep, p);
1522 break;
1523 case 1:
1524 CompressEndpoints1(ep, p[0]);
1525 break;
1526 case 2:
1527 CompressEndpoints2(ep);
1528 break;
1529 case 3:
1530 CompressEndpoints3(ep, p);
1531 break;
1532 case 6:
1533 CompressEndpoints6(ep, p);
1534 break;
1535 case 7:
1536 CompressEndpoints7(ep, p);
1537 break;
1538 default:
1539 assert(false);
1540 break;
1541 };
1542
1543 MFloat shapeError = ParallelMath::MakeFloatZero();
1544
1545 IndexSelector<4> indexSelector;
1546 indexSelector.Init<false>(channelWeights, ep, 1 << indexPrec);
1547
1548 EndpointRefiner<4> epRefiner;
1549 epRefiner.Init(1 << indexPrec, channelWeights);
1550
1551 MUInt15 indexes[16];
1552
1553 AggregatedError<4> aggError;
1554 for (int pxi = 0; pxi < shapeLength; pxi++)
1555 {
1556 int px = BC7Data::g_fragments[shapeStart + pxi];
1557
1558 MUInt15 index;
1559 MUInt15 reconstructed[4];
1560
1561 index = indexSelector.SelectIndexLDR(floatPixels[px], rtn);
1562 indexSelector.ReconstructLDR_BC7(index, reconstructed, numRealChannels);
1563
1564 if (flags & cvtt::Flags::BC7_FastIndexing)
1565 BCCommon::ComputeErrorLDR<4>(flags, reconstructed, pixels[px], numRealChannels, aggError);
1566 else
1567 {
1568 MFloat error = BCCommon::ComputeErrorLDRSimple<4>(flags, reconstructed, pixels[px], numRealChannels, channelWeightsSq);
1569
1570 MUInt15 altIndexes[2];
1571 altIndexes[0] = ParallelMath::Max(index, ParallelMath::MakeUInt15(1)) - ParallelMath::MakeUInt15(1);
1572 altIndexes[1] = ParallelMath::Min(index + ParallelMath::MakeUInt15(1), ParallelMath::MakeUInt15(static_cast<uint16_t>((1 << indexPrec) - 1)));
1573
1574 for (int ii = 0; ii < 2; ii++)
1575 {
1576 indexSelector.ReconstructLDR_BC7(altIndexes[ii], reconstructed, numRealChannels);
1577
1578 MFloat altError = BCCommon::ComputeErrorLDRSimple<4>(flags, reconstructed, pixels[px], numRealChannels, channelWeightsSq);
1579 ParallelMath::Int16CompFlag better = ParallelMath::FloatFlagToInt16(ParallelMath::Less(altError, error));
1580 error = ParallelMath::Min(error, altError);
1581 ParallelMath::ConditionalSet(index, better, altIndexes[ii]);
1582 }
1583
1584 shapeError = shapeError + error;
1585 }
1586
1587 if (refine != numRefineRounds - 1)
1588 epRefiner.ContributeUnweightedPW(preWeightedPixels[px], index, numRealChannels);
1589
1590 indexes[pxi] = index;
1591 }
1592
1593 if (flags & cvtt::Flags::BC7_FastIndexing)
1594 shapeError = aggError.Finalize(flags, channelWeightsSq);
1595
1596 if (isRGB)
1597 shapeError = shapeError + staticAlphaError;
1598
1599 ParallelMath::FloatCompFlag shapeErrorBetter;
1600 ParallelMath::Int16CompFlag shapeErrorBetter16;
1601
1602 shapeErrorBetter = ParallelMath::Less(shapeError, temps.shapeBestError[shape]);
1603 shapeErrorBetter16 = ParallelMath::FloatFlagToInt16(shapeErrorBetter);
1604
1605 if (ParallelMath::AnySet(shapeErrorBetter16))
1606 {
1607 bool punchThroughOK = true;
1608 if (needPunchThroughCheck)
1609 {
1610 shapeErrorBetter16 = ParallelMath::AndNot(punchThroughInvalid[pIter], shapeErrorBetter16);
1611 shapeErrorBetter = ParallelMath::Int16FlagToFloat(shapeErrorBetter16);
1612
1613 if (!ParallelMath::AnySet(shapeErrorBetter16))
1614 punchThroughOK = false;
1615 }
1616
1617 if (punchThroughOK)
1618 {
1619 ParallelMath::ConditionalSet(temps.shapeBestError[shape], shapeErrorBetter, shapeError);
1620 for (int epi = 0; epi < 2; epi++)
1621 for (int ch = 0; ch < numRealChannels; ch++)
1622 ParallelMath::ConditionalSet(temps.shapeBestEP[shape][epi][ch], shapeErrorBetter16, ep[epi][ch]);
1623
1624 for (int pxi = 0; pxi < shapeLength; pxi++)
1625 ParallelMath::ConditionalSet(temps.fragmentBestIndexes[shapeStart + pxi], shapeErrorBetter16, indexes[pxi]);
1626 }
1627 }
1628
1629 if (refine != numRefineRounds - 1)
1630 epRefiner.GetRefinedEndpointsLDR(ep, numRealChannels, rtn);
1631 } // refine
1632 } // tweak
1633 } // p
1634
1635 if (flags & cvtt::Flags::BC7_TrySingleColor)
1636 {
1637 MUInt15 total[4];
1638 for (int ch = 0; ch < 4; ch++)
1639 total[ch] = ParallelMath::MakeUInt15(0);
1640
1641 for (int pxi = 0; pxi < shapeLength; pxi++)
1642 {
1643 int px = BC7Data::g_fragments[shapeStart + pxi];
1644 for (int ch = 0; ch < 4; ch++)
1645 total[ch] = total[ch] + pixels[pxi][ch];
1646 }
1647
1648 MFloat rcpShapeLength = ParallelMath::MakeFloat(1.0f / static_cast<float>(shapeLength));
1649 MFloat average[4];
1650 for (int ch = 0; ch < 4; ch++)
1651 average[ch] = ParallelMath::ToFloat(total[ch]) * rcpShapeLength;
1652
1653 const uint8_t *fragment = BC7Data::g_fragments + shapeStart;
1654 MFloat &shapeBestError = temps.shapeBestError[shape];
1655 MUInt15 (&shapeBestEP)[2][4] = temps.shapeBestEP[shape];
1656 MUInt15 *fragmentBestIndexes = temps.fragmentBestIndexes + shapeStart;
1657
1658 const cvtt::Tables::BC7SC::Table **scTables = NULL;
1659 int numSCTables = 0;
1660
1661 const cvtt::Tables::BC7SC::Table *tables0[] =
1662 {
1663 &cvtt::Tables::BC7SC::g_mode0_p00_i1,
1664 &cvtt::Tables::BC7SC::g_mode0_p00_i2,
1665 &cvtt::Tables::BC7SC::g_mode0_p00_i3,
1666 &cvtt::Tables::BC7SC::g_mode0_p01_i1,
1667 &cvtt::Tables::BC7SC::g_mode0_p01_i2,
1668 &cvtt::Tables::BC7SC::g_mode0_p01_i3,
1669 &cvtt::Tables::BC7SC::g_mode0_p10_i1,
1670 &cvtt::Tables::BC7SC::g_mode0_p10_i2,
1671 &cvtt::Tables::BC7SC::g_mode0_p10_i3,
1672 &cvtt::Tables::BC7SC::g_mode0_p11_i1,
1673 &cvtt::Tables::BC7SC::g_mode0_p11_i2,
1674 &cvtt::Tables::BC7SC::g_mode0_p11_i3,
1675 };
1676
1677 const cvtt::Tables::BC7SC::Table *tables1[] =
1678 {
1679 &cvtt::Tables::BC7SC::g_mode1_p0_i1,
1680 &cvtt::Tables::BC7SC::g_mode1_p0_i2,
1681 &cvtt::Tables::BC7SC::g_mode1_p0_i3,
1682 &cvtt::Tables::BC7SC::g_mode1_p1_i1,
1683 &cvtt::Tables::BC7SC::g_mode1_p1_i2,
1684 &cvtt::Tables::BC7SC::g_mode1_p1_i3,
1685 };
1686
1687 const cvtt::Tables::BC7SC::Table *tables2[] =
1688 {
1689 &cvtt::Tables::BC7SC::g_mode2,
1690 };
1691
1692 const cvtt::Tables::BC7SC::Table *tables3[] =
1693 {
1694 &cvtt::Tables::BC7SC::g_mode3_p0,
1695 &cvtt::Tables::BC7SC::g_mode3_p1,
1696 };
1697
1698 const cvtt::Tables::BC7SC::Table *tables6[] =
1699 {
1700 &cvtt::Tables::BC7SC::g_mode6_p0_i1,
1701 &cvtt::Tables::BC7SC::g_mode6_p0_i2,
1702 &cvtt::Tables::BC7SC::g_mode6_p0_i3,
1703 &cvtt::Tables::BC7SC::g_mode6_p0_i4,
1704 &cvtt::Tables::BC7SC::g_mode6_p0_i5,
1705 &cvtt::Tables::BC7SC::g_mode6_p0_i6,
1706 &cvtt::Tables::BC7SC::g_mode6_p0_i7,
1707 &cvtt::Tables::BC7SC::g_mode6_p1_i1,
1708 &cvtt::Tables::BC7SC::g_mode6_p1_i2,
1709 &cvtt::Tables::BC7SC::g_mode6_p1_i3,
1710 &cvtt::Tables::BC7SC::g_mode6_p1_i4,
1711 &cvtt::Tables::BC7SC::g_mode6_p1_i5,
1712 &cvtt::Tables::BC7SC::g_mode6_p1_i6,
1713 &cvtt::Tables::BC7SC::g_mode6_p1_i7,
1714 };
1715
1716 const cvtt::Tables::BC7SC::Table *tables7[] =
1717 {
1718 &cvtt::Tables::BC7SC::g_mode7_p00,
1719 &cvtt::Tables::BC7SC::g_mode7_p01,
1720 &cvtt::Tables::BC7SC::g_mode7_p10,
1721 &cvtt::Tables::BC7SC::g_mode7_p11,
1722 };
1723
1724 switch (mode)
1725 {
1726 case 0:
1727 {
1728 scTables = tables0;
1729 numSCTables = sizeof(tables0) / sizeof(tables0[0]);
1730 }
1731 break;
1732 case 1:
1733 {
1734 scTables = tables1;
1735 numSCTables = sizeof(tables1) / sizeof(tables1[0]);
1736 }
1737 break;
1738 case 2:
1739 {
1740
1741 scTables = tables2;
1742 numSCTables = sizeof(tables2) / sizeof(tables2[0]);
1743 }
1744 break;
1745 case 3:
1746 {
1747 scTables = tables3;
1748 numSCTables = sizeof(tables3) / sizeof(tables3[0]);
1749 }
1750 break;
1751 case 6:
1752 {
1753 scTables = tables6;
1754 numSCTables = sizeof(tables6) / sizeof(tables6[0]);
1755 }
1756 break;
1757 case 7:
1758 {
1759 scTables = tables7;
1760 numSCTables = sizeof(tables7) / sizeof(tables7[0]);
1761 }
1762 break;
1763 default:
1764 assert(false);
1765 break;
1766 }
1767
1768 TrySingleColorRGBAMultiTable(flags, pixels, average, numRealChannels, fragment, shapeLength, staticAlphaError, punchThroughInvalid, shapeBestError, shapeBestEP, fragmentBestIndexes, channelWeightsSq, scTables, numSCTables, rtn);
1769 }
1770 } // shapeIter
1771
1772 uint64_t partitionsEnabledBits = 0xffffffffffffffffULL;
1773
1774 switch (mode)
1775 {
1776 case 0:
1777 partitionsEnabledBits = encodingPlan.mode0PartitionEnabled;
1778 break;
1779 case 1:
1780 partitionsEnabledBits = encodingPlan.mode1PartitionEnabled;
1781 break;
1782 case 2:
1783 partitionsEnabledBits = encodingPlan.mode2PartitionEnabled;
1784 break;
1785 case 3:
1786 partitionsEnabledBits = encodingPlan.mode3PartitionEnabled;
1787 break;
1788 case 6:
1789 partitionsEnabledBits = encodingPlan.mode6Enabled ? 1 : 0;
1790 break;
1791 case 7:
1792 if (anyBlockHasAlpha)
1793 partitionEnabledBits = encodingPlan.mode7RGBAPartitionEnabled;
1794 else
1795 partitionEnabledBits = encodingPlan.mode7RGBPartitionEnabled;
1796 break;
1797 default:
1798 break;
1799 };
1800
1801 for (uint16_t partition = 0; partition < numPartitions; partition++)
1802 {
1803 if (((partitionsEnabledBits >> partition) & 1) == 0)
1804 continue;
1805
1806 const int *partitionShapes;
1807 if (numSubsets == 1)
1808 partitionShapes = BC7Data::g_shapes1[partition];
1809 else if (numSubsets == 2)
1810 partitionShapes = BC7Data::g_shapes2[partition];
1811 else
1812 {
1813 assert(numSubsets == 3);
1814 partitionShapes = BC7Data::g_shapes3[partition];
1815 }
1816
1817 MFloat totalError = ParallelMath::MakeFloatZero();
1818 for (int subset = 0; subset < numSubsets; subset++)
1819 totalError = totalError + temps.shapeBestError[partitionShapes[subset]];
1820
1821 ParallelMath::FloatCompFlag errorBetter = ParallelMath::Less(totalError, work.m_error);
1822 ParallelMath::Int16CompFlag errorBetter16 = ParallelMath::FloatFlagToInt16(errorBetter);
1823
1824 if (mode == 7 && anyBlockHasAlpha)
1825 {
1826 // Some lanes could be better, but we filter them out to ensure consistency with scalar
1827 bool isRGBAllowedForThisPartition = (((encodingPlan.mode7RGBPartitionEnabled >> partition) & 1) != 0);
1828
1829 if (!isRGBAllowedForThisPartition)
1830 {
1831 errorBetter16 = (errorBetter16 & blockHasNonMaxAlpha);
1832 errorBetter = ParallelMath::Int16FlagToFloat(errorBetter16);
1833 }
1834 }
1835
1836 if (ParallelMath::AnySet(errorBetter16))
1837 {
1838 for (int subset = 0; subset < numSubsets; subset++)
1839 {
1840 int shape = partitionShapes[subset];
1841 int shapeStart = BC7Data::g_shapeRanges[shape][0];
1842 int shapeLength = BC7Data::g_shapeRanges[shape][1];
1843
1844 for (int epi = 0; epi < 2; epi++)
1845 for (int ch = 0; ch < 4; ch++)
1846 ParallelMath::ConditionalSet(work.m_ep[subset][epi][ch], errorBetter16, temps.shapeBestEP[shape][epi][ch]);
1847
1848 for (int pxi = 0; pxi < shapeLength; pxi++)
1849 {
1850 int px = BC7Data::g_fragments[shapeStart + pxi];
1851 ParallelMath::ConditionalSet(work.m_indexes[px], errorBetter16, temps.fragmentBestIndexes[shapeStart + pxi]);
1852 }
1853 }
1854
1855 ParallelMath::ConditionalSet(work.m_error, errorBetter, totalError);
1856 ParallelMath::ConditionalSet(work.m_mode, errorBetter16, ParallelMath::MakeUInt15(mode));
1857 ParallelMath::ConditionalSet(work.m_u.m_partition, errorBetter16, ParallelMath::MakeUInt15(partition));
1858 }
1859 }
1860 }
1861}
1862
1863void cvtt::Internal::BC7Computer::TryDualPlane(uint32_t flags, const MUInt15 pixels[16][4], const MFloat floatPixels[16][4], const float channelWeights[4], const BC7EncodingPlan &encodingPlan, int numRefineRounds, BC67::WorkInfo& work, const ParallelMath::RoundTowardNearestForScope *rtn)
1864{
1865 // TODO: These error calculations are not optimal for weight-by-alpha, but this routine needs to be mostly rewritten for that.
1866 // The alpha/color solutions are co-dependent in that case, but a good way to solve it would probably be to
1867 // solve the alpha channel first, then solve the RGB channels, which in turn breaks down into two cases:
1868 // - Separate alpha channel, then weighted RGB
1869 // - Alpha+2 other channels, then the independent channel
1870 if (numRefineRounds < 1)
1871 numRefineRounds = 1;
1872
1873 float channelWeightsSq[4];
1874 for (int ch = 0; ch < 4; ch++)
1875 channelWeightsSq[ch] = channelWeights[ch] * channelWeights[ch];
1876
1877 for (uint16_t mode = 4; mode <= 5; mode++)
1878 {
1879 int numSP[2] = { 0, 0 };
1880
1881 for (uint16_t rotation = 0; rotation < 4; rotation++)
1882 {
1883 if (mode == 4)
1884 {
1885 numSP[0] = encodingPlan.mode4SP[rotation][0];
1886 numSP[1] = encodingPlan.mode4SP[rotation][1];
1887 }
1888 else
1889 numSP[0] = numSP[1] = encodingPlan.mode5SP[rotation];
1890
1891 if (numSP[0] == 0 && numSP[1] == 0)
1892 continue;
1893
1894 int alphaChannel = (rotation + 3) & 3;
1895 int redChannel = (rotation == 1) ? 3 : 0;
1896 int greenChannel = (rotation == 2) ? 3 : 1;
1897 int blueChannel = (rotation == 3) ? 3 : 2;
1898
1899 MUInt15 rotatedRGB[16][3];
1900 MFloat floatRotatedRGB[16][3];
1901
1902 for (int px = 0; px < 16; px++)
1903 {
1904 rotatedRGB[px][0] = pixels[px][redChannel];
1905 rotatedRGB[px][1] = pixels[px][greenChannel];
1906 rotatedRGB[px][2] = pixels[px][blueChannel];
1907
1908 for (int ch = 0; ch < 3; ch++)
1909 floatRotatedRGB[px][ch] = ParallelMath::ToFloat(rotatedRGB[px][ch]);
1910 }
1911
1912 uint16_t maxIndexSelector = (mode == 4) ? 2 : 1;
1913
1914 float rotatedRGBWeights[3] = { channelWeights[redChannel], channelWeights[greenChannel], channelWeights[blueChannel] };
1915 float rotatedRGBWeightsSq[3] = { channelWeightsSq[redChannel], channelWeightsSq[greenChannel], channelWeightsSq[blueChannel] };
1916 float rotatedAlphaWeight[1] = { channelWeights[alphaChannel] };
1917 float rotatedAlphaWeightSq[1] = { channelWeightsSq[alphaChannel] };
1918
1919 float uniformWeight[1] = { 1.0f }; // Since the alpha channel is independent, there's no need to bother with weights when doing refinement or selection, only error
1920
1921 MFloat preWeightedRotatedRGB[16][3];
1922 BCCommon::PreWeightPixelsLDR<3>(preWeightedRotatedRGB, rotatedRGB, rotatedRGBWeights);
1923
1924 for (uint16_t indexSelector = 0; indexSelector < maxIndexSelector; indexSelector++)
1925 {
1926 int numTweakRounds = numSP[indexSelector];
1927
1928 if (numTweakRounds <= 0)
1929 continue;
1930
1931 if (numTweakRounds > MaxTweakRounds)
1932 numTweakRounds = MaxTweakRounds;
1933
1934 EndpointSelector<3, 8> rgbSelector;
1935
1936 for (int epPass = 0; epPass < NumEndpointSelectorPasses; epPass++)
1937 {
1938 for (int px = 0; px < 16; px++)
1939 rgbSelector.ContributePass(preWeightedRotatedRGB[px], epPass, ParallelMath::MakeFloat(1.0f));
1940
1941 rgbSelector.FinishPass(epPass);
1942 }
1943
1944 MUInt15 alphaRange[2];
1945
1946 alphaRange[0] = alphaRange[1] = pixels[0][alphaChannel];
1947 for (int px = 1; px < 16; px++)
1948 {
1949 alphaRange[0] = ParallelMath::Min(pixels[px][alphaChannel], alphaRange[0]);
1950 alphaRange[1] = ParallelMath::Max(pixels[px][alphaChannel], alphaRange[1]);
1951 }
1952
1953 int rgbPrec = 0;
1954 int alphaPrec = 0;
1955
1956 if (mode == 4)
1957 {
1958 rgbPrec = indexSelector ? 3 : 2;
1959 alphaPrec = indexSelector ? 2 : 3;
1960 }
1961 else
1962 rgbPrec = alphaPrec = 2;
1963
1964 UnfinishedEndpoints<3> unfinishedRGB = rgbSelector.GetEndpoints(rotatedRGBWeights);
1965
1966 MFloat bestRGBError = ParallelMath::MakeFloat(FLT_MAX);
1967 MFloat bestAlphaError = ParallelMath::MakeFloat(FLT_MAX);
1968
1969 MUInt15 bestRGBIndexes[16];
1970 MUInt15 bestAlphaIndexes[16];
1971 MUInt15 bestEP[2][4];
1972
1973 for (int px = 0; px < 16; px++)
1974 bestRGBIndexes[px] = bestAlphaIndexes[px] = ParallelMath::MakeUInt15(0);
1975
1976 for (int tweak = 0; tweak < numTweakRounds; tweak++)
1977 {
1978 MUInt15 rgbEP[2][3];
1979 MUInt15 alphaEP[2];
1980
1981 unfinishedRGB.FinishLDR(tweak, 1 << rgbPrec, rgbEP[0], rgbEP[1]);
1982
1983 TweakAlpha(alphaRange, tweak, 1 << alphaPrec, alphaEP);
1984
1985 for (int refine = 0; refine < numRefineRounds; refine++)
1986 {
1987 if (mode == 4)
1988 CompressEndpoints4(rgbEP, alphaEP);
1989 else
1990 CompressEndpoints5(rgbEP, alphaEP);
1991
1992
1993 IndexSelector<1> alphaIndexSelector;
1994 IndexSelector<3> rgbIndexSelector;
1995
1996 {
1997 MUInt15 alphaEPTemp[2][1] = { { alphaEP[0] },{ alphaEP[1] } };
1998 alphaIndexSelector.Init<false>(uniformWeight, alphaEPTemp, 1 << alphaPrec);
1999 }
2000 rgbIndexSelector.Init<false>(rotatedRGBWeights, rgbEP, 1 << rgbPrec);
2001
2002 EndpointRefiner<3> rgbRefiner;
2003 EndpointRefiner<1> alphaRefiner;
2004
2005 rgbRefiner.Init(1 << rgbPrec, rotatedRGBWeights);
2006 alphaRefiner.Init(1 << alphaPrec, uniformWeight);
2007
2008 MFloat errorRGB = ParallelMath::MakeFloatZero();
2009 MFloat errorA = ParallelMath::MakeFloatZero();
2010
2011 MUInt15 rgbIndexes[16];
2012 MUInt15 alphaIndexes[16];
2013
2014 AggregatedError<3> rgbAggError;
2015 AggregatedError<1> alphaAggError;
2016
2017 for (int px = 0; px < 16; px++)
2018 {
2019 MUInt15 rgbIndex = rgbIndexSelector.SelectIndexLDR(floatRotatedRGB[px], rtn);
2020 MUInt15 alphaIndex = alphaIndexSelector.SelectIndexLDR(floatPixels[px] + alphaChannel, rtn);
2021
2022 MUInt15 reconstructedRGB[3];
2023 MUInt15 reconstructedAlpha[1];
2024
2025 rgbIndexSelector.ReconstructLDR_BC7(rgbIndex, reconstructedRGB);
2026 alphaIndexSelector.ReconstructLDR_BC7(alphaIndex, reconstructedAlpha);
2027
2028 if (flags & cvtt::Flags::BC7_FastIndexing)
2029 {
2030 BCCommon::ComputeErrorLDR<3>(flags, reconstructedRGB, rotatedRGB[px], rgbAggError);
2031 BCCommon::ComputeErrorLDR<1>(flags, reconstructedAlpha, pixels[px] + alphaChannel, alphaAggError);
2032 }
2033 else
2034 {
2035 AggregatedError<3> baseRGBAggError;
2036 AggregatedError<1> baseAlphaAggError;
2037
2038 BCCommon::ComputeErrorLDR<3>(flags, reconstructedRGB, rotatedRGB[px], baseRGBAggError);
2039 BCCommon::ComputeErrorLDR<1>(flags, reconstructedAlpha, pixels[px] + alphaChannel, baseAlphaAggError);
2040
2041 MFloat rgbError = baseRGBAggError.Finalize(flags, rotatedRGBWeightsSq);
2042 MFloat alphaError = baseAlphaAggError.Finalize(flags, rotatedAlphaWeightSq);
2043
2044 MUInt15 altRGBIndexes[2];
2045 MUInt15 altAlphaIndexes[2];
2046
2047 altRGBIndexes[0] = ParallelMath::Max(rgbIndex, ParallelMath::MakeUInt15(1)) - ParallelMath::MakeUInt15(1);
2048 altRGBIndexes[1] = ParallelMath::Min(rgbIndex + ParallelMath::MakeUInt15(1), ParallelMath::MakeUInt15(static_cast<uint16_t>((1 << rgbPrec) - 1)));
2049
2050 altAlphaIndexes[0] = ParallelMath::Max(alphaIndex, ParallelMath::MakeUInt15(1)) - ParallelMath::MakeUInt15(1);
2051 altAlphaIndexes[1] = ParallelMath::Min(alphaIndex + ParallelMath::MakeUInt15(1), ParallelMath::MakeUInt15(static_cast<uint16_t>((1 << alphaPrec) - 1)));
2052
2053 for (int ii = 0; ii < 2; ii++)
2054 {
2055 rgbIndexSelector.ReconstructLDR_BC7(altRGBIndexes[ii], reconstructedRGB);
2056 alphaIndexSelector.ReconstructLDR_BC7(altAlphaIndexes[ii], reconstructedAlpha);
2057
2058 AggregatedError<3> altRGBAggError;
2059 AggregatedError<1> altAlphaAggError;
2060
2061 BCCommon::ComputeErrorLDR<3>(flags, reconstructedRGB, rotatedRGB[px], altRGBAggError);
2062 BCCommon::ComputeErrorLDR<1>(flags, reconstructedAlpha, pixels[px] + alphaChannel, altAlphaAggError);
2063
2064 MFloat altRGBError = altRGBAggError.Finalize(flags, rotatedRGBWeightsSq);
2065 MFloat altAlphaError = altAlphaAggError.Finalize(flags, rotatedAlphaWeightSq);
2066
2067 ParallelMath::Int16CompFlag rgbBetter = ParallelMath::FloatFlagToInt16(ParallelMath::Less(altRGBError, rgbError));
2068 ParallelMath::Int16CompFlag alphaBetter = ParallelMath::FloatFlagToInt16(ParallelMath::Less(altAlphaError, alphaError));
2069
2070 rgbError = ParallelMath::Min(altRGBError, rgbError);
2071 alphaError = ParallelMath::Min(altAlphaError, alphaError);
2072
2073 ParallelMath::ConditionalSet(rgbIndex, rgbBetter, altRGBIndexes[ii]);
2074 ParallelMath::ConditionalSet(alphaIndex, alphaBetter, altAlphaIndexes[ii]);
2075 }
2076
2077 errorRGB = errorRGB + rgbError;
2078 errorA = errorA + alphaError;
2079 }
2080
2081 if (refine != numRefineRounds - 1)
2082 {
2083 rgbRefiner.ContributeUnweightedPW(preWeightedRotatedRGB[px], rgbIndex);
2084 alphaRefiner.ContributeUnweightedPW(floatPixels[px] + alphaChannel, alphaIndex);
2085 }
2086
2087 if (flags & Flags::BC7_FastIndexing)
2088 {
2089 errorRGB = rgbAggError.Finalize(flags, rotatedRGBWeightsSq);
2090 errorA = alphaAggError.Finalize(flags, rotatedAlphaWeightSq);
2091 }
2092
2093 rgbIndexes[px] = rgbIndex;
2094 alphaIndexes[px] = alphaIndex;
2095 }
2096
2097 ParallelMath::FloatCompFlag rgbBetter = ParallelMath::Less(errorRGB, bestRGBError);
2098 ParallelMath::FloatCompFlag alphaBetter = ParallelMath::Less(errorA, bestAlphaError);
2099
2100 ParallelMath::Int16CompFlag rgbBetterInt16 = ParallelMath::FloatFlagToInt16(rgbBetter);
2101 ParallelMath::Int16CompFlag alphaBetterInt16 = ParallelMath::FloatFlagToInt16(alphaBetter);
2102
2103 if (ParallelMath::AnySet(rgbBetterInt16))
2104 {
2105 bestRGBError = ParallelMath::Min(errorRGB, bestRGBError);
2106
2107 for (int px = 0; px < 16; px++)
2108 ParallelMath::ConditionalSet(bestRGBIndexes[px], rgbBetterInt16, rgbIndexes[px]);
2109
2110 for (int ep = 0; ep < 2; ep++)
2111 {
2112 for (int ch = 0; ch < 3; ch++)
2113 ParallelMath::ConditionalSet(bestEP[ep][ch], rgbBetterInt16, rgbEP[ep][ch]);
2114 }
2115 }
2116
2117 if (ParallelMath::AnySet(alphaBetterInt16))
2118 {
2119 bestAlphaError = ParallelMath::Min(errorA, bestAlphaError);
2120
2121 for (int px = 0; px < 16; px++)
2122 ParallelMath::ConditionalSet(bestAlphaIndexes[px], alphaBetterInt16, alphaIndexes[px]);
2123
2124 for (int ep = 0; ep < 2; ep++)
2125 ParallelMath::ConditionalSet(bestEP[ep][3], alphaBetterInt16, alphaEP[ep]);
2126 }
2127
2128 if (refine != numRefineRounds - 1)
2129 {
2130 rgbRefiner.GetRefinedEndpointsLDR(rgbEP, rtn);
2131
2132 MUInt15 alphaEPTemp[2][1];
2133 alphaRefiner.GetRefinedEndpointsLDR(alphaEPTemp, rtn);
2134
2135 for (int i = 0; i < 2; i++)
2136 alphaEP[i] = alphaEPTemp[i][0];
2137 }
2138 } // refine
2139 } // tweak
2140
2141 MFloat combinedError = bestRGBError + bestAlphaError;
2142
2143 ParallelMath::FloatCompFlag errorBetter = ParallelMath::Less(combinedError, work.m_error);
2144 ParallelMath::Int16CompFlag errorBetter16 = ParallelMath::FloatFlagToInt16(errorBetter);
2145
2146 work.m_error = ParallelMath::Min(combinedError, work.m_error);
2147
2148 ParallelMath::ConditionalSet(work.m_mode, errorBetter16, ParallelMath::MakeUInt15(mode));
2149 ParallelMath::ConditionalSet(work.m_u.m_isr.m_rotation, errorBetter16, ParallelMath::MakeUInt15(rotation));
2150 ParallelMath::ConditionalSet(work.m_u.m_isr.m_indexSelector, errorBetter16, ParallelMath::MakeUInt15(indexSelector));
2151
2152 for (int px = 0; px < 16; px++)
2153 {
2154 ParallelMath::ConditionalSet(work.m_indexes[px], errorBetter16, indexSelector ? bestAlphaIndexes[px] : bestRGBIndexes[px]);
2155 ParallelMath::ConditionalSet(work.m_indexes2[px], errorBetter16, indexSelector ? bestRGBIndexes[px] : bestAlphaIndexes[px]);
2156 }
2157
2158 for (int ep = 0; ep < 2; ep++)
2159 for (int ch = 0; ch < 4; ch++)
2160 ParallelMath::ConditionalSet(work.m_ep[0][ep][ch], errorBetter16, bestEP[ep][ch]);
2161 }
2162 }
2163 }
2164}
2165
2166template<class T>
2167void cvtt::Internal::BC7Computer::Swap(T& a, T& b)
2168{
2169 T temp = a;
2170 a = b;
2171 b = temp;
2172}
2173
2174void cvtt::Internal::BC7Computer::Pack(uint32_t flags, const PixelBlockU8* inputs, uint8_t* packedBlocks, const float channelWeights[4], const BC7EncodingPlan &encodingPlan, int numRefineRounds)
2175{
2176 MUInt15 pixels[16][4];
2177 MFloat floatPixels[16][4];
2178
2179 for (int px = 0; px < 16; px++)
2180 {
2181 for (int ch = 0; ch < 4; ch++)
2182 ParallelMath::ConvertLDRInputs(inputs, px, ch, pixels[px][ch]);
2183 }
2184
2185 for (int px = 0; px < 16; px++)
2186 {
2187 for (int ch = 0; ch < 4; ch++)
2188 floatPixels[px][ch] = ParallelMath::ToFloat(pixels[px][ch]);
2189 }
2190
2191 BC67::WorkInfo work;
2192 memset(&work, 0, sizeof(work));
2193
2194 work.m_error = ParallelMath::MakeFloat(FLT_MAX);
2195
2196 {
2197 ParallelMath::RoundTowardNearestForScope rtn;
2198 TrySinglePlane(flags, pixels, floatPixels, channelWeights, encodingPlan, numRefineRounds, work, &rtn);
2199 TryDualPlane(flags, pixels, floatPixels, channelWeights, encodingPlan, numRefineRounds, work, &rtn);
2200 }
2201
2202 for (int block = 0; block < ParallelMath::ParallelSize; block++)
2203 {
2204 PackingVector pv;
2205 pv.Init();
2206
2207 ParallelMath::ScalarUInt16 mode = ParallelMath::Extract(work.m_mode, block);
2208 ParallelMath::ScalarUInt16 partition = ParallelMath::Extract(work.m_u.m_partition, block);
2209 ParallelMath::ScalarUInt16 indexSelector = ParallelMath::Extract(work.m_u.m_isr.m_indexSelector, block);
2210
2211 const BC7Data::BC7ModeInfo& modeInfo = BC7Data::g_modes[mode];
2212
2213 ParallelMath::ScalarUInt16 indexes[16];
2214 ParallelMath::ScalarUInt16 indexes2[16];
2215 ParallelMath::ScalarUInt16 endPoints[3][2][4];
2216
2217 for (int i = 0; i < 16; i++)
2218 {
2219 indexes[i] = ParallelMath::Extract(work.m_indexes[i], block);
2220 if (modeInfo.m_alphaMode == BC7Data::AlphaMode_Separate)
2221 indexes2[i] = ParallelMath::Extract(work.m_indexes2[i], block);
2222 }
2223
2224 for (int subset = 0; subset < 3; subset++)
2225 {
2226 for (int ep = 0; ep < 2; ep++)
2227 {
2228 for (int ch = 0; ch < 4; ch++)
2229 endPoints[subset][ep][ch] = ParallelMath::Extract(work.m_ep[subset][ep][ch], block);
2230 }
2231 }
2232
2233 int fixups[3] = { 0, 0, 0 };
2234
2235 if (modeInfo.m_alphaMode == BC7Data::AlphaMode_Separate)
2236 {
2237 bool flipRGB = ((indexes[0] & (1 << (modeInfo.m_indexBits - 1))) != 0);
2238 bool flipAlpha = ((indexes2[0] & (1 << (modeInfo.m_alphaIndexBits - 1))) != 0);
2239
2240 if (flipRGB)
2241 {
2242 uint16_t highIndex = (1 << modeInfo.m_indexBits) - 1;
2243 for (int px = 0; px < 16; px++)
2244 indexes[px] = highIndex - indexes[px];
2245 }
2246
2247 if (flipAlpha)
2248 {
2249 uint16_t highIndex = (1 << modeInfo.m_alphaIndexBits) - 1;
2250 for (int px = 0; px < 16; px++)
2251 indexes2[px] = highIndex - indexes2[px];
2252 }
2253
2254 if (indexSelector)
2255 Swap(flipRGB, flipAlpha);
2256
2257 if (flipRGB)
2258 {
2259 for (int ch = 0; ch < 3; ch++)
2260 Swap(endPoints[0][0][ch], endPoints[0][1][ch]);
2261 }
2262 if (flipAlpha)
2263 Swap(endPoints[0][0][3], endPoints[0][1][3]);
2264
2265 }
2266 else
2267 {
2268 if (modeInfo.m_numSubsets == 2)
2269 fixups[1] = BC7Data::g_fixupIndexes2[partition];
2270 else if (modeInfo.m_numSubsets == 3)
2271 {
2272 fixups[1] = BC7Data::g_fixupIndexes3[partition][0];
2273 fixups[2] = BC7Data::g_fixupIndexes3[partition][1];
2274 }
2275
2276 bool flip[3] = { false, false, false };
2277 for (int subset = 0; subset < modeInfo.m_numSubsets; subset++)
2278 flip[subset] = ((indexes[fixups[subset]] & (1 << (modeInfo.m_indexBits - 1))) != 0);
2279
2280 if (flip[0] || flip[1] || flip[2])
2281 {
2282 uint16_t highIndex = (1 << modeInfo.m_indexBits) - 1;
2283 for (int px = 0; px < 16; px++)
2284 {
2285 int subset = 0;
2286 if (modeInfo.m_numSubsets == 2)
2287 subset = (BC7Data::g_partitionMap[partition] >> px) & 1;
2288 else if (modeInfo.m_numSubsets == 3)
2289 subset = (BC7Data::g_partitionMap2[partition] >> (px * 2)) & 3;
2290
2291 if (flip[subset])
2292 indexes[px] = highIndex - indexes[px];
2293 }
2294
2295 int maxCH = (modeInfo.m_alphaMode == BC7Data::AlphaMode_Combined) ? 4 : 3;
2296 for (int subset = 0; subset < modeInfo.m_numSubsets; subset++)
2297 {
2298 if (flip[subset])
2299 for (int ch = 0; ch < maxCH; ch++)
2300 Swap(endPoints[subset][0][ch], endPoints[subset][1][ch]);
2301 }
2302 }
2303 }
2304
2305 pv.Pack(static_cast<uint8_t>(1 << mode), mode + 1);
2306
2307 if (modeInfo.m_partitionBits)
2308 pv.Pack(partition, modeInfo.m_partitionBits);
2309
2310 if (modeInfo.m_alphaMode == BC7Data::AlphaMode_Separate)
2311 {
2312 ParallelMath::ScalarUInt16 rotation = ParallelMath::Extract(work.m_u.m_isr.m_rotation, block);
2313 pv.Pack(rotation, 2);
2314 }
2315
2316 if (modeInfo.m_hasIndexSelector)
2317 pv.Pack(indexSelector, 1);
2318
2319 // Encode RGB
2320 for (int ch = 0; ch < 3; ch++)
2321 {
2322 for (int subset = 0; subset < modeInfo.m_numSubsets; subset++)
2323 {
2324 for (int ep = 0; ep < 2; ep++)
2325 {
2326 ParallelMath::ScalarUInt16 epPart = endPoints[subset][ep][ch];
2327 epPart >>= (8 - modeInfo.m_rgbBits);
2328
2329 pv.Pack(epPart, modeInfo.m_rgbBits);
2330 }
2331 }
2332 }
2333
2334 // Encode alpha
2335 if (modeInfo.m_alphaMode != BC7Data::AlphaMode_None)
2336 {
2337 for (int subset = 0; subset < modeInfo.m_numSubsets; subset++)
2338 {
2339 for (int ep = 0; ep < 2; ep++)
2340 {
2341 ParallelMath::ScalarUInt16 epPart = endPoints[subset][ep][3];
2342 epPart >>= (8 - modeInfo.m_alphaBits);
2343
2344 pv.Pack(epPart, modeInfo.m_alphaBits);
2345 }
2346 }
2347 }
2348
2349 // Encode parity bits
2350 if (modeInfo.m_pBitMode == BC7Data::PBitMode_PerSubset)
2351 {
2352 for (int subset = 0; subset < modeInfo.m_numSubsets; subset++)
2353 {
2354 ParallelMath::ScalarUInt16 epPart = endPoints[subset][0][0];
2355 epPart >>= (7 - modeInfo.m_rgbBits);
2356 epPart &= 1;
2357
2358 pv.Pack(epPart, 1);
2359 }
2360 }
2361 else if (modeInfo.m_pBitMode == BC7Data::PBitMode_PerEndpoint)
2362 {
2363 for (int subset = 0; subset < modeInfo.m_numSubsets; subset++)
2364 {
2365 for (int ep = 0; ep < 2; ep++)
2366 {
2367 ParallelMath::ScalarUInt16 epPart = endPoints[subset][ep][0];
2368 epPart >>= (7 - modeInfo.m_rgbBits);
2369 epPart &= 1;
2370
2371 pv.Pack(epPart, 1);
2372 }
2373 }
2374 }
2375
2376 // Encode indexes
2377 for (int px = 0; px < 16; px++)
2378 {
2379 int bits = modeInfo.m_indexBits;
2380 if ((px == 0) || (px == fixups[1]) || (px == fixups[2]))
2381 bits--;
2382
2383 pv.Pack(indexes[px], bits);
2384 }
2385
2386 // Encode secondary indexes
2387 if (modeInfo.m_alphaMode == BC7Data::AlphaMode_Separate)
2388 {
2389 for (int px = 0; px < 16; px++)
2390 {
2391 int bits = modeInfo.m_alphaIndexBits;
2392 if (px == 0)
2393 bits--;
2394
2395 pv.Pack(indexes2[px], bits);
2396 }
2397 }
2398
2399 pv.Flush(packedBlocks);
2400
2401 packedBlocks += 16;
2402 }
2403}
2404
2405void cvtt::Internal::BC7Computer::UnpackOne(PixelBlockU8 &output, const uint8_t* packedBlock)
2406{
2407 UnpackingVector pv;
2408 pv.Init(packedBlock);
2409
2410 int mode = 8;
2411 for (int i = 0; i < 8; i++)
2412 {
2413 if (pv.Unpack(1) == 1)
2414 {
2415 mode = i;
2416 break;
2417 }
2418 }
2419
2420 if (mode > 7)
2421 {
2422 for (int px = 0; px < 16; px++)
2423 for (int ch = 0; ch < 4; ch++)
2424 output.m_pixels[px][ch] = 0;
2425
2426 return;
2427 }
2428
2429 const BC7Data::BC7ModeInfo &modeInfo = BC7Data::g_modes[mode];
2430
2431 int partition = 0;
2432 if (modeInfo.m_partitionBits)
2433 partition = pv.Unpack(modeInfo.m_partitionBits);
2434
2435 int rotation = 0;
2436 if (modeInfo.m_alphaMode == BC7Data::AlphaMode_Separate)
2437 rotation = pv.Unpack(2);
2438
2439 int indexSelector = 0;
2440 if (modeInfo.m_hasIndexSelector)
2441 indexSelector = pv.Unpack(1);
2442
2443 // Resolve fixups
2444 int fixups[3] = { 0, 0, 0 };
2445
2446 if (modeInfo.m_alphaMode != BC7Data::AlphaMode_Separate)
2447 {
2448 if (modeInfo.m_numSubsets == 2)
2449 fixups[1] = BC7Data::g_fixupIndexes2[partition];
2450 else if (modeInfo.m_numSubsets == 3)
2451 {
2452 fixups[1] = BC7Data::g_fixupIndexes3[partition][0];
2453 fixups[2] = BC7Data::g_fixupIndexes3[partition][1];
2454 }
2455 }
2456
2457 int endPoints[3][2][4];
2458
2459 // Decode RGB
2460 for (int ch = 0; ch < 3; ch++)
2461 {
2462 for (int subset = 0; subset < modeInfo.m_numSubsets; subset++)
2463 {
2464 for (int ep = 0; ep < 2; ep++)
2465 endPoints[subset][ep][ch] = (pv.Unpack(modeInfo.m_rgbBits) << (8 - modeInfo.m_rgbBits));
2466 }
2467 }
2468
2469 // Decode alpha
2470 if (modeInfo.m_alphaMode != BC7Data::AlphaMode_None)
2471 {
2472 for (int subset = 0; subset < modeInfo.m_numSubsets; subset++)
2473 {
2474 for (int ep = 0; ep < 2; ep++)
2475 endPoints[subset][ep][3] = (pv.Unpack(modeInfo.m_alphaBits) << (8 - modeInfo.m_alphaBits));
2476 }
2477 }
2478 else
2479 {
2480 for (int subset = 0; subset < modeInfo.m_numSubsets; subset++)
2481 {
2482 for (int ep = 0; ep < 2; ep++)
2483 endPoints[subset][ep][3] = 255;
2484 }
2485 }
2486
2487 int parityBits = 0;
2488
2489 // Decode parity bits
2490 if (modeInfo.m_pBitMode == BC7Data::PBitMode_PerSubset)
2491 {
2492 for (int subset = 0; subset < modeInfo.m_numSubsets; subset++)
2493 {
2494 int p = pv.Unpack(1);
2495
2496 for (int ep = 0; ep < 2; ep++)
2497 {
2498 for (int ch = 0; ch < 3; ch++)
2499 endPoints[subset][ep][ch] |= p << (7 - modeInfo.m_rgbBits);
2500
2501 if (modeInfo.m_alphaMode != BC7Data::AlphaMode_None)
2502 endPoints[subset][ep][3] |= p << (7 - modeInfo.m_alphaBits);
2503 }
2504 }
2505
2506 parityBits = 1;
2507 }
2508 else if (modeInfo.m_pBitMode == BC7Data::PBitMode_PerEndpoint)
2509 {
2510 for (int subset = 0; subset < modeInfo.m_numSubsets; subset++)
2511 {
2512 for (int ep = 0; ep < 2; ep++)
2513 {
2514 int p = pv.Unpack(1);
2515
2516 for (int ch = 0; ch < 3; ch++)
2517 endPoints[subset][ep][ch] |= p << (7 - modeInfo.m_rgbBits);
2518
2519 if (modeInfo.m_alphaMode != BC7Data::AlphaMode_None)
2520 endPoints[subset][ep][3] |= p << (7 - modeInfo.m_alphaBits);
2521 }
2522 }
2523
2524 parityBits = 1;
2525 }
2526
2527 // Fill endpoint bits
2528 for (int subset = 0; subset < modeInfo.m_numSubsets; subset++)
2529 {
2530 for (int ep = 0; ep < 2; ep++)
2531 {
2532 for (int ch = 0; ch < 3; ch++)
2533 endPoints[subset][ep][ch] |= (endPoints[subset][ep][ch] >> (modeInfo.m_rgbBits + parityBits));
2534
2535 if (modeInfo.m_alphaMode != BC7Data::AlphaMode_None)
2536 endPoints[subset][ep][3] |= (endPoints[subset][ep][3] >> (modeInfo.m_alphaBits + parityBits));
2537 }
2538 }
2539
2540 int indexes[16];
2541 int indexes2[16];
2542
2543 // Decode indexes
2544 for (int px = 0; px < 16; px++)
2545 {
2546 int bits = modeInfo.m_indexBits;
2547 if ((px == 0) || (px == fixups[1]) || (px == fixups[2]))
2548 bits--;
2549
2550 indexes[px] = pv.Unpack(bits);
2551 }
2552
2553 // Decode secondary indexes
2554 if (modeInfo.m_alphaMode == BC7Data::AlphaMode_Separate)
2555 {
2556 for (int px = 0; px < 16; px++)
2557 {
2558 int bits = modeInfo.m_alphaIndexBits;
2559 if (px == 0)
2560 bits--;
2561
2562 indexes2[px] = pv.Unpack(bits);
2563 }
2564 }
2565 else
2566 {
2567 for (int px = 0; px < 16; px++)
2568 indexes2[px] = 0;
2569 }
2570
2571 const int *alphaWeights = BC7Data::g_weightTables[modeInfo.m_alphaIndexBits];
2572 const int *rgbWeights = BC7Data::g_weightTables[modeInfo.m_indexBits];
2573
2574 // Decode each pixel
2575 for (int px = 0; px < 16; px++)
2576 {
2577 int rgbWeight = 0;
2578 int alphaWeight = 0;
2579
2580 int rgbIndex = indexes[px];
2581
2582 rgbWeight = rgbWeights[indexes[px]];
2583
2584 if (modeInfo.m_alphaMode == BC7Data::AlphaMode_Combined)
2585 alphaWeight = rgbWeight;
2586 else if (modeInfo.m_alphaMode == BC7Data::AlphaMode_Separate)
2587 alphaWeight = alphaWeights[indexes2[px]];
2588
2589 if (indexSelector == 1)
2590 {
2591 int temp = rgbWeight;
2592 rgbWeight = alphaWeight;
2593 alphaWeight = temp;
2594 }
2595
2596 int pixel[4] = { 0, 0, 0, 255 };
2597
2598 int subset = 0;
2599
2600 if (modeInfo.m_numSubsets == 2)
2601 subset = (BC7Data::g_partitionMap[partition] >> px) & 1;
2602 else if (modeInfo.m_numSubsets == 3)
2603 subset = (BC7Data::g_partitionMap2[partition] >> (px * 2)) & 3;
2604
2605 for (int ch = 0; ch < 3; ch++)
2606 pixel[ch] = ((64 - rgbWeight) * endPoints[subset][0][ch] + rgbWeight * endPoints[subset][1][ch] + 32) >> 6;
2607
2608 if (modeInfo.m_alphaMode != BC7Data::AlphaMode_None)
2609 pixel[3] = ((64 - alphaWeight) * endPoints[subset][0][3] + alphaWeight * endPoints[subset][1][3] + 32) >> 6;
2610
2611 if (rotation != 0)
2612 {
2613 int ch = rotation - 1;
2614 int temp = pixel[ch];
2615 pixel[ch] = pixel[3];
2616 pixel[3] = temp;
2617 }
2618
2619 for (int ch = 0; ch < 4; ch++)
2620 output.m_pixels[px][ch] = static_cast<uint8_t>(pixel[ch]);
2621 }
2622}
2623
2624cvtt::ParallelMath::SInt16 cvtt::Internal::BC6HComputer::QuantizeSingleEndpointElementSigned(const MSInt16 &elem2CL, int precision, const ParallelMath::RoundUpForScope* ru)
2625{
2626 assert(ParallelMath::AllSet(ParallelMath::Less(elem2CL, ParallelMath::MakeSInt16(31744))));
2627 assert(ParallelMath::AllSet(ParallelMath::Less(ParallelMath::MakeSInt16(-31744), elem2CL)));
2628
2629 // Expand to full range
2630 ParallelMath::Int16CompFlag isNegative = ParallelMath::Less(elem2CL, ParallelMath::MakeSInt16(0));
2631 MUInt15 absElem = ParallelMath::LosslessCast<MUInt15>::Cast(ParallelMath::Select(isNegative, ParallelMath::MakeSInt16(0) - elem2CL, elem2CL));
2632
2633 absElem = ParallelMath::RightShift(ParallelMath::RoundAndConvertToU15(ParallelMath::ToFloat(absElem) * 32.0f / 31.0f, ru), 16 - precision);
2634
2635 MSInt16 absElemS16 = ParallelMath::LosslessCast<MSInt16>::Cast(absElem);
2636
2637 return ParallelMath::Select(isNegative, ParallelMath::MakeSInt16(0) - absElemS16, absElemS16);
2638}
2639
2640cvtt::ParallelMath::UInt15 cvtt::Internal::BC6HComputer::QuantizeSingleEndpointElementUnsigned(const MUInt15 &elem, int precision, const ParallelMath::RoundUpForScope* ru)
2641{
2642 MUInt16 expandedElem = ParallelMath::RoundAndConvertToU16(ParallelMath::Min(ParallelMath::ToFloat(elem) * 64.0f / 31.0f, ParallelMath::MakeFloat(65535.0f)), ru);
2643 return ParallelMath::LosslessCast<MUInt15>::Cast(ParallelMath::RightShift(expandedElem, 16 - precision));
2644}
2645
2646void cvtt::Internal::BC6HComputer::UnquantizeSingleEndpointElementSigned(const MSInt16 &comp, int precision, MSInt16 &outUnquantized, MSInt16 &outUnquantizedFinished2CL)
2647{
2648 MSInt16 zero = ParallelMath::MakeSInt16(0);
2649
2650 ParallelMath::Int16CompFlag negative = ParallelMath::Less(comp, zero);
2651 MUInt15 absComp = ParallelMath::LosslessCast<MUInt15>::Cast(ParallelMath::Select(negative, MSInt16(zero - comp), comp));
2652
2653 MSInt16 unq;
2654 MUInt15 absUnq;
2655
2656 if (precision >= 16)
2657 {
2658 unq = comp;
2659 absUnq = absComp;
2660 }
2661 else
2662 {
2663 MSInt16 maxCompMinusOne = ParallelMath::MakeSInt16(static_cast<int16_t>((1 << (precision - 1)) - 2));
2664 ParallelMath::Int16CompFlag isZero = ParallelMath::Equal(comp, zero);
2665 ParallelMath::Int16CompFlag isMax = ParallelMath::Less(maxCompMinusOne, comp);
2666
2667 absUnq = (absComp << (16 - precision)) + ParallelMath::MakeUInt15(static_cast<uint16_t>(0x4000 >> (precision - 1)));
2668 ParallelMath::ConditionalSet(absUnq, isZero, ParallelMath::MakeUInt15(0));
2669 ParallelMath::ConditionalSet(absUnq, isMax, ParallelMath::MakeUInt15(0x7fff));
2670
2671 unq = ParallelMath::ConditionalNegate(negative, ParallelMath::LosslessCast<MSInt16>::Cast(absUnq));
2672 }
2673
2674 outUnquantized = unq;
2675
2676 MUInt15 funq = ParallelMath::ToUInt15(ParallelMath::RightShift(ParallelMath::XMultiply(absUnq, ParallelMath::MakeUInt15(31)), 5));
2677
2678 outUnquantizedFinished2CL = ParallelMath::ConditionalNegate(negative, ParallelMath::LosslessCast<MSInt16>::Cast(funq));
2679}
2680
2681void cvtt::Internal::BC6HComputer::UnquantizeSingleEndpointElementUnsigned(const MUInt15 &comp, int precision, MUInt16 &outUnquantized, MUInt16 &outUnquantizedFinished)
2682{
2683 MUInt16 unq = ParallelMath::LosslessCast<MUInt16>::Cast(comp);
2684 if (precision < 15)
2685 {
2686 MUInt15 zero = ParallelMath::MakeUInt15(0);
2687 MUInt15 maxCompMinusOne = ParallelMath::MakeUInt15(static_cast<uint16_t>((1 << precision) - 2));
2688
2689 ParallelMath::Int16CompFlag isZero = ParallelMath::Equal(comp, zero);
2690 ParallelMath::Int16CompFlag isMax = ParallelMath::Less(maxCompMinusOne, comp);
2691
2692 unq = (ParallelMath::LosslessCast<MUInt16>::Cast(comp) << (16 - precision)) + ParallelMath::MakeUInt16(static_cast<uint16_t>(0x8000 >> precision));
2693
2694 ParallelMath::ConditionalSet(unq, isZero, ParallelMath::MakeUInt16(0));
2695 ParallelMath::ConditionalSet(unq, isMax, ParallelMath::MakeUInt16(0xffff));
2696 }
2697
2698 outUnquantized = unq;
2699 outUnquantizedFinished = ParallelMath::ToUInt16(ParallelMath::RightShift(ParallelMath::XMultiply(unq, ParallelMath::MakeUInt15(31)), 6));
2700}
2701
2702void cvtt::Internal::BC6HComputer::QuantizeEndpointsSigned(const MSInt16 endPoints[2][3], const MFloat floatPixelsColorSpace[16][3], const MFloat floatPixelsLinearWeighted[16][3], MAInt16 quantizedEndPoints[2][3], MUInt15 indexes[16], IndexSelectorHDR<3> &indexSelector, int fixupIndex, int precision, int indexRange, const float *channelWeights, bool fastIndexing, const ParallelMath::RoundTowardNearestForScope *rtn)
2703{
2704 MSInt16 unquantizedEP[2][3];
2705 MSInt16 finishedUnquantizedEP[2][3];
2706
2707 {
2708 ParallelMath::RoundUpForScope ru;
2709
2710 for (int epi = 0; epi < 2; epi++)
2711 {
2712 for (int ch = 0; ch < 3; ch++)
2713 {
2714 MSInt16 qee = QuantizeSingleEndpointElementSigned(endPoints[epi][ch], precision, &ru);
2715 UnquantizeSingleEndpointElementSigned(qee, precision, unquantizedEP[epi][ch], finishedUnquantizedEP[epi][ch]);
2716 quantizedEndPoints[epi][ch] = ParallelMath::LosslessCast<MAInt16>::Cast(qee);
2717 }
2718 }
2719 }
2720
2721 indexSelector.Init(channelWeights, unquantizedEP, finishedUnquantizedEP, indexRange);
2722 indexSelector.InitHDR(indexRange, true, fastIndexing, channelWeights);
2723
2724 MUInt15 halfRangeMinusOne = ParallelMath::MakeUInt15(static_cast<uint16_t>(indexRange / 2) - 1);
2725
2726 MUInt15 index = fastIndexing ? indexSelector.SelectIndexHDRFast(floatPixelsColorSpace[fixupIndex], rtn) : indexSelector.SelectIndexHDRSlow(floatPixelsLinearWeighted[fixupIndex], rtn);
2727
2728 ParallelMath::Int16CompFlag invert = ParallelMath::Less(halfRangeMinusOne, index);
2729
2730 if (ParallelMath::AnySet(invert))
2731 {
2732 ParallelMath::ConditionalSet(index, invert, MUInt15(ParallelMath::MakeUInt15(static_cast<uint16_t>(indexRange - 1)) - index));
2733
2734 indexSelector.ConditionalInvert(invert);
2735
2736 for (int ch = 0; ch < 3; ch++)
2737 {
2738 MAInt16 firstEP = quantizedEndPoints[0][ch];
2739 MAInt16 secondEP = quantizedEndPoints[1][ch];
2740
2741 quantizedEndPoints[0][ch] = ParallelMath::Select(invert, secondEP, firstEP);
2742 quantizedEndPoints[1][ch] = ParallelMath::Select(invert, firstEP, secondEP);
2743 }
2744 }
2745
2746 indexes[fixupIndex] = index;
2747}
2748
2749void cvtt::Internal::BC6HComputer::QuantizeEndpointsUnsigned(const MSInt16 endPoints[2][3], const MFloat floatPixelsColorSpace[16][3], const MFloat floatPixelsLinearWeighted[16][3], MAInt16 quantizedEndPoints[2][3], MUInt15 indexes[16], IndexSelectorHDR<3> &indexSelector, int fixupIndex, int precision, int indexRange, const float *channelWeights, bool fastIndexing, const ParallelMath::RoundTowardNearestForScope *rtn)
2750{
2751 MUInt16 unquantizedEP[2][3];
2752 MUInt16 finishedUnquantizedEP[2][3];
2753
2754 {
2755 ParallelMath::RoundUpForScope ru;
2756
2757 for (int epi = 0; epi < 2; epi++)
2758 {
2759 for (int ch = 0; ch < 3; ch++)
2760 {
2761 MUInt15 qee = QuantizeSingleEndpointElementUnsigned(ParallelMath::LosslessCast<MUInt15>::Cast(endPoints[epi][ch]), precision, &ru);
2762 UnquantizeSingleEndpointElementUnsigned(qee, precision, unquantizedEP[epi][ch], finishedUnquantizedEP[epi][ch]);
2763 quantizedEndPoints[epi][ch] = ParallelMath::LosslessCast<MAInt16>::Cast(qee);
2764 }
2765 }
2766 }
2767
2768 indexSelector.Init(channelWeights, unquantizedEP, finishedUnquantizedEP, indexRange);
2769 indexSelector.InitHDR(indexRange, false, fastIndexing, channelWeights);
2770
2771 MUInt15 halfRangeMinusOne = ParallelMath::MakeUInt15(static_cast<uint16_t>(indexRange / 2) - 1);
2772
2773 MUInt15 index = fastIndexing ? indexSelector.SelectIndexHDRFast(floatPixelsColorSpace[fixupIndex], rtn) : indexSelector.SelectIndexHDRSlow(floatPixelsLinearWeighted[fixupIndex], rtn);
2774
2775 ParallelMath::Int16CompFlag invert = ParallelMath::Less(halfRangeMinusOne, index);
2776
2777 if (ParallelMath::AnySet(invert))
2778 {
2779 ParallelMath::ConditionalSet(index, invert, MUInt15(ParallelMath::MakeUInt15(static_cast<uint16_t>(indexRange - 1)) - index));
2780
2781 indexSelector.ConditionalInvert(invert);
2782
2783 for (int ch = 0; ch < 3; ch++)
2784 {
2785 MAInt16 firstEP = quantizedEndPoints[0][ch];
2786 MAInt16 secondEP = quantizedEndPoints[1][ch];
2787
2788 quantizedEndPoints[0][ch] = ParallelMath::Select(invert, secondEP, firstEP);
2789 quantizedEndPoints[1][ch] = ParallelMath::Select(invert, firstEP, secondEP);
2790 }
2791 }
2792
2793 indexes[fixupIndex] = index;
2794}
2795
2796void cvtt::Internal::BC6HComputer::EvaluatePartitionedLegality(const MAInt16 ep0[2][3], const MAInt16 ep1[2][3], int aPrec, const int bPrec[3], bool isTransformed, MAInt16 outEncodedEPs[2][2][3], ParallelMath::Int16CompFlag& outIsLegal)
2797{
2798 ParallelMath::Int16CompFlag allLegal = ParallelMath::MakeBoolInt16(true);
2799
2800 MAInt16 aSignificantMask = ParallelMath::MakeAInt16(static_cast<int16_t>((1 << aPrec) - 1));
2801
2802 for (int ch = 0; ch < 3; ch++)
2803 {
2804 outEncodedEPs[0][0][ch] = ep0[0][ch];
2805 outEncodedEPs[0][1][ch] = ep0[1][ch];
2806 outEncodedEPs[1][0][ch] = ep1[0][ch];
2807 outEncodedEPs[1][1][ch] = ep1[1][ch];
2808
2809 if (isTransformed)
2810 {
2811 for (int subset = 0; subset < 2; subset++)
2812 {
2813 for (int epi = 0; epi < 2; epi++)
2814 {
2815 if (epi == 0 && subset == 0)
2816 continue;
2817
2818 MAInt16 bReduced = (outEncodedEPs[subset][epi][ch] & aSignificantMask);
2819
2820 MSInt16 delta = ParallelMath::TruncateToPrecisionSigned(ParallelMath::LosslessCast<MSInt16>::Cast(ParallelMath::AbstractSubtract(outEncodedEPs[subset][epi][ch], outEncodedEPs[0][0][ch])), bPrec[ch]);
2821
2822 outEncodedEPs[subset][epi][ch] = ParallelMath::LosslessCast<MAInt16>::Cast(delta);
2823
2824 MAInt16 reconstructed = (ParallelMath::AbstractAdd(outEncodedEPs[subset][epi][ch], outEncodedEPs[0][0][ch]) & aSignificantMask);
2825 allLegal = allLegal & ParallelMath::Equal(reconstructed, bReduced);
2826 }
2827 }
2828 }
2829
2830 if (!ParallelMath::AnySet(allLegal))
2831 break;
2832 }
2833
2834 outIsLegal = allLegal;
2835}
2836
2837void cvtt::Internal::BC6HComputer::EvaluateSingleLegality(const MAInt16 ep[2][3], int aPrec, const int bPrec[3], bool isTransformed, MAInt16 outEncodedEPs[2][3], ParallelMath::Int16CompFlag& outIsLegal)
2838{
2839 ParallelMath::Int16CompFlag allLegal = ParallelMath::MakeBoolInt16(true);
2840
2841 MAInt16 aSignificantMask = ParallelMath::MakeAInt16(static_cast<int16_t>((1 << aPrec) - 1));
2842
2843 for (int ch = 0; ch < 3; ch++)
2844 {
2845 outEncodedEPs[0][ch] = ep[0][ch];
2846 outEncodedEPs[1][ch] = ep[1][ch];
2847
2848 if (isTransformed)
2849 {
2850 MAInt16 bReduced = (outEncodedEPs[1][ch] & aSignificantMask);
2851
2852 MSInt16 delta = ParallelMath::TruncateToPrecisionSigned(ParallelMath::LosslessCast<MSInt16>::Cast(ParallelMath::AbstractSubtract(outEncodedEPs[1][ch], outEncodedEPs[0][ch])), bPrec[ch]);
2853
2854 outEncodedEPs[1][ch] = ParallelMath::LosslessCast<MAInt16>::Cast(delta);
2855
2856 MAInt16 reconstructed = (ParallelMath::AbstractAdd(outEncodedEPs[1][ch], outEncodedEPs[0][ch]) & aSignificantMask);
2857 allLegal = allLegal & ParallelMath::Equal(reconstructed, bReduced);
2858 }
2859 }
2860
2861 outIsLegal = allLegal;
2862}
2863
2864void cvtt::Internal::BC6HComputer::Pack(uint32_t flags, const PixelBlockF16* inputs, uint8_t* packedBlocks, const float channelWeights[4], bool isSigned, int numTweakRounds, int numRefineRounds)
2865{
2866 if (numTweakRounds < 1)
2867 numTweakRounds = 1;
2868 else if (numTweakRounds > MaxTweakRounds)
2869 numTweakRounds = MaxTweakRounds;
2870
2871 if (numRefineRounds < 1)
2872 numRefineRounds = 1;
2873 else if (numRefineRounds > MaxRefineRounds)
2874 numRefineRounds = MaxRefineRounds;
2875
2876 bool fastIndexing = ((flags & cvtt::Flags::BC6H_FastIndexing) != 0);
2877 float channelWeightsSq[3];
2878
2879 ParallelMath::RoundTowardNearestForScope rtn;
2880
2881 MSInt16 pixels[16][3];
2882 MFloat floatPixels2CL[16][3];
2883 MFloat floatPixelsLinearWeighted[16][3];
2884
2885 MSInt16 low15Bits = ParallelMath::MakeSInt16(32767);
2886
2887 for (int ch = 0; ch < 3; ch++)
2888 channelWeightsSq[ch] = channelWeights[ch] * channelWeights[ch];
2889
2890 for (int px = 0; px < 16; px++)
2891 {
2892 for (int ch = 0; ch < 3; ch++)
2893 {
2894 MSInt16 pixelValue;
2895 ParallelMath::ConvertHDRInputs(inputs, px, ch, pixelValue);
2896
2897 // Convert from sign+magnitude to 2CL
2898 if (isSigned)
2899 {
2900 ParallelMath::Int16CompFlag negative = ParallelMath::Less(pixelValue, ParallelMath::MakeSInt16(0));
2901 MSInt16 magnitude = (pixelValue & low15Bits);
2902 ParallelMath::ConditionalSet(pixelValue, negative, ParallelMath::MakeSInt16(0) - magnitude);
2903 pixelValue = ParallelMath::Max(pixelValue, ParallelMath::MakeSInt16(-31743));
2904 }
2905 else
2906 pixelValue = ParallelMath::Max(pixelValue, ParallelMath::MakeSInt16(0));
2907
2908 pixelValue = ParallelMath::Min(pixelValue, ParallelMath::MakeSInt16(31743));
2909
2910 pixels[px][ch] = pixelValue;
2911 floatPixels2CL[px][ch] = ParallelMath::ToFloat(pixelValue);
2912 floatPixelsLinearWeighted[px][ch] = ParallelMath::TwosCLHalfToFloat(pixelValue) * channelWeights[ch];
2913 }
2914 }
2915
2916 MFloat preWeightedPixels[16][3];
2917
2918 BCCommon::PreWeightPixelsHDR<3>(preWeightedPixels, pixels, channelWeights);
2919
2920 MAInt16 bestEndPoints[2][2][3];
2921 MUInt15 bestIndexes[16];
2922 MFloat bestError = ParallelMath::MakeFloat(FLT_MAX);
2923 MUInt15 bestMode = ParallelMath::MakeUInt15(0);
2924 MUInt15 bestPartition = ParallelMath::MakeUInt15(0);
2925
2926 for (int px = 0; px < 16; px++)
2927 bestIndexes[px] = ParallelMath::MakeUInt15(0);
2928
2929 for (int subset = 0; subset < 2; subset++)
2930 for (int epi = 0; epi < 2; epi++)
2931 for (int ch = 0; ch < 3; ch++)
2932 bestEndPoints[subset][epi][ch] = ParallelMath::MakeAInt16(0);
2933
2934 UnfinishedEndpoints<3> partitionedUFEP[32][2];
2935 UnfinishedEndpoints<3> singleUFEP;
2936
2937 // Generate UFEP for partitions
2938 for (int p = 0; p < 32; p++)
2939 {
2940 int partitionMask = BC7Data::g_partitionMap[p];
2941
2942 EndpointSelector<3, 8> epSelectors[2];
2943
2944 for (int pass = 0; pass < NumEndpointSelectorPasses; pass++)
2945 {
2946 for (int px = 0; px < 16; px++)
2947 {
2948 int subset = (partitionMask >> px) & 1;
2949 epSelectors[subset].ContributePass(preWeightedPixels[px], pass, ParallelMath::MakeFloat(1.0f));
2950 }
2951
2952 for (int subset = 0; subset < 2; subset++)
2953 epSelectors[subset].FinishPass(pass);
2954 }
2955
2956 for (int subset = 0; subset < 2; subset++)
2957 partitionedUFEP[p][subset] = epSelectors[subset].GetEndpoints(channelWeights);
2958 }
2959
2960 // Generate UFEP for single
2961 {
2962 EndpointSelector<3, 8> epSelector;
2963
2964 for (int pass = 0; pass < NumEndpointSelectorPasses; pass++)
2965 {
2966 for (int px = 0; px < 16; px++)
2967 epSelector.ContributePass(preWeightedPixels[px], pass, ParallelMath::MakeFloat(1.0f));
2968
2969 epSelector.FinishPass(pass);
2970 }
2971
2972 singleUFEP = epSelector.GetEndpoints(channelWeights);
2973 }
2974
2975 for (int partitionedInt = 0; partitionedInt < 2; partitionedInt++)
2976 {
2977 bool partitioned = (partitionedInt == 1);
2978
2979 for (int aPrec = BC7Data::g_maxHDRPrecision; aPrec >= 0; aPrec--)
2980 {
2981 if (!BC7Data::g_hdrModesExistForPrecision[partitionedInt][aPrec])
2982 continue;
2983
2984 int numPartitions = partitioned ? 32 : 1;
2985 int numSubsets = partitioned ? 2 : 1;
2986 int indexBits = partitioned ? 3 : 4;
2987 int indexRange = (1 << indexBits);
2988
2989 for (int p = 0; p < numPartitions; p++)
2990 {
2991 int partitionMask = partitioned ? BC7Data::g_partitionMap[p] : 0;
2992
2993 const int MaxMetaRounds = MaxTweakRounds * MaxRefineRounds;
2994
2995 MAInt16 metaEndPointsQuantized[MaxMetaRounds][2][2][3];
2996 MUInt15 metaIndexes[MaxMetaRounds][16];
2997 MFloat metaError[MaxMetaRounds][2];
2998
2999 bool roundValid[MaxMetaRounds][2];
3000
3001 for (int r = 0; r < MaxMetaRounds; r++)
3002 for (int subset = 0; subset < 2; subset++)
3003 roundValid[r][subset] = true;
3004
3005 for (int subset = 0; subset < numSubsets; subset++)
3006 {
3007 for (int tweak = 0; tweak < MaxTweakRounds; tweak++)
3008 {
3009 EndpointRefiner<3> refiners[2];
3010
3011 bool abortRemainingRefines = false;
3012 for (int refinePass = 0; refinePass < MaxRefineRounds; refinePass++)
3013 {
3014 int metaRound = tweak * MaxRefineRounds + refinePass;
3015
3016 if (tweak >= numTweakRounds || refinePass >= numRefineRounds)
3017 abortRemainingRefines = true;
3018
3019 if (abortRemainingRefines)
3020 {
3021 roundValid[metaRound][subset] = false;
3022 continue;
3023 }
3024
3025 MAInt16(&mrQuantizedEndPoints)[2][2][3] = metaEndPointsQuantized[metaRound];
3026 MUInt15(&mrIndexes)[16] = metaIndexes[metaRound];
3027
3028 MSInt16 endPointsColorSpace[2][3];
3029
3030 if (refinePass == 0)
3031 {
3032 UnfinishedEndpoints<3> ufep = partitioned ? partitionedUFEP[p][subset] : singleUFEP;
3033
3034 if (isSigned)
3035 ufep.FinishHDRSigned(tweak, indexRange, endPointsColorSpace[0], endPointsColorSpace[1], &rtn);
3036 else
3037 ufep.FinishHDRUnsigned(tweak, indexRange, endPointsColorSpace[0], endPointsColorSpace[1], &rtn);
3038 }
3039 else
3040 refiners[subset].GetRefinedEndpointsHDR(endPointsColorSpace, isSigned, &rtn);
3041
3042 refiners[subset].Init(indexRange, channelWeights);
3043
3044 int fixupIndex = (subset == 0) ? 0 : BC7Data::g_fixupIndexes2[p];
3045
3046 IndexSelectorHDR<3> indexSelector;
3047 if (isSigned)
3048 QuantizeEndpointsSigned(endPointsColorSpace, floatPixels2CL, floatPixelsLinearWeighted, mrQuantizedEndPoints[subset], mrIndexes, indexSelector, fixupIndex, aPrec, indexRange, channelWeights, fastIndexing, &rtn);
3049 else
3050 QuantizeEndpointsUnsigned(endPointsColorSpace, floatPixels2CL, floatPixelsLinearWeighted, mrQuantizedEndPoints[subset], mrIndexes, indexSelector, fixupIndex, aPrec, indexRange, channelWeights, fastIndexing, &rtn);
3051
3052 if (metaRound > 0)
3053 {
3054 ParallelMath::Int16CompFlag anySame = ParallelMath::MakeBoolInt16(false);
3055
3056 for (int prevRound = 0; prevRound < metaRound; prevRound++)
3057 {
3058 MAInt16(&prevRoundEPs)[2][3] = metaEndPointsQuantized[prevRound][subset];
3059
3060 ParallelMath::Int16CompFlag same = ParallelMath::MakeBoolInt16(true);
3061
3062 for (int epi = 0; epi < 2; epi++)
3063 for (int ch = 0; ch < 3; ch++)
3064 same = (same & ParallelMath::Equal(prevRoundEPs[epi][ch], mrQuantizedEndPoints[subset][epi][ch]));
3065
3066 anySame = (anySame | same);
3067 if (ParallelMath::AllSet(anySame))
3068 break;
3069 }
3070
3071 if (ParallelMath::AllSet(anySame))
3072 {
3073 roundValid[metaRound][subset] = false;
3074 continue;
3075 }
3076 }
3077
3078 MFloat subsetError = ParallelMath::MakeFloatZero();
3079
3080 {
3081 for (int px = 0; px < 16; px++)
3082 {
3083 if (subset != ((partitionMask >> px) & 1))
3084 continue;
3085
3086 MUInt15 index;
3087 if (px == fixupIndex)
3088 index = mrIndexes[px];
3089 else
3090 {
3091 index = fastIndexing ? indexSelector.SelectIndexHDRFast(floatPixels2CL[px], &rtn) : indexSelector.SelectIndexHDRSlow(floatPixelsLinearWeighted[px], &rtn);
3092 mrIndexes[px] = index;
3093 }
3094
3095 MSInt16 reconstructed[3];
3096 if (isSigned)
3097 indexSelector.ReconstructHDRSigned(mrIndexes[px], reconstructed);
3098 else
3099 indexSelector.ReconstructHDRUnsigned(mrIndexes[px], reconstructed);
3100
3101 subsetError = subsetError + (fastIndexing ? BCCommon::ComputeErrorHDRFast<3>(flags, reconstructed, pixels[px], channelWeightsSq) : BCCommon::ComputeErrorHDRSlow<3>(flags, reconstructed, pixels[px], channelWeightsSq));
3102
3103 if (refinePass != numRefineRounds - 1)
3104 refiners[subset].ContributeUnweightedPW(preWeightedPixels[px], index);
3105 }
3106 }
3107
3108 metaError[metaRound][subset] = subsetError;
3109 }
3110 }
3111 }
3112
3113 // Now we have a bunch of attempts, but not all of them will fit in the delta coding scheme
3114 int numMeta1 = partitioned ? MaxMetaRounds : 1;
3115 for (int meta0 = 0; meta0 < MaxMetaRounds; meta0++)
3116 {
3117 if (!roundValid[meta0][0])
3118 continue;
3119
3120 for (int meta1 = 0; meta1 < numMeta1; meta1++)
3121 {
3122 MFloat combinedError = metaError[meta0][0];
3123 if (partitioned)
3124 {
3125 if (!roundValid[meta1][1])
3126 continue;
3127
3128 combinedError = combinedError + metaError[meta1][1];
3129 }
3130
3131 ParallelMath::FloatCompFlag errorBetter = ParallelMath::Less(combinedError, bestError);
3132 if (!ParallelMath::AnySet(errorBetter))
3133 continue;
3134
3135 ParallelMath::Int16CompFlag needsCommit = ParallelMath::FloatFlagToInt16(errorBetter);
3136
3137 // Figure out if this is encodable
3138 for (int mode = 0; mode < BC7Data::g_numHDRModes; mode++)
3139 {
3140 const BC7Data::BC6HModeInfo &modeInfo = BC7Data::g_hdrModes[mode];
3141
3142 if (modeInfo.m_partitioned != partitioned || modeInfo.m_aPrec != aPrec)
3143 continue;
3144
3145 MAInt16 encodedEPs[2][2][3];
3146 ParallelMath::Int16CompFlag isLegal;
3147 if (partitioned)
3148 EvaluatePartitionedLegality(metaEndPointsQuantized[meta0][0], metaEndPointsQuantized[meta1][1], modeInfo.m_aPrec, modeInfo.m_bPrec, modeInfo.m_transformed, encodedEPs, isLegal);
3149 else
3150 EvaluateSingleLegality(metaEndPointsQuantized[meta0][0], modeInfo.m_aPrec, modeInfo.m_bPrec, modeInfo.m_transformed, encodedEPs[0], isLegal);
3151
3152 ParallelMath::Int16CompFlag isLegalAndBetter = (ParallelMath::FloatFlagToInt16(errorBetter) & isLegal);
3153 if (!ParallelMath::AnySet(isLegalAndBetter))
3154 continue;
3155
3156 ParallelMath::FloatCompFlag isLegalAndBetterFloat = ParallelMath::Int16FlagToFloat(isLegalAndBetter);
3157
3158 ParallelMath::ConditionalSet(bestError, isLegalAndBetterFloat, combinedError);
3159 ParallelMath::ConditionalSet(bestMode, isLegalAndBetter, ParallelMath::MakeUInt15(static_cast<uint16_t>(mode)));
3160 ParallelMath::ConditionalSet(bestPartition, isLegalAndBetter, ParallelMath::MakeUInt15(static_cast<uint16_t>(p)));
3161
3162 for (int subset = 0; subset < numSubsets; subset++)
3163 {
3164 for (int epi = 0; epi < 2; epi++)
3165 {
3166 for (int ch = 0; ch < 3; ch++)
3167 ParallelMath::ConditionalSet(bestEndPoints[subset][epi][ch], isLegalAndBetter, encodedEPs[subset][epi][ch]);
3168 }
3169 }
3170
3171 for (int px = 0; px < 16; px++)
3172 {
3173 int subset = ((partitionMask >> px) & 1);
3174 if (subset == 0)
3175 ParallelMath::ConditionalSet(bestIndexes[px], isLegalAndBetter, metaIndexes[meta0][px]);
3176 else
3177 ParallelMath::ConditionalSet(bestIndexes[px], isLegalAndBetter, metaIndexes[meta1][px]);
3178 }
3179
3180 needsCommit = ParallelMath::AndNot(needsCommit, isLegalAndBetter);
3181 if (!ParallelMath::AnySet(needsCommit))
3182 break;
3183 }
3184 }
3185 }
3186 }
3187 }
3188 }
3189
3190 // At this point, everything should be set
3191 for (int block = 0; block < ParallelMath::ParallelSize; block++)
3192 {
3193 ParallelMath::ScalarUInt16 mode = ParallelMath::Extract(bestMode, block);
3194 ParallelMath::ScalarUInt16 partition = ParallelMath::Extract(bestPartition, block);
3195 int32_t eps[2][2][3];
3196 ParallelMath::ScalarUInt16 indexes[16];
3197
3198 const BC7Data::BC6HModeInfo& modeInfo = BC7Data::g_hdrModes[mode];
3199
3200 const BC6HData::ModeDescriptor *desc = BC6HData::g_modeDescriptors[mode];
3201
3202 const size_t headerBits = modeInfo.m_partitioned ? 82 : 65;
3203
3204 for (int subset = 0; subset < 2; subset++)
3205 {
3206 for (int epi = 0; epi < 2; epi++)
3207 {
3208 for (int ch = 0; ch < 3; ch++)
3209 eps[subset][epi][ch] = ParallelMath::Extract(bestEndPoints[subset][epi][ch], block);
3210 }
3211 }
3212
3213 for (int px = 0; px < 16; px++)
3214 indexes[px] = ParallelMath::Extract(bestIndexes[px], block);
3215
3216 uint16_t modeID = modeInfo.m_modeID;
3217
3218 PackingVector pv;
3219 pv.Init();
3220
3221 for (size_t i = 0; i < headerBits; i++) {
3222 int32_t codedValue = 0;
3223 switch (desc[i].m_eField) {
3224 case BC6HData::M:
3225 codedValue = modeID;
3226 break;
3227 case BC6HData::D:
3228 codedValue = partition;
3229 break;
3230 case BC6HData::RW:
3231 codedValue = eps[0][0][0];
3232 break;
3233 case BC6HData::RX:
3234 codedValue = eps[0][1][0];
3235 break;
3236 case BC6HData::RY:
3237 codedValue = eps[1][0][0];
3238 break;
3239 case BC6HData::RZ:
3240 codedValue = eps[1][1][0];
3241 break;
3242 case BC6HData::GW:
3243 codedValue = eps[0][0][1];
3244 break;
3245 case BC6HData::GX:
3246 codedValue = eps[0][1][1];
3247 break;
3248 case BC6HData::GY:
3249 codedValue = eps[1][0][1];
3250 break;
3251 case BC6HData::GZ:
3252 codedValue = eps[1][1][1];
3253 break;
3254 case BC6HData::BW:
3255 codedValue = eps[0][0][2];
3256 break;
3257 case BC6HData::BX:
3258 codedValue = eps[0][1][2];
3259 break;
3260 case BC6HData::BY:
3261 codedValue = eps[1][0][2];
3262 break;
3263 case BC6HData::BZ:
3264 codedValue = eps[1][1][2];
3265 break;
3266 default:
3267 assert(false);
3268 break;
3269 }
3270 pv.Pack(static_cast<uint16_t>((codedValue >> desc[i].m_uBit) & 1), 1);
3271 }
3272
3273 int fixupIndex1 = 0;
3274 int indexBits = 4;
3275 if (modeInfo.m_partitioned)
3276 {
3277 fixupIndex1 = BC7Data::g_fixupIndexes2[partition];
3278 indexBits = 3;
3279 }
3280
3281 for (int px = 0; px < 16; px++)
3282 {
3283 ParallelMath::ScalarUInt16 index = ParallelMath::Extract(bestIndexes[px], block);
3284 if (px == 0 || px == fixupIndex1)
3285 pv.Pack(index, indexBits - 1);
3286 else
3287 pv.Pack(index, indexBits);
3288 }
3289
3290 pv.Flush(packedBlocks + 16 * block);
3291 }
3292}
3293
3294void cvtt::Internal::BC6HComputer::SignExtendSingle(int &v, int bits)
3295{
3296 if (v & (1 << (bits - 1)))
3297 v |= -(1 << bits);
3298}
3299
3300void cvtt::Internal::BC6HComputer::UnpackOne(PixelBlockF16 &output, const uint8_t *pBC, bool isSigned)
3301{
3302 UnpackingVector pv;
3303 pv.Init(pBC);
3304
3305 int numModeBits = 2;
3306 int modeBits = pv.Unpack(2);
3307 if (modeBits != 0 && modeBits != 1)
3308 {
3309 modeBits |= pv.Unpack(3) << 2;
3310 numModeBits += 3;
3311 }
3312
3313 int mode = -1;
3314 for (int possibleMode = 0; possibleMode < BC7Data::g_numHDRModes; possibleMode++)
3315 {
3316 if (BC7Data::g_hdrModes[possibleMode].m_modeID == modeBits)
3317 {
3318 mode = possibleMode;
3319 break;
3320 }
3321 }
3322
3323 if (mode < 0)
3324 {
3325 for (int px = 0; px < 16; px++)
3326 {
3327 for (int ch = 0; ch < 3; ch++)
3328 output.m_pixels[px][ch] = 0;
3329 output.m_pixels[px][3] = 0x3c00; // 1.0
3330 }
3331 return;
3332 }
3333
3334 const BC7Data::BC6HModeInfo& modeInfo = BC7Data::g_hdrModes[mode];
3335 const size_t headerBits = modeInfo.m_partitioned ? 82 : 65;
3336 const BC6HData::ModeDescriptor *desc = BC6HData::g_modeDescriptors[mode];
3337
3338 int32_t partition = 0;
3339 int32_t eps[2][2][3];
3340
3341 for (int subset = 0; subset < 2; subset++)
3342 for (int epi = 0; epi < 2; epi++)
3343 for (int ch = 0; ch < 3; ch++)
3344 eps[subset][epi][ch] = 0;
3345
3346 for (size_t i = numModeBits; i < headerBits; i++) {
3347 int32_t *pCodedValue = NULL;
3348
3349 switch (desc[i].m_eField) {
3350 case BC6HData::D:
3351 pCodedValue = &partition;
3352 break;
3353 case BC6HData::RW:
3354 pCodedValue = &eps[0][0][0];
3355 break;
3356 case BC6HData::RX:
3357 pCodedValue = &eps[0][1][0];
3358 break;
3359 case BC6HData::RY:
3360 pCodedValue = &eps[1][0][0];
3361 break;
3362 case BC6HData::RZ:
3363 pCodedValue = &eps[1][1][0];
3364 break;
3365 case BC6HData::GW:
3366 pCodedValue = &eps[0][0][1];
3367 break;
3368 case BC6HData::GX:
3369 pCodedValue = &eps[0][1][1];
3370 break;
3371 case BC6HData::GY:
3372 pCodedValue = &eps[1][0][1];
3373 break;
3374 case BC6HData::GZ:
3375 pCodedValue = &eps[1][1][1];
3376 break;
3377 case BC6HData::BW:
3378 pCodedValue = &eps[0][0][2];
3379 break;
3380 case BC6HData::BX:
3381 pCodedValue = &eps[0][1][2];
3382 break;
3383 case BC6HData::BY:
3384 pCodedValue = &eps[1][0][2];
3385 break;
3386 case BC6HData::BZ:
3387 pCodedValue = &eps[1][1][2];
3388 break;
3389 default:
3390 assert(false);
3391 break;
3392 }
3393
3394 (*pCodedValue) |= pv.Unpack(1) << desc[i].m_uBit;
3395 }
3396
3397 uint16_t modeID = modeInfo.m_modeID;
3398
3399 int fixupIndex1 = 0;
3400 int indexBits = 4;
3401 int numSubsets = 1;
3402 if (modeInfo.m_partitioned)
3403 {
3404 fixupIndex1 = BC7Data::g_fixupIndexes2[partition];
3405 indexBits = 3;
3406 numSubsets = 2;
3407 }
3408
3409 int indexes[16];
3410 for (int px = 0; px < 16; px++)
3411 {
3412 if (px == 0 || px == fixupIndex1)
3413 indexes[px] = pv.Unpack(indexBits - 1);
3414 else
3415 indexes[px] = pv.Unpack(indexBits);
3416 }
3417
3418 if (modeInfo.m_partitioned)
3419 {
3420 for (int ch = 0; ch < 3; ch++)
3421 {
3422 if (isSigned)
3423 SignExtendSingle(eps[0][0][ch], modeInfo.m_aPrec);
3424 if (modeInfo.m_transformed || isSigned)
3425 {
3426 SignExtendSingle(eps[0][1][ch], modeInfo.m_bPrec[ch]);
3427 SignExtendSingle(eps[1][0][ch], modeInfo.m_bPrec[ch]);
3428 SignExtendSingle(eps[1][1][ch], modeInfo.m_bPrec[ch]);
3429 }
3430 }
3431 }
3432 else
3433 {
3434 for (int ch = 0; ch < 3; ch++)
3435 {
3436 if (isSigned)
3437 SignExtendSingle(eps[0][0][ch], modeInfo.m_aPrec);
3438 if (modeInfo.m_transformed || isSigned)
3439 SignExtendSingle(eps[0][1][ch], modeInfo.m_bPrec[ch]);
3440 }
3441 }
3442
3443 int aPrec = modeInfo.m_aPrec;
3444
3445 if (modeInfo.m_transformed)
3446 {
3447 for (int ch = 0; ch < 3; ch++)
3448 {
3449 int wrapMask = (1 << aPrec) - 1;
3450
3451 eps[0][1][ch] = ((eps[0][0][ch] + eps[0][1][ch]) & wrapMask);
3452 if (isSigned)
3453 SignExtendSingle(eps[0][1][ch], aPrec);
3454
3455 if (modeInfo.m_partitioned)
3456 {
3457 eps[1][0][ch] = ((eps[0][0][ch] + eps[1][0][ch]) & wrapMask);
3458 eps[1][1][ch] = ((eps[0][0][ch] + eps[1][1][ch]) & wrapMask);
3459
3460 if (isSigned)
3461 {
3462 SignExtendSingle(eps[1][0][ch], aPrec);
3463 SignExtendSingle(eps[1][1][ch], aPrec);
3464 }
3465 }
3466 }
3467 }
3468
3469 // Unquantize endpoints
3470 for (int subset = 0; subset < numSubsets; subset++)
3471 {
3472 for (int epi = 0; epi < 2; epi++)
3473 {
3474 for (int ch = 0; ch < 3; ch++)
3475 {
3476 int &v = eps[subset][epi][ch];
3477
3478 if (isSigned)
3479 {
3480 if (aPrec >= 16)
3481 {
3482 // Nothing
3483 }
3484 else
3485 {
3486 bool s = false;
3487 int comp = v;
3488 if (v < 0)
3489 {
3490 s = true;
3491 comp = -comp;
3492 }
3493
3494 int unq = 0;
3495 if (comp == 0)
3496 unq = 0;
3497 else if (comp >= ((1 << (aPrec - 1)) - 1))
3498 unq = 0x7fff;
3499 else
3500 unq = ((comp << 15) + 0x4000) >> (aPrec - 1);
3501
3502 if (s)
3503 unq = -unq;
3504
3505 v = unq;
3506 }
3507 }
3508 else
3509 {
3510 if (aPrec >= 15)
3511 {
3512 // Nothing
3513 }
3514 else if (v == 0)
3515 {
3516 // Nothing
3517 }
3518 else if (v == ((1 << aPrec) - 1))
3519 v = 0xffff;
3520 else
3521 v = ((v << 16) + 0x8000) >> aPrec;
3522 }
3523 }
3524 }
3525 }
3526
3527 const int *weights = BC7Data::g_weightTables[indexBits];
3528
3529 for (int px = 0; px < 16; px++)
3530 {
3531 int subset = 0;
3532 if (modeInfo.m_partitioned)
3533 subset = (BC7Data::g_partitionMap[partition] >> px) & 1;
3534
3535 int w = weights[indexes[px]];
3536 for (int ch = 0; ch < 3; ch++)
3537 {
3538 int comp = ((64 - w) * eps[subset][0][ch] + w * eps[subset][1][ch] + 32) >> 6;
3539
3540 if (isSigned)
3541 {
3542 if (comp < 0)
3543 comp = -(((-comp) * 31) >> 5);
3544 else
3545 comp = (comp * 31) >> 5;
3546
3547 int s = 0;
3548 if (comp < 0)
3549 {
3550 s = 0x8000;
3551 comp = -comp;
3552 }
3553
3554 output.m_pixels[px][ch] = static_cast<uint16_t>(s | comp);
3555 }
3556 else
3557 {
3558 comp = (comp * 31) >> 6;
3559 output.m_pixels[px][ch] = static_cast<uint16_t>(comp);
3560 }
3561 }
3562 output.m_pixels[px][3] = 0x3c00; // 1.0
3563 }
3564}
3565
3566void cvtt::Kernels::ConfigureBC7EncodingPlanFromQuality(BC7EncodingPlan &encodingPlan, int quality)
3567{
3568 static const int kMaxQuality = 100;
3569
3570 if (quality < 1)
3571 quality = 1;
3572 else if (quality > kMaxQuality)
3573 quality = kMaxQuality;
3574
3575 const int numRGBModes = cvtt::Tables::BC7Prio::g_bc7NumPrioCodesRGB * quality / kMaxQuality;
3576 const int numRGBAModes = cvtt::Tables::BC7Prio::g_bc7NumPrioCodesRGBA * quality / kMaxQuality;
3577
3578 const uint16_t *prioLists[] = { cvtt::Tables::BC7Prio::g_bc7PrioCodesRGB, cvtt::Tables::BC7Prio::g_bc7PrioCodesRGBA };
3579 const int prioListSizes[] = { numRGBModes, numRGBAModes };
3580
3581 BC7FineTuningParams ftParams;
3582 memset(&ftParams, 0, sizeof(ftParams));
3583
3584 for (int listIndex = 0; listIndex < 2; listIndex++)
3585 {
3586 int prioListSize = prioListSizes[listIndex];
3587 const uint16_t *prioList = prioLists[listIndex];
3588
3589 for (int prioIndex = 0; prioIndex < prioListSize; prioIndex++)
3590 {
3591 const uint16_t packedMode = prioList[prioIndex];
3592
3593 uint8_t seedPoints = static_cast<uint8_t>(cvtt::Tables::BC7Prio::UnpackSeedPointCount(packedMode));
3594 int mode = cvtt::Tables::BC7Prio::UnpackMode(packedMode);
3595
3596 switch (mode)
3597 {
3598 case 0:
3599 ftParams.mode0SP[cvtt::Tables::BC7Prio::UnpackPartition(packedMode)] = seedPoints;
3600 break;
3601 case 1:
3602 ftParams.mode1SP[cvtt::Tables::BC7Prio::UnpackPartition(packedMode)] = seedPoints;
3603 break;
3604 case 2:
3605 ftParams.mode2SP[cvtt::Tables::BC7Prio::UnpackPartition(packedMode)] = seedPoints;
3606 break;
3607 case 3:
3608 ftParams.mode3SP[cvtt::Tables::BC7Prio::UnpackPartition(packedMode)] = seedPoints;
3609 break;
3610 case 4:
3611 ftParams.mode4SP[cvtt::Tables::BC7Prio::UnpackRotation(packedMode)][cvtt::Tables::BC7Prio::UnpackIndexSelector(packedMode)] = seedPoints;
3612 break;
3613 case 5:
3614 ftParams.mode5SP[cvtt::Tables::BC7Prio::UnpackRotation(packedMode)] = seedPoints;
3615 break;
3616 case 6:
3617 ftParams.mode6SP = seedPoints;
3618 break;
3619 case 7:
3620 ftParams.mode7SP[cvtt::Tables::BC7Prio::UnpackPartition(packedMode)] = seedPoints;
3621 break;
3622 }
3623 }
3624 }
3625
3626 ConfigureBC7EncodingPlanFromFineTuningParams(encodingPlan, ftParams);
3627}
3628
3629// Generates a BC7 encoding plan from fine-tuning parameters.
3630bool cvtt::Kernels::ConfigureBC7EncodingPlanFromFineTuningParams(BC7EncodingPlan &encodingPlan, const BC7FineTuningParams &params)
3631{
3632 memset(&encodingPlan, 0, sizeof(encodingPlan));
3633
3634 // Mode 0
3635 for (int partition = 0; partition < 16; partition++)
3636 {
3637 uint8_t sp = params.mode0SP[partition];
3638 if (sp == 0)
3639 continue;
3640
3641 encodingPlan.mode0PartitionEnabled |= static_cast<uint16_t>(1) << partition;
3642
3643 for (int subset = 0; subset < 3; subset++)
3644 {
3645 int shape = cvtt::Internal::BC7Data::g_shapes3[partition][subset];
3646 encodingPlan.seedPointsForShapeRGB[shape] = std::max(encodingPlan.seedPointsForShapeRGB[shape], sp);
3647 }
3648 }
3649
3650 // Mode 1
3651 for (int partition = 0; partition < 64; partition++)
3652 {
3653 uint8_t sp = params.mode1SP[partition];
3654 if (sp == 0)
3655 continue;
3656
3657 encodingPlan.mode1PartitionEnabled |= static_cast<uint64_t>(1) << partition;
3658
3659 for (int subset = 0; subset < 2; subset++)
3660 {
3661 int shape = cvtt::Internal::BC7Data::g_shapes2[partition][subset];
3662 encodingPlan.seedPointsForShapeRGB[shape] = std::max(encodingPlan.seedPointsForShapeRGB[shape], sp);
3663 }
3664 }
3665
3666 // Mode 2
3667 for (int partition = 0; partition < 64; partition++)
3668 {
3669 uint8_t sp = params.mode2SP[partition];
3670 if (sp == 0)
3671 continue;
3672
3673 encodingPlan.mode2PartitionEnabled |= static_cast<uint64_t>(1) << partition;
3674
3675 for (int subset = 0; subset < 3; subset++)
3676 {
3677 int shape = cvtt::Internal::BC7Data::g_shapes3[partition][subset];
3678 encodingPlan.seedPointsForShapeRGB[shape] = std::max(encodingPlan.seedPointsForShapeRGB[shape], sp);
3679 }
3680 }
3681
3682 // Mode 3
3683 for (int partition = 0; partition < 64; partition++)
3684 {
3685 uint8_t sp = params.mode3SP[partition];
3686 if (sp == 0)
3687 continue;
3688
3689 encodingPlan.mode3PartitionEnabled |= static_cast<uint64_t>(1) << partition;
3690
3691 for (int subset = 0; subset < 2; subset++)
3692 {
3693 int shape = cvtt::Internal::BC7Data::g_shapes2[partition][subset];
3694 encodingPlan.seedPointsForShapeRGB[shape] = std::max(encodingPlan.seedPointsForShapeRGB[shape], sp);
3695 }
3696 }
3697
3698 // Mode 4
3699 for (int rotation = 0; rotation < 4; rotation++)
3700 {
3701 for (int indexMode = 0; indexMode < 2; indexMode++)
3702 encodingPlan.mode4SP[rotation][indexMode] = params.mode4SP[rotation][indexMode];
3703 }
3704
3705 // Mode 5
3706 for (int rotation = 0; rotation < 4; rotation++)
3707 encodingPlan.mode5SP[rotation] = params.mode5SP[rotation];
3708
3709 // Mode 6
3710 {
3711 uint8_t sp = params.mode6SP;
3712 if (sp != 0)
3713 {
3714 encodingPlan.mode6Enabled = true;
3715
3716 int shape = cvtt::Internal::BC7Data::g_shapes1[0][0];
3717 encodingPlan.seedPointsForShapeRGBA[shape] = std::max(encodingPlan.seedPointsForShapeRGBA[shape], sp);
3718 }
3719 }
3720
3721 // Mode 7
3722 for (int partition = 0; partition < 64; partition++)
3723 {
3724 uint8_t sp = params.mode7SP[partition];
3725 if (sp == 0)
3726 continue;
3727
3728 encodingPlan.mode7RGBAPartitionEnabled |= static_cast<uint64_t>(1) << partition;
3729
3730 for (int subset = 0; subset < 2; subset++)
3731 {
3732 int shape = cvtt::Internal::BC7Data::g_shapes2[partition][subset];
3733 encodingPlan.seedPointsForShapeRGBA[shape] = std::max(encodingPlan.seedPointsForShapeRGBA[shape], sp);
3734 }
3735 }
3736
3737 for (int i = 0; i < BC7EncodingPlan::kNumRGBShapes; i++)
3738 {
3739 if (encodingPlan.seedPointsForShapeRGB[i] > 0)
3740 {
3741 encodingPlan.rgbShapeList[encodingPlan.rgbNumShapesToEvaluate] = i;
3742 encodingPlan.rgbNumShapesToEvaluate++;
3743 }
3744 }
3745
3746 for (int i = 0; i < BC7EncodingPlan::kNumRGBAShapes; i++)
3747 {
3748 if (encodingPlan.seedPointsForShapeRGBA[i] > 0)
3749 {
3750 encodingPlan.rgbaShapeList[encodingPlan.rgbaNumShapesToEvaluate] = i;
3751 encodingPlan.rgbaNumShapesToEvaluate++;
3752 }
3753 }
3754
3755 encodingPlan.mode7RGBPartitionEnabled = (encodingPlan.mode7RGBAPartitionEnabled & ~encodingPlan.mode3PartitionEnabled);
3756
3757 return true;
3758}
3759
3760#endif
3761