| 1 | // Copyright 2009-2021 Intel Corporation |
| 2 | // SPDX-License-Identifier: Apache-2.0 |
| 3 | |
| 4 | #pragma once |
| 5 | |
| 6 | #include "../builders/primrefgen.h" |
| 7 | #include "../builders/heuristic_spatial.h" |
| 8 | #include "../builders/splitter.h" |
| 9 | |
| 10 | #include "../../common/algorithms/parallel_for_for.h" |
| 11 | #include "../../common/algorithms/parallel_for_for_prefix_sum.h" |
| 12 | |
| 13 | #define DBG_PRESPLIT(x) |
| 14 | #define CHECK_PRESPLIT(x) |
| 15 | |
| 16 | #define GRID_SIZE 1024 |
| 17 | #define MAX_PRESPLITS_PER_PRIMITIVE_LOG 5 |
| 18 | #define MAX_PRESPLITS_PER_PRIMITIVE (1<<MAX_PRESPLITS_PER_PRIMITIVE_LOG) |
| 19 | #define PRIORITY_CUTOFF_THRESHOLD 1.0f |
| 20 | #define PRIORITY_SPLIT_POS_WEIGHT 1.5f |
| 21 | |
| 22 | namespace embree |
| 23 | { |
| 24 | namespace isa |
| 25 | { |
| 26 | |
| 27 | struct PresplitItem |
| 28 | { |
| 29 | union { |
| 30 | float priority; |
| 31 | unsigned int data; |
| 32 | }; |
| 33 | unsigned int index; |
| 34 | |
| 35 | __forceinline operator unsigned() const |
| 36 | { |
| 37 | return reinterpret_cast<const unsigned&>(priority); |
| 38 | } |
| 39 | __forceinline bool operator < (const PresplitItem& item) const |
| 40 | { |
| 41 | return (priority < item.priority); |
| 42 | } |
| 43 | |
| 44 | template<typename Mesh> |
| 45 | __forceinline static float compute_priority(const PrimRef &ref, Scene *scene, const Vec2i &mc) |
| 46 | { |
| 47 | const unsigned int geomID = ref.geomID(); |
| 48 | const unsigned int primID = ref.primID(); |
| 49 | const float area_aabb = area(ref.bounds()); |
| 50 | const float area_prim = ((Mesh*)scene->get(geomID))->projectedPrimitiveArea(primID); |
| 51 | const unsigned int diff = 31 - lzcnt(mc.x^mc.y); |
| 52 | assert(area_prim <= area_aabb); |
| 53 | //const float priority = powf((area_aabb - area_prim) * powf(PRIORITY_SPLIT_POS_WEIGHT,(float)diff),1.0f/4.0f); |
| 54 | const float priority = sqrtf(sqrtf( (area_aabb - area_prim) * powf(PRIORITY_SPLIT_POS_WEIGHT,(float)diff) )); |
| 55 | assert(priority >= 0.0f && priority < FLT_LARGE); |
| 56 | return priority; |
| 57 | } |
| 58 | |
| 59 | |
| 60 | }; |
| 61 | |
| 62 | inline std::ostream &operator<<(std::ostream &cout, const PresplitItem& item) { |
| 63 | return cout << "index " << item.index << " priority " << item.priority; |
| 64 | }; |
| 65 | |
| 66 | template<typename SplitterFactory> |
| 67 | void splitPrimitive(SplitterFactory &Splitter, |
| 68 | const PrimRef &prim, |
| 69 | const unsigned int geomID, |
| 70 | const unsigned int primID, |
| 71 | const unsigned int split_level, |
| 72 | const Vec3fa &grid_base, |
| 73 | const float grid_scale, |
| 74 | const float grid_extend, |
| 75 | PrimRef subPrims[MAX_PRESPLITS_PER_PRIMITIVE], |
| 76 | unsigned int& numSubPrims) |
| 77 | { |
| 78 | assert(split_level <= MAX_PRESPLITS_PER_PRIMITIVE_LOG); |
| 79 | if (split_level == 0) |
| 80 | { |
| 81 | assert(numSubPrims < MAX_PRESPLITS_PER_PRIMITIVE); |
| 82 | subPrims[numSubPrims++] = prim; |
| 83 | } |
| 84 | else |
| 85 | { |
| 86 | const Vec3fa lower = prim.lower; |
| 87 | const Vec3fa upper = prim.upper; |
| 88 | const Vec3fa glower = (lower-grid_base)*Vec3fa(grid_scale)+Vec3fa(0.2f); |
| 89 | const Vec3fa gupper = (upper-grid_base)*Vec3fa(grid_scale)-Vec3fa(0.2f); |
| 90 | Vec3ia ilower(floor(glower)); |
| 91 | Vec3ia iupper(floor(gupper)); |
| 92 | |
| 93 | /* this ignores dimensions that are empty */ |
| 94 | iupper = (Vec3ia)(select(vint4(glower) >= vint4(gupper),vint4(ilower),vint4(iupper))); |
| 95 | |
| 96 | /* compute a morton code for the lower and upper grid coordinates. */ |
| 97 | const unsigned int lower_code = bitInterleave(ilower.x,ilower.y,ilower.z); |
| 98 | const unsigned int upper_code = bitInterleave(iupper.x,iupper.y,iupper.z); |
| 99 | |
| 100 | /* if all bits are equal then we cannot split */ |
| 101 | if(unlikely(lower_code == upper_code)) |
| 102 | { |
| 103 | assert(numSubPrims < MAX_PRESPLITS_PER_PRIMITIVE); |
| 104 | subPrims[numSubPrims++] = prim; |
| 105 | return; |
| 106 | } |
| 107 | |
| 108 | /* compute octree level and dimension to perform the split in */ |
| 109 | const unsigned int diff = 31 - lzcnt(lower_code^upper_code); |
| 110 | const unsigned int level = diff / 3; |
| 111 | const unsigned int dim = diff % 3; |
| 112 | |
| 113 | /* now we compute the grid position of the split */ |
| 114 | const unsigned int isplit = iupper[dim] & ~((1<<level)-1); |
| 115 | |
| 116 | /* compute world space position of split */ |
| 117 | const float inv_grid_size = 1.0f / GRID_SIZE; |
| 118 | const float fsplit = grid_base[dim] + isplit * inv_grid_size * grid_extend; |
| 119 | |
| 120 | assert(prim.lower[dim] <= fsplit && |
| 121 | prim.upper[dim] >= fsplit); |
| 122 | |
| 123 | /* split primitive */ |
| 124 | const auto splitter = Splitter(prim); |
| 125 | BBox3fa left,right; |
| 126 | splitter(prim.bounds(),dim,fsplit,left,right); |
| 127 | assert(!left.empty()); |
| 128 | assert(!right.empty()); |
| 129 | |
| 130 | |
| 131 | splitPrimitive(Splitter,PrimRef(left ,geomID,primID),geomID,primID,split_level-1,grid_base,grid_scale,grid_extend,subPrims,numSubPrims); |
| 132 | splitPrimitive(Splitter,PrimRef(right,geomID,primID),geomID,primID,split_level-1,grid_base,grid_scale,grid_extend,subPrims,numSubPrims); |
| 133 | } |
| 134 | } |
| 135 | |
| 136 | |
| 137 | template<typename Mesh, typename SplitterFactory> |
| 138 | PrimInfo createPrimRefArray_presplit(Geometry* geometry, unsigned int geomID, size_t numPrimRefs, mvector<PrimRef>& prims, BuildProgressMonitor& progressMonitor) |
| 139 | { |
| 140 | ParallelPrefixSumState<PrimInfo> pstate; |
| 141 | |
| 142 | /* first try */ |
| 143 | progressMonitor(0); |
| 144 | PrimInfo pinfo = parallel_prefix_sum( pstate, size_t(0), geometry->size(), size_t(1024), PrimInfo(empty), [&](const range<size_t>& r, const PrimInfo& base) -> PrimInfo { |
| 145 | return geometry->createPrimRefArray(prims,r,r.begin(),geomID); |
| 146 | }, [](const PrimInfo& a, const PrimInfo& b) -> PrimInfo { return PrimInfo::merge(a,b); }); |
| 147 | |
| 148 | /* if we need to filter out geometry, run again */ |
| 149 | if (pinfo.size() != numPrimRefs) |
| 150 | { |
| 151 | progressMonitor(0); |
| 152 | pinfo = parallel_prefix_sum( pstate, size_t(0), geometry->size(), size_t(1024), PrimInfo(empty), [&](const range<size_t>& r, const PrimInfo& base) -> PrimInfo { |
| 153 | return geometry->createPrimRefArray(prims,r,base.size(),geomID); |
| 154 | }, [](const PrimInfo& a, const PrimInfo& b) -> PrimInfo { return PrimInfo::merge(a,b); }); |
| 155 | } |
| 156 | return pinfo; |
| 157 | } |
| 158 | |
| 159 | __forceinline Vec2i computeMC(const Vec3fa &grid_base, const float grid_scale, const PrimRef &ref) |
| 160 | { |
| 161 | const Vec3fa lower = ref.lower; |
| 162 | const Vec3fa upper = ref.upper; |
| 163 | const Vec3fa glower = (lower-grid_base)*Vec3fa(grid_scale)+Vec3fa(0.2f); |
| 164 | const Vec3fa gupper = (upper-grid_base)*Vec3fa(grid_scale)-Vec3fa(0.2f); |
| 165 | Vec3ia ilower(floor(glower)); |
| 166 | Vec3ia iupper(floor(gupper)); |
| 167 | |
| 168 | /* this ignores dimensions that are empty */ |
| 169 | iupper = (Vec3ia)select(vint4(glower) >= vint4(gupper),vint4(ilower),vint4(iupper)); |
| 170 | |
| 171 | /* compute a morton code for the lower and upper grid coordinates. */ |
| 172 | const unsigned int lower_code = bitInterleave(ilower.x,ilower.y,ilower.z); |
| 173 | const unsigned int upper_code = bitInterleave(iupper.x,iupper.y,iupper.z); |
| 174 | return Vec2i(lower_code,upper_code); |
| 175 | } |
| 176 | |
| 177 | template<typename Mesh, typename SplitterFactory> |
| 178 | PrimInfo createPrimRefArray_presplit(Scene* scene, Geometry::GTypeMask types, bool mblur, size_t numPrimRefs, mvector<PrimRef>& prims, BuildProgressMonitor& progressMonitor) |
| 179 | { |
| 180 | static const size_t MIN_STEP_SIZE = 128; |
| 181 | |
| 182 | ParallelForForPrefixSumState<PrimInfo> pstate; |
| 183 | Scene::Iterator2 iter(scene,types,mblur); |
| 184 | |
| 185 | /* first try */ |
| 186 | progressMonitor(0); |
| 187 | pstate.init(iter,size_t(1024)); |
| 188 | PrimInfo pinfo = parallel_for_for_prefix_sum0( pstate, iter, PrimInfo(empty), [&](Geometry* mesh, const range<size_t>& r, size_t k, size_t geomID) -> PrimInfo { |
| 189 | return mesh->createPrimRefArray(prims,r,k,(unsigned)geomID); |
| 190 | }, [](const PrimInfo& a, const PrimInfo& b) -> PrimInfo { return PrimInfo::merge(a,b); }); |
| 191 | |
| 192 | /* if we need to filter out geometry, run again */ |
| 193 | if (pinfo.size() != numPrimRefs) |
| 194 | { |
| 195 | progressMonitor(0); |
| 196 | pinfo = parallel_for_for_prefix_sum1( pstate, iter, PrimInfo(empty), [&](Geometry* mesh, const range<size_t>& r, size_t k, size_t geomID, const PrimInfo& base) -> PrimInfo { |
| 197 | return mesh->createPrimRefArray(prims,r,base.size(),(unsigned)geomID); |
| 198 | }, [](const PrimInfo& a, const PrimInfo& b) -> PrimInfo { return PrimInfo::merge(a,b); }); |
| 199 | } |
| 200 | |
| 201 | /* use correct number of primitives */ |
| 202 | size_t numPrimitives = pinfo.size(); |
| 203 | const size_t alloc_numPrimitives = prims.size(); |
| 204 | const size_t numSplitPrimitivesBudget = alloc_numPrimitives - numPrimitives; |
| 205 | |
| 206 | /* set up primitive splitter */ |
| 207 | SplitterFactory Splitter(scene); |
| 208 | |
| 209 | |
| 210 | DBG_PRESPLIT( |
| 211 | const size_t org_numPrimitives = pinfo.size(); |
| 212 | PRINT(numPrimitives); |
| 213 | PRINT(alloc_numPrimitives); |
| 214 | PRINT(numSplitPrimitivesBudget); |
| 215 | ); |
| 216 | |
| 217 | /* allocate double buffer presplit items */ |
| 218 | const size_t presplit_allocation_size = sizeof(PresplitItem)*alloc_numPrimitives; |
| 219 | PresplitItem *presplitItem = (PresplitItem*)alignedMalloc(presplit_allocation_size,64); |
| 220 | PresplitItem *tmp_presplitItem = (PresplitItem*)alignedMalloc(presplit_allocation_size,64); |
| 221 | |
| 222 | /* compute grid */ |
| 223 | const Vec3fa grid_base = pinfo.geomBounds.lower; |
| 224 | const Vec3fa grid_diag = pinfo.geomBounds.size(); |
| 225 | const float grid_extend = max(grid_diag.x,max(grid_diag.y,grid_diag.z)); |
| 226 | const float grid_scale = grid_extend == 0.0f ? 0.0f : GRID_SIZE / grid_extend; |
| 227 | |
| 228 | /* init presplit items and get total sum */ |
| 229 | const float psum = parallel_reduce( size_t(0), numPrimitives, size_t(MIN_STEP_SIZE), 0.0f, [&](const range<size_t>& r) -> float { |
| 230 | float sum = 0.0f; |
| 231 | for (size_t i=r.begin(); i<r.end(); i++) |
| 232 | { |
| 233 | presplitItem[i].index = (unsigned int)i; |
| 234 | const Vec2i mc = computeMC(grid_base,grid_scale,prims[i]); |
| 235 | /* if all bits are equal then we cannot split */ |
| 236 | presplitItem[i].priority = (mc.x != mc.y) ? PresplitItem::compute_priority<Mesh>(prims[i],scene,mc) : 0.0f; |
| 237 | /* FIXME: sum undeterministic */ |
| 238 | sum += presplitItem[i].priority; |
| 239 | } |
| 240 | return sum; |
| 241 | },[](const float& a, const float& b) -> float { return a+b; }); |
| 242 | |
| 243 | /* compute number of splits per primitive */ |
| 244 | const float inv_psum = 1.0f / psum; |
| 245 | parallel_for( size_t(0), numPrimitives, size_t(MIN_STEP_SIZE), [&](const range<size_t>& r) -> void { |
| 246 | for (size_t i=r.begin(); i<r.end(); i++) |
| 247 | { |
| 248 | if (presplitItem[i].priority > 0.0f) |
| 249 | { |
| 250 | const float rel_p = (float)numSplitPrimitivesBudget * presplitItem[i].priority * inv_psum; |
| 251 | if (rel_p >= PRIORITY_CUTOFF_THRESHOLD) // need at least a split budget that generates two sub-prims |
| 252 | { |
| 253 | presplitItem[i].priority = max(min(ceilf(logf(rel_p)/logf(2.0f)),(float)MAX_PRESPLITS_PER_PRIMITIVE_LOG),1.0f); |
| 254 | //presplitItem[i].priority = min(floorf(logf(rel_p)/logf(2.0f)),(float)MAX_PRESPLITS_PER_PRIMITIVE_LOG); |
| 255 | assert(presplitItem[i].priority >= 0.0f && presplitItem[i].priority <= (float)MAX_PRESPLITS_PER_PRIMITIVE_LOG); |
| 256 | } |
| 257 | else |
| 258 | presplitItem[i].priority = 0.0f; |
| 259 | } |
| 260 | } |
| 261 | }); |
| 262 | |
| 263 | auto isLeft = [&] (const PresplitItem &ref) { return ref.priority < PRIORITY_CUTOFF_THRESHOLD; }; |
| 264 | size_t center = parallel_partitioning(presplitItem,0,numPrimitives,isLeft,1024); |
| 265 | |
| 266 | /* anything to split ? */ |
| 267 | if (center < numPrimitives) |
| 268 | { |
| 269 | size_t numPrimitivesToSplit = numPrimitives - center; |
| 270 | assert(presplitItem[center].priority >= 1.0f); |
| 271 | |
| 272 | /* sort presplit items in ascending order */ |
| 273 | radix_sort_u32(presplitItem + center,tmp_presplitItem + center,numPrimitivesToSplit,1024); |
| 274 | |
| 275 | CHECK_PRESPLIT( |
| 276 | parallel_for( size_t(center+1), numPrimitives, size_t(MIN_STEP_SIZE), [&](const range<size_t>& r) -> void { |
| 277 | for (size_t i=r.begin(); i<r.end(); i++) |
| 278 | assert(presplitItem[i-1].priority <= presplitItem[i].priority); |
| 279 | }); |
| 280 | ); |
| 281 | |
| 282 | unsigned int* primOffset0 = (unsigned int*)tmp_presplitItem; |
| 283 | unsigned int* primOffset1 = (unsigned int*)tmp_presplitItem + numPrimitivesToSplit; |
| 284 | |
| 285 | /* compute actual number of sub-primitives generated within the [center;numPrimitives-1] range */ |
| 286 | const size_t totalNumSubPrims = parallel_reduce( size_t(center), numPrimitives, size_t(MIN_STEP_SIZE), size_t(0), [&](const range<size_t>& t) -> size_t { |
| 287 | size_t sum = 0; |
| 288 | for (size_t i=t.begin(); i<t.end(); i++) |
| 289 | { |
| 290 | PrimRef subPrims[MAX_PRESPLITS_PER_PRIMITIVE]; |
| 291 | assert(presplitItem[i].priority >= 1.0f); |
| 292 | const unsigned int primrefID = presplitItem[i].index; |
| 293 | const float prio = presplitItem[i].priority; |
| 294 | const unsigned int geomID = prims[primrefID].geomID(); |
| 295 | const unsigned int primID = prims[primrefID].primID(); |
| 296 | const unsigned int split_levels = (unsigned int)prio; |
| 297 | unsigned int numSubPrims = 0; |
| 298 | splitPrimitive(Splitter,prims[primrefID],geomID,primID,split_levels,grid_base,grid_scale,grid_extend,subPrims,numSubPrims); |
| 299 | assert(numSubPrims); |
| 300 | numSubPrims--; // can reuse slot |
| 301 | sum+=numSubPrims; |
| 302 | presplitItem[i].data = (numSubPrims << MAX_PRESPLITS_PER_PRIMITIVE_LOG) | split_levels; |
| 303 | primOffset0[i-center] = numSubPrims; |
| 304 | } |
| 305 | return sum; |
| 306 | },[](const size_t& a, const size_t& b) -> size_t { return a+b; }); |
| 307 | |
| 308 | /* if we are over budget, need to shrink the range */ |
| 309 | if (totalNumSubPrims > numSplitPrimitivesBudget) |
| 310 | { |
| 311 | size_t new_center = numPrimitives-1; |
| 312 | size_t sum = 0; |
| 313 | for (;new_center>=center;new_center--) |
| 314 | { |
| 315 | const unsigned int numSubPrims = presplitItem[new_center].data >> MAX_PRESPLITS_PER_PRIMITIVE_LOG; |
| 316 | if (unlikely(sum + numSubPrims >= numSplitPrimitivesBudget)) break; |
| 317 | sum += numSubPrims; |
| 318 | } |
| 319 | new_center++; |
| 320 | |
| 321 | primOffset0 += new_center - center; |
| 322 | numPrimitivesToSplit -= new_center - center; |
| 323 | center = new_center; |
| 324 | assert(numPrimitivesToSplit == (numPrimitives - center)); |
| 325 | } |
| 326 | |
| 327 | /* parallel prefix sum to compute offsets for storing sub-primitives */ |
| 328 | const unsigned int offset = parallel_prefix_sum(primOffset0,primOffset1,numPrimitivesToSplit,(unsigned int)0,std::plus<unsigned int>()); |
| 329 | assert(numPrimitives+offset <= alloc_numPrimitives); |
| 330 | |
| 331 | /* iterate over range, and split primitives into sub primitives and append them to prims array */ |
| 332 | parallel_for( size_t(center), numPrimitives, size_t(MIN_STEP_SIZE), [&](const range<size_t>& rn) -> void { |
| 333 | for (size_t j=rn.begin(); j<rn.end(); j++) |
| 334 | { |
| 335 | PrimRef subPrims[MAX_PRESPLITS_PER_PRIMITIVE]; |
| 336 | const unsigned int primrefID = presplitItem[j].index; |
| 337 | const unsigned int geomID = prims[primrefID].geomID(); |
| 338 | const unsigned int primID = prims[primrefID].primID(); |
| 339 | const unsigned int split_levels = presplitItem[j].data & ((unsigned int)(1 << MAX_PRESPLITS_PER_PRIMITIVE_LOG)-1); |
| 340 | |
| 341 | assert(split_levels); |
| 342 | assert(split_levels <= MAX_PRESPLITS_PER_PRIMITIVE_LOG); |
| 343 | unsigned int numSubPrims = 0; |
| 344 | splitPrimitive(Splitter,prims[primrefID],geomID,primID,split_levels,grid_base,grid_scale,grid_extend,subPrims,numSubPrims); |
| 345 | const size_t newID = numPrimitives + primOffset1[j-center]; |
| 346 | assert(newID+numSubPrims-1 <= alloc_numPrimitives); |
| 347 | prims[primrefID] = subPrims[0]; |
| 348 | for (size_t i=1;i<numSubPrims;i++) |
| 349 | prims[newID+i-1] = subPrims[i]; |
| 350 | } |
| 351 | }); |
| 352 | |
| 353 | numPrimitives += offset; |
| 354 | DBG_PRESPLIT( |
| 355 | PRINT(pinfo.size()); |
| 356 | PRINT(numPrimitives); |
| 357 | PRINT((float)numPrimitives/org_numPrimitives)); |
| 358 | } |
| 359 | |
| 360 | /* recompute centroid bounding boxes */ |
| 361 | pinfo = parallel_reduce(size_t(0),numPrimitives,size_t(MIN_STEP_SIZE),PrimInfo(empty),[&] (const range<size_t>& r) -> PrimInfo { |
| 362 | PrimInfo p(empty); |
| 363 | for (size_t j=r.begin(); j<r.end(); j++) |
| 364 | p.add_center2(prims[j]); |
| 365 | return p; |
| 366 | }, [](const PrimInfo& a, const PrimInfo& b) -> PrimInfo { return PrimInfo::merge(a,b); }); |
| 367 | |
| 368 | assert(pinfo.size() == numPrimitives); |
| 369 | |
| 370 | /* free double buffer presplit items */ |
| 371 | alignedFree(tmp_presplitItem); |
| 372 | alignedFree(presplitItem); |
| 373 | return pinfo; |
| 374 | } |
| 375 | } |
| 376 | } |
| 377 | |