aboutsummaryrefslogtreecommitdiffstatshomepage
path: root/libraries/ode-0.9/OPCODE/Ice/IceRevisitedRadix.cpp
diff options
context:
space:
mode:
authordan miller2007-10-19 05:15:33 +0000
committerdan miller2007-10-19 05:15:33 +0000
commit79eca25c945a535a7a0325999034bae17da92412 (patch)
tree40ff433d94859d629aac933d5ec73b382f62ba1a /libraries/ode-0.9/OPCODE/Ice/IceRevisitedRadix.cpp
parentadding ode source to /libraries (diff)
downloadopensim-SC_OLD-79eca25c945a535a7a0325999034bae17da92412.zip
opensim-SC_OLD-79eca25c945a535a7a0325999034bae17da92412.tar.gz
opensim-SC_OLD-79eca25c945a535a7a0325999034bae17da92412.tar.bz2
opensim-SC_OLD-79eca25c945a535a7a0325999034bae17da92412.tar.xz
resubmitting ode
Diffstat (limited to '')
-rw-r--r--libraries/ode-0.9/OPCODE/Ice/IceRevisitedRadix.cpp520
1 files changed, 520 insertions, 0 deletions
diff --git a/libraries/ode-0.9/OPCODE/Ice/IceRevisitedRadix.cpp b/libraries/ode-0.9/OPCODE/Ice/IceRevisitedRadix.cpp
new file mode 100644
index 0000000..3e351da
--- /dev/null
+++ b/libraries/ode-0.9/OPCODE/Ice/IceRevisitedRadix.cpp
@@ -0,0 +1,520 @@
1///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
2/**
3 * Contains source code from the article "Radix Sort Revisited".
4 * \file IceRevisitedRadix.cpp
5 * \author Pierre Terdiman
6 * \date April, 4, 2000
7 */
8///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
9
10///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
11/**
12 * Revisited Radix Sort.
13 * This is my new radix routine:
14 * - it uses indices and doesn't recopy the values anymore, hence wasting less ram
15 * - it creates all the histograms in one run instead of four
16 * - it sorts words faster than dwords and bytes faster than words
17 * - it correctly sorts negative floating-point values by patching the offsets
18 * - it automatically takes advantage of temporal coherence
19 * - multiple keys support is a side effect of temporal coherence
20 * - it may be worth recoding in asm... (mainly to use FCOMI, FCMOV, etc) [it's probably memory-bound anyway]
21 *
22 * History:
23 * - 08.15.98: very first version
24 * - 04.04.00: recoded for the radix article
25 * - 12.xx.00: code lifting
26 * - 09.18.01: faster CHECK_PASS_VALIDITY thanks to Mark D. Shattuck (who provided other tips, not included here)
27 * - 10.11.01: added local ram support
28 * - 01.20.02: bugfix! In very particular cases the last pass was skipped in the float code-path, leading to incorrect sorting......
29 * - 01.02.02: - "mIndices" renamed => "mRanks". That's a rank sorter after all.
30 * - ranks are not "reset" anymore, but implicit on first calls
31 * - 07.05.02: - offsets rewritten with one less indirection.
32 * - 11.03.02: - "bool" replaced with RadixHint enum
33 *
34 * \class RadixSort
35 * \author Pierre Terdiman
36 * \version 1.4
37 * \date August, 15, 1998
38 */
39///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
40
41/*
42To do:
43 - add an offset parameter between two input values (avoid some data recopy sometimes)
44 - unroll ? asm ?
45 - 11 bits trick & 3 passes as Michael did
46 - prefetch stuff the day I have a P3
47 - make a version with 16-bits indices ?
48*/
49
50///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
51// Precompiled Header
52#include "Stdafx.h"
53
54using namespace IceCore;
55
56#define INVALIDATE_RANKS mCurrentSize|=0x80000000
57#define VALIDATE_RANKS mCurrentSize&=0x7fffffff
58#define CURRENT_SIZE (mCurrentSize&0x7fffffff)
59#define INVALID_RANKS (mCurrentSize&0x80000000)
60
61#define CHECK_RESIZE(n) \
62 if(n!=mPreviousSize) \
63 { \
64 if(n>mCurrentSize) Resize(n); \
65 else ResetRanks(); \
66 mPreviousSize = n; \
67 }
68
69#define CREATE_HISTOGRAMS(type, buffer) \
70 /* Clear counters/histograms */ \
71 ZeroMemory(mHistogram, 256*4*sizeof(udword)); \
72 \
73 /* Prepare to count */ \
74 ubyte* p = (ubyte*)input; \
75 ubyte* pe = &p[nb*4]; \
76 udword* h0= &mHistogram[0]; /* Histogram for first pass (LSB) */ \
77 udword* h1= &mHistogram[256]; /* Histogram for second pass */ \
78 udword* h2= &mHistogram[512]; /* Histogram for third pass */ \
79 udword* h3= &mHistogram[768]; /* Histogram for last pass (MSB) */ \
80 \
81 bool AlreadySorted = true; /* Optimism... */ \
82 \
83 if(INVALID_RANKS) \
84 { \
85 /* Prepare for temporal coherence */ \
86 type* Running = (type*)buffer; \
87 type PrevVal = *Running; \
88 \
89 while(p!=pe) \
90 { \
91 /* Read input buffer in previous sorted order */ \
92 type Val = *Running++; \
93 /* Check whether already sorted or not */ \
94 if(Val<PrevVal) { AlreadySorted = false; break; } /* Early out */ \
95 /* Update for next iteration */ \
96 PrevVal = Val; \
97 \
98 /* Create histograms */ \
99 h0[*p++]++; h1[*p++]++; h2[*p++]++; h3[*p++]++; \
100 } \
101 \
102 /* If all input values are already sorted, we just have to return and leave the */ \
103 /* previous list unchanged. That way the routine may take advantage of temporal */ \
104 /* coherence, for example when used to sort transparent faces. */ \
105 if(AlreadySorted) \
106 { \
107 mNbHits++; \
108 for(udword i=0;i<nb;i++) mRanks[i] = i; \
109 return *this; \
110 } \
111 } \
112 else \
113 { \
114 /* Prepare for temporal coherence */ \
115 udword* Indices = mRanks; \
116 type PrevVal = (type)buffer[*Indices]; \
117 \
118 while(p!=pe) \
119 { \
120 /* Read input buffer in previous sorted order */ \
121 type Val = (type)buffer[*Indices++]; \
122 /* Check whether already sorted or not */ \
123 if(Val<PrevVal) { AlreadySorted = false; break; } /* Early out */ \
124 /* Update for next iteration */ \
125 PrevVal = Val; \
126 \
127 /* Create histograms */ \
128 h0[*p++]++; h1[*p++]++; h2[*p++]++; h3[*p++]++; \
129 } \
130 \
131 /* If all input values are already sorted, we just have to return and leave the */ \
132 /* previous list unchanged. That way the routine may take advantage of temporal */ \
133 /* coherence, for example when used to sort transparent faces. */ \
134 if(AlreadySorted) { mNbHits++; return *this; } \
135 } \
136 \
137 /* Else there has been an early out and we must finish computing the histograms */ \
138 while(p!=pe) \
139 { \
140 /* Create histograms without the previous overhead */ \
141 h0[*p++]++; h1[*p++]++; h2[*p++]++; h3[*p++]++; \
142 }
143
144#define CHECK_PASS_VALIDITY(pass) \
145 /* Shortcut to current counters */ \
146 udword* CurCount = &mHistogram[pass<<8]; \
147 \
148 /* Reset flag. The sorting pass is supposed to be performed. (default) */ \
149 bool PerformPass = true; \
150 \
151 /* Check pass validity */ \
152 \
153 /* If all values have the same byte, sorting is useless. */ \
154 /* It may happen when sorting bytes or words instead of dwords. */ \
155 /* This routine actually sorts words faster than dwords, and bytes */ \
156 /* faster than words. Standard running time (O(4*n))is reduced to O(2*n) */ \
157 /* for words and O(n) for bytes. Running time for floats depends on actual values... */ \
158 \
159 /* Get first byte */ \
160 ubyte UniqueVal = *(((ubyte*)input)+pass); \
161 \
162 /* Check that byte's counter */ \
163 if(CurCount[UniqueVal]==nb) PerformPass=false;
164
165///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
166/**
167 * Constructor.
168 */
169///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
170RadixSort::RadixSort() : mRanks(null), mRanks2(null), mCurrentSize(0), mTotalCalls(0), mNbHits(0)
171{
172#ifndef RADIX_LOCAL_RAM
173 // Allocate input-independent ram
174 mHistogram = new udword[256*4];
175 mOffset = new udword[256];
176#endif
177 // Initialize indices
178 INVALIDATE_RANKS;
179}
180
181///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
182/**
183 * Destructor.
184 */
185///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
186RadixSort::~RadixSort()
187{
188 // Release everything
189#ifndef RADIX_LOCAL_RAM
190 DELETEARRAY(mOffset);
191 DELETEARRAY(mHistogram);
192#endif
193 DELETEARRAY(mRanks2);
194 DELETEARRAY(mRanks);
195}
196
197///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
198/**
199 * Resizes the inner lists.
200 * \param nb [in] new size (number of dwords)
201 * \return true if success
202 */
203///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
204bool RadixSort::Resize(udword nb)
205{
206 // Free previously used ram
207 DELETEARRAY(mRanks2);
208 DELETEARRAY(mRanks);
209
210 // Get some fresh one
211 mRanks = new udword[nb]; CHECKALLOC(mRanks);
212 mRanks2 = new udword[nb]; CHECKALLOC(mRanks2);
213
214 return true;
215}
216
217inline_ void RadixSort::CheckResize(udword nb)
218{
219 udword CurSize = CURRENT_SIZE;
220 if(nb!=CurSize)
221 {
222 if(nb>CurSize) Resize(nb);
223 mCurrentSize = nb;
224 INVALIDATE_RANKS;
225 }
226}
227
228///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
229/**
230 * Main sort routine.
231 * This one is for integer values. After the call, mRanks contains a list of indices in sorted order, i.e. in the order you may process your data.
232 * \param input [in] a list of integer values to sort
233 * \param nb [in] number of values to sort, must be < 2^31
234 * \param hint [in] RADIX_SIGNED to handle negative values, RADIX_UNSIGNED if you know your input buffer only contains positive values
235 * \return Self-Reference
236 */
237///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
238RadixSort& RadixSort::Sort(const udword* input, udword nb, RadixHint hint)
239{
240 // Checkings
241 if(!input || !nb || nb&0x80000000) return *this;
242
243 // Stats
244 mTotalCalls++;
245
246 // Resize lists if needed
247 CheckResize(nb);
248
249#ifdef RADIX_LOCAL_RAM
250 // Allocate histograms & offsets on the stack
251 udword mHistogram[256*4];
252// udword mOffset[256];
253 udword* mLink[256];
254#endif
255
256 // Create histograms (counters). Counters for all passes are created in one run.
257 // Pros: read input buffer once instead of four times
258 // Cons: mHistogram is 4Kb instead of 1Kb
259 // We must take care of signed/unsigned values for temporal coherence.... I just
260 // have 2 code paths even if just a single opcode changes. Self-modifying code, someone?
261 if(hint==RADIX_UNSIGNED) { CREATE_HISTOGRAMS(udword, input); }
262 else { CREATE_HISTOGRAMS(sdword, input); }
263
264 // Compute #negative values involved if needed
265 udword NbNegativeValues = 0;
266 if(hint==RADIX_SIGNED)
267 {
268 // An efficient way to compute the number of negatives values we'll have to deal with is simply to sum the 128
269 // last values of the last histogram. Last histogram because that's the one for the Most Significant Byte,
270 // responsible for the sign. 128 last values because the 128 first ones are related to positive numbers.
271 udword* h3= &mHistogram[768];
272 for(udword i=128;i<256;i++) NbNegativeValues += h3[i]; // 768 for last histogram, 128 for negative part
273 }
274
275 // Radix sort, j is the pass number (0=LSB, 3=MSB)
276 for(udword j=0;j<4;j++)
277 {
278 CHECK_PASS_VALIDITY(j);
279
280 // Sometimes the fourth (negative) pass is skipped because all numbers are negative and the MSB is 0xFF (for example). This is
281 // not a problem, numbers are correctly sorted anyway.
282 if(PerformPass)
283 {
284 // Should we care about negative values?
285 if(j!=3 || hint==RADIX_UNSIGNED)
286 {
287 // Here we deal with positive values only
288
289 // Create offsets
290// mOffset[0] = 0;
291// for(udword i=1;i<256;i++) mOffset[i] = mOffset[i-1] + CurCount[i-1];
292 mLink[0] = mRanks2;
293 for(udword i=1;i<256;i++) mLink[i] = mLink[i-1] + CurCount[i-1];
294 }
295 else
296 {
297 // This is a special case to correctly handle negative integers. They're sorted in the right order but at the wrong place.
298
299 // Create biased offsets, in order for negative numbers to be sorted as well
300// mOffset[0] = NbNegativeValues; // First positive number takes place after the negative ones
301 mLink[0] = &mRanks2[NbNegativeValues]; // First positive number takes place after the negative ones
302// for(udword i=1;i<128;i++) mOffset[i] = mOffset[i-1] + CurCount[i-1]; // 1 to 128 for positive numbers
303 for(udword i=1;i<128;i++) mLink[i] = mLink[i-1] + CurCount[i-1]; // 1 to 128 for positive numbers
304
305 // Fixing the wrong place for negative values
306// mOffset[128] = 0;
307 mLink[128] = mRanks2;
308// for(i=129;i<256;i++) mOffset[i] = mOffset[i-1] + CurCount[i-1];
309 for(udword i=129;i<256;i++) mLink[i] = mLink[i-1] + CurCount[i-1];
310 }
311
312 // Perform Radix Sort
313 ubyte* InputBytes = (ubyte*)input;
314 InputBytes += j;
315 if(INVALID_RANKS)
316 {
317// for(udword i=0;i<nb;i++) mRanks2[mOffset[InputBytes[i<<2]]++] = i;
318 for(udword i=0;i<nb;i++) *mLink[InputBytes[i<<2]]++ = i;
319 VALIDATE_RANKS;
320 }
321 else
322 {
323 udword* Indices = mRanks;
324 udword* IndicesEnd = &mRanks[nb];
325 while(Indices!=IndicesEnd)
326 {
327 udword id = *Indices++;
328// mRanks2[mOffset[InputBytes[id<<2]]++] = id;
329 *mLink[InputBytes[id<<2]]++ = id;
330 }
331 }
332
333 // Swap pointers for next pass. Valid indices - the most recent ones - are in mRanks after the swap.
334 udword* Tmp = mRanks; mRanks = mRanks2; mRanks2 = Tmp;
335 }
336 }
337 return *this;
338}
339
340///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
341/**
342 * Main sort routine.
343 * This one is for floating-point values. After the call, mRanks contains a list of indices in sorted order, i.e. in the order you may process your data.
344 * \param input [in] a list of floating-point values to sort
345 * \param nb [in] number of values to sort, must be < 2^31
346 * \return Self-Reference
347 * \warning only sorts IEEE floating-point values
348 */
349///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
350RadixSort& RadixSort::Sort(const float* input2, udword nb)
351{
352 // Checkings
353 if(!input2 || !nb || nb&0x80000000) return *this;
354
355 // Stats
356 mTotalCalls++;
357
358 udword* input = (udword*)input2;
359
360 // Resize lists if needed
361 CheckResize(nb);
362
363#ifdef RADIX_LOCAL_RAM
364 // Allocate histograms & offsets on the stack
365 udword mHistogram[256*4];
366// udword mOffset[256];
367 udword* mLink[256];
368#endif
369
370 // Create histograms (counters). Counters for all passes are created in one run.
371 // Pros: read input buffer once instead of four times
372 // Cons: mHistogram is 4Kb instead of 1Kb
373 // Floating-point values are always supposed to be signed values, so there's only one code path there.
374 // Please note the floating point comparison needed for temporal coherence! Although the resulting asm code
375 // is dreadful, this is surprisingly not such a performance hit - well, I suppose that's a big one on first
376 // generation Pentiums....We can't make comparison on integer representations because, as Chris said, it just
377 // wouldn't work with mixed positive/negative values....
378 { CREATE_HISTOGRAMS(float, input2); }
379
380 // Compute #negative values involved if needed
381 udword NbNegativeValues = 0;
382 // An efficient way to compute the number of negatives values we'll have to deal with is simply to sum the 128
383 // last values of the last histogram. Last histogram because that's the one for the Most Significant Byte,
384 // responsible for the sign. 128 last values because the 128 first ones are related to positive numbers.
385 udword* h3= &mHistogram[768];
386 for(udword i=128;i<256;i++) NbNegativeValues += h3[i]; // 768 for last histogram, 128 for negative part
387
388 // Radix sort, j is the pass number (0=LSB, 3=MSB)
389 for(udword j=0;j<4;j++)
390 {
391 // Should we care about negative values?
392 if(j!=3)
393 {
394 // Here we deal with positive values only
395 CHECK_PASS_VALIDITY(j);
396
397 if(PerformPass)
398 {
399 // Create offsets
400// mOffset[0] = 0;
401 mLink[0] = mRanks2;
402// for(udword i=1;i<256;i++) mOffset[i] = mOffset[i-1] + CurCount[i-1];
403 for(udword i=1;i<256;i++) mLink[i] = mLink[i-1] + CurCount[i-1];
404
405 // Perform Radix Sort
406 ubyte* InputBytes = (ubyte*)input;
407 InputBytes += j;
408 if(INVALID_RANKS)
409 {
410// for(i=0;i<nb;i++) mRanks2[mOffset[InputBytes[i<<2]]++] = i;
411 for(udword i=0;i<nb;i++) *mLink[InputBytes[i<<2]]++ = i;
412 VALIDATE_RANKS;
413 }
414 else
415 {
416 udword* Indices = mRanks;
417 udword* IndicesEnd = &mRanks[nb];
418 while(Indices!=IndicesEnd)
419 {
420 udword id = *Indices++;
421// mRanks2[mOffset[InputBytes[id<<2]]++] = id;
422 *mLink[InputBytes[id<<2]]++ = id;
423 }
424 }
425
426 // Swap pointers for next pass. Valid indices - the most recent ones - are in mRanks after the swap.
427 udword* Tmp = mRanks; mRanks = mRanks2; mRanks2 = Tmp;
428 }
429 }
430 else
431 {
432 // This is a special case to correctly handle negative values
433 CHECK_PASS_VALIDITY(j);
434
435 if(PerformPass)
436 {
437 // Create biased offsets, in order for negative numbers to be sorted as well
438// mOffset[0] = NbNegativeValues; // First positive number takes place after the negative ones
439 mLink[0] = &mRanks2[NbNegativeValues]; // First positive number takes place after the negative ones
440// for(udword i=1;i<128;i++) mOffset[i] = mOffset[i-1] + CurCount[i-1]; // 1 to 128 for positive numbers
441 for(udword i=1;i<128;i++) mLink[i] = mLink[i-1] + CurCount[i-1]; // 1 to 128 for positive numbers
442
443 // We must reverse the sorting order for negative numbers!
444// mOffset[255] = 0;
445 mLink[255] = mRanks2;
446// for(i=0;i<127;i++) mOffset[254-i] = mOffset[255-i] + CurCount[255-i]; // Fixing the wrong order for negative values
447 for(udword i=0;i<127;i++) mLink[254-i] = mLink[255-i] + CurCount[255-i]; // Fixing the wrong order for negative values
448// for(i=128;i<256;i++) mOffset[i] += CurCount[i]; // Fixing the wrong place for negative values
449 for(udword i=128;i<256;i++) mLink[i] += CurCount[i]; // Fixing the wrong place for negative values
450
451 // Perform Radix Sort
452 if(INVALID_RANKS)
453 {
454 for(udword i=0;i<nb;i++)
455 {
456 udword Radix = input[i]>>24; // Radix byte, same as above. AND is useless here (udword).
457 // ### cmp to be killed. Not good. Later.
458// if(Radix<128) mRanks2[mOffset[Radix]++] = i; // Number is positive, same as above
459// else mRanks2[--mOffset[Radix]] = i; // Number is negative, flip the sorting order
460 if(Radix<128) *mLink[Radix]++ = i; // Number is positive, same as above
461 else *(--mLink[Radix]) = i; // Number is negative, flip the sorting order
462 }
463 VALIDATE_RANKS;
464 }
465 else
466 {
467 for(udword i=0;i<nb;i++)
468 {
469 udword Radix = input[mRanks[i]]>>24; // Radix byte, same as above. AND is useless here (udword).
470 // ### cmp to be killed. Not good. Later.
471// if(Radix<128) mRanks2[mOffset[Radix]++] = mRanks[i]; // Number is positive, same as above
472// else mRanks2[--mOffset[Radix]] = mRanks[i]; // Number is negative, flip the sorting order
473 if(Radix<128) *mLink[Radix]++ = mRanks[i]; // Number is positive, same as above
474 else *(--mLink[Radix]) = mRanks[i]; // Number is negative, flip the sorting order
475 }
476 }
477 // Swap pointers for next pass. Valid indices - the most recent ones - are in mRanks after the swap.
478 udword* Tmp = mRanks; mRanks = mRanks2; mRanks2 = Tmp;
479 }
480 else
481 {
482 // The pass is useless, yet we still have to reverse the order of current list if all values are negative.
483 if(UniqueVal>=128)
484 {
485 if(INVALID_RANKS)
486 {
487 // ###Possible?
488 for(udword i=0;i<nb;i++) mRanks2[i] = nb-i-1;
489 VALIDATE_RANKS;
490 }
491 else
492 {
493 for(udword i=0;i<nb;i++) mRanks2[i] = mRanks[nb-i-1];
494 }
495
496 // Swap pointers for next pass. Valid indices - the most recent ones - are in mRanks after the swap.
497 udword* Tmp = mRanks; mRanks = mRanks2; mRanks2 = Tmp;
498 }
499 }
500 }
501 }
502 return *this;
503}
504
505///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
506/**
507 * Gets the ram used.
508 * \return memory used in bytes
509 */
510///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
511udword RadixSort::GetUsedRam() const
512{
513 udword UsedRam = sizeof(RadixSort);
514#ifndef RADIX_LOCAL_RAM
515 UsedRam += 256*4*sizeof(udword); // Histograms
516 UsedRam += 256*sizeof(udword); // Offsets
517#endif
518 UsedRam += 2*CURRENT_SIZE*sizeof(udword); // 2 lists of indices
519 return UsedRam;
520}