In this post here, I presented a fast sorting function based on radix algorithm.
Today, lets make the code a bit more useful by rewriting it to sort an array of objects, with a “key” string to access our sorting key.
The desired API:
static public function RadixSorOn(iarray:Vector.<Object> , sorted:Vector.<Object> , key:String):void
As seen when implementing the flashSort.SortOn (here), accessing iarray[i][key] is very slow,
so the first thing we can do is copy in our domain memory all the keys , so we access them slowly only once, we can do that while performing our histogram pass:
From there, we will access the keys directly in our domain memory ByteArray.
// 1. parallel histogramming pass // + copy flipped sign value to domain memory for (i = -1; (++i - elements)>>>31;) { var x:int = iarray[i][key] ^ 0x80000000; //sign flip fastmem.fastSetI32(x,sort1+(i<<2)); //save iarray[i][key] fastmem.fastSetI32(fastmem.fastGetI32(adr = b0 + ( (x & 0x7FF) <<2))+1,adr); fastmem.fastSetI32(fastmem.fastGetI32(adr = b1 + (((x >>> 11) & 0x7FF) <<2))+1,adr); fastmem.fastSetI32(fastmem.fastGetI32(adr = b2 + ( (x >>> 22) <<2))+1,adr); }
Then in the first byte pass (which is actually 11 bits pass) , sort for our first radix, and also generate an array giving the sorting order by index:
Save that info starting at sort1 address.
// byte 0: flip sign, read/write histogram, write out to sort2 var j:int = -1; for (i =sort1,max=sort1+elements4; (i-max)>>>31;i+=4) { fastmem.fastSetI32(tmp = fastmem.fastGetI32(adr = b0 +(((x= fastmem.fastGetI32(i)) & 0x7FF)<<2))+1,adr); tmp <<= 2; fastmem.fastSetI32(x,sort2+tmp); fastmem.fastSetI32(++j,sorti1+tmp); //save sorting index to sorti1 }
In the “byte1” pass, while we sort for the second radix, apply in parallel the same transform reading our “sort1” index array, and write to “sort2” index array:
// byte 1: read/write histogram, copy // sorted2 -> sorted1 var sortiDelta:int = sorti1 - sort2; for (i =sort2,max=sort2+elements4; (i-max)>>>31;i+=4) { fastmem.fastSetI32(tmp = fastmem.fastGetI32(adr = b1 +((( (x = fastmem.fastGetI32(i)) >>> 11) & 0x7FF)<<2))+1,adr); tmp <<=2; fastmem.fastSetI32(x,sort1+tmp); //apply in // the sorting index from sorti1 -> sorti2 fastmem.fastSetI32(fastmem.fastGetI32(i+sortiDelta),sorti2+tmp); }
Finally, in the last Radix pass, read our sort2 array in parrallel, and apply the transform from our original Vector.<Object> array (iarray[]) to our destination sorted array (sorted[]).
Our destination array is now sorted !
// byte 2: read/write histogram, copy & flip out // sorted1 -> array sortiDelta = sorti2 - sort1; for (i =sort1,max=sort1+elements4; (i-max)>>>31;i+=4) { fastmem.fastSetI32(tmp = fastmem.fastGetI32(adr = b2 +(((x=fastmem.fastGetI32(i)) >>> 22)<<2))+1,adr); //final index: sorted[tmp] = iarray[fastmem.fastGetI32(i+sortiDelta)]; }
How fast is it?
Faster than flashSort !
Can we be faster
In my implementation, I decided to do 2 things in parallel : read/write our keys for each radix, and read/write the sorted index.
We could change the code to not write the keys, just write the indexes to their new sorted order, so we would save 1 write/iteration, but we would have an additional indirection to read the keys for the radix 1 and 2.. something to explore.
What do you get?
Radix Sort On code
// ================================================================================================ // radix sort Object // ================================================================================================ static public function RadixSorOn(iarray:Vector.<Object> , sorted:Vector.<Object> , key:String):void { var elements:int = iarray.length; var elements4:int = elements<<2; // 3 histograms on the stack: const kHist:int = 2048<<2; var b0:int = 8; var b1:int = b0 + kHist; var b2:int = b1 + kHist; var max:int = b2 + kHist; //put temporary data info here: var sort1:int = max; var sort2:int = sort1 + elements4; var sorti1:int = sort2 + elements4; var sorti2:int = sorti1 + elements4; var sorti2End:int = sorti2 + elements4; var i:int; var adr:int; var tmp:int; //clear byteArray , 32 bytes / iterations var mem:Number = 0; for(adr=b0;adr<max;adr+=32) { fastmem.fastSetDouble(mem,adr); fastmem.fastSetDouble(mem,adr+8); fastmem.fastSetDouble(mem,adr+16); fastmem.fastSetDouble(mem,adr+24); } // 1. parallel histogramming pass // + copy flipped sign value to domain memory for (i = -1; (++i - elements)>>>31;) { var x:int = iarray[i][key] ^ 0x80000000; //sign flip fastmem.fastSetI32(x,sort1+(i<<2)); fastmem.fastSetI32(fastmem.fastGetI32(adr = b0 + ( (x & 0x7FF) <<2))+1,adr); fastmem.fastSetI32(fastmem.fastGetI32(adr = b1 + (((x >>> 11) & 0x7FF) <<2))+1,adr); fastmem.fastSetI32(fastmem.fastGetI32(adr = b2 + ( (x >>> 22) <<2))+1,adr); } // 2. Sum the histograms -- each histogram entry records the number of values preceding itself. { var sum0:int = 0, sum1:int = 0, sum2:int = 0,tsum:int; for (i = 0; (i - kHist)>>>31;i+=8) { tsum = fastmem.fastGetI32(adr = b0+i) + sum0; fastmem.fastSetI32(sum0-1,adr); sum0 = fastmem.fastGetI32(adr = adr+4) + tsum; fastmem.fastSetI32(tsum-1,adr); tsum = fastmem.fastGetI32(adr = b1+i) + sum1; fastmem.fastSetI32(sum1-1,adr); sum1 = fastmem.fastGetI32(adr = adr+4) + tsum; fastmem.fastSetI32(tsum-1,adr); tsum = fastmem.fastGetI32(adr = b2+i) + sum2; fastmem.fastSetI32(sum2-1,adr); sum2 = fastmem.fastGetI32(adr = adr+4) + tsum; fastmem.fastSetI32(tsum-1,adr); } } // byte 0: flip sign, read/write histogram, write out to sort2 var j:int = -1; for (i =sort1,max=sort1+elements4; (i-max)>>>31;i+=4) { fastmem.fastSetI32(tmp = fastmem.fastGetI32(adr = b0 +(((x= fastmem.fastGetI32(i)) & 0x7FF)<<2))+1,adr); tmp <<= 2; fastmem.fastSetI32(x,sort2+tmp); fastmem.fastSetI32(++j,sorti1+tmp); //save sorting index to sorti1 } // byte 1: read/write histogram, copy // sorted2 -> sorted1 var sortiDelta:int = sorti1 - sort2; for (i =sort2,max=sort2+elements4; (i-max)>>>31;i+=4) { fastmem.fastSetI32(tmp = fastmem.fastGetI32(adr = b1 +((( (x = fastmem.fastGetI32(i)) >>> 11) & 0x7FF)<<2))+1,adr); tmp <<=2; fastmem.fastSetI32(x,sort1+tmp); //apply in // the sorting index from sorti1 -> sorti2 fastmem.fastSetI32(fastmem.fastGetI32(i+sortiDelta),sorti2+tmp); } // byte 2: read/write histogram, copy & flip out // sorted1 -> array sortiDelta = sorti2 - sort1; for (i =sort1,max=sort1+elements4; (i-max)>>>31;i+=4) { fastmem.fastSetI32(tmp = fastmem.fastGetI32(adr = b2 +(((x=fastmem.fastGetI32(i)) >>> 22)<<2))+1,adr); //final index: sorted[tmp] = iarray[fastmem.fastGetI32(i+sortiDelta)]; } }
Leave a Reply