#include #include typedef struct timeval timing_basic; #define timing_basic_now(x) gettimeofday((x),(struct timezone *) 0) #define timing_basic_diff(x,y) (1000.0 * ((x)->tv_usec - (double) (y)->tv_usec) + 1000000000.0 * ((x)->tv_sec - (double) (y)->tv_sec)) typedef struct { unsigned long t[2]; } timing; #define timing_now(x) asm volatile(".byte 15;.byte 49" : "=a"((x)->t[0]),"=d"((x)->t[1])) #define timing_diff(x,y) (((x)->t[0] - (double) (y)->t[0]) + 4294967296.0 * ((x)->t[1] - (double) (y)->t[1])) typedef unsigned int uint32; /* J. W. J. Williams in 1964 published the original heapsort algorithm, with top-down insertion. Robert W. Floyd in 1964 published two improvements used here, bottom-up insertion and fast heap construction. */ void heapsort(uint32 *a,unsigned int n) { unsigned int i; unsigned int j; unsigned int p; unsigned int q; uint32 ap; uint32 x; if (n < 2) return; --a; i = (n >> 1) + 1; j = n; while (j > 1) { /* a[i], a[i + 1], ..., a[j] are heapified */ /* a[j + 1], a[j + 2], ..., a[n] are in correct positions */ if (i > 1) { --i; x = a[i]; } else { x = a[j]; a[j] = a[1]; --j; } q = i; while ((p = q + q) <= j) { ap = a[p]; if (p < j) if (a[p + 1] >= ap) { ap = a[p + 1]; ++p; } a[q] = ap; q = p; } while (q > i) { p = q >> 1; ap = a[p]; if (x < ap) break; a[q] = ap; q = p; } a[q] = x; } } /* Harold H. Seward in 1954 published LSD radix sort, using a stable counting sort. Edward H. Friend in 1956 suggested counting the digits in the next column while rearranging numbers in the current column. Such techniques are important in reducing the number of memory accesses. This code can be sped up further. */ void lsdradixsort(uint32 *a,uint32 *b,unsigned int n) { unsigned int x[4][256]; unsigned int i; uint32 u; for (i = 0;i < 256;++i) { x[0][i] = 0; x[1][i] = 0; x[2][i] = 0; x[3][i] = 0; } i = n; while (i) { u = a[--i]; ++x[0][u & 255]; u >>= 8; ++x[1][u & 255]; u >>= 8; ++x[2][u & 255]; u >>= 8; ++x[3][u]; } for (i = 1;i < 256;++i) { x[0][i] += x[0][i - 1]; x[1][i] += x[1][i - 1]; x[2][i] += x[2][i - 1]; x[3][i] += x[3][i - 1]; } i = n; while (i) { u = a[--i]; b[--x[0][u & 255]] = u; } i = n; while (i) { u = b[--i]; a[--x[1][(u >> 8) & 255]] = u; } i = n; while (i) { u = a[--i]; b[--x[2][(u >> 16) & 255]] = u; } i = n; while (i) { u = b[--i]; a[--x[3][u >> 24]] = u; } } /* Peter McIlroy, Keith Bostic, and Doug McIlroy in 1993 published a fast in-place MSD radix sort, using an in-place (but unstable) counting sort. This code can be sped up further. */ void msdradixsort1(uint32 *a,unsigned int n) { int x[256]; int y[256]; int i; int j; uint32 t; uint32 u; uint32 v; if (n < 64) { heapsort(a,n); return; } for (i = 0;i < 256;++i) x[i] = 0; for (i = 0;i < n;++i) ++x[a[i] & 255]; for (i = 1;i < 256;++i) x[i] += x[i - 1]; for (i = 0;i < 256;++i) y[i] = x[i]; for (i = 0;i < n;i = y[v]) { u = a[i]; while ((j = --x[v = (u & 255)]) > i) { t = a[j]; a[j] = u; u = t; } a[i] = u; } } void msdradixsort2(uint32 *a,unsigned int n) { int x[256]; int y[256]; int i; int j; uint32 t; uint32 u; uint32 v; if (n < 160) { heapsort(a,n); return; } for (i = 0;i < 256;++i) x[i] = 0; for (i = 0;i < n;++i) ++x[(a[i] >> 8) & 255]; for (i = 1;i < 256;++i) x[i] += x[i - 1]; for (i = 0;i < 256;++i) y[i] = x[i]; for (i = 0;i < n;i = y[v]) { u = a[i]; while ((j = --x[v = ((u >> 8) & 255)]) > i) { t = a[j]; a[j] = u; u = t; } a[i] = u; } msdradixsort1(a,y[0]); for (i = 1;i < 256;++i) msdradixsort1(a + y[i - 1],y[i] - y[i - 1]); } void msdradixsort3(uint32 *a,unsigned int n) { int x[256]; int y[256]; int i; int j; uint32 t; uint32 u; uint32 v; if (n < 160) { heapsort(a,n); return; } for (i = 0;i < 256;++i) x[i] = 0; for (i = 0;i < n;++i) ++x[(a[i] >> 16) & 255]; for (i = 1;i < 256;++i) x[i] += x[i - 1]; for (i = 0;i < 256;++i) y[i] = x[i]; for (i = 0;i < n;i = y[v]) { u = a[i]; while ((j = --x[v = ((u >> 16) & 255)]) > i) { t = a[j]; a[j] = u; u = t; } a[i] = u; } msdradixsort2(a,y[0]); for (i = 1;i < 256;++i) msdradixsort2(a + y[i - 1],y[i] - y[i - 1]); } void msdradixsort(uint32 *a,unsigned int n) { int x[256]; int y[256]; int i; int j; uint32 t; uint32 u; uint32 v; if (n < 160) { heapsort(a,n); return; } for (i = 0;i < 256;++i) x[i] = 0; for (i = 0;i < n;++i) ++x[a[i] >> 24]; for (i = 1;i < 256;++i) x[i] += x[i - 1]; for (i = 0;i < 256;++i) y[i] = x[i]; for (i = 0;i < n;i = y[v]) { u = a[i]; while ((j = --x[v = (u >> 24)]) > i) { t = a[j]; a[j] = u; u = t; } a[i] = u; } msdradixsort3(a,y[0]); for (i = 1;i < 256;++i) msdradixsort3(a + y[i - 1],y[i] - y[i - 1]); } #define SIZE 1000000 uint32 a[SIZE]; uint32 b[SIZE]; uint32 c[SIZE]; uint32 tmp[SIZE]; timing start; timing_basic startb; timing finish; timing_basic finishb; timing t[10]; int main() { int i; timing_basic_now(&startb); timing_now(&start); timing_now(&t[0]); for (i = 0;i < SIZE;++i) a[i] = b[i] = c[i] = random() + random() + random() + random(); timing_now(&t[1]); heapsort(a,SIZE); timing_now(&t[2]); msdradixsort(b,SIZE); timing_now(&t[3]); lsdradixsort(c,tmp,SIZE); timing_now(&t[4]); timing_basic_now(&finishb); timing_now(&finish); printf("%.0f cycles for initialization\n",timing_diff(&t[1],&t[0])); printf("%.0f cycles for heapsort\n",timing_diff(&t[2],&t[1])); printf("%.0f cycles for msdradixsort\n",timing_diff(&t[3],&t[2])); printf("%.0f cycles for lsdradixsort\n",timing_diff(&t[4],&t[3])); printf("Approximately %f nanoseconds per cycle\n",timing_basic_diff(&finishb,&startb) / timing_diff(&finish,&start)); for (i = 0;i < SIZE;++i) if (a[i] != b[i]) { printf("Bug!\n"); exit(111); } for (i = 0;i < SIZE;++i) if (a[i] != c[i]) { printf("Bug!\n"); exit(111); } exit(0); }