Skip to content

Commit 432b3e8

Browse files
committed
fixed a bug in real ffts for N multiple of 25; work around a compiler bug with clang 3.2 for arm on linux
1 parent d3b146e commit 432b3e8

File tree

5 files changed

+166
-156
lines changed

5 files changed

+166
-156
lines changed

README.txt

+91-44
Original file line numberDiff line numberDiff line change
@@ -91,33 +91,42 @@ operation.
9191
Benchmark results (cpu tested: core i7 2600, core 2 quad, core 1 duo, atom N270, cortex-A9)
9292
--
9393

94+
The benchmark shows the performance of various fft implementations measured in
95+
MFlops, with the number of floating point operations being defined as 5Nlog2(N)
96+
for a length N complex fft, and 2.5*Nlog2(N) for a real fft.
97+
See http://www.fftw.org/speed/method.html for an explanation of these formulas.
98+
9499
MacOS Lion, gcc 4.2, 64-bit, fftw 3.3 on a 3.4 GHz core i7 2600
95100

96101
Built with:
97102

98103
gcc-4.2 -o test_pffft -arch x86_64 -O3 -Wall -W pffft.c test_pffft.c fftpack.c -L/usr/local/lib -I/usr/local/include/ -DHAVE_VECLIB -framework veclib -DHAVE_FFTW -lfftw3f
99104

100-
| N (input length) | real FFTPack | real vDSP | real FFTW | real PFFFT | | cplx FFTPack | cplx vDSP | cplx FFTW | cplx PFFFT |
101-
|------------------+--------------+--------------+--------------+--------------| |--------------+--------------+--------------+--------------|
102-
| 64 | 2887 | 9124 | 7082 | 8140 | | 3416 | 15722 | 15705 | 11222 |
103-
| 96 | 3356 | n/a | 8520 | 8158 | | 4038 | n/a | 16560 | 11141 |
104-
| 128 | 3895 | 12135 | 9613 | 10321 | | 4292 | 17981 | 17054 | 12538 |
105-
| 192 | 3941 | n/a | 10220 | 11167 | | 4388 | n/a | 16226 | 12964 |
106-
| 256 | 4532 | 13669 | 11031 | 12779 | | 4628 | 19905 | 17259 | 14305 |
107-
| 384 | 3512 | n/a | 11013 | 12278 | | 3645 | n/a | 16559 | 13370 |
108-
| 512 | 3716 | 15236 | 11515 | 14376 | | 3737 | 20423 | 17050 | 14746 |
109-
| 768 | 3756 | n/a | 11524 | 13659 | | 3748 | n/a | 16201 | 14891 |
110-
| 1024 | 4060 | 15841 | 10393 | 15732 | | 3828 | 21555 | 15898 | 15883 |
111-
| 2048 | 4646 | 16806 | 11888 | 15752 | | 4323 | 20802 | 15360 | 15219 |
112-
| 4096 | 4794 | 17008 | 11866 | 15785 | | 4167 | 19842 | 14532 | 14723 |
113-
| 8192 | 3887 | 16519 | 11290 | 12854 | | 3738 | 18923 | 12528 | 14164 |
114-
| 9216 | 3924 | n/a | 10980 | 13211 | | 3684 | n/a | 12349 | 14474 |
115-
| 16384 | 3907 | 16045 | 11146 | 13111 | | 3687 | 17628 | 12364 | 14176 |
116-
| 32768 | 4279 | 15169 | 10946 | 11538 | | 3919 | 15179 | 11558 | 11911 |
117-
| 262144 | 3423 | 11792 | 6753 | 9827 | | 2913 | 11989 | 8406 | 10960 |
118-
| 1048576 | 3313 | 10613 | 5478 | 7142 | | 2683 | 8487 | 2826 | 5961 |
119-
|------------------+--------------+--------------+--------------+--------------| |--------------+--------------+--------------+--------------|
120-
105+
| input len |real FFTPack| real vDSP | real FFTW | real PFFFT | |cplx FFTPack| cplx vDSP | cplx FFTW | cplx PFFFT |
106+
|-----------+------------+------------+------------+------------| |------------+------------+------------+------------|
107+
| 64 | 2816 | 8596 | 7329 | 8187 | | 2887 | 14898 | 14668 | 11108 |
108+
| 96 | 3298 | n/a | 8378 | 7727 | | 3953 | n/a | 15680 | 10878 |
109+
| 128 | 3507 | 11575 | 9266 | 10108 | | 4233 | 17598 | 16427 | 12000 |
110+
| 160 | 3391 | n/a | 9838 | 10711 | | 4220 | n/a | 16653 | 11187 |
111+
| 192 | 3919 | n/a | 9868 | 10956 | | 4297 | n/a | 15770 | 12540 |
112+
| 256 | 4283 | 13179 | 10694 | 13128 | | 4545 | 19550 | 16350 | 13822 |
113+
| 384 | 3136 | n/a | 10810 | 12061 | | 3600 | n/a | 16103 | 13240 |
114+
| 480 | 3477 | n/a | 10632 | 12074 | | 3536 | n/a | 11630 | 12522 |
115+
| 512 | 3783 | 15141 | 11267 | 13838 | | 3649 | 20002 | 16560 | 13580 |
116+
| 640 | 3639 | n/a | 11164 | 13946 | | 3695 | n/a | 15416 | 13890 |
117+
| 768 | 3800 | n/a | 11245 | 13495 | | 3590 | n/a | 15802 | 14552 |
118+
| 800 | 3440 | n/a | 10499 | 13301 | | 3659 | n/a | 12056 | 13268 |
119+
| 1024 | 3924 | 15605 | 11450 | 15339 | | 3769 | 20963 | 13941 | 15467 |
120+
| 2048 | 4518 | 16195 | 11551 | 15532 | | 4258 | 20413 | 13723 | 15042 |
121+
| 2400 | 4294 | n/a | 10685 | 13078 | | 4093 | n/a | 12777 | 13119 |
122+
| 4096 | 4750 | 16596 | 11672 | 15817 | | 4157 | 19662 | 14316 | 14336 |
123+
| 8192 | 3820 | 16227 | 11084 | 12555 | | 3691 | 18132 | 12102 | 13813 |
124+
| 9216 | 3864 | n/a | 10254 | 12870 | | 3586 | n/a | 12119 | 13994 |
125+
| 16384 | 3822 | 15123 | 10454 | 12822 | | 3613 | 16874 | 12370 | 13881 |
126+
| 32768 | 4175 | 14512 | 10662 | 11095 | | 3881 | 14702 | 11619 | 11524 |
127+
| 262144 | 3317 | 11429 | 6269 | 9517 | | 2810 | 11729 | 7757 | 10179 |
128+
| 1048576 | 2913 | 10551 | 4730 | 5867 | | 2661 | 7881 | 3520 | 5350 |
129+
|-----------+------------+------------+------------+------------| |------------+------------+------------+------------|
121130

122131

123132
Debian 6, gcc 4.4.5, 64-bit, fftw 3.3.1 on a 3.4 GHz core i7 2600
@@ -263,30 +272,68 @@ cl /Ox -D_USE_MATH_DEFINES /arch:SSE test_pffft.c pffft.c fftpack.c
263272

264273

265274

266-
Ubuntu 11.04 on Pandaboard, gcc-4.5.2, 32-bit, with fftw 3.3.1 beta (neon enabled), on a 1GHz ARM Cortex A9 (TI OMAP4430)
275+
Ubuntu 12.04, gcc-4.7.3, 32-bit, with fftw 3.3.3 (built with --enable-neon), on a 1.2GHz ARM Cortex A9 (Tegra 3)
267276

268277
Built with:
269-
gcc-4.5 -O3 -DHAVE_FFTW -march=armv7-a -mtune=cortex-a9 -mfloat-abi=softfp -mfpu=neon -ffast-math test_pffft.c pffft.c -o test_pffft_arm fftpack.c -lm -I/usr/local/include/ -L/usr/local/lib/ -lfftw3f
278+
gcc-4.7 -O3 -DHAVE_FFTW -march=armv7-a -mtune=cortex-a9 -mfloat-abi=hard -mfpu=neon -ffast-math test_pffft.c pffft.c -o test_pffft_arm fftpack.c -lm -I/usr/local/include/ -L/usr/local/lib/ -lfftw3f
270279

271-
I must admit that the performance was a bit disappointing here...
280+
| input len |real FFTPack| real FFTW | real PFFFT | |cplx FFTPack| cplx FFTW | cplx PFFFT |
281+
|-----------+------------+------------+------------| |------------+------------+------------|
282+
| 64 | 549 | 452 | 731 | | 512 | 602 | 640 |
283+
| 96 | 421 | 272 | 702 | | 496 | 571 | 602 |
284+
| 128 | 498 | 512 | 815 | | 597 | 618 | 652 |
285+
| 160 | 521 | 536 | 815 | | 586 | 669 | 625 |
286+
| 192 | 539 | 571 | 883 | | 485 | 597 | 626 |
287+
| 256 | 640 | 539 | 975 | | 569 | 611 | 671 |
288+
| 384 | 499 | 610 | 879 | | 499 | 602 | 637 |
289+
| 480 | 518 | 507 | 877 | | 496 | 661 | 616 |
290+
| 512 | 524 | 591 | 1002 | | 549 | 678 | 668 |
291+
| 640 | 542 | 612 | 955 | | 568 | 663 | 645 |
292+
| 768 | 557 | 613 | 981 | | 491 | 663 | 598 |
293+
| 800 | 514 | 353 | 882 | | 514 | 360 | 574 |
294+
| 1024 | 640 | 640 | 1067 | | 492 | 683 | 602 |
295+
| 2048 | 587 | 640 | 908 | | 486 | 640 | 552 |
296+
| 2400 | 479 | 368 | 777 | | 422 | 376 | 518 |
297+
| 4096 | 511 | 614 | 853 | | 426 | 640 | 534 |
298+
| 8192 | 415 | 584 | 708 | | 386 | 622 | 516 |
299+
| 9216 | 419 | 571 | 687 | | 364 | 586 | 506 |
300+
| 16384 | 426 | 577 | 716 | | 398 | 606 | 530 |
301+
| 32768 | 417 | 572 | 673 | | 399 | 572 | 468 |
302+
| 262144 | 219 | 380 | 293 | | 255 | 431 | 343 |
303+
| 1048576 | 202 | 274 | 237 | | 265 | 282 | 355 |
304+
|-----------+------------+------------+------------| |------------+------------+------------|
272305

273-
| N (input length) | real FFTPack | real FFTW | real PFFFT | | cplx FFTPack | cplx FFTW | cplx PFFFT |
274-
|------------------+--------------+--------------+--------------| |--------------+--------------+--------------|
275-
| 64 | 384 | 614 | 591 | | 404 | 1024 | 549 |
276-
| 96 | 324 | 702 | 562 | | 337 | 864 | 503 |
277-
| 128 | 407 | 717 | 640 | | 407 | 1086 | 543 |
278-
| 192 | 404 | 809 | 693 | | 388 | 903 | 547 |
279-
| 256 | 465 | 788 | 788 | | 427 | 871 | 594 |
280-
| 384 | 392 | 814 | 687 | | 343 | 862 | 543 |
281-
| 512 | 411 | 768 | 794 | | 372 | 940 | 583 |
282-
| 768 | 438 | 818 | 767 | | 383 | 846 | 584 |
283-
| 1024 | 427 | 800 | 883 | | 400 | 883 | 602 |
284-
| 2048 | 414 | 853 | 805 | | 343 | 828 | 477 |
285-
| 4096 | 426 | 768 | 698 | | 341 | 808 | 469 |
286-
| 8192 | 332 | 666 | 594 | | 297 | 765 | 438 |
287-
| 9216 | 335 | 660 | 571 | | 294 | 687 | 432 |
288-
| 16384 | 344 | 675 | 606 | | 314 | 709 | 456 |
289-
| 32768 | 342 | 685 | 564 | | 295 | 634 | 399 |
290-
| 262144 | 143 | 301 | 197 | | 160 | 321 | 251 |
291-
| 1048576 | 138 | 238 | 174 | | 173 | 212 | 253 |
292-
|------------------+--------------+--------------+--------------| |--------------+--------------+--------------|
306+
Same platform as above, but this time pffft and fftpack are built with clang 3.2:
307+
308+
clang -O3 -DHAVE_FFTW -march=armv7-a -mtune=cortex-a9 -mfloat-abi=hard -mfpu=neon -ffast-math test_pffft.c pffft.c -o test_pffft_arm fftpack.c -lm -I/usr/local/include/ -L/usr/local/lib/ -lfftw3f
309+
310+
| input len |real FFTPack| real FFTW | real PFFFT | |cplx FFTPack| cplx FFTW | cplx PFFFT |
311+
|-----------+------------+------------+------------| |------------+------------+------------|
312+
| 64 | 427 | 452 | 853 | | 427 | 602 | 1024 |
313+
| 96 | 351 | 276 | 843 | | 337 | 571 | 963 |
314+
| 128 | 373 | 512 | 996 | | 390 | 618 | 1054 |
315+
| 160 | 426 | 536 | 987 | | 375 | 669 | 914 |
316+
| 192 | 404 | 571 | 1079 | | 388 | 588 | 1079 |
317+
| 256 | 465 | 539 | 1205 | | 445 | 602 | 1170 |
318+
| 384 | 366 | 610 | 1099 | | 343 | 594 | 1099 |
319+
| 480 | 356 | 507 | 1140 | | 335 | 651 | 931 |
320+
| 512 | 411 | 591 | 1213 | | 384 | 649 | 1124 |
321+
| 640 | 398 | 612 | 1193 | | 373 | 654 | 901 |
322+
| 768 | 409 | 613 | 1227 | | 383 | 663 | 1044 |
323+
| 800 | 411 | 348 | 1073 | | 353 | 358 | 809 |
324+
| 1024 | 427 | 640 | 1280 | | 413 | 692 | 1004 |
325+
| 2048 | 414 | 626 | 1126 | | 371 | 640 | 853 |
326+
| 2400 | 399 | 373 | 898 | | 319 | 368 | 653 |
327+
| 4096 | 404 | 602 | 1059 | | 357 | 633 | 778 |
328+
| 8192 | 332 | 584 | 792 | | 308 | 616 | 716 |
329+
| 9216 | 322 | 561 | 783 | | 299 | 586 | 687 |
330+
| 16384 | 344 | 568 | 778 | | 314 | 617 | 745 |
331+
| 32768 | 342 | 564 | 737 | | 314 | 552 | 629 |
332+
| 262144 | 201 | 383 | 313 | | 227 | 435 | 413 |
333+
| 1048576 | 187 | 262 | 251 | | 228 | 281 | 409 |
334+
|-----------+------------+------------+------------| |------------+------------+------------|
335+
336+
So it looks like, on ARM, gcc 4.7 is the best at scalar floating point
337+
(the fftpack performance numbers are better with gcc), while clang is
338+
the best with neon intrinsics (see how pffft perf has improved with
339+
clang 3.2).

fftpack.c

+9-16
Original file line numberDiff line numberDiff line change
@@ -1493,22 +1493,14 @@ static void radf5(integer ido, integer l1, const real *cc, real *ch,
14931493
for (k = 1; k <= l1; ++k) {
14941494
for (i = 3; i <= ido; i += 2) {
14951495
ic = idp2 - i;
1496-
dr2 = wa1[i - 2] * cc_ref(i - 1, k, 2) + wa1[i - 1] *
1497-
cc_ref(i, k, 2);
1498-
di2 = wa1[i - 2] * cc_ref(i, k, 2) - wa1[i - 1] * cc_ref(
1499-
i - 1, k, 2);
1500-
dr3 = wa2[i - 2] * cc_ref(i - 1, k, 3) + wa2[i - 1] *
1501-
cc_ref(i, k, 3);
1502-
di3 = wa2[i - 2] * cc_ref(i, k, 3) - wa2[i - 1] * cc_ref(
1503-
i - 1, k, 3);
1504-
dr4 = wa3[i - 2] * cc_ref(i - 1, k, 4) + wa3[i - 1] *
1505-
cc_ref(i, k, 4);
1506-
di4 = wa3[i - 2] * cc_ref(i, k, 4) - wa3[i - 1] * cc_ref(
1507-
i - 1, k, 4);
1508-
dr5 = wa4[i - 2] * cc_ref(i - 1, k, 5) + wa4[i - 1] *
1509-
cc_ref(i, k, 5);
1510-
di5 = wa4[i - 2] * cc_ref(i, k, 5) - wa4[i - 1] * cc_ref(
1511-
i - 1, k, 5);
1496+
dr2 = wa1[i - 2] * cc_ref(i - 1, k, 2) + wa1[i - 1] * cc_ref(i, k, 2);
1497+
di2 = wa1[i - 2] * cc_ref(i, k, 2) - wa1[i - 1] * cc_ref(i - 1, k, 2);
1498+
dr3 = wa2[i - 2] * cc_ref(i - 1, k, 3) + wa2[i - 1] * cc_ref(i, k, 3);
1499+
di3 = wa2[i - 2] * cc_ref(i, k, 3) - wa2[i - 1] * cc_ref(i - 1, k, 3);
1500+
dr4 = wa3[i - 2] * cc_ref(i - 1, k, 4) + wa3[i - 1] * cc_ref(i, k, 4);
1501+
di4 = wa3[i - 2] * cc_ref(i, k, 4) - wa3[i - 1] * cc_ref(i - 1, k, 4);
1502+
dr5 = wa4[i - 2] * cc_ref(i - 1, k, 5) + wa4[i - 1] * cc_ref(i, k, 5);
1503+
di5 = wa4[i - 2] * cc_ref(i, k, 5) - wa4[i - 1] * cc_ref(i - 1, k, 5);
15121504
cr2 = dr2 + dr5;
15131505
ci5 = dr5 - dr2;
15141506
cr5 = di2 - di5;
@@ -2125,6 +2117,7 @@ static void rfftf1(integer n, real *c, real *ch, const real *wa, integer *ifac)
21252117
ix3 = ix2 + ido;
21262118
ix4 = ix3 + ido;
21272119
radf5(ido, l1, na ? ch : c, na ? c : ch, &wa[iw], &wa[ix2], &wa[ix3], &wa[ix4]);
2120+
break;
21282121
default:
21292122
if (ido == 1) {
21302123
na = 1 - na;

0 commit comments

Comments
 (0)