/** 
 * @file llfft.cpp
 * @brief FFT function implementations
 *
 * Copyright (c) 2003-2007, Linden Research, Inc.
 * 
 * Second Life Viewer Source Code
 * The source code in this file ("Source Code") is provided by Linden Lab
 * to you under the terms of the GNU General Public License, version 2.0
 * ("GPL"), unless you have obtained a separate licensing agreement
 * ("Other License"), formally executed by you and Linden Lab.  Terms of
 * the GPL can be found in doc/GPL-license.txt in this distribution, or
 * online at http://secondlife.com/developers/opensource/gplv2
 * 
 * There are special exceptions to the terms and conditions of the GPL as
 * it is applied to this Source Code. View the full text of the exception
 * in the file doc/FLOSS-exception.txt in this software distribution, or
 * online at http://secondlife.com/developers/opensource/flossexception
 * 
 * By copying, modifying or distributing this software, you acknowledge
 * that you have read and understood your obligations described above,
 * and agree to abide by those obligations.
 * 
 * ALL LINDEN LAB SOURCE CODE IS PROVIDED "AS IS." LINDEN LAB MAKES NO
 * WARRANTIES, EXPRESS, IMPLIED OR OTHERWISE, REGARDING ITS ACCURACY,
 * COMPLETENESS OR PERFORMANCE.
 */

/*
 *  Fast Fourier Transform
 *
 */
#include "llviewerprecompiledheaders.h"

#include "llfft.h"
#include "llerror.h"

/*
**
*********************************************************************
** Forward and inverse discrete Fourier transforms on complex data **
*********************************************************************
**
**
** forward_fft(array, rows, cols)
**               COMPLEX *array;
**               S32 rows, cols;
**
** inverse_fft(array, rows, cols)
**               COMPLEX *array;
**               S32 rows, cols;
**
** These entry points compute the forward and inverse DFT's, respectively, 
** of a single-precision COMPLEX array.
**
** The result is a COMPLEX array of the same size, returned in 
** the same space as the input array.  That is, the original array is
** overwritten and destroyed.
**
** Rows and columns must each be an integral power of 2.
**
** These routines return integer value -1 if an error was detected,
** 0 otherwise
**
** This implementation of the DFT uses the transform pair defined as follows.
**
** Let there be two COMPLEX arrays each with n rows and m columns
** Index them as 
** f(x,y):    0 <= x <= m - 1,  0 <= y <= n - 1
** F(u,v):    -m/2 <= u <= m/2 - 1,  -n/2 <= v <= n/2 - 1
**
** Then the forward and inverse transforms are related as
**
** Forward:
**
** F(u,v) = \sum_{x=0}^{m-1} \sum_{y=0}^{n-1} 
**                      f(x,y) \exp{-2\pi i (ux/m + vy/n)}
**
**
** Inverse:
**
** f(x,y) = 1/(mn) \sum_{u=-m/2}^{m/2-1} \sum_{v=-n/2}^{n/2-1} 
**                      F(u,v) \exp{2\pi i (ux/m + vy/n)}
**  
** Therefore, the transforms have these properties:
** 1.  \sum_x \sum_y  f(x,y) = F(0,0)
** 2.  m n \sum_x \sum_y |f(x,y)|^2 = \sum_u \sum_v |F(u,v)|^2
**
*/


//DPCOMPLEX 	*stageBuff;  /* buffer to hold a row or column at a time */
//COMPLEX 	*bigBuff;    /* a pointer to the input array */


/*
 *	These macros move complex data between bigBuff and
 *	stageBuff
 */

inline void LoadRow(DPCOMPLEX* stageBuff, COMPLEX* bigBuff, U32 row, U32 cols)
{
    for (U32 j = row*cols, k = 0 ; k < cols ; j++, k++)
	{
		stageBuff[k].re = bigBuff[j].re;
		stageBuff[k].im = bigBuff[j].im;
	}
}

inline void StoreRow(DPCOMPLEX* stageBuff, COMPLEX* bigBuff, U32 row, U32 cols)
{
    for (U32 j = row*cols, k = 0 ; k < cols ; j++, k++)
	{
		bigBuff[j].re = (F32)stageBuff[k].re;
		bigBuff[j].im = (F32)stageBuff[k].im;
	}
}

inline void LoadCol(DPCOMPLEX* stageBuff, COMPLEX* bigBuff, U32 col, U32 rows, U32 cols)
{
    for (U32 j = col,k = 0 ; k < rows ; j+=cols, k++)
	{
		stageBuff[k].re = bigBuff[j].re;
		stageBuff[k].im = bigBuff[j].im;
	}
}

inline void StoreCol(DPCOMPLEX* stageBuff, COMPLEX* bigBuff, U32 col, U32 rows, U32 cols)
{
    for (U32 j = col,k = 0 ; k < rows ; j+=cols, k++)
	{
		bigBuff[j].re = (F32)stageBuff[k].re;
		bigBuff[j].im = (F32)stageBuff[k].im;
	}
}


/* do something with an error message */
inline void handle_error(S8* msg)
{
	llerrs << msg << llendl;
}


/* 
** compute DFT: forward if direction==0, inverse if direction==1
** array must be COMPLEX 
*/ 
BOOL fft(const LLFFTPlan& plan, COMPLEX *array, S32 rows, S32 cols, S32 direction)
{
	S32 i;
	
	if (!plan.valid() || plan.rows() != rows || plan.cols() != cols)
		return FALSE;
	
	/* compute transform row by row */
	
	if(cols>1)
	{
		for(i=0;i<rows;i++) 
		{
			LoadRow(plan.buffer(), array, i, cols);
			FFT842(direction, cols, plan.buffer());
			StoreRow(plan.buffer(), array, i, cols);
		}
	}
	
	if(rows<2) /* done */
	{
		//freeBuffer();
		return TRUE;
	}
	
	
	/* compute transform column by column */
	
	for(i=0;i<cols;i++) 
	{
		LoadCol(plan.buffer(), array, i, rows, cols);
		FFT842(direction, rows, plan.buffer());
		StoreCol(plan.buffer(), array, i, rows, cols);
	}
	
	//freeBuffer();
	
	return TRUE;
}



/*
** FFT842
** This routine replaces the input DPCOMPLEX vector by its
** finite discrete complex fourier transform if in==0.
** It replaces the input DPCOMPLEX vector by its
** finite discrete complex inverse fourier transform if in==1.
**
**     in - FORWARD or INVERSE
**     n - length of vector
**     b - input vector
**
** It performs as many base 8 iterations as possible and
** then finishes with a base 4 iteration or a base 2 
** iteration if needed.
**
** Ported from the FORTRAN code in Programming for Digital Signal Processing,
** IEEE Press 1979, Section 1, by G. D. Bergland and M. T. Dolan
**
*/
void FFT842(S32 in, S32 n, DPCOMPLEX *b)
{
	F64 fn, r, fi;
	
	S32 L[16],L1,L2,L3,L4,L5,L6,L7,L8,L9,L10,L11,L12,L13,L14,L15;
	S32 j1,j2,j3,j4,j5,j6,j7,j8,j9,j10,j11,j12,j13,j14;
	S32 i, j, ij, ji, ij1, ji1;
	S32 n2pow, n8pow, nthpo, ipass, nxtlt, lengt;
	
	n2pow = fastlog2(n);
	nthpo = n;
	fn = 1.0 / (F64)nthpo;
	
	
	if(in==FORWARD)
	{
		/* take conjugate  */
		for(i=0;i<n;i++)
		{
			b[i].im *= -1.0;
		}
	}
	
	if(in==INVERSE)
	{
		/* scramble inputs */
		for(i=0,j=n/2;j<n;i++,j++) 
		{
			r  = b[j].re;
			fi = b[j].im;
			b[j].re = b[i].re;
			b[j].im = b[i].im;
			b[i].re = r;
			b[i].im = fi;
		}
	}
	
	n8pow = n2pow/3;
	
	if(n8pow) 
    {
		/* radix 8 iterations */
		for(ipass=1;ipass<=n8pow;ipass++) 
		{
			nxtlt = 0x1 << (n2pow - 3*ipass);
			lengt = 8*nxtlt;
			R8TX(nxtlt,nthpo,lengt,
				b,b+nxtlt,b+2*nxtlt,
				b+3*nxtlt,b+4*nxtlt,b+5*nxtlt,
				b+6*nxtlt,b+7*nxtlt);
		}
    }
	
	if(n2pow%3 == 1) 
    {
		/* radix 2 iteration needed */
		R2TX(nthpo,b,b+1); 
    }
	
	
	if(n2pow%3 == 2)  
    {
		/* radix 4 iteration needed */
		R4TX(nthpo,b,b+1,b+2,b+3); 
    }
	
	
	
	for(j=1;j<=15;j++) 
    {
		L[j] = 1;
		if(j-n2pow <= 0) L[j] = 0x1 << (n2pow + 1 - j);
    }
	L15=L[1];L14=L[2];L13=L[3];L12=L[4];L11=L[5];L10=L[6];L9=L[7];
	L8=L[8];L7=L[9];L6=L[10];L5=L[11];L4=L[12];L3=L[13];L2=L[14];L1=L[15];
	
	ij = 1;
    
	for(j1=1;j1<=L1;j1++)
		for(j2=j1;j2<=L2;j2+=L1)
			for(j3=j2;j3<=L3;j3+=L2)
				for(j4=j3;j4<=L4;j4+=L3)
					for(j5=j4;j5<=L5;j5+=L4)
						for(j6=j5;j6<=L6;j6+=L5)
							for(j7=j6;j7<=L7;j7+=L6)
								for(j8=j7;j8<=L8;j8+=L7)
									for(j9=j8;j9<=L9;j9+=L8)
										for(j10=j9;j10<=L10;j10+=L9)
											for(j11=j10;j11<=L11;j11+=L10)
												for(j12=j11;j12<=L12;j12+=L11)
													for(j13=j12;j13<=L13;j13+=L12)
														for(j14=j13;j14<=L14;j14+=L13)
															for(ji=j14;ji<=L15;ji+=L14) 
															{
																ij1 = ij-1;
																ji1 = ji-1;
																
																if(ij-ji<0) 
																{
																	r = b[ij1].re;
																	b[ij1].re = b[ji1].re;
																	b[ji1].re = r;
																	fi = b[ij1].im;
																	b[ij1].im = b[ji1].im;
																	b[ji1].im = fi;
																}
																ij++;
															}
															
															if(in==FORWARD)  // take conjugates & unscramble outputs
															{
																for(i=0,j=n/2;j<n;i++,j++)
																{
																	r = b[j].re; 
																	fi = b[j].im;
																	b[j].re = b[i].re;
																	b[j].im = -b[i].im;
																	b[i].re = r;
																	b[i].im = -fi;
																}
															}

															if(in==INVERSE) // scale outputs
															{
																for(i=0;i<nthpo;i++) 
																{
																	b[i].re *= fn;
																	b[i].im *= fn;
																}
															}
}


/*
** radix 2 iteration subroutine
*/
void R2TX(S32 nthpo, DPCOMPLEX *c0, DPCOMPLEX *c1)
{
	S32 k,kk;
	F64 *cr0, *ci0, *cr1, *ci1, r1, fi1;
	
	cr0 = &(c0[0].re);
	ci0 = &(c0[0].im);
	cr1 = &(c1[0].re);
	ci1 = &(c1[0].im);
	
	for(k = 0; k < nthpo; k += 2) 
    {
		kk = k*2;
		
		r1 = cr0[kk] + cr1[kk];
		cr1[kk] = cr0[kk] - cr1[kk];
		cr0[kk] = r1;
		fi1 = ci0[kk] + ci1[kk];
		ci1[kk] = ci0[kk] - ci1[kk];
		ci0[kk] = fi1;
    }
}


/*
** radix 4 iteration subroutine
*/
void R4TX(S32 nthpo, DPCOMPLEX *c0, DPCOMPLEX *c1, DPCOMPLEX *c2, DPCOMPLEX *c3)
{
	S32 k,kk;
	F64 *cr0, *ci0, *cr1, *ci1, *cr2, *ci2, *cr3, *ci3;
	F64 r1,r2,r3,r4,i1,i2,i3,i4;
	
	cr0 = &(c0[0].re);
	cr1 = &(c1[0].re);
	cr2 = &(c2[0].re);
	cr3 = &(c3[0].re);
	ci0 = &(c0[0].im);
	ci1 = &(c1[0].im);
	ci2 = &(c2[0].im);
	ci3 = &(c3[0].im);
	
	for(k = 1; k <= nthpo; k += 4) 
    {
		kk = (k-1)*2;  /* real and imag parts alternate */
		
		r1 = cr0[kk] + cr2[kk];
		r2 = cr0[kk] - cr2[kk];
		r3 = cr1[kk] + cr3[kk];
		r4 = cr1[kk] - cr3[kk];
		i1 = ci0[kk] + ci2[kk];
		i2 = ci0[kk] - ci2[kk];
		i3 = ci1[kk] + ci3[kk];
		i4 = ci1[kk] - ci3[kk];
		cr0[kk] = r1 + r3;
		ci0[kk] = i1 + i3;
		cr1[kk] = r1 - r3;
		ci1[kk] = i1 - i3;
		cr2[kk] = r2 - i4;
		ci2[kk] = i2 + r4;
		cr3[kk] = r2 + i4;
		ci3[kk] = i2 - r4;
    }
}


	
/*
** radix 8 iteration subroutine
*/
void R8TX(S32 nxtlt, S32 nthpo, S32 lengt, DPCOMPLEX *cc0, DPCOMPLEX *cc1, DPCOMPLEX *cc2,
		  DPCOMPLEX *cc3, DPCOMPLEX *cc4, DPCOMPLEX *cc5, DPCOMPLEX *cc6, DPCOMPLEX *cc7)
{
	S32 j,k,kk;
	F64 scale, arg, tr, ti;
	F64 c1,c2,c3,c4,c5,c6,c7;
	F64 s1,s2,s3,s4,s5,s6,s7;
	F64 ar0,ar1,ar2,ar3,ar4,ar5,ar6,ar7;
	F64 ai0,ai1,ai2,ai3,ai4,ai5,ai6,ai7;
	F64 br0,br1,br2,br3,br4,br5,br6,br7;
	F64 bi0,bi1,bi2,bi3,bi4,bi5,bi6,bi7;
	
	F64 *cr0,*cr1,*cr2,*cr3,*cr4,*cr5,*cr6,*cr7;
	F64 *ci0,*ci1,*ci2,*ci3,*ci4,*ci5,*ci6,*ci7;
	
	cr0 = &(cc0[0].re);
	cr1 = &(cc1[0].re);
	cr2 = &(cc2[0].re);
	cr3 = &(cc3[0].re);
	cr4 = &(cc4[0].re);
	cr5 = &(cc5[0].re);
	cr6 = &(cc6[0].re);
	cr7 = &(cc7[0].re);
	
	ci0 = &(cc0[0].im);
	ci1 = &(cc1[0].im);
	ci2 = &(cc2[0].im);
	ci3 = &(cc3[0].im);
	ci4 = &(cc4[0].im);
	ci5 = &(cc5[0].im);
	ci6 = &(cc6[0].im);
	ci7 = &(cc7[0].im);
	
	
	scale = F_TWO_PI/lengt;
	
	for(j = 1; j <= nxtlt; j++) 
    {
		arg = (j-1)*scale;
		c1 = cos(arg);
		s1 = sin(arg);
		c2 = c1*c1 - s1*s1;
		s2 = c1*s1 + c1*s1;
		c3 = c1*c2 - s1*s2;
		s3 = c2*s1 + s2*c1;
		c4 = c2*c2 - s2*s2;
		s4 = c2*s2 + c2*s2;
		c5 = c2*c3 - s2*s3;
		s5 = c3*s2 + s3*c2;
		c6 = c3*c3 - s3*s3;
		s6 = c3*s3 + c3*s3;
		c7 = c3*c4 - s3*s4;
		s7 = c4*s3 + s4*c3;
		
		for(k = j; k <= nthpo; k += lengt) 
		{
			kk = (k-1)*2; /* index by twos; re & im alternate */
			
			ar0 = cr0[kk] + cr4[kk];
			ar1 = cr1[kk] + cr5[kk];
			ar2 = cr2[kk] + cr6[kk];
			ar3 = cr3[kk] + cr7[kk];
			ar4 = cr0[kk] - cr4[kk];
			ar5 = cr1[kk] - cr5[kk];
			ar6 = cr2[kk] - cr6[kk];
			ar7 = cr3[kk] - cr7[kk];
			ai0 = ci0[kk] + ci4[kk];
			ai1 = ci1[kk] + ci5[kk];
			ai2 = ci2[kk] + ci6[kk];
			ai3 = ci3[kk] + ci7[kk];
			ai4 = ci0[kk] - ci4[kk];
			ai5 = ci1[kk] - ci5[kk];
			ai6 = ci2[kk] - ci6[kk];
			ai7 = ci3[kk] - ci7[kk];
			br0 = ar0 + ar2;
			br1 = ar1 + ar3;
			br2 = ar0 - ar2;
			br3 = ar1 - ar3;
			br4 = ar4 - ai6;
			br5 = ar5 - ai7;
			br6 = ar4 + ai6;
			br7 = ar5 + ai7;
			bi0 = ai0 + ai2;
			bi1 = ai1 + ai3;
			bi2 = ai0 - ai2;
			bi3 = ai1 - ai3;
			bi4 = ai4 + ar6;
			bi5 = ai5 + ar7;
			bi6 = ai4 - ar6;
			bi7 = ai5 - ar7;
			cr0[kk] = br0 + br1;
			ci0[kk] = bi0 + bi1;
			if(j > 1) 
			{
				cr1[kk] = c4*(br0-br1) - s4*(bi0-bi1);
				cr2[kk] = c2*(br2-bi3) - s2*(bi2+br3);
				cr3[kk] = c6*(br2+bi3) - s6*(bi2-br3);
				ci1[kk] = c4*(bi0-bi1) + s4*(br0-br1);
				ci2[kk] = c2*(bi2+br3) + s2*(br2-bi3);
				ci3[kk] = c6*(bi2-br3) + s6*(br2+bi3);
				tr = OO_SQRT2*(br5-bi5);
				ti = OO_SQRT2*(br5+bi5);
				cr4[kk] = c1*(br4+tr) - s1*(bi4+ti);
				ci4[kk] = c1*(bi4+ti) + s1*(br4+tr);
				cr5[kk] = c5*(br4-tr) - s5*(bi4-ti);
				ci5[kk] = c5*(bi4-ti) + s5*(br4-tr);
				tr = -OO_SQRT2*(br7+bi7);
				ti = OO_SQRT2*(br7-bi7);
				cr6[kk] = c3*(br6+tr) - s3*(bi6+ti);
				ci6[kk] = c3*(bi6+ti) + s3*(br6+tr);
				cr7[kk] = c7*(br6-tr) - s7*(bi6-ti);
				ci7[kk] = c7*(bi6-ti) + s7*(br6-tr);
			}
			else 
			{
				cr1[kk] = br0 - br1;
				cr2[kk] = br2 - bi3;
				cr3[kk] = br2 + bi3;
				ci1[kk] = bi0 - bi1;
				ci2[kk] = bi2 + br3;
				ci3[kk] = bi2 - br3;
				tr = OO_SQRT2*(br5-bi5);
				ti = OO_SQRT2*(br5+bi5);
				cr4[kk] = br4 + tr;
				ci4[kk] = bi4 + ti;
				cr5[kk] = br4 - tr;
				ci5[kk] = bi4 - ti;
				tr = -OO_SQRT2*(br7+bi7);
				ti = OO_SQRT2*(br7-bi7);
				cr6[kk] = br6 + tr;
				ci6[kk] = bi6 + ti;
				cr7[kk] = br6 - tr;
				ci7[kk] = bi6 - ti;
			}
		}
    }
}
	

/* see if exactly one bit is set in integer argument */
S32 power_of_2(S32 n)
{
	S32 bits=0;
	while(n)
	{
		bits += n & 1;
		n >>= 1;
	}
	return(bits==1);
}

/* get binary log of integer argument; exact if n a power of 2 */
S32 fastlog2(S32 n)
{
	S32 log = -1;
	while(n)
	{
		log++;
		n >>= 1;
	}
	return(log);
}