/* time_mp8.c  time parallelism of four processor multiprocessor */
/*             gcc -o time_mp8 -O3 time_mp8.c -lm -lpthread      */
/*             time_mp8 > time_mp8.out                           */

#define NTHREADS 8

/* Linux with glibc:
 *   _REENTRANT to grab thread-safe libraries
 *   _POSIX_SOURCE to get POSIX semantics
 */
#ifdef __linux__
#  define _REENTRANT
#  define _POSIX_SOURCE
#  define _P __P
#endif

#include <pthread.h>
#include <string.h>
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <time.h>

     /* FTYPE may be  double  or   float */
#define FTYPE double
void fft(FTYPE *b, FTYPE *a2, FTYPE *a1, int n, int sgn);
#define PI 3.141592653589793238462643

/* this is about 20 seconds computation per thread */
void *compute(void *arg)
{
  int myid=*(int *)arg;

  printf("compute %d: running  \n",myid);
  fflush(stdout); /* a good idea inside a thread */

  /* time_mp8  canned data, timing test */
  int i, j, k, m, n;
  FTYPE *a1, *a2, *b;
  int sgn = 1;	   /* default sign is 1	for FFT */
  FTYPE w;

  n = 65536; /* number of complex points */
  m = 32;    /* number of FFT and IFFT   */
  w = 2.0*PI/(FTYPE)n;

  printf("compute %d: allocating space for 4*%d elements\n", myid, n);
  b =  (FTYPE *) malloc( 2*n*sizeof(FTYPE) );  /* complex */
  a1 = (FTYPE *) malloc( n*sizeof(FTYPE) );    /* real */
  a2 = (FTYPE *) malloc( n*sizeof(FTYPE) );    /* real */
  printf("           running %d FFT and IFFT of length %d\n", m, n);
  fflush(stdout);

  for (i=0; i<m; i++)/* loop on data sets */
  {
    /* generate  b  test data, real, imag,  n items */
    for(j=0; j<n; j++)
    {
      *(b+2*j)   = cos(w*j)+cos(w*(i+1));
      *(b+2*j+1) = sin(w*j)+sin(w*(i+1+myid));
    }
    if(i==0 || i==m-1)
    {
      printf("bd(%d,%d)=%g,%g,%g,%g \n", i, j, *(b+0), *(b+1), *(b+2), *(b+3));
      fflush(stdout);
    }
    for(k=0; k<4; k++)
    {
      fft(b, a1, a2, n, sgn); /* eat up cpu time */
    }
    if(i==0 || i==m-1)
    {
      printf("bf(%d,%d)=%g,%g,%g,%g \n", i, j, *(b+0), *(b+1), *(b+2), *(b+3));
      fflush(stdout);
    }
    fft(b, a1, a2, n, -sgn);
    if(i==0 || i==m-1)
    {
      printf("bi(%d,%d)=%g,%g,%g,%g \n", i, j, *(b+0), *(b+1), *(b+2), *(b+3));
      fflush(stdout);
    }
  }
  return arg;
} /* end compute */

/* bitrev(a, k) -- reverse bits 0 thru k-1 in the integer "a" */
int bitrev(int a, int k)
{
  unsigned int i, b, p, q;
  for(i=b=0, p=1, q=1<<(k-1); i<k; i++, p<<=1, q>>=1) if(a & q) b|=p;
  return b;
} /* end bitrev */

/* ilog2(n) -- return an integer log, base 2 */
int ilog2(int n)
{
  int i;
  for (i=8*sizeof(int)-1; i>=0 && ((1<<i) & n)==0; i--);
  return i;
} /* end ilog2 */

/* fft(b, a2, a1, n, sgn) -- do an n-point fft of complex vector b and 
 * return the result in b.  b consists of 2*n FTYPE elements, organized 
 * as n complex pairs real_1, imag_1, real_2, imag_2, ..., real_n, imag_n.
 * the arrays a1 and a2 are used for working storage and each has n FTYPE 
 * elements.  sgn is 1 for an FFT, and -1 for an IFFT.  The procedure is
 * taken from the Cormen, Leiserson, and Rivest Algorithms text, in the 
 * section on efficient FFT implementations.  This implementation wastes 
 * some space; the b array should probably be dropped and the initial and  
 * final staging done in-place in the "a" arrays.
 */
void fft (FTYPE *b, FTYPE *a2, FTYPE *a1, int n, int sgn)
{
  int i, j, k, k2, s, m, log2n;
  double wm1, wm2, w1, w2, t1, t2, u1, u2;

  log2n = ilog2(n);

  /* reorder input and split input into real and complex parts */
  for (i=0; i<n; i++)
  {
    j = bitrev(i,log2n);
    a1[j] = b[2*i];
    a2[j] = b[2*i+1];
  }

  /* loop on FFT stages */
  for (s=1; s<=log2n; s++)
  {
    m = 1<<s;			/* m = 2^s */
    wm1 = cos(sgn*2*PI/m);	/* wm = exp(q*2*pi*i/m); */			
    wm2 = sin(sgn*2*PI/m);

    w1 = 1.0;
    w2 = 0.0;

    for (j=0; j<m/2; j++)
    {
      for (k=j; k<n; k+=m)
      {
	/* t = w*a[k+m/2]; */
	k2 = k+m/2;				
	t1 = w1 * a1[k2]  -  w2 * a2[k2] ;
	t2 = w1 * a2[k2]  +  w2 * a1[k2] ;
	u1 = a1[k];
	u2 = a2[k];
	a1[k] = u1 + t1;
	a2[k] = u2 + t2;
	a1[k2] = u1 - t1;
	a2[k2] = u2 - t2;
      }
      /* w = w * wm; */
      t1 = w1 * wm1  -  w2 * wm2 ;
      w2 = w1 * wm2  +  w2 * wm1 ;
      w1 = t1;
    }
  }
  
  /* flip the final stage */
  for (i=1; i<n/2; i++)
  {
    t1 = a1[i]; 
    a1[i] = a1[n-i]; 
    a1[n-i] = t1;
    t2 = a2[i]; 
    a2[i] = a2[n-i]; 
    a2[n-i] = t2;
  }

  /* copy out results */
  if (sgn == -1)
    /* scale by n for ifft */
    for (i=0; i<n; i++)
    {
      b[2*i] = a1[i] /= (FTYPE) n;
      b[2*i+1] = a2[i] /= (FTYPE) n;
    }
  else
    /* just copy the data out */
    for (i=0; i<n; i++)
    {
      b[2*i] = a1[i];
      b[2*i+1] = a2[i];
    }
} /* end fft */


/* this is the main thread's control code, no significant computation */
int main(int argc, char *argv[])
{
  int worker;
  int ids[NTHREADS];                          /* worker id number */
  pthread_t threads[NTHREADS];                /* holds thread info */
  int errcode;                                /* holds pthread error code */
  int *status;                                /* holds return code */
  double now, wnow;                           /* time CPU and wall */

  printf("time_mp8.c running \n");

  now = (double)clock()/(double)CLOCKS_PER_SEC;
  wnow = (double)time(NULL);

  /* create the threads */
  for (worker=0; worker<NTHREADS; worker++)
  {
    ids[worker]=worker; /* we are passing an address, 'worker' may change */
    if (errcode=pthread_create(&threads[worker],/* thread struct             */
		       NULL,                    /* default thread attributes */
		       compute,                 /* start thread              */
		       &ids[worker]))           /* arg to thread             */
    {
      printf("worker &d failed create: %s\n", worker, strerror(errcode));
      exit(1);
    }
  }
  /* wait for all threads to exit */
  for (worker=0; worker<NTHREADS; worker++)
  {
    if (errcode=pthread_join(threads[worker],(void *) &status))
    { 
      printf("worker &d failed join: %s\n", worker, strerror(errcode));
      exit(1);
    }
    /* check thread's exit status and release its resources */
    if (*status != worker)
    {
      printf("worker %d terminated abnormally \n", worker);
      exit(1);
    }
  }
  now = (double)clock()/(double)CLOCKS_PER_SEC - now;
  printf("total CPU time is %f seconds \n", now);
  wnow = (double)time(NULL) - wnow;
  printf("wall time is %f seconds \n", wnow);
  printf("average number of processors used = %f \n", now/wnow);
  printf("time_mp8.c exiting \n");
  return 0;
} /* end time_mp4.c */

