
#include <iostream.h>
#include <math.h>
#include <stdlib.h>
#include <time.h>

#ifdef SET_PRIORITY
#define WIN32_LEAN_AND_MEAN
#include <windows.h>
#endif

#ifndef NUM_VECTORS
#define NUM_VECTORS 100000
#endif

extern "C" int __stdcall matbyvec (
   const float* matrix,
   const float* vectors,
   int num_vec,
   int* output
);

extern "C" int __stdcall check_sse();

void main (void)
{
   // SSE test
   if (!check_sse()) {
      cout << "Your processor does not support Streaming SIMD Extensions" << endl;
      return;
   }

#ifdef SET_PRIORITY
   // This should improve performance on Win9x (no improvement on WinNT)
   SetPriorityClass (GetCurrentProcess(), REALTIME_PRIORITY_CLASS);
   SetThreadPriority (GetCurrentThread(), THREAD_PRIORITY_HIGHEST);
#endif

   // Prepare data
   float matrix[] = { 1.0f, 2.0f, 0.94f, 2.4f,
		      4.4f, 1.0f, 0.1f, 11.2f,
		      0.1f, 0.9f, 3.1f, 0.01f,
		      0.0f, 0.0f, 0.0f, 1.0f };
   float* vectors = new float[4*NUM_VECTORS+32];
   int* output = new int[2*NUM_VECTORS+8];

   // Align on 16-byte boundaries
   float* _vectors = (float*) ( ( (((int)vectors)-1) & (-16) ) + 16 );
   int*   _output  = (int*  ) ( ( (((int)output )-1) & (-8 ) ) + 8  );

   // Fill vectors
   srand (time(NULL));
   for (int i=0; i < 4*NUM_VECTORS;) {
      _vectors[i++] = 100.0f*(float)sin((float)(rand() % 6283)/1000);
      _vectors[i++] = 100.0f*(float)sin((float)(rand() % 6283)/1000);
      _vectors[i++] = 100.0f*(float)sin((float)(rand() % 6283)/1000);
      _vectors[i++] = 1.0f;
   }

   // Execute
   int num_cycles = matbyvec (matrix, _vectors, NUM_VECTORS, _output);

   // Output result
   cout << "Done in " << num_cycles << " cycles (" << (num_cycles / NUM_VECTORS)
      << " cycles per point)" << endl;

   // Cleanup
   delete [] output;
   delete [] vectors;
}
