#pragma OPENCL EXTENSION cl_amd_printf : enable

#define uint32_t uint
#define uint64_t ulong

struct r123array1x32 { uint32_t v[1]; };
struct r123array2x32 { uint32_t v[2]; };
struct r123array4x32 { uint32_t v[4]; };
struct r123array8x32 { uint32_t v[8]; };
struct r123array1x64 { uint64_t v[1]; };
struct r123array2x64 { uint64_t v[2]; };
struct r123array4x64 { uint64_t v[4]; };

inline uint32_t mulhilo32(uint32_t a, uint32_t b, uint32_t* hip)
{
    uint64_t product = ((uint64_t)a) * ((uint64_t)b);
    *hip = product>>32;
    return (uint32_t)product;
}

inline uint64_t mulhilo64(uint64_t a, uint64_t b, uint64_t* hip)
{
    *hip = mul_hi(a, b);
    return a*b;
}

inline struct r123array1x32 _philox2x32bumpkey(struct r123array1x32 key)
{
    key.v[0] += ((uint32_t)0x9E3779B9);
    return key;
}

inline struct r123array2x32 _philox4x32bumpkey(struct r123array2x32 key)
{
    key.v[0] += ((uint32_t)0x9E3779B9);
    key.v[1] += ((uint32_t)0xBB67AE85);
    return key;
}

inline struct r123array2x32 _philox2x32round(struct r123array2x32 ctr, struct r123array1x32 key) __attribute__((always_inline));
inline struct r123array2x32 _philox2x32round(struct r123array2x32 ctr, struct r123array1x32 key)
{
    uint32_t hi;
    uint32_t lo = mulhilo32(((uint32_t)0xd256d193), ctr.v[0], &hi);
    struct r123array2x32 out = { {hi ^ key.v[0] ^ ctr.v[1], lo} };
    return out;
}

inline struct r123array4x32 _philox4x32round(struct r123array4x32 ctr, struct r123array2x32 key) __attribute__((always_inline));
inline struct r123array4x32 _philox4x32round(struct r123array4x32 ctr, struct r123array2x32 key)
{
    uint32_t hi0;
    uint32_t hi1;
    uint32_t lo0 = mulhilo32(((uint32_t)0xD2511F53), ctr.v[0], &hi0);
    uint32_t lo1 = mulhilo32(((uint32_t)0xCD9E8D57), ctr.v[2], &hi1);
    struct r123array4x32 out = { {hi1 ^ ctr.v[1] ^ key.v[0], lo1, hi0 ^ ctr.v[3] ^ key.v[1], lo0} };
    return out;
}

enum r123_enum_philox2x32 { philox2x32_rounds = 10 };

typedef struct r123array2x32 philox2x32_ctr_t;
typedef struct r123array1x32 philox2x32_key_t;
typedef struct r123array1x32 philox2x32_ukey_t;

inline philox2x32_key_t philox2x32keyinit(philox2x32_ukey_t uk) { return uk; }

inline philox2x32_ctr_t philox2x32_R(unsigned int R, philox2x32_ctr_t ctr, philox2x32_key_t key) __attribute__((always_inline));
inline philox2x32_ctr_t philox2x32_R(unsigned int R, philox2x32_ctr_t ctr, philox2x32_key_t key)
{
    if (R > 0) { ctr = _philox2x32round(ctr, key); }
    if (R > 1) { key = _philox2x32bumpkey(key); ctr = _philox2x32round(ctr, key); }
    if (R > 2) { key = _philox2x32bumpkey(key); ctr = _philox2x32round(ctr, key); }
    if (R > 3) { key = _philox2x32bumpkey(key); ctr = _philox2x32round(ctr, key); }
    if (R > 4) { key = _philox2x32bumpkey(key); ctr = _philox2x32round(ctr, key); }
    if (R > 5) { key = _philox2x32bumpkey(key); ctr = _philox2x32round(ctr, key); }
    if (R > 6) { key = _philox2x32bumpkey(key); ctr = _philox2x32round(ctr, key); }
    if (R > 7) { key = _philox2x32bumpkey(key); ctr = _philox2x32round(ctr, key); }
    if (R > 8) { key = _philox2x32bumpkey(key); ctr = _philox2x32round(ctr, key); }
    if (R > 9) { key = _philox2x32bumpkey(key); ctr = _philox2x32round(ctr, key); }
    if (R > 10) { key = _philox2x32bumpkey(key); ctr = _philox2x32round(ctr, key); }
    if (R > 11) { key = _philox2x32bumpkey(key); ctr = _philox2x32round(ctr, key); }
    if (R > 12) { key = _philox2x32bumpkey(key); ctr = _philox2x32round(ctr, key); }
    if (R > 13) { key = _philox2x32bumpkey(key); ctr = _philox2x32round(ctr, key); }
    if (R > 14) { key = _philox2x32bumpkey(key); ctr = _philox2x32round(ctr, key); }
    if (R > 15) { key = _philox2x32bumpkey(key); ctr = _philox2x32round(ctr, key); }
    return ctr;
}

enum r123_enum_philox4x32 { philox4x32_rounds = 10 };

typedef struct r123array4x32 philox4x32_ctr_t;
typedef struct r123array2x32 philox4x32_key_t;
typedef struct r123array2x32 philox4x32_ukey_t;

inline philox4x32_key_t philox4x32keyinit(philox4x32_ukey_t uk) { return uk; }

inline philox4x32_ctr_t philox4x32_R(unsigned int R, philox4x32_ctr_t ctr, philox4x32_key_t key) __attribute__((always_inline));
inline philox4x32_ctr_t philox4x32_R(unsigned int R, philox4x32_ctr_t ctr, philox4x32_key_t key)
{
    if (R > 0) { ctr = _philox4x32round(ctr, key); }
    if (R > 1) { key = _philox4x32bumpkey(key); ctr = _philox4x32round(ctr, key); }
    if (R > 2) { key = _philox4x32bumpkey(key); ctr = _philox4x32round(ctr, key); }
    if (R > 3) { key = _philox4x32bumpkey(key); ctr = _philox4x32round(ctr, key); }
    if (R > 4) { key = _philox4x32bumpkey(key); ctr = _philox4x32round(ctr, key); }
    if (R > 5) { key = _philox4x32bumpkey(key); ctr = _philox4x32round(ctr, key); }
    if (R > 6) { key = _philox4x32bumpkey(key); ctr = _philox4x32round(ctr, key); }
    if (R > 7) { key = _philox4x32bumpkey(key); ctr = _philox4x32round(ctr, key); }
    if (R > 8) { key = _philox4x32bumpkey(key); ctr = _philox4x32round(ctr, key); }
    if (R > 9) { key = _philox4x32bumpkey(key); ctr = _philox4x32round(ctr, key); }
    if (R > 10) { key = _philox4x32bumpkey(key); ctr = _philox4x32round(ctr, key); }
    if (R > 11) { key = _philox4x32bumpkey(key); ctr = _philox4x32round(ctr, key); }
    if (R > 12) { key = _philox4x32bumpkey(key); ctr = _philox4x32round(ctr, key); }
    if (R > 13) { key = _philox4x32bumpkey(key); ctr = _philox4x32round(ctr, key); }
    if (R > 14) { key = _philox4x32bumpkey(key); ctr = _philox4x32round(ctr, key); }
    if (R > 15) { key = _philox4x32bumpkey(key); ctr = _philox4x32round(ctr, key); }
    return ctr;
}

inline struct r123array1x64 _philox2x64bumpkey(struct r123array1x64 key)
{
    key.v[0] += ((ulong)(0x9E3779B97F4A7C15UL));
    return key;
}

inline struct r123array2x64 _philox4x64bumpkey(struct r123array2x64 key)
{
    key.v[0] += ((ulong)(0x9E3779B97F4A7C15UL));
    key.v[1] += ((ulong)(0xBB67AE8584CAA73BUL));
    return key;
}

inline struct r123array2x64 _philox2x64round(struct r123array2x64 ctr, struct r123array1x64 key) __attribute__((always_inline));
inline struct r123array2x64 _philox2x64round(struct r123array2x64 ctr, struct r123array1x64 key)
{
    uint64_t hi;
    uint64_t lo = mulhilo64(((ulong)(0xD2B74407B1CE6E93UL)), ctr.v[0], &hi);
    struct r123array2x64 out = { {hi ^ key.v[0] ^ ctr.v[1], lo} };
    return out;
}

inline struct r123array4x64 _philox4x64round(struct r123array4x64 ctr, struct r123array2x64 key) __attribute__((always_inline));
inline struct r123array4x64 _philox4x64round(struct r123array4x64 ctr, struct r123array2x64 key)
{
    uint64_t hi0;
    uint64_t hi1;
    uint64_t lo0 = mulhilo64(((ulong)(0xD2E7470EE14C6C93UL)), ctr.v[0], &hi0);
    uint64_t lo1 = mulhilo64(((ulong)(0xCA5A826395121157UL)), ctr.v[2], &hi1);
    struct r123array4x64 out = { {hi1 ^ ctr.v[1] ^ key.v[0], lo1, hi0 ^ ctr.v[3] ^ key.v[1], lo0} };
    return out;
}

enum r123_enum_philox2x64 { philox2x64_rounds = 10 };

typedef struct r123array2x64 philox2x64_ctr_t;
typedef struct r123array1x64 philox2x64_key_t;
typedef struct r123array1x64 philox2x64_ukey_t;

inline philox2x64_key_t philox2x64keyinit(philox2x64_ukey_t uk) { return uk; }

inline philox2x64_ctr_t philox2x64_R(unsigned int R, philox2x64_ctr_t ctr, philox2x64_key_t key) __attribute__((always_inline));
inline philox2x64_ctr_t philox2x64_R(unsigned int R, philox2x64_ctr_t ctr, philox2x64_key_t key)
{
    if (R > 0) { ctr = _philox2x64round(ctr, key); }
    if (R > 1) { key = _philox2x64bumpkey(key); ctr = _philox2x64round(ctr, key); }
    if (R > 2) { key = _philox2x64bumpkey(key); ctr = _philox2x64round(ctr, key); }
    if (R > 3) { key = _philox2x64bumpkey(key); ctr = _philox2x64round(ctr, key); }
    if (R > 4) { key = _philox2x64bumpkey(key); ctr = _philox2x64round(ctr, key); }
    if (R > 5) { key = _philox2x64bumpkey(key); ctr = _philox2x64round(ctr, key); }
    if (R > 6) { key = _philox2x64bumpkey(key); ctr = _philox2x64round(ctr, key); }
    if (R > 7) { key = _philox2x64bumpkey(key); ctr = _philox2x64round(ctr, key); }
    if (R > 8) { key = _philox2x64bumpkey(key); ctr = _philox2x64round(ctr, key); }
    if (R > 9) { key = _philox2x64bumpkey(key); ctr = _philox2x64round(ctr, key); }
    if (R > 10) { key = _philox2x64bumpkey(key); ctr = _philox2x64round(ctr, key); }
    if (R > 11) { key = _philox2x64bumpkey(key); ctr = _philox2x64round(ctr, key); }
    if (R > 12) { key = _philox2x64bumpkey(key); ctr = _philox2x64round(ctr, key); }
    if (R > 13) { key = _philox2x64bumpkey(key); ctr = _philox2x64round(ctr, key); }
    if (R > 14) { key = _philox2x64bumpkey(key); ctr = _philox2x64round(ctr, key); }
    if (R > 15) { key = _philox2x64bumpkey(key); ctr = _philox2x64round(ctr, key); }
    return ctr;
}

enum r123_enum_philox4x64 { philox4x64_rounds = 10 };

typedef struct r123array4x64 philox4x64_ctr_t;
typedef struct r123array2x64 philox4x64_key_t;
typedef struct r123array2x64 philox4x64_ukey_t;

inline philox4x64_key_t philox4x64keyinit(philox4x64_ukey_t uk) { return uk; }

inline philox4x64_ctr_t philox4x64_R(unsigned int R, philox4x64_ctr_t ctr, philox4x64_key_t key) __attribute__((always_inline));
inline philox4x64_ctr_t philox4x64_R(unsigned int R, philox4x64_ctr_t ctr, philox4x64_key_t key)
{
    if (R > 0) { ctr = _philox4x64round(ctr, key); }
    if (R > 1) { key = _philox4x64bumpkey(key); ctr = _philox4x64round(ctr, key); }
    if (R > 2) { key = _philox4x64bumpkey(key); ctr = _philox4x64round(ctr, key); }
    if (R > 3) { key = _philox4x64bumpkey(key); ctr = _philox4x64round(ctr, key); }
    if (R > 4) { key = _philox4x64bumpkey(key); ctr = _philox4x64round(ctr, key); }
    if (R > 5) { key = _philox4x64bumpkey(key); ctr = _philox4x64round(ctr, key); }
    if (R > 6) { key = _philox4x64bumpkey(key); ctr = _philox4x64round(ctr, key); }
    if (R > 7) { key = _philox4x64bumpkey(key); ctr = _philox4x64round(ctr, key); }
    if (R > 8) { key = _philox4x64bumpkey(key); ctr = _philox4x64round(ctr, key); }
    if (R > 9) { key = _philox4x64bumpkey(key); ctr = _philox4x64round(ctr, key); }
    if (R > 10) { key = _philox4x64bumpkey(key); ctr = _philox4x64round(ctr, key); }
    if (R > 11) { key = _philox4x64bumpkey(key); ctr = _philox4x64round(ctr, key); }
    if (R > 12) { key = _philox4x64bumpkey(key); ctr = _philox4x64round(ctr, key); }
    if (R > 13) { key = _philox4x64bumpkey(key); ctr = _philox4x64round(ctr, key); }
    if (R > 14) { key = _philox4x64bumpkey(key); ctr = _philox4x64round(ctr, key); }
    if (R > 15) { key = _philox4x64bumpkey(key); ctr = _philox4x64round(ctr, key); }
    return ctr;
}

inline float randFloat(uint2 rvec) 
{
	philox2x32_ctr_t a;
	philox2x32_key_t b;
	a.v[0] = rvec.x;
	a.v[1] = rvec.x;
	b.v[0] = rvec.y;
	struct r123array2x32 r = philox2x32_R(10, a, b);
    return (float)(r.v[0]) / (float)(0xFFFFFFFF);
}
inline float randFloatRange(float lo, float hi, uint2 rvec)
{
	philox2x32_ctr_t a;
	philox2x32_key_t b;
	a.v[0] = rvec.x;
	a.v[1] = rvec.x;
	b.v[0] = rvec.y;
	struct r123array2x32 r = philox2x32_R(10, a, b);
 return lo + (float)(r.v[0])/((float)(0xFFFFFFFF)/(hi-lo));
}

float4 matrixVectorMul(float4 row0, float4 row1, float4 row2, float4 vector )
{
	float4 returnVector;
	returnVector.x = dot(row0, vector);
	returnVector.y = dot(row1, vector);
	returnVector.z = dot(row2, vector);
	returnVector.w = vector.w;
	return returnVector;
}

//sphericalCoord.x - r
//sphericalCoord.y - theta
//sphericalCoord.z - pi
/*
float4 cartesianToSpherical(float4 cartesianCoord)
{
	float r = sqrt( cartesianCoord.x * cartesianCoord.x + cartesianCoord.y * cartesianCoord.y + cartesianCoord.z * cartesianCoord.z );
	return (float4)(r, acos( cartesianCoord.z / r ), atan2( cartesianCoord.y, cartesianCoord.x ), 1.0);
}
float4 sphericalToCartesian(float4 sphericalCoord)
{
	return (float4)(sphericalCoord.x*sin(sphericalCoord.y)*cos(sphericalCoord.z), sphericalCoord.x*sin(sphericalCoord.y)*sin(sphericalCoord.z), sphericalCoord.x*cos(sphericalCoord.y), 1.0);
}
*/

/*
float4 getRandomPosOnConeTwo(float maxAngle, float scale, uint2* rvec, float4 row0, float4 row1, float4 row2)
{
	float4 v;
	v.x = randFloatRange(-maxAngle, maxAngle, rvec);
	v.y = randFloatRange(-maxAngle, maxAngle, rvec);
	v.z = 1.0f;//sqrt(1.0f - (v.x * v.x + v.y * v.y));
	v.w = 0.0f;
	v = normalize(v);
	if(v.x < (0.0 + randFloatRange(-0.05, 0.05, rvec))) v.w = 0.50196075;
	else v.w = 0.99999994;
	return matrixVectorMul(row0, row1, row2, v);

}
*/

#define PI 3.14159
/*
float4 getRandomPosOnConeThree(float4 dir, float maxAngle, float scale, uint2* rvec)
{
	float4 s = cartesianToSpherical(dir);

	
	s.x = scale;
	
	float thetaVariation = maxAngle; //randFloatRange(0.0, 3.0, rvec);// * randFloatRange(-1.0, 1.0, rvec);
	float phiVariation = randFloatRange(0.0, 0.1, rvec);
	s.y += thetaVariation;
	s.z += phiVariation;

	
	//(0 <= theta <= pi)
	if(s.y > PI) s.y -= PI;
	else if(s.y < 0) s.y += PI;

	//(0 <= phi < 2*pi)
	if(s.z >= 2*PI ) s.z -= 2*PI;
	else if (s.z < 0) s.z += 2*PI;
	/*

	//printf("T: %f ", s.z);
	volatile float color = s.z/6.0;

	//float p = s.z;
//if(p < 0) p += 6.28;
//if(p > 3.14) p = 3.14-(p-3.14);


	float4 c = sphericalToCartesian(s);
	//c.w = color;
	




//vM.w += 0.50;
//c.w = fmin((float)0.9, c.w);
//c.w = fmax((float)0.30, c.w);


return c;

}
*/

float4 getRandomPosOnCone(float maxAngle, float scale, uint2 rvec, float4 row0, float4 row1, float4 row2, float sRot)
{
float theta = maxAngle;
float phi = (get_local_id(0)*0.608318)+randFloatRange(-0.30, 0.30, rvec);  //randFloatRange(0, 6.28318, rvec);
//float phi = randFloatRange(0, 6.28318, rvec);
//(0 <= phi < 2*pi)
if(phi >= 2*PI ) phi -= 2*PI;
else if (phi < 0) phi += 2*PI;

float4 v = (float4)(cos(phi)*sin(theta), sin(phi)*sin(theta), cos(theta), 1.0);
v *= scale;
float4 vM = matrixVectorMul(row0, row1, row2, v);
//vM.w = 0.50196075;

float p = phi-sRot;
if(p < 0) p += 6.28;


if(p > 3.14) p = 3.14-(p-3.14);
vM.w = p/2.50;
//vM.w += 0.50;
vM.w = fmin((float)0.9, vM.w);
vM.w = fmax((float)0.30, vM.w);

return vM;
}

__kernel void EmitFromCone(__global float4* positions, __global float4* velocities, __global float4* attributes, float4 position, float4 row0, float4 row1, float4 row2, float coneAngle, float coneScale, int workLimit, int segmentEnd, int segmentLength, __global uint* rvec, float sRot)
{
	if(get_global_id(0) < workLimit)
	{
		uint index = get_global_id(0);
		if(index >= segmentEnd) index -= segmentLength;
		if(attributes[index].x < 0.01f)
		{
			uint2 randomSeed = (uint2)(rvec[index], get_local_id(0));
			positions[index] = position;
			velocities[index] = getRandomPosOnCone(coneAngle, coneScale, randomSeed, row0, row1, row2, sRot);
			//velocities[index] = getRandomPosOnConeThree(dir, coneAngle, coneScale, &randomSeed);
			attributes[index] = (float4)(20.0, velocities[index].w, 0.0, 1.0);
			rvec[index] += 1;
		}
		else return;
	}
	else return;
}
/*
__kernel void SpawnParticles(__global float4 *positions, __global float4 *velocities, __global float4 *attributes, __global float4* spawnPositions, __global float4* spawnVelocities, __global float4* spawnAttributes, int workLimit, int segmentEnd, int segmentLength)
{
	if(get_global_id(0) < workLimit)
	{
		uint index = get_global_id(0);
		if(index >= segmentEnd) index -= segmentLength;

		if(attributes[index].x < 0.0001f)
		{
			positions[index] = spawnPositions[index];
			velocities[index] = spawnVelocities[index];
			//TODO: mark segment?
			attributes[index] = spawnAttributes[index];
		}
	}
	else return;
}
*/