
#ifndef INTERPOLATE_NORMALS
#define INTERPOLATE_NORMALS 1
#endif

#ifndef MAX_BOUNCES
#define MAX_BOUNCES 4
#endif

#ifndef MAX_TRACE_LENGTH
#define MAX_TRACE_LENGTH 512
#endif

uniform usampler3D  s_grid_marker;

uint64_t            g_buff_grid_markers_ptr = 0;		// this is dangerous, but in DEBUG we do have some validation here

//--------------- DDA Intersection support ----------------------------------

struct ray_traversal_params
{
	float    trace_range_primary;
	float    trace_range_secondary;
};

const uint TF_NONE               = 0;
const uint TF_IS_INSIDE          = 1;
const uint TF_PREV_INSIDE        = 2;
const uint TF_HIT                = 4;
const uint TF_EARLY_EXIT         = 8;
const uint TF_INSIDE_TRANSPARENT = 16;
const uint TF_RUNNING            = 32;
const uint TF_LEFT               = 64;
const uint TF_DEBUG              = 128;

bool rt_is_mask_set(uint f, uint mask)
{
	return (f & mask) != 0;
}

uint rt_set_mask(uint f, uint m)
{
	return f |= m;
}

uint rt_set_mask_if(uint f, uint m, bool v)
{
	if (v)
		f |= m;

	return f;
}

uint rt_clear_mask(uint f, uint m)
{
	return f &= ~m;
}

uint rt_clear_mask_if(uint f, uint m, bool v)
{
	if (v)
		f &= ~m;

	return f;
}

uint rt_neg_mask(uint f, uint m)
{
	return f ^= m;
}

uint rt_set_clear_mask_if(uint f, uint m, bool v)
{
	if (v)
		return f |= m;
	else
		return f &= ~m;
}

#ifndef RT_TRAVERSAL_HAS_USER_RAY_STATE
struct ray_state_user_data
{
	uint dummy;
};
#endif

struct ray_state
{
	f16vec3             color;
	f16vec3             normal;
	vec3                dir;
	vec3                origin;
	float16_t           transparency;
	//bool                running;
	int16_t             bounces;
	int16_t             material;
	int                 tests;
	int                 face_tests;
	uint                flags;
	//bool                hit;
	//bool                left;
	//bool                inside_transparent;
	//bool                early_exit;
	uint                active_threads_factor;
	uint                active_threads_samples;
	float               final_color_factor;

	ray_state_user_data user_data;
};

// user callbacks

void evaluate_material(in out ray_state state, in vec3 prev_state_origin, int hit_face, uint hit_material_flags, f16vec2 bc, bool flip_normal_on_glass);

//
#define GRID_SIZE in_bbox_data.grid_size_raytrace.xyz
#define GRID_SIZE_RECIP in_bbox_data.grid_size_raytrace_recip.xyz

//

void build_triangle(uint idx, out vec3 p0, out vec3 p1, out vec3 p2)
{
	RTFace rt_face = rt_get_face(idx);
	uint p0_idx = rt_face.v0;
	uint p1_idx = rt_face.v1;
	uint p2_idx = rt_face.v2;

	p0 = rt_get_vertex(p0_idx);
	p1 = rt_get_vertex(p1_idx);
	p2 = rt_get_vertex(p2_idx);
}

vec3 barycentric_for_face(int idx, vec3 p)
{
	RTFace rt_face = rt_get_face(idx);
	uint p0_idx = rt_face.v0;
	uint p1_idx = rt_face.v1;
	uint p2_idx = rt_face.v2;

	vec3 a = rt_get_vertex(p0_idx);
	vec3 b = rt_get_vertex(p1_idx);
	vec3 c = rt_get_vertex(p2_idx);

	return rt_barycentric_xyz(p, a, b, c);
}

vec2 barycentric_for_face_yz(int idx, vec3 p)
{
	RTFace rt_face = rt_get_face(idx);
	uint p0_idx = rt_face.v0;
	uint p1_idx = rt_face.v1;
	uint p2_idx = rt_face.v2;

	vec3 a = rt_get_vertex(p0_idx);
	vec3 b = rt_get_vertex(p1_idx);
	vec3 c = rt_get_vertex(p2_idx);

	return rt_barycentric_yz(p, a, b, c);
}

vec3 build_normal(int fi)
{
	vec3 p0, p1, p2;
	build_triangle(fi, p0, p1, p2);
		
	vec3 e1 = p1 - p0;
	vec3 e2 = p2 - p0;
	return normalize(cross(e1, e2));
}

//---------------------------------------------------------------------------
// NOTE: We are doing dot produt thingie because this interpolator will 
// produce normal that is correct for face winding while during intersection
// we get proper normal always pointing towards the origin. This unifies this.
// NOTE2: This is also used to disable interpolation of the attributes on demand
vec3 interpolate_normal_from_bc_yz(int fi, vec2 bc_yz, vec3 ref_normal)
{
#if INTERPOLATE_NORMALS == 1
	vec3 n0, n1, n2;

	RTFace rt_face = rt_get_face(fi);
	uint i0 = rt_face.v0;
	uint i1 = rt_face.v1;
	uint i2 = rt_face.v2;
	
	n0 = f16vec3(rt_get_vertex_normal(i0));
	n1 = f16vec3(rt_get_vertex_normal(i1));
	n2 = f16vec3(rt_get_vertex_normal(i2));
	//return n0;

	vec3 smooth_normal = n0 * (1.0 - bc_yz.x - bc_yz.y) + n1 * bc_yz.x + n2 * bc_yz.y;
	//return smooth_normal * (dot(smooth_normal, ref_normal) < 0.0 ? -1.0 : 1.0);
	//return f16vec3(normalize(smooth_normal * (dot(smooth_normal, ref_normal) < 0.0 ? -1.0 : 1.0)));
	return normalize(smooth_normal);
#else
	return ref_normal;
#endif
}

vec2 interpolate_uv_from_bc_yz(int fi, vec2 bc_yz)
{
	vec2 uv0, uv1, uv2;

	RTFace rt_face = rt_get_face(fi);
	uint i0 = rt_face.v0;
	uint i1 = rt_face.v1;
	uint i2 = rt_face.v2;

	uv0 = rt_get_vertex_uv0(i0);
	uv1 = rt_get_vertex_uv0(i1);
	uv2 = rt_get_vertex_uv0(i2);

	vec2 uv = uv0 * (1.0 - bc_yz.x - bc_yz.y) + uv1 * bc_yz.x + uv2 * bc_yz.y;
	return uv;
}

vec4 interpolate_color_from_bc_yz(int fi, vec2 bc_yz)
{
	vec4 c0, c1, c2;

	RTFace rt_face = rt_get_face(fi);
	uint i0 = rt_face.v0;
	uint i1 = rt_face.v1;
	uint i2 = rt_face.v2;

	c0 = rt_get_vertex_color(i0);
	c1 = rt_get_vertex_color(i1);
	c2 = rt_get_vertex_color(i2);

	vec4 c = c0 * (1.0 - bc_yz.x - bc_yz.y) + c1 * bc_yz.x + c2 * bc_yz.y;
	return c;
}

bool intersectTriangle(vec3 orig, vec3 dir, vec3 v0, vec3 v1, vec3 v2, out float intersection_t, out vec3 out_normal)
{
	intersection_t = 0.0;

	//out_normal = vec3(0.0);

	vec3 e1 = v1 - v0;
	vec3 e2 = v2 - v0;
	// Calculate planes normal vector
	vec3 pvec = cross(dir, e2);
	float det = dot(e1, pvec);

	// Ray is parallel to plane
	if (det < 1e-8 && det > -1e-8)
	{
		//out_normal = vec3(1.0, 0.4, 1.0);
		return false;
	}

	float inv_det = 1.0 / det;
	vec3 tvec = orig - v0;
	float u = dot(tvec, pvec) * inv_det;
	if (u < 0.0 || u > 1.0)
	{
		//out_normal = vec3(1.0, 1.0, 0.0);       // this one is somehow triggered now????
		return false;
	}

	vec3 qvec = cross(tvec, e1);
	float v = dot(dir, qvec) * inv_det;
	if (v < 0.0 || u + v > 1.0)
	{
		out_normal = vec3(1.0, 0.0, 0.0);
		return false;
	}

	intersection_t = dot(e2, qvec) * inv_det;
	if (intersection_t > 1e-8)
	{
		out_normal = normalize(cross(e1, e2));	// TODO: remove normalization and reuse above calcs
		return true;
	}
	return false;
}

// NOTE: trying to workaround compiler issues here....

struct intersection
{
	float t;
	vec3  normal;
	float denom;
	vec2  bc;			// barycentrics. only two included
};

intersection intersectTriangle2(vec3 orig, vec3 dir, vec3 v0, vec3 v1, vec3 v2)
{
	intersection it;
	it.t = -1.0;

	vec3 e1 = v1 - v0;
	vec3 e2 = v2 - v0;
	// Calculate planes normal vector
	vec3 pvec = cross(dir, e2);
	float det = dot(e1, pvec);

	// Ray is parallel to plane
	if (abs(det) < 1e-7)
	{
		return it;
	}

	float inv_det = 1.0 / det;
	vec3 tvec = orig - v0;
	float u = dot(tvec, pvec) * inv_det;
	if (u < 0.0 || u > 1.0)
	{
		return it;
	}

	vec3 qvec = cross(tvec, e1);
	float v = dot(dir, qvec) * inv_det;
	if (v < 0.0 || u + v > 1.0)
	{
		return it;
	}

	// this is some fucked up shit... miscompiles?
	float t = dot(e2, qvec) * inv_det;
	if (t > 1e-7)
	{
		// float denom = dot(normalize(cross(e1, e2)), dir);
		it.normal = f16vec3(normalize(cross(e2, e1)));	// TODO: remove normalization and reuse above calcs
		// NOTE: This fucks up for some reason:(
		//it.t = t;
		#if 1
		float denom = dot(vec3(it.normal), dir); 
		if (denom > 1e-7)
		{
			vec3 p0l0 = v0 - orig; 
			t = dot(p0l0, vec3(it.normal)) / denom; 
			it.t = t;
		}
		#endif
	}
	return it;
}

intersection intersectTriangle3(vec3 orig, vec3 dir, vec3 v0, vec3 v1, vec3 v2)
{
	intersection it;
	it.t = -1.0;

	vec3 e1 = v1 - v0;
	vec3 e2 = v2 - v0;
	// Calculate planes normal vector
	vec3 pvec = cross(dir, e2);
	float det = dot(e1, pvec);

	// Ray is parallel to plane
	if (abs(det) < 1e-7)
	{
		return it;
	}

	float inv_det = 1.0 / det;
	vec3 tvec = orig - v0;
	float u = dot(tvec, pvec) * inv_det;
	if (u < 0.0 || u > 1.0)
	{
		return it;
	}

	vec3 qvec = cross(tvec, e1);
	float v = dot(dir, qvec) * inv_det;
	if (v < 0.0 || u + v > 1.0)
	{
		return it;
	}

	// this is some fucked up shit... miscompiles?
	float t = dot(e2, qvec) * inv_det;
	if (t > 1e-7)
	{
		// float denom = dot(normalize(cross(e1, e2)), dir);
		it.normal = normalize(cross(e2, e1));	// TODO: remove normalization and reuse above calcs
		//it.hit = true;
		// NOTE: This fucks up for some reason:(
		//it.t = t;
		#if 1
		float denom = dot(vec3(it.normal), dir); 
		
		//if (denom > 1e-7)
		{
			vec3 p0l0 = v0 - orig; 
			//t = dot(p0l0, vec3(it.normal)) / denom; 
			it.t = t;
			it.denom = denom;

			it.bc.x = u / denom;
			it.bc.y = v / denom;
		}
		#endif
	}
	return it;
}

#if 0
int findClosestNaive(vec3 origin, vec3 dir, int skip_fi, out int closest_fi, out float closest_it, out f16vec3 closest_norm)
{
	closest_fi = -1;
	closest_it = 10000.0;

	for(int fi = 0; fi < numFaces; fi++)
	{
		if (fi == skip_fi)
			continue;

		intersection it1, it2;

		vec3 p0, p1, p2;
		build_triangle(fi, p0, p1, p2);
		
		#if 0

		#if 1 // backfaces
		it1 = intersectTriangle2(origin, dir, p0, p1, p2);
		if (it1.t > 1e-8)
		{
			if (it1.t < closest_it)
			{
				closest_fi = fi;
				closest_it = it1.t;
				closest_norm = -it1.normal;
			}
		}
		#endif
		#if 1
		it2 = intersectTriangle2(origin, dir, p2, p1, p0);
		if (it2.t > 1e-8)
		{
			if (it2.t < closest_it)
			{
				closest_fi = fi;
				closest_it = it2.t;
				closest_norm = it2.normal;
			}
		}
		#endif

		#else // optimized

		
		it1 = intersectTriangle3(origin, dir, p2, p1, p0);
		if (it1.t > 1e-7)
		{
			if (it1.t < closest_it)
			{
				closest_fi = fi;
				closest_it = it1.t;
				closest_norm = it1.normal;

				#ifdef INNER_REFLECTION
				closest_norm = it1.denom > 0.0 ? it1.normal : -it1.normal;
				#endif
			}
		}

		#endif
	}

	return closest_fi;
}
#endif

#ifdef RT_TRAVERSAL_HAS_CUSTOM_INTERSECTION
// NOTE: This is a hack... provide proper interface...
void findClosestBucket2(in out ray_state_user_data user_data, uint list_index, bool bucket_full, uint max_tests, vec3 origin, vec3 dir, int skip_fi, float max_t, out int closest_fi, out uint material_flags, out float closest_it, out vec3 closest_norm, out vec2 closest_bc, out int face_tests);
#else
void findClosestBucket2(in out ray_state_user_data user_data, uint list_index, bool bucket_full, uint max_tests, vec3 origin, vec3 dir, int skip_fi, float max_t, out int closest_fi, out uint material_flags, out float closest_it, out vec3 closest_norm, out vec2 closest_bc, out int face_tests)
{
	face_tests = 0;

	closest_fi = -1;
	closest_it = max_t;

	uint head = in_faces_list_tails_data[list_index];
	uint cnt  = in_faces_list_data.node_buffer[head];
	
	if (max_tests != -1 && cnt >= max_tests)
	{
		// otherwise we would flicker
		closest_fi = -1;
		return;
	}

#if 0
	face_tests = int(cnt);
	ivec2 screen_pos = ivec2(gl_FragCoord.xy) & ivec2(127);
	float hash = fract(texelFetch(s_BlueNoise, ivec3(screen_pos, 0), 0).r);

	int fi_idx_start = int(hash * 2.0);
	for(int fi_idx = fi_idx_start; fi_idx < cnt; fi_idx += 2)
#else
	face_tests = int(cnt);
	for(int fi_idx = 0; fi_idx < cnt; fi_idx += 1)
#endif
	{
		int fi = int(in_faces_list_data.node_buffer[head + fi_idx + 1]);
		if (fi == skip_fi)
			continue;

		RTFace rt_face = rt_get_face(fi);
		vec3 p0 = rt_get_vertex(rt_face.v0);
		vec3 p1 = rt_get_vertex(rt_face.v1);
		vec3 p2 = rt_get_vertex(rt_face.v2);

		intersection it1 = intersectTriangle3(origin, dir, p2, p1, p0);
		{
			if (it1.t >= 0.0 && it1.t <= closest_it)
			//if (it1.t >= 0.0)
			{
				if ((rt_face.material_flags & MaterialFlag_Doublesided) == 0)
				{
					if (dot(dir, it1.normal) >= 0.0)
						continue;
				}

				closest_fi = fi;
				closest_it = it1.t;
				#if 0

				#ifdef INNER_REFLECTION
				closest_norm = it1.denom > 0.0 ? it1.normal : -it1.normal;
				#else
				closest_norm = it1.normal;
				#endif

				closest_bc     = it1.bc;
				material_flags = rt_face.material_flags;

				#endif
			}
			//break;
		}
	}

	if (closest_fi != -1)
	{
		// actually compute intersection params
		RTFace rt_face = rt_get_face(closest_fi);
		vec3 p0 = rt_get_vertex(rt_face.v0);
		vec3 p1 = rt_get_vertex(rt_face.v1);
		vec3 p2 = rt_get_vertex(rt_face.v2);

		intersection it1 = intersectTriangle3(origin, dir, p2, p1, p0);
		#ifdef INNER_REFLECTION
		closest_norm = it1.denom > 0.0 ? it1.normal : -it1.normal;
		#else
		closest_norm = it1.normal;
		#endif

		closest_bc     = it1.bc;
		material_flags = rt_face.material_flags;
	}
}
#endif

void findClosestBucket2_Multi(
	uint      list_index,
	uint      fi_first,
	uint      fi_step,
	vec3      origin,
	vec3      dir,
	int       skip_fi,
	float     max_t,
	out int   closest_fi,
	out uint  material_flags,
	out float closest_it,
	out vec3  closest_norm,
	out vec2  closest_bc
)
{
	closest_fi = -1;
	closest_it = max_t;

	uint head = in_faces_list_tails_data[list_index];
	uint cnt  = in_faces_list_data.node_buffer[head];
	
	for(uint fi_idx = fi_first; fi_idx < cnt; fi_idx += fi_step)
	{
		int fi = int(in_faces_list_data.node_buffer[head + fi_idx + 1]);
		if (fi == skip_fi)
			continue;

		RTFace rt_face = rt_get_face(fi);
		vec3 p0 = rt_get_vertex(rt_face.v0);
		vec3 p1 = rt_get_vertex(rt_face.v1);
		vec3 p2 = rt_get_vertex(rt_face.v2);

		intersection it1 = intersectTriangle3(origin, dir, p2, p1, p0);
		{
			if (it1.t >= 0.0 && it1.t <= closest_it)
			//if (it1.t >= 0.0)
			{
				if ((rt_face.material_flags & MaterialFlag_Doublesided) == 0)
				{
					if (dot(dir, it1.normal) >= 0.0)
						continue;
				}

				closest_fi = fi;
				closest_it = it1.t;
				#if 0

				#ifdef INNER_REFLECTION
				closest_norm = it1.denom > 0.0 ? it1.normal : -it1.normal;
				#else
				closest_norm = it1.normal;
				#endif

				closest_bc     = it1.bc;
				material_flags = rt_face.material_flags;

				#endif
			}
			//break;
		}
	}

#if 0
	if (closest_fi != -1)
	{
		// actually compute intersection params
		RTFace rt_face = rt_get_face(closest_fi);
		vec3 p0 = rt_get_vertex(rt_face.v0);
		vec3 p1 = rt_get_vertex(rt_face.v1);
		vec3 p2 = rt_get_vertex(rt_face.v2);

		intersection it1 = intersectTriangle3(origin, dir, p2, p1, p0);
		#ifdef INNER_REFLECTION
		closest_norm = it1.denom > 0.0 ? it1.normal : -it1.normal;
		#else
		closest_norm = it1.normal;
		#endif

		closest_bc     = it1.bc;
		material_flags = rt_face.material_flags;
	}
#endif
}

struct dda
{
	bool     is_high;
	vec3     res;
	vec3     ro;
	vec3     ird;
	vec3     delta;
	vec3     t_max;
	float    prev_next_t;
	vec3     prev_t_max;
	float    next_t;

	vec3     pf;
	float    tmin;
	float    tmax;
};

bool is_pos_inside_grid(ivec3 icell)
{
	int icell_mask = icell.x | icell.y | icell.z;   // test against <0 and >GRID_RES-1
	if ((icell_mask & (~(GRID_RES-1))) == 0)
		return true;
	else
		return false;
}

void build_dda(inout dda dda, vec3 ro, vec3 rd, float tmin, float tmax, bool restart)
{
	if (!restart)
	{
		dda.res = vec3(GRID_SIZE);
		dda.is_high = false;
	}
	dda.ro = ro - in_bbox_data.bbox_raytrace_min.xyz;

//	ivec3 p = ivec3(floor(dda.ro));
//	if (is_pos_inside_grid(p))
//		dda.res = is_high_level_cell_occupied(p.x, p.y, p.z) != 0 ? 4 : 1;

	dda.ird = vec3(1.0) / rd;
	vec3 s = step(vec3(0.0), rd);

	dda.delta = (s * 2.0 - 1.0) * dda.res * dda.ird;
	dda.t_max = ((floor(dda.ro / dda.res) + s) * dda.res - dda.ro) * dda.ird;

	dda.prev_next_t = 0.0f;
	dda.prev_t_max = dda.t_max;

	if (!restart)
		dda.pf = dda.ro;

	dda.tmin = tmin;
	dda.tmax = tmax;
}

ivec3 grd_icell_dda(inout dda dda)
{
	//ivec3 p = ivec3(floor(dda.pf * (dda.res_rcp * (dda.is_high ? 4.0 : 1.0))));
	ivec3 p = ivec3(floor(dda.pf * GRID_SIZE_RECIP));
	return p;
}

bool is_inside_grid_dda(inout dda dda)
{
	ivec3 p = grd_icell_dda(dda);
	return is_pos_inside_grid(p);
}

bool is_high_level_empty_dda(inout dda dda)
{
	#if 1
	ivec3 p = grd_icell_dda(dda);
	//return is_high_level_cell_occupied(p.x, p.y, p.z) == 0 ? true : false;
		#ifdef RT_USE_TEXTURE_GRID_MARKERS_FOR_TRAVERSAL
		return rt_read_grid_marker_low_res(s_grid_marker, p >> 2) ? false : true;
		#else
		return _rt_read_grid_marker_low_res(g_buff_grid_markers_ptr, p >> 2) ? false : true;
		#endif
	#else
	ivec3 p = ivec3(floor(dda.pf * ((vec3(1.0) / dda.res) * (dda.is_high ? 1.0 : 0.25))));
		#ifdef RT_USE_TEXTURE_GRID_MARKERS
		return rt_read_grid_marker_low_res(s_grid_marker, p) ? false : true;
		#else
		return _rt_read_grid_marker_low_res(g_buff_grid_markers_ptr, p) ? false : true;
		#endif
	#endif
}


float get_current_intersection_dda(inout dda dda, in ray_state state)
{
	dda.next_t = min(dda.t_max.x, min(dda.t_max.y, dda.t_max.z));
	dda.pf = dda.ro + state.dir * (dda.prev_next_t + dda.next_t) * 0.5;
	return dda.next_t;
}

bool dda_is_abort(in dda dda)
{
	return dda.next_t >= dda.tmax;
}

void step_dda(inout dda dda)
{
	dda.prev_next_t = dda.next_t;
	vec3 cmp = step(dda.t_max.xyz, dda.t_max.yxy) * step(dda.t_max.xyz, dda.t_max.zzx);
	dda.prev_t_max = dda.t_max;
	dda.t_max += cmp * dda.delta;
}

float dda_scale_grid_res(inout dda dda, in ray_state state, float scale)
{
	dda.res *= scale;
	dda.delta *= scale;

	// NOTE: This will still nuke sometimes. When we step exacly on both edges floor() might
	//       move us back one cell and this will nuke. Hope it is unlikely enough...
	// we calculate intersection with next block using new scales
					
	// NOTE: Try to merge these two conditions
	// NOTE: floor(float) in processing returns int!!!!

	float prev_t_min = min(dda.prev_t_max.x, min(dda.prev_t_max.y, dda.prev_t_max.z));  // this is checking where the grid intersection happened. can we just keep track of it?

	float blend_x = prev_t_min == dda.prev_t_max.x ? 1.0 : 0.0;
	float blend_y = prev_t_min == dda.prev_t_max.y ? 1.0 : 0.0;
	float blend_z = prev_t_min == dda.prev_t_max.z ? 1.0 : 0.0;

	vec3 next_ro = dda.ro + state.dir * prev_t_min;

	float dt_x_1 = dda.delta.x;
	float dt_y_1 = dda.delta.y;
	float dt_z_1 = dda.delta.z;
							
	vec3 s = step(vec3(0.0), state.dir);
	float dt_x_2 = ((floor(next_ro.x / dda.res.x) + s.x) * dda.res.x - next_ro.x) * dda.ird.x;
	float dt_y_2 = ((floor(next_ro.y / dda.res.y) + s.y) * dda.res.y - next_ro.y) * dda.ird.y;
	float dt_z_2 = ((floor(next_ro.z / dda.res.z) + s.z) * dda.res.z - next_ro.z) * dda.ird.z;
							
	dda.t_max.x = mix(dt_x_2, dt_x_1, blend_x);
	dda.t_max.y = mix(dt_y_2, dt_y_1, blend_y);
	dda.t_max.z = mix(dt_z_2, dt_z_1, blend_z);

	dda.t_max += prev_t_min;
	dda.next_t = min(dda.t_max.x, min(dda.t_max.y, dda.t_max.z));

	if (scale < 1.0) // no need to calculate sampling cube more precisely when going to high res
		dda.pf = dda.ro + state.dir * (dda.prev_next_t + dda.next_t) * 0.5;

	return dda.next_t;
}

bool fetch_grid_marker_for_cell(ivec3 icell, int mip)
{
	//int grid_marker = int(texelFetch(s_grid_marker, icell, 0).r);
	#ifdef RT_USE_TEXTURE_GRID_MARKERS_FOR_TRAVERSAL
	if (mip == 0)
		return rt_read_grid_marker_high_res(s_grid_marker, icell);
	else
		return rt_read_grid_marker_low_res(s_grid_marker, icell >> 2);
	#else
	if (mip == 0)
		return _rt_read_grid_marker_high_res(g_buff_grid_markers_ptr, icell);
	else
		return _rt_read_grid_marker_low_res(g_buff_grid_markers_ptr, icell >> 2);
	#endif

}


uint ballot_count(bool v)
{
	#ifndef SPIRV_VULKAN
	uint cnt = bitCount(ballotThreadNV(v));
	#else
	uint cnt;
	{
		uvec4 ballot = subgroupBallot(v);
		cnt  = bitCount(ballot.x);
		cnt += bitCount(ballot.y);
	}
	#endif

	return cnt;
}

ivec3 dda_step_sign_from_dir(vec3 dir)
{
	vec3 s = step(vec3(0.0), dir);   // 0.0 or 1.0
	vec3 sgn = s * 2.0 - 1.0;        // -1.0 or 1.0 
	return ivec3(sgn);
}

// return grid step. only one component of cell_step can be > 0
#if 0
ivec3 dda_step_from_dir(vec3 dir, vec3 cell_step)
{
	vec3 s = step(vec3(0.0), dir);
	vec3 sgn = s * cell_step * 2.0 - cell_step;
	return ivec3(sgn);
}
#else
ivec3 dda_step_from_dir(vec3 dir, vec3 cell_step)
{
	vec3 s;
	s.x = dir.x > 0.0 ? 1.0 : -1.0;
	s.y = dir.y > 0.0 ? 1.0 : -1.0;
	s.z = dir.z > 0.0 ? 1.0 : -1.0;
	vec3 sgn = s * cell_step;
	return ivec3(sgn);
}
#endif



int findClosestDDAMultibounce(ray_traversal_params traversal_params, inout ray_state state, int skip_fi, out int closest_fi, out float closest_it, int max_bounces, float t_max)
{ 
    //state.dir = normalize(state.dir);

	closest_fi = -1;
	closest_it = 1000000.0;

	const vec3 cellDimension = GRID_SIZE;
	const vec3 cellDimensionRecip = GRID_SIZE_RECIP;

	float tmin = 0.0;

	vec3 deltaT, nextCrossingT; 

	vec3 ro_cell = state.origin - in_bbox_data.bbox_raytrace_min.xyz;
	ivec3 icell = ivec3(floor(ro_cell * cellDimensionRecip));

	{
		vec3 s = step(vec3(0.0), state.dir);       // 0.0 or 1.0
		vec3 sgn = s * 2.0 - 1.0;                  // -1.0 or 1.0 

		deltaT = sgn * cellDimension / state.dir;    // same as 'dir'
		nextCrossingT = tmin + ((floor(ro_cell * cellDimensionRecip) + s) * cellDimension - ro_cell) / state.dir;
	}
 
	// walk through each cell of the grid and test for an intersection if
	// current cell contains geometry
	//float rt = tmin;
	uint traversal_flags = TF_NONE;
	//bool inside      = false;
	//bool prev_inside = inside;

	int max_iter = MAX_TRACE_LENGTH;                        // this includes all boundes we are now tracking
	int threads_running = int(ballot_count(true));

	for(;;)
	{
		bool hit = false;
		max_iter--;
		state.tests++;

		{
			float rt = tmin + min(nextCrossingT.x, min(nextCrossingT.y, nextCrossingT.z));
			state.flags = rt_clear_mask_if(state.flags, TF_RUNNING, rt >= t_max);
		}

		{
			state.flags = rt_clear_mask_if(state.flags, TF_RUNNING, max_iter < 0);
		}

		if (rt_is_mask_set(state.flags, TF_RUNNING) == false)
			break;

		//if (int(ballot_count(state.hit)) >= threads_running / 2)
		//	break;
		
		// t for next next crossing intersection
		//rt          = tmin + min(nextCrossingT.x, min(nextCrossingT.y, nextCrossingT.z));
		traversal_flags = rt_set_mask_if(traversal_flags, TF_PREV_INSIDE, rt_is_mask_set(traversal_flags, TF_IS_INSIDE));
		traversal_flags = rt_clear_mask(traversal_flags, TF_IS_INSIDE);
		//prev_inside = prev_inside || inside;
		//inside      = false;

		bool search_bucket = false;
		bool bucket_full    = false;

#if 0
		if (fetch_grid_marker_for_cell(icell, 0) > 0)
#else
		if (is_pos_inside_grid(icell))
#endif
		{
			traversal_flags = rt_set_mask(traversal_flags, TF_IS_INSIDE);
			//inside = true;
			bucket_full = fetch_grid_marker_for_cell(icell, 0);
		}

		// try to advance also other threads to increase coherency. this is mostly to skip empty space and sync threads
		// ------
		if (true)
		{
			bool done_criteria = (bucket_full) || (((traversal_flags & TF_IS_INSIDE) != 0) && ((traversal_flags & TF_PREV_INSIDE) == 0));
			//bool done_criteria = (bucket_full) || (prev_inside == true && inside == false);
			uint hit_threads = ballot_count(done_criteria);

			int ii = 0;
			while(hit_threads < 16 && ii < 4)
			{
				ii++;
				if (done_criteria == false)
				{
					state.tests++;
					max_iter--;

					// step dda
					vec3 mm = step(nextCrossingT.xyz, nextCrossingT.yxy) * step(nextCrossingT.xyz, nextCrossingT.zzx);
					icell += dda_step_from_dir(state.dir, mm);
					nextCrossingT += mm * deltaT;

					//rt = tmin + min(nextCrossingT.x, min(nextCrossingT.y, nextCrossingT.z));

					traversal_flags = rt_clear_mask(traversal_flags, TF_IS_INSIDE);
					if (is_pos_inside_grid(icell))
						traversal_flags = rt_set_mask(traversal_flags, TF_IS_INSIDE | TF_PREV_INSIDE);

					//inside = is_pos_inside_grid(i16vec3(icell));
					//prev_inside = prev_inside || inside;

					if (rt_is_mask_set(traversal_flags, TF_IS_INSIDE))
					//if (inside)
					{
						bucket_full = fetch_grid_marker_for_cell(icell, 0);
					}

					done_criteria = (bucket_full) || (((traversal_flags & TF_PREV_INSIDE) != 0) && ((traversal_flags & TF_IS_INSIDE) == 0));
					//done_criteria = (bucket_full) || (prev_inside == true && inside == false);
				}

				//hit_threads = ballot_count(done_criteria);
			}
		}

		if (bucket_full)
		{
			f16vec2 closest_bc;
			f16vec3 closest_normal;
			uint    closest_material_flags;
			uint    icell_idx = icell.z * GRID_RES * GRID_RES + icell.y * GRID_RES + icell.x;

			state.active_threads_factor  += ballot_count(true);
			state.active_threads_samples += 1;

			//state.tests += bucket_full;

			#ifndef USE_LINKED_LISTS
			int bucket_offset = int(in_buckets.offsets[icell_idx]);
			findClosestBucket2(bucket_offset, bucket_size, state.origin, state.dir, skip_fi, closest_fi, closest_it, state.normal);
			//findClosestBucket(icell_idx, state.origin, state.dir, skip_fi, closest_fi, closest_it, state.normal);
			#else
			int face_tests = 0;
			uint max_tests = -1;
			if (closest_fi == -1 && state.face_tests > 512)
				max_tests = 256;

			float rt = tmin + min(nextCrossingT.x, min(nextCrossingT.y, nextCrossingT.z));
			findClosestBucket2(state.user_data, icell_idx, bucket_full, max_tests, state.origin, state.dir, skip_fi, min(t_max, rt), closest_fi, closest_material_flags, closest_it, closest_normal, closest_bc, face_tests);
			state.face_tests += face_tests;
			#endif

			if (closest_fi != -1)
			{
				if (rt_is_mask_set(state.flags, TF_DEBUG))
				{
					debugPrintfEXT("  Hit at %f, material_flags:%d, closest_fi:%d\\n", closest_it, closest_material_flags, closest_fi);
				}

				state.bounces += int16_t(1);
				state.normal   = closest_normal;

				hit     = true;
				skip_fi = closest_fi;

				//
				if (state.bounces >= max_bounces || (closest_material_flags & MaterialFlag_RaytraceTerminate) != 0)
				{
					state.flags = rt_clear_mask(state.flags, TF_RUNNING);

					if (rt_is_mask_set(state.flags, TF_DEBUG))
					{
						debugPrintfEXT("   Terminate. Normal:%f %f %f, closest_fi:%d\\n", closest_normal.x, closest_normal.y, closest_normal.z, closest_fi);
					}
					//state.hit = true;
					//break;
				}

				f16vec2 bc = barycentric_for_face_yz(closest_fi, state.origin + state.dir * closest_it);
				if ((closest_material_flags & MaterialFlag_Flat) == 0)
					state.normal = interpolate_normal_from_bc_yz(closest_fi, bc, state.normal);

				bool flip_normal_on_glass = false;
				if ((closest_material_flags & MaterialFlag_Transparent) != 0)
				{
					// align normal along the current view dir
					flip_normal_on_glass = dot(state.normal, state.dir) > 0.0 ? true : false;
				}

				// calculate new origin and recalculate tracing parameters for the bounce
				vec3 prev_state_origin = state.origin; // for lighting calculation
				state.origin = state.origin + state.dir * closest_it;

				if (rt_is_mask_set(state.flags, TF_DEBUG))
					debugPrintfEXT("  %f, %f %f\\n", state.origin.x, state.origin.y, state.origin.z);

				// we hit solid object which is not perfectly rough, reflect
				evaluate_material(state, prev_state_origin, closest_fi, closest_material_flags, bc, flip_normal_on_glass);
				//state.color = TurboColormap(fract(closest_fi * 13.11301));
				//state.color.rg = bc.xy;
				//state.color.b = 1.0 - bc.x - bc.y;

				// rebuild stepping parameters TODO: factor this out
				if (max_bounces > 1) // && state.running)
				{
					vec3 ird = 1.0 / state.dir;
					ro_cell = state.origin - in_bbox_data.bbox_raytrace_min.xyz;
					//cell = floor(ro_cell / cellDimension);

					vec3 s = step(vec3(0.0), state.dir);            // 0.0 or 1.0
					vec3 sgn = s * 2.0 - 1.0;                       // -1.0 or 1.0 

					deltaT = sgn * cellDimension * ird;         // same as 'dir'
					nextCrossingT = tmin + ((floor(ro_cell * cellDimensionRecip) + s) * cellDimension - ro_cell) * ird;

					// walk through each cell of the grid and test for an intersection if
					// current cell contains geometry
					t_max = traversal_params.trace_range_secondary;
					rt = tmin;
				}
			}
		}

		if (rt_is_mask_set(traversal_flags, TF_IS_INSIDE) == false && rt_is_mask_set(traversal_flags, TF_PREV_INSIDE) == true)
		//if (inside == false && prev_inside == true)
		{
			state.flags = rt_clear_mask(state.flags, TF_RUNNING);
			state.flags = rt_set_mask(state.flags, TF_LEFT);
			state.flags = rt_set_mask(state.flags, TF_EARLY_EXIT);
			//state.running    = false;
			//state.left       = true;
			//state.early_exit = true;
		}

		if (!hit)
		{
			// all components of minimum mask (i.e. x <= y && x <= z, y <= x && y <= z, z <= y && z <= x) 
			// are false except for the corresponding smallest component of dt (if no mask), which 
			// is the axis along which the ray should be incremented
			// stolen from https://github.com/guozhou/voxelizer/blob/master/raycasting_fs.glsl
			// NOTE: nextCrossingT == dt

			vec3 mm = step(nextCrossingT.xyz, nextCrossingT.yxy) * step(nextCrossingT.xyz, nextCrossingT.zzx);
			icell += dda_step_from_dir(state.dir, mm);
			nextCrossingT += mm * deltaT;
		}
	} 

	return 0;
}

shared int   shared_hit_lane_idx;
shared int   shared_closest_fi;
shared float shared_closest_it;
//shared vec2  shared_closest_bc;
//shared vec3  shared_closest_normal;
//shared uint  shared_closest_material_flags;

int findClosestDDAMultibounce_New(ray_traversal_params traversal_params, inout ray_state state, int skip_fi, out int closest_fi, out float closest_it, int max_bounces, float tmax)
{ 
    //state.dir = normalize(state.dir);

	closest_fi = -1;
	closest_it = 1000000.0;

	const vec3 cellDimension = GRID_SIZE;
	const vec3 cellDimensionRecip = GRID_SIZE_RECIP;

	float tmin = 0.0;

	vec3 deltaT, nextCrossingT; 

	vec3 ro_cell = state.origin - in_bbox_data.bbox_raytrace_min.xyz;
	ivec3 icell = ivec3(floor(ro_cell * cellDimensionRecip));

	{
		vec3 s = step(vec3(0.0), state.dir);       // 0.0 or 1.0
		vec3 sgn = s * 2.0 - 1.0;                  // -1.0 or 1.0 

		deltaT = sgn * cellDimension / state.dir;    // same as 'dir'
		nextCrossingT = tmin + ((floor(ro_cell * cellDimensionRecip) + s) * cellDimension - ro_cell) / state.dir;
	}
 
	// walk through each cell of the grid and test for an intersection if
	// current cell contains geometry
	//float rt = tmin;
	uint traversal_flags = TF_NONE;
	//bool inside      = false;
	//bool prev_inside = inside;

	int max_iter = MAX_TRACE_LENGTH;                        // this includes all boundes we are now tracking
	int threads_running = int(ballot_count(true));

	for(;;)
	{
		bool hit = false;
		max_iter--;
		state.tests++;

		{
			float rt = tmin + min(nextCrossingT.x, min(nextCrossingT.y, nextCrossingT.z));
			state.flags = rt_clear_mask_if(state.flags, TF_RUNNING, rt >= tmax);
		}

		{
			state.flags = rt_clear_mask_if(state.flags, TF_RUNNING, max_iter < 0);
		}

		// t for next next crossing intersection
		//rt          = tmin + min(nextCrossingT.x, min(nextCrossingT.y, nextCrossingT.z));
		traversal_flags = rt_set_mask_if(traversal_flags, TF_PREV_INSIDE, rt_is_mask_set(traversal_flags, TF_IS_INSIDE));
		traversal_flags = rt_clear_mask(traversal_flags, TF_IS_INSIDE);
		//prev_inside = prev_inside || inside;
		//inside      = false;

		bool search_bucket = false;
		bool bucket_full    = false;

		if (rt_is_mask_set(state.flags, TF_RUNNING))
		{
#if 0
			if (fetch_grid_marker_for_cell(icell, 0) > 0)
#else
			if (is_pos_inside_grid(icell))
#endif
			{
				traversal_flags = rt_set_mask(traversal_flags, TF_IS_INSIDE);
				//inside = true;
				bucket_full = fetch_grid_marker_for_cell(icell, 0);
			}
		}

		bool is_running     = rt_is_mask_set(state.flags, TF_RUNNING);
		uint is_running_num = ballot_count(is_running);
		uint is_active_num  = ballot_count(true);

		int prev_closest_fi = closest_fi;
		int set_from_lane = -1;

		{
			vec2 closest_bc;
			vec3 closest_normal;
			uint closest_material_flags;

			uint  icell_idx = icell.z * GRID_RES * GRID_RES + icell.y * GRID_RES + icell.x;
			float rt        = tmin + min(nextCrossingT.x, min(nextCrossingT.y, nextCrossingT.z));

			// iterate through all threads which have bucket_full == true. assume for now not larger than 64

			uvec4    active_ballot      = subgroupBallot(true);
			uvec4    bucket_full_ballot = subgroupBallot(bucket_full);
			uint64_t bucket_full_mask   = (uint64_t(bucket_full_ballot.y) << 32) | uint64_t(bucket_full_ballot.x);

			bool participating = false;
			if (rt_is_mask_set(state.flags, TF_DEBUG) && bucket_full_mask != 0)
			{
				if ((bucket_full_mask & (1U << gl_SubgroupInvocationID)) != 0)
					participating = true;
				//debugPrintfEXT("Ballot:0x%x 0x%x bucket_full:0x%x 0x%x mask:0x%lx SubgroupID:%d, participating:%d\\n", active_ballot.y, active_ballot.x, bucket_full_ballot.y, bucket_full_ballot.x, bucket_full_mask, gl_SubgroupInvocationID, participating);
			}

			if (bucket_full)
			{
				uint head = in_faces_list_tails_data[icell_idx];
				uint cnt  = in_faces_list_data.node_buffer[head];
				state.face_tests += int(cnt);
			}

			// iterate through all active in the mask
			while(bucket_full_mask != 0)
			{
				// we have a guarantee to find something, so no safety here
				int bucket_full_first_idx = findLSB(uint(bucket_full_mask));
				if (bucket_full_first_idx == -1)
				{
					bucket_full_first_idx = 32 + findLSB(uint(bucket_full_mask >> 32));
				}

				//debugPrintfEXT("  mask:0x%lx and thread idx:%d\\n", bucket_full_mask, bucket_full_first_idx); 

				int   _closest_fi = -1;
				float _closest_it;
				vec2  _closest_bc;
				vec3  _closest_normal;
				uint  _closest_material_flags;

				uint iterator_idx = subgroupBallotExclusiveBitCount(active_ballot);	// all active threads participate
				uint iterator_cnt = subgroupBallotBitCount(active_ballot);

				findClosestBucket2_Multi(
					subgroupShuffle(icell_idx, bucket_full_first_idx),
					iterator_idx,
					iterator_cnt,
					subgroupShuffle(state.origin, bucket_full_first_idx),
					subgroupShuffle(state.dir, bucket_full_first_idx),
					subgroupShuffle(skip_fi, bucket_full_first_idx),
					min(tmax, subgroupShuffle(rt, bucket_full_first_idx)),
					_closest_fi,
					_closest_material_flags,
					_closest_it,
					_closest_normal,
					_closest_bc);

				// from all the threads check which one returned closest hit
				[[branch]]
				if (subgroupMax(_closest_fi) > -1)
				{
					// check which thread has this value and broadcast to the current
					shared_hit_lane_idx = -1;	//

					[[branch]]
					if (_closest_it == subgroupMin(_closest_it))	// this one;)
					{
						// need to transfer via shared memory as later on we can not use 
						// subgroupShuffle() from inactive thread:(
						shared_hit_lane_idx = int(gl_SubgroupInvocationID);

						//shared_closest_it             = _closest_it;
						shared_closest_fi             = _closest_fi;
						//shared_closest_bc             = _closest_bc;
						//shared_closest_normal         = _closest_normal;
						//shared_closest_material_flags = _closest_material_flags;
					}
					memoryBarrierShared();	// NOTE: this seems to be required on rdna4 even though everything happens within single wave?
					//hit_lane_idx = subgroupMax(shared_hit_lane_idx);

					// fetch all the values from this threads_running
					[[branch]]
					if (bucket_full_first_idx == gl_SubgroupInvocationID)
					{
						closest_fi             = shared_closest_fi;
						#if 0
						closest_it             = shared_closest_it;
						closest_bc             = shared_closest_bc;
						closest_normal         = shared_closest_normal;
						closest_material_flags = shared_closest_material_flags;
						#else
						if (closest_fi != -1)
						{
							{
								// actually compute intersection params
								RTFace rt_face = rt_get_face(closest_fi);
								vec3 p0 = rt_get_vertex(rt_face.v0);
								vec3 p1 = rt_get_vertex(rt_face.v1);
								vec3 p2 = rt_get_vertex(rt_face.v2);

								intersection it1 = intersectTriangle3(state.origin, state.dir, p2, p1, p0);
								#ifdef INNER_REFLECTION
								closest_normal = it1.denom > 0.0 ? it1.normal : -it1.normal;
								#else
								closest_normal = it1.normal;
								#endif

								closest_bc     = it1.bc;
								closest_material_flags = rt_face.material_flags;
							}
						}
						#endif

						set_from_lane = shared_hit_lane_idx;

						//if (rt_is_mask_set(state.flags, TF_DEBUG) && rt_is_mask_set(state.flags, TF_RUNNING))
						if (rt_is_mask_set(state.flags, TF_DEBUG))
						{
							debugPrintfEXT("  [%04d] Running:%d closest_it:%f for thread:%d, hit_lane:%d = %f material_flags:%d, subgroupID:%d closest_fi:%d\\n", max_iter, is_running_num, closest_it, bucket_full_first_idx, shared_hit_lane_idx, subgroupShuffle(_closest_it, shared_hit_lane_idx), closest_material_flags, gl_SubgroupInvocationID, closest_fi);
							for(int ii = 0; ii < 64; ii++)
							{
								uint head = in_faces_list_tails_data[subgroupShuffle(icell_idx, ii)];
								debugPrintfEXT("  ++ %f faces_in_bucket:%d, iterator_idx:%d, iterator_stride:%d\\n", subgroupShuffle(_closest_it, ii), in_faces_list_data.node_buffer[head], subgroupShuffle(iterator_idx, ii), subgroupShuffle(iterator_cnt, ii));
							}
						}
					}
				}

				// clear the lane
				bucket_full_mask -= (1L << bucket_full_first_idx);
			}
		
			if (closest_fi != prev_closest_fi && rt_is_mask_set(state.flags, TF_RUNNING))
			{
				if (rt_is_mask_set(state.flags, TF_DEBUG))
					debugPrintfEXT("  FI:%d OLD:%d LANE:%d, gl_SubgroupInvocationID:%d\\n", closest_fi, prev_closest_fi, set_from_lane, gl_SubgroupInvocationID);
			}

			// only check the hit if we are still running
			if (closest_fi != -1 && rt_is_mask_set(state.flags, TF_RUNNING))
			{
				state.bounces += int16_t(1);
				state.normal   = closest_normal;

				hit     = true;
				skip_fi = closest_fi;

				//
				if (state.bounces >= max_bounces || (closest_material_flags & MaterialFlag_RaytraceTerminate) != 0)
				{
					state.flags = rt_clear_mask(state.flags, TF_RUNNING);

					if (rt_is_mask_set(state.flags, TF_DEBUG))
						debugPrintfEXT("  Terminate:%02d subgroup size:%d, %f, %f, %f, FI:%d\\n", gl_SubgroupInvocationID, gl_SubgroupSize, closest_normal.x, closest_normal.y, closest_normal.z, closest_fi);
					//state.hit = true;
					//break;
				}

				vec2 bc = barycentric_for_face_yz(closest_fi, state.origin + state.dir * closest_it);
				if ((closest_material_flags & MaterialFlag_Flat) == 0)
					state.normal = interpolate_normal_from_bc_yz(closest_fi, bc, state.normal);

				bool flip_normal_on_glass = false;
				if ((closest_material_flags & MaterialFlag_Transparent) != 0)
				{
					// align normal along the current view dir
					flip_normal_on_glass = dot(state.normal, state.dir) > 0.0 ? true : false;
				}

				// calculate new origin and recalculate tracing parameters for the bounce
				vec3 prev_state_origin = state.origin; // for lighting calculation
				state.origin = state.origin + state.dir * closest_it;

				if (rt_is_mask_set(state.flags, TF_DEBUG))
					debugPrintfEXT("  %f, %f %f, %f\\n", state.origin.x, state.origin.y, state.origin.z, closest_it);

				// we hit solid object which is not perfectly rough, reflect
				evaluate_material(state, prev_state_origin, closest_fi, closest_material_flags, bc, flip_normal_on_glass);
				//state.color = TurboColormap(fract(closest_fi * 13.11301));
				//state.color.rg = bc.xy;
				//state.color.b = 1.0 - bc.x - bc.y;
			}
		}

		if (false)
		if (bucket_full)
		{
			vec2 closest_bc;
			vec3 closest_normal;
			uint closest_material_flags;
			uint icell_idx = icell.z * GRID_RES * GRID_RES + icell.y * GRID_RES + icell.x;

			state.active_threads_factor  += ballot_count(true);
			state.active_threads_samples += 1;

			//state.tests += bucket_full;

			int face_tests = 0;
			uint max_tests = -1;
			//if (closest_fi == -1 && state.face_tests > 512)
			//	max_tests = 256;

			float rt = tmin + min(nextCrossingT.x, min(nextCrossingT.y, nextCrossingT.z));
			
			#if 1
			findClosestBucket2(state.user_data, icell_idx, bucket_full, max_tests, state.origin, state.dir, skip_fi, min(tmax, rt), closest_fi, closest_material_flags, closest_it, closest_normal, closest_bc, face_tests);
			#else
			findClosestBucket2_Multi(state.user_data, icell_idx, 0, 1, state.origin, state.dir, skip_fi, min(tmax, rt), closest_fi, closest_material_flags, closest_it, closest_normal, closest_bc, face_tests);
			#endif

			state.face_tests += face_tests;

			{
				uint hit_threads = ballot_count(true);
				if (rt_is_mask_set(state.flags, TF_DEBUG))
				{
					debugPrintfEXT("  Tests:%d (%d) at %d hit_threads:%d active_threads:%d (running:%d)\\n", face_tests, state.face_tests, state.tests, hit_threads, is_active_num, is_running_num);
				}
			}
			
			if (closest_fi != -1)
			{
				state.bounces += int16_t(1);
				state.normal   = closest_normal;

				hit     = true;
				skip_fi = closest_fi;

				//
				if (state.bounces >= max_bounces || (closest_material_flags & MaterialFlag_RaytraceTerminate) != 0)
				{
					state.flags = rt_clear_mask(state.flags, TF_RUNNING);
					//state.hit = true;
					//break;
				}

				vec2 bc = barycentric_for_face_yz(closest_fi, state.origin + state.dir * closest_it);
				if ((closest_material_flags & MaterialFlag_Flat) == 0)
					state.normal = interpolate_normal_from_bc_yz(closest_fi, bc, state.normal);

				bool flip_normal_on_glass = false;
				if ((closest_material_flags & MaterialFlag_Transparent) != 0)
				{
					// align normal along the current view dir
					flip_normal_on_glass = dot(state.normal, state.dir) > 0.0 ? true : false;
				}

				// calculate new origin and recalculate tracing parameters for the bounce
				vec3 prev_state_origin = state.origin; // for lighting calculation
				state.origin = state.origin + state.dir * closest_it;

				// we hit solid object which is not perfectly rough, reflect
				evaluate_material(state, prev_state_origin, closest_fi, closest_material_flags, bc, flip_normal_on_glass);
				//state.color = TurboColormap(fract(closest_fi * 13.11301));
				//state.color.rg = bc.xy;
				//state.color.b = 1.0 - bc.x - bc.y;
			}
		}

		if (rt_is_mask_set(state.flags, TF_RUNNING))
		{
			if (rt_is_mask_set(traversal_flags, TF_IS_INSIDE) == false && rt_is_mask_set(traversal_flags, TF_PREV_INSIDE) == true)
			//if (inside == false && prev_inside == true)
			{
				state.flags = rt_clear_mask(state.flags, TF_RUNNING);
				state.flags = rt_set_mask(state.flags, TF_LEFT);
				state.flags = rt_set_mask(state.flags, TF_EARLY_EXIT);
				//state.running    = false;
				//state.left       = true;
				//state.early_exit = true;
			}

			if (!hit)
			{
				// all components of minimum mask (i.e. x <= y && x <= z, y <= x && y <= z, z <= y && z <= x) 
				// are false except for the corresponding smallest component of dt (if no mask), which 
				// is the axis along which the ray should be incremented
				// stolen from https://github.com/guozhou/voxelizer/blob/master/raycasting_fs.glsl
				// NOTE: nextCrossingT == dt

				vec3 mm = step(nextCrossingT.xyz, nextCrossingT.yxy) * step(nextCrossingT.xyz, nextCrossingT.zzx);
				icell += dda_step_from_dir(state.dir, mm);
				nextCrossingT += mm * deltaT;
			}
		}

		if (false && state.face_tests >= 128)
		{
			state.flags = rt_clear_mask(state.flags, TF_RUNNING);
			state.flags = rt_set_mask(state.flags, TF_LEFT);
			state.flags = rt_set_mask(state.flags, TF_EARLY_EXIT);
		}

		if (false)
		if (true && state.face_tests >= 8192)
		{
			bool is_running = rt_is_mask_set(state.flags, TF_RUNNING);
			if (ballot_count(is_running) <= 4)
				break;
		}

		{
			bool is_running = rt_is_mask_set(state.flags, TF_RUNNING);
			if (ballot_count(is_running) == 0)
				break;

			//if (rt_is_mask_set(state.flags, TF_RUNNING) == false)
			//	return 0;
		}
	} 

	return 0;
} 


