;Entry for Hugi Size Coding Competition 27 - The Resurrection!
;By Rune Lehard Hansen Stubbe aka Mentor 2008 (stubbe@daimi.au.dk)
;
;This is the decoder matching the encoder of main.cpp. It has taken
;me quite a bit of experimentation to arrive at this exact solution,
;but it seems there is still a lot of stuff to try. Most notably,
;it would be interesting to investigate the potential of a sort of
;local search image transformer, instead of the heuristic one I am
;currently using.
;
;Characters are represented as 16-bit words with the most significant byte
;containing the color and the least significant byte the character code.
;
;Each words is decoded one bit at a time using a binary arithmetic decoder.
;The modelling part of the code is responsible for providing the probabilities
;for the next bit being a 0-bit or 1-bit to the arithmetic decoder.
;
;Models make predictions based on experiences made in the same context.
;A context consists of the decoded bits of the current word together
;with possible the previous word and the word directly above (80 words back).
;As both of these words are optional, a total of four different models
;are possible. Predictions from all four models are computed and the their
;counters are blended with equal weight. I have experimented quite a bit
;with higher order models and more advanced weighting schemes, but this
;seems to be a good compromise between compression and decompressor complexity.
;To avoid zero-frequencies, counters are offset with a constant amount,
;corresponding to a model with a fixed prediction.
;
;Context counters are computed by scanning through the entire decoded part of the
;image. As this is done for every model for every bit, the decoding
;time is quadratic in the size of the image. Doing everything in video memory
;doesn't speed things up either, but we are not optimizing for speed, are we? :)
;
;Counters are updated in a PAQ6-style semi-static fashion, where the correct
;counter is incremented and the opposite counter is roughly halved to favor recent
;experience.
;
;This is my first attempt at 16-bit coding, so let me know, if any of you
;more experienced guys can spot any obvious optimization opportunities :)


[ORG 100h]
%define IMAGE_WIDTH 	80
%define IMAGE_HEIGHT 	25
%define IMAGE_SIZE		(IMAGE_WIDTH*IMAGE_HEIGHT)
%define BOOST_FACTOR	15
%define NUM_MODELS		4

	;assume:	ax=0000h, bx=0000h, cx=00FF, dx=????
	;			si=0100h, di=FFFEh, bp=09??, sp=FFFE
start:	
	xchg si, ax				;si := 0
	call .setMode			;set mode
	
	push	0xB800
	pop 	ds				;work directly in video memory
	
	mov 	[si], si		;clear [0x0000]
	lodsw					;si := 2, ax := 0
	
	;; bx = 0
	;; cx = 0xFF
	inc ax					;ax := 1
	scasw					;di := 0	;)

.prediction:
	and cx, byte 0xF		;bitidx &= 0xF
	mov bp, NUM_MODELS-1	;model counter and base probability
	push bp					;push BASEPROB
	push bp					;push BASEPROB
.model_loop:
	pusha

	xor bx, bx
	xor di, di
	imul di					;ax := 0, dx := 0
.context_loop:
	pusha

	inc cx					;bitpos: 8..1
.matchloop:
	lea ax, [di+bx]
	test ax, ax
	jl short .no_match		;di+bx < 0?
	
	mov ax, word [di+bx]
	xor ax, word [si+bx]
	shr ax, cl
	jnz .no_match
	xchg cx, ax				;cx := 0
	
.skip:
	;update bx: 0, -2, -160
	dec bx
	dec bx
	jnp .not_up
	mov bl, -160
.not_up:

	shr bp, byte 1
	jc .matchloop
	jnz .skip
	;match
	
.no_match:
	;zf: match?
	popa
	
	jnz short .end_context
	inc dx
	inc ax
	bt [di], cx
	jc short .c
		shr dx, byte 1
		jmp short .end_context
	.c:	shr ax, byte 1
.end_context:

	scasw	;di += 2
	cmp di, si
	jl short .context_loop
	
	mov cl, BOOST_FACTOR
	.add_loop:
		add word [ss:0xFFFC], ax	;write to stack. there must be a nicer way to do this..
		add word [ss:0xFFFA], dx
		test ax, ax
		jz .z
		test dx, dx
.z:		loope .add_loop		;loop BOOST_FACTOR times
	popa
	dec bp
	jge short .model_loop

	;pop predictions 
	pop bp
	pop dx

	cmp si, IMAGE_SIZE*2
	js short .arit_decode	;end?
		;ch=0
		xchg ax, cx
		int	16h				;wait for keystroke
	.setMode:
		mov	ax, 3			;clean up
		int	10h
		ret

	;16-bit version of the crinkler binary arithmetic decoder
.aritloop:
	bt	[cs:_data], bx		;test bit
	adc	di, di				;shift bit in
	inc	bx					;next bit
	add	ax, ax				;shift interval	
.arit_decode:
	test ax, ax				;msb of interval != 0
	jns	short .aritloop
	btr [si], cx			;clear bit
	
	add	bp, dx				;bp = p0 + p1
	push ax					;push interval_size
	mul	dx					;dx:ax = p0 * interval_size
	div	bp					;ax = (p0 * interval_size) / (p0 + p1)
	pop	dx					;dx = interval_size
	cmp	di, ax				;data < threshold?
	jb short .zero
	xchg ax, dx				;ax = interval_size, dx = threshold
	sub	di, dx				;data -= threshold
	sub	ax, dx				;interval_size -= threshold
	bts [si], cx			;write bit
.zero:

	dec cx					;update bit
	jns short .dontInc		;update word
		inc si
		inc si
	.dontInc:
	jmp .prediction
	
_data:
incbin "compressed"