; Bonz's entry for Hugi size coding compo 15
;
; assemble with nasm -fbin -oentry.com entry.asm
;
; This is the version that I prefer.  The source is clear
; (apart from some mess in the calculation of the VRAM
; segment), the program is fast (it only sets the palette
; once) and stable (it does not depend on file handles), 
; yet it is only 142 bytes.  This is just before the final
; destructuration and spaghettization :-) that brought
; me below the 140 mark.

BMP_FILE_SIZE   equ     17462           ; = 128x128 + 128 + 1024

; constants for the I/O subroutine
READ_GFX	equ	3f13h
OPEN_SPACE	equ	3d20h
WRITE_TEXT	equ	4003h
CREATE_CR	equ	3c0dh

; Here is the reasoning behind all of these:
; - IO_AREA --> same as DAC port
; - GFX_SEG --> origin in (32, 96)
; - TOPLEFT_OFS --> so that BP overflows after exactly 128 lines
; - WSPACE_END --> this is also the segment that we write to.
; - INITIAL_BP --> WSPACE_END - 128 + INITIAL_BP = TOPLEFT_OFS

IO_AREA		equ	03c8h
GFX_SEG		equ	0a2d6h
TOPLEFT_OFS	equ	4000h
WSPACE_END	equ	GFX_SEG - TOPLEFT_OFS / 16
INITIAL_BP		equ	TOPLEFT_OFS - WSPACE_END + 128

; BMP file characteristics
BMP_SIZE	equ	128 * 128
BMP_DATA	equ	IO_AREA + BMP_FILE_SIZE - BMP_SIZE

V1		equ 128
D2		equ 64			; scroll down
U3		equ 32			; scroll left
X4		equ 16			; exchange x/y
V5		equ 8			; vertical flip
D6		equ 4
U7		equ 2
X8		equ 1

	org	256

	; A shell script that helped me in obtaining these ways to do
	; the transformations (together with lots of patience) is
	; included in the archive.

table	equ	$-30h
	db	U3+V5+U7		; 0	SUB BH, BH
	db	V1+D2+U3+X4+V5+D6+U7+X8	; 1	(clears CF)
	db	D2+D6+U7		; 2	INC SI
	db	V1+X4+D6+U7		; 3	XCHG SI, AX
	db	X4+D6+X8   		; 4	ADC AX,131Fh
	db	X4+V5+D6+U7+X8		; 5
	db	X4+U7+X8   		; 6
	db	V1+X4+V5+X8		; 7	CWD
	db	U3+X4+X8  		; 8	XOR  [BX+SI], BX
	db	X4+V5			; 9	(but BX=0, so has no effect)

	; AX = 1420h, DX=SI=0, others untouched

	mov	di, 82h			; point to command line
	mov	bx, READ_GFX		; read, set gfx mode
	mov	ah, OPEN_SPACE>>8	; open, deny write, space-delimited
	call	io			; read file at offset 3c8h

	xchg	si, ax			; prepare a counter
	out	dx, al			; and DX = data area = DAC address port
	inc	dx

; It turns out that the most stupid palette-setting code is also the smallest!

palette:
	mov	bl, 12
setpal:
	out	dx, al			; set a component
	dec	bx			; emit 12 equal values
	jnz	setpal
	inc	ax			; then increment AX
	jns	palette		; emit 32768*12 = 393216 values (!)

kbdloop:				; first time AL = 0 --> null transform
	pusha				; save ptr to filename, BX, CX
	mov	di, BMP_DATA		; DI = first byte of the image
	mov	si, WSPACE_END	; SI = end of workspace
	mov	fs, si
	mov	bp, INITIAL_BP	; Fix from workspace to VRAM

	mov	cl, 2			; CL = shift count

pixel:
	dec	bx
	and	bl, 7fh			; Mask away bit 7 of the column

	dec	si		   	; Next byte in the workspace
					; (BX goes backwards, and so does SI)
	pusha		   		; Save loop counter and opcode (AL)
xform:
	cbw	                   	; AL has opcode (VDUXVDUX)
	xor	bh, ah             	; so if V, invert BH
	shl	al, cl             	; AL = UXVDUX00  CF = D
	cbw                        	; If U = 1 and D = 0, sum AH = FFh
	adc	bh, ah             	; If D = 1 and U = 0, sum CF = 1
	shl	al, cl             	; AL = VDUX0000  CF = X
	jnc	no_xchg
	xchg	bh, bl             	; If X, exchange row/column
no_xchg:
	jnz	xform			; If anything else to do, loop
	
	; This computes the address and masks away bits 15 and 7
	; We can leverage the shift count already in CL

	shl	bl, 1              	; bits 14-8 = y, bits 7-1 = x
	shl	bx, 1              	; bits 15-9 = y, bits 8-2 = x
	shr	bx, cl             	; bits 13-7 = y, bits 6-0 = x

	mov	dl, [bx+di]        	; Load from bitmap
	mov	[si], dl	   	; Store to workspace
	mov	[fs:bp+si], dl
	
	popa				; Restore loop counter and opcode
	or	bl, bl			; Plotted last pixel on the line?
	jnz	pixel
	add	bp, 320 + 128	; Yes, advance to the next one
	jno	pixel			; until BP exceeds 8000h

	rep	movsb			; Copy 4402h bytes from workspace to
					; bitmap (more than it is necessary).

	popa				; pop CX, BX and address of file name
	mov	ah, 8              	; Read a key
	int	21h                	; go through DOS

	aam	30h			; If spacebar, set AH = 0
	sahf				; and then CF = 0
	xchg	si, ax
	lodsb				; Get the opcode (VDUXVDUX) in AL
	jc	kbdloop           	; If a number, loop

	mov	bx, WRITE_TEXT		; write, set text mode
	mov	ax, CREATE_CR		; create, CR-delimited

	; This subroutine does in practice everything that
	; must be done twice in the program :-) (that is, at
	; startup and on termination): it scans the command
	; line up to an arbitrary terminator, opens a file with an
	; arbitrary DOS function, does I/O with another arbitrary
	; DOS function, and sets a video mode (!).
	;
	; AX = function to open the file
	; AL = filename delimiter
	; BX = function to read/write the file + video mode to set
	; CX = must be at least as long as filename
	; DI = pointer to filename (into command line --> high byte = 0)
	;
	; on output
	; AX = 4437h
	; BX = file handle
	; CX = number of bytes read (should be 4436h)
	; DX = 3c8h
	; DI = pointer to next filename

io:
	mov	dx, di			; ptr to file name on command line
	repne	scasb			; skip to the first whitespace
	mov	[di-1],dh		; store a zero
	repe	scasb			; skip to the next filename
	dec	di			; off by one because of SCASB

	mov	cl, 20h			; attribute for create
	int	21h
	xchg	bx, ax			; handle in BX, I/O function + mode in AX

	pusha				; save DOS function and handle
	cbw
	int	10h			; enter video mode specified by AL
	popa
	
	mov	dx, 3c8h
	mov	cx, BMP_FILE_SIZE
	int	21h
	ret
