;"volumi revisited" intro by Kuemmel for Lovebyte 2024 countdown
;modifications to the original: red-ish colours and different texture
;needs to be run on FreeDOS and fast CPU (tested on i5/2.9 GHz), DOSBox is way to slow
;additional credits:
;- idea derived from Mentor's shader: https://www.shadertoy.com/view/MsjBWR 

iter% =     83 ;iterations of volumina routine, e.g. 83 => affects speed !
resf% = 0x3c71 ;0x3c71 final float of resize as integer word

org 100h
use16

;---setup and data
;data in front = addressing by si=0x100
sub al,-(0x13+0x80) ;this provides screen mode (thanx Hellmood !) and data does no harm
sphere_size_init    dw 0x3e00 ;si+0(+ 2) 0.125 (0x3e00) defines size of sphere, also used to avoid div error
f_step              dw 0x3ba4 ;  +2(+ 4) defines step for w, e.g. approx 0.005
w_init              dw 0x3e80 ;  +4(+ 6) 0.25 (0x3e80) init for iteration, affects visuals
aspect              dw 0x3f66 ;  +6(+ 8) aspect ratio correction from (320x200=16:10) to 16:9 = approx 0.9
resize_add          dw 0x38d2 ;  +8(+10) resize add, approx 0.0001
db 0            ;needed to make data harmless...still saves bytes
int 10h
mov fs,[si]     ;texture address (results in 0x6d2c which is okay as an address)
push 0a000h     ;screen address
pop es          
fninit          ;needed for a fresh FreeDOS start, crashed otherwise
fldz            ;init rotation to zero at start

;---sine red-ish->white-ish palette generation
;needs: bx=0,cx=0x00ff
palette_loop:
    mov dx,3c8h
    mov al,cl
    out dx,al       ;could be skipped, but 3c8h seems to needs 0 init with FreeDOS
    inc dx
    mov al,63
    cmp al,cl
    cmova ax,cx
    out dx,al       ;R
    mov al,bl       ;cheap sin(c) = approx (1-(1-c)*(1-c))
    shr al,1        ;0...127
    mul al          ;c*c
    mov al,63       
    sub al,ah       ;63-highbyte(1-c*c)
    out dx,al       ;G
    out dx,al       ;B
    inc bx
loop palette_loop

;---main frame loop
pixel_loop:
    xor dx,dx                   ;init for div, crashes otherwise
    mov ax,di
    mov cx,320                  ;no rrrrola this time, want proper x,y
    div cx              
    sub dx,159                  ;center x
    sub ax,99                   ;center y
    
    mov word[bp+si],dx          ;st0                |st1    |st2    |st3    |st4    |st5    |st6    |st7
    fild word[bp+si]            ;x                  |t
    fmul dword[si]              ;x=x*sphere_size    |t
    mov word[bp+si],ax
    fild word[bp+si]            ;y                  |t
    fmul dword[si]              ;y=y*sphere_size    |x      |t
        
    mov  word[bp+si],si         ;reordered, si = 256 = tex_size
    fld  st0                    ;y                  |y      |x      |t
    fmul st0,st0                ;y*y                |y      |x      |t
    fld  st2                    ;x                  |y*y    |y      |x      |t
    fmul st0,st0                ;x*x                |y*y    |y      |x      |t
    faddp st1,st0               ;d=x*x+y*y          |y      |x      |t              
    fadd dword[si]              ;d=d+sphere_size    |y      |x      |t      ;to capture div error or use f_step
    fild word[bp+si]            ;tex_size           |d      |y      |x      |t
    fdiv st0,st1                ;rd=tex_size/d      |d      |y      |x      |t
    fld  dword[si+4]            ;init w             |rd     |d      |y      |x      |t
    
    mov ax,iter%                ;init iteration counter
    cwd                         ;clear colour = dx
    iteration:                  ;uses a cheap approximation for ASIN(w*d) => 1-SQRT(1-w*d)
            fld1                ;1                  |w      |rd     |d      |y      |x      |t
            fld st1             ;w                  |1      |w      |rd     |d      |y      |x      |t
            fmul st0,st4        ;w*d                |1      |w      |rd     |d      |y      |x      |t
            fcomi st0,st1       ;w*d                |1      |w      |rd     |d      |y      |x      |t
            jnc add_nothing     ;is (w*d<1) ? if yes then don't set carry and skip calculation => speed matters !
            fsubr st0,st1       ;1-w*d              |1      |w      |rd     |d      |y      |x      |t
            fsqrt               ;fsqrt(1-w*d)       |1      |w      |rd     |d      |y      |x      |t
            fsubp st1,st0       ;sw=1-fsqrt(1-w*d)  |w      |rd     |d      |y      |x      |t
            fmul st0,st2        ;sw=sw*rd           |w      |rd     |d      |y      |x      |t  
            fld st0             ;sw                 |sw     |w      |rd     |d      |y      |x      |t
            fmul st0,st6        ;sw*x               |sw     |w      |rd     |d      |y      |x      |t
            fadd st0,st7        ;sw*x-t             |sw     |w      |rd     |d      |y      |x      |t
            fadd st0,st7        ;add twice for better rotation visual
            fistp word[bp+si]   ;sw                 |w      |rd     |d      |y      |x      |t
            fmul st0,st4        ;sw*y               |w      |rd     |d      |y      |x      |t
            mov bx,word[bp+si]  ;reordered
            fadd st0,st6        ;sw*y-t             |w      |rd     |d      |y      |x      |t
            fistp word[bp+si]   ;w                  |rd     |d      |y      |x      |t
            mov bh,byte[bp+si]
            movzx bx,byte[fs:bx];seems faster for newer cpus than xor+mov or xlatb
            add dx,bx           ;colour = colour + texture[y:x]
            jmp skip_clr_st0_st1;or use fld st0 + fld st0 to skip the jump, but takes 2 bytes more and speed seems same 
        add_nothing:
        fcompp                  ;w                  |rd     |d      |y      |x      |t
        skip_clr_st0_st1:   
        fadd dword[si+2]        ;w=w+f              |rd     |d      |y      |x      |t
        dec ax
    jnz iteration               ;agner's optimization manual: 'loop' is slow on all cpu's so I give one byte more here
    fcompp                      ;d                  |y      |x      |t
    fcompp                      ;x                  |t
    fstp st0                    ;t
    shld ax,dx,(3+8)            ;one byte shorter instead of shr/mov combo
    stosb
    test di,di
shortcut:
jnz pixel_loop

;---size and rotation animation
cmp word[si+2],resf%    ;check float by integer
jne skip_rotation
   fadd dword[si+6]     ;add rotation offset or not
skip_rotation:
je skip_size_change
   fld  dword[si]       ;change size or not
   fsub dword[si+8]
   fstp dword[si]
skip_size_change:

;---seamless texture generation (each frame)
;needs: di=0
fist word[bp+si]
texture:
    mov ax,di
    and ax,0011111100111111b ; or 0011111000111110b
    mov cl,ah
    sub al,31  ;x
    imul al    ;x*x
    xchg ax,cx ;save x*x and get y
    sub al,31  ;y
    imul al    ;y*y
    add ax,cx  ;x*x+y*y
    shr ax,2
    and al,11100000b
    neg al
    add al,128
    jns skip_capy
       salc
       test byte[bp+si],1100000b
       jne skip_capy
         mov al,31 
    skip_capy:
    mov byte[fs:di],al
    inc di
jnz texture

;---vsync for timing...
mov dx,3dah
vsync:
  in al,dx
  test al,8
jz vsync

;---check keyboard
in al,0x60      ;check for ESC
dec al          ;ah not zero here
jnz shortcut    ;shortcut saves two bytes :-)
ret