;--------------------------------
;"Plattenbau 3000 AVX" by Kuemmel
;
; requieres AVX2 compatible CPU
; tested on FreeDOS
;--------------------------------
org 100h

;---parameters
effect_speed_shift=9    ;default: 9
effect_01=1000000000b   ;default: 1000000000b should kind of fit to effect_speed_shift
depth_initial=8         ;default: 8
depth_steps=512         ;raycasting steps 0...65535
scr_width=1024          ;screen width
scr_height=768          ;screen height
x_offset=512+130        ;center x
y_offset=384+50         ;center y

;---create words with 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,... for x-offset mask
   mov di,0x200  ;cx=0x00ff, ax = 0x0000
   loop0123:
      stosw
      inc ax
   loop loop0123 ;after loop di is 0x3fe, ax=0x00ff

;---screen mode stuff by JIN-X, set screen mode and get LFB address
   mov ax,0x4f01 ;get Mode Info  INT 0x10, ax=0x4f01, cx=mode, es:di=256 byte buffer
   mov cx,0x4105 ;if successful ax = 0x004f
   int 0x10
   mov bx,cx
   mov ax,0x4f02 ;set Video Mode INT 0x10, ax=0x4f02, bX=mode, es:di=CRTCInfoBlock
   int 0x10      ;if successful ax = 0x004f

;---pmode stuff by JIN-X
   cli
   pop fs
   mov eax,0x40603
   lmsw ax
   mov cr4,eax
   xor ecx,ecx
   xgetbv
   or al,0x7
   xsetbv

;---set first palette entry (background) to orange
   xchg ax,cx   ;clear ax with cx
   mov dx,0x3c8
   out dx,al    ;palette entry 0
   inc dx
   dec ax       ;al = 0xffff => 63 
   out dx,al
   xor al,208   ;al = 47 (for 31 use 224) and carry flag clear with xor
   out dx,al
   salc         ;al =  0
   out dx,al

;---init stuff
   xor bp,bp                ;init bp for clear effect start
   vzeroall                 ;for xmm6 counter...may be not needed....
   vpcmpeqw ymm2,ymm2,ymm2  ;all bits to 1 => = ymm2 = -1|-1|...
   
   mainloop:

   shld ax,bp,effect_speed_shift
   and al,00000011b
   or  al,00010000b         ;mask caluclation/variation => 000100??b
   mov byte [si],al
   vpbroadcastb ymm7,[si]   ;needed on all 32 bytes

   mov cx,-scr_height+y_offset
   mov edi,[0x426]          ;get screen address (di + 0x28 => 0x426)
   y_loop:
      mov [si],cx
      mov ax,-scr_width+x_offset
      vpbroadcastw ymm3,[si]
      ;change geometry after every 4 shapes
      test bp,effect_01
      jz skip_effect_01
         vpaddw ymm3,ymm3,ymm7
      skip_effect_01:
      x_loop:
         mov [si],ax
         mov bx,depth_steps                  ;depth => if >255 steps use "bx" + 1 Byte
         vpbroadcastw ymm0,[si]
         vpsllw       ymm4,ymm2,depth_initial
         vpaddw       ymm0,ymm0,[0x200]      ;x = x+0|...|x+15
         vmovdqu      [si],ymm0              ;store x as there's no more regs available
         vpxor        ymm1,ymm1,ymm1         ;hit_colours = 0
         ray_loop:
            vpaddw       ymm4,ymm4,ymm2      ;depth = depth - 1
            vpmullw      ymm0,ymm4,[si]      ;x = (x-center)*depth
            vpmullw      ymm5,ymm4,ymm3      ;y = (y-center)*depth
            vpaddw       ymm0,ymm0,ymm6      ;x + timer
            vpaddw       ymm5,ymm5,ymm6      ;y + timer
            vpand        ymm5,ymm5,ymm0
            vpsraw       ymm5,ymm5,10
            vpand        ymm5,ymm5,ymm4      ;initial plattenbau geometry
            vextracti128 xmm0,ymm5,1         ;high 128 to low 128
            vpacksswb    xmm5,xmm5,xmm0      ;current color from 16 words to 16 bytes
            vpandn       xmm0,xmm1,xmm7      ;mask only if hit_colour NOT set already
            ;vptest       xmm0,xmm0           
            ;jz early_exit                    ;early exit, major speed up but no constant frame rate any more
            dec bx                           ;reordered
            vpand        xmm0,xmm0,xmm5      ;check if hit occurred => if current color contains the mask
            vpcmpeqb     xmm0,xmm0,xmm7      ;if hit occurred set byte to 11111111 
            vpblendvb    xmm1,xmm1,xmm5,xmm0 ;update only the 11111111 byte's of hit_colours
         jnz ray_loop                        ;using LOOP is much slower
         ;early_exit:
         vmovdqa [fs:edi],xmm1               ;plot all 16 byte pixels
         add ax,16
         add edi,16
         cmp ax,x_offset
      jl x_loop
      inc cx
      cmp cx,y_offset
   jne y_loop

   ;---vsync for timing
   ;mov dl,0xda ;=> 0x03da as dh=x03 from palette change and not changed anywhere else 
   ;vsync:
   ;   in al,dx
   ;   test al,8
   ;jz vsync

   vpsubw ymm6,ymm6,ymm4
   inc bp
   
   in al,0x60
   cbw
   dec ax
   jnz mainloop
   mov cr0,eax
   mov al,0x3   ;skip if you need those bytes, so it exits to 1024x768x8Bit mode
   int 0x10     ;skip if you need those bytes, so it exits to 1024x768x8Bit mode
   int 0x20     ;needed due to pop es from init code

