//----------------------------------------------
//Inversionism 2020 by Kuemmel
//----------------------------------------------
.syntax unified
//--- some often used OS routines
.set OS_NewLine, 0x03
.set OS_ScreenMode, 0x65
.set OS_RemoveCursors, 0x36
.set OS_ReadVduVariables, 0x31
.set OS_Exit, 0x11
.set OS_ReadEscapeState, 0x2c
.set OS_ReadMonotonicTime, 0x42
.set OS_Byte, 0x06
.set OS_CallASWI, 0x6f
.set VFPSupport_CreateContext, 0x58ec1
.thumb
start:
//---hide mouse---------------------------------------------------
  movs r0,#106
  movs r1,#0
  mvns r5,r1        // r5 = -1 for vdu and main routine
  swi OS_Byte

//---get vfp support context--------------------------------------
  movs r1,#32
  lsls r0,r1,#26    // reuse r1
  adds r0,r0,#3     // r0 = 0x80000003
  movs r2,#0
  movw r10,#0x8ec1
  movt r10,#0x5
  swi OS_CallASWI   // needed due to swi numner >0xff

//---get screen data----------------------------------------------
  adr.n r0,start + 1024
  movs r1,r0
  movs r2,#12
  movs r3,#11
  movs r4,#148
  stmia r1!,{r2-r5}
  swi OS_ReadVduVariables

//--- main intro loop --------------------------------------------
mainloop:

  //don't change: r1 = screen data address, r5 = -1

  ldm r1,{r2,r3,r4}           //yres%-1,xres%-1,screen_base%

  swi OS_ReadMonotonicTime    //if time passed, inc SIN timer...
  vdup.i32 d31,r0
  vcvt.f32.u32 d31,d31,#11  //check what fractional part is okay...

  //r6=SIN(time)     ;*0.8
  //r7=SIN(time)>>2  ;*0.2
  movs r6,#0
  movs r7,#0
  vmov     	d14,r6,r7
  vcvt.f32.s32	d14,d14,#8    //fxp_int to float
  vmov.f32 	s30,#0.5      //sin(time)*0.8		|sin(time)*0.2	|0.5

  vmov.f32 q7,#0.5
  vmov.f32      s31,#2.0      //sin(time)*0.8		|sin(time)*0.2	|0.5	|2.0     /colouring

  vmov s26,r2
  vcvt.f32.u32 s26,s26        //d13[0] = float(yres%)
  vdiv.f32     s26,s31,s26    //d13[0] = 2*1/yres%

  lsrs r2,r2,#1               //adjust for y-loop
  lsrs r3,r3,#1               //adjust for x-loop

  subs r6,r5,r2               // init y_counter and compensate for yres%-1
  y_loop:
    subs r7,r5,r3             // init x_counter and compensate for xres%-1
    x_loop:
       vmov    		d0,r7,r6	//x	            |y
       vcvt.f32.s32	d0,d0   	//float(x)          |(float(y)
       vmul.f32         d0,d0,d13[0]	//2.0*x/yres% 	    |2.0*y/yres%           //if resolution is fixed, can be done by vcvt fixed point
       vmul.f32         d1,d0,d0        //q.x*q.x   	    |q.x*q.x
       vpadd.f32       	d2,d1,d1        //q.x*q.x+q.y*q.y   |q.x*q.x+q.y*q.y      //dot(q,q) ...save for later also
       vrecpe.f32       d3,d2           //1.0/dot(q,q)      |1.0/dot(q,q)         //check if accuracy is good enough
       vmul.f32         d3,d0,d3        //p.x=q.x/dot(q,q)  |p.y=q.y/dot(q,q)     //or skalar multiply if faster...
       vadd.f32         d3,d3,d31       //p.x+=timer        |p.y+=timer
       vadd.f32         d3,d3,d3
       vadd.f32         d3,d3,d3

       vdup.32          q2,d3[0]        //p.x               |p.x |p.x |.p.x
       vdup.32          q3,d3[1]        //p.y               |p.y |p.y |.p.y
       //movs r0,#1
       //xy_loop:
         vadd.f32         q2,q2,q7        //p.x + col.rgbx    |from here on check if a looping once is possible
         vadd.f32         q3,q3,q7        //p.x + col.rgbx    |from here on check if a looping once is possible
         vcvt.s32.f32     q4,q2           //int(cx)
         vcvt.s32.f32     q5,q3           //int(cx)
         vcvt.f32.s32     q4,q4           //float(int(cx))
         vcvt.f32.s32     q5,q5           //float(int(cx))
         vsub.f32         q2,q2,q4        //fract(cx)
         vsub.f32         q3,q3,q5        //fract(cx)
         vmul.f32         q2,q2,q2        //fract(cx)*fract(cx)
         vmul.f32         q3,q3,q3        //fract(cx)*fract(cx)
         //vswp             q2,q3
       //subs r0,r0,#1
       //beq xy_loop
       vmax.f32         q2,q2,q3        //min(cx,cy)
       vmul.f32         q2,q2,q2
       vmul.f32         q2,q2,d2[0]     //min(cx,cy)*dot(q,q)
       vcvt.u32.f32     q2,q2,#8        //int(col)*256
       vmovn.u32        d0,q2           //check if 'q' is needed here
       vqmovn.u16       d0,q0           //32Bit->8Bit RGB
       vst1.32          {d0[0]},[r4]!
       adds r7,r7,#1
       cmp r7,r3
    ble x_loop
    adds r6,r6,#1
  cmp r6,r2
  ble y_loop

//--- escape test and exit ---------------------------------------
  swi OS_ReadEscapeState // check for ESC
  bcc mainloop
  swi OS_NewLine         // to trigger desktop redraw
  swi OS_Exit            // exit to OS
