//----------------------------------------------
//Inversionism 2020 by Kuemmel
//----------------------------------------------
.syntax unified
//--- some often used OS routines
.set OS_NewLine, 0x03
.set OS_ScreenMode, 0x65
.set OS_RemoveCursors, 0x36
.set OS_ReadVduVariables, 0x31
.set OS_Exit, 0x11
.set OS_ReadEscapeState, 0x2c
.set OS_ReadMonotonicTime, 0x42
.set OS_Byte, 0x06
.set OS_CallASWI, 0x6f
.set VFPSupport_CreateContext, 0x58ec1
.set OS_ConvertInteger4, 0xdc
.set OS_Write0, 0x02
.thumb
start:
//---get vfp support context--------------------------------------
  movs r1,#32
  lsls r0,r1,#26    // reuse r1
  adds r0,r0,#3     // r0 = 0x80000003
  movs r2,#0
  movw r10,#0x8ec1
  movt r10,#0x5
  swi OS_CallASWI   // needed due to swi numner >0xff

//---init screen and get screen start address ---------------------
  movs r0,#15
  adr.n r1,mode
  swi OS_ScreenMode
  mvns r3,r2        //  -1
  movs r2,#148      // 148
  movs r0,r1        // read = write address
  stmia r1!,{r2,r3}
  swi OS_ReadVduVariables // screen address at r1
  swi OS_RemoveCursors    // remove cursor

//---init constants for colouring
  vmov.f32 s28,#4.0
  vmov.f32 s29,#8.0
  vmov.f32 s30,#2.0

//--- main intro loop --------------------------------------------
  mainloop:

  swi OS_ReadMonotonicTime    //get timer
  vmov d31,r0,r0
  vcvt.f32.u32 d31,d31,#10    //adjust fractional part to modify speed
  ldr r0,[r1]                 //get screen address

  movs r2,#600
  y_loop:
    movs r3,#800
    subs r6,r2,#300
    x_loop:
       subs r7,r3,#400
       vmov          d0,r7,r6	  //x	             |y
       vcvt.f32.s32  d0,d0,#8  	  //float(x)/256     |(float(y)/256
       vmul.f32      d1,d0,d0     //q.x*q.x   	     |q.x*q.x
       vpadd.f32     d2,d1,d1     //q.x*q.x+q.y*q.y  |q.x*q.x+q.y*q.y      //dot(q,q) ...save for later also
       vrecpe.f32    d3,d2        //1.0/dot(q,q)     |1.0/dot(q,q)         //check if accuracy is good enough
       vrecps.f32    d4,d2,d3
       vmul.f32      d3,d3,d4     //better accuracy...
       vmul.f32      d3,d0,d3     //p.x=q.x/dot(q,q) |p.y=q.y/dot(q,q)     //or skalar multiply if faster...
       vadd.f32      d3,d3,d31    //p.x+=timer       |p.y+=timer
       vdup.32       q2,d3[0]     //p.x              |p.x |p.x |.p.x
       vdup.32       q3,d3[1]     //p.y              |p.y |p.y |.p.y
       vmul.f32      q2,q2,q7     //p.x * rgb.colours //folowing could be looped to save some Bytes but kills 40% speed
       vmul.f32      q3,q3,q7     //p.y * rgb.colours
       vcvt.s32.f32  q4,q2        //int(cx)
       vcvt.s32.f32  q5,q3        //int(cx)
       vcvt.f32.s32  q4,q4        //float(int(cx))
       vcvt.f32.s32  q5,q5        //float(int(cx))
       vsub.f32      q2,q2,q4     //fract(cx)
       vsub.f32      q3,q3,q5     //fract(cx)
       vmul.f32      q2,q2,q2     //fract(cx)*fract(cx)
       vmul.f32      q3,q3,q3     //fract(cx)*fract(cx)
       //vrsqrte.f32   d2,d2
       //vrecpe.f32    d2,d2
       vmax.f32      q2,q2,q3     //max(cx,cy)...or min...
       vmul.f32      q2,q2,d2[0]  //*dot(q,q)
       vcvt.u32.f32  q2,q2,#8     //int(col)*256
       vmovn.u32     d0,q2
       vqmovn.u16    d0,q0        //32Bit->8Bit RGB
       vst1.32       {d0[0]},[r0]!
      subs r3,r3,#1
    bne x_loop
    subs r2,r2,#1
  bne y_loop

//--- escape test and exit ---------------------------------------
  swi OS_ReadEscapeState // check for ESC
  bcc mainloop
  swi OS_Exit            // exit to OS

.align 2
mode:
.string "32 C16M"        // 800x600 True colour
string_buffer:

