//-------------------------------------------------------------------------
//Codecraft#4 invitro 2021 by Kuemmel for Revision 2021
//-------------------------------------------------------------------------
//Additional credits:
//- music tracked by Saga Musix
//- texture generation based on an 256 colour algo by Baze/3SC for DOS
//- texture generation size optimization & rotozoom origin by Exoticorn
//-------------------------------------------------------------------------
//options more byte saving:
// - plot text and screen buffer:  - 6 Bytes for ldm/stm/uquadd8 solution
// - if resolution string is kept: - 4 Bytes for loading screen address ldm
// - skip app memory init:         - 2...16 Bytes
// - sound handler movs r2,#0      - 2 Bytes if moved before texture gen
//-------------------------------------------------------------------------
.syntax unified
.thumb

//---variables-------------------------------------------------------------
x_res=800 //720 for video capture (adjust also sphere_size and mode string)
y_res=600 //576 for video capture (adjust also sphere_size and mode string)
font_size=48           //font size
action_change_time=594 //based on the music 128 notes at 44100 KHz

//---OS routines-----------------------------------------------------------
.set OS_ScreenMode,                 0x65
.set OS_RemoveCursors,              0x36
.set OS_ReadVduVariables,           0x31
.set OS_Exit,                       0x11
.set OS_ReadEscapeState,            0x2c
.set OS_ReadMonotonicTime,          0x42
.set OS_Byte,                       0x06
.set OS_CallASWI,                   0x6f
.set OS_ChangeDynamicArea,          0x2a
.set Wimp_SlotSize,              0x400ec
.set Font_ReadFontMax,           0x4009c
.set Font_SetFontMax,            0x4009b
.set Font_FindFont,              0x40081
.set Font_Paint,                 0x40086
.set VFPSupport_CreateContext,   0x58ec1
.set XSharedSound_InstallHandler,0x6b440
.set XSharedSound_SampleRate,    0x6b446
.set XSharedSound_RemoveHandler, 0x6b441

//---get vfp support context-----------------------------------------------
  movs r1,#32
  lsls r7,r1,#13     //r7=0x00040000 for reserve application memory
  lsls r0,r1,#26     //reuse r1
  adds r0,r0,#3      //r0=0x80000003
  movs r2,#0
  movw r10,#0x8ec1
  movt r10,#0x0005   //"VFPSupport_CreateContext"
  swi OS_CallASWI    //needed due to swi numner >0xff

//---reserve application memory--------------------------------------------
  lsls r0,r7,#5      //r0=0x800000 => get 8 MByte of application memory
  //movs r1,r0         //...clears next slot...but I don't care :-)
  adds r7,r7,#0xec   //r7==0x400ec
  mov  r10,r7        //"Wimp_SlotSize"
  swi OS_CallASWI

//---reserve screen memory (...not needed usually...)----------------------
//  movs r0,#2                //screen area
//  movs r1,#(8<<20)          //8 MByte...save for tripple buffering
//  swi OS_ChangeDynamicArea

//---init screen and get screen start address------------------------------
  movs r0,#15
  adr.n r1,mode
  movs r2,#0
  swi OS_ScreenMode
  mvns r4,r2        // -1
  movs r2,#148      //148
  movs r0,r1        //read = write address
  stmia r1!,{r2,r4}
  swi OS_ReadVduVariables  //screen address at screen_address
  swi OS_RemoveCursors     //remove cursor

//---set maximum antialiasing font for font size >36 (OS default)----------
  sub r10,r10,#(Wimp_SlotSize-Font_ReadFontMax)   //"Font_ReadFontMax"
  swi OS_CallASWI
  movs r2,#font_size       //Change FontMax2 to get antialiasing
  movs r3,#font_size       //Change FontMax3 to get antialiasing
  sub r10,r10,#(Font_ReadFontMax-Font_SetFontMax) //"Font_SetFontMax"
  swi OS_CallASWI

//---get font in chosen size-----------------------------------------------
  adr.n r1,font_name
  lsls r2,r2,#4            //x_size = font_size*16
  lsls r3,r3,#4            //y_size = font_size*16
  movs r4,#0               //default dpi
  movs r5,#0               //default dpi
  sub r10,r10,#(Font_SetFontMax-Font_FindFont) //"Font_FindFont"
  swi OS_CallASWI          //=> font handler will be in r0

  mov r11,r4     //r11 init for rotozoom, r11 exclusive for rotozoom

//---plot 4 strings on screen----------------------------------------------
  add  r10,r10,#(Font_Paint-Font_FindFont) //"Font_Paint"
  movs r2,#0b10000          //flags  ...attention...affects inversion algo
  movs r3,#64               //text_x
  movs r6,#19*3             //text address offset init
  lsls r4,r2,#6             //text_y = flags<<6 = 1024
  plot_strings_loop:
     adr.n r1,text_address  //text address
     adds r1,r1,r6
     swi OS_CallASWI
     sub  r4,r4,#(y_res>>1) //y_offset due to 600*2/4
     subs r6,r6,#19
  bpl plot_strings_loop

//---copy screen with text to text_buffer and clear texture buffer --------
  adr.n r4,screen_size      //get r4=screen_size,r5=text_buffer_address,
  ldmia r4,{r4,r5,r6,r7}    //r6=screen_buffer_address, r7=texture_buffer_address
  mov r13,r5                //used also as a stack at 0x9000 downwards
  ldr.n r0,screen_address+8 //+8 due to saving bytes at screen init
  movs  r3,#0               //init also for rgb/texture_loop
  copy_and_clear_loop:
     ldm  r0!,{r1}        //load from screen
     stm  r5!,{r1}        //save in text buffer
     strb r3,[r7,r4]      //clear texture_buffer, size enough (800*600>256*256*4)
     subs r4,r4,#1
  bpl copy_and_clear_loop //=> r4 will be -1 = 0xffffffff

//--- texture generation 256x256x32Bit seamless ---------------------------
  //needs r4 = -1 = 0xffffffff, r3 = 0, r7 = texture_buffer_address
  movs r0,#2           //rgb counter for rgb_loop
  uxth r1,r4           //defined seed...0xffff
  mov r12,r1           //r12 init for rotozoom, r12 exclusive for rotozoom
  rgb_loop:
     adds  r5,r7,r0
     uxth  r2,r4       //loop counter 0xffff
     texture_loop:
        rsb  r1,r1,r1,ror#23
        add  r6,r2,#255      //try 255,254,253,128,127
        ldrb r6,[r5,r6,lsl#2]
        adds r3,r3,r6
        sub  r3,r3,r1,asr#27
        lsrs r3,r3,#1
        strb r3,[r5,r2,lsl#2]
        eor  r6,r2,#0xff00
        strb r3,[r5,r6,lsl#2]
        subs r2,r2,#1
     bpl texture_loop
     subs r0,r0,#1
  bpl rgb_loop

//----set up shared sound interrupt handler to start the sound-------------
  adr.w r0,soundcode+1   //+1 as interrupt code address in thumb state also
  movs  r2,#0            //immediate handler
  adr.n r3,soundhandler_title+4
  str   r2,[r3]          //dummy title string
  movw  r10,#0xb440
  movt  r10,#0x6         //install XSharedSound_InstallHandler
  swi   OS_CallASWI
  push  {r0}             //backup handler number for remove handle at exit
  ldr.n r1,sample_freq   //get sample frequency Hz * 1024, could be hardcoded
  add   r10,r10,#6       //XSharedSound_SampleRate
  swi   OS_CallASWI
  sub   r10,r10,#5       //r10 for XSharedSound_RemoveHandler on exit
                         //don't use r10 anywhere else without backup

//-------------------------------------------------------------------------
//---main intro loop-------------------------------------------------------
//-------------------------------------------------------------------------
mainloop:

//---part control by timer, setup and effect choice------------------------
  adr.n r7,timer_save          //need adr as there's no str rx,label ...
  ldm r7,{r1,r2,r3,r4,r5,r6}   //r1=timer_save r2=action_status...
                               //r3=screen_size, r4=text_buffer
                               //r5=screen_buffer_address, r6 texture_buffer_address
  movw r4,#action_change_time
  swi OS_ReadMonotonicTime
  subs r1,r0,r1                //r0 is timer
  cmp  r1,r4                   //=> time since last action change...
  ittt hi
     addhi r2,r2,#1
     andhi r2,r2,#0b11         //0...3 => 4 different actions
     stmhi r7!,{r0,r2}         //update action_status and timer_save if time passed

  movw r4,#y_res               //...as used only once for all routines
  cmp r2,#2                    //check action_status and choose effect
  bhi planes_effect            // = 3
  beq rotozoom_effect          // = 2
  cbz r2,sphere_effect         // = 0 range of label +4...+130 Bytes... free cmp+branch :-)
                               //if = 1 ...next effect following here will be...

//---***texture layer scroll*** effect-------------------------------------
layer_effect: // 76 Bytes
  //requires r0=timer, r4=screen_y,
  //         r5=screen_buffer_address, r6=texture_buffer_address
  //uses neon d0...d3

  lsrs r0,r0,#2        //adjust timer
  layer_y_loop:
     ldr.n r1,screen_x
     layer_x_loop:
       movs r7,#2     //shift/amount of layers
       veor q1,q1     //init sum
       layer_loop:
         lsls r3,r0,r7         //timer << layer shift
         adds r2,r3,r4         //ty = y + timer << layer shift
         adds r3,r3,r1         //tx = x + timer << layer shift
         lsrs r2,r2,r7         //stretch layer
         lsrs r3,r3,r7         //stretch layer
         bfi  r3,r2,#8,#8      //[byte]ty[byte]tx
         uxth r3,r3            //[word]tytx
         add  r3,r6,r3,lsl#2   //calc texture address
         vld1.32 {d0[0]},[r3]  //load RGB word
         vaddw.u8 q1,q1,d0     //add wide to sum
         subs r7,r7,#1
       bpl layer_loop
       mul r3,r4,r1             //distance to upper left edge
       lsrs r3,r3,#10           //adjust
       vdup.16  d0,r3
       vqsub.u16 d0,d2,d0       //satuarated sub to sum of layer, or try add...
       vqshrn.u16 d0,q0,#1      //adjust
       vst1.32 {d0[0]},[r5]!
       subs r1,r1,#1
     bne layer_x_loop
     subs r4,r4,#1
  bne layer_y_loop
  b plot_text

//---***sphere with texture*** effect-------------------------------------
sphere_effect: // 130 Bytes
  //requires r0=timer, r4=screen_y,
  //         r5=screen_buffer_address, r6=texture_buffer_address
  //uses neon d0...d8

  vdup.i32 d6,r0                // timer| timer
  vldr s16,sphere_size          //     f
  sphere_y_loop:
     ldr.n r1,screen_x
     sphere_x_loop:
      subs r2,r1,#(x_res>>1)    //center x
      subs r3,r4,#(y_res>>1)    //center y, can be out side of loop...
      vmov         d0,r2,r3     //            x|            y
      vmov.f32     d7,#1.0      //          1.0|          1.0 no speed penalty
      vcvt.f32.s32 d0,d0        //     FLOAT(x)|     FLOAT(y)
      vmul.f32     d1,d0,d0     //        ux*ux|        uy*uy
      vpadd.f32    d2,d1,d1     //  ux*ux+uy*uy|  ux*ux+uy*uy
      vmls.f32     d7,d2,d8[0]  //      1-sum*f|      1-sum*f
      vabs.f32     d7,d7        //   t=ABS(1-t)|   t=ABS(1-t)
      vrsqrte.f32  d3,d7
      vmul.f32     d4,d7,d3     //backup later for colouring
      vmov.f32     d5,#2.0      //          2.0|          2.0 no speed penalty
      vrsqrte.f32  d3,d4
      vfms.f32     d5,d4,d3     // z=2-SQR(t)^2| z=2-SQR(t)^2
      vmul.f32     d0,d0,d5[0]  //          x*z|          y*z
      vcvt.s32.f32 d0,d0        //       INT(x)|       INT(y)
      vshr.u32     d0,d0,#1     //texture zoom adjust...looks better
      vadd.u32     d0,d0,d6     //add movement based on timer
      vmov r2,r3,d0
      bfi  r3,r2,#8,#8          //[byte]ty[byte]tx
      uxth r3,r3                //[word]tytx
      add  r3,r6,r3,lsl#2       //calc texture address
      vld1.32      {d0[0]},[r3]
      vcvt.u32.f32 d4,d4,#14    //colouring based on one SQR(t) only, looks kind of better
      vshll.u8     q0,d0,#1
      vqdmulh.s16  d0,d0,d4[0]  //d0,d0,d4 looks also interesting
      vqmovn.u16   d0,q0
      vst1.32      {d0[0]},[r5]!
      subs r1,r1,#1
    bne sphere_x_loop
    subs r4,r4,#1
  bne sphere_y_loop
  b plot_text

//---***planes*** effect---------------------------------------------------
planes_effect: // 76 Bytes
  //requires r0=timer, r4=screen_y,
  //         r5=screen_buffer_address, r6=texture_buffer_address
  //uses neon d0...d2

  //adjust next number by different values like 400 (default),50,
  mov   r8,#50<<9                //adjust texture size and speed

  planes_y_loop:
    ldr.n r1,screen_x
    planes_x_loop:
      sub  r2,r1,#(x_res>>1)   //x% =x%-400
      sub  r3,r4,#(y_res>>1)   //y% =y%-300
      subs r7,r2,r3            //d% =x%-y%
      //eor  r7,r7,r7,asr#31   //cheap branchless ABS(d) ...if put here: no counterflow
      lsls r3,r3,#9            //yc%=y%<<8    //adjust scale here for texture size
      sdiv r2,r8,r7            //xc%/d%
      sdiv r3,r3,r7            //yc%/d%
      adds r2,r2,r0            //tx%+timer
      adds r3,r3,r0            //ty%+timer
      bfi  r3,r2,#8,#8         //[byte]ty[byte]tx
      uxth r3,r3               //[word]tytx
      eor  r7,r7,r7,asr#31     //cheap branchless ABS(d) ...if put here: counterflow
      add  r3,r6,r3,lsl#2      //calc texture address
      lsrs r7,r7,#2            //limit ABS(d) to 8 Bit...otherwise 16Bit are needed later
      vld1.32    {d0[0]},[r3]  //load RGB word
      vdup.u8    d2,r7         //load on all lanes
      vmull.u8   q0,d0,d2      //long multiply
      vqshrn.u16 d0,q0,#6      //right shift saturated narrow ...change shift for brightness +/-
      vst1.32    {d0[0]},[r5]! //plot one pixel
      subs r1,r1,#1
    bne planes_x_loop
    subs r4,r4,#1
  bne planes_y_loop
  b plot_text

//---***rotozoom*** effect-------------------------------------------------
rotozoom_effect: // 106 Bytes
  //requires r4=screen_y,
  //         r5=screen_buffer_address, r6=texture_buffer_address
  mov   r9,r6             //texture buffer address ...saves bytes overall
  add  r12,r12,r11,asr#7  //default is 10, lower is faster
  sub  r11,r11,r12,asr#7  //default is 10, lower is faster
  push {r11,r12}          //backup r11,r12
  asrs r0,r12,#5
  mul  r3,r11,r0
  mov  r11,r4             //y
  mul  r4,r12,r0
  lsls r1,r3,#9           //default is 9
  mvn  r2,r4,lsl#9        //default is 9
  rotozoom_y_loop:
    mov   r7,r1
    mov   r8,r2
    adds  r1,r1,r3        //or subs plus subs below
    adds  r2,r2,r4
    ldr.n r6,screen_x
    rotozoom_x_loop:
      mov    r0,r8,lsr#24         //default is 24
      lsls   r0,r0,#8
      orr    r0,r0,r7,lsr#24      //default is 24
      ldr    r12,[r9,r0,lsl#2]
      uxth   r12,r12              //some colour adjust to vary texture to other effects
      lsrs   r0,r6,#2             //also interesting mov r14,r4,lsr#20
      lsls   r0,r0,#16
      uqadd8 r12,r12,r0           //add some blue dependant on x
      lsrs   r0,r4,#21            //add rotozoom factor => kind of depth effect
      uqadd8 r12,r12,r0
      lsls   r0,r0,#16
      uqadd8 r0,r0,r12
      stmia  r5!,{r0}
      subs   r7,r7,r4
      add    r8,r8,r3             //or subs, but takes 2 Bytes more
      subs   r6,r6,#1
    bne rotozoom_x_loop
    subs r11,r11,#1
  bne rotozoom_y_loop     //restore r11,r12
  pop {r11,r12}

//---wait for vsync--------------------------------------------------------
  movs r0,#19
  swi OS_Byte

//---add text on screen buffer and plot screen dependant on action status--
//...could be done with 6 Bytes less with ldmia/ldmia/uqadd8/stmia but probably slower...
plot_text:
  adr.n  r4,action_status   //0...3
  ldmia  r4,{r4,r5,r6,r7}   //get action_status,screen_size,text_buffer_address,screen_buffer_address
  mla    r6,r4,r5,r6        //adjust start address dependant on action_status
  ldr.n r3,screen_address+8 //+8 due to saving bytes at screen init
  plot_text_loop:
     vld1.32 {d0,d1},[r7]!
     cmp r5,#0x58000             //almost 1/4 of screen, fits to 800*600
     itt hi
        vldmiahi.32 r6!,{d2,d3}  //conditional load of text
        vqaddhi.u8  q0,q0,q1     //saturated bytewise add of 16 bytes
     vst1.32 {d0,d1},[r3]!
     subs r5,r5,#4
  bne plot_text_loop

//---escape test, remove sound handler and exit----------------------------
  swi OS_ReadEscapeState //check for ESC
  bcc mainloop
  pop {r0}               //restore handler number
  swi OS_CallASWI        //Remove XSharedSound handler
  swi OS_Exit            //exit to OS

//---interrupt routine/sound generation -----------------------------------
// r1 -> base of buffer, r2 -> end of buffer, r6 = 8.24 fractional step
soundcode:
  push {r0-r9,LR}
  adr.n r0,soundtimer+4
  ldmia r0,{r0,r3}            //get 64 bit timer
  mov r9,r6                   //to use r6 in soundloop...saves like 2 Bytes overall

  soundloop:                  //r0,r3 are timers,r4 address, free to use r5,r7
    lsrs r5,r0,#16            //adjust timer t for bytebeat
    orr  r5,r5,r3,lsl#16      //insert upper timer

    //---lead melody voices
    adr.n r4,lead_notes       //address for song data
    lsls r7,r5,#13            //prepare notes choice (128 notes, 64 bytes) with next lsrs, saves 2 bytes over AND
    lsrs r7,r7,#26            //one additional shift to trigger nibble choice
    ldrb r7,[r4,r7]           //get one byte = two notes
    ite cc
      andcc r7,r7,#15         //get low nibble
      lsrcs r7,r7,#4          //or high nibble dependant on shift before

    adr.n r4,lead_frequencies //get address for lead frequencies
    ldrb r7,[r4,r7]           //get frequency multiplier byte
    muls r7,r5,r7             //t*frequency
    movw r6,#8191             //sawtooth 0b1111111111111
    itttt ne
       addne  r4,r7,r5,lsl#1  //second lead voice, phaser effect (t*frequency + t*2)
       andne  r7,r7,r6        //sawtooth lead voice 1
       andne  r4,r4,r6        //sawtooth lead voice 2
       addne  r7,r7,r4        //combine, orr also sounds interesting

    //---bass melody voice
    adr.n r4,bass_frequencies //get address for bass frequencies
    lsls  r6,r5,#14           //prepare timer position for note bass change with next lsrs
    lsrs  r6,r6,#30           //saves 2 bytes compared to using AND to get the 4 notes
    ldrb  r6,[r4,r6]          //get frequency multiplier byte
    lsrs  r4,r5,#14           //note position trigger
    it cs
      lsrcs r6,r6,#1          //switch low and nigh note
    muls  r6,r5,r6            //t*frequency
    ands  r6,r6,#(256<<6)     //rectangle bass voice
    lsrs  r6,r6,#2            //lower volume...

    //---combine
    adds  r7,r7,r6            //lead and bass voices

    //---fade out to minimize some buzz and shorten all notes
    and r4,r5,#0b111111110000 //get timer position
    cmp r4,   #0b110000000000
    ittt hi
      lslhi r4,r4,#(32-6-4)
      lsrhi r4,r4,#28         //saves over AND
      lsrhi r7,r7,r4          //shift sound byte

    bfi  r7,r7,#16,#16        //left = right channel stereo
    stm  r1!,{r7}             //store sound word

    adds r0,r0,r9,lsr#8       //inc low  word 64Bit timer by adjusted fractional step
    adc  r3,r3,#0             //inc high word 64Bit timer if overflow
    cmp  r1,r2                //check if buffer filled
  bne soundloop
  adr.n r7,soundtimer+4
  stmia r7!,{r0,r3}           //save 64 bit timer
  pop {r0-r9,PC}

//---data stuff------------------------------------------------------------
.align 2
screen_x:
.word x_res             //as used may be 4 times in code...
text_address:                 //4*19 (text backwards) + 8 (colour code) = 84 Bytes
.string "Join us in 2021 ! "  //needs to be equal size
.string "...we are back !!!"  //needs to be equal size
.string "After 20 years... "  //needs to be equal size
.byte 19,0,0,0,128,128,128,14 //Colour Trans code,bg.r,bg.g,bg.b,fg.r,fg.g,fg.b,antialias
                              //...it's interesting to play with the colours...
.string "-= CODECRAFT #4 =-"  //only at last (=first) string to save bytes
font_name:
.ascii "Exo.Bold"       //change position later may be as 3 Bytes are lost...
timer_save:
.word 0                 //timer save init, also as string terminantion to font_name
action_status:
.word 3                 //byte could be used but doesn't make sense due to ldmia's
screen_size:
.word x_res*y_res       //screen size...could be read by VDU above may be
text_buffer_address:
.word 0x00009000        //hardcoded above code, also used as decrementing stacke base address
screen_buffer_address:
.word 0x00390000        //hardcoded above code 0x9000 + max 1280*720*4     ...single byte in word
texture_buffer_address: //size 256*256*4
.word 0x00720000        //hardcoded above code 0x9000 + max 1280*720*4 * 2 ...single byte in word
sphere_size:
.single 0.00002         //sphere effect size factor, use 0.0000192
sample_freq:
.word 44100*1024
lead_notes:             //128 notes and pauses ...4Bit encoding
.byte  1+( 0<<4), 3+( 0<<4), 5+( 0<<4), 0+( 1<<4), 0+( 0<<4), 3+( 0<<4), 5+( 0<<4), 0+( 0<<4)
.byte  2+( 0<<4), 3+( 0<<4), 5+( 0<<4), 0+( 2<<4), 0+( 0<<4), 3+( 0<<4), 5+( 0<<4), 0+( 0<<4)
.byte  5+( 0<<4), 7+( 0<<4), 2+( 0<<4), 0+( 5<<4), 0+( 0<<4), 7+( 0<<4), 2+( 0<<4), 0+( 0<<4)
.byte  4+( 0<<4), 6+( 0<<4), 1+( 0<<4), 0+( 4<<4), 0+( 0<<4), 6+( 0<<4), 5+( 0<<4), 2+( 0<<4)
.byte  8+( 0<<4), 7+( 0<<4), 5+( 0<<4), 0+( 4<<4), 0+( 0<<4), 5+( 0<<4), 5+( 0<<4), 0+( 0<<4)
.byte  5+( 0<<4), 7+( 0<<4), 5+( 0<<4), 0+( 4<<4), 0+( 0<<4), 5+( 0<<4), 5+( 0<<4), 0+( 0<<4)
.byte 10+( 0<<4), 9+( 0<<4), 8+( 0<<4), 0+( 8<<4), 0+( 0<<4), 7+( 0<<4), 5+( 0<<4), 0+( 0<<4)
.byte  8+( 0<<4), 7+( 0<<4), 6+( 0<<4), 0+( 7<<4), 0+( 0<<4), 6+( 0<<4), 4+( 0<<4), 5+( 0<<4)
lead_frequencies:       //12 multipliers for frequencies based on 44100 Hz/128 (tone)
.byte 0,77,87,92,103,116,130,138,154,173,184,194
bass_frequencies:
.byte 77,92,116,103     //4 multipliers for frequencies
mode:
screen_address:         // space here will be overwritten by screen init (12 Bytes)
.string "31 C16M"       // 800x600 true colour or "X720 Y576 C16M"
.align 2
soundhandler_title:     // add +4 to that to be save
soundtimer:
