;
; Hugi size coding Compo 28 Entry (final submission)
; --------------------------------------------------------------
;
; by Tapani Utriainen (once known as nadir / RAGE ... back in the demo days)
;
; Final version. There could still be some 20 bytes to trim down.
;
; Some tricks used here to get the size down:
; - PCI scan routine using self-modifying code, base port is read by same code
; - Use of outsb/outsw for communicating with ports
; - Clever memory layout: each TD entry is 32 bytes, but with 16 last bytes
;   unused (available for application). My TD table is "overlapping", i.e.
;   the last 16 bytes of one table entry contains the first 16 of the next.
; - All memory data is placed in same region so it can be all setup by one
;   pass of stos/movs instructions/
; - Use of hlt to wait for timer interrupts
; - Storing UHCI port in bp allows one to "lea dx, [bp+4]" to set dx to
;   base port + 4
; 
; Compile by tasm: 
;   tasm /m9 tapani233.asm
; and link
;   tlink /x /t /3 tapani233.obj
;
; Run, for instance, in Bochs v2.4
;
; //Tapani
;

.486                    ; an elegant weapon, for a more civilized time

cseg     segment use16
assume   cs:cseg, ds:cseg, ss:cseg, es:cseg
         org 100h

; ------------------------------------------------------------------------
; | Scan PCI for an UHCI
; ------------------------------------------------------------------------

start:
   call  pci_scan
   cmp   eax, 0c030000h
   je    uhci_found
   dec   word ptr [ds:pci_scan + 3]
   jnz   start          ; loop until PCI adress space is probed
pci_scan:               ; if not found, do a redundant read and exit
   mov   eax, 8001ff08h ; this constant contains loop counter
   mov   dx, 0cf8h
   out   dx, eax
   mov   dl, 0fch
   in    eax, dx
   and   al, 0fch       ; filter away PCI revision / low bits of port
   ret                  ; return value in eax (or exit if no UHCI found)

; ------------------------------------------------------------------------
; | UHCI found
; ------------------------------------------------------------------------

uhci_found:
   mov   byte ptr [ds:pci_scan + 2], 20h
   call  pci_scan    ; uses the same PCI scanning code to get base port
   hlt               ; wait for interrupt (timer)
   xchg  bp, ax      ; base port in bp

   mov   dl, 04h
   mov   ax, 05h
   out   dx, al      ; allow access to PCI

   mov   dx, bp
   out   dx, ax      ; ax = 0005h : reset (GRESET) and start the controller
   hlt               ; wait

   xor   eax, eax
   out   dx, ax

; ------------------------------------------------------------------------
; | Set up memory
; ------------------------------------------------------------------------
;
; * convert Frame TD adress to 32-bit and write to the controller
; * convert Setup_Packet adress to 32-bit
; initialize data: Terminate each element in frame list by setting bit 0
;                  5 Frame TDs are initialized using 4 dwords each:
; 
; [ ptr to next (current + 20h) ] [ from TD_Data block ] [ from TD_Data block ] [ ofs Setup_Packet ] [ 16 bytes of whatever ]
; [ ptr to next (current + 20h) ] [ from TD_Data block ] [ from TD_Data block ] [ ofs Setup_Packet + 8] [ 16 bytes of whatever ]
; ...
; [ something with bit 1 set ]
;
; Memory aligned 4k at OFFSET: 
; +0                 400h dwords ( = offset queue horiz or 3 )
; +1000h             queue horiz ( = terminate)
; +1004h             queue vert ( = offset Frame TDs )
; +1010h             5 x 32 bytes of Frame TDs ( see above )
; +1060h             Setup_Packet
; +1068h             Device Descriptor


; - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
; Memory setup: Calculate memory adresses to use, and send to USB controller 
; - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -

NOF_TDS          equ 5
SETUP_PACKET equ (1010h + NOF_TDS*10h)

   mov   cx, 0404h         ; dwords to init before Frame TDs (frame list + queue + some)
                           ; cl=4 is also used for shifting
   push  cs
   pop   ax

   add   ax, cx            ; add ~16k, then binary and it to 4k alignment
   and   ax, 0fc00h        ; al = 0 gives a 4k alignment after shift
      
   push  ax
   pop   es
   shl   eax, cl           ; eax <-- ptr to OFFSET

   lea   dx, [bp+8]
   out   dx, eax           ; send TD adress to the UHCI

; - - - - - - - - - - - - - - - - - -
; Memory setup: Init the frame list
; - - - - - - - - - - - - - - - - - -

   add   ax, SETUP_PACKET  ; eax <-- ptr to Setup_Packet
   mov   bx, ax            ; bx <-- ptr to Setup_Packet (high 16 bits are from eax)
   mov   al, 11h           ; eax <-- ptr to TD block + 16 | TERMINATE

   xor   di, di            ; es:di <-- ptr to OFFSET
   rep   stosd
   dec   ax
   mov   [es:di-12], ax    ; queue vertical


   ; eax is 32-bit address to TD block
   ; es:di is 16-bit ptr to TD block
   ; ds:si is (will be) 16-bit to TD_Data
   ; ebx is offset to Setup_Packet / Device_Descriptor

; - - - - - - - - - - - - - - - - - - - - - - - -
; Memory setup: Loop to set up the TDs in memory 
; - - - - - - - - - - - - - - - - - - - - - - - -

   mov   cl, NOF_TDS
@tdloop:
   add   al, 16   ; add al suffices since eax (initially) points 4k aligned!
   stosd          ; store ptr to next TD entry

   mov   si, offset TD_Data
   movsd
   movsd

   xchg  ax, bx
   stosd
   add   al, 8
   xchg  ax, bx

   loop  @tdloop

; - - - - - - - - - - - - - - - - - - - - - - - - - - -
; Memory setup: Setup packet is directly after the TDs 
; - - - - - - - - - - - - - - - - - - - - - - - - - - -

   movsd
   stosw
   movsw

; - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
; Memory setup: patch TD data that is not set up correctly by the loop
; - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -

   mov   di, 1057h
   movsb
   mov   di, 1018h
   movsb

; Done setting up memory

; ------------------------------------------------------------------------
; | Configure UHCI, wait for the interrupt
; ------------------------------------------------------------------------

   lea   dx, [bp+4]
   outsw             ; 000fh, enable interrupts

   mov   dx, bp
   outsw             ; 0001h, set status (RUN)

   ; first reset the port and then wait
   lea   dx, [bp+10h]
   outsw             ; 0200h
   hlt
   
   ; clear the reset and wait
   outsw             ; 0005h
   hlt

   ; set the first frame in TD frame list to point to the queue
   xor   di, di
   movsb

   mov   dx, bp
   outsw             ; 0001h

   push  es
   pop   ds

   inc   dx
   inc   dx
@3:               
   in    ax, dx
   and   al, 1       ; use and instead of test, al is always zero after loop
   jz    short @3    ; wait for the interrupt

   mov   dx, bp
   out   dx, al

; ------------------------------------------------------------------------
; | Print out result
; ------------------------------------------------------------------------
   
   mov   si, SETUP_PACKET + 8
   mov   cl, 10h
   call  prt_seq
   mov   cl, 02h
prt_seq:
   mov   di, dx         ; result string(s) will be stored in ds:dx
                        ; it is possible to use base port as offset, since no
                        ; more memory ops are used, and base port >> 100h
   mov   ax, 0d0ah
   stosw
@prt_sibyte:
   lodsb
   db    0d4h, 10h
   aaa
   adc   al, 30h
   xchg  al, ah
   aaa
   adc   al, 30h
   stosw                ; remember, intel byte order

   mov   al, 20h
   stosb
   loop  @prt_sibyte
   dec   di             ; undo space
   mov   ax, 0924h      ; 09h for int 21h (print string), 24h = '$' (terminate) 
   stosb
   mov   byte ptr [di-25], '-'      ; put that damn hyphen there...
   int   21h
   ret                  ; return to call, or exit program on 2nd run

; ------------------------------------------------------------------------
; | DATA
; ------------------------------------------------------------------------

TD_Data:
   dd    04800000h
   dd    00e00069h

;Setup_Packet:
   db 80h    ; dev->host, type=standard, recipient=device
   db 06h    ; get descriptor
   db 00h    ; index = 0
   db 01h    ; type = device
   dw 12h

   db      1dh    ; manual patch for TD status entry
   db      2dh

   ; data to send to ports
   dw    000fh
   dw    0001h
   dw    0200h

   dw    0005h
   db    02h      ; offset queue->horiz | IS_QUEUE
   dw    0001h

cseg ends
end start

