I tried using macros to simulate loops in assembler and finally ended up with a higher language compiler into Z80 assembler. I chose Forth as a language because it is simpler than C and in a sense it is somewhere halfway between C and Asm.
When I started I knew almost nothing about Forth, now I know a little more, but not much, because I still haven't used one and I'm looking at it more from the point of view of how to translate an interpreter into asm. Things like "DOES>" keep surprising me.
But the result is quite good. It's the clearest asm code I've ever seen. When each word or group of words is clearly recognizable.
Forth code:
Code: Select all
: fib2 ( n1 -- n2 )
0 1 rot 0 ?do
over + swap
loop
drop
;
: fib2_bench ( -- )
1000 0 do
20 0 do
I fib2 drop
loop
loop
;
fib2_bench
macro M4 FORTH code (../fth2m4.sh fib2.fth > fib2.m4):
Code: Select all
include(`../M4/FIRST.M4')dnl
ifdef __ORG
org __ORG
else
org 32768
endif
INIT(60000)
CALL(_fib2_bench)
STOP
COLON(_fib2,({{{{ n1 -- n2 }}}}))
PUSH(0) PUSH(1) ROT PUSH(0) QUESTIONDO
OVER ADD SWAP
LOOP
DROP
SEMICOLON
COLON(_fib2_bench,({{{{ -- }}}}))
PUSH(1000) PUSH(0) DO
PUSH(20) PUSH(0) DO
I CALL(_fib2) DROP
LOOP
LOOP
SEMICOLON
Code: Select all
ifdef __ORG
org __ORG
else
org 32768
endif
; === b e g i n ===
ld [Stop+1], SP ; 4:20 init storing the original SP value when the "bye" word is used
ld L, 0x1A ; 2:7 init Upper screen
call 0x1605 ; 3:17 init Open channel
ld HL, 0xEA60 ; 3:10 init Return address stack = 60000
exx ; 1:4 init
call _fib2_bench ; 3:17 call ( -- )
Stop: ; stop
ld SP, 0x0000 ; 3:10 stop restoring the original SP value when the "bye" word is used
ld HL, 0x2758 ; 3:10 stop
exx ; 1:4 stop
ret ; 1:10 stop
; ===== e n d =====
; --- the beginning of a non-recursive function ---
_fib2: ; ( n1 -- n2 )
pop BC ; 1:10 : ret
ld [_fib2_end+1],BC; 4:20 : ( ret -- )
;[6:36] 0 1 rot ( x -- 0 1 x )
push DE ; 1:11 0 1 rot
ld DE, 0x0000 ; 3:10 0 1 rot
push DE ; 1:11 0 1 rot
inc E ; 1:4 0 1 rot
ld A, L ; 1:4 0 ?do_101(m) ( stop 0 -- )
ld [stp_lo101], A ; 3:13 0 ?do_101(m) lo stop
ld A, H ; 1:4 0 ?do_101(m)
ld [stp_hi101], A ; 3:13 0 ?do_101(m) hi stop
or L ; 1:4 0 ?do_101(m)
ex DE, HL ; 1:4 0 ?do_101(m)
pop DE ; 1:10 0 ?do_101(m)
jp z, exit101 ; 3:10 0 ?do_101(m)
ld BC, 0x0000 ; 3:10 0 ?do_101(m)
do101save: ; 0 ?do_101(m)
ld [idx101], BC ; 4:20 0 ?do_101(m) save index
do101: ; 0 ?do_101(m)
add HL, DE ; 1:11 over +
ex DE, HL ; 1:4 swap ( b a -- a b )
idx101 EQU $+1 ;[16:57/78] loop_101(m) idx always points to a 16-bit index
ld BC, 0x0000 ; 3:10 loop_101(m)
inc BC ; 1:6 loop_101(m) index++
ld A, C ; 1:4 loop_101(m) lo new index
stp_lo101 EQU $+1 ; loop_101(m)
xor 0x00 ; 2:7 loop_101(m) lo stop
jp nz, do101save ; 3:10 loop_101(m)
ld A, B ; 1:4 loop_101(m) hi new index
stp_hi101 EQU $+1 ; loop_101(m)
xor 0x00 ; 2:7 loop_101(m) hi stop
jp nz, do101save ; 3:10 loop_101(m)
leave101: ; loop_101(m)
exit101: ; loop_101(m)
ex DE, HL ; 1:4 drop
pop DE ; 1:10 drop ( a -- )
_fib2_end:
jp 0x0000 ; 3:10 ;
; --------- end of non-recursive function ---------
; --- the beginning of a non-recursive function ---
_fib2_bench: ; ( -- )
pop BC ; 1:10 : ret
ld [_fib2_bench_end+1],BC; 4:20 : ( ret -- )
ld BC, 0x0000 ; 3:10 1000 0 do_102(xm)
do102save: ; 1000 0 do_102(xm)
ld [idx102],BC ; 4:20 1000 0 do_102(xm)
xor A ; 1:4 20 0 do_103 i_103(m) 8-bit loop ( 20 0 -- i )
do103saveA: ; 20 0 do_103 i_103(m)
push DE ; 1:11 20 0 do_103 i_103(m)
ex DE, HL ; 1:4 20 0 do_103 i_103(m)
ld [idx103], A ; 3:13 20 0 do_103 i_103(m) save lo(index)
ld L, A ; 1:4 20 0 do_103 i_103(m)
ld H, 0x00 ; 2:7 20 0 do_103 i_103(m)
call _fib2 ; 3:17 call ( -- )
ex DE, HL ; 1:4 drop
pop DE ; 1:10 drop ( a -- )
;[9:32/32] loop_103(m) variant +1.ignore: 8-bit loop, run 20x
idx103 EQU $+1 ; loop_103(m) idx always points to a 16-bit index
ld A, 0 ; 2:7 loop_103(m) 0.. +1 ..(20), real_stop:0x0014
db 0x00 ; 1:4 loop_103(m) ignore opcode = hi(index) -> idx always points to a 16-bit index.
inc A ; 1:4 loop_103(m) index++
cp 0x14 ; 2:7 loop_103(m) lo(real_stop)
jp nz, do103saveA ; 3:10 loop_103(m) index<>real_stop?
;[16:57/58] loop_102(xm) variant +1.default: step one, run 1000x
idx102 EQU $+1 ; loop_102(xm) idx always points to a 16-bit index
ld BC, 0x0000 ; 3:10 loop_102(xm) 0.. +1 ..(1000), real_stop:0x03E8
inc BC ; 1:6 loop_102(xm) index++
ld A, C ; 1:4 loop_102(xm)
xor 0xE8 ; 2:7 loop_102(xm) lo(real_stop) first (232>3)
jp nz, do102save ; 3:10 loop_102(xm) 3x false positive
ld A, B ; 1:4 loop_102(xm)
xor 0x03 ; 2:7 loop_102(xm) hi(real_stop)
jp nz, do102save ; 3:10 loop_102(xm) 232x false positive if he was first
leave102: ; loop_102(xm)
exit102: ; loop_102(xm)
_fib2_bench_end:
jp 0x0000 ; 3:10 ;
; --------- end of non-recursive function ---------
Code: Select all
| Forth / C | Benchmark | Time (sec/round) | Bytes
| :---------------------: | :---------: | :--------------- | :------
| M4_FORTH | Fib2 | 0m5.65s | 133
| M4_FORTH use data stack | Fib2s | 0m5.03s | 112
| M4_FORTH use assembler | Fib2a | 0m2.55s | 96
| Boriel Basic zxbc 1.16.4| Fib2 a = a+c| 0m14.38s |
| zcc z88dk v16209 | Fib2 a = a+c| 0m49.19s |
| zcc z88dk v16209 | Fib2 a+= c | 0m43.97s |
| zcc z88dk v19766 -O2 | Fib2 a+= c | 0m36.09s | 2222
But it still has the advantage of holding TOS (top of stack) in the paired register HL and NOS in DE. This in turn forces the programmer to choose an algorithm that is more efficient because it is executed over registers and not memory space.
Thanks to the fact that words or combinations of words are just macros, it is easy to create a new word if necessary and thereby make the resulting program more efficient. If you can improve the translation, you are not dependent on what the C compiler can do.