I had a look at some of these options. My Z80 assembly language isn't much above beginner's level so I stuck with Z88DK/C, which works for me because that's what my game is written in.
I implemented the unsigned int to ACSII conversion several times and put each one in a loop 9999 times so I could time it using a stopwatch.
First up, here's the sprintf() version:
Code: Select all
/*
* zcc +zx -vn -startup=0 -clib=sdcc_iy -SO3 --max-allocs-per-node200000 utoa_char_sprintf.c -o utoa_char_sprintf -create-app --list
*
* gcc -o utoa_char_sprintf utoa_char_sprintf.c
*/
#include <stdio.h>
unsigned char output_str[5];
int main()
{
unsigned int i;
for( i=9999; i>0; i-- ) {
sprintf( output_str, "%04u", i );
#ifdef __GNUC__
printf("%s\n", output_str);
#endif
}
return 0;
}
On a 48K Spectrum in Fuse this runs in 24 seconds, which is why I asked the original question! Slow... The Z88DK sprintf %u convertor uses the library's utoa() function under the covers, so replacing sprintf with utoa() results in pretty much the same code and timings.
However, as AA advised, there's a library build time switch which pulls in utoa() code optimised for the base 10 case. Using that brings the testcase run time down to 16 seconds, so that's a pretty useful optimisation. Still slow though. :p
Next up is the ASCII digit rolling technique suggested by @R-Tape and @spectron. Like this:
Code: Select all
/*
* zcc +zx -vn -startup=0 -clib=sdcc_iy -SO3 --max-allocs-per-node200000 utoa_char_digits.c -o utoa_char_digits -create-app --list
*
* gcc -o utoc_char_digits utoa_char_digits.c
*/
#include <stdio.h>
unsigned char output_str[5] = "9999";
int main()
{
while( output_str[0] != '0' || output_str[1] != '0' || output_str[2] != '0' || output_str[3] != '1' ) {
if( --output_str[3] == 0x2F ) {
output_str[3] = '9';
if( --output_str[2] == 0x2F ) {
output_str[2] = '9';
if( --output_str[1] == 0x2F ) {
output_str[1] = '9';
if( --output_str[0] == 0x2F ) {
output_str[0] = '9';
}
}
}
}
#ifdef __GNUC__
printf("%s\n", output_str);
#endif
}
return 0;
}
This is faster still. Even on the 48K Spectrum this is almost too fast to time. It takes a bit over a second. The code compiles to this:
Code: Select all
_main:
l_main_00112:
ld bc,_output_str+0
ld a, (bc)
ld d, a
ld hl,+(_output_str + 0x0003)
ld e, (hl)
ld a, d
sub a,0x30
jr NZ,l_main_00113
ld a,(_output_str + 0x0001)
sub a,0x30
jr NZ,l_main_00113
ld a,(_output_str + 0x0002)
sub a,0x30
jr NZ,l_main_00113
ld a, e
sub a,0x31
jr Z,l_main_00114
l_main_00113:
dec e
ld hl, +(_output_str + 0x0003)
ld (hl), e
ld a, e
sub a,0x2f
jr NZ,l_main_00112
ld (hl),0x39
ld a,(_output_str + 0x0002)
add a,0xff
ld hl, +(_output_str + 0x0002)
ld (hl), a
sub a,0x2f
jr NZ,l_main_00112
ld (hl),0x39
ld a,(_output_str + 0x0001)
add a,0xff
ld hl, +(_output_str + 0x0001)
ld (hl), a
sub a,0x2f
jr NZ,l_main_00112
ld (hl),0x39
ld a, (bc)
add a,0xff
ld (bc), a
sub a,0x2f
jr NZ,l_main_00112
ld a,0x39
ld (bc), a
jr l_main_00112
l_main_00114:
ld hl,0x0000
ret
I can follow that, just about. I don't quite get why it uses an 'add a,0xff' to do the decrement (surely 'dec a' would be the obvious choice?) but the rest of it makes sense.
The problem with this approach is that it's not very flexible. If I want to decrement the counter by, say, 10, I'd have the run it in a loop that many times. If I want to do any even moderate maths on the value, like add a bonus, then it gets a bit tricky. But it's fast though.
Here's the other option I played with, taking the divide-by-powers-of-10 approach advocated by @Metalbrain:
Code: Select all
/*
* zcc +zx -vn -startup=0 -clib=sdcc_iy -SO3 --max-allocs-per-node200000 utoa_pow10_divs.c -o utoa_pow10_divs -create-app --list
*
* gcc -o utoa_pow10_divs utoa_pow10_divs.c
*/
#include <stdio.h>
#include <string.h>
unsigned char output_str[5];
int main()
{
unsigned int i = 9999;
while( i > 0 ) {
unsigned int tmp = i;
memcpy( output_str, "0000", 4 );
while( tmp>=1000 ) {
output_str[0]++;
tmp -= 1000;
}
while( tmp>=100 ) {
output_str[1]++;
tmp -= 100;
}
while( tmp>=10 ) {
output_str[2]++;
tmp -= 10;
}
output_str[3]+=tmp;
#ifdef __GNUC__
printf("%s\n", output_str);
#endif
i--;
}
return 0;
}
This loop counts down in about 5 seconds, so it's still way faster than the sprintf and keeps the value in a scalar so it's easy to manipulate.
The code compiles to this:
Code: Select all
_main:
ld bc,0x270f
l_main_00110:
ld a, b
or a,c
jr Z,l_main_00112
push bc
ld de,_output_str
ld bc,0x0004
ld hl,___str_0
ldir
pop bc
ld e, c
ld d, b
l_main_00101:
ld a, e
sub a,0xe8
ld a, d
sbc a,0x03
jr C,l_main_00119
ld a,(_output_str)
inc a
ld (_output_str),a
ld hl,0xfc18
add hl,de
ex de,hl
jr l_main_00101
l_main_00119:
l_main_00104:
ld a, e
sub a,0x64
ld a, d
sbc a,0x00
jr C,l_main_00121
ld hl,_output_str + 1
inc (hl)
ld hl,0xff9c
add hl,de
ex de,hl
jr l_main_00104
l_main_00121:
l_main_00107:
ld a, e
sub a,0x0a
ld a, d
sbc a,0x00
jr C,l_main_00109
ld hl,_output_str + 2
inc (hl)
ld hl,0xfff6
add hl,de
ex de,hl
jr l_main_00107
l_main_00109:
ld a,(_output_str + 0x0003)
add a, e
ld ((_output_str + 0x0003)),a
dec bc
jr l_main_00110
l_main_00112:
ld hl,0x0000
ret
I'm a bit out of my depth with this, but I get the idea.
Finally I tried switching to the small_utoa assembly language function described by AA, which is used internally in Z88DK. This does something similar to the tight powers-of-10 dividing code posted by @Bizzley:
Code: Select all
/*
* zcc +zx -vn -startup=0 -clib=sdcc_iy -SO3 --max-allocs-per-node200000 small_utoa_main.c small_utoa.asm -o small_utoa -create-app --list
*
*/
#include <stdio.h>
#include <string.h>
unsigned char output_str[5];
char* small_utoa( unsigned int, char* );
int main()
{
unsigned int i;
for( i=9999; i>0; i-- ) {
small_utoa( i, output_str );
}
return 0;
}
It needs this (not very efficient because I'm still new to this) interface code to externalise what is normally an internal function:
Code: Select all
; char *small_utoa(unsigned int num, char *buf)
SECTION code_stdlib
PUBLIC _small_utoa
EXTERN l_small_utoa
_small_utoa:
pop af
pop hl
pop de
pop bc
push bc
push de
push hl
push af
call l_small_utoa
ex de,hl
ld (hl),0
ret
That takes about 3.5 seconds to do what is basically the same as my C code, only with hand coded ASM.
I'd like to try the BCD option but I don't I understand it enough to try.
So, bottom line: I knew the sprintf() would be slow which is why I went through this exercise. The divide-by-powers-of-10 approach is much faster, and the hand coded assembly language version internal to Z88DK is significantly faster than my C implementation, so that's the code to use for maximum flexibility. However, the way my game currently works I think I can use the digit rolling code.