Hello all,
Browsing the source code for nasm from \redist folder I have found a way to let the assembler support unicode.
In file preproc.c changed:int __stdcall MultiByteToWideChar(int cp,int flag,char *ansi,int len,char *uni,int len2);
/*
*ÂÃ, this function creates a new Token and passes a pointer to it
*ÂÃ, back to the caller.ÂÃ, It sets the type and text elements, and
*ÂÃ, also the mac and next elements to NULL.
*/
static Token *
new_Token(Token * next, int type, char *text, int txtlen)
{
ÂÃ, Ã‚Ã, Token *t;
ÂÃ, Ã‚Ã, int i;
ÂÃ, Ã‚Ã, Ã‚Ã, Ã‚Ã, char *cc, start;
ÂÃ, Ã‚Ã, Ã‚Ã, Ã‚Ã, Ã‚Ã, Ã‚Ã, Ã‚Ã, Ã‚Ã, unsigned char uni[4];ÂÃ, // sapero
ÂÃ, Ã‚Ã, if (freeTokens == NULL)
ÂÃ, Ã‚Ã, {
ÂÃ, Ã‚Ã, Ã‚Ã, Ã‚Ã, freeTokens = (Token *)new_Block(TOKEN_BLOCKSIZE * sizeof(Token));
ÂÃ, Ã‚Ã, Ã‚Ã, Ã‚Ã, for (i = 0; i < TOKEN_BLOCKSIZE - 1; i++)
ÂÃ, Ã‚Ã, Ã‚Ã, Ã‚Ã, Ã‚Ã, Ã‚Ã, freeTokens[i].next = &freeTokens[i + 1];
ÂÃ, Ã‚Ã, Ã‚Ã, Ã‚Ã, freeTokens[i].next = NULL;
ÂÃ, Ã‚Ã, }
ÂÃ, Ã‚Ã, t = freeTokens;
ÂÃ, Ã‚Ã, freeTokens = t->next;
ÂÃ, Ã‚Ã, t->next = next;
ÂÃ, Ã‚Ã, t->mac = NULL;
ÂÃ, Ã‚Ã, t->type = type;
ÂÃ, Ã‚Ã, if (type == TOK_WHITESPACE || text == NULL)
ÂÃ, Ã‚Ã, {
ÂÃ, Ã‚Ã, Ã‚Ã, Ã‚Ã, t->text = NULL;
ÂÃ, Ã‚Ã, }
ÂÃ, Ã‚Ã, else
ÂÃ, Ã‚Ã, {
ÂÃ, Ã‚Ã, if (txtlen == 0)
ÂÃ, Ã‚Ã, Ã‚Ã, Ã‚Ã, txtlen = strlen(text);
ÂÃ, Ã‚Ã, Ã‚Ã, Ã‚Ã, /////////////////////////////////////////////
ÂÃ, Ã‚Ã, Ã‚Ã, Ã‚Ã, // nasm unicode support start (by sapero)
ÂÃ, Ã‚Ã, Ã‚Ã, Ã‚Ã, /////////////////////////////////////////////
ÂÃ, Ã‚Ã, Ã‚Ã, Ã‚Ã, //
ÂÃ, Ã‚Ã, Ã‚Ã, Ã‚Ã, // string "\Labcd" is replaced with
ÂÃ, Ã‚Ã, Ã‚Ã, Ã‚Ã, // "a",0,"b",0,"c",0,"d",0,0,0
ÂÃ, Ã‚Ã, Ã‚Ã, Ã‚Ã, if (type==TOK_STRING && 0==strncmp(text, "\"\\L", 3) ) {
ÂÃ, Ã‚Ã, Ã‚Ã, Ã‚Ã, Ã‚Ã, Ã‚Ã, t->text = nasm_malloc(((strlen(text)-2) * 6)+9);
ÂÃ, Ã‚Ã, Ã‚Ã, Ã‚Ã, Ã‚Ã, Ã‚Ã, cc = t->text;
ÂÃ, Ã‚Ã, Ã‚Ã, Ã‚Ã, Ã‚Ã, Ã‚Ã, start = *text;
ÂÃ, Ã‚Ã, Ã‚Ã, Ã‚Ã, Ã‚Ã, Ã‚Ã, text += 3; // skip "\L
ÂÃ, Ã‚Ã, Ã‚Ã, Ã‚Ã, Ã‚Ã, Ã‚Ã, while (*text != start) {
ÂÃ, Ã‚Ã, Ã‚Ã, Ã‚Ã, Ã‚Ã, Ã‚Ã, Ã‚Ã, Ã‚Ã, MultiByteToWideChar(0,0,text,1,uni,1); // convert character
ÂÃ, Ã‚Ã, Ã‚Ã, Ã‚Ã, Ã‚Ã, Ã‚Ã, Ã‚Ã, Ã‚Ã, sprintf(cc, "\"%c\",%d,",uni[0],uni[1]); // use codepage
ÂÃ, Ã‚Ã, Ã‚Ã, Ã‚Ã, Ã‚Ã, Ã‚Ã, Ã‚Ã, Ã‚Ã, //sprintf(cc, "\"%c\",0,", *text); // no codepage
ÂÃ, Ã‚Ã, Ã‚Ã, Ã‚Ã, Ã‚Ã, Ã‚Ã, Ã‚Ã, Ã‚Ã, cc += strlen(cc); text++; // 6-8 characters
ÂÃ, Ã‚Ã, Ã‚Ã, Ã‚Ã, Ã‚Ã, Ã‚Ã, }
ÂÃ, Ã‚Ã, Ã‚Ã, Ã‚Ã, Ã‚Ã, Ã‚Ã, sprintf(cc, "0,0");
ÂÃ, Ã‚Ã, Ã‚Ã, Ã‚Ã, }
ÂÃ, Ã‚Ã, Ã‚Ã, Ã‚Ã, else {
ÂÃ, Ã‚Ã, Ã‚Ã, Ã‚Ã, Ã‚Ã, Ã‚Ã, // the original handler...
ÂÃ, Ã‚Ã, Ã‚Ã, Ã‚Ã, Ã‚Ã, Ã‚Ã, t->text = nasm_malloc(1 + txtlen);
ÂÃ, Ã‚Ã, Ã‚Ã, Ã‚Ã, Ã‚Ã, Ã‚Ã, strncpy(t->text, text, txtlen);
ÂÃ, Ã‚Ã, Ã‚Ã, Ã‚Ã, Ã‚Ã, Ã‚Ã, t->text[txtlen] = '\0';
ÂÃ, Ã‚Ã, Ã‚Ã, Ã‚Ã, }
ÂÃ, Ã‚Ã, }
ÂÃ, Ã‚Ã, return t;
}
usage is simple: "\Lx"ÂÃ, where x is your string. Unicode string is 'assembled' into object file, so .asm output is not changed.
Works also in pure asm coding :)
invoke MessageBoxW, 0, "\Lunicode test", "\L", 0x40
OEM characters are supported by MultiByteToWideChar api
extern int MessageBoxW(int hwnd,string s1,opt string *s2,opt int icon=0x40);
global sub main()
{
MessageBoxW(0,"\Lunicode message box","\L");
return MessageBoxW(0,"\LÂÃ,¹ÃÆ'Ã,¦Ã‚Ã,¿ÃÆ'Ã,±Ã‚Ã,³ÃÆ'Ã,³ÃÆ'Ã,ªÃÆ'Ã,¤ÃÆ'Ã,¶ÃÆ'Ã,¼","\L");
}
and to see how the string is modified:#asm %error '"\Lunicode"'
#endasm
small change: t->text = nasm_malloc(((strlen(text)-2) * 8)+4); // "aaa,bbb," * size + "0,0" + null
cc = t->text;
start = *text;
text += 3; // skip "\L
while (*text != start) {
MultiByteToWideChar(0,0,text,1,uni,1); // convert character
sprintf(cc, "%d,%d,",uni[0],uni[1]); // use codepage
cc += strlen(cc); text++;
}
sprintf(cc, "0,0");
}
both unicode characters are converted into digits: xxx,yyy,
Cool, now we have to get Paul to incorporate that.