May 29, 2024, 08:52:28 AM

News:

Own IWBasic 2.x ? -----> Get your free upgrade to 3.x now.........

Searching for string

Started by sapero, September 05, 2006, 07:51:53 AM

0 Members and 1 Guest are viewing this topic.

sapero

September 05, 2006, 07:51:53 AMLast Edit: September 05, 2006, 07:59:16 AM by sapero
Here's a snippet for ultra fast finding a string.
If you have 65536 strings, you need to compare only (max) 16 times to find the string.
I've posted this tipp on IBStandard forum
All what you need is sorted array of string pointers, number of strings and string to find:
// sort approximate time is calculated for given constants and my cpu speed.
`//console app// fast find string exaple// requires sorted table with string pointers (use same compare function for sorting and searching)// how it works:// - set search range to maximum (0 to count-1)// - set compare position 'in between' of the range ( min + 0.5(max-min)Ãƒâ€šÃ,Â  orÃƒâ€šÃ,Â  0.5(min+max)Ãƒâ€šÃ,Â  )// - compare. if equal - found//Ãƒâ€šÃ,Â  Ãƒâ€šÃ,Â if greater - move range min to position+1//Ãƒâ€šÃ,Â  Ãƒâ€šÃ,Â if bellowÃƒâ€šÃ,Â  - move range max to position-1//Ãƒâ€šÃ,Â  Ãƒâ€šÃ,Â if min <= max go to 'set compare position'//// maximum iterations: log2(number_of_strings) - also max 16 for 65536 strings#define NUM_STRINGS 65536*16Ãƒâ€šÃ,Â  Ãƒâ€šÃ,Â  // number of strings (max 2^32 :P)#define STRING_LEN_MIN 2Ãƒâ€šÃ,Â  Ãƒâ€šÃ,Â  Ãƒâ€šÃ,Â  Ãƒâ€šÃ,Â // minimum characters in string (max 32767)#define STRING_LEN_MAX 32Ãƒâ€šÃ,Â  Ãƒâ€šÃ,Â  Ãƒâ€šÃ,Â  // maximum characters in string (max 32767)string* g_table[NUM_STRINGS];Ãƒâ€šÃ,Â  // array of string pointers to searchintÃƒâ€šÃ,Â  Ãƒâ€šÃ,Â  Ãƒâ€šÃ,Â g_nStrings;Ãƒâ€šÃ,Â  Ãƒâ€šÃ,Â  Ãƒâ€šÃ,Â  Ãƒâ€šÃ,Â  Ãƒâ€šÃ,Â  Ãƒâ€šÃ,Â  // number of successfully allocted stringsimport void cdecl qsort(void* array, int elements, int elementsize, void* cdecl_compare_fn);import byte getch alias _getch();import intÃƒâ€šÃ,Â  GetTickCount();import intÃƒâ€šÃ,Â  lstrcmp alias lstrcmpA(string *a,string *b); // 0(a=b)Ãƒâ€šÃ,Â  Ãƒâ€šÃ,Â +(a>b)Ãƒâ€šÃ,Â  Ãƒâ€šÃ,Â -(a<b)global sub main(){ // fill the string pointer g_table print("creating ",NUM_STRINGS," random strings, with ",STRING_LEN_MIN,"-",STRING_LEN_MAX," characters... ",); int time = GetTickCount(); CreateStrings(STRING_LEN_MIN, STRING_LEN_MAX); print("done ", (GetTickCount() - time)/1000.0, "s"); // any string failed? if (g_nStrings != NUM_STRINGS) print("\nOOPS! Out of memory! Created only ", g_nStrings, " strings\n"); // any string allocated? if (g_nStrings == 0) goto __exit001; // sort it print("sorting (approx. ",g_nStrings/71920,"s)... ",); time = GetTickCount(); // we sort pointers, so element size = sizeof(pointer) qsort( &g_table, g_nStrings, sizeof(pointer), &SortModesCallback ); print("done ", (GetTickCount() - time)/1000.0, "s"); // pick a random string from g_table to search for int randindex = (rand(32768) << 16) | rand(32768); while (randindex > (g_nStrings-1)) {randindex /= 2;} // search int Iterations; string *pStrToFind = g_table[randindex]; print("\n* string to search for: '", *pStrToFind, "'"); print("* position: ",randindex); print("searching (case sensitible)... ",); time = GetTickCount(); int pos = Find(pStrToFind, &Iterations); print("done ", (GetTickCount() - time)/1000.0, "s"); // display stats if (pos != -1) { string *lpstr = g_table[pos]; print("\nfound stringÃƒâ€šÃ,Â  Ãƒâ€šÃ,Â  Ãƒâ€šÃ,Â  Ãƒâ€šÃ,Â  Ãƒâ€šÃ,Â  Ãƒâ€šÃ,Â  '", *lpstr, "'"); print("at position ", pos); print("iterations: ",Iterations);} else { print("\nnot found!");} // cleanup print("\ndeleting strings... ",); time = GetTickCount(); FreeStrings(); print("done ", (GetTickCount() - time)/1000.0, "s");__exit001: print("press any key to quit"); getch();}sub CreateStrings(int minlen, int maxlen){ // loop for all strings g_nStrings = 0; for (int pos = 0; pos<NUM_STRINGS; pos++) { // random string lenght int strsize = rand(minlen, maxlen); // allocate new string byte *pstr = new(byte, strsize+1); if (!pstr) break; g_nStrings++; // save it in g_table g_table[pos] = pstr; // and fill it with random characters for (int chr=0; chr<strsize; chr++) { // random character type: big/small letter or a digit int type = rand(0,2); int char; ifÃƒâ€šÃ,Â  Ãƒâ€šÃ,Â  Ãƒâ€šÃ,Â  (type == 0) char = rand('0','9') else if (type == 1) char = rand('a','z') elseÃƒâ€šÃ,Â  Ãƒâ€šÃ,Â  Ãƒâ€šÃ,Â  Ãƒâ€šÃ,Â  Ãƒâ€šÃ,Â  Ãƒâ€šÃ,Â  Ãƒâ€šÃ,Â  Ãƒâ€šÃ,Â  char = rand('A','Z'); // write random character *(byte)pstr[chr] = char; } // terminate the string with null *(byte)pstr[strsize] = 0; }}sub Find(string *pStrToFind,int *ppIterations),int{ int min = 0; int max = g_nStrings-1; int cmp = -1; *ppIterations = 0; while (min <= max) { int pos = min + ((max - min) / 2); //print(" - range: ",min,"-",max, " pos:",pos); cmp = lstrcmp(pStrToFind, g_table[pos]); if (cmp > 0) {min = pos+1;} else if (cmp < 0) {max = pos-1;} else break; *ppIterations++; } if (cmp != 0) pos = -1; return pos;}sub FreeStrings(){ for (int pos = 0; pos<g_nStrings; pos++) { delete g_table[pos]; }}// SortModesCallback function should be cdecldeclare cdecl SortModesCallback(void* arg1, void* arg2 ),int;// arg1 and arg1 - addresses of copared elementssub SortModesCallback(void* arg1, void* arg2 ),int{ return lstrcmp(*(pointer)arg1,*(pointer)arg2);}`

kryton9

September 05, 2006, 11:50:52 AM #1
Thanks Sapero, another fine example!! Will definitly be in my study list!!

sapero

September 05, 2006, 01:38:57 PM #2
After some tests: the lstrcmp api is very slow, it sorts much better with crt function strcmp, but the ultra speed you get with StrCmpC from shlwapi.dll

Compare sort time for 65536*4 strings with fixed size 256 characters:
• lstrcmpÃƒâ€šÃ,Â  15Ãƒâ€šÃ,Â  Ãƒâ€šÃ,Â s
• strcmpÃƒâ€šÃ,Â  Ãƒâ€šÃ,Â  4Ãƒâ€šÃ,Â  Ãƒâ€šÃ,Â s
• StrCmpCÃƒâ€šÃ,Â  Ãƒâ€šÃ,Â 0.4 s

defines:
`// string compare apiimport intÃƒâ€šÃ,Â  lstrcmp alias lstrcmpA(string *a,string *b); // 0(a=b)Ãƒâ€šÃ,Â  Ãƒâ€šÃ,Â +(a>b)Ãƒâ€šÃ,Â  Ãƒâ€šÃ,Â -(a<b)import int cdecl strcmp(string *s1,string *s2);// this one is the fastest#use "shlwapi.lib"import int StrCmpC alias StrCmpCA(string *s1,string *s2);`

To apply new compare function you need to replace the original lstrcmp in Find() and in SortModesCallback().
The SortModesCallback could be QsortCallback...
BTW sort time is not important here, only the search time counts here, and it is ~null

Ionic Wind Support Team

September 05, 2006, 01:57:12 PM #3
lstrcmpA is slow on NT systems if your using ANSI strings as it has to convert each string to unicode first and then calls the wide character version of the api.  If you use unicode strings you will find it quicker.

strcmp from C only works with ANSI
Ionic Wind Support Team