Ridiculous UTF-8 character counting

by George Pollard

So, Colin Percival has posted a UTF-8 strlen which improves on my previous post. While his code runs slightly slower than mine on my PC, I assume that’s because his code is aimed at a 64-bit architecture. With 32-bits (reading 4 bytes at a time, instead of 8 ) it doesn’t quite get the same speed up.

That said, the vectorization code is clearly an improvement on mine, so let’s take that ball and run with it!

The Code

Now we use SIMD instructions to vectorize the counting of characters. I modified this from Colin’s routine, and I’m sure he has some bit-fiddling up his sleeves that would make this run even faster

As it is, I used a straightforward algorithm to extract the information.

#define GetMask(x) __builtin_ia32_pmovmskb128(x)
#define LoadBytes(x) __builtin_ia32_loaddqu(x)
#define CompareEquality(x,y) __builtin_ia32_pcmpeqb128((x),(y))
#define Or(x,y) __builtin_ia32_por128((x),(y))
#define NotExpected(x) __builtin_expect((x),0)
#define And(x,y) __builtin_ia32_pand128((x),(y))
 
typedef unsigned char v16qi __attribute__ ((vector_size(16)));
 
const char mask[16] = {
    0xc0, 0xc0, 0xc0, 0xc0,
    0xc0, 0xc0, 0xc0, 0xc0,
    0xc0, 0xc0, 0xc0, 0xc0,
    0xc0, 0xc0, 0xc0, 0xc0
};
const char match[16] = {
    0x80, 0x80, 0x80, 0x80,
    0x80, 0x80, 0x80, 0x80,
    0x80, 0x80, 0x80, 0x80,
    0x80, 0x80, 0x80, 0x80
};
const char zero[16] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 };
unsigned char HammingWeight[65536]; //initialized elsewhere
 
static size_t cp_strlen_utf8_sse2(const char *_s)
{
    const char *s;
    const v16qi allZero = LoadBytes(zero);
    const v16qi masking = LoadBytes(mask);
    const v16qi matching = LoadBytes(match);
 
    v16qi row;
    size_t count = 0;
    unsigned char b;
 
    // unaligned bytes
    for (s = _s; (uintptr_t) (s) & (sizeof(v16qi) - 1); s++) {
        b = *s;
        if (b == '\0')
            goto done;
        count += (b >> 7) & ((~b) >> 6);
    }
 
    /* Handle complete blocks. */
    for (;; s += sizeof(v16qi)) {
        /* Prefetch */
        __builtin_prefetch(&s[256], 0, 0);
 
        /* Load Bytes */
        row = LoadBytes(s);
 
        /* Expect this to be false :) */
        if (NotExpected(GetMask(
                                   /* Check for zero bytes */
                                      CompareEquality(allZero, row))))
            break;
 
        /* Count number of non-starter bytes */
 
        row = And(row, masking);
        row = CompareEquality(row, matching);
        count += HammingWeight[GetMask(row)];
    }
 
    //leftover bytes
    for (;; s++) {
        b = *s;
        if (b == '\0')
            break;
        count += (b >> 7) & ((~b) >> 6);
    }
 
  done:
    return ((s - _s) - count);
}

Results

This counts about twice as fast as GCC/libc’s standard, non-UTF-8 strlen. Note the discrepancies between my timings of Colin’s code and his own tests. Damn thou, 32-bits!

"": 0 0 0 0 0 0 0
"hello, world": 12 12 12 12 12 12 12
"naïve": 6 6 6 5 5 5 5
"こんにちは": 15 15 15 5 5 5 5
"abcdefghijklmnopqrstuvwxyzβ": 28 28 28 27 27 27 27
testing 33554424 bytes of repeated "hello, world":
                      gcc_strlen =   33554424: 0.019331 +/- 0.001076
                      kjs_strlen =   33554424: 0.035095 +/- 0.000530
                       cp_strlen =   33554424: 0.021472 +/- 0.000310
                 kjs_strlen_utf8 =   33554424: 0.070260 +/- 0.000240
                  gp_strlen_utf8 =   33554424: 0.035144 +/- 0.000471
                  cp_strlen_utf8 =   33554424: 0.050539 +/- 0.000342
             cp_strlen_utf8_sse2 =   33554424: 0.010297 +/- 0.001551
testing 33554430 bytes of repeated "naïve":
                      gcc_strlen =   33554430: 0.019176 +/- 0.000824
                      kjs_strlen =   33554430: 0.035090 +/- 0.000478
                       cp_strlen =   33554430: 0.021472 +/- 0.000323
                 kjs_strlen_utf8 =   27962025: 0.070347 +/- 0.000354
                  gp_strlen_utf8 =   27962025: 0.054802 +/- 0.000299
                  cp_strlen_utf8 =   27962025: 0.050595 +/- 0.000602
             cp_strlen_utf8_sse2 =   27962025: 0.010011 +/- 0.001453
testing 33554430 bytes of repeated "こんにちは":
                      gcc_strlen =   33554430: 0.019331 +/- 0.000836
                      kjs_strlen =   33554430: 0.035225 +/- 0.000411
                       cp_strlen =   33554430: 0.021429 +/- 0.000309
                 kjs_strlen_utf8 =   11184810: 0.070249 +/- 0.000312
                  gp_strlen_utf8 =   11184810: 0.026545 +/- 0.000621
                  cp_strlen_utf8 =   11184810: 0.050512 +/- 0.000273
             cp_strlen_utf8_sse2 =   11184810: 0.010246 +/- 0.001466
testing 33554416 bytes of repeated "abcdefghijklmnopqrstuvwxyzβ":
                      gcc_strlen =   33554416: 0.019308 +/- 0.001091
                      kjs_strlen =   33554416: 0.035070 +/- 0.000486
                       cp_strlen =   33554416: 0.021441 +/- 0.000289
                 kjs_strlen_utf8 =   32356044: 0.070287 +/- 0.000297
                  gp_strlen_utf8 =   32356044: 0.043681 +/- 0.000429
                  cp_strlen_utf8 =   32356044: 0.050402 +/- 0.000204
             cp_strlen_utf8_sse2 =   32356044: 0.010407 +/- 0.001371