static void bitstring_xor_worker(int ds, char *bit_string, long bit_string_length, uint64_t *resultbuffer) { long remaininglength = bit_string_length * 8; // convert bytes to bits char *current_bit_string_pos; current_bit_string_pos = bit_string; long long offset = 0; int block_size = xordatastoretable[ds].sizeofablock; char *datastorebase; datastorebase = (char *) xordatastoretable[ds].datastore; int dwords_per_block = block_size / sizeof(uint64_t); int bit = 128; while (remaininglength >0) { if ((*current_bit_string_pos) & bit) { XOR_fullblocks(resultbuffer, (uint64_t *) (datastorebase+offset), dwords_per_block); } offset += block_size; bit /= 2; remaininglength -=1; if (bit == 0) { bit = 128; current_bit_string_pos++; } } }
// This function needs to be fast. It is a good candidate for releasing Python's GIL static void multi_bitstring_xor_worker(int ds, char *bit_string, long bit_string_length, unsigned int numstrings, __m128i *resultbuffer) { long one_bit_string_length = bit_string_length / numstrings; // length of one bit string long remaininglength = one_bit_string_length * 8; // convert bytes to bits char *current_bit_string_pos; current_bit_string_pos = bit_string; long long offset = 0; int block_size = xordatastoretable[ds].sizeofablock; char *datastorebase; datastorebase = (char *) xordatastoretable[ds].datastore; int dwords_per_block = block_size / sizeof(__m128i); unsigned char bit = 128; unsigned int i; while (remaininglength > 0) { for(i = 0; i < numstrings; i++){ if ( *(current_bit_string_pos + one_bit_string_length * i) & bit) { XOR_fullblocks(resultbuffer + dwords_per_block * i, (__m128i *) (datastorebase + offset), dwords_per_block); } } offset += block_size; bit /= 2; remaininglength -=1; if (bit == 0) { bit = 128; current_bit_string_pos++; } } }
// This XORs data with the starting data in dest static char *fast_XOR(char *dest, const char *data, long stringlength) { int leadingmisalignedbytes; long fulllengthblocks; int remainingbytes; // If it's shorter than a block, use char-based XOR if (stringlength <= sizeof(uint64_t)) { return slow_XOR(dest,data,stringlength); } // I would guess these should be similarly DWORD aligned... if (((long) dest) % sizeof(uint64_t) == ((long) data) % sizeof(uint64_t)) { printf("Error, assumed that dest and data are identically DWORD aligned!\n"); return NULL; } // Let's XOR any stray bytes at the front... // This is the number of bytes that are before we get DWORD aligned // To compute this we do (8 - (pos % 8)) % 8) leadingmisalignedbytes = (sizeof(uint64_t) - (((long)data) % sizeof(uint64_t))) % sizeof(uint64_t); XOR_byteblocks(dest, data, leadingmisalignedbytes); // The middle will be done with full sized blocks... fulllengthblocks = (stringlength-leadingmisalignedbytes) / sizeof(uint64_t); XOR_fullblocks((uint64_t *) (dest+leadingmisalignedbytes), (uint64_t *) (data + leadingmisalignedbytes), fulllengthblocks); // XOR anything left over at the end... remainingbytes = stringlength - (leadingmisalignedbytes + fulllengthblocks * sizeof(uint64_t)); XOR_byteblocks(dest+stringlength-remainingbytes, data+stringlength-remainingbytes, remainingbytes); return dest; }