static int __fnmatch( char *pattern, char *string ) /************************************************** * OS specific compare function FNameCmpChr * must be used for file names */ { char *p; int len; int star_char; int i; /* * check pattern section with wildcard characters */ star_char = 0; while( IS_WILDCARD_CHAR( pattern ) ) { if( *pattern == '?' ) { if( *string == 0 ) { return( 0 ); } string++; } else { star_char = 1; } pattern++; } if( *pattern == 0 ) { if( (*string == 0) || star_char ) { return( 1 ); } else { return( 0 ); } } /* * check pattern section with exact match * ( all characters except wildcards ) */ p = pattern; len = 0; do { if( star_char ) { if( string[len] == 0 ) { return( 0 ); } len++; } else { if( FNameCmpChr( *pattern, *string ) != 0 ) { return( 0 ); } string++; } pattern++; } while( *pattern && !IS_WILDCARD_CHAR( pattern ) ); if( star_char == 0 ) { /* * match is OK, try next pattern section */ return( __fnmatch( pattern, string ) ); } else { /* * star pattern section, try locate exact match */ while( *string ) { if( FNameCmpChr( *p, *string ) == 0 ) { for( i = 1; i < len; i++ ) { if( FNameCmpChr( *(p + i), *(string + i) ) != 0 ) { break; } } if( i == len ) { /* * if rest doesn't match, find next occurence */ if( __fnmatch( pattern, string + len ) ) { return( 1 ); } } } string++; } return( 0 ); } }
ExtentList * CompactIndex2::getPostingsForWildcardQuery(const char *pattern, const char *stem) { if ((header.descriptorCount <= 0) || (header.termCount <= 0)) return new ExtentList_Empty(); char *prefix = duplicateString(pattern); for (int i = 0; prefix[i] != 0; i++) if (IS_WILDCARD_CHAR(prefix[i])) { prefix[i] = 0; break; } bool isDocumentLevel = startsWith(prefix, "<!>"); int prefixLen = strlen(prefix); if (prefixLen < (isDocumentLevel ? 5 : 2)) { free(prefix); return new ExtentList_Empty(); } char t[MAX_TOKEN_LENGTH * 2], prevTerm[MAX_TOKEN_LENGTH * 2]; int64_t filePosition = getBlockStart(prefix, prevTerm); if (filePosition < 0) filePosition = groupDescriptors[0].filePosition; // we have identified the index block that potentially contains the // term that we are looking for; load first BYTES_PER_INDEX_BLOCK bytes // into memory and conduct another sequential scan on those data PostingListSegmentHeader plsh; byte buffer[BYTES_PER_INDEX_BLOCK + 256]; int status = readRawData(filePosition, buffer, sizeof(buffer)); int pos = 0; LocalLock lock(this); int termsFound = 0, termsAllocated = 256; ExtentList **lists = typed_malloc(ExtentList*, termsAllocated); #if ALWAYS_LOAD_POSTINGS_INTO_MEMORY SPL_InMemorySegment *splSegments = NULL; FileFile *file = NULL; #else SPL_OnDiskSegment *splSegments = NULL; FileFile *file = getFile(); #endif pos += decodeFrontCoding(&buffer[pos], prevTerm, t); strcpy(prevTerm, t); while (strncmp(prevTerm, prefix, prefixLen) <= 0) { int64_t postingsPosition = filePosition + pos; int comparison = strncmp(prevTerm, prefix, prefixLen); PostingListSegmentHeader plsh; // make sure the current term matches the prefix query and also satisfies the // stemming criterion if (comparison == 0) if (fnmatch(pattern, prevTerm, 0) != 0) comparison = -1; if ((comparison == 0) && (stem != NULL)) { char tempForStemming[MAX_TOKEN_LENGTH * 2]; strcpy(tempForStemming, prevTerm); if (isDocumentLevel) Stemmer::stemEnglish(&tempForStemming[3]); else Stemmer::stemEnglish(tempForStemming); if (strcmp(tempForStemming, stem) != 0) comparison = -1; } // end if ((comparison == 0) && (stem != NULL)) int segmentsSeen = 0; do { if (++segmentsSeen == 2) { int64_t markerValue; memcpy(&markerValue, &buffer[pos], sizeof(markerValue)); postingsPosition += sizeof(markerValue); filePosition += pos + sizeof(markerValue) + markerValue; status = readRawData(filePosition, buffer, sizeof(buffer)); pos = 0; int32_t segmentCount, segmentSize; pos += decodeVByte32(&segmentCount, &buffer[pos]); pos += decodeVByte32(&segmentSize, &buffer[pos]); if (comparison == 0) { byte *compressedHeaders = (byte*)malloc(segmentSize); readRawData(filePosition + pos, compressedHeaders, segmentSize); int inPos = decompressPLSH(&compressedHeaders[0], 0, &plsh); #if ALWAYS_LOAD_POSTINGS_INTO_MEMORY splSegments = typed_realloc(SPL_InMemorySegment, splSegments, segmentCount); #else splSegments = typed_realloc(SPL_OnDiskSegment, splSegments, segmentCount); #endif for (int i = 1; i < segmentCount; i++) { int headerSize = decompressPLSH(&compressedHeaders[inPos], plsh.lastElement, &plsh); inPos += headerSize; postingsPosition += headerSize; splSegments[i].count = plsh.postingCount; splSegments[i].byteLength = plsh.byteLength; splSegments[i].firstPosting = plsh.firstElement; splSegments[i].lastPosting = plsh.lastElement; #if ALWAYS_LOAD_POSTINGS_INTO_MEMORY splSegments[i].postings = (byte*)malloc(plsh.byteLength); readRawData(postingsPosition, splSegments[i].postings, plsh.byteLength); #else splSegments[i].file = new FileFile(file, postingsPosition); #endif postingsPosition += plsh.byteLength + 1; } free(compressedHeaders); segmentsSeen = segmentCount; } // end if (comparison == 0) pos += segmentSize; break; } // end if (++segmentsSeen == 2) int headerSize = decompressPLSH(&buffer[pos], 0, &plsh); pos += headerSize; postingsPosition += headerSize; // if the current term matches the query, collect postings data if (comparison == 0) { #if ALWAYS_LOAD_POSTINGS_INTO_MEMORY splSegments = typed_malloc(SPL_InMemorySegment, 1); splSegments[0].postings = (byte*)malloc(plsh.byteLength); readRawData(postingsPosition, splSegments[0].postings, plsh.byteLength); #else splSegments = typed_malloc(SPL_OnDiskSegment, 1); splSegments[0].file = new FileFile(file, postingsPosition); #endif splSegments[0].count = plsh.postingCount; splSegments[0].byteLength = plsh.byteLength; splSegments[0].firstPosting = plsh.firstElement; splSegments[0].lastPosting = plsh.lastElement; } // end if (comparison == 0) pos += plsh.byteLength; postingsPosition += plsh.byteLength + 1; if (pos + 256 > status) { filePosition += pos; status = readRawData(filePosition, buffer, sizeof(buffer)); pos = 0; } } while (buffer[pos++] == 255); // add current list to set of lists to return to caller if ((comparison == 0) && (segmentsSeen > 0)) { if (termsFound >= termsAllocated) lists = typed_realloc(ExtentList*, lists, termsAllocated = (termsAllocated * 2)); #if ALWAYS_LOAD_POSTINGS_INTO_MEMORY lists[termsFound++] = new SegmentedPostingList(splSegments, segmentsSeen, true); #else lists[termsFound++] = new SegmentedPostingList(splSegments, segmentsSeen); #endif } if (pos + 256 > status) { // refill buffer if necessary filePosition += pos; status = readRawData(filePosition, buffer, sizeof(buffer)); pos = 0; } // fetch next term from buffer pos += decodeFrontCoding(&buffer[pos], prevTerm, t); strcpy(prevTerm, t); } // end while (strncmp(prevTerm, prefix, prefixLen) <= 0) free(prefix); if (termsFound == 0) { if (file != NULL) delete file; free(lists); return new ExtentList_Empty(); } else if (termsFound == 1) { ExtentList *result = lists[0]; free(lists); return result; } else if (isDocumentLevel) return ExtentList::mergeDocumentLevelLists(lists, termsFound); else return new ExtentList_OR_Postings(lists, termsFound); } // end of getPostingsForWildcardQuery(char*, char*)