Example #1
0
static int __fnmatch( char *pattern, char *string )
/**************************************************
 * OS specific compare function FNameCmpChr
 * must be used for file names
 */
{
    char    *p;
    int     len;
    int     star_char;
    int     i;

    /*
     * check pattern section with wildcard characters
     */
    star_char = 0;
    while( IS_WILDCARD_CHAR( pattern ) ) {
        if( *pattern == '?' ) {
            if( *string == 0 ) {
                return( 0 );
            }
            string++;
        } else {
            star_char = 1;
        }
        pattern++;
    }
    if( *pattern == 0 ) {
        if( (*string == 0) || star_char ) {
            return( 1 );
        } else {
            return( 0 );
        }
    }
    /*
     * check pattern section with exact match
     * ( all characters except wildcards )
     */
    p = pattern;
    len = 0;
    do {
        if( star_char ) {
            if( string[len] == 0 ) {
                return( 0 );
            }
            len++;
        } else {
            if( FNameCmpChr( *pattern, *string ) != 0 ) {
                return( 0 );
            }
            string++;
        }
        pattern++;
    } while( *pattern && !IS_WILDCARD_CHAR( pattern ) );
    if( star_char == 0 ) {
        /*
         * match is OK, try next pattern section
         */
        return( __fnmatch( pattern, string ) );
    } else {
        /*
         * star pattern section, try locate exact match
         */
        while( *string ) {
            if( FNameCmpChr( *p, *string ) == 0 ) {
                for( i = 1; i < len; i++ ) {
                    if( FNameCmpChr( *(p + i), *(string + i) ) != 0 ) {
                        break;
                    }
                }
                if( i == len ) {
                    /*
                     * if rest doesn't match, find next occurence
                     */
                    if( __fnmatch( pattern, string + len ) ) {
                        return( 1 );
                    }
                }
            }
            string++;
        }
        return( 0 );
    }
}
Example #2
0
ExtentList * CompactIndex2::getPostingsForWildcardQuery(const char *pattern, const char *stem) {
	if ((header.descriptorCount <= 0) || (header.termCount <= 0))
		return new ExtentList_Empty();

	char *prefix = duplicateString(pattern);
	for (int i = 0; prefix[i] != 0; i++)
		if (IS_WILDCARD_CHAR(prefix[i])) {
			prefix[i] = 0;
			break;
		}
	bool isDocumentLevel = startsWith(prefix, "<!>");
	int prefixLen = strlen(prefix);
	if (prefixLen < (isDocumentLevel ? 5 : 2)) {
		free(prefix);
		return new ExtentList_Empty();
	}
	
	char t[MAX_TOKEN_LENGTH * 2], prevTerm[MAX_TOKEN_LENGTH * 2];
	int64_t filePosition = getBlockStart(prefix, prevTerm);
	if (filePosition < 0)
		filePosition = groupDescriptors[0].filePosition;

	// we have identified the index block that potentially contains the
	// term that we are looking for; load first BYTES_PER_INDEX_BLOCK bytes
	// into memory and conduct another sequential scan on those data
	PostingListSegmentHeader plsh;
	byte buffer[BYTES_PER_INDEX_BLOCK + 256];
	int status = readRawData(filePosition, buffer, sizeof(buffer));
	int pos = 0;

	LocalLock lock(this);

	int termsFound = 0, termsAllocated = 256;
	ExtentList **lists = typed_malloc(ExtentList*, termsAllocated);

#if ALWAYS_LOAD_POSTINGS_INTO_MEMORY
	SPL_InMemorySegment *splSegments = NULL;
	FileFile *file = NULL;
#else
	SPL_OnDiskSegment *splSegments = NULL;
	FileFile *file = getFile();
#endif

	pos += decodeFrontCoding(&buffer[pos], prevTerm, t);
	strcpy(prevTerm, t);
	while (strncmp(prevTerm, prefix, prefixLen) <= 0) {
		int64_t postingsPosition = filePosition + pos;
		int comparison = strncmp(prevTerm, prefix, prefixLen);
		PostingListSegmentHeader plsh;

		// make sure the current term matches the prefix query and also satisfies the
		// stemming criterion
		if (comparison == 0)
			if (fnmatch(pattern, prevTerm, 0) != 0)
				comparison = -1;
		if ((comparison == 0) && (stem != NULL)) {
			char tempForStemming[MAX_TOKEN_LENGTH * 2];
			strcpy(tempForStemming, prevTerm);
			if (isDocumentLevel)
				Stemmer::stemEnglish(&tempForStemming[3]);
			else
				Stemmer::stemEnglish(tempForStemming);
			if (strcmp(tempForStemming, stem) != 0)
				comparison = -1;
		} // end if ((comparison == 0) && (stem != NULL))

		int segmentsSeen = 0;
		do {
			if (++segmentsSeen == 2) {
				int64_t markerValue;
				memcpy(&markerValue, &buffer[pos], sizeof(markerValue));
				postingsPosition += sizeof(markerValue);
				filePosition += pos + sizeof(markerValue) + markerValue;
				status = readRawData(filePosition, buffer, sizeof(buffer));
				pos = 0;

				int32_t segmentCount, segmentSize;
				pos += decodeVByte32(&segmentCount, &buffer[pos]);
				pos += decodeVByte32(&segmentSize, &buffer[pos]);

				if (comparison == 0) {
					byte *compressedHeaders = (byte*)malloc(segmentSize);
					readRawData(filePosition + pos, compressedHeaders, segmentSize);
					int inPos = decompressPLSH(&compressedHeaders[0], 0, &plsh);
#if ALWAYS_LOAD_POSTINGS_INTO_MEMORY
					splSegments = typed_realloc(SPL_InMemorySegment, splSegments, segmentCount);
#else
					splSegments = typed_realloc(SPL_OnDiskSegment, splSegments, segmentCount);
#endif
					for (int i = 1; i < segmentCount; i++) {
						int headerSize = decompressPLSH(&compressedHeaders[inPos], plsh.lastElement, &plsh);
						inPos += headerSize;
						postingsPosition += headerSize;
						splSegments[i].count = plsh.postingCount;
						splSegments[i].byteLength = plsh.byteLength;
						splSegments[i].firstPosting = plsh.firstElement;
						splSegments[i].lastPosting = plsh.lastElement;
#if ALWAYS_LOAD_POSTINGS_INTO_MEMORY
						splSegments[i].postings = (byte*)malloc(plsh.byteLength);
						readRawData(postingsPosition, splSegments[i].postings, plsh.byteLength);
#else
						splSegments[i].file = new FileFile(file, postingsPosition);
#endif
						postingsPosition += plsh.byteLength + 1;
					}

					free(compressedHeaders);
					segmentsSeen = segmentCount;
				} // end if (comparison == 0)

				pos += segmentSize;
				break;
			} // end if (++segmentsSeen == 2)

			int headerSize = decompressPLSH(&buffer[pos], 0, &plsh);
			pos += headerSize;
			postingsPosition += headerSize;

			// if the current term matches the query, collect postings data
			if (comparison == 0) {
#if ALWAYS_LOAD_POSTINGS_INTO_MEMORY
				splSegments = typed_malloc(SPL_InMemorySegment, 1);
				splSegments[0].postings = (byte*)malloc(plsh.byteLength);
				readRawData(postingsPosition, splSegments[0].postings, plsh.byteLength);
#else
				splSegments = typed_malloc(SPL_OnDiskSegment, 1);
				splSegments[0].file = new FileFile(file, postingsPosition);
#endif
				splSegments[0].count = plsh.postingCount;
				splSegments[0].byteLength = plsh.byteLength;
				splSegments[0].firstPosting = plsh.firstElement;
				splSegments[0].lastPosting = plsh.lastElement;
			} // end if (comparison == 0)

			pos += plsh.byteLength;
			postingsPosition += plsh.byteLength + 1;

			if (pos + 256 > status) {
				filePosition += pos;
				status = readRawData(filePosition, buffer, sizeof(buffer));
				pos = 0;
			}
		} while (buffer[pos++] == 255);

		// add current list to set of lists to return to caller
		if ((comparison == 0) && (segmentsSeen > 0)) {
			if (termsFound >= termsAllocated)
				lists = typed_realloc(ExtentList*, lists, termsAllocated = (termsAllocated * 2));
#if ALWAYS_LOAD_POSTINGS_INTO_MEMORY
			lists[termsFound++] = new SegmentedPostingList(splSegments, segmentsSeen, true);
#else
			lists[termsFound++] = new SegmentedPostingList(splSegments, segmentsSeen);
#endif
		}

		if (pos + 256 > status) {
			// refill buffer if necessary
			filePosition += pos;
			status = readRawData(filePosition, buffer, sizeof(buffer));
			pos = 0;
		}

		// fetch next term from buffer
		pos += decodeFrontCoding(&buffer[pos], prevTerm, t);
		strcpy(prevTerm, t);
	} // end while (strncmp(prevTerm, prefix, prefixLen) <= 0)

	free(prefix);

	if (termsFound == 0) {
		if (file != NULL)
			delete file;
		free(lists);
		return new ExtentList_Empty();
	}
	else if (termsFound == 1) {
		ExtentList *result = lists[0];
		free(lists);
		return result;
	}
	else if (isDocumentLevel)
		return ExtentList::mergeDocumentLevelLists(lists, termsFound);
	else
		return new ExtentList_OR_Postings(lists, termsFound);
} // end of getPostingsForWildcardQuery(char*, char*)