Esempio n. 1
0
wordnode* article_loadIntoHt(char* read_buffer, int* article_wordcount, long* term_totalCount,node ** articleHashTable, stopwordNode * stopwordList){

//load into article
	wchar_t wchar, pre_wchar;
	wordnode* article_wordlist;
	long article_size;
	node * articleTempNode;
	int i;
	wchar_t term[3]={'\0'};
	int bufferflag=0, wordcount=0;

	wchar_t article_buffer[2000];
	wchar_t dest;
	size_t length, max;
	mbstate_t mbs;
	
	printf("@\n@B:%s", read_buffer);
	
	mbrlen( NULL, 0, &mbs);
	max= strlen(read_buffer);
	
	bufferflag=0;
	while(max>0){
		length = mbrtowc(&dest, read_buffer, max, &mbs);
			
		if((length==0)||(length>max) || bufferflag>=2000)
			break;
		
		article_buffer[bufferflag] = dest;
		//wprintf(L"[%lc]", dest);
	//	printf(" max: %d, length: %d\n", max, length);
		read_buffer+=length;
		max-=length;
		bufferflag++;
	}
	
	/*
	printf("%s", read_buffer);
	for (i=0; i<strlen(read_buffer); i++){
			temp_wchar = btowc(read_buffer[i]);
			
			if (temp_wchar != WEOF){
				article_buffer[i] = temp_wchar;
				wprintf(L"'%c'", article_buffer[i]);
			}
			else{
			break;
			}
	}*/
	wprintf(L"%s",article_buffer);

	article_size = wcslen(article_buffer);

	article_wordlist = (wordnode*)malloc(bufferflag*sizeof(wordnode));

	pre_wchar=0;

	wordcount=0;
	for(i=0;i<bufferflag;i++){
		
		article_wordlist[wordcount].word = article_buffer[i];
		wchar = article_buffer[i];
		if(isStopword(stopwordList, wchar)==1 || wchar<=127){
			article_wordlist[wordcount].isTerm = 0;
			article_wordlist[wordcount].breakmark =1;

			if(wordcount!=0){
				article_wordlist[wordcount-1].breakmark=1;
				article_wordlist[wordcount-1].nextRatio=-1;
			}

			pre_wchar = 0;
		}

		else
		{

			if(pre_wchar==0)
				pre_wchar = wchar;
			else{

				term[0]=pre_wchar;
				term[1]= wchar;

				if((articleTempNode=find_node(articleHashTable, 2000, term))!=NULL)
					
					articleTempNode->count++;
				else{
					insert_node(articleHashTable, 2000, term,  1, 0);
					//wprintf(L"[%lc %lc]\n", term[0], term[1]);
				}
				(*term_totalCount)++;
			pre_wchar=wchar;
			}
			article_wordlist[wordcount].isTerm=1;
			article_wordlist[wordcount].breakmark=1;
		}

		article_wordlist[wordcount].nextCount = -1;
		article_wordlist[wordcount].nextRatio = -1;
		
		//wprintf(L"[%lc]", article_wordlist[wordcount].word);
		wordcount++;
	}
	article_wordlist[wordcount-1].breakmark=1;

		*article_wordcount =wordcount;
		
		return article_wordlist;
}
Esempio n. 2
0
void processTREC() {
	char topic[32768];
	char line[32768];
	char thisField[32768];
	int lineLen, topicLen, queryLen;
	WeightedTerm query[1024];
	while (fgets(line, sizeof(line), stdin) != NULL) {
		if ((line[0] == 0) || (line[0] == '\n'))
			continue;
		line[strlen(line) - 1] = ' ';
		if (strncasecmp(line, "<top>", 5) == 0) {
			topicLen = 0;
		}
		else if (strncasecmp(line, "</top>", 6) == 0) {
			topicLen += sprintf(&topic[topicLen], "%s", line);
			int outPos = 1;
			for (int i = 1; topic[i] != 0; i++)
				if ((topic[i] != ' ') || (topic[outPos - 1] != ' '))
					topic[outPos++] = topic[i];
			topic[outPos] = 0;
			queryLen = 0;
			char *topicColon = strstr(topic, "Topic:");
			if (topicColon != NULL)
				for (int i = 0; i < 6; i++)
					topicColon[i] = ' ';
			for (int i = 0; topicFields[i] != NULL; i++) if (topicFieldWeights[i] > 0.0) {
				char *field = strstr(topic, topicFields[i]);
				if (field == NULL) {
					fprintf(stderr, "Error: Field \"%s\" not found.\n", topicFields[i]);
					continue;
				}
				field = &field[strlen(topicFields[i])];
				char *endOfField = strchr(field, '<');
				if (endOfField == NULL) {
					fprintf(stderr, "Error: No delimiter found for field \"%s\".\n", topicFields[i]);
					continue;
				}
				*endOfField = 0;
				strcpy(thisField, field);
				*endOfField = '<';
				if (REPLACE_US_AND_UK)
					replaceUSandUK(thisField);
				for (int k = 0; thisField[k] != 0; k++) {
					if (thisField[k] < 0)
						thisField[k] = ' ';
					else
						thisField[k] = translationTable[thisField[k]];
				}

				char *token = strtok(thisField, " ");
				while (token != NULL) {
					if (strlen(token) < MAX_TOKEN_LEN) {
						if ((!REMOVE_STOPWORDS) || (!isStopword(token, LANGUAGE_ENGLISH))) {
							strcpy(query[queryLen].term, token);
							query[queryLen].weight = topicFieldWeights[i];
							queryLen++;
						}
					}
					token = strtok(NULL, " ");
				} // end  while (token != NULL)
			} // end for (int i = 0; topicFields[i] != NULL; i++)

			char *num = strstr(topic, "Number:");
			sscanf(&num[strlen("Number:")], "%s", topicID);
			char queryString[32768];
			int queryStringLen = 0;
			queryStringLen += sprintf(&queryString[queryStringLen], QUERY_COMMAND);
			queryStringLen += sprintf(&queryString[queryStringLen],
					"[count=%d][id=%s]%s", documentCount, topicID, FEEDBACK_MODE);
			processQuery(query, queryLen, queryString);
		}
		else
			topicLen += sprintf(&topic[topicLen], "%s", line);
	}
} // end of processTREC()
Esempio n. 3
0
void processPlain() {
	char line[32768];
	char wumpusQuery[32768];
	int lineLen, topicLen, queryLen;
	WeightedTerm query[1024];
	while (fgets(line, sizeof(line), stdin) != NULL) {
		if ((line[0] == 0) || (line[0] == '\n'))
			continue;
		line[strlen(line) - 1] = 0;

		if (line[0] == '@') {
			processQuery(line);
			continue;
		}

		if (REPLACE_US_AND_UK)
			replaceUSandUK(line);
		for (int k = 0; line[k] != 0; k++) {
			if (line[k] < 0)
				line[k] = ' ';
			else
				line[k] = translationTable[line[k]];
		}

		// normalize input string
		for (int i = 0; line[i] != 0; i++) {
			if ((line[i] >= 'a') && (line[i] <= 'z')) {
				line[i] = line[i];
			}
			else if ((line[i] >= 'A') && (line[i] <= 'Z')) {
				line[i] = (line[i] | 32);
			}
			else if ((line[i] >= '0') && (line[i] <= '9')) {
				line[i] = line[i];
			}
			else
				line[i] = ' ';
		} // end for (int i = 0; line[i] != 0; i++)

		char *token = strtok(line, " ");
		int queryLen = 0;
		while (token != NULL) {
			if (strlen(token) < MAX_TOKEN_LEN) {
				if (!(REMOVE_STOPWORDS) || (!isStopword(token, LANGUAGE_ENGLISH))) {
					if ((queryLen == 0) && (strcmp(token, "us") == 0))
						strcpy(query[queryLen].term, "usa");
					else
						strcpy(query[queryLen].term, token);
					query[queryLen].weight = 1.0;
					queryLen++;
				}
			}
			else {
				token[MAX_TOKEN_LEN - 3] = 0;
				strcpy(query[queryLen].term, token);
				query[queryLen].weight = 1.0;
				queryLen++;
			}
			token = strtok(NULL, " ");
		} // end  while (token != NULL)

		char queryString[32768];
		int queryID, queryStringLen = 0;
		queryStringLen += sprintf(&queryString[queryStringLen], "%s", QUERY_COMMAND);
		queryStringLen += sprintf(&queryString[queryStringLen], "[count=%d]", documentCount);
		if (sscanf(query[0].term, "%d", &queryID) == 1) {
			sprintf(topicID, "%d", queryID);
			queryStringLen += sprintf(&queryString[queryStringLen], "[id=%d]", queryID);
			processQuery(&query[1], queryLen - 1, queryString);
		}
		else
			processQuery(query, queryLen, queryString);
	} // end while (fgets(line, sizeof(line), stdin) != NULL)

} // end of processPlain()
int CommonLanguageAnalyzer::analyze_impl(const Term& input, void* data, HookType func)
{
    parse(input.text_);

    unsigned char topAndOrBit = Term::AND;
    int tempOffset = 0;
    int lastWordOffset = -1;

    while (nextToken())
    {
        if (len_ == 0)
            continue;

        if (bRemoveStopwords_ && isStopword())
            continue;

/*            {
            UString foo(token_, len_); string bar; foo.convertString(bar, UString::UTF_8);
            cout << "(" << bar << ") --<> " << isIndex_ << "," << offset_ << "," << isRaw_ << "," << level_ << endl;
            }*/

        if (bChinese_)
        {
            int curWordOffset = offset_;
            if (curWordOffset == lastWordOffset)
                topAndOrBit = Term::OR;
            else
                topAndOrBit = Term::AND;
            lastWordOffset = curWordOffset;
        }

        if (isIndex_)
        {
            if (isSpecialChar())
            {
                func(data, token_, len_, offset_, Term::SpecialCharPOS, Term::AND, level_, true);
                tempOffset = offset_;
                continue;
            }
            if (isRaw_)
            {
                func(data, token_, len_, offset_, pos_, Term::OR, level_, false);
                tempOffset = offset_;
                continue;
            }

            // foreign language, e.g. English
            if (isAlpha())
            {
                UString::CharT* lowercaseTermUstr = lowercase_ustring_buffer_;
                bool lowercaseIsDifferent = UString::toLowerString(token_, len_,
                                            lowercase_ustring_buffer_, term_ustring_buffer_limit_);

                char* lowercaseTerm = lowercase_string_buffer_;
                UString::convertString(UString::UTF_8, lowercaseTermUstr, len_, lowercase_string_buffer_, term_string_buffer_limit_);

                UString::CharT* stemmingTermUstr = NULL;
                size_t stemmingTermUstrSize = 0;

                UString::CharT * synonymResultUstr = NULL;
                size_t synonymResultUstrLen = 0;

                if (bExtractEngStem_)
                {
                    /// TODO: write a UCS2 based stemmer
                    string stem_term;
                    pStemmer_->stem(lowercaseTerm, stem_term);
                    if (strcmp(stem_term.c_str(), lowercaseTerm))
                    {
                        stemmingTermUstr = stemming_ustring_buffer_;
                        stemmingTermUstrSize = UString::toUcs2(UString::UTF_8,
                                stem_term.c_str(), stem_term.size(), stemming_ustring_buffer_, term_ustring_buffer_limit_);
                    }
                }

//              if (false /*bExtractSynonym_, preprocessed*/)
//              {
//                  pSynonymContainer_ = uscSPtr_->getSynonymContainer();
//                  pSynonymContainer_->searchNgetSynonym(lowercaseTerm, pSynonymResult_);
//                  char * synonymResult = pSynonymResult_->getHeadWord(0);
//                  if (synonymResult)
//                  {
//                      size_t synonymResultLen = strlen(synonymResult);
//                      if (synonymResultLen <= term_ustring_buffer_limit_)
//                      {
//                          synonymResultUstr = synonym_ustring_buffer_;
//                          synonymResultUstrLen = UString::toUcs2(synonymEncode_,
//                                  synonymResult, synonymResultLen, synonym_ustring_buffer_, term_ustring_buffer_limit_);
//                      }
//                  }
//              }

                if (stemmingTermUstr || synonymResultUstr || (bCaseSensitive_ && bContainLower_ && lowercaseIsDifferent))
                {
                    /// have more than one output
                    if (bCaseSensitive_)
                    {
                        func(data,  token_, len_, offset_, Term::EnglishPOS, Term::OR, level_+1, false);
                        tempOffset = offset_;
                    }
                    else
                    {
                        func(data, lowercaseTermUstr, len_, offset_, Term::EnglishPOS, Term::OR, level_+1, false);
                        tempOffset = offset_;
                    }
                    if (stemmingTermUstr)
                    {
                        func(data, stemmingTermUstr, stemmingTermUstrSize, offset_, Term::EnglishPOS, Term::OR, level_+1, false);
                        tempOffset = offset_;
                    }
                    if (synonymResultUstr)
                    {
                        func(data, synonymResultUstr, synonymResultUstrLen, offset_, NULL, Term::OR, level_+1, false);
                        tempOffset = offset_;
                    }
                    if (bCaseSensitive_ && bContainLower_ && lowercaseIsDifferent)
                    {
                        func(data, lowercaseTermUstr, len_, offset_, Term::EnglishPOS, Term::OR, level_+1, false);
                        tempOffset = offset_;
                    }
                }
                else
                {
                    /// have only one output
                    if (bCaseSensitive_)
                    {
                        func(data,  token_, len_, offset_, Term::EnglishPOS, Term::AND, level_, false);
                        tempOffset = offset_;
                    }
                    else
                    {
                        func(data, lowercaseTermUstr, len_, offset_, Term::EnglishPOS, Term::AND, level_, false);
                        tempOffset = offset_;
                    }
                }
            }
            else
            {
//              if (false /*bExtractSynonym_, preprocessed*/)
//              {
//                  UString::CharT * synonymResultUstr = NULL;
//                  size_t synonymResultUstrLen = 0;

//                  pSynonymContainer_ = uscSPtr_->getSynonymContainer();
//                  pSynonymContainer_->searchNgetSynonym(nativeToken_, pSynonymResult_);

//                  bool hasSynonym = false;
//                  for (int i =0; i<pSynonymResult_->getSynonymCount(0); i++)
//                  {
//                      char * synonymResult = pSynonymResult_->getWord(0, i);
//                      if (synonymResult)
//                      {
//                          if (strcmp(nativeToken_, synonymResult) == 0)
//                          {
//                              //cout << "synonym self: "<<string(synonymResult) <<endl;
//                              continue;
//                          }
//                          //cout << "synonym : "<<string(synonymResult) <<endl;

//                          size_t synonymResultLen = strlen(synonymResult);
//                          if (synonymResultLen <= term_ustring_buffer_limit_)
//                          {
//                              synonymResultUstr = synonym_ustring_buffer_;
//                              synonymResultUstrLen = UString::toUcs2(synonymEncode_,
//                                      synonymResult, synonymResultLen, synonym_ustring_buffer_, term_ustring_buffer_limit_);
//                          }

//                          hasSynonym = true;
//                          func(data, synonymResultUstr, synonymResultUstrLen, offset_, NULL, Term::OR, level_+1, false);
//                      }
//                  }

//                  if (hasSynonym)
//                  {
//                      func(data, token_, len_, offset_, pos_, Term::OR, level_+1, false);
//                  }
//                  else
//                  {
//                      func(data, token_, len_, offset_, pos_, topAndOrBit, level_, false);
//                  }
//              }
//              else
                {
                    func(data, token_, len_, offset_, pos_, topAndOrBit, level_, false);
                    tempOffset = offset_;
                }
            }
        }
    }
    return tempOffset + 1;
}