wordnode* article_loadIntoHt(char* read_buffer, int* article_wordcount, long* term_totalCount,node ** articleHashTable, stopwordNode * stopwordList){ //load into article wchar_t wchar, pre_wchar; wordnode* article_wordlist; long article_size; node * articleTempNode; int i; wchar_t term[3]={'\0'}; int bufferflag=0, wordcount=0; wchar_t article_buffer[2000]; wchar_t dest; size_t length, max; mbstate_t mbs; printf("@\n@B:%s", read_buffer); mbrlen( NULL, 0, &mbs); max= strlen(read_buffer); bufferflag=0; while(max>0){ length = mbrtowc(&dest, read_buffer, max, &mbs); if((length==0)||(length>max) || bufferflag>=2000) break; article_buffer[bufferflag] = dest; //wprintf(L"[%lc]", dest); // printf(" max: %d, length: %d\n", max, length); read_buffer+=length; max-=length; bufferflag++; } /* printf("%s", read_buffer); for (i=0; i<strlen(read_buffer); i++){ temp_wchar = btowc(read_buffer[i]); if (temp_wchar != WEOF){ article_buffer[i] = temp_wchar; wprintf(L"'%c'", article_buffer[i]); } else{ break; } }*/ wprintf(L"%s",article_buffer); article_size = wcslen(article_buffer); article_wordlist = (wordnode*)malloc(bufferflag*sizeof(wordnode)); pre_wchar=0; wordcount=0; for(i=0;i<bufferflag;i++){ article_wordlist[wordcount].word = article_buffer[i]; wchar = article_buffer[i]; if(isStopword(stopwordList, wchar)==1 || wchar<=127){ article_wordlist[wordcount].isTerm = 0; article_wordlist[wordcount].breakmark =1; if(wordcount!=0){ article_wordlist[wordcount-1].breakmark=1; article_wordlist[wordcount-1].nextRatio=-1; } pre_wchar = 0; } else { if(pre_wchar==0) pre_wchar = wchar; else{ term[0]=pre_wchar; term[1]= wchar; if((articleTempNode=find_node(articleHashTable, 2000, term))!=NULL) articleTempNode->count++; else{ insert_node(articleHashTable, 2000, term, 1, 0); //wprintf(L"[%lc %lc]\n", term[0], term[1]); } (*term_totalCount)++; pre_wchar=wchar; } article_wordlist[wordcount].isTerm=1; article_wordlist[wordcount].breakmark=1; } article_wordlist[wordcount].nextCount = -1; article_wordlist[wordcount].nextRatio = -1; //wprintf(L"[%lc]", article_wordlist[wordcount].word); wordcount++; } article_wordlist[wordcount-1].breakmark=1; *article_wordcount =wordcount; return article_wordlist; }
void processTREC() { char topic[32768]; char line[32768]; char thisField[32768]; int lineLen, topicLen, queryLen; WeightedTerm query[1024]; while (fgets(line, sizeof(line), stdin) != NULL) { if ((line[0] == 0) || (line[0] == '\n')) continue; line[strlen(line) - 1] = ' '; if (strncasecmp(line, "<top>", 5) == 0) { topicLen = 0; } else if (strncasecmp(line, "</top>", 6) == 0) { topicLen += sprintf(&topic[topicLen], "%s", line); int outPos = 1; for (int i = 1; topic[i] != 0; i++) if ((topic[i] != ' ') || (topic[outPos - 1] != ' ')) topic[outPos++] = topic[i]; topic[outPos] = 0; queryLen = 0; char *topicColon = strstr(topic, "Topic:"); if (topicColon != NULL) for (int i = 0; i < 6; i++) topicColon[i] = ' '; for (int i = 0; topicFields[i] != NULL; i++) if (topicFieldWeights[i] > 0.0) { char *field = strstr(topic, topicFields[i]); if (field == NULL) { fprintf(stderr, "Error: Field \"%s\" not found.\n", topicFields[i]); continue; } field = &field[strlen(topicFields[i])]; char *endOfField = strchr(field, '<'); if (endOfField == NULL) { fprintf(stderr, "Error: No delimiter found for field \"%s\".\n", topicFields[i]); continue; } *endOfField = 0; strcpy(thisField, field); *endOfField = '<'; if (REPLACE_US_AND_UK) replaceUSandUK(thisField); for (int k = 0; thisField[k] != 0; k++) { if (thisField[k] < 0) thisField[k] = ' '; else thisField[k] = translationTable[thisField[k]]; } char *token = strtok(thisField, " "); while (token != NULL) { if (strlen(token) < MAX_TOKEN_LEN) { if ((!REMOVE_STOPWORDS) || (!isStopword(token, LANGUAGE_ENGLISH))) { strcpy(query[queryLen].term, token); query[queryLen].weight = topicFieldWeights[i]; queryLen++; } } token = strtok(NULL, " "); } // end while (token != NULL) } // end for (int i = 0; topicFields[i] != NULL; i++) char *num = strstr(topic, "Number:"); sscanf(&num[strlen("Number:")], "%s", topicID); char queryString[32768]; int queryStringLen = 0; queryStringLen += sprintf(&queryString[queryStringLen], QUERY_COMMAND); queryStringLen += sprintf(&queryString[queryStringLen], "[count=%d][id=%s]%s", documentCount, topicID, FEEDBACK_MODE); processQuery(query, queryLen, queryString); } else topicLen += sprintf(&topic[topicLen], "%s", line); } } // end of processTREC()
void processPlain() { char line[32768]; char wumpusQuery[32768]; int lineLen, topicLen, queryLen; WeightedTerm query[1024]; while (fgets(line, sizeof(line), stdin) != NULL) { if ((line[0] == 0) || (line[0] == '\n')) continue; line[strlen(line) - 1] = 0; if (line[0] == '@') { processQuery(line); continue; } if (REPLACE_US_AND_UK) replaceUSandUK(line); for (int k = 0; line[k] != 0; k++) { if (line[k] < 0) line[k] = ' '; else line[k] = translationTable[line[k]]; } // normalize input string for (int i = 0; line[i] != 0; i++) { if ((line[i] >= 'a') && (line[i] <= 'z')) { line[i] = line[i]; } else if ((line[i] >= 'A') && (line[i] <= 'Z')) { line[i] = (line[i] | 32); } else if ((line[i] >= '0') && (line[i] <= '9')) { line[i] = line[i]; } else line[i] = ' '; } // end for (int i = 0; line[i] != 0; i++) char *token = strtok(line, " "); int queryLen = 0; while (token != NULL) { if (strlen(token) < MAX_TOKEN_LEN) { if (!(REMOVE_STOPWORDS) || (!isStopword(token, LANGUAGE_ENGLISH))) { if ((queryLen == 0) && (strcmp(token, "us") == 0)) strcpy(query[queryLen].term, "usa"); else strcpy(query[queryLen].term, token); query[queryLen].weight = 1.0; queryLen++; } } else { token[MAX_TOKEN_LEN - 3] = 0; strcpy(query[queryLen].term, token); query[queryLen].weight = 1.0; queryLen++; } token = strtok(NULL, " "); } // end while (token != NULL) char queryString[32768]; int queryID, queryStringLen = 0; queryStringLen += sprintf(&queryString[queryStringLen], "%s", QUERY_COMMAND); queryStringLen += sprintf(&queryString[queryStringLen], "[count=%d]", documentCount); if (sscanf(query[0].term, "%d", &queryID) == 1) { sprintf(topicID, "%d", queryID); queryStringLen += sprintf(&queryString[queryStringLen], "[id=%d]", queryID); processQuery(&query[1], queryLen - 1, queryString); } else processQuery(query, queryLen, queryString); } // end while (fgets(line, sizeof(line), stdin) != NULL) } // end of processPlain()
int CommonLanguageAnalyzer::analyze_impl(const Term& input, void* data, HookType func) { parse(input.text_); unsigned char topAndOrBit = Term::AND; int tempOffset = 0; int lastWordOffset = -1; while (nextToken()) { if (len_ == 0) continue; if (bRemoveStopwords_ && isStopword()) continue; /* { UString foo(token_, len_); string bar; foo.convertString(bar, UString::UTF_8); cout << "(" << bar << ") --<> " << isIndex_ << "," << offset_ << "," << isRaw_ << "," << level_ << endl; }*/ if (bChinese_) { int curWordOffset = offset_; if (curWordOffset == lastWordOffset) topAndOrBit = Term::OR; else topAndOrBit = Term::AND; lastWordOffset = curWordOffset; } if (isIndex_) { if (isSpecialChar()) { func(data, token_, len_, offset_, Term::SpecialCharPOS, Term::AND, level_, true); tempOffset = offset_; continue; } if (isRaw_) { func(data, token_, len_, offset_, pos_, Term::OR, level_, false); tempOffset = offset_; continue; } // foreign language, e.g. English if (isAlpha()) { UString::CharT* lowercaseTermUstr = lowercase_ustring_buffer_; bool lowercaseIsDifferent = UString::toLowerString(token_, len_, lowercase_ustring_buffer_, term_ustring_buffer_limit_); char* lowercaseTerm = lowercase_string_buffer_; UString::convertString(UString::UTF_8, lowercaseTermUstr, len_, lowercase_string_buffer_, term_string_buffer_limit_); UString::CharT* stemmingTermUstr = NULL; size_t stemmingTermUstrSize = 0; UString::CharT * synonymResultUstr = NULL; size_t synonymResultUstrLen = 0; if (bExtractEngStem_) { /// TODO: write a UCS2 based stemmer string stem_term; pStemmer_->stem(lowercaseTerm, stem_term); if (strcmp(stem_term.c_str(), lowercaseTerm)) { stemmingTermUstr = stemming_ustring_buffer_; stemmingTermUstrSize = UString::toUcs2(UString::UTF_8, stem_term.c_str(), stem_term.size(), stemming_ustring_buffer_, term_ustring_buffer_limit_); } } // if (false /*bExtractSynonym_, preprocessed*/) // { // pSynonymContainer_ = uscSPtr_->getSynonymContainer(); // pSynonymContainer_->searchNgetSynonym(lowercaseTerm, pSynonymResult_); // char * synonymResult = pSynonymResult_->getHeadWord(0); // if (synonymResult) // { // size_t synonymResultLen = strlen(synonymResult); // if (synonymResultLen <= term_ustring_buffer_limit_) // { // synonymResultUstr = synonym_ustring_buffer_; // synonymResultUstrLen = UString::toUcs2(synonymEncode_, // synonymResult, synonymResultLen, synonym_ustring_buffer_, term_ustring_buffer_limit_); // } // } // } if (stemmingTermUstr || synonymResultUstr || (bCaseSensitive_ && bContainLower_ && lowercaseIsDifferent)) { /// have more than one output if (bCaseSensitive_) { func(data, token_, len_, offset_, Term::EnglishPOS, Term::OR, level_+1, false); tempOffset = offset_; } else { func(data, lowercaseTermUstr, len_, offset_, Term::EnglishPOS, Term::OR, level_+1, false); tempOffset = offset_; } if (stemmingTermUstr) { func(data, stemmingTermUstr, stemmingTermUstrSize, offset_, Term::EnglishPOS, Term::OR, level_+1, false); tempOffset = offset_; } if (synonymResultUstr) { func(data, synonymResultUstr, synonymResultUstrLen, offset_, NULL, Term::OR, level_+1, false); tempOffset = offset_; } if (bCaseSensitive_ && bContainLower_ && lowercaseIsDifferent) { func(data, lowercaseTermUstr, len_, offset_, Term::EnglishPOS, Term::OR, level_+1, false); tempOffset = offset_; } } else { /// have only one output if (bCaseSensitive_) { func(data, token_, len_, offset_, Term::EnglishPOS, Term::AND, level_, false); tempOffset = offset_; } else { func(data, lowercaseTermUstr, len_, offset_, Term::EnglishPOS, Term::AND, level_, false); tempOffset = offset_; } } } else { // if (false /*bExtractSynonym_, preprocessed*/) // { // UString::CharT * synonymResultUstr = NULL; // size_t synonymResultUstrLen = 0; // pSynonymContainer_ = uscSPtr_->getSynonymContainer(); // pSynonymContainer_->searchNgetSynonym(nativeToken_, pSynonymResult_); // bool hasSynonym = false; // for (int i =0; i<pSynonymResult_->getSynonymCount(0); i++) // { // char * synonymResult = pSynonymResult_->getWord(0, i); // if (synonymResult) // { // if (strcmp(nativeToken_, synonymResult) == 0) // { // //cout << "synonym self: "<<string(synonymResult) <<endl; // continue; // } // //cout << "synonym : "<<string(synonymResult) <<endl; // size_t synonymResultLen = strlen(synonymResult); // if (synonymResultLen <= term_ustring_buffer_limit_) // { // synonymResultUstr = synonym_ustring_buffer_; // synonymResultUstrLen = UString::toUcs2(synonymEncode_, // synonymResult, synonymResultLen, synonym_ustring_buffer_, term_ustring_buffer_limit_); // } // hasSynonym = true; // func(data, synonymResultUstr, synonymResultUstrLen, offset_, NULL, Term::OR, level_+1, false); // } // } // if (hasSynonym) // { // func(data, token_, len_, offset_, pos_, Term::OR, level_+1, false); // } // else // { // func(data, token_, len_, offset_, pos_, topAndOrBit, level_, false); // } // } // else { func(data, token_, len_, offset_, pos_, topAndOrBit, level_, false); tempOffset = offset_; } } } } return tempOffset + 1; }