inline void distinguish::segment(int width, int height, BYTE*& data) { int i = 0; int j = 0; int ct = 0; int s_horizon[CHARACTER_NUM]; int e_horizon[CHARACTER_NUM]; vector<int> rows; #ifdef DEBUG_ERROR printf("width=%d,height=%d\n", width, height); printf("c_BLACK=%d\n", c_BLACK); #endif for(i = 0; i < width; i++) { for(j = 0; j < height && data[j * width + i] < p_value; j++); if(j == height) { rows.push_back(i); } #ifdef DEBUG_ERROR printf("%d,rows.size(%d)=%d\n",data[j * width + i], i, j); #endif } if(rows[rows.size() - 1] != width - 1) { rows.push_back(width - 1); } #ifdef DEBUG_ERROR printf("rows.size()=%d\n", rows.size()); #endif for(i = 1; i < int(rows.size()); i++) { if(rows[i] - rows[i - 1] >= MIN_WIDTH) { s_horizon[ct] = rows[i - 1]; e_horizon[ct] = rows[i]; subsegment(width, height, rows[i - 1], rows[i], data); ct++; } #ifdef DEBUG_ERROR printf("%d,%d\n", rows[i - 1], rows[i]); #endif } #ifdef DEBUG_ERROR printf("%d\n", ct); #endif }
bool CommonLanguageAnalyzer::analyze_synonym_impl(const izenelib::util::UString& inputString, SynonymOutputType& synonymOutput) { // cout << "[CommonLanguageAnalyzer::analyze_synonym_impl] "; // inputString.displayStringValue(izenelib::util::UString::UTF_8); // cout << endl; bool retFoundSynonym = false; const UString::CharT* pInput = inputString.c_str(); size_t length = inputString.length(); // ensure length size_t ustring_buffer_size_ = term_ustring_buffer_limit_; if (ustring_buffer_size_ < length+1) { ustring_buffer_size_ = length+1; delete lowercase_ustring_buffer_; lowercase_ustring_buffer_ = new UString::CharT[ustring_buffer_size_]; } size_t string_buffer_size = term_string_buffer_limit_; if (string_buffer_size < length*4) { string_buffer_size = length*4; delete lowercase_string_buffer_; lowercase_string_buffer_ = new char[string_buffer_size]; } // to low case UString::CharT* lowercaseTermUstr = lowercase_ustring_buffer_; bool lowercaseIsDifferent = UString::toLowerString(pInput, length, lowercase_ustring_buffer_, term_ustring_buffer_limit_); if (lowercaseIsDifferent) pInput = lowercaseTermUstr; // convert input string to utf8 characters char* chars = lowercase_string_buffer_; vector<size_t> charOffs; charOffs.reserve(length); size_t preCharLen = -1; size_t curOff = 0; for (size_t i = 0; i < length; i++) { curOff += (preCharLen+1); charOffs[i] = curOff; preCharLen = UString::convertString(UString::UTF_8, pInput+i, 1, chars+curOff, 4); //cout << chars+curOff <<" "<<curOff<<" " << preCharLen <<endl; } // search synonym dict for input izenelib::am::StrBasedVTrie strTrie(pSynonymContainer_->getData()); UString::CharT * synonymResultUstr = NULL; size_t synonymResultUstrLen = 0; size_t curIdx = 0; size_t startIdx = 0; size_t wordEndIdx; VTrieNode endNode; while (curIdx < length) { wordEndIdx = size_t(-1); char* pch = chars+charOffs[curIdx]; strTrie.firstSearch(pch); if (strTrie.completeSearch == false || strTrie.node->moreLong == false) { curIdx ++; continue; } if (strTrie.exists()) { wordEndIdx = curIdx; strTrie.getCurrentNode(endNode); } //else // uncomment if minimum match { for (size_t j = curIdx + 1; j < length; j ++) { char* pnch = chars+charOffs[j]; strTrie.search(pnch); if (strTrie.completeSearch == false) break; // matched a word if (strTrie.exists()) { wordEndIdx = j; strTrie.getCurrentNode(endNode); //break; // uncomment if minimum match } } } // if matched a synonym if (wordEndIdx != size_t(-1)) { retFoundSynonym = true; // segment with out synonym if (startIdx < curIdx) { std::vector<UString> segment; UString subsegment(pInput+startIdx, curIdx-startIdx); //subsegment.displayStringValue(izenelib::util::UString::UTF_8); cout << endl;/// segment.push_back(subsegment); synonymOutput.push_back(segment); } // segment with synonyms std::vector<UString> segment; //UString(pInput+curIdx,wordEndIdx+1-curIdx).displayStringValue(izenelib::util::UString::UTF_8); cout <<" [has synonym] "; pSynonymContainer_->setSynonym(pSynonymResult_, &endNode); size_t cnt, idx = 0; set<UString> synonymSet; // avoid duplication do { cnt = pSynonymResult_->getSynonymCount(idx); for (size_t off = 0; off < cnt; off++) { char * synonymResult = pSynonymResult_->getWord(idx, off); if (synonymResult) { size_t synonymResultLen = strlen(synonymResult); if (synonymResultLen <= term_ustring_buffer_limit_) { synonymResultUstr = synonym_ustring_buffer_; synonymResultUstrLen = UString::toUcs2(synonymEncode_, synonymResult, synonymResultLen, synonym_ustring_buffer_, term_ustring_buffer_limit_); } UString synonym(synonymResultUstr, synonymResultUstrLen); if (synonymSet.find(synonym) != synonymSet.end()) continue; synonymSet.insert(synonym); segment.push_back(synonym); } } idx++; } while (cnt > 0); synonymOutput.push_back(segment); curIdx = wordEndIdx+1; startIdx = curIdx; } else { curIdx++; } } if (!retFoundSynonym) return false; if (startIdx < curIdx) { std::vector<UString> segment; UString subsegment(pInput+startIdx, curIdx-startIdx); //subsegment.displayStringValue(izenelib::util::UString::UTF_8); cout << endl; /// segment.push_back(subsegment); synonymOutput.push_back(segment); } return true; }