void StemDisambiguation::operator()() { for (int i = 0; i < stemNodes->size(); i++) { StemNode &n = (*stemNodes)[i]; long stem_id = n.stem_id; StemNode::CategoryVector &cats = n.category_ids; for (int c = 0; c < cats.size(); c++) { long cat_id = cats[c]; //QString cat=database_info.comp_rules->getCategoryName(cat_id); StemNode::RawDatasEntry &raws = n.raw_datas[c]; for (int r = 0; r < raws.size(); r++) { QString raw = raws[r]; QString stem = removeDiacritics(raw); //bc currently n.key left empty ItemEntryKey entry(stem_id, cat_id, raw); ItemCatRaw2AbsDescPosMapItr itr = map->find(entry); while (itr != map->end() && itr.key() == entry) { //dbitvec d=itr.value().first; QString pos = itr.value().third; long desc_id = itr.value().second; QString desc; if (desc_id >= 0) { desc = (*database_info.descriptions)[desc_id]; } store(stem_id, stem, raw, desc, pos); itr++; } } } } }
int EditDistance::Compute(QString a, QString b, bool) { a = removeDiacritics(a); b = removeDiacritics(b); // Allocate distance matrix QList<QList<int> > d; QList<int> temp; for (int i = 0; i < b.size() + 1; i++) { temp.append(0); } for (int i = 0; i < a.size() + 1; i++) { d.append(temp); } // Compute distance for (int i = 0; i <= a.size(); i++) { d[i][0] = i; } for (int j = 0; j <= b.size(); j++) { d[0][j] = j; } for (int i = 1; i <= a.size(); i++) { for (int j = 1; j <= b.size(); j++) { if (CharComparer(a.at(i - 1), b.at(j - 1))) { // No change required d[i][j] = d[i - 1][j - 1]; } else { d[i][ j] = min(d[i - 1][ j] + 1, // Deletion min(d[i][ j - 1] + 1, // Insertion d[i - 1][ j - 1] + 1)); // Substitution } } } // Return final value return d[a.size()][ b.size()]; }
bool DiacriticRules::on_match() { double ambiguity_reduction = 0.0; int least_ambiguity_position = -1; /** Number of Morphemes **/ int number_of_morphemes = 0; /** letter count of unvocalized word **/ int length = 0; QString vocalizedWord; QString unvocalizedWord; QVector<QString> prefixPOSs; QVector<QString> prefixes; QString stemPOS; QVector<QString> suffixPOSs; QVector<QString> suffixes; int prefix_length = 0; /** letter count of stem **/ int stem_length = 0; int suffix_length = 0; /** Get vocalized and unvocalized words **/ int prefix_infos_size = prefix_infos->size(); for (int i= 0; i<prefix_infos_size;i++) { minimal_item_info & pre = (*prefix_infos)[i]; if(!(pre.raw_data.isEmpty())) { number_of_morphemes++; vocalizedWord.append(pre.raw_data); } } prefix_length = removeDiacritics(vocalizedWord).count(); number_of_morphemes++; vocalizedWord.append(stem_info->raw_data); stem_length = removeDiacritics(stem_info->raw_data).count(); int suffix_infos_size = suffix_infos->size(); for (int i=0;i<suffix_infos_size;i++) { minimal_item_info & suff = (*suffix_infos)[i]; if(!(suff.raw_data.isEmpty())) { number_of_morphemes++; vocalizedWord.append(suff.raw_data); } } unvocalizedWord = removeDiacritics(vocalizedWord); /** Unvocalized word Character Count **/ length = unvocalizedWord.count(); suffix_length = length - (prefix_length + stem_length); /** Ambiguity of the unvocalized word **/ int unvocalizedAmbiguity = 0; WordAmbiguity wa(&unvocalizedWord, &unvocalizedAmbiguity); wa(); /** Discard this morphological solution if the unvocalized word is not ambiguous (has 1 morpho. solution) **/ if(unvocalizedAmbiguity < 2) { return true; } /// Select required morphological features /** Prefix Features **/ int j = 0; for (int i = (prefix_infos_size-1); (i>=0) && (j<4);i--) { minimal_item_info & pre = (*prefix_infos)[i]; if(pre.POS.isEmpty() && pre.raw_data.isEmpty()) { continue; } QStringList pre_poss = pre.POS.split('/'); if(pre_poss.count() != 2) { continue; } QString unvoc_pre_data = removeDiacritics(pre.raw_data); if(!(unvoc_pre_data.isEmpty())) { prefixes.prepend(unvoc_pre_data); } if(!(pre_poss[1].isEmpty())) { prefixPOSs.prepend(pre_poss[1]); } j++; } while(prefixes.count() < 4) { prefixes.prepend("EPRE"); } while(prefixPOSs.count() < 4) { prefixPOSs.prepend("EPREPOS"); } /** Stem Features **/ minimal_item_info & stem = *stem_info; //stem_length = removeDiacritics(stem.raw_data).count(); QStringList stem_poss = stem.POS.split('/'); if(stem_poss.count() != 2) { return true; } stemPOS = stem_poss[1]; /** Suffix Features **/ j = 0; for (int i=0;(i<suffix_infos_size) && (j<4);i++) { minimal_item_info & suff = (*suffix_infos)[i]; if(suff.POS.isEmpty() && suff.raw_data.isEmpty()) { continue; } QStringList suff_poss = suff.POS.split('/'); if(suff_poss.count() != 2) { continue; } QString unvoc_suf_data = removeDiacritics(suff.raw_data); if(!(unvoc_suf_data.isEmpty())) { suffixes.append(unvoc_suf_data); } if(!(suff_poss[1].isEmpty())) { suffixPOSs.append(suff_poss[1]); } j++; } while(suffixes.count() < 4) { suffixes.append("ESUF"); } while(suffixPOSs.count() < 4) { suffixPOSs.append("ESUFPOS"); } /// Detach diacritics from raw_data and store in separate structure int diacritic_Counter = 0; QVector<QVector<QChar> > wordDiacritics(length); int letterIndex = 0; for(int i=1; i<vocalizedWord.count(); i++) { QChar currentLetter= vocalizedWord[i]; if(isDiacritic(currentLetter)) { wordDiacritics[letterIndex].append(currentLetter); diacritic_Counter++; } else { letterIndex++; } } if(diacritic_Counter == 0) { return true; } /// Get the number of solutions for each solution with one diacritic /// Select diacritic position leastambiguous = least number of morphological solutions QVector<QVector<int> > diacriticAmbiguity(length); int least_ambiguity = unvocalizedAmbiguity + 1; int diacritic_Index = -1; for(int i=0; i< wordDiacritics.count(); i++) { for(j=0; j< wordDiacritics.at(i).count(); j++) { QString one_diacritic_word = unvocalizedWord; one_diacritic_word.insert(i+1,wordDiacritics[i][j]); int one_diacritic_Ambiguity = 0; WordAmbiguity wa(&one_diacritic_word, &one_diacritic_Ambiguity); wa(); if(one_diacritic_Ambiguity == 0) { diacriticAmbiguity[i].append(unvocalizedAmbiguity); } else { diacriticAmbiguity[i].append(one_diacritic_Ambiguity); } if(diacriticAmbiguity[i][j] < least_ambiguity) { least_ambiguity = diacriticAmbiguity[i][j]; least_ambiguity_position = i; diacritic_Index = j; } } } /** This weirdly happens when a word partial diacritics has ambiguity more than the unvocalized word (ex. dAn) **/ if((least_ambiguity_position == -1) || (diacritic_Index == -1)) { if(number_of_solutions == -1) { return true; } else if(solution_counter != number_of_solutions) { solution_counter++; return true; } else { return false; } } ambiguity_reduction = ((unvocalizedAmbiguity- diacriticAmbiguity[least_ambiguity_position][diacritic_Index]) * 1.0) / unvocalizedAmbiguity; /** Filter data to extract high ambiguity reduction instances **/ if(ambiguity_reduction < 0.667) { if(number_of_solutions == -1) { return true; } else if(solution_counter != number_of_solutions) { solution_counter++; return true; } else { return false; } } filtered_items++; /** Print/Use data **/ theSarf->out << number_of_morphemes << " " << length << " " << stem_length << " "; // prefixPOSs for(int i=0; i<prefixPOSs.count(); i++) { theSarf->out << prefixPOSs[i] << " "; } // prefixes for(int i=0; i< prefixes.count(); i++) { theSarf->out << prefixes[i] << " "; } // stemPOS theSarf->out << stemPOS << " "; // suffixPOSs for(int i=0; i<suffixPOSs.count(); i++) { theSarf->out << suffixPOSs[i] << " "; } // suffixes for(int i=0; i<suffixes.count(); i++) { theSarf->out << suffixes[i] << " "; } // least_ambiguity_position // prefixs , prefixm, prefixe , stems , stemm, steme , suffixs , suffixm, suffixe //theSarf->out << least_ambiguity_position << " "; QString diacritic_position; if((prefix_length != 0) && (least_ambiguity_position == 0)) { diacritic_position = "prefixs"; } else if((prefix_length != 0) && (least_ambiguity_position > 0) && (least_ambiguity_position < (prefix_length-1))) { diacritic_position = "prefixm"; } else if((prefix_length != 0) && (least_ambiguity_position == (prefix_length-1))) { diacritic_position = "prefixe"; } else if(least_ambiguity_position == prefix_length) { diacritic_position = "stems"; } else if((least_ambiguity_position > (prefix_length)) && (least_ambiguity_position < (prefix_length + stem_length - 1))) { diacritic_position = "stemm"; } else if(least_ambiguity_position == (prefix_length + stem_length - 1)) { diacritic_position = "steme"; } else if((suffix_length != 0) && (least_ambiguity_position == (prefix_length + stem_length))) { diacritic_position = "suffixs"; } else if((suffix_length != 0) && (least_ambiguity_position > (prefix_length + stem_length)) && (least_ambiguity_position < (length - 1))) { diacritic_position = "suffixm"; } else if((suffix_length != 0) && (least_ambiguity_position == (length -1))) { diacritic_position = "suffixe"; } else { cout << "Couldn't set diacritic position!" << endl; return false; } theSarf->out << diacritic_position << '\n'; // ambiguity_reduction //theSarf->out << ambiguity_reduction << '\n'; /** Check for number of solutions requested **/ if(number_of_solutions == -1) { return true; } else if(solution_counter != number_of_solutions) { solution_counter++; return true; } else { return false; } };
int tree::build_helper(item_types type, long cat_id1, int size, node * current) { if (size<=0) return 0; long cat_id2,cat_r_id; Search_Compatibility s2((type==PREFIX?AA:CC),cat_id1); while (s2.retrieve(cat_id2,cat_r_id)) { QString inflections=s2.getInflectionRules(); bool isAccept=isAcceptState(type,cat_r_id); if (isAccept || hasCompatibleAffixes(type,cat_r_id)) { //dont add to tree branches that have no rules and connect to nothing else that may have rules Search_by_category s3(cat_id2); #if defined(MEMORY_EXHAUSTIVE) || defined(REDUCE_THRU_DIACRITICS) all_item_info inf; while(s3.retrieve(inf)) { QString name= getColumn(interpret_type(type),"name",inf.item_id); name=removeDiacritics(name); QString inflectedRawData=inf.raw_data; applyRawDataInflections(inflections,name,inflectedRawData); //name and inflectedRawData are changed #ifdef MEMORY_EXHAUSTIVE node * next=addElement(name,inf.item_id,cat_id2,cat_r_id,isAccept,inf.raw_data,inf.description,current); #elif defined(REDUCE_THRU_DIACRITICS) node * next; #if 0 if (name.isEmpty() && inf.raw_data.isEmpty() && inf.POS.isEmpty() && inf.description().isEmpty()) { result_node * n=dynamic_cast<result_node*>(current); inf.item_id=n->get_affix_id(); inf.category_id=n->get_previous_category_id(); for (int i=0;i<n->raw_datas.size();i++) { inf.raw_data=n->raw_datas[i]; next=addElement(name,inf.item_id,cat_id2,cat_r_id,isAccept,inf.raw_data,current->parent); //check that } } else #endif next=addElement(name,inf.item_id,cat_id2,cat_r_id,isAccept,inf.raw_data,inflectedRawData,inflections,current); #endif #else long long affix_id; while(s3.retrieve(affix_id)) { QString name= getColumn(interpret_type(type),"name",affix_id); node * next=addElement(name,affix_id,cat_id2,cat_r_id,isAccept,current); #endif build_helper(type,cat_r_id,size-name.length(),next); } } } return 0; } #ifdef MEMORY_EXHAUSTIVE node* tree::addElement(QString letters, long affix_id,long category_id, long resulting_category_id,bool isAccept,QString raw_data,QString description,node * current) #elif defined (REDUCE_THRU_DIACRITICS) node* tree::addElement(QString letters, long affix_id,long category_id, long resulting_category_id,bool isAccept,QString raw_data, QString inflected_raw_data, QString descriptionInflectionRule,node * current) #else node* tree::addElement(QString letters, long affix_id,long category_id, long resulting_category_id,bool isAccept,node * current) #endif { #if 0 if (!equal(letters,raw_data)) raw_data.remove(" "); #endif assert (current->isLetterNode() || equal(letters,inflected_raw_data)); #ifdef LOAD_FROM_FILE if (file!=NULL) { (*file)<<letters<<affix_id<<category_id<<resulting_category_id<<isAccept #if defined (REDUCE_THRU_DIACRITICS) <<raw_data<<inflected_raw_data<<descriptionInflectionRule #elif defined (MEMORY_EXHAUSTIVE) <<raw_data<<description #endif <<generateNodeID(current); } #endif //pre-condition: assumes category_id is added to the right place and results in the appropraite resulting_category QChar current_letter; //QList<letter_node *>* current_letter_children; letter_node* matching_letter_node=NULL; if (current->isLetterNode() && current!=base) { #if 1 error << "Unexpected Error: provided node was a letter node and not a result one\n"; #ifdef LOAD_FROM_FILE if (file!=NULL) (*file)<<generateNodeID(NULL); #endif return NULL; #else current_letter='\0'; goto result; #endif } int i; if (letters.count()==0) { current_letter='\0'; if (current==base) goto result; } else current_letter=letters[0]; i=0; do { matching_letter_node=current->getLetterChild(current_letter); if (matching_letter_node!=NULL) { current=matching_letter_node; i++; current_letter=letters[i]; } else break; }while(i<letters.count()); if (letters.count()==0 && i==0) { //add null letter letter_node* new_node=new letter_node('\0'); current->addChild(new_node); current=new_node; letter_nodes++; } for (;i<letters.count();i++) { //add necessary letters letter_node* new_node=new letter_node(letters[i]); current->addChild(new_node); current=new_node; letter_nodes++; } result: int size=current->getResultChildren()->size(); for (int i=0;i<size;i++) //check if this result node is already present { result_node * old_result=current->getResultChildren()->at(i); if (old_result->get_previous_category_id()==category_id && old_result->get_resulting_category_id()==resulting_category_id && old_result->get_affix_id()==affix_id) { #ifdef MEMORY_EXHAUSTIVE ((result_node*)old_result)->addPair(raw_data,description); #elif defined(REDUCE_THRU_DIACRITICS) old_result->add_raw_data(raw_data, inflected_raw_data); old_result->setInflectionRule(descriptionInflectionRule); #endif #ifdef LOAD_FROM_FILE if (file!=NULL) (*file)<<generateNodeID(old_result); #endif return old_result; } } #ifdef MEMORY_EXHAUSTIVE result_node * result=new result_node(affix_id,category_id,resulting_category_id,isAccept,raw_data,description); #elif defined(REDUCE_THRU_DIACRITICS) result_node * result=new result_node(affix_id,category_id,resulting_category_id,isAccept,raw_data,inflected_raw_data); #else result_node * result=new result_node(affix_id,category_id,resulting_category_id,isAccept); #endif result->setInflectionRule(descriptionInflectionRule); current->addChild(result); current=result; result_nodes++; #ifdef LOAD_FROM_FILE if (file!=NULL) (*file)<<generateNodeID(current); #endif return current; //post-condition: returns node of resulting category reached after addition } tree::tree() { base= new letter_node('\0'); letter_nodes=1; result_nodes=0; isAffix=false; #ifdef LOAD_FROM_FILE file=NULL; #endif } tree::tree(item_types type) { base= new letter_node('\0'); letter_nodes=1; result_nodes=0; isAffix=true; #ifdef LOAD_FROM_FILE file=NULL; #endif build_affix_tree(type); }