예제 #1
0
void StemDisambiguation::operator()() {
    for (int i = 0; i < stemNodes->size(); i++) {
        StemNode &n = (*stemNodes)[i];
        long stem_id = n.stem_id;
        StemNode::CategoryVector &cats = n.category_ids;

        for (int c = 0; c < cats.size(); c++) {
            long cat_id = cats[c];
            //QString cat=database_info.comp_rules->getCategoryName(cat_id);
            StemNode::RawDatasEntry &raws = n.raw_datas[c];

            for (int r = 0; r < raws.size(); r++) {
                QString raw = raws[r];
                QString stem = removeDiacritics(raw); //bc currently n.key left empty
                ItemEntryKey entry(stem_id, cat_id, raw);
                ItemCatRaw2AbsDescPosMapItr itr = map->find(entry);

                while (itr != map->end() && itr.key() == entry) {
                    //dbitvec d=itr.value().first;
                    QString pos = itr.value().third;
                    long desc_id = itr.value().second;
                    QString desc;

                    if (desc_id >= 0) {
                        desc = (*database_info.descriptions)[desc_id];
                    }

                    store(stem_id, stem, raw, desc, pos);
                    itr++;
                }
            }
        }
    }
}
예제 #2
0
int EditDistance::Compute(QString a, QString b, bool) {
    a = removeDiacritics(a);
    b = removeDiacritics(b);
    // Allocate distance matrix
    QList<QList<int> > d;
    QList<int> temp;

    for (int i = 0; i < b.size() + 1; i++) {
        temp.append(0);
    }

    for (int i = 0; i < a.size() + 1; i++) {
        d.append(temp);
    }

    // Compute distance
    for (int i = 0; i <= a.size(); i++) {
        d[i][0] = i;
    }

    for (int j = 0; j <= b.size(); j++) {
        d[0][j] = j;
    }

    for (int i = 1; i <= a.size(); i++) {
        for (int j = 1; j <= b.size(); j++) {
            if (CharComparer(a.at(i - 1), b.at(j - 1))) {
                // No change required
                d[i][j] = d[i - 1][j - 1];
            } else {
                d[i][ j] =
                    min(d[i - 1][ j] + 1,    // Deletion
                        min(d[i][ j - 1] + 1,    // Insertion
                            d[i - 1][ j - 1] + 1));       // Substitution
            }
        }
    }

    // Return final value
    return d[a.size()][ b.size()];
}
예제 #3
0
bool DiacriticRules::on_match()
{
    double ambiguity_reduction = 0.0;
    int least_ambiguity_position = -1;

    /** Number of Morphemes **/
    int number_of_morphemes = 0;

    /** letter count of unvocalized word **/
    int length = 0;

    QString vocalizedWord;
    QString unvocalizedWord;
    QVector<QString> prefixPOSs;
    QVector<QString> prefixes;
    QString stemPOS;
    QVector<QString> suffixPOSs;
    QVector<QString> suffixes;

    int prefix_length = 0;
    /** letter count of stem **/
    int stem_length = 0;
    int suffix_length = 0;

    /** Get vocalized and unvocalized words **/
    int prefix_infos_size = prefix_infos->size();
    for (int i= 0; i<prefix_infos_size;i++) {
        minimal_item_info & pre = (*prefix_infos)[i];
        if(!(pre.raw_data.isEmpty())) {
            number_of_morphemes++;
            vocalizedWord.append(pre.raw_data);
        }
    }
    prefix_length = removeDiacritics(vocalizedWord).count();

    number_of_morphemes++;
    vocalizedWord.append(stem_info->raw_data);
    stem_length = removeDiacritics(stem_info->raw_data).count();

    int suffix_infos_size = suffix_infos->size();
    for (int i=0;i<suffix_infos_size;i++) {
        minimal_item_info & suff = (*suffix_infos)[i];
        if(!(suff.raw_data.isEmpty())) {
            number_of_morphemes++;
            vocalizedWord.append(suff.raw_data);
        }
    }

    unvocalizedWord = removeDiacritics(vocalizedWord);

    /** Unvocalized word Character Count **/
    length = unvocalizedWord.count();
    suffix_length = length - (prefix_length + stem_length);

    /** Ambiguity of the unvocalized word **/
    int unvocalizedAmbiguity = 0;
    WordAmbiguity wa(&unvocalizedWord, &unvocalizedAmbiguity);
    wa();

    /** Discard this morphological solution if the unvocalized word is not ambiguous (has 1 morpho. solution) **/
    if(unvocalizedAmbiguity < 2) {
        return true;
    }

    /// Select required morphological features

    /** Prefix Features **/
    int j = 0;
    for (int i = (prefix_infos_size-1); (i>=0) && (j<4);i--) {
        minimal_item_info & pre = (*prefix_infos)[i];
        if(pre.POS.isEmpty() && pre.raw_data.isEmpty()) {
            continue;
        }

        QStringList pre_poss = pre.POS.split('/');
        if(pre_poss.count() != 2) {
            continue;
        }
        QString unvoc_pre_data = removeDiacritics(pre.raw_data);
        if(!(unvoc_pre_data.isEmpty())) {
            prefixes.prepend(unvoc_pre_data);
        }
        if(!(pre_poss[1].isEmpty())) {
            prefixPOSs.prepend(pre_poss[1]);
        }
        j++;
    }

    while(prefixes.count() < 4) {
        prefixes.prepend("EPRE");
    }

    while(prefixPOSs.count() < 4) {
        prefixPOSs.prepend("EPREPOS");
    }

    /** Stem Features **/
    minimal_item_info & stem = *stem_info;
    //stem_length = removeDiacritics(stem.raw_data).count();
    QStringList stem_poss = stem.POS.split('/');
    if(stem_poss.count() != 2) {
        return true;
    }
    stemPOS = stem_poss[1];

    /** Suffix Features **/
    j = 0;
    for (int i=0;(i<suffix_infos_size) && (j<4);i++) {
        minimal_item_info & suff = (*suffix_infos)[i];
        if(suff.POS.isEmpty() && suff.raw_data.isEmpty()) {
            continue;
        }

        QStringList suff_poss = suff.POS.split('/');
        if(suff_poss.count() != 2) {
            continue;
        }
        QString unvoc_suf_data = removeDiacritics(suff.raw_data);
        if(!(unvoc_suf_data.isEmpty())) {
            suffixes.append(unvoc_suf_data);
        }
        if(!(suff_poss[1].isEmpty())) {
            suffixPOSs.append(suff_poss[1]);
        }
        j++;
    }

    while(suffixes.count() < 4) {
        suffixes.append("ESUF");
    }

    while(suffixPOSs.count() < 4) {
        suffixPOSs.append("ESUFPOS");
    }

    /// Detach diacritics from raw_data and store in separate structure
    int diacritic_Counter = 0;
    QVector<QVector<QChar> > wordDiacritics(length);
    int letterIndex = 0;
    for(int i=1; i<vocalizedWord.count(); i++) {
        QChar currentLetter= vocalizedWord[i];
        if(isDiacritic(currentLetter)) {
            wordDiacritics[letterIndex].append(currentLetter);
            diacritic_Counter++;
        }
        else {
            letterIndex++;
        }
    }

    if(diacritic_Counter == 0) {
        return true;
    }

    /// Get the number of solutions for each solution with one diacritic
    /// Select diacritic position leastambiguous = least number of morphological solutions
    QVector<QVector<int> > diacriticAmbiguity(length);
    int least_ambiguity = unvocalizedAmbiguity + 1;
    int diacritic_Index = -1;
    for(int i=0; i< wordDiacritics.count(); i++) {
        for(j=0; j< wordDiacritics.at(i).count(); j++) {
            QString one_diacritic_word = unvocalizedWord;
            one_diacritic_word.insert(i+1,wordDiacritics[i][j]);

            int one_diacritic_Ambiguity = 0;
            WordAmbiguity wa(&one_diacritic_word, &one_diacritic_Ambiguity);
            wa();

            if(one_diacritic_Ambiguity == 0) {
                diacriticAmbiguity[i].append(unvocalizedAmbiguity);
            }
            else {
                diacriticAmbiguity[i].append(one_diacritic_Ambiguity);
            }

            if(diacriticAmbiguity[i][j] <  least_ambiguity) {
                least_ambiguity = diacriticAmbiguity[i][j];
                least_ambiguity_position = i;
                diacritic_Index = j;
            }
        }
    }

    /** This weirdly happens when a word partial diacritics has ambiguity more than the unvocalized word (ex. dAn) **/
    if((least_ambiguity_position == -1) || (diacritic_Index == -1)) {
        if(number_of_solutions == -1) {
            return true;
        }
        else if(solution_counter != number_of_solutions) {
            solution_counter++;
            return true;
        }
        else {
            return false;
        }
    }

    ambiguity_reduction = ((unvocalizedAmbiguity- diacriticAmbiguity[least_ambiguity_position][diacritic_Index]) * 1.0) / unvocalizedAmbiguity;

    /** Filter data to extract high ambiguity reduction instances **/

    if(ambiguity_reduction < 0.667) {
        if(number_of_solutions == -1) {
            return true;
        }
        else if(solution_counter != number_of_solutions) {
            solution_counter++;
            return true;
        }
        else {
            return false;
        }
    }
    filtered_items++;

    /** Print/Use data **/

    theSarf->out << number_of_morphemes << "  "
            << length << "  "
            << stem_length << "  ";

    //    prefixPOSs
    for(int i=0; i<prefixPOSs.count(); i++) {
        theSarf->out << prefixPOSs[i] << "  ";
    }

    //    prefixes
    for(int i=0; i< prefixes.count(); i++) {
        theSarf->out << prefixes[i] << "  ";
    }

    //    stemPOS
    theSarf->out << stemPOS << "  ";

    //    suffixPOSs
    for(int i=0; i<suffixPOSs.count(); i++) {
        theSarf->out << suffixPOSs[i] << "  ";
    }

    //    suffixes
    for(int i=0; i<suffixes.count(); i++) {
        theSarf->out << suffixes[i] << "  ";
    }

    //    least_ambiguity_position
    //    prefixs , prefixm, prefixe , stems , stemm, steme , suffixs , suffixm, suffixe
    //theSarf->out << least_ambiguity_position << "  ";
    QString diacritic_position;

    if((prefix_length != 0) && (least_ambiguity_position == 0)) {
        diacritic_position = "prefixs";
    }
    else if((prefix_length != 0) && (least_ambiguity_position > 0) && (least_ambiguity_position < (prefix_length-1))) {
        diacritic_position = "prefixm";
    }
    else if((prefix_length != 0) && (least_ambiguity_position == (prefix_length-1))) {
        diacritic_position = "prefixe";
    }
    else if(least_ambiguity_position == prefix_length) {
        diacritic_position = "stems";
    }
    else if((least_ambiguity_position > (prefix_length)) && (least_ambiguity_position < (prefix_length + stem_length - 1))) {
        diacritic_position = "stemm";
    }
    else if(least_ambiguity_position == (prefix_length + stem_length - 1)) {
        diacritic_position = "steme";
    }
    else if((suffix_length != 0) && (least_ambiguity_position == (prefix_length + stem_length))) {
        diacritic_position = "suffixs";
    }
    else if((suffix_length != 0) && (least_ambiguity_position > (prefix_length + stem_length)) && (least_ambiguity_position < (length - 1))) {
        diacritic_position = "suffixm";
    }
    else if((suffix_length != 0) && (least_ambiguity_position == (length -1))) {
        diacritic_position = "suffixe";
    }
    else {
        cout << "Couldn't set diacritic position!" << endl;
        return false;
    }
    theSarf->out << diacritic_position << '\n';

    //    ambiguity_reduction
    //theSarf->out << ambiguity_reduction << '\n';

    /** Check for number of solutions requested **/
    if(number_of_solutions == -1) {
        return true;
    }
    else if(solution_counter != number_of_solutions) {
        solution_counter++;
        return true;
    }
    else {
        return false;
    }
};
예제 #4
0
파일: tree.cpp 프로젝트: ZoeLeBlanc/atmine
int tree::build_helper(item_types type, long cat_id1, int size, node * current)
{
    if (size<=0)
		return 0;
    long cat_id2,cat_r_id;
    Search_Compatibility s2((type==PREFIX?AA:CC),cat_id1);
    while (s2.retrieve(cat_id2,cat_r_id))
    {
		QString inflections=s2.getInflectionRules();
		bool isAccept=isAcceptState(type,cat_r_id);
		if (isAccept || hasCompatibleAffixes(type,cat_r_id)) { //dont add to tree branches that have no rules and connect to nothing else that may have rules
			Search_by_category s3(cat_id2);
		#if defined(MEMORY_EXHAUSTIVE) || defined(REDUCE_THRU_DIACRITICS)
			all_item_info inf;
			while(s3.retrieve(inf))	{
					QString name= getColumn(interpret_type(type),"name",inf.item_id);
					name=removeDiacritics(name);
					QString inflectedRawData=inf.raw_data;
					applyRawDataInflections(inflections,name,inflectedRawData); //name and inflectedRawData are changed
				#ifdef MEMORY_EXHAUSTIVE
					node * next=addElement(name,inf.item_id,cat_id2,cat_r_id,isAccept,inf.raw_data,inf.description,current);
				#elif defined(REDUCE_THRU_DIACRITICS)
					node * next;
				#if 0
					if (name.isEmpty() && inf.raw_data.isEmpty() && inf.POS.isEmpty() && inf.description().isEmpty()) {
						result_node * n=dynamic_cast<result_node*>(current);
						inf.item_id=n->get_affix_id();
						inf.category_id=n->get_previous_category_id();
						for (int i=0;i<n->raw_datas.size();i++) {
							inf.raw_data=n->raw_datas[i];
							next=addElement(name,inf.item_id,cat_id2,cat_r_id,isAccept,inf.raw_data,current->parent); //check that
						}
					} else
				#endif
						next=addElement(name,inf.item_id,cat_id2,cat_r_id,isAccept,inf.raw_data,inflectedRawData,inflections,current);
				#endif
		#else
			long long affix_id;
			while(s3.retrieve(affix_id))
			{
					QString name= getColumn(interpret_type(type),"name",affix_id);
					node * next=addElement(name,affix_id,cat_id2,cat_r_id,isAccept,current);

		#endif
					build_helper(type,cat_r_id,size-name.length(),next);
			}
		}
    }
    return 0;
}
#ifdef MEMORY_EXHAUSTIVE
node* tree::addElement(QString letters, long affix_id,long category_id, long resulting_category_id,bool isAccept,QString raw_data,QString description,node * current)
#elif defined (REDUCE_THRU_DIACRITICS)
node* tree::addElement(QString letters, long affix_id,long category_id, long resulting_category_id,bool isAccept,QString raw_data, QString inflected_raw_data, QString descriptionInflectionRule,node * current)
#else
node* tree::addElement(QString letters, long affix_id,long category_id, long resulting_category_id,bool isAccept,node * current)
#endif
{
#if 0
	if (!equal(letters,raw_data))
		raw_data.remove(" ");
#endif
	assert (current->isLetterNode() || equal(letters,inflected_raw_data));
#ifdef LOAD_FROM_FILE
	if (file!=NULL)
	{
		(*file)<<letters<<affix_id<<category_id<<resulting_category_id<<isAccept
			#if defined (REDUCE_THRU_DIACRITICS)
				<<raw_data<<inflected_raw_data<<descriptionInflectionRule
			#elif defined (MEMORY_EXHAUSTIVE)
				<<raw_data<<description
			#endif
				<<generateNodeID(current);
	}
#endif
	//pre-condition: assumes category_id is added to the right place and results in the appropraite resulting_category
	QChar current_letter;
	//QList<letter_node *>* current_letter_children;
	letter_node* matching_letter_node=NULL;
	if (current->isLetterNode() && current!=base) {
	#if 1
		error << "Unexpected Error: provided node was a letter node and not a result one\n";
	#ifdef LOAD_FROM_FILE
		if (file!=NULL)
			(*file)<<generateNodeID(NULL);
	#endif
		return NULL;
	#else
		current_letter='\0';
		goto result;
	#endif
    }
	int i;
    if (letters.count()==0)
    {
		current_letter='\0';
		if (current==base)
			goto result;
    }
    else
		current_letter=letters[0];
    i=0;
    do
    {
		matching_letter_node=current->getLetterChild(current_letter);
		if (matching_letter_node!=NULL)
		{
				current=matching_letter_node;
				i++;
				current_letter=letters[i];
		}
		else
			break;
    }while(i<letters.count());
    if (letters.count()==0 && i==0)
    {
		//add null letter
		letter_node* new_node=new letter_node('\0');
		current->addChild(new_node);
		current=new_node;
		letter_nodes++;
    }
    for (;i<letters.count();i++)
    {
		//add necessary letters
		letter_node* new_node=new letter_node(letters[i]);
		current->addChild(new_node);
		current=new_node;
		letter_nodes++;
    }
result:
	int size=current->getResultChildren()->size();
	for (int i=0;i<size;i++) //check if this result node is already present
    {
		result_node * old_result=current->getResultChildren()->at(i);
		if (old_result->get_previous_category_id()==category_id && old_result->get_resulting_category_id()==resulting_category_id && old_result->get_affix_id()==affix_id)
		{
		#ifdef MEMORY_EXHAUSTIVE
			((result_node*)old_result)->addPair(raw_data,description);
		#elif defined(REDUCE_THRU_DIACRITICS)
			old_result->add_raw_data(raw_data, inflected_raw_data);
			old_result->setInflectionRule(descriptionInflectionRule);
		#endif
		#ifdef LOAD_FROM_FILE
			if (file!=NULL)
				(*file)<<generateNodeID(old_result);
		#endif
			return old_result;
		}
    }
#ifdef MEMORY_EXHAUSTIVE
	result_node * result=new result_node(affix_id,category_id,resulting_category_id,isAccept,raw_data,description);
#elif defined(REDUCE_THRU_DIACRITICS)
	result_node * result=new result_node(affix_id,category_id,resulting_category_id,isAccept,raw_data,inflected_raw_data);
#else
	result_node * result=new result_node(affix_id,category_id,resulting_category_id,isAccept);
#endif
	result->setInflectionRule(descriptionInflectionRule);
    current->addChild(result);
    current=result;
    result_nodes++;
#ifdef LOAD_FROM_FILE
	if (file!=NULL)
		(*file)<<generateNodeID(current);
#endif
    return current;
    //post-condition: returns node of resulting category reached after addition
}

tree::tree()
{
    base= new letter_node('\0');
    letter_nodes=1;
    result_nodes=0;
    isAffix=false;
#ifdef LOAD_FROM_FILE
	file=NULL;
#endif
}
tree::tree(item_types type)
{
    base= new letter_node('\0');
    letter_nodes=1;
    result_nodes=0;
    isAffix=true;
#ifdef LOAD_FROM_FILE
	file=NULL;
#endif
    build_affix_tree(type);
}