void actualizeNonTerminals(GrammarADT grammar) { int nontermquant = getQuantTerminals(grammar); char * nontermsfounded = NULL ; int nontermsfoundedsize =0; ProductionsADT productions = getProductions(grammar); int productionquant = getQuant(productions),i; /*detect current non terminals*/ for (i=0; i<productionquant; i++) { ProductionADT p = getProduction(productions,i); char first = getProductionComponent(p,0); char sec = getProductionComponent(p,1); char third = getProductionComponent(p,2); if (isNonTerminal(first) && !containsChar(nontermsfounded,nontermsfoundedsize,first) ) { addChar(&nontermsfounded, &nontermsfoundedsize, first); } if (isNonTerminal(sec) && !containsChar(nontermsfounded,nontermsfoundedsize,sec) ) { addChar(&nontermsfounded, &nontermsfoundedsize, sec); } if(isNonTerminal(third) && !containsChar(nontermsfounded,nontermsfoundedsize,third)) { addChar(&nontermsfounded, &nontermsfoundedsize, third); } } /*actualize non terminals*/ if( nontermsfoundedsize != nontermquant ) { /*there are less current non terminals*/ setNonTerminals(grammar,nontermsfounded,nontermsfoundedsize); } }
ProductionParts decompose(const std::string& name, const std::string& rhs) const { assert(isNonTerminal(name)); ProductionParts production; production.name = name; production.products = toSymbolSequence(rhs); return production; }
/** * Find the closure of the given set of productions. * @param items -> The set of Items to be checked * @return the updated set of Items */ std::set<Item> Parser::findClosure(std::set<Item> items) { std::set<Item> closure; for (auto item : items) { closure.insert(item); } bool changed; do { changed = false; for (Item item : closure) { char next_char = item.production[item.dot]; if (isNonTerminal(next_char)) { for (auto production : _productions) { if (production[0] == next_char) { Item new_item = {production, START_POS}; auto result = closure.insert(new_item); if (result.second) { changed = true; } } } } } } while (changed); return closure; }
/** * Find the follow set for each symbol. */ void Parser::findFollow() { bool changed; do { changed = false; for (auto production : _productions) { size_t i = 0; char lhs = production[0]; std::string rhs = production.substr(START_POS); while (i < rhs.length()) { bool is_non_terminal = isNonTerminal(rhs[i]); if (is_non_terminal && i < rhs.length() - 1) { std::set<char> next_first = _first[rhs[i + 1]]; bool has_epsilon = next_first.erase(EPSILON[0]); addSetToFollow(rhs[i], next_first, &changed); if (has_epsilon) { addSetToFollow(rhs[i], _follow[lhs], &changed); } } else if (is_non_terminal && i == rhs.length() - 1) { addSetToFollow(rhs[i], _follow[lhs], &changed); } ++i; } } } while (changed); }
void Grammar::computeClosure(LR1State &state, bool allowNewItems) { bool changed; do { changed = false; for(size_t i = 0; i < state.m_items.size(); i++) { const LR1Item &item = state.m_items[i]; const Production &prod = getProduction(item); if(item.m_dot < prod.getLength() && isNonTerminal(prod.m_rightSide[item.m_dot])) { // item is A -> alfa . B beta [la] const GrammarSymbol &B = getSymbol(prod.m_rightSide[item.m_dot]); const BitSet la(first1(item)); for(size_t p = 0; p < B.m_leftSideOf.size(); p++) { const LR1Item newItem(false, B.m_leftSideOf[p], 0, la); // newItem is B -> . gamma [first1(beta la)] (nonkernelitem) LR1Item *oldItem = state.findItem(newItem); if(oldItem == NULL) { if(!allowNewItems) { throwException(_T("Grammar::computeClosure:No new items allowed")); } state.addItem(newItem); changed = true; } else { if(!(newItem.m_la - oldItem->m_la).isEmpty()) { oldItem->m_la += newItem.m_la; changed = true; } } } } } } while(changed); state.sortItems(); }
// Check for equal non-terminal alignment in case of SCFG rules. // Precondition: otherTargetToSourceAlignment has the same size as m_targetToSourceAlignments.begin()->first bool ExtractionPhrasePair::MatchesAlignment( ALIGNMENT *otherTargetToSourceAlignment ) const { if (!hierarchicalFlag) return true; // all or none of the phrasePair's word alignment matrices match, so just pick one const ALIGNMENT *thisTargetToSourceAlignment = m_targetToSourceAlignments.begin()->first; assert(m_phraseTarget->size() == thisTargetToSourceAlignment->size() + 1); assert(thisTargetToSourceAlignment->size() == otherTargetToSourceAlignment->size()); // loop over all symbols but the left hand side of the rule for (size_t i=0; i<thisTargetToSourceAlignment->size()-1; ++i) { if (isNonTerminal( vcbT.getWord( m_phraseTarget->at(i) ) )) { size_t thisAlign = *(thisTargetToSourceAlignment->at(i).begin()); size_t otherAlign = *(otherTargetToSourceAlignment->at(i).begin()); if (thisTargetToSourceAlignment->at(i).size() != 1 || otherTargetToSourceAlignment->at(i).size() != 1 || thisAlign != otherAlign) { return false; } } } return true; }
bool Syntactic::step() throw (AnalysisError) { if (currentToken == 0) //Fim de Sentenca { int pos = 0; if (previousToken != 0) pos = previousToken->getPosition() + previousToken->getLexeme().size(); currentToken = new Token(DOLLAR, "$", pos, 1); } int a = currentToken->getId(); int x = stack.top(); stack.pop(); if (x == EPSILON) { return false; } else if (isTerminal(x)) { if (x == a) { if (stack.empty()) return true; else { if (previousToken != 0) delete previousToken; previousToken = currentToken; currentToken = scanner->nextToken(); return false; } } else { throw SyntacticError(PARSER_ERROR[x], currentToken); } } else if (isNonTerminal(x)) { if (pushProduction(x, a)) return false; else throw SyntacticError(PARSER_ERROR[x], currentToken); } else // isSemanticAction(x) { semanticAnalyser->executeAction(x-FIRST_SEMANTIC_ACTION, previousToken); return false; } }
int isRight(GrammarADT grammar) { ProductionsADT productions = getProductions(grammar); int n = getQuant(productions); int i; for (i=0; i<n; i++) { ProductionADT p = getProduction(productions,i); char sec = getProductionComponent(p,1); char third = getProductionComponent(p,2); /*if there is a production like A->Ba, it is a left sided grammar*/ if (isNonTerminal(sec) && isTerminal(third)) { return 0; } } return 1; }
void removeUnproductiveProductions(GrammarADT grammar) { ProductionsADT productions = getProductions(grammar); int i, quantproductions = getQuant(productions), productivequant=0,lastproductivequant=-1; char * productives = NULL; char * aux1 = NULL; while(productivequant != lastproductivequant) { lastproductivequant = productivequant; for( i=0; i< quantproductions; i++ ) { ProductionADT p1 = getProduction(productions,i); char first1 = getProductionComponent(p1,0); char sec1 = getProductionComponent(p1,1); char third1 = getProductionComponent(p1,2); if ( !containsChar(productives,productivequant,first1) ) { if ( ( sec1 == LAMDA && third1 == LAMDA ) || /*lamda*/ (isTerminal(sec1) && isTerminal(third1) ) || /*both terminal*/ ( isTerminal(sec1) && third1 == LAMDA ) || /*one terminal*/ ( isTerminal(third1) && sec1 == LAMDA ) || /*one terminal and one productive*/ (isTerminal(sec1) && ( isNonTerminal(third1) && containsChar(productives,productivequant,third1) ) ) || (isTerminal(third1) && ( isNonTerminal(sec1) && containsChar(productives,productivequant,sec1) ) ) || ( sec1 == LAMDA && ( isNonTerminal(third1) && containsChar(productives,productivequant,third1) ) ) || ( third1 == LAMDA && ( isNonTerminal(sec1) && containsChar(productives,productivequant,sec1) ) )) { if ( ( aux1 = realloc(productives, sizeof(char)*(productivequant+1)) ) == NULL ) { fprintf(stderr, "Error doing realloc \n"); } productives = aux1; productives[productivequant++] = first1; } } } } /*remove non terminals and terminals that are no longer there */ actualizeTerminals(grammar); actualizeNonTerminals(grammar); actualizeProductions(grammar); }
void Grammar::dump(MarginFile *f) const { for(int i = 0; i < getSymbolCount(); i++) { const GrammarSymbol &sym = getSymbol(i); f->printf(_T("Symbol:%-20s, %4d %-11s "), sym.m_name.cstr(), sym.m_precedence, sym.getTypeString()); if(sym.m_reachable ) f->printf(_T("reachable ")); if(sym.m_terminate ) f->printf(_T("terminate ")); if(sym.m_deriveEpsilon) f->printf(_T("derive e ")); if(isNonTerminal(i)) { dump(sym.m_first1); } f->printf(_T("\n")); } for(int i = 0; i < getProductionCount(); i++) { dump(m_productions[i], f); f->printf(_T("\n")); } }
void processFiles( const std::string& fileNameDirect, const std::string& fileNameIndirect, const std::string& fileNameConsolidated, const std::string& fileNameCountOfCounts, const std::string& fileNameSourceLabelSet, const std::string& fileNamePartsOfSpeechVocabulary ) { if (goodTuringFlag || kneserNeyFlag) loadCountOfCounts( fileNameCountOfCounts ); // open input files Moses::InputFileStream fileDirect(fileNameDirect); UTIL_THROW_IF2(fileDirect.fail(), "could not open phrase table file " << fileNameDirect); Moses::InputFileStream fileIndirect(fileNameIndirect); UTIL_THROW_IF2(fileIndirect.fail(), "could not open phrase table file " << fileNameIndirect); // open output file: consolidated phrase table Moses::OutputFileStream fileConsolidated; bool success = fileConsolidated.Open(fileNameConsolidated); UTIL_THROW_IF2(!success, "could not open output file " << fileNameConsolidated); // create properties consolidator // (in case any additional phrase property requires further processing) MosesTraining::PropertiesConsolidator propertiesConsolidator = MosesTraining::PropertiesConsolidator(); if (sourceLabelsFlag) { propertiesConsolidator.ActivateSourceLabelsProcessing(fileNameSourceLabelSet); } if (partsOfSpeechFlag) { propertiesConsolidator.ActivatePartsOfSpeechProcessing(fileNamePartsOfSpeechVocabulary); } // loop through all extracted phrase translations int i=0; while(true) { i++; if (i%100000 == 0) std::cerr << "." << std::flush; std::vector< std::string > itemDirect, itemIndirect; if (! getLine(fileIndirect, itemIndirect) || ! getLine(fileDirect, itemDirect)) break; // direct: target source alignment probabilities // indirect: source target probabilities // consistency checks UTIL_THROW_IF2(itemDirect[0].compare( itemIndirect[0] ) != 0, "target phrase does not match in line " << i << ": '" << itemDirect[0] << "' != '" << itemIndirect[0] << "'"); UTIL_THROW_IF2(itemDirect[1].compare( itemIndirect[1] ) != 0, "source phrase does not match in line " << i << ": '" << itemDirect[1] << "' != '" << itemIndirect[1] << "'"); // SCORES ... std::string directScores, directSparseScores, indirectScores, indirectSparseScores; breakdownCoreAndSparse( itemDirect[3], directScores, directSparseScores ); breakdownCoreAndSparse( itemIndirect[3], indirectScores, indirectSparseScores ); std::vector<std::string> directCounts; Moses::Tokenize( directCounts, itemDirect[4] ); std::vector<std::string> indirectCounts; Moses::Tokenize( indirectCounts, itemIndirect[4] ); float countF = Moses::Scan<float>(directCounts[0]); float countE = Moses::Scan<float>(indirectCounts[0]); float countEF = Moses::Scan<float>(indirectCounts[1]); float n1_F, n1_E; if (kneserNeyFlag) { n1_F = Moses::Scan<float>(directCounts[2]); n1_E = Moses::Scan<float>(indirectCounts[2]); } // Good Turing discounting float adjustedCountEF = countEF; if (goodTuringFlag && countEF+0.99999 < goodTuringDiscount.size()-1) adjustedCountEF *= goodTuringDiscount[(int)(countEF+0.99998)]; float adjustedCountEF_indirect = adjustedCountEF; // Kneser Ney discounting [Foster et al, 2006] if (kneserNeyFlag) { float D = kneserNey_D3; if (countEF < 2) D = kneserNey_D1; else if (countEF < 3) D = kneserNey_D2; if (D > countEF) D = countEF - 0.01; // sanity constraint float p_b_E = n1_E / totalCount; // target phrase prob based on distinct float alpha_F = D * n1_F / countF; // available mass adjustedCountEF = countEF - D + countF * alpha_F * p_b_E; // for indirect float p_b_F = n1_F / totalCount; // target phrase prob based on distinct float alpha_E = D * n1_E / countE; // available mass adjustedCountEF_indirect = countEF - D + countE * alpha_E * p_b_F; } // drop due to MinScore thresholding if ((minScore0 > 0 && adjustedCountEF_indirect/countE < minScore0) || (minScore2 > 0 && adjustedCountEF /countF < minScore2)) { continue; } // output phrase pair fileConsolidated << itemDirect[0] << " ||| "; if (partsOfSpeechFlag) { // write POS factor from property std::vector<std::string> targetTokens; Moses::Tokenize( targetTokens, itemDirect[1] ); std::vector<std::string> propertyValuePOS; propertiesConsolidator.GetPOSPropertyValueFromPropertiesString(itemDirect[5], propertyValuePOS); size_t targetTerminalIndex = 0; for (std::vector<std::string>::const_iterator targetTokensIt=targetTokens.begin(); targetTokensIt!=targetTokens.end(); ++targetTokensIt) { fileConsolidated << *targetTokensIt; if (!isNonTerminal(*targetTokensIt)) { assert(propertyValuePOS.size() > targetTerminalIndex); fileConsolidated << "|" << propertyValuePOS[targetTerminalIndex]; ++targetTerminalIndex; } fileConsolidated << " "; } fileConsolidated << "|||"; } else { fileConsolidated << itemDirect[1] << " |||"; } // prob indirect if (!onlyDirectFlag) { fileConsolidated << " " << maybeLogProb(adjustedCountEF_indirect/countE); fileConsolidated << " " << indirectScores; } // prob direct fileConsolidated << " " << maybeLogProb(adjustedCountEF/countF); fileConsolidated << " " << directScores; // phrase count feature if (phraseCountFlag) { fileConsolidated << " " << maybeLogProb(2.718); } // low count feature if (lowCountFlag) { fileConsolidated << " " << maybeLogProb(std::exp(-1.0/countEF)); } // count bin feature (as a core feature) if (countBin.size()>0 && !sparseCountBinFeatureFlag) { bool foundBin = false; for(size_t i=0; i < countBin.size(); i++) { if (!foundBin && countEF <= countBin[i]) { fileConsolidated << " " << maybeLogProb(2.718); foundBin = true; } else { fileConsolidated << " " << maybeLogProb(1); } } fileConsolidated << " " << maybeLogProb( foundBin ? 1 : 2.718 ); } // alignment fileConsolidated << " |||"; if (!itemDirect[2].empty()) { fileConsolidated << " " << itemDirect[2];; } // counts, for debugging fileConsolidated << " ||| " << countE << " " << countF << " " << countEF; // sparse features fileConsolidated << " |||"; if (directSparseScores.compare("") != 0) fileConsolidated << " " << directSparseScores; if (indirectSparseScores.compare("") != 0) fileConsolidated << " " << indirectSparseScores; // count bin feature (as a sparse feature) if (sparseCountBinFeatureFlag) { bool foundBin = false; for(size_t i=0; i < countBin.size(); i++) { if (!foundBin && countEF <= countBin[i]) { fileConsolidated << " cb_"; if (i == 0 && countBin[i] > 1) fileConsolidated << "1_"; else if (i > 0 && countBin[i-1]+1 < countBin[i]) fileConsolidated << (countBin[i-1]+1) << "_"; fileConsolidated << countBin[i] << " 1"; foundBin = true; } } if (!foundBin) { fileConsolidated << " cb_max 1"; } } // arbitrary key-value pairs fileConsolidated << " |||"; if (itemDirect.size() >= 6) { propertiesConsolidator.ProcessPropertiesString(itemDirect[5], fileConsolidated); } if (countsProperty) { fileConsolidated << " {{Counts " << countE << " " << countF << " " << countEF << "}}"; } fileConsolidated << std::endl; } fileDirect.Close(); fileIndirect.Close(); fileConsolidated.Close(); }
void removeUnitaryProductions(GrammarADT grammar) { ProductionsADT productions = getProductions(grammar); int i,j,k, productionquant = getQuant(productions), unitaryquant = 0, lastunitaryquant = 0; /*auxiliar array for unitary productions*/ char * unitaries = NULL; /*iterate over productions and determine first unitaries: * the productions that have only one non terminal symbol * on the right side */ for (i=0; i< productionquant; i++) { char first = getProductionComponent(getProduction(productions,i),0); char sec = getProductionComponent(getProduction(productions,i),1); char third = getProductionComponent(getProduction(productions,i),2); if ( isNonTerminal(sec) && third == LAMDA ) { addPair(&unitaries,&unitaryquant,first, sec); } else if( isNonTerminal(third) && sec == LAMDA) { addPair(&unitaries,&unitaryquant,first, third); } } /*iterate over unitaries, adding the closure*/ while(unitaryquant != lastunitaryquant) { lastunitaryquant = unitaryquant; for (i=0; i<unitaryquant ; i+=2) { char first1 = unitaries[i]; char sec1 = unitaries[i+1]; for (j=0; j<unitaryquant ; j+=2) { char first2 = unitaries[j]; char sec2 = unitaries[j+1]; /*(A,B)(B,C)-> (A,C)*/ if (sec1 == first2 ) { if (!containsPair(unitaries,unitaryquant,first1,sec2) && first1 != sec2 ) { /*no sense in adding (A,A) unitaries*/ addPair(&unitaries,&unitaryquant,first1,sec2); } } } } } /*Debug*/ //printByPairs(unitaries,unitaryquant); //printf("unitaries quant: %d\n\n", unitaryquant/2); /*create the new productions and remove the unitaries*/ for(i=0; i<productionquant; i++) { ProductionADT p1 = getProduction(productions,i); if ( isUnitary(p1) ) { char first1 = getProductionComponent(p1,0); char sec1 = getProductionComponent(p1,1); char third1 = getProductionComponent(p1,2); for(j=0; j<unitaryquant; j+=2) { char uni1 = unitaries[j]; char uni2 = unitaries[j+1]; //A->B and (A,B) (unitary production is localized) if ((first1 == uni1) && (sec1 == uni2 || third1 == uni2 )) { for(k=0; k<productionquant; k++ ) { ProductionADT p2 = getProduction(productions,k); char first2 = getProductionComponent(p2,0); char sec2 = getProductionComponent(p2,1); char third2 = getProductionComponent(p2,2); if(!isUnitary(p2)) { if(first2 == uni2 ) { addProduction(productions,newProduction(first1,sec2,third2)); } } } } } removeParticularProduction(productions,p1); free(p1); } } /*remove non terminals and terminals that are no longer there */ actualizeTerminals(grammar); actualizeNonTerminals(grammar); actualizeProductions(grammar); }
void convertToRight(GrammarADT grammar) { int i; int ml = FALSE; char oldistiguished = getDistinguished(grammar); /*if the grammar is already right there is no * reason to convert it*/ if ( isRight(grammar) ) { return; } ProductionsADT productions = getProductions(grammar); int quantproductions = getQuant(productions); for(i = 0; i < quantproductions ; i++) { ProductionADT p1 = getProduction(productions, i); char first = getProductionComponent(p1, 0); char sec = getProductionComponent(p1, 1); char third = getProductionComponent(p1, 2); if(isNonTerminal(sec)) { addProduction(productions, newProduction(sec , third, first)); removeParticularProduction(productions,p1); } } setProductions(grammar, productions); /*a new nonTerminal should be created , * that joint the non terminals that were joined to lambda*/ char * leftnontermssymbols = NULL; int size=0; for(i=0; i < quantproductions; i++) { ProductionADT p1 = getProduction(productions, i); char first = getProductionComponent(p1, 0); char sec = getProductionComponent(p1, 1); char third = getProductionComponent(p1, 2); if(sec == LAMDA && third == LAMDA) { addChar(&leftnontermssymbols,&size,first); } } /*get a new distiguished symbol*/ char newsymbol = getNewSymbol(grammar); setDistinguished(grammar,newsymbol); /*generate new unitary productions*/ for(i=0; i<size; i++) { ProductionADT newprod = newProduction(newsymbol,leftnontermssymbols[i],LAMDA); //printProduction(newprod); addProduction(productions, newprod); } /*remove all old lambda productions*/ for(i=0; i<getQuant(productions); i++) { ProductionADT p = getProduction(productions,i); char sec = getProductionComponent(p,1); char third = getProductionComponent(p,2); /*if it is a lamda productions : delete*/ if( sec == LAMDA && third == LAMDA ) { removeParticularProduction(productions,p); } } if(!ml) { addProduction(productions, newProduction(oldistiguished, LAMDA, LAMDA)); } setProductions(grammar,productions); /*remove non terminals and terminals that are no longer there */ actualizeTerminals(grammar); actualizeNonTerminals(grammar); actualizeProductions(grammar); }
void removeUnreachableProductions(GrammarADT grammar) { ProductionsADT productions = getProductions(grammar); int i, quantproductions = getQuant(productions), reachablesquant=0,lastreachablesquant=0; char * reachables = malloc(sizeof(char)); char * aux1 = NULL; /*starts only with distinguished symbol, if it is in the current productions*/ if(inCurrentProductions(productions,getDistinguished(grammar))) { reachables[reachablesquant++] = getDistinguished(grammar); } /*until something the quantity of reachables varies*/ while (reachablesquant != lastreachablesquant) { lastreachablesquant = reachablesquant; for(i=0; i<quantproductions; i++) { char first = getProductionComponent(getProduction(productions,i),0); char sec = getProductionComponent(getProduction(productions,i),1); char third = getProductionComponent(getProduction(productions,i),2); /*if the symbol of the left is contained in the reachables, the non terminal * symbols of the right must be added*/ if (containsChar(reachables,reachablesquant,first)) { /*if the second symbol is nonterminal and is not yet in the * reachable list, it must be added*/ if ( isNonTerminal( sec ) && !containsChar(reachables,reachablesquant,sec)) { if ( ( aux1 = realloc(reachables, sizeof(char)*(reachablesquant+1)) ) == NULL ) { fprintf(stderr, "Error doing realloc \n"); } reachables = aux1; reachables[reachablesquant++] = sec; }/*if the third symbol is nonterminal and is not yet in the * reachable list, it must be added*/ else if( isNonTerminal(third) && !containsChar(reachables,reachablesquant,third) ) { if ( (aux1 = realloc(reachables, sizeof(char)*(reachablesquant+1)) ) == NULL ) { fprintf(stderr, "Error doing realloc \n"); } reachables = aux1; reachables[reachablesquant++] = third; } } } } /*TODO: delete debug printf*/ printf("\nReachables!!: "); printArray(reachables,reachablesquant); int symsToRemovequant=0; /*remove the unreachable productions*/ /*If the quantity of reachables is equal to the quantity of nonterminals, * nothing should be removed*/ if (reachablesquant != getQuantNonTerminals(grammar)) { char * symsToRemove = NULL; symsToRemovequant = getDifferents(getNonTerminals(grammar), getQuantNonTerminals(grammar) ,reachables, reachablesquant, &symsToRemove); printf("\nTO REMOVE:"); printArray(symsToRemove,symsToRemovequant ); for(i=0; i<symsToRemovequant; i++) { removeProduction(productions,symsToRemove[i]); } } /*remove non terminals and terminals that are no longer there */ actualizeTerminals(grammar); actualizeNonTerminals(grammar); actualizeProductions(grammar); }