void merge( const MetaTranslator *tor, const MetaTranslator *virginTor, MetaTranslator *outTor, bool verbose, bool noObsolete ) { int known = 0; int neww = 0; int obsoleted = 0; int UntranslatedObsoleted = 0; int similarTextHeuristicCount = 0; TML all = tor->messages(); TML::Iterator it; outTor->setLanguageCode(tor->languageCode()); outTor->setSourceLanguageCode(tor->sourceLanguageCode()); /* The types of all the messages from the vernacular translator are updated according to the virgin translator. */ for ( it = all.begin(); it != all.end(); ++it ) { MetaTranslatorMessage::Type newType = MetaTranslatorMessage::Finished; MetaTranslatorMessage m = *it; // skip context comment if ( !QByteArray(m.sourceText()).isEmpty() ) { MetaTranslatorMessage mv = virginTor->find(m.context(), m.sourceText(), m.comment()); if ( mv.isNull() ) { mv = virginTor->find(m.context(), m.comment(), m.fileName(), m.lineNumber()); if ( mv.isNull() ) { // did not find it in the virgin, mark it as obsolete newType = MetaTranslatorMessage::Obsolete; if ( m.type() != MetaTranslatorMessage::Obsolete ) obsoleted++; } else { // Do not just accept it if its on the same line number, but different source text. // Also check if the texts are more or less similar before we consider them to represent the same message... // ### The QString() cast is evil if (getSimilarityScore(QString(m.sourceText()), mv.sourceText()) >= textSimilarityThreshold) { // It is just slightly modified, assume that it is the same string m = MetaTranslatorMessage(m.context(), mv.sourceText(), m.comment(), m.fileName(), m.lineNumber(), m.translations()); m.setPlural(mv.isPlural()); // Mark it as unfinished. (Since the source text was changed it might require re-translating...) newType = MetaTranslatorMessage::Unfinished; ++similarTextHeuristicCount; } else { // The virgin and vernacular sourceTexts are so different that we could not find it. newType = MetaTranslatorMessage::Obsolete; if ( m.type() != MetaTranslatorMessage::Obsolete ) obsoleted++; } neww++; } } else { switch ( m.type() ) { case MetaTranslatorMessage::Finished: default: if (m.isPlural() == mv.isPlural()) { newType = MetaTranslatorMessage::Finished; } else { newType = MetaTranslatorMessage::Unfinished; } known++; break; case MetaTranslatorMessage::Unfinished: newType = MetaTranslatorMessage::Unfinished; known++; break; case MetaTranslatorMessage::Obsolete: newType = MetaTranslatorMessage::Unfinished; neww++; } // Always get the filename and linenumber info from the virgin Translator, in case it has changed location. // This should also enable us to read a file that does not have the <location> element. m.setFileName(mv.fileName()); m.setLineNumber(mv.lineNumber()); m.setPlural(mv.isPlural()); // ### why not use operator=? } if (newType == MetaTranslatorMessage::Obsolete && !m.isTranslated()) { ++UntranslatedObsoleted; } m.setType(newType); outTor->insert(m); } } /* Messages found only in the virgin translator are added to the vernacular translator. Among these are all the context comments. */ all = virginTor->messages(); for ( it = all.begin(); it != all.end(); ++it ) { MetaTranslatorMessage mv = *it; bool found = tor->contains(mv.context(), mv.sourceText(), mv.comment()); if (!found) { MetaTranslatorMessage m = tor->find(mv.context(), mv.comment(), mv.fileName(), mv.lineNumber()); if (!m.isNull()) { if (getSimilarityScore(QString(m.sourceText()), mv.sourceText()) >= textSimilarityThreshold) { found = true; } } else { found = false; } } if ( !found ) { outTor->insert( mv ); if ( !QByteArray(mv.sourceText()).isEmpty() ) neww++; } } /* The same-text heuristic handles cases where a message has an obsolete counterpart with a different context or comment. */ int sameTextHeuristicCount = applySameTextHeuristic( outTor ); /* The number heuristic handles cases where a message has an obsolete counterpart with mostly numbers differing in the source text. */ int sameNumberHeuristicCount = applyNumberHeuristic( outTor ); if ( verbose ) { int totalFound = neww + known; fprintf( stderr, " Found %d source text%s (%d new and %d already existing)\n", totalFound, totalFound == 1 ? "" : "s", neww, known); if (obsoleted) { if (noObsolete) { fprintf( stderr, " Removed %d obsolete entr%s\n", obsoleted, obsoleted == 1 ? "y" : "ies" ); } else { int total = obsoleted - UntranslatedObsoleted; fprintf( stderr, " Kept %d obsolete translation%s\n", total, total == 1 ? "" : "s" ); fprintf( stderr, " Removed %d obsolete untranslated entr%s\n", UntranslatedObsoleted, UntranslatedObsoleted == 1 ? "y" : "ies" ); } } if (sameNumberHeuristicCount) fprintf( stderr, " Number heuristic provided %d translation%s\n", sameNumberHeuristicCount, sameNumberHeuristicCount == 1 ? "" : "s" ); if (sameTextHeuristicCount) fprintf( stderr, " Same-text heuristic provided %d translation%s\n", sameTextHeuristicCount, sameTextHeuristicCount == 1 ? "" : "s" ); if (similarTextHeuristicCount) fprintf( stderr, " Similar-text heuristic provided %d translation%s\n", similarTextHeuristicCount, similarTextHeuristicCount == 1 ? "" : "s" ); } }
char* alignTwoMessages(t_message * resMessage, Bool doInternalSlick, t_message * message1, t_message * message2, Bool debugMode){ // local variables short int ** matrix = NULL; unsigned int i = 0; unsigned int j = 0; // Construction of the matrix short int elt1, elt2, elt3, max, eltL, eltD, eltT; // Levenshtein distance // float levenshtein = 0.0; float scoreAlignment = 0; unsigned int maxLen = 0; // Traceback unsigned char * contentMessage1 = NULL; unsigned int * mapMessage1 = NULL; unsigned char * maskMessage1 = NULL; unsigned char * contentMessage2 = NULL; unsigned char * maskMessage2 = NULL; unsigned int * mapMessage2 = NULL; unsigned int iReg1 = 0; unsigned int iReg2 = 0; // Computing resMessage unsigned char *tmpMessage = NULL; unsigned char *tmpMessageMask = NULL; t_semanticTag **tmpMessageTags = NULL; // Score computation unsigned int nbDynTotal = 0; unsigned int nbDynCommon = 0; // Regex returned by the function char * regex = NULL; // DEBUG DISPLAY OF MESSAGES if (debugMode == TRUE) { displayMessage(message1); displayMessage(message2); } //+------------------------------------------------------------------------+ // Create and initialize the matrix //+------------------------------------------------------------------------+ matrix = (short int**) malloc( sizeof(short int*) * (message1->len + 1) ); for (i = 0; i < (message1->len + 1); i++) { matrix[i] = (short int*) calloc( (message2->len + 1), sizeof(short int) ); } //+------------------------------------------------------------------------+ // Fullfill the matrix given the two messages //+------------------------------------------------------------------------+ // Parralelization: unsigned int nbDiag = 0; unsigned int nbBlock = 0; // Depends on which diagonal we are on unsigned int minLen = 0; unsigned int firsti = 0; unsigned int firstj = 0; unsigned int diagloop = 0; unsigned int blockLoop = 0; unsigned int iblock = 0; unsigned int jblock = 0; unsigned int maxLoopi = 0; unsigned int maxLoopj = 0; unsigned int lastRow = 0; unsigned int lastColumn = 0; int maxScoreMatrix = 0; lastRow = ((message1->len+1)/BLEN) * BLEN; lastColumn = ((message2->len+1)/BLEN) * BLEN; nbDiag = (message1->len+1)/BLEN + (message2->len+1)/BLEN + ((message1->len+1)%BLEN!=0); // reminder: BLEN = blocklength minLen = message1->len+1 <= message2->len+1 ? message1->len+1 : message2->len+1; maxLen = message1->len+1 > message2->len+1 ? message1->len+1 : message2->len+1; // Begin loop over diagonals for (diagloop = 0; diagloop < nbDiag; diagloop++){ //printf("Diag n %d\n",diagloop); for (blockLoop = 0;blockLoop <= nbBlock; blockLoop++){ //printf("Block n %d\n",blockLoop); //(iblock,jblock are moving from the bottom left of the current diagonal to the top right) iblock = firsti - blockLoop * BLEN; jblock = firstj + blockLoop * BLEN; maxLoopi = iblock + BLEN <= message1->len + 1? iblock + BLEN:message1->len + 1; maxLoopj = jblock + BLEN <= message2->len + 1? jblock + BLEN:message2->len + 1; for(i = iblock;i < maxLoopi; i++){ for(j = jblock; j < maxLoopj; j++){ if (i > 0 && j > 0){ elt1 = matrix[i - 1][j - 1]; elt1 += getSimilarityScore(message1, message2, i, j); elt2 = matrix[i][j - 1] + GAP; elt3 = matrix[i - 1][j] + GAP; max = elt1 > elt2 ? elt1 : elt2; max = max > elt3 ? max : elt3; matrix[i][j] = max; if (max > maxScoreMatrix) { maxScoreMatrix = max; } }//printf("%d,\t",matrix[i][j]); } //printf("\n"); }//End for iblock }//End for blockLoop //Actualize the number of block for the next time if (diagloop < minLen/BLEN){ nbBlock++; } else if (diagloop > maxLen/BLEN){ nbBlock--; } //Actualise the first position of the cursor (bottom left of the next diagonal) if (firsti != lastRow) // If we are not at the last row firsti = firsti + BLEN ; else if (firstj != lastColumn) // Else If we are not at the last column firstj += BLEN; }//End for diagloop // Compute score of the alignment (ratio regarding the max score these two payloads could have get if they were equals) unsigned int lenSmallestPayload = message1->len > message2->len ? message1->len : message2->len; float maxScore = lenSmallestPayload * MATCH; scoreAlignment = (100.0f / maxScore) * (float) maxScoreMatrix; if (scoreAlignment > 100.0f) { scoreAlignment = 100.0f; } else if (scoreAlignment < 0.0f) { scoreAlignment = 0.0f; } //levenshtein = MATCH*(float)matrix[message1->len][message2->len] / maxLen; //float levcop = matrix[message1->len][message2->len]; //levenshtein = levenshtein * 10 / maxLen; //+------------------------------------------------------------------------+ // Traceback into the matrix //+------------------------------------------------------------------------+ //finish = FALSE; contentMessage1 = calloc( message1->len + message2->len, sizeof(unsigned char)); mapMessage1 = calloc( message1->len + message2->len, sizeof(unsigned int)); maskMessage1 = calloc( message1->len + message2->len, sizeof(unsigned char)); contentMessage2 = calloc( message1->len + message2->len, sizeof(unsigned char)); mapMessage2 = calloc( message1->len + message2->len, sizeof(unsigned int)); maskMessage2 = calloc( message1->len + message2->len, sizeof(unsigned char)); if (contentMessage1 == NULL) { printf("Error while trying to allocate memory for variable : contentMessage1.\n"); goto end; } if (contentMessage2 == NULL) { printf("Error while trying to allocate memory for variable : contentMessage2.\n"); goto end; } if (maskMessage1 == NULL) { printf("Error while trying to allocate memory for variable : maskMessage1.\n"); goto end; } if (maskMessage2 == NULL) { printf("Error while trying to allocate memory for variable : maskMessage2.\n"); goto end; } // Fullfill the mask with END like filling it with a '\0' memset(maskMessage1, END, (message1->len + message2->len) * sizeof(unsigned char)); memset(maskMessage2, END, (message1->len + message2->len) * sizeof(unsigned char)); // Prepare variables for the traceback iReg1 = message1->len + message2->len - 1; iReg2 = iReg1; i = message1->len; j = message2->len; // DIAGONAL (almost) TRACEBACK while ((i > 0) && (j > 0)) { eltL = matrix[i][j - 1]; eltD = matrix[i - 1][j - 1]; eltT = matrix[i - 1][j]; if ((eltL > eltD) && (eltL > eltT)) { --j; contentMessage1[iReg1] = 0xf1; maskMessage1[iReg1] = DIFFERENT; if( message2->mask[j] == EQUAL) { contentMessage2[iReg2] = message2->alignment[j]; maskMessage2[iReg2] = EQUAL; } else { contentMessage2[iReg2] = 0xf1; maskMessage2[iReg2] = DIFFERENT; } } else if ((eltT >= eltL) && (eltT > eltD)) { --i; contentMessage2[iReg2] = 0xf2; maskMessage2[iReg2] = DIFFERENT; if( message1->mask[i] == EQUAL) { contentMessage1[iReg1] = message1->alignment[i]; maskMessage1[iReg1] = EQUAL; } else { contentMessage1[iReg1] = 0xf2; maskMessage1[iReg1] = DIFFERENT; } } else { --i; --j; if(message1->mask[i] == EQUAL) { contentMessage1[iReg1] = message1->alignment[i]; maskMessage1[iReg1] = EQUAL; } else { contentMessage1[iReg1] = 0xf2; maskMessage1[iReg1] = DIFFERENT; } if(message2->mask[j] == EQUAL) { contentMessage2[iReg2] = message2->alignment[j]; maskMessage2[iReg2] = EQUAL; } else { contentMessage2[iReg2] = 0xf2; maskMessage2[iReg2] = DIFFERENT; } } mapMessage1[iReg1]=i; mapMessage2[iReg2]=j; --iReg1; --iReg2; } // THE DIAGONAL IS FINISH WE CLOSE THE // TRACEBACK BY GOING TO THE EXTREME TOP while (i > 0) { --i; contentMessage2[iReg2] = 0xf3; maskMessage2[iReg2] = DIFFERENT; if(message1->mask[i] == EQUAL) { contentMessage1[iReg1] = message1->alignment[i]; maskMessage1[iReg1] = EQUAL; } else { contentMessage1[iReg1] = 0xf3; maskMessage1[iReg1] = DIFFERENT; } mapMessage1[iReg1]=i; mapMessage2[iReg2]=j; --iReg1; --iReg2; } // THE DIAGONAL IS FINISH WE CLOSE THE // TRACEBACK BY GOING TO THE EXTREME LEFT while (j > 0) { --j; contentMessage1[iReg1] = 0xf4; maskMessage1[iReg1] = DIFFERENT; if(message2->mask[j] == EQUAL) { contentMessage2[iReg2] = message2->alignment[j]; maskMessage2[iReg2] = EQUAL; } else { contentMessage2[iReg2] = 0xf4; maskMessage2[iReg2] = DIFFERENT; } mapMessage1[iReg1]=i; mapMessage2[iReg2]=j; --iReg1; --iReg2; } if (debugMode == TRUE) { // Display the mapping between alignement and message half-bytes printf("Mapping : "); for( i = 0; i < message1->len + message2->len; i++) { unsigned int iTag = mapMessage1[i]; unsigned int jTag = mapMessage2[i]; char * tagNameMessage1 = NULL; char * tagNameMessage2 = NULL; if (iTag >= message1->len || message1->semanticTags[iTag] == NULL || message1->semanticTags[iTag]->name == NULL) { tagNameMessage1 = "None"; } else { tagNameMessage1 = message1->semanticTags[iTag]->name; } if (jTag >= message2->len || message2->semanticTags[jTag] == NULL || message2->semanticTags[jTag]->name == NULL) { tagNameMessage2 = "None"; } else { tagNameMessage2 = message2->semanticTags[jTag]->name; } if (strcmp(tagNameMessage1, "None") != 0 || strcmp(tagNameMessage2, "None") != 0) { printf("%d) 1=%d [%s], 2=%d [%s], \n", i, iTag, tagNameMessage1, jTag, tagNameMessage2); } } } // For debug only if (debugMode == TRUE) { printf("(1)Alig : "); for( i = 0; i < message1->len + message2->len; i++) { if(maskMessage1[i] == EQUAL ) { printf("%02x", (unsigned char) contentMessage1[i]); } else if ( maskMessage2[i] == END ) { //printf("##"); } else { printf("--"); } } printf("\n"); printf("(2)Alig : "); for( i = 0; i < message1->len + message2->len; i++) { if( maskMessage2[i] == EQUAL ) { printf("%02x", (unsigned char) contentMessage2[i]); } else if ( maskMessage2[i] == END ) { //printf("##"); } else { printf("--"); } } printf("\n"); } // Compute the common alignment char hexrepr[3]; int sizereg = 100000;//(int)(levcop/10)*2+(int)(levcop/10)+2; int regind = 0; tmpMessage = calloc(message1->len + message2->len, sizeof(unsigned char)); tmpMessageMask = malloc((message1->len + message2->len) * sizeof(unsigned char)); memset(tmpMessageMask, END, (message1->len + message2->len) * sizeof(unsigned char)); tmpMessageTags = malloc((message1->len + message2->len) * sizeof(t_semanticTag*)); for (i=0; i<message1->len + message2->len; i++){ tmpMessageTags[i] = malloc(sizeof(t_semanticTag)); tmpMessageTags[i]->name = NULL; } regex= malloc( sizereg* sizeof(char)); memset(regex, 0, sizereg); if (debugMode == TRUE) { printf("Compute the common alignment:\n"); } i = 0; while (i < message1->len + message2->len) { // Fetch the semantic tag of the two messages unsigned int iTag = mapMessage1[i]; unsigned int jTag = mapMessage2[i]; char * tagNameMessage1 = NULL; char * tagNameMessage2 = NULL; char * tagNewMessage = NULL; if (iTag >= message1->len || message1->semanticTags[iTag] == NULL || message1->semanticTags[iTag]->name == NULL) { tagNameMessage1 = "None"; } else { tagNameMessage1 = message1->semanticTags[iTag]->name; } if (jTag >= message2->len || message2->semanticTags[jTag] == NULL || message2->semanticTags[jTag]->name == NULL) { tagNameMessage2 = "None"; } else { tagNameMessage2 = message2->semanticTags[jTag]->name; } if (strcmp(tagNameMessage1, tagNameMessage2) == 0) { tagNewMessage = tagNameMessage1; } else { tagNewMessage = "None"; } tmpMessageTags[i]->name = tagNewMessage; if ((maskMessage1[i] == END) || (maskMessage2[i] == END)) { if(regind==0){ regex[0] ='.'; regind++; } else if(regex[regind-1] !='.'){ regex[regind] ='.'; regind++; } tmpMessage[i] = 0xf9; tmpMessageMask[i] = END; } else if ((maskMessage1[i] == EQUAL) && (maskMessage2[i] == EQUAL) && (contentMessage1[i] == contentMessage2[i])) { tmpMessage[i] = contentMessage1[i]; sprintf(hexrepr,"%02x",contentMessage1[i]); sprintf(regex+regind,"%02x",contentMessage1[i]); //regex[regind] = hexrepr[1]; //regex[regind+1] = hexrepr[0]; regind+=2; tmpMessageMask[i] = EQUAL; } else { if(regind==0){ regex[0] ='.'; regind++; } else if(regex[regind-1] !='.'){ regex[regind] ='.'; regind++; } tmpMessage[i] = 0xf5; tmpMessageMask[i] = DIFFERENT; nbDynTotal += 1; if ((maskMessage1[i] == EQUAL) && (maskMessage2[i] == EQUAL)) { nbDynCommon += 1; } } i++; } //printf("%f\n",levcop); /*if(regex!=NULL){ printf("REGEX %s\n",regex); //free(regex); //printf("FREE \n"); }*/ // Try to (optionally) slick the alignment if(doInternalSlick == TRUE) { if(message1->len + message2->len > 0) { for(i = 1; i < message1->len + message2->len - 1; i++) { if( tmpMessageMask[i] == EQUAL ) { if( tmpMessageMask[i - 1] == DIFFERENT ) { if( tmpMessageMask[i + 1] == DIFFERENT ) { tmpMessage[i] = 0xf6; tmpMessageMask[i] = DIFFERENT; } } } } } } // Create the alignment based on obtained data // Remove the first # of the alignment (where mask = END) // Retrieve the shortest possible alignment i = 0; while( tmpMessageMask[i] == END ) i++; // Store the results resMessage->len = message1->len + message2->len - i; resMessage->alignment = malloc(resMessage->len * sizeof(unsigned char)); resMessage->mask = malloc(resMessage->len * sizeof(unsigned char)); resMessage->semanticTags = malloc(resMessage->len * sizeof(t_semanticTag *)); // default semantic tag is "None" for (j=0; j<resMessage->len; j++) { resMessage->semanticTags[j] = malloc(sizeof(t_semanticTag)); if (tmpMessageTags[i+j] == NULL || strcmp(tmpMessageTags[i+j]->name, "None") == 0) { resMessage->semanticTags[j]->name = "None"; } else { resMessage->semanticTags[j]->name = tmpMessageTags[i+j]->name; } //strcpy(resMessage->semanticTags[j]->name, tmpMessageTags[i+j]->name); } // TODO: (fgy) free resMessage.mask and resMessage.alignment memcpy(resMessage->alignment, tmpMessage + i, resMessage->len); memcpy(resMessage->mask, tmpMessageMask + i, resMessage->len); // Compute the scores of similarity, using the resMessage if (debugMode == TRUE) { displayMessage(resMessage); printf("Result : "); for( i = 0; i < resMessage->len; i++) { if(resMessage->mask[i] == EQUAL ) { printf("%02x", (unsigned char) resMessage->alignment[i]); } else if ( resMessage->mask[i] == END ) { //printf("##"); } else { printf("--"); } } printf("\n"); } // COMPUTE THE SCORES resMessage->score->s1 = getScoreRatio(resMessage); resMessage->score->s2 = getScoreDynSize(nbDynTotal, nbDynCommon); resMessage->score->s3 = scoreAlignment; if (debugMode == TRUE) { printf("Score ratio : %0.2f.\n", resMessage->score->s1); printf("Score DynSize : %0.2f.\n", resMessage->score->s2); printf("Score Rang : %0.2f.\n", resMessage->score->s3); } end: // Room service if(matrix) { for (i = 0; i < (message1->len + 1); i++) { if(matrix[i]) { free(matrix[i]); } } free(matrix); } if(contentMessage1) { free(contentMessage1); } if(contentMessage2) { free(contentMessage2); } if(maskMessage1) { free(maskMessage1); } if(maskMessage2) { free(maskMessage2); } if(mapMessage1) { free(mapMessage1); } if(mapMessage2) { free(mapMessage2); } if(tmpMessage) { free(tmpMessage); } if(tmpMessageMask) { free(tmpMessageMask); } if(tmpMessageTags) { for (i = 0; i < message1->len + message2->len; i++) { if(tmpMessageTags[i]) { free(tmpMessageTags[i]); } } free(tmpMessageTags); } return regex; }