void TNmObjBs::GetMergedNmObj(TIntV& NewNmObjIdV){ // matching constraints int MnPfxLen=3; int MxSfxLen=2; // create transformation vector int NmObjs=NmObjWordStrVToDocIdVH.Len(); NewNmObjIdV.Gen(NmObjs); NewNmObjIdV.PutAll(-1); // merging single words // merging statistics {int SingleWords=0; int ReducedSingleWords=0; // collect single words according to prefix TStrIntVH PfxStrToNmObjIdVH; for (int NmObjId=0; NmObjId<NmObjs; NmObjId++){ if (NewNmObjIdV[NmObjId]!=-1){continue;} const TStrV& WordStrV=NmObjWordStrVToDocIdVH.GetKey(NmObjId); if (WordStrV.Len()==1){ TStr PfxStr=WordStrV[0].GetSubStr(0, 2); PfxStrToNmObjIdVH.AddDat(PfxStr).Add(NmObjId); SingleWords++; } } // traverse word-groups with the same prefix int Pfxs=PfxStrToNmObjIdVH.Len(); for (int PfxId=0; PfxId<Pfxs; PfxId++){ // get & traverse word-group TIntV& NmObjIdV=PfxStrToNmObjIdVH[PfxId]; for (int NmObjIdN=0; NmObjIdN<NmObjIdV.Len(); NmObjIdN++){ int NmObjId=NmObjIdV[NmObjIdN]; if (NewNmObjIdV[NmObjId]!=-1){continue;} NewNmObjIdV[NmObjId]=NmObjId; const TStr& WordStr=NmObjWordStrVToDocIdVH.GetKey(NmObjId)[0]; int Fq=NmObjWordStrVToDocIdVH[NmObjId].Len(); TIntPrV FqNmObjIdPrV(NmObjIdV.Len(), 0); FqNmObjIdPrV.Add(TIntPr(Fq, NmObjId)); // traverse rest of the word-group for matching words for (int SubNmObjIdN=NmObjIdN+1; SubNmObjIdN<NmObjIdV.Len(); SubNmObjIdN++){ int SubNmObjId=NmObjIdV[SubNmObjIdN]; if (NewNmObjIdV[SubNmObjId]!=-1){continue;} const TStr& SubWordStr=NmObjWordStrVToDocIdVH.GetKey(SubNmObjId)[0]; // test matching if (IsMatchPfx(WordStr, SubWordStr, MnPfxLen, MxSfxLen)){ NewNmObjIdV[SubNmObjId]=NmObjId; int SubFq=NmObjWordStrVToDocIdVH[SubNmObjId].Len(); FqNmObjIdPrV.Add(TIntPr(SubFq, SubNmObjId)); //printf("%s -> %s\n", WordStr.CStr(), SubWordStr.CStr()); } } // increment number of equivalence word-groups ReducedSingleWords++; // collapse matching words into most frequent word if (FqNmObjIdPrV.Len()>1){ FqNmObjIdPrV.Sort(false); int MainNmObjId=FqNmObjIdPrV[0].Val2; NewNmObjIdV[MainNmObjId]=MainNmObjId; TStr MainWordStr=NmObjWordStrVToDocIdVH.GetKey(MainNmObjId)[0]; //printf("[%s:", MainWordStr.CStr()); for (int FqNmObjIdPrN=1; FqNmObjIdPrN<FqNmObjIdPrV.Len(); FqNmObjIdPrN++){ int SubNmObjId=FqNmObjIdPrV[FqNmObjIdPrN].Val2; NewNmObjIdV[SubNmObjId]=MainNmObjId; //TStr& SubWordStr=NmObjWordStrVToDocIdVH.GetKey(SubNmObjId)[0]; //printf(" %s", SubWordStr.CStr()); } //printf("]\n"); } } } // print statistics //printf("SingleWords:%d ReducedSingleWords:%d\n", // SingleWords, ReducedSingleWords); } // merging double words // merging statistics {int DoubleWords=0; int ReducedDoubleWords=0; // collect double words according to prefix TStrIntVH PfxStrToNmObjIdVH; for (int NmObjId=0; NmObjId<NmObjs; NmObjId++){ if (NewNmObjIdV[NmObjId]!=-1){continue;} const TStrV& WordStrV=NmObjWordStrVToDocIdVH.GetKey(NmObjId); if (WordStrV.Len()==2){ TStr PfxStr=WordStrV[0].GetSubStr(0, 2)+WordStrV[1].GetSubStr(0, 2); PfxStrToNmObjIdVH.AddDat(PfxStr).Add(NmObjId); DoubleWords++; } } // traverse word-groups with the same prefix int Pfxs=PfxStrToNmObjIdVH.Len(); for (int PfxId=0; PfxId<Pfxs; PfxId++){ // get & traverse word-group TIntV& NmObjIdV=PfxStrToNmObjIdVH[PfxId]; for (int NmObjIdN=0; NmObjIdN<NmObjIdV.Len(); NmObjIdN++){ int NmObjId=NmObjIdV[NmObjIdN]; if (NewNmObjIdV[NmObjId]!=-1){continue;} NewNmObjIdV[NmObjId]=NmObjId; const TStr& WordStr1=NmObjWordStrVToDocIdVH.GetKey(NmObjId)[0]; const TStr& WordStr2=NmObjWordStrVToDocIdVH.GetKey(NmObjId)[1]; int Fq=NmObjWordStrVToDocIdVH[NmObjId].Len(); TIntPrV FqNmObjIdPrV(NmObjIdV.Len(), 0); FqNmObjIdPrV.Add(TIntPr(Fq, NmObjId)); // traverse rest of the word-group for matching words for (int SubNmObjIdN=NmObjIdN+1; SubNmObjIdN<NmObjIdV.Len(); SubNmObjIdN++){ int SubNmObjId=NmObjIdV[SubNmObjIdN]; if (NewNmObjIdV[SubNmObjId]!=-1){continue;} const TStr& SubWordStr1=NmObjWordStrVToDocIdVH.GetKey(SubNmObjId)[0]; const TStr& SubWordStr2=NmObjWordStrVToDocIdVH.GetKey(SubNmObjId)[1]; // test matching if (IsMatchPfx(WordStr1, SubWordStr1, MnPfxLen, MxSfxLen+1)&& IsMatchPfx(WordStr2, SubWordStr2, MnPfxLen, MxSfxLen+1)){ NewNmObjIdV[SubNmObjId]=NmObjId; int SubFq=NmObjWordStrVToDocIdVH[SubNmObjId].Len(); FqNmObjIdPrV.Add(TIntPr(SubFq, SubNmObjId)); //printf("%s_%s -> %s_%s\n", // WordStr1.CStr(), WordStr2.CStr(), // SubWordStr1.CStr(), SubWordStr2.CStr()); } } // increment number of equivalence word-groups ReducedDoubleWords++; // collapse matching words into most frequent word if (FqNmObjIdPrV.Len()>1){ FqNmObjIdPrV.Sort(false); int MainNmObjId=FqNmObjIdPrV[0].Val2; NewNmObjIdV[MainNmObjId]=MainNmObjId; TStr MainWordStr1=NmObjWordStrVToDocIdVH.GetKey(MainNmObjId)[0]; TStr MainWordStr2=NmObjWordStrVToDocIdVH.GetKey(MainNmObjId)[1]; //printf("[%s_%s:", MainWordStr1.CStr(), MainWordStr2.CStr()); for (int FqNmObjIdPrN=1; FqNmObjIdPrN<FqNmObjIdPrV.Len(); FqNmObjIdPrN++){ int SubNmObjId=FqNmObjIdPrV[FqNmObjIdPrN].Val2; NewNmObjIdV[SubNmObjId]=MainNmObjId; //TStr& SubWordStr1=NmObjWordStrVToDocIdVH.GetKey(SubNmObjId)[0]; //TStr& SubWordStr2=NmObjWordStrVToDocIdVH.GetKey(SubNmObjId)[1]; //printf(" %s_%s", SubWordStr1.CStr(), SubWordStr2.CStr()); } //printf("]\n"); } } } // print statistics //printf("DoubleWords:%d ReducedDoubleWords:%d\n", // DoubleWords, ReducedDoubleWords); } // merging triples to doubles // ... (prefix, first-name, last-name) to (first-name, last-name) // merging statistics {int TripleWords=0; int ReducedTripleWords=0; // collect single words according to prefix TStrIntVH PfxStrToNmObjIdVH; for (int NmObjId=0; NmObjId<NmObjs; NmObjId++){ if (NewNmObjIdV[NmObjId]!=-1){continue;} const TStrV& WordStrV=NmObjWordStrVToDocIdVH.GetKey(NmObjId); if (WordStrV.Len()==3){ TripleWords++; TStrV DbWordStrV(2, 0); DbWordStrV.Add(WordStrV[1]); DbWordStrV.Add(WordStrV[2]); int DbNmObjId=NmObjWordStrVToDocIdVH.GetKeyId(DbWordStrV); if (DbNmObjId!=-1){ ReducedTripleWords++; int NewDbNmObjId=NewNmObjIdV[DbNmObjId]; NewNmObjIdV[NmObjId]=NewDbNmObjId; //TStr NmObjStr=GetNmObjStr(NmObjId); //TStr DbNmObjStr=GetNmObjStr(DbNmObjId); //TStr NewDbNmObjStr=GetNmObjStr(NewDbNmObjId); //printf("%s -> %s -> %s\n", // NmObjStr.CStr(), DbNmObjStr.CStr(), NewDbNmObjStr.CStr()); } } } //printf("TripleWords:%d ReducedTripleWords:%d\n", // TripleWords, ReducedTripleWords); } // merging triple words // merging statistics {int TripleWords=0; int ReducedTripleWords=0; // collect triple words according to prefix TStrIntVH PfxStrToNmObjIdVH; for (int NmObjId=0; NmObjId<NmObjs; NmObjId++){ if (NewNmObjIdV[NmObjId]!=-1){continue;} const TStrV& WordStrV=NmObjWordStrVToDocIdVH.GetKey(NmObjId); if (WordStrV.Len()==3){ TStr PfxStr=WordStrV[0].GetSubStr(0, 2)+WordStrV[1].GetSubStr(0, 2)+WordStrV[2].GetSubStr(0, 2); PfxStrToNmObjIdVH.AddDat(PfxStr).Add(NmObjId); TripleWords++; } } // traverse word-groups with the same prefix int Pfxs=PfxStrToNmObjIdVH.Len(); for (int PfxId=0; PfxId<Pfxs; PfxId++){ // get & traverse word-group TIntV& NmObjIdV=PfxStrToNmObjIdVH[PfxId]; for (int NmObjIdN=0; NmObjIdN<NmObjIdV.Len(); NmObjIdN++){ int NmObjId=NmObjIdV[NmObjIdN]; if (NewNmObjIdV[NmObjId]!=-1){continue;} NewNmObjIdV[NmObjId]=NmObjId; const TStr& WordStr1=NmObjWordStrVToDocIdVH.GetKey(NmObjId)[0]; const TStr& WordStr2=NmObjWordStrVToDocIdVH.GetKey(NmObjId)[1]; const TStr& WordStr3=NmObjWordStrVToDocIdVH.GetKey(NmObjId)[2]; int Fq=NmObjWordStrVToDocIdVH[NmObjId].Len(); TIntPrV FqNmObjIdPrV(NmObjIdV.Len(), 0); FqNmObjIdPrV.Add(TIntPr(Fq, NmObjId)); // traverse rest of the word-group for matching words for (int SubNmObjIdN=NmObjIdN+1; SubNmObjIdN<NmObjIdV.Len(); SubNmObjIdN++){ int SubNmObjId=NmObjIdV[SubNmObjIdN]; if (NewNmObjIdV[SubNmObjId]!=-1){continue;} const TStr& SubWordStr1=NmObjWordStrVToDocIdVH.GetKey(SubNmObjId)[0]; const TStr& SubWordStr2=NmObjWordStrVToDocIdVH.GetKey(SubNmObjId)[1]; const TStr& SubWordStr3=NmObjWordStrVToDocIdVH.GetKey(SubNmObjId)[2]; // test matching if (IsMatchPfx(WordStr1, SubWordStr1, MnPfxLen, MxSfxLen+1)&& IsMatchPfx(WordStr2, SubWordStr2, MnPfxLen, MxSfxLen+1)&& IsMatchPfx(WordStr3, SubWordStr3, MnPfxLen, MxSfxLen+1)){ NewNmObjIdV[SubNmObjId]=NmObjId; int SubFq=NmObjWordStrVToDocIdVH[SubNmObjId].Len(); FqNmObjIdPrV.Add(TIntPr(SubFq, SubNmObjId)); //printf("%s_%s_%s -> %s_%s_%s\n", // WordStr1.CStr(), WordStr2.CStr(), WordStr3.CStr(), // SubWordStr1.CStr(), SubWordStr2.CStr(), SubWordStr3.CStr()); } } // increment number of equivalence word-groups ReducedTripleWords++; // collapse matching words into most frequent word if (FqNmObjIdPrV.Len()>1){ FqNmObjIdPrV.Sort(false); int MainNmObjId=FqNmObjIdPrV[0].Val2; NewNmObjIdV[MainNmObjId]=MainNmObjId; TStr MainWordStr1=NmObjWordStrVToDocIdVH.GetKey(MainNmObjId)[0]; TStr MainWordStr2=NmObjWordStrVToDocIdVH.GetKey(MainNmObjId)[1]; TStr MainWordStr3=NmObjWordStrVToDocIdVH.GetKey(MainNmObjId)[2]; //printf("[%s_%s_%s:", MainWordStr1.CStr(), MainWordStr2.CStr(), MainWordStr3.CStr()); for (int FqNmObjIdPrN=1; FqNmObjIdPrN<FqNmObjIdPrV.Len(); FqNmObjIdPrN++){ int SubNmObjId=FqNmObjIdPrV[FqNmObjIdPrN].Val2; NewNmObjIdV[SubNmObjId]=MainNmObjId; //TStr& SubWordStr1=NmObjWordStrVToDocIdVH.GetKey(SubNmObjId)[0]; //TStr& SubWordStr2=NmObjWordStrVToDocIdVH.GetKey(SubNmObjId)[1]; //TStr& SubWordStr3=NmObjWordStrVToDocIdVH.GetKey(SubNmObjId)[2]; //printf(" %s_%s_%s", SubWordStr1.CStr(), SubWordStr2.CStr(), SubWordStr3.CStr()); } //printf("]\n"); } } } // print statistics //printf("TripleWords:%d ReducedTripleWords:%d\n", // TripleWords, ReducedTripleWords); } }