void TNmObjBs::FilterCandToNmObjIdV( const TStrV& CandWordStrV, TIntV& NmObjIdV, const bool& DumpP){ // prepare candidate traversal TVec<TStrV> NmObjIdWordStrVV; int CandWordStrN=0; int CandWordStrs=CandWordStrV.Len(); while (CandWordStrN<CandWordStrs){ // get candidate TStr WordStr=CandWordStrV[CandWordStrN]; //printf("%s ", WordStr.CStr()); // simple filters if (WordStr.Len()<=1){CandWordStrN++; continue;} if (WordStr==ParagraphTagStr){CandWordStrN++; continue;} if (WordStr==BreakTagStr){CandWordStrN++; continue;} if (WordStr==EofTagStr){CandWordStrN++; break;} if (IsNumStr(WordStr)){CandWordStrN++; continue;} TStr UcWordStr=ChDef->GetUcStr(WordStr); //if (SwSet->IsIn(UcWordStr, true)){ // CandWordStrN++; continue;} if ((WordStr==UcWordStr)&&((WordStr.Len()>4)&&(!IsNmObjAttr(WordStr, noaAcronym)))){ CandWordStrN++; continue;} // unperiod if (IsNmObjAttr(WordStr, noaUnperiod)&&(CandWordStrV[CandWordStrN+1]==PeriodTagStr)){ CandWordStrN+=1; } // period if (WordStr==PeriodTagStr){ CandWordStrN++; WordStr=CandWordStrV[CandWordStrN]; if (IsTagStr(WordStr)){continue;} if (IsNmObjAttr(WordStr, noaDefined)){ continue; } else if ((CandWordStrN>1)&&(IsNmObjAttr(CandWordStrV[CandWordStrN-2], noaUnperiod))){ continue; } else { TStr NextWordStr=CandWordStrV[CandWordStrN+1]; if (IsFirstCapWordStr(NextWordStr)||IsNmObjAttr(NextWordStr, noaAsCapitalized)){ continue; } else if (!IsNmObj(WordStr)){ CandWordStrN++; continue; } } } // if (WordStr=="British"){ // printf("");} // ignore if (IsNmObjAttr(WordStr, noaIgnore)){ CandWordStrN++; continue; } // collect named-object words TStrV WordStrV; forever { WordStrV.Add(WordStr); CandWordStrN++; WordStr=CandWordStrV[CandWordStrN]; if (IsTagStr(WordStr)){break;} if (WordStr.Len()<=1){break;} if (IsNmObjAttr(WordStr, noaIgnore)){CandWordStrN++; break;} if (IsNmObjAttr(WordStr, noaStandalone)){break;} if (IsNmObjAttr(WordStrV, noaStandalone)){break;} } // get normalized version of named-object TStrV NrWordStrV; GetNrNmObjStrV(WordStrV, NrWordStrV); // simple filters if (IsNmObjAttr(NrWordStrV, noaIgnore)){continue;} if (IsNmObjAttr(NrWordStrV, noaFirstName)){continue;} if (NrWordStrV.Len()>5){ while (NrWordStrV.Len()>2){NrWordStrV.Del(0);}} if (NrWordStrV.Len()==1){ TStr UcWordStr=ChDef->GetUcStr(NrWordStrV[0]); if (SwSet->IsIn(UcWordStr, true)){continue;} } // add named object NmObjIdWordStrVV.Add(NrWordStrV); } // merge similar words for (int NmObjN=0; NmObjN<NmObjIdWordStrVV.Len(); NmObjN++){ TStrV& WordStrV=NmObjIdWordStrVV[NmObjN]; if (WordStrV.Len()==1){ // merge single words for (int SubNmObjN=0; SubNmObjN<NmObjIdWordStrVV.Len(); SubNmObjN++){ TStrV& SubWordStrV=NmObjIdWordStrVV[SubNmObjN]; if (SubWordStrV.Len()==1){ if (WordStrV[0]!=SubWordStrV[0]){ if (IsMatchPfx(WordStrV[0], SubWordStrV[0], 3, 4)){ // normalize to shorter string if (WordStrV[0].Len()<SubWordStrV[0].Len()){SubWordStrV=WordStrV;} else {WordStrV=SubWordStrV;} } } } } } else if (WordStrV.Len()>=2){ TStr LastNm=WordStrV.Last(); for (int SubNmObjN=0; SubNmObjN<NmObjIdWordStrVV.Len(); SubNmObjN++){ TStrV& SubWordStrV=NmObjIdWordStrVV[SubNmObjN]; if (SubWordStrV.Len()==1){ // merge last-name with [first-name,last-name] pairs TStr SubLastNm=SubWordStrV[0]; if (LastNm!=SubLastNm){ if (IsMatchPfx(LastNm, SubLastNm, 3, 4)){ if (LastNm.Len()<SubLastNm.Len()){SubWordStrV=WordStrV;} else {WordStrV=SubWordStrV;} } } } else if (false&&(SubWordStrV.Len()==2)){ // merge [first-name,last-name] with [first-name,last-name] pairs if ((WordStrV[0]!=SubWordStrV[0])||(WordStrV[1]!=SubWordStrV[1])){ if ((IsMatchPfx(WordStrV[0], SubWordStrV[0], 3, 4))&& (IsMatchPfx(WordStrV[1], SubWordStrV[1], 3, 4))){ // normalize to shorter string (first word) if (WordStrV[0].Len()<SubWordStrV[0].Len()){ SubWordStrV[0]=WordStrV[0];} else {WordStrV[0]=SubWordStrV[0];} // normalize to shorter string (second word) if (WordStrV[1].Len()<SubWordStrV[1].Len()){ SubWordStrV[1]=WordStrV[1];} else {WordStrV[1]=SubWordStrV[1];} } } } } } } // get named-objects-ids NmObjIdV.Gen(NmObjIdWordStrVV.Len(), 0); {for (int NmObjN=0; NmObjN<NmObjIdWordStrVV.Len(); NmObjN++){ TStrV& NmObjWordStrV=NmObjIdWordStrVV[NmObjN]; int NmObjId=GetNmObjId(NmObjWordStrV, true); NmObjIdV.Add(NmObjId); }} // dump if (DumpP){ printf("Named-Objects: "); for (int NmObjN=0; NmObjN<NmObjIdV.Len(); NmObjN++){ int NmObjId=NmObjIdV[NmObjN]; TStr NmObjStr=GetNmObjStr(NmObjId); printf("%s ", NmObjStr.CStr()); } printf("\n"); } }