void TGgSchRef::GetAuthNmVPubStr( const TStr& AuthNmVPubStr, TStrV& AuthNmV, TStr& PubNm, TStr& PubYearStr){ // split input string into two parts TStr AuthNmVStr; TStr PubStr; AuthNmVPubStr.SplitOnStr(AuthNmVStr, " - ", PubStr); // author-names string AuthNmVStr.SplitOnAllCh(',', AuthNmV, true); for (int AuthN=0; AuthN<AuthNmV.Len(); AuthN++){ AuthNmV[AuthN].ToTrunc(); } if ((!AuthNmV.Empty())&& ((AuthNmV.Last().IsStrIn("..."))||(AuthNmV.Last().Len()<=2))){ AuthNmV.DelLast(); } // publication-name & publication-year string TStr OriginStr; TStr LinkStr; PubStr.SplitOnStr(OriginStr, " - ", LinkStr); OriginStr.SplitOnLastCh(PubNm, ',', PubYearStr); PubNm.ToTrunc(); PubYearStr.ToTrunc(); if ((PubYearStr.Len()>=4)&&(PubYearStr.GetSubStr(0, 3).IsInt())){ PubYearStr=PubYearStr.GetSubStr(0, 3); } else if ((PubNm.Len()>=4)&&(PubNm.GetSubStr(0, 3).IsInt())){ PubYearStr=PubNm.GetSubStr(0, 3); PubNm=""; } else { PubYearStr=""; } }
void TWebPgFetchEvent::ChangeLastUrlToLc(const PHttpResp& HttpResp){ static TStr MsNm="Microsoft"; static TStr HttpsNm="HTTPS"; TStr SrvNm=HttpResp->GetSrvNm(); if ((SrvNm.StartsWith(MsNm))||(SrvNm.StartsWith(HttpsNm))){ if (!UrlStrV.Last().IsLc()){ PUrl Url=TUrl::New(UrlStrV.Last()); Url->ToLcPath(); UrlStrV.Last()=Url->GetUrlStr(); } } }
// <last_name>_<first name innitial> TStr TStrUtil::GetStdName(TStr AuthorName) { TStr StdName; AuthorName.ToLc(); AuthorName.ChangeChAll('\n', ' '); AuthorName.ChangeChAll('.', ' '); // if there is a number in the name, remove it and everything after it int i, pos = 0; while (pos<AuthorName.Len() && (AuthorName[pos]!='#' && !TCh::IsNum(AuthorName[pos]))) { pos++; } if (pos < AuthorName.Len()) { AuthorName = AuthorName.GetSubStr(0, pos-1).ToTrunc(); } if (AuthorName.Empty()) { return TStr::GetNullStr(); } // replace everything after '(' int b = AuthorName.SearchCh('('); if (b != -1) { AuthorName = AuthorName.GetSubStr(0, b-1).ToTrunc(); } // skip if contains ')' if (AuthorName .SearchCh(')')!=-1) { return TStr::GetNullStr(); } // skip if it is not a name if (AuthorName .SearchStr("figures")!=-1 || AuthorName .SearchStr("macros")!=-1 || AuthorName .SearchStr("univ")!=-1 || AuthorName .SearchStr("institute")!=-1) { return TStr::GetNullStr(); } // remove all non-letters (latex tags, ...) TChA NewName; for (i = 0; i < AuthorName.Len(); i++) { const char Ch = AuthorName[i]; if (TCh::IsAlpha(Ch) || TCh::IsWs(Ch) || Ch=='-') { NewName += Ch; } } StdName = NewName; StdName.ToTrunc(); TStrV AuthNmV; StdName.SplitOnWs(AuthNmV); // too short -- not a name if (! AuthNmV.Empty() && AuthNmV.Last() == "jr") AuthNmV.DelLast(); if (AuthNmV.Len() < 2) return TStr::GetNullStr(); const TStr LastNm = AuthNmV.Last(); if (! TCh::IsAlpha(LastNm[0]) || LastNm.Len() == 1) return TStr::GetNullStr(); IAssert(isalpha(AuthNmV[0][0])); return TStr::Fmt("%s_%c", LastNm.CStr(), AuthNmV[0][0]); }
TStrV TEnv::GetIfArgPrefixStrV( const TStr& PrefixStr, TStrV& DfValV, const TStr& DNm) const { TStrV ArgValV; if (Env.GetArgs()<=MnArgs) { // 'usage' argument message if (!SilentP) { printf(" %s%s (default:", PrefixStr.CStr(), DNm.CStr()); for (int DfValN=0; DfValN<DfValV.Len(); DfValN++) { if (DfValN>0) { printf(", "); } printf("'%s'", DfValV[DfValN].CStr()); } printf(")\n"); } return ArgValV; } else { // argument & value message TStr ArgValVChA; for (int ArgN=0; ArgN<GetArgs(); ArgN++) { // get argument string TStr ArgStr=GetArg(ArgN); if (ArgStr.StartsWith(PrefixStr)) { // extract & add argument value ArgStr.DelStr(PrefixStr); ArgValV.Add(ArgStr); // add to message string if (ArgValV.Len()>1) { ArgValVChA+=", "; } ArgValVChA+=ArgValV.Last(); } } if (ArgValV.Empty()) { ArgValV=DfValV; } // output argument values TChA MsgChA; MsgChA+=" "+DNm; MsgChA+=" ("; MsgChA+=PrefixStr; MsgChA+=")="; for (int ArgValN=0; ArgValN<ArgValV.Len(); ArgValN++) { if (ArgValN>0) { MsgChA+=", "; } MsgChA+="'"; MsgChA+=ArgValV[ArgValN]; MsgChA+="'"; } if (!SilentP) { TNotify::OnStatus(Notify, MsgChA); } return ArgValV; } }
///////////////////////////////////////////////// // EuPartner TStr TCordisEuPart::ExtrCountry(const TStr& AddrStr){ TStr CountryStr; TStrV LnV; AddrStr.SplitOnStr("<br>", LnV); if (LnV.Len()>0){ CountryStr=LnV.Last(); if (CountryStr.Empty()&&(LnV.Len()>1)){ CountryStr=LnV[LnV.Len()-2];} CountryStr.DelChAll('\r'); CountryStr.DelChAll('\n'); } if (CountryStr.Empty()){ printf("Country Field Not Found!\n");} return CountryStr; }
void TTransCorpus::SaveTxt(const TStr& OutFBase, const TStr& OutOrgFNm, const TStr& OutTransFNm, const TStr& OutRefTransFNm, TStrV& OrgFNmV, TStrV& TransFNmV, TStrV& RefTransFNmV, const int& LinesPerFile) { // prepare filenames OrgFNmV.Clr(); TransFNmV.Clr(); RefTransFNmV.Clr(); if (!OutOrgFNm.Empty()) { OrgFNmV.Add(GetOutFNm(OutFBase, 0, LinesPerFile, OutOrgFNm)); } if (!OutTransFNm.Empty()) { TransFNmV.Add(GetOutFNm(OutFBase, 0, LinesPerFile, OutTransFNm)); } if (!OutRefTransFNm.Empty()) { RefTransFNmV.Add(GetOutFNm(OutFBase, 0, LinesPerFile, OutRefTransFNm)); } // open files PSOut OrgSOut = !OutOrgFNm.Empty() ? TFOut::New(OrgFNmV.Last()) : PSOut(); PSOut TransSOut = !OutTransFNm.Empty() ? TFOut::New(TransFNmV.Last()) : PSOut(); PSOut RefTransSOut = !OutRefTransFNm.Empty() ? TFOut::New(RefTransFNmV.Last()) : PSOut(); // check which are given const bool IsOrgP = !OrgSOut.Empty() && IsOrg(); const bool IsTransP = !TransSOut.Empty() && IsTrans(); const bool IsRefTransP = !RefTransSOut.Empty() && IsRefTrans(); // print warnings if (!IsOrgP) { printf("No original sentences!\n"); } if (!IsTransP) { printf("No machine translation sentences!\n"); } if (!IsRefTransP) { printf("No reference translation sentences!\n"); } // go over all the sentences and store them in the file TIntV SentIdV; GetSentIdV(SentIdV); for (int SentIdN = 0; SentIdN < SentIdV.Len(); SentIdN++) { const int SentId = SentIdV[SentIdN]; if (IsOrgP) { OrgSOut->PutStrLn(GetOrgStr(SentId)); } if (IsTransP) { TransSOut->PutStrLn(GetTransStr(SentId)); } if (IsRefTransP) { RefTransSOut->PutStrLn(GetRefTransStrV(SentId)[0]); } // should we break and go to next file? if ((LinesPerFile!=-1) && (SentIdN>0) && (SentIdN % LinesPerFile == 0)) { // prepare filenames if (!OutOrgFNm.Empty()) { OrgFNmV.Add(GetOutFNm(OutFBase, SentIdN, LinesPerFile, OutOrgFNm)); } if (!OutTransFNm.Empty()) { TransFNmV.Add(GetOutFNm(OutFBase, SentIdN, LinesPerFile, OutTransFNm)); } if (!OutRefTransFNm.Empty()) { RefTransFNmV.Add(GetOutFNm(OutFBase, SentIdN, LinesPerFile, OutRefTransFNm)); } // open next files files OrgSOut = !OutOrgFNm.Empty() ? TFOut::New(OrgFNmV.Last()) : PSOut(); TransSOut = !OutTransFNm.Empty() ? TFOut::New(TransFNmV.Last()) : PSOut(); RefTransSOut = !OutRefTransFNm.Empty() ? TFOut::New(RefTransFNmV.Last()) : PSOut(); } } }
void TNmObjBs::ExtrCandWordStrV( const TStr& HtmlStr, TStrV& CandWordStrV, const bool& DumpP){ // prepare named-object vector CandWordStrV.Clr(); // prepare html parsing PSIn HtmlSIn=TStrIn::New(HtmlStr); PHtmlDoc HtmlDoc=THtmlDoc::New(HtmlSIn, hdtAll, false); PHtmlTok Tok; THtmlLxSym Sym; TStr Str; TStr NrStr; CandWordStrV.Add(PeriodTagStr); bool InTitle=false; bool InScript=false; int LastNmObjTokN=-1; // traverse html tokens if (DumpP){printf("Tokens: ");} for (int TokN=0; TokN<HtmlDoc->GetToks(); TokN++){ PHtmlTok Tok=HtmlDoc->GetTok(TokN); HtmlDoc->GetTok(TokN, Sym, Str); switch (Sym){ case hsyUndef: break; case hsyStr: case hsyNum: if (InTitle){break;} if (InScript){break;} NrStr=GetNrWordStr(Str); if (DumpP){ if (Str==NrStr){printf("%s ", Str.CStr());} else {printf("%s(%s) ", Str.CStr(), NrStr.CStr());} } if (IsFirstCapWordStr(NrStr)||IsNmObjAttr(NrStr, noaAsCapitalized)){ if ((LastNmObjTokN!=-1)&&(LastNmObjTokN<TokN-1)){ if (CandWordStrV.Last()!=PeriodTagStr){ CandWordStrV.Add(BreakTagStr); } } CandWordStrV.Add(NrStr); LastNmObjTokN=TokN; } break; case hsySSym: if (InTitle){break;} if (InScript){break;} if (DumpP){ printf("%s ", Str.CStr());} if ( (Str==".")||(Str=="!")||(Str=="?")|| (Str=="\"")||(Str=="-")||(Str=="/")|| (Str==":")||(Str==";")){ if (CandWordStrV.Last()!=PeriodTagStr){ CandWordStrV.Add(PeriodTagStr); } } break; case hsyBTag: case hsyETag: if (Str=="<TITLE>"){ InTitle=(Sym==hsyBTag); } else if (Str=="<SCRIPT>"){ InScript=(Sym==hsyBTag); } else if (Str=="<P>"){ if ((!CandWordStrV.Empty())&&(CandWordStrV.Last()!=ParagraphTagStr)){ CandWordStrV.Add(ParagraphTagStr); CandWordStrV.Add(PeriodTagStr); } } else if ((Str=="<TD>")||(Str=="<BR>")){ CandWordStrV.Add(PeriodTagStr); } break; case hsyEof: break; default: break; } } CandWordStrV.Add(EofTagStr); if (DumpP){printf("\n");} if (DumpP){ printf("Candidates: "); for (int CandWordStrN=0; CandWordStrN<CandWordStrV.Len(); CandWordStrN++){ printf("%s ", CandWordStrV[CandWordStrN].CStr());} printf("\n"); } }