Example #1
0
void TGgSchRef::GetAuthNmVPubStr(
 const TStr& AuthNmVPubStr, TStrV& AuthNmV, TStr& PubNm, TStr& PubYearStr){
  // split input string into two parts
  TStr AuthNmVStr; TStr PubStr;
  AuthNmVPubStr.SplitOnStr(AuthNmVStr, " - ", PubStr);
  // author-names string
  AuthNmVStr.SplitOnAllCh(',', AuthNmV, true);
  for (int AuthN=0; AuthN<AuthNmV.Len(); AuthN++){
    AuthNmV[AuthN].ToTrunc();
  }
  if ((!AuthNmV.Empty())&&
   ((AuthNmV.Last().IsStrIn("..."))||(AuthNmV.Last().Len()<=2))){
    AuthNmV.DelLast();
  }
  // publication-name & publication-year string
  TStr OriginStr; TStr LinkStr;
  PubStr.SplitOnStr(OriginStr, " - ", LinkStr);
  OriginStr.SplitOnLastCh(PubNm, ',', PubYearStr);
  PubNm.ToTrunc(); PubYearStr.ToTrunc();
  if ((PubYearStr.Len()>=4)&&(PubYearStr.GetSubStr(0, 3).IsInt())){
    PubYearStr=PubYearStr.GetSubStr(0, 3);
  } else
  if ((PubNm.Len()>=4)&&(PubNm.GetSubStr(0, 3).IsInt())){
    PubYearStr=PubNm.GetSubStr(0, 3); PubNm="";
  } else {
    PubYearStr="";
  }
}
Example #2
0
void TWebPgFetchEvent::ChangeLastUrlToLc(const PHttpResp& HttpResp){
  static TStr MsNm="Microsoft";
  static TStr HttpsNm="HTTPS";
  TStr SrvNm=HttpResp->GetSrvNm();
  if ((SrvNm.StartsWith(MsNm))||(SrvNm.StartsWith(HttpsNm))){
    if (!UrlStrV.Last().IsLc()){
      PUrl Url=TUrl::New(UrlStrV.Last());
      Url->ToLcPath();
      UrlStrV.Last()=Url->GetUrlStr();
    }
  }
}
Example #3
0
File: util.cpp Project: pikma/Snap
// <last_name>_<first name innitial>
TStr TStrUtil::GetStdName(TStr AuthorName) {
    TStr StdName;
    AuthorName.ToLc();
    AuthorName.ChangeChAll('\n', ' ');
    AuthorName.ChangeChAll('.', ' ');
    // if there is a number in the name, remove it and everything after it
    int i, pos = 0;
    while (pos<AuthorName.Len() && (AuthorName[pos]!='#' && !TCh::IsNum(AuthorName[pos]))) {
        pos++;
    }
    if (pos < AuthorName.Len()) {
        AuthorName = AuthorName.GetSubStr(0, pos-1).ToTrunc();
    }
    if (AuthorName.Empty()) {
        return TStr::GetNullStr();
    }

    // replace everything after '('
    int b = AuthorName.SearchCh('(');
    if (b != -1) {
        AuthorName = AuthorName.GetSubStr(0, b-1).ToTrunc();
    }
    // skip if contains ')'
    if (AuthorName .SearchCh(')')!=-1) {
        return TStr::GetNullStr();
    }
    // skip if it is not a name
    if (AuthorName .SearchStr("figures")!=-1 || AuthorName .SearchStr("macros")!=-1
            || AuthorName .SearchStr("univ")!=-1 || AuthorName .SearchStr("institute")!=-1) {
        return TStr::GetNullStr();
    }
    // remove all non-letters (latex tags, ...)
    TChA NewName;
    for (i = 0; i < AuthorName.Len(); i++) {
        const char Ch = AuthorName[i];
        if (TCh::IsAlpha(Ch) || TCh::IsWs(Ch) || Ch=='-') {
            NewName += Ch;
        }
    }
    StdName = NewName;
    StdName.ToTrunc();
    TStrV AuthNmV;
    StdName.SplitOnWs(AuthNmV);
    // too short -- not a name
    if (! AuthNmV.Empty() && AuthNmV.Last() == "jr") AuthNmV.DelLast();
    if (AuthNmV.Len() < 2) return TStr::GetNullStr();

    const TStr LastNm = AuthNmV.Last();
    if (! TCh::IsAlpha(LastNm[0]) || LastNm.Len() == 1) return TStr::GetNullStr();

    IAssert(isalpha(AuthNmV[0][0]));
    return TStr::Fmt("%s_%c", LastNm.CStr(), AuthNmV[0][0]);
}
Example #4
0
TStrV TEnv::GetIfArgPrefixStrV(
    const TStr& PrefixStr, TStrV& DfValV, const TStr& DNm) const {
    TStrV ArgValV;
    if (Env.GetArgs()<=MnArgs) {
        // 'usage' argument message
        if (!SilentP) {
            printf("   %s%s (default:", PrefixStr.CStr(), DNm.CStr());
            for (int DfValN=0; DfValN<DfValV.Len(); DfValN++) {
                if (DfValN>0) {
                    printf(", ");
                }
                printf("'%s'", DfValV[DfValN].CStr());
            }
            printf(")\n");
        }
        return ArgValV;
    } else {
        // argument & value message
        TStr ArgValVChA;
        for (int ArgN=0; ArgN<GetArgs(); ArgN++) {
            // get argument string
            TStr ArgStr=GetArg(ArgN);
            if (ArgStr.StartsWith(PrefixStr)) {
                // extract & add argument value
                ArgStr.DelStr(PrefixStr);
                ArgValV.Add(ArgStr);
                // add to message string
                if (ArgValV.Len()>1) {
                    ArgValVChA+=", ";
                }
                ArgValVChA+=ArgValV.Last();
            }
        }
        if (ArgValV.Empty()) {
            ArgValV=DfValV;
        }
        // output argument values
        TChA MsgChA;
        MsgChA+="  "+DNm;
        MsgChA+=" (";
        MsgChA+=PrefixStr;
        MsgChA+=")=";
        for (int ArgValN=0; ArgValN<ArgValV.Len(); ArgValN++) {
            if (ArgValN>0) {
                MsgChA+=", ";
            }
            MsgChA+="'";
            MsgChA+=ArgValV[ArgValN];
            MsgChA+="'";
        }
        if (!SilentP) {
            TNotify::OnStatus(Notify, MsgChA);
        }
        return ArgValV;
    }
}
/////////////////////////////////////////////////
// EuPartner
TStr TCordisEuPart::ExtrCountry(const TStr& AddrStr){
  TStr CountryStr;
  TStrV LnV; AddrStr.SplitOnStr("<br>", LnV);
  if (LnV.Len()>0){
    CountryStr=LnV.Last();
    if (CountryStr.Empty()&&(LnV.Len()>1)){
      CountryStr=LnV[LnV.Len()-2];}
    CountryStr.DelChAll('\r');
    CountryStr.DelChAll('\n');
  }
  if (CountryStr.Empty()){
    printf("Country Field Not Found!\n");}
  return CountryStr;
}
void TTransCorpus::SaveTxt(const TStr& OutFBase, const TStr& OutOrgFNm, 
        const TStr& OutTransFNm, const TStr& OutRefTransFNm, TStrV& OrgFNmV, 
        TStrV& TransFNmV, TStrV& RefTransFNmV, const int& LinesPerFile) {
    
    // prepare filenames
    OrgFNmV.Clr(); TransFNmV.Clr(); RefTransFNmV.Clr();
    if (!OutOrgFNm.Empty()) { OrgFNmV.Add(GetOutFNm(OutFBase, 0, LinesPerFile, OutOrgFNm)); }
    if (!OutTransFNm.Empty()) { TransFNmV.Add(GetOutFNm(OutFBase, 0, LinesPerFile, OutTransFNm)); }
    if (!OutRefTransFNm.Empty()) { RefTransFNmV.Add(GetOutFNm(OutFBase, 0, LinesPerFile, OutRefTransFNm)); }
    // open files
    PSOut OrgSOut = !OutOrgFNm.Empty() ? TFOut::New(OrgFNmV.Last()) : PSOut();
    PSOut TransSOut = !OutTransFNm.Empty() ? TFOut::New(TransFNmV.Last()) : PSOut();
    PSOut RefTransSOut = !OutRefTransFNm.Empty() ? TFOut::New(RefTransFNmV.Last()) : PSOut();
    // check which are given
    const bool IsOrgP = !OrgSOut.Empty() && IsOrg();
    const bool IsTransP = !TransSOut.Empty() && IsTrans();
    const bool IsRefTransP = !RefTransSOut.Empty() && IsRefTrans();
    // print warnings
    if (!IsOrgP) { printf("No original sentences!\n"); }
    if (!IsTransP) { printf("No machine translation sentences!\n"); }
    if (!IsRefTransP) { printf("No reference translation sentences!\n"); }
    // go over all the sentences and store them in the file
    TIntV SentIdV; GetSentIdV(SentIdV);
    for (int SentIdN = 0; SentIdN < SentIdV.Len(); SentIdN++) {
        const int SentId = SentIdV[SentIdN];
        if (IsOrgP) { OrgSOut->PutStrLn(GetOrgStr(SentId)); }
        if (IsTransP) { TransSOut->PutStrLn(GetTransStr(SentId)); }
        if (IsRefTransP) { RefTransSOut->PutStrLn(GetRefTransStrV(SentId)[0]); }
        // should we break and go to next file?
        if ((LinesPerFile!=-1) && (SentIdN>0) && (SentIdN % LinesPerFile == 0)) {
            // prepare filenames
            if (!OutOrgFNm.Empty()) { OrgFNmV.Add(GetOutFNm(OutFBase, SentIdN, LinesPerFile, OutOrgFNm)); }
            if (!OutTransFNm.Empty()) { TransFNmV.Add(GetOutFNm(OutFBase, SentIdN, LinesPerFile, OutTransFNm)); }
            if (!OutRefTransFNm.Empty()) { RefTransFNmV.Add(GetOutFNm(OutFBase, SentIdN, LinesPerFile, OutRefTransFNm)); }
            // open next files files
            OrgSOut = !OutOrgFNm.Empty() ? TFOut::New(OrgFNmV.Last()) : PSOut();
            TransSOut = !OutTransFNm.Empty() ? TFOut::New(TransFNmV.Last()) : PSOut();
            RefTransSOut = !OutRefTransFNm.Empty() ? TFOut::New(RefTransFNmV.Last()) : PSOut();
        }
    }
}
void TNmObjBs::ExtrCandWordStrV(
 const TStr& HtmlStr, TStrV& CandWordStrV, const bool& DumpP){
  // prepare named-object vector
  CandWordStrV.Clr();
  // prepare html parsing
  PSIn HtmlSIn=TStrIn::New(HtmlStr);
  PHtmlDoc HtmlDoc=THtmlDoc::New(HtmlSIn, hdtAll, false);
  PHtmlTok Tok;
  THtmlLxSym Sym; TStr Str; TStr NrStr;
  CandWordStrV.Add(PeriodTagStr);
  bool InTitle=false; bool InScript=false; int LastNmObjTokN=-1;
  // traverse html tokens
  if (DumpP){printf("Tokens: ");}
  for (int TokN=0; TokN<HtmlDoc->GetToks(); TokN++){
    PHtmlTok Tok=HtmlDoc->GetTok(TokN);
    HtmlDoc->GetTok(TokN, Sym, Str);
    switch (Sym){
      case hsyUndef: break;
      case hsyStr:
      case hsyNum:
        if (InTitle){break;}
        if (InScript){break;}
        NrStr=GetNrWordStr(Str);
        if (DumpP){
          if (Str==NrStr){printf("%s ", Str.CStr());}
          else {printf("%s(%s) ", Str.CStr(), NrStr.CStr());}
        }
        if (IsFirstCapWordStr(NrStr)||IsNmObjAttr(NrStr, noaAsCapitalized)){
          if ((LastNmObjTokN!=-1)&&(LastNmObjTokN<TokN-1)){
            if (CandWordStrV.Last()!=PeriodTagStr){
              CandWordStrV.Add(BreakTagStr);
            }
          }
          CandWordStrV.Add(NrStr); LastNmObjTokN=TokN;
        }
        break;
      case hsySSym:
        if (InTitle){break;}
        if (InScript){break;}
        if (DumpP){
          printf("%s ", Str.CStr());}
        if (
         (Str==".")||(Str=="!")||(Str=="?")||
         (Str=="\"")||(Str=="-")||(Str=="/")||
         (Str==":")||(Str==";")){
          if (CandWordStrV.Last()!=PeriodTagStr){
            CandWordStrV.Add(PeriodTagStr);
          }
        }
        break;
      case hsyBTag:
      case hsyETag:
        if (Str=="<TITLE>"){
          InTitle=(Sym==hsyBTag);
        } else
        if (Str=="<SCRIPT>"){
          InScript=(Sym==hsyBTag);
        } else
        if (Str=="<P>"){
          if ((!CandWordStrV.Empty())&&(CandWordStrV.Last()!=ParagraphTagStr)){
            CandWordStrV.Add(ParagraphTagStr);
            CandWordStrV.Add(PeriodTagStr);
          }
        } else
        if ((Str=="<TD>")||(Str=="<BR>")){
          CandWordStrV.Add(PeriodTagStr);
        }
        break;
      case hsyEof: break;
      default: break;
    }
  }
  CandWordStrV.Add(EofTagStr);
  if (DumpP){printf("\n");}
  if (DumpP){
    printf("Candidates: ");
    for (int CandWordStrN=0; CandWordStrN<CandWordStrV.Len(); CandWordStrN++){
      printf("%s ", CandWordStrV[CandWordStrN].CStr());}
    printf("\n");
  }
}