示例#1
0
bool TBagOfWords::Update(const TStrV& TokenStrV) {    
    // Generate Ngrams if necessary
	TStrV NgramStrV;
    GenerateNgrams(TokenStrV, NgramStrV);

    // process tokens to update DF counts
    bool UpdateP = false;
    if (IsHashing()) {  
        // consolidate tokens and get their hashed IDs
        TIntSet TokenIdH;
        for (int TokenStrN = 0; TokenStrN < NgramStrV.Len(); TokenStrN++) {
            const TStr& TokenStr = NgramStrV[TokenStrN];
            TInt TokenId = TokenStr.GetHashTrick() % HashDim;
            TokenIdH.AddKey(TokenId);
            if (IsStoreHashWords()) { HashWordV[TokenId].AddKey(TokenStr); }
        }
        // update document counts
        int KeyId = TokenIdH.FFirstKeyId();
        while (TokenIdH.FNextKeyId(KeyId)) {
            const int TokenId = TokenIdH.GetKey(KeyId);
            // update DF
            DocFqV[TokenId]++;
        }
    } else {
        // consolidate tokens
        TStrH TokenStrH;
        for (int TokenStrN = 0; TokenStrN < NgramStrV.Len(); TokenStrN++) {
            const TStr& TokenStr = NgramStrV[TokenStrN];
            TokenStrH.AddKey(TokenStr);
        }
        // update document counts and update vocabulary with new tokens
        int KeyId = TokenStrH.FFirstKeyId();
        while (TokenStrH.FNextKeyId(KeyId)) {
            // get token
            const TStr& TokenStr = TokenStrH.GetKey(KeyId);
            // different processing for hashing
            int TokenId = TokenSet.GetKeyId(TokenStr);
            if (TokenId == -1) {
                // new token, remember the dimensionality change
                UpdateP = true;
                // remember the new token
                TokenId = TokenSet.AddKey(TokenStr);
                // increase document count table
                const int TokenDfId = DocFqV.Add(0);
                // increase also the old count table
                OldDocFqV.Add(0.0);
                // make sure we DF vector and TokenSet still in sync
                IAssert(TokenId == TokenDfId);
                IAssert(DocFqV.Len() == OldDocFqV.Len());
            }
            // document count update
            DocFqV[TokenId]++;
        }
    }
    // update document count
    Docs++;
    // tell if dimension changed
    return UpdateP;
}
示例#2
0
文件: strut.cpp 项目: amrsobhy/qminer
TStr TStrUtil::GetStr(const TStrH& StrH, const TStr& FieldDelimiterStr, const TStr& DelimiterStr) {
  if (StrH.Empty()) {return TStr();}
  TChA ResChA;
  int KeyId = StrH.FFirstKeyId();
  while (StrH.FNextKeyId(KeyId)) {
	if (!ResChA.Empty()) { ResChA+=DelimiterStr; }
    ResChA+=StrH.GetKey(KeyId);
	ResChA+=FieldDelimiterStr;
	ResChA+=StrH[KeyId].GetStr();
  }
  return ResChA;
}
示例#3
0
void TFtrGenToken::Update(const TStr& Val) {
    TStrV TokenStrV; GetTokenV(Val, TokenStrV); TStrH TokenStrH;
    for (int TokenStrN = 0; TokenStrN < TokenStrV.Len(); TokenStrN++) {
        const TStr& TokenStr = TokenStrV[TokenStrN];
        TokenStrH.AddKey(TokenStr);
    }
    int KeyId = TokenStrH.FFirstKeyId();
    while (TokenStrH.FNextKeyId(KeyId)) {
        const TStr& TokenStr = TokenStrH.GetKey(KeyId);
        TokenH.AddDat(TokenStr)++;
    }
    Docs++;
}
示例#4
0
void TCycBs::GetRelNmV(TStrV& RelNmV){
  TStrH RelNmH;
  // traverse vertices
  for (int VId=0; VId<GetVIds(); VId++){
    TStr VNm=GetVNm(VId);
    TCycVrtx& Vrtx=GetVrtx(VId);
    // traverse edges per vertice
    for (int EdgeN=0; EdgeN<Vrtx.GetEdges(); EdgeN++){
      // extract relation name
      TCycEdge& Edge=Vrtx.GetEdge(EdgeN);
      TStr RelNm=GetVNm(Edge.GetRelId());
      // add relation name to the pool
      RelNmH.AddKey(RelNm);
    }
  }
  // extract relation-name-vector
  RelNmH.GetKeyV(RelNmV);
}
示例#5
0
void TCpDoc::SaveAsfaToCpd(const TStr& InFPath, const TStr& OutCpdFNm){
  // create output file
  PSOut SOut=TFOut::New(OutCpdFNm);
  // traverse files
  TStrH AccessionIdH;
  TFFile FFile(TStr::GetNrFPath(InFPath)+"*.Asfa"); TStr AsfaFNm;
  while (FFile.Next(AsfaFNm)){
    printf("Processing file '%s'\n", AsfaFNm.CStr());
    PSIn SIn=TFIn::New(AsfaFNm);
    TILx Lx(SIn, TFSet(iloRetEoln, iloExcept));
    Lx.GetSym(syLn, syEof);
    while (Lx.Sym!=syEof){
      // Query Line
      TStr QueryLnStr=Lx.Str;
      TStrV QueryStrV; QueryLnStr.SplitOnAllCh('\t', QueryStrV, false);
      IAssert(QueryStrV[0]=="Query");
      // RecordNo Line
      Lx.GetSym(syLn); TStr RecNoLnStr=Lx.Str;
      TStrV RecNoStrV; RecNoLnStr.SplitOnAllCh('\t', RecNoStrV, false);
      IAssert(RecNoStrV[0]=="RecordNo");
      //int RecN=RecNoStrV[1].GetInt();
      // fields (format: Short-Name Tab Long-Name Tab Value-String)
      TStr TitleStr, AbstractStr, PublicationYearStr, AccessionId;
      TStrV AuthorNmV; TStrV TermNmV1, TermNmV2;
      while (true){
        Lx.GetSym(syLn); TStr FldLnStr=Lx.Str;
        TStrV FldStrV; FldLnStr.SplitOnAllCh('\t', FldStrV, false);
        if (FldStrV[0]=="----"){
          if (!AccessionIdH.IsKey(AccessionId)){
            AccessionIdH.AddKey(AccessionId);
            // create & save cpd document
            PCpDoc CpDoc=TCpDoc::New();
            CpDoc->DocNm=AccessionId;
            CpDoc->DateStr=PublicationYearStr;
            CpDoc->TitleStr=TitleStr;
            CpDoc->ParStrV.Add(AbstractStr);
            CpDoc->TopCdNmV=TermNmV1;
            CpDoc->GeoCdNmV=TermNmV2;
            CpDoc->IndCdNmV=AuthorNmV;
            CpDoc->Save(*SOut);
          } else {/*printf("[%s]", AccessionId.CStr());*/}
          break;
        } else
        if (FldStrV[0]=="TI"){
          TitleStr=FldStrV[2];
        } else if (FldStrV[0]=="TI"){
          TitleStr=FldStrV[2];
        } else if (FldStrV[0]=="AU"){
          FldStrV[2].SplitOnAllCh(';', AuthorNmV);
          for (int StrN=0; StrN<AuthorNmV.Len(); StrN++){AuthorNmV[StrN].ToTrunc();}
        } else if (FldStrV[0]=="AB"){
          AbstractStr=FldStrV[2];
        } else if (FldStrV[0]=="PY"){
          PublicationYearStr=FldStrV[2];
        } else if (FldStrV[0]=="DE"){
          FldStrV[2].SplitOnAllCh(';', TermNmV1);
          for (int StrN=0; StrN<TermNmV1.Len(); StrN++){TermNmV1[StrN].ToTrunc();}
        } else if (FldStrV[0]=="CL"){
          FldStrV[2].SplitOnAllCh(';', TermNmV2);
          for (int StrN=0; StrN<TermNmV2.Len(); StrN++){TermNmV2[StrN].ToTrunc();}
        } else if (FldStrV[0]=="AN"){
          AccessionId=FldStrV[2];
        }
      }
      printf("%d\r", AccessionIdH.Len());
      Lx.GetSym(syLn, syEof);
    }
  }
}