TStr TSecTm::GetMonthNm(const TLoc& Loc) const { struct tm Tm; IAssert(IsDef() && GetTmStruct(AbsSecs(), Tm)); return TTmInfo::GetMonthNm(Tm.tm_mon+1, Loc); }
int main(int argc, char* argv[]) { Env = TEnv(argc, argv, TNotify::StdNotify); Env.PrepArgs(TStr::Fmt("Flow. build: %s, %s. Time: %s", __TIME__, __DATE__, TExeTm::GetCurTm())); double NetPRTimeSum = 0; double NetEKTimeSum = 0; int NumWins = 0; Try const TStr InFNm = Env.GetIfArgPrefixStr("-i:", "", "Input file"); const int Iters = Env.GetIfArgPrefixInt("-n:", 10, "Number of runs per thread"); const int Threads = Env.GetIfArgPrefixInt("-t:", 4, "Number of threads"); printf("Integer Flow Test\n"); printf("Filename: %s\n", InFNm.CStr()); printf("Building Network...\n"); TFIn InFile(InFNm); // If the input file is a binary, use the following line to load the network PNEANet Net = TNEANet::Load(InFile); // If the input file is a text file, use the following to load the network and save as binary // PNEANet Net; // int MaxEdgeCap = BuildCapacityNetwork(InFNm, Net); // const TStr OutFNm = Env.GetIfArgPrefixStr("-o:", "", "Output file"); // TFOut OutFile(OutFNm); // Net->Save(OutFile); printf("PNEANet Nodes: %d, Edges: %d\n\n", Net->GetNodes(), Net->GetEdges()); #pragma omp parallel for reduction(+:NetEKTimeSum,NetPRTimeSum,NumWins) schedule(static, 1) for (int t = 0; t < Threads; t++) { TRnd Random(t); for (int i = 0; i < Iters; i++) { int SrcNId = Net->GetRndNId(Random); int SnkNId = Net->GetRndNId(Random); double PRBeginTime = getcputime(); int NetMaxFlowPR = TSnap::GetMaxFlowIntPR(Net, SrcNId, SnkNId); double PREndTime = getcputime(); double NetPRFlowRunTime = PREndTime - PRBeginTime; double EKBeginTime = getcputime(); int NetMaxFlowEK = TSnap::GetMaxFlowIntEK(Net, SrcNId, SnkNId); double EKEndTime = getcputime(); double NetEKFlowRunTime = EKEndTime - EKBeginTime; IAssert(NetMaxFlowPR == NetMaxFlowEK); if (NetPRFlowRunTime < NetEKFlowRunTime) { NumWins++; } NetPRTimeSum += NetPRFlowRunTime; NetEKTimeSum += NetEKFlowRunTime; #pragma omp critical { #ifndef NOMP printf("Thread: %d\n", omp_get_thread_num()); #endif printf("Source: %d, Sink %d\n", SrcNId, SnkNId); printf("Max Flow: %d\n", NetMaxFlowEK); printf("PR CPU Time: %f\n", NetPRFlowRunTime); printf("EK CPU Time: %f\n", NetEKFlowRunTime); printf("\n"); } } } int TotalRuns = Iters*Threads; printf ("Avg PR PNEANet Time: %f\n", NetPRTimeSum/TotalRuns); printf ("Avg EK PNEANet Time: %f\n", NetEKTimeSum/TotalRuns); printf ("%d out of %d PR was faster\n", NumWins, TotalRuns); Catch return 0; }
uint64 TZipIn::GetFLen(const TStr& ZipFNm) { #ifdef GLib_WIN HANDLE ZipStdoutRd, ZipStdoutWr; // create pipes SECURITY_ATTRIBUTES saAttr; saAttr.nLength = sizeof(SECURITY_ATTRIBUTES); saAttr.bInheritHandle = TRUE; saAttr.lpSecurityDescriptor = NULL; // Create a pipe for the child process's STDOUT. const int PipeBufferSz = 32*1024; EAssertR(CreatePipe(&ZipStdoutRd, &ZipStdoutWr, &saAttr, PipeBufferSz), "Stdout pipe creation failed"); // Ensure the read handle to the pipe for STDOUT is not inherited. SetHandleInformation(ZipStdoutRd, HANDLE_FLAG_INHERIT, 0); //CreateZipProcess(GetCmd(FNm), FNm); { const TStr CmdLine = TStr::Fmt("7z.exe l \"%s\"", ZipFNm.CStr()); PROCESS_INFORMATION piProcInfo; STARTUPINFO siStartInfo; ZeroMemory( &piProcInfo, sizeof(PROCESS_INFORMATION)); ZeroMemory( &siStartInfo, sizeof(STARTUPINFO)); siStartInfo.cb = sizeof(STARTUPINFO); siStartInfo.hStdOutput = ZipStdoutWr; siStartInfo.dwFlags |= STARTF_USESTDHANDLES; // Create the child process. const BOOL FuncRetn = CreateProcess(NULL, (LPSTR) CmdLine.CStr(), NULL, NULL, TRUE, 0, NULL, NULL, &siStartInfo, &piProcInfo); EAssertR(FuncRetn!=0, TStr::Fmt("Can not execute '%s'", CmdLine.CStr()).CStr()); CloseHandle(piProcInfo.hProcess); CloseHandle(piProcInfo.hThread); } #else const TStr CmdLine = TStr::Fmt("7za l %s", ZipFNm.CStr()); FILE* ZipStdoutRd = popen(CmdLine.CStr(), "r"); if (ZipStdoutRd == NULL) { // try using SevenZipPath ZipStdoutRd = popen((TZipIn::SevenZipPath+"/"+CmdLine).CStr(), "r"); } EAssertR(ZipStdoutRd != NULL, TStr::Fmt("Can not execute '%s'", CmdLine.CStr()).CStr()); #endif // Read output from the child process const int BfSz = 32*1024; char* Bf = new char [BfSz]; int BfC=0, BfL=0; memset(Bf, 0, BfSz); #ifdef GLib_WIN DWORD BytesRead; EAssert(ReadFile(ZipStdoutRd, Bf, MxBfL, &BytesRead, NULL) != 0); #else size_t BytesRead = fread(Bf, 1, MxBfL, ZipStdoutRd); EAssert(BytesRead != 0); EAssert(pclose(ZipStdoutRd) != -1); #endif BfL = (int) BytesRead; IAssert((BfC!=0)||(BfL!=0)); BfC = 0; Bf[BfL] = 0; // find file lenght TStr Str(Bf); delete [] Bf; TStrV StrV; Str.SplitOnWs(StrV); int n = StrV.Len()-1; while (n > 0 && ! StrV[n].StartsWith("-----")) { n--; } if (n-7 <= 0) { WrNotify(TStr::Fmt("Corrupt file %s: MESSAGE:\n", ZipFNm.CStr()).CStr(), Str.CStr()); SaveToErrLog(TStr::Fmt("Corrupt file %s. Message:\n:%s\n", ZipFNm.CStr(), Str.CStr()).CStr()); return 0; } return StrV[n-7].GetInt64(); }
bool TFtrGen::Reg(const TStr& TypeNm, const TFtrGenLoadF& LoadF){ IAssert(!TypeToLoadFH.IsKey(TypeNm)); TypeToLoadFH.AddDat(TypeNm, LoadF); return true; }
int TFRnd::GetRecN(){ IAssert(RecAct); int FPos=GetFPos()-HdLen; EAssertR(FPos%RecLen==0, "Invalid position in file'"+FNm+"'."); return FPos/RecLen; }
TUNGraph::TEdgeI TUNGraph::GetEI(const int& SrcNId, const int& DstNId) const { const TNodeI SrcNI = GetNI(SrcNId); const int NodeN = SrcNI.NodeHI.GetDat().NIdV.SearchBin(DstNId); IAssert(NodeN != -1); return TEdgeI(SrcNI, EndNI(), NodeN); }
///////////////////////////////////////////////// // Roget-Base void TRBase::LoadArtfl(const TStr& WebBaseFPath){ PWebBase WebBase=PWebBase(new TWebMemBase(WebBaseFPath)); int WebPgP=WebBase->FFirstWebPg(); int WebPgId; while (WebBase->FNextWebPg(WebPgP, WebPgId)){ TStr UrlStr=WebBase->GetUrlStr(WebPgId); static TStr RgShStr="RG.sh"; if (!UrlStr.IsStrIn(RgShStr)){continue;} // if (!UrlStr.IsStrIn("RG.sh?^544\\")){continue;} PWebPg WebPg=WebBase->GetWebPg(WebPgId); PSIn SIn=TStrIn::New(WebPg->GetBodyStr()); PHtmlDoc HtmlDoc=THtmlDoc::New(SIn, hdtAll); int TokN=0; PHtmlTok Tok; THtmlLxSym Sym; TStr Str; // move to <h2> do {HtmlDoc->GetTok(TokN++, Sym, Str); } while (!((Sym==hlsyBTag)&&(Str==THtmlTok::H2TagNm))); // parse "ddd[A|B]." TChA CtgNm; TChA CtgIdNm; HtmlDoc->GetTok(TokN++, Sym, Str); IAssert(Sym==hlsyNum); CtgNm+=Str; CtgIdNm+=Str; HtmlDoc->GetTok(TokN++, Sym, Str); if (Sym==hlsyStr){ IAssert((Str=='A')||(Str=='B')); CtgNm+=Str; CtgIdNm+=Str; HtmlDoc->GetTok(TokN++, Sym, Str); } IAssert((Sym==hlsySSym)&&(Str=='.')); CtgNm+=Str; // parse to </h2>" TChA BracketStr; HtmlDoc->GetTok(TokN++, Sym, Str); while (!((Sym==hlsyETag)&&(Str==THtmlTok::H2TagNm))){ if ((Sym==hlsySSym)&&(Str=='[')){ HtmlDoc->GetTok(TokN++, Sym, Str); while (!((Sym==hlsySSym)&&(Str==']'))){ if ((!BracketStr.Empty())&&(Sym==hlsyStr)){BracketStr+=' ';} BracketStr+=Str; HtmlDoc->GetTok(TokN++, Sym, Str); } BracketStr.Ins(0, " ["); BracketStr+=']'; } else { if (Sym==hlsyStr){CtgNm+=' ';} CtgNm+=Str; } HtmlDoc->GetTok(TokN++, Sym, Str); } CtgNm+=BracketStr; TNotify::OnNotify(Notify, ntInfo, CtgNm); // parse words static TStr AdjStr="ADJ"; static TStr AdvStr="ADV"; static TStr IntStr="INT"; static TStr PgStr="PAGE"; static TStr PhrStr="PHR"; static TStr PrefStr="PREF"; static TStr PronStr="PRON"; HtmlDoc->GetTok(TokN++, Sym, Str); IAssert((Sym==hlsyStr)&&((Str=='N')||(Str==AdvStr))); while (TokN<HtmlDoc->GetToks()){ if (Sym==hlsyStr){ if (Str==PhrStr){break;} if ((Str!='N')&&(Str!='V')&&(Str!=AdjStr)&&(Str!=AdvStr)&& (Str!=IntStr)&&(Str!=PrefStr)&&(Str!=PronStr)){ TChA WordStr; do { if (!WordStr.Empty()){WordStr+=' ';} WordStr+=Str; HtmlDoc->GetTok(TokN++, Sym, Str); } while (Sym==hlsyStr); // TNotify::OnNotify(Notify, ntInfo, WordStr); } else { HtmlDoc->GetTok(TokN++, Sym, Str); } } else if (Sym==hlsySSym){ TStr ExpectStr; if (Str=='('){ExpectStr=')';} else if (Str=='['){ExpectStr=']';} else if (Str=='{'){ExpectStr='}';} else if (Str=='"'){ExpectStr='"';} if (!ExpectStr.Empty()){ do {HtmlDoc->GetTok(TokN++, Sym, Str); } while (!((Sym==hlsySSym)&&(Str==ExpectStr))); } HtmlDoc->GetTok(TokN++, Sym, Str); } else { HtmlDoc->GetTok(TokN++, Sym, Str); } } } }
PBowDocBs TBowFl::LoadTsactTxt(const TStr& FNm, const int& MxDocs){ // prepare document set PBowDocBs BowDocBs=TBowDocBs::New(); // open file PSIn SIn=TFIn::New(FNm); printf("Loading '%s' ...\n", FNm.CStr()); if (!SIn->Eof()){ // current document int CurDId=-1; TIntH CurDocWIdToFqH(100); // read first character char Ch=SIn->GetCh(); // skip to the first digit or eof while ((!SIn->Eof())&&(!(('0'<=Ch)&&(Ch<='9')))){ Ch=SIn->GetCh();} while (!SIn->Eof()){ // notify if (BowDocBs->DocSpVV.Len()%1000==0){ printf("%d transactions read\r", BowDocBs->DocSpVV.Len());} // read transaction number IAssert(('0'<=Ch)&&(Ch<='9')); int TsactN=Ch-'0'; Ch=SIn->GetCh(); while (('0'<=Ch)&&(Ch<='9')){ TsactN=TsactN*10+Ch-'0'; Ch=SIn->GetCh();} // skip space while (!(('0'<=Ch)&&(Ch<='9'))){ IAssert((Ch==' ')||(Ch=='\t')); Ch=SIn->GetCh();} // read item number IAssert(('0'<=Ch)&&(Ch<='9')); int ItemN=Ch-'0'; Ch=SIn->GetCh(); while (('0'<=Ch)&&(Ch<='9')){ ItemN=ItemN*10+Ch-'0'; Ch=SIn->GetCh();} // skip to the next digit or eof while ((!SIn->Eof())&&(!(('0'<=Ch)&&(Ch<='9')))){ Ch=SIn->GetCh();} // get document-id from transaction-id TStr DocNm=TInt::GetStr(TsactN); int DId=BowDocBs->DocNmToDescStrH.AddKey(DocNm); // get word-id from item-id TStr WordStr=TInt::GetStr(ItemN); int WId=BowDocBs->WordStrToDescH.AddKey(WordStr); BowDocBs->WordStrToDescH[WId].Fq++; // add word to document if (CurDId!=DId){ if (CurDId!=-1){ if ((MxDocs!=-1)&&(BowDocBs->GetDocs()>=MxDocs-1)){break;} PBowSpV NewSpV=TBowSpV::New(CurDId, CurDocWIdToFqH.Len()); int NewDId=BowDocBs->DocSpVV.Add(NewSpV); IAssert(NewDId==CurDId); for (int DocWIdN=0; DocWIdN<CurDocWIdToFqH.Len(); DocWIdN++){ int WId=CurDocWIdToFqH.GetKey(DocWIdN); int WordFq=CurDocWIdToFqH[DocWIdN]; NewSpV->AddWIdWgt(WId, WordFq); } NewSpV->Sort(); } CurDId=DId; CurDocWIdToFqH.Clr(false); } CurDocWIdToFqH.AddDat(WId)++; } // save last document if (CurDId!=-1){ PBowSpV NewSpV=TBowSpV::New(CurDId, CurDocWIdToFqH.Len()); BowDocBs->DocSpVV.Add(NewSpV); for (int DocWIdN=0; DocWIdN<CurDocWIdToFqH.Len(); DocWIdN++){ int WId=CurDocWIdToFqH.GetKey(DocWIdN); int WordFq=CurDocWIdToFqH[DocWIdN]; NewSpV->AddWIdWgt(WId, WordFq); } NewSpV->Sort(); } printf("%d transactions read\n", BowDocBs->DocSpVV.Len()); } printf("... Done.\n"); // return results BowDocBs->AssertOk(); return BowDocBs; }
PBowDocBs TBowFl::LoadSvmLightTxt( const TStr& DocDefFNm, const TStr& WordDefFNm, const TStr& TrainDataFNm, const TStr& TestDataFNm, const int& MxDocs){ //TODO: use MxDocs // prepare document set PBowDocBs BowDocBs=TBowDocBs::New(); int MOneCId=BowDocBs->CatNmToFqH.AddKey("-1"); int POneCId=BowDocBs->CatNmToFqH.AddKey("+1"); // document definition bool DocDefP=false; if (!DocDefFNm.Empty()&&(TFile::Exists(DocDefFNm))){ // (DId "DoxNm"<eoln>)* PSIn SIn=TFIn::New(DocDefFNm); TILx Lx(SIn, TFSet()|iloRetEoln|iloSigNum|iloExcept); Lx.GetSym(syInt, syEof); while (Lx.Sym==syInt){ int DId=Lx.Int; Lx.GetSym(syColon); Lx.GetSym(syQStr); TStr DocNm=Lx.Str; Lx.GetSym(syEoln); Lx.GetSym(syInt, syEof); int NewDId=BowDocBs->DocNmToDescStrH.AddKey(DocNm); EAssertR(DId==NewDId, "Document-Ids don't match."); } DocDefP=true; } // word definition if (!WordDefFNm.Empty()&&(TFile::Exists(WordDefFNm))){ BowDocBs->WordStrToDescH.AddDat("Undef").Fq=0; // ... to have WId==0 PSIn SIn=TFIn::New(WordDefFNm); TILx Lx(SIn, TFSet()|iloRetEoln|iloSigNum|iloExcept); Lx.GetSym(syQStr, syEof); while (Lx.Sym==syQStr){ TStr WordStr=Lx.Str; Lx.GetSym(syInt); int WId=Lx.Int; Lx.GetSym(syInt); int WordFq=Lx.Int; Lx.GetSym(syEoln); Lx.GetSym(syQStr, syEof); int NewWId=BowDocBs->WordStrToDescH.AddKey(WordStr); EAssertR(WId==NewWId, "Word-Ids don't match."); BowDocBs->WordStrToDescH[WId].Fq=WordFq; } } // train & test data int MxWId=-1; TIntIntH WIdToFqH; // train data if (!TrainDataFNm.Empty()){ PSIn SIn=TFIn::New(TrainDataFNm); TILx Lx(SIn, TFSet()|iloCmtAlw|iloRetEoln|iloSigNum|iloExcept); // skip comment lines while (Lx.GetSym(syInt, syEoln, syEof)==syEoln){} // parse data lines while (Lx.Sym==syInt){ // document TStr DocNm=TInt::GetStr(BowDocBs->GetDocs()); int DId; if (DocDefP){ DId=BowDocBs->DocNmToDescStrH.GetKeyId(DocNm); } else { DId=BowDocBs->DocNmToDescStrH.AddKey(DocNm); } BowDocBs->TrainDIdV.Add(DId); // category (class value) int CId=(Lx.Int==-1) ? MOneCId : POneCId; BowDocBs->DocCIdVV.Add(); IAssert(DId==BowDocBs->DocCIdVV.Len()-1); BowDocBs->DocCIdVV.Last().Gen(1, 0); BowDocBs->DocCIdVV.Last().Add(CId); // words (attributes) PBowSpV SpV=TBowSpV::New(DId); BowDocBs->DocSpVV.Add(SpV); IAssert(DId==BowDocBs->DocSpVV.Len()-1); Lx.GetSym(syInt, syEoln); while (Lx.Sym==syInt){ int WId=Lx.Int; Lx.GetSym(syColon); Lx.GetSym(syFlt); double WordFq=Lx.Flt; Lx.GetSym(syInt, syEoln); SpV->AddWIdWgt(WId, WordFq); if (MxWId==-1){MxWId=WId;} else {MxWId=TInt::GetMx(MxWId, WId);} WIdToFqH.AddDat(WId)++; } if (!Lx.CmtStr.Empty()){ // change document name to 'N' if comment 'docDesc=N' TStr CmtStr=Lx.CmtStr; static TStr DocNmPrefixStr="docDesc="; if (CmtStr.IsPrefix(DocNmPrefixStr)){ TStr NewDocNm= TStr("D")+CmtStr.GetSubStr(DocNmPrefixStr.Len(), CmtStr.Len()-1); BowDocBs->DocNmToDescStrH.DelKey(DocNm); int NewDId=BowDocBs->DocNmToDescStrH.AddKey(NewDocNm); IAssert(DId==NewDId); } } SpV->Trunc(); while (Lx.GetSym(syInt, syEoln, syEof)==syEoln){} } } // test data if (!TestDataFNm.Empty()){ PSIn SIn=TFIn::New(TestDataFNm); TILx Lx(SIn, TFSet()|iloCmtAlw|iloRetEoln|iloSigNum|iloExcept); while (Lx.GetSym(syInt, syEoln, syEof)==syEoln){} while (Lx.Sym==syInt){ // document TStr DocNm=TInt::GetStr(BowDocBs->GetDocs()); int DId; if (DocDefP){ DId=BowDocBs->DocNmToDescStrH.GetKeyId(DocNm); } else { DId=BowDocBs->DocNmToDescStrH.AddKey(DocNm); } BowDocBs->TestDIdV.Add(DId); // category (class value) int CId=(Lx.Int==-1) ? MOneCId : POneCId; BowDocBs->DocCIdVV.Add(); IAssert(DId==BowDocBs->DocCIdVV.Len()-1); BowDocBs->DocCIdVV.Last().Gen(1, 0); BowDocBs->DocCIdVV.Last().Add(CId); // words (attributes) PBowSpV SpV=TBowSpV::New(DId); BowDocBs->DocSpVV.Add(SpV); IAssert(DId==BowDocBs->DocSpVV.Len()-1); Lx.GetSym(syInt, syEoln); while (Lx.Sym==syInt){ int WId=Lx.Int; Lx.GetSym(syColon); Lx.GetSym(syFlt); double WordFq=Lx.Flt; Lx.GetSym(syInt, syEoln); SpV->AddWIdWgt(WId, WordFq); if (MxWId==-1){MxWId=WId;} else {MxWId=TInt::GetMx(MxWId, WId);} WIdToFqH.AddDat(WId)++; } if (!Lx.CmtStr.Empty()){ // change document name to 'N' if comment 'docDesc=N' TStr CmtStr=Lx.CmtStr; static TStr DocNmPrefixStr="docDesc="; if (CmtStr.IsPrefix(DocNmPrefixStr)){ TStr NewDocNm= TStr("D")+CmtStr.GetSubStr(DocNmPrefixStr.Len(), CmtStr.Len()-1); BowDocBs->DocNmToDescStrH.DelKey(DocNm); int NewDId=BowDocBs->DocNmToDescStrH.AddKey(NewDocNm); IAssert(DId==NewDId); } } SpV->Trunc(); while (Lx.GetSym(syInt, syEoln, syEof)==syEoln){} } } // add missing words for (int WId=0; WId<=MxWId; WId++){ if (!BowDocBs->IsWId(WId)){ TStr WordStr=TInt::GetStr(WId, "W%d"); int _WId=BowDocBs->AddWordStr(WordStr); IAssert(WId==_WId); TInt Fq; if (WIdToFqH.IsKeyGetDat(WId, Fq)){ BowDocBs->PutWordFq(WId, Fq); } } } BowDocBs->AssertOk(); return BowDocBs; }
uint TSecTm::GetDSecs(const TSecTm& SecTm1, const TSecTm& SecTm2){ IAssert(SecTm1.IsDef()&&SecTm2.IsDef()); const time_t Time1= time_t(SecTm1.AbsSecs()); const time_t Time2= time_t(SecTm2.AbsSecs()); return uint(difftime(Time2, Time1)); }
void TSecTm::SaveTxt(TOLx& Lx) const { IAssert(int(AbsSecs) < TInt::Mx); Lx.PutInt((int)AbsSecs); }
int TSecTm::GetSecN() const { struct tm Tm; IAssert(IsDef() && GetTmStruct(AbsSecs(), Tm)); return Tm.tm_sec; }
TStr TSecTm::GetDayOfWeekNm(const TLoc& Loc) const { struct tm Tm; IAssert(IsDef() && GetTmStruct(AbsSecs(), Tm)); return TTmInfo::GetDayOfWeekNm(Tm.tm_wday+1, Loc); }
int TSecTm::GetDayOfWeekN() const { struct tm Tm; IAssert(IsDef() && GetTmStruct(AbsSecs(), Tm)); return Tm.tm_wday + 1; }
/// R-MAT Generator. The modes is based on the recursive descent into a 2x2 /// matrix [A,B; C, 1-(A+B+C)]. /// See: R-MAT Generator: A Recursive Model for Graph Mining. /// D. Chakrabarti, Y. Zhan and C. Faloutsos, in SIAM Data Mining 2004. /// URL: http://www.cs.cmu.edu/~deepay/mywww/papers/siam04.pdf PNGraph GenRMat(const int& Nodes, const int& Edges, const double& A, const double& B, const double& C, TRnd& Rnd) { PNGraph GraphPt = TNGraph::New(); TNGraph& Graph = *GraphPt; Graph.Reserve(Nodes, Edges); IAssert(A+B+C < 1.0); int rngX, rngY, offX, offY; int Depth=0, Collisions=0, Cnt=0, PctDone=0; const int EdgeGap = Edges / 100 + 1; // sum of parameters (probabilities) TVec<double> sumA(128, 0), sumAB(128, 0), sumAC(128, 0), sumABC(128, 0); // up to 2^128 vertices ~ 3.4e38 for (int i = 0; i < 128; i++) { const double a = A * (Rnd.GetUniDev() + 0.5); const double b = B * (Rnd.GetUniDev() + 0.5); const double c = C * (Rnd.GetUniDev() + 0.5); const double d = (1.0 - (A+B+C)) * (Rnd.GetUniDev() + 0.5); const double abcd = a+b+c+d; sumA.Add(a / abcd); sumAB.Add((a+b) / abcd); sumAC.Add((a+c) / abcd); sumABC.Add((a+b+c) / abcd); } // nodes for (int node = 0; node < Nodes; node++) { IAssert(Graph.AddNode(-1) == node); } // edges for (int edge = 0; edge < Edges; ) { rngX = Nodes; rngY = Nodes; offX = 0; offY = 0; Depth = 0; // recurse the matrix while (rngX > 1 || rngY > 1) { const double RndProb = Rnd.GetUniDev(); if (rngX>1 && rngY>1) { if (RndProb < sumA[Depth]) { rngX/=2; rngY/=2; } else if (RndProb < sumAB[Depth]) { offX+=rngX/2; rngX-=rngX/2; rngY/=2; } else if (RndProb < sumABC[Depth]) { offY+=rngY/2; rngX/=2; rngY-=rngY/2; } else { offX+=rngX/2; offY+=rngY/2; rngX-=rngX/2; rngY-=rngY/2; } } else if (rngX>1) { // row vector if (RndProb < sumAC[Depth]) { rngX/=2; rngY/=2; } else { offX+=rngX/2; rngX-=rngX/2; rngY/=2; } } else if (rngY>1) { // column vector if (RndProb < sumAB[Depth]) { rngX/=2; rngY/=2; } else { offY+=rngY/2; rngX/=2; rngY-=rngY/2; } } else { Fail; } Depth++; } // add edge const int NId1 = offX; const int NId2 = offY; if (NId1 != NId2 && ! Graph.IsEdge(NId1, NId2)) { Graph.AddEdge(NId1, NId2); if (++Cnt > EdgeGap) { Cnt=0; printf("\r %d%% edges", ++PctDone); } edge++; } else { Collisions++; } } printf("\r RMat: nodes:%d, edges:%d, Iterations:%d, Collisions:%d (%.1f%%).\n", Nodes, Edges, Edges+Collisions, Collisions, 100*Collisions/double(Edges+Collisions)); Graph.Defrag(); return GraphPt; }
////////////////////////////////////////////////////////////////////////// // Partial-Gram-Schmidt TPartialGS::TPartialGS(PSVMTrainSet BigSet, const int& Dim, const double& Eps) { IAssert(Dim <= BigSet->Len() && 0.0 <= Eps && Eps < 1.0); int Len = BigSet->Len(); TVec<TKeyDat<TFlt, TBool> > NiV(Len); for (int i = 0; i < Len; i++) { //NiV[i].Key = BigSet->DotProduct(i, i); NiV[i].Key = BigSet->GetNorm2(i); NiV[i].Dat = false; IAssertR(NiV[i].Key.Val > 0.0 && _isnan(NiV[i].Key.Val) == 0, TInt::GetStr(i) + TStr(":") + TFlt::GetStr(NiV[i].Key)); } R.Gen(Dim, 0); //for (i = 0; i < Dim; i++) R[i].Gen(Len-i); IdV.Gen(Len); for (int i = 0; i < Len; i++) IdV[i] = i; TFltV BlufV(Dim, 0); int max = -1; for (int j = 0; j < Dim; j++) { // find element with bigest residual norm max = -1; for (int t = 0, l = Len; t < l; t++) if (!NiV[t].Dat && (max == -1 || NiV[t].Key > NiV[max].Key)) max = t; // if max residual norm is reached if (NiV[max].Key.Val < Eps) break; //printf("(%.2f)", NiV[max].Key.Val); // permute j-th and max-th column of R NiV[max].Dat = true; int mid = IdV.SearchForw(max, j); { int tmp = IdV[j]; IdV[j] = max; IdV[mid] = tmp; } for (int t = 0; t < j; t++) { double tmp = R[t][j-t]; R[t][j-t] = R[t][mid-t]; R[t][mid-t] = tmp; } // calculate j-th row of R and update NiV (residual norms) if (-0.001 < NiV[max].Key.Val && NiV[max].Key.Val < 0) NiV[max].Key.Val = 0.0; IAssertR(NiV[max].Key.Val >= 0.0, TInt::GetStr(j) + TStr(":") + TFlt::GetStr(NiV[max].Key.Val)); IAssert(R.Len() == j); R.Add(TFltV()); R[j].Gen(Len-j); // NEW R[j][0] = sqrt(NiV[max].Key.Val); BlufV.Add(NiV[IdV[j]].Key.Val); for (int i = j+1; i < Len; i++) { double RR = BigSet->DotProduct(IdV[i], IdV[j]); for (int t = 0; t < j; t++) RR -= R[t][j-t] * R[t][i-t]; IAssertR(NiV[IdV[j]].Key.Val>0, TInt::GetStr(i)); RR /= sqrt(NiV[IdV[j]].Key.Val); IAssertR(_isnan(RR) == 0, TInt::GetStr(IdV[j]) + TStr(":") + TFlt::GetStr(NiV[IdV[j]].Key.Val)); R[j][i-j] = RR; NiV[IdV[i]].Key -= RR*RR; } } if (max == -1) max = 0; printf("stoped at %d/%d with residual norm %.3f\n", R.Len(), BigSet->Len(), NiV[max].Key.Val); NormV.Gen(Len); VecNormV.Gen(Len); for (int i = 0; i < Len; i++) { NormV[i] = NiV[IdV[i]].Key; VecNormV[i] = GetKernel(i,i); } }
void THttpChDef::SetChTy(const THttpChTy& ChTy, const char& Ch){ IAssert(ChTyV[Ch-TCh::Mn]==int(hpctUndef)); ChTyV[Ch-TCh::Mn]=TInt(ChTy);}
PSIn TILx::GetSIn(const char& SepCh){ IAssert(PrevSymStStack.Empty()); while ((Ch!=TCh::EofCh)&&(Ch!=SepCh)){GetCh();} return SIn; }
int main(int argc, char* argv[]){ Try; // create environment Env=TEnv(argc, argv, TNotify::StdNotify); // get command line parameters Env.PrepArgs("Crawl-Base to Text", 0); TStr InCrawlBsFNm=Env.GetIfArgPrefixStr("-i:", "", "Crawl-Base-FileName"); TStr OutTxtFNm=Env.GetIfArgPrefixStr("-ot:", "Crawl.Txt", "Output-Text-Filename"); TStr OutStatFNm=Env.GetIfArgPrefixStr("-os:", "Crawl.Stat.Txt", "Output-Statistics-Text-Filename"); bool SaveContP=Env.GetIfArgPrefixBool("-sc:", false, "Save-Content"); bool SaveContOutUrlP=Env.GetIfArgPrefixBool("-scou:", true, "Save-Content-Outgoing-Urls"); bool SaveContTagP=Env.GetIfArgPrefixBool("-sct:", true, "Save-Content-Tags"); bool SaveOutUrlP=Env.GetIfArgPrefixBool("-sou:", false, "Save-Outgoing-Urls"); bool SaveCTxtP=Env.GetIfArgPrefixBool("-sctc:", false, "Save-Continuos-Text-Content"); int MnCTxtToks=Env.GetIfArgPrefixInt("-mctt:", 100, "Minimal-Continuos-Text-Tokens"); TStrV BlockedDmNmV=Env.GetIfArgPrefixStrV("-bd:", "Blocked-Domain-Names (multiple)"); if (Env.IsEndOfRun()){return 0;} // -i:si -sc:t -scou:n -sct:n -sctc:t -bd:.delo.si -bd:.dnevnik.si -bd:.vecer.si TStr BlobBsFMid=InCrawlBsFNm.GetFMid(); // output file TFOut TxtFOut(OutTxtFNm); FILE* fTxt=TxtFOut.GetFileId(); fprintf(fTxt, "Comment:input=%s\n", InCrawlBsFNm.CStr()); fprintf(fTxt, "Comment:output=%s\n", OutTxtFNm.CStr()); fprintf(fTxt, "BlobBaseName:%s\n", BlobBsFMid.CStr()); // statistics TStrIntH HostNmToFqH; TStrIntH StatusCdToFqH; TStrIntH ContTypeToFqH; PMom HttpContLenMom=TMom::New(); PBlobBs CrawlBBs=TMBlobBs::New(InCrawlBsFNm); TBlobPt TrvCrawlBPt=CrawlBBs->FFirstBlobPt(); TBlobPt CrawlBPt; PSIn CrawlBlobSIn; int CrawlBlobN=0; while (CrawlBBs->FNextBlobPt(TrvCrawlBPt, CrawlBPt, CrawlBlobSIn)){ CrawlBlobN++; printf("%d\r", CrawlBlobN); TStr DateTimeStr(*CrawlBlobSIn); //TStr DateTimeStr; TStr UrlStr(*CrawlBlobSIn); PUrl Url=TUrl::New(UrlStr); IAssert(Url->IsOk(usHttp)); TMem HttpRespMem(*CrawlBlobSIn); PSIn HttpRespSIn=HttpRespMem.GetSIn(); PHttpResp HttpResp=THttpResp::New(HttpRespSIn); // statistics HostNmToFqH.AddDat(Url->GetHostNm())++; StatusCdToFqH.AddDat(TInt::GetStr(HttpResp->GetStatusCd()))++; ContTypeToFqH.AddDat(HttpResp->GetFldVal(THttp::ContTypeFldNm))++; int ContLen=HttpResp->GetFldVal(THttp::ContLenFldNm).GetInt(-1); if (ContLen!=-1){ HttpContLenMom->Add(ContLen);} // check blocked domain-names if (!BlockedDmNmV.Empty()){ TStr DmNm=Url->GetDmNm(); int BlockedDmP=false; for (int BDmNmN=0; BDmNmN<BlockedDmNmV.Len(); BDmNmN++){ if (DmNm.IsSuffix(BlockedDmNmV[BDmNmN])){ BlockedDmP=true; break; } } if (BlockedDmP){ continue; } } // check continuos-text if (SaveCTxtP&&IsCTxtHttpResp(Url, HttpResp, MnCTxtToks)){continue;} if (HttpResp->IsStatusCd_Ok()){ PWebPg WebPg=TWebPg::New(UrlStr, HttpResp); fprintf(fTxt, "Start:HttpOk\n"); fprintf(fTxt, "BlobBaseAddress:bb://%s/%d/%d\n", BlobBsFMid.CStr(), CrawlBPt.GetSeg(), CrawlBPt.GetAddr()); fprintf(fTxt, "DateTime:%s\n", DateTimeStr.CStr()); fprintf(fTxt, "Url:%s\n", UrlStr.CStr()); fprintf(fTxt, "UrlMd5:%s\n", TMd5Sig(UrlStr).GetStr().CStr()); fprintf(fTxt, "AtomName:%s\n", TUrl::GetTopDownDocNm(UrlStr).CStr()); for (int FldN=0; FldN<HttpResp->GetFlds(); FldN++){ TStr FldNm; TStr FldVal; HttpResp->GetFldNmVal(FldN, FldNm, FldVal); fprintf(fTxt, "HttpField:%s=%s\n", FldNm.CStr(), FldVal.CStr()); } TMem BodyMem=HttpResp->GetBodyAsMem(); fprintf(fTxt, "BodyMd5:%s\n", TMd5Sig(BodyMem).GetStr().CStr()); // text if (SaveContP){ if (HttpResp->IsContType(THttp::TextHtmlFldVal)){ TStr HtmlStr=BodyMem.GetAsStr(); TStr TxtStr=THtmlDoc::GetTxtLnDoc(HtmlStr, UrlStr, SaveContOutUrlP, SaveContTagP); fprintf(fTxt, "Content:%s\n", TxtStr.CStr()); } } // outgoing-urls if (SaveOutUrlP){ TUrlV OutUrlV; WebPg->GetOutUrlV(OutUrlV); for (int OutUrlN=0; OutUrlN<OutUrlV.Len(); OutUrlN++){ TStr OutUrlStr=OutUrlV[OutUrlN]->GetUrlStr(); fprintf(fTxt, "OutUrl:%s\n", OutUrlStr.CStr()); } } fprintf(fTxt, "End:HttpOk\n"); } else if (HttpResp->IsStatusCd_Redir()){ TStr RedirUrlStr=HttpResp->GetFldVal(THttp::LocFldNm); PUrl RedirUrl=TUrl::New(RedirUrlStr, UrlStr); if (RedirUrl->IsOk(usHttp)){ TStr RedirUrlStr=RedirUrl->GetUrlStr(); fprintf(fTxt, "Start:HttpRedirection\n"); fprintf(fTxt, "BlobBaseAddress:bb://%s/%d/%d\n", BlobBsFMid.CStr(), CrawlBPt.GetSeg(), CrawlBPt.GetAddr()); fprintf(fTxt, "DateTime:%s\n", DateTimeStr.CStr()); fprintf(fTxt, "Url:%s\n", UrlStr.CStr()); fprintf(fTxt, "UrlMd5:%s\n", TMd5Sig(UrlStr).GetStr().CStr()); fprintf(fTxt, "AtomName:%s\n", TUrl::GetTopDownDocNm(UrlStr).CStr()); fprintf(fTxt, "RedirectionUrl:%s\n", RedirUrlStr.CStr()); for (int FldN=0; FldN<HttpResp->GetFlds(); FldN++){ TStr FldNm; TStr FldVal; HttpResp->GetFldNmVal(FldN, FldNm, FldVal); fprintf(fTxt, "HttpField:%s=%s\n", FldNm.CStr(), FldVal.CStr()); } fprintf(fTxt, "End:HttpRedirection\n"); } } } // statistics HttpContLenMom->Def(); if (!OutStatFNm.Empty()){ TFOut StatFOut(OutStatFNm); FILE* fStat=StatFOut.GetFileId(); TIntStrPrV FqStatusCdPrV; StatusCdToFqH.GetDatKeyPrV(FqStatusCdPrV); TIntStrPrV FqContTypePrV; ContTypeToFqH.GetDatKeyPrV(FqContTypePrV); // hosts {fprintf(fStat, "================================================\n"); TIntStrPrV FqHostNmPrV; HostNmToFqH.GetDatKeyPrV(FqHostNmPrV); FqHostNmPrV.Sort(false); int HostNmsSum=0; fprintf(fStat, "Hosts (%d):\n", FqHostNmPrV.Len()); for (int HostNmN=0; HostNmN<FqHostNmPrV.Len(); HostNmN++){ fprintf(fStat, "%7d '%s'\n", FqHostNmPrV[HostNmN].Val1, FqHostNmPrV[HostNmN].Val2.CStr()); HostNmsSum+=FqHostNmPrV[HostNmN].Val1; } fprintf(fStat, "----------\n"); fprintf(fStat, "%7d %s\n", HostNmsSum, "Sum"); fprintf(fStat, "================================================\n");} // status-code {fprintf(fStat, "================================================\n"); TIntStrPrV FqStatusCdPrV; StatusCdToFqH.GetDatKeyPrV(FqStatusCdPrV); FqStatusCdPrV.Sort(false); int StatusCdsSum=0; fprintf(fStat, "Status-Codes (%d):\n", FqStatusCdPrV.Len()); for (int StatusCdN=0; StatusCdN<FqStatusCdPrV.Len(); StatusCdN++){ fprintf(fStat, "%7d '%s'\n", FqStatusCdPrV[StatusCdN].Val1, FqStatusCdPrV[StatusCdN].Val2.CStr()); StatusCdsSum+=FqStatusCdPrV[StatusCdN].Val1; } fprintf(fStat, "----------\n"); fprintf(fStat, "%7d %s\n", StatusCdsSum, "Sum"); fprintf(fStat, "================================================\n");} // content-type {fprintf(fStat, "================================================\n"); TIntStrPrV FqContTypePrV; ContTypeToFqH.GetDatKeyPrV(FqContTypePrV); FqContTypePrV.Sort(false); int ContTypesSum=0; fprintf(fStat, "Content-Types (%d):\n", FqContTypePrV.Len()); for (int ContTypeN=0; ContTypeN<FqContTypePrV.Len(); ContTypeN++){ fprintf(fStat, "%7d '%s'\n", FqContTypePrV[ContTypeN].Val1, FqContTypePrV[ContTypeN].Val2.CStr()); ContTypesSum+=FqContTypePrV[ContTypeN].Val1; } fprintf(fStat, "----------\n"); fprintf(fStat, "%7d %s\n", ContTypesSum, "Sum"); fprintf(fStat, "================================================\n");} // content-length {fprintf(fStat, "================================================\n"); fprintf(fStat, "Content-length:\n"); if (HttpContLenMom->IsUsable()){ TStr MomStr=HttpContLenMom->GetStr('\n', ':', true, false, "%g"); fprintf(fStat, "%s\n", MomStr.CStr()); } else { fprintf(fStat, "Statistics not usable.\n"); } fprintf(fStat, "================================================\n");} } return 0; Catch; return 1; }
TLxSym TILx::GetSym(const TFSet& Expect){ CmtStr.Clr(); if (!PrevSymStStack.Empty()){ // symbols already on the stack PrevSymStStack.Top().Restore(*this); PrevSymStStack.Pop(); } else if (Expect.In(syLn)){ // symbol is the whole line string if (Ch==TCh::EofCh){ Sym=syEof; } else { Str.Clr(); if (IsBof()){GetCh();} while (!ChDef->IsTerm(Ch)){Str.AddCh(Ch); GetCh();} bool _IsRetEoln=IsRetEoln; IsRetEoln=true; GetSym(TFSet()|syEoln|syEof); Sym=syLn; IsRetEoln=_IsRetEoln; } } else if (IsTabSep){ // symbol is between tab characters if (IsBof()){GetCh();} if (Ch==TCh::TabCh){ // tab character Sym=syTab; GetCh(); } else if (ChDef->IsTerm(Ch)){ // eoln & eof characters bool _IsRetEoln=IsRetEoln; IsRetEoln=true; IsTabSep=false; GetSym(TFSet()|syEoln|syEof); IsRetEoln=_IsRetEoln; IsTabSep=true; } else { Str.Clr(); while ((!ChDef->IsTerm(Ch))&&(Ch!=TCh::TabCh)){ Str.AddCh(Ch); UcStr.AddCh(ChDef->GetUc(Ch)); GetCh();} Sym=syStr; QuoteP=false; } } else { // usual symbol while (ChDef->IsSpace(Ch)){GetCh();} SymLnN=LnN; SymLnChN=LnChN; SymChN=ChN; if (ChDef->IsAlpha(Ch)){ if (IsUniStr){Sym=syStr;} else {Sym=syIdStr;} Str.Clr(); UcStr.Clr(); QuoteP=false; do {Str.AddCh(Ch); UcStr.AddCh(ChDef->GetUc(Ch));} while (ChDef->IsAlNum(GetCh())); if (!RwStrH.Empty()){ TStr RwStr=Str; if (!IsCsSens){RwStr=UcStr;} int SymKeyId=RwStrH.GetKeyId(RwStr); if (SymKeyId!=-1){Sym=TLxSym(int(RwStrH[SymKeyId]));} } if (Expect.In(syBool)){ Sym=syBool; IAssert(TBool::IsValStr(Str)); Bool=TBool::GetValFromStr(Str); } } else if ((Ch=='"')||(Ch=='\'')){ if (IsUniStr){Sym=syStr;} else {Sym=syQStr;} Str.Clr(); UcStr.Clr(); QuoteP=true; QuoteCh=Ch; GetCh(); forever{ while ((Ch!=QuoteCh)&&(Ch!='\\')&&(Ch!=TCh::EofCh)){ Str.AddCh(Ch); UcStr.AddCh(ChDef->GetUc(Ch)); GetCh();} if (Ch==TCh::EofCh){ Sym=syUndef; break; } else if (Ch==QuoteCh){ GetCh(); break; } else { GetCh(); switch (Ch){ case '"': Str.AddCh(Ch); UcStr.AddCh(ChDef->GetUc(Ch)); GetCh(); break; case '\\': Str.AddCh(Ch); UcStr.AddCh(ChDef->GetUc(Ch)); GetCh(); break; case '\'': Str.AddCh(Ch); UcStr.AddCh(ChDef->GetUc(Ch)); GetCh(); break; case '/': Str.AddCh(Ch); UcStr.AddCh(ChDef->GetUc(Ch)); GetCh(); break; case 'b': Str.AddCh('\b'); UcStr.AddCh(ChDef->GetUc(Ch)); GetCh(); break; case 'f': Str.AddCh('\f'); UcStr.AddCh(ChDef->GetUc(Ch)); GetCh(); break; case 'n': Str.AddCh('\n'); UcStr.AddCh(ChDef->GetUc(Ch)); GetCh(); break; case 'r': Str.AddCh('\r'); UcStr.AddCh(ChDef->GetUc(Ch)); GetCh(); break; case 't': Str.AddCh('\t'); UcStr.AddCh(ChDef->GetUc(Ch)); GetCh(); break; case 'u': { // unicode character, represented using 4 hexadecimal digits GetCh(); EAssertR(TCh::IsHex(Ch), "Invalid hexadecimal digit in unicode escape"); int UChCd = TCh::GetHex(Ch); GetCh(); EAssertR(TCh::IsHex(Ch), "Invalid hexadecimal digit in unicode escape"); UChCd = 16 * UChCd + TCh::GetHex(Ch); GetCh(); EAssertR(TCh::IsHex(Ch), "Invalid hexadecimal digit in unicode escape"); UChCd = 16 * UChCd + TCh::GetHex(Ch); GetCh(); EAssertR(TCh::IsHex(Ch), "Invalid hexadecimal digit in unicode escape"); UChCd = 16 * UChCd + TCh::GetHex(Ch); // get as UTF8 encoded characters TUnicode::EncodeUtf8(UChCd, Str); TUnicode::EncodeUtf8(UChCd, UcStr); } GetCh(); break; default: Sym=syUndef; break; } if (Sym==syUndef){ throw PExcept(new TExcept("Invalid Escape Sequence in Quoted String"));} } } } else if ((ChDef->IsNum(Ch))||(IsSigNum&&((Ch=='+')||(Ch=='-')))){ Str.Clr(); bool IntP=true; do {Str.AddCh(Ch);} while (ChDef->IsNum(GetCh())); if (Expect.In(syFlt)){ if (Ch=='.'){ Str.AddCh(Ch); IntP=false; while (ChDef->IsNum(GetCh())){Str.AddCh(Ch);} } if ((Ch=='e')||(Ch=='E')){ Str.AddCh(Ch); GetCh(); IntP=false; if ((Ch=='+')||(Ch=='-')){Str.AddCh(Ch); GetCh();} while (ChDef->IsNum(Ch)){Str.AddCh(Ch); GetCh();} } } UcStr=Str; if (IntP&&(Expect.In(syInt))){ Sym=syInt; Int=atoi(Str.CStr()); } else { Sym=syFlt; Flt=atof(Str.CStr()); } } else if ((Ch==TCh::CrCh)||(Ch==TCh::LfCh)){ Sym=syEoln; if (Ch==TCh::CrCh){if (GetCh()==TCh::LfCh){GetCh();}} else if (Ch==TCh::LfCh){if (GetCh()==TCh::CrCh){GetCh();}} LnN++; LnChN=0; if (!IsRetEoln){GetSym(Expect);} } else if (Ch=='/'){ GetCh(); if ((IsCmtAlw)&&(Ch=='/')){ TChA _CmtStr; do {_CmtStr+=GetCh();} while (!ChDef->IsTerm(Ch)); _CmtStr.Pop(); _CmtStr.Trunc(); if (Ch==TCh::CrCh){ if (GetCh()==TCh::LfCh){GetCh();} } else if (Ch==TCh::LfCh){ if (GetCh()==TCh::CrCh){GetCh();} } if (IsRetEoln){Sym=syEoln;} else {GetSym(Expect);} CmtStr=_CmtStr; } else if (Ch=='*'){ TChA _CmtStr; do { while (GetCh()!='*'){_CmtStr+=Ch;} _CmtStr+=GetCh(); } while (Ch!='/'); _CmtStr.Pop(); _CmtStr.Pop(); _CmtStr.Trunc(); GetCh(); GetSym(Expect); CmtStr=_CmtStr; } else { Sym=sySlash; } } else if (Ch==TCh::EofCh){ Sym=syEof; } else { switch (Ch){ case '.': if (GetCh()=='.'){Sym=syDPeriod; GetCh();} else {Sym=syPeriod;} break; case ',': Sym=syComma; GetCh(); break; case ':': if (GetCh()==':'){Sym=syDColon; GetCh();} else {Sym=syColon;} break; case ';': Sym=sySemicolon; GetCh(); break; case '+': Sym=syPlus; GetCh(); break; case '-': Sym=syMinus; GetCh(); break; case '*': Sym=syAsterisk; GetCh(); break; case '/': Sym=sySlash; GetCh(); break; case '%': Sym=syPercent; GetCh(); break; case '!': Sym=syExclamation; GetCh(); break; case '|': Sym=syVBar; GetCh(); break; case '&': Sym=syAmpersand; GetCh(); break; case '=': Sym=syEq; GetCh(); break; case '<': GetCh(); if (Ch=='='){Sym=syLEq; GetCh();} else if (Ch=='>'){Sym=syNEq; GetCh();} else {Sym=syLss;} break; case '>': if (GetCh()=='='){Sym=syGEq; GetCh();} else {Sym=syGtr;} break; case '?': Sym=syQuestion; GetCh(); break; case '#': if (IsCmtAlw){ TChA _CmtStr; do {_CmtStr+=GetCh();} while (!ChDef->IsTerm(Ch)); _CmtStr.Pop(); _CmtStr.Trunc(); if (Ch==TCh::CrCh){ if (GetCh()==TCh::LfCh){GetCh();} } else if (Ch==TCh::LfCh){ if (GetCh()==TCh::CrCh){GetCh();} } if (IsRetEoln){Sym=syEoln;} else {GetSym(Expect);} CmtStr=_CmtStr; } else { Sym=syHash; GetCh(); } break; case '(': Sym=syLParen; GetCh(); break; case ')': Sym=syRParen; GetCh(); break; case '[': Sym=syLBracket; GetCh(); break; case ']': Sym=syRBracket; GetCh(); break; case '{': Sym=syLBrace; GetCh(); break; case '}': Sym=syRBrace; GetCh(); break; default: Sym=syUndef; GetCh(); break; } } }
PBowDocBs TFtrGenBs::LoadCsv(TStr& FNm, const int& ClassId, const TIntV& IgnoreIdV, const int& TrainLen) { // feature generators PFtrGenBs FtrGenBs = TFtrGenBs::New(); // CSV parsing stuff PSIn SIn = TFIn::New(FNm); char SsCh = ' '; TStrV FldValV; // read the headers and initialise the feature generators TSs::LoadTxtFldV(ssfCommaSep, SIn, SsCh, FldValV, false); for (int FldValN = 0; FldValN < FldValV.Len(); FldValN++) { const TStr& FldVal = FldValV[FldValN]; if (FldValN == ClassId) { if (FldVal == "NOM") { FtrGenBs->PutClsFtrGen(TFtrGenNominal::New()); } else if (FldVal == "MULTI-NOM") { FtrGenBs->PutClsFtrGen(TFtrGenMultiNom::New()); } else { TExcept::Throw("Wrong class type '" + FldVal + "', should be NOM or MULTI-NOM!"); } } else if (!IgnoreIdV.IsIn(FldValN)) { if (FldVal == TFtrGenNumeric::GetType()) { FtrGenBs->AddFtrGen(TFtrGenNumeric::New()); } else if (FldVal == TFtrGenNominal::GetType()) { FtrGenBs->AddFtrGen(TFtrGenNominal::New()); } else if (FldVal == TFtrGenToken::GetType()) { FtrGenBs->AddFtrGen(TFtrGenToken::New( TSwSet::New(swstNone), TStemmer::New(stmtNone))); } else if (FldVal == TFtrGenSparseNumeric::GetType()) { FtrGenBs->AddFtrGen(TFtrGenSparseNumeric::New()); } else if (FldVal == TFtrGenMultiNom::GetType()) { FtrGenBs->AddFtrGen(TFtrGenMultiNom::New()); } else { TExcept::Throw("Wrong type '" + FldVal + "'!"); } } } const int Flds = FldValV.Len(); // read the lines and feed them to the feature generators int Recs = 0; while (!SIn->Eof()) { if (Recs == TrainLen) { break; } Recs++; printf("%7d\r", Recs); TSs::LoadTxtFldV(ssfCommaSep, SIn, SsCh, FldValV, false); // make sure line still has the same number of fields as the header EAssertR(FldValV.Len() == Flds, TStr::Fmt("Wrong number of fields in line %d! Found %d and expected %d!", Recs + 1, FldValV.Len(), Flds)); // go over lines try { TStrV FtrValV; for (int FldValN = 0; FldValN < FldValV.Len(); FldValN++) { const TStr& FldVal = FldValV[FldValN]; if (FldValN == ClassId) { FtrGenBs->UpdateCls(FldVal); } else if (!IgnoreIdV.IsIn(FldValN)) { FtrValV.Add(FldVal); } } FtrGenBs->Update(FtrValV); } catch (PExcept Ex) { TExcept::Throw(TStr::Fmt("Error in line %d: '%s'!", Recs+1, Ex->GetMsgStr().CStr())); } } // read the file again and feed it to the training set PBowDocBs BowDocBs = FtrGenBs->MakeBowDocBs(); // we read and ignore the headers since we parsed them already SIn = TFIn::New(FNm); SsCh = ' '; TSs::LoadTxtFldV(ssfCommaSep, SIn, SsCh, FldValV, false); // read the lines and feed them to the training set Recs = 0; while (!SIn->Eof()){ Recs++; printf("%7d\r", Recs); TSs::LoadTxtFldV(ssfCommaSep, SIn, SsCh, FldValV, false); // make sure line still has the same number of fields as the header EAssertR(FldValV.Len() == Flds, TStr::Fmt("Wrong number of fields in line %s! Found %d and expected %d!", Recs + 1, FldValV.Len(), Flds)); // go over lines and construct the sparse vector TStrV FtrValV; TStr ClsFtrVal; try { for (int FldValN = 0; FldValN < FldValV.Len(); FldValN++) { const TStr& FldVal = FldValV[FldValN]; if (FldValN == ClassId) { ClsFtrVal = FldVal; } else if (!IgnoreIdV.IsIn(FldValN)) { FtrValV.Add(FldVal); } } } catch (PExcept Ex) { TExcept::Throw(TStr::Fmt("Error in line %d: '%s'!", Recs+1, Ex->GetMsgStr().CStr())); } // add the feature vector to trainsets FtrGenBs->AddBowDoc(BowDocBs, TStr::Fmt("Line-%d", Recs), FtrValV, ClsFtrVal); } // prepare training and testing doc ids TIntV AllDIdV; BowDocBs->GetAllDIdV(AllDIdV); IAssert(AllDIdV.IsSorted()); TIntV TrainDIdV = AllDIdV; TrainDIdV.Trunc(TrainLen); BowDocBs->PutTrainDIdV(TrainDIdV); TIntV TestDIdV = AllDIdV; TestDIdV.Minus(TrainDIdV); BowDocBs->PutTestDIdV(TestDIdV); return BowDocBs; }
TBigStrPool::TBigStrPool(TSize MxBfLen, uint _GrowBy) : MxBfL(MxBfLen), BfL(0), GrowBy(_GrowBy), Bf(0) { //IAssert(MxBfL >= 0); IAssert(GrowBy >= 0); if (MxBfL > 0) { Bf = (char *) malloc(MxBfL); IAssert(Bf); } AddStr(""); // add empty string }
void TFRnd::SetRecN(const int& RecN){ IAssert(RecAct); SetFPos(HdLen+RecN*RecLen); }
///////////////////////////////////////////////// // System-Console TSysConsole::TSysConsole(){ Ok=(AllocConsole()!=0); IAssert(Ok); hStdOut=GetStdHandle(STD_OUTPUT_HANDLE); IAssert(hStdOut!=INVALID_HANDLE_VALUE); }
int TFRnd::GetRecs(){ IAssert(RecAct); int FLen=GetFLen()-HdLen; EAssertR(FLen%RecLen==0, "Invalid length of file'"+FNm+"'."); return FLen/RecLen; }
TSysConsole::~TSysConsole(){ if (Ok){ IAssert(FreeConsole());} }
void TCpDoc::SaveAsfaToCpd(const TStr& InFPath, const TStr& OutCpdFNm){ // create output file PSOut SOut=TFOut::New(OutCpdFNm); // traverse files TStrH AccessionIdH; TFFile FFile(TStr::GetNrFPath(InFPath)+"*.Asfa"); TStr AsfaFNm; while (FFile.Next(AsfaFNm)){ printf("Processing file '%s'\n", AsfaFNm.CStr()); PSIn SIn=TFIn::New(AsfaFNm); TILx Lx(SIn, TFSet(iloRetEoln, iloExcept)); Lx.GetSym(syLn, syEof); while (Lx.Sym!=syEof){ // Query Line TStr QueryLnStr=Lx.Str; TStrV QueryStrV; QueryLnStr.SplitOnAllCh('\t', QueryStrV, false); IAssert(QueryStrV[0]=="Query"); // RecordNo Line Lx.GetSym(syLn); TStr RecNoLnStr=Lx.Str; TStrV RecNoStrV; RecNoLnStr.SplitOnAllCh('\t', RecNoStrV, false); IAssert(RecNoStrV[0]=="RecordNo"); //int RecN=RecNoStrV[1].GetInt(); // fields (format: Short-Name Tab Long-Name Tab Value-String) TStr TitleStr, AbstractStr, PublicationYearStr, AccessionId; TStrV AuthorNmV; TStrV TermNmV1, TermNmV2; while (true){ Lx.GetSym(syLn); TStr FldLnStr=Lx.Str; TStrV FldStrV; FldLnStr.SplitOnAllCh('\t', FldStrV, false); if (FldStrV[0]=="----"){ if (!AccessionIdH.IsKey(AccessionId)){ AccessionIdH.AddKey(AccessionId); // create & save cpd document PCpDoc CpDoc=TCpDoc::New(); CpDoc->DocNm=AccessionId; CpDoc->DateStr=PublicationYearStr; CpDoc->TitleStr=TitleStr; CpDoc->ParStrV.Add(AbstractStr); CpDoc->TopCdNmV=TermNmV1; CpDoc->GeoCdNmV=TermNmV2; CpDoc->IndCdNmV=AuthorNmV; CpDoc->Save(*SOut); } else {/*printf("[%s]", AccessionId.CStr());*/} break; } else if (FldStrV[0]=="TI"){ TitleStr=FldStrV[2]; } else if (FldStrV[0]=="TI"){ TitleStr=FldStrV[2]; } else if (FldStrV[0]=="AU"){ FldStrV[2].SplitOnAllCh(';', AuthorNmV); for (int StrN=0; StrN<AuthorNmV.Len(); StrN++){AuthorNmV[StrN].ToTrunc();} } else if (FldStrV[0]=="AB"){ AbstractStr=FldStrV[2]; } else if (FldStrV[0]=="PY"){ PublicationYearStr=FldStrV[2]; } else if (FldStrV[0]=="DE"){ FldStrV[2].SplitOnAllCh(';', TermNmV1); for (int StrN=0; StrN<TermNmV1.Len(); StrN++){TermNmV1[StrN].ToTrunc();} } else if (FldStrV[0]=="CL"){ FldStrV[2].SplitOnAllCh(';', TermNmV2); for (int StrN=0; StrN<TermNmV2.Len(); StrN++){TermNmV2[StrN].ToTrunc();} } else if (FldStrV[0]=="AN"){ AccessionId=FldStrV[2]; } } printf("%d\r", AccessionIdH.Len()); Lx.GetSym(syLn, syEof); } } }
void TSysConsole::Put(const TStr& Str){ DWORD ChsWritten; WriteConsole(hStdOut, Str.CStr(), Str.Len(), &ChsWritten, NULL); IAssert(ChsWritten==DWORD(Str.Len())); }
void TMOut::CutBf(const int& CutBfL){ IAssert((0<=CutBfL)&&(CutBfL<=BfL)); if (CutBfL==BfL){BfL=0;} else {memmove(Bf, Bf+CutBfL, BfL-CutBfL); BfL=BfL-CutBfL;} }
int TSecTm::GetMonthN() const { struct tm Tm; IAssert(IsDef() && GetTmStruct(AbsSecs(), Tm)); return Tm.tm_mon+1; }