/// Generates a random graph with exact degree sequence DegSeqV. /// The generated graph has no self loops. The graph generation process /// simulates the Configuration Model but if a duplicate edge occurs, we find a /// random edge, break it and reconnect it with the duplicate. PUNGraph GenDegSeq(const TIntV& DegSeqV, TRnd& Rnd) { const int Nodes = DegSeqV.Len(); PUNGraph GraphPt = TUNGraph::New(); TUNGraph& Graph = *GraphPt; Graph.Reserve(Nodes, -1); TIntH DegH(DegSeqV.Len(), true); IAssertR(DegSeqV.IsSorted(false), "DegSeqV must be sorted in descending order."); int DegSum=0, edge=0; for (int node = 0; node < Nodes; node++) { IAssert(Graph.AddNode(node) == node); DegH.AddDat(node, DegSeqV[node]); DegSum += DegSeqV[node]; } IAssert(DegSum % 2 == 0); while (! DegH.Empty()) { // pick random nodes and connect const int NId1 = DegH.GetKey(DegH.GetRndKeyId(TInt::Rnd, 0.5)); const int NId2 = DegH.GetKey(DegH.GetRndKeyId(TInt::Rnd, 0.5)); IAssert(DegH.IsKey(NId1) && DegH.IsKey(NId2)); if (NId1 == NId2) { if (DegH.GetDat(NId1) == 1) { continue; } // find rnd edge, break it, and connect the endpoints to the nodes const TIntPr Edge = TSnapDetail::GetRndEdgeNonAdjNode(GraphPt, NId1, -1); if (Edge.Val1==-1) { continue; } Graph.DelEdge(Edge.Val1, Edge.Val2); Graph.AddEdge(Edge.Val1, NId1); Graph.AddEdge(NId1, Edge.Val2); if (DegH.GetDat(NId1) == 2) { DegH.DelKey(NId1); } else { DegH.GetDat(NId1) -= 2; } } else { if (! Graph.IsEdge(NId1, NId2)) { Graph.AddEdge(NId1, NId2); } // good edge else { // find rnd edge, break and cross-connect const TIntPr Edge = TSnapDetail::GetRndEdgeNonAdjNode(GraphPt, NId1, NId2); if (Edge.Val1==-1) {continue; } Graph.DelEdge(Edge.Val1, Edge.Val2); Graph.AddEdge(NId1, Edge.Val1); Graph.AddEdge(NId2, Edge.Val2); } if (DegH.GetDat(NId1)==1) { DegH.DelKey(NId1); } else { DegH.GetDat(NId1) -= 1; } if (DegH.GetDat(NId2)==1) { DegH.DelKey(NId2); } else { DegH.GetDat(NId2) -= 1; } } if (++edge % 1000 == 0) { printf("\r %dk / %dk", edge/1000, DegSum/2000); } } return GraphPt; }
void TTrawling::JoinItems(const TIntV& Item1, const TIntV& Item2, TIntV& JoinItem) { int i = 0, j = 0; JoinItem.Clr(false); const int MaxL = Item1.Len()+1; while (i < Item1.Len()) { while (j < Item2.Len() && Item2[j] < Item1[i]) { JoinItem.Add(Item2[j]); j++; } JoinItem.Add(Item1[i]); if (j < Item2.Len() && Item1[i] == Item2[j]) { j++; } i++; if (JoinItem.Len() > MaxL) { JoinItem.Clr(false); return; } } while (j < Item2.Len()) { JoinItem.Add(Item2[j]); j++; } /*if (JoinItem.Len() > 3) { Dump(Item1, "\n1:"); Dump(Item2, "2:"); Dump(JoinItem, "J:"); }*/ IAssert(JoinItem.IsSorted()); }
PBowDocBs TFtrGenBs::LoadCsv(TStr& FNm, const int& ClassId, const TIntV& IgnoreIdV, const int& TrainLen) { // feature generators PFtrGenBs FtrGenBs = TFtrGenBs::New(); // CSV parsing stuff PSIn SIn = TFIn::New(FNm); char SsCh = ' '; TStrV FldValV; // read the headers and initialise the feature generators TSs::LoadTxtFldV(ssfCommaSep, SIn, SsCh, FldValV, false); for (int FldValN = 0; FldValN < FldValV.Len(); FldValN++) { const TStr& FldVal = FldValV[FldValN]; if (FldValN == ClassId) { if (FldVal == "NOM") { FtrGenBs->PutClsFtrGen(TFtrGenNominal::New()); } else if (FldVal == "MULTI-NOM") { FtrGenBs->PutClsFtrGen(TFtrGenMultiNom::New()); } else { TExcept::Throw("Wrong class type '" + FldVal + "', should be NOM or MULTI-NOM!"); } } else if (!IgnoreIdV.IsIn(FldValN)) { if (FldVal == TFtrGenNumeric::GetType()) { FtrGenBs->AddFtrGen(TFtrGenNumeric::New()); } else if (FldVal == TFtrGenNominal::GetType()) { FtrGenBs->AddFtrGen(TFtrGenNominal::New()); } else if (FldVal == TFtrGenToken::GetType()) { FtrGenBs->AddFtrGen(TFtrGenToken::New( TSwSet::New(swstNone), TStemmer::New(stmtNone))); } else if (FldVal == TFtrGenSparseNumeric::GetType()) { FtrGenBs->AddFtrGen(TFtrGenSparseNumeric::New()); } else if (FldVal == TFtrGenMultiNom::GetType()) { FtrGenBs->AddFtrGen(TFtrGenMultiNom::New()); } else { TExcept::Throw("Wrong type '" + FldVal + "'!"); } } } const int Flds = FldValV.Len(); // read the lines and feed them to the feature generators int Recs = 0; while (!SIn->Eof()) { if (Recs == TrainLen) { break; } Recs++; printf("%7d\r", Recs); TSs::LoadTxtFldV(ssfCommaSep, SIn, SsCh, FldValV, false); // make sure line still has the same number of fields as the header EAssertR(FldValV.Len() == Flds, TStr::Fmt("Wrong number of fields in line %d! Found %d and expected %d!", Recs + 1, FldValV.Len(), Flds)); // go over lines try { TStrV FtrValV; for (int FldValN = 0; FldValN < FldValV.Len(); FldValN++) { const TStr& FldVal = FldValV[FldValN]; if (FldValN == ClassId) { FtrGenBs->UpdateCls(FldVal); } else if (!IgnoreIdV.IsIn(FldValN)) { FtrValV.Add(FldVal); } } FtrGenBs->Update(FtrValV); } catch (PExcept Ex) { TExcept::Throw(TStr::Fmt("Error in line %d: '%s'!", Recs+1, Ex->GetMsgStr().CStr())); } } // read the file again and feed it to the training set PBowDocBs BowDocBs = FtrGenBs->MakeBowDocBs(); // we read and ignore the headers since we parsed them already SIn = TFIn::New(FNm); SsCh = ' '; TSs::LoadTxtFldV(ssfCommaSep, SIn, SsCh, FldValV, false); // read the lines and feed them to the training set Recs = 0; while (!SIn->Eof()){ Recs++; printf("%7d\r", Recs); TSs::LoadTxtFldV(ssfCommaSep, SIn, SsCh, FldValV, false); // make sure line still has the same number of fields as the header EAssertR(FldValV.Len() == Flds, TStr::Fmt("Wrong number of fields in line %s! Found %d and expected %d!", Recs + 1, FldValV.Len(), Flds)); // go over lines and construct the sparse vector TStrV FtrValV; TStr ClsFtrVal; try { for (int FldValN = 0; FldValN < FldValV.Len(); FldValN++) { const TStr& FldVal = FldValV[FldValN]; if (FldValN == ClassId) { ClsFtrVal = FldVal; } else if (!IgnoreIdV.IsIn(FldValN)) { FtrValV.Add(FldVal); } } } catch (PExcept Ex) { TExcept::Throw(TStr::Fmt("Error in line %d: '%s'!", Recs+1, Ex->GetMsgStr().CStr())); } // add the feature vector to trainsets FtrGenBs->AddBowDoc(BowDocBs, TStr::Fmt("Line-%d", Recs), FtrValV, ClsFtrVal); } // prepare training and testing doc ids TIntV AllDIdV; BowDocBs->GetAllDIdV(AllDIdV); IAssert(AllDIdV.IsSorted()); TIntV TrainDIdV = AllDIdV; TrainDIdV.Trunc(TrainLen); BowDocBs->PutTrainDIdV(TrainDIdV); TIntV TestDIdV = AllDIdV; TestDIdV.Minus(TrainDIdV); BowDocBs->PutTestDIdV(TestDIdV); return BowDocBs; }