/// Generates a random graph with exact degree sequence DegSeqV.
/// The generated graph has no self loops. The graph generation process
/// simulates the Configuration Model but if a duplicate edge occurs, we find a
/// random edge, break it and reconnect it with the duplicate.
PUNGraph GenDegSeq(const TIntV& DegSeqV, TRnd& Rnd) {
  const int Nodes = DegSeqV.Len();
  PUNGraph GraphPt = TUNGraph::New();
  TUNGraph& Graph = *GraphPt;
  Graph.Reserve(Nodes, -1);
  TIntH DegH(DegSeqV.Len(), true);
  
  IAssertR(DegSeqV.IsSorted(false), "DegSeqV must be sorted in descending order.");
  int DegSum=0, edge=0;
  for (int node = 0; node < Nodes; node++) {
    IAssert(Graph.AddNode(node) == node);
    DegH.AddDat(node, DegSeqV[node]);
    DegSum += DegSeqV[node];
  }
  IAssert(DegSum % 2 == 0);
  while (! DegH.Empty()) {
    // pick random nodes and connect
    const int NId1 = DegH.GetKey(DegH.GetRndKeyId(TInt::Rnd, 0.5));
    const int NId2 = DegH.GetKey(DegH.GetRndKeyId(TInt::Rnd, 0.5));
    IAssert(DegH.IsKey(NId1) && DegH.IsKey(NId2));
    if (NId1 == NId2) {
      if (DegH.GetDat(NId1) == 1) { continue; }
      // find rnd edge, break it, and connect the endpoints to the nodes
      const TIntPr Edge = TSnapDetail::GetRndEdgeNonAdjNode(GraphPt, NId1, -1);
      if (Edge.Val1==-1) { continue; }
      Graph.DelEdge(Edge.Val1, Edge.Val2);
      Graph.AddEdge(Edge.Val1, NId1);
      Graph.AddEdge(NId1, Edge.Val2);
      if (DegH.GetDat(NId1) == 2) { DegH.DelKey(NId1); }
      else { DegH.GetDat(NId1) -= 2; }
    } else {
      if (! Graph.IsEdge(NId1, NId2)) {
        Graph.AddEdge(NId1, NId2); }  // good edge
      else {
        // find rnd edge, break and cross-connect
        const TIntPr Edge = TSnapDetail::GetRndEdgeNonAdjNode(GraphPt, NId1, NId2);
        if (Edge.Val1==-1) {continue; }
        Graph.DelEdge(Edge.Val1, Edge.Val2);
        Graph.AddEdge(NId1, Edge.Val1);
        Graph.AddEdge(NId2, Edge.Val2);
      }
      if (DegH.GetDat(NId1)==1) { DegH.DelKey(NId1); }
      else { DegH.GetDat(NId1) -= 1; }
      if (DegH.GetDat(NId2)==1) { DegH.DelKey(NId2); }
      else { DegH.GetDat(NId2) -= 1; }
    }
    if (++edge % 1000 == 0) {
      printf("\r %dk / %dk", edge/1000, DegSum/2000); }
  }
  return GraphPt;
}
Beispiel #2
0
void TTrawling::JoinItems(const TIntV& Item1, const TIntV& Item2, TIntV& JoinItem) {
  int i = 0, j = 0;
  JoinItem.Clr(false);
  const int MaxL = Item1.Len()+1;
  while (i < Item1.Len()) {
    while (j < Item2.Len() && Item2[j] < Item1[i]) {
      JoinItem.Add(Item2[j]); j++; }
    JoinItem.Add(Item1[i]);
    if (j < Item2.Len() && Item1[i] == Item2[j]) { j++; }
    i++;
    if (JoinItem.Len() > MaxL) { JoinItem.Clr(false); return; }
  }
  while (j < Item2.Len()) {
    JoinItem.Add(Item2[j]); j++;
  }
  /*if (JoinItem.Len() > 3) {
    Dump(Item1, "\n1:");
    Dump(Item2, "2:");
    Dump(JoinItem, "J:");
  }*/
  IAssert(JoinItem.IsSorted());
}
Beispiel #3
0
PBowDocBs TFtrGenBs::LoadCsv(TStr& FNm, const int& ClassId, 
        const TIntV& IgnoreIdV, const int& TrainLen) {

    // feature generators
	PFtrGenBs FtrGenBs = TFtrGenBs::New();
    // CSV parsing stuff
    PSIn SIn = TFIn::New(FNm); 
    char SsCh = ' '; TStrV FldValV;
    // read the headers and initialise the feature generators
    TSs::LoadTxtFldV(ssfCommaSep, SIn, SsCh, FldValV, false);  
    for (int FldValN = 0; FldValN < FldValV.Len(); FldValN++) {
        const TStr& FldVal = FldValV[FldValN];
        if (FldValN == ClassId) { 
            if (FldVal == "NOM") {
                FtrGenBs->PutClsFtrGen(TFtrGenNominal::New());
            } else if (FldVal == "MULTI-NOM") {
                FtrGenBs->PutClsFtrGen(TFtrGenMultiNom::New());
            } else {
                TExcept::Throw("Wrong class type '" + FldVal + "', should be NOM or MULTI-NOM!");
            }
        } else if (!IgnoreIdV.IsIn(FldValN)) {
            if (FldVal == TFtrGenNumeric::GetType()) {
				FtrGenBs->AddFtrGen(TFtrGenNumeric::New());
            } else if (FldVal == TFtrGenNominal::GetType()) { 
				FtrGenBs->AddFtrGen(TFtrGenNominal::New());
            } else if (FldVal == TFtrGenToken::GetType()) { 
				FtrGenBs->AddFtrGen(TFtrGenToken::New(
                    TSwSet::New(swstNone), TStemmer::New(stmtNone)));
            } else if (FldVal == TFtrGenSparseNumeric::GetType()) { 
				FtrGenBs->AddFtrGen(TFtrGenSparseNumeric::New());
            } else if (FldVal == TFtrGenMultiNom::GetType()) { 
				FtrGenBs->AddFtrGen(TFtrGenMultiNom::New());
            } else {
                TExcept::Throw("Wrong type '" + FldVal + "'!");
            }
        }
    }
    const int Flds = FldValV.Len();
    // read the lines and feed them to the feature generators
    int Recs = 0;
    while (!SIn->Eof()) {
        if (Recs == TrainLen) { break; }
        Recs++; printf("%7d\r", Recs);
        TSs::LoadTxtFldV(ssfCommaSep, SIn, SsCh, FldValV, false);
        // make sure line still has the same number of fields as the header
        EAssertR(FldValV.Len() == Flds, 
            TStr::Fmt("Wrong number of fields in line %d! Found %d and expected %d!",
            Recs + 1, FldValV.Len(), Flds));
        // go over lines
        try {
			TStrV FtrValV;
            for (int FldValN = 0; FldValN < FldValV.Len(); FldValN++) {
                const TStr& FldVal = FldValV[FldValN];
                if (FldValN == ClassId) { 
					FtrGenBs->UpdateCls(FldVal);
                } else if (!IgnoreIdV.IsIn(FldValN)) {
                    FtrValV.Add(FldVal);
                }
            }
			FtrGenBs->Update(FtrValV);
        } catch (PExcept Ex) {
            TExcept::Throw(TStr::Fmt("Error in line %d: '%s'!", 
                Recs+1, Ex->GetMsgStr().CStr()));
        }
    }
    // read the file again and feed it to the training set
    PBowDocBs BowDocBs = FtrGenBs->MakeBowDocBs();
    // we read and ignore the headers since we parsed them already 
    SIn = TFIn::New(FNm); SsCh = ' ';
    TSs::LoadTxtFldV(ssfCommaSep, SIn, SsCh, FldValV, false);  
    // read the lines and feed them to the training set
    Recs = 0;
    while (!SIn->Eof()){
        Recs++; printf("%7d\r", Recs);
        TSs::LoadTxtFldV(ssfCommaSep, SIn, SsCh, FldValV, false);
        // make sure line still has the same number of fields as the header
        EAssertR(FldValV.Len() == Flds, 
            TStr::Fmt("Wrong number of fields in line %s! Found %d and expected %d!",
            Recs + 1, FldValV.Len(), Flds));
        // go over lines and construct the sparse vector
		TStrV FtrValV; TStr ClsFtrVal;
        try {
            for (int FldValN = 0; FldValN < FldValV.Len(); FldValN++) {
                const TStr& FldVal = FldValV[FldValN];
                if (FldValN == ClassId) { 
                    ClsFtrVal = FldVal;
                } else if (!IgnoreIdV.IsIn(FldValN)) {
                    FtrValV.Add(FldVal);
                }
            }
        } catch (PExcept Ex) {
            TExcept::Throw(TStr::Fmt("Error in line %d: '%s'!", 
                Recs+1, Ex->GetMsgStr().CStr()));
        }
        // add the feature vector to trainsets
		FtrGenBs->AddBowDoc(BowDocBs, TStr::Fmt("Line-%d", Recs), FtrValV, ClsFtrVal);
    }
	// prepare training and testing doc ids
	TIntV AllDIdV; BowDocBs->GetAllDIdV(AllDIdV); IAssert(AllDIdV.IsSorted());
	TIntV TrainDIdV = AllDIdV; TrainDIdV.Trunc(TrainLen);
	BowDocBs->PutTrainDIdV(TrainDIdV);
	TIntV TestDIdV = AllDIdV; TestDIdV.Minus(TrainDIdV);
	BowDocBs->PutTestDIdV(TestDIdV);

    return BowDocBs;
}