TEST_F(BodyCentredNonRotatingDynamicFrameTest, GeometricAcceleration) { int const kSteps = 10; RelativeDegreesOfFreedom<ICRFJ2000Equator> const initial_big_to_small = small_initial_state_ - big_initial_state_; Length const big_to_small = initial_big_to_small.displacement().Norm(); Acceleration const small_on_big = small_gravitational_parameter_ / (big_to_small * big_to_small); for (Length y = big_to_small / kSteps; y < big_to_small; y += big_to_small / kSteps) { Position<Big> const position(Big::origin + Displacement<Big>({0 * Kilo(Metre), y, 0 * Kilo(Metre)})); Acceleration const big_on_position = -big_gravitational_parameter_ / (y * y); Acceleration const small_on_position = small_gravitational_parameter_ / ((big_to_small - y) * (big_to_small - y)); Vector<Acceleration, Big> const expected_acceleration( {0 * SIUnit<Acceleration>(), small_on_position + big_on_position - small_on_big, 0 * SIUnit<Acceleration>()}); EXPECT_THAT(AbsoluteError( big_frame_->GeometricAcceleration( t0_, DegreesOfFreedom<Big>(position, Velocity<Big>())), expected_acceleration), Lt(1E-10 * SIUnit<Acceleration>())); } }
TFfGGen::TStopReason TUndirFFire::AddNodes(const int& GraphNodes, const bool& FloodStop) { printf("\n***Undirected GEO ForestFire: graph(%d,%d) add %d nodes, burn prob %.3f\n", Graph->GetNodes(), Graph->GetEdges(), GraphNodes, BurnProb); TExeTm ExeTm; int Burned1 = 0, Burned2 = 0, Burned3 = 0; // last 3 fire sizes TIntPrV NodesEdgesV; // create initial set of nodes if (Graph.Empty()) { Graph = PUNGraph::New(); } if (Graph->GetNodes() == 0) { Graph->AddNode(); } int NEdges = Graph->GetEdges(); // forest fire for (int NNodes = Graph->GetNodes() + 1; NNodes <= GraphNodes; NNodes++) { const int NewNId = Graph->AddNode(-1); IAssert(NewNId == Graph->GetNodes() - 1); // node ids have to be 0...N const int StartNId = Rnd.GetUniDevInt(NewNId); const int NBurned = BurnGeoFire(StartNId); // add edges to burned nodes for (int e = 0; e < NBurned; e++) { Graph->AddEdge(NewNId, GetBurnedNId(e)); } NEdges += NBurned; Burned1 = Burned2; Burned2 = Burned3; Burned3 = NBurned; if (NNodes % Kilo(1) == 0) { printf("(%d, %d) burned: [%d,%d,%d] [%s]\n", NNodes, NEdges, Burned1, Burned2, Burned3, ExeTm.GetStr()); NodesEdgesV.Add(TIntPr(NNodes, NEdges)); } if (FloodStop && NEdges>1000 && NEdges / double(NNodes)>100.0) { // average node degree is more than 50 printf("!!! FLOOD. G(%6d, %6d)\n", NNodes, NEdges); return TFfGGen::srFlood; } } printf("\n"); IAssert(Graph->GetEdges() == NEdges); return TFfGGen::srOk; }
TFfGGen::TStopReason TFfGGen::AddNodes(const int& GraphNodes, const bool& FloodStop) { printf("\n***ForestFire: %s Nodes:%d StartNodes:%d Take2AmbProb:%g\n", BurnExpFire ? "ExpFire" : "GeoFire", GraphNodes, StartNodes(), Take2AmbProb()); printf(" FwdBurnP:%g BckBurnP:%g ProbDecay:%g Orphan:%g\n", FwdBurnProb(), BckBurnProb(), ProbDecay(), OrphanProb()); TExeTm ExeTm; int Burned1 = 0, Burned2 = 0, Burned3 = 0; // last 3 fire sizes // create initial set of nodes if (Graph.Empty()) { Graph = PNGraph::New(); } if (Graph->GetNodes() == 0) { for (int n = 0; n < StartNodes; n++) { Graph->AddNode(); } } int NEdges = Graph->GetEdges(); // forest fire TRnd Rnd(0); TForestFire ForestFire(Graph, FwdBurnProb, BckBurnProb, ProbDecay, 0); // add nodes for (int NNodes = Graph->GetNodes() + 1; NNodes <= GraphNodes; NNodes++) { const int NewNId = Graph->AddNode(-1); IAssert(NewNId == Graph->GetNodes() - 1); // node ids have to be 0...N // not an Orphan (burn fire) if (OrphanProb == 0.0 || Rnd.GetUniDev() > OrphanProb) { // infect ambassadors if (Take2AmbProb == 0.0 || Rnd.GetUniDev() > Take2AmbProb || NewNId < 2) { ForestFire.Infect(Rnd.GetUniDevInt(NewNId)); // take 1 ambassador } else { const int AmbassadorNId1 = Rnd.GetUniDevInt(NewNId); int AmbassadorNId2 = Rnd.GetUniDevInt(NewNId); while (AmbassadorNId1 == AmbassadorNId2) { AmbassadorNId2 = Rnd.GetUniDevInt(NewNId); } ForestFire.Infect(TIntV::GetV(AmbassadorNId1, AmbassadorNId2)); // take 2 ambassadors } // burn fire if (BurnExpFire) { ForestFire.BurnExpFire(); } else { ForestFire.BurnGeoFire(); } // add edges to burned nodes for (int e = 0; e < ForestFire.GetBurned(); e++) { Graph->AddEdge(NewNId, ForestFire.GetBurnedNId(e)); NEdges++; } Burned1 = Burned2; Burned2 = Burned3; Burned3 = ForestFire.GetBurned(); } else { // Orphan (zero out-links) Burned1 = Burned2; Burned2 = Burned3; Burned3 = 0; } if (NNodes % Kilo(1) == 0) { printf("(%d, %d) burned: [%d,%d,%d] [%s]\n", NNodes, NEdges, Burned1, Burned2, Burned3, ExeTm.GetStr()); } if (FloodStop && NEdges>GraphNodes && (NEdges / double(NNodes)>1000.0)) { // average node degree is more than 500 printf(". FLOOD. G(%6d, %6d)\n", NNodes, NEdges); return srFlood; } if (NNodes % 1000 == 0 && TimeLimitSec > 0 && ExeTm.GetSecs() > TimeLimitSec) { printf(". TIME LIMIT. G(%d, %d)\n", Graph->GetNodes(), Graph->GetEdges()); return srTimeLimit; } } IAssert(Graph->GetEdges() == NEdges); return srOk; }
void TGStat::TakeSpectral(const PNGraph& Graph, TFSet StatFSet, int _TakeSngVals) { if (_TakeSngVals == -1) { _TakeSngVals = TakeSngVals; } // singular values, vectors if (StatFSet.In(gsdSngVal)) { const int SngVals = TMath::Mn(_TakeSngVals, Graph->GetNodes()/2); TFltV SngValV1; TSnap::GetSngVals(Graph, SngVals, SngValV1); SngValV1.Sort(false); TFltPrV& SngValV = DistrStatH.AddDat(gsdSngVal); SngValV.Gen(SngValV1.Len(), 0); for (int i = 0; i < SngValV1.Len(); i++) { SngValV.Add(TFltPr(i+1, SngValV1[i])); } } if (StatFSet.In(gsdSngVec)) { TFltV LeftV, RightV; TSnap::GetSngVec(Graph, LeftV, RightV); LeftV.Sort(false); TFltPrV& SngVec = DistrStatH.AddDat(gsdSngVec); SngVec.Gen(LeftV.Len(), 0); for (int i = 0; i < TMath::Mn(Kilo(10), LeftV.Len()/2); i++) { if (LeftV[i] > 0) { SngVec.Add(TFltPr(i+1, LeftV[i])); } } } }
void BM_BarycentricRotatingDynamicFrame( benchmark::State& state) { // NOLINT(runtime/references) Time const Δt = 5 * Minute; int const steps = state.range_x(); SolarSystem<ICRFJ2000Equator> solar_system; solar_system.Initialize( SOLUTION_DIR / "astronomy" / "gravity_model.proto.txt", SOLUTION_DIR / "astronomy" / "initial_state_jd_2433282_500000000.proto.txt"); auto const ephemeris = solar_system.MakeEphemeris( McLachlanAtela1992Order5Optimal<Position<ICRFJ2000Equator>>(), 45 * Minute, 5 * Milli(Metre)); ephemeris->Prolong(solar_system.epoch() + steps * Δt); not_null<MassiveBody const*> const earth = solar_system.massive_body(*ephemeris, "Earth"); not_null<MassiveBody const*> const venus = solar_system.massive_body(*ephemeris, "Venus"); MasslessBody probe; Position<ICRFJ2000Equator> probe_initial_position = ICRFJ2000Equator::origin + Displacement<ICRFJ2000Equator>( {0.5 * AstronomicalUnit, -1 * AstronomicalUnit, 0 * AstronomicalUnit}); Velocity<ICRFJ2000Equator> probe_velocity = Velocity<ICRFJ2000Equator>({0 * SIUnit<Speed>(), 100 * Kilo(Metre) / Second, 0 * SIUnit<Speed>()}); DiscreteTrajectory<ICRFJ2000Equator> probe_trajectory; FillLinearTrajectory<ICRFJ2000Equator, DiscreteTrajectory>( probe_initial_position, probe_velocity, solar_system.epoch(), Δt, steps, &probe_trajectory); BarycentricRotatingDynamicFrame<ICRFJ2000Equator, Rendering> dynamic_frame(ephemeris.get(), earth, venus); while (state.KeepRunning()) { auto v = ApplyDynamicFrame(&probe, &dynamic_frame, probe_trajectory.Begin(), probe_trajectory.End()); } }
/// load bipartite community affiliation graph from text file (each row contains the member node IDs for each community) void TAGMUtil::LoadCmtyVV(const TStr& InFNm, TVec<TIntV>& CmtyVV) { CmtyVV.Gen(Kilo(100), 0); TSsParser Ss(InFNm, ssfWhiteSep); while (Ss.Next()) { if(Ss.GetFlds() > 0) { TIntV CmtyV; for (int i = 0; i < Ss.GetFlds(); i++) { if (Ss.IsInt(i)) { CmtyV.Add(Ss.GetInt(i)); } } CmtyVV.Add(CmtyV); } } CmtyVV.Pack(); printf("community loading completed (%d communities)\n",CmtyVV.Len()); }
// Eve communication network PWgtNet TWgtNet::LoadEveCommNet(const TStr& FNm) { PWgtNet Net = TWgtNet::New(); TStrSet AuthorSet; TChA Ln; TVec<char*> WrdV; TFIn FIn(FNm); for (int c=0; FIn.GetNextLn(Ln); c++) { TStrUtil::SplitOnCh(Ln, WrdV, ';'); const int n1 = AuthorSet.AddKey(WrdV[0]); const int n2 = AuthorSet.AddKey(WrdV[1]); if (! Net->IsNode(n1)) { Net->AddNode(n1, WrdV[0]); } if (! Net->IsNode(n2)) { Net->AddNode(n2, WrdV[1]); } if (Net->IsEdge(n1, n2)) { Net->GetEDat(n1, n2) += 1; } else { Net->AddEdge(n1, n2, 1); } if (c % Kilo(10) == 0) { printf("\r%dk", c/1000); } } printf("\n"); TGBase::PrintInfo(Net); printf(" Edge weight: %f\n", Net->GetEdgeWgt()); return Net; }
// Gradient descent for p_c while fixing community affiliation graph (CAG). int TAGMFit::MLEGradAscentGivenCAG(const double& Thres, const int& MaxIter, const TStr PlotNm) { int Edges = G->GetEdges(); TExeTm ExeTm; TFltV GradV(LambdaV.Len()); int iter = 0; TIntFltPrV IterLV, IterGradNormV; double GradCutOff = 1000; for (iter = 0; iter < MaxIter; iter++) { GradLogLForLambda(GradV); //if gradient is going out of the boundary, cut off for (int i = 0; i < LambdaV.Len(); i++) { if (GradV[i] < -GradCutOff) { GradV[i] = -GradCutOff; } if (GradV[i] > GradCutOff) { GradV[i] = GradCutOff; } if (LambdaV[i] <= MinLambda && GradV[i] < 0) { GradV[i] = 0.0; } if (LambdaV[i] >= MaxLambda && GradV[i] > 0) { GradV[i] = 0.0; } } double Alpha = 0.15, Beta = 0.2; if (Edges > Kilo(100)) { Alpha = 0.00015; Beta = 0.3;} double LearnRate = GetStepSizeByLineSearchForLambda(GradV, GradV, Alpha, Beta); if (TLinAlg::Norm(GradV) < Thres) { break; } for (int i = 0; i < LambdaV.Len(); i++) { double Change = LearnRate * GradV[i]; LambdaV[i] += Change; if(LambdaV[i] < MinLambda) { LambdaV[i] = MinLambda;} if(LambdaV[i] > MaxLambda) { LambdaV[i] = MaxLambda;} } if (! PlotNm.Empty()) { double L = Likelihood(); IterLV.Add(TIntFltPr(iter, L)); IterGradNormV.Add(TIntFltPr(iter, TLinAlg::Norm(GradV))); } } if (! PlotNm.Empty()) { TGnuPlot::PlotValV(IterLV, PlotNm + ".likelihood_Q"); TGnuPlot::PlotValV(IterGradNormV, PlotNm + ".gradnorm_Q"); printf("MLE for Lambda completed with %d iterations(%s)\n",iter,ExeTm.GetTmStr()); } return iter; }
void TTrawling::CountSupport() { for (int c = 0; c < CandItemH.Len(); c++) { CandItemH[c] = GetSupport(CandItemH.GetKey(c)); if (c % Kilo(100) == 0) { printf("."); } } }
TEST_F(ManœuvreTest, Apollo8SIVB) { // Data from NASA's Saturn V Launch Vehicle, Flight Evaluation Report AS-503, // Apollo 8 Mission (1969), // http://ntrs.nasa.gov/archive/nasa/casi.ntrs.nasa.gov/19690015314.pdf. // We use the reconstructed or actual values. // Table 2-2. Significant Event Times Summary. Instant const range_zero; Instant const s_ivb_1st_90_percent_thrust = range_zero + 530.53 * Second; Instant const s_ivb_1st_eco = range_zero + 684.98 * Second; // Initiate S-IVB Restart Sequence and Start of Time Base 6 (T6). Instant const t6 = range_zero + 9659.54 * Second; Instant const s_ivb_2nd_90_percent_thrust = range_zero + 10'240.02 * Second; Instant const s_ivb_2nd_eco = range_zero + 10'555.51 * Second; // From Table 7-2. S-IVB Steady State Performance - First Burn. Force thrust_1st = 901'557 * Newton; Speed specific_impulse_1st = 4'204.1 * Newton * Second / Kilogram; Variation<Mass> lox_flowrate_1st = 178.16 * Kilogram / Second; Variation<Mass> fuel_flowrate_1st = 36.30 * Kilogram / Second; // From Table 7-7. S-IVB Steady State Performance - Second Burn. Force thrust_2nd = 897'548 * Newton; Speed specific_impulse_2nd = 4199.2 * Newton * Second / Kilogram; Variation<Mass> lox_flowrate_2nd = 177.70 * Kilogram / Second; Variation<Mass> fuel_flowrate_2nd = 36.01 * Kilogram / Second; // Table 21-5. Total Vehicle Mass, S-IVB First Burn Phase, Kilograms. Mass total_vehicle_at_s_ivb_1st_90_percent_thrust = 161143 * Kilogram; Mass total_vehicle_at_s_ivb_1st_eco = 128095 * Kilogram; // Table 21-7. Total Vehicle Mass, S-IVB Second Burn Phase, Kilograms. Mass total_vehicle_at_s_ivb_2nd_90_percent_thrust = 126780 * Kilogram; Mass total_vehicle_at_s_ivb_2nd_eco = 59285 * Kilogram; // An arbitrary direction, we're not testing this. Vector<double, World> e_y({0, 1, 0}); Manœuvre<World> first_burn(thrust_1st, total_vehicle_at_s_ivb_1st_90_percent_thrust, specific_impulse_1st, e_y); EXPECT_THAT(RelativeError(lox_flowrate_1st + fuel_flowrate_1st, first_burn.mass_flow()), Lt(1E-4)); first_burn.set_duration(s_ivb_1st_eco - s_ivb_1st_90_percent_thrust); EXPECT_THAT( RelativeError(total_vehicle_at_s_ivb_1st_eco, first_burn.final_mass()), Lt(1E-3)); first_burn.set_initial_time(s_ivb_1st_90_percent_thrust); EXPECT_EQ(s_ivb_1st_eco, first_burn.final_time()); // Accelerations from Figure 4-4. Ascent Trajectory Acceleration Comparison. // Final acceleration from Table 4-2. Comparison of Significant Trajectory // Events. EXPECT_THAT( first_burn.acceleration()(first_burn.initial_time()).Norm(), AllOf(Gt(5 * Metre / Pow<2>(Second)), Lt(6.25 * Metre / Pow<2>(Second)))); EXPECT_THAT(first_burn.acceleration()(range_zero + 600 * Second).Norm(), AllOf(Gt(6.15 * Metre / Pow<2>(Second)), Lt(6.35 * Metre / Pow<2>(Second)))); EXPECT_THAT(first_burn.acceleration()(first_burn.final_time()).Norm(), AllOf(Gt(7.03 * Metre / Pow<2>(Second)), Lt(7.05 * Metre / Pow<2>(Second)))); Manœuvre<World> second_burn(thrust_2nd, total_vehicle_at_s_ivb_2nd_90_percent_thrust, specific_impulse_2nd, e_y); EXPECT_THAT(RelativeError(lox_flowrate_2nd + fuel_flowrate_2nd, second_burn.mass_flow()), Lt(2E-4)); second_burn.set_duration(s_ivb_2nd_eco - s_ivb_2nd_90_percent_thrust); EXPECT_THAT( RelativeError(total_vehicle_at_s_ivb_2nd_eco, second_burn.final_mass()), Lt(2E-3)); second_burn.set_initial_time(s_ivb_2nd_90_percent_thrust); EXPECT_EQ(s_ivb_2nd_eco, second_burn.final_time()); // Accelerations from Figure 4-9. Injection Phase Acceleration Comparison. // Final acceleration from Table 4-2. Comparison of Significant Trajectory // Events. EXPECT_THAT(second_burn.acceleration()(second_burn.initial_time()).Norm(), AllOf(Gt(7 * Metre / Pow<2>(Second)), Lt(7.5 * Metre / Pow<2>(Second)))); EXPECT_THAT(second_burn.acceleration()(t6 + 650 * Second).Norm(), AllOf(Gt(8 * Metre / Pow<2>(Second)), Lt(8.02 * Metre / Pow<2>(Second)))); EXPECT_THAT(second_burn.acceleration()(t6 + 700 * Second).Norm(), AllOf(Gt(8.8 * Metre / Pow<2>(Second)), Lt(9 * Metre / Pow<2>(Second)))); EXPECT_THAT(second_burn.acceleration()(t6 + 750 * Second).Norm(), AllOf(Gt(9.9 * Metre / Pow<2>(Second)), Lt(10 * Metre / Pow<2>(Second)))); EXPECT_THAT(second_burn.acceleration()(t6 + 850 * Second).Norm(), AllOf(Gt(12.97 * Metre / Pow<2>(Second)), Lt(13 * Metre / Pow<2>(Second)))); EXPECT_THAT(second_burn.acceleration()(second_burn.final_time()).Norm(), AllOf(Gt(15.12 * Metre / Pow<2>(Second)), Lt(15.17 * Metre / Pow<2>(Second)))); EXPECT_THAT(second_burn.Δv(), AllOf(Gt(3 * Kilo(Metre) / Second), Lt(3.25 * Kilo(Metre) / Second))); // From the Apollo 8 flight journal. EXPECT_THAT(AbsoluteError(10'519.6 * Foot / Second, second_burn.Δv()), Lt(20 * Metre / Second)); }
// Parse files downloaded from IMDB. Actors point to movies. // Files: actors.list.gz, languages.list.gz, countries.list.gz PImdbNet TImdbNet::LoadFromImdb(const TStr& DataDir) { PImdbNet Net = TImdbNet::New(); Net->Reserve((int) Mega(2.5), -1); // ACTORS { TSsParser Ss(DataDir+"\\actors.list.gz", ssfTabSep); while (Ss.Next() && strcmp(Ss[0],"THE ACTORS LIST")!=0) { } Ss.Next(); Ss.Next(); Ss.Next(); int ActorId = -1, NAct=0; for (int i = 0; Ss.Next(); i++) { //printf("%s\n", Ss.DumpStr()); int mPos = 0; if (Ss.Len() > 1) { // actor ActorId = Net->AddStr(Ss[0]); if (Net->IsNode(ActorId)) { printf(" actor '%s'(%d) is already a node %s\n", Ss[0], ActorId, TImdbNet::GetMovieTyStr((TMovieTy) Net->GetNDat(ActorId).Type.Val).CStr()); continue; } else { Net->AddNode(ActorId); } TImdbNode& Node = Net->GetNDat(ActorId); Node.Name = ActorId; NAct++; Node.Type = mtyActor; mPos = 1; while (strlen(Ss[mPos])==0) { mPos++; } } // movie (delete everything last [) // also parse the position <>, but is a property of an edge (actor, movie) pair char *DelFrom; char *C1 = strrchr(Ss[mPos], '<'); char *C2 = strrchr(Ss[mPos], '['); if (C1==NULL) { DelFrom=C2; } else if (C2==NULL) { DelFrom=C1; } else { DelFrom = TMath::Mn(C1, C2); } if (DelFrom != NULL) { char *mov = DelFrom; while (*mov) { *mov=0; mov++; } mov = (char *) DelFrom-1; while (TCh::IsWs(*mov)) { *mov=0; mov--; } } const int MovNId = Net->AddStr(Ss[mPos]); if (! Net->IsNode(MovNId)) { Net->AddNode(MovNId); TImdbNode& Node = Net->GetNDat(MovNId); Node.Type = mtyMovie; Node.Year = GetYearFromTitle(Ss[mPos]); } if (Net->IsEdge(ActorId, MovNId)) { printf(" already an edge %d %d\n", ActorId, MovNId); } else { Net->AddEdge(ActorId, MovNId); } if ((i+1) % Kilo(10) == 0) { printf("\r %d", (i+1)/1000); if ((i+1) % Kilo(100) == 0) { printf(" nodes: %d, edges: %d, actors: %d", Net->GetNodes(), Net->GetEdges(), NAct); } } } printf("\n=== nodes: %d, edges: %d, actors: %d", Net->GetNodes(), Net->GetEdges(), NAct); } // MOVIE LANGUAGE */ { TSsParser Ss(DataDir+"\\language.list.gz", ssfTabSep); while (Ss.Next() && strcmp(Ss[0],"LANGUAGE LIST")!=0) { } Ss.Next(); int LangCnt=0, i; for (i = 0; Ss.Next(); i++) { char *Mov = Ss[0]; char *Lang = Ss[Ss.Len()-1]; if (Net->IsStr(Mov)) { const int NId = Net->GetStrId(Mov); Net->GetNDat(NId).Lang = Net->AddStr(Lang); LangCnt++; } //else { printf("movie not found: '%s'\n", Mov); } if ((i+1) % Kilo(10) == 0) { printf("\r %d found %d ", (i+1), LangCnt); } } printf("\n LANG: total movies: %d, found %d\n", (i+1), LangCnt); } // MOVIE COUNTRY { TSsParser Ss(DataDir+"\\countries.list.gz", ssfTabSep); while (Ss.Next() && strcmp(Ss[0],"COUNTRIES LIST")!=0) { } Ss.Next(); int LangCnt=0, i; for (i = 0; Ss.Next(); i++) { char *Mov = Ss[0]; char *Cntry = Ss[Ss.Len()-1]; if (Net->IsStr(Mov)) { const int NId = Net->GetStrId(Mov); Net->GetNDat(NId).Cntry = Net->AddStr(Cntry); LangCnt++; } //else { printf("country not found: '%s'\n", Mov); } if ((i+1) % Kilo(10) == 0) { printf("\n %d found %d ", (i+1), LangCnt); } } printf("\r CNTRY: total movies: %d, found %d\n", (i+1), LangCnt); } return Net; }
void BigMain(int argc, char* argv[]) { TExeTm ExeTm; Env = TEnv(argc, argv, TNotify::StdNotify); Env.PrepArgs("QuotesApp"); const TStr ToDo = Env.GetIfArgPrefixStr("-do:", "", "To do").GetLc(); if (Env.IsEndOfRun()) { printf("To do:\n"); printf(" MkDataset : Make memes dataset (extract quotes and save txt)\n"); printf(" ExtractSubset : Extract a subset of memes containing particular words\n"); printf(" MemesToQtBs : Load memes dataset and create quote base\n"); printf(" MkClustNet : Build cluster network from the quote base\n"); return; } #pragma region mkdataset // extract quotes and links and make them into a single file if (ToDo == "mkdataset") { const TStr InFNm = Env.GetIfArgPrefixStr("-i:", "files.txt", "Spinn3r input files (one file per line)"); const TStr OutFNm = Env.GetIfArgPrefixStr("-o:", "Spinn3r-dataset.txt", "Output file"); const int MinQtWrdLen = Env.GetIfArgPrefixInt("-w:", 3, "Minimum quote word length"); const TStr UrlFNm = Env.GetIfArgPrefixStr("-u:", "", "Seen url set (THashSet<TMd5Sig>) file name"); const bool UrlOnlyOnce = Env.GetIfArgPrefixBool("-q:", true, "Only keep unique Urls"); //// parse directly from Spinn3r TStr Spinn3rFNm; THashSet<TMd5Sig> SeenUrlSet; if (UrlOnlyOnce && ! UrlFNm.Empty()) { // keep track of already seen urls (so that there are no duplicate urls) TFIn FIn(UrlFNm); SeenUrlSet.Load(FIn); } FILE *F = fopen(OutFNm.CStr(), "wt"); TFIn FIn(InFNm); int Items=0; for (int f=0; FIn.GetNextLn(Spinn3rFNm); f++) { TQuoteExtractor QE(Spinn3rFNm.ToTrunc()); printf("Processing %02d: %s [%s]\n", f+1, Spinn3rFNm.CStr(), TExeTm::GetCurTm()); fflush(stdout); for (int item = 0; QE.Next(); item++) { const TMd5Sig PostMd5(QE.PostUrlStr); if (QE.QuoteV.Empty() && QE.LinkV.Empty()) { continue; } // no quotes, no links if (UrlOnlyOnce) { if (SeenUrlSet.IsKey(PostMd5)) { continue; } SeenUrlSet.AddKey(PostMd5); } fprintf(F, "P\t%s\n", QE.PostUrlStr.CStr()); //if (QE.PubTm > TSecTm(2008,8,30) || QE.PubTm < TSecTm(2008,7,25)) { printf("%s\n", QE.PubTm.GetStr().CStr()); } fprintf(F, "T\t%s\n", QE.PubTm.GetYmdTmStr().CStr()); for (int q = 0; q < QE.QuoteV.Len(); q++) { if (TStrUtil::CountWords(QE.QuoteV[q]) >= MinQtWrdLen) { fprintf(F, "Q\t%s\n", QE.QuoteV[q].CStr()); } } for (int l = 0; l < QE.LinkV.Len(); l++) { fprintf(F, "L\t%s\n", QE.LinkV[l].CStr()); } fprintf(F, "\n"); if (item>0 && item % Kilo(100) == 0) { QE.DumpStat(); QE.ExeTm.Tick(); } Items++; } printf("file done. Total %d all posts, %d all items\n", SeenUrlSet.Len(), Items); fflush(stdout); } printf("all done. Saving %d post urls\n", SeenUrlSet.Len()); fflush(stdout); if (! SeenUrlSet.Empty()) { TFOut FOut(OutFNm.GetFMid()+".SeenUrlSet"); SeenUrlSet.Save(FOut); } fclose(F); } #pragma endregion mkdataset #pragma region extractsubset // save posts with memes containing particular words else if (ToDo == "extractsubset") { const TStr InFNmWc = Env.GetIfArgPrefixStr("-i:", "memes_*.rar", "Input file prefix"); const bool IsInFNmWc = Env.GetIfArgPrefixBool("-w:", true, "Input is wildcard (else a file with list of input files)"); const TStr OutFNm = Env.GetIfArgPrefixStr("-o:", "memes-subset.txt", "Output memes file"); const TStr WordsFNm = Env.GetIfArgPrefixStr("-p:", "phrases-in.txt", "Phrases that memes have to contain"); TChAV CatchMemeV;// = TStr::GetV("great depression", "economic meltdown", "recession had bottomed out", "green shoots", "slow recovery", "gradual recovery"); printf("Loading %s\n", WordsFNm.CStr()); { TFIn FIn(WordsFNm); for (TStr Ln; FIn.GetNextLn(Ln); ) { printf(" %s\n", Ln.GetLc().CStr()); CatchMemeV.Add(Ln.GetLc()); } } printf("%d strings loaded\n", CatchMemeV.Len()); TFOut FOut(OutFNm); TMemesDataLoader Memes(InFNmWc, IsInFNmWc); for (int posts = 0, nsave=0; Memes.LoadNext(); posts++) { bool DoSave = false; for (int m = 0; m < Memes.MemeV.Len(); m++) { for (int i = 0; i < CatchMemeV.Len(); i++) { if (Memes.MemeV[m].SearchStr(CatchMemeV[i]) != -1) { DoSave=true; break; } } if (DoSave) { break; } } if (DoSave) { Memes.SaveTxt(FOut); nsave++; } if (posts % Mega(1) == 0) { printf("%dm posts, %d saved\n", posts/Mega(1), nsave); FOut.Flush(); } } } #pragma endregion extractsubset #pragma region memestoqtbs // load memes dataset (MkDataset) and create quote base else if (ToDo == "memestoqtbs") { const TStr InFNm = Env.GetIfArgPrefixStr("-i:", "201007_201107.txt", "Input Memes dataset files"); const TStr MediaUrlFNm = Env.GetIfArgPrefixStr("-u:", "news_media.txt", "Fule with news media urls"); const TStr Pref = Env.GetIfArgPrefixStr("-o:", "qt", "Output file name prefix"); const int MinWrdLen = Env.GetIfArgPrefixInt("-l:", 4, "Min quote word length"); const int MinMemeFq = Env.GetIfArgPrefixInt("-f:", 5, "Min meme frequency"); const TStr MinTmStr = Env.GetIfArgPrefixStr("-mint:", "20100714", "Min time of quotes, format = YYYYMMDD"); const TStr MaxTmStr = Env.GetIfArgPrefixStr("-maxt:", "20110728", "Max time of quotes, format = YYYYMMDD"); TSecTm MinTm(atoi(MinTmStr.GetSubStr(0,3).CStr()),atoi(MinTmStr.GetSubStr(4,5).CStr()),atoi(MinTmStr.GetSubStr(6,7).CStr())); TSecTm MaxTm(atoi(MaxTmStr.GetSubStr(0,3).CStr()),atoi(MaxTmStr.GetSubStr(4,5).CStr()),atoi(MaxTmStr.GetSubStr(6,7).CStr())); PQuoteBs QtBs = TQuoteBs::New(); int HashTableSize = 100; // 100 for each quarter, for one year data, use 400 int UrlSetSize = 4 * HashTableSize; QtBs->ConstructQtBsZarya(InFNm, Pref, MediaUrlFNm, MinTm, MaxTm, MinWrdLen, MinMemeFq, HashTableSize, UrlSetSize); } #pragma endregion memestoqtbs #pragma region mkclustnet // make cluster network else if (ToDo == "mkclustnet") { TStr InQtBsNm = Env.GetIfArgPrefixStr("-i:", "", "Input quote base file name"); TStr Pref = Env.GetIfArgPrefixStr("-o:", "qt", "Output network/updated QtBs filename"); TStr BlackListFNm = Env.GetIfArgPrefixStr("-b:", "quote_blacklist.txt", "Blacklist file name"); bool IsShglReady = Env.GetIfArgPrefixBool("-shglready:", false, "Indicate whether shingle hash table is ready"); bool IsNetReady = Env.GetIfArgPrefixBool("-netready:", false, "Indicate whether cluster net is ready"); double BktThresh = Env.GetIfArgPrefixFlt("-bktthresh:", 0.4, "Threshold for bad shingle bucket elimination"); double MxTmDelay = Env.GetIfArgPrefixFlt("-delaythresh:", 5, "Max time delay between two quotes in the same cluster"); double MxTmDev = Env.GetIfArgPrefixFlt("-devthresh:", 3, "Max time deviation for a quote to be specific rather than general"); double RefineThresh = Env.GetIfArgPrefixFlt("-refinethresh:", 0.2, "Threshold for merging quote cluster in refining process"); const int MinWrdLen = Env.GetIfArgPrefixInt("-minl:", 4, "Min quote word length"); const int MaxWrdLen = Env.GetIfArgPrefixInt("-maxl:", 200, "Max quote word length"); const int MinMemeFq = Env.GetIfArgPrefixInt("-mf:", 5, "Min meme frequency"); const int MinClustFq = Env.GetIfArgPrefixInt("-cf:", 50, "Min quote cluster frequency"); // Load quote base PQuoteBs QtBs; if (TZipIn::IsZipFNm(InQtBsNm)) { TZipIn ZipIn(InQtBsNm); QtBs = TQuoteBs::Load(ZipIn); } else { TFIn FIn(InQtBsNm); QtBs = TQuoteBs::Load(FIn); } // Cluster the quotes QtBs->ClusterQuotes(MinMemeFq, MinWrdLen, MaxWrdLen, BlackListFNm, Pref, IsShglReady, IsNetReady, BktThresh, MxTmDelay, MxTmDev, RefineThresh); // Dump the clusters bool SkipUrl = true, FlashDisp = true; QtBs->DumpQuoteClusters(MinWrdLen, MinClustFq, SkipUrl, FlashDisp, Pref); } #pragma endregion mkclustnet #pragma region memeclust else if (ToDo.SearchStr(TStr("memeclust")) >= 0) { const TStr InFNm = Env.GetIfArgPrefixStr("-i:", "201101.txt", "Input Memes dataset files"); const TStr MediaUrlFNm = Env.GetIfArgPrefixStr("-u:", "news_media.txt", "Fule with news media urls"); TStr Pref = Env.GetIfArgPrefixStr("-o:", "qt", "Output file name prefix"); const int MinWrdLen = Env.GetIfArgPrefixInt("-minl:", 4, "Min quote word length"); const int MaxWrdLen = Env.GetIfArgPrefixInt("-maxl:", 200, "Max quote word length"); const int MinMemeFq = Env.GetIfArgPrefixInt("-f:", 5, "Min meme frequency"); const int MinClustFq = Env.GetIfArgPrefixInt("-cf:", 50, "Min quote cluster frequency"); TStr BlackListFNm = Env.GetIfArgPrefixStr("-b:", "quote_blacklist.txt", "Blacklist file name"); bool IsQtBsReady = Env.GetIfArgPrefixBool("-qtbsready:", false, "Indicate whether quote base is ready and can be loaded readily"); bool IsShglReady = Env.GetIfArgPrefixBool("-shglready:", false, "Indicate whether shingle hash table is ready"); bool IsNetReady = Env.GetIfArgPrefixBool("-netready:", false, "Indicate whether cluster net is ready"); double BktThresh = Env.GetIfArgPrefixFlt("-bktthresh:", 0.4, "Threshold for bad shingle bucket elimination"); double MxTmDelay = Env.GetIfArgPrefixFlt("-delaythresh:", 5, "Max time delay between two quotes in the same cluster"); double MxTmDev = Env.GetIfArgPrefixFlt("-devthresh:", 3, "Max time deviation for a quote to be specific rather than general"); double RefineThresh = Env.GetIfArgPrefixFlt("-refinethresh:", 0.2, "Threshold for merging quote cluster in refining process"); const TStr MinTmStr = Env.GetIfArgPrefixStr("-mint:", "20010101", "Min time of quotes, format = YYYYMMDD"); const TStr MaxTmStr = Env.GetIfArgPrefixStr("-maxt:", "20200101", "Max time of quotes, format = YYYYMMDD"); TSecTm MinTm(atoi(MinTmStr.GetSubStr(0,3).CStr()),atoi(MinTmStr.GetSubStr(4,5).CStr()),atoi(MinTmStr.GetSubStr(6,7).CStr())); TSecTm MaxTm(atoi(MaxTmStr.GetSubStr(0,3).CStr()),atoi(MaxTmStr.GetSubStr(4,5).CStr()),atoi(MaxTmStr.GetSubStr(6,7).CStr())); // Construct the quote base from Zarya data PQuoteBs QtBs = TQuoteBs::New(); if (!IsQtBsReady) { int HashTableSize = 100; // 100 for each quarter, for one year data, use 400 if (ToDo == "memeclustzarya") { int UrlSetSize = 4 * HashTableSize; QtBs->ConstructQtBsZarya(InFNm, Pref, MediaUrlFNm, MinTm, MaxTm, MinWrdLen, MinMemeFq, HashTableSize, UrlSetSize); } else if (ToDo == "memeclustqtonly") { QtBs->ConstructQtBsQtOnly(InFNm, Pref, MediaUrlFNm, MinWrdLen, MinMemeFq, HashTableSize); } else if (ToDo == "memeclustqttime") { QtBs->ConstructQtBsQtTime(InFNm, Pref, MediaUrlFNm, MinWrdLen, MinMemeFq, HashTableSize); } else { printf("Please specify one of the three options for -do : memeclustzarya, memeclustqtonly, memeclustqttime!\n"); return; } } else { TStr InQtBsNm = TStr::Fmt("%s-w%dmfq%d.QtBs", Pref.CStr(), MinWrdLen, MinMemeFq); if (TZipIn::IsZipFNm(InQtBsNm)) { TZipIn ZipIn(InQtBsNm); QtBs = TQuoteBs::Load(ZipIn); } else { TFIn FIn(InQtBsNm); QtBs = TQuoteBs::Load(FIn); } } // Cluster the quotes QtBs->ClusterQuotes(MinMemeFq, MinWrdLen, MaxWrdLen, BlackListFNm, Pref, IsShglReady, IsNetReady, BktThresh, MxTmDelay, MxTmDev, RefineThresh); // Dump the clusters bool SkipUrl = true, FlashDisp = true; QtBs->DumpQuoteClusters(MinWrdLen, MinClustFq, SkipUrl, FlashDisp, Pref); } #pragma endregion memeclust }