// Parse files downloaded from IMDB. Actors point to movies. // Files: actors.list.gz, languages.list.gz, countries.list.gz PImdbNet TImdbNet::LoadFromImdb(const TStr& DataDir) { PImdbNet Net = TImdbNet::New(); Net->Reserve((int) Mega(2.5), -1); // ACTORS { TSsParser Ss(DataDir+"\\actors.list.gz", ssfTabSep); while (Ss.Next() && strcmp(Ss[0],"THE ACTORS LIST")!=0) { } Ss.Next(); Ss.Next(); Ss.Next(); int ActorId = -1, NAct=0; for (int i = 0; Ss.Next(); i++) { //printf("%s\n", Ss.DumpStr()); int mPos = 0; if (Ss.Len() > 1) { // actor ActorId = Net->AddStr(Ss[0]); if (Net->IsNode(ActorId)) { printf(" actor '%s'(%d) is already a node %s\n", Ss[0], ActorId, TImdbNet::GetMovieTyStr((TMovieTy) Net->GetNDat(ActorId).Type.Val).CStr()); continue; } else { Net->AddNode(ActorId); } TImdbNode& Node = Net->GetNDat(ActorId); Node.Name = ActorId; NAct++; Node.Type = mtyActor; mPos = 1; while (strlen(Ss[mPos])==0) { mPos++; } } // movie (delete everything last [) // also parse the position <>, but is a property of an edge (actor, movie) pair char *DelFrom; char *C1 = strrchr(Ss[mPos], '<'); char *C2 = strrchr(Ss[mPos], '['); if (C1==NULL) { DelFrom=C2; } else if (C2==NULL) { DelFrom=C1; } else { DelFrom = TMath::Mn(C1, C2); } if (DelFrom != NULL) { char *mov = DelFrom; while (*mov) { *mov=0; mov++; } mov = (char *) DelFrom-1; while (TCh::IsWs(*mov)) { *mov=0; mov--; } } const int MovNId = Net->AddStr(Ss[mPos]); if (! Net->IsNode(MovNId)) { Net->AddNode(MovNId); TImdbNode& Node = Net->GetNDat(MovNId); Node.Type = mtyMovie; Node.Year = GetYearFromTitle(Ss[mPos]); } if (Net->IsEdge(ActorId, MovNId)) { printf(" already an edge %d %d\n", ActorId, MovNId); } else { Net->AddEdge(ActorId, MovNId); } if ((i+1) % Kilo(10) == 0) { printf("\r %d", (i+1)/1000); if ((i+1) % Kilo(100) == 0) { printf(" nodes: %d, edges: %d, actors: %d", Net->GetNodes(), Net->GetEdges(), NAct); } } } printf("\n=== nodes: %d, edges: %d, actors: %d", Net->GetNodes(), Net->GetEdges(), NAct); } // MOVIE LANGUAGE */ { TSsParser Ss(DataDir+"\\language.list.gz", ssfTabSep); while (Ss.Next() && strcmp(Ss[0],"LANGUAGE LIST")!=0) { } Ss.Next(); int LangCnt=0, i; for (i = 0; Ss.Next(); i++) { char *Mov = Ss[0]; char *Lang = Ss[Ss.Len()-1]; if (Net->IsStr(Mov)) { const int NId = Net->GetStrId(Mov); Net->GetNDat(NId).Lang = Net->AddStr(Lang); LangCnt++; } //else { printf("movie not found: '%s'\n", Mov); } if ((i+1) % Kilo(10) == 0) { printf("\r %d found %d ", (i+1), LangCnt); } } printf("\n LANG: total movies: %d, found %d\n", (i+1), LangCnt); } // MOVIE COUNTRY { TSsParser Ss(DataDir+"\\countries.list.gz", ssfTabSep); while (Ss.Next() && strcmp(Ss[0],"COUNTRIES LIST")!=0) { } Ss.Next(); int LangCnt=0, i; for (i = 0; Ss.Next(); i++) { char *Mov = Ss[0]; char *Cntry = Ss[Ss.Len()-1]; if (Net->IsStr(Mov)) { const int NId = Net->GetStrId(Mov); Net->GetNDat(NId).Cntry = Net->AddStr(Cntry); LangCnt++; } //else { printf("country not found: '%s'\n", Mov); } if ((i+1) % Kilo(10) == 0) { printf("\n %d found %d ", (i+1), LangCnt); } } printf("\r CNTRY: total movies: %d, found %d\n", (i+1), LangCnt); } return Net; }
// load from allactors.zip that was prepared by Brad Malin in 2005 PImdbNet TImdbNet::LoadTxt(const TStr& ActorFNm) { PImdbNet Net = TImdbNet::New(); TStrV ColV; char line [2024]; int NLines=0, DupEdge=0, Year, Position, ActorNId, MovieNId; TIntH ActorNIdH; THash<TIntPr, TInt> MovieNIdH; FILE *F = fopen(ActorFNm.CStr(), "rt"); fgets(line, 2024, F); while (! feof(F)) { memset(line, 0, 2024); fgets(line, 2024, F); if (strlen(line) == 0) break; TStr(line).SplitOnAllCh('|', ColV, false); IAssert(ColV.Len() == 7); const int NameStrId = Net->AddStr(ColV[0].GetTrunc().GetLc()+" "+ColV[1].GetTrunc().GetLc()); const int MovieStrId = Net->AddStr(ColV[2].GetTrunc().GetLc()); TStr YearStr = ColV[3].GetTrunc(); if (YearStr.Len() > 4) YearStr = YearStr.GetSubStr(0, 3); Year = 1; YearStr.IsInt(Year); const TMovieTy MovieTy = TImdbNet::GetMovieTy(ColV[4]); Position = TInt::Mx; ColV[5].GetTrunc().IsInt(Position); IAssert(ColV[6].GetTrunc()[0] == 'M' || ColV[6].GetTrunc()[0]=='F'); const bool IsMale = ColV[6].GetTrunc()[0] == 'M'; // create nodes if (ActorNIdH.IsKey(NameStrId)) { ActorNId = ActorNIdH.GetDat(NameStrId); } else { ActorNId = Net->AddNode(-1, TImdbNode(NameStrId, Year, Position, IsMale)); ActorNIdH.AddDat(NameStrId, ActorNId); } if (MovieNIdH.IsKey(TIntPr(MovieStrId, Year))) { MovieNId = MovieNIdH.GetDat(TIntPr(MovieStrId, Year)); } else { MovieNId = Net->AddNode(-1, TImdbNode(NameStrId, Year, MovieTy)); MovieNIdH.AddDat(TIntPr(MovieStrId, Year), MovieNId); } if (! Net->IsEdge(ActorNId, MovieNId)) { Net->AddEdge(ActorNId, MovieNId); } else { DupEdge++; } if (++NLines % 100000 == 0) printf("\r %dk ", NLines/1000); } fclose(F); printf("duplicate edges: %d\n", DupEdge); printf("nodes: %d\n", Net->GetNodes()); printf("edges: %d\n", Net->GetEdges()); printf("actors: %d\n", ActorNIdH.Len()); printf("movies: %d\n", MovieNIdH.Len()); // set the actor year to the year of his first movie int NUpdates=0; for (TNet::TNodeI NI = Net->BegNI(); NI < Net->EndNI(); NI++) { if (NI().IsActor()) { int MinYear = NI().GetYear(); for (int e = 0; e < NI.GetOutDeg(); e++) { const TImdbNode& NodeDat = Net->GetNDat(NI.GetOutNId(e)); if (NodeDat.IsMovie()) MinYear = TMath::Mn(MinYear, NodeDat.GetYear()); } if (NI().Year != MinYear) NUpdates++; NI().Year = MinYear; } } printf("updated actor times: %d\n", NUpdates); return Net; }