Пример #1
0
// Parse files downloaded from IMDB. Actors point to movies.
// Files: actors.list.gz, languages.list.gz, countries.list.gz
PImdbNet TImdbNet::LoadFromImdb(const TStr& DataDir) {
  PImdbNet Net = TImdbNet::New();
  Net->Reserve((int) Mega(2.5), -1);
  // ACTORS
  { TSsParser Ss(DataDir+"\\actors.list.gz", ssfTabSep);
  while (Ss.Next() && strcmp(Ss[0],"THE ACTORS LIST")!=0) { }
  Ss.Next();  Ss.Next();  Ss.Next();  
  int ActorId = -1, NAct=0;
  for (int i = 0; Ss.Next(); i++) {
    //printf("%s\n", Ss.DumpStr());
    int mPos = 0;
    if (Ss.Len() > 1) { // actor
      ActorId = Net->AddStr(Ss[0]);
      if (Net->IsNode(ActorId)) {
        printf("  actor '%s'(%d) is already a node %s\n", Ss[0], 
          ActorId, TImdbNet::GetMovieTyStr((TMovieTy) Net->GetNDat(ActorId).Type.Val).CStr());
        continue;
      } else { Net->AddNode(ActorId); }
      TImdbNode& Node = Net->GetNDat(ActorId);
      Node.Name = ActorId;  NAct++;
      Node.Type = mtyActor;
      mPos = 1;  while (strlen(Ss[mPos])==0) { mPos++; }
    } 
    // movie (delete everything last [)
    //  also parse the position <>, but is a property of an edge (actor, movie) pair
    char *DelFrom;
    char *C1 = strrchr(Ss[mPos], '<');
    char *C2 = strrchr(Ss[mPos], '[');
    if (C1==NULL) { DelFrom=C2; }
    else if (C2==NULL) { DelFrom=C1; }
    else { DelFrom = TMath::Mn(C1, C2); }
    if (DelFrom != NULL) {
      char *mov = DelFrom;  while (*mov) { *mov=0; mov++; }
      mov = (char *) DelFrom-1;  while (TCh::IsWs(*mov)) { *mov=0; mov--; }
    }
    const int MovNId = Net->AddStr(Ss[mPos]);
    if (! Net->IsNode(MovNId)) {
      Net->AddNode(MovNId);
      TImdbNode& Node = Net->GetNDat(MovNId);
      Node.Type = mtyMovie;  Node.Year = GetYearFromTitle(Ss[mPos]);
    }
    if (Net->IsEdge(ActorId, MovNId)) {
      printf("  already an edge %d %d\n", ActorId, MovNId); }
    else { Net->AddEdge(ActorId, MovNId); }
    if ((i+1) % Kilo(10) == 0) { 
      printf("\r   %d", (i+1)/1000); 
      if ((i+1) % Kilo(100) == 0) { 
        printf(" nodes: %d, edges: %d, actors: %d", 
          Net->GetNodes(), Net->GetEdges(), NAct); }
    }
  } 
  printf("\n=== nodes: %d, edges: %d, actors: %d", Net->GetNodes(), Net->GetEdges(), NAct); }
  // MOVIE LANGUAGE */
  { TSsParser Ss(DataDir+"\\language.list.gz", ssfTabSep);
  while (Ss.Next() && strcmp(Ss[0],"LANGUAGE LIST")!=0) { }
  Ss.Next();
  int LangCnt=0, i;
  for (i = 0; Ss.Next(); i++) {
    char *Mov = Ss[0];
    char *Lang = Ss[Ss.Len()-1];
    if (Net->IsStr(Mov)) {
      const int NId = Net->GetStrId(Mov);
      Net->GetNDat(NId).Lang = Net->AddStr(Lang);
      LangCnt++;
    } //else { printf("movie not found: '%s'\n", Mov); }
    if ((i+1) % Kilo(10) == 0) { 
      printf("\r   %d found %d ", (i+1), LangCnt); }
  } 
  printf("\n  LANG: total movies: %d,  found %d\n", (i+1), LangCnt); }
  // MOVIE COUNTRY
  { TSsParser Ss(DataDir+"\\countries.list.gz", ssfTabSep);
  while (Ss.Next() && strcmp(Ss[0],"COUNTRIES LIST")!=0) { }
  Ss.Next();
  int LangCnt=0, i;
  for (i = 0; Ss.Next(); i++) {
    char *Mov = Ss[0];
    char *Cntry = Ss[Ss.Len()-1];
    if (Net->IsStr(Mov)) {
      const int NId = Net->GetStrId(Mov);
      Net->GetNDat(NId).Cntry = Net->AddStr(Cntry);
      LangCnt++;
    } //else { printf("country not found: '%s'\n", Mov); }
    if ((i+1) % Kilo(10) == 0) { 
      printf("\n   %d found %d ", (i+1), LangCnt); }
  } 
  printf("\r  CNTRY: total movies: %d,  found %d\n", (i+1), LangCnt);  }
  return Net;
}
Пример #2
0
// load from allactors.zip that was prepared by Brad Malin in 2005
PImdbNet TImdbNet::LoadTxt(const TStr& ActorFNm) {
  PImdbNet Net = TImdbNet::New();
  TStrV ColV;
  char line [2024];
  int NLines=0, DupEdge=0, Year, Position, ActorNId, MovieNId;
  TIntH ActorNIdH;
  THash<TIntPr, TInt> MovieNIdH;
  FILE *F = fopen(ActorFNm.CStr(), "rt");  fgets(line, 2024, F);
  while (! feof(F)) {
    memset(line, 0, 2024);
    fgets(line, 2024, F);
    if (strlen(line) == 0) break;
    TStr(line).SplitOnAllCh('|', ColV, false);  IAssert(ColV.Len() == 7);
    const int NameStrId = Net->AddStr(ColV[0].GetTrunc().GetLc()+" "+ColV[1].GetTrunc().GetLc());
    const int MovieStrId = Net->AddStr(ColV[2].GetTrunc().GetLc());
    TStr YearStr = ColV[3].GetTrunc();
    if (YearStr.Len() > 4) YearStr = YearStr.GetSubStr(0, 3);
    Year = 1;  YearStr.IsInt(Year);
    const TMovieTy MovieTy = TImdbNet::GetMovieTy(ColV[4]);
    Position = TInt::Mx;  ColV[5].GetTrunc().IsInt(Position);
    IAssert(ColV[6].GetTrunc()[0] == 'M' || ColV[6].GetTrunc()[0]=='F');
    const bool IsMale = ColV[6].GetTrunc()[0] == 'M';
    // create nodes  
    if (ActorNIdH.IsKey(NameStrId)) { 
      ActorNId = ActorNIdH.GetDat(NameStrId); }
    else { 
      ActorNId = Net->AddNode(-1, TImdbNode(NameStrId, Year, Position, IsMale));
      ActorNIdH.AddDat(NameStrId, ActorNId);
    }
    if (MovieNIdH.IsKey(TIntPr(MovieStrId, Year))) {
      MovieNId = MovieNIdH.GetDat(TIntPr(MovieStrId, Year)); }
    else {
      MovieNId = Net->AddNode(-1, TImdbNode(NameStrId, Year, MovieTy)); 
      MovieNIdH.AddDat(TIntPr(MovieStrId, Year), MovieNId); 
    }
    if (! Net->IsEdge(ActorNId, MovieNId)) { 
      Net->AddEdge(ActorNId, MovieNId); }
    else { DupEdge++; }
    if (++NLines % 100000 == 0) printf("\r  %dk  ", NLines/1000);
  }
  fclose(F);
  printf("duplicate edges: %d\n", DupEdge);
  printf("nodes:  %d\n", Net->GetNodes());
  printf("edges:  %d\n", Net->GetEdges());
  printf("actors: %d\n", ActorNIdH.Len());
  printf("movies: %d\n", MovieNIdH.Len());
  // set the actor year to the year of his first movie
  int NUpdates=0;
  for (TNet::TNodeI NI = Net->BegNI(); NI < Net->EndNI(); NI++) {
    if (NI().IsActor()) {
      int MinYear = NI().GetYear();
      for (int e = 0; e < NI.GetOutDeg(); e++) {
        const TImdbNode& NodeDat = Net->GetNDat(NI.GetOutNId(e));
        if (NodeDat.IsMovie()) MinYear = TMath::Mn(MinYear, NodeDat.GetYear());
      }
      if (NI().Year != MinYear) NUpdates++;
      NI().Year = MinYear;
    }
  }
  printf("updated actor times: %d\n", NUpdates);
  return Net;
}