Exemplo n.º 1
0
// Parse files downloaded from IMDB. Actors point to movies.
// Files: actors.list.gz, languages.list.gz, countries.list.gz
PImdbNet TImdbNet::LoadFromImdb(const TStr& DataDir) {
  PImdbNet Net = TImdbNet::New();
  Net->Reserve((int) Mega(2.5), -1);
  // ACTORS
  { TSsParser Ss(DataDir+"\\actors.list.gz", ssfTabSep);
  while (Ss.Next() && strcmp(Ss[0],"THE ACTORS LIST")!=0) { }
  Ss.Next();  Ss.Next();  Ss.Next();  
  int ActorId = -1, NAct=0;
  for (int i = 0; Ss.Next(); i++) {
    //printf("%s\n", Ss.DumpStr());
    int mPos = 0;
    if (Ss.Len() > 1) { // actor
      ActorId = Net->AddStr(Ss[0]);
      if (Net->IsNode(ActorId)) {
        printf("  actor '%s'(%d) is already a node %s\n", Ss[0], 
          ActorId, TImdbNet::GetMovieTyStr((TMovieTy) Net->GetNDat(ActorId).Type.Val).CStr());
        continue;
      } else { Net->AddNode(ActorId); }
      TImdbNode& Node = Net->GetNDat(ActorId);
      Node.Name = ActorId;  NAct++;
      Node.Type = mtyActor;
      mPos = 1;  while (strlen(Ss[mPos])==0) { mPos++; }
    } 
    // movie (delete everything last [)
    //  also parse the position <>, but is a property of an edge (actor, movie) pair
    char *DelFrom;
    char *C1 = strrchr(Ss[mPos], '<');
    char *C2 = strrchr(Ss[mPos], '[');
    if (C1==NULL) { DelFrom=C2; }
    else if (C2==NULL) { DelFrom=C1; }
    else { DelFrom = TMath::Mn(C1, C2); }
    if (DelFrom != NULL) {
      char *mov = DelFrom;  while (*mov) { *mov=0; mov++; }
      mov = (char *) DelFrom-1;  while (TCh::IsWs(*mov)) { *mov=0; mov--; }
    }
    const int MovNId = Net->AddStr(Ss[mPos]);
    if (! Net->IsNode(MovNId)) {
      Net->AddNode(MovNId);
      TImdbNode& Node = Net->GetNDat(MovNId);
      Node.Type = mtyMovie;  Node.Year = GetYearFromTitle(Ss[mPos]);
    }
    if (Net->IsEdge(ActorId, MovNId)) {
      printf("  already an edge %d %d\n", ActorId, MovNId); }
    else { Net->AddEdge(ActorId, MovNId); }
    if ((i+1) % Kilo(10) == 0) { 
      printf("\r   %d", (i+1)/1000); 
      if ((i+1) % Kilo(100) == 0) { 
        printf(" nodes: %d, edges: %d, actors: %d", 
          Net->GetNodes(), Net->GetEdges(), NAct); }
    }
  } 
  printf("\n=== nodes: %d, edges: %d, actors: %d", Net->GetNodes(), Net->GetEdges(), NAct); }
  // MOVIE LANGUAGE */
  { TSsParser Ss(DataDir+"\\language.list.gz", ssfTabSep);
  while (Ss.Next() && strcmp(Ss[0],"LANGUAGE LIST")!=0) { }
  Ss.Next();
  int LangCnt=0, i;
  for (i = 0; Ss.Next(); i++) {
    char *Mov = Ss[0];
    char *Lang = Ss[Ss.Len()-1];
    if (Net->IsStr(Mov)) {
      const int NId = Net->GetStrId(Mov);
      Net->GetNDat(NId).Lang = Net->AddStr(Lang);
      LangCnt++;
    } //else { printf("movie not found: '%s'\n", Mov); }
    if ((i+1) % Kilo(10) == 0) { 
      printf("\r   %d found %d ", (i+1), LangCnt); }
  } 
  printf("\n  LANG: total movies: %d,  found %d\n", (i+1), LangCnt); }
  // MOVIE COUNTRY
  { TSsParser Ss(DataDir+"\\countries.list.gz", ssfTabSep);
  while (Ss.Next() && strcmp(Ss[0],"COUNTRIES LIST")!=0) { }
  Ss.Next();
  int LangCnt=0, i;
  for (i = 0; Ss.Next(); i++) {
    char *Mov = Ss[0];
    char *Cntry = Ss[Ss.Len()-1];
    if (Net->IsStr(Mov)) {
      const int NId = Net->GetStrId(Mov);
      Net->GetNDat(NId).Cntry = Net->AddStr(Cntry);
      LangCnt++;
    } //else { printf("country not found: '%s'\n", Mov); }
    if ((i+1) % Kilo(10) == 0) { 
      printf("\n   %d found %d ", (i+1), LangCnt); }
  } 
  printf("\r  CNTRY: total movies: %d,  found %d\n", (i+1), LangCnt);  }
  return Net;
}