예제 #1
0
PAmazonItem TAmazonItem::New(const PXmlDoc& XmlDoc){
  // create item
  PAmazonItem AmazonItem=PAmazonItem(new TAmazonItem());
  // item-id
  AmazonItem->ItemId=XmlDoc->GetTagTok("AmazonItem|ItemId")->GetTokStr(false);
  // title
  AmazonItem->TitleStr=XmlDoc->GetTagTok("AmazonItem|Title")->GetTokStr(false);
  // authors
  TXmlTokV AuthorNmTokV;
  XmlDoc->GetTagTokV("AmazonItem|Authors|Name", AuthorNmTokV);
  for (int AuthorNmTokN=0; AuthorNmTokN<AuthorNmTokV.Len(); AuthorNmTokN++){
    PXmlTok AuthorNmTok=AuthorNmTokV[AuthorNmTokN];
    TStr AuthorNm=AuthorNmTok->GetTokStr(false);
    AmazonItem->AuthorNmV.Add(AuthorNm);
  }
  // x-sell item-ids
  TXmlTokV NextItemIdTokV;
  XmlDoc->GetTagTokV("AmazonItem|XSell|ItemId", NextItemIdTokV);
  for (int ItemIdTokN=0; ItemIdTokN<NextItemIdTokV.Len(); ItemIdTokN++){
    PXmlTok NextItemIdTok=NextItemIdTokV[ItemIdTokN];
    TStr NextItemId=NextItemIdTok->GetTokStr(false);
    AmazonItem->NextItemIdV.Add(NextItemId);
  }
  // return item
  return AmazonItem;
}
예제 #2
0
void TDzsBsDoc::GetDocParts(
 const TStr& FNm, const PXmlDoc& XmlDoc,
 const TStr& FPath, const TStr& WebAlias,
 bool& Ok, TStr& IdStr, TStr& TitleStr, TStr& DataStr, int& YearN){
  Ok=false;
  if (!XmlDoc->IsOk()){return;}
  // id
  IdStr=FNm;
  IdStr.ChangeStr(FPath, WebAlias);
//  PXmlTok IdTok;
//  if (XmlDoc->IsTagTok("term|metadata|identifier", IdTok)){
//    IdStr=IdTok->GetTokStr(false);}
//  else {return;}
  // title
  PXmlTok TitleTok;
  if (XmlDoc->IsTagTok("term|metadata|title", TitleTok)){
    TitleStr=TitleTok->GetTokStr(false);}
  else {return;}
  // timedata
  TXmlTokV TimeDataTokV;
  XmlDoc->GetTagTokV("term|data|frame|timedata|fromyear", TimeDataTokV);
  TStr TimeDataStr=TXmlTok::GetTokVStr(TimeDataTokV, false);
  if (TimeDataStr.IsInt(YearN)){} else {YearN=0;}
  // locdata
  TXmlTokV LocDataTokV;
  XmlDoc->GetTagTokV("term|data|frame|locdata", LocDataTokV);
  TStr LocDataStr=TXmlTok::GetTokVStr(LocDataTokV, true);
  // pages
  TXmlTokV PageTokV; XmlDoc->GetTagTokV("term|data|frame|page", PageTokV);
  DataStr=GetDataTokVStr(PageTokV, "\n")+" "+LocDataStr;
  // character-set transformation
  TitleStr=THtmlLxChDef::GetCSZFromWin1250(TitleStr);
  DataStr=THtmlLxChDef::GetCSZFromWin1250(DataStr);
  // success
  Ok=true;
}
PTransCorpus TTransCorpus::LoadTMX(const TStr& InTmxFPath,
        const TStr& OrgLang, const TStr& RefTransLang) {

    // prepare prset structures
    PTransCorpus TransCorpus = TTransCorpus::New();
    // iterate over all the TMX files
    TFFile TmxFNms(InTmxFPath, "tmx", false); TStr TmxFNm; int SentId = 0;
    while (TmxFNms.Next(TmxFNm)) {
        printf("Loading %s ...\n", TmxFNm.CStr());
        // we load xml by skiping first tag...
        TStr CleanTmxFNm = TmxFNm + ".xml";
        CleanTmx(TmxFNm, CleanTmxFNm);
        PSIn XmlSIn = TFIn::New(CleanTmxFNm);
        PXmlDoc XmlDoc = TXmlDoc::LoadTxt(XmlSIn);
        // stop if at the last tag
        if (!XmlDoc->IsOk()) { 
            printf(" error: %s\n", XmlDoc->GetMsgStr().CStr());
            continue; 
        }
        // extract sentences from xml-trees
        TXmlTokV TuTokV; XmlDoc->GetTagTokV("tmx|body|tu", TuTokV);
        const int TuToks = TuTokV.Len();
        for (int TuTokN = 0; TuTokN < TuToks; TuTokN++) {
            if (TuTokN % 100 == 0) { printf(" %d / %d\r", TuTokN, TuToks); }
            TXmlTokV TuvTokV; TuTokV[TuTokN]->GetTagTokV("tuv", TuvTokV);
            IAssert(TuvTokV.Len() == 2);
            TStr OrgSent, RefTransSent;
            for (int TuvTokN = 0; TuvTokN < TuvTokV.Len(); TuvTokN++) {
                TStr Lang = TuvTokV[TuvTokN]->GetStrArgVal("xml:lang", "");
                TStr Sent = CleanRtf(TuvTokV[TuvTokN]->GetTagTok("seg")->GetTokStr(false));
                if (Lang == OrgLang) { OrgSent = Sent; } 
                else if (Lang == RefTransLang) { RefTransSent = Sent; }
            }
            TransCorpus->AddSentenceNoTrans(SentId, OrgSent, RefTransSent); SentId++;
        }
        printf(" %d / %d\n", TuToks, TuToks);
    }    
    // finish
    return TransCorpus;
}
예제 #4
0
PCycBs TCycBs::LoadCycXmlDump(const TStr& FPath){
  // file-names
  TStr NrFPath=TStr::GetNrFPath(FPath);
  TStr CycLexiconFNm=NrFPath+"lexicon-dump.xml";
  TStr CycTaxonomyFNm=NrFPath+"taxonomy-dump.xml";
  TStr CycRelevanceFNm=NrFPath+"relevance-dump.xml";
  TStr CycKBaseFNm=NrFPath+"kb-dump.xml";                                                 

  // create cyc-base
  PCycBs CycBs=TCycBs::New();

  // lexicon
  {printf("Processing Lexicon %s ...\n", CycLexiconFNm.CStr());
  PSIn CycLexiconSIn=TFIn::New(CycLexiconFNm);
  PXmlDoc XmlDoc; int XmlDocs=0;
  TStr PrevCycWStr; TStr PrevCycLStr;
  forever{
    // statistics
    XmlDocs++; if (XmlDocs%1000==0){printf("%d Docs\r", XmlDocs);}
    // load xml-tree
    XmlDoc=TXmlDoc::LoadTxt(CycLexiconSIn);
    if (!XmlDoc->IsOk()){
      printf("%s - %s\n", PrevCycWStr.CStr(), PrevCycLStr.CStr());
      Fail;
    }
    // extract fields from xml-tree
    PXmlTok TopTok=XmlDoc->GetTok();
    if (TopTok->IsTag("end")){break;}
    IAssert(TopTok->IsTag("word"));
    TStr CycWStr=TopTok->GetArgVal("string");
    TStr CycLStr=TopTok->GetArgVal("cycl");
    PrevCycWStr=CycWStr; PrevCycLStr;
    // insert data
    CycBs->AddEdge(CycLStr, "#$nameString", CycWStr);
    CycBs->AddEdge(CycWStr, "~#$nameString", CycLStr);
  }
  printf("%d Docs\nDone.\n", XmlDocs);}

  // taxonomy
  {printf("Processing Taxonomy %s ...\n", CycTaxonomyFNm.CStr());
  PSIn CycTaxonomySIn=TFIn::New(CycTaxonomyFNm);
  PXmlDoc XmlDoc; int XmlDocs=0;
  TStr PrevSrcCycLStr;
  forever{
    // statistics
    XmlDocs++;
    if (XmlDocs%1000==0){
      printf("%d Docs\r", XmlDocs);}
    // load xml-tree
    XmlDoc=TXmlDoc::LoadTxt(CycTaxonomySIn);
    if (!XmlDoc->IsOk()){
      printf("%s\n", PrevSrcCycLStr.CStr());
      Fail;
    }
    // extract fields from xml-tree
    PXmlTok TopTok=XmlDoc->GetTok();
    if (TopTok->IsTag("end")){break;}
    IAssert(TopTok->IsTag("term"));
    TStr SrcCycLStr=TopTok->GetArgVal("cycl");
    PrevSrcCycLStr=SrcCycLStr;
    for (int SubTokN=0; SubTokN<TopTok->GetSubToks(); SubTokN++){
      PXmlTok SubTok=TopTok->GetSubTok(SubTokN);
      TStr DstCycLStr=SubTok->GetTagNm();
      if (SubTok->IsTag("isa")){
        DstCycLStr=SubTok->GetArgVal("value");
        CycBs->AddEdge(SrcCycLStr, "#$isa", DstCycLStr);
        CycBs->AddEdge(DstCycLStr, "~#$isa", SrcCycLStr);
      } else
      if (SubTok->IsTag("genl")){
        DstCycLStr=SubTok->GetArgVal("value");
        CycBs->AddEdge(SrcCycLStr, "#$genls", DstCycLStr);
        CycBs->AddEdge(DstCycLStr, "~#$genls", SrcCycLStr);
      } else {
        Fail;
      }
    }
  }
  printf("%d Docs\nDone.\n", XmlDocs);}

  // relevance
  {printf("Processing Relevance %s ...\n", CycRelevanceFNm.CStr());
  PSIn CycRelevanceSIn=TFIn::New(CycRelevanceFNm);
  PXmlDoc XmlDoc; int XmlDocs=0;
  TStr PrevCycStr;
  forever{
    // statistics
    XmlDocs++;
    if (XmlDocs%1000==0){
      printf("%d Docs\r", XmlDocs);}
    // load xml-tree
    XmlDoc=TXmlDoc::LoadTxt(CycRelevanceSIn);
    if (!XmlDoc->IsOk()){
      printf("%s\n", PrevCycStr.CStr());
      Fail;
    }
    // extract fields from xml-tree
    PXmlTok TopTok=XmlDoc->GetTok();
    if (TopTok->IsTag("end")){break;}
    IAssert(TopTok->IsTag("term"));
    TStr CycStr=TopTok->GetArgVal("cyc");
    PrevCycStr=CycStr;
    //IAssert(CycBs->IsVNm(CycStr));
    if (CycBs->IsVNm(CycStr)){
      if (TopTok->GetArgVal("thcl")=="T"){
        CycBs->GetVrtx(CycStr).SetFlag(cvfHumanRelevant, true);}
      if (TopTok->GetArgVal("irrel")=="T"){
        CycBs->GetVrtx(CycStr).SetFlag(cfvHumanIrrelevant, true);}
      if (TopTok->GetArgVal("clarifying")=="T"){
        CycBs->GetVrtx(CycStr).SetFlag(cfvHumanClarifying, true);}
      if ((TopTok->GetArgVal("thcl")=="T")||(TopTok->GetArgVal("clarifying")=="T")){
        CycBs->GetVrtx(CycStr).SetFlag(cvfHumanOk, true);}
    } else {
      //printf("%s\n", CycStr.CStr());
    }
  }
  printf("%d Docs\nDone.\n", XmlDocs);}

  // knowledge-base
  {printf("Processing KBase %s ...\n", CycKBaseFNm.CStr());
  PSIn CycKBaseSIn=TFIn::New(CycKBaseFNm);
  PXmlDoc XmlDoc; int XmlDocs=0;
  TStr PrevCycLStr; TStrV PrevArgCycLStrV;
  TStrIntH HdCycLToFq;
  forever{
    // statistics
    XmlDocs++;
    if (XmlDocs%1000==0){
      printf("%d Docs\r", XmlDocs);}
    //if (XmlDocs>10000){break;}
    // load xml-tree
    XmlDoc=TXmlDoc::LoadTxt(CycKBaseSIn);
    if (!XmlDoc->IsOk()){
      printf("%s\n", PrevCycLStr.CStr());
      for (int ArgN=0; ArgN<PrevArgCycLStrV.Len(); ArgN++){
        printf(" [%s]", PrevArgCycLStrV[ArgN].CStr());}
      printf("\n");
      Fail;
    }
    // extract fields from xml-tree
    PXmlTok TopTok=XmlDoc->GetTok();
    if (TopTok->IsTag("end")){break;}
    IAssert(TopTok->IsTag("sentence"));
    TStr CycLStr=TopTok->GetArgVal("cycl");
    TXmlTokV ArgXmlTokV; XmlDoc->GetTagTokV("sentence|arg", ArgXmlTokV);
    TStrV ArgCycLStrV;
    for (int ArgN=0; ArgN<ArgXmlTokV.Len(); ArgN++){
      PXmlTok Tok=ArgXmlTokV[ArgN];
      IAssert(Tok->IsTag("arg"));
      if (Tok->IsArg("cycl")){
        TStr ArgCycLStr=Tok->GetArgVal("cycl");
        ArgCycLStrV.Add(ArgCycLStr);
      } else {
        ArgCycLStrV.Add("Empty");
      }
    }
    PrevCycLStr=CycLStr;
    PrevArgCycLStrV=ArgCycLStrV;
    if (ArgCycLStrV.Len()>0){
      HdCycLToFq.AddDat(ArgCycLStrV[0]+" - "+TInt::GetStr(ArgCycLStrV.Len()-1))++;}
    // insert
    if (ArgCycLStrV.Len()==3){
      TStr PredNm=ArgCycLStrV[0];
      if ((PredNm!="#$isa")&&(PredNm!="#$termOfUnit")&&(PredNm!="#$genls")){
        TStr BackLinkPredNm=TStr("~")+PredNm;
        TStr Arg1=ArgCycLStrV[1];
        TStr Arg2=ArgCycLStrV[2];
        CycBs->AddEdge(Arg1, PredNm, Arg2);
        CycBs->AddEdge(Arg2, BackLinkPredNm, Arg1);
      }
    }
  }
  // output top cycl relations
  {TFOut CycLSOut("CycKB-CycLFq.Stat.Txt"); FILE* fCycL=CycLSOut.GetFileId();
  TIntStrPrV FqCycLStrPrV; HdCycLToFq.GetDatKeyPrV(FqCycLStrPrV); 
  FqCycLStrPrV.Sort(false);
  for (int CycLN=0; CycLN<FqCycLStrPrV.Len(); CycLN++){
    fprintf(fCycL, "%6d. %s\n", 1+FqCycLStrPrV[CycLN].Val1, FqCycLStrPrV[CycLN].Val2.CStr());
  }}
  printf("%d Docs\nDone.\n", XmlDocs);}

  // return cyc-base
  return CycBs;
}