PTransCorpus TTransCorpus::LoadAC(const TStr& InXmlFNm, const int& MxSents) {
    // prepare prset structures
    PTransCorpus TransCorpus = TTransCorpus::New();
    // we load xml by skiping first tags
    PSIn XmlSIn=TFIn::New(InXmlFNm); 
    TXmlDoc::SkipTopTag(XmlSIn); // ignore TEI
    printf("Ignoring: %s\n", TXmlDoc::LoadTxt(XmlSIn)->GetTok()->GetTagNm().CStr()); // ignore teiHeader
    TXmlDoc::SkipTopTag(XmlSIn); // ignore text
    TXmlDoc::SkipTopTag(XmlSIn); // ignore body
    PXmlDoc XmlDoc; int XmlDocs = 0, SentId = 0;;
    forever{
        // load xml tree
        XmlDocs++; printf("%7d Sentences \r", SentId);
        XmlDoc=TXmlDoc::LoadTxt(XmlSIn);
        // stop if at the last tag
        if (!XmlDoc->IsOk()) { /*printf("Error: %s\n", XmlDoc->GetMsgStr().CStr());*/ break; }
        // extract documents from xml-trees
        PXmlTok TopTok=XmlDoc->GetTok();
        if (TopTok->IsTag("div")){
            // extract document Id
            TStr DocNm = TopTok->GetArgVal("n");
            // and paragraphs
            TXmlTokV LinkTokV; TopTok->GetTagTokV("linkGrp|link", LinkTokV);
            for (int LinkTokN = 0; LinkTokN < LinkTokV.Len(); LinkTokN++) {
                PXmlTok LinkTok = LinkTokV[LinkTokN];
                TStr LinkType = LinkTok->GetArgVal("type");
                // skip if paragraph for one language is empty
                if (LinkType == "1:1") {
                    TXmlTokV S1TokV; LinkTok->GetTagTokV("s1", S1TokV); 
                    TXmlTokV S2TokV; LinkTok->GetTagTokV("s2", S2TokV);
                    IAssert(S1TokV.Len() == 1); IAssert(S2TokV.Len() == 1);
                    TStr ParaStr1 = S1TokV[0]->GetTagTokStr("");
                    TStr ParaStr2 = S2TokV[0]->GetTagTokStr("");
                    TransCorpus->AddSentenceNoTrans(SentId, ParaStr1, ParaStr2); SentId++;
                }
            }
        } else {
            printf("Unknow tag: %s\n", TopTok->GetTagNm().CStr());
        }
        if ((MxSents != -1) && (TransCorpus->GetSentences() > MxSents)) { break; }
    }
    printf("\n");
    // finish
    return TransCorpus;
}
예제 #2
0
PCycBs TCycBs::LoadCycXmlDump(const TStr& FPath){
  // file-names
  TStr NrFPath=TStr::GetNrFPath(FPath);
  TStr CycLexiconFNm=NrFPath+"lexicon-dump.xml";
  TStr CycTaxonomyFNm=NrFPath+"taxonomy-dump.xml";
  TStr CycRelevanceFNm=NrFPath+"relevance-dump.xml";
  TStr CycKBaseFNm=NrFPath+"kb-dump.xml";                                                 

  // create cyc-base
  PCycBs CycBs=TCycBs::New();

  // lexicon
  {printf("Processing Lexicon %s ...\n", CycLexiconFNm.CStr());
  PSIn CycLexiconSIn=TFIn::New(CycLexiconFNm);
  PXmlDoc XmlDoc; int XmlDocs=0;
  TStr PrevCycWStr; TStr PrevCycLStr;
  forever{
    // statistics
    XmlDocs++; if (XmlDocs%1000==0){printf("%d Docs\r", XmlDocs);}
    // load xml-tree
    XmlDoc=TXmlDoc::LoadTxt(CycLexiconSIn);
    if (!XmlDoc->IsOk()){
      printf("%s - %s\n", PrevCycWStr.CStr(), PrevCycLStr.CStr());
      Fail;
    }
    // extract fields from xml-tree
    PXmlTok TopTok=XmlDoc->GetTok();
    if (TopTok->IsTag("end")){break;}
    IAssert(TopTok->IsTag("word"));
    TStr CycWStr=TopTok->GetArgVal("string");
    TStr CycLStr=TopTok->GetArgVal("cycl");
    PrevCycWStr=CycWStr; PrevCycLStr;
    // insert data
    CycBs->AddEdge(CycLStr, "#$nameString", CycWStr);
    CycBs->AddEdge(CycWStr, "~#$nameString", CycLStr);
  }
  printf("%d Docs\nDone.\n", XmlDocs);}

  // taxonomy
  {printf("Processing Taxonomy %s ...\n", CycTaxonomyFNm.CStr());
  PSIn CycTaxonomySIn=TFIn::New(CycTaxonomyFNm);
  PXmlDoc XmlDoc; int XmlDocs=0;
  TStr PrevSrcCycLStr;
  forever{
    // statistics
    XmlDocs++;
    if (XmlDocs%1000==0){
      printf("%d Docs\r", XmlDocs);}
    // load xml-tree
    XmlDoc=TXmlDoc::LoadTxt(CycTaxonomySIn);
    if (!XmlDoc->IsOk()){
      printf("%s\n", PrevSrcCycLStr.CStr());
      Fail;
    }
    // extract fields from xml-tree
    PXmlTok TopTok=XmlDoc->GetTok();
    if (TopTok->IsTag("end")){break;}
    IAssert(TopTok->IsTag("term"));
    TStr SrcCycLStr=TopTok->GetArgVal("cycl");
    PrevSrcCycLStr=SrcCycLStr;
    for (int SubTokN=0; SubTokN<TopTok->GetSubToks(); SubTokN++){
      PXmlTok SubTok=TopTok->GetSubTok(SubTokN);
      TStr DstCycLStr=SubTok->GetTagNm();
      if (SubTok->IsTag("isa")){
        DstCycLStr=SubTok->GetArgVal("value");
        CycBs->AddEdge(SrcCycLStr, "#$isa", DstCycLStr);
        CycBs->AddEdge(DstCycLStr, "~#$isa", SrcCycLStr);
      } else
      if (SubTok->IsTag("genl")){
        DstCycLStr=SubTok->GetArgVal("value");
        CycBs->AddEdge(SrcCycLStr, "#$genls", DstCycLStr);
        CycBs->AddEdge(DstCycLStr, "~#$genls", SrcCycLStr);
      } else {
        Fail;
      }
    }
  }
  printf("%d Docs\nDone.\n", XmlDocs);}

  // relevance
  {printf("Processing Relevance %s ...\n", CycRelevanceFNm.CStr());
  PSIn CycRelevanceSIn=TFIn::New(CycRelevanceFNm);
  PXmlDoc XmlDoc; int XmlDocs=0;
  TStr PrevCycStr;
  forever{
    // statistics
    XmlDocs++;
    if (XmlDocs%1000==0){
      printf("%d Docs\r", XmlDocs);}
    // load xml-tree
    XmlDoc=TXmlDoc::LoadTxt(CycRelevanceSIn);
    if (!XmlDoc->IsOk()){
      printf("%s\n", PrevCycStr.CStr());
      Fail;
    }
    // extract fields from xml-tree
    PXmlTok TopTok=XmlDoc->GetTok();
    if (TopTok->IsTag("end")){break;}
    IAssert(TopTok->IsTag("term"));
    TStr CycStr=TopTok->GetArgVal("cyc");
    PrevCycStr=CycStr;
    //IAssert(CycBs->IsVNm(CycStr));
    if (CycBs->IsVNm(CycStr)){
      if (TopTok->GetArgVal("thcl")=="T"){
        CycBs->GetVrtx(CycStr).SetFlag(cvfHumanRelevant, true);}
      if (TopTok->GetArgVal("irrel")=="T"){
        CycBs->GetVrtx(CycStr).SetFlag(cfvHumanIrrelevant, true);}
      if (TopTok->GetArgVal("clarifying")=="T"){
        CycBs->GetVrtx(CycStr).SetFlag(cfvHumanClarifying, true);}
      if ((TopTok->GetArgVal("thcl")=="T")||(TopTok->GetArgVal("clarifying")=="T")){
        CycBs->GetVrtx(CycStr).SetFlag(cvfHumanOk, true);}
    } else {
      //printf("%s\n", CycStr.CStr());
    }
  }
  printf("%d Docs\nDone.\n", XmlDocs);}

  // knowledge-base
  {printf("Processing KBase %s ...\n", CycKBaseFNm.CStr());
  PSIn CycKBaseSIn=TFIn::New(CycKBaseFNm);
  PXmlDoc XmlDoc; int XmlDocs=0;
  TStr PrevCycLStr; TStrV PrevArgCycLStrV;
  TStrIntH HdCycLToFq;
  forever{
    // statistics
    XmlDocs++;
    if (XmlDocs%1000==0){
      printf("%d Docs\r", XmlDocs);}
    //if (XmlDocs>10000){break;}
    // load xml-tree
    XmlDoc=TXmlDoc::LoadTxt(CycKBaseSIn);
    if (!XmlDoc->IsOk()){
      printf("%s\n", PrevCycLStr.CStr());
      for (int ArgN=0; ArgN<PrevArgCycLStrV.Len(); ArgN++){
        printf(" [%s]", PrevArgCycLStrV[ArgN].CStr());}
      printf("\n");
      Fail;
    }
    // extract fields from xml-tree
    PXmlTok TopTok=XmlDoc->GetTok();
    if (TopTok->IsTag("end")){break;}
    IAssert(TopTok->IsTag("sentence"));
    TStr CycLStr=TopTok->GetArgVal("cycl");
    TXmlTokV ArgXmlTokV; XmlDoc->GetTagTokV("sentence|arg", ArgXmlTokV);
    TStrV ArgCycLStrV;
    for (int ArgN=0; ArgN<ArgXmlTokV.Len(); ArgN++){
      PXmlTok Tok=ArgXmlTokV[ArgN];
      IAssert(Tok->IsTag("arg"));
      if (Tok->IsArg("cycl")){
        TStr ArgCycLStr=Tok->GetArgVal("cycl");
        ArgCycLStrV.Add(ArgCycLStr);
      } else {
        ArgCycLStrV.Add("Empty");
      }
    }
    PrevCycLStr=CycLStr;
    PrevArgCycLStrV=ArgCycLStrV;
    if (ArgCycLStrV.Len()>0){
      HdCycLToFq.AddDat(ArgCycLStrV[0]+" - "+TInt::GetStr(ArgCycLStrV.Len()-1))++;}
    // insert
    if (ArgCycLStrV.Len()==3){
      TStr PredNm=ArgCycLStrV[0];
      if ((PredNm!="#$isa")&&(PredNm!="#$termOfUnit")&&(PredNm!="#$genls")){
        TStr BackLinkPredNm=TStr("~")+PredNm;
        TStr Arg1=ArgCycLStrV[1];
        TStr Arg2=ArgCycLStrV[2];
        CycBs->AddEdge(Arg1, PredNm, Arg2);
        CycBs->AddEdge(Arg2, BackLinkPredNm, Arg1);
      }
    }
  }
  // output top cycl relations
  {TFOut CycLSOut("CycKB-CycLFq.Stat.Txt"); FILE* fCycL=CycLSOut.GetFileId();
  TIntStrPrV FqCycLStrPrV; HdCycLToFq.GetDatKeyPrV(FqCycLStrPrV); 
  FqCycLStrPrV.Sort(false);
  for (int CycLN=0; CycLN<FqCycLStrPrV.Len(); CycLN++){
    fprintf(fCycL, "%6d. %s\n", 1+FqCycLStrPrV[CycLN].Val1, FqCycLStrPrV[CycLN].Val2.CStr());
  }}
  printf("%d Docs\nDone.\n", XmlDocs);}

  // return cyc-base
  return CycBs;
}