Ejemplo n.º 1
0
PAmazonItem TAmazonItem::New(const PXmlDoc& XmlDoc){
  // create item
  PAmazonItem AmazonItem=PAmazonItem(new TAmazonItem());
  // item-id
  AmazonItem->ItemId=XmlDoc->GetTagTok("AmazonItem|ItemId")->GetTokStr(false);
  // title
  AmazonItem->TitleStr=XmlDoc->GetTagTok("AmazonItem|Title")->GetTokStr(false);
  // authors
  TXmlTokV AuthorNmTokV;
  XmlDoc->GetTagTokV("AmazonItem|Authors|Name", AuthorNmTokV);
  for (int AuthorNmTokN=0; AuthorNmTokN<AuthorNmTokV.Len(); AuthorNmTokN++){
    PXmlTok AuthorNmTok=AuthorNmTokV[AuthorNmTokN];
    TStr AuthorNm=AuthorNmTok->GetTokStr(false);
    AmazonItem->AuthorNmV.Add(AuthorNm);
  }
  // x-sell item-ids
  TXmlTokV NextItemIdTokV;
  XmlDoc->GetTagTokV("AmazonItem|XSell|ItemId", NextItemIdTokV);
  for (int ItemIdTokN=0; ItemIdTokN<NextItemIdTokV.Len(); ItemIdTokN++){
    PXmlTok NextItemIdTok=NextItemIdTokV[ItemIdTokN];
    TStr NextItemId=NextItemIdTok->GetTokStr(false);
    AmazonItem->NextItemIdV.Add(NextItemId);
  }
  // return item
  return AmazonItem;
}
Ejemplo n.º 2
0
/////////////////////////////////////////////////
// SkyGrid-Document
void TSkyGridBinDoc::SaveBinDocV(
 const TStr& InXmlFPath, const TStr& OutBinFNm, const int& MxDocs){
  printf("Processing SkyGrid-News-Xml files from '%s'...\n", InXmlFPath.CStr());
  TFOut SOut(OutBinFNm);
  TFFile FFile(InXmlFPath, true); TStr FNm;
  int Docs=0; int DateDocs=0; uint64 PrevTm=0;
  while (FFile.Next(FNm)){
    if ((MxDocs!=-1)&&(Docs>=MxDocs)){break;}
    //printf("  Processing '%s' ...", FNm.CStr());
    PXmlDoc XmlDoc=TXmlDoc::LoadTxt(FNm);
    PXmlTok ContentTok=XmlDoc->GetTagTok("item|content");
    TStr SwIdStr=ContentTok->GetTagTok("swid")->GetArgVal("value");
    TStr UrlStr=ContentTok->GetTagTok("url")->GetTokStr(false);
    TStr TitleStr=ContentTok->GetTagTok("title")->GetTokStr(false);
    TStr FetchedValStr=ContentTok->GetTagTok("fetched")->GetArgVal("value");
    TXmlTokV EntityTokV; ContentTok->GetTagTokV("annotations|entity", EntityTokV);
    TStr BodyStr=ContentTok->GetTagTok("body")->GetTokStr(false);
    // extract date
    TStr DateStr=SwIdStr.GetSubStr(0, 7);
    TStr YearStr=DateStr.GetSubStr(0, 3);
    TStr MonthStr=DateStr.GetSubStr(4, 5);
    TStr DayStr=DateStr.GetSubStr(6, 7);
    TTm DateTm(YearStr.GetInt(), MonthStr.GetInt(), DayStr.GetInt());
    uint64 Tm=TTm::GetMSecsFromTm(DateTm);
    // extract entities
    TStrIntH EntNmToFqH;
    for (int EntityTokN=0; EntityTokN<EntityTokV.Len(); EntityTokN++){
      PXmlTok EntityTok=EntityTokV[EntityTokN];
      if (!EntityTok->IsTag("entity")){continue;}
      TStr CanonicalNm=EntityTok->GetArgVal("canonical", "");
      TStr TextStr=EntityTok->GetArgVal("text", "");
      TStr TypeNm=EntityTok->GetArgVal("type", "");
      TStr EntNm=CanonicalNm.Empty() ? TextStr : CanonicalNm;
      EntNmToFqH.AddDat(EntNm)++;
    }
    TIntStrPrV FqEntNmPrV; EntNmToFqH.GetDatKeyPrV(FqEntNmPrV); FqEntNmPrV.Sort(false);
    // extract headline
    TChA HeadlineChA=BodyStr.GetSubStr(0, 250);
    while ((HeadlineChA.Len()>0)&&(HeadlineChA.LastCh()!=' ')){
      HeadlineChA.Trunc(HeadlineChA.Len()-1);}
    HeadlineChA+="...";
    // create document
    TSkyGridBinDoc Doc(SwIdStr, Tm, TitleStr, HeadlineChA, FqEntNmPrV);
    // save document
    Doc.Save(SOut);
    // screen log
    if (PrevTm!=Tm){
      if (PrevTm!=0){printf("\n");}
      PrevTm=Tm; DateDocs=0;
    }
    Docs++; DateDocs++;
    printf("  %s [Day:%d / All:%d]\r", DateStr.CStr(), DateDocs, Docs);
  }
  printf("\nDone.\n");
}
Ejemplo n.º 3
0
PBowDocBs TBowFl::LoadReuters21578Txt(
 const TStr& FPath, const int& MxDocs,
 const TStr& SwSetTypeNm, const TStr& StemmerTypeNm,
 const int& MxNGramLen, const int& MnNGramFq, const bool& SaveDocP,
 const PNotify& Notify){
  // prepare stop-words
  PSwSet SwSet=TSwSet::GetSwSet(SwSetTypeNm);
  // prepare stemmer
  PStemmer Stemmer=TStemmer::GetStemmer(StemmerTypeNm);
  // create ngrams
  PNGramBs NGramBs;
  if (!((MxNGramLen==1)&&(MnNGramFq==1))){
    NGramBs=TNGramBs::GetNGramBsFromReuters21578(
     FPath, MxDocs,
     MxNGramLen, MnNGramFq, SwSet, Stemmer);
  }
  // create document-base
  PBowDocBs BowDocBs=TBowDocBs::New(SwSet, Stemmer, NGramBs);
  // traverse directory
  TFFile FFile(FPath, ".SGM", false); TStr FNm; int Docs=0;
  while (FFile.Next(FNm)){
    printf("Processing file '%s'\n", FNm.CStr());
    TIntH DocWIdToFqH(100);
    TXmlDocV LDocV; TXmlDoc::LoadTxt(FNm, LDocV);
    for (int LDocN=0; LDocN<LDocV.Len(); LDocN++){
      Docs++; if (Docs%100==0){printf("%d\r", Docs);}
      if ((MxDocs!=-1)&&(Docs>=MxDocs)){break;}
      // get document-name
      PXmlDoc Doc=LDocV[LDocN];
      PXmlTok DocTok=Doc->GetTok();
      TStr DocNm=DocTok->GetArgVal("NEWID");
      // get document-categories
      TStrV CatNmV;
      TXmlTokV TopicsTokV; Doc->GetTagTokV("REUTERS|TOPICS|D", TopicsTokV);
      for (int TopicsTokN=0; TopicsTokN<TopicsTokV.Len(); TopicsTokN++){
        TStr CatNm=TopicsTokV[TopicsTokN]->GetTokStr(false);
        CatNmV.Add(CatNm);
      }
      // get document-contents
      PXmlTok DocStrTok=Doc->GetTagTok("REUTERS|TEXT");
      TStr DocStr=DocStrTok->GetTokStr(false);
      // add document to bow
      int DId=BowDocBs->AddHtmlDoc(DocNm, CatNmV, DocStr, SaveDocP);
      // train & test data
      if ((DocTok->GetArgVal("LEWISSPLIT")=="TRAIN")&&(DocTok->GetArgVal("TOPICS")=="YES")){
        BowDocBs->AddTrainDId(DId);}
      if ((DocTok->GetArgVal("LEWISSPLIT")=="TEST")&&(DocTok->GetArgVal("TOPICS")=="YES")){
        BowDocBs->AddTestDId(DId);}
    }
    if ((MxDocs!=-1)&&(Docs>=MxDocs)){break;}
  }
  // return results
  BowDocBs->AssertOk();
  return BowDocBs;
}
Ejemplo n.º 4
0
void TCpDoc::SaveReuters21578ToCpd(
 const TStr& InFPath, const TStr& OutCpdFNm, const int& MxDocs){
  // create output file
  PSOut SOut=TFOut::New(OutCpdFNm);
  // traverse directory with .sgm files
  TFFile FFile(InFPath, ".SGM", false); TStr FNm; int Docs=0;
  while (FFile.Next(FNm)){
    printf("Processing file '%s'\n", FNm.CStr());
    TXmlDocV LDocV; TXmlDoc::LoadTxt(FNm, LDocV);
    for (int LDocN=0; LDocN<LDocV.Len(); LDocN++){
      Docs++; if (Docs%100==0){printf("%d\r", Docs);}
      if ((MxDocs!=-1)&&(Docs>MxDocs)){break;}
      // create reuters document
      PCpDoc CpDoc=TCpDoc::New();
      // load xml document
      PXmlDoc Doc=LDocV[LDocN];
      PXmlTok DocTok=Doc->GetTok();
      // document id
      CpDoc->DocNm=DocTok->GetArgVal("NEWID");
      // date
      CpDoc->DateStr=Doc->GetTagTok("REUTERS|DATE")->GetTokStr(false);
      // document title
      PXmlTok TitleTok=Doc->GetTagTok("REUTERS|TEXT|TITLE");
      if (!TitleTok.Empty()){
        CpDoc->TitleStr=TitleTok->GetTokStr(false);}
      // dateline
      PXmlTok DatelineTok=Doc->GetTagTok("REUTERS|TEXT|DATELINE");
      if (!DatelineTok.Empty()){
        CpDoc->DatelineStr=DatelineTok->GetTokStr(false);}
      // get text string
      TStr TextStr;
      PXmlTok BodyTok=Doc->GetTagTok("REUTERS|TEXT|BODY");
      if (!BodyTok.Empty()){
        TextStr=BodyTok->GetTokStr(false);
      } else {
        // if <BODY> doesn't exist, take the whole <TEXT>
        PXmlTok TextTok=Doc->GetTagTok("REUTERS|TEXT");
        if (!TextTok.Empty()){
          TextStr=TextTok->GetTokStr(false);
        }
      }
      CpDoc->ParStrV.Add(TextStr, 1);
      // topic categories
      TXmlTokV TopCatTokV; Doc->GetTagTokV("REUTERS|TOPICS|D", TopCatTokV);
      for (int TokN=0; TokN<TopCatTokV.Len(); TokN++){
        TStr CatNm=TopCatTokV[TokN]->GetTokStr(false);
        CpDoc->TopCdNmV.Add(CatNm);
      }
      // save cpd document
      CpDoc->Save(*SOut);
    }
    if ((MxDocs!=-1)&&(Docs>MxDocs)){break;}
  }
}
TIntV GetEmailIds(const PXmlTok& QueryXml, const TStr& TagPath)
{
	TIntV EmailIdV;
	TXmlTokV Ids;
	QueryXml->GetTagTokV(TagPath, Ids);
	for (int EmlInd = 0; EmlInd < Ids.Len(); EmlInd++) 
	{
		TInt EmailIdInt = Ids[EmlInd]->GetIntArgVal("id", -1);
		if (EmailIdInt != -1)
			EmailIdV.AddUnique(EmailIdInt);
	}
	return EmailIdV;
}
void GetKeywords(const PXmlTok& QueryXml, const TStr& TagPath, TStrV& KeywordsV, TStrV& IgnoreKeywordsV)
{
	TXmlTokV KwsXmlV;
	QueryXml->GetTagTokV(TagPath, KwsXmlV);
	for (int KwInd = 0; KwInd < KwsXmlV.Len(); KwInd++) 
	{
		TStr Kw = KwsXmlV[KwInd]->GetTokStr(false);
		int hide = KwsXmlV[KwInd]->GetIntArgVal("hide", 0);
		if (hide)
			IgnoreKeywordsV.Add(Kw);
		else
			KeywordsV.Add(Kw);
	}
}
Ejemplo n.º 7
0
TStr TDzsBsDoc::GetDataTokVStr(const TXmlTokV& TokV, const TStr& SepStr){
  TChA ChA;
  for (int TokN=0; TokN<TokV.Len(); TokN++){
    if (TokN>0){ChA+=SepStr;}
    ChA+=GetDataTokStr(TokV[TokN]);
  }
  return ChA;
}
Ejemplo n.º 8
0
PTransCorpus TTransCorpus::LoadAC(const TStr& InXmlFNm, const int& MxSents) {
    // prepare prset structures
    PTransCorpus TransCorpus = TTransCorpus::New();
    // we load xml by skiping first tags
    PSIn XmlSIn=TFIn::New(InXmlFNm); 
    TXmlDoc::SkipTopTag(XmlSIn); // ignore TEI
    printf("Ignoring: %s\n", TXmlDoc::LoadTxt(XmlSIn)->GetTok()->GetTagNm().CStr()); // ignore teiHeader
    TXmlDoc::SkipTopTag(XmlSIn); // ignore text
    TXmlDoc::SkipTopTag(XmlSIn); // ignore body
    PXmlDoc XmlDoc; int XmlDocs = 0, SentId = 0;;
    forever{
        // load xml tree
        XmlDocs++; printf("%7d Sentences \r", SentId);
        XmlDoc=TXmlDoc::LoadTxt(XmlSIn);
        // stop if at the last tag
        if (!XmlDoc->IsOk()) { /*printf("Error: %s\n", XmlDoc->GetMsgStr().CStr());*/ break; }
        // extract documents from xml-trees
        PXmlTok TopTok=XmlDoc->GetTok();
        if (TopTok->IsTag("div")){
            // extract document Id
            TStr DocNm = TopTok->GetArgVal("n");
            // and paragraphs
            TXmlTokV LinkTokV; TopTok->GetTagTokV("linkGrp|link", LinkTokV);
            for (int LinkTokN = 0; LinkTokN < LinkTokV.Len(); LinkTokN++) {
                PXmlTok LinkTok = LinkTokV[LinkTokN];
                TStr LinkType = LinkTok->GetArgVal("type");
                // skip if paragraph for one language is empty
                if (LinkType == "1:1") {
                    TXmlTokV S1TokV; LinkTok->GetTagTokV("s1", S1TokV); 
                    TXmlTokV S2TokV; LinkTok->GetTagTokV("s2", S2TokV);
                    IAssert(S1TokV.Len() == 1); IAssert(S2TokV.Len() == 1);
                    TStr ParaStr1 = S1TokV[0]->GetTagTokStr("");
                    TStr ParaStr2 = S2TokV[0]->GetTagTokStr("");
                    TransCorpus->AddSentenceNoTrans(SentId, ParaStr1, ParaStr2); SentId++;
                }
            }
        } else {
            printf("Unknow tag: %s\n", TopTok->GetTagNm().CStr());
        }
        if ((MxSents != -1) && (TransCorpus->GetSentences() > MxSents)) { break; }
    }
    printf("\n");
    // finish
    return TransCorpus;
}
Ejemplo n.º 9
0
void TCpDoc::LoadReuters2000DocFromXml(const TStr& FNm,
 TStr& DocId, TStr& DateStr, TStr& TitleStr,
 TStr& HeadlineStr, TStr& BylineStr, TStr& DatelineStr,
 TStrV& ParStrV,
 TStrV& TopCdNmV, TStrV& GeoCdNmV, TStrV& IndCdNmV){
  PXmlDoc Doc=TXmlDoc::LoadTxt(FNm);
  // get text strings
  // general document data
  DocId=Doc->GetTagTok("newsitem")->GetArgVal("itemid");
  DateStr=Doc->GetTagTok("newsitem")->GetArgVal("date");
  TitleStr=Doc->GetTagTok("newsitem|title")->GetTokStr(false);
  HeadlineStr=Doc->GetTagTok("newsitem|headline")->GetTokStr(false);
  BylineStr=""; PXmlTok BylineTok;
  if (Doc->IsTagTok("newsitem|byline", BylineTok)){
    BylineStr=BylineTok->GetTokStr(false);}
  DatelineStr=""; PXmlTok DatelineTok;
  if (Doc->IsTagTok("newsitem|dateline", DatelineTok)){
    DatelineStr=DatelineTok->GetTokStr(false);}
  // text paragraphs
  ParStrV.Clr(); TXmlTokV ParTokV; Doc->GetTagTokV("newsitem|text|p", ParTokV);
  for (int ParTokN=0; ParTokN<ParTokV.Len(); ParTokN++){
    TStr ParStr=ParTokV[ParTokN]->GetTokStr(false);
    ParStrV.Add(ParStr);
  }
  // codes
  TopCdNmV.Clr(); GeoCdNmV.Clr(); IndCdNmV.Clr();
  TXmlTokV CdsTokV; Doc->GetTagTokV("newsitem|metadata|codes", CdsTokV);
  for (int CdsTokN=0; CdsTokN<CdsTokV.Len(); CdsTokN++){
    PXmlTok CdsTok=CdsTokV[CdsTokN];
    TXmlTokV CdTokV; CdsTok->GetTagTokV("code", CdTokV);
    if (CdsTok->GetArgVal("class")=="bip:topics:1.0"){
      for (int CdTokN=0; CdTokN<CdTokV.Len(); CdTokN++){
        TStr CdNm=CdTokV[CdTokN]->GetArgVal("code");
        TopCdNmV.Add(CdNm);
      }
    } else
    if (CdsTok->GetArgVal("class")=="bip:countries:1.0"){
      for (int CdTokN=0; CdTokN<CdTokV.Len(); CdTokN++){
        TStr CdNm=CdTokV[CdTokN]->GetArgVal("code");
        GeoCdNmV.Add(CdNm);
      }
    } else
    if (CdsTok->GetArgVal("class")=="bip:industries:1.0"){
      for (int CdTokN=0; CdTokN<CdTokV.Len(); CdTokN++){
        TStr CdNm=CdTokV[CdTokN]->GetArgVal("code");
        IndCdNmV.Add(CdNm);
      }
    } else {
      Fail;
    }
  }
}
Ejemplo n.º 10
0
Archivo: exp.cpp Proyecto: Accio/snap
/////////////////////////////////////////////////
// Expression-Help
PExpHelp TExpHelp::LoadXml(const PSIn& SIn){
  // create expression help
  PExpHelp ExpHelp=TExpHelp::New();
  // load xml with expression help
  PXmlDoc Doc=TXmlDoc::LoadTxt(SIn);
  // retrieve objects
  TXmlTokV ObjTokV; Doc->GetTagTokV("ExpHelp|Obj", ObjTokV);
  for (int ObjTokN=0; ObjTokN<ObjTokV.Len(); ObjTokN++){
    PXmlTok ObjTok=ObjTokV[ObjTokN];
    // type
    TStr TypeStr=ObjTok->GetTagTok("Type")->GetTokStr(false);
    // category
    TStr CatNm=ObjTok->GetTagTok("Cat")->GetTokStr(false);
    // header
    TStr HdNm=ObjTok->GetTagTok("Head|Name")->GetTokStr(false);
    TStr HdTypeStr=ObjTok->GetTagTok("Head|Type")->GetTokStr(false);
    TStr HdDescStr=ObjTok->GetTagTok("Head|Desc")->GetTokStr(false);
    PExpHelpItem HdItem=
     TExpHelpItem::New(HdNm, HdTypeStr, HdDescStr, "");
    // arguments
    TXmlTokV ArgTokV; ObjTok->GetTagTokV("Args|Arg", ArgTokV);
    TExpHelpItemV ArgItemV;
    for (int ArgTokN=0; ArgTokN<ArgTokV.Len(); ArgTokN++){
      PXmlTok ArgTok=ArgTokV[ArgTokN];
      // argument
      TStr ArgNm=ArgTok->GetTagTok("Name")->GetTokStr(false);
      TStr ArgTypeStr=ArgTok->GetTagTok("Type")->GetTokStr(false);
      TStr ArgDescStr=ArgTok->GetTagTok("Desc")->GetTokStr(false);
      TStr ArgDfValStr=ArgTok->GetTagTok("Default")->GetTokStr(false);
      PExpHelpItem ArgItem=
       TExpHelpItem::New(ArgNm, ArgTypeStr, ArgDescStr, ArgDfValStr);
      ArgItemV.Add(ArgItem);
    }
    // create & add object
    TExpHelpObjType Type=TExpHelpObj::GetObjTypeFromStr(TypeStr);
    PExpHelpObj Obj=TExpHelpObj::New(Type, CatNm, HdItem, ArgItemV);
    ExpHelp->AddObj(Obj);
  }
  // return result
  return ExpHelp;
}
Ejemplo n.º 11
0
PTransCorpus TTransCorpus::LoadTMX(const TStr& InTmxFPath,
        const TStr& OrgLang, const TStr& RefTransLang) {

    // prepare prset structures
    PTransCorpus TransCorpus = TTransCorpus::New();
    // iterate over all the TMX files
    TFFile TmxFNms(InTmxFPath, "tmx", false); TStr TmxFNm; int SentId = 0;
    while (TmxFNms.Next(TmxFNm)) {
        printf("Loading %s ...\n", TmxFNm.CStr());
        // we load xml by skiping first tag...
        TStr CleanTmxFNm = TmxFNm + ".xml";
        CleanTmx(TmxFNm, CleanTmxFNm);
        PSIn XmlSIn = TFIn::New(CleanTmxFNm);
        PXmlDoc XmlDoc = TXmlDoc::LoadTxt(XmlSIn);
        // stop if at the last tag
        if (!XmlDoc->IsOk()) { 
            printf(" error: %s\n", XmlDoc->GetMsgStr().CStr());
            continue; 
        }
        // extract sentences from xml-trees
        TXmlTokV TuTokV; XmlDoc->GetTagTokV("tmx|body|tu", TuTokV);
        const int TuToks = TuTokV.Len();
        for (int TuTokN = 0; TuTokN < TuToks; TuTokN++) {
            if (TuTokN % 100 == 0) { printf(" %d / %d\r", TuTokN, TuToks); }
            TXmlTokV TuvTokV; TuTokV[TuTokN]->GetTagTokV("tuv", TuvTokV);
            IAssert(TuvTokV.Len() == 2);
            TStr OrgSent, RefTransSent;
            for (int TuvTokN = 0; TuvTokN < TuvTokV.Len(); TuvTokN++) {
                TStr Lang = TuvTokV[TuvTokN]->GetStrArgVal("xml:lang", "");
                TStr Sent = CleanRtf(TuvTokV[TuvTokN]->GetTagTok("seg")->GetTokStr(false));
                if (Lang == OrgLang) { OrgSent = Sent; } 
                else if (Lang == RefTransLang) { RefTransSent = Sent; }
            }
            TransCorpus->AddSentenceNoTrans(SentId, OrgSent, RefTransSent); SentId++;
        }
        printf(" %d / %d\n", TuToks, TuToks);
    }    
    // finish
    return TransCorpus;
}
Ejemplo n.º 12
0
PLwOnto TLwOnto::LoadAsfaVoc(const TStr& FPath){
  // normalize path
  TStr NrFPath=TStr::GetNrFPath(FPath);
  // create ontology
  PLwOnto LwOnto=TLwOnto::New();

  // create language object
  int EnLangId=LwOnto->GetLangBs()->AddLang("EN", "English");

  // create term-types
  {PLwTermType D_TermType=TLwTermType::New(0, "Descriptor", EnLangId);
  PLwTermType ND_TermType=TLwTermType::New(1, "Non-descriptor", EnLangId);
  LwOnto->GetTermTypeBs()->AddTermType(D_TermType);
  LwOnto->GetTermTypeBs()->AddTermType(ND_TermType);}

  // create link-types
  {PLwLinkType BT_LinkType=TLwLinkType::New(0, "BT", EnLangId, "Broader-Term");
  PLwLinkType NT_LinkType=TLwLinkType::New(1, "NT", EnLangId, "Narrower-Term");
  PLwLinkType RT_LinkType=TLwLinkType::New(2, "RT", EnLangId, "Related-Term");
  PLwLinkType UF_LinkType=TLwLinkType::New(3, "UF", EnLangId, "Used-For");
  PLwLinkType USE_LinkType=TLwLinkType::New(4, "USE", EnLangId, "Used-By");
  LwOnto->GetLinkTypeBs()->AddLinkType(BT_LinkType);
  LwOnto->GetLinkTypeBs()->AddLinkType(NT_LinkType);
  LwOnto->GetLinkTypeBs()->AddLinkType(RT_LinkType);
  LwOnto->GetLinkTypeBs()->AddLinkType(UF_LinkType);
  LwOnto->GetLinkTypeBs()->AddLinkType(USE_LinkType);}

  // load ontology file
  TStr AsfaOntoFNm=NrFPath+"asfa_xml_20060522.xml";
  printf("Loading '%s' ...", AsfaOntoFNm.CStr());
  PXmlDoc AsfaXmlDoc=TXmlDoc::LoadTxt(AsfaOntoFNm);
  IAssert(AsfaXmlDoc->IsOk());
  TXmlTokV ConceptXmlTokV;
  AsfaXmlDoc->GetTagTokV("THESAURUS|CONCEPT", ConceptXmlTokV);
  printf(" Done.\n");

  // create terms
  {printf("Creating terms ...");
  for (int ConceptN=0; ConceptN<ConceptXmlTokV.Len(); ConceptN++){
    PXmlTok ConceptXmlTok=ConceptXmlTokV[ConceptN];
    // term-name
    TStr TermNm;
    if (ConceptXmlTok->IsSubTag("NON-DESCRIPTOR")){
      TermNm=ConceptXmlTok->GetTagTokStr("NON-DESCRIPTOR");}
    else if (ConceptXmlTok->IsSubTag("DESCRIPTOR")){
      TermNm=ConceptXmlTok->GetTagTokStr("DESCRIPTOR");}
    // term-type
    TStr TermTypeNm=ConceptXmlTok->GetTagTokStr("TYP");
    int TermTypeId=LwOnto->GetTermTypeBs()->GetTermTypeId(TermTypeNm, EnLangId);
    // description
    TStr DescStr;
    if (ConceptXmlTok->IsSubTag("SN")){
      DescStr=ConceptXmlTok->GetTagTokStr("SN");
      DescStr.ChangeChAll('\r', ' '); DescStr.ChangeChAll('\n', ' ');
      DescStr.ChangeStrAll("  ", " "); DescStr.ToTrunc();
    }
    // create term
    PLwTerm Term=TLwTerm::New(-1, TermNm, EnLangId, TermTypeId, DescStr);
    LwOnto->GetTermBs()->AddTermGetTermId(Term);
  }
  printf(" Done. (%d)\n", LwOnto->GetTermBs()->GetTerms());}

  // create links
  {printf("Creating links ...");
  for (int ConceptN=0; ConceptN<ConceptXmlTokV.Len(); ConceptN++){
    PXmlTok ConceptXmlTok=ConceptXmlTokV[ConceptN];
    // source-term-name
    TStr TermNm1;
    if (ConceptXmlTok->IsSubTag("NON-DESCRIPTOR")){
      TermNm1=ConceptXmlTok->GetTagTokStr("NON-DESCRIPTOR");}
    else if (ConceptXmlTok->IsSubTag("DESCRIPTOR")){
      TermNm1=ConceptXmlTok->GetTagTokStr("DESCRIPTOR");}
    int TermId1=LwOnto->GetTermBs()->GetTermId(TermNm1, EnLangId);
    // links
    for (int SubTokN=0; SubTokN<ConceptXmlTok->GetSubToks(); SubTokN++){
      PXmlTok SubTok=ConceptXmlTok->GetSubTok(SubTokN);
      if (SubTok->IsTag()){
        TStr LinkTypeNm=SubTok->GetTagNm();
        if (LwOnto->GetLinkTypeBs()->IsLinkType(LinkTypeNm, EnLangId)){
          // destination-term-name
          TStr TermNm2=ConceptXmlTok->GetTagTokStr(LinkTypeNm);
          int TermId2=LwOnto->GetTermBs()->GetTermId(TermNm2, EnLangId);
          int LinkTypeId=LwOnto->GetLinkTypeBs()->GetLinkTypeId(LinkTypeNm, EnLangId);
          LwOnto->GetLinkBs()->AddLink(TermId1, LinkTypeId, TermId2);
        }
      }
    }
  }
  printf(" Done. (%d)\n", LwOnto->GetLinkBs()->GetLinks());}

  // return ontology
  return LwOnto;
}
Ejemplo n.º 13
0
PCycBs TCycBs::LoadCycXmlDump(const TStr& FPath){
  // file-names
  TStr NrFPath=TStr::GetNrFPath(FPath);
  TStr CycLexiconFNm=NrFPath+"lexicon-dump.xml";
  TStr CycTaxonomyFNm=NrFPath+"taxonomy-dump.xml";
  TStr CycRelevanceFNm=NrFPath+"relevance-dump.xml";
  TStr CycKBaseFNm=NrFPath+"kb-dump.xml";                                                 

  // create cyc-base
  PCycBs CycBs=TCycBs::New();

  // lexicon
  {printf("Processing Lexicon %s ...\n", CycLexiconFNm.CStr());
  PSIn CycLexiconSIn=TFIn::New(CycLexiconFNm);
  PXmlDoc XmlDoc; int XmlDocs=0;
  TStr PrevCycWStr; TStr PrevCycLStr;
  forever{
    // statistics
    XmlDocs++; if (XmlDocs%1000==0){printf("%d Docs\r", XmlDocs);}
    // load xml-tree
    XmlDoc=TXmlDoc::LoadTxt(CycLexiconSIn);
    if (!XmlDoc->IsOk()){
      printf("%s - %s\n", PrevCycWStr.CStr(), PrevCycLStr.CStr());
      Fail;
    }
    // extract fields from xml-tree
    PXmlTok TopTok=XmlDoc->GetTok();
    if (TopTok->IsTag("end")){break;}
    IAssert(TopTok->IsTag("word"));
    TStr CycWStr=TopTok->GetArgVal("string");
    TStr CycLStr=TopTok->GetArgVal("cycl");
    PrevCycWStr=CycWStr; PrevCycLStr;
    // insert data
    CycBs->AddEdge(CycLStr, "#$nameString", CycWStr);
    CycBs->AddEdge(CycWStr, "~#$nameString", CycLStr);
  }
  printf("%d Docs\nDone.\n", XmlDocs);}

  // taxonomy
  {printf("Processing Taxonomy %s ...\n", CycTaxonomyFNm.CStr());
  PSIn CycTaxonomySIn=TFIn::New(CycTaxonomyFNm);
  PXmlDoc XmlDoc; int XmlDocs=0;
  TStr PrevSrcCycLStr;
  forever{
    // statistics
    XmlDocs++;
    if (XmlDocs%1000==0){
      printf("%d Docs\r", XmlDocs);}
    // load xml-tree
    XmlDoc=TXmlDoc::LoadTxt(CycTaxonomySIn);
    if (!XmlDoc->IsOk()){
      printf("%s\n", PrevSrcCycLStr.CStr());
      Fail;
    }
    // extract fields from xml-tree
    PXmlTok TopTok=XmlDoc->GetTok();
    if (TopTok->IsTag("end")){break;}
    IAssert(TopTok->IsTag("term"));
    TStr SrcCycLStr=TopTok->GetArgVal("cycl");
    PrevSrcCycLStr=SrcCycLStr;
    for (int SubTokN=0; SubTokN<TopTok->GetSubToks(); SubTokN++){
      PXmlTok SubTok=TopTok->GetSubTok(SubTokN);
      TStr DstCycLStr=SubTok->GetTagNm();
      if (SubTok->IsTag("isa")){
        DstCycLStr=SubTok->GetArgVal("value");
        CycBs->AddEdge(SrcCycLStr, "#$isa", DstCycLStr);
        CycBs->AddEdge(DstCycLStr, "~#$isa", SrcCycLStr);
      } else
      if (SubTok->IsTag("genl")){
        DstCycLStr=SubTok->GetArgVal("value");
        CycBs->AddEdge(SrcCycLStr, "#$genls", DstCycLStr);
        CycBs->AddEdge(DstCycLStr, "~#$genls", SrcCycLStr);
      } else {
        Fail;
      }
    }
  }
  printf("%d Docs\nDone.\n", XmlDocs);}

  // relevance
  {printf("Processing Relevance %s ...\n", CycRelevanceFNm.CStr());
  PSIn CycRelevanceSIn=TFIn::New(CycRelevanceFNm);
  PXmlDoc XmlDoc; int XmlDocs=0;
  TStr PrevCycStr;
  forever{
    // statistics
    XmlDocs++;
    if (XmlDocs%1000==0){
      printf("%d Docs\r", XmlDocs);}
    // load xml-tree
    XmlDoc=TXmlDoc::LoadTxt(CycRelevanceSIn);
    if (!XmlDoc->IsOk()){
      printf("%s\n", PrevCycStr.CStr());
      Fail;
    }
    // extract fields from xml-tree
    PXmlTok TopTok=XmlDoc->GetTok();
    if (TopTok->IsTag("end")){break;}
    IAssert(TopTok->IsTag("term"));
    TStr CycStr=TopTok->GetArgVal("cyc");
    PrevCycStr=CycStr;
    //IAssert(CycBs->IsVNm(CycStr));
    if (CycBs->IsVNm(CycStr)){
      if (TopTok->GetArgVal("thcl")=="T"){
        CycBs->GetVrtx(CycStr).SetFlag(cvfHumanRelevant, true);}
      if (TopTok->GetArgVal("irrel")=="T"){
        CycBs->GetVrtx(CycStr).SetFlag(cfvHumanIrrelevant, true);}
      if (TopTok->GetArgVal("clarifying")=="T"){
        CycBs->GetVrtx(CycStr).SetFlag(cfvHumanClarifying, true);}
      if ((TopTok->GetArgVal("thcl")=="T")||(TopTok->GetArgVal("clarifying")=="T")){
        CycBs->GetVrtx(CycStr).SetFlag(cvfHumanOk, true);}
    } else {
      //printf("%s\n", CycStr.CStr());
    }
  }
  printf("%d Docs\nDone.\n", XmlDocs);}

  // knowledge-base
  {printf("Processing KBase %s ...\n", CycKBaseFNm.CStr());
  PSIn CycKBaseSIn=TFIn::New(CycKBaseFNm);
  PXmlDoc XmlDoc; int XmlDocs=0;
  TStr PrevCycLStr; TStrV PrevArgCycLStrV;
  TStrIntH HdCycLToFq;
  forever{
    // statistics
    XmlDocs++;
    if (XmlDocs%1000==0){
      printf("%d Docs\r", XmlDocs);}
    //if (XmlDocs>10000){break;}
    // load xml-tree
    XmlDoc=TXmlDoc::LoadTxt(CycKBaseSIn);
    if (!XmlDoc->IsOk()){
      printf("%s\n", PrevCycLStr.CStr());
      for (int ArgN=0; ArgN<PrevArgCycLStrV.Len(); ArgN++){
        printf(" [%s]", PrevArgCycLStrV[ArgN].CStr());}
      printf("\n");
      Fail;
    }
    // extract fields from xml-tree
    PXmlTok TopTok=XmlDoc->GetTok();
    if (TopTok->IsTag("end")){break;}
    IAssert(TopTok->IsTag("sentence"));
    TStr CycLStr=TopTok->GetArgVal("cycl");
    TXmlTokV ArgXmlTokV; XmlDoc->GetTagTokV("sentence|arg", ArgXmlTokV);
    TStrV ArgCycLStrV;
    for (int ArgN=0; ArgN<ArgXmlTokV.Len(); ArgN++){
      PXmlTok Tok=ArgXmlTokV[ArgN];
      IAssert(Tok->IsTag("arg"));
      if (Tok->IsArg("cycl")){
        TStr ArgCycLStr=Tok->GetArgVal("cycl");
        ArgCycLStrV.Add(ArgCycLStr);
      } else {
        ArgCycLStrV.Add("Empty");
      }
    }
    PrevCycLStr=CycLStr;
    PrevArgCycLStrV=ArgCycLStrV;
    if (ArgCycLStrV.Len()>0){
      HdCycLToFq.AddDat(ArgCycLStrV[0]+" - "+TInt::GetStr(ArgCycLStrV.Len()-1))++;}
    // insert
    if (ArgCycLStrV.Len()==3){
      TStr PredNm=ArgCycLStrV[0];
      if ((PredNm!="#$isa")&&(PredNm!="#$termOfUnit")&&(PredNm!="#$genls")){
        TStr BackLinkPredNm=TStr("~")+PredNm;
        TStr Arg1=ArgCycLStrV[1];
        TStr Arg2=ArgCycLStrV[2];
        CycBs->AddEdge(Arg1, PredNm, Arg2);
        CycBs->AddEdge(Arg2, BackLinkPredNm, Arg1);
      }
    }
  }
  // output top cycl relations
  {TFOut CycLSOut("CycKB-CycLFq.Stat.Txt"); FILE* fCycL=CycLSOut.GetFileId();
  TIntStrPrV FqCycLStrPrV; HdCycLToFq.GetDatKeyPrV(FqCycLStrPrV); 
  FqCycLStrPrV.Sort(false);
  for (int CycLN=0; CycLN<FqCycLStrPrV.Len(); CycLN++){
    fprintf(fCycL, "%6d. %s\n", 1+FqCycLStrPrV[CycLN].Val1, FqCycLStrPrV[CycLN].Val2.CStr());
  }}
  printf("%d Docs\nDone.\n", XmlDocs);}

  // return cyc-base
  return CycBs;
}