Example #1
0
File: main.cpp Project: CCJY/coliru
void PrintWordsSorted(const char* str)
{
	char* buffer = (char*)malloc(256 * sizeof(char));
	struct WordList words;
	size_t i = 0;
	char c;
	char* value;
	assert(buffer != NULL);
	buffer[255] = '\0';
	CreateWordList(words);

	/* iterate on each word. */
	do {
		c = *str;
		buffer[i] = c;
		if ((c == ' ' || c == '\0') && i > 0) {
			// Add this word to the list.
			value = SumCharsAndDigits(buffer);
			InsertWordList(words, buffer, value[0] - '0');
			i = 0;
		}
		i++;
		str++;
	} while (c != '\0');

	PrintWordList(words);

	DeleteWordList(words);
	free(buffer);
}
Example #2
0
/* Initialise: initialise global data structures */
void Initialise(void)
{
   int  i;
   char path[256];

   CreateHeap(&langHeap,"LModel mem",MSTAK,1,0.5,1000,20000);

   if (wlistFN!=NULL) {
      tgtVoc = &wlist;
      CreateWordList(wlistFN,tgtVoc,10);
   }

   if (processText) {
      /* init empty buffer */
      CreateWordMap(NULL,&wmap,newWords); 
      wmap.hasCnts = TRUE;
      wmap.name = defMapName;
      wmap.htkEsc = htkEscape;
      ++wmap.seqno;
      mapUpdated = FALSE;
     
      if (tgtVoc!=NULL) {      /* add words from word list to the map */
	 pruneWords = TRUE;
	 for (i=0; i<tgtVoc->used; i++) {
	    AddWordToMap(&wmap,tgtVoc->id[i]);
	 }
	 SortWordMap(&wmap);
	 unkId = GetLabId(unkStr,FALSE);  
      }
      
      /* init ngram buffer */
      MakeFN(rootFN,dbsDir,NULL,path);
      stdBuf.used = 0;
      stdBuf.ng[nSize] = 1;  /* count = 1 */
      stdBuf.ngb = CreateNGBuffer(&langHeap,nSize,ngbSize,path,&wmap);
   } else {
      CreateWordMap(omapFN,&wmap,1);
   }
   
   CreateInputSet(&gstack,&wmap,&inSet);
   binfo.wmap = &wmap;
   binfo.inSet = &inSet;
   binfo.nSize = nSize;
}
Example #3
0
/* CombineModels: load models and combine with the one in memory */
BackOffLM *CombineModels(MemHeap *heap,LMInfo *lmi,int nLModel,int nSize,WordMap *wl) 
{
   int i,j,nw;
   float x;
   LMInfo *li;
   BackOffLM *tgtLM;
   WordMap wordList;
   LabId lab;
   NameId *na;

   /* normalise weights */
   for (x=0.0, i=1; i<nLModel; i++)
      x += lmInfo[i].weight;
   lmInfo[0].weight = 1.0-x;

   /* load all models except the first one*/
   for (li=lmInfo+1, i=1; i<nLModel; i++, li++) {
      if (trace&T_TOP)
	 printf("Loading language model from %s\n",li->fn);
      li->lm = LoadLangModel(li->fn,wl,1.0,LMP_FLOAT,heap);
   }
   if (wl==NULL) {
      wl = &wordList;
      /* derive word list from LMs */
      for (li=lmInfo, i=0; i<nLModel; i++, li++) {  
	 na = li->lm->binMap;
	 for (j=0; j<li->lm->vocSize; j++) {
	    lab = GetLabId(na[j+1]->name,TRUE);
	    lab->aux=NULL; 
	 }
      }
      for (nw=0,li=lmInfo, i=0; i<nLModel; i++, li++) {  
	 na = li->lm->binMap;
	 for (j=0; j<li->lm->vocSize; j++) {
	    lab = GetLabId(na[j+1]->name,FALSE);
	    if (lab->aux==NULL) {
	       nw++; lab->aux = (Ptr) wl;
	    }
	 }
      }
      CreateWordList(NULL,wl,nw+10);
      for (nw=0,li=lmInfo, i=0; i<nLModel; i++, li++) {
	 na = li->lm->binMap;
	 for (j=0; j<li->lm->vocSize; j++) {
	    lab = GetLabId(na[j+1]->name,FALSE);
	    if (lab->aux==(Ptr) wl) {
	       wl->id[nw++]=lab; lab->aux = NULL;
	    }
	 }
      }
      wl->used = nw;
   }
   if (trace&T_TOP) {
      printf("Using language model(s): \n");
      for (li=lmInfo,i=0; i<nLModel; i++,li++)
	 printf("  %d-gram %s, weight %.2f\n",li->lm->nSize,li->fn,li->weight);
   }
   if (trace&T_TOP) {
      printf("Generating %d-gram model %s\n",nSize,outFN);
      fflush(stdout);
   }
   tgtLM = MergeModels(heap,lmInfo,nLModel,nSize,wl);
#ifdef HTK_CRYPT   
   if (tgtLM->encrypt && binfo.saveFmt==LMF_TEXT)
      binfo.saveFmt = LMF_BINARY;
#endif
   for (i=1; i<=nSize; i++) {
      tgtLM->gInfo[i].fmt = (i==1) ? LMF_TEXT : binfo.saveFmt;
   }
   return tgtLM;
}
Example #4
0


static int       nSize = 0;              /* output ngram size */

static WordMap   wList;                  /* the word list */

static int       nLModel;                /* number of loaded LMs */

static LMInfo    lmInfo[MAX_LMODEL];     /* array of loaded LMs */

static BackOffLM *tgtLM;                 /* target lm */

static char      *tgtFN;                 /* output model name */

static MemHeap   langHeap;               /* Stores global stats */

static LMFileFmt saveFmt = DEF_SAVEFMT;  /* LM file format */



/* ---------------- Configuration Parameters --------------------- */



static ConfParam *cParm[MAXGLOBS];

static int nParm = 0;                  /* total num params */





/* ---------------- Function prototypes -------------------------- */



void Initialise(void);





/* ---------------- Process Command Line ------------------------- */



/* SetConfParms: set conf parms relevant to this tool */

void SetConfParms(void)

{

   int i;



   nParm = GetConfig("LMERGE", TRUE, cParm, MAXGLOBS);

   if (nParm>0){

      if (GetConfInt(cParm,nParm,"TRACE",&i)) trace = i;

   }

}



char *ReturnLMName(int fmt)

{

   switch(fmt) {

      case LMF_TEXT:

	 return LM_TXT_TEXT;

      case LMF_BINARY:

	 return LM_TXT_BINARY;

      case LMF_ULTRA:

	 return LM_TXT_ULTRA;

      default:

	 return LM_TXT_OTHER;
Example #5
0
/* Initialise: perform global initialisations */
static void Initialise(void)
{
   int i,j,ndx;
   float x;
   LMInfo *li;
   Boolean inLM;
   LabId *wid,lab;
   NameId *na,nid;
   Boolean isPipe;

   nulClass = GetLabId(nulName,TRUE);

   /* normalise weights */
   for (x=0.0, i=1; i<nLModel; i++)
      x += lmInfo[i].weight;
   lmInfo[0].weight = 1.0-x;

   /* load all models */
   for (li=lmInfo, i=0; i<nLModel; i++, li++) {
      if (trace&T_TOP)
	 printf("Loading language model from %s\n",li->fn);
      li->lm = LoadLangModel(li->fn,NULL,1.0,LMP_LOG|LMP_COUNT,&permHeap);
      if (li->lm->probType==LMP_COUNT)
	 RebuildLM(li->lm,cutOff,wdThresh,LMP_LOG);
      AttachAccessInfo(li->lm);
   }

   if (trace&T_TOP) {
      printf("Using language model(s): \n");
      for (li=lmInfo,i=0; i<nLModel; i++,li++)
	 printf("  %d-gram %s, weight %.2f\n",li->lm->nSize,li->fn,li->weight);
   }
   if (numTests==0) {
      numTests=1; testInfo[0] = lmInfo[0].lm->nSize;
   }

   /* load or create word list */
   if (wlistFN!=NULL) {
      /* load word list from file */
      CreateWordList(wlistFN,&wList,nWords+10);
      nWords = wList.used;
      for (wid=wList.id, i=0; i<nWords; i++,wid++) /* assign lookup indices */
	 (*wid)->aux = (Ptr) (i+1);
   } else {
      /* derive word list from LMs */
      for (nWords=0,li=lmInfo, i=0; i<nLModel; i++, li++)
      {
 	 /* Obtain class-LM word list in a different way */
	 if (li->lm->classLM)
	 {
	   na = li->lm->classBM;

	   for (j=0; j<li->lm->classW; j++)
	   {
	     lab = GetLabId(na[j+1]->name, TRUE);
	     if (lab->aux==NULL)
	       lab->aux = (Ptr) (++nWords);
	   }
	 }
	 else
	 {
	   na = li->lm->binMap;

	   for (j=0; j<li->lm->vocSize; j++)
	   {
	     lab = GetLabId(na[j+1]->name,TRUE);
	     if (lab->aux==NULL)
	       lab->aux = (Ptr) (++nWords);
	   }
	 }
      }
      CreateWordList(NULL,&wList,nWords+10);
      for (li=lmInfo, i=0; i<nLModel; i++, li++) {
	/* Obtain class-LM word list in a different way */
	if (li->lm->classLM)
	{
	  na = li->lm->classBM;

	  for (j=0; j<li->lm->classW; j++)
	  {
	    lab = GetLabId(na[j+1]->name,TRUE);
	    ndx = ((int) lab->aux) - 1;
	    wList.id[ndx] = lab;
	  }
	}
	else
	{
	  na = li->lm->binMap;

	  for (j=0; j<li->lm->vocSize; j++)
	  {
	    lab = GetLabId(na[j+1]->name,TRUE);
	    ndx = ((int) lab->aux) - 1;
	    wList.id[ndx] = lab;
	  }
	}

      }
      wList.used = nWords;
   }
   if (trace&T_TOP) {
      printf("Found %d unique words in %d model(s)\n",nWords,nLModel);
      fflush(stdout);
   }
   if (unkId->aux==NULL && !skipOOV) {
      HError(16620,"LPlex: OOV class symbol %s not in word list",unkId->name);
   }
   if (sstId->aux==NULL) {
      HError(16620,"LPlex: sentence start symbol %s not in word list",sstId->name);
   }
   if (senId->aux==NULL) {
      HError(16620,"LPlex: sentence end symbol %s not in word list",senId->name);
   }

   /* create lookup table */
   l2nId = (NameId **) New(&permHeap,nLModel*sizeof(NameId *));
   /* create LabId -> NameId lookup arrays (one per LM) */
   for (li=lmInfo, i=0; i<nLModel; i++, li++, na++) {
      na = (NameId *) New(&permHeap,(nWords+2)*sizeof(NameId));
      for (wid = wList.id, j=0; j<nWords; j++, wid++) {
	if (li->lm->classLM)
	{
	  nid = na[(int) ((*wid)->aux)] = GetNameId(li->lm->classH, (*wid)->name, FALSE);
	}
	else
	{
	  nid = na[(int) ((*wid)->aux)] = GetNameId(li->lm->htab, (*wid)->name, FALSE);
	}
#ifdef SANITY
	 if (nid==NULL)
	    HError(-16625,"Unable to find word %s in model %s\n",(*wid)->name,li->fn);
#endif
      }
      l2nId[i] = na;
   }

   /* ensure words present at least in one model */
   for (wid = wList.id, j=0; j<nWords; j++, wid++) {
      for (inLM=FALSE,i=0; i<nLModel; i++, li++)
	 if (l2nId[i][(int) ((*wid)->aux)]!=NULL)
	    inLM = TRUE;
      if (!inLM)
	 HError(16625,"Unable to find word %s in any model\n",(*wid)->name);
   }

   /* create equivalence class lookup array */
   eqId = (LabId *) New(&permHeap,(nWords+NumEquiv()+2)*sizeof(NameId));
   for (wid = wList.id, i=0; i<nWords; i++, wid++) {
      eqId[(int) ((*wid)->aux)] = NULL;
   }

   /* link equivalence classes */
   LinkEquiv();

   /* open output stream */
   if (outStreamFN != NULL)
     if ((outStream = FOpen(outStreamFN,NoOFilter,&isPipe)) == NULL)
        HError(16610,"Initialise: unable to open output file %s",outStreamFN);

}
Example #6
0
int main(int argc, char *argv[])
{
   int i;
   char *s,*c;
   char fmt[256];
   dictList *dEntry,*d;

   InitShell(argc,argv,prog_version,prog_vc_id);
   InitMem();
   InitMath();
   InitWave();
   InitLabel();
   InitDict();
   InitWMap();
   InitLUtil();
   InitLModel();
   InitPCalc();

   if (!InfoPrinted() && NumArgs() == 0)
      ReportUsage();
   if (NumArgs() == 0) Exit(EXIT_SUCCESS);

   SetConfParms();

   CreateHeap(&langHeap,"langHeap",MSTAK,1,0.5,5000,40000);

   for (i=1; i<=LM_NSIZE; i++) cutOff[i] = 0;

   while (NextArg() == SWITCHARG) {
      s = GetSwtArg();
      if (strlen(s)!=1) 
         HError(16919,"Bad switch %s; must be single letter",s);
      switch(s[0]){
         case 'c':
           i = GetChkedInt(2,LM_NSIZE,s); 
	   cutOff[i] = GetChkedInt(1,1000,s);
	   break;
         case 'd':
	   if (NextArg()!=STRINGARG)
	     HError(16919,"LMCopy: Input dictionary file name expected");
	   dEntry=New(&gcheap,sizeof(dictList));
	   dEntry->fname=GetStrArg(); dEntry->next=NULL;
	   if (dList==NULL) dList=dEntry;
	   else {
	     for (d=dList;d->next!=NULL;d=d->next);
	     d->next=dEntry;
	   }
	   break;
         case 'f':
	   strcpy(fmt,GetStrArg());
	   for (c=fmt; *c!=0; *c=toupper(*c), c++);
	    if (strcmp(fmt, LM_TXT_TEXT)==0)
               saveFmt = LMF_TEXT;
	    else if (strcmp(fmt, LM_TXT_BINARY)==0)
               saveFmt = LMF_BINARY;
	    else if (strcmp(fmt, LM_TXT_ULTRA)==0)
               saveFmt = LMF_ULTRA;
	    else
	       HError(16919,"Unrecognised LM format, should be one of [%s, %s, %s]",
		      LM_TXT_TEXT, LM_TXT_BINARY, LM_TXT_ULTRA);
	   break;
	 case 'm':
	   remDup=FALSE;
	   break;
         case 'n':
            nSize = GetChkedInt(1,LM_NSIZE,s); break;
	 case 'o':
	   firstOnly=TRUE;
	   break;
	 case 'u':
	   if (NextArg()!=STRINGARG)
	     HError(16919,"LMCopy: Unigram file name expected");
	   uniFn = GetStrArg();
	   break;
         case 'v':
	   if (NextArg()!=STRINGARG)
	     HError(16919,"LMCopy: Dictionary output file name expected");
	   outDictFn = GetStrArg();
	   break;
         case 'w':
	    if (NextArg() != STRINGARG)
	       HError(16919,"LPlex: Word list file name expected");
	    wlistFN = GetStrArg();
	    break;
	 case 'T':
	    trace = GetChkedInt(0,077, s); break;
         default:
            HError(16919,"LMPlex: Unknown switch %s",s);
      }
   }
   if (NextArg()!=STRINGARG)  /* load the language model */
      HError(16919, "Input language model filename expected");
   srcFN = GetStrArg();

   if (NextArg()!=STRINGARG)  /* load the language model */
      HError(16919, "Output language model filename expected");
   tgtFN= GetStrArg();

   if (wlistFN!=NULL) {
      InitVocab(&vocab);   
      if(ReadDict(wlistFN,&vocab) < SUCCESS) 
	 HError(16913,"Could not read dict in %s", wlistFN);
      if (trace&T_TOP) {
	printf("Loaded %d words from %s\n",vocab.nwords,wlistFN); 
	fflush(stdout);
      }
      voc = &vocab;
      CreateWordList(wlistFN,&wlist,10);
      lm = LoadLangModel(srcFN,&wlist,1.0,LMP_FLOAT|LMP_COUNT,&langHeap);
   } else {
      voc = NULL;
      lm = LoadLangModel(srcFN,NULL,1.0,LMP_FLOAT|LMP_COUNT,&langHeap);
   }
   if (trace&T_TOP) {
     printf("Loaded model from %s\n",srcFN); 
     fflush(stdout);
   }
   if (lm->probType==LMP_COUNT) {
      RebuildLM(lm, cutOff, NULL, LMP_FLOAT); /* GLM there was no threshold before! */
   }
   if (uniFn!=NULL)
      ReplaceUnigrams(uniFn,lm);
   if (nSize>0 && nSize<lm->nSize)
      lm->nSize = nSize;
#ifdef HTK_CRYPT
   if (lm->encrypt && saveFmt==LMF_TEXT)
     saveFmt = LMF_BINARY;
#endif
   for (i=1;i<=lm->nSize;i++)
      lm->gInfo[i].fmt = (i==1) ? LMF_TEXT : saveFmt;
   SaveLangModel(tgtFN,lm);
   if (trace&T_TOP) {
     printf("Wrote model to %s\n",tgtFN); 
     fflush(stdout);
   }
   if (outDictFn) {
      MakeDictionary(outDictFn,dList,voc);
   }

   Exit(EXIT_SUCCESS);
   return EXIT_SUCCESS; /* never reached -- make compiler happy */
}   
Example #7
0
int main(int argc, char *argv[])
{
   int i;
   char *s,*c;
   char fmt[256];

   InitShell(argc,argv,lnorm_version,lnorm_vc_id);
   InitMem();
   InitMath();
   InitWave();
   InitLabel();
   InitWMap();
   InitLUtil();
   InitLModel();
   InitPCalc();

   if (!InfoPrinted() && NumArgs() == 0)
      ReportUsage();
   if (NumArgs() == 0) Exit(EXIT_SUCCESS);

   SetConfParms();

   CreateHeap(&langHeap,"langHeap",MSTAK,1,0.5,5000,40000);

   for (i=1; i<=LM_NSIZE; i++) cutOff[i] = 0, wdThresh[i] = 0.0;
   while (NextArg() == SWITCHARG) {
      s = GetSwtArg();
      if (strlen(s)!=1) 
         HError(16519,"Bad switch %s; must be single letter",s);
      switch(s[0]){
         case 'c':
            i = GetChkedInt(2,LM_NSIZE,s); 
	    cutOff[i] = GetChkedInt(1,1000,s);
	    break;
         case 'd':
            i = GetChkedInt(2,LM_NSIZE,s); 
	    wdThresh[i] = GetChkedFlt(0.0,1E10,s);
	    break;
         case 'f':
	   strcpy(fmt,GetStrArg());
	   for (c=fmt; *c!=0; *c=toupper(*c),c++);
           if (strcmp(fmt, LM_TXT_TEXT)==0)
              saveFmt = LMF_TEXT;
           else if (strcmp(fmt, LM_TXT_BINARY)==0)
              saveFmt = LMF_BINARY;
           else if (strcmp(fmt, LM_TXT_ULTRA)==0)
              saveFmt = LMF_ULTRA;
	   else
              HError(16519,"Unrecognised LM format, should be one of [%s, %s, %s]",
                     LM_TXT_TEXT, LM_TXT_BINARY, LM_TXT_ULTRA);
	   break;
         case 'n':
            nSize = GetChkedInt(1,LM_NSIZE,s); break;
         case 'w':
	    if (NextArg() != STRINGARG)
	       HError(16519,"LPlex: Word list file name expected");
	    wlistFN = GetStrArg();
	    break;
	 case 'T':
	    trace = GetChkedInt(0,077, s); break;
         default:
            HError(16519,"LMPlex: Unknown switch %s",s);
      }
   }
   if (NextArg()!=STRINGARG)  /* load the language model */
      HError(16519, "Input language model filename expected");
   srcFN = GetStrArg();

   if (NextArg()!=STRINGARG)  /* load the language model */
      HError(16519, "Output language model filename expected");
   tgtFN= GetStrArg();

   if (wlistFN!=NULL) {
      CreateWordList(wlistFN,&wlist,10);
      lm = LoadLangModel(srcFN,&wlist,1.0,LMP_FLOAT|LMP_COUNT,&langHeap);
   } else {
      lm = LoadLangModel(srcFN,NULL,1.0,LMP_FLOAT|LMP_COUNT,&langHeap);
   }
   if (lm->probType==LMP_COUNT) {
      RebuildLM(lm,cutOff,wdThresh,LMP_FLOAT);
   } else {
      NormaliseLM(lm);
   }
   if (nSize>0 && nSize<lm->nSize)
      lm->nSize = nSize;
   for (i=1;i<=lm->nSize;i++)
      lm->gInfo[i].fmt = (i==1) ? LMF_TEXT : saveFmt;
   SaveLangModel(tgtFN,lm);

   Exit(EXIT_SUCCESS);
   return EXIT_SUCCESS; /* never reached -- make compiler happy */
}