gk_i2cc2i_t *gk_i2cc2i_create_common(char *alphabet)
{
    
    
    int nsymbols;
    gk_idx_t i;
    gk_i2cc2i_t *t;

    nsymbols = strlen(alphabet);
    t        = gk_malloc(sizeof(gk_i2cc2i_t),"gk_i2c_create_common");
    t->n     = nsymbols;
    t->i2c   = gk_cmalloc(256, "gk_i2c_create_common");
    t->c2i   = gk_imalloc(256, "gk_i2c_create_common");
    

    gk_cset(256, -1, t->i2c);
    gk_iset(256, -1, t->c2i);
    
    for(i=0;i<nsymbols;i++){
	t->i2c[i] = alphabet[i];
	t->c2i[(int)alphabet[i]] = i;
    }

    return t;

}
gk_graph_t *gk_graph_ExtractPartition(gk_graph_t *graph, int *part, int pid)
{
  ssize_t i, j, nnz;
  gk_graph_t *ngraph;

  ngraph = gk_graph_Create();

  ngraph->nrows = 0;
  ngraph->ncols = graph->ncols;

  for (nnz=0, i=0; i<graph->nrows; i++) {
    if (part[i] == pid) {
      ngraph->nrows++;
      nnz += graph->rowptr[i+1]-graph->rowptr[i];
    }
  }

  ngraph->rowptr = gk_zmalloc(ngraph->nrows+1, "gk_graph_ExtractPartition: rowptr");
  ngraph->rowind = gk_imalloc(nnz, "gk_graph_ExtractPartition: rowind");
  ngraph->rowval = gk_fmalloc(nnz, "gk_graph_ExtractPartition: rowval");

  ngraph->rowptr[0] = 0;
  for (nnz=0, j=0, i=0; i<graph->nrows; i++) {
    if (part[i] == pid) {
      gk_icopy(graph->rowptr[i+1]-graph->rowptr[i], graph->rowind+graph->rowptr[i], ngraph->rowind+nnz);
      gk_fcopy(graph->rowptr[i+1]-graph->rowptr[i], graph->rowval+graph->rowptr[i], ngraph->rowval+nnz);
      nnz += graph->rowptr[i+1]-graph->rowptr[i];
      ngraph->rowptr[++j] = nnz;
    }
  }
  ASSERT(j == ngraph->nrows);

  return ngraph;
}
gk_graph_t *gk_graph_ExtractRows(gk_graph_t *graph, int nrows, int *rind)
{
  ssize_t i, ii, j, nnz;
  gk_graph_t *ngraph;

  ngraph = gk_graph_Create();

  ngraph->nrows = nrows;
  ngraph->ncols = graph->ncols;

  for (nnz=0, i=0; i<nrows; i++)  
    nnz += graph->rowptr[rind[i]+1]-graph->rowptr[rind[i]];

  ngraph->rowptr = gk_zmalloc(ngraph->nrows+1, "gk_graph_ExtractPartition: rowptr");
  ngraph->rowind = gk_imalloc(nnz, "gk_graph_ExtractPartition: rowind");
  ngraph->rowval = gk_fmalloc(nnz, "gk_graph_ExtractPartition: rowval");

  ngraph->rowptr[0] = 0;
  for (nnz=0, j=0, ii=0; ii<nrows; ii++) {
    i = rind[ii];
    gk_icopy(graph->rowptr[i+1]-graph->rowptr[i], graph->rowind+graph->rowptr[i], ngraph->rowind+nnz);
    gk_fcopy(graph->rowptr[i+1]-graph->rowptr[i], graph->rowval+graph->rowptr[i], ngraph->rowval+nnz);
    nnz += graph->rowptr[i+1]-graph->rowptr[i];
    ngraph->rowptr[++j] = nnz;
  }
  ASSERT(j == ngraph->nrows);

  return ngraph;
}
Exemple #4
0
void gk_find_frequent_itemsets(int ntrans, int *tranptr, int *tranind, 
        int minfreq, int maxfreq, int minlen, int maxlen, 
        void (*process_itemset)(void *stateptr, int nitems, int *itemids, 
                                int ntrans, int *transids),
        void *stateptr)
{
  ssize_t i;
  gk_csr_t *mat, *pmat;
  isparams_t params;
  int *pattern;

  /* Create the matrix */
  mat = gk_csr_Create();
  mat->nrows  = ntrans;
  mat->ncols  = tranind[gk_iargmax(tranptr[ntrans], tranind)]+1;
  mat->rowptr = gk_zmalloc(ntrans+1, "gk_find_frequent_itemsets: mat.rowptr");
  for (i=0; i<ntrans+1; i++)
    mat->rowptr[i] = tranptr[i];
  mat->rowind = gk_icopy(tranptr[ntrans], tranind, gk_imalloc(tranptr[ntrans], "gk_find_frequent_itemsets: mat.rowind"));
  mat->colids = gk_iincset(mat->ncols, 0, gk_imalloc(mat->ncols, "gk_find_frequent_itemsets: mat.colids"));

  /* Setup the parameters */
  params.minfreq  = minfreq;
  params.maxfreq  = (maxfreq == -1 ? mat->nrows : maxfreq);
  params.minlen   = minlen;
  params.maxlen   = (maxlen == -1 ? mat->ncols : maxlen);
  params.tnitems  = mat->ncols;
  params.callback = process_itemset;
  params.stateptr = stateptr;
  params.rmarker  = gk_ismalloc(mat->nrows, 0, "gk_find_frequent_itemsets: rmarker");
  params.cand     = gk_ikvmalloc(mat->ncols, "gk_find_frequent_itemsets: cand");

  /* Perform the initial projection */
  gk_csr_CreateIndex(mat, GK_CSR_COL);
  pmat = itemsets_project_matrix(&params, mat, -1);
  gk_csr_Free(&mat);

  pattern = gk_imalloc(pmat->ncols, "gk_find_frequent_itemsets: pattern");
  itemsets_find_frequent_itemsets(&params, pmat, 0, pattern); 

  gk_csr_Free(&pmat);
  gk_free((void **)&pattern, &params.rmarker, &params.cand, LTERM);

}
gk_seq_t *gk_seq_ReadGKMODPSSM(char *filename)
{
    gk_seq_t *seq;
    gk_idx_t i, j, ii;
    size_t ntokens, nbytes, len;
    FILE *fpin;
    
    
    gk_Tokens_t tokens;
    static char *AAORDER = "ARNDCQEGHILKMFPSTWYVBZX*";
    static int PSSMWIDTH = 20;
    char *header, line[MAXLINELEN];
    gk_i2cc2i_t *converter;

    header = gk_cmalloc(PSSMWIDTH, "gk_seq_ReadGKMODPSSM: header");
    
    converter = gk_i2cc2i_create_common(AAORDER);
    
    gk_getfilestats(filename, &len, &ntokens, NULL, &nbytes);
    len --;

    seq = gk_malloc(sizeof(gk_seq_t),"gk_seq_ReadGKMODPSSM");
    gk_seq_init(seq);
    
    seq->len = len;
    seq->sequence = gk_imalloc(len, "gk_seq_ReadGKMODPSSM");
    seq->pssm     = gk_iAllocMatrix(len, PSSMWIDTH, 0, "gk_seq_ReadGKMODPSSM");
    seq->psfm     = gk_iAllocMatrix(len, PSSMWIDTH, 0, "gk_seq_ReadGKMODPSSM");
    
    seq->nsymbols = PSSMWIDTH;
    seq->name     = gk_getbasename(filename);
    
    fpin = gk_fopen(filename,"r","gk_seq_ReadGKMODPSSM");


    /* Read the header line */
    if (fgets(line, MAXLINELEN-1, fpin) == NULL)
      errexit("Unexpected end of file: %s\n", filename);
    gk_strtoupper(line);
    gk_strtokenize(line, " \t\n", &tokens);

    for (i=0; i<PSSMWIDTH; i++)
	header[i] = tokens.list[i][0];
    
    gk_freetokenslist(&tokens);
    

    /* Read the rest of the lines */
    for (i=0, ii=0; ii<len; ii++) {
	if (fgets(line, MAXLINELEN-1, fpin) == NULL)
          errexit("Unexpected end of file: %s\n", filename);
	gk_strtoupper(line);
	gk_strtokenize(line, " \t\n", &tokens);
	
	seq->sequence[i] = converter->c2i[(int)tokens.list[1][0]];
	
	for (j=0; j<PSSMWIDTH; j++) {
	    seq->pssm[i][converter->c2i[(int)header[j]]] = atoi(tokens.list[2+j]);
	    seq->psfm[i][converter->c2i[(int)header[j]]] = atoi(tokens.list[2+PSSMWIDTH+j]);
	}
	
      
	
	gk_freetokenslist(&tokens);
	i++;
    }
    
    seq->len = i; /* Reset the length if certain characters were skipped */
    
    gk_free((void **)&header, LTERM);
    gk_fclose(fpin);

    return seq;
}
Exemple #6
0
gk_csr_t *itemsets_project_matrix(isparams_t *params, gk_csr_t *mat, int cid)
{
  ssize_t i, j, k, ii, pnnz;
  int nrows, ncols, pnrows, pncols;
  ssize_t *colptr, *pcolptr;
  int *colind, *colids, *pcolind, *pcolids, *rmarker;
  gk_csr_t *pmat;
  gk_ikv_t *cand;

  nrows  = mat->nrows;
  ncols  = mat->ncols;
  colptr = mat->colptr;
  colind = mat->colind;
  colids = mat->colids;

  rmarker = params->rmarker;
  cand    = params->cand;


  /* Allocate space for the projected matrix based on what you know thus far */
  pmat = gk_csr_Create();
  pmat->nrows  = pnrows = (cid == -1 ? nrows : colptr[cid+1]-colptr[cid]);


  /* Mark the rows that will be kept and determine the prowids */
  if (cid == -1) { /* Initial projection */
    gk_iset(nrows, 1, rmarker);
  }
  else { /* The other projections */
    for (i=colptr[cid]; i<colptr[cid+1]; i++) 
      rmarker[colind[i]] = 1;
  }


  /* Determine the length of each column that will be left in the projected matrix */
  for (pncols=0, pnnz=0, i=cid+1; i<ncols; i++) {
    for (k=0, j=colptr[i]; j<colptr[i+1]; j++) {
      k += rmarker[colind[j]];
    }
    if (k >= params->minfreq && k <= params->maxfreq) {
      cand[pncols].val   = i;
      cand[pncols++].key = k;
      pnnz += k;
    }
  }

  /* Sort the columns in increasing order */
  gk_ikvsorti(pncols, cand);


  /* Allocate space for the remaining fields of the projected matrix */
  pmat->ncols  = pncols;
  pmat->colids = pcolids = gk_imalloc(pncols, "itemsets_project_matrix: pcolids");
  pmat->colptr = pcolptr = gk_zmalloc(pncols+1, "itemsets_project_matrix: pcolptr");
  pmat->colind = pcolind = gk_imalloc(pnnz, "itemsets_project_matrix: pcolind");


  /* Populate the projected matrix */
  pcolptr[0] = 0;
  for (pnnz=0, ii=0; ii<pncols; ii++) {
    i = cand[ii].val;
    for (j=colptr[i]; j<colptr[i+1]; j++) {
      if (rmarker[colind[j]]) 
        pcolind[pnnz++] = colind[j];
    }

    pcolids[ii] = colids[i];
    pcolptr[ii+1] = pnnz;
  }


  /* Reset the rmarker array */
  if (cid == -1) { /* Initial projection */
    gk_iset(nrows, 0, rmarker);
  }
  else { /* The other projections */
    for (i=colptr[cid]; i<colptr[cid+1]; i++) 
      rmarker[colind[i]] = 0;
  }


  return pmat;
}
gk_graph_t *gk_graph_Prune(gk_graph_t *graph, int what, int minf, int maxf)
{
  ssize_t i, j, nnz;
  int nrows, ncols;
  ssize_t *rowptr, *nrowptr;
  int *rowind, *nrowind, *collen;
  float *rowval, *nrowval;
  gk_graph_t *ngraph;

  ngraph = gk_graph_Create();
  
  nrows = ngraph->nrows = graph->nrows;
  ncols = ngraph->ncols = graph->ncols;

  rowptr = graph->rowptr;
  rowind = graph->rowind;
  rowval = graph->rowval;

  nrowptr = ngraph->rowptr = gk_zmalloc(nrows+1, "gk_graph_Prune: nrowptr");
  nrowind = ngraph->rowind = gk_imalloc(rowptr[nrows], "gk_graph_Prune: nrowind");
  nrowval = ngraph->rowval = gk_fmalloc(rowptr[nrows], "gk_graph_Prune: nrowval");


  switch (what) {
    case GK_CSR_COL:
      collen = gk_ismalloc(ncols, 0, "gk_graph_Prune: collen");

      for (i=0; i<nrows; i++) {
        for (j=rowptr[i]; j<rowptr[i+1]; j++) {
          ASSERT(rowind[j] < ncols);
          collen[rowind[j]]++;
        }
      }
      for (i=0; i<ncols; i++)
        collen[i] = (collen[i] >= minf && collen[i] <= maxf ? 1 : 0);

      nrowptr[0] = 0;
      for (nnz=0, i=0; i<nrows; i++) {
        for (j=rowptr[i]; j<rowptr[i+1]; j++) {
          if (collen[rowind[j]]) {
            nrowind[nnz] = rowind[j];
            nrowval[nnz] = rowval[j];
            nnz++;
          }
        }
        nrowptr[i+1] = nnz;
      }
      gk_free((void **)&collen, LTERM);
      break;

    case GK_CSR_ROW:
      nrowptr[0] = 0;
      for (nnz=0, i=0; i<nrows; i++) {
        if (rowptr[i+1]-rowptr[i] >= minf && rowptr[i+1]-rowptr[i] <= maxf) {
          for (j=rowptr[i]; j<rowptr[i+1]; j++, nnz++) {
            nrowind[nnz] = rowind[j];
            nrowval[nnz] = rowval[j];
          }
        }
        nrowptr[i+1] = nnz;
      }
      break;

    default:
      gk_graph_Free(&ngraph);
      gk_errexit(SIGERR, "Unknown prunning type of %d\n", what);
      return NULL;
  }

  return ngraph;
}