gk_i2cc2i_t *gk_i2cc2i_create_common(char *alphabet) { int nsymbols; gk_idx_t i; gk_i2cc2i_t *t; nsymbols = strlen(alphabet); t = gk_malloc(sizeof(gk_i2cc2i_t),"gk_i2c_create_common"); t->n = nsymbols; t->i2c = gk_cmalloc(256, "gk_i2c_create_common"); t->c2i = gk_imalloc(256, "gk_i2c_create_common"); gk_cset(256, -1, t->i2c); gk_iset(256, -1, t->c2i); for(i=0;i<nsymbols;i++){ t->i2c[i] = alphabet[i]; t->c2i[(int)alphabet[i]] = i; } return t; }
gk_graph_t *gk_graph_ExtractPartition(gk_graph_t *graph, int *part, int pid) { ssize_t i, j, nnz; gk_graph_t *ngraph; ngraph = gk_graph_Create(); ngraph->nrows = 0; ngraph->ncols = graph->ncols; for (nnz=0, i=0; i<graph->nrows; i++) { if (part[i] == pid) { ngraph->nrows++; nnz += graph->rowptr[i+1]-graph->rowptr[i]; } } ngraph->rowptr = gk_zmalloc(ngraph->nrows+1, "gk_graph_ExtractPartition: rowptr"); ngraph->rowind = gk_imalloc(nnz, "gk_graph_ExtractPartition: rowind"); ngraph->rowval = gk_fmalloc(nnz, "gk_graph_ExtractPartition: rowval"); ngraph->rowptr[0] = 0; for (nnz=0, j=0, i=0; i<graph->nrows; i++) { if (part[i] == pid) { gk_icopy(graph->rowptr[i+1]-graph->rowptr[i], graph->rowind+graph->rowptr[i], ngraph->rowind+nnz); gk_fcopy(graph->rowptr[i+1]-graph->rowptr[i], graph->rowval+graph->rowptr[i], ngraph->rowval+nnz); nnz += graph->rowptr[i+1]-graph->rowptr[i]; ngraph->rowptr[++j] = nnz; } } ASSERT(j == ngraph->nrows); return ngraph; }
gk_graph_t *gk_graph_ExtractRows(gk_graph_t *graph, int nrows, int *rind) { ssize_t i, ii, j, nnz; gk_graph_t *ngraph; ngraph = gk_graph_Create(); ngraph->nrows = nrows; ngraph->ncols = graph->ncols; for (nnz=0, i=0; i<nrows; i++) nnz += graph->rowptr[rind[i]+1]-graph->rowptr[rind[i]]; ngraph->rowptr = gk_zmalloc(ngraph->nrows+1, "gk_graph_ExtractPartition: rowptr"); ngraph->rowind = gk_imalloc(nnz, "gk_graph_ExtractPartition: rowind"); ngraph->rowval = gk_fmalloc(nnz, "gk_graph_ExtractPartition: rowval"); ngraph->rowptr[0] = 0; for (nnz=0, j=0, ii=0; ii<nrows; ii++) { i = rind[ii]; gk_icopy(graph->rowptr[i+1]-graph->rowptr[i], graph->rowind+graph->rowptr[i], ngraph->rowind+nnz); gk_fcopy(graph->rowptr[i+1]-graph->rowptr[i], graph->rowval+graph->rowptr[i], ngraph->rowval+nnz); nnz += graph->rowptr[i+1]-graph->rowptr[i]; ngraph->rowptr[++j] = nnz; } ASSERT(j == ngraph->nrows); return ngraph; }
void gk_find_frequent_itemsets(int ntrans, int *tranptr, int *tranind, int minfreq, int maxfreq, int minlen, int maxlen, void (*process_itemset)(void *stateptr, int nitems, int *itemids, int ntrans, int *transids), void *stateptr) { ssize_t i; gk_csr_t *mat, *pmat; isparams_t params; int *pattern; /* Create the matrix */ mat = gk_csr_Create(); mat->nrows = ntrans; mat->ncols = tranind[gk_iargmax(tranptr[ntrans], tranind)]+1; mat->rowptr = gk_zmalloc(ntrans+1, "gk_find_frequent_itemsets: mat.rowptr"); for (i=0; i<ntrans+1; i++) mat->rowptr[i] = tranptr[i]; mat->rowind = gk_icopy(tranptr[ntrans], tranind, gk_imalloc(tranptr[ntrans], "gk_find_frequent_itemsets: mat.rowind")); mat->colids = gk_iincset(mat->ncols, 0, gk_imalloc(mat->ncols, "gk_find_frequent_itemsets: mat.colids")); /* Setup the parameters */ params.minfreq = minfreq; params.maxfreq = (maxfreq == -1 ? mat->nrows : maxfreq); params.minlen = minlen; params.maxlen = (maxlen == -1 ? mat->ncols : maxlen); params.tnitems = mat->ncols; params.callback = process_itemset; params.stateptr = stateptr; params.rmarker = gk_ismalloc(mat->nrows, 0, "gk_find_frequent_itemsets: rmarker"); params.cand = gk_ikvmalloc(mat->ncols, "gk_find_frequent_itemsets: cand"); /* Perform the initial projection */ gk_csr_CreateIndex(mat, GK_CSR_COL); pmat = itemsets_project_matrix(¶ms, mat, -1); gk_csr_Free(&mat); pattern = gk_imalloc(pmat->ncols, "gk_find_frequent_itemsets: pattern"); itemsets_find_frequent_itemsets(¶ms, pmat, 0, pattern); gk_csr_Free(&pmat); gk_free((void **)&pattern, ¶ms.rmarker, ¶ms.cand, LTERM); }
gk_seq_t *gk_seq_ReadGKMODPSSM(char *filename) { gk_seq_t *seq; gk_idx_t i, j, ii; size_t ntokens, nbytes, len; FILE *fpin; gk_Tokens_t tokens; static char *AAORDER = "ARNDCQEGHILKMFPSTWYVBZX*"; static int PSSMWIDTH = 20; char *header, line[MAXLINELEN]; gk_i2cc2i_t *converter; header = gk_cmalloc(PSSMWIDTH, "gk_seq_ReadGKMODPSSM: header"); converter = gk_i2cc2i_create_common(AAORDER); gk_getfilestats(filename, &len, &ntokens, NULL, &nbytes); len --; seq = gk_malloc(sizeof(gk_seq_t),"gk_seq_ReadGKMODPSSM"); gk_seq_init(seq); seq->len = len; seq->sequence = gk_imalloc(len, "gk_seq_ReadGKMODPSSM"); seq->pssm = gk_iAllocMatrix(len, PSSMWIDTH, 0, "gk_seq_ReadGKMODPSSM"); seq->psfm = gk_iAllocMatrix(len, PSSMWIDTH, 0, "gk_seq_ReadGKMODPSSM"); seq->nsymbols = PSSMWIDTH; seq->name = gk_getbasename(filename); fpin = gk_fopen(filename,"r","gk_seq_ReadGKMODPSSM"); /* Read the header line */ if (fgets(line, MAXLINELEN-1, fpin) == NULL) errexit("Unexpected end of file: %s\n", filename); gk_strtoupper(line); gk_strtokenize(line, " \t\n", &tokens); for (i=0; i<PSSMWIDTH; i++) header[i] = tokens.list[i][0]; gk_freetokenslist(&tokens); /* Read the rest of the lines */ for (i=0, ii=0; ii<len; ii++) { if (fgets(line, MAXLINELEN-1, fpin) == NULL) errexit("Unexpected end of file: %s\n", filename); gk_strtoupper(line); gk_strtokenize(line, " \t\n", &tokens); seq->sequence[i] = converter->c2i[(int)tokens.list[1][0]]; for (j=0; j<PSSMWIDTH; j++) { seq->pssm[i][converter->c2i[(int)header[j]]] = atoi(tokens.list[2+j]); seq->psfm[i][converter->c2i[(int)header[j]]] = atoi(tokens.list[2+PSSMWIDTH+j]); } gk_freetokenslist(&tokens); i++; } seq->len = i; /* Reset the length if certain characters were skipped */ gk_free((void **)&header, LTERM); gk_fclose(fpin); return seq; }
gk_csr_t *itemsets_project_matrix(isparams_t *params, gk_csr_t *mat, int cid) { ssize_t i, j, k, ii, pnnz; int nrows, ncols, pnrows, pncols; ssize_t *colptr, *pcolptr; int *colind, *colids, *pcolind, *pcolids, *rmarker; gk_csr_t *pmat; gk_ikv_t *cand; nrows = mat->nrows; ncols = mat->ncols; colptr = mat->colptr; colind = mat->colind; colids = mat->colids; rmarker = params->rmarker; cand = params->cand; /* Allocate space for the projected matrix based on what you know thus far */ pmat = gk_csr_Create(); pmat->nrows = pnrows = (cid == -1 ? nrows : colptr[cid+1]-colptr[cid]); /* Mark the rows that will be kept and determine the prowids */ if (cid == -1) { /* Initial projection */ gk_iset(nrows, 1, rmarker); } else { /* The other projections */ for (i=colptr[cid]; i<colptr[cid+1]; i++) rmarker[colind[i]] = 1; } /* Determine the length of each column that will be left in the projected matrix */ for (pncols=0, pnnz=0, i=cid+1; i<ncols; i++) { for (k=0, j=colptr[i]; j<colptr[i+1]; j++) { k += rmarker[colind[j]]; } if (k >= params->minfreq && k <= params->maxfreq) { cand[pncols].val = i; cand[pncols++].key = k; pnnz += k; } } /* Sort the columns in increasing order */ gk_ikvsorti(pncols, cand); /* Allocate space for the remaining fields of the projected matrix */ pmat->ncols = pncols; pmat->colids = pcolids = gk_imalloc(pncols, "itemsets_project_matrix: pcolids"); pmat->colptr = pcolptr = gk_zmalloc(pncols+1, "itemsets_project_matrix: pcolptr"); pmat->colind = pcolind = gk_imalloc(pnnz, "itemsets_project_matrix: pcolind"); /* Populate the projected matrix */ pcolptr[0] = 0; for (pnnz=0, ii=0; ii<pncols; ii++) { i = cand[ii].val; for (j=colptr[i]; j<colptr[i+1]; j++) { if (rmarker[colind[j]]) pcolind[pnnz++] = colind[j]; } pcolids[ii] = colids[i]; pcolptr[ii+1] = pnnz; } /* Reset the rmarker array */ if (cid == -1) { /* Initial projection */ gk_iset(nrows, 0, rmarker); } else { /* The other projections */ for (i=colptr[cid]; i<colptr[cid+1]; i++) rmarker[colind[i]] = 0; } return pmat; }
gk_graph_t *gk_graph_Prune(gk_graph_t *graph, int what, int minf, int maxf) { ssize_t i, j, nnz; int nrows, ncols; ssize_t *rowptr, *nrowptr; int *rowind, *nrowind, *collen; float *rowval, *nrowval; gk_graph_t *ngraph; ngraph = gk_graph_Create(); nrows = ngraph->nrows = graph->nrows; ncols = ngraph->ncols = graph->ncols; rowptr = graph->rowptr; rowind = graph->rowind; rowval = graph->rowval; nrowptr = ngraph->rowptr = gk_zmalloc(nrows+1, "gk_graph_Prune: nrowptr"); nrowind = ngraph->rowind = gk_imalloc(rowptr[nrows], "gk_graph_Prune: nrowind"); nrowval = ngraph->rowval = gk_fmalloc(rowptr[nrows], "gk_graph_Prune: nrowval"); switch (what) { case GK_CSR_COL: collen = gk_ismalloc(ncols, 0, "gk_graph_Prune: collen"); for (i=0; i<nrows; i++) { for (j=rowptr[i]; j<rowptr[i+1]; j++) { ASSERT(rowind[j] < ncols); collen[rowind[j]]++; } } for (i=0; i<ncols; i++) collen[i] = (collen[i] >= minf && collen[i] <= maxf ? 1 : 0); nrowptr[0] = 0; for (nnz=0, i=0; i<nrows; i++) { for (j=rowptr[i]; j<rowptr[i+1]; j++) { if (collen[rowind[j]]) { nrowind[nnz] = rowind[j]; nrowval[nnz] = rowval[j]; nnz++; } } nrowptr[i+1] = nnz; } gk_free((void **)&collen, LTERM); break; case GK_CSR_ROW: nrowptr[0] = 0; for (nnz=0, i=0; i<nrows; i++) { if (rowptr[i+1]-rowptr[i] >= minf && rowptr[i+1]-rowptr[i] <= maxf) { for (j=rowptr[i]; j<rowptr[i+1]; j++, nnz++) { nrowind[nnz] = rowind[j]; nrowval[nnz] = rowval[j]; } } nrowptr[i+1] = nnz; } break; default: gk_graph_Free(&ngraph); gk_errexit(SIGERR, "Unknown prunning type of %d\n", what); return NULL; } return ngraph; }