/************************************************************************* * This function writes weighted graph into a file **************************************************************************/ void WriteWgtGraph(char *filename, idxtype nvtxs, idxtype *xadj, idxtype *adjncy, idxtype *vwgt) { idxtype i, j; FILE *fpout; fpout = gk_fopen(filename, "w", __func__); mfprintf(fpout, "%D %D", nvtxs, xadj[nvtxs]/2); mfprintf(fpout, " %D", 10); for (i=0; i<nvtxs; i++) { fprintf(fpout, "\n"); fprintf(fpout, "%" PRIIDX, vwgt[i]); for (j=xadj[i]; j<xadj[i+1]; j++) fprintf(fpout, " %" PRIIDX, adjncy[j]+1); } gk_fclose(fpout); }
/************************************************************************ * This function reads the element node array of a i Mixed mesh **************************************************************************/ idxtype *ReadMgcnums(char *filename) { idxtype i; idxtype *mgc; FILE *fpin; fpin = gk_fopen(filename, "r", __func__); mgc = idxmalloc(36, "Readmgcnums: mgcnums"); for (i=0; i<36; i++) { if (i<6 || i%6==0) mgc[i]=-1; else mfscanf(fpin, "%D", mgc+i); } gk_fclose(fpin); return mgc; }
/************************************************************************* * This function reads the spd matrix **************************************************************************/ void ReadCoordinates(GraphType *graph, char *filename) { idxtype i, j, k, l, nvtxs, fmt, readew, readvw, ncon, edge, ewgt; FILE *fpin; char *line; fpin = gk_fopen(filename, "r", __func__); nvtxs = graph->nvtxs; graph->coords = gk_dsmalloc(3*nvtxs, 0.0, "ReadCoordinates: coords"); line = gk_cmalloc(MAXLINE+1, "ReadCoordinates: line"); for (i=0; i<nvtxs; i++) { fgets(line, MAXLINE, fpin); msscanf(line, "%lf %lf %lf", graph->coords+3*i+0, graph->coords+3*i+1, graph->coords+3*i+2); } gk_fclose(fpin); gk_free((void **)&line, LTERM); }
gk_seq_t *gk_seq_ReadGKMODPSSM(char *filename) { gk_seq_t *seq; gk_idx_t i, j, ii; size_t ntokens, nbytes, len; FILE *fpin; gk_Tokens_t tokens; static char *AAORDER = "ARNDCQEGHILKMFPSTWYVBZX*"; static int PSSMWIDTH = 20; char *header, line[MAXLINELEN]; gk_i2cc2i_t *converter; header = gk_cmalloc(PSSMWIDTH, "gk_seq_ReadGKMODPSSM: header"); converter = gk_i2cc2i_create_common(AAORDER); gk_getfilestats(filename, &len, &ntokens, NULL, &nbytes); len --; seq = gk_malloc(sizeof(gk_seq_t),"gk_seq_ReadGKMODPSSM"); gk_seq_init(seq); seq->len = len; seq->sequence = gk_imalloc(len, "gk_seq_ReadGKMODPSSM"); seq->pssm = gk_iAllocMatrix(len, PSSMWIDTH, 0, "gk_seq_ReadGKMODPSSM"); seq->psfm = gk_iAllocMatrix(len, PSSMWIDTH, 0, "gk_seq_ReadGKMODPSSM"); seq->nsymbols = PSSMWIDTH; seq->name = gk_getbasename(filename); fpin = gk_fopen(filename,"r","gk_seq_ReadGKMODPSSM"); /* Read the header line */ if (fgets(line, MAXLINELEN-1, fpin) == NULL) errexit("Unexpected end of file: %s\n", filename); gk_strtoupper(line); gk_strtokenize(line, " \t\n", &tokens); for (i=0; i<PSSMWIDTH; i++) header[i] = tokens.list[i][0]; gk_freetokenslist(&tokens); /* Read the rest of the lines */ for (i=0, ii=0; ii<len; ii++) { if (fgets(line, MAXLINELEN-1, fpin) == NULL) errexit("Unexpected end of file: %s\n", filename); gk_strtoupper(line); gk_strtokenize(line, " \t\n", &tokens); seq->sequence[i] = converter->c2i[(int)tokens.list[1][0]]; for (j=0; j<PSSMWIDTH; j++) { seq->pssm[i][converter->c2i[(int)header[j]]] = atoi(tokens.list[2+j]); seq->psfm[i][converter->c2i[(int)header[j]]] = atoi(tokens.list[2+PSSMWIDTH+j]); } gk_freetokenslist(&tokens); i++; } seq->len = i; /* Reset the length if certain characters were skipped */ gk_free((void **)&header, LTERM); gk_fclose(fpin); return seq; }
void ComputeNeighbors(params_t *params) { int i, j, nhits; gk_csr_t *mat; int32_t *marker; gk_fkv_t *hits, *cand; FILE *fpout; printf("Reading data for %s...\n", params->infstem); mat = gk_csr_Read(params->infstem, GK_CSR_FMT_CSR, 1, 0); printf("#docs: %d, #nnz: %d.\n", mat->nrows, mat->rowptr[mat->nrows]); /* compact the column-space of the matrices */ gk_csr_CompactColumns(mat); /* perform auxiliary normalizations/pre-computations based on similarity */ gk_csr_Normalize(mat, GK_CSR_ROW, 2); /* create the inverted index */ gk_csr_CreateIndex(mat, GK_CSR_COL); /* create the output file */ fpout = (params->outfile ? gk_fopen(params->outfile, "w", "ComputeNeighbors: fpout") : NULL); /* allocate memory for the necessary working arrays */ hits = gk_fkvmalloc(mat->nrows, "ComputeNeighbors: hits"); marker = gk_i32smalloc(mat->nrows, -1, "ComputeNeighbors: marker"); cand = gk_fkvmalloc(mat->nrows, "ComputeNeighbors: cand"); /* find the best neighbors for each query document */ gk_startwctimer(params->timer_1); for (i=0; i<mat->nrows; i++) { if (params->verbosity > 0) printf("Working on query %7d\n", i); /* find the neighbors of the ith document */ nhits = gk_csr_GetSimilarRows(mat, mat->rowptr[i+1]-mat->rowptr[i], mat->rowind+mat->rowptr[i], mat->rowval+mat->rowptr[i], GK_CSR_COS, params->nnbrs, params->minsim, hits, marker, cand); /* write the results in the file */ if (fpout) { for (j=0; j<nhits; j++) fprintf(fpout, "%8d %8d %.3f\n", i, hits[j].val, hits[j].key); } } gk_stopwctimer(params->timer_1); /* cleanup and exit */ if (fpout) gk_fclose(fpout); gk_free((void **)&hits, &marker, &cand, LTERM); gk_csr_Free(&mat); return; }
/************************************************************************* * This function reads the spd matrix **************************************************************************/ void ReadGraph(GraphType *graph, char *filename, idxtype *wgtflag) { idxtype i, j, k, l, fmt, readew, readvw, ncon, edge, ewgt; idxtype *xadj, *adjncy, *vwgt, *adjwgt; char *line, *oldstr, *newstr; FILE *fpin; InitGraph(graph); line = gk_cmalloc(MAXLINE+1, "ReadGraph: line"); fpin = gk_fopen(filename, "r", __func__); do { fgets(line, MAXLINE, fpin); } while (line[0] == '%' && !feof(fpin)); if (feof(fpin)) { graph->nvtxs = 0; gk_free((void **)&line, LTERM); return; } fmt = ncon = 0; msscanf(line, "%D %D %D %D", &(graph->nvtxs), &(graph->nedges), &fmt, &ncon); readew = (fmt%10 > 0); readvw = ((fmt/10)%10 > 0); if (fmt >= 100) { mprintf("Cannot read this type of file format!"); exit(0); } *wgtflag = 0; if (readew) *wgtflag += 1; if (readvw) *wgtflag += 2; if (ncon > 0 && !readvw) { mprintf("------------------------------------------------------------------------------\n"); mprintf("*** I detected an error in your input file ***\n\n"); mprintf("You specified ncon=%D, but the fmt parameter does not specify vertex weights\n", ncon); mprintf("Make sure that the fmt parameter is set to either 10 or 11.\n"); mprintf("------------------------------------------------------------------------------\n"); exit(0); } graph->nedges *=2; ncon = graph->ncon = (ncon == 0 ? 1 : ncon); /*mprintf("%D %D %D %D %D [%D %D]\n", fmt, fmt%10, (fmt/10)%10, ncon, graph->ncon, readew, readvw);*/ if (graph->nvtxs > MAXIDX) errexit("\nThe matrix is too big: %d [%d %d]\n", graph->nvtxs, MAXIDX, sizeof(idxtype)); xadj = graph->xadj = idxsmalloc(graph->nvtxs+1, 0, "ReadGraph: xadj"); adjncy = graph->adjncy = idxmalloc(graph->nedges, "ReadGraph: adjncy"); vwgt = graph->vwgt = (readvw ? idxmalloc(ncon*graph->nvtxs, "ReadGraph: vwgt") : NULL); adjwgt = graph->adjwgt = (readew ? idxmalloc(graph->nedges, "ReadGraph: adjwgt") : NULL); /* Start reading the graph file */ for (xadj[0]=0, k=0, i=0; i<graph->nvtxs; i++) { do { fgets(line, MAXLINE, fpin); } while (line[0] == '%' && !feof(fpin)); oldstr = line; newstr = NULL; if (strlen(line) == MAXLINE) errexit("\nBuffer for fgets not big enough!\n"); if (readvw) { for (l=0; l<ncon; l++) { vwgt[i*ncon+l] = strtoidx(oldstr, &newstr, 10); oldstr = newstr; } } for (;;) { edge = strtoidx(oldstr, &newstr, 10) -1; oldstr = newstr; if (readew) { ewgt = strtoidx(oldstr, &newstr, 10); oldstr = newstr; } if (edge < 0) break; adjncy[k] = edge; if (readew) adjwgt[k] = ewgt; k++; } xadj[i+1] = k; } gk_fclose(fpin); if (k != graph->nedges) { mprintf("------------------------------------------------------------------------------\n"); mprintf("*** I detected an error in your input file ***\n\n"); mprintf("In the first line of the file, you specified that the graph contained\n%D edges. However, I only found %D edges in the file.\n", graph->nedges/2, k/2); if (2*k == graph->nedges) { mprintf("\n *> I detected that you specified twice the number of edges that you have in\n"); mprintf(" the file. Remember that the number of edges specified in the first line\n"); mprintf(" counts each edge between vertices v and u only once.\n\n"); } mprintf("Please specify the correct number of edges in the first line of the file.\n"); mprintf("------------------------------------------------------------------------------\n"); exit(0); } gk_free((void **)&line, LTERM); }
int main(int argc, char *argv[]) { ssize_t i, j, niter; params_t *params; gk_csr_t *mat; FILE *fpout; /* get command-line options */ params = parse_cmdline(argc, argv); /* read the data */ mat = gk_csr_Read(params->infile, GK_CSR_FMT_METIS, 1, 1); /* display some basic stats */ print_init_info(params, mat); if (params->ntvs != -1) { /* compute the pr for different randomly generated restart-distribution vectors */ float **prs; prs = gk_fAllocMatrix(params->ntvs, mat->nrows, 0.0, "main: prs"); /* generate the random restart vectors */ for (j=0; j<params->ntvs; j++) { for (i=0; i<mat->nrows; i++) prs[j][i] = RandomInRange(931); gk_fscale(mat->nrows, 1.0/gk_fsum(mat->nrows, prs[j], 1), prs[j], 1); niter = gk_rw_PageRank(mat, params->lamda, params->eps, params->niter, prs[j]); printf("tvs#: %zd; niters: %zd\n", j, niter); } /* output the computed pr scores */ fpout = gk_fopen(params->outfile, "w", "main: outfile"); for (i=0; i<mat->nrows; i++) { for (j=0; j<params->ntvs; j++) fprintf(fpout, "%.4e ", prs[j][i]); fprintf(fpout, "\n"); } gk_fclose(fpout); gk_fFreeMatrix(&prs, params->ntvs, mat->nrows); } else if (params->ppr != -1) { /* compute the personalized pr from the specified vertex */ float *pr; pr = gk_fsmalloc(mat->nrows, 0.0, "main: pr"); pr[params->ppr-1] = 1.0; niter = gk_rw_PageRank(mat, params->lamda, params->eps, params->niter, pr); printf("ppr: %d; niters: %zd\n", params->ppr, niter); /* output the computed pr scores */ fpout = gk_fopen(params->outfile, "w", "main: outfile"); for (i=0; i<mat->nrows; i++) fprintf(fpout, "%.4e\n", pr[i]); gk_fclose(fpout); gk_free((void **)&pr, LTERM); } else { /* compute the standard pr */ int jmax; float diff, maxdiff; float *pr; pr = gk_fsmalloc(mat->nrows, 1.0/mat->nrows, "main: pr"); niter = gk_rw_PageRank(mat, params->lamda, params->eps, params->niter, pr); printf("pr; niters: %zd\n", niter); /* output the computed pr scores */ fpout = gk_fopen(params->outfile, "w", "main: outfile"); for (i=0; i<mat->nrows; i++) { for (jmax=i, maxdiff=0.0, j=mat->rowptr[i]; j<mat->rowptr[i+1]; j++) { if ((diff = fabs(pr[i]-pr[mat->rowind[j]])) > maxdiff) { maxdiff = diff; jmax = mat->rowind[j]; } } fprintf(fpout, "%.4e %10zd %.4e %10d\n", pr[i], mat->rowptr[i+1]-mat->rowptr[i], maxdiff, jmax+1); } gk_fclose(fpout); gk_free((void **)&pr, LTERM); } gk_csr_Free(&mat); /* display some final stats */ print_final_info(params); }
gk_graph_t *gk_graph_Read(char *filename, int format, int isfewgts, int isfvwgts, int isfvsizes) { ssize_t i, k, l; size_t nfields, nvtxs, nedges, fmt, ncon, lnlen; int32_t ival; float fval; int readsizes=0, readwgts=0, readvals=0, numbering=0; char *line=NULL, *head, *tail, fmtstr[256]; FILE *fpin=NULL; gk_graph_t *graph=NULL; if (!gk_fexists(filename)) gk_errexit(SIGERR, "File %s does not exist!\n", filename); if (format == GK_GRAPH_FMT_METIS) { fpin = gk_fopen(filename, "r", "gk_graph_Read: fpin"); do { if (gk_getline(&line, &lnlen, fpin) <= 0) gk_errexit(SIGERR, "Premature end of input file: file:%s\n", filename); } while (line[0] == '%'); fmt = ncon = 0; nfields = sscanf(line, "%zu %zu %zu %zu", &nvtxs, &nedges, &fmt, &ncon); if (nfields < 2) gk_errexit(SIGERR, "Header line must contain at least 2 integers (#vtxs and #edges).\n"); nedges *= 2; if (fmt > 111) gk_errexit(SIGERR, "Cannot read this type of file format [fmt=%zu]!\n", fmt); sprintf(fmtstr, "%03zu", fmt%1000); readsizes = (fmtstr[0] == '1'); readwgts = (fmtstr[1] == '1'); readvals = (fmtstr[2] == '1'); numbering = 1; ncon = (ncon == 0 ? 1 : ncon); } else { gk_errexit(SIGERR, "Unrecognized format: %d\n", format); } graph = gk_graph_Create(); graph->nvtxs = nvtxs; graph->xadj = gk_zmalloc(nvtxs+1, "gk_graph_Read: xadj"); graph->adjncy = gk_i32malloc(nedges, "gk_graph_Read: adjncy"); if (readvals) { if (isfewgts) graph->fadjwgt = gk_fmalloc(nedges, "gk_graph_Read: fadjwgt"); else graph->iadjwgt = gk_i32malloc(nedges, "gk_graph_Read: iadjwgt"); } if (readsizes) { if (isfvsizes) graph->fvsizes = gk_fmalloc(nvtxs, "gk_graph_Read: fvsizes"); else graph->ivsizes = gk_i32malloc(nvtxs, "gk_graph_Read: ivsizes"); } if (readwgts) { if (isfvwgts) graph->fvwgts = gk_fmalloc(nvtxs*ncon, "gk_graph_Read: fvwgts"); else graph->ivwgts = gk_i32malloc(nvtxs*ncon, "gk_graph_Read: ivwgts"); } /*---------------------------------------------------------------------- * Read the sparse graph file *---------------------------------------------------------------------*/ numbering = (numbering ? - 1 : 0); for (graph->xadj[0]=0, k=0, i=0; i<nvtxs; i++) { do { if (gk_getline(&line, &lnlen, fpin) == -1) gk_errexit(SIGERR, "Pregraphure end of input file: file while reading row %d\n", i); } while (line[0] == '%'); head = line; tail = NULL; /* Read vertex sizes */ if (readsizes) { if (isfvsizes) { #ifdef __MSC__ graph->fvsizes[i] = (float)strtod(head, &tail); #else graph->fvsizes[i] = strtof(head, &tail); #endif if (tail == head) gk_errexit(SIGERR, "The line for vertex %zd does not have size information\n", i+1); if (graph->fvsizes[i] < 0) gk_errexit(SIGERR, "The size for vertex %zd must be >= 0\n", i+1); } else { graph->ivsizes[i] = strtol(head, &tail, 0); if (tail == head) gk_errexit(SIGERR, "The line for vertex %zd does not have size information\n", i+1); if (graph->ivsizes[i] < 0) gk_errexit(SIGERR, "The size for vertex %zd must be >= 0\n", i+1); } head = tail; } /* Read vertex weights */ if (readwgts) { for (l=0; l<ncon; l++) { if (isfvwgts) { #ifdef __MSC__ graph->fvwgts[i*ncon+l] = (float)strtod(head, &tail); #else graph->fvwgts[i*ncon+l] = strtof(head, &tail); #endif if (tail == head) gk_errexit(SIGERR, "The line for vertex %zd does not have enough weights " "for the %d constraints.\n", i+1, ncon); if (graph->fvwgts[i*ncon+l] < 0) gk_errexit(SIGERR, "The weight vertex %zd and constraint %zd must be >= 0\n", i+1, l); } else { graph->ivwgts[i*ncon+l] = strtol(head, &tail, 0); if (tail == head) gk_errexit(SIGERR, "The line for vertex %zd does not have enough weights " "for the %d constraints.\n", i+1, ncon); if (graph->ivwgts[i*ncon+l] < 0) gk_errexit(SIGERR, "The weight vertex %zd and constraint %zd must be >= 0\n", i+1, l); } head = tail; } } /* Read the rest of the row */ while (1) { ival = (int)strtol(head, &tail, 0); if (tail == head) break; head = tail; if ((graph->adjncy[k] = ival + numbering) < 0) gk_errexit(SIGERR, "Error: Invalid column number %d at row %zd.\n", ival, i); if (readvals) { if (isfewgts) { #ifdef __MSC__ fval = (float)strtod(head, &tail); #else fval = strtof(head, &tail); #endif if (tail == head) gk_errexit(SIGERR, "Value could not be found for edge! Vertex:%zd, NNZ:%zd\n", i, k); graph->fadjwgt[k] = fval; } else { ival = strtol(head, &tail, 0); if (tail == head) gk_errexit(SIGERR, "Value could not be found for edge! Vertex:%zd, NNZ:%zd\n", i, k); graph->iadjwgt[k] = ival; } head = tail; } k++; } graph->xadj[i+1] = k; } if (k != nedges) gk_errexit(SIGERR, "gk_graph_Read: Something wrong with the number of edges in " "the input file. nedges=%zd, Actualnedges=%zd.\n", nedges, k); gk_fclose(fpin); gk_free((void **)&line, LTERM); return graph; }