main(int argc, char **argv) { int i, j, k, l, m, n, centro[200], len_chroseq[100]; int maxi; int num_seq; char **chrname, **repnames; int num_chro; int *counts, maxc; SEGMENT *segment; int num_segment; int *repeats, num_repeats; char name[1000], temp[1000], c; double id; FILE *fp; readpar(); initenv(argc, argv); /* Input chromsomal information */ chrname = alloc_name(100, 100); fp = ckopen(lenfile, "r"); num_chro = read_chro_centro(chrname, len_chroseq, centro, fp); fclose(fp); nchro = findgenname(chriname, chrname, num_chro); /* input segments */ segment = (SEGMENT *) ckalloc(60000 * sizeof(SEGMENT)); fp = stdin; num_segment = input_segment(segment, chrname, num_chro, fp); /* sort the segments */ /* qsort((void *) segment, num_segment, sizeof(SEGMENT), (void *) segcompar); */ /* Define repeats from sub-repeats */ repeats = (int *) ckalloc(num_segment * sizeof(int)); num_repeats = 0; counts = (int *) ckalloc(10000 * num_segment * sizeof(int)); n = m = 0; j = 0; for(i = 0; i < num_segment; i ++) { if(i == num_segment - 1 || segment[i + 1].pos[0] > segment[i].pos[1] + min_length || segment[i + 1].chro != segment[i].chro) { repeats[num_repeats ++] = i; if(segment[i].chro == nchro) j ++; } if(segment[i].pos[1] - segment[i].pos[0] < min_length) continue; counts[segment[i].eq_pos[0]] ++; if(segment[i].eq_pos[0] > n) n = segment[i].eq_pos[0]; } m += n; k = maxc = 0; for(i = 0; i < m; i ++) { if(counts[i] > 1) k ++; if(counts[i] > maxc) { maxi = i; maxc = counts[i]; } } printf("m %d k %d\n", m, k); printf("%d repeats (%s %d) %d subrepeats %d max_multip subrepeats index %d.\n", num_repeats, chriname, j, k, maxc, maxi); free((void *) counts); chrname = free_name(chrname, 100); free((void *) repeats); free((void *) segment); }
/*--------------------------------------------------------*/ int main(int argc, char *argv[]) { char *parfname, *inpfname, *outfname; int i, nobj, nterm, ik, ik0, delta, niter; float *x, *nx, *nzx, *zx, *y, *ny, *nzy, *zy, dx, dy, rr, thresh, sx, sy; double *coeffx, *coeffy; FILE *outf; PARAMS par; /* IO stuff */ if (argc != 4) { printf("\n\tUSAGE: xygrid parameter_file input_list coeff_file\n\n"); exit(1); } parfname = argv[1]; inpfname = argv[2]; outfname = argv[3]; readpar(parfname, &par); nterm = (par.ndeg+1)*(par.ndeg+2)/2; nobj=readcoor(inpfname, &x, &y, &zx, &zy); if (!(coeffx=(double *)calloc(nterm, sizeof(double)))) errmess("calloc(coeffx)"); if (!(coeffy=(double *)calloc(nterm, sizeof(double)))) errmess("calloc(coeffy)"); /* make initial fit */ fit(x, y, zx, par.ndeg, nobj, coeffx); fit(x, y, zy, par.ndeg, nobj, coeffy); sx = sigma(x, y, zx, par.ndeg, nobj, coeffx); sy = sigma(x, y, zy, par.ndeg, nobj, coeffy); thresh = par.sigmaf*par.sigmaf*(sx*sx + sy*sy); ik = nobj; delta = 1; niter = 0; if (!(nx=(float *)calloc(nobj, sizeof(float)))) errmess("readcoor: calloc(nx)"); if (!(ny=(float *)calloc(nobj, sizeof(float)))) errmess("readcoor: calloc(ny)"); if (!(nzx=(float *)calloc(nobj, sizeof(float)))) errmess("readcoor: calloc(nzx)"); if (!(nzy=(float *)calloc(nobj, sizeof(float)))) errmess("readcoor: calloc(nzy)"); /* sigma clipping of the fit until MAX_NITER reached or nothing changes */ while ((delta > 0) && (niter < par.maxniter)) { ik0 = ik; ik = 0; for (i=0; i<nobj; i++) { dx = poly(x[i], y[i], par.ndeg, coeffx) - zx[i]; dy = poly(x[i], y[i], par.ndeg, coeffy) - zy[i]; rr = dx*dx + dy*dy; nx[ik] = x[i]; ny[ik] = y[i]; nzx[ik] = zx[i]; nzy[ik] = zy[i]; if (rr < thresh) ++ik; } delta = ik0 - ik; fit(nx, ny, nzx, par.ndeg, ik, coeffx); fit(nx, ny, nzy, par.ndeg, ik, coeffy); sx = sigma(nx, ny, nzx, par.ndeg, ik, coeffx); sy = sigma(nx, ny, nzy, par.ndeg, ik, coeffy); niter++; } free(x); free(y); free(zx); free(zy); free(nx); free(ny); free(nzx); free(nzy); /* print results and store coefficients in binary file */ if (par.verbose) { printf("\n"); for (i=0; i<nterm; i++) { printf("coeffx[%d] = %9.6f ", i, coeffx[i]); printf("coeffy[%d] = %9.6f \n", i, coeffy[i]); } } printf("%s: sigmax= %.4f sigmay= %.4f ndata= %d nleft= %d\n", outfname, sx, sy, nobj, ik); if (!(outf=fopen(outfname, "w"))) errmess(outfname); fwrite(&nterm, sizeof(int), 1, outf); fwrite(coeffx, sizeof(double), nterm, outf); fwrite(coeffy, sizeof(double), nterm, outf); fclose(outf); free(coeffx); free(coeffy); return(0); }
main(int argc, char **argv) { int i, j, k, l, m, n; char **src_seq, **src_name; int *len_seq, num_seq; char temp[100]; ALIGN **align, *aln, *aln0; FILE *fp; readpar(); random1(&idum); initenv(argc, argv); /* Input the length of the reads (required) */ len_seq = (int *) ckalloc(2 * sizeof(int)); src_seq = (char **) ckalloc(2 * sizeof(char *)); src_name = (char **) ckalloc(1 * sizeof(char *)); src_name[0] = (char *) ckalloc(100 * sizeof(char)); fp = ckopen(seqfile, "r"); num_seq = readseq1by1(src_seq, src_name, len_seq, fp); fclose(fp); printf("Genome length: %d\n", len_seq[0]); /* Make reverse complements of input sequences rev(i) --> i + num_seq */ len_seq[1] = len_seq[0]; src_seq[1] = (char *) ckalloc(len_seq[0] * sizeof(char)); for(j = 0; j < len_seq[0]; j ++) { src_seq[1][j] = rev(src_seq[0][len_seq[0] - j - 1]); } /* read in pairwise alignments by Reputer */ align = (ALIGN **) ckalloc(2 * sizeof(ALIGN *)); fp = ckopen(inpfile, "r"); n = readph(align, src_seq, len_seq, fp, min_leg, min_id); fclose(fp); printf("# alignments input: %d.\n", n); /* Write alignments */ fp = ckopen(outfile, "w"); for(m = 0; m < 2; m ++) { n = size_align(align[m]); fwrite(&n, sizeof(int), 1, fp); aln = align[m]; while(aln) { fwrite(&(aln -> reads[1]), sizeof(int), 1, fp); fwrite(&(aln -> mis_match), sizeof(int), 1, fp); fwrite(&(aln -> length), sizeof(int), 1, fp); fwrite(aln -> pos[0], sizeof(int), aln -> length, fp); fwrite(aln -> pos[1], sizeof(int), aln -> length, fp); aln0 = aln -> next; free((void *) aln -> pos[0]); free((void *) aln -> pos[1]); free((void *) aln); aln = aln0; } } fclose(fp); printf("Done...\n"); free((void **) align); for(i = 0; i < 2 * num_seq; i ++) { free((void *) src_seq[i]); } for(i = 0; i < num_seq; i ++) { free((void *) src_name[i]); } free((void **) src_seq); free((void **) src_name); free((void *) len_seq); }
main(int argc, char **argv) { int i, j, k, l, m, n; int dist[20]; int reads; int num_vertex, num_class, num_edge; int *len_seq, num_seq, num_remain; int **num_pa; char **src_seq, **src_name; char temp[100]; ALIGN **eq_class, *align; EDGE **edge, *edge1, *edge2, *bal_edge1, *bal_edge2; PATH *path; int num_path; NODES **vertex, *begin, *node, *node_next, **start_node; LIST **list; READINTERVAL *readinterval; POSITION *position; FILE *fp, *fp1; readpar(); random1(&idum); initenv(argc, argv); printf("%d %d %d\n", sizeof(POSITION), sizeof(NODES), sizeof(LIST)); /* Input the length of the genome (required) */ len_seq = (int *) ckalloc(2 * MAX_NUM * sizeof(int)); src_name = alloc_name(MAX_NUM, 100); fp = ckopen(lenfile, "r"); num_seq = readlen(fp, len_seq, src_name); fclose(fp); src_seq = (char **) ckalloc(2 * num_seq * sizeof(char *)); l = 0; printf("Genome length: "); for(i = 0; i < num_seq; i ++) { l += len_seq[i]; printf("%d ", len_seq[i]); } printf("\n"); printf("Total length: %d\n", l); /* Make reverse complements of input sequences rev(i) --> i + num_seq */ for(i = 0; i < num_seq; i ++) { len_seq[i + num_seq] = len_seq[i]; src_seq[i] = (char *) ckalloc(len_seq[i] * sizeof(char)); src_seq[i + num_seq] = (char *) ckalloc(len_seq[i] * sizeof(char)); for(j = 0; j < len_seq[i]; j ++) { src_seq[num_seq + i][j] = rev(src_seq[i][len_seq[i] - j - 1]); } } /* Input equivalent readintervales between reads -- see the format of the equivalent readinterval files */ printf("Read equivalent readintervales...\n"); eq_class = (ALIGN **) ckalloc(2 * num_seq * sizeof(ALIGN *)); fp = ckopen(inpfile, "r"); num_class = readclass(eq_class, num_seq, fp); fclose(fp); printf("# equivalent readintervales input: %d\n", num_class); /* for(i = 0; i < 2 * num_seq; i ++) { align = eq_class[i]; while(align) { printf("See: \n"); output_align(align, src_name, src_seq, len_seq, num_seq); getchar(); align = align -> next; } } */ /* Initialize the nodes: each position in each read is assigned as a new node. An array of "list" is set up for each read */ list = (LIST **) ckalloc(2 * num_seq * sizeof(LIST *)); for(i = 0; i < 2 * num_seq; i ++) { list[i] = (LIST *) ckalloc(len_seq[i] * sizeof(LIST)); } printf("intitialize nodes...\n"); initialize(list, len_seq, num_seq); printf("done.\n"); n = countnode(list, len_seq, 2 * num_seq); printf("# of nodes before merge: %d\n", n); /* Glue together two nodes if their corresponding positions are defined as equivalent in a pairwise alignment */ printf("Merge...\n"); merge(num_seq, len_seq, eq_class, num_class, list); printf("done.\n"); for(i = 0; i < num_seq; i ++) { while(eq_class[i]) { eq_class[i] = free_align(eq_class[i]); } } free((void **) eq_class); /* Compute the width of each node */ for(i = 0; i < 2 * num_seq; i ++) { for(j = 0; j < len_seq[i]; j ++) { if(!list[i][j].node -> visit) { list[i][j].node -> num_path = countthickness(list[i][j].node); list[i][j].node -> visit = 1; } } } cleannode(list, len_seq, 2 * num_seq); n = countnode(list, len_seq, 2 * num_seq); printf("# of nodes after merge: %d\n", n); /* Add edges to the graph */ edge = (EDGE **) ckalloc(n * sizeof(EDGE *)); num_edge = graph(num_seq, len_seq, list, edge); printf("# edges: %d\n", num_edge); start_node = (NODES **) ckalloc(num_seq * sizeof(NODES *)); for(i = 0; i < num_seq; i ++) { if(len_seq[i] > 0) { start_node[i] = list[i][0].node; } else { start_node[i] = (NODES *) NULL; } } for(i = 0; i < 2 * num_seq; i ++) { free((void *) list[i]); } free((void **) list); vertex = (NODES **) ckalloc(2 * num_edge * sizeof(NODES *)); num_vertex = count_vertex(edge, num_edge, vertex); free((void **) edge); num_pa = (int **) ckalloc(MAX_BRA * sizeof(int *)); for(i = 0; i < MAX_BRA; i ++) { num_pa[i] = (int *) ckalloc(MAX_BRA * sizeof(int)); } num_edge = count_edge_simp(vertex, num_vertex, num_pa); printf("%d vertices %d edges (%d source %d sinks) remained.\n", num_vertex, num_edge, num_pa[0][1], num_pa[1][0]); /* Assign the complementary edges of each edge */ for(i = 0; i < num_vertex; i ++) { for(j = 0; j < vertex[i] -> num_nextedge; j ++) { edge1 = vertex[i] -> nextedge[j]; edge1 -> bal_edge = find_bal_edge(edge1, len_seq, num_seq, i); } } /* Remove bulges in the graph */ printf("Shave...\n"); num_vertex = shave_graph(vertex, num_vertex); printf("done.\n"); /* Remove cycles shorter than some threshold in the graph */ /* printf("Shaving graph...\n"); num_vertex = rem_cycle(vertex, num_vertex); printf("done.\n%d vertices remained.\n", num_vertex); */ /* remove short edges */ /* printf("Remove shortedges...\n"); num_vertex = rem_short_edge(vertex, num_vertex, len_seq); printf("done.\n%d vertices remained.\n", num_vertex); fflush(stdout); */ num_edge = count_edge_simp(vertex, num_vertex, num_pa); printf("%d vertices %d edges (%d source %d sinks) remained.\n", num_vertex, num_edge, num_pa[0][1], num_pa[1][0]); fflush(stdout); /* Allocate the spaces for paths */ printf("Allocating paths...\n"); for(i = 0; i < num_vertex; i ++) { vertex[i] -> num_path = 0; } /* Build sequence paths */ printf("Define paths...\n"); m = 0; for(i = 0; i < num_vertex; i ++) { for(j = 0; j < vertex[i] -> num_nextedge; j ++) { m += vertex[i] -> nextedge[j] -> multip; } } path = (PATH *) ckalloc(2 * num_seq * sizeof(PATH)); for(i = 0; i < 2 * num_seq; i ++) { path[i].edge = (EDGE **) ckalloc(m * sizeof(EDGE *)); } num_path = readpath(start_node, path, num_seq); free((void **) start_node); num_edge = count_edge_simp(vertex, num_vertex, num_pa); m = l = 0; for(i = 0; i < num_vertex; i ++) { for(j = 0; j < vertex[i] -> num_nextedge; j ++) { l += vertex[i] -> nextedge[j] -> length; if(vertex[i] -> nextedge[j] -> length > m) { m = vertex[i] -> nextedge[j] -> length; } } } printf("%d vertics %d edges (%d source %d sinks) remained: total length %d (maximal %d).\n", num_vertex, num_edge, num_pa[0][1], num_pa[1][0], l, m); fflush(stdout); /* Make consensus of edges */ initial_edge(vertex, num_vertex, src_seq, num_seq); printf("edge initialed\n"); /* Output sequence path */ n = 0; for(i = 0; i < num_vertex; i ++) { vertex[i] -> visit = i; for(j = 0; j < vertex[i] -> num_nextedge; j ++) { vertex[i] -> nextedge[j] -> start_cover = n; n ++; } } for(m = 0; m < num_seq; m ++) { printf("len_path %d\n", path[m].len_path); printf("Sequence%d: ", m + 1); for(i = 0; i < path[m].len_path; i ++) { printf("%d -- %d(%d,%d) --> ", path[m].edge[i] -> begin -> visit, path[m].edge[i] -> start_cover, path[m].edge[i] -> multip, path[m].edge[i] -> length); if(i % 5 == 4) { printf("\n"); } } if(path[m].len_path > 0) { printf("%d\n", path[m].edge[i - 1] -> end -> visit); } else { printf("\n"); } fflush(stdout); } /* Output graph & contigs */ sprintf(temp, "%s.edge", seqfile); fp = ckopen(temp, "w"); sprintf(temp, "%s.graph", seqfile); fp1 = ckopen(temp, "w"); write_graph(vertex, num_vertex, fp, fp1); fclose(fp); fclose(fp1); /* Output read intervals in each edge */ sprintf(temp, "%s.intv", seqfile); fp = ckopen(temp, "w"); write_interval(vertex, num_vertex, fp); fclose(fp); /* Output graphviz format graph */ sprintf(temp, "%s", outfile); fp = ckopen(temp, "w"); output_graph(vertex, num_vertex, fp); fclose(fp); for(i = 0; i < MAX_BRA; i ++) { free((void *) num_pa[i]); } free((void **) num_pa); for(i = 0; i < 2 * num_seq; i ++) { if(path[i].len_path > 0) { free((void **) path[i].edge); } } free((void *) path); free_graph(vertex, num_vertex); for(i = 0; i < 2 * num_seq; i ++) { free((void *) src_seq[i]); } free((void **) src_seq); free_name(src_name, MAX_NUM); free((void *) len_seq); }
/*========================================== * main *========================================== */ int main(int argc, char* argv[]) { int c, iter, ITER=0, seed=0; enum dataType data = LdaC; enum dataType testdata = LdaC; int dots = 0; enum GibbsType fix_hold = GibbsNone; char *stem; char *resstem; int topwords = 20; int noerrorlog = 0; int displayed = 0; int load_vocab = 0; int checkpoint = 0; int restart = 0; int dopmi = 0; int restart_hca = 0; int load_phi = 0; int load_mu = 0; int procs = 1; int maxW = 0; enum ScoreType score=ST_idf; double BM0val=0, BM1val =0, BP0val=0, BP1val=0; clock_t t1=0, t2=0, t3=0; double tot_time = 0; double psample_time = 0; enum ParType par; /* * default values */ ddN.T = 10; ITER = 100; ddN.TEST = 0; pctl_init(); while ( (c=getopt(argc, argv,"b:c:C:d:ef:F:g:G:h:K:l:L:N:o:pq:vr:s:S:t:T:vVW:"))>=0 ) { switch ( c ) { case 'b': if ( !optarg || sscanf(optarg,"%d",&ddP.back)!=1 ) yap_quit("Need a valid 'b' argument\n"); break; case 'c': if ( !optarg || sscanf(optarg,"%d",&checkpoint)!=1 ) yap_quit("Need a valid 'c' argument\n"); break; case 'C': if ( !optarg || sscanf(optarg,"%d",&ITER)!=1 ) yap_quit("Need a valid 'C' argument\n"); break; case 'd': if ( !optarg || sscanf(optarg,"%d",&dots)!=1 ) yap_quit("Need a valid 'd' argument\n"); break; case 'e': noerrorlog++; break; case 'f': if ( strcmp(optarg,"witdit")==0 ) data = WitDit; else if ( strcmp(optarg,"docword")==0 ) data = Docword; else if ( strcmp(optarg,"ldac")==0 ) data = LdaC; else if ( strcmp(optarg,"bag")==0 ) data = TxtBag; else if ( strcmp(optarg,"lst")==0 ) data = SeqTxtBag; else yap_quit("Illegal data type for -f\n"); break; case 'F': if ( strcmp(optarg,"all")==0 ) { for (par=ParAM; par<=ParBB; par++) ddT[par].fix = 1; } else { par = findpar(optarg); if ( par==ParNone ) yap_quit("Illegal arg for -F\n"); ddT[par].fix = 1; } break; case 'g': { char var[100]; int st=0; if ( !optarg || sscanf(optarg,"%[^, ],%d", &var[0], &st)<1 ) yap_quit("Need a valid 'g' argument\n"); par = findpar(var); if ( par==ParBP1 ) ddP.kbatch = st; else yap_quit("Illegal var for -g\n"); } break; case 'G': { char var[100]; int st=0, cy=0; if ( !optarg || sscanf(optarg,"%[^, ],%d,%d", &var[0], &cy, &st)<2 || st<0 || cy<0 ) yap_quit("Need a valid 'G' argument\n"); par = findpar(var); if ( par==ParNone || par==ParB0P || par==ParB0M ) yap_quit("Illegal var for -G\n"); ddT[par].fix = 0; ddT[par].start = st; ddT[par].cycles = cy; } break; case 'h': { fix_hold = GibbsHold; if ( !optarg ) yap_quit("Need a valid 'h' argument\n"); if ( strncmp(optarg,"dict,",5)==0 ) { if ( sscanf(&optarg[5],"%d",&ddP.hold_dict)<1 || ddP.hold_dict<2 ) yap_quit("Need a valid 'hdict' argument\n"); } else if ( strncmp(optarg,"fract,",6)==0 ) { if ( sscanf(&optarg[6],"%lf",&ddP.hold_fraction)<1 || ddP.hold_fraction<=0 || ddP.hold_fraction>=1 ) yap_quit("Need a valid 'hfract' argument\n"); } else if ( strncmp(optarg,"doc,",4)==0 ) { if ( sscanf(&optarg[4],"%d",&ddP.hold_every)<1 || ddP.hold_every<2 ) yap_quit("Need a valid 'hdoc' argument\n"); } else yap_quit("Need a valid 'h' argument\n"); } break; case 'K': if ( !optarg || sscanf(optarg,"%d",&ddN.T)!=1 ) yap_quit("Need a valid 'K' argument\n"); break; case 'l': if ( !optarg ) yap_quit("Need a valid 'l ' argument\n"); if ( strncmp(optarg,"phi,",4)==0 ) { if ( sscanf(&optarg[4],"%d,%d",&ddP.phiiter, &ddP.phiburn)<2 ) yap_quit("Need a valid 'l word,' argument\n"); } else if ( strncmp(optarg,"theta,",6)==0 ) { if ( sscanf(&optarg[6],"%d,%d",&ddP.thetaiter, &ddP.thetaburn)<2 ) yap_quit("Need a valid 'l word,' argument\n"); } else if ( strncmp(optarg,"mu,",3)==0 ) { if ( sscanf(&optarg[3],"%d,%d",&ddP.muiter, &ddP.muburn)<2 ) yap_quit("Need a valid 'l word,' argument\n"); } else if ( strncmp(optarg,"prog,",5)==0 ) { if ( sscanf(&optarg[5],"%d,%d",&ddP.progiter, &ddP.progburn)<2 ) yap_quit("Need a valid 'l prog,' argument\n"); } else yap_quit("Need a valid DIAG code in 'l' argument\n"); break; case 'L': if ( !optarg ) yap_quit("Need a valid 'L ' argument\n"); if ( strncmp(optarg,"like,",5)==0 ) { if ( sscanf(&optarg[5],"%d,%d",&ddP.mltiter, &ddP.mltburn)<1 ) yap_quit("Need a valid 'L like' argument\n"); } else yap_quit("Need a valid DIAG code in 'L' argument\n"); break; case 'N': if ( !optarg || sscanf(optarg,"%d,%d", &ddP.maxN, &ddP.maxM)<1 ) yap_quit("Need a valid 'N' argument\n"); break; case 'o': { char *ptr = strchr(optarg, ','); int len = strlen(optarg); if ( ptr ) len = ptr - optarg; if ( strncmp(optarg,"idf",len)==0 ) score = ST_idf; else if ( strncmp(optarg,"count",len)==0 ) score = ST_count; else if ( strncmp(optarg,"Q",len)==0 ) score = ST_Q; else if ( strncmp(optarg,"cost",len)==0 ) score = ST_cost; else yap_quit("Need a valid parameter for 'o' argument\n"); if ( ptr ) { /* there was a second arg */ if ( sscanf(ptr+1, "%d", &topwords) != 1) yap_quit("Need a valid second 'o' argument\n"); } break; } break; case 'p': dopmi++; break; case 'q': if(!optarg || sscanf(optarg, "%d", &procs) != 1) yap_quit("Need a valid 'q' argument\n"); break; case 'r': if(!optarg ) yap_quit("Need a valid 'r' argument\n"); if ( strcmp(optarg,"tca")==0 ) restart++; else if ( strcmp(optarg,"hca")==0 ) restart_hca++; else if ( strcmp(optarg,"phi")==0 ) load_phi++; else if ( strcmp(optarg,"mu")==0 ) load_mu++; else yap_quit("Need a valid 'r' argument\n"); break; case 's': if ( !optarg || sscanf(optarg,"%d",&seed)!=1 ) yap_quit("Need a valid 's' argument\n"); break; case 'S': { char var[100]; double vin=0; if ( !optarg || sscanf(optarg,"%[^=, ]=%lf", &var[0], &vin)<2 ) yap_quit("Need a valid 'S' argument\n"); par = findpar(var); if ( par==ParNone ) yap_quit("Illegal var for -S\n"); else if ( par==ParBM0 ) BM0val = vin; else if ( par==ParBM1 ) BM1val = vin; else if ( par==ParBP0 ) BP0val = vin; else if ( par==ParBP1 ) BP1val = vin; else *(ddT[par].ptr) = vin; } break; case 't': if ( !optarg || sscanf(optarg,"%d",&ddP.training)!=1 ) yap_quit("Need a valid 't' argument\n"); break; case 'T': if ( !optarg ) yap_quit("Need a valid 'T' argument\n"); { char *tname = data_name(optarg,data); FILE *fp = fopen(tname,"r"); if ( fp==NULL ) { free(tname); tname = data_name(optarg,testdata); fp = fopen(tname,"r"); } else { testdata = data; } free(tname); if ( fp!=NULL ) { /* its a valid test filename */ ddP.teststem = optarg; fclose(fp); } else if ( sscanf(optarg,"%d",&ddN.TEST)!=1 ) yap_quit("Need a valid 'T' argument\n"); } break; case 'v': verbose++; break; case 'V': load_vocab = 1; break; case 'W': if ( !optarg || sscanf(optarg,"%d",&maxW)<1 ) yap_quit("Need a valid 'W' argument\n"); break; default: yap_quit("Unknown option '%c'\n", c); } } if (argc-optind != 2) { usage(); exit(-1); } if ( optind>=argc ) { yap_quit("No arguments given\n"); } stem = strdup(argv[optind++]); resstem = strdup(argv[optind++]); if ( dopmi ) load_vocab = 1; if ( dopmi && verbose !=2 ) { /* * due to the use of the ".top" file * its really multi-purpose */ yap_quit("When computing PMI verbose must be exactly 2\n"); } if ( noerrorlog==0 ) { char *wname = yap_makename(resstem, ".log"); yap_file(wname); free(wname); } yap_commandline(argc, argv); #ifdef H_THREADS yap_message(" Threads,"); #endif if ( restart || restart_hca ) { char *fname = yap_makename(resstem,".par"); FILE *fp = fopen(fname,"r"); char *buf; if ( !fp ) yap_quit("Parameter file '%s' doesn't exist\n", fname); fclose(fp); free(fname); buf = readpar(resstem,"T",50); if ( !buf ) yap_quit("Parameter file '%s' has no T\n", fname); ddN.T = atoi(buf); free(buf); if ( restart ) { buf = readpar(resstem,"E",50); if ( !buf ) yap_quit("Parameter file '%s' has no E\n", fname); ddN.E = atoi(buf); free(buf); pctl_read(resstem); } if ( maxW==0 ) { buf = readpar(resstem,"W",50); if ( buf ) { maxW = atoi(buf); free(buf); } } if ( ddP.training==0 ) { buf = readpar(resstem,"TRAIN",50); if ( buf ) { ddP.training = atoi(buf); free(buf); } } if ( ddN.TEST==0 ) { buf = readpar(resstem,"TEST",50); if ( buf ) { ddN.TEST = atoi(buf); free(buf); } } } assert(ddN.T>0); assert(ddN.TEST>=0); assert(restart || restart_hca || ITER>0); if ( load_phi && ddP.phiiter>0 ) yap_quit("Options '-l phi,...' and '-r phi' incompatible\n"); if ( load_mu && ddP.muiter>0 ) yap_quit("Options '-l mu,...' and '-r mu' incompatible\n"); /* * set random number generator */ if ( seed ) { rng_seed(rngp,seed); } else { rng_time(rngp,&seed); } yap_message("Setting seed = %lu\n", seed); /* * read data and get dimensions */ { D_bag_t *dbp = data_read(stem, data); int training = pctl_training(dbp->D); if ( ddP.teststem ) { D_bag_t *dbpt = data_read(ddP.teststem, testdata); /* need to load a separate test set, strip to bare training */ data_shrink(dbp, training); ddN.TEST = dbpt->D; data_append(dbp, dbpt); free(dbpt->w); free(dbpt->d); free(dbpt); } if ( maxW>0 ) { if ( dbp->W <= maxW ) dbp->W = maxW; if ( dbp->W > maxW ) data_vocabshrink(dbp, maxW); } /* * transfer into system */ ddN.D = dbp->D; ddN.W = dbp->W; ddN.N = dbp->N; ddN.NT = dbp->N; ddN.DT = training; ddD.w = dbp->w; ddD.d = dbp->d; free(dbp); if ( ddN.DT<ddN.D ) { /* recompute NT */ int i; for (i=0; i<ddN.N; i++) if ( ddD.d[i]>=ddN.DT ) break; ddN.NT = i; } } data_read_epoch(stem); /* * at this point, dimensions are fixed, so load phi and mu if needed */ if ( load_phi ) pctl_loadphi(resstem); if ( load_mu ) pctl_loadmu(resstem); /* * correct parameters after command line */ pctl_fix(ITER); if ( BM0val>0 ) { ddP.b_mu[0] = BM0val; } if ( BM1val>0 ) { int i; for (i=1; i<ddN.E; i++) ddP.b_mu[i] = BM1val; } if ( BP0val>0 ) { int i; for (i=0; i<ddN.T; i++) ddP.b_phi[0][i] = BP0val; } if ( BP1val>0 ) { int i; if ( ddN.E==1 ) yap_quit("b_phi[1] invalid when epochs==1\n"); for (i=0; i<ddN.T; i++) ddP.b_phi[1][i] = BP1val; } pctl_samplereport(); /* * all data structures */ data_alloc(); if ( ddP.phiiter>0 ) phi_init(resstem); else ddS.phi = NULL; if ( ddP.muiter>0 ) mu_init(resstem); else ddS.mu = NULL; if ( ddP.thetaiter>0 ) theta_init(resstem); else ddS.theta = NULL; tca_alloc(); if ( PCTL_BURSTY() ) dmi_init(&ddM, ddS.z, ddD.w, ddD.N_dTcum, ddN.T, ddN.N, ddN.W, ddN.D, ddN.DT, (fix_hold==GibbsHold)?pctl_hold:NULL); if ( load_vocab ) { data_vocab(stem); } cache_init(); /* * yap some details */ data_report(ITER, seed); pctl_report(); /* * load/init topic assignments and prepare statistics */ if ( restart || restart_hca) { tca_read_z(resstem, 0, ddN.DT); tca_rand_z(ddN.T, ddN.DT, ddN.D); } else { tca_rand_z(ddN.T, 0, ddN.D); } tca_reset_stats(resstem, restart, 0); if ( (restart || restart_hca ) && ITER ) yap_message("Initial log_2(perp)=%lf\n", -M_LOG2E * likelihood()/ddN.NT); if ( ITER ) yap_report("cycles: "); for (iter=0; iter<ITER; iter++) { int pro; double thislp = 0; int thisNd = 0; int doc; #ifdef H_THREADS pthread_t thread[procs]; #endif D_pargs_p parg[procs]; #ifdef MU_CACHE mu_side_fact_reinit(); #endif #ifdef PHI_CACHE phi_cache_reinit(); #endif t1 = clock(); /* * sampling */ #ifdef IND_STATS ddP.doc_ind_stats = u32tri(ddN.T,ddN.E,ddN.E); ddP.word_ind_stats = u32tri(ddN.T,ddN.E,ddN.E); #endif /* a bit complex if no threads! */ doc = 0; for (pro = 0 ; pro < procs ; pro++){ parg[pro].dots=dots; parg[pro].procs=procs; parg[pro].doc = &doc; #ifndef H_THREADS sampling_p(&parg[pro]); #else if ( procs==1 ) sampling_p(&parg[pro]); else if( pthread_create(&thread[pro],NULL,sampling_p,(void*) &parg[pro]) != 0){ yap_message("thread failed %d\n",pro+1 ); } #endif } #ifdef H_THREADS if ( procs>1 ) { //waiting for threads to finish for (pro = 0; pro < procs; pro++){ pthread_join(thread[pro], NULL); } } #endif // getting lp, Nd and clock for(pro = 0; pro < procs; pro++){ thislp += parg[pro].thislp; thisNd += parg[pro].thisNd; tot_time += parg[pro].tot_time; } #ifdef H_THREADS if ( procs>1 ) tca_reset_stats(NULL,1,1); #endif /* * full check */ #ifndef NDEBUG { int e, d; check_cp_et(); for (e=0; e<ddN.E; e++) check_m_vte(e); for (d=0; d<ddN.DT; d++) check_n_dt(d); } #endif #ifdef IND_STATS { char *fname = yap_makename(resstem,".istats"); FILE *ifp = fopen(fname,"a"); int e1, e2, kk; fprintf(ifp,"Iteration %d\n", iter); for (kk=0; kk<ddN.T; kk++) { fprintf(ifp," Topic %d\n", kk); for (e1=0; e1<ddN.E; e1++) { fprintf(ifp," Epoch %d\n ", e1); for (e2=0; e2<ddN.E; e2++) fprintf(ifp," %u", (unsigned)ddP.doc_ind_stats[kk][e1][e2]); fprintf(ifp,"\n "); for (e2=0; e2<ddN.E; e2++) fprintf(ifp," %u", (unsigned)ddP.word_ind_stats[kk][e1][e2]); fprintf(ifp,"\n"); } } fclose(ifp); free(ddP.doc_ind_stats[0][0]); free(ddP.doc_ind_stats[0]); free(ddP.doc_ind_stats); free(ddP.word_ind_stats[0][0]); free(ddP.word_ind_stats[0]); free(ddP.word_ind_stats); free(fname); } #endif /* * sample hyperparameters */ t3 = clock(); pctl_sample(iter, procs); /* * do time calcs here to remove diagnostics+reporting */ t2 = clock(); tot_time += (double)(t2 - t1) / CLOCKS_PER_SEC; psample_time += (double)(t2 - t3) / CLOCKS_PER_SEC; /* * progress reports */ if ( ( iter>ddP.progburn && (iter%ddP.progiter)==0 ) || iter+1>=ITER ) { yap_message(" %d\nlog_2(perp)=%lf,%lf", iter, -M_LOG2E * likelihood()/ddN.NT, -M_LOG2E * thislp/thisNd); pctl_update(iter); if ( verbose && iter%10==0 ) yap_probs(); if ( iter>0 && verbose>1 ) { if ( ddN.tokens ) { tca_displaytopics(resstem,topwords,score); displayed++; } } if ( iter+1<ITER ) { // yap_message("\n"); yap_report("cycles: "); } } else { yap_message(" %d", iter); if ( verbose>1) yap_message("\n"); } if ( checkpoint>0 && iter>0 && iter%checkpoint==0 ) { data_checkpoint(resstem, stem, iter+1); yap_message(" checkpointed\n"); tca_report(resstem, stem, ITER, procs, fix_hold, (dopmi&&displayed>0)?1:0); } if ( ddP.phiiter>0 && iter>ddP.phiburn && (iter%ddP.phiiter)==0 ) phi_update(); if ( ddP.thetaiter>0 && iter>ddP.thetaburn && (iter%ddP.thetaiter)==0 ) theta_update(); if ( ddP.muiter>0 && iter>ddP.muburn && (iter%ddP.muiter)==0 ) mu_update(); } // over iter if ( ITER ) yap_report("Finished after %d cycles on average of %lf+%lf(s) per cycle\n", iter, (tot_time-psample_time)/iter, psample_time/iter); if ( ( verbose==1 || ((iter+1)%5!=0 && verbose>1) ) ) { if ( ddN.tokens ) { tca_displaytopics(resstem,topwords,score); displayed++; } } yap_probs(); if ( ITER>0 ) data_checkpoint(resstem, stem, ITER); tca_report(resstem, stem, ITER, procs, fix_hold, (dopmi&&displayed>0)?1:0); if ( ddP.phiiter>0 ) phi_save(resstem); if ( ddP.thetaiter>0 ) theta_save(resstem); if ( ddP.muiter>0 ) mu_save(resstem); /* * free */ phi_free(); theta_free(); mu_free(); cache_free(); pctl_free(); data_free(); dmi_free(&ddM); tca_free(); free(stem); free(resstem); rng_free(rngp); return 0; }