main(int argc, char **argv) { int i, j, k, l, m, n; char **src_seq, **src_name; int *len_seq, num_seq; char temp[100]; ALIGN **align, *aln, *aln0; FILE *fp; readpar(); random1(&idum); initenv(argc, argv); /* Input the length of the reads (required) */ len_seq = (int *) ckalloc(2 * sizeof(int)); src_seq = (char **) ckalloc(2 * sizeof(char *)); src_name = (char **) ckalloc(1 * sizeof(char *)); src_name[0] = (char *) ckalloc(100 * sizeof(char)); fp = ckopen(seqfile, "r"); num_seq = readseq1by1(src_seq, src_name, len_seq, fp); fclose(fp); printf("Genome length: %d\n", len_seq[0]); /* Make reverse complements of input sequences rev(i) --> i + num_seq */ len_seq[1] = len_seq[0]; src_seq[1] = (char *) ckalloc(len_seq[0] * sizeof(char)); for(j = 0; j < len_seq[0]; j ++) { src_seq[1][j] = rev(src_seq[0][len_seq[0] - j - 1]); } /* read in pairwise alignments by Reputer */ align = (ALIGN **) ckalloc(2 * sizeof(ALIGN *)); fp = ckopen(inpfile, "r"); n = readph(align, src_seq, len_seq, fp, min_leg, min_id); fclose(fp); printf("# alignments input: %d.\n", n); /* Write alignments */ fp = ckopen(outfile, "w"); for(m = 0; m < 2; m ++) { n = size_align(align[m]); fwrite(&n, sizeof(int), 1, fp); aln = align[m]; while(aln) { fwrite(&(aln -> reads[1]), sizeof(int), 1, fp); fwrite(&(aln -> mis_match), sizeof(int), 1, fp); fwrite(&(aln -> length), sizeof(int), 1, fp); fwrite(aln -> pos[0], sizeof(int), aln -> length, fp); fwrite(aln -> pos[1], sizeof(int), aln -> length, fp); aln0 = aln -> next; free((void *) aln -> pos[0]); free((void *) aln -> pos[1]); free((void *) aln); aln = aln0; } } fclose(fp); printf("Done...\n"); free((void **) align); for(i = 0; i < 2 * num_seq; i ++) { free((void *) src_seq[i]); } for(i = 0; i < num_seq; i ++) { free((void *) src_name[i]); } free((void **) src_seq); free((void **) src_name); free((void *) len_seq); }
boolean prlContig2nodes (char *grapfile, int len_cut) { long long i, num_seq; char name[256], *next_name; FILE *fp; pthread_t threads[thrd_num]; time_t start_t, stop_t; unsigned char thrdSignal[thrd_num + 1]; PARAMETER paras[thrd_num]; int maxCtgLen, minCtgLen, nameLen; unsigned int lenSum, contigId; WORDFILTER = createFilter (overlaplen); time (&start_t); sprintf (name, "%s.contig", grapfile); fp = ckopen (name, "r"); maxCtgLen = nameLen = 10; minCtgLen = 1000; num_seq = readseqpar (&maxCtgLen, &minCtgLen, &nameLen, fp); printf ("\nthere're %lld contigs in file: %s, max seq len %d, min seq len %d, max name len %d\n", num_seq, grapfile, maxCtgLen, minCtgLen, nameLen); maxReadLen = maxCtgLen; fclose (fp); time (&stop_t); printf ("time spent on parse contigs file %ds\n", (int) (stop_t - start_t)); next_name = (char *) ckalloc ((maxNameLen + 1) * sizeof (char)); // extract all the EDONs seq_buffer_size = buffer_size * 2; max_read_c = seq_buffer_size / 20; kmerBuffer = (Kmer *) ckalloc (buffer_size * sizeof (Kmer)); hashBanBuffer = (ubyte8 *) ckalloc (buffer_size * sizeof (ubyte8)); smallerBuffer = (boolean *) ckalloc (buffer_size * sizeof (boolean)); seqBuffer = (char *) ckalloc (seq_buffer_size * sizeof (char)); lenBuffer = (int *) ckalloc (max_read_c * sizeof (int)); indexArray = (unsigned int *) ckalloc ((max_read_c + 1) * sizeof (unsigned int)); seqBreakers = (unsigned int *) ckalloc ((max_read_c + 1) * sizeof (unsigned int)); ctgIdArray = (int *) ckalloc (max_read_c * sizeof (int)); fp = ckopen (name, "r"); //node_mem_manager = createMem_manager(EDONBLOCKSIZE,sizeof(EDON)); rcSeq = (char **) ckalloc ((thrd_num + 1) * sizeof (char *)); if (1) { kmerCounter = (long long *) ckalloc ((thrd_num + 1) * sizeof (long long)); KmerSets = (KmerSet **) ckalloc (thrd_num * sizeof (KmerSet *)); for (i = 0; i < thrd_num; i++) { KmerSets[i] = init_kmerset (1024, 0.77f); thrdSignal[i + 1] = 0; paras[i].threadID = i; paras[i].mainSignal = &thrdSignal[0]; paras[i].selfSignal = &thrdSignal[i + 1]; kmerCounter[i + 1] = 0; rcSeq[i + 1] = (char *) ckalloc (maxCtgLen * sizeof (char)); } creatThrds (threads, paras); } kmer_c = thrdSignal[0] = kmerCounter[0] = 0; time (&start_t); read_c = lenSum = i = seqBreakers[0] = indexArray[0] = 0; readseq1by1 (seqBuffer + seqBreakers[read_c], next_name, &(lenBuffer[read_c]), fp, -1); while (!feof (fp)) { contigId = getID (next_name); readseq1by1 (seqBuffer + seqBreakers[read_c], next_name, &(lenBuffer[read_c]), fp, 1); if ((++i) % 10000000 == 0) { printf ("--- %lldth contigs\n", i); } if (lenBuffer[read_c] < overlaplen + 1 || lenBuffer[read_c] < len_cut) { contigId = getID (next_name); continue; } //printf("len of seq %d is %d, ID %d\n",read_c,lenBuffer[read_c],contigId); ctgIdArray[read_c] = contigId > 0 ? contigId : i; lenSum += lenBuffer[read_c]; kmer_c += lenBuffer[read_c] - overlaplen + 1; read_c++; seqBreakers[read_c] = lenSum; indexArray[read_c] = kmer_c; //printf("seq %d start at %d\n",read_c,seqBreakers[read_c]); if (read_c == max_read_c || (lenSum + maxCtgLen) > seq_buffer_size || (kmer_c + maxCtgLen - overlaplen + 1) > buffer_size) { kmerCounter[0] += kmer_c; sendWorkSignal (2, thrdSignal); sendWorkSignal (1, thrdSignal); kmer_c = read_c = lenSum = 0; } } if (read_c) { kmerCounter[0] += kmer_c; sendWorkSignal (2, thrdSignal); sendWorkSignal (1, thrdSignal); } sendWorkSignal (3, thrdSignal); thread_wait (threads); time (&stop_t); printf ("time spent on hash reads: %ds\n", (int) (stop_t - start_t)); if (1) { unsigned long long alloCounter = 0; unsigned long long allKmerCounter = 0; for (i = 0; i < thrd_num; i++) { alloCounter += count_kmerset ((KmerSets[i])); allKmerCounter += kmerCounter[i + 1]; free ((void *) rcSeq[i + 1]); } printf ("%lli nodes allocated, %lli kmer in reads, %lli kmer processed\n", alloCounter, kmerCounter[0], allKmerCounter); } free ((void *) rcSeq); free ((void *) kmerCounter); free ((void *) seqBuffer); free ((void *) lenBuffer); free ((void *) indexArray); free ((void *) seqBreakers); free ((void *) ctgIdArray); free ((void *) kmerBuffer); free ((void *) hashBanBuffer); free ((void *) smallerBuffer); free ((void *) next_name); fclose (fp); return 1; }
main(int argc, char **argv) { int i, j, k, l, m, n; int *len_seq, num_seq; int len1, len2, pos1, pos2; int **num_pa; char **src_seq, **src_name; char str[300], dir[3]; int num_ins, *insertpos, *insertlen; FILE *fp, *fp1; if(argc < 7) { printf("Usage: insencode seq_file reput_file insert_reg_file out_seq_file out_reput_file out_reg_file\n"); exit(-1); } len_seq = (int *) ckalloc(2 * sizeof(int)); src_seq = (char **) ckalloc(2 * sizeof(char *)); src_name = (char **) ckalloc(1 * sizeof(char *)); src_name[0] = (char *) ckalloc(100 * sizeof(char)); fp = ckopen(argv[1], "r"); num_seq = readseq1by1(src_seq, src_name, len_seq, fp); fclose(fp); insertpos = (int *) ckalloc(10000 * sizeof(int)); insertlen = (int *) ckalloc(10000 * sizeof(int)); fp = ckopen(argv[3], "r"); num_ins = readins(insertpos, insertlen, fp); fclose(fp); fp = ckopen(argv[2], "r"); fp1 = ckopen(argv[5], "w"); while(fgets(str, 290, fp)) { if(str[0] != '#') { sscanf(str, "%d%d%s%d%d", &len1, &pos1, dir, &len2, &pos2); k = len1 + pos1 - 1; pos1 = reculate_pos(pos1, insertpos, insertlen); k = reculate_pos(k, insertpos, insertlen, num_ins); len1 = k - pos1 + 1; k = len2 + pos2 - 1; pos2 = reculate_pos(pos2, insertpos, insertlen); k = reculate_pos(k, insertpos, insertlen, num_ins); len2 = k - pos2 + 1; fprintf(fp1, "%d %d %s %d %d\n", len1, pos1, len2, pos2, dir); } } fclose(fp); fclose(fp1); fp = ckopen(argv[6], "w"); l = 0; for(i = 0; i < num_ins; i ++) { fprintf(fp, "%d %d\n", insertpos[i] - l, insertlen[i]); l += insertlen[i]; } fclose(fp); fp = ckopen(argv[4], "w"); fprintf(fp, ">seq_no_common_repeat\n"); k = n = 0; for(i = 0; i < len_seq[0]; i ++) { while(n < num_ins && i == insertpos[n] + 1) { i += insertlen[n]; n ++; } fprintf(fp, "%c", na_name[src_seq[0][i]]); if(k % 50 == 49) { fprintf(fp, "\n"); } k ++; } if(k % 50 != 0) { fprintf(fp, "\n"); } fclose(fp); printf("Genome length after removal: %d\n", k); free((void *) insertpos); free((void *) insertlen); for(i = 0; i < 2 * num_seq; i ++) { free((void *) src_seq[i]); } for(i = 0; i < num_seq; i ++) { free((void *) src_name[i]); } free((void **) src_seq); free((void **) src_name); free((void *) len_seq); }