Beispiel #1
0
main(int argc, char **argv)
{
	int	i, j, k, l, m, n;
	char	**src_seq, **src_name;
	int	*len_seq, num_seq;
	char	temp[100];
	ALIGN	**align, *aln, *aln0;
	FILE	*fp;

	readpar();
	random1(&idum);
	initenv(argc, argv);

/*	Input the length of the reads (required) */

	len_seq = (int *) ckalloc(2 * sizeof(int));
	src_seq = (char **) ckalloc(2 * sizeof(char *));
	src_name = (char **) ckalloc(1 * sizeof(char *));
	src_name[0] = (char *) ckalloc(100 * sizeof(char));

	fp = ckopen(seqfile, "r");
	num_seq = readseq1by1(src_seq, src_name, len_seq, fp);
	fclose(fp);
	printf("Genome length: %d\n", len_seq[0]);

/*	Make reverse complements of input sequences rev(i) --> i + num_seq	*/

	len_seq[1] = len_seq[0];
	src_seq[1] = (char *) ckalloc(len_seq[0] * sizeof(char));
	for(j = 0; j < len_seq[0]; j ++)	{
		src_seq[1][j] = rev(src_seq[0][len_seq[0] - j - 1]);
	}

/*      read in pairwise alignments by Reputer	*/

	align = (ALIGN **) ckalloc(2 * sizeof(ALIGN *));
	fp = ckopen(inpfile, "r");
	n = readph(align, src_seq, len_seq, fp, min_leg, min_id);
	fclose(fp);
	printf("# alignments input: %d.\n", n);

/*	Write alignments	*/

	fp = ckopen(outfile, "w");
	for(m = 0; m < 2; m ++)	{
		n = size_align(align[m]);
		fwrite(&n, sizeof(int), 1, fp);
		aln = align[m];
		while(aln)	{
			fwrite(&(aln -> reads[1]), sizeof(int), 1, fp);
			fwrite(&(aln -> mis_match), sizeof(int), 1, fp);
			fwrite(&(aln -> length), sizeof(int), 1, fp);
			fwrite(aln -> pos[0], sizeof(int), aln -> length, fp);
			fwrite(aln -> pos[1], sizeof(int), aln -> length, fp);
			aln0 = aln -> next;
			free((void *) aln -> pos[0]);
			free((void *) aln -> pos[1]);
			free((void *) aln);
			aln = aln0;
		}
	}
	fclose(fp);
	printf("Done...\n");

	free((void **) align);
	for(i = 0; i < 2 * num_seq; i ++)	{
		free((void *) src_seq[i]);
	}
	for(i = 0; i < num_seq; i ++)	{
		free((void *) src_name[i]);
	}
	free((void **) src_seq);
	free((void **) src_name);
	free((void *) len_seq);
}
Beispiel #2
0
boolean prlContig2nodes (char *grapfile, int len_cut)
{
	long long i, num_seq;
	char name[256], *next_name;
	FILE *fp;
	pthread_t threads[thrd_num];
	time_t start_t, stop_t;
	unsigned char thrdSignal[thrd_num + 1];
	PARAMETER paras[thrd_num];
	int maxCtgLen, minCtgLen, nameLen;
	unsigned int lenSum, contigId;

	WORDFILTER = createFilter (overlaplen);
	time (&start_t);
	sprintf (name, "%s.contig", grapfile);
	fp = ckopen (name, "r");
	maxCtgLen = nameLen = 10;
	minCtgLen = 1000;
	num_seq = readseqpar (&maxCtgLen, &minCtgLen, &nameLen, fp);
	printf ("\nthere're %lld contigs in file: %s, max seq len %d, min seq len %d, max name len %d\n", num_seq, grapfile, maxCtgLen, minCtgLen, nameLen);
	maxReadLen = maxCtgLen;
	fclose (fp);
	time (&stop_t);
	printf ("time spent on parse contigs file %ds\n", (int) (stop_t - start_t));
	next_name = (char *) ckalloc ((maxNameLen + 1) * sizeof (char));
	// extract all the EDONs
	seq_buffer_size = buffer_size * 2;
	max_read_c = seq_buffer_size / 20;
	kmerBuffer = (Kmer *) ckalloc (buffer_size * sizeof (Kmer));
	hashBanBuffer = (ubyte8 *) ckalloc (buffer_size * sizeof (ubyte8));
	smallerBuffer = (boolean *) ckalloc (buffer_size * sizeof (boolean));
	seqBuffer = (char *) ckalloc (seq_buffer_size * sizeof (char));
	lenBuffer = (int *) ckalloc (max_read_c * sizeof (int));
	indexArray = (unsigned int *) ckalloc ((max_read_c + 1) * sizeof (unsigned int));
	seqBreakers = (unsigned int *) ckalloc ((max_read_c + 1) * sizeof (unsigned int));
	ctgIdArray = (int *) ckalloc (max_read_c * sizeof (int));
	fp = ckopen (name, "r");
	//node_mem_manager = createMem_manager(EDONBLOCKSIZE,sizeof(EDON));
	rcSeq = (char **) ckalloc ((thrd_num + 1) * sizeof (char *));

	if (1)
	{
		kmerCounter = (long long *) ckalloc ((thrd_num + 1) * sizeof (long long));
		KmerSets = (KmerSet **) ckalloc (thrd_num * sizeof (KmerSet *));

		for (i = 0; i < thrd_num; i++)
		{
			KmerSets[i] = init_kmerset (1024, 0.77f);
			thrdSignal[i + 1] = 0;
			paras[i].threadID = i;
			paras[i].mainSignal = &thrdSignal[0];
			paras[i].selfSignal = &thrdSignal[i + 1];
			kmerCounter[i + 1] = 0;
			rcSeq[i + 1] = (char *) ckalloc (maxCtgLen * sizeof (char));
		}

		creatThrds (threads, paras);
	}

	kmer_c = thrdSignal[0] = kmerCounter[0] = 0;
	time (&start_t);
	read_c = lenSum = i = seqBreakers[0] = indexArray[0] = 0;
	readseq1by1 (seqBuffer + seqBreakers[read_c], next_name, &(lenBuffer[read_c]), fp, -1);

	while (!feof (fp))
	{
		contigId = getID (next_name);
		readseq1by1 (seqBuffer + seqBreakers[read_c], next_name, &(lenBuffer[read_c]), fp, 1);

		if ((++i) % 10000000 == 0)
		{
			printf ("--- %lldth contigs\n", i);
		}

		if (lenBuffer[read_c] < overlaplen + 1 || lenBuffer[read_c] < len_cut)
		{
			contigId = getID (next_name);
			continue;
		}

		//printf("len of seq %d is %d, ID %d\n",read_c,lenBuffer[read_c],contigId);
		ctgIdArray[read_c] = contigId > 0 ? contigId : i;
		lenSum += lenBuffer[read_c];
		kmer_c += lenBuffer[read_c] - overlaplen + 1;
		read_c++;
		seqBreakers[read_c] = lenSum;
		indexArray[read_c] = kmer_c;

		//printf("seq %d start at %d\n",read_c,seqBreakers[read_c]);
		if (read_c == max_read_c || (lenSum + maxCtgLen) > seq_buffer_size || (kmer_c + maxCtgLen - overlaplen + 1) > buffer_size)
		{
			kmerCounter[0] += kmer_c;
			sendWorkSignal (2, thrdSignal);
			sendWorkSignal (1, thrdSignal);
			kmer_c = read_c = lenSum = 0;
		}
	}

	if (read_c)
	{
		kmerCounter[0] += kmer_c;
		sendWorkSignal (2, thrdSignal);
		sendWorkSignal (1, thrdSignal);
	}

	sendWorkSignal (3, thrdSignal);
	thread_wait (threads);
	time (&stop_t);
	printf ("time spent on hash reads: %ds\n", (int) (stop_t - start_t));

	if (1)
	{
		unsigned long long alloCounter = 0;
		unsigned long long allKmerCounter = 0;

		for (i = 0; i < thrd_num; i++)
		{
			alloCounter += count_kmerset ((KmerSets[i]));
			allKmerCounter += kmerCounter[i + 1];
			free ((void *) rcSeq[i + 1]);
		}

		printf ("%lli nodes allocated, %lli kmer in reads, %lli kmer processed\n", alloCounter, kmerCounter[0], allKmerCounter);
	}

	free ((void *) rcSeq);
	free ((void *) kmerCounter);
	free ((void *) seqBuffer);
	free ((void *) lenBuffer);
	free ((void *) indexArray);
	free ((void *) seqBreakers);
	free ((void *) ctgIdArray);
	free ((void *) kmerBuffer);
	free ((void *) hashBanBuffer);
	free ((void *) smallerBuffer);
	free ((void *) next_name);
	fclose (fp);
	return 1;
}
Beispiel #3
0
main(int argc, char **argv)
{
	int	i, j, k, l, m, n;
	int	*len_seq, num_seq;
	int	len1, len2, pos1, pos2;
	int	**num_pa;
	char	**src_seq, **src_name;
	char	str[300], dir[3];
	int	num_ins, *insertpos, *insertlen;
	FILE	*fp, *fp1;

	if(argc < 7)	{
		printf("Usage: insencode seq_file reput_file insert_reg_file out_seq_file out_reput_file out_reg_file\n");
		exit(-1);
	}

	len_seq = (int *) ckalloc(2 * sizeof(int));
	src_seq = (char **) ckalloc(2 * sizeof(char *));
	src_name = (char **) ckalloc(1 * sizeof(char *));
	src_name[0] = (char *) ckalloc(100 * sizeof(char));
	fp = ckopen(argv[1], "r");
	num_seq = readseq1by1(src_seq, src_name, len_seq, fp);
	fclose(fp);

	insertpos = (int *) ckalloc(10000 * sizeof(int));
	insertlen = (int *) ckalloc(10000 * sizeof(int));
	fp = ckopen(argv[3], "r");
	num_ins = readins(insertpos, insertlen, fp);
	fclose(fp);

	fp = ckopen(argv[2], "r");
	fp1 = ckopen(argv[5], "w");
	while(fgets(str, 290, fp))	{
		if(str[0] != '#')	{
			sscanf(str, "%d%d%s%d%d", &len1, &pos1, dir, &len2, &pos2);
			k = len1 + pos1 - 1;
			pos1 = reculate_pos(pos1, insertpos, insertlen);
			k = reculate_pos(k, insertpos, insertlen, num_ins);
			len1 = k - pos1 + 1;
			k = len2 + pos2 - 1;
			pos2 = reculate_pos(pos2, insertpos, insertlen);
			k = reculate_pos(k, insertpos, insertlen, num_ins);
			len2 = k - pos2 + 1;
			fprintf(fp1, "%d %d %s %d %d\n", len1, pos1, len2, pos2, dir);
		}
	}
	fclose(fp);
	fclose(fp1);

	fp = ckopen(argv[6], "w");
	l = 0;
	for(i = 0; i < num_ins; i ++)	{
		fprintf(fp, "%d %d\n", insertpos[i] - l, insertlen[i]);
		l += insertlen[i];
	}
	fclose(fp);

	fp = ckopen(argv[4], "w");
	fprintf(fp, ">seq_no_common_repeat\n");
	k = n = 0;
	for(i = 0; i < len_seq[0]; i ++)	{
		while(n < num_ins && i == insertpos[n] + 1)	{
			i += insertlen[n];
			n ++;
		}
		fprintf(fp, "%c", na_name[src_seq[0][i]]);
		if(k % 50 == 49)	{
			fprintf(fp, "\n");
		}
		k ++;
	}
	if(k % 50 != 0)	{
		fprintf(fp, "\n");
	}
	fclose(fp);
	printf("Genome length after removal: %d\n", k);
	free((void *) insertpos);
	free((void *) insertlen);
	for(i = 0; i < 2 * num_seq; i ++)	{
		free((void *) src_seq[i]);
	}
	for(i = 0; i < num_seq; i ++)	{
		free((void *) src_name[i]);
	}
	free((void **) src_seq);
	free((void **) src_name);
	free((void *) len_seq);
}