示例#1
0
unsigned char *seq_readnib(const char *fname, int32_t rbase, int32_t rlen, int32_t *slen)
{
	FILE *fp = ckopen(fname, "rb");
	unsigned char *s = seq_freadnib(fp, rbase, rlen, slen);
	fclose(fp);
	return s;
}
示例#2
0
static void freqStat ( char * outfile )
{
	FILE * fo;
	char name[256];
	int i, j;
	long long sum;
	sprintf ( name, "%s.kmerFreq", outfile );
	fo = ckopen ( name, "w" );

	for ( i = 1; i < 256; i++ )
	{
		sum = 0;

		for ( j = 0; j < thrd_num; j++ )
		{
			sum += kmerFreq[j][i];
		}

		fprintf ( fo, "%lld\n", sum );
	}

	for ( i = 0; i < thrd_num; i++ )
	{
		free ( ( void * ) kmerFreq[i] );
	}

	free ( ( void * ) kmerFreq );
	fclose ( fo );
}
示例#3
0
文件: species.c 项目: ma-compbio/RACA
void get_spename(char *configfile) {
	FILE *fp;
	char buf[500], sn[20];
	int tag, die, r, i;
	
	fp = ckopen(configfile, "r");
	die = 0;
	while(fgets(buf, 500, fp)) {
		if (buf[0] == '>' && strstr(buf, "species") != NULL) {
			die = 1;
			while(fgets(buf, 500, fp)) {
				if (buf[0] == '\n')
					break;
				if (sscanf(buf, "%s %d", sn, &tag) != 2)
					fatalf("cannot parse species %s", buf);
				strcpy(Spename[Spesz], sn);
				Spetag[Spesz] = tag;
				++Spesz;
			}
		}
		if (die == 1)
			break;
	}
	fclose(fp);
	if (Spesz > MAXSPE)
		fatalf("MAXSPE %d too small (%d)", MAXSPE, Spesz);
	for (i = r = 0; i < Spesz; i++)
		if (Spetag[i] == 0)
			++r;
	if (r == 0)
		fatal("ref species not specified");
	if (r > 1)
		fatal("ref speices more than one");
}
示例#4
0
void output_graph (char *outfile) 
{
	char name[256];

	FILE * fp;
	unsigned int i, bal_i;

	sprintf (name, "%s.edge.gvz", outfile);
	fp = ckopen (name, "w");
	fprintf (fp, "digraph G{\n");
	fprintf (fp, "\tsize=\"512,512\";\n");
	for (i = num_ed; i > 0; i--)
		
	{
		if (edge_array[i].deleted)
			
		{
			continue;
		}
		
		bal_i = getTwinEdge (i);
		
		fprintf (fp, "\tV%d -> V%d[label =\"%d(%d)\"];\n", edge_array[i].from_vt, edge_array[i].to_vt, i, edge_array[i].length);
	}
	fprintf (fp, "}\n");
	fclose (fp);
}
示例#5
0
void output_updated_edges ( char * outfile )
{
	FILE * fp;
	char name[256];
	unsigned int i, validCounter = 0;
	EDGE * edge;
	sprintf ( name, "%s.updated.edge", outfile );
	fp = ckopen ( name, "w" );

	for ( i = 1; i <= num_ed; i++ )
		{ validCounter++; }

	fprintf ( fp, "EDGEs %d\n", validCounter );
	validCounter = 0;

	for ( i = 1; i <= num_ed; i++ )
	{
		edge = &edge_array[i];
		fprintf ( fp, ">length %d,", edge->length );
		print_kmer ( fp, vt_array[edge->from_vt].kmer, ',' );
		print_kmer ( fp, vt_array[edge->to_vt].kmer, ',' );

		if ( EdSmallerThanTwin ( i ) )
			{ fprintf ( fp, "1," ); }
		else if ( EdLargerThanTwin ( i ) )
			{ fprintf ( fp, "-1," ); }
		else
			{ fprintf ( fp, "0," ); }

		fprintf ( fp, "%d\n", edge->cvg );
	}

	fclose ( fp );
}
示例#6
0
文件: conv_head.c 项目: gtsong/CHAP2
int main(int argc, char **argv) {
	FILE *fp;
	char buf[BIG];
	char name[100];
	int num_scafs = 0;
	int len = 0;

	if (argc != 3)
		fatal("args: seq-file(fasta) seq_name");
	
	strcpy(name, argv[2]);

	fp = ckopen(argv[1], "r");

	while(fgets(buf, 1000, fp))
	{
		if( (buf[0] == '>') || (buf[0] == '<') ) {
			num_scafs++;	
		}
	}
	
	fseek(fp, 0, SEEK_SET);

	if( num_scafs <= 1 ) {
		while(fgets(buf, 1000, fp))
		{
			if( (buf[0] == '>') || (buf[0] == '<') ) {
				if( strstr(buf, name) != NULL ) {
					printf(">%s\n", name);
				}
			}
			else {
				len = strlen(buf);
				if( buf[len-1] != '\n' ) printf("%s\n", buf);
				else printf("%s", buf);
			}
		}
	}
	else {
		if( (buf[0] == '>') || (buf[0] == '<') ) {
			len = strlen(buf);
			if( strstr(buf, name) != NULL ) {
				if( buf[len-1] != '\n' ) printf(">%s\n", buf);
				else printf(">%s", buf);
			}
			else {
				if( buf[len-1] != '\n' ) printf(">%s.%s\n", name, buf);
				else printf(">%s.%s", name, buf);
			}
		}
		else {
			len = strlen(buf);
			if( buf[len-1] != '\n' ) printf("%s\n", buf);
			else printf("%s", buf);
		}	
	}

	return EXIT_SUCCESS;
}
示例#7
0
void kmer2edges ( char * outfile )
{
	FILE * fp;
	char temp[256];
	sprintf ( temp, "%s.edge", outfile );
	fp = ckopen ( temp, "w" );
	make_edge ( fp );
	fclose ( fp );
	num_ed = edge_c;
}
示例#8
0
void read_graph_file(char *edgefile,
		     char *graphfile,
		     int *num_vertex,
		     NODES ***vertex,
		     int *num_edge,
		     EDGE ***edge)
{
  FILE *fp;
  FILE *fp1;

  fp = ckopen(edgefile, "r");
  fp1 = ckopen(graphfile, "r");
  *vertex = (NODES **) ckalloc(MAX_NODES * sizeof(NODES *));
  *edge = (EDGE **) ckalloc(MAX_EDGE * sizeof(EDGE *));
  *num_vertex = read_graph(*vertex, *edge, num_edge, fp, fp1);
  printf("Input graph ... done. %d vertices and %d edges.\n",
	 *num_vertex, *num_edge);
  fclose(fp1);
  fclose(fp);
}
示例#9
0
void output_pool(char * fileName)
{
	FILE * pool_file = ckopen(fileName,"w");
	unsigned int j,index;
	for(index=1;index<=num_ed;index++)
	{
		j = flag_array[index];
		fprintf(pool_file,"%d\t%d\n", index,pool[j]);
	}
	fclose(pool_file);
}
示例#10
0
/*************************************************
 Function:
    loadContig
 Description:
    Loads contigs sequence.
 Input:
    1. graphfile:       prefix of graph file
 Output:
    None.
 Return:
    None.
 *************************************************/
void loadContig ( char * graphfile )
{
	char c, name[256], line[1024], *tightSeq = NULL;
	FILE * fp;
	int n = 0, length, index = -1, edgeno;
	unsigned int i;
	unsigned int newIndex;
	sprintf ( name, "%s.contig", graphfile );
	fp = ckopen ( name, "r" );

	while ( fgets ( line, sizeof ( line ), fp ) != NULL )
	{
		if ( line[0] == '>' )
		{
			if ( index >= 0 )
			{
				newIndex = index_array[edgeno];
				contig_array[newIndex].seq = tightSeq;
			}

			n = 0;
			index++;
			sscanf ( line + 1, "%d %s %d", &edgeno, name, &length );
			//printf("contig %d, length %d\n",edgeno,length);
			tightSeq = ( char * ) ckalloc ( ( length / 4 + 1 ) * sizeof ( char ) );
		}
		else
		{
			for ( i = 0; i < strlen ( line ); i++ )
			{
				if ( line[i] >= 'a' && line[i] <= 'z' )
				{
					c = base2int ( line[i] - 'a' + 'A' );
					writeChar2tightString ( c, tightSeq, n++ );
				}
				else if ( line[i] >= 'A' && line[i] <= 'Z' )
				{
					c = base2int ( line[i] );
					writeChar2tightString ( c, tightSeq, n++ );
				}
			}
		}
	}

	if ( index >= 0 )
	{
		newIndex = index_array[edgeno];
		contig_array[newIndex].seq = tightSeq;
	}

	fprintf ( stderr, "%d contig(s) loaded.\n", index + 1 );
	fclose ( fp );
	//printf("the %dth contig with index 107\n",index);
}
static void clearCurSeq(const char *sid)
{
    char line[5000];
    if (hotfile == NULL) return;
    if (!hot_fp) hot_fp = ckopen(hotfile, "r");
    while (strcmp(sid, hot_id) == 0) {
	if (fgets(line, sizeof line, hot_fp) == NULL) return;
	if (line[0] == '#') continue;
	sscanf(line, "%s %d", hot_id, &hot_pos);
	if (debug) printf("INFO: clearCurSeq   %s\t%s\t%d\n", sid, hot_id, hot_pos);
    }
}
示例#12
0
void output_cntGVZ ( char * outfile )
{
	char name[256];
	FILE * fp;
	unsigned int i;
	CONNECT * connect;
	boolean flag;
	sprintf ( name, "%s.scaffold.gvz", outfile );
	fp = ckopen ( name, "w" );
	fprintf ( fp, "digraph G{\n" );
	fprintf ( fp, "\tsize=\"512,512\";\n" );

	for ( i = num_ctg; i > 0; i-- )
	{
		if ( !contig_array[i].downwardConnect )
		{
			continue;
		}

		connect = contig_array[i].downwardConnect;

		while ( connect )
		{
			if ( connect->deleted )
			{
				connect = connect->next;
				continue;
			}

			if ( connect->prevInScaf || connect->nextInScaf )
			{
				flag = 1;
			}
			else
			{
				flag = 0;
			}

			if ( !connect->mask )
				fprintf ( fp, "\tC%d_%d -> C%d_%d [label = \"%d(%d_%d)\"];\n", i, contig_array[i].length, connect->contigID, contig_array[connect->contigID].length,
				          connect->gapLen, flag, connect->weight );
			else
				fprintf ( fp, "\tC%d_%d -> C%d_%d [label = \"%d(%d_%d)\", color = red];\n", i, contig_array[i].length, connect->contigID, contig_array[connect->contigID].length,
				          connect->gapLen, flag, connect->weight );

			connect = connect->next;
		}
	}

	fprintf ( fp, "}\n" );
	fclose ( fp );
}
示例#13
0
void output_vertex ( char * outfile )
{
	char    temp[256];
	FILE * fp;
	int i;
	kmer_t * node;
	KmerSet * set;
	sprintf ( temp, "%s.vertex", outfile );
	fp = ckopen ( temp, "w" );

	for ( i = 0; i < thrd_num; i++ )
	{
		set = KmerSets[i];
		set->iter_ptr = 0;

		while ( set->iter_ptr < set->size )
		{
			if ( !is_kmer_entity_null ( set->flags, set->iter_ptr ) )
			{
				node = set->array + set->iter_ptr;
				output1vt ( node, fp );
			}

			set->iter_ptr ++;
		}
	}

	fprintf ( fp, "\n" );
	printf ( "%d vertex outputed\n", outvCounter );
	fclose ( fp );
	sprintf ( temp, "%s.preGraphBasic", outfile );
	fp = ckopen ( temp, "w" );
	fprintf ( fp, "VERTEX %d K %d\n", outvCounter, overlaplen );
	fprintf ( fp, "\nEDGEs %d\n", num_ed );
	fprintf ( fp, "\nMaxReadLen %d MinReadLen %d MaxNameLen %d\n", maxReadLen4all, minReadLen, maxNameLen );
	fclose ( fp );
}
示例#14
0
文件: species.c 项目: ma-compbio/RACA
void get_treestr2(char *configfile) {
	FILE *fp;
	char buf[500];

	fp = ckopen(configfile, "r");
	while(fgets(buf, 500, fp)) {
		if (buf[0] == '>' && strstr(buf, "tree2") != NULL) {
			if (fgets(buf, 500, fp) && sscanf(buf, "%s", Treestr2) != 1)
				fatalf("missing tree string in config file.");
			break;
		}
	}
	fclose(fp);
	if (Treestr2[0] == '\0')
		fatalf("missing tree string in config file.");
}
示例#15
0
文件: species.c 项目: ma-compbio/RACA
void get_numchr(char *configfile) {
	FILE *fp;
	char buf[500];
	
	fp = ckopen(configfile, "r");
	while(fgets(buf, 500, fp)) {
		if (buf[0] == '>' && strstr(buf, "numchr") != NULL) {
			if (fgets(buf, 500, fp) && sscanf(buf, "%d", &HSACHR) != 1)
				fatalf("missing numchr string in config file.");
			break;
		}
	}
	fclose(fp);
	if (HSACHR == 0)
		fatalf("missing numchr string in config file.");
}
示例#16
0
文件: species.c 项目: ma-compbio/RACA
void get_minlen(char *configfile) {
	FILE *fp;
	char buf[500];
	
	fp = ckopen(configfile, "r");
	while(fgets(buf, 500, fp)) {
		if (buf[0] == '>' && strstr(buf, "resolution") != NULL) {
			if (fgets(buf, 500, fp) && sscanf(buf, "%d", &MINLEN) != 1)
				fatalf("missing resolution string in config file.");
			break;
		}
	}
	fclose(fp);
	if (MINLEN == 0)
		fatalf("missing resolution string in config file.");
}
示例#17
0
文件: species.c 项目: ma-compbio/RACA
void get_netdir(char *configfile) {
	FILE *fp;
	char buf[500];
	
	fp = ckopen(configfile, "r");
	while(fgets(buf, 500, fp)) {
		if (buf[0] == '>' && strstr(buf, "netdir") != NULL) {
			if (fgets(buf, 500, fp) && sscanf(buf, "%s", Netdir) != 1)
				fatalf("missing netdir string in config file.");
			break;
		}
	}
	fclose(fp);
	if (Netdir[0] == '\0')
		fatalf("missing netdir string in config file.");
}
示例#18
0
文件: seq.c 项目: cestmoi7/AGAPE
SEQ* seq_open_type(const char *fname, int type)
{
	SEQ *s = ckallocz(sizeof(SEQ));
	int r, flags = 0;

	r = parse_fname(fname, 
		&(s->fname), &(s->from), &(s->slen), &(s->maskname));
	if (r == -1)
		fatalf("improper positions specification: %s", fname);

	s->type = type;
	s->flags = check_flags(r|flags);
	s->fp = ckopen(s->fname, "rb");
	s->count = 0;
	s->offset = 0;
	return s;
}
示例#19
0
文件: species.c 项目: ma-compbio/RACA
void get_chaindir(char *configfile) {
	FILE *fp;
	char buf[500];
	
	fp = ckopen(configfile, "r");
	while(fgets(buf, 500, fp)) {
		if (buf[0] == '#' || buf[0] == '\n')
			continue;
		if (buf[0] == '>' && strstr(buf, "chaindir") != NULL) {
			if (fgets(buf, 500, fp) && sscanf(buf, "%s", Chaindir) != 1)
				fatalf("missing chaindir string in config file.");
			break;
		}
	}
	fclose(fp);
	if (Chaindir[0] == '\0')
		fatalf("missing chaindir string in config file.");
}
示例#20
0
void output_contig_graph ( char * outfile )
{
	char name[256];
	FILE * fp;
	unsigned int i;
	sprintf ( name, "%s.contig.gvz", outfile );
	fp = ckopen ( name, "w" );
	fprintf ( fp, "digraph G{\n" );
	fprintf ( fp, "\tsize=\"512,512\";\n" );

	for ( i = num_ctg; i > 0; i-- )
	{
		fprintf ( fp, "\tV%d -> V%d[label =\"%d(%d)\"];\n", contig_array[i].from_vt, contig_array[i].to_vt, i, contig_array[i].length );
	}

	fprintf ( fp, "}\n" );
	fclose ( fp );
}
示例#21
0
/*************************************************
 Function:
    loadArcs
 Description:
    Loads arc information of contigs and calculates the average weight of arcs.
 Input:
    1. graphfile:       prefix of graph file
 Output:
    None.
 Return:
    The average weight of arcs.
 *************************************************/
static unsigned int loadArcs ( char * graphfile )
{
	FILE * fp;
	char name[256], line[1024];
	unsigned int target, weight;
	unsigned int from_ed;
	char * seg;
	unsigned int avg_weight = 0, weight_sum = 0, arc_num = 0;
	sprintf ( name, "%s.Arc", graphfile );
	fp = ckopen ( name, "r" );
	createPreArcMemManager ();
	arcCounter = 0;

	while ( fgets ( line, sizeof ( line ), fp ) != NULL )
	{
		seg = strtok ( line, " " );
		from_ed = atoi ( seg );

		//printf("%d\n",from_ed);
		while ( ( seg = strtok ( NULL, " " ) ) != NULL )
		{
			target = atoi ( seg );
			seg = strtok ( NULL, " " );
			weight = atoi ( seg );
			add1Arc ( from_ed, target, weight );

			if ( !contig_array[index_array[from_ed]].mask && !contig_array[index_array[target]].mask )
			{
				weight_sum += weight;
				++arc_num;
			}
		}
	}

	if ( arc_num )
	{
		avg_weight = weight_sum / arc_num;
	}

	fprintf ( stderr, "%lld arc(s) loaded, average weight is %u.\n", arcCounter, avg_weight );
	fclose ( fp );
	return avg_weight;
}
示例#22
0
void output_heavyArcs (char *outfile) 
{
	unsigned int i, j;
	char name[256];

	FILE * outfp;
	ARC * parc;
	sprintf (name, "%s.Arc", outfile);
	outfp = ckopen (name, "w");
	for (i = 1; i <= num_ed; i++)
	{
		if(edge_array[flag_array[i]].deleted==1 && edge_array[flag_array[i]].length <1)
			continue;
		parc = edge_array[flag_array[i]].arcs;
		if (!parc)	
		{
			continue;
		}
		j = 0;
		fprintf (outfp, "%u", i);

		while (parc)
		{
			if(edge_array[parc->to_ed].deleted==1 && edge_array[parc->to_ed].length <1)
			{
				parc = parc->next;
				continue;
			}
			fprintf (outfp, " %u %u", index_array[parc->to_ed], parc->multiplicity);	
			if ((++j) % 10 == 0)	
			{
				fprintf (outfp, "\n%u", i);
			}
			parc = parc->next;
		}
		fprintf (outfp, "\n");
	}
	fclose (outfp);
	free ((void*) index_array);		
	free ((void*) flag_array);		
}
示例#23
0
void read_configuration(const char* const filename, 
                        hashtable* const id2chroms,
                        hashtable* insertlengths,
                        uint* const averagecoverage)
{
    size_t n = 1;
    char* fptr = ckalloc(n + 1);

    char rgname[128]; // name of the library
    uint minsize; // minimum size in a proper pair
    uint maxsize; // maximum size in a proper pair

    char chrom[128]; // chromosome
    uint meancov;   // mean coverage on that chromosome

    FILE* fp = ckopen(filename,"r");
    while(getline(&fptr, &n, fp) != -1){
        if(strncmp(fptr, "IL", 2) == 0){
            if(sscanf(fptr, "IL %s %u %u\n", rgname, &minsize, &maxsize) != 3){
                fatalf("error in reading the insert length range: %s", fptr);
            }
            int32_t* range = ckalloc(2*sizeof(int32_t));
            range[0] = minsize;
            range[1] = maxsize;
            add_hashtable(insertlengths,rgname,strlen(rgname),range);
        } else if(averagecoverage && (strncmp(fptr, "RC", 2) == 0)){
            if(sscanf(fptr, "RC %s %u\n", chrom, &meancov) != 2){
                fatalf("error in reading the mean coverage: %s", fptr);
            }
            averagecoverage[must_find_hashtable_int(id2chroms, 
                                                    chrom, 
                                                    strlen(chrom))] = meancov;
        } else {
            fatalf("unknown tag in configuration: %s", fptr);
        }
    }
    
    ckfree(fptr);
    fclose(fp);
}
示例#24
0
文件: maf.c 项目: gtsong/CHAP2
struct mafFile *mafOpen(char *fileName, int verbose) {
	struct mafFile *mf;
	FILE *fp;
	char buf[500], *s;

	mf = ckalloc(sizeof(struct mafFile));
	mf->next = NULL;
	fp = mf->fp = ckopen(fileName, "r");
	if (fgets(buf, 500, fp) == NULL)
		fatalf("empty file %s", fileName);
	if (sscanf(buf, "##maf version=%d", &(mf->version)) != 1)
		fatalf("improper maf header line: %s", buf);
	if ((s = strstr(buf, "scoring=")) != NULL)
		mf->scoring = copy_string(s+8);
	else
		mf->scoring = NULL;
	mf->alignments = NULL;
	mf->fileName = copy_string(fileName);
	mf->line_nbr = 0;
	mf->verbose = verbose;
	return mf;
}
示例#25
0
void getaln(char *seq1, char *seq2, int len1, int len2, int *sapp, char *filename)
{
	int	i, j, k, l;
	int	pos11, pos12, pos21, pos22;
	FILE	*fp;
	char	str[500];

	i = j = k = 0;
	fp = ckopen(filename, "r");
	while(fgets(str, 490, fp))	{
		if(str[0] == 'a')	continue;
		sscanf(str, "%*s%*s%*s%*s%d%d%d%d", &pos11, &pos12, &pos21, &pos22);
		while(i < pos11 && j < pos21)	{
			k ++;
			i ++;
			j ++; 
		}
		if(i == pos11)	{
			j = pos22;
			sapp[k ++] = pos22 - pos21 + 1;	
		} else if(j == pos21)	{
			i = pos12;
			sapp[k ++] = -(pos12 - pos11 + 1);
		}
	}
	fclose(fp);
	while(i < len1 && j < len2)	{
		k ++;
		i ++;
		j ++; 
	}
	if(i != len1 || j != len2)	{
		printf("File %s len1 %d len2 %d i %d j %d\n", filename, len1, len2, i, j);
		exit(0);
	}
}
示例#26
0
void output_heavyArcs ( char * outfile )
{
	unsigned int i, j;
	char name[256];
	FILE * outfp;
	ARC * parc;
	sprintf ( name, "%s.Arc", outfile );
	outfp = ckopen ( name, "w" );

	for ( i = 1; i <= num_ed; i++ )
	{
		parc = edge_array[i].arcs;

		if ( !parc )
			{ continue; }

		j = 0;
		fprintf ( outfp, "%u", i );

		while ( parc )
		{
			fprintf ( outfp, " %u %u", parc->to_ed, parc->multiplicity );

			if ( ( ++j ) % 10 == 0 )
			{
				fprintf ( outfp, "\n%u", i );
			}

			parc = parc->next;
		}

		fprintf ( outfp, "\n" );
	}

	fclose ( outfp );
}
示例#27
0
文件: read_maf.c 项目: gtsong/CHAP2
void read_maf(char *fname, int mode, struct DotList *algns, int *num_algns, int *size1, int *size2) {
	FILE *fp;
	char *status;
	int i = 0;
	int count = 0;
	int temp;
	int a_pid;
	int b1, e1, b2, e2;
	char strand[100], len1[100], len2[100];
	char *s, *t;
	int algn_type = SELF1 - 1;
	int j = 0;
	int srcblock = -1;
	char token[50];
	char name1[LEN_NAME], name2[LEN_NAME];

	strcpy(name1, "");
	strcpy(name2, "");
	strcpy(len1, "0");
	strcpy(len2, "0");
	strcpy(strand, "+");
	strcpy(token, "");

	fp = ckopen(fname, "r");
	if (((status = fgets(S, BIG, fp)) == NULL) || strncmp(S, "##maf", 5))
		fatalf("%s is not a maf file", fname);
/*
	while (S[0] == '#')
		if ((status = fgets(S, BIG, fp)) == NULL)
			fatalf("no alignments in %s", fname);
*/

	while ((status != NULL) && (strstr(S, "eof") == NULL)) {
		if(S[0] == '#') {
			if((mode == C_MODE) || (mode == S_MODE)) {
				while((status != NULL) && (S[0] == '#')) {
					if( strncmp(S, "##maf", 5) == 0 ) algn_type++;
					status = fgets(S, BIG, fp);
				}	
				if( algn_type > PAIR ) fatal("too many alignments are combined\n");
			}
			else {
				while ((status != NULL ) && (S[0] == '#')) {
					status = fgets(S, BIG, fp);
				}
			}
			j = 0;
		}

		srcblock = -1;
		if ( status == NULL ) {
		}
		else {
			if (S[0] != 'a')
				fatalf("expecting an a-line in %s, saw %s",
				  fname, S);

			if( mode == O_MODE ) {
				sscanf(S, "%*s %s", token);
				srcblock = cat_srcblock(token);
			}
	
			if ((fgets(S, BIG, fp) == NULL) || (fgets(T, BIG, fp) == NULL))
				fatalf("cannot find alignment in %s", fname);
			if ((sscanf(S, "%*s %s %d %d %*s %s", name1, &b1, &e1, len1) != 4) || (sscanf(T, "%*s %s %d %d %s %s", name2, &b2, &e2, strand, len2) != 5))
			{
				fatalf("bad alignment info of 2 in %s", fname);
			}
			// aligned interval given as base-0 start and length
			e1 += b1;
			e2 += b2;
	
			if( strcmp(strand, "-") == 0) {
				temp = b2;
				b2 = atoi(len2) - e2;
				e2 = atoi(len2) - temp;	
			}			
	
			b1++;
			b2++;
			e1++;
			e2++;
	
			s = nucs(S);
			t = nucs(T);
			a_pid = cal_pid(s, t, strlen(s)-1);
	
			if( ((mode == D_MODE) || ((mode == C_MODE) && (algn_type <= PAIR))) && (( (algn_type != PAIR) && (b1 >= b2)) || ((algn_type != PAIR) && (abs(b1-b2) <= DEL_TH) && (abs(e1-e2) <=DEL_TH)) || ((e1-b1) < ALT_EFFEC_VALUE) || (a_pid <= PID_TH) )) {}
			else if( (mode == S_MODE) && ( algn_type != PAIR  ) ) {}
			else if( (abs(e1-b1) <= ERR_SM_TH) || (abs(e2-b2) <= ERR_SM_TH) ) {}
			else  {
				algns[count].x = assign_I(b1, e1);
				if( b2 < e2 ) algns[count].y = assign_I(b2, e2);
				else algns[count].y = assign_I(e2, b2);
				algns[count].identity = a_pid;
				algns[count].m_pid = a_pid;
	
				if( strcmp(strand, "+") == 0 ) {
					algns[count].sign = 0;
					algns[count].init_sign = 0;
				}	
				else if( strcmp(strand, "-") == 0 ) {
					algns[count].sign = 1;
					algns[count].init_sign = 1;
				}
				else {
					algns[count].sign = DELETED;
					algns[count].init_sign = DELETED;
				}
	
				if( mode == O_MODE ) {
					algns[count].indiv_fid = srcblock; // ith alignment
				}
				else {
					algns[count].indiv_fid = j; // j alignment
				}
				algns[count].fid = i; // ith alignment
				algns[count].index = count; // ith alignment
 	   		algns[count].c_id = -1; // not chained alignment
 	    	algns[count].m_id = -1; // not chained alignment
 	    	algns[count].rp1_id = -1; // the inserted repeat id of the chained alignment in first seq
      	algns[count].rp2_id = -1; // the inserted repeat id of the chained alignment in second seq 
   	  	algns[count].l_id = -1;
      	algns[count].lock = -1;  
      	algns[count].m_x = assign_I(0,1);
      	algns[count].m_y = assign_I(0,1);
      	algns[count].xl_diff = 0; // the offset of the left end
      	algns[count].yl_diff = 0; // the offset of the left end
      	algns[count].xr_diff = 0; // the offset of the right end
      	algns[count].yr_diff = 0; // the offset of the right end
      	algns[count].pair_self = -1;
      	algns[count].l_pid = -1;
				if( (mode == O_MODE) || (mode == PAIR_MODE) ) {
					algns[count].sp_id = PAIR;
				}
				else {
					algns[count].sp_id = algn_type; // SELF1 for first self-alignment, SELF2 for second self-alignment and PAIR for pairwise alignment
				}
 	    	algns[count].xl_offset = 0; // the offset of low of x
      	algns[count].yl_offset = 0; // the offset of up of x
      	algns[count].xr_offset = 0; // the offset of low of y 
				if( algn_type == PAIR ) algns[count].pair_self = PAIR;
				else algns[count].pair_self = SELF;
      	strcpy(algns[count].name1, name1);  
      	strcpy(algns[count].name2, name2);  
      	algns[count].len1 = atoi(len1);  
      	algns[count].len2 = atoi(len2);  
      	algns[count].ctg_id1 = -1;  
      	algns[count].ctg_id2 = -1;  

				count++;
			}

			if ((fgets(S, BIG, fp) == NULL) || (S[0] != '\n'))
				fatalf("bad alignment end in %s", fname);
			status = fgets(S, BIG, fp);
			i++; // ith alignment 
			j++;
		}
	}

	*size1 = atoi(len1);
	*size2 = atoi(len2);
	*num_algns = count;
	fclose(fp);
}
示例#28
0
/*************************************************
Function:
    prlRead2HashTable
Description:
    1. Imports the reads from the lib file one by one.
    2. Chops the reads into kmers and store them in KmerSets.
    3. Removes the kmers with low coverage.
    4. Marks the linear kmers.
    5. Counts the kmer frequences.
Input:
    1. libfile :            the reads config file
    2. outfile :        the output file prefix
Output:
    None.
Return:
    1 if exits normally.
*************************************************/
boolean prlRead2HashTable ( char * libfile, char * outfile )
{
	char * cach1;
	char * cach2;
	unsigned char asm_ctg = 1;
	long long i;
	char * next_name, name[256];
	FILE * fo;
	time_t start_t, stop_t;
	int maxReadNum;
	int libNo;
	pthread_t threads[thrd_num];
	unsigned char thrdSignal[thrd_num + 1];
	PARAMETER paras[thrd_num];
	boolean flag, pairs = 0;
	WORDFILTER = createFilter ( overlaplen );
	maxReadLen = 0;
	maxNameLen = 256;
	scan_libInfo ( libfile );
	alloc_pe_mem ( num_libs );

	if ( !maxReadLen )
	{
		maxReadLen = 100;
	}

	if ( gLineLen < maxReadLen )
	{
		gStr = ( char * ) ckalloc ( ( maxReadLen + 1 ) * sizeof ( char ) );
	}

	//init
	maxReadLen4all = maxReadLen;
	fprintf ( stderr, "In %s, %d lib(s), maximum read length %d, maximum name length %d.\n\n", libfile, num_libs, maxReadLen, maxNameLen );
	next_name = ( char * ) ckalloc ( ( maxNameLen + 1 ) * sizeof ( char ) );
	kmerBuffer = ( Kmer * ) ckalloc ( buffer_size * sizeof ( Kmer ) );
	hashBanBuffer = ( ubyte8 * ) ckalloc ( buffer_size * sizeof ( ubyte8 ) );
	prevcBuffer = ( char * ) ckalloc ( buffer_size * sizeof ( char ) );
	nextcBuffer = ( char * ) ckalloc ( buffer_size * sizeof ( char ) );
	maxReadNum = buffer_size / ( maxReadLen - overlaplen + 1 );
	//printf("buffer size %d, max read len %d, max read num %d\n",buffer_size,maxReadLen,maxReadNum);
	int maxAIOSize = 32768;
	aioBuffer1 = ( char * ) ckalloc ( ( maxAIOSize ) * sizeof ( char ) );
	aioBuffer2 = ( char * ) ckalloc ( ( maxAIOSize ) * sizeof ( char ) );
	readBuffer1 = ( char * ) ckalloc ( ( maxAIOSize + ( maxReadLen * 4 + 1024 ) ) * sizeof ( char ) ); //(char *)ckalloc(maxAIOSize*sizeof(char)); //1024
	readBuffer2 = ( char * ) ckalloc ( ( maxAIOSize + ( maxReadLen * 4 + 1024 ) ) * sizeof ( char ) ); //1024
	cach1 = ( char * ) ckalloc ( ( maxReadLen * 4 + 1024 ) * sizeof ( char ) ); //1024
	cach2 = ( char * ) ckalloc ( ( maxReadLen * 4 + 1024 ) * sizeof ( char ) ); //1024
	memset ( cach1, '\0', ( maxReadLen * 4 + 1024 ) ); //1024
	memset ( cach2, '\0', ( maxReadLen * 4 + 1024 ) ); //1024
	seqBuffer = ( char ** ) ckalloc ( maxReadNum * sizeof ( char * ) );
	lenBuffer = ( int * ) ckalloc ( maxReadNum * sizeof ( int ) );
	indexArray = ( int * ) ckalloc ( maxReadNum * sizeof ( int ) );

	for ( i = 0; i < maxReadNum; i++ )
	{
		seqBuffer[i] = ( char * ) ckalloc ( maxReadLen * sizeof ( char ) );
	}

	rcSeq = ( char ** ) ckalloc ( ( thrd_num + 1 ) * sizeof ( char * ) );

	if ( 1 )
	{
		kmerCounter = ( long long * ) ckalloc ( ( thrd_num + 1 ) * sizeof ( long long ) );
		KmerSets = ( KmerSet ** ) ckalloc ( thrd_num * sizeof ( KmerSet * ) );
		ubyte8 init_size = 1024;
		ubyte8 k = 0;

		if ( initKmerSetSize )
		{
#ifdef MER127
			init_size = ( ubyte8 ) ( ( double ) initKmerSetSize * 1024.0f * 1024.0f * 1024.0f / ( double ) thrd_num / 40 );
#else
			init_size = ( ubyte8 ) ( ( double ) initKmerSetSize * 1024.0f * 1024.0f * 1024.0f / ( double ) thrd_num / 24 ); //is it true?
#endif

			do
			{
				++k;
			}
			while ( k * 0xFFFFFFLLU < init_size );
		}

		for ( i = 0; i < thrd_num; i++ )
		{
			//KmerSets[i] = init_kmerset(1024,0.77f);
			KmerSets[i] = init_kmerset ( ( ( initKmerSetSize ) ? ( k * 0xFFFFFFLLU ) : ( init_size ) ), 0.77f );
			thrdSignal[i + 1] = 0;
			paras[i].threadID = i;
			paras[i].mainSignal = &thrdSignal[0];
			paras[i].selfSignal = &thrdSignal[i + 1];
			kmerCounter[i + 1] = 0;
			rcSeq[i + 1] = ( char * ) ckalloc ( maxReadLen * sizeof ( char ) );
		}

		creatThrds ( threads, paras );
	}

	thrdSignal[0] = kmerCounter[0] = 0;
	time ( &start_t );
	kmer_c = n_solexa = read_c = i = libNo = readNumBack = gradsCounter = 0;

	while ( openNextFile ( &libNo, pairs, asm_ctg ) )
	{
		//read bam file
		if ( lib_array[libNo].curr_type == 4 )
		{
			int type = 0;   //deside the PE reads is good or bad

			while ( ( flag = read1seqInLibBam ( seqBuffer[read_c], next_name, & ( lenBuffer[read_c] ), &libNo, pairs, 1, &type ) ) != 0 )
			{
				if ( type == -1 ) //if the reads is bad, go back.
				{
					i--;

					if ( lenBuffer[read_c - 1] >= overlaplen + 1 )
					{
						kmer_c -= lenBuffer[read_c - 1] - overlaplen + 1;
						read_c--;
					}

					n_solexa -= 2;
					continue;
				}

				if ( ( ++i ) % 100000000 == 0 )
					{ fprintf ( stderr, "--- %lldth reads.\n", i ); }

				if ( lenBuffer[read_c] < 0 )
					{ fprintf ( stderr, "Read len %d.\n", lenBuffer[read_c] ); }

				if ( lenBuffer[read_c] < overlaplen + 1 )
					{ continue; }

				/*
				   if(lenBuffer[read_c]>70)
				   lenBuffer[read_c] = 50;
				   else if(lenBuffer[read_c]>40)
				   lenBuffer[read_c] = 40;
				 */
				indexArray[read_c] = kmer_c;
				kmer_c += lenBuffer[read_c] - overlaplen + 1;
				read_c++;

				if ( read_c == maxReadNum )
				{
					kmerCounter[0] += kmer_c;
					sendWorkSignal ( 2, thrdSignal ); //chopKmer4read
					sendWorkSignal ( 1, thrdSignal ); //singleKmer
					kmer_c = read_c = 0;
				}
			}
		}
		//read PE fasta or fastq
		else if ( lib_array[libNo].curr_type == 1 || lib_array[libNo].curr_type == 2 )
		{
			initAIO ( &aio1, aioBuffer1, fileno ( lib_array[libNo].fp1 ), maxAIOSize );
			initAIO ( &aio2, aioBuffer2, fileno ( lib_array[libNo].fp2 ), maxAIOSize );
			int offset1, offset2, flag1, flag2, rt1, rt2;
			offset1 = offset2 = 0;
			rt1 = aio_read ( &aio1 );
			rt2 = aio_read ( &aio2 );
			flag1 = AIORead ( &aio1, &offset1, readBuffer1, cach1, &rt1, lib_array[libNo].curr_type );
			flag2 = AIORead ( &aio2, &offset2, readBuffer2, cach2, &rt2, lib_array[libNo].curr_type );

			if ( flag1 && flag2 )
			{
				int start1, start2, turn;
				start1 = start2 = 0;
				turn = 1;

				while ( start1 < offset1 || start2 < offset2 )
				{
					if ( turn == 1 )
					{
						turn = 2;
						readseqInLib ( seqBuffer[read_c], next_name, & ( lenBuffer[read_c] ), readBuffer1, &start1, offset1, libNo );

						if ( ( ++i ) % 100000000 == 0 )
							{ fprintf ( stderr, "--- %lldth reads.\n", i ); }

						if ( lenBuffer[read_c] < 0 )
							{ fprintf ( stderr, "Read len %d.\n", lenBuffer[read_c] ); }

						if ( lenBuffer[read_c] < overlaplen + 1 )
						{
							if ( start1 >= offset1 )
							{
								start1 = 0;
								offset1 = 0;
								flag1 = AIORead ( &aio1, &offset1, readBuffer1, cach1, &rt1, lib_array[libNo].curr_type );
							}

							continue;
						}

						indexArray[read_c] = kmer_c;
						kmer_c += lenBuffer[read_c] - overlaplen + 1;
						read_c++;

						if ( start1 >= offset1 )
						{
							start1 = 0;
							offset1 = 0;
							flag1 = AIORead ( &aio1, &offset1, readBuffer1, cach1, &rt1, lib_array[libNo].curr_type );
						}

						if ( read_c == maxReadNum )
						{
							kmerCounter[0] += kmer_c;
							sendWorkSignal ( 2, thrdSignal );   //chopKmer4read
							sendWorkSignal ( 1, thrdSignal );   //singleKmer
							kmer_c = read_c = 0;
						}

						continue;
					}

					if ( turn == 2 )
					{
						turn = 1;
						readseqInLib ( seqBuffer[read_c], next_name, & ( lenBuffer[read_c] ), readBuffer2, &start2, offset2, libNo );

						if ( ( ++i ) % 100000000 == 0 )
							{ fprintf ( stderr, "--- %lldth reads.\n", i ); }

						if ( lenBuffer[read_c] < 0 )
							{ fprintf ( stderr, "Read len %d.\n", lenBuffer[read_c] ); }

						if ( lenBuffer[read_c] < overlaplen + 1 )
						{
							if ( ( flag2 == 2 ) && ( start2 >= offset2 ) )
								{ break; }

							if ( start2 >= offset2 )
							{
								start2 = 0;
								offset2 = 0;
								flag2 = AIORead ( &aio2, &offset2, readBuffer2, cach2, &rt2, lib_array[libNo].curr_type );
							}

							continue;
						}

						indexArray[read_c] = kmer_c;
						kmer_c += lenBuffer[read_c] - overlaplen + 1;
						read_c++;

						if ( ( flag2 == 2 ) && ( start2 >= offset2 ) )
							{ break; }

						if ( start2 >= offset2 )
						{
							start2 = 0;
							offset2 = 0;
							flag2 = AIORead ( &aio2, &offset2, readBuffer2, cach2, &rt2, lib_array[libNo].curr_type );
						}

						if ( read_c == maxReadNum )
						{
							kmerCounter[0] += kmer_c;
							sendWorkSignal ( 2, thrdSignal );   //chopKmer4read
							sendWorkSignal ( 1, thrdSignal );   //singleKmer
							kmer_c = read_c = 0;
						}

						continue;
					}
				}
			}
			else
			{
				fprintf(stderr, "Error: aio_read error.\n");
			}
		}
		//read single fasta, single fastq and PE fasta in one file
		else
		{
			initAIO ( &aio1, aioBuffer1, fileno ( lib_array[libNo].fp1 ), maxAIOSize );
			int offset, flag1, rt;
			offset = 0;
			rt = aio_read ( &aio1 );

			while ( ( flag1 = AIORead ( &aio1, &offset, readBuffer1, cach1, &rt, lib_array[libNo].curr_type ) ) )
			{
				int start = 0;

				while ( start < offset )
				{
					readseqInLib ( seqBuffer[read_c], next_name, & ( lenBuffer[read_c] ), readBuffer1, &start, offset, libNo );

					if ( ( ++i ) % 100000000 == 0 )
						{ fprintf ( stderr, "--- %lldth reads.\n", i ); }

					if ( lenBuffer[read_c] < 0 )
						{ fprintf ( stderr, "Read len %d.\n", lenBuffer[read_c] ); }

					if ( lenBuffer[read_c] < overlaplen + 1 )
						{ continue; }

					indexArray[read_c] = kmer_c;
					kmer_c += lenBuffer[read_c] - overlaplen + 1;
					read_c++;
				}

				if ( read_c > maxReadNum - 1024 )
				{
					kmerCounter[0] += kmer_c;
					sendWorkSignal ( 2, thrdSignal );   //chopKmer4read
					sendWorkSignal ( 1, thrdSignal );   //singleKmer
					kmer_c = read_c = 0;
				}

				if ( flag1 == 2 )
					{ break; }
			}
		}
	}

	if ( read_c )
	{
		kmerCounter[0] += kmer_c;
		sendWorkSignal ( 2, thrdSignal );   //chopKmer4read
		sendWorkSignal ( 1, thrdSignal );   //singleKmer
	}

	time ( &stop_t );
	fprintf ( stderr, "Time spent on hashing reads: %ds, %lld read(s) processed.\n", ( int ) ( stop_t - start_t ), i );

	//record insert size info
	if ( pairs )
	{
		if ( gradsCounter )
			{ fprintf ( stderr, "%d pe insert size, the largest boundary is %lld.\n\n", gradsCounter, pes[gradsCounter - 1].PE_bound ); }
		else
		{
			fprintf ( stderr, "No paired reads found.\n" );
		}

		sprintf ( name, "%s.peGrads", outfile );
		fo = ckopen ( name, "w" );
		fprintf ( fo, "grads&num: %d\t%lld\n", gradsCounter, n_solexa );

		for ( i = 0; i < gradsCounter; i++ )
		{
			fprintf ( fo, "%d\t%lld\t%d\n", pes[i].insertS, pes[i].PE_bound, pes[i].rank );
		}

		fclose ( fo );
	}

	free_pe_mem ();
	free_libs ();

	if ( 1 )
	{
		unsigned long long alloCounter = 0;
		unsigned long long allKmerCounter = 0;

		for ( i = 0; i < thrd_num; i++ )
		{
			alloCounter += count_kmerset ( ( KmerSets[i] ) );
			allKmerCounter += kmerCounter[i + 1];
			free ( ( void * ) rcSeq[i + 1] );
		}

		fprintf ( stderr, "%lli node(s) allocated, %lli kmer(s) in reads, %lli kmer(s) processed.\n", alloCounter, kmerCounter[0], allKmerCounter );
	}

	free ( ( void * ) rcSeq );
	free ( ( void * ) kmerCounter );

	for ( i = 0; i < maxReadNum; i++ )
	{
		free ( ( void * ) seqBuffer[i] );
	}

	free ( ( void * ) seqBuffer );
	free ( ( void * ) lenBuffer );
	free ( ( void * ) indexArray );
	free ( ( void * ) kmerBuffer );
	free ( ( void * ) hashBanBuffer );
	free ( ( void * ) nextcBuffer );
	free ( ( void * ) prevcBuffer );
	free ( ( void * ) next_name );
	free ( ( void * ) aioBuffer1 );
	free ( ( void * ) aioBuffer2 );
	free ( ( void * ) readBuffer1 );
	free ( ( void * ) readBuffer2 );
	free ( ( void * ) cach1 );
	free ( ( void * ) cach2 );
	fprintf ( stderr, "done hashing nodes\n" );

	if ( deLowKmer )
	{
		time ( &start_t );
		deLowCov ( thrdSignal );
		time ( &stop_t );
		fprintf ( stderr, "Time spent on delowcvgNode: %ds.\n", ( int ) ( stop_t - start_t ) );
	}

	time ( &start_t );
	Mark1in1outNode ( thrdSignal );
	freqStat ( outfile );
	time ( &stop_t );
	fprintf ( stderr, "Time spent on marking linear nodes: %ds.\n", ( int ) ( stop_t - start_t ) );
	sendWorkSignal ( 3, thrdSignal );   //exit
	thread_wait ( threads );
	return 1;
}
示例#29
0
void output_contig ( EDGE * ed_array, unsigned int ed_num, char * outfile, int cut_len )
{
	char    temp[256];
	FILE * fp, *fp_contig;
	int flag, count, len_c;
	int signI;
	unsigned int i;
	long long sum = 0, N90, N50;
	unsigned int * length_array;
	boolean tip;
	sprintf ( temp, "%s.contig", outfile );
	fp = ckopen ( temp, "w" );
	qsort ( &ed_array[1], ed_num, sizeof ( EDGE ), cmp_edge );
	length_array = ( unsigned int * ) ckalloc ( ed_num * sizeof ( unsigned int ) );
	kmerSeq = ( char * ) ckalloc ( overlaplen * sizeof ( char ) );
	//first scan for number counting
	count = len_c = 0;

	for ( i = 1; i <= ed_num; i++ )
	{
		if ( ( ed_array[i].length + overlaplen ) >= len_bar )
			{ length_array[len_c++] = ed_array[i].length + overlaplen; }

		if ( ed_array[i].length < 1 || ed_array[i].deleted )
			{ continue; }

		count++;

		if ( EdSmallerThanTwin ( i ) )
			{ i++; }
	}

	sum = 0;

	for ( signI = len_c - 1; signI >= 0; signI-- )
		{ sum += length_array[signI]; }

	if ( len_c > 0 )
		{ printf ( "%d ctgs longer than %d, sum up %lldbp, with average length %lld\n", len_c, len_bar, sum, sum / len_c ); }

	qsort ( length_array, len_c, sizeof ( length_array[0] ), cmp_int );
	printf ( "the longest is %dbp, ", length_array[len_c - 1] );
	N50 = sum * 0.5;
	N90 = sum * 0.9;
	sum = flag = 0;

	for ( signI = len_c - 1; signI >= 0; signI-- )
	{
		sum += length_array[signI];

		if ( !flag && sum >= N50 )
		{
			printf ( "contig N50 is %d bp,", length_array[signI] );
			flag = 1;
		}

		if ( sum >= N90 )
		{
			printf ( "contig N90 is %d bp\n", length_array[signI] );
			break;
		}
	}

	//fprintf(fp,"Number %d\n",count);

	for ( i = 1; i <= ed_num; i++ )
	{
		//if(ed_array[i].multi!=1||ed_array[i].length<1||(ed_array[i].length+overlaplen)<cut_len)
		if ( ed_array[i].deleted || ed_array[i].length < 1 )
			{ continue; }

		if ( ed_array[i].arcs && ed_array[getTwinEdge ( i )].arcs )
			{ tip = 0; }
		else
			{ tip = 1; }

		output_1contig ( i, & ( ed_array[i] ), fp, tip );

		if ( EdSmallerThanTwin ( i ) )
			{ i++; }
	}

	fclose ( fp );
	free ( ( void * ) kmerSeq );
	free ( ( void * ) length_array );
	printf ( "%d contigs longer than %d output\n", count, cut_len );
	sprintf ( temp, "%s.ContigIndex", outfile );
	fp_contig = ckopen ( temp, "w" );
	fprintf ( fp_contig, "Edge_num %d %d\n", ed_num, count );
	fprintf ( fp_contig, "index\tlength\treverseComplement\n" );

	for ( i = 1; i <= num_ed; i++ )
	{
		fprintf ( fp_contig, "%d\t%d\t", i, edge_array[i].length + overlaplen );

		if ( EdSmallerThanTwin ( i ) )
		{
			fprintf ( fp_contig, "1\n" );
			i++;
		}
		else if ( EdLargerThanTwin ( i ) )
			{ fprintf ( fp_contig, "-1\n" ); }
		else
			{ fprintf ( fp_contig, "0\n" ); }
	}

	fclose ( fp_contig );
}
示例#30
0
int main(int argc, char *argv[])
{
	SEQ *sf;
	uchar *s;
	FILE *f;
	char buf[10000];
	char head[MAX_LEN];
	char cur[LEN_NAME], chr_name[LEN_NAME], annot[LEN_NAME], gname[LEN_NAME], filter[LEN_NAME];
	int gid = -1;
	int rid = -1;
	int i = 0;
	int b = 0, e = 1, num_cds = 0;
	char dir[3];
	struct exons_list *exons;
	char annot_name[LEN_NAME];
	float qual = (float)0;
	char ref[LEN_NAME], alt[LEN_NAME];
	int rest = 0;
	char codon[4], alt_codon[4];
	char aa1 = '\0', aa2 = '\0';
	int num_rmsk = 0;
	struct exons_list *rmsk;
	int num_snps = 0, num_pass = 0, num_filter = 0, num_coding1 = 0, num_syn1 = 0, num_non1 = 0, num_repeats1 = 0, num_coding_repeats1 = 0;
	int num_coding = 0, num_syn = 0, num_non = 0, num_repeats = 0, num_coding_repeats = 0;
	bool is_num_print = false;

	strcpy(buf, "");
	strcpy(head, "");
	strcpy(cur, "");
	strcpy(chr_name, "");
	strcpy(annot, "");
	strcpy(gname, "");
	strcpy(annot_name, "");
	strcpy(ref, "");
	strcpy(alt, "");
	strcpy(codon, "");
	strcpy(alt_codon, "");
	strcpy(dir, "");
	codon[3] = '\0';
	alt_codon[3] = '\0';

	if( argc != 7 ) {
		printf("link_to_annot vcf_file gff_file seq_file annot_type(exon, gene, ...) rmsk_file print_mode(NUM or SITES)\n");
		return EXIT_FAILURE;
	}
	else {
		if(!(f = ckopen(argv[2], "r"))) {
			printf("no file %s exists\n", argv[2]);
			return EXIT_FAILURE;
		}

		strcpy(annot_name, argv[4]);
		if( strcmp(annot_name, "exon") != 0 ) {
			fatalf("seq file is required only when the annot type is exon, but %s here\n", annot_name);
		}
		sf = seq_get(argv[3]);
		s = SEQ_CHARS(sf) - 1;
		if( strcmp(argv[6], "NUM") == 0 ) {
			is_num_print = true;
		}
		else if( strcmp(argv[6], "SITES") == 0 ) {
			is_num_print = false;
		}
		else {
			fatalf("unsupported print option: %s\n", argv[6]);
		}
	}

  compl['a'] = compl['A'] = 'T';
  compl['c'] = compl['C'] = 'G';
  compl['g'] = compl['G'] = 'C';
  compl['t'] = compl['T'] = 'A';

	while(fgets(buf, 10000, f))
	{
		if( (buf[0] == '#') || (buf[0] == '>') ) {}
		else if( sscanf(buf, "%*s %*s %s %d %d %*s", annot, &b, &e) != 3 ) {
			fatalf("line in wrong gff format: %s\n", buf);
		}
		else {
			if( strcmp(annot, annot_name) == 0 ) {
				num_cds++;
			}
		}
	}

	if( num_cds > 0 ) exons = (struct exons_list *) ckalloc(num_cds * sizeof(struct exons_list));

	initialize_exons_list(exons, 0, num_cds);

	fseek(f, 0, SEEK_SET);

	i = 0;
	
	while(fgets(buf, 10000, f))
	{
		if( (buf[0] == '#') || (buf[0] == '>') ) {}
		else if( sscanf(buf, "%s %*s %s %d %d %*s %s %*s %s", chr_name, annot, &b, &e, dir, cur) != 6 ) {
			fatalf("line in wrong gff format: %s\n", buf);
		}
		else {
			if( strcmp(annot, annot_name) == 0 ) {
				get_gene_name(cur, gname);
				strcpy(exons[i].name, gname);
				exons[i].reg = assign_I(b, e);
				exons[i].dir = dir[0];
				strcpy(exons[i].chr, chr_name);
				i++;
			}	
		}
	}

	if( i != num_cds ) {
		fatalf("%s counting error: %d - %d\n", annot_name, num_cds, i);
	}
	fclose(f);

	if(!(f = ckopen(argv[5], "r"))) {
		fatalf("%s file not found\n", argv[5]);
	}

	rmsk = 0;
	while(fgets(buf, 10000, f))
	{
		if( (buf[0] == '#') || (buf[0] == '>') ) {}
		else if( sscanf(buf, "%*s %*s %s %d %d %*s", annot, &b, &e) != 3 ) {
			fatalf("line in wrong gff format: %s\n", buf);
		}
		else {
			num_rmsk++;
		}
	}

	if( num_rmsk > 0 ) rmsk = (struct exons_list *) ckalloc(num_rmsk * sizeof(struct exons_list));

	initialize_exons_list(rmsk, 0, num_rmsk);

	fseek(f, 0, SEEK_SET);

	i = 0;
	
	while(fgets(buf, 10000, f))
	{
		if( (buf[0] == '#') || (buf[0] == '>') ) {}
		else if( sscanf(buf, "%s %*s %s %d %d %*s %s %*s %s", chr_name, annot, &b, &e, dir, cur) != 6 ) {
			fatalf("line in wrong gff format: %s\n", buf);
		}
		else {
			strcpy(rmsk[i].name, annot);
			rmsk[i].reg = assign_I(b, e);
			rmsk[i].dir = dir[0];
			strcpy(rmsk[i].chr, chr_name);
			i++;
		}
	}

	if( i != num_rmsk ) {
		fatalf("%s counting error: %d - %d\n", annot_name, num_cds, i);
	}
	fclose(f);

	if(!(f = ckopen(argv[1], "r"))) {
		printf("no file %s exists\n", argv[1]);
		return EXIT_FAILURE;
	}

	i = 0;
	while(fgets(buf, 10000, f))
	{
		if( buf[0] != '#' ) {
			num_snps++;
			if( sscanf(buf, "%s %d %*s %s %s %f %s %*s", chr_name, &b, ref, alt, &qual, filter) != 6 ) {
				fatalf("bad format in %s\n", buf);
			}
			else 
			{
				if( strstr(filter, "PASS") == 0 ) {
					num_pass++;
				}
				else if( strstr(filter, "filter") == 0 ) {
					num_filter++;
				}

				rid = -1;
				rid = find_overlap_gene(chr_name, b, rmsk, num_rmsk);
				if( rid != -1 ) {
					num_repeats++;
					if( strstr(filter, "filter") == 0 ) {}
					else if( strstr(filter, "PASS") == 0 ) {
						num_repeats1++;
					}
					else {
						fatalf("unexpected filter option: %s\n", filter);
					}
				}

				if( (gid = find_overlap_gene(chr_name, b, exons, num_cds)) != -1 ) {
					num_coding++;
					if( strstr(filter, "PASS") == 0 ) {
						num_coding1++;
					}

					if( ref[0] != s[b] ) {
						fatalf("nucleotides not match: %c - %c\n", alt, s[b]);
					}

					if( exons[gid].dir == '+' ) {
						rest = (b - exons[gid].reg.lower)%3;
						if( rest == 0 ) {
							sprintf(codon, "%c%c%c", s[b], s[b+1], s[b+2]);
							sprintf(alt_codon, "%c%c%c", alt[0], s[b+1], s[b+2]);
						}
						else if( rest == 1 ) {
							sprintf(codon, "%c%c%c", s[b-1], s[b], s[b+1]);
							sprintf(alt_codon, "%c%c%c", s[b-1], alt[0], s[b+1]);
						}
						else {
							sprintf(codon, "%c%c%c", s[b-2], s[b-1], s[b]);
							sprintf(alt_codon, "%c%c%c", s[b-2], s[b-1], alt[0]);
						}
					}
					else if( exons[gid].dir == '-' ) {
						rest = (b - exons[gid].reg.upper)%3;
						if( rest == 0 ) {
							sprintf(codon, "%c%c%c", compl[s[b]], compl[s[b-1]], compl[s[b-2]]);
							sprintf(alt_codon, "%c%c%c", compl[alt[0]], compl[s[b-1]], compl[s[b-2]]);
						}
						else if( rest == 1 ) {
							sprintf(codon, "%c%c%c", compl[s[b+1]], compl[s[b]], compl[s[b-1]]);
							sprintf(alt_codon, "%c%c%c", compl[s[b+1]], compl[alt[0]], compl[s[b-1]]);
						}
						else {
							sprintf(codon, "%c%c%c", compl[s[b+2]], compl[s[b+1]], compl[s[b]]);
							sprintf(alt_codon, "%c%c%c", compl[s[b+2]], compl[s[b+1]], compl[alt[0]]);
						}
					}
					else {
						fatalf("%c unsupported\n", exons[gid].dir);
					}
					aa1 = dna2oneaa(codon);
					aa2 = dna2oneaa(alt_codon);
					
					if( aa1 == aa2 ) {
						num_syn++;
						if( strstr(filter, "filter") == 0) {
						}
						else if( strstr(filter, "PASS") == 0 ) {
							num_syn1++;
						}
						else {
							fatalf("unexpected filter option: %s\n", filter);
						}
					}
					else {
						num_non++;
						if( strstr(filter, "filter") == 0) {
						}
						else if( strstr(filter, "PASS") == 0 ) {
							num_non1++;
						}
						else {
							fatalf("unexpected filter option: %s\n", filter);
						}
					}
					
					if( rid != -1 ) {
						num_coding_repeats++;
						if( strstr(filter, "PASS") == 0 ) {
							num_coding_repeats1++;
						}
					}

					if( is_num_print == false ) {
						if( rid == -1 ) {
							printf("%s\t%d\t%s\t%s\t%f\t%s\t%s\t%d\t%d\t%c\t%c\t%c\t.\n", chr_name, b, ref, alt, qual, filter, exons[gid].name, exons[gid].reg.lower, exons[gid].reg.upper, exons[gid].dir, aa1, aa2);
						}
						else {
							printf("%s\t%d\t%s\t%s\t%f\t%s\t%s\t%d\t%d\t%c\t%c\t%c\t%s\n", chr_name, b, ref, alt, qual, filter, exons[gid].name, exons[gid].reg.lower, exons[gid].reg.upper, exons[gid].dir, aa1, aa2, rmsk[rid].name);
						}
					}
				}
				else {
				}
			}
		}
	}
	
	if( is_num_print == true ) {
		printf("%s\t%d\t%d\t%d\t%d\t%d\t%d\t%d\t%d\t%d\t%d\t%d\t%d\t%d\n", chr_name, num_snps, num_pass, num_filter, num_coding, num_coding1, num_non, num_syn, num_non1, num_syn1, num_repeats, num_repeats1, num_coding_repeats, num_coding_repeats1);
	}

	if( num_cds > 0 ) {
		free(exons);
	}
	fclose(f);

	return EXIT_SUCCESS;
}