Ejemplo n.º 1
0
void buildTranscript(int sp, int ep) {
	int cur_s, cur_e; // current_start, current_end
	
	string transcript_id = items[sp].getTranscriptID();
	string gene_id = items[sp].getGeneID();
	string gene_name = "", transcript_name = "";
	
	char strand = items[sp].getStrand();
	string seqname = items[sp].getSeqName();
	string left = items[sp].getLeft();
	
	vec.clear();
	cur_s = cur_e = -1;
	for (int i = sp; i <= ep; ++i) {
		int start = items[i].getStart();
		int end = items[i].getEnd();
		
		general_assert(strand == items[i].getStrand(), "According to the GTF file given, a transcript has exons from different orientations!");
		general_assert(seqname == items[i].getSeqName(), "According to the GTF file given, a transcript has exons on multiple chromosomes!");

		if (items[i].getGeneName() != "") {
			if (gene_name == "") gene_name = items[i].getGeneName();
			else general_assert(gene_name == items[i].getGeneName(), "A transcript is associated with multiple gene names!");
		}
		if (items[i].getTranscriptName() != "") {
			if (transcript_name == "") transcript_name = items[i].getTranscriptName();
			else general_assert(transcript_name == items[i].getTranscriptName(), "A transcript is associated with multiple transcript names!");
		}
		
		if (cur_e + 1 < start) {
			if (cur_s > 0) vec.push_back(Interval(cur_s, cur_e));
			cur_s = start;
		}
		cur_e = (cur_e < end ? end : cur_e);
	}
	if (cur_s > 0) vec.push_back(Interval(cur_s, cur_e));

	//  if (gene_name != "") gene_id += "_" + gene_name;
	//  if (transcript_name != "") transcript_id += "_" + transcript_name;
	
	transcripts.add(Transcript(transcript_id, gene_id, seqname, strand, vec, left));
}
Ejemplo n.º 2
0
bool buildTranscript(int sp, int ep) {
	int cur_s, cur_e; // current_start, current_end
	vector<Interval> vec;

	string transcript_id = items[sp].getTranscriptID();
	string gene_id = items[sp].getGeneID();
	char strand = items[sp].getStrand();
	string seqname = items[sp].getSeqName();
	string left = items[sp].getLeft();

	vec.clear();
	cur_s = cur_e = -1;
	for (int i = sp; i <= ep; i++) {
		int start = items[i].getStart();
		int end = items[i].getEnd();

		if (strand != items[i].getStrand()) {
		  fprintf(stderr, "According to the GTF file given, a transcript has exons from different orientations!\n");
		  exit(-1);
		}
		if (seqname != items[i].getSeqName()) {
		  fprintf(stderr, "According to the GTF file given, a transcript has exons on multiple chromosomes!\n");
		  exit(-1);
		}

		if (cur_e + 1 < start) {
			if (cur_s > 0) vec.push_back(Interval(cur_s, cur_e));
			cur_s = start;
		}
		cur_e = (cur_e < end ? end : cur_e);
	}
	if (cur_s > 0) vec.push_back(Interval(cur_s, cur_e));

	transcripts.add(Transcript(transcript_id, gene_id, seqname, strand, vec, left));

	return true;
}
Ejemplo n.º 3
0
int main(int argc, char* argv[]) {
  if (argc < 5 || ((hasMappingFile = atoi(argv[3])) && argc < 6)) {
		printf("Usage: synthesisRef refName quiet hasMappingFile<0,no;1,yes;2,allele-specific> [mappingFile] reference_file_1 [reference_file_2 ...]\n");
		exit(-1);
	}

	verbose = !atoi(argv[2]);

	if (hasMappingFile) { loadMappingInfo(hasMappingFile, argv[4]); }

	// allele-specific
	if (hasMappingFile == 2) { transcripts.setType(2); }

	int start = hasMappingFile ? 5 : 4;

	ifstream fin;
	string line, gseq;
	string seqname, gene_id, transcript_id;

	vector<Interval> vec;

	M = 0;
	name2seq.clear();
	for (int i = start; i < argc; i++) {
		fin.open(argv[i]);
		general_assert(fin.is_open(), "Cannot open " + cstrtos(argv[i]) + "! It may not exist."); 
		unsigned long int line_no = 0; //Keep track of file line number
		getline(fin, line);
		line_no += 1;
		while ((fin) && (line[0] == '>')) {
			istringstream strin(line.substr(1));
			strin>>seqname;

			gseq = "";
			while((getline(fin, line)) && (line[0] != '>')) {
			    line_no += 1;
			    gseq += line;
			}

			int len = gseq.length();
			assert(len > 0);
			for (int j = 0; j < len; j++) gseq[j] = check(gseq[j],line_no);

			name2seq[seqname] = gseq;

			transcript_id = seqname;
			gene_id = seqname;

			if (hasMappingFile) {
			      mi_iter = mi_table.find(seqname);
			      general_assert(mi_iter != mi_table.end(), "Mapping Info is not correct, cannot find " + seqname + "'s gene_id!");
			      gene_id = mi_iter->second;
			      if (hasMappingFile == 2) {
				mi_iter2 = mi_table2.find(seqname);
				general_assert(mi_iter2 != mi_table2.end(), "Mapping Info is not correct, cannot find allele " + seqname + "'s transcript_id!");
				transcript_id = mi_iter2->second;
			      }
			}
			
			vec.clear();
			vec.push_back(Interval(1, len));
			transcripts.add(Transcript(transcript_id, gene_id, seqname, '+', vec, ""));
			++M;

			if (verbose && M % 1000000 == 0) { printf("%d sequences are processed!\n", M); }
		}
		fin.close();
	}

	if (M < 1) {
		fprintf(stderr, "Number of transcripts in the reference is less than 1!\n");
		exit(-1);
	}

	assert(M == transcripts.getM());
	transcripts.sort();

	writeResults(hasMappingFile, argv[1]);

	return 0;
}