Пример #1
0
void parse_gtf_file(char* gtfF) {
	ifstream fin(gtfF);
	string line, tid, gid;
	GTFItem item;

	general_assert(fin.is_open(), "Cannot open " + cstrtos(gtfF) + "! It may not exist.");

	int cnt = 0;
	
	items.clear();
	while (getline(fin, line)) {
		if (skip(line)) continue;
		item.parse(line);
		string feature = item.getFeature();
		if (feature == "exon") {
			if (item.getStart() > item.getEnd()) {
	printf("Warning: exon's start position is larger than its end position! This exon is discarded.\n");
	printf("\t%s\n\n", line.c_str());
			}
			else if (item.getStart() < 1) {
	printf("Warning: exon's start position is less than 1! This exon is discarded.\n");
	printf("\t%s\n\n", line.c_str());
			}
			else {
	item.parseAttributes(line);
	if (mappingType > 0) {
		tid = item.getTranscriptID();
		mi_iter = mi_table.find(tid);
		general_assert(mi_iter != mi_table.end(), "Mapping Info is not correct, cannot find " + tid + "'s gene_id!");
		gid = mi_iter->second;
		item.setGeneID(gid);
	}
	items.push_back(item);
			}
		}
		
		++cnt;
		if (verbose && cnt % 200000 == 0) { printf("Parsed %d lines\n", cnt); }
	}
	fin.close();
	
	sort(items.begin(), items.end());
	
	int sp = 0, ep; // start pointer, end pointer
	int nItems = items.size();
	
	sn2tr.clear();
	while (sp < nItems) {
		tid = items[sp].getTranscriptID();
		
		ep = sp + 1;
		while (ep < nItems && items[ep].getTranscriptID() == tid) ++ep;
		--ep;
		
		buildTranscript(sp, ep);
		
		int sid = transcripts.getM();
		const Transcript& transcript = transcripts.getTranscriptAt(sid);
		
		sn2tr_iter = sn2tr.find(transcript.getSeqName());
		if (sn2tr_iter == sn2tr.end()) {
			vector<int> vec(1, sid);
			sn2tr[transcript.getSeqName()] = vec;
		}
		else {
			sn2tr_iter->second.push_back(sid);
		}
		
		sp = ep + 1;
	}
	
	items.clear();
	
	if (verbose) { printf("Parsing GTF File is done!\n"); }
}
Пример #2
0
void parse_gtf_file(char* gtfF) {
	ifstream fin(gtfF);
	string line, curgid, tid, gid; //  curgid: current gene id;
	GTFItem item;

	if (!fin.is_open()) { fprintf(stderr, "Cannot open %s! It may not exist.\n", gtfF); exit(-1); }

	int cnt = 0;

	items.clear();
 	while (getline(fin, line)) {
 		if (line[0] == '#') continue; // if this line is comment, jump it
 		item.parse(line);
 		string feature = item.getFeature();
 		if (feature == "exon") {
 			if (item.getStart() > item.getEnd()) {
 				fprintf(stderr, "Warning: exon's start position is larger than its end position! This exon is discarded.\n");
 				fprintf(stderr, "\t%s\n\n", line.c_str());
 			}
 			else if (item.getStart() < 1) {
 				fprintf(stderr, "Warning: exon's start position is less than 1! This exon is discarded.\n");
 				fprintf(stderr, "\t%s\n\n", line.c_str());
 			}
 			else {
 		 		if (hasMappingFile) {
 		 			tid = item.getTranscriptID();
					mi_iter = mi_table.find(tid);
					if (mi_iter == mi_table.end()) {
					  fprintf(stderr, "Mapping Info is not correct, cannot find %s's gene_id!\n", tid.c_str());
					  exit(-1);
					}
					//assert(iter != table.end());
					gid = mi_iter->second;
					item.setGeneID(gid);
 		 		}
 				items.push_back(item);
 			}
 		}

 		++cnt;
 		if (verbose && cnt % 200000 == 0) { printf("Parsed %d lines\n", cnt); }
	}
	fin.close();

	sort(items.begin(), items.end());

	starts.clear();
	sn2tr.clear();
	curgid = "";

	int sp = 0, ep; // start pointer, end pointer
	int nItems = items.size();

	while (sp < nItems) {
		tid = items[sp].getTranscriptID();
		gid = items[sp].getGeneID();

		ep = sp + 1;
		while (ep < nItems && items[ep].getTranscriptID() == tid) ep++;
		ep--;

		buildTranscript(sp, ep);

		int sid = transcripts.getM();
		const Transcript& transcript = transcripts.getTranscriptAt(sid);

		if (curgid != gid) {
			starts.push_back(sid);
			curgid = gid;
		}
		iter = sn2tr.find(transcript.getSeqName());
		if (iter == sn2tr.end()) {
			vector<int> vec(1, sid);
			sn2tr[transcript.getSeqName()] = vec;
		}
		else {
			iter->second.push_back(sid);
		}

		sp = ep + 1;
	}

	M = transcripts.getM();
	starts.push_back(M + 1);
	items.clear();

	if (M < 1) {
		fprintf(stderr, "Number of transcripts in the reference is less than 1!\n");
		exit(-1);
	}

	if (verbose) { printf("Parsing gtf File is done!\n"); }
}