Esempio n. 1
0
int main(int argc, char* argv[]) {
	if (argc < 2) {
		printf("Usage: PROBer-build-reference refName [--gtf gtfF] [--mapping mappingF] [--allele-specific] [--files num_of_files file_1 file_2 ...] [--n2g-index] [-q]\n");
		exit(-1);
	}

	hasGTF = false;
	mappingType = 0;
	n2g_idx = false;
	
	int argpos = 2;
	while (argpos < argc) {
		if (!strcmp(argv[argpos], "--gtf")) {
			hasGTF = true;
			strcpy(gtfF, argv[++argpos]);
		}
		if (!strcmp(argv[argpos], "--mapping")) {
			mappingType = 1;
			mappingPos = ++argpos;
		}
		if (!strcmp(argv[argpos], "--allele-specific")) mappingType = 2;
		if (!strcmp(argv[argpos], "--files")) {
			num_files = atoi(argv[++argpos]);
			file_pos = argpos + 1; // the position in argv for the first file
			argpos += num_files;
		}
		if (!strcmp(argv[argpos], "--n2g-index")) n2g_idx = true;
		if (!strcmp(argv[argpos], "-q")) verbose = false;
		++argpos;
	}

	if (mappingType > 0) loadMappingInfo(mappingType, argv[mappingPos]);

	ifstream fin;
	string line, gseq, tseq; // gseq, genomic sequence; tseq, transcript sequence
	string seqname, gene_id, transcript_id;
	
	if (hasGTF) {
		transcripts.setType(0);
		assert(mappingType < 2);
		parse_gtf_file(gtfF);

		M = transcripts.getM();
		general_assert(M > 0, "The reference contains no transcripts!");
		seqs.assign(M + 1, "");
		
		chrvec.clear();
		
		for (int i = 0; i < num_files; ++i, ++file_pos) {
			fin.open(argv[file_pos]);
			general_assert(fin.is_open(), "Cannot open " + cstrtos(argv[file_pos]) + "! It may not exist.");
			getline(fin, line);
			while ((fin) && (line[0] == '>')) {
	istringstream strin(line.substr(1));
	strin>>seqname;
	
	gseq = "";
	while((getline(fin, line)) && (line[0] != '>')) {
		gseq += line;
	}
	assert(gseq.length() > 0);
			
	sn2tr_iter = sn2tr.find(seqname);
	if (sn2tr_iter == sn2tr.end()) continue;
	
	chrvec.push_back(ChrInfo(seqname, gseq.length()));
	
	vector<int>& vec = sn2tr_iter->second;
	int s = vec.size();
	for (int j = 0; j < s; ++j) {
		assert(vec[j] > 0 && vec[j] <= M);
		transcripts.getTranscriptAt(vec[j]).extractSeq(gseq, seqs[vec[j]]);
	}
			}
			fin.close();

			if (verbose) { printf("%s is processed!\n", argv[file_pos]); } 
		}
		
		sort(chrvec.begin(), chrvec.end());

		// Shrink and build up Refs
		int curp = 0;
		for (int i = 1; i <= M; ++i) {
			const Transcript& transcript = transcripts.getTranscriptAt(i);
			if (seqs[i] == "") 
	printf("Warning: Cannot extract transcript %s because the chromosome it locates -- %s -- is absent!\n", transcript.getTranscriptID().c_str(), transcript.getSeqName().c_str());
			else {
	refs.addRef(transcript.getTranscriptID(), seqs[i]); // insert RefSeqs
	++curp;
	transcripts.move(i, curp);
			}
		}
		printf("%d transcripts are extracted and %d transcripts are omitted.\n", curp, M - curp);
		
		transcripts.setM(curp);
		M = transcripts.getM();
		general_assert(M > 0, "The reference contains no transcripts!");
		assert(refs.getM() == M);
	}
	else {
Esempio n. 2
0
void parse_gtf_file(char* gtfF) {
	ifstream fin(gtfF);
	string line, curgid, tid, gid; //  curgid: current gene id;
	GTFItem item;

	if (!fin.is_open()) { fprintf(stderr, "Cannot open %s! It may not exist.\n", gtfF); exit(-1); }

	int cnt = 0;

	items.clear();
 	while (getline(fin, line)) {
 		if (line[0] == '#') continue; // if this line is comment, jump it
 		item.parse(line);
 		string feature = item.getFeature();
 		if (feature == "exon") {
 			if (item.getStart() > item.getEnd()) {
 				fprintf(stderr, "Warning: exon's start position is larger than its end position! This exon is discarded.\n");
 				fprintf(stderr, "\t%s\n\n", line.c_str());
 			}
 			else if (item.getStart() < 1) {
 				fprintf(stderr, "Warning: exon's start position is less than 1! This exon is discarded.\n");
 				fprintf(stderr, "\t%s\n\n", line.c_str());
 			}
 			else {
 		 		if (hasMappingFile) {
 		 			tid = item.getTranscriptID();
					mi_iter = mi_table.find(tid);
					if (mi_iter == mi_table.end()) {
					  fprintf(stderr, "Mapping Info is not correct, cannot find %s's gene_id!\n", tid.c_str());
					  exit(-1);
					}
					//assert(iter != table.end());
					gid = mi_iter->second;
					item.setGeneID(gid);
 		 		}
 				items.push_back(item);
 			}
 		}

 		++cnt;
 		if (verbose && cnt % 200000 == 0) { printf("Parsed %d lines\n", cnt); }
	}
	fin.close();

	sort(items.begin(), items.end());

	starts.clear();
	sn2tr.clear();
	curgid = "";

	int sp = 0, ep; // start pointer, end pointer
	int nItems = items.size();

	while (sp < nItems) {
		tid = items[sp].getTranscriptID();
		gid = items[sp].getGeneID();

		ep = sp + 1;
		while (ep < nItems && items[ep].getTranscriptID() == tid) ep++;
		ep--;

		buildTranscript(sp, ep);

		int sid = transcripts.getM();
		const Transcript& transcript = transcripts.getTranscriptAt(sid);

		if (curgid != gid) {
			starts.push_back(sid);
			curgid = gid;
		}
		iter = sn2tr.find(transcript.getSeqName());
		if (iter == sn2tr.end()) {
			vector<int> vec(1, sid);
			sn2tr[transcript.getSeqName()] = vec;
		}
		else {
			iter->second.push_back(sid);
		}

		sp = ep + 1;
	}

	M = transcripts.getM();
	starts.push_back(M + 1);
	items.clear();

	if (M < 1) {
		fprintf(stderr, "Number of transcripts in the reference is less than 1!\n");
		exit(-1);
	}

	if (verbose) { printf("Parsing gtf File is done!\n"); }
}
Esempio n. 3
0
void parse_gtf_file(char* gtfF) {
	ifstream fin(gtfF);
	string line, tid, gid;
	GTFItem item;

	general_assert(fin.is_open(), "Cannot open " + cstrtos(gtfF) + "! It may not exist.");

	int cnt = 0;
	
	items.clear();
	while (getline(fin, line)) {
		if (skip(line)) continue;
		item.parse(line);
		string feature = item.getFeature();
		if (feature == "exon") {
			if (item.getStart() > item.getEnd()) {
	printf("Warning: exon's start position is larger than its end position! This exon is discarded.\n");
	printf("\t%s\n\n", line.c_str());
			}
			else if (item.getStart() < 1) {
	printf("Warning: exon's start position is less than 1! This exon is discarded.\n");
	printf("\t%s\n\n", line.c_str());
			}
			else {
	item.parseAttributes(line);
	if (mappingType > 0) {
		tid = item.getTranscriptID();
		mi_iter = mi_table.find(tid);
		general_assert(mi_iter != mi_table.end(), "Mapping Info is not correct, cannot find " + tid + "'s gene_id!");
		gid = mi_iter->second;
		item.setGeneID(gid);
	}
	items.push_back(item);
			}
		}
		
		++cnt;
		if (verbose && cnt % 200000 == 0) { printf("Parsed %d lines\n", cnt); }
	}
	fin.close();
	
	sort(items.begin(), items.end());
	
	int sp = 0, ep; // start pointer, end pointer
	int nItems = items.size();
	
	sn2tr.clear();
	while (sp < nItems) {
		tid = items[sp].getTranscriptID();
		
		ep = sp + 1;
		while (ep < nItems && items[ep].getTranscriptID() == tid) ++ep;
		--ep;
		
		buildTranscript(sp, ep);
		
		int sid = transcripts.getM();
		const Transcript& transcript = transcripts.getTranscriptAt(sid);
		
		sn2tr_iter = sn2tr.find(transcript.getSeqName());
		if (sn2tr_iter == sn2tr.end()) {
			vector<int> vec(1, sid);
			sn2tr[transcript.getSeqName()] = vec;
		}
		else {
			sn2tr_iter->second.push_back(sid);
		}
		
		sp = ep + 1;
	}
	
	items.clear();
	
	if (verbose) { printf("Parsing GTF File is done!\n"); }
}
Esempio n. 4
0
int main(int argc, char* argv[]) {
  if (argc < 5 || ((hasMappingFile = atoi(argv[3])) && argc < 6)) {
		printf("Usage: synthesisRef refName quiet hasMappingFile<0,no;1,yes;2,allele-specific> [mappingFile] reference_file_1 [reference_file_2 ...]\n");
		exit(-1);
	}

	verbose = !atoi(argv[2]);

	if (hasMappingFile) { loadMappingInfo(hasMappingFile, argv[4]); }

	// allele-specific
	if (hasMappingFile == 2) { transcripts.setType(2); }

	int start = hasMappingFile ? 5 : 4;

	ifstream fin;
	string line, gseq;
	string seqname, gene_id, transcript_id;

	vector<Interval> vec;

	M = 0;
	name2seq.clear();
	for (int i = start; i < argc; i++) {
		fin.open(argv[i]);
		general_assert(fin.is_open(), "Cannot open " + cstrtos(argv[i]) + "! It may not exist."); 
		unsigned long int line_no = 0; //Keep track of file line number
		getline(fin, line);
		line_no += 1;
		while ((fin) && (line[0] == '>')) {
			istringstream strin(line.substr(1));
			strin>>seqname;

			gseq = "";
			while((getline(fin, line)) && (line[0] != '>')) {
			    line_no += 1;
			    gseq += line;
			}

			int len = gseq.length();
			assert(len > 0);
			for (int j = 0; j < len; j++) gseq[j] = check(gseq[j],line_no);

			name2seq[seqname] = gseq;

			transcript_id = seqname;
			gene_id = seqname;

			if (hasMappingFile) {
			      mi_iter = mi_table.find(seqname);
			      general_assert(mi_iter != mi_table.end(), "Mapping Info is not correct, cannot find " + seqname + "'s gene_id!");
			      gene_id = mi_iter->second;
			      if (hasMappingFile == 2) {
				mi_iter2 = mi_table2.find(seqname);
				general_assert(mi_iter2 != mi_table2.end(), "Mapping Info is not correct, cannot find allele " + seqname + "'s transcript_id!");
				transcript_id = mi_iter2->second;
			      }
			}
			
			vec.clear();
			vec.push_back(Interval(1, len));
			transcripts.add(Transcript(transcript_id, gene_id, seqname, '+', vec, ""));
			++M;

			if (verbose && M % 1000000 == 0) { printf("%d sequences are processed!\n", M); }
		}
		fin.close();
	}

	if (M < 1) {
		fprintf(stderr, "Number of transcripts in the reference is less than 1!\n");
		exit(-1);
	}

	assert(M == transcripts.getM());
	transcripts.sort();

	writeResults(hasMappingFile, argv[1]);

	return 0;
}