Ejemplo n.º 1
0
void writeResults(char* refName) {
	int s;
	ofstream fout;

	sprintf(groupF, "%s.grp", refName);
	sprintf(tiF, "%s.ti", refName);
	sprintf(refFastaF, "%s.transcripts.fa", refName);
	sprintf(chromListF, "%s.chrlist", refName);


	fout.open(groupF);
	s = starts.size();
	for (int i = 0; i < s; i++) fout<<starts[i]<<endl;
	fout.close();
	if (verbose) { printf("Group File is generated!\n"); }

	transcripts.writeTo(tiF);
	if (verbose) { printf("Transcript Information File is generated!\n"); }

	fout.open(chromListF);
	s = chrvec.size();
	for (int i = 0; i < s; i++) {
		fout<<chrvec[i].name<<'\t'<<chrvec[i].len<<endl;
	}
	fout.close();
	if (verbose) { printf("Chromosome List File is generated!\n"); }

	fout.open(refFastaF);
	for (int i = 1; i <= M; i++) {
		fout<<">"<<transcripts.getTranscriptAt(i).getTranscriptID()<<endl;
		fout<<seqs[i]<<endl;
	}
	fout.close();
	if (verbose) { printf("Extracted Sequences File is generated!\n"); }
}
Ejemplo n.º 2
0
void buildTranscript(int sp, int ep) {
	int cur_s, cur_e; // current_start, current_end
	
	string transcript_id = items[sp].getTranscriptID();
	string gene_id = items[sp].getGeneID();
	string gene_name = "", transcript_name = "";
	
	char strand = items[sp].getStrand();
	string seqname = items[sp].getSeqName();
	string left = items[sp].getLeft();
	
	vec.clear();
	cur_s = cur_e = -1;
	for (int i = sp; i <= ep; ++i) {
		int start = items[i].getStart();
		int end = items[i].getEnd();
		
		general_assert(strand == items[i].getStrand(), "According to the GTF file given, a transcript has exons from different orientations!");
		general_assert(seqname == items[i].getSeqName(), "According to the GTF file given, a transcript has exons on multiple chromosomes!");

		if (items[i].getGeneName() != "") {
			if (gene_name == "") gene_name = items[i].getGeneName();
			else general_assert(gene_name == items[i].getGeneName(), "A transcript is associated with multiple gene names!");
		}
		if (items[i].getTranscriptName() != "") {
			if (transcript_name == "") transcript_name = items[i].getTranscriptName();
			else general_assert(transcript_name == items[i].getTranscriptName(), "A transcript is associated with multiple transcript names!");
		}
		
		if (cur_e + 1 < start) {
			if (cur_s > 0) vec.push_back(Interval(cur_s, cur_e));
			cur_s = start;
		}
		cur_e = (cur_e < end ? end : cur_e);
	}
	if (cur_s > 0) vec.push_back(Interval(cur_s, cur_e));

	//  if (gene_name != "") gene_id += "_" + gene_name;
	//  if (transcript_name != "") transcript_id += "_" + transcript_name;
	
	transcripts.add(Transcript(transcript_id, gene_id, seqname, strand, vec, left));
}
Ejemplo n.º 3
0
bool buildTranscript(int sp, int ep) {
	int cur_s, cur_e; // current_start, current_end
	vector<Interval> vec;

	string transcript_id = items[sp].getTranscriptID();
	string gene_id = items[sp].getGeneID();
	char strand = items[sp].getStrand();
	string seqname = items[sp].getSeqName();
	string left = items[sp].getLeft();

	vec.clear();
	cur_s = cur_e = -1;
	for (int i = sp; i <= ep; i++) {
		int start = items[i].getStart();
		int end = items[i].getEnd();

		if (strand != items[i].getStrand()) {
		  fprintf(stderr, "According to the GTF file given, a transcript has exons from different orientations!\n");
		  exit(-1);
		}
		if (seqname != items[i].getSeqName()) {
		  fprintf(stderr, "According to the GTF file given, a transcript has exons on multiple chromosomes!\n");
		  exit(-1);
		}

		if (cur_e + 1 < start) {
			if (cur_s > 0) vec.push_back(Interval(cur_s, cur_e));
			cur_s = start;
		}
		cur_e = (cur_e < end ? end : cur_e);
	}
	if (cur_s > 0) vec.push_back(Interval(cur_s, cur_e));

	transcripts.add(Transcript(transcript_id, gene_id, seqname, strand, vec, left));

	return true;
}
Ejemplo n.º 4
0
int main(int argc, char* argv[]) {
	ifstream fin;
	bool quiet = false;

	if (argc < 5) {
		printf("Usage : rsem-run-em refName read_type sampleName sampleToken [-p #Threads] [-b samInpType samInpF has_fn_list_? [fn_list]] [-q] [--gibbs-out] [--sampling]\n\n");
		printf("  refName: reference name\n");
		printf("  read_type: 0 single read without quality score; 1 single read with quality score; 2 paired-end read without quality score; 3 paired-end read with quality score.\n");
		printf("  sampleName: sample's name, including the path\n");
		printf("  sampleToken: sampleName excludes the path\n");
		printf("  -p: number of threads which user wants to use. (default: 1)\n");
		printf("  -b: produce bam format output file. (default: off)\n");
		printf("  -q: set it quiet\n");
		printf("  --gibbs-out: generate output file used by Gibbs sampler. (default: off)\n");
		printf("  --sampling: sample each read from its posterior distribution when bam file is generated. (default: off)\n");
		printf("// model parameters should be in imdName.mparams.\n");
		exit(-1);
	}

	time_t a = time(NULL);

	strcpy(refName, argv[1]);
	read_type = atoi(argv[2]);
	strcpy(outName, argv[3]);
	sprintf(imdName, "%s.temp/%s", argv[3], argv[4]);
	sprintf(statName, "%s.stat/%s", argv[3], argv[4]);

	nThreads = 1;

	genBamF = false;
	bamSampling = false;
	genGibbsOut = false;
	pt_fn_list = pt_chr_list = NULL;

	for (int i = 5; i < argc; i++) {
		if (!strcmp(argv[i], "-p")) { nThreads = atoi(argv[i + 1]); }
		if (!strcmp(argv[i], "-b")) {
			genBamF = true;
			inpSamType = argv[i + 1][0];
			strcpy(inpSamF, argv[i + 2]);
			if (atoi(argv[i + 3]) == 1) {
				strcpy(fn_list, argv[i + 4]);
				pt_fn_list = (char*)(&fn_list);
			}
		}
		if (!strcmp(argv[i], "-q")) { quiet = true; }
		if (!strcmp(argv[i], "--gibbs-out")) { genGibbsOut = true; }
		if (!strcmp(argv[i], "--sampling")) { bamSampling = true; }
	}

	general_assert(nThreads > 0, "Number of threads should be bigger than 0!");

	verbose = !quiet;

	//basic info loading
	sprintf(refF, "%s.seq", refName);
	refs.loadRefs(refF);
	M = refs.getM();
	sprintf(groupF, "%s.grp", refName);
	gi.load(groupF);
	m = gi.getm();

	sprintf(tiF, "%s.ti", refName);
	transcripts.readFrom(tiF);

	sprintf(cntF, "%s.cnt", statName);
	fin.open(cntF);

	general_assert(fin.is_open(), "Cannot open " + cstrtos(cntF) + "! It may not exist.");

	fin>>N0>>N1>>N2>>N_tot;
	fin.close();

	general_assert(N1 > 0, "There are no alignable reads!");

	if ((READ_INT_TYPE)nThreads > N1) nThreads = N1;

	//set model parameters
	mparams.M = M;
	mparams.N[0] = N0; mparams.N[1] = N1; mparams.N[2] = N2;
	mparams.refs = &refs;

	sprintf(mparamsF, "%s.mparams", imdName);
	fin.open(mparamsF);

	general_assert(fin.is_open(), "Cannot open " + cstrtos(mparamsF) + "It may not exist.");

	fin>> mparams.minL>> mparams.maxL>> mparams.probF;
	int val; // 0 or 1 , for estRSPD
	fin>>val;
	mparams.estRSPD = (val != 0);
	fin>> mparams.B>> mparams.mate_minL>> mparams.mate_maxL>> mparams.mean>> mparams.sd;
	fin>> mparams.seedLen;
	fin.close();

	//run EM
	switch(read_type) {
	case 0 : EM<SingleRead, SingleHit, SingleModel>(); break;
	case 1 : EM<SingleReadQ, SingleHit, SingleQModel>(); break;
	case 2 : EM<PairedEndRead, PairedEndHit, PairedEndModel>(); break;
	case 3 : EM<PairedEndReadQ, PairedEndHit, PairedEndQModel>(); break;
	default : fprintf(stderr, "Unknown Read Type!\n"); exit(-1);
	}

	time_t b = time(NULL);

	printTimeUsed(a, b, "EM.cpp");

	return 0;
}
Ejemplo n.º 5
0
void writeResults(ModelType& model, double* counts) {
	double denom;
	char outF[STRLEN];
	FILE *fo;

	sprintf(modelF, "%s.model", statName);
	model.write(modelF);

	//calculate tau values
	double *tau = new double[M + 1];
	memset(tau, 0, sizeof(double) * (M + 1));

	denom = 0.0;
	for (int i = 1; i <= M; i++) 
	  if (eel[i] >= EPSILON) {
	    tau[i] = theta[i] / eel[i];
	    denom += tau[i];
	  }   

	general_assert(denom > 0, "No alignable reads?!");

	for (int i = 1; i <= M; i++) {
		tau[i] /= denom;
	}

	//isoform level results
	sprintf(outF, "%s.iso_res", imdName);
	fo = fopen(outF, "w");
	for (int i = 1; i <= M; i++) {
		const Transcript& transcript = transcripts.getTranscriptAt(i);
		fprintf(fo, "%s%c", transcript.getTranscriptID().c_str(), (i < M ? '\t' : '\n'));
	}
	for (int i = 1; i <= M; i++)
		fprintf(fo, "%.2f%c", counts[i], (i < M ? '\t' : '\n'));
	for (int i = 1; i <= M; i++)
		fprintf(fo, "%.15g%c", tau[i], (i < M ? '\t' : '\n'));
	for (int i = 1; i <= M; i++) {
		const Transcript& transcript = transcripts.getTranscriptAt(i);
		fprintf(fo, "%s%c", transcript.getGeneID().c_str(), (i < M ? '\t' : '\n'));
	}
	fclose(fo);

	//gene level results
	sprintf(outF, "%s.gene_res", imdName);
	fo = fopen(outF, "w");
	for (int i = 0; i < m; i++) {
		const string& gene_id = transcripts.getTranscriptAt(gi.spAt(i)).getGeneID();
		fprintf(fo, "%s%c", gene_id.c_str(), (i < m - 1 ? '\t' : '\n'));
	}
	for (int i = 0; i < m; i++) {
		double sumC = 0.0; // sum of counts
		int b = gi.spAt(i), e = gi.spAt(i + 1);
		for (int j = b; j < e; j++) sumC += counts[j];
		fprintf(fo, "%.2f%c", sumC, (i < m - 1 ? '\t' : '\n'));
	}
	for (int i = 0; i < m; i++) {
		double sumT = 0.0; // sum of tau values
		int b = gi.spAt(i), e = gi.spAt(i + 1);
		for (int j = b; j < e; j++) sumT += tau[j];
		fprintf(fo, "%.15g%c", sumT, (i < m - 1 ? '\t' : '\n'));
	}
	for (int i = 0; i < m; i++) {
		int b = gi.spAt(i), e = gi.spAt(i + 1);
		for (int j = b; j < e; j++) {
			fprintf(fo, "%s%c", transcripts.getTranscriptAt(j).getTranscriptID().c_str(), (j < e - 1 ? ',' : (i < m - 1 ? '\t' :'\n')));
		}
	}
	fclose(fo);

	delete[] tau;

	if (verbose) { printf("Expression Results are written!\n"); }
}
Ejemplo n.º 6
0
int main(int argc, char* argv[]) {
	bool quiet = false;

	if (argc < 6) {
		printf("Usage : rsem-parse-alignments refName imdName statName alignFType('s' for sam, 'b' for bam) alignF [-t Type] [-l fn_list] [-tag tagName] [-q]\n");
		exit(-1);
	}

	strcpy(fn_list, "");
	read_type = 0;
	if (argc > 6) {
		for (int i = 6; i < argc; i++) {
			if (!strcmp(argv[i], "-t")) {
				read_type = atoi(argv[i + 1]);
			}
			if (!strcmp(argv[i], "-l")) {
				strcpy(fn_list, argv[i + 1]);
			}
			if (!strcmp(argv[i], "-tag")) {
				SamParser::setReadTypeTag(argv[i + 1]);
			}
			if (!strcmp(argv[i], "-q")) { quiet = true; }
		}
	}

	verbose = !quiet;

	sprintf(groupF, "%s.grp", argv[1]);
	gi.load(groupF);
	sprintf(tiF, "%s.ti", argv[1]);
	transcripts.readFrom(tiF);

	sprintf(datF, "%s.dat", argv[2]);
	sprintf(cntF, "%s.cnt", argv[3]);

	init(argv[2], argv[4][0], argv[5]);

	hit_out.open(datF);

	string firstLine(99, ' ');
	firstLine.append(1, '\n');		//May be dangerous!
	hit_out<<firstLine;

	switch(read_type) {
	case 0 : parseIt<SingleRead, SingleHit>(parser); break;
	case 1 : parseIt<SingleReadQ, SingleHit>(parser); break;
	case 2 : parseIt<PairedEndRead, PairedEndHit>(parser); break;
	case 3 : parseIt<PairedEndReadQ, PairedEndHit>(parser); break;
	}

	hit_out.seekp(0, ios_base::beg);
	hit_out<<N[1]<<" "<<nHits<<" "<<read_type;

	hit_out.close();

	//cntF for statistics of alignments file
	ofstream fout(cntF);
	fout<<N[0]<<" "<<N[1]<<" "<<N[2]<<" "<<(N[0] + N[1] + N[2])<<endl;
	fout<<nUnique<<" "<<nMulti<<" "<<nIsoMulti<<endl;
	fout<<nHits<<" "<<read_type<<endl;
	fout<<"0\t"<<N[0]<<endl;
	for (iter = counter.begin(); iter != counter.end(); iter++) {
		fout<<iter->first<<'\t'<<iter->second<<endl;
	}
	fout<<"Inf\t"<<N[2]<<endl;
	fout.close();

	release();

	if (verbose) { printf("Done!\n"); }

	return 0;
}
Ejemplo n.º 7
0
int main(int argc, char* argv[]) {
	if (argc < 2) {
		printf("Usage: PROBer-build-reference refName [--gtf gtfF] [--mapping mappingF] [--allele-specific] [--files num_of_files file_1 file_2 ...] [--n2g-index] [-q]\n");
		exit(-1);
	}

	hasGTF = false;
	mappingType = 0;
	n2g_idx = false;
	
	int argpos = 2;
	while (argpos < argc) {
		if (!strcmp(argv[argpos], "--gtf")) {
			hasGTF = true;
			strcpy(gtfF, argv[++argpos]);
		}
		if (!strcmp(argv[argpos], "--mapping")) {
			mappingType = 1;
			mappingPos = ++argpos;
		}
		if (!strcmp(argv[argpos], "--allele-specific")) mappingType = 2;
		if (!strcmp(argv[argpos], "--files")) {
			num_files = atoi(argv[++argpos]);
			file_pos = argpos + 1; // the position in argv for the first file
			argpos += num_files;
		}
		if (!strcmp(argv[argpos], "--n2g-index")) n2g_idx = true;
		if (!strcmp(argv[argpos], "-q")) verbose = false;
		++argpos;
	}

	if (mappingType > 0) loadMappingInfo(mappingType, argv[mappingPos]);

	ifstream fin;
	string line, gseq, tseq; // gseq, genomic sequence; tseq, transcript sequence
	string seqname, gene_id, transcript_id;
	
	if (hasGTF) {
		transcripts.setType(0);
		assert(mappingType < 2);
		parse_gtf_file(gtfF);

		M = transcripts.getM();
		general_assert(M > 0, "The reference contains no transcripts!");
		seqs.assign(M + 1, "");
		
		chrvec.clear();
		
		for (int i = 0; i < num_files; ++i, ++file_pos) {
			fin.open(argv[file_pos]);
			general_assert(fin.is_open(), "Cannot open " + cstrtos(argv[file_pos]) + "! It may not exist.");
			getline(fin, line);
			while ((fin) && (line[0] == '>')) {
	istringstream strin(line.substr(1));
	strin>>seqname;
	
	gseq = "";
	while((getline(fin, line)) && (line[0] != '>')) {
		gseq += line;
	}
	assert(gseq.length() > 0);
			
	sn2tr_iter = sn2tr.find(seqname);
	if (sn2tr_iter == sn2tr.end()) continue;
	
	chrvec.push_back(ChrInfo(seqname, gseq.length()));
	
	vector<int>& vec = sn2tr_iter->second;
	int s = vec.size();
	for (int j = 0; j < s; ++j) {
		assert(vec[j] > 0 && vec[j] <= M);
		transcripts.getTranscriptAt(vec[j]).extractSeq(gseq, seqs[vec[j]]);
	}
			}
			fin.close();

			if (verbose) { printf("%s is processed!\n", argv[file_pos]); } 
		}
		
		sort(chrvec.begin(), chrvec.end());

		// Shrink and build up Refs
		int curp = 0;
		for (int i = 1; i <= M; ++i) {
			const Transcript& transcript = transcripts.getTranscriptAt(i);
			if (seqs[i] == "") 
	printf("Warning: Cannot extract transcript %s because the chromosome it locates -- %s -- is absent!\n", transcript.getTranscriptID().c_str(), transcript.getSeqName().c_str());
			else {
	refs.addRef(transcript.getTranscriptID(), seqs[i]); // insert RefSeqs
	++curp;
	transcripts.move(i, curp);
			}
		}
		printf("%d transcripts are extracted and %d transcripts are omitted.\n", curp, M - curp);
		
		transcripts.setM(curp);
		M = transcripts.getM();
		general_assert(M > 0, "The reference contains no transcripts!");
		assert(refs.getM() == M);
	}
	else {
Ejemplo n.º 8
0
void writeToDisk(char* refName) {
	ofstream fout;

	sprintf(tiF, "%s.ti", refName);
	transcripts.writeTo(tiF);
	if (verbose) { printf("Transcript Information File is generated!\n"); }
	
	sprintf(refFastaF, "%s.transcripts.fa", refName);
	refs.writeTo(refFastaF);

	sprintf(transListF, "%s.translist", refName);
	refs.writeTransListTo(transListF);

	sprintf(chromListF, "%s.chrlist", refName);
	fout.open(chromListF);
	for (int i = 0; i < (int)chrvec.size(); ++i)
		fout<< chrvec[i].name<< '\t'<< chrvec[i].len<< endl;
	fout.close();
	if (verbose) { printf("Chromosome List File is generated!\n"); }
	
	string cur_gene_id, cur_transcript_id, name;
	vector<int> gi, gt, ta;

	cur_gene_id = ""; gi.clear(); 
	if (mappingType == 2) { cur_transcript_id = ""; gt.clear(); ta.clear(); }
	for (int i = 1; i <= M; ++i) {
		const Transcript& transcript = transcripts.getTranscriptAt(i);
		if (cur_gene_id != transcript.getGeneID()) {
			gi.push_back(i);
			if (mappingType == 2) gt.push_back((int)ta.size());
			cur_gene_id = transcript.getGeneID();
		}
		if ((mappingType == 2) && (cur_transcript_id != transcript.getTranscriptID())) {
			ta.push_back(i);
			cur_transcript_id = transcript.getTranscriptID();
		}
	}
	
	gi.push_back(M + 1);
	if (mappingType == 2) { gt.push_back((int)ta.size()); ta.push_back(M + 1); }

	sprintf(groupF, "%s.grp", refName);
	fout.open(groupF);
	for (int i = 0; i < (int)gi.size(); ++i) fout<< gi[i]<< endl;
	fout.close();
	if (verbose) { printf("Group File is generated!\n"); }

	if (mappingType == 2) {
		sprintf(gtF, "%s.gt", refName);
		fout.open(gtF);
		for (int i = 0; i < (int)gt.size(); ++i) fout<< gt[i]<< endl;
		fout.close();
		sprintf(taF, "%s.ta", refName);
		fout.open(taF);
		for (int i = 0; i < (int)ta.size(); ++i) fout<< ta[i]<< endl;
		fout.close();
		if (verbose) { printf("Allele-specific group files are generated!\n"); }
	}

	if (n2g_idx) {
		sprintf(n2g_idxF, "%s.n2g.idx.fa", refName);
		fout.open(n2g_idxF);
		for (int i = 1; i <= M; ++i) 
			fout<< '>'<< refs.getRef(i)->getName()<< endl<< n2g(refs.getRef(i)->getSeq())<< endl;
		fout.close();
		if (verbose) printf("%s is generated!\n", n2g_idxF);
	}
}
Ejemplo n.º 9
0
void parse_gtf_file(char* gtfF) {
	ifstream fin(gtfF);
	string line, tid, gid;
	GTFItem item;

	general_assert(fin.is_open(), "Cannot open " + cstrtos(gtfF) + "! It may not exist.");

	int cnt = 0;
	
	items.clear();
	while (getline(fin, line)) {
		if (skip(line)) continue;
		item.parse(line);
		string feature = item.getFeature();
		if (feature == "exon") {
			if (item.getStart() > item.getEnd()) {
	printf("Warning: exon's start position is larger than its end position! This exon is discarded.\n");
	printf("\t%s\n\n", line.c_str());
			}
			else if (item.getStart() < 1) {
	printf("Warning: exon's start position is less than 1! This exon is discarded.\n");
	printf("\t%s\n\n", line.c_str());
			}
			else {
	item.parseAttributes(line);
	if (mappingType > 0) {
		tid = item.getTranscriptID();
		mi_iter = mi_table.find(tid);
		general_assert(mi_iter != mi_table.end(), "Mapping Info is not correct, cannot find " + tid + "'s gene_id!");
		gid = mi_iter->second;
		item.setGeneID(gid);
	}
	items.push_back(item);
			}
		}
		
		++cnt;
		if (verbose && cnt % 200000 == 0) { printf("Parsed %d lines\n", cnt); }
	}
	fin.close();
	
	sort(items.begin(), items.end());
	
	int sp = 0, ep; // start pointer, end pointer
	int nItems = items.size();
	
	sn2tr.clear();
	while (sp < nItems) {
		tid = items[sp].getTranscriptID();
		
		ep = sp + 1;
		while (ep < nItems && items[ep].getTranscriptID() == tid) ++ep;
		--ep;
		
		buildTranscript(sp, ep);
		
		int sid = transcripts.getM();
		const Transcript& transcript = transcripts.getTranscriptAt(sid);
		
		sn2tr_iter = sn2tr.find(transcript.getSeqName());
		if (sn2tr_iter == sn2tr.end()) {
			vector<int> vec(1, sid);
			sn2tr[transcript.getSeqName()] = vec;
		}
		else {
			sn2tr_iter->second.push_back(sid);
		}
		
		sp = ep + 1;
	}
	
	items.clear();
	
	if (verbose) { printf("Parsing GTF File is done!\n"); }
}
Ejemplo n.º 10
0
int main(int argc, char* argv[]) {
  if (argc < 6 || ((hasMappingFile = atoi(argv[4])) && argc < 7)) {
		printf("Usage: rsem-extract-reference-transcripts refName quiet gtfF hasMappingFile [mappingFile] chromosome_file_1 [chromosome_file_2 ...]\n");
		exit(-1);
	}

	verbose = !atoi(argv[2]);
	if (hasMappingFile) {
		loadMappingInfo(argv[5]);
	}
	parse_gtf_file(argv[3]);

	ifstream fin;
	string line, gseq, seqname;

	chrvec.clear();

	seqs.clear();
	seqs.resize(M + 1, "");
	int start = hasMappingFile ? 6 : 5;
	for (int i = start; i < argc; i++) {
		fin.open(argv[i]);
		if (!fin.is_open()) { fprintf(stderr, "Cannot open %s! It may not exist.\n", argv[i]); exit(-1); }
		getline(fin, line);
		while ((fin) && (line[0] == '>')) {
			istringstream strin(line.substr(1));
			strin>>seqname;

			gseq = "";
			while((getline(fin, line)) && (line[0] != '>')) {
			  gseq += line;
			}

			size_t len = gseq.length();
			assert(len > 0);
			for (size_t j = 0; j < len; j++) gseq[j] = check(gseq[j]);
			
			iter = sn2tr.find(seqname);
			if (iter == sn2tr.end()) continue;
			
			chrvec.push_back(ChrInfo(seqname, len));
			
			vector<int>& vec = iter->second;
			int s = vec.size();
			for (int j = 0; j < s; j++) {
			  assert(vec[j] > 0 && vec[j] <= M);
			  transcripts.getTranscriptAt(vec[j]).extractSeq(gseq, seqs[vec[j]]);
			}
		}
		fin.close();

		if (verbose) { printf("%s is processed!\n", argv[i]); }
	}

	for (int i = 1; i <= M; i++) {
		if (seqs[i] == "") {
			const Transcript& transcript = transcripts.getTranscriptAt(i);

			fprintf(stderr, "Cannot extract transcript %s's sequence from chromosome %s! Loading chromosome %s's sequence is failed. Please check if 1) the chromosome directory is set correctly; 2) the list of chromosome files is complete; 3) the FASTA files containing chromosome sequences are not truncated or having wrong format.\n", \
				transcript.getTranscriptID().c_str(), transcript.getSeqName().c_str(), transcript.getSeqName().c_str());
			exit(-1);
		}
	}

	sort(chrvec.begin(), chrvec.end());

	if (verbose) { printf("Extracting sequences is done!\n"); }

	writeResults(argv[1]);

	return 0;
}
Ejemplo n.º 11
0
void parse_gtf_file(char* gtfF) {
	ifstream fin(gtfF);
	string line, curgid, tid, gid; //  curgid: current gene id;
	GTFItem item;

	if (!fin.is_open()) { fprintf(stderr, "Cannot open %s! It may not exist.\n", gtfF); exit(-1); }

	int cnt = 0;

	items.clear();
 	while (getline(fin, line)) {
 		if (line[0] == '#') continue; // if this line is comment, jump it
 		item.parse(line);
 		string feature = item.getFeature();
 		if (feature == "exon") {
 			if (item.getStart() > item.getEnd()) {
 				fprintf(stderr, "Warning: exon's start position is larger than its end position! This exon is discarded.\n");
 				fprintf(stderr, "\t%s\n\n", line.c_str());
 			}
 			else if (item.getStart() < 1) {
 				fprintf(stderr, "Warning: exon's start position is less than 1! This exon is discarded.\n");
 				fprintf(stderr, "\t%s\n\n", line.c_str());
 			}
 			else {
 		 		if (hasMappingFile) {
 		 			tid = item.getTranscriptID();
					mi_iter = mi_table.find(tid);
					if (mi_iter == mi_table.end()) {
					  fprintf(stderr, "Mapping Info is not correct, cannot find %s's gene_id!\n", tid.c_str());
					  exit(-1);
					}
					//assert(iter != table.end());
					gid = mi_iter->second;
					item.setGeneID(gid);
 		 		}
 				items.push_back(item);
 			}
 		}

 		++cnt;
 		if (verbose && cnt % 200000 == 0) { printf("Parsed %d lines\n", cnt); }
	}
	fin.close();

	sort(items.begin(), items.end());

	starts.clear();
	sn2tr.clear();
	curgid = "";

	int sp = 0, ep; // start pointer, end pointer
	int nItems = items.size();

	while (sp < nItems) {
		tid = items[sp].getTranscriptID();
		gid = items[sp].getGeneID();

		ep = sp + 1;
		while (ep < nItems && items[ep].getTranscriptID() == tid) ep++;
		ep--;

		buildTranscript(sp, ep);

		int sid = transcripts.getM();
		const Transcript& transcript = transcripts.getTranscriptAt(sid);

		if (curgid != gid) {
			starts.push_back(sid);
			curgid = gid;
		}
		iter = sn2tr.find(transcript.getSeqName());
		if (iter == sn2tr.end()) {
			vector<int> vec(1, sid);
			sn2tr[transcript.getSeqName()] = vec;
		}
		else {
			iter->second.push_back(sid);
		}

		sp = ep + 1;
	}

	M = transcripts.getM();
	starts.push_back(M + 1);
	items.clear();

	if (M < 1) {
		fprintf(stderr, "Number of transcripts in the reference is less than 1!\n");
		exit(-1);
	}

	if (verbose) { printf("Parsing gtf File is done!\n"); }
}
Ejemplo n.º 12
0
int main(int argc, char* argv[]) {
	ifstream fin;
	bool quiet = false;

	if (argc < 6) {
		printf("Usage : rsem-run-em refName read_type sampleName imdName statName [-p #Threads] [-b samInpType samInpF has_fn_list_? [fn_list]] [-q] [--gibbs-out] [--sampling] [--seed seed] [--calc-evaluation-score nb_r nb_p L w]\n\n");
		printf("  refName: reference name\n");
		printf("  read_type: 0 single read without quality score; 1 single read with quality score; 2 paired-end read without quality score; 3 paired-end read with quality score.\n");
		printf("  sampleName: sample's name, including the path\n");
		printf("  sampleToken: sampleName excludes the path\n");
		printf("  -p: number of threads which user wants to use. (default: 1)\n");
		printf("  -b: produce bam format output file. (default: off)\n");
		printf("  -q: set it quiet\n");
		printf("  --gibbs-out: generate output file use by Gibbs sampler. (default: off)\n");
		printf("  --sampling: sample each read from its posterior distribution when bam file is generated. (default: off)\n");
		printf("  --seed uint32: the seed used for the BAM sampling. (default: off)\n");
		printf("  --calc-evaluation-score nb_r nb_p L w: "
				"nb_r and nb_p are parameters for the true transcript length distribution, which is modeled by a negative binomial distribution; "
				"L is the read length and w is the mininum overlap required for joining two contigs.\n");
		printf("// model parameters should be in imdName.mparams.\n");
		exit(-1);
	}

	time_t a = time(NULL);

	strcpy(refName, argv[1]);
	read_type = atoi(argv[2]);
	strcpy(outName, argv[3]);
	strcpy(imdName, argv[4]);
	strcpy(statName, argv[5]);

	nThreads = 1;

	genBamF = false;
	bamSampling = false;
	genGibbsOut = false;
	calcEvalScore = false;
	pt_fn_list = NULL;
	hasSeed = false;

	for (int i = 6; i < argc; i++) {
		if (!strcmp(argv[i], "-p")) { nThreads = atoi(argv[i + 1]); }
		if (!strcmp(argv[i], "-b")) {
			genBamF = true;
			inpSamType = argv[i + 1][0];
			strcpy(inpSamF, argv[i + 2]);
			if (atoi(argv[i + 3]) == 1) {
				strcpy(fn_list, argv[i + 4]);
				pt_fn_list = (char*)(&fn_list);
			}
		}
		if (!strcmp(argv[i], "-q")) { quiet = true; }
		if (!strcmp(argv[i], "--gibbs-out")) { genGibbsOut = true; }
		if (!strcmp(argv[i], "--sampling")) { bamSampling = true; }
		if (!strcmp(argv[i], "--seed")) {
		  hasSeed = true;
		  int len = strlen(argv[i + 1]);
		  seed = 0;
		  for (int k = 0; k < len; k++) seed = seed * 10 + (argv[i + 1][k] - '0');
		}
		if (!strcmp(argv[i], "--calc-evaluation-score")) {
			calcEvalScore = true;
			nb_r = atof(argv[i + 1]);
			nb_p = atof(argv[i + 2]);
			L = atoi(argv[i + 3]);
			w = atoi(argv[i + 4]);
		}
	}

	general_assert(nThreads > 0, "Number of threads should be bigger than 0!");

	verbose = !quiet;

	//basic info loading
	sprintf(refF, "%s.seq", refName);
	refs.loadRefs(refF);
	M = refs.getM();

	sprintf(tiF, "%s.ti", refName);
	transcripts.readFrom(tiF);

	sprintf(cntF, "%s.cnt", statName);
	fin.open(cntF);

	general_assert(fin.is_open(), "Cannot open " + cstrtos(cntF) + "! It may not exist.");

	fin>>N0>>N1>>N2>>N_tot;
	fin.close();

	general_assert(N1 > 0, "There are no alignable reads!");

	if ((READ_INT_TYPE)nThreads > N1) nThreads = N1;

	//set model parameters
	mparams.M = M;
	mparams.N[0] = N0; mparams.N[1] = N1; mparams.N[2] = N2;
	mparams.refs = &refs;

	sprintf(mparamsF, "%s.mparams", imdName);
	fin.open(mparamsF);

	general_assert(fin.is_open(), "Cannot open " + cstrtos(mparamsF) + "It may not exist.");

	fin>> mparams.minL>> mparams.maxL>> mparams.probF;
	int val; // 0 or 1 , for estRSPD
	fin>>val;
	mparams.estRSPD = (val != 0);
	fin>> mparams.B>> mparams.mate_minL>> mparams.mate_maxL>> mparams.mean>> mparams.sd;
	fin>> mparams.seedLen;
	fin.close();

	//run EM
	switch(read_type) {
	case 0 : EM<SingleRead, SingleHit, SingleModel>(); break;
	case 1 : EM<SingleReadQ, SingleHit, SingleQModel>(); break;
	case 2 : EM<PairedEndRead, PairedEndHit, PairedEndModel>(); break;
	case 3 : EM<PairedEndReadQ, PairedEndHit, PairedEndQModel>(); break;
	default : fprintf(stderr, "Unknown Read Type!\n"); exit(-1);
	}

	if (calcEvalScore) {
		CalcEvalScore ces(refs, nb_r, nb_p, L, w, statName);
		sprintf(scoreF, "%s.score", outName);
		ces.writeScoresTo(scoreF);
		
		char groupF[STRLEN];
		GroupInfo gi;
		sprintf(groupF, "%s.grp", argv[1]);
		gi.load(groupF);

		ces.generateExpressionFiles(gi, transcripts, scoreF);
	}

	time_t b = time(NULL);

	printTimeUsed(a, b, "EM.cpp");

	return 0;
}
Ejemplo n.º 13
0
void writeResults(int option, char* refName) {
	ofstream fout, fout2;
	string cur_gene_id, cur_transcript_id, name;
	vector<int> gi, gt, ta;

	sprintf(tiF, "%s.ti", refName);
	transcripts.writeTo(tiF);
	if (verbose) { printf("Transcript Information File is generated!\n"); }

	cur_gene_id = ""; gi.clear(); 
	if (option == 2) { cur_transcript_id = ""; gt.clear(); ta.clear(); }
	for (int i = 1; i <= M; i++) {
		const Transcript& transcript = transcripts.getTranscriptAt(i);
		if (cur_gene_id != transcript.getGeneID()) {
		  gi.push_back(i);
		  if (option == 2) gt.push_back((int)ta.size());
		  cur_gene_id = transcript.getGeneID();
		}
		if ((option == 2) && (cur_transcript_id != transcript.getTranscriptID())) {
		    ta.push_back(i);
		    cur_transcript_id = transcript.getTranscriptID();
		}
	}
	gi.push_back(M + 1);
	if (option == 2) { gt.push_back((int)ta.size()); ta.push_back(M + 1); }

	sprintf(groupF, "%s.grp", refName);
	fout.open(groupF);
	for (int i = 0; i < (int)gi.size(); i++) fout<< gi[i]<< endl;
	fout.close();
	if (verbose) { printf("Group File is generated!\n"); }

	if (option == 2) {
	  sprintf(gtF, "%s.gt", refName);
	  fout.open(gtF);
	  for (int i = 0; i < (int)gt.size(); i++) fout<< gt[i]<< endl;
	  fout.close();
	  sprintf(taF, "%s.ta", refName);
	  fout.open(taF);
	  for (int i = 0; i < (int)ta.size(); i++) fout<< ta[i]<< endl;
	  fout.close();
	  if (verbose) { printf("Allele-specific group files are generated!\n"); }
	}

	sprintf(refFastaF, "%s.transcripts.fa", refName);
	sprintf(chromListF, "%s.chrlist", refName);
	fout2.open(chromListF);
	fout.open(refFastaF);
	for (int i = 1; i <= M; i++) {
		name = transcripts.getTranscriptAt(i).getSeqName();
		iter = name2seq.find(name);
		general_assert(iter != name2seq.end(), "Cannot recognize sequence ID" + name + "!");
		fout<<">"<<name<<endl;
		fout<<iter->second<<endl;

		fout2<<name<<'\t'<<iter->second.length()<<endl;
	}
	fout.close();
	fout2.close();
	
	if (verbose) { 
	  printf("Chromosome List File is generated!\n"); 
	  printf("Extracted Sequences File is generated!\n"); 
	}
}
Ejemplo n.º 14
0
int main(int argc, char* argv[]) {
  if (argc < 5 || ((hasMappingFile = atoi(argv[3])) && argc < 6)) {
		printf("Usage: synthesisRef refName quiet hasMappingFile<0,no;1,yes;2,allele-specific> [mappingFile] reference_file_1 [reference_file_2 ...]\n");
		exit(-1);
	}

	verbose = !atoi(argv[2]);

	if (hasMappingFile) { loadMappingInfo(hasMappingFile, argv[4]); }

	// allele-specific
	if (hasMappingFile == 2) { transcripts.setType(2); }

	int start = hasMappingFile ? 5 : 4;

	ifstream fin;
	string line, gseq;
	string seqname, gene_id, transcript_id;

	vector<Interval> vec;

	M = 0;
	name2seq.clear();
	for (int i = start; i < argc; i++) {
		fin.open(argv[i]);
		general_assert(fin.is_open(), "Cannot open " + cstrtos(argv[i]) + "! It may not exist."); 
		unsigned long int line_no = 0; //Keep track of file line number
		getline(fin, line);
		line_no += 1;
		while ((fin) && (line[0] == '>')) {
			istringstream strin(line.substr(1));
			strin>>seqname;

			gseq = "";
			while((getline(fin, line)) && (line[0] != '>')) {
			    line_no += 1;
			    gseq += line;
			}

			int len = gseq.length();
			assert(len > 0);
			for (int j = 0; j < len; j++) gseq[j] = check(gseq[j],line_no);

			name2seq[seqname] = gseq;

			transcript_id = seqname;
			gene_id = seqname;

			if (hasMappingFile) {
			      mi_iter = mi_table.find(seqname);
			      general_assert(mi_iter != mi_table.end(), "Mapping Info is not correct, cannot find " + seqname + "'s gene_id!");
			      gene_id = mi_iter->second;
			      if (hasMappingFile == 2) {
				mi_iter2 = mi_table2.find(seqname);
				general_assert(mi_iter2 != mi_table2.end(), "Mapping Info is not correct, cannot find allele " + seqname + "'s transcript_id!");
				transcript_id = mi_iter2->second;
			      }
			}
			
			vec.clear();
			vec.push_back(Interval(1, len));
			transcripts.add(Transcript(transcript_id, gene_id, seqname, '+', vec, ""));
			++M;

			if (verbose && M % 1000000 == 0) { printf("%d sequences are processed!\n", M); }
		}
		fin.close();
	}

	if (M < 1) {
		fprintf(stderr, "Number of transcripts in the reference is less than 1!\n");
		exit(-1);
	}

	assert(M == transcripts.getM());
	transcripts.sort();

	writeResults(hasMappingFile, argv[1]);

	return 0;
}