Ejemplo n.º 1
0
int main(int argc, char* argv[]) {
	ifstream fin;
	bool quiet = false;

	if (argc < 5) {
		printf("Usage : rsem-run-em refName read_type sampleName sampleToken [-p #Threads] [-b samInpType samInpF has_fn_list_? [fn_list]] [-q] [--gibbs-out] [--sampling]\n\n");
		printf("  refName: reference name\n");
		printf("  read_type: 0 single read without quality score; 1 single read with quality score; 2 paired-end read without quality score; 3 paired-end read with quality score.\n");
		printf("  sampleName: sample's name, including the path\n");
		printf("  sampleToken: sampleName excludes the path\n");
		printf("  -p: number of threads which user wants to use. (default: 1)\n");
		printf("  -b: produce bam format output file. (default: off)\n");
		printf("  -q: set it quiet\n");
		printf("  --gibbs-out: generate output file used by Gibbs sampler. (default: off)\n");
		printf("  --sampling: sample each read from its posterior distribution when bam file is generated. (default: off)\n");
		printf("// model parameters should be in imdName.mparams.\n");
		exit(-1);
	}

	time_t a = time(NULL);

	strcpy(refName, argv[1]);
	read_type = atoi(argv[2]);
	strcpy(outName, argv[3]);
	sprintf(imdName, "%s.temp/%s", argv[3], argv[4]);
	sprintf(statName, "%s.stat/%s", argv[3], argv[4]);

	nThreads = 1;

	genBamF = false;
	bamSampling = false;
	genGibbsOut = false;
	pt_fn_list = pt_chr_list = NULL;

	for (int i = 5; i < argc; i++) {
		if (!strcmp(argv[i], "-p")) { nThreads = atoi(argv[i + 1]); }
		if (!strcmp(argv[i], "-b")) {
			genBamF = true;
			inpSamType = argv[i + 1][0];
			strcpy(inpSamF, argv[i + 2]);
			if (atoi(argv[i + 3]) == 1) {
				strcpy(fn_list, argv[i + 4]);
				pt_fn_list = (char*)(&fn_list);
			}
		}
		if (!strcmp(argv[i], "-q")) { quiet = true; }
		if (!strcmp(argv[i], "--gibbs-out")) { genGibbsOut = true; }
		if (!strcmp(argv[i], "--sampling")) { bamSampling = true; }
	}

	general_assert(nThreads > 0, "Number of threads should be bigger than 0!");

	verbose = !quiet;

	//basic info loading
	sprintf(refF, "%s.seq", refName);
	refs.loadRefs(refF);
	M = refs.getM();
	sprintf(groupF, "%s.grp", refName);
	gi.load(groupF);
	m = gi.getm();

	sprintf(tiF, "%s.ti", refName);
	transcripts.readFrom(tiF);

	sprintf(cntF, "%s.cnt", statName);
	fin.open(cntF);

	general_assert(fin.is_open(), "Cannot open " + cstrtos(cntF) + "! It may not exist.");

	fin>>N0>>N1>>N2>>N_tot;
	fin.close();

	general_assert(N1 > 0, "There are no alignable reads!");

	if ((READ_INT_TYPE)nThreads > N1) nThreads = N1;

	//set model parameters
	mparams.M = M;
	mparams.N[0] = N0; mparams.N[1] = N1; mparams.N[2] = N2;
	mparams.refs = &refs;

	sprintf(mparamsF, "%s.mparams", imdName);
	fin.open(mparamsF);

	general_assert(fin.is_open(), "Cannot open " + cstrtos(mparamsF) + "It may not exist.");

	fin>> mparams.minL>> mparams.maxL>> mparams.probF;
	int val; // 0 or 1 , for estRSPD
	fin>>val;
	mparams.estRSPD = (val != 0);
	fin>> mparams.B>> mparams.mate_minL>> mparams.mate_maxL>> mparams.mean>> mparams.sd;
	fin>> mparams.seedLen;
	fin.close();

	//run EM
	switch(read_type) {
	case 0 : EM<SingleRead, SingleHit, SingleModel>(); break;
	case 1 : EM<SingleReadQ, SingleHit, SingleQModel>(); break;
	case 2 : EM<PairedEndRead, PairedEndHit, PairedEndModel>(); break;
	case 3 : EM<PairedEndReadQ, PairedEndHit, PairedEndQModel>(); break;
	default : fprintf(stderr, "Unknown Read Type!\n"); exit(-1);
	}

	time_t b = time(NULL);

	printTimeUsed(a, b, "EM.cpp");

	return 0;
}
Ejemplo n.º 2
0
int main(int argc, char* argv[]) {
	if (argc < 8) {
		printf("Usage: rsem-calculate-credibility-intervals reference_name sample_name sampleToken confidence nCV nSpC nMB [-p #Threads] [-q]\n");
		exit(-1);
	}

	confidence = atof(argv[4]);
	nCV = atoi(argv[5]);
	nSpC = atoi(argv[6]);
	nMB = atoi(argv[7]);

	nThreads = 1;
	quiet = false;
	for (int i = 8; i < argc; i++) {
		if (!strcmp(argv[i], "-p")) nThreads = atoi(argv[i + 1]);
		if (!strcmp(argv[i], "-q")) quiet = true;
	}
	verbose = !quiet;

	sprintf(refF, "%s.seq", argv[1]);
	refs.loadRefs(refF, 1);
	M = refs.getM();
	sprintf(groupF, "%s.grp", argv[1]);
	gi.load(groupF);
	m = gi.getm();

	nSamples = nCV * nSpC;
	cvlen = M + 1;
	assert(nSamples > 0 && cvlen > 1); // for Buffter.h: (bufsize_type)nSamples

	sprintf(imdName, "%s.temp/%s", argv[2], argv[3]);
	sprintf(statName, "%s.stat/%s", argv[2], argv[3]);
	sprintf(tmpF, "%s.tmp", imdName);
	sprintf(cvsF, "%s.countvectors", imdName);

	sprintf(modelF, "%s.model", statName);
	FILE *fi = fopen(modelF, "r");
	general_assert(fi != NULL, "Cannot open " + cstrtos(modelF) + "!");
	assert(fscanf(fi, "%d", &model_type) == 1);
	fclose(fi);

	// Phase I
	switch(model_type) {
	case 0 : sample_theta_vectors_from_count_vectors<SingleModel>(); break;
	case 1 : sample_theta_vectors_from_count_vectors<SingleQModel>(); break;
	case 2 : sample_theta_vectors_from_count_vectors<PairedEndModel>(); break;
	case 3 : sample_theta_vectors_from_count_vectors<PairedEndQModel>(); break;
	}

	// Phase II
	calculate_credibility_intervals(imdName);

	/*
	sprintf(command, "rm -f %s", tmpF);
	int status = system(command);
	if (status != 0) {
		fprintf(stderr, "Cannot delete %s!\n", tmpF);
		exit(-1);
	}
	*/

	return 0;
}
Ejemplo n.º 3
0
int main(int argc, char* argv[]) {
	if (argc < 2) {
		printf("Usage: PROBer-build-reference refName [--gtf gtfF] [--mapping mappingF] [--allele-specific] [--files num_of_files file_1 file_2 ...] [--n2g-index] [-q]\n");
		exit(-1);
	}

	hasGTF = false;
	mappingType = 0;
	n2g_idx = false;
	
	int argpos = 2;
	while (argpos < argc) {
		if (!strcmp(argv[argpos], "--gtf")) {
			hasGTF = true;
			strcpy(gtfF, argv[++argpos]);
		}
		if (!strcmp(argv[argpos], "--mapping")) {
			mappingType = 1;
			mappingPos = ++argpos;
		}
		if (!strcmp(argv[argpos], "--allele-specific")) mappingType = 2;
		if (!strcmp(argv[argpos], "--files")) {
			num_files = atoi(argv[++argpos]);
			file_pos = argpos + 1; // the position in argv for the first file
			argpos += num_files;
		}
		if (!strcmp(argv[argpos], "--n2g-index")) n2g_idx = true;
		if (!strcmp(argv[argpos], "-q")) verbose = false;
		++argpos;
	}

	if (mappingType > 0) loadMappingInfo(mappingType, argv[mappingPos]);

	ifstream fin;
	string line, gseq, tseq; // gseq, genomic sequence; tseq, transcript sequence
	string seqname, gene_id, transcript_id;
	
	if (hasGTF) {
		transcripts.setType(0);
		assert(mappingType < 2);
		parse_gtf_file(gtfF);

		M = transcripts.getM();
		general_assert(M > 0, "The reference contains no transcripts!");
		seqs.assign(M + 1, "");
		
		chrvec.clear();
		
		for (int i = 0; i < num_files; ++i, ++file_pos) {
			fin.open(argv[file_pos]);
			general_assert(fin.is_open(), "Cannot open " + cstrtos(argv[file_pos]) + "! It may not exist.");
			getline(fin, line);
			while ((fin) && (line[0] == '>')) {
	istringstream strin(line.substr(1));
	strin>>seqname;
	
	gseq = "";
	while((getline(fin, line)) && (line[0] != '>')) {
		gseq += line;
	}
	assert(gseq.length() > 0);
			
	sn2tr_iter = sn2tr.find(seqname);
	if (sn2tr_iter == sn2tr.end()) continue;
	
	chrvec.push_back(ChrInfo(seqname, gseq.length()));
	
	vector<int>& vec = sn2tr_iter->second;
	int s = vec.size();
	for (int j = 0; j < s; ++j) {
		assert(vec[j] > 0 && vec[j] <= M);
		transcripts.getTranscriptAt(vec[j]).extractSeq(gseq, seqs[vec[j]]);
	}
			}
			fin.close();

			if (verbose) { printf("%s is processed!\n", argv[file_pos]); } 
		}
		
		sort(chrvec.begin(), chrvec.end());

		// Shrink and build up Refs
		int curp = 0;
		for (int i = 1; i <= M; ++i) {
			const Transcript& transcript = transcripts.getTranscriptAt(i);
			if (seqs[i] == "") 
	printf("Warning: Cannot extract transcript %s because the chromosome it locates -- %s -- is absent!\n", transcript.getTranscriptID().c_str(), transcript.getSeqName().c_str());
			else {
	refs.addRef(transcript.getTranscriptID(), seqs[i]); // insert RefSeqs
	++curp;
	transcripts.move(i, curp);
			}
		}
		printf("%d transcripts are extracted and %d transcripts are omitted.\n", curp, M - curp);
		
		transcripts.setM(curp);
		M = transcripts.getM();
		general_assert(M > 0, "The reference contains no transcripts!");
		assert(refs.getM() == M);
	}
	else {
Ejemplo n.º 4
0
int main(int argc, char* argv[]) {
	ifstream fin;
	bool quiet = false;

	if (argc < 6) {
		printf("Usage : rsem-run-em refName read_type sampleName imdName statName [-p #Threads] [-b samInpType samInpF has_fn_list_? [fn_list]] [-q] [--gibbs-out] [--sampling] [--seed seed] [--calc-evaluation-score nb_r nb_p L w]\n\n");
		printf("  refName: reference name\n");
		printf("  read_type: 0 single read without quality score; 1 single read with quality score; 2 paired-end read without quality score; 3 paired-end read with quality score.\n");
		printf("  sampleName: sample's name, including the path\n");
		printf("  sampleToken: sampleName excludes the path\n");
		printf("  -p: number of threads which user wants to use. (default: 1)\n");
		printf("  -b: produce bam format output file. (default: off)\n");
		printf("  -q: set it quiet\n");
		printf("  --gibbs-out: generate output file use by Gibbs sampler. (default: off)\n");
		printf("  --sampling: sample each read from its posterior distribution when bam file is generated. (default: off)\n");
		printf("  --seed uint32: the seed used for the BAM sampling. (default: off)\n");
		printf("  --calc-evaluation-score nb_r nb_p L w: "
				"nb_r and nb_p are parameters for the true transcript length distribution, which is modeled by a negative binomial distribution; "
				"L is the read length and w is the mininum overlap required for joining two contigs.\n");
		printf("// model parameters should be in imdName.mparams.\n");
		exit(-1);
	}

	time_t a = time(NULL);

	strcpy(refName, argv[1]);
	read_type = atoi(argv[2]);
	strcpy(outName, argv[3]);
	strcpy(imdName, argv[4]);
	strcpy(statName, argv[5]);

	nThreads = 1;

	genBamF = false;
	bamSampling = false;
	genGibbsOut = false;
	calcEvalScore = false;
	pt_fn_list = NULL;
	hasSeed = false;

	for (int i = 6; i < argc; i++) {
		if (!strcmp(argv[i], "-p")) { nThreads = atoi(argv[i + 1]); }
		if (!strcmp(argv[i], "-b")) {
			genBamF = true;
			inpSamType = argv[i + 1][0];
			strcpy(inpSamF, argv[i + 2]);
			if (atoi(argv[i + 3]) == 1) {
				strcpy(fn_list, argv[i + 4]);
				pt_fn_list = (char*)(&fn_list);
			}
		}
		if (!strcmp(argv[i], "-q")) { quiet = true; }
		if (!strcmp(argv[i], "--gibbs-out")) { genGibbsOut = true; }
		if (!strcmp(argv[i], "--sampling")) { bamSampling = true; }
		if (!strcmp(argv[i], "--seed")) {
		  hasSeed = true;
		  int len = strlen(argv[i + 1]);
		  seed = 0;
		  for (int k = 0; k < len; k++) seed = seed * 10 + (argv[i + 1][k] - '0');
		}
		if (!strcmp(argv[i], "--calc-evaluation-score")) {
			calcEvalScore = true;
			nb_r = atof(argv[i + 1]);
			nb_p = atof(argv[i + 2]);
			L = atoi(argv[i + 3]);
			w = atoi(argv[i + 4]);
		}
	}

	general_assert(nThreads > 0, "Number of threads should be bigger than 0!");

	verbose = !quiet;

	//basic info loading
	sprintf(refF, "%s.seq", refName);
	refs.loadRefs(refF);
	M = refs.getM();

	sprintf(tiF, "%s.ti", refName);
	transcripts.readFrom(tiF);

	sprintf(cntF, "%s.cnt", statName);
	fin.open(cntF);

	general_assert(fin.is_open(), "Cannot open " + cstrtos(cntF) + "! It may not exist.");

	fin>>N0>>N1>>N2>>N_tot;
	fin.close();

	general_assert(N1 > 0, "There are no alignable reads!");

	if ((READ_INT_TYPE)nThreads > N1) nThreads = N1;

	//set model parameters
	mparams.M = M;
	mparams.N[0] = N0; mparams.N[1] = N1; mparams.N[2] = N2;
	mparams.refs = &refs;

	sprintf(mparamsF, "%s.mparams", imdName);
	fin.open(mparamsF);

	general_assert(fin.is_open(), "Cannot open " + cstrtos(mparamsF) + "It may not exist.");

	fin>> mparams.minL>> mparams.maxL>> mparams.probF;
	int val; // 0 or 1 , for estRSPD
	fin>>val;
	mparams.estRSPD = (val != 0);
	fin>> mparams.B>> mparams.mate_minL>> mparams.mate_maxL>> mparams.mean>> mparams.sd;
	fin>> mparams.seedLen;
	fin.close();

	//run EM
	switch(read_type) {
	case 0 : EM<SingleRead, SingleHit, SingleModel>(); break;
	case 1 : EM<SingleReadQ, SingleHit, SingleQModel>(); break;
	case 2 : EM<PairedEndRead, PairedEndHit, PairedEndModel>(); break;
	case 3 : EM<PairedEndReadQ, PairedEndHit, PairedEndQModel>(); break;
	default : fprintf(stderr, "Unknown Read Type!\n"); exit(-1);
	}

	if (calcEvalScore) {
		CalcEvalScore ces(refs, nb_r, nb_p, L, w, statName);
		sprintf(scoreF, "%s.score", outName);
		ces.writeScoresTo(scoreF);
		
		char groupF[STRLEN];
		GroupInfo gi;
		sprintf(groupF, "%s.grp", argv[1]);
		gi.load(groupF);

		ces.generateExpressionFiles(gi, transcripts, scoreF);
	}

	time_t b = time(NULL);

	printTimeUsed(a, b, "EM.cpp");

	return 0;
}