void writeResults(char* refName) { int s; ofstream fout; sprintf(groupF, "%s.grp", refName); sprintf(tiF, "%s.ti", refName); sprintf(refFastaF, "%s.transcripts.fa", refName); sprintf(chromListF, "%s.chrlist", refName); fout.open(groupF); s = starts.size(); for (int i = 0; i < s; i++) fout<<starts[i]<<endl; fout.close(); if (verbose) { printf("Group File is generated!\n"); } transcripts.writeTo(tiF); if (verbose) { printf("Transcript Information File is generated!\n"); } fout.open(chromListF); s = chrvec.size(); for (int i = 0; i < s; i++) { fout<<chrvec[i].name<<'\t'<<chrvec[i].len<<endl; } fout.close(); if (verbose) { printf("Chromosome List File is generated!\n"); } fout.open(refFastaF); for (int i = 1; i <= M; i++) { fout<<">"<<transcripts.getTranscriptAt(i).getTranscriptID()<<endl; fout<<seqs[i]<<endl; } fout.close(); if (verbose) { printf("Extracted Sequences File is generated!\n"); } }
void writeResults(ModelType& model, double* counts) { double denom; char outF[STRLEN]; FILE *fo; sprintf(modelF, "%s.model", statName); model.write(modelF); //calculate tau values double *tau = new double[M + 1]; memset(tau, 0, sizeof(double) * (M + 1)); denom = 0.0; for (int i = 1; i <= M; i++) if (eel[i] >= EPSILON) { tau[i] = theta[i] / eel[i]; denom += tau[i]; } general_assert(denom > 0, "No alignable reads?!"); for (int i = 1; i <= M; i++) { tau[i] /= denom; } //isoform level results sprintf(outF, "%s.iso_res", imdName); fo = fopen(outF, "w"); for (int i = 1; i <= M; i++) { const Transcript& transcript = transcripts.getTranscriptAt(i); fprintf(fo, "%s%c", transcript.getTranscriptID().c_str(), (i < M ? '\t' : '\n')); } for (int i = 1; i <= M; i++) fprintf(fo, "%.2f%c", counts[i], (i < M ? '\t' : '\n')); for (int i = 1; i <= M; i++) fprintf(fo, "%.15g%c", tau[i], (i < M ? '\t' : '\n')); for (int i = 1; i <= M; i++) { const Transcript& transcript = transcripts.getTranscriptAt(i); fprintf(fo, "%s%c", transcript.getGeneID().c_str(), (i < M ? '\t' : '\n')); } fclose(fo); //gene level results sprintf(outF, "%s.gene_res", imdName); fo = fopen(outF, "w"); for (int i = 0; i < m; i++) { const string& gene_id = transcripts.getTranscriptAt(gi.spAt(i)).getGeneID(); fprintf(fo, "%s%c", gene_id.c_str(), (i < m - 1 ? '\t' : '\n')); } for (int i = 0; i < m; i++) { double sumC = 0.0; // sum of counts int b = gi.spAt(i), e = gi.spAt(i + 1); for (int j = b; j < e; j++) sumC += counts[j]; fprintf(fo, "%.2f%c", sumC, (i < m - 1 ? '\t' : '\n')); } for (int i = 0; i < m; i++) { double sumT = 0.0; // sum of tau values int b = gi.spAt(i), e = gi.spAt(i + 1); for (int j = b; j < e; j++) sumT += tau[j]; fprintf(fo, "%.15g%c", sumT, (i < m - 1 ? '\t' : '\n')); } for (int i = 0; i < m; i++) { int b = gi.spAt(i), e = gi.spAt(i + 1); for (int j = b; j < e; j++) { fprintf(fo, "%s%c", transcripts.getTranscriptAt(j).getTranscriptID().c_str(), (j < e - 1 ? ',' : (i < m - 1 ? '\t' :'\n'))); } } fclose(fo); delete[] tau; if (verbose) { printf("Expression Results are written!\n"); } }
int main(int argc, char* argv[]) { if (argc < 2) { printf("Usage: PROBer-build-reference refName [--gtf gtfF] [--mapping mappingF] [--allele-specific] [--files num_of_files file_1 file_2 ...] [--n2g-index] [-q]\n"); exit(-1); } hasGTF = false; mappingType = 0; n2g_idx = false; int argpos = 2; while (argpos < argc) { if (!strcmp(argv[argpos], "--gtf")) { hasGTF = true; strcpy(gtfF, argv[++argpos]); } if (!strcmp(argv[argpos], "--mapping")) { mappingType = 1; mappingPos = ++argpos; } if (!strcmp(argv[argpos], "--allele-specific")) mappingType = 2; if (!strcmp(argv[argpos], "--files")) { num_files = atoi(argv[++argpos]); file_pos = argpos + 1; // the position in argv for the first file argpos += num_files; } if (!strcmp(argv[argpos], "--n2g-index")) n2g_idx = true; if (!strcmp(argv[argpos], "-q")) verbose = false; ++argpos; } if (mappingType > 0) loadMappingInfo(mappingType, argv[mappingPos]); ifstream fin; string line, gseq, tseq; // gseq, genomic sequence; tseq, transcript sequence string seqname, gene_id, transcript_id; if (hasGTF) { transcripts.setType(0); assert(mappingType < 2); parse_gtf_file(gtfF); M = transcripts.getM(); general_assert(M > 0, "The reference contains no transcripts!"); seqs.assign(M + 1, ""); chrvec.clear(); for (int i = 0; i < num_files; ++i, ++file_pos) { fin.open(argv[file_pos]); general_assert(fin.is_open(), "Cannot open " + cstrtos(argv[file_pos]) + "! It may not exist."); getline(fin, line); while ((fin) && (line[0] == '>')) { istringstream strin(line.substr(1)); strin>>seqname; gseq = ""; while((getline(fin, line)) && (line[0] != '>')) { gseq += line; } assert(gseq.length() > 0); sn2tr_iter = sn2tr.find(seqname); if (sn2tr_iter == sn2tr.end()) continue; chrvec.push_back(ChrInfo(seqname, gseq.length())); vector<int>& vec = sn2tr_iter->second; int s = vec.size(); for (int j = 0; j < s; ++j) { assert(vec[j] > 0 && vec[j] <= M); transcripts.getTranscriptAt(vec[j]).extractSeq(gseq, seqs[vec[j]]); } } fin.close(); if (verbose) { printf("%s is processed!\n", argv[file_pos]); } } sort(chrvec.begin(), chrvec.end()); // Shrink and build up Refs int curp = 0; for (int i = 1; i <= M; ++i) { const Transcript& transcript = transcripts.getTranscriptAt(i); if (seqs[i] == "") printf("Warning: Cannot extract transcript %s because the chromosome it locates -- %s -- is absent!\n", transcript.getTranscriptID().c_str(), transcript.getSeqName().c_str()); else { refs.addRef(transcript.getTranscriptID(), seqs[i]); // insert RefSeqs ++curp; transcripts.move(i, curp); } } printf("%d transcripts are extracted and %d transcripts are omitted.\n", curp, M - curp); transcripts.setM(curp); M = transcripts.getM(); general_assert(M > 0, "The reference contains no transcripts!"); assert(refs.getM() == M); } else {
void writeToDisk(char* refName) { ofstream fout; sprintf(tiF, "%s.ti", refName); transcripts.writeTo(tiF); if (verbose) { printf("Transcript Information File is generated!\n"); } sprintf(refFastaF, "%s.transcripts.fa", refName); refs.writeTo(refFastaF); sprintf(transListF, "%s.translist", refName); refs.writeTransListTo(transListF); sprintf(chromListF, "%s.chrlist", refName); fout.open(chromListF); for (int i = 0; i < (int)chrvec.size(); ++i) fout<< chrvec[i].name<< '\t'<< chrvec[i].len<< endl; fout.close(); if (verbose) { printf("Chromosome List File is generated!\n"); } string cur_gene_id, cur_transcript_id, name; vector<int> gi, gt, ta; cur_gene_id = ""; gi.clear(); if (mappingType == 2) { cur_transcript_id = ""; gt.clear(); ta.clear(); } for (int i = 1; i <= M; ++i) { const Transcript& transcript = transcripts.getTranscriptAt(i); if (cur_gene_id != transcript.getGeneID()) { gi.push_back(i); if (mappingType == 2) gt.push_back((int)ta.size()); cur_gene_id = transcript.getGeneID(); } if ((mappingType == 2) && (cur_transcript_id != transcript.getTranscriptID())) { ta.push_back(i); cur_transcript_id = transcript.getTranscriptID(); } } gi.push_back(M + 1); if (mappingType == 2) { gt.push_back((int)ta.size()); ta.push_back(M + 1); } sprintf(groupF, "%s.grp", refName); fout.open(groupF); for (int i = 0; i < (int)gi.size(); ++i) fout<< gi[i]<< endl; fout.close(); if (verbose) { printf("Group File is generated!\n"); } if (mappingType == 2) { sprintf(gtF, "%s.gt", refName); fout.open(gtF); for (int i = 0; i < (int)gt.size(); ++i) fout<< gt[i]<< endl; fout.close(); sprintf(taF, "%s.ta", refName); fout.open(taF); for (int i = 0; i < (int)ta.size(); ++i) fout<< ta[i]<< endl; fout.close(); if (verbose) { printf("Allele-specific group files are generated!\n"); } } if (n2g_idx) { sprintf(n2g_idxF, "%s.n2g.idx.fa", refName); fout.open(n2g_idxF); for (int i = 1; i <= M; ++i) fout<< '>'<< refs.getRef(i)->getName()<< endl<< n2g(refs.getRef(i)->getSeq())<< endl; fout.close(); if (verbose) printf("%s is generated!\n", n2g_idxF); } }
void parse_gtf_file(char* gtfF) { ifstream fin(gtfF); string line, tid, gid; GTFItem item; general_assert(fin.is_open(), "Cannot open " + cstrtos(gtfF) + "! It may not exist."); int cnt = 0; items.clear(); while (getline(fin, line)) { if (skip(line)) continue; item.parse(line); string feature = item.getFeature(); if (feature == "exon") { if (item.getStart() > item.getEnd()) { printf("Warning: exon's start position is larger than its end position! This exon is discarded.\n"); printf("\t%s\n\n", line.c_str()); } else if (item.getStart() < 1) { printf("Warning: exon's start position is less than 1! This exon is discarded.\n"); printf("\t%s\n\n", line.c_str()); } else { item.parseAttributes(line); if (mappingType > 0) { tid = item.getTranscriptID(); mi_iter = mi_table.find(tid); general_assert(mi_iter != mi_table.end(), "Mapping Info is not correct, cannot find " + tid + "'s gene_id!"); gid = mi_iter->second; item.setGeneID(gid); } items.push_back(item); } } ++cnt; if (verbose && cnt % 200000 == 0) { printf("Parsed %d lines\n", cnt); } } fin.close(); sort(items.begin(), items.end()); int sp = 0, ep; // start pointer, end pointer int nItems = items.size(); sn2tr.clear(); while (sp < nItems) { tid = items[sp].getTranscriptID(); ep = sp + 1; while (ep < nItems && items[ep].getTranscriptID() == tid) ++ep; --ep; buildTranscript(sp, ep); int sid = transcripts.getM(); const Transcript& transcript = transcripts.getTranscriptAt(sid); sn2tr_iter = sn2tr.find(transcript.getSeqName()); if (sn2tr_iter == sn2tr.end()) { vector<int> vec(1, sid); sn2tr[transcript.getSeqName()] = vec; } else { sn2tr_iter->second.push_back(sid); } sp = ep + 1; } items.clear(); if (verbose) { printf("Parsing GTF File is done!\n"); } }
int main(int argc, char* argv[]) { if (argc < 6 || ((hasMappingFile = atoi(argv[4])) && argc < 7)) { printf("Usage: rsem-extract-reference-transcripts refName quiet gtfF hasMappingFile [mappingFile] chromosome_file_1 [chromosome_file_2 ...]\n"); exit(-1); } verbose = !atoi(argv[2]); if (hasMappingFile) { loadMappingInfo(argv[5]); } parse_gtf_file(argv[3]); ifstream fin; string line, gseq, seqname; chrvec.clear(); seqs.clear(); seqs.resize(M + 1, ""); int start = hasMappingFile ? 6 : 5; for (int i = start; i < argc; i++) { fin.open(argv[i]); if (!fin.is_open()) { fprintf(stderr, "Cannot open %s! It may not exist.\n", argv[i]); exit(-1); } getline(fin, line); while ((fin) && (line[0] == '>')) { istringstream strin(line.substr(1)); strin>>seqname; gseq = ""; while((getline(fin, line)) && (line[0] != '>')) { gseq += line; } size_t len = gseq.length(); assert(len > 0); for (size_t j = 0; j < len; j++) gseq[j] = check(gseq[j]); iter = sn2tr.find(seqname); if (iter == sn2tr.end()) continue; chrvec.push_back(ChrInfo(seqname, len)); vector<int>& vec = iter->second; int s = vec.size(); for (int j = 0; j < s; j++) { assert(vec[j] > 0 && vec[j] <= M); transcripts.getTranscriptAt(vec[j]).extractSeq(gseq, seqs[vec[j]]); } } fin.close(); if (verbose) { printf("%s is processed!\n", argv[i]); } } for (int i = 1; i <= M; i++) { if (seqs[i] == "") { const Transcript& transcript = transcripts.getTranscriptAt(i); fprintf(stderr, "Cannot extract transcript %s's sequence from chromosome %s! Loading chromosome %s's sequence is failed. Please check if 1) the chromosome directory is set correctly; 2) the list of chromosome files is complete; 3) the FASTA files containing chromosome sequences are not truncated or having wrong format.\n", \ transcript.getTranscriptID().c_str(), transcript.getSeqName().c_str(), transcript.getSeqName().c_str()); exit(-1); } } sort(chrvec.begin(), chrvec.end()); if (verbose) { printf("Extracting sequences is done!\n"); } writeResults(argv[1]); return 0; }
void parse_gtf_file(char* gtfF) { ifstream fin(gtfF); string line, curgid, tid, gid; // curgid: current gene id; GTFItem item; if (!fin.is_open()) { fprintf(stderr, "Cannot open %s! It may not exist.\n", gtfF); exit(-1); } int cnt = 0; items.clear(); while (getline(fin, line)) { if (line[0] == '#') continue; // if this line is comment, jump it item.parse(line); string feature = item.getFeature(); if (feature == "exon") { if (item.getStart() > item.getEnd()) { fprintf(stderr, "Warning: exon's start position is larger than its end position! This exon is discarded.\n"); fprintf(stderr, "\t%s\n\n", line.c_str()); } else if (item.getStart() < 1) { fprintf(stderr, "Warning: exon's start position is less than 1! This exon is discarded.\n"); fprintf(stderr, "\t%s\n\n", line.c_str()); } else { if (hasMappingFile) { tid = item.getTranscriptID(); mi_iter = mi_table.find(tid); if (mi_iter == mi_table.end()) { fprintf(stderr, "Mapping Info is not correct, cannot find %s's gene_id!\n", tid.c_str()); exit(-1); } //assert(iter != table.end()); gid = mi_iter->second; item.setGeneID(gid); } items.push_back(item); } } ++cnt; if (verbose && cnt % 200000 == 0) { printf("Parsed %d lines\n", cnt); } } fin.close(); sort(items.begin(), items.end()); starts.clear(); sn2tr.clear(); curgid = ""; int sp = 0, ep; // start pointer, end pointer int nItems = items.size(); while (sp < nItems) { tid = items[sp].getTranscriptID(); gid = items[sp].getGeneID(); ep = sp + 1; while (ep < nItems && items[ep].getTranscriptID() == tid) ep++; ep--; buildTranscript(sp, ep); int sid = transcripts.getM(); const Transcript& transcript = transcripts.getTranscriptAt(sid); if (curgid != gid) { starts.push_back(sid); curgid = gid; } iter = sn2tr.find(transcript.getSeqName()); if (iter == sn2tr.end()) { vector<int> vec(1, sid); sn2tr[transcript.getSeqName()] = vec; } else { iter->second.push_back(sid); } sp = ep + 1; } M = transcripts.getM(); starts.push_back(M + 1); items.clear(); if (M < 1) { fprintf(stderr, "Number of transcripts in the reference is less than 1!\n"); exit(-1); } if (verbose) { printf("Parsing gtf File is done!\n"); } }
void writeResults(int option, char* refName) { ofstream fout, fout2; string cur_gene_id, cur_transcript_id, name; vector<int> gi, gt, ta; sprintf(tiF, "%s.ti", refName); transcripts.writeTo(tiF); if (verbose) { printf("Transcript Information File is generated!\n"); } cur_gene_id = ""; gi.clear(); if (option == 2) { cur_transcript_id = ""; gt.clear(); ta.clear(); } for (int i = 1; i <= M; i++) { const Transcript& transcript = transcripts.getTranscriptAt(i); if (cur_gene_id != transcript.getGeneID()) { gi.push_back(i); if (option == 2) gt.push_back((int)ta.size()); cur_gene_id = transcript.getGeneID(); } if ((option == 2) && (cur_transcript_id != transcript.getTranscriptID())) { ta.push_back(i); cur_transcript_id = transcript.getTranscriptID(); } } gi.push_back(M + 1); if (option == 2) { gt.push_back((int)ta.size()); ta.push_back(M + 1); } sprintf(groupF, "%s.grp", refName); fout.open(groupF); for (int i = 0; i < (int)gi.size(); i++) fout<< gi[i]<< endl; fout.close(); if (verbose) { printf("Group File is generated!\n"); } if (option == 2) { sprintf(gtF, "%s.gt", refName); fout.open(gtF); for (int i = 0; i < (int)gt.size(); i++) fout<< gt[i]<< endl; fout.close(); sprintf(taF, "%s.ta", refName); fout.open(taF); for (int i = 0; i < (int)ta.size(); i++) fout<< ta[i]<< endl; fout.close(); if (verbose) { printf("Allele-specific group files are generated!\n"); } } sprintf(refFastaF, "%s.transcripts.fa", refName); sprintf(chromListF, "%s.chrlist", refName); fout2.open(chromListF); fout.open(refFastaF); for (int i = 1; i <= M; i++) { name = transcripts.getTranscriptAt(i).getSeqName(); iter = name2seq.find(name); general_assert(iter != name2seq.end(), "Cannot recognize sequence ID" + name + "!"); fout<<">"<<name<<endl; fout<<iter->second<<endl; fout2<<name<<'\t'<<iter->second.length()<<endl; } fout.close(); fout2.close(); if (verbose) { printf("Chromosome List File is generated!\n"); printf("Extracted Sequences File is generated!\n"); } }