int main(int argc, char* argv[]) { if (argc < 2) { printf("Usage: PROBer-build-reference refName [--gtf gtfF] [--mapping mappingF] [--allele-specific] [--files num_of_files file_1 file_2 ...] [--n2g-index] [-q]\n"); exit(-1); } hasGTF = false; mappingType = 0; n2g_idx = false; int argpos = 2; while (argpos < argc) { if (!strcmp(argv[argpos], "--gtf")) { hasGTF = true; strcpy(gtfF, argv[++argpos]); } if (!strcmp(argv[argpos], "--mapping")) { mappingType = 1; mappingPos = ++argpos; } if (!strcmp(argv[argpos], "--allele-specific")) mappingType = 2; if (!strcmp(argv[argpos], "--files")) { num_files = atoi(argv[++argpos]); file_pos = argpos + 1; // the position in argv for the first file argpos += num_files; } if (!strcmp(argv[argpos], "--n2g-index")) n2g_idx = true; if (!strcmp(argv[argpos], "-q")) verbose = false; ++argpos; } if (mappingType > 0) loadMappingInfo(mappingType, argv[mappingPos]); ifstream fin; string line, gseq, tseq; // gseq, genomic sequence; tseq, transcript sequence string seqname, gene_id, transcript_id; if (hasGTF) { transcripts.setType(0); assert(mappingType < 2); parse_gtf_file(gtfF); M = transcripts.getM(); general_assert(M > 0, "The reference contains no transcripts!"); seqs.assign(M + 1, ""); chrvec.clear(); for (int i = 0; i < num_files; ++i, ++file_pos) { fin.open(argv[file_pos]); general_assert(fin.is_open(), "Cannot open " + cstrtos(argv[file_pos]) + "! It may not exist."); getline(fin, line); while ((fin) && (line[0] == '>')) { istringstream strin(line.substr(1)); strin>>seqname; gseq = ""; while((getline(fin, line)) && (line[0] != '>')) { gseq += line; } assert(gseq.length() > 0); sn2tr_iter = sn2tr.find(seqname); if (sn2tr_iter == sn2tr.end()) continue; chrvec.push_back(ChrInfo(seqname, gseq.length())); vector<int>& vec = sn2tr_iter->second; int s = vec.size(); for (int j = 0; j < s; ++j) { assert(vec[j] > 0 && vec[j] <= M); transcripts.getTranscriptAt(vec[j]).extractSeq(gseq, seqs[vec[j]]); } } fin.close(); if (verbose) { printf("%s is processed!\n", argv[file_pos]); } } sort(chrvec.begin(), chrvec.end()); // Shrink and build up Refs int curp = 0; for (int i = 1; i <= M; ++i) { const Transcript& transcript = transcripts.getTranscriptAt(i); if (seqs[i] == "") printf("Warning: Cannot extract transcript %s because the chromosome it locates -- %s -- is absent!\n", transcript.getTranscriptID().c_str(), transcript.getSeqName().c_str()); else { refs.addRef(transcript.getTranscriptID(), seqs[i]); // insert RefSeqs ++curp; transcripts.move(i, curp); } } printf("%d transcripts are extracted and %d transcripts are omitted.\n", curp, M - curp); transcripts.setM(curp); M = transcripts.getM(); general_assert(M > 0, "The reference contains no transcripts!"); assert(refs.getM() == M); } else {
void parse_gtf_file(char* gtfF) { ifstream fin(gtfF); string line, curgid, tid, gid; // curgid: current gene id; GTFItem item; if (!fin.is_open()) { fprintf(stderr, "Cannot open %s! It may not exist.\n", gtfF); exit(-1); } int cnt = 0; items.clear(); while (getline(fin, line)) { if (line[0] == '#') continue; // if this line is comment, jump it item.parse(line); string feature = item.getFeature(); if (feature == "exon") { if (item.getStart() > item.getEnd()) { fprintf(stderr, "Warning: exon's start position is larger than its end position! This exon is discarded.\n"); fprintf(stderr, "\t%s\n\n", line.c_str()); } else if (item.getStart() < 1) { fprintf(stderr, "Warning: exon's start position is less than 1! This exon is discarded.\n"); fprintf(stderr, "\t%s\n\n", line.c_str()); } else { if (hasMappingFile) { tid = item.getTranscriptID(); mi_iter = mi_table.find(tid); if (mi_iter == mi_table.end()) { fprintf(stderr, "Mapping Info is not correct, cannot find %s's gene_id!\n", tid.c_str()); exit(-1); } //assert(iter != table.end()); gid = mi_iter->second; item.setGeneID(gid); } items.push_back(item); } } ++cnt; if (verbose && cnt % 200000 == 0) { printf("Parsed %d lines\n", cnt); } } fin.close(); sort(items.begin(), items.end()); starts.clear(); sn2tr.clear(); curgid = ""; int sp = 0, ep; // start pointer, end pointer int nItems = items.size(); while (sp < nItems) { tid = items[sp].getTranscriptID(); gid = items[sp].getGeneID(); ep = sp + 1; while (ep < nItems && items[ep].getTranscriptID() == tid) ep++; ep--; buildTranscript(sp, ep); int sid = transcripts.getM(); const Transcript& transcript = transcripts.getTranscriptAt(sid); if (curgid != gid) { starts.push_back(sid); curgid = gid; } iter = sn2tr.find(transcript.getSeqName()); if (iter == sn2tr.end()) { vector<int> vec(1, sid); sn2tr[transcript.getSeqName()] = vec; } else { iter->second.push_back(sid); } sp = ep + 1; } M = transcripts.getM(); starts.push_back(M + 1); items.clear(); if (M < 1) { fprintf(stderr, "Number of transcripts in the reference is less than 1!\n"); exit(-1); } if (verbose) { printf("Parsing gtf File is done!\n"); } }
void parse_gtf_file(char* gtfF) { ifstream fin(gtfF); string line, tid, gid; GTFItem item; general_assert(fin.is_open(), "Cannot open " + cstrtos(gtfF) + "! It may not exist."); int cnt = 0; items.clear(); while (getline(fin, line)) { if (skip(line)) continue; item.parse(line); string feature = item.getFeature(); if (feature == "exon") { if (item.getStart() > item.getEnd()) { printf("Warning: exon's start position is larger than its end position! This exon is discarded.\n"); printf("\t%s\n\n", line.c_str()); } else if (item.getStart() < 1) { printf("Warning: exon's start position is less than 1! This exon is discarded.\n"); printf("\t%s\n\n", line.c_str()); } else { item.parseAttributes(line); if (mappingType > 0) { tid = item.getTranscriptID(); mi_iter = mi_table.find(tid); general_assert(mi_iter != mi_table.end(), "Mapping Info is not correct, cannot find " + tid + "'s gene_id!"); gid = mi_iter->second; item.setGeneID(gid); } items.push_back(item); } } ++cnt; if (verbose && cnt % 200000 == 0) { printf("Parsed %d lines\n", cnt); } } fin.close(); sort(items.begin(), items.end()); int sp = 0, ep; // start pointer, end pointer int nItems = items.size(); sn2tr.clear(); while (sp < nItems) { tid = items[sp].getTranscriptID(); ep = sp + 1; while (ep < nItems && items[ep].getTranscriptID() == tid) ++ep; --ep; buildTranscript(sp, ep); int sid = transcripts.getM(); const Transcript& transcript = transcripts.getTranscriptAt(sid); sn2tr_iter = sn2tr.find(transcript.getSeqName()); if (sn2tr_iter == sn2tr.end()) { vector<int> vec(1, sid); sn2tr[transcript.getSeqName()] = vec; } else { sn2tr_iter->second.push_back(sid); } sp = ep + 1; } items.clear(); if (verbose) { printf("Parsing GTF File is done!\n"); } }
int main(int argc, char* argv[]) { if (argc < 5 || ((hasMappingFile = atoi(argv[3])) && argc < 6)) { printf("Usage: synthesisRef refName quiet hasMappingFile<0,no;1,yes;2,allele-specific> [mappingFile] reference_file_1 [reference_file_2 ...]\n"); exit(-1); } verbose = !atoi(argv[2]); if (hasMappingFile) { loadMappingInfo(hasMappingFile, argv[4]); } // allele-specific if (hasMappingFile == 2) { transcripts.setType(2); } int start = hasMappingFile ? 5 : 4; ifstream fin; string line, gseq; string seqname, gene_id, transcript_id; vector<Interval> vec; M = 0; name2seq.clear(); for (int i = start; i < argc; i++) { fin.open(argv[i]); general_assert(fin.is_open(), "Cannot open " + cstrtos(argv[i]) + "! It may not exist."); unsigned long int line_no = 0; //Keep track of file line number getline(fin, line); line_no += 1; while ((fin) && (line[0] == '>')) { istringstream strin(line.substr(1)); strin>>seqname; gseq = ""; while((getline(fin, line)) && (line[0] != '>')) { line_no += 1; gseq += line; } int len = gseq.length(); assert(len > 0); for (int j = 0; j < len; j++) gseq[j] = check(gseq[j],line_no); name2seq[seqname] = gseq; transcript_id = seqname; gene_id = seqname; if (hasMappingFile) { mi_iter = mi_table.find(seqname); general_assert(mi_iter != mi_table.end(), "Mapping Info is not correct, cannot find " + seqname + "'s gene_id!"); gene_id = mi_iter->second; if (hasMappingFile == 2) { mi_iter2 = mi_table2.find(seqname); general_assert(mi_iter2 != mi_table2.end(), "Mapping Info is not correct, cannot find allele " + seqname + "'s transcript_id!"); transcript_id = mi_iter2->second; } } vec.clear(); vec.push_back(Interval(1, len)); transcripts.add(Transcript(transcript_id, gene_id, seqname, '+', vec, "")); ++M; if (verbose && M % 1000000 == 0) { printf("%d sequences are processed!\n", M); } } fin.close(); } if (M < 1) { fprintf(stderr, "Number of transcripts in the reference is less than 1!\n"); exit(-1); } assert(M == transcripts.getM()); transcripts.sort(); writeResults(hasMappingFile, argv[1]); return 0; }