//********************************************************************************************************************** int GetSeqsCommand::readCount(){ try { string thisOutputDir = outputDir; if (outputDir == "") { thisOutputDir += m->hasPath(countfile); } map<string, string> variables; variables["[filename]"] = thisOutputDir + m->getRootName(m->getSimpleName(countfile)); variables["[extension]"] = m->getExtension(countfile); string outputFileName = getOutputFileName("count", variables); ofstream out; m->openOutputFile(outputFileName, out); ifstream in; m->openInputFile(countfile, in); bool wroteSomething = false; int selectedCount = 0; string headers = m->getline(in); m->gobble(in); out << headers << endl; string test = headers; vector<string> pieces = m->splitWhiteSpace(test); string name, rest; int thisTotal; rest = ""; while (!in.eof()) { if (m->control_pressed) { in.close(); out.close(); m->mothurRemove(outputFileName); return 0; } in >> name; m->gobble(in); in >> thisTotal; m->gobble(in); if (pieces.size() > 2) { rest = m->getline(in); m->gobble(in); } if (m->debug) { m->mothurOut("[DEBUG]: " + name + '\t' + rest + "\n"); } if (names.count(name) != 0) { out << name << '\t' << thisTotal << '\t' << rest << endl; wroteSomething = true; selectedCount+= thisTotal; } } in.close(); out.close(); //check for groups that have been eliminated CountTable ct; if (ct.testGroups(outputFileName)) { ct.readTable(outputFileName, true, false); ct.printTable(outputFileName); } if (wroteSomething == false) { m->mothurOut("Your file does not contain any sequence from the .accnos file."); m->mothurOutEndLine(); } outputTypes["count"].push_back(outputFileName); outputNames.push_back(outputFileName); m->mothurOut("Selected " + toString(selectedCount) + " sequences from your count file."); m->mothurOutEndLine(); return 0; } catch(exception& e) { m->errorOut(e, "GetSeqsCommand", "readCount"); exit(1); } }
SequenceCountParser::SequenceCountParser(string countfile, string fastafile) { try { m = MothurOut::getInstance(); //read count file CountTable countTable; countTable.readTable(countfile, true, false); //initialize maps namesOfGroups = countTable.getNamesOfGroups(); for (int i = 0; i < namesOfGroups.size(); i++) { vector<Sequence> temp; map<string, int> tempMap; seqs[namesOfGroups[i]] = temp; countTablePerGroup[namesOfGroups[i]] = tempMap; } //read fasta file making sure each sequence is in the group file ifstream in; m->openInputFile(fastafile, in); int fastaCount = 0; while (!in.eof()) { if (m->control_pressed) { break; } Sequence seq(in); m->gobble(in); fastaCount++; if (m->debug) { if((fastaCount) % 1000 == 0){ m->mothurOut("[DEBUG]: reading seq " + toString(fastaCount) + "\n."); } } if (seq.getName() != "") { allSeqsMap[seq.getName()] = seq.getName(); vector<int> groupCounts = countTable.getGroupCounts(seq.getName()); for (int i = 0; i < namesOfGroups.size(); i++) { if (groupCounts[i] != 0) { seqs[namesOfGroups[i]].push_back(seq); countTablePerGroup[namesOfGroups[i]][seq.getName()] = groupCounts[i]; } } } } in.close(); } catch(exception& e) { m->errorOut(e, "SequenceCountParser", "SequenceCountParser"); exit(1); } }
//********************************************************************************************************************** int ListSeqsCommand::readCount(){ try { CountTable ct; ct.readTable(countfile, false, false); if (m->control_pressed) { return 0; } names = ct.getNamesOfSeqs(); return 0; } catch(exception& e) { m->errorOut(e, "ListSeqsCommand", "readCount"); exit(1); } }
//********************************************************************************************************************** int SharedCommand::createSharedFromCount() { try { //getting output filename string filename = countfile; if (outputDir == "") { outputDir += util.hasPath(filename); } map<string, string> variables; variables["[filename]"] = outputDir + util.getRootName(util.getSimpleName(filename)); filename = getOutputFileName("shared",variables); outputNames.push_back(filename); outputTypes["shared"].push_back(filename); ofstream out; bool printHeaders = true; util.openOutputFile(filename, out); CountTable ct; ct.readTable(countfile, true, false); map<string, string> seqNameToOtuName; SharedRAbundVectors* lookup = ct.getShared(Groups, seqNameToOtuName); lookup->setLabels(*labels.begin()); lookup->print(out, printHeaders); out.close(); delete lookup; string mapFilename = getOutputFileName("map",variables); outputNames.push_back(mapFilename); outputTypes["map"].push_back(mapFilename); ofstream outMap; util.openOutputFile(mapFilename, outMap); for (map<string, string>::iterator it = seqNameToOtuName.begin(); it != seqNameToOtuName.end(); it++) { outMap << it->first << '\t' << it->second << endl; } outMap.close(); return 0; } catch(exception& e) { m->errorOut(e, "SharedCommand", "createSharedFromCount"); exit(1); } }
int SeqSummaryCommand::execute(){ try{ if (abort == true) { if (calledHelp) { return 0; } return 2; } //set current fasta to fastafile m->setFastaFile(fastafile); map<string, string> variables; variables["[filename]"] = outputDir + m->getRootName(m->getSimpleName(fastafile)); string summaryFile = getOutputFileName("summary",variables); int numSeqs = 0; vector<int> startPosition; vector<int> endPosition; vector<int> seqLength; vector<int> ambigBases; vector<int> longHomoPolymer; if (namefile != "") { nameMap = m->readNames(namefile); } else if (countfile != "") { CountTable ct; ct.readTable(countfile, false, false); nameMap = ct.getNameMap(); } if (m->control_pressed) { return 0; } #ifdef USE_MPI int pid, numSeqsPerProcessor; int tag = 2001; int startTag = 1; int endTag = 2; int lengthTag = 3; int baseTag = 4; int lhomoTag = 5; int outMode=MPI_MODE_CREATE|MPI_MODE_WRONLY; vector<unsigned long long> MPIPos; MPI_Status status; MPI_Status statusOut; MPI_File inMPI; MPI_File outMPI; MPI_Comm_size(MPI_COMM_WORLD, &processors); MPI_Comm_rank(MPI_COMM_WORLD, &pid); char tempFileName[1024]; strcpy(tempFileName, fastafile.c_str()); char sumFileName[1024]; strcpy(sumFileName, summaryFile.c_str()); MPI_File_open(MPI_COMM_WORLD, tempFileName, MPI_MODE_RDONLY, MPI_INFO_NULL, &inMPI); //comm, filename, mode, info, filepointer MPI_File_open(MPI_COMM_WORLD, sumFileName, outMode, MPI_INFO_NULL, &outMPI); if (m->control_pressed) { MPI_File_close(&inMPI); MPI_File_close(&outMPI); return 0; } if (pid == 0) { //you are the root process //print header string outputString = "seqname\tstart\tend\tnbases\tambigs\tpolymer\tnumSeqs\n"; int length = outputString.length(); char* buf2 = new char[length]; memcpy(buf2, outputString.c_str(), length); MPI_File_write_shared(outMPI, buf2, length, MPI_CHAR, &statusOut); delete buf2; MPIPos = m->setFilePosFasta(fastafile, numSeqs); //fills MPIPos, returns numSeqs for(int i = 1; i < processors; i++) { MPI_Send(&numSeqs, 1, MPI_INT, i, tag, MPI_COMM_WORLD); MPI_Send(&MPIPos[0], (numSeqs+1), MPI_LONG, i, tag, MPI_COMM_WORLD); } //figure out how many sequences you have to do numSeqsPerProcessor = numSeqs / processors; int startIndex = pid * numSeqsPerProcessor; if(pid == (processors - 1)){ numSeqsPerProcessor = numSeqs - pid * numSeqsPerProcessor; } //do your part MPICreateSummary(startIndex, numSeqsPerProcessor, startPosition, endPosition, seqLength, ambigBases, longHomoPolymer, inMPI, outMPI, MPIPos); }else { //i am the child process MPI_Recv(&numSeqs, 1, MPI_INT, 0, tag, MPI_COMM_WORLD, &status); MPIPos.resize(numSeqs+1); MPI_Recv(&MPIPos[0], (numSeqs+1), MPI_LONG, 0, tag, MPI_COMM_WORLD, &status); //figure out how many sequences you have to align numSeqsPerProcessor = numSeqs / processors; int startIndex = pid * numSeqsPerProcessor; if(pid == (processors - 1)){ numSeqsPerProcessor = numSeqs - pid * numSeqsPerProcessor; } //do your part MPICreateSummary(startIndex, numSeqsPerProcessor, startPosition, endPosition, seqLength, ambigBases, longHomoPolymer, inMPI, outMPI, MPIPos); } MPI_File_close(&inMPI); MPI_File_close(&outMPI); MPI_Barrier(MPI_COMM_WORLD); //make everyone wait - just in case if (pid == 0) { //get the info from the child processes for(int i = 1; i < processors; i++) { int size; MPI_Recv(&size, 1, MPI_INT, i, tag, MPI_COMM_WORLD, &status); vector<int> temp; temp.resize(size+1); for(int j = 0; j < 5; j++) { MPI_Recv(&temp[0], (size+1), MPI_INT, i, 2001, MPI_COMM_WORLD, &status); int receiveTag = temp[temp.size()-1]; //child process added a int to the end to indicate what count this is for if (receiveTag == startTag) { for (int k = 0; k < size; k++) { startPosition.push_back(temp[k]); } }else if (receiveTag == endTag) { for (int k = 0; k < size; k++) { endPosition.push_back(temp[k]); } }else if (receiveTag == lengthTag) { for (int k = 0; k < size; k++) { seqLength.push_back(temp[k]); } }else if (receiveTag == baseTag) { for (int k = 0; k < size; k++) { ambigBases.push_back(temp[k]); } }else if (receiveTag == lhomoTag) { for (int k = 0; k < size; k++) { longHomoPolymer.push_back(temp[k]); } } } } }else{ //send my counts int size = startPosition.size(); MPI_Send(&size, 1, MPI_INT, 0, tag, MPI_COMM_WORLD); startPosition.push_back(startTag); int ierr = MPI_Send(&(startPosition[0]), (size+1), MPI_INT, 0, 2001, MPI_COMM_WORLD); endPosition.push_back(endTag); ierr = MPI_Send (&(endPosition[0]), (size+1), MPI_INT, 0, 2001, MPI_COMM_WORLD); seqLength.push_back(lengthTag); ierr = MPI_Send(&(seqLength[0]), (size+1), MPI_INT, 0, 2001, MPI_COMM_WORLD); ambigBases.push_back(baseTag); ierr = MPI_Send(&(ambigBases[0]), (size+1), MPI_INT, 0, 2001, MPI_COMM_WORLD); longHomoPolymer.push_back(lhomoTag); ierr = MPI_Send(&(longHomoPolymer[0]), (size+1), MPI_INT, 0, 2001, MPI_COMM_WORLD); } MPI_Barrier(MPI_COMM_WORLD); //make everyone wait - just in case #else vector<unsigned long long> positions; #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix) positions = m->divideFile(fastafile, processors); for (int i = 0; i < (positions.size()-1); i++) { lines.push_back(new linePair(positions[i], positions[(i+1)])); } #else positions = m->setFilePosFasta(fastafile, numSeqs); if (positions.size() < processors) { processors = positions.size(); } //figure out how many sequences you have to process int numSeqsPerProcessor = numSeqs / processors; for (int i = 0; i < processors; i++) { int startIndex = i * numSeqsPerProcessor; if(i == (processors - 1)){ numSeqsPerProcessor = numSeqs - i * numSeqsPerProcessor; } lines.push_back(new linePair(positions[startIndex], numSeqsPerProcessor)); } #endif if(processors == 1){ numSeqs = driverCreateSummary(startPosition, endPosition, seqLength, ambigBases, longHomoPolymer, fastafile, summaryFile, lines[0]); }else{ numSeqs = createProcessesCreateSummary(startPosition, endPosition, seqLength, ambigBases, longHomoPolymer, fastafile, summaryFile); } if (m->control_pressed) { return 0; } #endif #ifdef USE_MPI if (pid == 0) { #endif sort(startPosition.begin(), startPosition.end()); sort(endPosition.begin(), endPosition.end()); sort(seqLength.begin(), seqLength.end()); sort(ambigBases.begin(), ambigBases.end()); sort(longHomoPolymer.begin(), longHomoPolymer.end()); int size = startPosition.size(); //find means unsigned long long meanStartPosition, meanEndPosition, meanSeqLength, meanAmbigBases, meanLongHomoPolymer; meanStartPosition = 0; meanEndPosition = 0; meanSeqLength = 0; meanAmbigBases = 0; meanLongHomoPolymer = 0; for (int i = 0; i < size; i++) { meanStartPosition += startPosition[i]; meanEndPosition += endPosition[i]; meanSeqLength += seqLength[i]; meanAmbigBases += ambigBases[i]; meanLongHomoPolymer += longHomoPolymer[i]; } double meanstartPosition, meanendPosition, meanseqLength, meanambigBases, meanlongHomoPolymer; meanstartPosition = meanStartPosition / (double) size; meanendPosition = meanEndPosition /(double) size; meanlongHomoPolymer = meanLongHomoPolymer / (double) size; meanseqLength = meanSeqLength / (double) size; meanambigBases = meanAmbigBases /(double) size; int ptile0_25 = int(size * 0.025); int ptile25 = int(size * 0.250); int ptile50 = int(size * 0.500); int ptile75 = int(size * 0.750); int ptile97_5 = int(size * 0.975); int ptile100 = size - 1; //to compensate for blank sequences that would result in startPosition and endPostion equalling -1 if (startPosition[0] == -1) { startPosition[0] = 0; } if (endPosition[0] == -1) { endPosition[0] = 0; } if (m->control_pressed) { m->mothurRemove(summaryFile); return 0; } m->mothurOutEndLine(); m->mothurOut("\t\tStart\tEnd\tNBases\tAmbigs\tPolymer\tNumSeqs"); m->mothurOutEndLine(); m->mothurOut("Minimum:\t" + toString(startPosition[0]) + "\t" + toString(endPosition[0]) + "\t" + toString(seqLength[0]) + "\t" + toString(ambigBases[0]) + "\t" + toString(longHomoPolymer[0]) + "\t" + toString(1)); m->mothurOutEndLine(); m->mothurOut("2.5%-tile:\t" + toString(startPosition[ptile0_25]) + "\t" + toString(endPosition[ptile0_25]) + "\t" + toString(seqLength[ptile0_25]) + "\t" + toString(ambigBases[ptile0_25]) + "\t"+ toString(longHomoPolymer[ptile0_25]) + "\t" + toString(ptile0_25+1)); m->mothurOutEndLine(); m->mothurOut("25%-tile:\t" + toString(startPosition[ptile25]) + "\t" + toString(endPosition[ptile25]) + "\t" + toString(seqLength[ptile25]) + "\t" + toString(ambigBases[ptile25]) + "\t" + toString(longHomoPolymer[ptile25]) + "\t" + toString(ptile25+1)); m->mothurOutEndLine(); m->mothurOut("Median: \t" + toString(startPosition[ptile50]) + "\t" + toString(endPosition[ptile50]) + "\t" + toString(seqLength[ptile50]) + "\t" + toString(ambigBases[ptile50]) + "\t" + toString(longHomoPolymer[ptile50]) + "\t" + toString(ptile50+1)); m->mothurOutEndLine(); m->mothurOut("75%-tile:\t" + toString(startPosition[ptile75]) + "\t" + toString(endPosition[ptile75]) + "\t" + toString(seqLength[ptile75]) + "\t" + toString(ambigBases[ptile75]) + "\t" + toString(longHomoPolymer[ptile75]) + "\t" + toString(ptile75+1)); m->mothurOutEndLine(); m->mothurOut("97.5%-tile:\t" + toString(startPosition[ptile97_5]) + "\t" + toString(endPosition[ptile97_5]) + "\t" + toString(seqLength[ptile97_5]) + "\t" + toString(ambigBases[ptile97_5]) + "\t" + toString(longHomoPolymer[ptile97_5]) + "\t" + toString(ptile97_5+1)); m->mothurOutEndLine(); m->mothurOut("Maximum:\t" + toString(startPosition[ptile100]) + "\t" + toString(endPosition[ptile100]) + "\t" + toString(seqLength[ptile100]) + "\t" + toString(ambigBases[ptile100]) + "\t" + toString(longHomoPolymer[ptile100]) + "\t" + toString(ptile100+1)); m->mothurOutEndLine(); m->mothurOut("Mean:\t" + toString(meanstartPosition) + "\t" + toString(meanendPosition) + "\t" + toString(meanseqLength) + "\t" + toString(meanambigBases) + "\t" + toString(meanlongHomoPolymer)); m->mothurOutEndLine(); if ((namefile == "") && (countfile == "")) { m->mothurOut("# of Seqs:\t" + toString(numSeqs)); m->mothurOutEndLine(); } else { m->mothurOut("# of unique seqs:\t" + toString(numSeqs)); m->mothurOutEndLine(); m->mothurOut("total # of seqs:\t" + toString(startPosition.size())); m->mothurOutEndLine(); } if (m->control_pressed) { m->mothurRemove(summaryFile); return 0; } m->mothurOutEndLine(); m->mothurOut("Output File Names: "); m->mothurOutEndLine(); m->mothurOut(summaryFile); m->mothurOutEndLine(); outputNames.push_back(summaryFile); outputTypes["summary"].push_back(summaryFile); m->mothurOutEndLine(); #ifdef USE_MPI } #endif //set fasta file as new current fastafile string current = ""; itTypes = outputTypes.find("summary"); if (itTypes != outputTypes.end()) { if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setSummaryFile(current); } } return 0; } catch(exception& e) { m->errorOut(e, "SeqSummaryCommand", "execute"); exit(1); } }
int RemoveGroupsCommand::execute(){ try { if (abort == true) { if (calledHelp) { return 0; } return 2; } //get groups you want to remove if (accnosfile != "") { m->readAccnos(accnosfile, Groups); m->setGroups(Groups); } if (groupfile != "") { groupMap = new GroupMap(groupfile); groupMap->readMap(); //make sure groups are valid //takes care of user setting groupNames that are invalid or setting groups=all vector<string> namesGroups = groupMap->getNamesOfGroups(); vector<string> checkedGroups; for (int i = 0; i < Groups.size(); i++) { if (m->inUsersGroups(Groups[i], namesGroups)) { checkedGroups.push_back(Groups[i]); } else { m->mothurOut("[WARNING]: " + Groups[i] + " is not a valid group in your groupfile, ignoring.\n"); } } if (checkedGroups.size() == 0) { m->mothurOut("[ERROR]: no valid groups, aborting.\n"); delete groupMap; return 0; } else { Groups = checkedGroups; m->setGroups(Groups); } //fill names with names of sequences that are from the groups we want to remove fillNames(); delete groupMap; }else if (countfile != ""){ if ((fastafile != "") || (listfile != "") || (taxfile != "")) { m->mothurOut("\n[NOTE]: The count file should contain only unique names, so mothur assumes your fasta, list and taxonomy files also contain only uniques.\n\n"); } CountTable ct; ct.readTable(countfile, true, false); if (!ct.hasGroupInfo()) { m->mothurOut("[ERROR]: your count file does not contain group info, aborting.\n"); return 0; } vector<string> gNamesOfGroups = ct.getNamesOfGroups(); SharedUtil util; util.setGroups(Groups, gNamesOfGroups); vector<string> namesOfSeqs = ct.getNamesOfSeqs(); sort(Groups.begin(), Groups.end()); for (int i = 0; i < namesOfSeqs.size(); i++) { vector<string> thisSeqsGroups = ct.getGroups(namesOfSeqs[i]); if (m->isSubset(Groups, thisSeqsGroups)) { //you only have seqs from these groups so remove you names.insert(namesOfSeqs[i]); } } } if (m->control_pressed) { return 0; } //read through the correct file and output lines you want to keep if (namefile != "") { readName(); } if (fastafile != "") { readFasta(); } if (groupfile != "") { readGroup(); } if (countfile != "") { readCount(); } if (listfile != "") { readList(); } if (taxfile != "") { readTax(); } if (sharedfile != "") { readShared(); } if (designfile != "") { readDesign(); } if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } return 0; } if (outputNames.size() != 0) { m->mothurOutEndLine(); m->mothurOut("Output File names: "); m->mothurOutEndLine(); for (int i = 0; i < outputNames.size(); i++) { m->mothurOut(outputNames[i]); m->mothurOutEndLine(); } m->mothurOutEndLine(); //set fasta file as new current fastafile string current = ""; itTypes = outputTypes.find("fasta"); if (itTypes != outputTypes.end()) { if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setFastaFile(current); } } itTypes = outputTypes.find("name"); if (itTypes != outputTypes.end()) { if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setNameFile(current); } } itTypes = outputTypes.find("group"); if (itTypes != outputTypes.end()) { if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setGroupFile(current); } } itTypes = outputTypes.find("list"); if (itTypes != outputTypes.end()) { if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setListFile(current); } } itTypes = outputTypes.find("taxonomy"); if (itTypes != outputTypes.end()) { if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setTaxonomyFile(current); } } itTypes = outputTypes.find("shared"); if (itTypes != outputTypes.end()) { if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setSharedFile(current); } } itTypes = outputTypes.find("design"); if (itTypes != outputTypes.end()) { if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setDesignFile(current); } } itTypes = outputTypes.find("count"); if (itTypes != outputTypes.end()) { if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setCountTableFile(current); } } } return 0; } catch(exception& e) { m->errorOut(e, "RemoveGroupsCommand", "execute"); exit(1); } }
//********************************************************************************************************************** int RemoveRareCommand::processList(){ try { //you must provide a label because the names in the listfile need to be consistent string thisLabel = ""; if (allLines) { m->mothurOut("For the listfile you must select one label, using first label in your listfile."); m->mothurOutEndLine(); } else if (labels.size() > 1) { m->mothurOut("For the listfile you must select one label, using " + (*labels.begin()) + "."); m->mothurOutEndLine(); thisLabel = *labels.begin(); } else { thisLabel = *labels.begin(); } InputData input(listfile, "list"); ListVector* list = input.getListVector(); //get first one or the one we want if (thisLabel != "") { //use smart distancing set<string> userLabels; userLabels.insert(thisLabel); set<string> processedLabels; string lastLabel = list->getLabel(); while((list != NULL) && (userLabels.size() != 0)) { if(userLabels.count(list->getLabel()) == 1){ processedLabels.insert(list->getLabel()); userLabels.erase(list->getLabel()); break; } if ((m->anyLabelsToProcess(list->getLabel(), userLabels, "") == true) && (processedLabels.count(lastLabel) != 1)) { processedLabels.insert(list->getLabel()); userLabels.erase(list->getLabel()); delete list; list = input.getListVector(lastLabel); break; } lastLabel = list->getLabel(); delete list; list = input.getListVector(); } if (userLabels.size() != 0) { m->mothurOut("Your file does not include the label " + thisLabel + ". I will use " + lastLabel + "."); m->mothurOutEndLine(); list = input.getListVector(lastLabel); } } string thisOutputDir = outputDir; if (outputDir == "") { thisOutputDir += m->hasPath(listfile); } map<string, string> variables; variables["[filename]"] = thisOutputDir + m->getRootName(m->getSimpleName(listfile)); variables["[extension]"] = m->getExtension(listfile); variables["[tag]"] = list->getLabel(); string outputFileName = getOutputFileName("list", variables); variables["[filename]"] = thisOutputDir + m->getRootName(m->getSimpleName(groupfile)); variables["[extension]"] = m->getExtension(groupfile); string outputGroupFileName = getOutputFileName("group", variables); variables["[filename]"] = thisOutputDir + m->getRootName(m->getSimpleName(countfile)); variables["[extension]"] = m->getExtension(countfile); string outputCountFileName = getOutputFileName("count", variables); ofstream out, outGroup; m->openOutputFile(outputFileName, out); bool wroteSomething = false; //if groupfile is given then use it GroupMap* groupMap; CountTable ct; if (groupfile != "") { groupMap = new GroupMap(groupfile); groupMap->readMap(); SharedUtil util; vector<string> namesGroups = groupMap->getNamesOfGroups(); util.setGroups(Groups, namesGroups); m->openOutputFile(outputGroupFileName, outGroup); }else if (countfile != "") { ct.readTable(countfile, true, false); if (ct.hasGroupInfo()) { vector<string> namesGroups = ct.getNamesOfGroups(); SharedUtil util; util.setGroups(Groups, namesGroups); } } if (list != NULL) { vector<string> binLabels = list->getLabels(); vector<string> newLabels; //make a new list vector ListVector newList; newList.setLabel(list->getLabel()); //for each bin for (int i = 0; i < list->getNumBins(); i++) { if (m->control_pressed) { if (groupfile != "") { delete groupMap; outGroup.close(); m->mothurRemove(outputGroupFileName); } out.close(); m->mothurRemove(outputFileName); return 0; } //parse out names that are in accnos file string binnames = list->get(i); vector<string> names; string saveBinNames = binnames; m->splitAtComma(binnames, names); int binsize = names.size(); vector<string> newGroupFile; if (groupfile != "") { vector<string> newNames; saveBinNames = ""; for(int k = 0; k < names.size(); k++) { string group = groupMap->getGroup(names[k]); if (m->inUsersGroups(group, Groups)) { newGroupFile.push_back(names[k] + "\t" + group); newNames.push_back(names[k]); saveBinNames += names[k] + ","; } } names = newNames; binsize = names.size(); saveBinNames = saveBinNames.substr(0, saveBinNames.length()-1); }else if (countfile != "") { saveBinNames = ""; binsize = 0; for(int k = 0; k < names.size(); k++) { if (ct.hasGroupInfo()) { vector<string> thisSeqsGroups = ct.getGroups(names[k]); int thisSeqsCount = 0; for (int n = 0; n < thisSeqsGroups.size(); n++) { if (m->inUsersGroups(thisSeqsGroups[n], Groups)) { thisSeqsCount += ct.getGroupCount(names[k], thisSeqsGroups[n]); } } binsize += thisSeqsCount; //if you don't have any seqs from the groups the user wants, then remove you. if (thisSeqsCount == 0) { newGroupFile.push_back(names[k]); } else { saveBinNames += names[k] + ","; } }else { binsize += ct.getNumSeqs(names[k]); saveBinNames += names[k] + ","; } } saveBinNames = saveBinNames.substr(0, saveBinNames.length()-1); } if (binsize > nseqs) { //keep bin newList.push_back(saveBinNames); newLabels.push_back(binLabels[i]); if (groupfile != "") { for(int k = 0; k < newGroupFile.size(); k++) { outGroup << newGroupFile[k] << endl; } } else if (countfile != "") { for(int k = 0; k < newGroupFile.size(); k++) { ct.remove(newGroupFile[k]); } } }else { if (countfile != "") { for(int k = 0; k < names.size(); k++) { ct.remove(names[k]); } } } } //print new listvector if (newList.getNumBins() != 0) { wroteSomething = true; newList.setLabels(newLabels); newList.printHeaders(out); newList.print(out); } } out.close(); if (groupfile != "") { outGroup.close(); outputTypes["group"].push_back(outputGroupFileName); outputNames.push_back(outputGroupFileName); } if (countfile != "") { if (ct.hasGroupInfo()) { vector<string> allGroups = ct.getNamesOfGroups(); for (int i = 0; i < allGroups.size(); i++) { if (!m->inUsersGroups(allGroups[i], Groups)) { ct.removeGroup(allGroups[i]); } } } ct.printTable(outputCountFileName); outputTypes["count"].push_back(outputCountFileName); outputNames.push_back(outputCountFileName); } if (wroteSomething == false) { m->mothurOut("Your file contains only rare sequences."); m->mothurOutEndLine(); } outputTypes["list"].push_back(outputFileName); outputNames.push_back(outputFileName); return 0; } catch(exception& e) { m->errorOut(e, "RemoveRareCommand", "processList"); exit(1); } }
int ClusterDoturCommand::execute(){ try { if (abort) { if (calledHelp) { return 0; } return 2; } ClusterClassic* cluster = new ClusterClassic(cutoff, method, sim); NameAssignment* nameMap = NULL; CountTable* ct = NULL; map<string, int> counts; if(namefile != "") { nameMap = new NameAssignment(namefile); nameMap->readMap(); cluster->readPhylipFile(phylipfile, nameMap); delete nameMap; }else if (countfile != "") { ct = new CountTable(); ct->readTable(countfile, false, false); cluster->readPhylipFile(phylipfile, ct); counts = ct->getNameMap(); delete ct; }else { cluster->readPhylipFile(phylipfile, nameMap); } tag = cluster->getTag(); if (m->getControl_pressed()) { delete cluster; return 0; } list = cluster->getListVector(); rabund = cluster->getRAbundVector(); if (outputDir == "") { outputDir += util.hasPath(phylipfile); } fileroot = outputDir + util.getRootName(util.getSimpleName(phylipfile)); map<string, string> variables; variables["[filename]"] = fileroot; variables["[clustertag]"] = tag; string sabundFileName = getOutputFileName("sabund", variables); string rabundFileName = getOutputFileName("rabund", variables); //if (countfile != "") { variables["[tag2]"] = "unique_list"; } string listFileName = getOutputFileName("list", variables); if (countfile == "") { util.openOutputFile(sabundFileName, sabundFile); util.openOutputFile(rabundFileName, rabundFile); outputNames.push_back(sabundFileName); outputTypes["sabund"].push_back(sabundFileName); outputNames.push_back(rabundFileName); outputTypes["rabund"].push_back(rabundFileName); } util.openOutputFile(listFileName, listFile); outputNames.push_back(listFileName); outputTypes["list"].push_back(listFileName); float previousDist = 0.00000; float rndPreviousDist = 0.00000; oldRAbund = *rabund; oldList = *list; bool printHeaders = true; int estart = time(NULL); int loop = 0; while ((cluster->getSmallDist() <= cutoff) && (cluster->getNSeqs() > 1)){ if (m->getControl_pressed()) { delete cluster; delete list; delete rabund; if(countfile == "") {rabundFile.close(); sabundFile.close(); util.mothurRemove((fileroot+ tag + ".rabund")); util.mothurRemove((fileroot+ tag + ".sabund")); } listFile.close(); util.mothurRemove((fileroot+ tag + ".list")); outputTypes.clear(); return 0; } cluster->update(cutoff); float dist = cluster->getSmallDist(); float rndDist = util.ceilDist(dist, precision); //cout << loop << '\t' << dist << '\t' << oldList.getNumBins() << endl; loop++; if(previousDist <= 0.0000 && dist != previousDist) { printData("unique", counts, printHeaders); } else if(rndDist != rndPreviousDist) { printData(toString(rndPreviousDist, length-1), counts, printHeaders); } previousDist = dist; rndPreviousDist = rndDist; oldRAbund = *rabund; oldList = *list; } if(previousDist <= 0.0000) { printData("unique", counts, printHeaders); } else if(rndPreviousDist<cutoff) { printData(toString(rndPreviousDist, length-1), counts, printHeaders); } if (countfile == "") { sabundFile.close(); rabundFile.close(); } listFile.close(); delete cluster; delete list; delete rabund; //set list file as new current listfile string currentName = ""; itTypes = outputTypes.find("list"); if (itTypes != outputTypes.end()) { if ((itTypes->second).size() != 0) { currentName = (itTypes->second)[0]; current->setListFile(currentName); } } //set rabund file as new current rabundfile itTypes = outputTypes.find("rabund"); if (itTypes != outputTypes.end()) { if ((itTypes->second).size() != 0) { currentName = (itTypes->second)[0]; current->setRabundFile(currentName); } } //set sabund file as new current sabundfile itTypes = outputTypes.find("sabund"); if (itTypes != outputTypes.end()) { if ((itTypes->second).size() != 0) { currentName = (itTypes->second)[0]; current->setSabundFile(currentName); } } m->mothurOut("\nOutput File Names: \n"); for (int i = 0; i < outputNames.size(); i++) { m->mothurOut(outputNames[i] +"\n"); } m->mothurOutEndLine(); m->mothurOut("It took " + toString(time(NULL) - estart) + " seconds to cluster"); m->mothurOutEndLine(); return 0; } catch(exception& e) { m->errorOut(e, "ClusterDoturCommand", "execute"); exit(1); } }
int MergeGroupsCommand::processCountFile(DesignMap*& designMap){ try { CountTable countTable; if (!countTable.testGroups(countfile)) { m->mothurOut("[ERROR]: your countfile contains no group information, please correct.\n"); m->setControl_pressed(true); return 0; } //read countTable countTable.readTable(countfile, true, false); //fill Groups - checks for "all" and for any typo groups vector<string> nameGroups = countTable.getNamesOfGroups(); if (Groups.size() == 0) { Groups = nameGroups; } vector<string> dnamesGroups = designMap->getNamesGroups(); //sanity check bool error = false; if (nameGroups.size() == dnamesGroups.size()) { //at least there are the same number //is every group in counttable also in designmap for (int i = 0; i < nameGroups.size(); i++) { if (m->getControl_pressed()) { break; } if (!util.inUsersGroups(nameGroups[i], dnamesGroups)) { error = true; break; } } } if (error) { m->mothurOut("[ERROR]: Your countfile does not contain the same groups as your design file, please correct\n"); m->setControl_pressed(true); return 0; } //user selected groups - remove some groups from table if (Groups.size() != nameGroups.size()) { for (int i = 0; i < nameGroups.size(); i++) { if (!util.inUsersGroups(nameGroups[i], Groups)) { countTable.removeGroup(nameGroups[i]); } } } //ask again in case order changed nameGroups = countTable.getNamesOfGroups(); int numGroups = nameGroups.size(); //create new table CountTable newTable; vector<string> treatments = designMap->getCategory(); map<string, vector<int> > clearedMap; for (int i = 0; i < treatments.size(); i++) { newTable.addGroup(treatments[i]); vector<int> temp; clearedMap[treatments[i]] = temp; } treatments = newTable.getNamesOfGroups(); set<string> namesToRemove; vector<string> namesOfSeqs = countTable.getNamesOfSeqs(); for (int i = 0; i < namesOfSeqs.size(); i++) { if (m->getControl_pressed()) { break; } vector<int> thisSeqsCounts = countTable.getGroupCounts(namesOfSeqs[i]); map<string, vector<int> > thisSeqsMap = clearedMap; for (int j = 0; j < numGroups; j++) { thisSeqsMap[designMap->get(nameGroups[j])].push_back(thisSeqsCounts[j]); } //create new counts for seq for new table vector<int> newCounts; int totalAbund = 0; for (int j = 0; j < treatments.size(); j++){ int abund = mergeAbund(thisSeqsMap[treatments[j]]); newCounts.push_back(abund); //order matters, add in count for each treatment in new table. totalAbund += abund; } //add seq to new table if(totalAbund == 0) { namesToRemove.insert(namesOfSeqs[i]); }else { newTable.push_back(namesOfSeqs[i], newCounts); } } if (error) { m->setControl_pressed(true); return 0; } //remove sequences zeroed out by median method if (namesToRemove.size() != 0) { //print names ofstream out; string accnosFile = "accnosFile.temp"; util.openOutputFile(accnosFile, out); //output to .accnos file for (set<string>::iterator it = namesToRemove.begin(); it != namesToRemove.end(); it++) { if (m->getControl_pressed()) { out.close(); util.mothurRemove(accnosFile); return 0; } out << *it << endl; } out.close(); //run remove.seqs string inputString = "accnos=" + accnosFile + ", fasta=" + fastafile; m->mothurOut("/******************************************/"); m->mothurOutEndLine(); m->mothurOut("Running command: remove.seqs(" + inputString + ")"); m->mothurOutEndLine(); current->setMothurCalling(true); Command* removeCommand = new RemoveSeqsCommand(inputString); removeCommand->execute(); map<string, vector<string> > filenames = removeCommand->getOutputFiles(); delete removeCommand; current->setMothurCalling(false); m->mothurOut("/******************************************/"); m->mothurOutEndLine(); util.mothurRemove(accnosFile); } string thisOutputDir = outputDir; if (outputDir == "") { thisOutputDir += util.hasPath(countfile); } map<string, string> variables; variables["[filename]"] = thisOutputDir + util.getRootName(util.getSimpleName(countfile)); variables["[extension]"] = util.getExtension(countfile); string outputFileName = getOutputFileName("count", variables); outputTypes["count"].push_back(outputFileName); outputNames.push_back(outputFileName); newTable.printTable(outputFileName); return 0; } catch(exception& e) { m->errorOut(e, "MergeGroupsCommand", "processCountFile"); exit(1); } }
int TreeGroupCommand::execute(){ try { if (abort) { if (calledHelp) { return 0; } return 2; } if (format == "sharedfile") { InputData input(sharedfile, "sharedfile", Groups); SharedRAbundVectors* lookup = input.getSharedRAbundVectors(); lastLabel = lookup->getLabel(); Groups = lookup->getNamesGroups(); if (lookup->size() < 2) { m->mothurOut("You have not provided enough valid groups. I cannot run the command.\n"); return 0; } //create treemap class from groupmap for tree class to use CountTable ct; set<string> nameMap; map<string, string> groupMap; set<string> gps; for (int i = 0; i < Groups.size(); i++) { nameMap.insert(Groups[i]); gps.insert(Groups[i]); groupMap[Groups[i]] = Groups[i]; } ct.createTable(nameMap, groupMap, gps); //fills tree names with shared files groups Treenames = lookup->getNamesGroups(); if (m->getControl_pressed()) { return 0; } //create tree file makeSimsShared(input, lookup, ct); if (m->getControl_pressed()) { for (int i = 0; i < outputNames.size(); i++) { util.mothurRemove(outputNames[i]); } return 0; } }else{ //read in dist file filename = inputfile; ReadMatrix* readMatrix; if (format == "column") { readMatrix = new ReadColumnMatrix(filename); } else if (format == "phylip") { readMatrix = new ReadPhylipMatrix(filename); } readMatrix->setCutoff(cutoff); ListVector* list; if(namefile != ""){ NameAssignment* nameMap = new NameAssignment(namefile); nameMap->readMap(); readMatrix->read(nameMap); list = readMatrix->getListVector(); delete nameMap; }else if (countfile != "") { CountTable* ct = new CountTable(); ct->readTable(countfile, true, false); readMatrix->read(ct); list = readMatrix->getListVector(); delete ct; }else { NameAssignment* nameMap = NULL; readMatrix->read(nameMap); list = readMatrix->getListVector(); } SparseDistanceMatrix* dMatrix = readMatrix->getDMatrix(); Treenames.clear(); //make treemap CountTable ct; set<string> nameMap; map<string, string> groupMap; set<string> gps; for (int i = 0; i < list->getNumBins(); i++) { string bin = list->get(i); nameMap.insert(bin); gps.insert(bin); groupMap[bin] = bin; Treenames.push_back(bin); } ct.createTable(nameMap, groupMap, gps); vector<string> namesGroups = ct.getNamesOfGroups(); if (m->getControl_pressed()) { return 0; } vector< vector<double> > matrix = makeSimsDist(dMatrix, list->getNumBins()); delete readMatrix; delete dMatrix; if (m->getControl_pressed()) { return 0; } //create a new filename map<string, string> variables; variables["[filename]"] = outputDir + util.getRootName(util.getSimpleName(inputfile)); string outputFile = getOutputFileName("tree",variables); outputNames.push_back(outputFile); outputTypes["tree"].push_back(outputFile); Tree* newTree = new Tree(&ct, matrix, Treenames); if (m->getControl_pressed()) { delete newTree; newTree = NULL; } else { newTree->assembleTree(); } if (newTree != NULL) { newTree->createNewickFile(outputFile); delete newTree; } if (m->getControl_pressed()) { return 0; } m->mothurOut("Tree complete.\n"); } //set tree file as new current treefile string currentName = ""; itTypes = outputTypes.find("tree"); if (itTypes != outputTypes.end()) { if ((itTypes->second).size() != 0) { currentName = (itTypes->second)[0]; current->setTreeFile(currentName); } } m->mothurOut("\nOutput File Names: \n"); for (int i = 0; i < outputNames.size(); i++) { m->mothurOut(outputNames[i] +"\n"); } m->mothurOutEndLine(); return 0; } catch(exception& e) { m->errorOut(e, "TreeGroupCommand", "execute"); exit(1); } }
int ClusterCommand::execute(){ try { if (abort == true) { if (calledHelp) { return 0; } return 2; } //phylip file given and cutoff not given - use cluster.classic because it uses less memory and is faster if ((format == "phylip") && (cutoff > 10.0)) { m->mothurOutEndLine(); m->mothurOut("You are using a phylip file and no cutoff. I will run cluster.classic to save memory and time."); m->mothurOutEndLine(); //run unique.seqs for deconvolute results string inputString = "phylip=" + distfile; if (namefile != "") { inputString += ", name=" + namefile; } else if (countfile != "") { inputString += ", count=" + countfile; } inputString += ", precision=" + toString(precision); inputString += ", method=" + method; if (hard) { inputString += ", hard=T"; } else { inputString += ", hard=F"; } if (sim) { inputString += ", sim=T"; } else { inputString += ", sim=F"; } m->mothurOutEndLine(); m->mothurOut("/------------------------------------------------------------/"); m->mothurOutEndLine(); m->mothurOut("Running command: cluster.classic(" + inputString + ")"); m->mothurOutEndLine(); Command* clusterClassicCommand = new ClusterDoturCommand(inputString); clusterClassicCommand->execute(); delete clusterClassicCommand; m->mothurOut("/------------------------------------------------------------/"); m->mothurOutEndLine(); return 0; } ReadMatrix* read; if (format == "column") { read = new ReadColumnMatrix(columnfile, sim); } //sim indicates whether its a similarity matrix else if (format == "phylip") { read = new ReadPhylipMatrix(phylipfile, sim); } read->setCutoff(cutoff); NameAssignment* nameMap = NULL; CountTable* ct = NULL; map<string, int> counts; if(namefile != ""){ nameMap = new NameAssignment(namefile); nameMap->readMap(); read->read(nameMap); }else if (countfile != "") { ct = new CountTable(); ct->readTable(countfile, false, false); read->read(ct); counts = ct->getNameMap(); }else { read->read(nameMap); } list = read->getListVector(); matrix = read->getDMatrix(); if(countfile != "") { rabund = new RAbundVector(); createRabund(ct, list, rabund); //creates an rabund that includes the counts for the unique list delete ct; }else { rabund = new RAbundVector(list->getRAbundVector()); } delete read; if (m->control_pressed) { //clean up delete list; delete matrix; delete rabund; if(countfile == ""){rabundFile.close(); sabundFile.close(); m->mothurRemove((fileroot+ tag + ".rabund")); m->mothurRemove((fileroot+ tag + ".sabund")); } listFile.close(); m->mothurRemove((fileroot+ tag + ".list")); outputTypes.clear(); return 0; } //create cluster if (method == "furthest") { cluster = new CompleteLinkage(rabund, list, matrix, cutoff, method, adjust); } else if(method == "nearest"){ cluster = new SingleLinkage(rabund, list, matrix, cutoff, method, adjust); } else if(method == "average"){ cluster = new AverageLinkage(rabund, list, matrix, cutoff, method, adjust); } else if(method == "weighted"){ cluster = new WeightedLinkage(rabund, list, matrix, cutoff, method, adjust); } tag = cluster->getTag(); if (outputDir == "") { outputDir += m->hasPath(distfile); } fileroot = outputDir + m->getRootName(m->getSimpleName(distfile)); map<string, string> variables; variables["[filename]"] = fileroot; variables["[clustertag]"] = tag; string sabundFileName = getOutputFileName("sabund", variables); string rabundFileName = getOutputFileName("rabund", variables); if (countfile != "") { variables["[tag2]"] = "unique_list"; } string listFileName = getOutputFileName("list", variables); if (countfile == "") { m->openOutputFile(sabundFileName, sabundFile); m->openOutputFile(rabundFileName, rabundFile); outputNames.push_back(sabundFileName); outputTypes["sabund"].push_back(sabundFileName); outputNames.push_back(rabundFileName); outputTypes["rabund"].push_back(rabundFileName); } m->openOutputFile(listFileName, listFile); outputNames.push_back(listFileName); outputTypes["list"].push_back(listFileName); list->printHeaders(listFile); time_t estart = time(NULL); float previousDist = 0.00000; float rndPreviousDist = 0.00000; oldRAbund = *rabund; oldList = *list; print_start = true; start = time(NULL); loops = 0; double saveCutoff = cutoff; while (matrix->getSmallDist() < cutoff && matrix->getNNodes() > 0){ if (m->control_pressed) { //clean up delete list; delete matrix; delete rabund; delete cluster; if(countfile == "") {rabundFile.close(); sabundFile.close(); m->mothurRemove((fileroot+ tag + ".rabund")); m->mothurRemove((fileroot+ tag + ".sabund")); } listFile.close(); m->mothurRemove((fileroot+ tag + ".list")); outputTypes.clear(); return 0; } if (print_start && m->isTrue(timing)) { m->mothurOut("Clustering (" + tag + ") dist " + toString(matrix->getSmallDist()) + "/" + toString(m->roundDist(matrix->getSmallDist(), precision)) + "\t(precision: " + toString(precision) + ", Nodes: " + toString(matrix->getNNodes()) + ")"); cout.flush(); print_start = false; } loops++; cluster->update(cutoff); float dist = matrix->getSmallDist(); float rndDist; if (hard) { rndDist = m->ceilDist(dist, precision); }else{ rndDist = m->roundDist(dist, precision); } if(previousDist <= 0.0000 && dist != previousDist){ printData("unique", counts); } else if(rndDist != rndPreviousDist){ printData(toString(rndPreviousDist, length-1), counts); } previousDist = dist; rndPreviousDist = rndDist; oldRAbund = *rabund; oldList = *list; } if (print_start && m->isTrue(timing)) { m->mothurOut("Clustering (" + tag + ") for distance " + toString(previousDist) + "/" + toString(rndPreviousDist) + "\t(precision: " + toString(precision) + ", Nodes: " + toString(matrix->getNNodes()) + ")"); cout.flush(); print_start = false; } if(previousDist <= 0.0000){ printData("unique", counts); } else if(rndPreviousDist<cutoff){ printData(toString(rndPreviousDist, length-1), counts); } delete matrix; delete list; delete rabund; delete cluster; if (countfile == "") { sabundFile.close(); rabundFile.close(); } listFile.close(); if (saveCutoff != cutoff) { if (hard) { saveCutoff = m->ceilDist(saveCutoff, precision); } else { saveCutoff = m->roundDist(saveCutoff, precision); } m->mothurOut("changed cutoff to " + toString(cutoff)); m->mothurOutEndLine(); } //set list file as new current listfile string current = ""; itTypes = outputTypes.find("list"); if (itTypes != outputTypes.end()) { if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setListFile(current); } } //set rabund file as new current rabundfile itTypes = outputTypes.find("rabund"); if (itTypes != outputTypes.end()) { if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setRabundFile(current); } } //set sabund file as new current sabundfile itTypes = outputTypes.find("sabund"); if (itTypes != outputTypes.end()) { if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setSabundFile(current); } } m->mothurOutEndLine(); m->mothurOut("Output File Names: "); m->mothurOutEndLine(); for (int i = 0; i < outputNames.size(); i++) { m->mothurOut(outputNames[i]); m->mothurOutEndLine(); } m->mothurOutEndLine(); //if (m->isTrue(timing)) { m->mothurOut("It took " + toString(time(NULL) - estart) + " seconds to cluster"); m->mothurOutEndLine(); //} return 0; } catch(exception& e) { m->errorOut(e, "ClusterCommand", "execute"); exit(1); } }
int ClassifySeqsCommand::execute(){ try { if (abort) { if (calledHelp) { return 0; } return 2; } string outputMethodTag = method; if(method == "wang"){ classify = new Bayesian(taxonomyFileName, templateFileName, search, kmerSize, cutoff, iters, util.getRandomNumber(), flip, writeShortcuts, current->getVersion()); } else if(method == "knn"){ classify = new Knn(taxonomyFileName, templateFileName, search, kmerSize, gapOpen, gapExtend, match, misMatch, numWanted, util.getRandomNumber(), current->getVersion()); } else if(method == "zap"){ outputMethodTag = search + "_" + outputMethodTag; if (search == "kmer") { classify = new KmerTree(templateFileName, taxonomyFileName, kmerSize, cutoff); } else { classify = new AlignTree(templateFileName, taxonomyFileName, cutoff); } } else { m->mothurOut(search + " is not a valid method option. I will run the command using wang."); m->mothurOutEndLine(); classify = new Bayesian(taxonomyFileName, templateFileName, search, kmerSize, cutoff, iters, util.getRandomNumber(), flip, writeShortcuts, current->getVersion()); } if (m->getControl_pressed()) { delete classify; return 0; } m->mothurOut("Classifying sequences from " + fastafile + " ...\n" ); string baseTName = util.getSimpleName(taxonomyFileName); //set rippedTaxName to string RippedTaxName = ""; bool foundDot = false; for (int i = baseTName.length()-1; i >= 0; i--) { if (foundDot && (baseTName[i] != '.')) { RippedTaxName = baseTName[i] + RippedTaxName; } else if (foundDot && (baseTName[i] == '.')) { break; } else if (!foundDot && (baseTName[i] == '.')) { foundDot = true; } } if (outputDir == "") { outputDir += util.hasPath(fastafile); } map<string, string> variables; variables["[filename]"] = outputDir + util.getRootName(util.getSimpleName(fastafile)); variables["[tag]"] = RippedTaxName; variables["[tag2]"] = outputMethodTag; string newTaxonomyFile = getOutputFileName("taxonomy", variables); string newaccnosFile = getOutputFileName("accnos", variables); string tempTaxonomyFile = outputDir + util.getRootName(util.getSimpleName(fastafile)) + "taxonomy.temp"; string taxSummary = getOutputFileName("taxsummary", variables); if ((method == "knn") && (search == "distance")) { string DistName = getOutputFileName("matchdist", variables); classify->setDistName(DistName); outputNames.push_back(DistName); outputTypes["matchdist"].push_back(DistName); } outputNames.push_back(newTaxonomyFile); outputTypes["taxonomy"].push_back(newTaxonomyFile); outputNames.push_back(taxSummary); outputTypes["taxsummary"].push_back(taxSummary); long start = time(NULL); int numFastaSeqs = createProcesses(newTaxonomyFile, tempTaxonomyFile, newaccnosFile, fastafile); if (!util.isBlank(newaccnosFile)) { m->mothurOut("\n[WARNING]: mothur reversed some your sequences for a better classification. If you would like to take a closer look, please check " + newaccnosFile + " for the list of the sequences.\n"); outputNames.push_back(newaccnosFile); outputTypes["accnos"].push_back(newaccnosFile); }else { util.mothurRemove(newaccnosFile); } m->mothurOut("\nIt took " + toString(time(NULL) - start) + " secs to classify " + toString(numFastaSeqs) + " sequences.\n\n"); start = time(NULL); //read namefile map<string, vector<string> > nameMap; map<string, vector<string> >::iterator itNames; if(namefile != "") { m->mothurOut("Reading " + namefile + "..."); cout.flush(); nameMap.clear(); //remove old names util.readNames(namefile, nameMap); m->mothurOut(" Done.\n"); } //output taxonomy with the unclassified bins added ifstream inTax; util.openInputFile(newTaxonomyFile, inTax); ofstream outTax; string unclass = newTaxonomyFile + ".unclass.temp"; util.openOutputFile(unclass, outTax); //get maxLevel from phylotree so you know how many 'unclassified's to add int maxLevel = classify->getMaxLevel(); //read taxfile - this reading and rewriting is done to preserve the confidence scores. string name, taxon; GroupMap* groupMap = NULL; CountTable* ct = NULL; PhyloSummary* taxaSum; if (hasCount) { ct = new CountTable(); ct->readTable(countfile, true, false); taxaSum = new PhyloSummary(ct, relabund, printlevel); }else { if (groupfile != "") { groupMap = new GroupMap(groupfile); groupMap->readMap(); } taxaSum = new PhyloSummary(groupMap, relabund, printlevel); } while (!inTax.eof()) { if (m->getControl_pressed()) { outputTypes.clear(); if (ct != NULL) { delete ct; } if (groupMap != NULL) { delete groupMap; } delete taxaSum; for (int i = 0; i < outputNames.size(); i++) { util.mothurRemove(outputNames[i]); } delete classify; return 0; } inTax >> name; util.gobble(inTax); taxon = util.getline(inTax); util.gobble(inTax); string newTax = util.addUnclassifieds(taxon, maxLevel, probs); outTax << name << '\t' << newTax << endl; if (namefile != "") { itNames = nameMap.find(name); if (itNames == nameMap.end()) { m->mothurOut(name + " is not in your name file please correct.\n"); exit(1); }else{ //add it as many times as there are identical seqs for (int i = 0; i < itNames->second.size(); i++) { taxaSum->addSeqToTree(itNames->second[i], newTax); } itNames->second.clear(); nameMap.erase(itNames->first); } }else { taxaSum->addSeqToTree(name, newTax); } } inTax.close(); outTax.close(); util.mothurRemove(newTaxonomyFile); util.renameFile(unclass, newTaxonomyFile); if (m->getControl_pressed()) { outputTypes.clear(); if (ct != NULL) { delete ct; } if (groupMap != NULL) { delete groupMap; } for (int i = 0; i < outputNames.size(); i++) { util.mothurRemove(outputNames[i]); } delete classify; return 0; } //print summary file ofstream outTaxTree; util.openOutputFile(taxSummary, outTaxTree); taxaSum->print(outTaxTree, output); outTaxTree.close(); if (ct != NULL) { delete ct; } if (groupMap != NULL) { delete groupMap; } delete taxaSum; util.mothurRemove(tempTaxonomyFile); delete classify; m->mothurOut("\nIt took " + toString(time(NULL) - start) + " secs to create the summary file for " + toString(numFastaSeqs) + " sequences.\n\n"); m->mothurOut("\nOutput File Names: \n"); for (int i = 0; i < outputNames.size(); i++) { m->mothurOut(outputNames[i]); m->mothurOutEndLine(); } m->mothurOutEndLine(); //set taxonomy file as new current taxonomyfile string currentName = ""; itTypes = outputTypes.find("taxonomy"); if (itTypes != outputTypes.end()) { if ((itTypes->second).size() != 0) { currentName = (itTypes->second)[0]; current->setTaxonomyFile(currentName); } } currentName = ""; itTypes = outputTypes.find("accnos"); if (itTypes != outputTypes.end()) { if ((itTypes->second).size() != 0) { currentName = (itTypes->second)[0]; current->setAccnosFile(currentName); } } return 0; } catch(exception& e) { m->errorOut(e, "ClassifySeqsCommand", "execute"); exit(1); } }
int SeqSummaryCommand::execute(){ try{ if (abort == true) { if (calledHelp) { return 0; } return 2; } int start = time(NULL); //set current fasta to fastafile m->setFastaFile(fastafile); map<string, string> variables; variables["[filename]"] = outputDir + m->getRootName(m->getSimpleName(fastafile)); string summaryFile = getOutputFileName("summary",variables); long long numSeqs = 0; long long size = 0; long long numUniques = 0; map<int, long long> startPosition; map<int, long long> endPosition; map<int, long long> seqLength; map<int, long long> ambigBases; map<int, long long> longHomoPolymer; if (namefile != "") { nameMap = m->readNames(namefile); numUniques = nameMap.size(); } else if (countfile != "") { CountTable ct; ct.readTable(countfile, false, false); nameMap = ct.getNameMap(); size = ct.getNumSeqs(); numUniques = ct.getNumUniqueSeqs(); } if (m->control_pressed) { return 0; } vector<unsigned long long> positions; #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix) positions = m->divideFile(fastafile, processors); for (int i = 0; i < (positions.size()-1); i++) { lines.push_back(new linePair(positions[i], positions[(i+1)])); } #else positions = m->setFilePosFasta(fastafile, numSeqs); if (numSeqs < processors) { processors = numSeqs; } //figure out how many sequences you have to process int numSeqsPerProcessor = numSeqs / processors; for (int i = 0; i < processors; i++) { int startIndex = i * numSeqsPerProcessor; if(i == (processors - 1)){ numSeqsPerProcessor = numSeqs - i * numSeqsPerProcessor; } lines.push_back(new linePair(positions[startIndex], numSeqsPerProcessor)); } #endif if(processors == 1){ numSeqs = driverCreateSummary(startPosition, endPosition, seqLength, ambigBases, longHomoPolymer, fastafile, summaryFile, lines[0]); }else{ numSeqs = createProcessesCreateSummary(startPosition, endPosition, seqLength, ambigBases, longHomoPolymer, fastafile, summaryFile); } if (m->control_pressed) { return 0; } //set size if (countfile != "") {}//already set else if (namefile == "") { size = numSeqs; } else { for (map<int, long long>::iterator it = startPosition.begin(); it != startPosition.end(); it++) { size += it->second; } } if ((namefile != "") || (countfile != "")) { string type = "count"; if (namefile != "") { type = "name"; } if (numSeqs != numUniques) { // do fasta and name/count files match m->mothurOut("[ERROR]: Your " + type + " file contains " + toString(numUniques) + " unique sequences, but your fasta file contains " + toString(numSeqs) + ". File mismatch detected, quitting command.\n"); m->control_pressed = true; } } if (m->control_pressed) { m->mothurRemove(summaryFile); return 0; } long long ptile0_25 = 1+(long long)(size * 0.025); //number of sequences at 2.5% long long ptile25 = 1+(long long)(size * 0.250); //number of sequences at 25% long long ptile50 = 1+(long long)(size * 0.500); long long ptile75 = 1+(long long)(size * 0.750); long long ptile97_5 = 1+(long long)(size * 0.975); long long ptile100 = (long long)(size); vector<int> starts; starts.resize(7,0); vector<int> ends; ends.resize(7,0); vector<int> ambigs; ambigs.resize(7,0); vector<int> lengths; lengths.resize(7,0); vector<int> homops; homops.resize(7,0); //find means long long meanStartPosition, meanEndPosition, meanSeqLength, meanAmbigBases, meanLongHomoPolymer; meanStartPosition = 0; meanEndPosition = 0; meanSeqLength = 0; meanAmbigBases = 0; meanLongHomoPolymer = 0; //minimum if ((startPosition.begin())->first == -1) { starts[0] = 0; } else {starts[0] = (startPosition.begin())->first; } long long totalSoFar = 0; //set all values to min starts[1] = starts[0]; starts[2] = starts[0]; starts[3] = starts[0]; starts[4] = starts[0]; starts[5] = starts[0]; int lastValue = 0; for (map<int, long long>::iterator it = startPosition.begin(); it != startPosition.end(); it++) { int value = it->first; if (value == -1) { value = 0; } meanStartPosition += (value*it->second); totalSoFar += it->second; if (((totalSoFar <= ptile0_25) && (totalSoFar > 1)) || ((lastValue < ptile0_25) && (totalSoFar > ptile0_25))){ starts[1] = value; } //save value if (((totalSoFar <= ptile25) && (totalSoFar > ptile0_25)) || ((lastValue < ptile25) && (totalSoFar > ptile25))) { starts[2] = value; } //save value if (((totalSoFar <= ptile50) && (totalSoFar > ptile25)) || ((lastValue < ptile50) && (totalSoFar > ptile50))) { starts[3] = value; } //save value if (((totalSoFar <= ptile75) && (totalSoFar > ptile50)) || ((lastValue < ptile75) && (totalSoFar > ptile75))) { starts[4] = value; } //save value if (((totalSoFar <= ptile97_5) && (totalSoFar > ptile75)) || ((lastValue < ptile97_5) && (totalSoFar > ptile97_5))) { starts[5] = value; } //save value if ((totalSoFar <= ptile100) && (totalSoFar > ptile97_5)) { starts[6] = value; } //save value lastValue = totalSoFar; } starts[6] = (startPosition.rbegin())->first; if ((endPosition.begin())->first == -1) { ends[0] = 0; } else {ends[0] = (endPosition.begin())->first; } totalSoFar = 0; //set all values to min ends[1] = ends[0]; ends[2] = ends[0]; ends[3] = ends[0]; ends[4] = ends[0]; ends[5] = ends[0]; lastValue = 0; for (map<int, long long>::iterator it = endPosition.begin(); it != endPosition.end(); it++) { int value = it->first; if (value == -1) { value = 0; } meanEndPosition += (value*it->second); totalSoFar += it->second; if (((totalSoFar <= ptile0_25) && (totalSoFar > 1)) || ((lastValue < ptile0_25) && (totalSoFar > ptile0_25))){ ends[1] = value; } //save value if (((totalSoFar <= ptile25) && (totalSoFar > ptile0_25)) || ((lastValue < ptile25) && (totalSoFar > ptile25))) { ends[2] = value; } //save value if (((totalSoFar <= ptile50) && (totalSoFar > ptile25)) || ((lastValue < ptile50) && (totalSoFar > ptile50))) { ends[3] = value; } //save value if (((totalSoFar <= ptile75) && (totalSoFar > ptile50)) || ((lastValue < ptile75) && (totalSoFar > ptile75))) { ends[4] = value; } //save value if (((totalSoFar <= ptile97_5) && (totalSoFar > ptile75)) || ((lastValue < ptile97_5) && (totalSoFar > ptile97_5))) { ends[5] = value; } //save value if ((totalSoFar <= ptile100) && (totalSoFar > ptile97_5)) { ends[6] = value; } //save value lastValue = totalSoFar; } ends[6] = (endPosition.rbegin())->first; if ((seqLength.begin())->first == -1) { lengths[0] = 0; } else {lengths[0] = (seqLength.begin())->first; } //set all values to min lengths[1] = lengths[0]; lengths[2] = lengths[0]; lengths[3] = lengths[0]; lengths[4] = lengths[0]; lengths[5] = lengths[0]; totalSoFar = 0; lastValue = 0; for (map<int, long long>::iterator it = seqLength.begin(); it != seqLength.end(); it++) { int value = it->first; meanSeqLength += (value*it->second); totalSoFar += it->second; if (((totalSoFar <= ptile0_25) && (totalSoFar > 1)) || ((lastValue < ptile0_25) && (totalSoFar > ptile0_25))){ lengths[1] = value; } //save value if (((totalSoFar <= ptile25) && (totalSoFar > ptile0_25)) || ((lastValue < ptile25) && (totalSoFar > ptile25))) { lengths[2] = value; } //save value if (((totalSoFar <= ptile50) && (totalSoFar > ptile25)) || ((lastValue < ptile50) && (totalSoFar > ptile50))) { lengths[3] = value; } //save value if (((totalSoFar <= ptile75) && (totalSoFar > ptile50)) || ((lastValue < ptile75) && (totalSoFar > ptile75))) { lengths[4] = value; } //save value if (((totalSoFar <= ptile97_5) && (totalSoFar > ptile75)) || ((lastValue < ptile97_5) && (totalSoFar > ptile97_5))) { lengths[5] = value; } //save value if ((totalSoFar <= ptile100) && (totalSoFar > ptile97_5)) { lengths[6] = value; } //save value lastValue = totalSoFar; } lengths[6] = (seqLength.rbegin())->first; if ((ambigBases.begin())->first == -1) { ambigs[0] = 0; } else {ambigs[0] = (ambigBases.begin())->first; } //set all values to min ambigs[1] = ambigs[0]; ambigs[2] = ambigs[0]; ambigs[3] = ambigs[0]; ambigs[4] = ambigs[0]; ambigs[5] = ambigs[0]; totalSoFar = 0; lastValue = 0; for (map<int, long long>::iterator it = ambigBases.begin(); it != ambigBases.end(); it++) { int value = it->first; meanAmbigBases += (value*it->second); totalSoFar += it->second; if (((totalSoFar <= ptile0_25) && (totalSoFar > 1)) || ((lastValue < ptile0_25) && (totalSoFar > ptile0_25))){ ambigs[1] = value; } //save value if (((totalSoFar <= ptile25) && (totalSoFar > ptile0_25)) || ((lastValue < ptile25) && (totalSoFar > ptile25))) { ambigs[2] = value; } //save value if (((totalSoFar <= ptile50) && (totalSoFar > ptile25)) || ((lastValue < ptile50) && (totalSoFar > ptile50))) { ambigs[3] = value; } //save value if (((totalSoFar <= ptile75) && (totalSoFar > ptile50)) || ((lastValue < ptile75) && (totalSoFar > ptile75))) { ambigs[4] = value; } //save value if (((totalSoFar <= ptile97_5) && (totalSoFar > ptile75)) || ((lastValue < ptile97_5) && (totalSoFar > ptile97_5))) { ambigs[5] = value; } //save value if ((totalSoFar <= ptile100) && (totalSoFar > ptile97_5)) { ambigs[6] = value; } //save value lastValue = totalSoFar; } ambigs[6] = (ambigBases.rbegin())->first; if ((longHomoPolymer.begin())->first == -1) { homops[0] = 0; } else {homops[0] = (longHomoPolymer.begin())->first; } //set all values to min homops[1] = homops[0]; homops[2] = homops[0]; homops[3] = homops[0]; homops[4] = homops[0]; homops[5] = homops[0]; totalSoFar = 0; lastValue = 0; for (map<int, long long>::iterator it = longHomoPolymer.begin(); it != longHomoPolymer.end(); it++) { int value = it->first; meanLongHomoPolymer += (it->first*it->second); totalSoFar += it->second; if (((totalSoFar <= ptile0_25) && (totalSoFar > 1)) || ((lastValue < ptile0_25) && (totalSoFar > ptile0_25))){ homops[1] = value; } //save value if (((totalSoFar <= ptile25) && (totalSoFar > ptile0_25)) || ((lastValue < ptile25) && (totalSoFar > ptile25))) { homops[2] = value; } //save value if (((totalSoFar <= ptile50) && (totalSoFar > ptile25)) || ((lastValue < ptile50) && (totalSoFar > ptile50))) { homops[3] = value; } //save value if (((totalSoFar <= ptile75) && (totalSoFar > ptile50)) || ((lastValue < ptile75) && (totalSoFar > ptile75))) { homops[4] = value; } //save value if (((totalSoFar <= ptile97_5) && (totalSoFar > ptile75)) || ((lastValue < ptile97_5) && (totalSoFar > ptile97_5))) { homops[5] = value; } //save value if ((totalSoFar <= ptile100) && (totalSoFar > ptile97_5)) { homops[6] = value; } //save value lastValue = totalSoFar; } homops[6] = (longHomoPolymer.rbegin())->first; double meanstartPosition, meanendPosition, meanseqLength, meanambigBases, meanlongHomoPolymer; meanstartPosition = meanStartPosition / (double) size; meanendPosition = meanEndPosition /(double) size; meanlongHomoPolymer = meanLongHomoPolymer / (double) size; meanseqLength = meanSeqLength / (double) size; meanambigBases = meanAmbigBases /(double) size; if (m->control_pressed) { m->mothurRemove(summaryFile); return 0; } m->mothurOutEndLine(); m->mothurOut("\t\tStart\tEnd\tNBases\tAmbigs\tPolymer\tNumSeqs"); m->mothurOutEndLine(); m->mothurOut("Minimum:\t" + toString(starts[0]) + "\t" + toString(ends[0]) + "\t" + toString(lengths[0]) + "\t" + toString(ambigs[0]) + "\t" + toString(homops[0]) + "\t" + toString(1)); m->mothurOutEndLine(); m->mothurOut("2.5%-tile:\t" + toString(starts[1]) + "\t" + toString(ends[1]) + "\t" + toString(lengths[1]) + "\t" + toString(ambigs[1]) + "\t" + toString(homops[1]) + "\t" + toString(ptile0_25)); m->mothurOutEndLine(); m->mothurOut("25%-tile:\t" + toString(starts[2]) + "\t" + toString(ends[2]) + "\t" + toString(lengths[2]) + "\t" + toString(ambigs[2]) + "\t" + toString(homops[2]) + "\t" + toString(ptile25)); m->mothurOutEndLine(); m->mothurOut("Median: \t" + toString(starts[3]) + "\t" + toString(ends[3]) + "\t" + toString(lengths[3]) + "\t" + toString(ambigs[3]) + "\t" + toString(homops[3]) + "\t" + toString(ptile50)); m->mothurOutEndLine(); m->mothurOut("75%-tile:\t" + toString(starts[4]) + "\t" + toString(ends[4]) + "\t" + toString(lengths[4]) + "\t" + toString(ambigs[4]) + "\t" + toString(homops[4]) + "\t" + toString(ptile75)); m->mothurOutEndLine(); m->mothurOut("97.5%-tile:\t" + toString(starts[5]) + "\t" + toString(ends[5]) + "\t" + toString(lengths[5]) + "\t" + toString(ambigs[5]) + "\t" + toString(homops[5]) + "\t" + toString(ptile97_5)); m->mothurOutEndLine(); m->mothurOut("Maximum:\t" + toString(starts[6]) + "\t" + toString(ends[6]) + "\t" + toString(lengths[6]) + "\t" + toString(ambigs[6]) + "\t" + toString(homops[6]) + "\t" + toString(ptile100)); m->mothurOutEndLine(); m->mothurOut("Mean:\t" + toString(meanstartPosition) + "\t" + toString(meanendPosition) + "\t" + toString(meanseqLength) + "\t" + toString(meanambigBases) + "\t" + toString(meanlongHomoPolymer)); m->mothurOutEndLine(); if ((namefile == "") && (countfile == "")) { m->mothurOut("# of Seqs:\t" + toString(numSeqs)); m->mothurOutEndLine(); } else { m->mothurOut("# of unique seqs:\t" + toString(numSeqs)); m->mothurOutEndLine(); m->mothurOut("total # of seqs:\t" + toString(size)); m->mothurOutEndLine(); } if (m->control_pressed) { m->mothurRemove(summaryFile); return 0; } m->mothurOutEndLine(); m->mothurOut("Output File Names: "); m->mothurOutEndLine(); m->mothurOut(summaryFile); m->mothurOutEndLine(); outputNames.push_back(summaryFile); outputTypes["summary"].push_back(summaryFile); m->mothurOutEndLine(); if ((namefile == "") && (countfile == "")) { m->mothurOut("It took " + toString(time(NULL) - start) + " secs to summarize " + toString(numSeqs) + " sequences.\n"); } else{ m->mothurOut("It took " + toString(time(NULL) - start) + " secs to summarize " + toString(size) + " sequences.\n"); } //set fasta file as new current fastafile string current = ""; itTypes = outputTypes.find("summary"); if (itTypes != outputTypes.end()) { if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setSummaryFile(current); } } return 0; } catch(exception& e) { m->errorOut(e, "SeqSummaryCommand", "execute"); exit(1); } }
int OptiBlastMatrix::readBlast(){ try { Utils util; map<string, long long> nameAssignment; if (namefile != "") { util.readNames(namefile, nameAssignment); } else if (countfile != "") { CountTable ct; ct.readTable(countfile, false, true); map<string, int> temp = ct.getNameMap(); for (map<string, int>::iterator it = temp.begin(); it!= temp.end(); it++) { nameAssignment[it->first] = it->second; } } else { readBlastNames(nameAssignment); } int count = 0; for (map<string, long long>::iterator it = nameAssignment.begin(); it!= nameAssignment.end(); it++) { it->second = count; count++; nameMap.push_back(it->first); overlapNameMap.push_back(it->first); } m->mothurOut("Reading Blast File... "); cout.flush(); string firstName, secondName, eScore, currentRow; currentRow = ""; string repeatName = ""; float distance, thisoverlap, refScore; float percentId; float numBases, mismatch, gap, startQuery, endQuery, startRef, endRef, score, lengthThisSeq; map<string, float> thisRowsBlastScores; ///////////////////// Read to eliminate singletons /////////////////////// ifstream fileHandle; util.openInputFile(distFile, fileHandle); map<int, int> singletonIndexSwap; map<int, int> blastSingletonIndexSwap; vector<bool> singleton; singleton.resize(nameAssignment.size(), true); vector<bool> overlapSingleton; overlapSingleton.resize(nameAssignment.size(), true); vector< map<string,float> > dists; dists.resize(nameAssignment.size()); if (!fileHandle.eof()) { //read in line from file fileHandle >> firstName >> secondName >> percentId >> numBases >> mismatch >> gap >> startQuery >> endQuery >> startRef >> endRef >> eScore >> score; util.gobble(fileHandle); currentRow = firstName; lengthThisSeq = numBases; repeatName = firstName + secondName; if (firstName == secondName) { refScore = score; } else{ thisRowsBlastScores[secondName] = score; //calc overlap score thisoverlap = 1.0 - (percentId * (lengthThisSeq - startQuery) / endRef / 100.0 - penalty); //if there is a valid overlap, add it if ((startRef <= length) && ((endQuery+length) >= lengthThisSeq) && (thisoverlap <= cutoff)) { //convert name to number map<string,long long>::iterator itA = nameAssignment.find(firstName); map<string,long long>::iterator itB = nameAssignment.find(secondName); if(itA == nameAssignment.end()){ m->mothurOut("AAError: Sequence '" + firstName + "' was not found in the names file, please correct\n"); exit(1); } if(itB == nameAssignment.end()){ m->mothurOut("ABError: Sequence '" + secondName + "' was not found in the names file, please correct\n"); exit(1); } int indexA = (itA->second); int indexB = (itB->second); overlapSingleton[indexA] = false; overlapSingleton[indexB] = false; blastSingletonIndexSwap[indexA] = indexA; blastSingletonIndexSwap[indexB] = indexB; } } }else { m->mothurOut("Error in your blast file, cannot read."); m->mothurOutEndLine(); exit(1); } while(fileHandle){ //let's assume it's a triangular matrix... if (m->getControl_pressed()) { fileHandle.close(); return 0; } //read in line from file fileHandle >> firstName >> secondName >> percentId >> numBases >> mismatch >> gap >> startQuery >> endQuery >> startRef >> endRef >> eScore >> score; util.gobble(fileHandle); string temp = firstName + secondName; //to check if this file has repeat lines, ie. is this a blast instead of a blscreen file //if this is a new pairing if (temp != repeatName) { repeatName = temp; if (currentRow == firstName) { if (firstName == secondName) { refScore = score; } else{ //save score thisRowsBlastScores[secondName] = score; //calc overlap score thisoverlap = 1.0 - (percentId * (lengthThisSeq - startQuery) / endRef / 100.0 - penalty); //if there is a valid overlap, add it if ((startRef <= length) && ((endQuery+length) >= lengthThisSeq) && (thisoverlap <= cutoff)) { //convert name to number map<string,long long>::iterator itA = nameAssignment.find(firstName); map<string,long long>::iterator itB = nameAssignment.find(secondName); if(itA == nameAssignment.end()){ m->mothurOut("AAError: Sequence '" + firstName + "' was not found in the names file, please correct\n"); exit(1); } if(itB == nameAssignment.end()){ m->mothurOut("ABError: Sequence '" + secondName + "' was not found in the names file, please correct\n"); exit(1); } int indexA = (itA->second); int indexB = (itB->second); overlapSingleton[indexA] = false; overlapSingleton[indexB] = false; blastSingletonIndexSwap[indexA] = indexA; blastSingletonIndexSwap[indexB] = indexB; } } //end else }else { //end row //convert blast scores to distance and add cell to sparse matrix if we can map<string, float>::iterator it; map<string, float>::iterator itDist; for(it=thisRowsBlastScores.begin(); it!=thisRowsBlastScores.end(); it++) { distance = 1.0 - (it->second / refScore); //do we already have the distance calculated for b->a map<string,long long>::iterator itA = nameAssignment.find(currentRow); map<string,long long>::iterator itB = nameAssignment.find(it->first); itDist = dists[itB->second].find(itA->first); //if we have it then compare if (itDist != dists[itB->second].end()) { //if you want the minimum blast score ratio, then pick max distance if(minWanted) { distance = max(itDist->second, distance); } else{ distance = min(itDist->second, distance); } //is this distance below cutoff if (distance <= cutoff) { int indexA = (itA->second); int indexB = (itB->second); singleton[indexA] = false; singleton[indexB] = false; singletonIndexSwap[indexA] = indexA; singletonIndexSwap[indexB] = indexB; } //not going to need this again dists[itB->second].erase(itDist); }else { //save this value until we get the other ratio dists[itA->second][it->first] = distance; } } //clear out last rows info thisRowsBlastScores.clear(); currentRow = firstName; lengthThisSeq = numBases; //add this row to thisRowsBlastScores if (firstName == secondName) { refScore = score; } else{ //add this row to thisRowsBlastScores thisRowsBlastScores[secondName] = score; //calc overlap score thisoverlap = 1.0 - (percentId * (lengthThisSeq - startQuery) / endRef / 100.0 - penalty); //if there is a valid overlap, add it if ((startRef <= length) && ((endQuery+length) >= lengthThisSeq) && (thisoverlap <= cutoff)) { //convert name to number map<string,long long>::iterator itA = nameAssignment.find(firstName); map<string,long long>::iterator itB = nameAssignment.find(secondName); if(itA == nameAssignment.end()){ m->mothurOut("AAError: Sequence '" + firstName + "' was not found in the names file, please correct\n"); exit(1); } if(itB == nameAssignment.end()){ m->mothurOut("ABError: Sequence '" + secondName + "' was not found in the names file, please correct\n"); exit(1); } int indexA = (itA->second); int indexB = (itB->second); overlapSingleton[indexA] = false; overlapSingleton[indexB] = false; blastSingletonIndexSwap[indexA] = indexA; blastSingletonIndexSwap[indexB] = indexB; } } }//end if current row }//end if repeat } fileHandle.close(); //convert blast scores to distance and add cell to sparse matrix if we can map<string, float>::iterator it; map<string, float>::iterator itDist; for(it=thisRowsBlastScores.begin(); it!=thisRowsBlastScores.end(); it++) { distance = 1.0 - (it->second / refScore); //do we already have the distance calculated for b->a map<string,long long>::iterator itA = nameAssignment.find(currentRow); map<string,long long>::iterator itB = nameAssignment.find(it->first); itDist = dists[itB->second].find(itA->first); //if we have it then compare if (itDist != dists[itB->second].end()) { //if you want the minimum blast score ratio, then pick max distance if(minWanted) { distance = max(itDist->second, distance); } else{ distance = min(itDist->second, distance); } //is this distance below cutoff if (distance <= cutoff) { int indexA = (itA->second); int indexB = (itB->second); singleton[indexA] = false; singleton[indexB] = false; singletonIndexSwap[indexA] = indexA; singletonIndexSwap[indexB] = indexB; } //not going to need this again dists[itB->second].erase(itDist); }else { //save this value until we get the other ratio dists[itA->second][it->first] = distance; } } //clear out info thisRowsBlastScores.clear(); dists.clear(); ////////////////////////////////////////////////////////////////////////// int nonSingletonCount = 0; for (int i = 0; i < singleton.size(); i++) { if (!singleton[i]) { //if you are a singleton singletonIndexSwap[i] = nonSingletonCount; nonSingletonCount++; }else { singletons.push_back(nameMap[i]); } } singleton.clear(); int overlapNonSingletonCount = 0; for (int i = 0; i < overlapSingleton.size(); i++) { if (!overlapSingleton[i]) { //if you are a singleton blastSingletonIndexSwap[i] = overlapNonSingletonCount; overlapNonSingletonCount++; } } overlapSingleton.clear(); ifstream in; util.openInputFile(distFile, in); dists.resize(nameAssignment.size()); closeness.resize(nonSingletonCount); blastOverlap.resize(overlapNonSingletonCount); map<string, string> names; if (namefile != "") { util.readNames(namefile, names); for (int i = 0; i < singletons.size(); i++) { singletons[i] = names[singletons[i]]; } } m->mothurOut(" halfway ... "); cout.flush(); if (!in.eof()) { //read in line from file in >> firstName >> secondName >> percentId >> numBases >> mismatch >> gap >> startQuery >> endQuery >> startRef >> endRef >> eScore >> score; util.gobble(fileHandle); currentRow = firstName; lengthThisSeq = numBases; repeatName = firstName + secondName; if (firstName == secondName) { refScore = score; } else{ //convert name to number map<string,long long>::iterator itA = nameAssignment.find(firstName); map<string,long long>::iterator itB = nameAssignment.find(secondName); if(itA == nameAssignment.end()){ m->mothurOut("AAError: Sequence '" + firstName + "' was not found in the names file, please correct\n"); exit(1); } if(itB == nameAssignment.end()){ m->mothurOut("ABError: Sequence '" + secondName + "' was not found in the names file, please correct\n"); exit(1); } thisRowsBlastScores[secondName] = score; if (namefile != "") { firstName = names[firstName]; //redundant names secondName = names[secondName]; //redundant names } nameMap[singletonIndexSwap[itA->second]] = firstName; nameMap[singletonIndexSwap[itB->second]] = secondName; //calc overlap score thisoverlap = 1.0 - (percentId * (lengthThisSeq - startQuery) / endRef / 100.0 - penalty); //if there is a valid overlap, add it if ((startRef <= length) && ((endQuery+length) >= lengthThisSeq) && (thisoverlap <= cutoff)) { int indexA = (itA->second); int indexB = (itB->second); int newB = blastSingletonIndexSwap[indexB]; int newA = blastSingletonIndexSwap[indexA]; blastOverlap[newA].insert(newB); blastOverlap[newB].insert(newA); overlapNameMap[newA] = firstName; overlapNameMap[newB] = secondName; } } }else { m->mothurOut("Error in your blast file, cannot read."); m->mothurOutEndLine(); exit(1); }
//********************************************************************************************************************** int GetRAbundCommand::processList(ofstream& out){ try { CountTable ct; ct.readTable(countfile, false, false); InputData input(inputfile, format); ListVector* list = input.getListVector(); string lastLabel = list->getLabel(); //if the users enters label "0.06" and there is no "0.06" in their file use the next lowest label. set<string> processedLabels; set<string> userLabels = labels; if (m->control_pressed) { delete list; return 0; } while((list != NULL) && ((allLines == 1) || (userLabels.size() != 0))) { if(allLines == 1 || labels.count(list->getLabel()) == 1){ m->mothurOut(list->getLabel()); m->mothurOutEndLine(); if (m->control_pressed) { delete list; return 0; } RAbundVector* rabund = new RAbundVector(); createRabund(ct, list, rabund); if(sorted) { rabund->print(out); } else { rabund->nonSortedPrint(out); } delete rabund; processedLabels.insert(list->getLabel()); userLabels.erase(list->getLabel()); } if ((m->anyLabelsToProcess(list->getLabel(), userLabels, "") == true) && (processedLabels.count(lastLabel) != 1)) { string saveLabel = list->getLabel(); delete list; list = input.getListVector(lastLabel); m->mothurOut(list->getLabel()); m->mothurOutEndLine(); if (m->control_pressed) { delete list; return 0; } RAbundVector* rabund = new RAbundVector(); createRabund(ct, list, rabund); if(sorted) { rabund->print(out); } else { rabund->nonSortedPrint(out); } delete rabund; processedLabels.insert(list->getLabel()); userLabels.erase(list->getLabel()); //restore real lastlabel to save below list->setLabel(saveLabel); } lastLabel = list->getLabel(); delete list; list = input.getListVector(); } //output error messages about any remaining user labels set<string>::iterator it; bool needToRun = false; for (it = userLabels.begin(); it != userLabels.end(); it++) { m->mothurOut("Your file does not include the label " + *it); if (processedLabels.count(lastLabel) != 1) { m->mothurOut(". I will use " + lastLabel + "."); m->mothurOutEndLine(); needToRun = true; }else { m->mothurOut(". Please refer to " + lastLabel + "."); m->mothurOutEndLine(); } } //run last label if you need to if (needToRun == true) { if (list != NULL) { delete list; } list = input.getListVector(lastLabel); m->mothurOut(list->getLabel()); m->mothurOutEndLine(); if (m->control_pressed) { delete list; return 0; } RAbundVector* rabund = new RAbundVector(); createRabund(ct, list, rabund); if(sorted) { rabund->print(out); } else { rabund->nonSortedPrint(out); } delete rabund; delete list; } return 0; } catch(exception& e) { m->errorOut(e, "GetRAbundCommand", "processList"); exit(1); } }
int DeconvoluteCommand::execute() { try { if (abort) { if (calledHelp) { return 0; } return 2; } //prepare filenames and open files map<string, string> variables; variables["[filename]"] = outputDir + util.getRootName(util.getSimpleName(fastafile)); string outNameFile = getOutputFileName("name", variables); string outCountFile = getOutputFileName("count", variables); variables["[extension]"] = util.getExtension(fastafile); string outFastaFile = getOutputFileName("fasta", variables); map<string, string> nameMap; map<string, string>::iterator itNames; if (namefile != "") { util.readNames(namefile, nameMap); if (namefile == outNameFile){ //prepare filenames and open files map<string, string> mvariables; mvariables["[filename]"] = outputDir + util.getRootName(util.getSimpleName(fastafile)); mvariables["[tag]"] = "unique"; outNameFile = getOutputFileName("name", mvariables); } } CountTable ct; if (countfile != "") { ct.readTable(countfile, true, false); if (countfile == outCountFile){ //prepare filenames and open files map<string, string> mvariables; mvariables["[filename]"] = outputDir + util.getRootName(util.getSimpleName(fastafile)); mvariables["[tag]"] = "unique"; outCountFile = getOutputFileName("count", mvariables); } } if (m->getControl_pressed()) { return 0; } ifstream in; util.openInputFile(fastafile, in); ofstream outFasta; util.openOutputFile(outFastaFile, outFasta); map<string, string> sequenceStrings; //sequenceString -> list of names. "atgc...." -> seq1,seq2,seq3. map<string, string>::iterator itStrings; set<string> nameInFastaFile; //for sanity checking set<string>::iterator itname; vector<string> nameFileOrder; CountTable newCt; int count = 0; while (!in.eof()) { if (m->getControl_pressed()) { in.close(); outFasta.close(); util.mothurRemove(outFastaFile); return 0; } Sequence seq(in); if (seq.getName() != "") { //sanity checks itname = nameInFastaFile.find(seq.getName()); if (itname == nameInFastaFile.end()) { nameInFastaFile.insert(seq.getName()); } else { m->mothurOut("[ERROR]: You already have a sequence named " + seq.getName() + " in your fasta file, sequence names must be unique, please correct."); m->mothurOutEndLine(); } itStrings = sequenceStrings.find(seq.getAligned()); if (itStrings == sequenceStrings.end()) { //this is a new unique sequence //output to unique fasta file seq.printSequence(outFasta); if (namefile != "") { itNames = nameMap.find(seq.getName()); if (itNames == nameMap.end()) { //namefile and fastafile do not match m->mothurOut("[ERROR]: " + seq.getName() + " is in your fasta file, and not in your namefile, please correct."); m->mothurOutEndLine(); }else { if (format == "name") { sequenceStrings[seq.getAligned()] = itNames->second; nameFileOrder.push_back(seq.getAligned()); }else { newCt.push_back(seq.getName(), util.getNumNames(itNames->second)); sequenceStrings[seq.getAligned()] = seq.getName(); nameFileOrder.push_back(seq.getAligned()); } } }else if (countfile != "") { if (format == "name") { int numSeqs = ct.getNumSeqs(seq.getName()); string expandedName = seq.getName()+"_0"; for (int i = 1; i < numSeqs; i++) { expandedName += "," + seq.getName() + "_" + toString(i); } sequenceStrings[seq.getAligned()] = expandedName; nameFileOrder.push_back(seq.getAligned()); }else { ct.getNumSeqs(seq.getName()); //checks to make sure seq is in table sequenceStrings[seq.getAligned()] = seq.getName(); nameFileOrder.push_back(seq.getAligned()); } }else { if (format == "name") { sequenceStrings[seq.getAligned()] = seq.getName(); nameFileOrder.push_back(seq.getAligned()); } else { newCt.push_back(seq.getName()); sequenceStrings[seq.getAligned()] = seq.getName(); nameFileOrder.push_back(seq.getAligned()); } } }else { //this is a dup if (namefile != "") { itNames = nameMap.find(seq.getName()); if (itNames == nameMap.end()) { //namefile and fastafile do not match m->mothurOut("[ERROR]: " + seq.getName() + " is in your fasta file, and not in your namefile, please correct."); m->mothurOutEndLine(); }else { if (format == "name") { sequenceStrings[seq.getAligned()] += "," + itNames->second; } else { int currentReps = newCt.getNumSeqs(itStrings->second); newCt.setNumSeqs(itStrings->second, currentReps+(util.getNumNames(itNames->second))); } } }else if (countfile != "") { if (format == "name") { int numSeqs = ct.getNumSeqs(seq.getName()); string expandedName = seq.getName()+"_0"; for (int i = 1; i < numSeqs; i++) { expandedName += "," + seq.getName() + "_" + toString(i); } sequenceStrings[seq.getAligned()] += "," + expandedName; }else { int num = ct.getNumSeqs(seq.getName()); //checks to make sure seq is in table if (num != 0) { //its in the table ct.mergeCounts(itStrings->second, seq.getName()); //merges counts and saves in uniques name } } }else { if (format == "name") { sequenceStrings[seq.getAligned()] += "," + seq.getName(); } else { int currentReps = newCt.getNumSeqs(itStrings->second); newCt.setNumSeqs(itStrings->second, currentReps+1); } } } count++; } util.gobble(in); if(count % 1000 == 0) { m->mothurOutJustToScreen(toString(count) + "\t" + toString(sequenceStrings.size()) + "\n"); } } if(count % 1000 != 0) { m->mothurOut(toString(count) + "\t" + toString(sequenceStrings.size())); m->mothurOutEndLine(); } in.close(); outFasta.close(); if (m->getControl_pressed()) { util.mothurRemove(outFastaFile); return 0; } //print new names file ofstream outNames; if (format == "name") { util.openOutputFile(outNameFile, outNames); outputNames.push_back(outNameFile); outputTypes["name"].push_back(outNameFile); } else { util.openOutputFile(outCountFile, outNames); outputTypes["count"].push_back(outCountFile); outputNames.push_back(outCountFile); } if ((countfile != "") && (format == "count")) { ct.printHeaders(outNames); } else if ((countfile == "") && (format == "count")) { newCt.printHeaders(outNames); } for (int i = 0; i < nameFileOrder.size(); i++) { if (m->getControl_pressed()) { outputTypes.clear(); util.mothurRemove(outFastaFile); outNames.close(); for (int j = 0; j < outputNames.size(); j++) { util.mothurRemove(outputNames[j]); } return 0; } itStrings = sequenceStrings.find(nameFileOrder[i]); if (itStrings != sequenceStrings.end()) { if (format == "name") { //get rep name int pos = (itStrings->second).find_first_of(','); if (pos == string::npos) { // only reps itself outNames << itStrings->second << '\t' << itStrings->second << endl; }else { outNames << (itStrings->second).substr(0, pos) << '\t' << itStrings->second << endl; } }else { if (countfile != "") { ct.printSeq(outNames, itStrings->second); } else if (format == "count") { newCt.printSeq(outNames, itStrings->second); } } }else{ m->mothurOut("[ERROR]: mismatch in namefile print."); m->mothurOutEndLine(); m->setControl_pressed(true); } } outNames.close(); if (m->getControl_pressed()) { outputTypes.clear(); util.mothurRemove(outFastaFile); for (int j = 0; j < outputNames.size(); j++) { util.mothurRemove(outputNames[j]); } return 0; } m->mothurOut("\nOutput File Names: \n"); outputNames.push_back(outFastaFile); outputTypes["fasta"].push_back(outFastaFile); for (int i = 0; i < outputNames.size(); i++) { m->mothurOut(outputNames[i] +"\n"); } m->mothurOutEndLine(); //set fasta file as new current fastafile string currentName = ""; itTypes = outputTypes.find("fasta"); if (itTypes != outputTypes.end()) { if ((itTypes->second).size() != 0) { currentName = (itTypes->second)[0]; current->setFastaFile(currentName); } } itTypes = outputTypes.find("name"); if (itTypes != outputTypes.end()) { if ((itTypes->second).size() != 0) { currentName = (itTypes->second)[0]; current->setNameFile(currentName); } } itTypes = outputTypes.find("count"); if (itTypes != outputTypes.end()) { if ((itTypes->second).size() != 0) { currentName = (itTypes->second)[0]; current->setCountFile(currentName); } } return 0; } catch(exception& e) { m->errorOut(e, "DeconvoluteCommand", "execute"); exit(1); } }
//********************************************************************************************************************** int SharedCommand::createSharedFromListGroup() { try { GroupMap* groupMap = NULL; CountTable* countTable = NULL; pickedGroups = false; if (groupfile != "") { groupMap = new GroupMap(groupfile); int groupError = groupMap->readMap(); if (groupError == 1) { delete groupMap; return 0; } vector<string> allGroups = groupMap->getNamesOfGroups(); if (Groups.size() == 0) { Groups = allGroups; } else { pickedGroups = true; } }else{ countTable = new CountTable(); countTable->readTable(countfile, true, false); vector<string> allGroups = countTable->getNamesOfGroups(); if (Groups.size() == 0) { Groups = allGroups; } else { pickedGroups = true; } } int numGroups = Groups.size(); if (m->getControl_pressed()) { return 0; } ofstream out; string filename = ""; if (!pickedGroups) { string filename = listfile; if (outputDir == "") { outputDir += util.hasPath(filename); } map<string, string> variables; variables["[filename]"] = outputDir + util.getRootName(util.getSimpleName(filename)); filename = getOutputFileName("shared",variables); outputNames.push_back(filename); outputTypes["shared"].push_back(filename); util.openOutputFile(filename, out); } //set fileroot fileroot = outputDir + util.getRootName(util.getSimpleName(listfile)); map<string, string> variables; variables["[filename]"] = fileroot; string errorOff = "no error"; InputData input(listfile, "shared", Groups); SharedListVector* SharedList = input.getSharedListVector(); string lastLabel = SharedList->getLabel(); SharedRAbundVectors* lookup; if (m->getControl_pressed()) { delete SharedList; if (groupMap != NULL) { delete groupMap; } if (countTable != NULL) { delete countTable; } out.close(); if (!pickedGroups) { util.mothurRemove(filename); } return 0; } //sanity check vector<string> namesSeqs; int numGroupNames = 0; if (current->getGroupMode() == "group") { namesSeqs = groupMap->getNamesSeqs(); numGroupNames = groupMap->getNumSeqs(); } else { namesSeqs = countTable->getNamesOfSeqs(); numGroupNames = countTable->getNumUniqueSeqs(); } int error = ListGroupSameSeqs(namesSeqs, SharedList); if ((!pickedGroups) && (SharedList->getNumSeqs() != numGroupNames)) { //if the user has not specified any groups and their files don't match exit with error m->mothurOut("Your group file contains " + toString(numGroupNames) + " sequences and list file contains " + toString(SharedList->getNumSeqs()) + " sequences. Please correct.\n"); m->setControl_pressed(true); out.close(); if (!pickedGroups) { util.mothurRemove(filename); } //remove blank shared file you made //delete memory delete SharedList; if (groupMap != NULL) { delete groupMap; } if (countTable != NULL) { delete countTable; } return 0; } if (error == 1) { m->setControl_pressed(true); } //if user has specified groups make new groupfile for them if ((pickedGroups) && (current->getGroupMode() == "group")) { //make new group file string groups = ""; if (numGroups < 4) { for (int i = 0; i < numGroups-1; i++) { groups += Groups[i] + "."; } groups+=Groups[numGroups-1]; }else { groups = "merge"; } map<string, string> variables; variables["[filename]"] = outputDir + util.getRootName(util.getSimpleName(listfile)); variables["[group]"] = groups; string newGroupFile = getOutputFileName("group",variables); outputTypes["group"].push_back(newGroupFile); outputNames.push_back(newGroupFile); ofstream outGroups; util.openOutputFile(newGroupFile, outGroups); vector<string> names = groupMap->getNamesSeqs(); string groupName; for (int i = 0; i < names.size(); i++) { groupName = groupMap->getGroup(names[i]); if (isValidGroup(groupName, Groups)) { outGroups << names[i] << '\t' << groupName << endl; } } outGroups.close(); } //if the users enters label "0.06" and there is no "0.06" in their file use the next lowest label. set<string> processedLabels; set<string> userLabels = labels; bool printHeaders = true; while((SharedList != NULL) && ((allLines == 1) || (userLabels.size() != 0))) { if (m->getControl_pressed()) { delete SharedList; if (groupMap != NULL) { delete groupMap; } if (countTable != NULL) { delete countTable; } if (!pickedGroups) { out.close(); util.mothurRemove(filename); } return 0; } if(allLines == 1 || labels.count(SharedList->getLabel()) == 1){ lookup = SharedList->getSharedRAbundVector(); m->mothurOut(lookup->getLabel()+"\n"); if (m->getControl_pressed()) { delete SharedList; if (groupMap != NULL) { delete groupMap; } if (countTable != NULL) { delete countTable; } delete lookup; if (!pickedGroups) { out.close(); util.mothurRemove(filename); } return 0; } //if picked groups must split the shared file by label if (pickedGroups) { string filename = listfile; if (outputDir == "") { outputDir += util.hasPath(filename); } map<string, string> variables; variables["[filename]"] = outputDir + util.getRootName(util.getSimpleName(filename)); variables["[distance]"] = lookup->getLabel(); filename = getOutputFileName("shared",variables); outputNames.push_back(filename); outputTypes["shared"].push_back(filename); ofstream out2; util.openOutputFile(filename, out2); lookup->eliminateZeroOTUS(); printSharedData(lookup, out2, printHeaders); out2.close(); }else { printSharedData(lookup, out, printHeaders); //prints info to the .shared file } delete lookup; processedLabels.insert(SharedList->getLabel()); userLabels.erase(SharedList->getLabel()); } if ((util.anyLabelsToProcess(SharedList->getLabel(), userLabels, errorOff) ) && (processedLabels.count(lastLabel) != 1)) { string saveLabel = SharedList->getLabel(); delete SharedList; SharedList = input.getSharedListVector(lastLabel); //get new list vector to process lookup = SharedList->getSharedRAbundVector(); m->mothurOut(lookup->getLabel()+"\n"); if (m->getControl_pressed()) { delete SharedList; if (groupMap != NULL) { delete groupMap; } if (countTable != NULL) { delete countTable; } delete lookup; if (!pickedGroups) { out.close(); util.mothurRemove(filename); } return 0; } //if picked groups must split the shared file by label if (pickedGroups) { string filename = listfile; if (outputDir == "") { outputDir += util.hasPath(filename); } map<string, string> variables; variables["[filename]"] = outputDir + util.getRootName(util.getSimpleName(filename)); variables["[distance]"] = lookup->getLabel(); filename = getOutputFileName("shared",variables); outputNames.push_back(filename); outputTypes["shared"].push_back(filename); ofstream out2; util.openOutputFile(filename, out2); lookup->eliminateZeroOTUS(); printSharedData(lookup, out2, printHeaders); out2.close(); }else { printSharedData(lookup, out, printHeaders); //prints info to the .shared file } delete lookup; processedLabels.insert(SharedList->getLabel()); userLabels.erase(SharedList->getLabel()); //restore real lastlabel to save below SharedList->setLabel(saveLabel); } lastLabel = SharedList->getLabel(); delete SharedList; SharedList = input.getSharedListVector(); //get new list vector to process } //output error messages about any remaining user labels set<string>::iterator it; bool needToRun = false; for (it = userLabels.begin(); it != userLabels.end(); it++) { if (processedLabels.count(lastLabel) != 1) { needToRun = true; } } //run last label if you need to if (needToRun ) { if (SharedList != NULL) { delete SharedList; } SharedList = input.getSharedListVector(lastLabel); //get new list vector to process lookup = SharedList->getSharedRAbundVector(); m->mothurOut(lookup->getLabel()+"\n"); if (m->getControl_pressed()) { if (groupMap != NULL) { delete groupMap; } if (countTable != NULL) { delete countTable; } if (!pickedGroups) { out.close(); util.mothurRemove(filename); } return 0; } //if picked groups must split the shared file by label if (pickedGroups) { string filename = listfile; if (outputDir == "") { outputDir += util.hasPath(filename); } map<string, string> variables; variables["[filename]"] = outputDir + util.getRootName(util.getSimpleName(filename)); variables["[distance]"] = lookup->getLabel(); filename = getOutputFileName("shared",variables); outputNames.push_back(filename); outputTypes["shared"].push_back(filename); ofstream out2; util.openOutputFile(filename, out2); lookup->eliminateZeroOTUS(); printSharedData(lookup, out2, printHeaders); out2.close(); }else { printSharedData(lookup, out, printHeaders); //prints info to the .shared file } delete lookup; delete SharedList; } if (!pickedGroups) { out.close(); } if (groupMap != NULL) { delete groupMap; } if (countTable != NULL) { delete countTable; } if (m->getControl_pressed()) { if (!pickedGroups) { util.mothurRemove(filename); } return 0; } return 0; } catch(exception& e) { m->errorOut(e, "SharedCommand", "createSharedFromListGroup"); exit(1); } }
//********************************************************************************************************************** int SplitGroupCommand::runCount(){ try { CountTable ct; ct.readTable(countfile, true, false); if (!ct.hasGroupInfo()) { m->mothurOut("[ERROR]: your count file does not contain group info, cannot split by group.\n"); m->control_pressed = true; } if (m->control_pressed) { return 0; } vector<string> namesGroups = ct.getNamesOfGroups(); SharedUtil util; util.setGroups(Groups, namesGroups); //fill filehandles with neccessary ofstreams map<string, string> ffiles; //group -> filename map<string, string> cfiles; //group -> filename for (int i=0; i<Groups.size(); i++) { ofstream ftemp, ctemp; map<string, string> variables; variables["[filename]"] = outputDir + m->getRootName(m->getSimpleName(fastafile)); variables["[group]"] = Groups[i]; string newFasta = getOutputFileName("fasta",variables); outputNames.push_back(newFasta); outputTypes["fasta"].push_back(newFasta); ffiles[Groups[i]] = newFasta; m->openOutputFile(newFasta, ftemp); ftemp.close(); variables["[filename]"] = outputDir + m->getRootName(m->getSimpleName(countfile)); string newCount = getOutputFileName("count",variables); outputNames.push_back(newCount); outputTypes["count"].push_back(newCount); cfiles[Groups[i]] = newCount; m->openOutputFile(newCount, ctemp); ctemp << "Representative_Sequence\ttotal\t" << Groups[i] << endl; ctemp.close(); } ifstream in; m->openInputFile(fastafile, in); while (!in.eof()) { Sequence seq(in); m->gobble(in); if (m->control_pressed) { break; } if (seq.getName() != "") { vector<string> thisSeqsGroups = ct.getGroups(seq.getName()); for (int i = 0; i < thisSeqsGroups.size(); i++) { if (m->inUsersGroups(thisSeqsGroups[i], Groups)) { //if this sequence belongs to a group we want them print ofstream outf, outc; m->openOutputFileAppend(ffiles[thisSeqsGroups[i]], outf); seq.printSequence(outf); outf.close(); int numSeqs = ct.getGroupCount(seq.getName(), thisSeqsGroups[i]); m->openOutputFileAppend(cfiles[thisSeqsGroups[i]], outc); outc << seq.getName() << '\t' << numSeqs << '\t' << numSeqs << endl; outc.close(); } } } } in.close(); return 0; } catch(exception& e) { m->errorOut(e, "SplitGroupCommand", "runCount"); exit(1); } }
//********************************************************************************************************************** SharedCommand::SharedCommand(string option) { try { abort = false; calledHelp = false; pickedGroups=false; allLines = 1; //allow user to run help if(option == "help") { help(); abort = true; calledHelp = true; } else if(option == "citation") { citation(); abort = true; calledHelp = true;} else { vector<string> myArray = setParameters(); OptionParser parser(option); map<string, string> parameters = parser.getParameters(); ValidParameters validParameter; map<string, string>::iterator it; //check to make sure all parameters are valid for command for (it = parameters.begin(); it != parameters.end(); it++) { if (!validParameter.isValidParameter(it->first, myArray, it->second)) { abort = true; } } //if the user changes the input directory command factory will send this info to us in the output parameter string inputDir = validParameter.valid(parameters, "inputdir"); if (inputDir == "not found"){ inputDir = ""; } else { string path; it = parameters.find("list"); //user has given a template file if(it != parameters.end()){ path = util.hasPath(it->second); //if the user has not given a path then, add inputdir. else leave path alone. if (path == "") { parameters["list"] = inputDir + it->second; } } it = parameters.find("group"); //user has given a template file if(it != parameters.end()){ path = util.hasPath(it->second); //if the user has not given a path then, add inputdir. else leave path alone. if (path == "") { parameters["group"] = inputDir + it->second; } } it = parameters.find("count"); //user has given a template file if(it != parameters.end()){ path = util.hasPath(it->second); //if the user has not given a path then, add inputdir. else leave path alone. if (path == "") { parameters["count"] = inputDir + it->second; } } it = parameters.find("biom"); //user has given a template file if(it != parameters.end()){ path = util.hasPath(it->second); //if the user has not given a path then, add inputdir. else leave path alone. if (path == "") { parameters["biom"] = inputDir + it->second; } } } vector<string> tempOutNames; outputTypes["shared"] = tempOutNames; outputTypes["group"] = tempOutNames; outputTypes["map"] = tempOutNames; //if the user changes the output directory command factory will send this info to us in the output parameter outputDir = validParameter.valid(parameters, "outputdir"); if (outputDir == "not found"){ outputDir = ""; } //check for required parameters listfile = validParameter.validFile(parameters, "list"); if (listfile == "not open") { listfile = ""; abort = true; } else if (listfile == "not found") { listfile = ""; } else { current->setListFile(listfile); } biomfile = validParameter.validFile(parameters, "biom"); if (biomfile == "not open") { biomfile = ""; abort = true; } else if (biomfile == "not found") { biomfile = ""; } else { current->setBiomFile(biomfile); } ordergroupfile = validParameter.validFile(parameters, "ordergroup"); if (ordergroupfile == "not open") { abort = true; } else if (ordergroupfile == "not found") { ordergroupfile = ""; } groupfile = validParameter.validFile(parameters, "group"); if (groupfile == "not open") { groupfile = ""; abort = true; } else if (groupfile == "not found") { groupfile = ""; } else { current->setGroupFile(groupfile); } countfile = validParameter.validFile(parameters, "count"); if (countfile == "not open") { countfile = ""; abort = true; } else if (countfile == "not found") { countfile = ""; } else { current->setCountFile(countfile); CountTable temp; if (!temp.testGroups(countfile)) { m->mothurOut("\n[WARNING]: Your count file does not have group info, all reads will be assigned to mothurGroup.\n"); temp.readTable(countfile, false, false); //dont read groups map<string, int> seqs = temp.getNameMap(); CountTable newCountTable; newCountTable.addGroup("mothurGroup"); for (map<string, int>::iterator it = seqs.begin(); it != seqs.end(); it++) { vector<int> counts; counts.push_back(it->second); newCountTable.push_back(it->first, counts); } string newCountfileName = util.getRootName(countfile) + "mothurGroup" + util.getExtension(countfile); newCountTable.printTable(newCountfileName); current->setCountFile(newCountfileName); countfile = newCountfileName; outputNames.push_back(newCountfileName); } } if ((biomfile == "") && (listfile == "") && (countfile == "")) { //you must provide at least one of the following //is there are current file available for either of these? //give priority to list, then biom, then count listfile = current->getListFile(); if (listfile != "") { m->mothurOut("Using " + listfile + " as input file for the list parameter.\n"); } else { biomfile = current->getBiomFile(); if (biomfile != "") { m->mothurOut("Using " + biomfile + " as input file for the biom parameter.\n"); } else { countfile = current->getCountFile(); if (countfile != "") { m->mothurOut("Using " + countfile + " as input file for the count parameter.\n"); } else { m->mothurOut("[ERROR]: No valid current files. You must provide a list or biom or count file before you can use the make.shared command.\n"); abort = true; } } } } else if ((biomfile != "") && (listfile != "")) { m->mothurOut("When executing a make.shared command you must enter ONLY ONE of the following: list or biom.\n"); abort = true; } if (listfile != "") { if ((groupfile == "") && (countfile == "")) { groupfile = current->getGroupFile(); if (groupfile != "") { m->mothurOut("Using " + groupfile + " as input file for the group parameter.\n"); } else { countfile = current->getCountFile(); if (countfile != "") { m->mothurOut("Using " + countfile + " as input file for the count parameter.\n"); } else { m->mothurOut("[ERROR]: You need to provide a groupfile or countfile if you are going to use the list format.\n"); abort = true; } } } } string groups = validParameter.valid(parameters, "groups"); if (groups == "not found") { groups = ""; } else { pickedGroups=true; util.splitAtDash(groups, Groups); if (Groups.size() != 0) { if (Groups[0]== "all") { Groups.clear(); } } } //check for optional parameter and set defaults // ...at some point should added some additional type checking... string label = validParameter.valid(parameters, "label"); if (label == "not found") { label = ""; } else { if(label != "all") { util.splitAtDash(label, labels); allLines = 0; } else { allLines = 1; } } if ((listfile == "") && (biomfile == "") && (countfile != "")) { //building a shared file from a count file, require label if (labels.size() == 0) { m->mothurOut("[ERROR]: You must provide a label when converting a count file to a shared file, please correct.\n"); abort = true; } } } } catch(exception& e) { m->errorOut(e, "SharedCommand", "SharedCommand"); exit(1); } }
//*************************************************************************************************************** int SummaryQualCommand::execute(){ try{ if (abort == true) { if (calledHelp) { return 0; } return 2; } int start = time(NULL); int numSeqs = 0; vector<int> position; vector<int> averageQ; vector< vector<int> > scores; if (m->control_pressed) { return 0; } if (namefile != "") { nameMap = m->readNames(namefile); } else if (countfile != "") { CountTable ct; ct.readTable(countfile, false, false); nameMap = ct.getNameMap(); } vector<unsigned long long> positions; #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix) positions = m->divideFile(qualfile, processors); for (int i = 0; i < (positions.size()-1); i++) { lines.push_back(linePair(positions[i], positions[(i+1)])); } #else if (processors == 1) { lines.push_back(linePair(0, 1000)); }else { positions = m->setFilePosFasta(qualfile, numSeqs); if (numSeqs < processors) { processors = numSeqs; } //figure out how many sequences you have to process int numSeqsPerProcessor = numSeqs / processors; for (int i = 0; i < processors; i++) { int startIndex = i * numSeqsPerProcessor; if(i == (processors - 1)){ numSeqsPerProcessor = numSeqs - i * numSeqsPerProcessor; } lines.push_back(linePair(positions[startIndex], numSeqsPerProcessor)); } } #endif if(processors == 1){ numSeqs = driverCreateSummary(position, averageQ, scores, qualfile, lines[0]); } else{ numSeqs = createProcessesCreateSummary(position, averageQ, scores, qualfile); } if (m->control_pressed) { return 0; } //print summary file map<string, string> variables; variables["[filename]"] = outputDir + m->getRootName(m->getSimpleName(qualfile)); string summaryFile = getOutputFileName("summary",variables); printQual(summaryFile, position, averageQ, scores); if (m->control_pressed) { m->mothurRemove(summaryFile); return 0; } //output results to screen cout.setf(ios::fixed, ios::floatfield); cout.setf(ios::showpoint); m->mothurOutEndLine(); m->mothurOut("Position\tNumSeqs\tAverageQ"); m->mothurOutEndLine(); for (int i = 0; i < position.size(); i+=100) { float average = averageQ[i] / (float) position[i]; cout << i << '\t' << position[i] << '\t' << average; m->mothurOutJustToLog(toString(i) + "\t" + toString(position[i]) + "\t" + toString(average)); m->mothurOutEndLine(); } m->mothurOutEndLine(); m->mothurOut("It took " + toString(time(NULL) - start) + " secs to create the summary file for " + toString(numSeqs) + " sequences."); m->mothurOutEndLine(); m->mothurOutEndLine(); m->mothurOutEndLine(); m->mothurOut("Output File Names: "); m->mothurOutEndLine(); m->mothurOut(summaryFile); m->mothurOutEndLine(); outputNames.push_back(summaryFile); outputTypes["summary"].push_back(summaryFile); m->mothurOutEndLine(); return 0; } catch(exception& e) { m->errorOut(e, "SummaryQualCommand", "execute"); exit(1); } }
int ClassifySeqsCommand::execute(){ try { if (abort == true) { if (calledHelp) { return 0; } return 2; } string outputMethodTag = method; if(method == "wang"){ classify = new Bayesian(taxonomyFileName, templateFileName, search, kmerSize, cutoff, iters, rand(), flip, writeShortcuts); } else if(method == "knn"){ classify = new Knn(taxonomyFileName, templateFileName, search, kmerSize, gapOpen, gapExtend, match, misMatch, numWanted, rand()); } else if(method == "zap"){ outputMethodTag = search + "_" + outputMethodTag; if (search == "kmer") { classify = new KmerTree(templateFileName, taxonomyFileName, kmerSize, cutoff); } else { classify = new AlignTree(templateFileName, taxonomyFileName, cutoff); } } else { m->mothurOut(search + " is not a valid method option. I will run the command using wang."); m->mothurOutEndLine(); classify = new Bayesian(taxonomyFileName, templateFileName, search, kmerSize, cutoff, iters, rand(), flip, writeShortcuts); } if (m->control_pressed) { delete classify; return 0; } for (int s = 0; s < fastaFileNames.size(); s++) { m->mothurOut("Classifying sequences from " + fastaFileNames[s] + " ..." ); m->mothurOutEndLine(); string baseTName = m->getSimpleName(taxonomyFileName); //set rippedTaxName to string RippedTaxName = ""; bool foundDot = false; for (int i = baseTName.length()-1; i >= 0; i--) { if (foundDot && (baseTName[i] != '.')) { RippedTaxName = baseTName[i] + RippedTaxName; } else if (foundDot && (baseTName[i] == '.')) { break; } else if (!foundDot && (baseTName[i] == '.')) { foundDot = true; } } //if (RippedTaxName != "") { RippedTaxName += "."; } if (outputDir == "") { outputDir += m->hasPath(fastaFileNames[s]); } map<string, string> variables; variables["[filename]"] = outputDir + m->getRootName(m->getSimpleName(fastaFileNames[s])); variables["[tag]"] = RippedTaxName; variables["[tag2]"] = outputMethodTag; string newTaxonomyFile = getOutputFileName("taxonomy", variables); string newaccnosFile = getOutputFileName("accnos", variables); string tempTaxonomyFile = outputDir + m->getRootName(m->getSimpleName(fastaFileNames[s])) + "taxonomy.temp"; string taxSummary = getOutputFileName("taxsummary", variables); if ((method == "knn") && (search == "distance")) { string DistName = getOutputFileName("matchdist", variables); classify->setDistName(DistName); outputNames.push_back(DistName); outputTypes["matchdist"].push_back(DistName); } outputNames.push_back(newTaxonomyFile); outputTypes["taxonomy"].push_back(newTaxonomyFile); outputNames.push_back(taxSummary); outputTypes["taxsummary"].push_back(taxSummary); int start = time(NULL); int numFastaSeqs = 0; for (int i = 0; i < lines.size(); i++) { delete lines[i]; } lines.clear(); vector<unsigned long long> positions; #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix) positions = m->divideFile(fastaFileNames[s], processors); for (int i = 0; i < (positions.size()-1); i++) { lines.push_back(new linePair(positions[i], positions[(i+1)])); } #else if (processors == 1) { lines.push_back(new linePair(0, 1000)); }else { positions = m->setFilePosFasta(fastaFileNames[s], numFastaSeqs); if (numFastaSeqs < processors) { processors = numFastaSeqs; } //figure out how many sequences you have to process int numSeqsPerProcessor = numFastaSeqs / processors; for (int i = 0; i < processors; i++) { int startIndex = i * numSeqsPerProcessor; if(i == (processors - 1)){ numSeqsPerProcessor = numFastaSeqs - i * numSeqsPerProcessor; } lines.push_back(new linePair(positions[startIndex], numSeqsPerProcessor)); } } #endif if(processors == 1){ numFastaSeqs = driver(lines[0], newTaxonomyFile, tempTaxonomyFile, newaccnosFile, fastaFileNames[s]); }else{ numFastaSeqs = createProcesses(newTaxonomyFile, tempTaxonomyFile, newaccnosFile, fastaFileNames[s]); } if (!m->isBlank(newaccnosFile)) { m->mothurOutEndLine(); m->mothurOut("[WARNING]: mothur reversed some your sequences for a better classification. If you would like to take a closer look, please check " + newaccnosFile + " for the list of the sequences."); m->mothurOutEndLine(); outputNames.push_back(newaccnosFile); outputTypes["accnos"].push_back(newaccnosFile); }else { m->mothurRemove(newaccnosFile); } m->mothurOutEndLine(); m->mothurOut("It took " + toString(time(NULL) - start) + " secs to classify " + toString(numFastaSeqs) + " sequences."); m->mothurOutEndLine(); m->mothurOutEndLine(); start = time(NULL); //read namefile if(namefile != "") { m->mothurOut("Reading " + namefileNames[s] + "..."); cout.flush(); nameMap.clear(); //remove old names m->readNames(namefileNames[s], nameMap); m->mothurOut(" Done."); m->mothurOutEndLine(); } //output taxonomy with the unclassified bins added ifstream inTax; m->openInputFile(newTaxonomyFile, inTax); ofstream outTax; string unclass = newTaxonomyFile + ".unclass.temp"; m->openOutputFile(unclass, outTax); //get maxLevel from phylotree so you know how many 'unclassified's to add int maxLevel = classify->getMaxLevel(); //read taxfile - this reading and rewriting is done to preserve the confidence scores. string name, taxon; string group = ""; GroupMap* groupMap = NULL; CountTable* ct = NULL; PhyloSummary* taxaSum; if (hasCount) { ct = new CountTable(); ct->readTable(countfileNames[s], true, false); taxaSum = new PhyloSummary(ct, relabund, printlevel); }else { if (groupfile != "") { group = groupfileNames[s]; groupMap = new GroupMap(group); groupMap->readMap(); } taxaSum = new PhyloSummary(groupMap, relabund, printlevel); } while (!inTax.eof()) { if (m->control_pressed) { outputTypes.clear(); if (ct != NULL) { delete ct; } if (groupMap != NULL) { delete groupMap; } delete taxaSum; for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } delete classify; return 0; } inTax >> name >> taxon; m->gobble(inTax); string newTax = m->addUnclassifieds(taxon, maxLevel, probs); outTax << name << '\t' << newTax << endl; if (namefile != "") { itNames = nameMap.find(name); if (itNames == nameMap.end()) { m->mothurOut(name + " is not in your name file please correct."); m->mothurOutEndLine(); exit(1); }else{ for (int i = 0; i < itNames->second.size(); i++) { taxaSum->addSeqToTree(itNames->second[i], newTax); //add it as many times as there are identical seqs } itNames->second.clear(); nameMap.erase(itNames->first); } }else { taxaSum->addSeqToTree(name, newTax); } } inTax.close(); outTax.close(); m->mothurRemove(newTaxonomyFile); rename(unclass.c_str(), newTaxonomyFile.c_str()); if (m->control_pressed) { outputTypes.clear(); if (ct != NULL) { delete ct; } if (groupMap != NULL) { delete groupMap; } for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } delete classify; return 0; } //print summary file ofstream outTaxTree; m->openOutputFile(taxSummary, outTaxTree); taxaSum->print(outTaxTree, output); outTaxTree.close(); if (ct != NULL) { delete ct; } if (groupMap != NULL) { delete groupMap; } delete taxaSum; m->mothurRemove(tempTaxonomyFile); m->mothurOutEndLine(); m->mothurOut("It took " + toString(time(NULL) - start) + " secs to create the summary file for " + toString(numFastaSeqs) + " sequences."); m->mothurOutEndLine(); m->mothurOutEndLine(); } delete classify; m->mothurOutEndLine(); m->mothurOut("Output File Names: "); m->mothurOutEndLine(); for (int i = 0; i < outputNames.size(); i++) { m->mothurOut(outputNames[i]); m->mothurOutEndLine(); } m->mothurOutEndLine(); //set taxonomy file as new current taxonomyfile string current = ""; itTypes = outputTypes.find("taxonomy"); if (itTypes != outputTypes.end()) { if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setTaxonomyFile(current); } } current = ""; itTypes = outputTypes.find("accnos"); if (itTypes != outputTypes.end()) { if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setAccnosFile(current); } } return 0; } catch(exception& e) { m->errorOut(e, "ClassifySeqsCommand", "execute"); exit(1); } }