// module.names // -o output_path // -s skip_module.names int parseTask(std::string& content, Task* task, int left, int right) { int space = findSymbol(content, ' ', left, right); splitNames(task->moduleNames, content, left, space); if (space == right) return ERROR_OK; // no more data // parse task options int cursor = space + 1; int hyphen; while (cursor < right) { hyphen = findSymbol(content, '-', cursor, right); if (hyphen == right) break; // no options space = findSymbol(content, ' ', hyphen + 3, right); switch (content[hyphen + 1]) { case 'o': task->output = content.substr(hyphen + 3, space - (hyphen + 3)); break; case 's': splitNames(task->skipModuleNames, content, hyphen + 3, space); break; default: return PROJECT_UNKNOWN_TASK_OPTION_ERROR; } cursor = space + 1; } return ERROR_OK; }
static void collapse(char *path) { char *names = (path[0] == '/') ? path + 1 : path; /* Preserve first '/' */ int nc; char **ix; int i, j; char *p, *q; nc = collapsible(names); if (nc < 2) return; /* Nothing to do */ ix = (char **)alloca(nc * sizeof(char *)); splitNames(names, ix); for (i = 0; i < nc; i++) { int dots = 0; /* Find next occurrence of "." or ".." */ do { char *p = ix[i]; if (p[0] == '.') { if (p[1] == '\0') { dots = 1; break; } if ((p[1] == '.') && (p[2] == '\0')) { dots = 2; break; } } i++; } while (i < nc); if (i >= nc) break; /* At this point i is the index of either a "." or a "..", so take the appropriate action and then continue the outer loop */ if (dots == 1) { /* Remove this instance of "." */ ix[i] = 0; } else { /* If there is a preceding name, remove both that name and this instance of ".."; otherwise, leave the ".." as is */ for (j = i - 1; j >= 0; j--) { if (ix[j]) break; } if (j < 0) continue; ix[j] = 0; ix[i] = 0; } /* i will be incremented at the top of the loop */ } joinNames(names, nc, ix); }
NAMELIST *edit_list(NAMELIST *list, size_t *list_size_p){ long list_size=MAXLETTER, i, j; NAMELIST *list2, tmp; char *name, *periodName; long from, to, lengthLimit; fpos_t fposition; char *periodPosition; list2=(NAMELIST *)malloc(list_size*sizeof(NAMELIST)); /* delimit by space */ j=0; for(i=0;list[i].name!=NULL;i++){ tmp=seqnameDelimitBySpace(list[i]); /*fprintf(stderr, "after delimit by space: [%s]\n", tmp.name);*/ if(tmp.name[0]!='\0'){ if(j+10 >= list_size){ list_size+=MAXLETTER; list2=(NAMELIST *)realloc(list2,list_size*sizeof(NAMELIST)); } list2[j]=tmp; j++; if(isspace(tmp.name[0])){ list2[j]=nameWithSpace(&tmp); j++; } name=(char *)malloc((strlen(tmp.name)+1)*sizeof(char)); strcpy(name, tmp.name); lengthLimit=strlen(name)-1; /*printf(" input: %s, length: %ld\n",name, lengthLimit); */ for(from=tmp.from-1, to=tmp.from-1, fposition=tmp.fposition; name!=NULL && from < lengthLimit; j++){ if(j>=list_size-2){ list_size+=MAXLETTER; list2=(NAMELIST *)realloc(list2,list_size*sizeof(NAMELIST)); } /* printf("Input: %s %ld-%ld::%lld\n", name, from, to, fposition); *//*debug */ tmp=splitNames(&from, &to, name, fposition); /* printf("result; %s %ld-%ld::%lld / %ld\n", tmp.name, from, to, fposition, lengthLimit); */ /* debug */ if(tmp.name!=NULL && tmp.name[0]!='\0'){ list2[j]=tmp; /* name+=list2[j].to+1;*/ }else{ break; } if(strstr(tmp.name, ".")!=NULL){ j++; periodName=(char *)malloc((strlen(tmp.name)+1)*sizeof(char)); strcpy(periodName, tmp.name); periodPosition=strstr(periodName, "."); *periodPosition='\0'; list2[j].name=periodName; list2[j].from =list2[j-1].from; list2[j].to =list2[j-1].to-(strlen(tmp.name)-strlen(periodName)); list2[j].fposition=list2[j-1].fposition; /* printf("found period: %s %d-%d::%lld\n", list2[j].name, list2[j].from, list2[j].to, list2[j].fposition);*/ } } } } /* add line to line2 */ for(i=0;list[i].name!=NULL;j++, i++){ if(j>=list_size){ list_size+=MAXLETTER; list2=(NAMELIST *)realloc(list2,list_size*sizeof(NAMELIST)); } list2[j]=list[i]; } *list_size_p=j; return list2; }
//******************************************************************************************************************** int SplitMatrix::splitDistanceRAM() { try { vector<set<string> > groups; vector<string> outputs; int numGroups = 0; ifstream dFile; m->openInputFile(distFile, dFile); while(dFile) { string seqA, seqB; float dist; dFile >> seqA >> seqB >> dist; if (m->control_pressed) { dFile.close(); for(int i=0; i<numGroups; i++) { if(groups[i].size() > 0) { m->mothurRemove((distFile + "." + toString(i) + ".temp")); } } return 0; } if(dist < cutoff) { //cout << "in cutoff: " << dist << endl; int groupIDA = -1; int groupIDB = -1; int groupID = -1; for(int i=0; i<numGroups; i++) { set<string>::iterator aIt = groups[i].find(seqA); set<string>::iterator bIt = groups[i].find(seqB); if(groupIDA == -1 && aIt != groups[i].end()) { //seqA is not already assigned to a group and is in group[i], so assign seqB to group[i] groups[i].insert(seqB); groupIDA = i; groupID = groupIDA; //cout << "in aIt: " << groupID << endl; // break; } else if(groupIDB == -1 && bIt != groups[i].end()) { //seqB is not already assigned to a group and is in group[i], so assign seqA to group[i] groups[i].insert(seqA); groupIDB = i; groupID = groupIDB; // cout << "in bIt: " << groupID << endl; // break; } if(groupIDA != -1 && groupIDB != -1) { //both ifs above have been executed, so we need to decide who to assign them to if(groupIDA < groupIDB) { // cout << "A: " << groupIDA << "\t" << groupIDB << endl; groups[groupIDA].insert(groups[groupIDB].begin(), groups[groupIDB].end()); //merge two groups into groupIDA groups[groupIDB].clear(); groupID = groupIDA; } else { // cout << "B: " << groupIDA << "\t" << groupIDB << endl; groups[groupIDB].insert(groups[groupIDA].begin(), groups[groupIDA].end()); //merge two groups into groupIDB groups[groupIDA].clear(); groupID = groupIDB; } break; } } //windows is gonna gag on the reuse of outFile, will need to make it local... if(groupIDA == -1 && groupIDB == -1) { //we need a new group set<string> newGroup; newGroup.insert(seqA); newGroup.insert(seqB); groups.push_back(newGroup); string tempOut = seqA + '\t' + seqB + '\t' + toString(dist) + '\n'; outputs.push_back(tempOut); numGroups++; } else { outputs[groupID] += seqA + '\t' + seqB + '\t' + toString(dist) + '\n'; if(groupIDA != -1 && groupIDB != -1) { //merge distance files of two groups you merged above string row, column, distance; if(groupIDA<groupIDB) { //merge memory outputs[groupID] += outputs[groupIDB]; outputs[groupIDB] = ""; } else { outputs[groupID] += outputs[groupIDA]; outputs[groupIDA] = ""; } } } } m->gobble(dFile); } dFile.close(); vector<string> tempDistFiles; for (int i = 0; i < numGroups; i++) { string fileName = distFile + "." + toString(i) + ".temp"; tempDistFiles.push_back(fileName); if (outputs[i] != "") { ofstream outFile; outFile.open(fileName.c_str(), ios::ate); outFile << outputs[i]; outFile.close(); } } map<string, int> seqGroup; for (int i = 0; i < groups.size(); i++) { for (set<string>::iterator itNames = groups[i].begin(); itNames != groups[i].end();) { seqGroup[*itNames] = i; groups[i].erase(itNames++); } } splitNames(seqGroup, numGroups, tempDistFiles); return 0; } catch(exception& e) { m->errorOut(e, "SplitMatrix", "splitDistanceRAM"); exit(1); } }
int SplitMatrix::splitDistanceLarge() { try { vector<set<string> > groups; //for buffering the io to improve speed //allow for 30 dists to be stored, then output. vector<string> outputs; vector<int> numOutputs; vector<bool> wroteOutPut; int numGroups = 0; //ofstream outFile; ifstream dFile; m->openInputFile(distFile, dFile); while(dFile) { string seqA, seqB; float dist; dFile >> seqA >> seqB >> dist; if (m->control_pressed) { dFile.close(); for(int i=0; i<numGroups; i++) { if(groups[i].size() > 0) { m->mothurRemove((distFile + "." + toString(i) + ".temp")); } } return 0; } if(dist < cutoff) { //cout << "in cutoff: " << dist << endl; int groupIDA = -1; int groupIDB = -1; int groupID = -1; for(int i=0; i<numGroups; i++) { set<string>::iterator aIt = groups[i].find(seqA); set<string>::iterator bIt = groups[i].find(seqB); if(groupIDA == -1 && aIt != groups[i].end()) { //seqA is not already assigned to a group and is in group[i], so assign seqB to group[i] groups[i].insert(seqB); groupIDA = i; groupID = groupIDA; //cout << "in aIt: " << groupID << endl; // break; } else if(groupIDB == -1 && bIt != groups[i].end()) { //seqB is not already assigned to a group and is in group[i], so assign seqA to group[i] groups[i].insert(seqA); groupIDB = i; groupID = groupIDB; // cout << "in bIt: " << groupID << endl; // break; } if(groupIDA != -1 && groupIDB != -1) { //both ifs above have been executed, so we need to decide who to assign them to if(groupIDA < groupIDB) { // cout << "A: " << groupIDA << "\t" << groupIDB << endl; groups[groupIDA].insert(groups[groupIDB].begin(), groups[groupIDB].end()); //merge two groups into groupIDA groups[groupIDB].clear(); groupID = groupIDA; } else { // cout << "B: " << groupIDA << "\t" << groupIDB << endl; groups[groupIDB].insert(groups[groupIDA].begin(), groups[groupIDA].end()); //merge two groups into groupIDB groups[groupIDA].clear(); groupID = groupIDB; } break; } } //windows is gonna gag on the reuse of outFile, will need to make it local... if(groupIDA == -1 && groupIDB == -1) { //we need a new group set<string> newGroup; newGroup.insert(seqA); newGroup.insert(seqB); groups.push_back(newGroup); string tempOut = seqA + '\t' + seqB + '\t' + toString(dist) + '\n'; outputs.push_back(tempOut); numOutputs.push_back(1); wroteOutPut.push_back(false); numGroups++; } else { string fileName = distFile + "." + toString(groupID) + ".temp"; //have we reached the max buffer size if (numOutputs[groupID] > 60) { //write out sequence ofstream outFile; outFile.open(fileName.c_str(), ios::app); outFile << outputs[groupID] << seqA << '\t' << seqB << '\t' << dist << endl; outFile.close(); outputs[groupID] = ""; numOutputs[groupID] = 0; wroteOutPut[groupID] = true; } else { outputs[groupID] += seqA + '\t' + seqB + '\t' + toString(dist) + '\n'; numOutputs[groupID]++; } if(groupIDA != -1 && groupIDB != -1) { //merge distance files of two groups you merged above string row, column, distance; if(groupIDA<groupIDB) { //merge memory numOutputs[groupID] += numOutputs[groupIDB]; outputs[groupID] += outputs[groupIDB]; outputs[groupIDB] = ""; numOutputs[groupIDB] = 0; //if groupB is written to file it is above buffer size so read and write to new merged file if (wroteOutPut[groupIDB]) { string fileName2 = distFile + "." + toString(groupIDB) + ".temp"; m->appendFiles(fileName2, fileName); m->mothurRemove(fileName2); //write out the merged memory if (numOutputs[groupID] > 60) { ofstream tempOut; m->openOutputFile(fileName, tempOut); tempOut << outputs[groupID]; outputs[groupID] = ""; numOutputs[groupID] = 0; tempOut.close(); } //outFile.close(); wroteOutPut[groupID] = true; wroteOutPut[groupIDB] = false; } else { } //just merge b's memory with a's memory } else { numOutputs[groupID] += numOutputs[groupIDA]; outputs[groupID] += outputs[groupIDA]; outputs[groupIDA] = ""; numOutputs[groupIDA] = 0; if (wroteOutPut[groupIDA]) { string fileName2 = distFile + "." + toString(groupIDA) + ".temp"; m->appendFiles(fileName2, fileName); m->mothurRemove(fileName2); //write out the merged memory if (numOutputs[groupID] > 60) { ofstream tempOut; m->openOutputFile(fileName, tempOut); tempOut << outputs[groupID]; outputs[groupID] = ""; numOutputs[groupID] = 0; tempOut.close(); } //outFile.close(); wroteOutPut[groupID] = true; wroteOutPut[groupIDA] = false; } else { } //just merge memory } } } } m->gobble(dFile); } dFile.close(); vector<string> tempDistFiles; for (int i = 0; i < numGroups; i++) { string fileName = distFile + "." + toString(i) + ".temp"; tempDistFiles.push_back(fileName); //remove old names files just in case if (numOutputs[i] > 0) { ofstream outFile; outFile.open(fileName.c_str(), ios::app); outFile << outputs[i]; outFile.close(); } } map<string, int> seqGroup; for (int i = 0; i < groups.size(); i++) { for (set<string>::iterator itNames = groups[i].begin(); itNames != groups[i].end();) { seqGroup[*itNames] = i; groups[i].erase(itNames++); } } splitNames(seqGroup, numGroups, tempDistFiles); return 0; } catch(exception& e) { m->errorOut(e, "SplitMatrix", "splitDistanceLarge"); exit(1); } }
int SplitMatrix::splitDistanceFileByTax(map<string, int>& seqGroup, int numGroups) { try { map<string, int>::iterator it; map<string, int>::iterator it2; ofstream outFile; ifstream dFile; m->openInputFile(distFile, dFile); for (int i = 0; i < numGroups; i++) { //remove old temp files, just in case m->mothurRemove((distFile + "." + toString(i) + ".temp")); } //for buffering the io to improve speed //allow for 10 dists to be stored, then output. vector<string> outputs; outputs.resize(numGroups, ""); vector<int> numOutputs; numOutputs.resize(numGroups, 0); //you can have a group made, but their may be no distances in the file for this group if the taxonomy file and distance file don't match //this can occur if we have converted the phylip to column, since we reduce the size at that step by using the cutoff value vector<bool> validDistances; validDistances.resize(numGroups, false); //for each distance while(dFile) { string seqA, seqB; float dist; if (m->control_pressed) { dFile.close(); for (int i = 0; i < numGroups; i++) { m->mothurRemove((distFile + "." + toString(i) + ".temp")); } } dFile >> seqA >> seqB >> dist; m->gobble(dFile); //if both sequences are in the same group then they are within the cutoff it = seqGroup.find(seqA); it2 = seqGroup.find(seqB); if ((it != seqGroup.end()) && (it2 != seqGroup.end())) { //they are both not singletons if (it->second == it2->second) { //they are from the same group so add the distance if (numOutputs[it->second] > 30) { m->openOutputFileAppend((distFile + "." + toString(it->second) + ".temp"), outFile); outFile << outputs[it->second] << seqA << '\t' << seqB << '\t' << dist << endl; outFile.close(); outputs[it->second] = ""; numOutputs[it->second] = 0; validDistances[it->second] = true; } else { outputs[it->second] += seqA + '\t' + seqB + '\t' + toString(dist) + '\n'; numOutputs[it->second]++; } } } } dFile.close(); string inputFile = namefile; if (countfile != "") { inputFile = countfile; } vector<string> tempDistFiles; for (int i = 0; i < numGroups; i++) { //remove old temp files, just in case string tempDistFile = distFile + "." + toString(i) + ".temp"; tempDistFiles.push_back(tempDistFile); m->mothurRemove((inputFile + "." + toString(i) + ".temp")); //write out any remaining buffers if (numOutputs[i] > 0) { m->openOutputFileAppend((distFile + "." + toString(i) + ".temp"), outFile); outFile << outputs[i]; outFile.close(); outputs[i] = ""; numOutputs[i] = 0; validDistances[i] = true; } } splitNames(seqGroup, numGroups, tempDistFiles); if (m->control_pressed) { for (int i = 0; i < dists.size(); i++) { m->mothurRemove((dists[i].begin()->first)); m->mothurRemove((dists[i].begin()->second)); } dists.clear(); } return 0; } catch(exception& e) { m->errorOut(e, "SplitMatrix", "splitDistanceFileByTax"); exit(1); } }
int SplitMatrix::createDistanceFilesFromTax(map<string, int>& seqGroup, int numGroups) { try { map<string, int> copyGroups = seqGroup; map<string, int>::iterator it; set<string> names; ifstream in; m->openInputFile(fastafile, in); for (int i = 0; i < numGroups; i++) { m->mothurRemove((fastafile + "." + toString(i) + ".temp")); } //parse fastafile while (!in.eof()) { Sequence query(in); m->gobble(in); if (query.getName() != "") { it = seqGroup.find(query.getName()); //save names in case no namefile is given if ((namefile == "") && (countfile == "")) { names.insert(query.getName()); } if (it != seqGroup.end()) { //not singleton ofstream outFile; m->openOutputFileAppend((fastafile + "." + toString(it->second) + ".temp"), outFile); query.printSequence(outFile); outFile.close(); copyGroups.erase(query.getName()); } } } in.close(); bool error = false; //warn about sequence in groups that are not in fasta file for(it = copyGroups.begin(); it != copyGroups.end(); it++) { m->mothurOut("ERROR: " + it->first + " is missing from your fastafile. This could happen if your taxonomy file is not unique and your fastafile is, or it could indicate an error."); m->mothurOutEndLine(); error = true; } copyGroups.clear(); if (error) { exit(1); } if (outputType == "distance") { //create distance matrices for each fasta file //process each distance file for (int i = 0; i < numGroups; i++) { string options = ""; if (classic) { options = "fasta=" + (fastafile + "." + toString(i) + ".temp") + ", processors=" + toString(processors) + ", output=lt"; } else { options = "fasta=" + (fastafile + "." + toString(i) + ".temp") + ", processors=" + toString(processors) + ", cutoff=" + toString(distCutoff); } if (outputDir != "") { options += ", outputdir=" + outputDir; } m->mothurCalling = true; m->mothurOut("/******************************************/"); m->mothurOutEndLine(); m->mothurOut("Running command: dist.seqs(" + options + ")"); m->mothurOutEndLine(); m->mothurCalling = true; Command* command = new DistanceCommand(options); m->mothurOut("/******************************************/"); m->mothurOutEndLine(); command->execute(); delete command; m->mothurCalling = false; m->mothurRemove((fastafile + "." + toString(i) + ".temp")); //remove old names files just in case if (namefile != "") { m->mothurRemove((namefile + "." + toString(i) + ".temp")); } else { m->mothurRemove((countfile + "." + toString(i) + ".temp")); } } } //restore old fasta file name since dist.seqs overwrites it with the temp files m->setFastaFile(fastafile); vector<string> tempDistFiles; for(int i=0; i<numGroups; i++) { if (outputDir == "") { outputDir = m->hasPath(fastafile); } string tempDistFile = (fastafile + "." + toString(i) + ".temp"); if (outputType == "distance") { if (classic) { tempDistFile = outputDir + m->getRootName(m->getSimpleName((fastafile + "." + toString(i) + ".temp"))) + "phylip.dist"; } else { tempDistFile = outputDir + m->getRootName(m->getSimpleName((fastafile + "." + toString(i) + ".temp"))) + "dist"; } } tempDistFiles.push_back(tempDistFile); } if (method == "vsearch") { splitNamesVsearch(seqGroup, numGroups, tempDistFiles); } else { splitNames(seqGroup, numGroups, tempDistFiles); } if (m->control_pressed) { for (int i = 0; i < dists.size(); i++) { m->mothurRemove((dists[i].begin()->first)); m->mothurRemove((dists[i].begin()->second)); } dists.clear(); } return 0; } catch(exception& e) { m->errorOut(e, "SplitMatrix", "createDistanceFilesFromTax"); exit(1); } }