Пример #1
0
// module.names
// -o output_path
// -s skip_module.names
int parseTask(std::string& content, Task* task, int left, int right) {
  int space = findSymbol(content, ' ', left, right);
  splitNames(task->moduleNames, content, left, space);

  if (space == right) return ERROR_OK; // no more data

  // parse task options
  int cursor = space + 1;
  int hyphen;
  while (cursor < right) {
    hyphen = findSymbol(content, '-', cursor, right);
    if (hyphen == right) break; // no options

    space = findSymbol(content, ' ',  hyphen + 3, right);
    switch (content[hyphen + 1]) {
      case 'o':
        task->output = content.substr(hyphen + 3, space - (hyphen + 3));
        break;
      case 's':
        splitNames(task->skipModuleNames, content, hyphen + 3, space);
        break;
      default:
        return PROJECT_UNKNOWN_TASK_OPTION_ERROR;
    }
    
    cursor = space + 1;
  }

  return ERROR_OK;
}
static void
collapse(char *path)
{
    char *names = (path[0] == '/') ? path + 1 : path; /* Preserve first '/' */
    int nc;
    char **ix;
    int i, j;
    char *p, *q;

    nc = collapsible(names);
    if (nc < 2) return;         /* Nothing to do */
    ix = (char **)alloca(nc * sizeof(char *));
    splitNames(names, ix);

    for (i = 0; i < nc; i++) {
        int dots = 0;

        /* Find next occurrence of "." or ".." */
        do {
            char *p = ix[i];
            if (p[0] == '.') {
                if (p[1] == '\0') {
                    dots = 1;
                    break;
                }
                if ((p[1] == '.') && (p[2] == '\0')) {
                    dots = 2;
                    break;
                }
            }
            i++;
        } while (i < nc);
        if (i >= nc) break;

        /* At this point i is the index of either a "." or a "..", so take the
           appropriate action and then continue the outer loop */
        if (dots == 1) {
            /* Remove this instance of "." */
            ix[i] = 0;
        }
        else {
            /* If there is a preceding name, remove both that name and this
               instance of ".."; otherwise, leave the ".." as is */
            for (j = i - 1; j >= 0; j--) {
                if (ix[j]) break;
            }
            if (j < 0) continue;
            ix[j] = 0;
            ix[i] = 0;
        }
        /* i will be incremented at the top of the loop */
    }

    joinNames(names, nc, ix);
}
Пример #3
0
NAMELIST *edit_list(NAMELIST *list, size_t *list_size_p){
  long list_size=MAXLETTER, i, j;
  NAMELIST *list2, tmp;
  char *name, *periodName;
  long from, to, lengthLimit;
  fpos_t fposition;
  char *periodPosition;

  list2=(NAMELIST *)malloc(list_size*sizeof(NAMELIST));

  /* delimit by space */
  j=0;
  for(i=0;list[i].name!=NULL;i++){
    tmp=seqnameDelimitBySpace(list[i]);
    /*fprintf(stderr, "after delimit by space: [%s]\n", tmp.name);*/
    if(tmp.name[0]!='\0'){
      if(j+10 >= list_size){
	list_size+=MAXLETTER;
	list2=(NAMELIST *)realloc(list2,list_size*sizeof(NAMELIST));
      }
      list2[j]=tmp;
      j++;
      if(isspace(tmp.name[0])){
	list2[j]=nameWithSpace(&tmp);
	j++;
      }

      name=(char *)malloc((strlen(tmp.name)+1)*sizeof(char));
      strcpy(name, tmp.name);
      lengthLimit=strlen(name)-1;
      /*printf("      input: %s, length: %ld\n",name, lengthLimit); */
      for(from=tmp.from-1, to=tmp.from-1, fposition=tmp.fposition; 
	  name!=NULL && from < lengthLimit;
	  j++){
	if(j>=list_size-2){
	  list_size+=MAXLETTER;
	  list2=(NAMELIST *)realloc(list2,list_size*sizeof(NAMELIST));
	}
	/* printf("Input: %s %ld-%ld::%lld\n", name, from, to, fposition); *//*debug */
	tmp=splitNames(&from, &to, name, fposition);
	/* printf("result; %s %ld-%ld::%lld / %ld\n", tmp.name, from, to, fposition, lengthLimit); */ /* debug */
	if(tmp.name!=NULL && tmp.name[0]!='\0'){
	  list2[j]=tmp;
	  /* name+=list2[j].to+1;*/
	}else{
	  break;
	}

	if(strstr(tmp.name, ".")!=NULL){
	  j++;
	  periodName=(char *)malloc((strlen(tmp.name)+1)*sizeof(char));
	  strcpy(periodName, tmp.name);
	  periodPosition=strstr(periodName, ".");
	  *periodPosition='\0';
	  list2[j].name=periodName;
	  list2[j].from     =list2[j-1].from;
	  list2[j].to       =list2[j-1].to-(strlen(tmp.name)-strlen(periodName));
	  list2[j].fposition=list2[j-1].fposition;
	  /* printf("found period: %s %d-%d::%lld\n", list2[j].name, list2[j].from,
	     list2[j].to, list2[j].fposition);*/
	}
      }
    }
  }

  /* add line to line2 */
  for(i=0;list[i].name!=NULL;j++, i++){
    if(j>=list_size){
      list_size+=MAXLETTER;
      list2=(NAMELIST *)realloc(list2,list_size*sizeof(NAMELIST));
    }
    list2[j]=list[i];
  }

  *list_size_p=j;
  return list2;
}
Пример #4
0
//********************************************************************************************************************
int SplitMatrix::splitDistanceRAM() {
    try {
        vector<set<string> > groups;
        vector<string> outputs;

        int numGroups = 0;

        ifstream dFile;
        m->openInputFile(distFile, dFile);

        while(dFile) {
            string seqA, seqB;
            float dist;

            dFile >> seqA >> seqB >> dist;

            if (m->control_pressed) {
                dFile.close();
                for(int i=0; i<numGroups; i++) {
                    if(groups[i].size() > 0) {
                        m->mothurRemove((distFile + "." + toString(i) + ".temp"));
                    }
                }
                return 0;
            }

            if(dist < cutoff) {
                //cout << "in cutoff: " << dist << endl;
                int groupIDA = -1;
                int groupIDB = -1;
                int groupID = -1;

                for(int i=0; i<numGroups; i++) {
                    set<string>::iterator aIt = groups[i].find(seqA);
                    set<string>::iterator bIt = groups[i].find(seqB);

                    if(groupIDA == -1 && aIt != groups[i].end()) { //seqA is not already assigned to a group and is in group[i], so assign seqB to group[i]
                        groups[i].insert(seqB);
                        groupIDA = i;
                        groupID = groupIDA;

                        //cout << "in aIt: " << groupID << endl;
                        //					break;
                    }
                    else if(groupIDB == -1 && bIt != groups[i].end()) { //seqB is not already assigned to a group and is in group[i], so assign seqA to group[i]
                        groups[i].insert(seqA);
                        groupIDB = i;
                        groupID = groupIDB;

                        //	cout << "in bIt: " << groupID << endl;
                        //					break;
                    }

                    if(groupIDA != -1 && groupIDB != -1) { //both ifs above have been executed, so we need to decide who to assign them to
                        if(groupIDA < groupIDB) {
                            //	cout << "A: " << groupIDA << "\t" << groupIDB << endl;
                            groups[groupIDA].insert(groups[groupIDB].begin(), groups[groupIDB].end()); //merge two groups into groupIDA
                            groups[groupIDB].clear();
                            groupID = groupIDA;
                        }
                        else {
                            //	cout << "B: " << groupIDA << "\t" << groupIDB << endl;
                            groups[groupIDB].insert(groups[groupIDA].begin(), groups[groupIDA].end()); //merge two groups into groupIDB
                            groups[groupIDA].clear();
                            groupID = groupIDB;
                        }
                        break;
                    }
                }

                //windows is gonna gag on the reuse of outFile, will need to make it local...

                if(groupIDA == -1 && groupIDB == -1) { //we need a new group
                    set<string> newGroup;
                    newGroup.insert(seqA);
                    newGroup.insert(seqB);
                    groups.push_back(newGroup);

                    string tempOut = seqA + '\t' + seqB + '\t' + toString(dist) + '\n';
                    outputs.push_back(tempOut);
                    numGroups++;
                }
                else {

                    outputs[groupID] +=  seqA + '\t' + seqB + '\t' + toString(dist)  + '\n';

                    if(groupIDA != -1 && groupIDB != -1) { //merge distance files of two groups you merged above
                        string row, column, distance;
                        if(groupIDA<groupIDB) {
                            //merge memory
                            outputs[groupID] += outputs[groupIDB];
                            outputs[groupIDB] = "";
                        } else {
                            outputs[groupID] += outputs[groupIDA];
                            outputs[groupIDA] = "";
                        }
                    }
                }
            }
            m->gobble(dFile);
        }
        dFile.close();

        vector<string> tempDistFiles;
        for (int i = 0; i < numGroups; i++) {
            string fileName = distFile + "." + toString(i) + ".temp";
            tempDistFiles.push_back(fileName);
            if (outputs[i] != "") {
                ofstream outFile;
                outFile.open(fileName.c_str(), ios::ate);
                outFile << outputs[i];
                outFile.close();
            }
        }

        map<string, int> seqGroup;
        for (int i = 0; i < groups.size(); i++) {
            for (set<string>::iterator itNames = groups[i].begin(); itNames != groups[i].end();) {
                seqGroup[*itNames] = i;
                groups[i].erase(itNames++);
            }
        }

        splitNames(seqGroup, numGroups, tempDistFiles);

        return 0;
    }
    catch(exception& e) {
        m->errorOut(e, "SplitMatrix", "splitDistanceRAM");
        exit(1);
    }
}
Пример #5
0
int SplitMatrix::splitDistanceLarge() {
    try {
        vector<set<string> > groups;

        //for buffering the io to improve speed
        //allow for 30 dists to be stored, then output.
        vector<string> outputs;
        vector<int> numOutputs;
        vector<bool> wroteOutPut;

        int numGroups = 0;

        //ofstream outFile;
        ifstream dFile;
        m->openInputFile(distFile, dFile);

        while(dFile) {
            string seqA, seqB;
            float dist;

            dFile >> seqA >> seqB >> dist;

            if (m->control_pressed) {
                dFile.close();
                for(int i=0; i<numGroups; i++) {
                    if(groups[i].size() > 0) {
                        m->mothurRemove((distFile + "." + toString(i) + ".temp"));
                    }
                }
                return 0;
            }

            if(dist < cutoff) {
                //cout << "in cutoff: " << dist << endl;
                int groupIDA = -1;
                int groupIDB = -1;
                int groupID = -1;

                for(int i=0; i<numGroups; i++) {
                    set<string>::iterator aIt = groups[i].find(seqA);
                    set<string>::iterator bIt = groups[i].find(seqB);

                    if(groupIDA == -1 && aIt != groups[i].end()) { //seqA is not already assigned to a group and is in group[i], so assign seqB to group[i]
                        groups[i].insert(seqB);
                        groupIDA = i;
                        groupID = groupIDA;

                        //cout << "in aIt: " << groupID << endl;
                        //					break;
                    }
                    else if(groupIDB == -1 && bIt != groups[i].end()) { //seqB is not already assigned to a group and is in group[i], so assign seqA to group[i]
                        groups[i].insert(seqA);
                        groupIDB = i;
                        groupID = groupIDB;

                        //	cout << "in bIt: " << groupID << endl;
                        //					break;
                    }

                    if(groupIDA != -1 && groupIDB != -1) { //both ifs above have been executed, so we need to decide who to assign them to
                        if(groupIDA < groupIDB) {
                            //	cout << "A: " << groupIDA << "\t" << groupIDB << endl;
                            groups[groupIDA].insert(groups[groupIDB].begin(), groups[groupIDB].end()); //merge two groups into groupIDA
                            groups[groupIDB].clear();
                            groupID = groupIDA;
                        }
                        else {
                            //	cout << "B: " << groupIDA << "\t" << groupIDB << endl;
                            groups[groupIDB].insert(groups[groupIDA].begin(), groups[groupIDA].end()); //merge two groups into groupIDB
                            groups[groupIDA].clear();
                            groupID = groupIDB;
                        }
                        break;
                    }
                }

                //windows is gonna gag on the reuse of outFile, will need to make it local...

                if(groupIDA == -1 && groupIDB == -1) { //we need a new group
                    set<string> newGroup;
                    newGroup.insert(seqA);
                    newGroup.insert(seqB);
                    groups.push_back(newGroup);

                    string tempOut = seqA + '\t' + seqB + '\t' + toString(dist) + '\n';
                    outputs.push_back(tempOut);
                    numOutputs.push_back(1);
                    wroteOutPut.push_back(false);

                    numGroups++;
                }
                else {
                    string fileName = distFile + "." + toString(groupID) + ".temp";

                    //have we reached the max buffer size
                    if (numOutputs[groupID] > 60) { //write out sequence
                        ofstream outFile;
                        outFile.open(fileName.c_str(), ios::app);
                        outFile << outputs[groupID] << seqA << '\t' << seqB << '\t' << dist << endl;
                        outFile.close();

                        outputs[groupID] = "";
                        numOutputs[groupID] = 0;
                        wroteOutPut[groupID] = true;
                    } else {
                        outputs[groupID] +=  seqA + '\t' + seqB + '\t' + toString(dist)  + '\n';
                        numOutputs[groupID]++;
                    }

                    if(groupIDA != -1 && groupIDB != -1) { //merge distance files of two groups you merged above
                        string row, column, distance;
                        if(groupIDA<groupIDB) {

                            //merge memory
                            numOutputs[groupID] += numOutputs[groupIDB];
                            outputs[groupID] += outputs[groupIDB];

                            outputs[groupIDB] = "";
                            numOutputs[groupIDB] = 0;

                            //if groupB is written to file it is above buffer size so read and write to new merged file
                            if (wroteOutPut[groupIDB]) {
                                string fileName2 = distFile + "." + toString(groupIDB) + ".temp";
                                m->appendFiles(fileName2, fileName);
                                m->mothurRemove(fileName2);


                                //write out the merged memory
                                if (numOutputs[groupID] > 60) {
                                    ofstream tempOut;
                                    m->openOutputFile(fileName, tempOut);
                                    tempOut << outputs[groupID];
                                    outputs[groupID] = "";
                                    numOutputs[groupID] = 0;
                                    tempOut.close();
                                }

                                //outFile.close();

                                wroteOutPut[groupID] = true;
                                wroteOutPut[groupIDB] = false;
                            } else { } //just merge b's memory with a's memory
                        }
                        else {
                            numOutputs[groupID] += numOutputs[groupIDA];
                            outputs[groupID] += outputs[groupIDA];

                            outputs[groupIDA] = "";
                            numOutputs[groupIDA] = 0;

                            if (wroteOutPut[groupIDA]) {
                                string fileName2 = distFile + "." + toString(groupIDA) + ".temp";
                                m->appendFiles(fileName2, fileName);
                                m->mothurRemove(fileName2);

                                //write out the merged memory
                                if (numOutputs[groupID] > 60) {
                                    ofstream tempOut;
                                    m->openOutputFile(fileName, tempOut);
                                    tempOut << outputs[groupID];
                                    outputs[groupID] = "";
                                    numOutputs[groupID] = 0;
                                    tempOut.close();
                                }

                                //outFile.close();

                                wroteOutPut[groupID] = true;
                                wroteOutPut[groupIDA] = false;
                            } else { } //just merge memory
                        }
                    }
                }
            }
            m->gobble(dFile);
        }
        dFile.close();

        vector<string> tempDistFiles;
        for (int i = 0; i < numGroups; i++) {
            string fileName = distFile + "." + toString(i) + ".temp";
            tempDistFiles.push_back(fileName);
            //remove old names files just in case

            if (numOutputs[i] > 0) {
                ofstream outFile;
                outFile.open(fileName.c_str(), ios::app);
                outFile << outputs[i];
                outFile.close();
            }
        }

        map<string, int> seqGroup;
        for (int i = 0; i < groups.size(); i++) {
            for (set<string>::iterator itNames = groups[i].begin(); itNames != groups[i].end();) {
                seqGroup[*itNames] = i;
                groups[i].erase(itNames++);
            }
        }

        splitNames(seqGroup, numGroups, tempDistFiles);

        return 0;
    }
    catch(exception& e) {
        m->errorOut(e, "SplitMatrix", "splitDistanceLarge");
        exit(1);
    }
}
Пример #6
0
int SplitMatrix::splitDistanceFileByTax(map<string, int>& seqGroup, int numGroups) {
    try {
        map<string, int>::iterator it;
        map<string, int>::iterator it2;

        ofstream outFile;
        ifstream dFile;
        m->openInputFile(distFile, dFile);


        for (int i = 0; i < numGroups; i++) { //remove old temp files, just in case
            m->mothurRemove((distFile + "." + toString(i) + ".temp"));
        }

        //for buffering the io to improve speed
        //allow for 10 dists to be stored, then output.
        vector<string> outputs;
        outputs.resize(numGroups, "");
        vector<int> numOutputs;
        numOutputs.resize(numGroups, 0);

        //you can have a group made, but their may be no distances in the file for this group if the taxonomy file and distance file don't match
        //this can occur if we have converted the phylip to column, since we reduce the size at that step by using the cutoff value
        vector<bool> validDistances;
        validDistances.resize(numGroups, false);

        //for each distance
        while(dFile) {
            string seqA, seqB;
            float dist;

            if (m->control_pressed) {
                dFile.close();
                for (int i = 0; i < numGroups; i++) {
                    m->mothurRemove((distFile + "." + toString(i) + ".temp"));
                }
            }

            dFile >> seqA >> seqB >> dist;
            m->gobble(dFile);

            //if both sequences are in the same group then they are within the cutoff
            it = seqGroup.find(seqA);
            it2 = seqGroup.find(seqB);

            if ((it != seqGroup.end()) && (it2 != seqGroup.end())) { //they are both not singletons
                if (it->second == it2->second) { //they are from the same group so add the distance
                    if (numOutputs[it->second] > 30) {
                        m->openOutputFileAppend((distFile + "." + toString(it->second) + ".temp"), outFile);
                        outFile << outputs[it->second] << seqA << '\t' << seqB << '\t' << dist << endl;
                        outFile.close();
                        outputs[it->second] = "";
                        numOutputs[it->second] = 0;
                        validDistances[it->second] = true;
                    } else {
                        outputs[it->second] += seqA + '\t' + seqB + '\t' + toString(dist)  + '\n';
                        numOutputs[it->second]++;
                    }
                }
            }
        }
        dFile.close();

        string inputFile = namefile;
        if (countfile != "") {
            inputFile = countfile;
        }

        vector<string> tempDistFiles;
        for (int i = 0; i < numGroups; i++) { //remove old temp files, just in case
            string tempDistFile = distFile + "." + toString(i) + ".temp";
            tempDistFiles.push_back(tempDistFile);
            m->mothurRemove((inputFile + "." + toString(i) + ".temp"));

            //write out any remaining buffers
            if (numOutputs[i] > 0) {
                m->openOutputFileAppend((distFile + "." + toString(i) + ".temp"), outFile);
                outFile << outputs[i];
                outFile.close();
                outputs[i] = "";
                numOutputs[i] = 0;
                validDistances[i] = true;
            }
        }

        splitNames(seqGroup, numGroups, tempDistFiles);

        if (m->control_pressed)	 {
            for (int i = 0; i < dists.size(); i++) {
                m->mothurRemove((dists[i].begin()->first));
                m->mothurRemove((dists[i].begin()->second));
            }
            dists.clear();
        }

        return 0;
    }
    catch(exception& e) {
        m->errorOut(e, "SplitMatrix", "splitDistanceFileByTax");
        exit(1);
    }
}
Пример #7
0
int SplitMatrix::createDistanceFilesFromTax(map<string, int>& seqGroup, int numGroups) {
    try {

        map<string, int> copyGroups = seqGroup;
        map<string, int>::iterator it;
        set<string> names;

        ifstream in;
        m->openInputFile(fastafile, in);

        for (int i = 0; i < numGroups; i++) {
            m->mothurRemove((fastafile + "." + toString(i) + ".temp"));
        }

        //parse fastafile
        while (!in.eof()) {
            Sequence query(in);
            m->gobble(in);
            if (query.getName() != "") {

                it = seqGroup.find(query.getName());

                //save names in case no namefile is given
                if ((namefile == "") && (countfile == "")) {
                    names.insert(query.getName());
                }

                if (it != seqGroup.end()) { //not singleton
                    ofstream outFile;
                    m->openOutputFileAppend((fastafile + "." + toString(it->second) + ".temp"), outFile);
                    query.printSequence(outFile);
                    outFile.close();
                    copyGroups.erase(query.getName());
                }
            }
        }
        in.close();

        bool error = false;
        //warn about sequence in groups that are not in fasta file
        for(it = copyGroups.begin(); it != copyGroups.end(); it++) {
            m->mothurOut("ERROR: " + it->first + " is missing from your fastafile. This could happen if your taxonomy file is not unique and your fastafile is, or it could indicate an error.");
            m->mothurOutEndLine();
            error = true;
        }
        copyGroups.clear();

        if (error) {
            exit(1);
        }


        if (outputType == "distance") { //create distance matrices for each fasta file
            //process each distance file
            for (int i = 0; i < numGroups; i++) {

                string options = "";
                if (classic) {
                    options = "fasta=" + (fastafile + "." + toString(i) + ".temp") + ", processors=" + toString(processors) + ", output=lt";
                }
                else {
                    options = "fasta=" + (fastafile + "." + toString(i) + ".temp") + ", processors=" + toString(processors) + ", cutoff=" + toString(distCutoff);
                }
                if (outputDir != "") {
                    options += ", outputdir=" + outputDir;
                }

                m->mothurCalling = true;
                m->mothurOut("/******************************************/");
                m->mothurOutEndLine();
                m->mothurOut("Running command: dist.seqs(" + options + ")");
                m->mothurOutEndLine();
                m->mothurCalling = true;

                Command* command = new DistanceCommand(options);

                m->mothurOut("/******************************************/");
                m->mothurOutEndLine();

                command->execute();
                delete command;
                m->mothurCalling = false;

                m->mothurRemove((fastafile + "." + toString(i) + ".temp"));

                //remove old names files just in case
                if (namefile != "") {
                    m->mothurRemove((namefile + "." + toString(i) + ".temp"));
                }
                else {
                    m->mothurRemove((countfile + "." + toString(i) + ".temp"));
                }
            }
        }
        //restore old fasta file name since dist.seqs overwrites it with the temp files
        m->setFastaFile(fastafile);

        vector<string> tempDistFiles;
        for(int i=0; i<numGroups; i++) {
            if (outputDir == "") {
                outputDir = m->hasPath(fastafile);
            }
            string tempDistFile = (fastafile + "." + toString(i) + ".temp");
            if (outputType == "distance") {
                if (classic) {
                    tempDistFile =  outputDir + m->getRootName(m->getSimpleName((fastafile + "." + toString(i) + ".temp"))) + "phylip.dist";
                }
                else {
                    tempDistFile = outputDir + m->getRootName(m->getSimpleName((fastafile + "." + toString(i) + ".temp"))) + "dist";
                }
            }
            tempDistFiles.push_back(tempDistFile);
        }

        if (method == "vsearch")    {
            splitNamesVsearch(seqGroup, numGroups, tempDistFiles);
        }
        else                        {
            splitNames(seqGroup, numGroups, tempDistFiles);
        }

        if (m->control_pressed)	 {
            for (int i = 0; i < dists.size(); i++) {
                m->mothurRemove((dists[i].begin()->first));
                m->mothurRemove((dists[i].begin()->second));
            }
            dists.clear();
        }

        return 0;
    }
    catch(exception& e) {
        m->errorOut(e, "SplitMatrix", "createDistanceFilesFromTax");
        exit(1);
    }
}