Ejemplo n.º 1
0
/** Read cluster matrix file. Can only get here if file has already been
  * determined to be in the proper format, so do no further error checking.
  * Expected format:
  *   <int> <int> <name>
  */
int DataIO_Std::ReadCmatrix(FileName const& fname,
                            DataSetList& datasetlist, std::string const& dsname)
{
  // Allocate output data set
  DataSet* ds = datasetlist.AddSet( DataSet::CMATRIX, dsname );
  if (ds == 0) return 1;
  DataSet_Cmatrix_MEM& Mat = static_cast<DataSet_Cmatrix_MEM&>( *ds );
  // Buffer file
  BufferedLine buffer;
  if (buffer.OpenFileRead( fname )) return 1;
  // Read past title. See if optional 'nframes' key is there.
  const char* ptr = buffer.Line();
  ArgList header;
  header.SetList(ptr+1, SEPARATORS );
  int nframes = header.getKeyInt("nframes", -1);
  // Need to keep track of frame indices so we can check for sieving.
  std::vector<char> sieveStatus;
  if (nframes > 0)
    sieveStatus.assign(nframes, 'T');
  // Keep track of matrix values.
  std::vector<float> Vals;
  // Read file
  bool checkSieve = true;
  int f1 = -1, f2 = -1, firstf1 = -1;
  float val = 0;
  while ( (ptr = buffer.Line()) != 0 )
  {
    if (checkSieve) {
      sscanf(ptr, "%i %i %f", &f1, &f2, &val);
      if (f2 > (int)sieveStatus.size())
        sieveStatus.resize(f2, 'T');
      if (firstf1 == -1) {
        // First values.
        sieveStatus[f1-1] = 'F';
        sieveStatus[f2-1] = 'F';
        firstf1 = f1;
      } else if (f1 > firstf1) {
          checkSieve = false;
      } else {
        sieveStatus[f2-1] = 'F';
      }
    } else {
      sscanf(ptr, "%*i %*i %f", &val);
    }
    Vals.push_back( val );
  }
  // DEBUG
  //mprintf("Sieved array:\n");
  //for (unsigned int i = 0; i < sieveStatus.size(); i++)
  //  mprintf("\t%6u %c\n", i+1, sieveStatus[i]);
  // Try to determine if sieve is random or not.
  int sieveDelta = 1;
  f1 = -1;
  f2 = -1;
  int actual_nrows = 0;
  for (int i = 0; i < (int)sieveStatus.size(); i++) {
    if (sieveStatus[i] == 'F') {
      actual_nrows++;
      if (sieveDelta != -2) {
        if (f1 == -1) {
          f1 = i;
        } else if (f2 == -1) {
          sieveDelta = i - f1;
          f1 = i;
          f2 = i;
        } else {
          int newDelta = i - f1;
          if (newDelta != sieveDelta) {
            // Random. No need to calculate sieveDelta anymore.
            sieveDelta = -2;
          }
          f1 = i;
        }
      }
    }
  }
  if (sieveDelta == -2) {
    // Random sieve. Try to figure out original sieve value.
    int o_frames = (int)sieveStatus.size();
    int o_sieve_value = o_frames / actual_nrows;
    if ( (o_frames % actual_nrows) != 0 )
      o_sieve_value++;
    sieveDelta = -o_sieve_value;
  }
  if (debug_ > 0)
    mprintf("DEBUG: sieve %i, actual_nrows= %i\n", sieveDelta, actual_nrows);
  if (sieveDelta != 1 && nframes == -1)
    mprintf("Warning: Pairwise distance matrix file contains sieved frames but\n"
            "Warning:   number of original frames is not present in file - this\n"
            "Warning:   may lead to ignored frames in cluster output. Please add\n"
            "Warning:   'nframes <# original frames>' to the pairwise distance\n"
            "Warning:   matrix file header, e.g. '#F1 F2 pw.dat nframes 1000'.\n");
  
  // Save cluster matrix
  if (Mat.Allocate( DataSet::SizeArray(1, actual_nrows) )) return 1;
  std::copy( Vals.begin(), Vals.end(), Mat.Ptr() );
  Mat.SetSieveFromArray(sieveStatus, sieveDelta);

  return 0;
}
Ejemplo n.º 2
0
// DataIO_Std::Read_1D()
int DataIO_Std::Read_1D(std::string const& fname, 
                        DataSetList& datasetlist, std::string const& dsname)
{
  ArgList labels;
  bool hasLabels = false;
  // Buffer file
  BufferedLine buffer;
  if (buffer.OpenFileRead( fname )) return 1;

  // Read the first line. Attempt to determine the number of columns
  const char* linebuffer = buffer.Line();
  if (linebuffer == 0) return 1;
  int ntoken = buffer.TokenizeLine( SEPARATORS );
  if ( ntoken == 0 ) {
    mprinterr("Error: No columns detected in %s\n", buffer.Filename().full());
    return 1;
  }

  // Try to skip past any comments. If line begins with a '#', assume it
  // contains labels. 
  bool isCommentLine = true;
  const char* ptr = linebuffer;
  while (isCommentLine) {
    // Skip past any whitespace
    while ( *ptr != '\0' && isspace(*ptr) ) ++ptr;
    // Assume these are column labels until proven otherwise.
    if (*ptr == '#') {
      labels.SetList(ptr+1, SEPARATORS );
      if (!labels.empty()) {
        hasLabels = true;
        // If first label is Frame assume it is the index column
        if (labels[0] == "Frame" && indexcol_ == -1)
          indexcol_ = 0;
      }
      linebuffer = buffer.Line();
      ptr = linebuffer;
      if (ptr == 0) {
        mprinterr("Error: No data found in file.\n");
        return 1;
      }
    } else 
      // Not a recognized comment character, assume data.
      isCommentLine = false;
  }
  // Special case: check if labels are '#F1   F2 <name> [nframes <#>]'. If so, assume
  // this is a cluster matrix file.
  if ((labels.Nargs() == 3 || labels.Nargs() == 5) && labels[0] == "F1" && labels[1] == "F2")
  {
    mprintf("Warning: Header format '#F1 F2 <name>' detected, assuming cluster pairwise matrix.\n");
    return IS_ASCII_CMATRIX;
  }
  // Column user args start from 1
  if (indexcol_ > -1)
    mprintf("\tUsing column %i as index column.\n", indexcol_ + 1);

  // Should be at first data line. Tokenize the line.
  ntoken = buffer.TokenizeLine( SEPARATORS );
  // If # of data columns does not match # labels, clear labels.
  if ( !labels.empty() && ntoken != labels.Nargs() ) {
    labels.ClearList();
    hasLabels = false;
  }
  // Index column checks
  if (indexcol_ != -1 ) {
    if (indexcol_ >= ntoken) {
      mprinterr("Error: Specified index column %i is out of range (%i columns).\n",
                indexcol_+1, ntoken);
      return 1;
    }
    if (!onlycols_.Empty() && !onlycols_.InRange(indexcol_)) {
      mprinterr("Error: Index column %i specified, but not in given column range '%s'\n",
                indexcol_+1, onlycols_.RangeArg());
      return 1;
    }
  }

  // Determine the type of data stored in each column. Assume numbers should
  // be read with double precision.
  MetaData md( dsname );
  DataSetList::DataListType inputSets;
  unsigned int nsets = 0;
  for (int col = 0; col != ntoken; ++col) {
    std::string token( buffer.NextToken() );
    if (!onlycols_.Empty() && !onlycols_.InRange( col )) {
      mprintf("\tSkipping column %i\n", col+1);
      inputSets.push_back( 0 );
    } else {
      md.SetIdx( col+1 );
      if (hasLabels) md.SetLegend( labels[col] );
      if ( col == indexcol_ ) {
        // Always save the index column as floating point
        inputSets.push_back( new DataSet_double() );
      } else if (validInteger(token)) {
        // Integer number
        inputSets.push_back( datasetlist.Allocate(DataSet::INTEGER) );
      } else if (validDouble(token)) {
        // Floating point number
        inputSets.push_back( new DataSet_double() );
      } else {
        // Assume string. Not allowed for index column.
        if (col == indexcol_) {
          mprintf("Warning: '%s' index column %i has string values. No indices will be read.\n", 
                    buffer.Filename().full(), indexcol_+1);
          indexcol_ = -1;
        }
        inputSets.push_back( new DataSet_string() );
      }
      inputSets.back()->SetMeta( md );
      nsets++;
    }
  }
  if (inputSets.empty() || nsets == 0) {
    mprinterr("Error: No data detected.\n");
    return 1;
  }

  // Read in data
  while (linebuffer != 0) {
    if ( buffer.TokenizeLine( SEPARATORS ) != ntoken ) {
      PrintColumnError(buffer.LineNumber());
      break;
    }
    // Convert data in columns
    for (int i = 0; i < ntoken; ++i) {
      const char* token = buffer.NextToken();
      if (inputSets[i] != 0) {
        if (inputSets[i]->Type() == DataSet::DOUBLE)
          ((DataSet_double*)inputSets[i])->AddElement( atof(token) );
        else if (inputSets[i]->Type() == DataSet::INTEGER)
          ((DataSet_integer*)inputSets[i])->AddElement( atoi(token) );
        else
          ((DataSet_string*)inputSets[i])->AddElement( std::string(token) );
      }
    }
    //Ndata++;
    linebuffer = buffer.Line();
  }
  buffer.CloseFile();
   mprintf("\tDataFile %s has %i columns, %i lines.\n", buffer.Filename().full(),
           ntoken, buffer.LineNumber());

  // Create list containing only data sets.
  DataSetList::DataListType mySets;
  DataSet_double* Xptr = 0;
  for (int idx = 0; idx != (int)inputSets.size(); idx++) {
    if (inputSets[idx] != 0) {
      if ( idx != indexcol_ )
        mySets.push_back( inputSets[idx] );
      else
        Xptr = (DataSet_double*)inputSets[idx];
    }
  }
  mprintf("\tRead %zu data sets.\n", mySets.size());
  std::string Xlabel;
  if (indexcol_ != -1 && indexcol_ < labels.Nargs())
    Xlabel = labels[indexcol_];
  if (Xptr == 0)
    datasetlist.AddOrAppendSets(Xlabel, DataSetList::Darray(), mySets);
  else {
    datasetlist.AddOrAppendSets(Xlabel, Xptr->Data(), mySets);
    delete Xptr;
  }

  return 0;
}