/** Read cluster matrix file. Can only get here if file has already been * determined to be in the proper format, so do no further error checking. * Expected format: * <int> <int> <name> */ int DataIO_Std::ReadCmatrix(FileName const& fname, DataSetList& datasetlist, std::string const& dsname) { // Allocate output data set DataSet* ds = datasetlist.AddSet( DataSet::CMATRIX, dsname ); if (ds == 0) return 1; DataSet_Cmatrix_MEM& Mat = static_cast<DataSet_Cmatrix_MEM&>( *ds ); // Buffer file BufferedLine buffer; if (buffer.OpenFileRead( fname )) return 1; // Read past title. See if optional 'nframes' key is there. const char* ptr = buffer.Line(); ArgList header; header.SetList(ptr+1, SEPARATORS ); int nframes = header.getKeyInt("nframes", -1); // Need to keep track of frame indices so we can check for sieving. std::vector<char> sieveStatus; if (nframes > 0) sieveStatus.assign(nframes, 'T'); // Keep track of matrix values. std::vector<float> Vals; // Read file bool checkSieve = true; int f1 = -1, f2 = -1, firstf1 = -1; float val = 0; while ( (ptr = buffer.Line()) != 0 ) { if (checkSieve) { sscanf(ptr, "%i %i %f", &f1, &f2, &val); if (f2 > (int)sieveStatus.size()) sieveStatus.resize(f2, 'T'); if (firstf1 == -1) { // First values. sieveStatus[f1-1] = 'F'; sieveStatus[f2-1] = 'F'; firstf1 = f1; } else if (f1 > firstf1) { checkSieve = false; } else { sieveStatus[f2-1] = 'F'; } } else { sscanf(ptr, "%*i %*i %f", &val); } Vals.push_back( val ); } // DEBUG //mprintf("Sieved array:\n"); //for (unsigned int i = 0; i < sieveStatus.size(); i++) // mprintf("\t%6u %c\n", i+1, sieveStatus[i]); // Try to determine if sieve is random or not. int sieveDelta = 1; f1 = -1; f2 = -1; int actual_nrows = 0; for (int i = 0; i < (int)sieveStatus.size(); i++) { if (sieveStatus[i] == 'F') { actual_nrows++; if (sieveDelta != -2) { if (f1 == -1) { f1 = i; } else if (f2 == -1) { sieveDelta = i - f1; f1 = i; f2 = i; } else { int newDelta = i - f1; if (newDelta != sieveDelta) { // Random. No need to calculate sieveDelta anymore. sieveDelta = -2; } f1 = i; } } } } if (sieveDelta == -2) { // Random sieve. Try to figure out original sieve value. int o_frames = (int)sieveStatus.size(); int o_sieve_value = o_frames / actual_nrows; if ( (o_frames % actual_nrows) != 0 ) o_sieve_value++; sieveDelta = -o_sieve_value; } if (debug_ > 0) mprintf("DEBUG: sieve %i, actual_nrows= %i\n", sieveDelta, actual_nrows); if (sieveDelta != 1 && nframes == -1) mprintf("Warning: Pairwise distance matrix file contains sieved frames but\n" "Warning: number of original frames is not present in file - this\n" "Warning: may lead to ignored frames in cluster output. Please add\n" "Warning: 'nframes <# original frames>' to the pairwise distance\n" "Warning: matrix file header, e.g. '#F1 F2 pw.dat nframes 1000'.\n"); // Save cluster matrix if (Mat.Allocate( DataSet::SizeArray(1, actual_nrows) )) return 1; std::copy( Vals.begin(), Vals.end(), Mat.Ptr() ); Mat.SetSieveFromArray(sieveStatus, sieveDelta); return 0; }
// DataIO_Std::Read_1D() int DataIO_Std::Read_1D(std::string const& fname, DataSetList& datasetlist, std::string const& dsname) { ArgList labels; bool hasLabels = false; // Buffer file BufferedLine buffer; if (buffer.OpenFileRead( fname )) return 1; // Read the first line. Attempt to determine the number of columns const char* linebuffer = buffer.Line(); if (linebuffer == 0) return 1; int ntoken = buffer.TokenizeLine( SEPARATORS ); if ( ntoken == 0 ) { mprinterr("Error: No columns detected in %s\n", buffer.Filename().full()); return 1; } // Try to skip past any comments. If line begins with a '#', assume it // contains labels. bool isCommentLine = true; const char* ptr = linebuffer; while (isCommentLine) { // Skip past any whitespace while ( *ptr != '\0' && isspace(*ptr) ) ++ptr; // Assume these are column labels until proven otherwise. if (*ptr == '#') { labels.SetList(ptr+1, SEPARATORS ); if (!labels.empty()) { hasLabels = true; // If first label is Frame assume it is the index column if (labels[0] == "Frame" && indexcol_ == -1) indexcol_ = 0; } linebuffer = buffer.Line(); ptr = linebuffer; if (ptr == 0) { mprinterr("Error: No data found in file.\n"); return 1; } } else // Not a recognized comment character, assume data. isCommentLine = false; } // Special case: check if labels are '#F1 F2 <name> [nframes <#>]'. If so, assume // this is a cluster matrix file. if ((labels.Nargs() == 3 || labels.Nargs() == 5) && labels[0] == "F1" && labels[1] == "F2") { mprintf("Warning: Header format '#F1 F2 <name>' detected, assuming cluster pairwise matrix.\n"); return IS_ASCII_CMATRIX; } // Column user args start from 1 if (indexcol_ > -1) mprintf("\tUsing column %i as index column.\n", indexcol_ + 1); // Should be at first data line. Tokenize the line. ntoken = buffer.TokenizeLine( SEPARATORS ); // If # of data columns does not match # labels, clear labels. if ( !labels.empty() && ntoken != labels.Nargs() ) { labels.ClearList(); hasLabels = false; } // Index column checks if (indexcol_ != -1 ) { if (indexcol_ >= ntoken) { mprinterr("Error: Specified index column %i is out of range (%i columns).\n", indexcol_+1, ntoken); return 1; } if (!onlycols_.Empty() && !onlycols_.InRange(indexcol_)) { mprinterr("Error: Index column %i specified, but not in given column range '%s'\n", indexcol_+1, onlycols_.RangeArg()); return 1; } } // Determine the type of data stored in each column. Assume numbers should // be read with double precision. MetaData md( dsname ); DataSetList::DataListType inputSets; unsigned int nsets = 0; for (int col = 0; col != ntoken; ++col) { std::string token( buffer.NextToken() ); if (!onlycols_.Empty() && !onlycols_.InRange( col )) { mprintf("\tSkipping column %i\n", col+1); inputSets.push_back( 0 ); } else { md.SetIdx( col+1 ); if (hasLabels) md.SetLegend( labels[col] ); if ( col == indexcol_ ) { // Always save the index column as floating point inputSets.push_back( new DataSet_double() ); } else if (validInteger(token)) { // Integer number inputSets.push_back( datasetlist.Allocate(DataSet::INTEGER) ); } else if (validDouble(token)) { // Floating point number inputSets.push_back( new DataSet_double() ); } else { // Assume string. Not allowed for index column. if (col == indexcol_) { mprintf("Warning: '%s' index column %i has string values. No indices will be read.\n", buffer.Filename().full(), indexcol_+1); indexcol_ = -1; } inputSets.push_back( new DataSet_string() ); } inputSets.back()->SetMeta( md ); nsets++; } } if (inputSets.empty() || nsets == 0) { mprinterr("Error: No data detected.\n"); return 1; } // Read in data while (linebuffer != 0) { if ( buffer.TokenizeLine( SEPARATORS ) != ntoken ) { PrintColumnError(buffer.LineNumber()); break; } // Convert data in columns for (int i = 0; i < ntoken; ++i) { const char* token = buffer.NextToken(); if (inputSets[i] != 0) { if (inputSets[i]->Type() == DataSet::DOUBLE) ((DataSet_double*)inputSets[i])->AddElement( atof(token) ); else if (inputSets[i]->Type() == DataSet::INTEGER) ((DataSet_integer*)inputSets[i])->AddElement( atoi(token) ); else ((DataSet_string*)inputSets[i])->AddElement( std::string(token) ); } } //Ndata++; linebuffer = buffer.Line(); } buffer.CloseFile(); mprintf("\tDataFile %s has %i columns, %i lines.\n", buffer.Filename().full(), ntoken, buffer.LineNumber()); // Create list containing only data sets. DataSetList::DataListType mySets; DataSet_double* Xptr = 0; for (int idx = 0; idx != (int)inputSets.size(); idx++) { if (inputSets[idx] != 0) { if ( idx != indexcol_ ) mySets.push_back( inputSets[idx] ); else Xptr = (DataSet_double*)inputSets[idx]; } } mprintf("\tRead %zu data sets.\n", mySets.size()); std::string Xlabel; if (indexcol_ != -1 && indexcol_ < labels.Nargs()) Xlabel = labels[indexcol_]; if (Xptr == 0) datasetlist.AddOrAppendSets(Xlabel, DataSetList::Darray(), mySets); else { datasetlist.AddOrAppendSets(Xlabel, Xptr->Data(), mySets); delete Xptr; } return 0; }