iMatrix *readmatrix(std::string filename,const std::string delim=",;: \t"){ ///@param filename A filename to read.@param delim A string of delimiters. std::vector<std::string> tokens; const int SIZE=500000; char buffer[SIZE]; std::ifstream pFile (filename.c_str(),std::ios::in); if(!pFile){ std::cout <<"Problems opening file" <<filename<<std::endl; exit(0); } std::string tmp_string; int doFirstRow =1; int itemsInFirstRow=0; int numRows =0; while(!pFile.eof()){ pFile.getline(buffer,SIZE); tmp_string = std::string(buffer); if(doFirstRow){ //if file has a emptystart line itemsInFirstRow = get_lexemes(tmp_string,tokens,delim); if (itemsInFirstRow==0) continue; // printf("items in first rwo:%d\n",itemsInFirstRow); doFirstRow=0; numRows++; } else{ int nItems = get_lexemes(tmp_string,tokens,delim); //if line is empty if(nItems==0) continue; numRows++; if(nItems!=itemsInFirstRow){ printf("row length mismatch at line:%d numitems is:%d shouldn't be:%d\t will exit\n",numRows,itemsInFirstRow,nItems); exit(0); } } } iMatrix *data_ = allocIntMatrix(numRows,itemsInFirstRow); //now we have a token array of string coerce the types now typecast_stringarray_to_int_matrix(tokens,data_); //copy(tokens.begin(), tokens.end(), ostream_iterator<string>(cout, ", ")); printf("\t-> Dimension of genotype datafile is (%d,%d)\n",data_->x,data_->y); return data_; }
iMatrix *readmatrix_filty_memory(std::string filename,const std::string delim=",;: \t"){ ///@param filename A filename to read.@param delim A string of delimiters. std::vector<std::string> tokens; const int SIZE = MAX_ELEMS_PER_LINE;//defined in conf.h char buffer[SIZE]; std::ifstream pFile (filename.c_str(),std::ios::in); if(!pFile){ fileError(filename); exit(0); } std::string tmp_string; int doFirstRow =1; int itemsInFirstRow=0; int numRows =0; while(!pFile.eof()){ pFile.getline(buffer,SIZE); tmp_string = std::string(buffer); if(doFirstRow){ //if file has a emptystart line itemsInFirstRow = get_lexemes(tmp_string,tokens,delim); if (itemsInFirstRow==0) continue; // printf("items in first rwo:%d\n",itemsInFirstRow); doFirstRow=0; numRows++; } else{ int nItems = get_lexemes(tmp_string,tokens,delim); //if line is empty if(nItems==0) continue; numRows++; if(nItems!=itemsInFirstRow){ printf("row length mismatch at line:%d numitems is:%d shouldn't be:%d\t will exit\n",numRows,itemsInFirstRow,nItems); exit(0); } } } flush_print("\r\t-> File has been read in now, will now typecheck... "); iMatrix *data_ = allocIntMatrix(numRows,itemsInFirstRow); //now we have a token array of string coerce the types now typecast_stringarray_to_int_matrix(tokens,data_); //copy(tokens.begin(), tokens.end(), ostream_iterator<string>(cout, ", ")); return data_; }
iMatrix *bed_to_iMatrix(const char* file, int nrow,int ncol) { const unsigned char recode[4] = {'\x01', '\x00', '\x02', '\x03'}; const unsigned char mask = '\x03'; FILE *in = fopen(file, "r"); if (!in){ printf("Couln't open input file: %s", file); exit(0); } unsigned char start[3]; if (fread(start, 1, 3, in)!=3){ printf("Failed to read first 3 bytes"); exit(0); } if (start[0]!='\x6C' || start[1]!='\x1B'){ printf("Input file does not appear to be a .bed file (%X, %X)", start[0], start[1]); exit(0); } /* Create output object */ iMatrix *returnMat = allocIntMatrix(nrow,ncol); int ncell = nrow*ncol; unsigned char *result = new unsigned char[nrow*ncol]; memset(result, 0x00, ncell); /* Read in data */ int snp_major = start[2]; int part=0, ij=0, i=0, j=0; while (1) { unsigned char byte; if (!part) { if (feof(in) || !fread(&byte, 1, 1, in)) { printf("Unexpected end of file reached"); exit(0); } part = 4; } unsigned char code = byte & mask; byte = byte >> 2; part--; result[ij] = recode[code]; returnMat->matrix[i][j] = result[ij]; if(returnMat->matrix[i][j]==3) returnMat->matrix[i][j]=1; else if(returnMat->matrix[i][j]==1) returnMat->matrix[i][j]=3; else if(returnMat->matrix[i][j]<0 || returnMat->matrix[i][j]>3){ printf("Problem in bed file at position=(%d,%d)=%d\n",i,j,returnMat->matrix[i][j]); exit(0); } // printf("(%d,%d)=%d ",i,j,result[ij]); if (snp_major) { ij++; i++; if (i==nrow) { i = part = 0; j++; if (j==ncol) break; } } else { ij += nrow; j++; if (j==ncol){ j = part = 0; i++; if (i==nrow) break; ij = i; } } } fclose(in); delete [] result; return returnMat; }
iMatrix *readmatrix(std::string filename,const std::string delim=",;: \t"){ ///@param filename A filename to read.@param delim A string of delimiters. if(0){ printf("\t-> will try to open postfile: \"%s\" ... \n",filename.c_str()); fflush(stdout); } const int SIZE = MAX_ELEMS_PER_LINE; char buffer[SIZE]; std::ifstream pFile (filename.c_str(),std::ios::in); if(!pFile){ std::cout <<"Problems opening file" <<filename<<std::endl; exit(0); } int doFirstRow =1; int itemsInFirstRow=0; int numRows =0; flush_print("Checking consistency of file..."); while(!pFile.eof()){ std::vector<std::string> tokens; std::string tmp_string; pFile.getline(buffer,SIZE); tmp_string = std::string(buffer); if(doFirstRow){ //if file has a emptystart line itemsInFirstRow = get_lexemes(tmp_string,tokens,delim); if (itemsInFirstRow==0) continue; // printf("items in first rwo:%d\n",itemsInFirstRow); doFirstRow=0; numRows++; } else{ int nItems = get_lexemes(tmp_string,tokens,delim); //if line is empty if(nItems==0) continue; numRows++; if(nItems!=itemsInFirstRow){ printf("row length mismatch at line:%d numitems is:%d shouldn't be:%d\t will exit\n",numRows,itemsInFirstRow,nItems); exit(0); } } if ((numRows%20 )==0){ printf("\r\t-> Checking consistency of file: (checking number of items at line: %d )",numRows); fflush(stdout); } } pFile.close(); fflush(stdout); iMatrix *mat = allocIntMatrix(numRows,itemsInFirstRow); numRows = 0; std::ifstream pFile2 (filename.c_str(),std::ios::in); while(!pFile2.eof()){ if ((numRows%5 )==0){ printf("\r\t-> Checking consistency of file: (Now reading in data at line: %d/%d ) ",numRows,mat->x); fflush(stdout); } std::vector<std::string> tokens; std::string tmp_string; pFile2.getline(buffer,SIZE); tmp_string = std::string(buffer); int itemsInRow = get_lexemes(tmp_string,tokens,delim); if (itemsInRow==0) continue; for(unsigned int i=0; i <tokens.size();i++){ mat->matrix[numRows][i] = to_int(tokens[i]); if(mat->matrix[numRows][i]<0||mat->matrix[numRows][i]>3){ printf("\n\t-> Error in genotype data: (%d,%d)=%d, value should be between 0 and 3.\n",numRows,i,mat->matrix[numRows][i]); exit(0); } } numRows++; } pFile2.close(); //copy(tokens.begin(), tokens.end(), ostream_iterator<string>(cout, ", ")); fflush(stdout); return mat; }