void SplitFasta(const char* seqDataFile, const char *outpath, const char *ext)/*{{{*/ { char seqFile[MAX_PATH+1] = ""; //char command[MAX_COMMAND_LINE+1] = ""; FILE *fpSeqData; FILE *fpSeq; fpSeqData = fopen(seqDataFile,"r"); checkfilestream(fpSeqData, seqDataFile, "r"); char id[SIZE_ID+1] = ""; char str[SIZE_ID+10+1] = ""; char rtname[MAX_PATH+1] = ""; rootname(seqDataFile,rtname); int linesize; Array1D <char> line_1darray(maxline+1); char *line = line_1darray.array1D; char delim[] = " \t\r\n>"; char *pch = NULL; fpos_t pos; //int i; int cnt = 0 ; while((linesize = fgetline(fpSeqData,line,maxline)) != EOF) { if(linesize == 0 || IsBlankLine(line)) continue; if(line[0] == '>')//beginning of a record { cnt ++ ; my_strcpy(str, line, SIZE_ID+10); pch = strtok(str,delim); while(pch != NULL) //take the first word following '>' as id { my_strcpy(id, pch, SIZE_ID); break; } if(strcmp(id,"") == 0)// if no word following '>', set the id as rootname(seq-file)_cnt { sprintf(id,"%s_%d", rtname, cnt); } printf("%d\t:%s exported\n",cnt,id); sprintf(seqFile,"%s/%s.%s",outpath,id,ext); fpSeq = fopen(seqFile,"w"); checkfilestream(fpSeq, seqFile, "r"); fprintf(fpSeq,"%s\n",line); while(!feof(fpSeqData)) { fgetpos(fpSeqData,&pos); linesize = fgetline(fpSeqData,line,maxline); if(linesize == 0 || IsBlankLine(line)) continue; if(line[0] == '>') { fsetpos(fpSeqData,&pos); break; } else fprintf(fpSeq,"%s\n",line); } fclose(fpSeq); } } fclose(fpSeqData); printf("%d sequences exported!\n",cnt); }
/** Read examples into memory */ void CSeqFeature::LoadFeatures() { unsigned int tmpFidx = 0; Scalar tmpFval = 0; unsigned int featureCnt = 0; unsigned int seqNum = 0; unsigned int phiNum = 0; unsigned int posNum1 = 0, posNum2 = 0; std::string line = ""; std::string token = ""; std::ifstream featureFp; featureFp.open(featureFile.c_str()); if(!featureFp.good()) { string msg = "Cannot open feature file <" + featureFile + ">!"; throw CBMRMException(msg, "CSeqFeature::ScanFeatureFile()"); } // read header information int headerInfoCnt = 3; // min duration, max duration, feature dimension do { getline(featureFp, line); trim(line); if(IsBlankLine(line)) continue; // blank line if(line[0] == '#') continue; // comment line if(sscanf(line.c_str(),"maxDuration:%d",&maxDuration)==1) headerInfoCnt--; if(sscanf(line.c_str(),"minDuration:%d",&minDuration)==1) headerInfoCnt--; if(sscanf(line.c_str(),"globalFeatureDim:%d",&featureDimension)==1) headerInfoCnt--; } while(!featureFp.eof() && (headerInfoCnt != 0)); assert(maxDuration >= minDuration); assert(featureDimension < (1<<30)); // featureDimension is normally less then 1 billion if(featureFp.eof()) throw CBMRMException("Feature file does not contain valid examples","CSeqFeature::LoadFeatures()"); // read sequences nnz = 0; while(!featureFp.eof()) { // read sequence number do { getline(featureFp, line); trim(line); if(IsBlankLine(line)) continue; // blank line if(line[0] == '#') continue; // comment line if(sscanf(line.c_str(),"sequence:%d",&seqNum)==1) break; } while(!featureFp.eof()); if(featureFp.eof()) throw CBMRMException("Feature file does not contain valid phi:*","CSeqFeature::LoadFeatures()"); // read phi:1 tag phiNum = 0; do { getline(featureFp, line); trim(line); if(IsBlankLine(line)) continue; // blank line if(line[0] == '#') continue; // comment line if(sscanf(line.c_str(),"phi:%d",&phiNum)==1) break; } while(!featureFp.eof()); if(featureFp.eof() || (phiNum != 1)) throw CBMRMException("Feature file does not contain valid phi:1","CSeqFeature::LoadFeatures()"); // read phi:1 sparse vectors do { getline(featureFp, line); trim(line); if(IsBlankLine(line)) continue; // blank line if(line[0] == '#') continue; // comment line if(sscanf(line.c_str(),"phi:%d",&phiNum) == 1) break; istringstream iss(line); iss >> token; if((sscanf(token.c_str(),"pos:%d",&posNum1) != 1)) throw CBMRMException("Feature file does not contain valid pos tag in phi:1","CSeqFeature::LoadFeatures()"); TheMatrix svec(1,featureDimension,SML::SPARSE); featureCnt = 0; while(!iss.eof()) { iss >> token; if(sscanf(token.c_str(),svec_feature_index_and_value_format.c_str(),&tmpFidx, &tmpFval) != 2) { ostringstream msg; msg << "Invalid #" << featureCnt + 1 << " sparse vector element in phi:"<< phiNum << " seq:" << seqNum << " pos:" << posNum1; throw CBMRMException(msg.str(),"CSeqFeature::LoadFeatures()"); } svec.Set(0,tmpFidx,tmpFval); nnz++; } if(featureCnt == 0) throw CBMRMException("Feature file does not contain valid phi:2 sparse vector","CSeqFeature::LoadFeatures()"); phi_1.push_back(svec); } while(!featureFp.eof()); if(phi_1.size() < 1) throw CBMRMException("Feature file does not contain valid phi:1","CSeqFeature::LoadFeatures()"); numOfSeq = phi_1.size(); if(featureFp.eof() || (phiNum != 2)) throw CBMRMException("Feature file does not contain valid phi:2","CSeqFeature::LoadFeatures()"); // read phi:2 sparse vectors unsigned int prevPosNum1 = 0, prevPosNum2 = 0; vector<TheMatrix> tmp_phi_2_svecs; featureCnt = 0; do { getline(featureFp, line); trim(line); if(IsBlankLine(line)) continue; // blank line if(line[0] == '#') continue; // comment line if((sscanf(line.c_str(),"phi:%d",&phiNum) == 1)) break; istringstream iss(line); iss >> token; if((sscanf(token.c_str(),"pos:%d,%d",&posNum1,&posNum2) != 2)) throw CBMRMException("Feature file does not containt valid pos tag in phi:2","CSeqFeature::LoadFeatures()"); if(prevPosNum2 >= posNum2) { ostringstream msg; msg << "previous posNum2 must be > current posNum2 in phi:2 (phi:2 pos:" << posNum1 << "," << posNum2; throw CBMRMException(msg.str(),"CSeqFeature::LoadFeatures()"); } if(prevPosNum1 >= posNum1) { ostringstream msg; msg << "previous posNum1 must be > current posNum1 in phi:2 (phi:2 pos:" << posNum1 << "," << posNum2; throw CBMRMException(msg.str(),"CSeqFeature::LoadFeatures()"); } if(posNum1 != prevPosNum1) { phi_2.push_back(tmp_phi_2_svecs); tmp_phi_2_svecs.clear(); } TheMatrix svec(1,featureDimension,SML::SPARSE); featureCnt = 0; while(!iss.eof()) { iss >> token; if(sscanf(token.c_str(),svec_feature_index_and_value_format.c_str(),&tmpFidx, &tmpFval) != 2) { ostringstream msg; msg << "Invalid #" << featureCnt + 1 << " sparse vector element in phi:"<< phiNum << " seq:" << seqNum << " pos:" << posNum1; throw CBMRMException(msg.str(),"CSeqFeature::LoadFeatures()"); } svec.Set(0,tmpFidx,tmpFval); nnz++; } if(featureCnt == 0) throw CBMRMException("Feature file does not containt valid phi:2 sparse vector","CSeqFeature::LoadFeatures()"); tmp_phi_2_svecs.push_back(svec); } while(!featureFp.eof()); if(phi_2.size() < 1) throw CBMRMException("Feature file does not contain phi:2","CSeqFeature::LoadFeatures()"); } // data matrix density density = ((double)nnz/featureDimension)/numOfSeq; featureFp.close(); }